diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
152 files changed, 18265 insertions, 8988 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 677c49331cd5..ca088e63e03c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -15,10 +15,10 @@ namespace llvm { -class AMDGPUTargetMachine; class FunctionPass; class GCNTargetMachine; class ImmutablePass; +class MachineFunctionPass; class ModulePass; class Pass; class Target; @@ -51,12 +51,12 @@ FunctionPass *createSIAnnotateControlFlowPass(); FunctionPass *createSIFoldOperandsPass(); FunctionPass *createSIPeepholeSDWAPass(); FunctionPass *createSILowerI1CopiesPass(); -FunctionPass *createSIAddIMGInitPass(); FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(); FunctionPass *createSIWholeQuadModePass(); FunctionPass *createSIFixControlFlowLiveIntervalsPass(); FunctionPass *createSIOptimizeExecMaskingPreRAPass(); +FunctionPass *createSIOptimizeVGPRLiveRangePass(); FunctionPass *createSIFixSGPRCopiesPass(); FunctionPass *createSIMemoryLegalizerPass(); FunctionPass *createSIInsertWaitcntsPass(); @@ -72,7 +72,10 @@ FunctionPass *createAMDGPUMachineCFGStructurizerPass(); FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *); ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *); FunctionPass *createAMDGPURewriteOutArgumentsPass(); +ModulePass *createAMDGPUReplaceLDSUseWithPointerPass(); +ModulePass *createAMDGPULowerModuleLDSPass(); FunctionPass *createSIModeRegisterPass(); +FunctionPass *createGCNPreRAOptimizationsPass(); struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> { AMDGPUSimplifyLibCallsPass(TargetMachine &TM) : TM(TM) {} @@ -94,6 +97,8 @@ extern char &AMDGPUMachineCFGStructurizerID; void initializeAMDGPUAlwaysInlinePass(PassRegistry&); Pass *createAMDGPUAnnotateKernelFeaturesPass(); +Pass *createAMDGPUAttributorPass(); +void initializeAMDGPUAttributorPass(PassRegistry &); void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; @@ -146,6 +151,21 @@ private: TargetMachine &TM; }; +void initializeAMDGPUReplaceLDSUseWithPointerPass(PassRegistry &); +extern char &AMDGPUReplaceLDSUseWithPointerID; + +struct AMDGPUReplaceLDSUseWithPointerPass + : PassInfoMixin<AMDGPUReplaceLDSUseWithPointerPass> { + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + +void initializeAMDGPULowerModuleLDSPass(PassRegistry &); +extern char &AMDGPULowerModuleLDSID; + +struct AMDGPULowerModuleLDSPass : PassInfoMixin<AMDGPULowerModuleLDSPass> { + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &); extern char &AMDGPURewriteOutArgumentsID; @@ -197,14 +217,11 @@ extern char &SIWholeQuadModeID; void initializeSILowerControlFlowPass(PassRegistry &); extern char &SILowerControlFlowID; -void initializeSIRemoveShortExecBranchesPass(PassRegistry &); -extern char &SIRemoveShortExecBranchesID; - void initializeSIPreEmitPeepholePass(PassRegistry &); extern char &SIPreEmitPeepholeID; -void initializeSIInsertSkipsPass(PassRegistry &); -extern char &SIInsertSkipsPassID; +void initializeSILateBranchLoweringPass(PassRegistry &); +extern char &SILateBranchLoweringPassID; void initializeSIOptimizeExecMaskingPass(PassRegistry &); extern char &SIOptimizeExecMaskingID; @@ -218,9 +235,6 @@ extern char &AMDGPUSimplifyLibCallsID; void initializeAMDGPUUseNativeCallsPass(PassRegistry &); extern char &AMDGPUUseNativeCallsID; -void initializeSIAddIMGInitPass(PassRegistry &); -extern char &SIAddIMGInitID; - void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &); extern char &AMDGPUPerfHintAnalysisID; @@ -271,6 +285,9 @@ ModulePass *createAMDGPUPrintfRuntimeBinding(); void initializeAMDGPUPrintfRuntimeBindingPass(PassRegistry&); extern char &AMDGPUPrintfRuntimeBindingID; +void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &); +extern char &AMDGPUResourceUsageAnalysisID; + struct AMDGPUPrintfRuntimeBindingPass : PassInfoMixin<AMDGPUPrintfRuntimeBindingPass> { PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); @@ -287,6 +304,9 @@ struct AMDGPUUnifyMetadataPass : PassInfoMixin<AMDGPUUnifyMetadataPass> { void initializeSIOptimizeExecMaskingPreRAPass(PassRegistry&); extern char &SIOptimizeExecMaskingPreRAID; +void initializeSIOptimizeVGPRLiveRangePass(PassRegistry &); +extern char &SIOptimizeVGPRLiveRangeID; + void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&); extern char &AMDGPUAnnotateUniformValuesPassID; @@ -331,12 +351,12 @@ ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass(); void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &); extern char &AMDGPUOpenCLEnqueuedBlockLoweringID; -void initializeGCNRegBankReassignPass(PassRegistry &); -extern char &GCNRegBankReassignID; - void initializeGCNNSAReassignPass(PassRegistry &); extern char &GCNNSAReassignID; +void initializeGCNPreRAOptimizationsPass(PassRegistry &); +extern char &GCNPreRAOptimizationsID; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index c352c0097c5c..7991f3d2a6b2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -51,6 +51,12 @@ def HalfRate64Ops : SubtargetFeature<"half-rate-64-ops", "Most fp64 instructions are half rate instead of quarter" >; +def FullRate64Ops : SubtargetFeature<"full-rate-64-ops", + "FullRate64Ops", + "true", + "Most fp64 instructions are full rate" +>; + def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", "FlatAddressSpace", "true", @@ -148,6 +154,12 @@ def FeatureXNACK : SubtargetFeature<"xnack", "Enable XNACK support" >; +def FeatureTgSplit : SubtargetFeature<"tgsplit", + "EnableTgSplit", + "true", + "Enable threadgroup split execution" +>; + def FeatureCuMode : SubtargetFeature<"cumode", "EnableCuMode", "true", @@ -214,10 +226,28 @@ def FeatureNSAtoVMEMBug : SubtargetFeature<"nsa-to-vmem-bug", "MIMG-NSA followed by VMEM fail if EXEC_LO or EXEC_HI equals zero" >; +def FeatureNSAClauseBug : SubtargetFeature<"nsa-clause-bug", + "HasNSAClauseBug", + "true", + "MIMG-NSA in a hard clause has unpredictable results on GFX10.1" +>; + def FeatureFlatSegmentOffsetBug : SubtargetFeature<"flat-segment-offset-bug", "HasFlatSegmentOffsetBug", "true", - "GFX10 bug, inst_offset ignored in flat segment" + "GFX10 bug where inst_offset is ignored when flat instructions access global memory" +>; + +def FeatureNegativeScratchOffsetBug : SubtargetFeature<"negative-scratch-offset-bug", + "NegativeScratchOffsetBug", + "true", + "Negative immediate offsets in scratch instructions with an SGPR offset page fault on GFX9" +>; + +def FeatureNegativeUnalignedScratchOffsetBug : SubtargetFeature<"negative-unaligned-scratch-offset-bug", + "NegativeUnalignedScratchOffsetBug", + "true", + "Scratch instructions with a VGPR offset and a negative immediate offset that is not a multiple of 4 read wrong memory on GFX10" >; def FeatureOffset3fBug : SubtargetFeature<"offset-3f-bug", @@ -272,6 +302,12 @@ def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts", "Additional instructions for GFX9+" >; +def FeatureGFX90AInsts : SubtargetFeature<"gfx90a-insts", + "GFX90AInsts", + "true", + "Additional instructions for GFX90A+" +>; + def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts", "GFX10Insts", "true", @@ -387,6 +423,18 @@ def FeatureDPP8 : SubtargetFeature<"dpp8", "Support DPP8 (Data Parallel Primitives) extension" >; +def Feature64BitDPP : SubtargetFeature<"dpp-64bit", + "Has64BitDPP", + "true", + "Support DPP (Data Parallel Primitives) extension" +>; + +def FeaturePackedFP32Ops : SubtargetFeature<"packed-fp32-ops", + "HasPackedFP32Ops", + "true", + "Support packed fp32 instructions" +>; + def FeatureR128A16 : SubtargetFeature<"r128-a16", "HasR128A16", "true", @@ -411,6 +459,18 @@ def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding", "Support NSA encoding for image instructions" >; +def FeatureExtendedImageInsts : SubtargetFeature<"extended-image-insts", + "HasExtendedImageInsts", + "true", + "Support mips != 0, lod != 0, gather4, and get_lod" +>; + +def FeatureGFX10_AEncoding : SubtargetFeature<"gfx10_a-encoding", + "GFX10_AEncoding", + "true", + "Has BVH ray tracing instructions" +>; + def FeatureGFX10_BEncoding : SubtargetFeature<"gfx10_b-encoding", "GFX10_BEncoding", "true", @@ -444,7 +504,7 @@ def FeatureDot1Insts : SubtargetFeature<"dot1-insts", def FeatureDot2Insts : SubtargetFeature<"dot2-insts", "HasDot2Insts", "true", - "Has v_dot2_f32_f16, v_dot2_i32_i16, v_dot2_u32_u16, v_dot4_u32_u8, v_dot8_u32_u4 instructions" + "Has v_dot2_i32_i16, v_dot2_u32_u16 instructions" >; def FeatureDot3Insts : SubtargetFeature<"dot3-insts", @@ -471,6 +531,12 @@ def FeatureDot6Insts : SubtargetFeature<"dot6-insts", "Has v_dot4c_i32_i8 instruction" >; +def FeatureDot7Insts : SubtargetFeature<"dot7-insts", + "HasDot7Insts", + "true", + "Has v_dot2_f32_f16, v_dot4_u32_u8, v_dot8_u32_u4 instructions" +>; + def FeatureMAIInsts : SubtargetFeature<"mai-insts", "HasMAIInsts", "true", @@ -527,6 +593,12 @@ def FeatureSMemTimeInst : SubtargetFeature<"s-memtime-inst", "Has s_memtime instruction" >; +def FeatureShaderCyclesRegister : SubtargetFeature<"shader-cycles-register", + "HasShaderCyclesRegister", + "true", + "Has SHADER_CYCLES hardware register" +>; + def FeatureMadMacF32Insts : SubtargetFeature<"mad-mac-f32-insts", "HasMadMacF32Insts", "true", @@ -557,6 +629,16 @@ def FeatureNoDataDepHazard : SubtargetFeature<"no-data-dep-hazard", "Does not need SW waitstates" >; +class SubtargetFeatureNSAMaxSize <int Value> : SubtargetFeature < + "nsa-max-size-"#Value, + "NSAMaxSize", + !cast<string>(Value), + "The maximum non-sequential address size in VGPRs." +>; + +def FeatureNSAMaxSize5 : SubtargetFeatureNSAMaxSize<5>; +def FeatureNSAMaxSize13 : SubtargetFeatureNSAMaxSize<13>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -659,6 +741,18 @@ def FeatureUnalignedAccessMode : SubtargetFeature<"unaligned-access-mode", " supports it" >; +def FeaturePackedTID : SubtargetFeature<"packed-tid", + "HasPackedTID", + "true", + "Workitem IDs are packed into v0 at kernel launch" +>; + +def FeatureArchitectedFlatScratch : SubtargetFeature<"architected-flat-scratch", + "HasArchitectedFlatScratch", + "true", + "Flat Scratch register is a readonly SPI initialized architected register" +>; + // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", @@ -675,7 +769,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS", [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128, FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel, - FeatureTrigReducedRange] + FeatureTrigReducedRange, FeatureExtendedImageInsts + ] >; def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", @@ -684,7 +779,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange, FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, - FeatureDsSrc2Insts, FeatureUnalignedBufferAccess] + FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess + ] >; def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", @@ -697,7 +793,9 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP, FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, - FeatureDsSrc2Insts, FeatureFastDenormalF32, FeatureUnalignedBufferAccess] + FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32, + FeatureUnalignedBufferAccess + ] >; def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", @@ -712,9 +810,10 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16, - FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, - FeatureFastDenormalF32, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, - FeatureSupportsXNACK] + FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK, + FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, + FeatureNegativeScratchOffsetBug + ] >; def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", @@ -729,9 +828,9 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts, FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking, - FeatureVOP3Literal, FeatureDPP8, + FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts, FeatureNoDataDepHazard, FeaturePkFmacF16Inst, - FeatureGFX10A16, FeatureFastDenormalF32, FeatureG16, + FeatureGFX10A16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess ] >; @@ -816,17 +915,26 @@ def FeatureISAVersion9_0_0 : FeatureSet< [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, + FeatureDsSrc2Insts, + FeatureExtendedImageInsts, + FeatureMadMacF32Insts, FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_2 : FeatureSet< [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, + FeatureDsSrc2Insts, + FeatureExtendedImageInsts, + FeatureMadMacF32Insts, FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_4 : FeatureSet< [FeatureGFX9, FeatureLDSBankCount32, + FeatureDsSrc2Insts, + FeatureExtendedImageInsts, + FeatureMadMacF32Insts, FeatureFmaMixInsts, FeatureImageGather4D16Bug]>; @@ -835,9 +943,13 @@ def FeatureISAVersion9_0_6 : FeatureSet< HalfRate64Ops, FeatureFmaMixInsts, FeatureLDSBankCount32, + FeatureDsSrc2Insts, + FeatureExtendedImageInsts, + FeatureMadMacF32Insts, FeatureDLInsts, FeatureDot1Insts, FeatureDot2Insts, + FeatureDot7Insts, FeatureSupportsSRAMECC, FeatureImageGather4D16Bug]>; @@ -846,6 +958,9 @@ def FeatureISAVersion9_0_8 : FeatureSet< HalfRate64Ops, FeatureFmaMixInsts, FeatureLDSBankCount32, + FeatureDsSrc2Insts, + FeatureExtendedImageInsts, + FeatureMadMacF32Insts, FeatureDLInsts, FeatureDot1Insts, FeatureDot2Insts, @@ -853,6 +968,7 @@ def FeatureISAVersion9_0_8 : FeatureSet< FeatureDot4Insts, FeatureDot5Insts, FeatureDot6Insts, + FeatureDot7Insts, FeatureMAIInsts, FeaturePkFmacF16Inst, FeatureAtomicFaddInsts, @@ -864,13 +980,41 @@ def FeatureISAVersion9_0_9 : FeatureSet< [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, + FeatureDsSrc2Insts, + FeatureExtendedImageInsts, + FeatureMadMacF32Insts, FeatureImageGather4D16Bug]>; +def FeatureISAVersion9_0_A : FeatureSet< + [FeatureGFX9, + FeatureGFX90AInsts, + FeatureFmaMixInsts, + FeatureLDSBankCount32, + FeatureDLInsts, + FeatureDot1Insts, + FeatureDot2Insts, + FeatureDot3Insts, + FeatureDot4Insts, + FeatureDot5Insts, + FeatureDot6Insts, + FeatureDot7Insts, + Feature64BitDPP, + FeaturePackedFP32Ops, + FeatureMAIInsts, + FeaturePkFmacF16Inst, + FeatureAtomicFaddInsts, + FeatureMadMacF32Insts, + FeatureSupportsSRAMECC, + FeaturePackedTID, + FullRate64Ops]>; + def FeatureISAVersion9_0_C : FeatureSet< [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, - FeatureXNACK, + FeatureDsSrc2Insts, + FeatureExtendedImageInsts, + FeatureMadMacF32Insts, FeatureImageGather4D16Bug]>; // TODO: Organize more features into groups. @@ -884,8 +1028,10 @@ def FeatureGroup { FeatureVcmpxExecWARHazard, FeatureLdsBranchVmemWARHazard, FeatureNSAtoVMEMBug, + FeatureNSAClauseBug, FeatureOffset3fBug, - FeatureFlatSegmentOffsetBug + FeatureFlatSegmentOffsetBug, + FeatureNegativeUnalignedScratchOffsetBug ]; } @@ -895,12 +1041,12 @@ def FeatureISAVersion10_1_0 : FeatureSet< FeatureLDSBankCount32, FeatureDLInsts, FeatureNSAEncoding, + FeatureNSAMaxSize5, FeatureWavefrontSize32, FeatureScalarStores, FeatureScalarAtomics, FeatureScalarFlatScratchInsts, FeatureGetWaveIdInst, - FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLdsMisalignedBug, @@ -915,13 +1061,14 @@ def FeatureISAVersion10_1_1 : FeatureSet< FeatureDot2Insts, FeatureDot5Insts, FeatureDot6Insts, + FeatureDot7Insts, FeatureNSAEncoding, + FeatureNSAMaxSize5, FeatureWavefrontSize32, FeatureScalarStores, FeatureScalarAtomics, FeatureScalarFlatScratchInsts, FeatureGetWaveIdInst, - FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLdsMisalignedBug, @@ -936,13 +1083,32 @@ def FeatureISAVersion10_1_2 : FeatureSet< FeatureDot2Insts, FeatureDot5Insts, FeatureDot6Insts, + FeatureDot7Insts, + FeatureNSAEncoding, + FeatureNSAMaxSize5, + FeatureWavefrontSize32, + FeatureScalarStores, + FeatureScalarAtomics, + FeatureScalarFlatScratchInsts, + FeatureGetWaveIdInst, + FeatureMadMacF32Insts, + FeatureDsSrc2Insts, + FeatureLdsMisalignedBug, + FeatureSupportsXNACK])>; + +def FeatureISAVersion10_1_3 : FeatureSet< + !listconcat(FeatureGroup.GFX10_1_Bugs, + [FeatureGFX10, + FeatureGFX10_AEncoding, + FeatureLDSBankCount32, + FeatureDLInsts, FeatureNSAEncoding, + FeatureNSAMaxSize5, FeatureWavefrontSize32, FeatureScalarStores, FeatureScalarAtomics, FeatureScalarFlatScratchInsts, FeatureGetWaveIdInst, - FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLdsMisalignedBug, @@ -950,6 +1116,7 @@ def FeatureISAVersion10_1_2 : FeatureSet< def FeatureISAVersion10_3_0 : FeatureSet< [FeatureGFX10, + FeatureGFX10_AEncoding, FeatureGFX10_BEncoding, FeatureGFX10_3Insts, FeatureLDSBankCount32, @@ -958,8 +1125,11 @@ def FeatureISAVersion10_3_0 : FeatureSet< FeatureDot2Insts, FeatureDot5Insts, FeatureDot6Insts, + FeatureDot7Insts, FeatureNSAEncoding, - FeatureWavefrontSize32]>; + FeatureNSAMaxSize13, + FeatureWavefrontSize32, + FeatureShaderCyclesRegister]>; //===----------------------------------------------------------------------===// @@ -1077,6 +1247,14 @@ def isGFX6GFX7GFX8GFX9 : "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, AssemblerPredicate<(all_of (not FeatureGFX10Insts))>; +def isGFX6GFX7GFX8GFX9NotGFX90A : + Predicate<"!Subtarget->hasGFX90AInsts() &&" + "(Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" + " Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" + " Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" + " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">, + AssemblerPredicate<(all_of (not FeatureGFX10Insts), (not FeatureGFX90AInsts))>; + def isGFX7Plus : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">, AssemblerPredicate<(all_of FeatureCIInsts)>; @@ -1097,6 +1275,32 @@ def isGFX9Only : Predicate < "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, AssemblerPredicate<(all_of FeatureGCN3Encoding, FeatureGFX9Insts)>; +def isGCN3ExcludingGFX90A : + Predicate<"Subtarget->isGCN3Encoding() && !Subtarget->hasGFX90AInsts()">, + AssemblerPredicate<(all_of FeatureGCN3Encoding, (not FeatureGFX90AInsts))>; + +def isGFX90APlus : + Predicate<"Subtarget->hasGFX90AInsts()">, + AssemblerPredicate<(all_of FeatureGFX90AInsts)>; + +def isNotGFX90APlus : + Predicate<"!Subtarget->hasGFX90AInsts()">, + AssemblerPredicate<(all_of (not FeatureGFX90AInsts))>; + +def isGFX8GFX9NotGFX90A : + Predicate<"!Subtarget->hasGFX90AInsts() &&" + "(Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" + " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">, + AssemblerPredicate<(all_of FeatureGFX8Insts, FeatureGCN3Encoding, (not FeatureGFX90AInsts))>; + +def isGFX90AOnly : + Predicate<"Subtarget->hasGFX90AInsts()">, + AssemblerPredicate<(all_of FeatureGFX90AInsts)>; + +def isGFX908orGFX90A : + Predicate<"Subtarget->hasMAIInsts()">, + AssemblerPredicate<(all_of FeatureMAIInsts)>; + def isGFX8GFX9 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, @@ -1126,6 +1330,9 @@ def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">, def HasFlatScratchSTMode : Predicate<"Subtarget->hasFlatScratchSTMode()">, AssemblerPredicate<(any_of FeatureGFX10_3Insts)>; +def HasGFX10_AEncoding : Predicate<"Subtarget->hasGFX10_AEncoding()">, + AssemblerPredicate<(all_of FeatureGFX10_AEncoding)>; + def HasGFX10_BEncoding : Predicate<"Subtarget->hasGFX10_BEncoding()">, AssemblerPredicate<(all_of FeatureGFX10_BEncoding)>; @@ -1177,6 +1384,19 @@ def HasDPP : Predicate<"Subtarget->hasDPP()">, def HasDPP8 : Predicate<"Subtarget->hasDPP8()">, AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureDPP8)>; +def Has64BitDPP : Predicate<"Subtarget->has64BitDPP()">, + AssemblerPredicate<(all_of Feature64BitDPP)>; + +def HasPackedFP32Ops : Predicate<"Subtarget->hasPackedFP32Ops()">, + AssemblerPredicate<(all_of FeaturePackedFP32Ops)>; + +def HasFmaakFmamkF32Insts : + Predicate<"Subtarget->hasFmaakFmamkF32Insts()">, + AssemblerPredicate<(any_of FeatureGFX10Insts)>; + +def HasExtendedImageInsts : Predicate<"Subtarget->hasExtendedImageInsts()">, + AssemblerPredicate<(all_of FeatureExtendedImageInsts)>; + def HasR128A16 : Predicate<"Subtarget->hasR128A16()">, AssemblerPredicate<(all_of FeatureR128A16)>; @@ -1238,6 +1458,9 @@ def HasDot5Insts : Predicate<"Subtarget->hasDot5Insts()">, def HasDot6Insts : Predicate<"Subtarget->hasDot6Insts()">, AssemblerPredicate<(all_of FeatureDot6Insts)>; +def HasDot7Insts : Predicate<"Subtarget->hasDot7Insts()">, + AssemblerPredicate<(all_of FeatureDot7Insts)>; + def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">, AssemblerPredicate<(all_of FeatureGetWaveIdInst)>; @@ -1250,7 +1473,8 @@ def HasSMemRealTime : Predicate<"Subtarget->hasSMemRealTime()">, def HasSMemTimeInst : Predicate<"Subtarget->hasSMemTimeInst()">, AssemblerPredicate<(all_of FeatureSMemTimeInst)>; -def HasNoSMemTimeInst : Predicate<"!Subtarget->hasSMemTimeInst()">; +def HasShaderCyclesRegister : Predicate<"Subtarget->hasShaderCyclesRegister()">, + AssemblerPredicate<(all_of FeatureShaderCyclesRegister)>; def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">, AssemblerPredicate<(all_of FeaturePkFmacF16Inst)>; @@ -1267,9 +1491,6 @@ def HasAtomicFaddInsts : Predicate<"Subtarget->hasAtomicFaddInsts()">, def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">, AssemblerPredicate<(all_of FeatureDsSrc2Insts)>; -def HasOffset3fBug : Predicate<"!Subtarget->hasOffset3fBug()">, - AssemblerPredicate<(all_of FeatureOffset3fBug)>; - def EnableLateCFGStructurize : Predicate< "EnableLateStructurizeCFG">; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp index 0ed89e9ca8d6..88b88a04a7d1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp @@ -41,24 +41,28 @@ void AMDGPUAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); } -// These arrays are indexed by address space value enum elements 0 ... to 7 -static const AliasResult ASAliasRules[8][8] = { - /* Flat Global Region Group Constant Private Constant 32-bit Buffer Fat Ptr */ - /* Flat */ {MayAlias, MayAlias, NoAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias}, - /* Global */ {MayAlias, MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , MayAlias, MayAlias}, - /* Region */ {NoAlias, NoAlias , MayAlias, NoAlias , NoAlias, NoAlias , NoAlias, NoAlias}, - /* Group */ {MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , NoAlias , NoAlias , NoAlias}, - /* Constant */ {MayAlias, MayAlias, NoAlias, NoAlias , NoAlias , NoAlias , MayAlias, MayAlias}, - /* Private */ {MayAlias, NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, NoAlias , NoAlias}, - /* Constant 32-bit */ {MayAlias, MayAlias, NoAlias, NoAlias , MayAlias, NoAlias , NoAlias , MayAlias}, - /* Buffer Fat Ptr */ {MayAlias, MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , MayAlias, MayAlias} -}; - static AliasResult getAliasResult(unsigned AS1, unsigned AS2) { static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 7, "Addr space out of range"); if (AS1 > AMDGPUAS::MAX_AMDGPU_ADDRESS || AS2 > AMDGPUAS::MAX_AMDGPU_ADDRESS) - return MayAlias; + return AliasResult::MayAlias; + +#define ASMay AliasResult::MayAlias +#define ASNo AliasResult::NoAlias + // This array is indexed by address space value enum elements 0 ... to 7 + static const AliasResult ASAliasRules[8][8] = { + /* Flat Global Region Group Constant Private Const32 Buf Fat Ptr */ + /* Flat */ {ASMay, ASMay, ASNo, ASMay, ASMay, ASMay, ASMay, ASMay}, + /* Global */ {ASMay, ASMay, ASNo, ASNo, ASMay, ASNo, ASMay, ASMay}, + /* Region */ {ASNo, ASNo, ASMay, ASNo, ASNo, ASNo, ASNo, ASNo}, + /* Group */ {ASMay, ASNo, ASNo, ASMay, ASNo, ASNo, ASNo, ASNo}, + /* Constant */ {ASMay, ASMay, ASNo, ASNo, ASNo, ASNo, ASMay, ASMay}, + /* Private */ {ASMay, ASNo, ASNo, ASNo, ASNo, ASMay, ASNo, ASNo}, + /* Constant 32-bit */ {ASMay, ASMay, ASNo, ASNo, ASMay, ASNo, ASNo, ASMay}, + /* Buffer Fat Ptr */ {ASMay, ASMay, ASNo, ASNo, ASMay, ASNo, ASMay, ASMay} + }; +#undef ASMay +#undef ASNo return ASAliasRules[AS1][AS2]; } @@ -70,7 +74,7 @@ AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA, unsigned asB = LocB.Ptr->getType()->getPointerAddressSpace(); AliasResult Result = getAliasResult(asA, asB); - if (Result == NoAlias) + if (Result == AliasResult::NoAlias) return Result; // In general, FLAT (generic) pointers could be aliased to LOCAL or PRIVATE @@ -87,21 +91,21 @@ AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA, if (asA == AMDGPUAS::FLAT_ADDRESS && (asB == AMDGPUAS::LOCAL_ADDRESS || asB == AMDGPUAS::PRIVATE_ADDRESS)) { const auto *ObjA = - getUnderlyingObject(A.Ptr->stripPointerCastsAndInvariantGroups()); + getUnderlyingObject(A.Ptr->stripPointerCastsForAliasAnalysis()); if (const LoadInst *LI = dyn_cast<LoadInst>(ObjA)) { // If a generic pointer is loaded from the constant address space, it // could only be a GLOBAL or CONSTANT one as that address space is soley // prepared on the host side, where only GLOBAL or CONSTANT variables are // visible. Note that this even holds for regular functions. if (LI->getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) - return NoAlias; + return AliasResult::NoAlias; } else if (const Argument *Arg = dyn_cast<Argument>(ObjA)) { const Function *F = Arg->getParent(); switch (F->getCallingConv()) { case CallingConv::AMDGPU_KERNEL: // In the kernel function, kernel arguments won't alias to (local) // variables in shared or private address space. - return NoAlias; + return AliasResult::NoAlias; default: // TODO: In the regular function, if that local variable in the // location B is not captured, that argument pointer won't alias to it diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp index 51af25050950..2af9fc955875 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -120,10 +120,10 @@ static bool alwaysInlineImpl(Module &M, bool GlobalOpt) { for (GlobalVariable &GV : M.globals()) { // TODO: Region address unsigned AS = GV.getAddressSpace(); - if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS) - continue; - - recursivelyVisitUsers(GV, FuncsToAlwaysInline); + if ((AS == AMDGPUAS::REGION_ADDRESS) || + (AS == AMDGPUAS::LOCAL_ADDRESS && + !AMDGPUTargetMachine::EnableLowerModuleLDS)) + recursivelyVisitUsers(GV, FuncsToAlwaysInline); } if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp index a4e72f787230..af6dfc07eb50 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -25,6 +25,13 @@ using namespace llvm; namespace { +static constexpr StringLiteral ImplicitAttrNames[] = { + // X ids unnecessarily propagated to kernels. + "amdgpu-work-item-id-x", "amdgpu-work-item-id-y", + "amdgpu-work-item-id-z", "amdgpu-work-group-id-x", + "amdgpu-work-group-id-y", "amdgpu-work-group-id-z", + "amdgpu-dispatch-ptr", "amdgpu-dispatch-id", + "amdgpu-queue-ptr", "amdgpu-implicitarg-ptr"}; class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { private: @@ -194,18 +201,10 @@ static bool handleAttr(Function &Parent, const Function &Callee, static void copyFeaturesToFunction(Function &Parent, const Function &Callee, bool &NeedQueuePtr) { - // X ids unnecessarily propagated to kernels. - static constexpr StringLiteral AttrNames[] = { - "amdgpu-work-item-id-x", "amdgpu-work-item-id-y", - "amdgpu-work-item-id-z", "amdgpu-work-group-id-x", - "amdgpu-work-group-id-y", "amdgpu-work-group-id-z", - "amdgpu-dispatch-ptr", "amdgpu-dispatch-id", - "amdgpu-implicitarg-ptr"}; - if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) NeedQueuePtr = true; - for (StringRef AttrName : AttrNames) + for (StringRef AttrName : ImplicitAttrNames) handleAttr(Parent, Callee, AttrName); } @@ -268,7 +267,20 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { bool Changed = false; bool NeedQueuePtr = false; bool HaveCall = false; + bool HasIndirectCall = false; bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv()); + CallingConv::ID CC = F.getCallingConv(); + bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx); + + // If this function hasAddressTaken() = true + // then add all attributes corresponding to the implicit args. + if (CallingConvSupportsAllImplicits && + F.hasAddressTaken(nullptr, true, true, true)) { + for (StringRef AttrName : ImplicitAttrNames) { + F.addFnAttr(AttrName); + } + Changed = true; + } for (BasicBlock &BB : F) { for (Instruction &I : BB) { @@ -281,10 +293,12 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { const Function *Callee = dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts()); - // TODO: Do something with indirect calls. + // Note the occurence of indirect call. if (!Callee) { - if (!CB->isInlineAsm()) + if (!CB->isInlineAsm()) { + HasIndirectCall = true; HaveCall = true; + } continue; } @@ -351,6 +365,28 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { Changed = true; } + // This pass cannot copy attributes from callees to callers + // if there is an indirect call and in thus such cases, + // hasAddressTaken() would be false for kernels and functions + // making an indirect call (if they are themselves not indirectly called). + // We must tag all such kernels/functions with all implicits attributes + // for correctness. + // e.g. + // 1. Kernel K1 makes an indirect call to function F1. + // Without detecting an indirect call in K1, this pass will not + // add all implicit args to K1 (which is incorrect). + // 2. Kernel K1 makes direct call to F1 which makes indirect call to function + // F2. + // Without detecting an indirect call in F1 (whose hasAddressTaken() is + // false), the pass will not add all implicit args to F1 (which is + // essential for correctness). + if (CallingConvSupportsAllImplicits && HasIndirectCall) { + for (StringRef AttrName : ImplicitAttrNames) { + F.addFnAttr(AttrName); + } + Changed = true; + } + return Changed; } @@ -367,9 +403,11 @@ bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { } Function *F = I->getFunction(); - // Add feature attributes - if (!F || F->isDeclaration()) + // Ignore functions with graphics calling conventions, these are currently + // not allowed to have kernel arguments. + if (!F || F->isDeclaration() || AMDGPU::isGraphics(F->getCallingConv())) continue; + // Add feature attributes Changed |= addFeatureAttributes(*F); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp index c2a4d67ea98e..7d6845b287bc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -14,10 +14,8 @@ #include "AMDGPU.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/ADT/SetVector.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/Analysis/MemorySSA.h" #include "llvm/IR/InstVisitor.h" #include "llvm/InitializePasses.h" @@ -30,8 +28,7 @@ namespace { class AMDGPUAnnotateUniformValues : public FunctionPass, public InstVisitor<AMDGPUAnnotateUniformValues> { LegacyDivergenceAnalysis *DA; - MemoryDependenceResults *MDR; - LoopInfo *LI; + MemorySSA *MSSA; DenseMap<Value*, GetElementPtrInst*> noClobberClones; bool isEntryFunc; @@ -46,8 +43,7 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LegacyDivergenceAnalysis>(); - AU.addRequired<MemoryDependenceWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequired<MemorySSAWrapperPass>(); AU.setPreservesAll(); } @@ -61,8 +57,7 @@ public: INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE, "Add AMDGPU uniform metadata", false, false) INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) -INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE, "Add AMDGPU uniform metadata", false, false) @@ -75,49 +70,14 @@ static void setNoClobberMetadata(Instruction *I) { I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {})); } -static void DFS(BasicBlock *Root, SetVector<BasicBlock*> & Set) { - for (auto I : predecessors(Root)) - if (Set.insert(I)) - DFS(I, Set); -} - bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) { - // 1. get Loop for the Load->getparent(); - // 2. if it exists, collect all the BBs from the most outer - // loop and check for the writes. If NOT - start DFS over all preds. - // 3. Start DFS over all preds from the most outer loop header. - SetVector<BasicBlock *> Checklist; - BasicBlock *Start = Load->getParent(); - Checklist.insert(Start); - const Value *Ptr = Load->getPointerOperand(); - const Loop *L = LI->getLoopFor(Start); - if (L) { - const Loop *P = L; - do { - L = P; - P = P->getParentLoop(); - } while (P); - Checklist.insert(L->block_begin(), L->block_end()); - Start = L->getHeader(); - } - - DFS(Start, Checklist); - for (auto &BB : Checklist) { - BasicBlock::iterator StartIt = (!L && (BB == Load->getParent())) ? - BasicBlock::iterator(Load) : BB->end(); - auto Q = MDR->getPointerDependencyFrom( - MemoryLocation::getBeforeOrAfter(Ptr), true, StartIt, BB, Load); - if (Q.isClobber() || Q.isUnknown() || - // Store defines the load and thus clobbers it. - (Q.isDef() && Q.getInst()->mayWriteToMemory())) - return true; - } - return false; + const MemoryAccess *MA = MSSA->getWalker()->getClobberingMemoryAccess(Load); + return !MSSA->isLiveOnEntryDef(MA); } void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) { if (DA->isUniform(&I)) - setUniformMetadata(I.getParent()->getTerminator()); + setUniformMetadata(&I); } void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { @@ -154,9 +114,9 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { Value *Idx = Constant::getIntegerValue( Type::getInt32Ty(Ptr->getContext()), APInt(64, 0)); // Insert GEP at the entry to make it dominate all uses - PtrI = GetElementPtrInst::Create( - Ptr->getType()->getPointerElementType(), Ptr, - ArrayRef<Value*>(Idx), Twine(""), F->getEntryBlock().getFirstNonPHI()); + PtrI = GetElementPtrInst::Create(I.getType(), Ptr, + ArrayRef<Value *>(Idx), Twine(""), + F->getEntryBlock().getFirstNonPHI()); } I.replaceUsesOfWith(Ptr, PtrI); } @@ -177,9 +137,8 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) { if (skipFunction(F)) return false; - DA = &getAnalysis<LegacyDivergenceAnalysis>(); - MDR = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(); - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + DA = &getAnalysis<LegacyDivergenceAnalysis>(); + MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA(); isEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv()); visit(F); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp index fb273a1650ae..aab76d27ef11 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp @@ -92,7 +92,7 @@ AMDGPUFunctionArgInfo::getPreloadedValue( case AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER: { return std::make_tuple(PrivateSegmentBuffer ? &PrivateSegmentBuffer : nullptr, - &AMDGPU::SGPR_128RegClass, LLT::vector(4, 32)); + &AMDGPU::SGPR_128RegClass, LLT::fixed_vector(4, 32)); } case AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR: return std::make_tuple(ImplicitBufferPtr ? &ImplicitBufferPtr : nullptr, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h index 139ac3bab14c..e9ed45d8cd14 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -143,7 +143,8 @@ struct AMDGPUFunctionArgInfo { // Input registers for non-HSA ABI ArgDescriptor ImplicitBufferPtr; - // VGPRs inputs. These are always v0, v1 and v2 for entry functions. + // VGPRs inputs. For entry functions these are either v0, v1 and v2 or packed + // into v0, 10 bits per dimension if packed-tid is set. ArgDescriptor WorkItemIDX; ArgDescriptor WorkItemIDY; ArgDescriptor WorkItemIDZ; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index c655e5ec87b7..cbc4ab212566 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -18,6 +18,7 @@ #include "AMDGPUAsmPrinter.h" #include "AMDGPU.h" #include "AMDGPUHSAMetadataStreamer.h" +#include "AMDGPUResourceUsageAnalysis.h" #include "AMDKernelCodeT.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUInstPrinter.h" @@ -39,22 +40,6 @@ using namespace llvm; using namespace llvm::AMDGPU; -// We need to tell the runtime some amount ahead of time if we don't know the -// true stack size. Assume a smaller number if this is only due to dynamic / -// non-entry block allocas. -static cl::opt<uint32_t> AssumedStackSizeForExternalCall( - "amdgpu-assume-external-call-stack-size", - cl::desc("Assumed stack use of any external call (in bytes)"), - cl::Hidden, - cl::init(16384)); - -static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects( - "amdgpu-assume-dynamic-stack-object-size", - cl::desc("Assumed extra stack use if there are any " - "variable sized objects (in bytes)"), - cl::Hidden, - cl::init(4096)); - // This should get the default rounding mode from the kernel. We just set the // default here, but this could change if the OpenCL rounding mode pragmas are // used. @@ -97,12 +82,14 @@ extern "C" void LLVM_EXTERNAL_VISIBILITY LLVMInitializeAMDGPUAsmPrinter() { AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) - : AsmPrinter(TM, std::move(Streamer)) { + : AsmPrinter(TM, std::move(Streamer)) { if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { if (isHsaAbiVersion2(getGlobalSTI())) { HSAMetadataStream.reset(new HSAMD::MetadataStreamerV2()); - } else { + } else if (isHsaAbiVersion3(getGlobalSTI())) { HSAMetadataStream.reset(new HSAMD::MetadataStreamerV3()); + } else { + HSAMetadataStream.reset(new HSAMD::MetadataStreamerV4()); } } } @@ -122,34 +109,34 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const { } void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) { - if (isHsaAbiVersion3(getGlobalSTI())) { - std::string ExpectedTarget; - raw_string_ostream ExpectedTargetOS(ExpectedTarget); - IsaInfo::streamIsaVersion(getGlobalSTI(), ExpectedTargetOS); - - getTargetStreamer()->EmitDirectiveAMDGCNTarget(ExpectedTarget); - } + // TODO: Which one is called first, emitStartOfAsmFile or + // emitFunctionBodyStart? + if (getTargetStreamer() && !getTargetStreamer()->getTargetID()) + initializeTargetID(M); if (TM.getTargetTriple().getOS() != Triple::AMDHSA && TM.getTargetTriple().getOS() != Triple::AMDPAL) return; + if (isHsaAbiVersion3Or4(getGlobalSTI())) + getTargetStreamer()->EmitDirectiveAMDGCNTarget(); + if (TM.getTargetTriple().getOS() == Triple::AMDHSA) - HSAMetadataStream->begin(M); + HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID()); if (TM.getTargetTriple().getOS() == Triple::AMDPAL) getTargetStreamer()->getPALMetadata()->readFromIR(M); - if (isHsaAbiVersion3(getGlobalSTI())) + if (isHsaAbiVersion3Or4(getGlobalSTI())) return; - // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2. + // HSA emits NT_AMD_HSA_CODE_OBJECT_VERSION for code objects v2. if (TM.getTargetTriple().getOS() == Triple::AMDHSA) getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1); - // HSA and PAL emit NT_AMDGPU_HSA_ISA for code objects v2. + // HSA and PAL emit NT_AMD_HSA_ISA_VERSION for code objects v2. IsaVersion Version = getIsaVersion(getGlobalSTI()->getCPU()); - getTargetStreamer()->EmitDirectiveHSACodeObjectISA( + getTargetStreamer()->EmitDirectiveHSACodeObjectISAV2( Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU"); } @@ -159,15 +146,11 @@ void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) { return; if (TM.getTargetTriple().getOS() != Triple::AMDHSA || - isHsaAbiVersion2(getGlobalSTI())) { - // Emit ISA Version (NT_AMD_AMDGPU_ISA). - std::string ISAVersionString; - raw_string_ostream ISAVersionStream(ISAVersionString); - IsaInfo::streamIsaVersion(getGlobalSTI(), ISAVersionStream); - getTargetStreamer()->EmitISAVersion(ISAVersionStream.str()); - } + isHsaAbiVersion2(getGlobalSTI())) + getTargetStreamer()->EmitISAVersion(); // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA). + // Emit HSA Metadata (NT_AMD_HSA_METADATA). if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { HSAMetadataStream->end(); bool Success = HSAMetadataStream->emitTo(*getTargetStreamer()); @@ -192,11 +175,37 @@ bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( void AMDGPUAsmPrinter::emitFunctionBodyStart() { const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); + const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); + const Function &F = MF->getFunction(); + + // TODO: Which one is called first, emitStartOfAsmFile or + // emitFunctionBodyStart? + if (getTargetStreamer() && !getTargetStreamer()->getTargetID()) + initializeTargetID(*F.getParent()); + + const auto &FunctionTargetID = STM.getTargetID(); + // Make sure function's xnack settings are compatible with module's + // xnack settings. + if (FunctionTargetID.isXnackSupported() && + FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any && + FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) { + OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) + + "' function does not match module xnack setting"); + return; + } + // Make sure function's sramecc settings are compatible with module's + // sramecc settings. + if (FunctionTargetID.isSramEccSupported() && + FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any && + FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) { + OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) + + "' function does not match module sramecc setting"); + return; + } + if (!MFI.isEntryFunction()) return; - const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); - const Function &F = MF->getFunction(); if ((STM.isMesaKernel(F) || isHsaAbiVersion2(getGlobalSTI())) && (F.getCallingConv() == CallingConv::AMDGPU_KERNEL || F.getCallingConv() == CallingConv::SPIR_KERNEL)) { @@ -232,26 +241,25 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() { if (ReadOnlySection.getAlignment() < 64) ReadOnlySection.setAlignment(Align(64)); - const MCSubtargetInfo &STI = MF->getSubtarget(); + const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); SmallString<128> KernelName; getNameWithPrefix(KernelName, &MF->getFunction()); getTargetStreamer()->EmitAmdhsaKernelDescriptor( - STI, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), + STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), CurrentProgramInfo.NumVGPRsForWavesPerEU, CurrentProgramInfo.NumSGPRsForWavesPerEU - - IsaInfo::getNumExtraSGPRs(&STI, + IsaInfo::getNumExtraSGPRs(&STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed), - CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, - hasXNACK(STI)); + CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed); Streamer.PopSection(); } void AMDGPUAsmPrinter::emitFunctionEntryLabel() { if (TM.getTargetTriple().getOS() == Triple::AMDHSA && - isHsaAbiVersion3(getGlobalSTI())) { + isHsaAbiVersion3Or4(getGlobalSTI())) { AsmPrinter::emitFunctionEntryLabel(); return; } @@ -322,17 +330,15 @@ void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { } bool AMDGPUAsmPrinter::doFinalization(Module &M) { - CallGraphResourceInfo.clear(); - // Pad with s_code_end to help tools and guard against instruction prefetch // causing stale data in caches. Arguably this should be done by the linker, // which is why this isn't done for Mesa. const MCSubtargetInfo &STI = *getGlobalSTI(); - if (AMDGPU::isGFX10Plus(STI) && + if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) && (STI.getTargetTriple().getOS() == Triple::AMDHSA || STI.getTargetTriple().getOS() == Triple::AMDPAL)) { OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); - getTargetStreamer()->EmitCodeEnd(); + getTargetStreamer()->EmitCodeEnd(STI); } return AsmPrinter::doFinalization(M); @@ -400,6 +406,9 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor( const MachineFunction &MF, const SIProgramInfo &PI) const { + const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); + const Function &F = MF.getFunction(); + amdhsa::kernel_descriptor_t KernelDescriptor; memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor)); @@ -409,14 +418,24 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor( KernelDescriptor.group_segment_fixed_size = PI.LDSSize; KernelDescriptor.private_segment_fixed_size = PI.ScratchSize; + + Align MaxKernArgAlign; + KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign); + KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(); KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2; KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF); + assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0); + if (STM.hasGFX90AInsts()) + KernelDescriptor.compute_pgm_rsrc3 = + CurrentProgramInfo.ComputePGMRSrc3GFX90A; + return KernelDescriptor; } bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>(); CurrentProgramInfo = SIProgramInfo(); const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); @@ -438,12 +457,6 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { if (MFI->isModuleEntryFunction()) { getSIProgramInfo(CurrentProgramInfo, MF); - } else { - auto I = CallGraphResourceInfo.insert( - std::make_pair(&MF.getFunction(), SIFunctionResourceInfo())); - SIFunctionResourceInfo &Info = I.first->second; - assert(I.second && "should only be called once per function"); - Info = analyzeResourceUsage(MF); } if (STM.isAmdPalOS()) { @@ -480,7 +493,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { if (!MFI->isEntryFunction()) { OutStreamer->emitRawComment(" Function info:", false); - SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()]; + const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info = + ResourceUsage->getResourceInfo(&MF.getFunction()); emitCommonFunctionComments( Info.NumVGPR, STM.hasMAIInsts() ? Info.NumAGPR : Optional<uint32_t>(), @@ -521,6 +535,11 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { " NumVGPRsForWavesPerEU: " + Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false); + if (STM.hasGFX90AInsts()) + OutStreamer->emitRawComment( + " AccumOffset: " + + Twine((CurrentProgramInfo.AccumOffset + 1) * 4), false); + OutStreamer->emitRawComment( " Occupancy: " + Twine(CurrentProgramInfo.Occupancy), false); @@ -550,6 +569,21 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)), false); + + assert(STM.hasGFX90AInsts() || + CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0); + if (STM.hasGFX90AInsts()) { + OutStreamer->emitRawComment( + " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " + + Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))), + false); + OutStreamer->emitRawComment( + " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " + + Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))), + false); + } } if (DumpCodeInstEmitter) { @@ -572,6 +606,36 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { return false; } +// TODO: Fold this into emitFunctionBodyStart. +void AMDGPUAsmPrinter::initializeTargetID(const Module &M) { + // In the beginning all features are either 'Any' or 'NotSupported', + // depending on global target features. This will cover empty modules. + getTargetStreamer()->initializeTargetID( + *getGlobalSTI(), getGlobalSTI()->getFeatureString()); + + // If module is empty, we are done. + if (M.empty()) + return; + + // If module is not empty, need to find first 'Off' or 'On' feature + // setting per feature from functions in module. + for (auto &F : M) { + auto &TSTargetID = getTargetStreamer()->getTargetID(); + if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) && + (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff())) + break; + + const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F); + const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID(); + if (TSTargetID->isXnackSupported()) + if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any) + TSTargetID->setXnackSetting(STMTargetID.getXnackSetting()); + if (TSTargetID->isSramEccSupported()) + if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any) + TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting()); + } +} + uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const { const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = STM.getInstrInfo(); @@ -593,398 +657,17 @@ uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const return CodeSize; } -static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, - const SIInstrInfo &TII, - unsigned Reg) { - for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { - if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent())) - return true; - } - - return false; -} - -int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs( - const GCNSubtarget &ST) const { - return NumExplicitSGPR + IsaInfo::getNumExtraSGPRs(&ST, - UsesVCC, UsesFlatScratch); -} - -int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumVGPRs( - const GCNSubtarget &ST) const { - return std::max(NumVGPR, NumAGPR); -} - -static const Function *getCalleeFunction(const MachineOperand &Op) { - if (Op.isImm()) { - assert(Op.getImm() == 0); - return nullptr; - } - - return cast<Function>(Op.getGlobal()); -} - -AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( - const MachineFunction &MF) const { - SIFunctionResourceInfo Info; - - const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - const MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); - - Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || - MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI); - - // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat - // instructions aren't used to access the scratch buffer. Inline assembly may - // need it though. - // - // If we only have implicit uses of flat_scr on flat instructions, it is not - // really needed. - if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() && - (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && - !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && - !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { - Info.UsesFlatScratch = false; - } - - Info.PrivateSegmentSize = FrameInfo.getStackSize(); - - // Assume a big number if there are any unknown sized objects. - Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); - if (Info.HasDynamicallySizedStack) - Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects; - - if (MFI->isStackRealigned()) - Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value(); - - Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || - MRI.isPhysRegUsed(AMDGPU::VCC_HI); - - // If there are no calls, MachineRegisterInfo can tell us the used register - // count easily. - // A tail call isn't considered a call for MachineFrameInfo's purposes. - if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { - MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - HighestVGPRReg = Reg; - break; - } - } - - if (ST.hasMAIInsts()) { - MCPhysReg HighestAGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - HighestAGPRReg = Reg; - break; - } - } - Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister ? 0 : - TRI.getHWRegIndex(HighestAGPRReg) + 1; - } - - MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - HighestSGPRReg = Reg; - break; - } - } - - // We found the maximum register index. They start at 0, so add one to get the - // number of registers. - Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 : - TRI.getHWRegIndex(HighestVGPRReg) + 1; - Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 : - TRI.getHWRegIndex(HighestSGPRReg) + 1; - - return Info; - } - - int32_t MaxVGPR = -1; - int32_t MaxAGPR = -1; - int32_t MaxSGPR = -1; - uint64_t CalleeFrameSize = 0; - - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { - // TODO: Check regmasks? Do they occur anywhere except calls? - for (const MachineOperand &MO : MI.operands()) { - unsigned Width = 0; - bool IsSGPR = false; - bool IsAGPR = false; - - if (!MO.isReg()) - continue; - - Register Reg = MO.getReg(); - switch (Reg) { - case AMDGPU::EXEC: - case AMDGPU::EXEC_LO: - case AMDGPU::EXEC_HI: - case AMDGPU::SCC: - case AMDGPU::M0: - case AMDGPU::SRC_SHARED_BASE: - case AMDGPU::SRC_SHARED_LIMIT: - case AMDGPU::SRC_PRIVATE_BASE: - case AMDGPU::SRC_PRIVATE_LIMIT: - case AMDGPU::SGPR_NULL: - case AMDGPU::MODE: - continue; - - case AMDGPU::SRC_POPS_EXITING_WAVE_ID: - llvm_unreachable("src_pops_exiting_wave_id should not be used"); - - case AMDGPU::NoRegister: - assert(MI.isDebugInstr() && "Instruction uses invalid noreg register"); - continue; - - case AMDGPU::VCC: - case AMDGPU::VCC_LO: - case AMDGPU::VCC_HI: - case AMDGPU::VCC_LO_LO16: - case AMDGPU::VCC_LO_HI16: - case AMDGPU::VCC_HI_LO16: - case AMDGPU::VCC_HI_HI16: - Info.UsesVCC = true; - continue; - - case AMDGPU::FLAT_SCR: - case AMDGPU::FLAT_SCR_LO: - case AMDGPU::FLAT_SCR_HI: - continue; - - case AMDGPU::XNACK_MASK: - case AMDGPU::XNACK_MASK_LO: - case AMDGPU::XNACK_MASK_HI: - llvm_unreachable("xnack_mask registers should not be used"); - - case AMDGPU::LDS_DIRECT: - llvm_unreachable("lds_direct register should not be used"); - - case AMDGPU::TBA: - case AMDGPU::TBA_LO: - case AMDGPU::TBA_HI: - case AMDGPU::TMA: - case AMDGPU::TMA_LO: - case AMDGPU::TMA_HI: - llvm_unreachable("trap handler registers should not be used"); - - case AMDGPU::SRC_VCCZ: - llvm_unreachable("src_vccz register should not be used"); - - case AMDGPU::SRC_EXECZ: - llvm_unreachable("src_execz register should not be used"); - - case AMDGPU::SRC_SCC: - llvm_unreachable("src_scc register should not be used"); - - default: - break; - } - - if (AMDGPU::SReg_32RegClass.contains(Reg) || - AMDGPU::SReg_LO16RegClass.contains(Reg) || - AMDGPU::SGPR_HI16RegClass.contains(Reg)) { - assert(!AMDGPU::TTMP_32RegClass.contains(Reg) && - "trap handler registers should not be used"); - IsSGPR = true; - Width = 1; - } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || - AMDGPU::VGPR_LO16RegClass.contains(Reg) || - AMDGPU::VGPR_HI16RegClass.contains(Reg)) { - IsSGPR = false; - Width = 1; - } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || - AMDGPU::AGPR_LO16RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 1; - } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { - assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && - "trap handler registers should not be used"); - IsSGPR = true; - Width = 2; - } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { - IsSGPR = false; - Width = 2; - } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 2; - } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { - IsSGPR = false; - Width = 3; - } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { - IsSGPR = true; - Width = 3; - } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 3; - } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { - assert(!AMDGPU::TTMP_128RegClass.contains(Reg) && - "trap handler registers should not be used"); - IsSGPR = true; - Width = 4; - } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { - IsSGPR = false; - Width = 4; - } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 4; - } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { - IsSGPR = false; - Width = 5; - } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { - IsSGPR = true; - Width = 5; - } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 5; - } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { - IsSGPR = false; - Width = 6; - } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { - IsSGPR = true; - Width = 6; - } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 6; - } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { - assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && - "trap handler registers should not be used"); - IsSGPR = true; - Width = 8; - } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { - IsSGPR = false; - Width = 8; - } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 8; - } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { - assert(!AMDGPU::TTMP_512RegClass.contains(Reg) && - "trap handler registers should not be used"); - IsSGPR = true; - Width = 16; - } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { - IsSGPR = false; - Width = 16; - } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 16; - } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { - IsSGPR = true; - Width = 32; - } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { - IsSGPR = false; - Width = 32; - } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 32; - } else { - llvm_unreachable("Unknown register class"); - } - unsigned HWReg = TRI.getHWRegIndex(Reg); - int MaxUsed = HWReg + Width - 1; - if (IsSGPR) { - MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; - } else if (IsAGPR) { - MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; - } else { - MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; - } - } - - if (MI.isCall()) { - // Pseudo used just to encode the underlying global. Is there a better - // way to track this? - - const MachineOperand *CalleeOp - = TII->getNamedOperand(MI, AMDGPU::OpName::callee); - - const Function *Callee = getCalleeFunction(*CalleeOp); - DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I = - CallGraphResourceInfo.end(); - bool IsExternal = !Callee || Callee->isDeclaration(); - if (!IsExternal) - I = CallGraphResourceInfo.find(Callee); - - if (IsExternal || I == CallGraphResourceInfo.end()) { - // Avoid crashing on undefined behavior with an illegal call to a - // kernel. If a callsite's calling convention doesn't match the - // function's, it's undefined behavior. If the callsite calling - // convention does match, that would have errored earlier. - // FIXME: The verifier shouldn't allow this. - if (!IsExternal && - AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) - report_fatal_error("invalid call to entry function"); - - // If this is a call to an external function, we can't do much. Make - // conservative guesses. - - // 48 SGPRs - vcc, - flat_scr, -xnack - int MaxSGPRGuess = - 47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace()); - MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess); - MaxVGPR = std::max(MaxVGPR, 23); - MaxAGPR = std::max(MaxAGPR, 23); - - CalleeFrameSize = std::max(CalleeFrameSize, - static_cast<uint64_t>(AssumedStackSizeForExternalCall)); - - Info.UsesVCC = true; - Info.UsesFlatScratch = ST.hasFlatAddressSpace(); - Info.HasDynamicallySizedStack = true; - } else { - // We force CodeGen to run in SCC order, so the callee's register - // usage etc. should be the cumulative usage of all callees. - - MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); - MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); - MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR); - CalleeFrameSize - = std::max(I->second.PrivateSegmentSize, CalleeFrameSize); - Info.UsesVCC |= I->second.UsesVCC; - Info.UsesFlatScratch |= I->second.UsesFlatScratch; - Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; - Info.HasRecursion |= I->second.HasRecursion; - } - - // FIXME: Call site could have norecurse on it - if (!Callee || !Callee->doesNotRecurse()) - Info.HasRecursion = true; - } - } - } - - Info.NumExplicitSGPR = MaxSGPR + 1; - Info.NumVGPR = MaxVGPR + 1; - Info.NumAGPR = MaxAGPR + 1; - Info.PrivateSegmentSize += CalleeFrameSize; - - return Info; -} - void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const MachineFunction &MF) { - SIFunctionResourceInfo Info = analyzeResourceUsage(MF); + const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info = + ResourceUsage->getResourceInfo(&MF.getFunction()); const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); ProgInfo.NumArchVGPR = Info.NumVGPR; ProgInfo.NumAccVGPR = Info.NumAGPR; ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM); + ProgInfo.AccumOffset = alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1; + ProgInfo.TgSplit = STM.isTgSplitEnabled(); ProgInfo.NumSGPR = Info.NumExplicitSGPR; ProgInfo.ScratchSize = Info.PrivateSegmentSize; ProgInfo.VCCUsed = Info.UsesVCC; @@ -1001,7 +684,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are + // The calculations related to SGPR/VGPR blocks are // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be // unified. unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs( @@ -1163,6 +846,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) | S_00B84C_EXCP_EN(0); + if (STM.hasGFX90AInsts()) { + AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, + ProgInfo.AccumOffset); + AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, + ProgInfo.TgSplit); + } + ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize, ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU); @@ -1262,10 +954,16 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) { auto *MD = getTargetStreamer()->getPALMetadata(); const MachineFrameInfo &MFI = MF.getFrameInfo(); MD->setFunctionScratchSize(MF, MFI.getStackSize()); + // Set compute registers MD->setRsrc1(CallingConv::AMDGPU_CS, CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS)); MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.ComputePGMRSrc2); + + // Set optional info + MD->setFunctionLdsSize(MF, CurrentProgramInfo.LDSSize); + MD->setFunctionNumUsedVgprs(MF, CurrentProgramInfo.NumVGPRsForWavesPerEU); + MD->setFunctionNumUsedSgprs(MF, CurrentProgramInfo.NumSGPRsForWavesPerEU); } // This is supposed to be log2(Size) @@ -1383,3 +1081,9 @@ bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, } return true; } + +void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<AMDGPUResourceUsageAnalysis>(); + AU.addPreserved<AMDGPUResourceUsageAnalysis>(); + AsmPrinter::getAnalysisUsage(AU); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 9e1e26d65d8c..d3a555bc228f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -22,6 +22,7 @@ struct amd_kernel_code_t; namespace llvm { class AMDGPUMachineFunction; +struct AMDGPUResourceUsageAnalysis; class AMDGPUTargetStreamer; class MCCodeEmitter; class MCOperand; @@ -39,32 +40,17 @@ struct kernel_descriptor_t; class AMDGPUAsmPrinter final : public AsmPrinter { private: - // Track resource usage for callee functions. - struct SIFunctionResourceInfo { - // Track the number of explicitly used VGPRs. Special registers reserved at - // the end are tracked separately. - int32_t NumVGPR = 0; - int32_t NumAGPR = 0; - int32_t NumExplicitSGPR = 0; - uint64_t PrivateSegmentSize = 0; - bool UsesVCC = false; - bool UsesFlatScratch = false; - bool HasDynamicallySizedStack = false; - bool HasRecursion = false; - - int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const; - int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const; - }; + void initializeTargetID(const Module &M); + + AMDGPUResourceUsageAnalysis *ResourceUsage; SIProgramInfo CurrentProgramInfo; - DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo; std::unique_ptr<AMDGPU::HSAMD::MetadataStreamer> HSAMetadataStream; MCCodeEmitter *DumpCodeInstEmitter = nullptr; uint64_t getFunctionCodeSize(const MachineFunction &MF) const; - SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF) const; void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF); void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo, @@ -146,6 +132,8 @@ public: const char *ExtraCode, raw_ostream &O) override; protected: + void getAnalysisUsage(AnalysisUsage &AU) const override; + std::vector<std::string> DisasmLines, HexLines; size_t DisasmLineMaxLen; }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index aae2a54c198b..3e9fdcb1618e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -48,6 +48,8 @@ private: const GCNSubtarget *ST; bool IsPixelShader; + Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, + Value *const Identity) const; Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, Value *const Identity) const; Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; @@ -279,6 +281,45 @@ static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, return B.CreateSelect(Cond, LHS, RHS); } +// Use the builder to create a reduction of V across the wavefront, with all +// lanes active, returning the same result in all lanes. +Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B, + AtomicRMWInst::BinOp Op, Value *V, + Value *const Identity) const { + Type *const Ty = V->getType(); + Module *M = B.GetInsertBlock()->getModule(); + Function *UpdateDPP = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); + + // Reduce within each row of 16 lanes. + for (unsigned Idx = 0; Idx < 4; Idx++) { + V = buildNonAtomicBinOp( + B, Op, V, + B.CreateCall(UpdateDPP, + {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx), + B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); + } + + // Reduce within each pair of rows (i.e. 32 lanes). + assert(ST->hasPermLaneX16()); + V = buildNonAtomicBinOp( + B, Op, V, + B.CreateIntrinsic( + Intrinsic::amdgcn_permlanex16, {}, + {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()})); + + if (ST->isWave32()) + return V; + + // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and + // combine them with a scalar operation. + Function *ReadLane = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); + Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); + Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); + return buildNonAtomicBinOp(B, Op, Lane0, Lane32); +} + // Use the builder to create an inclusive scan of V across the wavefront, with // all lanes active. Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, @@ -287,10 +328,6 @@ Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); - Function *PermLaneX16 = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_permlanex16, {}); - Function *ReadLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); for (unsigned Idx = 0; Idx < 4; Idx++) { V = buildNonAtomicBinOp( @@ -317,9 +354,10 @@ Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes // 48..63). - Value *const PermX = - B.CreateCall(PermLaneX16, {V, V, B.getInt32(-1), B.getInt32(-1), - B.getFalse(), B.getFalse()}); + assert(ST->hasPermLaneX16()); + Value *const PermX = B.CreateIntrinsic( + Intrinsic::amdgcn_permlanex16, {}, + {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); V = buildNonAtomicBinOp( B, Op, V, B.CreateCall(UpdateDPP, @@ -327,7 +365,8 @@ Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, B.getInt32(0xa), B.getInt32(0xf), B.getFalse()})); if (!ST->isWave32()) { // Combine lane 31 into lanes 32..63. - Value *const Lane31 = B.CreateCall(ReadLane, {V, B.getInt32(31)}); + Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, + {V, B.getInt32(31)}); V = buildNonAtomicBinOp( B, Op, V, B.CreateCall(UpdateDPP, @@ -346,10 +385,6 @@ Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V, Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); - Function *ReadLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); - Function *WriteLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); if (ST->hasDPPWavefrontShifts()) { // GFX9 has DPP wavefront shift operations. @@ -357,6 +392,11 @@ Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V, {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); } else { + Function *ReadLane = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); + Function *WriteLane = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); + // On GFX10 all DPP operations are confined to a single row. To get cross- // row operations we have to use permlane or readlane. Value *Old = V; @@ -480,6 +520,8 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, Value *ExclScan = nullptr; Value *NewV = nullptr; + const bool NeedResult = !I.use_empty(); + // If we have a divergent value in each lane, we need to combine the value // using DPP. if (ValDivergent) { @@ -489,35 +531,27 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, const AtomicRMWInst::BinOp ScanOp = Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; - NewV = buildScan(B, ScanOp, NewV, Identity); - ExclScan = buildShiftRight(B, NewV, Identity); - - // Read the value from the last lane, which has accumlated the values of - // each active lane in the wavefront. This will be our new value which we - // will provide to the atomic operation. - Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); - if (TyBitWidth == 64) { - Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty()); - Value *const ExtractHi = - B.CreateTrunc(B.CreateLShr(NewV, 32), B.getInt32Ty()); - CallInst *const ReadLaneLo = B.CreateIntrinsic( - Intrinsic::amdgcn_readlane, {}, {ExtractLo, LastLaneIdx}); - CallInst *const ReadLaneHi = B.CreateIntrinsic( - Intrinsic::amdgcn_readlane, {}, {ExtractHi, LastLaneIdx}); - Value *const PartialInsert = B.CreateInsertElement( - UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0)); - Value *const Insert = - B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1)); - NewV = B.CreateBitCast(Insert, Ty); - } else if (TyBitWidth == 32) { + if (!NeedResult && ST->hasPermLaneX16()) { + // On GFX10 the permlanex16 instruction helps us build a reduction without + // too many readlanes and writelanes, which are generally bad for + // performance. + NewV = buildReduction(B, ScanOp, NewV, Identity); + } else { + NewV = buildScan(B, ScanOp, NewV, Identity); + if (NeedResult) + ExclScan = buildShiftRight(B, NewV, Identity); + + // Read the value from the last lane, which has accumlated the values of + // each active lane in the wavefront. This will be our new value which we + // will provide to the atomic operation. + Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); + assert(TyBitWidth == 32); NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {NewV, LastLaneIdx}); - } else { - llvm_unreachable("Unhandled atomic bit width"); } // Finally mark the readlanes in the WWM section. - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV); + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); } else { switch (Op) { default: @@ -583,7 +617,6 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, // original instruction. B.SetInsertPoint(&I); - const bool NeedResult = !I.use_empty(); if (NeedResult) { // Create a PHI node to get our new atomic result into the exit block. PHINode *const PHI = B.CreatePHI(Ty, 2); @@ -621,7 +654,8 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, // from the first lane, to get our lane's index into the atomic result. Value *LaneOffset = nullptr; if (ValDivergent) { - LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan); + LaneOffset = + B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); } else { switch (Op) { default: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp new file mode 100644 index 000000000000..61b1d22edc33 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -0,0 +1,528 @@ +//===- AMDGPUAttributor.cpp -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This pass uses Attributor framework to deduce AMDGPU attributes. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IntrinsicsR600.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/IPO/Attributor.h" + +#define DEBUG_TYPE "amdgpu-attributor" + +using namespace llvm; + +static constexpr StringLiteral ImplicitAttrNames[] = { + // X ids unnecessarily propagated to kernels. + "amdgpu-work-item-id-x", "amdgpu-work-item-id-y", + "amdgpu-work-item-id-z", "amdgpu-work-group-id-x", + "amdgpu-work-group-id-y", "amdgpu-work-group-id-z", + "amdgpu-dispatch-ptr", "amdgpu-dispatch-id", + "amdgpu-queue-ptr", "amdgpu-implicitarg-ptr"}; + +// We do not need to note the x workitem or workgroup id because they are always +// initialized. +// +// TODO: We should not add the attributes if the known compile time workgroup +// size is 1 for y/z. +static StringRef intrinsicToAttrName(Intrinsic::ID ID, bool &NonKernelOnly, + bool &IsQueuePtr) { + switch (ID) { + case Intrinsic::amdgcn_workitem_id_x: + NonKernelOnly = true; + return "amdgpu-work-item-id-x"; + case Intrinsic::amdgcn_workgroup_id_x: + NonKernelOnly = true; + return "amdgpu-work-group-id-x"; + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::r600_read_tidig_y: + return "amdgpu-work-item-id-y"; + case Intrinsic::amdgcn_workitem_id_z: + case Intrinsic::r600_read_tidig_z: + return "amdgpu-work-item-id-z"; + case Intrinsic::amdgcn_workgroup_id_y: + case Intrinsic::r600_read_tgid_y: + return "amdgpu-work-group-id-y"; + case Intrinsic::amdgcn_workgroup_id_z: + case Intrinsic::r600_read_tgid_z: + return "amdgpu-work-group-id-z"; + case Intrinsic::amdgcn_dispatch_ptr: + return "amdgpu-dispatch-ptr"; + case Intrinsic::amdgcn_dispatch_id: + return "amdgpu-dispatch-id"; + case Intrinsic::amdgcn_kernarg_segment_ptr: + return "amdgpu-kernarg-segment-ptr"; + case Intrinsic::amdgcn_implicitarg_ptr: + return "amdgpu-implicitarg-ptr"; + case Intrinsic::amdgcn_queue_ptr: + case Intrinsic::amdgcn_is_shared: + case Intrinsic::amdgcn_is_private: + // TODO: Does not require queue ptr on gfx9+ + case Intrinsic::trap: + case Intrinsic::debugtrap: + IsQueuePtr = true; + return "amdgpu-queue-ptr"; + default: + return ""; + } +} + +static bool castRequiresQueuePtr(unsigned SrcAS) { + return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; +} + +static bool isDSAddress(const Constant *C) { + const GlobalValue *GV = dyn_cast<GlobalValue>(C); + if (!GV) + return false; + unsigned AS = GV->getAddressSpace(); + return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS; +} + +class AMDGPUInformationCache : public InformationCache { +public: + AMDGPUInformationCache(const Module &M, AnalysisGetter &AG, + BumpPtrAllocator &Allocator, + SetVector<Function *> *CGSCC, TargetMachine &TM) + : InformationCache(M, AG, Allocator, CGSCC), TM(TM) {} + TargetMachine &TM; + + enum ConstantStatus { DS_GLOBAL = 1 << 0, ADDR_SPACE_CAST = 1 << 1 }; + + /// Check if the subtarget has aperture regs. + bool hasApertureRegs(Function &F) { + const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); + return ST.hasApertureRegs(); + } + +private: + /// Check if the ConstantExpr \p CE requires queue ptr attribute. + static bool visitConstExpr(const ConstantExpr *CE) { + if (CE->getOpcode() == Instruction::AddrSpaceCast) { + unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); + return castRequiresQueuePtr(SrcAS); + } + return false; + } + + /// Get the constant access bitmap for \p C. + uint8_t getConstantAccess(const Constant *C) { + auto It = ConstantStatus.find(C); + if (It != ConstantStatus.end()) + return It->second; + + uint8_t Result = 0; + if (isDSAddress(C)) + Result = DS_GLOBAL; + + if (const auto *CE = dyn_cast<ConstantExpr>(C)) + if (visitConstExpr(CE)) + Result |= ADDR_SPACE_CAST; + + for (const Use &U : C->operands()) { + const auto *OpC = dyn_cast<Constant>(U); + if (!OpC) + continue; + + Result |= getConstantAccess(OpC); + } + return Result; + } + +public: + /// Returns true if \p Fn needs a queue ptr attribute because of \p C. + bool needsQueuePtr(const Constant *C, Function &Fn) { + bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv()); + bool HasAperture = hasApertureRegs(Fn); + + // No need to explore the constants. + if (!IsNonEntryFunc && HasAperture) + return false; + + uint8_t Access = getConstantAccess(C); + + // We need to trap on DS globals in non-entry functions. + if (IsNonEntryFunc && (Access & DS_GLOBAL)) + return true; + + return !HasAperture && (Access & ADDR_SPACE_CAST); + } + +private: + /// Used to determine if the Constant needs a queue ptr attribute. + DenseMap<const Constant *, uint8_t> ConstantStatus; +}; + +struct AAAMDAttributes : public StateWrapper<BooleanState, AbstractAttribute> { + using Base = StateWrapper<BooleanState, AbstractAttribute>; + AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {} + + /// Create an abstract attribute view for the position \p IRP. + static AAAMDAttributes &createForPosition(const IRPosition &IRP, + Attributor &A); + + /// See AbstractAttribute::getName(). + const std::string getName() const override { return "AAAMDAttributes"; } + + /// See AbstractAttribute::getIdAddr(). + const char *getIdAddr() const override { return &ID; } + + /// This function should return true if the type of the \p AA is + /// AAAMDAttributes. + static bool classof(const AbstractAttribute *AA) { + return (AA->getIdAddr() == &ID); + } + + virtual const DenseSet<StringRef> &getAttributes() const = 0; + + /// Unique ID (due to the unique address) + static const char ID; +}; +const char AAAMDAttributes::ID = 0; + +struct AAAMDWorkGroupSize + : public StateWrapper<BooleanState, AbstractAttribute> { + using Base = StateWrapper<BooleanState, AbstractAttribute>; + AAAMDWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {} + + /// Create an abstract attribute view for the position \p IRP. + static AAAMDWorkGroupSize &createForPosition(const IRPosition &IRP, + Attributor &A); + + /// See AbstractAttribute::getName(). + const std::string getName() const override { return "AAAMDWorkGroupSize"; } + + /// See AbstractAttribute::getIdAddr(). + const char *getIdAddr() const override { return &ID; } + + /// This function should return true if the type of the \p AA is + /// AAAMDAttributes. + static bool classof(const AbstractAttribute *AA) { + return (AA->getIdAddr() == &ID); + } + + /// Unique ID (due to the unique address) + static const char ID; +}; +const char AAAMDWorkGroupSize::ID = 0; + +struct AAAMDWorkGroupSizeFunction : public AAAMDWorkGroupSize { + AAAMDWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A) + : AAAMDWorkGroupSize(IRP, A) {} + + void initialize(Attributor &A) override { + Function *F = getAssociatedFunction(); + CallingConv::ID CC = F->getCallingConv(); + + if (CC != CallingConv::AMDGPU_KERNEL) + return; + + bool InitialValue = false; + if (F->hasFnAttribute("uniform-work-group-size")) + InitialValue = F->getFnAttribute("uniform-work-group-size") + .getValueAsString() + .equals("true"); + + if (InitialValue) + indicateOptimisticFixpoint(); + else + indicatePessimisticFixpoint(); + } + + ChangeStatus updateImpl(Attributor &A) override { + ChangeStatus Change = ChangeStatus::UNCHANGED; + + auto CheckCallSite = [&](AbstractCallSite CS) { + Function *Caller = CS.getInstruction()->getFunction(); + LLVM_DEBUG(dbgs() << "[AAAMDWorkGroupSize] Call " << Caller->getName() + << "->" << getAssociatedFunction()->getName() << "\n"); + + const auto &CallerInfo = A.getAAFor<AAAMDWorkGroupSize>( + *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); + + Change = Change | clampStateAndIndicateChange(this->getState(), + CallerInfo.getState()); + + return true; + }; + + bool AllCallSitesKnown = true; + if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown)) + indicatePessimisticFixpoint(); + + return Change; + } + + ChangeStatus manifest(Attributor &A) override { + SmallVector<Attribute, 8> AttrList; + LLVMContext &Ctx = getAssociatedFunction()->getContext(); + + AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size", + getAssumed() ? "true" : "false")); + return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList, + /* ForceReplace */ true); + } + + bool isValidState() const override { + // This state is always valid, even when the state is false. + return true; + } + + const std::string getAsStr() const override { + return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]"; + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override {} +}; + +AAAMDWorkGroupSize &AAAMDWorkGroupSize::createForPosition(const IRPosition &IRP, + Attributor &A) { + if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) + return *new (A.Allocator) AAAMDWorkGroupSizeFunction(IRP, A); + llvm_unreachable("AAAMDWorkGroupSize is only valid for function position"); +} + +struct AAAMDAttributesFunction : public AAAMDAttributes { + AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A) + : AAAMDAttributes(IRP, A) {} + + void initialize(Attributor &A) override { + Function *F = getAssociatedFunction(); + CallingConv::ID CC = F->getCallingConv(); + bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx); + + // Don't add attributes to instrinsics + if (F->isIntrinsic()) { + indicatePessimisticFixpoint(); + return; + } + + // Ignore functions with graphics calling conventions, these are currently + // not allowed to have kernel arguments. + if (AMDGPU::isGraphics(F->getCallingConv())) { + indicatePessimisticFixpoint(); + return; + } + + for (StringRef Attr : ImplicitAttrNames) { + if (F->hasFnAttribute(Attr)) + Attributes.insert(Attr); + } + + // TODO: We shouldn't need this in the future. + if (CallingConvSupportsAllImplicits && + F->hasAddressTaken(nullptr, true, true, true)) { + for (StringRef AttrName : ImplicitAttrNames) { + Attributes.insert(AttrName); + } + } + } + + ChangeStatus updateImpl(Attributor &A) override { + Function *F = getAssociatedFunction(); + ChangeStatus Change = ChangeStatus::UNCHANGED; + bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv()); + CallingConv::ID CC = F->getCallingConv(); + bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx); + auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); + + auto AddAttribute = [&](StringRef AttrName) { + if (Attributes.insert(AttrName).second) + Change = ChangeStatus::CHANGED; + }; + + // Check for Intrinsics and propagate attributes. + const AACallEdges &AAEdges = A.getAAFor<AACallEdges>( + *this, this->getIRPosition(), DepClassTy::REQUIRED); + + // We have to assume that we can reach a function with these attributes. + // We do not consider inline assembly as a unknown callee. + if (CallingConvSupportsAllImplicits && AAEdges.hasNonAsmUnknownCallee()) { + for (StringRef AttrName : ImplicitAttrNames) { + AddAttribute(AttrName); + } + } + + bool NeedsQueuePtr = false; + bool HasCall = false; + for (Function *Callee : AAEdges.getOptimisticEdges()) { + Intrinsic::ID IID = Callee->getIntrinsicID(); + if (IID != Intrinsic::not_intrinsic) { + if (!IsNonEntryFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) { + AddAttribute("amdgpu-kernarg-segment-ptr"); + continue; + } + + bool NonKernelOnly = false; + StringRef AttrName = + intrinsicToAttrName(IID, NonKernelOnly, NeedsQueuePtr); + + if (!AttrName.empty() && (IsNonEntryFunc || !NonKernelOnly)) + AddAttribute(AttrName); + + continue; + } + + HasCall = true; + const AAAMDAttributes &AAAMD = A.getAAFor<AAAMDAttributes>( + *this, IRPosition::function(*Callee), DepClassTy::REQUIRED); + const DenseSet<StringRef> &CalleeAttributes = AAAMD.getAttributes(); + // Propagate implicit attributes from called function. + for (StringRef AttrName : ImplicitAttrNames) + if (CalleeAttributes.count(AttrName)) + AddAttribute(AttrName); + } + + HasCall |= AAEdges.hasUnknownCallee(); + if (!IsNonEntryFunc && HasCall) + AddAttribute("amdgpu-calls"); + + // Check the function body. + auto CheckAlloca = [&](Instruction &I) { + AddAttribute("amdgpu-stack-objects"); + return false; + }; + + bool UsedAssumedInformation = false; + A.checkForAllInstructions(CheckAlloca, *this, {Instruction::Alloca}, + UsedAssumedInformation); + + // If we found that we need amdgpu-queue-ptr, nothing else to do. + if (NeedsQueuePtr || Attributes.count("amdgpu-queue-ptr")) { + AddAttribute("amdgpu-queue-ptr"); + return Change; + } + + auto CheckAddrSpaceCasts = [&](Instruction &I) { + unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace(); + if (castRequiresQueuePtr(SrcAS)) { + NeedsQueuePtr = true; + return false; + } + return true; + }; + + bool HasApertureRegs = InfoCache.hasApertureRegs(*F); + + // `checkForAllInstructions` is much more cheaper than going through all + // instructions, try it first. + + // amdgpu-queue-ptr is not needed if aperture regs is present. + if (!HasApertureRegs) + A.checkForAllInstructions(CheckAddrSpaceCasts, *this, + {Instruction::AddrSpaceCast}, + UsedAssumedInformation); + + // If we found that we need amdgpu-queue-ptr, nothing else to do. + if (NeedsQueuePtr) { + AddAttribute("amdgpu-queue-ptr"); + return Change; + } + + if (!IsNonEntryFunc && HasApertureRegs) + return Change; + + for (BasicBlock &BB : *F) { + for (Instruction &I : BB) { + for (const Use &U : I.operands()) { + if (const auto *C = dyn_cast<Constant>(U)) { + if (InfoCache.needsQueuePtr(C, *F)) { + AddAttribute("amdgpu-queue-ptr"); + return Change; + } + } + } + } + } + + return Change; + } + + ChangeStatus manifest(Attributor &A) override { + SmallVector<Attribute, 8> AttrList; + LLVMContext &Ctx = getAssociatedFunction()->getContext(); + + for (StringRef AttrName : Attributes) + AttrList.push_back(Attribute::get(Ctx, AttrName)); + + return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList, + /* ForceReplace */ true); + } + + const std::string getAsStr() const override { + return "AMDInfo[" + std::to_string(Attributes.size()) + "]"; + } + + const DenseSet<StringRef> &getAttributes() const override { + return Attributes; + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override {} + +private: + DenseSet<StringRef> Attributes; +}; + +AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP, + Attributor &A) { + if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) + return *new (A.Allocator) AAAMDAttributesFunction(IRP, A); + llvm_unreachable("AAAMDAttributes is only valid for function position"); +} + +class AMDGPUAttributor : public ModulePass { +public: + AMDGPUAttributor() : ModulePass(ID) {} + + /// doInitialization - Virtual method overridden by subclasses to do + /// any necessary initialization before any pass is run. + bool doInitialization(Module &) override { + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) + report_fatal_error("TargetMachine is required"); + + TM = &TPC->getTM<TargetMachine>(); + return false; + } + + bool runOnModule(Module &M) override { + SetVector<Function *> Functions; + AnalysisGetter AG; + for (Function &F : M) + Functions.insert(&F); + + CallGraphUpdater CGUpdater; + BumpPtrAllocator Allocator; + AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM); + Attributor A(Functions, InfoCache, CGUpdater); + + for (Function &F : M) { + A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F)); + A.getOrCreateAAFor<AAAMDWorkGroupSize>(IRPosition::function(F)); + } + + ChangeStatus Change = A.run(); + return Change == ChangeStatus::CHANGED; + } + + StringRef getPassName() const override { return "AMDGPU Attributor"; } + TargetMachine *TM; + static char ID; +}; + +char AMDGPUAttributor::ID = 0; + +Pass *llvm::createAMDGPUAttributorPass() { return new AMDGPUAttributor(); } +INITIALIZE_PASS(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false, false) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 852a05b3c181..b9faad453aba 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -29,44 +29,39 @@ using namespace llvm; namespace { -struct AMDGPUValueHandler : public CallLowering::ValueHandler { - AMDGPUValueHandler(bool IsIncoming, MachineIRBuilder &B, - MachineRegisterInfo &MRI, CCAssignFn *AssignFn) - : ValueHandler(IsIncoming, B, MRI, AssignFn) {} - - /// Wrapper around extendRegister to ensure we extend to a full 32-bit - /// register. - Register extendRegisterMin32(Register ValVReg, CCValAssign &VA) { - if (VA.getLocVT().getSizeInBits() < 32) { - // 16-bit types are reported as legal for 32-bit registers. We need to - // extend and do a 32-bit copy to avoid the verifier complaining about it. - return MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0); - } - - return extendRegister(ValVReg, VA); +/// Wrapper around extendRegister to ensure we extend to a full 32-bit register. +static Register extendRegisterMin32(CallLowering::ValueHandler &Handler, + Register ValVReg, CCValAssign &VA) { + if (VA.getLocVT().getSizeInBits() < 32) { + // 16-bit types are reported as legal for 32-bit registers. We need to + // extend and do a 32-bit copy to avoid the verifier complaining about it. + return Handler.MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0); } -}; -struct AMDGPUOutgoingValueHandler : public AMDGPUValueHandler { + return Handler.extendRegister(ValVReg, VA); +} + +struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler { AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, - MachineInstrBuilder MIB, CCAssignFn *AssignFn) - : AMDGPUValueHandler(false, B, MRI, AssignFn), MIB(MIB) {} + MachineInstrBuilder MIB) + : OutgoingValueHandler(B, MRI), MIB(MIB) {} MachineInstrBuilder MIB; Register getStackAddress(uint64_t Size, int64_t Offset, - MachinePointerInfo &MPO) override { + MachinePointerInfo &MPO, + ISD::ArgFlagsTy Flags) override { llvm_unreachable("not implemented"); } - void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, + void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy, MachinePointerInfo &MPO, CCValAssign &VA) override { llvm_unreachable("not implemented"); } void assignValueToReg(Register ValVReg, Register PhysReg, CCValAssign &VA) override { - Register ExtReg = extendRegisterMin32(ValVReg, VA); + Register ExtReg = extendRegisterMin32(*this, ValVReg, VA); // If this is a scalar return, insert a readfirstlane just in case the value // ends up in a VGPR. @@ -83,27 +78,23 @@ struct AMDGPUOutgoingValueHandler : public AMDGPUValueHandler { MIRBuilder.buildCopy(PhysReg, ExtReg); MIB.addUse(PhysReg, RegState::Implicit); } - - bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - const CallLowering::ArgInfo &Info, - ISD::ArgFlagsTy Flags, - CCState &State) override { - return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); - } }; -struct AMDGPUIncomingArgHandler : public AMDGPUValueHandler { +struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler { uint64_t StackUsed = 0; - AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, - CCAssignFn *AssignFn) - : AMDGPUValueHandler(true, B, MRI, AssignFn) {} + AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI) + : IncomingValueHandler(B, MRI) {} Register getStackAddress(uint64_t Size, int64_t Offset, - MachinePointerInfo &MPO) override { + MachinePointerInfo &MPO, + ISD::ArgFlagsTy Flags) override { auto &MFI = MIRBuilder.getMF().getFrameInfo(); - int FI = MFI.CreateFixedObject(Size, Offset, true); + + // Byval is assumed to be writable memory, but other stack passed arguments + // are not. + const bool IsImmutable = !Flags.isByVal(); + int FI = MFI.CreateFixedObject(Size, Offset, IsImmutable); MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); auto AddrReg = MIRBuilder.buildFrameIndex( LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), FI); @@ -119,35 +110,24 @@ struct AMDGPUIncomingArgHandler : public AMDGPUValueHandler { // 16-bit types are reported as legal for 32-bit registers. We need to do // a 32-bit copy, and truncate to avoid the verifier complaining about it. auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg); - MIRBuilder.buildTrunc(ValVReg, Copy); + + // If we have signext/zeroext, it applies to the whole 32-bit register + // before truncation. + auto Extended = + buildExtensionHint(VA, Copy.getReg(0), LLT(VA.getLocVT())); + MIRBuilder.buildTrunc(ValVReg, Extended); return; } - switch (VA.getLocInfo()) { - case CCValAssign::LocInfo::SExt: - case CCValAssign::LocInfo::ZExt: - case CCValAssign::LocInfo::AExt: { - auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg); - MIRBuilder.buildTrunc(ValVReg, Copy); - break; - } - default: - MIRBuilder.buildCopy(ValVReg, PhysReg); - break; - } + IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA); } - void assignValueToAddress(Register ValVReg, Register Addr, uint64_t MemSize, + void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy, MachinePointerInfo &MPO, CCValAssign &VA) override { MachineFunction &MF = MIRBuilder.getMF(); - // The reported memory location may be wider than the value. - const LLT RegTy = MRI.getType(ValVReg); - MemSize = std::min(static_cast<uint64_t>(RegTy.getSizeInBytes()), MemSize); - - // FIXME: Get alignment auto MMO = MF.getMachineMemOperand( - MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemSize, + MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemTy, inferAlignFromPtrInfo(MF, MPO)); MIRBuilder.buildLoad(ValVReg, Addr, *MMO); } @@ -159,9 +139,8 @@ struct AMDGPUIncomingArgHandler : public AMDGPUValueHandler { }; struct FormalArgHandler : public AMDGPUIncomingArgHandler { - FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, - CCAssignFn *AssignFn) - : AMDGPUIncomingArgHandler(B, MRI, AssignFn) {} + FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI) + : AMDGPUIncomingArgHandler(B, MRI) {} void markPhysRegUsed(unsigned PhysReg) override { MIRBuilder.getMBB().addLiveIn(PhysReg); @@ -170,8 +149,8 @@ struct FormalArgHandler : public AMDGPUIncomingArgHandler { struct CallReturnHandler : public AMDGPUIncomingArgHandler { CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, - MachineInstrBuilder MIB, CCAssignFn *AssignFn) - : AMDGPUIncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} + MachineInstrBuilder MIB) + : AMDGPUIncomingArgHandler(MIRBuilder, MRI), MIB(MIB) {} void markPhysRegUsed(unsigned PhysReg) override { MIB.addDef(PhysReg, RegState::Implicit); @@ -180,10 +159,7 @@ struct CallReturnHandler : public AMDGPUIncomingArgHandler { MachineInstrBuilder MIB; }; -struct AMDGPUOutgoingArgHandler : public AMDGPUValueHandler { - MachineInstrBuilder MIB; - CCAssignFn *AssignFnVarArg; - +struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler { /// For tail calls, the byte offset of the call's argument area from the /// callee's. Unused elsewhere. int FPDiff; @@ -195,20 +171,23 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUValueHandler { AMDGPUOutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, MachineInstrBuilder MIB, - CCAssignFn *AssignFn, CCAssignFn *AssignFnVarArg, bool IsTailCall = false, int FPDiff = 0) - : AMDGPUValueHandler(false, MIRBuilder, MRI, AssignFn), MIB(MIB), - AssignFnVarArg(AssignFnVarArg), FPDiff(FPDiff), IsTailCall(IsTailCall) { - } + : AMDGPUOutgoingValueHandler(MIRBuilder, MRI, MIB), FPDiff(FPDiff), + IsTailCall(IsTailCall) {} Register getStackAddress(uint64_t Size, int64_t Offset, - MachinePointerInfo &MPO) override { + MachinePointerInfo &MPO, + ISD::ArgFlagsTy Flags) override { MachineFunction &MF = MIRBuilder.getMF(); const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32); const LLT S32 = LLT::scalar(32); if (IsTailCall) { - llvm_unreachable("implement me"); + Offset += FPDiff; + int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true); + auto FIReg = MIRBuilder.buildFrameIndex(PtrTy, FI); + MPO = MachinePointerInfo::getFixedStack(MF, FI); + return FIReg.getReg(0); } const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); @@ -226,35 +205,29 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUValueHandler { void assignValueToReg(Register ValVReg, Register PhysReg, CCValAssign &VA) override { MIB.addUse(PhysReg, RegState::Implicit); - Register ExtReg = extendRegisterMin32(ValVReg, VA); + Register ExtReg = extendRegisterMin32(*this, ValVReg, VA); MIRBuilder.buildCopy(PhysReg, ExtReg); } - void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, + void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy, MachinePointerInfo &MPO, CCValAssign &VA) override { MachineFunction &MF = MIRBuilder.getMF(); uint64_t LocMemOffset = VA.getLocMemOffset(); const auto &ST = MF.getSubtarget<GCNSubtarget>(); auto MMO = MF.getMachineMemOperand( - MPO, MachineMemOperand::MOStore, Size, - commonAlignment(ST.getStackAlignment(), LocMemOffset)); + MPO, MachineMemOperand::MOStore, MemTy, + commonAlignment(ST.getStackAlignment(), LocMemOffset)); MIRBuilder.buildStore(ValVReg, Addr, *MMO); } - void assignValueToAddress(const CallLowering::ArgInfo &Arg, Register Addr, - uint64_t MemSize, MachinePointerInfo &MPO, - CCValAssign &VA) override { + void assignValueToAddress(const CallLowering::ArgInfo &Arg, + unsigned ValRegIndex, Register Addr, LLT MemTy, + MachinePointerInfo &MPO, CCValAssign &VA) override { Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt - ? extendRegister(Arg.Regs[0], VA) - : Arg.Regs[0]; - - // If we extended the value type we might need to adjust the MMO's - // Size. This happens if ComputeValueVTs widened a small type value to a - // legal register type (e.g. s8->s16) - const LLT RegTy = MRI.getType(ValVReg); - MemSize = std::min(MemSize, (uint64_t)RegTy.getSizeInBytes()); - assignValueToAddress(ValVReg, Addr, MemSize, MPO, VA); + ? extendRegister(Arg.Regs[ValRegIndex], VA) + : Arg.Regs[ValRegIndex]; + assignValueToAddress(ValVReg, Addr, MemTy, MPO, VA); } }; } @@ -277,149 +250,6 @@ static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) { } } -// FIXME: This should move to generic code. -void AMDGPUCallLowering::splitToValueTypes(MachineIRBuilder &B, - const ArgInfo &OrigArg, - SmallVectorImpl<ArgInfo> &SplitArgs, - const DataLayout &DL, - CallingConv::ID CallConv) const { - const SITargetLowering &TLI = *getTLI<SITargetLowering>(); - LLVMContext &Ctx = OrigArg.Ty->getContext(); - - SmallVector<EVT, 4> SplitVTs; - ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs); - - assert(OrigArg.Regs.size() == SplitVTs.size()); - - if (SplitVTs.size() == 0) - return; - - if (SplitVTs.size() == 1) { - // No splitting to do, but we want to replace the original type (e.g. [1 x - // double] -> double). - SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx), - OrigArg.Flags[0], OrigArg.IsFixed); - return; - } - - // Create one ArgInfo for each virtual register in the original ArgInfo. - assert(OrigArg.Regs.size() == SplitVTs.size() && "Regs / types mismatch"); - - bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters( - OrigArg.Ty, CallConv, false); - for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) { - Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx); - SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags[0], - OrigArg.IsFixed); - if (NeedsRegBlock) - SplitArgs.back().Flags[0].setInConsecutiveRegs(); - } - - SplitArgs.back().Flags[0].setInConsecutiveRegsLast(); -} - -void AMDGPUCallLowering::processSplitArgs( - MachineIRBuilder &B, const ArgInfo &OrigArg, - const SmallVectorImpl<ArgInfo> &SplitArg, - SmallVectorImpl<ArgInfo> &SplitArgs, const DataLayout &DL, - CallingConv::ID CallConv, bool IsOutgoing, - SplitArgTy PerformArgSplit) const { - LLVMContext &Ctx = OrigArg.Ty->getContext(); - const SITargetLowering &TLI = *getTLI<SITargetLowering>(); - - // FIXME: This is mostly nasty pre-processing before handleAssignments. Most - // of this should be performed by handleAssignments. - - for (int SplitIdx = 0, e = SplitArg.size(); SplitIdx != e; ++SplitIdx) { - const ArgInfo &CurSplitArg = SplitArg[SplitIdx]; - Register Reg = OrigArg.Regs[SplitIdx]; - EVT VT = EVT::getEVT(CurSplitArg.Ty); - LLT LLTy = getLLTForType(*CurSplitArg.Ty, DL); - - unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT); - MVT RegVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT); - - if (NumParts == 1) { - // No splitting to do, but we want to replace the original type (e.g. [1 x - // double] -> double). - SplitArgs.emplace_back(Reg, CurSplitArg.Ty, OrigArg.Flags, - OrigArg.IsFixed); - continue; - } - - SmallVector<Register, 8> SplitRegs; - Type *PartTy = EVT(RegVT).getTypeForEVT(Ctx); - LLT PartLLT = getLLTForType(*PartTy, DL); - MachineRegisterInfo &MRI = *B.getMRI(); - - // FIXME: Should we be reporting all of the part registers for a single - // argument, and let handleAssignments take care of the repacking? - for (unsigned i = 0; i < NumParts; ++i) { - Register PartReg = MRI.createGenericVirtualRegister(PartLLT); - SplitRegs.push_back(PartReg); - SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags); - } - - PerformArgSplit(SplitRegs, Reg, LLTy, PartLLT, SplitIdx); - } -} - -// TODO: Move to generic code -static void unpackRegsToOrigType(MachineIRBuilder &B, - ArrayRef<Register> DstRegs, - Register SrcReg, - const CallLowering::ArgInfo &Info, - LLT SrcTy, - LLT PartTy) { - assert(DstRegs.size() > 1 && "Nothing to unpack"); - - const unsigned PartSize = PartTy.getSizeInBits(); - - if (SrcTy.isVector() && !PartTy.isVector() && - PartSize > SrcTy.getElementType().getSizeInBits()) { - // Vector was scalarized, and the elements extended. - auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), SrcReg); - for (int i = 0, e = DstRegs.size(); i != e; ++i) - B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i)); - return; - } - - LLT GCDTy = getGCDType(SrcTy, PartTy); - if (GCDTy == PartTy) { - // If this already evenly divisible, we can create a simple unmerge. - B.buildUnmerge(DstRegs, SrcReg); - return; - } - - MachineRegisterInfo &MRI = *B.getMRI(); - LLT DstTy = MRI.getType(DstRegs[0]); - LLT LCMTy = getLCMType(SrcTy, PartTy); - - const unsigned LCMSize = LCMTy.getSizeInBits(); - const unsigned DstSize = DstTy.getSizeInBits(); - const unsigned SrcSize = SrcTy.getSizeInBits(); - - Register UnmergeSrc = SrcReg; - if (LCMSize != SrcSize) { - // Widen to the common type. - Register Undef = B.buildUndef(SrcTy).getReg(0); - SmallVector<Register, 8> MergeParts(1, SrcReg); - for (unsigned Size = SrcSize; Size != LCMSize; Size += SrcSize) - MergeParts.push_back(Undef); - - UnmergeSrc = B.buildMerge(LCMTy, MergeParts).getReg(0); - } - - // Unmerge to the original registers and pad with dead defs. - SmallVector<Register, 8> UnmergeResults(DstRegs.begin(), DstRegs.end()); - for (unsigned Size = DstSize * DstRegs.size(); Size != LCMSize; - Size += DstSize) { - UnmergeResults.push_back(MRI.createGenericVirtualRegister(DstTy)); - } - - B.buildUnmerge(UnmergeResults, UnmergeSrc); -} - bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF, CallingConv::ID CallConv, SmallVectorImpl<BaseArgInfo> &Outs, @@ -458,18 +288,12 @@ bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B, assert(VRegs.size() == SplitEVTs.size() && "For each split Type there should be exactly one VReg."); - // We pre-process the return value decomposed into EVTs. - SmallVector<ArgInfo, 8> PreSplitRetInfos; - - // Further processing is applied to split the arguments from PreSplitRetInfos - // into 32-bit pieces in SplitRetInfos before passing off to - // handleAssignments. SmallVector<ArgInfo, 8> SplitRetInfos; for (unsigned i = 0; i < SplitEVTs.size(); ++i) { EVT VT = SplitEVTs[i]; Register Reg = VRegs[i]; - ArgInfo RetInfo(Reg, VT.getTypeForEVT(Ctx)); + ArgInfo RetInfo(Reg, VT.getTypeForEVT(Ctx), 0); setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F); if (VT.isScalarInteger()) { @@ -497,23 +321,15 @@ bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B, setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F); } - splitToValueTypes(B, RetInfo, PreSplitRetInfos, DL, CC); - - // FIXME: This splitting should mostly be done by handleAssignments - processSplitArgs(B, RetInfo, - PreSplitRetInfos, SplitRetInfos, DL, CC, true, - [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, - LLT PartLLT, int VTSplitIdx) { - unpackRegsToOrigType(B, Regs, SrcReg, - PreSplitRetInfos[VTSplitIdx], LLTy, - PartLLT); - }); - PreSplitRetInfos.clear(); + splitToValueTypes(RetInfo, SplitRetInfos, DL, CC); } CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg()); - AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn); - return handleAssignments(B, SplitRetInfos, RetHandler); + + OutgoingValueAssigner Assigner(AssignFn); + AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret); + return determineAndHandleAssignments(RetHandler, Assigner, SplitRetInfos, B, + CC, F.isVarArg()); } bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val, @@ -568,7 +384,6 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val, } void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B, - Type *ParamTy, uint64_t Offset) const { MachineFunction &MF = B.getMF(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); @@ -582,26 +397,45 @@ void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B, B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg); } -void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy, - uint64_t Offset, Align Alignment, - Register DstReg) const { +void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, ArgInfo &OrigArg, + uint64_t Offset, + Align Alignment) const { MachineFunction &MF = B.getMF(); const Function &F = MF.getFunction(); const DataLayout &DL = F.getParent()->getDataLayout(); MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); - unsigned TypeSize = DL.getTypeStoreSize(ParamTy); LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); - Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy); - lowerParameterPtr(PtrReg, B, ParamTy, Offset); - MachineMemOperand *MMO = MF.getMachineMemOperand( - PtrInfo, - MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant, - TypeSize, Alignment); + SmallVector<ArgInfo, 32> SplitArgs; + SmallVector<uint64_t> FieldOffsets; + splitToValueTypes(OrigArg, SplitArgs, DL, F.getCallingConv(), &FieldOffsets); + + unsigned Idx = 0; + for (ArgInfo &SplitArg : SplitArgs) { + Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy); + lowerParameterPtr(PtrReg, B, Offset + FieldOffsets[Idx]); + + LLT ArgTy = getLLTForType(*SplitArg.Ty, DL); + if (SplitArg.Flags[0].isPointer()) { + // Compensate for losing pointeriness in splitValueTypes. + LLT PtrTy = LLT::pointer(SplitArg.Flags[0].getPointerAddrSpace(), + ArgTy.getScalarSizeInBits()); + ArgTy = ArgTy.isVector() ? LLT::vector(ArgTy.getElementCount(), PtrTy) + : PtrTy; + } + + MachineMemOperand *MMO = MF.getMachineMemOperand( + PtrInfo, + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + ArgTy, commonAlignment(Alignment, FieldOffsets[Idx])); - B.buildLoad(DstReg, PtrReg, *MMO); + assert(SplitArg.Regs.size() == 1); + + B.buildLoad(SplitArg.Regs[0], PtrReg, *MMO); + ++Idx; + } } // Allocate special inputs passed in user SGPRs. @@ -665,9 +499,10 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel( SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); const SITargetLowering &TLI = *getTLI<SITargetLowering>(); - const DataLayout &DL = F.getParent()->getDataLayout(); + Info->allocateModuleLDSGlobal(F.getParent()); + SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); @@ -706,24 +541,19 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel( assert(VRegs[i].size() == 1 && "expected only one register for byval pointers"); if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) { - lowerParameterPtr(VRegs[i][0], B, ArgTy, ArgOffset); + lowerParameterPtr(VRegs[i][0], B, ArgOffset); } else { const LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); Register PtrReg = MRI.createGenericVirtualRegister(ConstPtrTy); - lowerParameterPtr(PtrReg, B, ArgTy, ArgOffset); + lowerParameterPtr(PtrReg, B, ArgOffset); B.buildAddrSpaceCast(VRegs[i][0], PtrReg); } } else { - ArrayRef<Register> OrigArgRegs = VRegs[i]; - Register ArgReg = - OrigArgRegs.size() == 1 - ? OrigArgRegs[0] - : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL)); - - lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg); - if (OrigArgRegs.size() > 1) - unpackRegs(OrigArgRegs, ArgReg, ArgTy, B); + ArgInfo OrigArg(VRegs[i], Arg, i); + const unsigned OrigArgIdx = i + AttributeList::FirstArgIndex; + setArgFlags(OrigArg, OrigArgIdx, DL, F); + lowerParameter(B, OrigArg, ArgOffset, Alignment); } ++i; @@ -734,117 +564,6 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel( return true; } -/// Pack values \p SrcRegs to cover the vector type result \p DstRegs. -static MachineInstrBuilder mergeVectorRegsToResultRegs( - MachineIRBuilder &B, ArrayRef<Register> DstRegs, ArrayRef<Register> SrcRegs) { - MachineRegisterInfo &MRI = *B.getMRI(); - LLT LLTy = MRI.getType(DstRegs[0]); - LLT PartLLT = MRI.getType(SrcRegs[0]); - - // Deal with v3s16 split into v2s16 - LLT LCMTy = getLCMType(LLTy, PartLLT); - if (LCMTy == LLTy) { - // Common case where no padding is needed. - assert(DstRegs.size() == 1); - return B.buildConcatVectors(DstRegs[0], SrcRegs); - } - - const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits(); - Register Undef = B.buildUndef(PartLLT).getReg(0); - - // Build vector of undefs. - SmallVector<Register, 8> WidenedSrcs(NumWide, Undef); - - // Replace the first sources with the real registers. - std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin()); - - auto Widened = B.buildConcatVectors(LCMTy, WidenedSrcs); - int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits(); - - SmallVector<Register, 8> PadDstRegs(NumDst); - std::copy(DstRegs.begin(), DstRegs.end(), PadDstRegs.begin()); - - // Create the excess dead defs for the unmerge. - for (int I = DstRegs.size(); I != NumDst; ++I) - PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy); - - return B.buildUnmerge(PadDstRegs, Widened); -} - -// TODO: Move this to generic code -static void packSplitRegsToOrigType(MachineIRBuilder &B, - ArrayRef<Register> OrigRegs, - ArrayRef<Register> Regs, - LLT LLTy, - LLT PartLLT) { - MachineRegisterInfo &MRI = *B.getMRI(); - - if (!LLTy.isVector() && !PartLLT.isVector()) { - assert(OrigRegs.size() == 1); - LLT OrigTy = MRI.getType(OrigRegs[0]); - - unsigned SrcSize = PartLLT.getSizeInBits() * Regs.size(); - if (SrcSize == OrigTy.getSizeInBits()) - B.buildMerge(OrigRegs[0], Regs); - else { - auto Widened = B.buildMerge(LLT::scalar(SrcSize), Regs); - B.buildTrunc(OrigRegs[0], Widened); - } - - return; - } - - if (LLTy.isVector() && PartLLT.isVector()) { - assert(OrigRegs.size() == 1); - assert(LLTy.getElementType() == PartLLT.getElementType()); - mergeVectorRegsToResultRegs(B, OrigRegs, Regs); - return; - } - - assert(LLTy.isVector() && !PartLLT.isVector()); - - LLT DstEltTy = LLTy.getElementType(); - - // Pointer information was discarded. We'll need to coerce some register types - // to avoid violating type constraints. - LLT RealDstEltTy = MRI.getType(OrigRegs[0]).getElementType(); - - assert(DstEltTy.getSizeInBits() == RealDstEltTy.getSizeInBits()); - - if (DstEltTy == PartLLT) { - // Vector was trivially scalarized. - - if (RealDstEltTy.isPointer()) { - for (Register Reg : Regs) - MRI.setType(Reg, RealDstEltTy); - } - - B.buildBuildVector(OrigRegs[0], Regs); - } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) { - // Deal with vector with 64-bit elements decomposed to 32-bit - // registers. Need to create intermediate 64-bit elements. - SmallVector<Register, 8> EltMerges; - int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits(); - - assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0); - - for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I) { - auto Merge = B.buildMerge(RealDstEltTy, Regs.take_front(PartsPerElt)); - // Fix the type in case this is really a vector of pointers. - MRI.setType(Merge.getReg(0), RealDstEltTy); - EltMerges.push_back(Merge.getReg(0)); - Regs = Regs.drop_front(PartsPerElt); - } - - B.buildBuildVector(OrigRegs[0], EltMerges); - } else { - // Vector was split, and elements promoted to a wider type. - LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT); - auto BV = B.buildBuildVector(BVType, Regs); - B.buildTrunc(OrigRegs[0], BV); - } -} - bool AMDGPUCallLowering::lowerFormalArguments( MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs, FunctionLoweringInfo &FLI) const { @@ -867,6 +586,7 @@ bool AMDGPUCallLowering::lowerFormalArguments( const SIRegisterInfo *TRI = Subtarget.getRegisterInfo(); const DataLayout &DL = F.getParent()->getDataLayout(); + Info->allocateModuleLDSGlobal(F.getParent()); SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext()); @@ -885,7 +605,6 @@ bool AMDGPUCallLowering::lowerFormalArguments( CCInfo.AllocateReg(ImplicitBufferPtrReg); } - SmallVector<ArgInfo, 8> SplitArg; SmallVector<ArgInfo, 32> SplitArgs; unsigned Idx = 0; unsigned PSInputNum = 0; @@ -931,23 +650,11 @@ bool AMDGPUCallLowering::lowerFormalArguments( } } - ArgInfo OrigArg(VRegs[Idx], Arg.getType()); + ArgInfo OrigArg(VRegs[Idx], Arg, Idx); const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex; setArgFlags(OrigArg, OrigArgIdx, DL, F); - SplitArg.clear(); - splitToValueTypes(B, OrigArg, SplitArg, DL, CC); - - processSplitArgs(B, OrigArg, SplitArg, SplitArgs, DL, CC, false, - // FIXME: We should probably be passing multiple registers - // to handleAssignments to do this - [&](ArrayRef<Register> Regs, Register DstReg, LLT LLTy, - LLT PartLLT, int VTSplitIdx) { - assert(DstReg == VRegs[Idx][VTSplitIdx]); - packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs, - LLTy, PartLLT); - }); - + splitToValueTypes(OrigArg, SplitArgs, DL, CC); ++Idx; } @@ -1004,10 +711,16 @@ bool AMDGPUCallLowering::lowerFormalArguments( TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); } - FormalArgHandler Handler(B, MRI, AssignFn); - if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler)) + IncomingValueAssigner Assigner(AssignFn); + if (!determineAssignments(Assigner, SplitArgs, CCInfo)) + return false; + + FormalArgHandler Handler(B, MRI); + if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B)) return false; + uint64_t StackOffset = Assigner.StackOffset; + if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) { // Special inputs come after user arguments. TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); @@ -1022,6 +735,12 @@ bool AMDGPUCallLowering::lowerFormalArguments( TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); } + // When we tail call, we need to check if the callee's arguments will fit on + // the caller's stack. So, whenever we lower formal arguments, we should keep + // track of this information, since we might lower a tail call in this + // function later. + Info->setBytesInStackArgArea(StackOffset); + // Move back to the end of the basic block. B.setMBB(MBB); @@ -1184,7 +903,7 @@ getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) { static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, bool IsTailCall) { - return AMDGPU::SI_CALL; + return IsTailCall ? AMDGPU::SI_TCRETURN : AMDGPU::SI_CALL; } // Add operands to call instruction to track the callee. @@ -1208,6 +927,317 @@ static bool addCallTargetOperands(MachineInstrBuilder &CallInst, return true; } +bool AMDGPUCallLowering::doCallerAndCalleePassArgsTheSameWay( + CallLoweringInfo &Info, MachineFunction &MF, + SmallVectorImpl<ArgInfo> &InArgs) const { + const Function &CallerF = MF.getFunction(); + CallingConv::ID CalleeCC = Info.CallConv; + CallingConv::ID CallerCC = CallerF.getCallingConv(); + + // If the calling conventions match, then everything must be the same. + if (CalleeCC == CallerCC) + return true; + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + + // Make sure that the caller and callee preserve all of the same registers. + auto TRI = ST.getRegisterInfo(); + + const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); + const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); + if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) + return false; + + // Check if the caller and callee will handle arguments in the same way. + const SITargetLowering &TLI = *getTLI<SITargetLowering>(); + CCAssignFn *CalleeAssignFnFixed; + CCAssignFn *CalleeAssignFnVarArg; + std::tie(CalleeAssignFnFixed, CalleeAssignFnVarArg) = + getAssignFnsForCC(CalleeCC, TLI); + + CCAssignFn *CallerAssignFnFixed; + CCAssignFn *CallerAssignFnVarArg; + std::tie(CallerAssignFnFixed, CallerAssignFnVarArg) = + getAssignFnsForCC(CallerCC, TLI); + + // FIXME: We are not accounting for potential differences in implicitly passed + // inputs, but only the fixed ABI is supported now anyway. + IncomingValueAssigner CalleeAssigner(CalleeAssignFnFixed, + CalleeAssignFnVarArg); + IncomingValueAssigner CallerAssigner(CallerAssignFnFixed, + CallerAssignFnVarArg); + return resultsCompatible(Info, MF, InArgs, CalleeAssigner, CallerAssigner); +} + +bool AMDGPUCallLowering::areCalleeOutgoingArgsTailCallable( + CallLoweringInfo &Info, MachineFunction &MF, + SmallVectorImpl<ArgInfo> &OutArgs) const { + // If there are no outgoing arguments, then we are done. + if (OutArgs.empty()) + return true; + + const Function &CallerF = MF.getFunction(); + CallingConv::ID CalleeCC = Info.CallConv; + CallingConv::ID CallerCC = CallerF.getCallingConv(); + const SITargetLowering &TLI = *getTLI<SITargetLowering>(); + + CCAssignFn *AssignFnFixed; + CCAssignFn *AssignFnVarArg; + std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI); + + // We have outgoing arguments. Make sure that we can tail call with them. + SmallVector<CCValAssign, 16> OutLocs; + CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext()); + OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg); + + if (!determineAssignments(Assigner, OutArgs, OutInfo)) { + LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n"); + return false; + } + + // Make sure that they can fit on the caller's stack. + const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + if (OutInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) { + LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n"); + return false; + } + + // Verify that the parameters in callee-saved registers match. + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC); + MachineRegisterInfo &MRI = MF.getRegInfo(); + return parametersInCSRMatch(MRI, CallerPreservedMask, OutLocs, OutArgs); +} + +/// Return true if the calling convention is one that we can guarantee TCO for. +static bool canGuaranteeTCO(CallingConv::ID CC) { + return CC == CallingConv::Fast; +} + +/// Return true if we might ever do TCO for calls with this calling convention. +static bool mayTailCallThisCC(CallingConv::ID CC) { + switch (CC) { + case CallingConv::C: + case CallingConv::AMDGPU_Gfx: + return true; + default: + return canGuaranteeTCO(CC); + } +} + +bool AMDGPUCallLowering::isEligibleForTailCallOptimization( + MachineIRBuilder &B, CallLoweringInfo &Info, + SmallVectorImpl<ArgInfo> &InArgs, SmallVectorImpl<ArgInfo> &OutArgs) const { + // Must pass all target-independent checks in order to tail call optimize. + if (!Info.IsTailCall) + return false; + + MachineFunction &MF = B.getMF(); + const Function &CallerF = MF.getFunction(); + CallingConv::ID CalleeCC = Info.CallConv; + CallingConv::ID CallerCC = CallerF.getCallingConv(); + + const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); + const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); + // Kernels aren't callable, and don't have a live in return address so it + // doesn't make sense to do a tail call with entry functions. + if (!CallerPreserved) + return false; + + if (!mayTailCallThisCC(CalleeCC)) { + LLVM_DEBUG(dbgs() << "... Calling convention cannot be tail called.\n"); + return false; + } + + if (any_of(CallerF.args(), [](const Argument &A) { + return A.hasByValAttr() || A.hasSwiftErrorAttr(); + })) { + LLVM_DEBUG(dbgs() << "... Cannot tail call from callers with byval " + "or swifterror arguments\n"); + return false; + } + + // If we have -tailcallopt, then we're done. + if (MF.getTarget().Options.GuaranteedTailCallOpt) + return canGuaranteeTCO(CalleeCC) && CalleeCC == CallerF.getCallingConv(); + + // Verify that the incoming and outgoing arguments from the callee are + // safe to tail call. + if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) { + LLVM_DEBUG( + dbgs() + << "... Caller and callee have incompatible calling conventions.\n"); + return false; + } + + if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs)) + return false; + + LLVM_DEBUG(dbgs() << "... Call is eligible for tail call optimization.\n"); + return true; +} + +// Insert outgoing implicit arguments for a call, by inserting copies to the +// implicit argument registers and adding the necessary implicit uses to the +// call instruction. +void AMDGPUCallLowering::handleImplicitCallArguments( + MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst, + const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo, + ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const { + if (!ST.enableFlatScratch()) { + // Insert copies for the SRD. In the HSA case, this should be an identity + // copy. + auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::fixed_vector(4, 32), + FuncInfo.getScratchRSrcReg()); + MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); + CallInst.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit); + } + + for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) { + MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second); + CallInst.addReg(ArgReg.first, RegState::Implicit); + } +} + +bool AMDGPUCallLowering::lowerTailCall( + MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, + SmallVectorImpl<ArgInfo> &OutArgs) const { + MachineFunction &MF = MIRBuilder.getMF(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + const Function &F = MF.getFunction(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SITargetLowering &TLI = *getTLI<SITargetLowering>(); + + // True when we're tail calling, but without -tailcallopt. + bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt; + + // Find out which ABI gets to decide where things go. + CallingConv::ID CalleeCC = Info.CallConv; + CCAssignFn *AssignFnFixed; + CCAssignFn *AssignFnVarArg; + std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI); + + MachineInstrBuilder CallSeqStart; + if (!IsSibCall) + CallSeqStart = MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP); + + unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true); + auto MIB = MIRBuilder.buildInstrNoInsert(Opc); + if (!addCallTargetOperands(MIB, MIRBuilder, Info)) + return false; + + // Byte offset for the tail call. When we are sibcalling, this will always + // be 0. + MIB.addImm(0); + + // Tell the call which registers are clobbered. + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC); + MIB.addRegMask(Mask); + + // FPDiff is the byte offset of the call's argument area from the callee's. + // Stores to callee stack arguments will be placed in FixedStackSlots offset + // by this amount for a tail call. In a sibling call it must be 0 because the + // caller will deallocate the entire stack and the callee still expects its + // arguments to begin at SP+0. + int FPDiff = 0; + + // This will be 0 for sibcalls, potentially nonzero for tail calls produced + // by -tailcallopt. For sibcalls, the memory operands for the call are + // already available in the caller's incoming argument space. + unsigned NumBytes = 0; + if (!IsSibCall) { + // We aren't sibcalling, so we need to compute FPDiff. We need to do this + // before handling assignments, because FPDiff must be known for memory + // arguments. + unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); + SmallVector<CCValAssign, 16> OutLocs; + CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext()); + + // FIXME: Not accounting for callee implicit inputs + OutgoingValueAssigner CalleeAssigner(AssignFnFixed, AssignFnVarArg); + if (!determineAssignments(CalleeAssigner, OutArgs, OutInfo)) + return false; + + // The callee will pop the argument stack as a tail call. Thus, we must + // keep it 16-byte aligned. + NumBytes = alignTo(OutInfo.getNextStackOffset(), ST.getStackAlignment()); + + // FPDiff will be negative if this tail call requires more space than we + // would automatically have in our incoming argument space. Positive if we + // actually shrink the stack. + FPDiff = NumReusableBytes - NumBytes; + + // The stack pointer must be 16-byte aligned at all times it's used for a + // memory operation, which in practice means at *all* times and in + // particular across call boundaries. Therefore our own arguments started at + // a 16-byte aligned SP and the delta applied for the tail call should + // satisfy the same constraint. + assert(isAligned(ST.getStackAlignment(), FPDiff) && + "unaligned stack on tail call"); + } + + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext()); + + // We could pass MIB and directly add the implicit uses to the call + // now. However, as an aesthetic choice, place implicit argument operands + // after the ordinary user argument registers. + SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs; + + if (AMDGPUTargetMachine::EnableFixedFunctionABI && + Info.CallConv != CallingConv::AMDGPU_Gfx) { + // With a fixed ABI, allocate fixed registers before user arguments. + if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info)) + return false; + } + + OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg); + + if (!determineAssignments(Assigner, OutArgs, CCInfo)) + return false; + + // Do the actual argument marshalling. + AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, true, FPDiff); + if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder)) + return false; + + handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, ImplicitArgRegs); + + // If we have -tailcallopt, we need to adjust the stack. We'll do the call + // sequence start and end here. + if (!IsSibCall) { + MIB->getOperand(1).setImm(FPDiff); + CallSeqStart.addImm(NumBytes).addImm(0); + // End the call sequence *before* emitting the call. Normally, we would + // tidy the frame up after the call. However, here, we've laid out the + // parameters so that when SP is reset, they will be in the correct + // location. + MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN).addImm(NumBytes).addImm(0); + } + + // Now we can add the actual call instruction to the correct basic block. + MIRBuilder.insertInstr(MIB); + + // If Callee is a reg, since it is used by a target specific + // instruction, it must have a register class matching the + // constraint of that instruction. + + // FIXME: We should define regbankselectable call instructions to handle + // divergent call targets. + if (MIB->getOperand(0).isReg()) { + MIB->getOperand(0).setReg(constrainOperandRegClass( + MF, *TRI, MRI, *ST.getInstrInfo(), *ST.getRegBankInfo(), *MIB, + MIB->getDesc(), MIB->getOperand(0), 0)); + } + + MF.getFrameInfo().setHasTailCall(); + Info.LoweredTailCall = true; + return true; +} + bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const { if (Info.IsVarArg) { @@ -1223,39 +1253,24 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI = MF.getRegInfo(); const SITargetLowering &TLI = *getTLI<SITargetLowering>(); const DataLayout &DL = F.getParent()->getDataLayout(); - CallingConv::ID CallConv = F.getCallingConv(); if (!AMDGPUTargetMachine::EnableFixedFunctionABI && - CallConv != CallingConv::AMDGPU_Gfx) { + Info.CallConv != CallingConv::AMDGPU_Gfx) { LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n"); return false; } - if (AMDGPU::isShader(CallConv)) { - LLVM_DEBUG(dbgs() << "Unhandled call from graphics shader\n"); - return false; - } - SmallVector<ArgInfo, 8> OutArgs; + for (auto &OrigArg : Info.OrigArgs) + splitToValueTypes(OrigArg, OutArgs, DL, Info.CallConv); - SmallVector<ArgInfo, 8> SplitArg; - for (auto &OrigArg : Info.OrigArgs) { - splitToValueTypes(MIRBuilder, OrigArg, SplitArg, DL, Info.CallConv); - - processSplitArgs( - MIRBuilder, OrigArg, SplitArg, OutArgs, DL, Info.CallConv, true, - // FIXME: We should probably be passing multiple registers to - // handleAssignments to do this - [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT, - int VTSplitIdx) { - unpackRegsToOrigType(MIRBuilder, Regs, SrcReg, OrigArg, LLTy, PartLLT); - }); - - SplitArg.clear(); - } + SmallVector<ArgInfo, 8> InArgs; + if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) + splitToValueTypes(Info.OrigRet, InArgs, DL, Info.CallConv); // If we can lower as a tail call, do that instead. - bool CanTailCallOpt = false; + bool CanTailCallOpt = + isEligibleForTailCallOptimization(MIRBuilder, Info, InArgs, OutArgs); // We must emit a tail call if we have musttail. if (Info.IsMustTailCall && !CanTailCallOpt) { @@ -1263,6 +1278,9 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, return false; } + if (CanTailCallOpt) + return lowerTailCall(MIRBuilder, Info, OutArgs); + // Find out which ABI gets to decide where things go. CCAssignFn *AssignFnFixed; CCAssignFn *AssignFnVarArg; @@ -1295,7 +1313,8 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // after the ordinary user argument registers. SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs; - if (AMDGPUTargetMachine::EnableFixedFunctionABI) { + if (AMDGPUTargetMachine::EnableFixedFunctionABI && + Info.CallConv != CallingConv::AMDGPU_Gfx) { // With a fixed ABI, allocate fixed registers before user arguments. if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info)) return false; @@ -1303,26 +1322,18 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // Do the actual argument marshalling. SmallVector<Register, 8> PhysRegs; - AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, - AssignFnVarArg, false); - if (!handleAssignments(CCInfo, ArgLocs, MIRBuilder, OutArgs, Handler)) + + OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg); + if (!determineAssignments(Assigner, OutArgs, CCInfo)) + return false; + + AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, false); + if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder)) return false; const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - if (!ST.enableFlatScratch()) { - // Insert copies for the SRD. In the HSA case, this should be an identity - // copy. - auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32), - MFI->getScratchRSrcReg()); - MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); - MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit); - } - - for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) { - MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second); - MIB.addReg(ArgReg.first, RegState::Implicit); - } + handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, ImplicitArgRegs); // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); @@ -1340,55 +1351,32 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, 1)); } - auto OrigInsertPt = MIRBuilder.getInsertPt(); - // Now we can add the actual call instruction to the correct position. MIRBuilder.insertInstr(MIB); - // Insert this now to give us an anchor point for managing the insert point. - MachineInstrBuilder CallSeqEnd = - MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN); - - SmallVector<ArgInfo, 8> InArgs; - if (!Info.CanLowerReturn) { - insertSRetLoads(MIRBuilder, Info.OrigRet.Ty, Info.OrigRet.Regs, - Info.DemoteRegister, Info.DemoteStackIndex); - } else if (!Info.OrigRet.Ty->isVoidTy()) { - SmallVector<ArgInfo, 8> PreSplitRetInfos; - - splitToValueTypes( - MIRBuilder, Info.OrigRet, PreSplitRetInfos/*InArgs*/, DL, Info.CallConv); - - processSplitArgs(MIRBuilder, Info.OrigRet, - PreSplitRetInfos, InArgs/*SplitRetInfos*/, DL, Info.CallConv, false, - [&](ArrayRef<Register> Regs, Register DstReg, - LLT LLTy, LLT PartLLT, int VTSplitIdx) { - assert(DstReg == Info.OrigRet.Regs[VTSplitIdx]); - packSplitRegsToOrigType(MIRBuilder, Info.OrigRet.Regs[VTSplitIdx], - Regs, LLTy, PartLLT); - }); - } - - // Make sure the raw argument copies are inserted before the marshalling to - // the original types. - MIRBuilder.setInsertPt(MIRBuilder.getMBB(), CallSeqEnd); - // Finally we can copy the returned value back into its virtual-register. In // symmetry with the arguments, the physical register must be an // implicit-define of the call instruction. if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) { CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv, Info.IsVarArg); - CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn); - if (!handleAssignments(MIRBuilder, InArgs, Handler)) + IncomingValueAssigner Assigner(RetAssignFn); + CallReturnHandler Handler(MIRBuilder, MRI, MIB); + if (!determineAndHandleAssignments(Handler, Assigner, InArgs, MIRBuilder, + Info.CallConv, Info.IsVarArg)) return false; } uint64_t CalleePopBytes = NumBytes; - CallSeqEnd.addImm(0) + + MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN) + .addImm(0) .addImm(CalleePopBytes); - // Restore the insert point to after the call sequence. - MIRBuilder.setInsertPt(MIRBuilder.getMBB(), OrigInsertPt); + if (!Info.CanLowerReturn) { + insertSRetLoads(MIRBuilder, Info.OrigRet.Ty, Info.OrigRet.Regs, + Info.DemoteRegister, Info.DemoteStackIndex); + } + return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h index 1312388e4a38..569c6d75204d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -19,28 +19,16 @@ namespace llvm { class AMDGPUTargetLowering; +class GCNSubtarget; class MachineInstrBuilder; +class SIMachineFunctionInfo; class AMDGPUCallLowering final : public CallLowering { - void lowerParameterPtr(Register DstReg, MachineIRBuilder &B, Type *ParamTy, + void lowerParameterPtr(Register DstReg, MachineIRBuilder &B, uint64_t Offset) const; - void lowerParameter(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset, - Align Alignment, Register DstReg) const; - - /// A function of this type is used to perform value split action. - using SplitArgTy = std::function<void(ArrayRef<Register>, Register, LLT, LLT, int)>; - - void splitToValueTypes(MachineIRBuilder &B, const ArgInfo &OrigArgInfo, - SmallVectorImpl<ArgInfo> &SplitArgs, - const DataLayout &DL, CallingConv::ID CallConv) const; - - void processSplitArgs(MachineIRBuilder &B, const ArgInfo &OrigArgInfo, - const SmallVectorImpl<ArgInfo> &SplitArg, - SmallVectorImpl<ArgInfo> &SplitArgs, - const DataLayout &DL, CallingConv::ID CallConv, - bool IsOutgoing, - SplitArgTy PerformArgSplit) const; + void lowerParameter(MachineIRBuilder &B, ArgInfo &AI, uint64_t Offset, + Align Alignment) const; bool canLowerReturn(MachineFunction &MF, CallingConv::ID CallConv, SmallVectorImpl<BaseArgInfo> &Outs, @@ -68,6 +56,29 @@ public: SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs, CallLoweringInfo &Info) const; + bool + doCallerAndCalleePassArgsTheSameWay(CallLoweringInfo &Info, + MachineFunction &MF, + SmallVectorImpl<ArgInfo> &InArgs) const; + + bool + areCalleeOutgoingArgsTailCallable(CallLoweringInfo &Info, MachineFunction &MF, + SmallVectorImpl<ArgInfo> &OutArgs) const; + + /// Returns true if the call can be lowered as a tail call. + bool + isEligibleForTailCallOptimization(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info, + SmallVectorImpl<ArgInfo> &InArgs, + SmallVectorImpl<ArgInfo> &OutArgs) const; + + void handleImplicitCallArguments( + MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst, + const GCNSubtarget &ST, const SIMachineFunctionInfo &MFI, + ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const; + + bool lowerTailCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, + SmallVectorImpl<ArgInfo> &OutArgs) const; bool lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td index 250c42776297..90b52395b76c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -34,16 +34,13 @@ def CC_SI_Gfx : CallingConv<[ VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31 ]>>>, - CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>, - CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>, - CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>, - CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>, - CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>, - CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>, - CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>> + CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>> ]>; def RetCC_SI_Gfx : CallingConv<[ + CCIfType<[i1], CCPromoteToType<i32>>, + CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>, + // 0-3 are reserved for the stack buffer descriptor // 32 is reserved for the stack pointer CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[ @@ -74,14 +71,6 @@ def RetCC_SI_Gfx : CallingConv<[ VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127, VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135 ]>>>, - - CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>, - CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>, - CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>, - CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>, - CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>, - CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>, - CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>> ]>; def CC_SI_SHADER : CallingConv<[ @@ -118,6 +107,7 @@ def CC_SI_SHADER : CallingConv<[ ]>; def RetCC_SI_Shader : CallingConv<[ + CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>, CCIfType<[i32, i16] , CCAssignToReg<[ SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, @@ -175,6 +165,10 @@ def CSR_AMDGPU_VGPRs : CalleeSavedRegs< (sequence "VGPR%u", 248, 255)) >; +def CSR_AMDGPU_AGPRs_32_255 : CalleeSavedRegs< + (sequence "AGPR%u", 32, 255) +>; + def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs< (sequence "SGPR%u", 32, 105) >; @@ -184,6 +178,13 @@ def CSR_AMDGPU_AllVGPRs : CalleeSavedRegs< (sequence "VGPR%u", 0, 255) >; +def CSR_AMDGPU_AllAGPRs : CalleeSavedRegs< + (sequence "AGPR%u", 0, 255) +>; +def CSR_AMDGPU_AllVectorRegs : CalleeSavedRegs< + (add CSR_AMDGPU_AllVGPRs, CSR_AMDGPU_AllAGPRs) +>; + // Just to get the regmask, not for calling convention purposes. def CSR_AMDGPU_AllAllocatableSRegs : CalleeSavedRegs< (add (sequence "SGPR%u", 0, 105), VCC_LO, VCC_HI) @@ -193,6 +194,10 @@ def CSR_AMDGPU_HighRegs : CalleeSavedRegs< (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs_32_105) >; +def CSR_AMDGPU_HighRegs_With_AGPRs : CalleeSavedRegs< + (add CSR_AMDGPU_HighRegs, CSR_AMDGPU_AGPRs_32_255) +>; + def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>; // Calling convention for leaf functions @@ -205,13 +210,7 @@ def CC_AMDGPU_Func : CallingConv<[ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>, - CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>, - CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>, - CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>, - CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>, - CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>, - CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>, - CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>> + CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>> ]>; // Calling convention for leaf functions diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 2556996df97f..60e79c2c6c2f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -22,6 +22,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/KnownBits.h" @@ -200,6 +201,7 @@ public: AMDGPUCodeGenPrepare() : FunctionPass(ID) {} bool visitFDiv(BinaryOperator &I); + bool visitXor(BinaryOperator &I); bool visitInstruction(Instruction &I) { return false; } bool visitBinaryOperator(BinaryOperator &I); @@ -807,9 +809,34 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { return !!NewFDiv; } +bool AMDGPUCodeGenPrepare::visitXor(BinaryOperator &I) { + // Match the Xor instruction, its type and its operands + IntrinsicInst *IntrinsicCall = dyn_cast<IntrinsicInst>(I.getOperand(0)); + ConstantInt *RHS = dyn_cast<ConstantInt>(I.getOperand(1)); + if (!RHS || !IntrinsicCall || RHS->getSExtValue() != -1) + return visitBinaryOperator(I); + + // Check if the Call is an intrinsic intruction to amdgcn_class intrinsic + // has only one use + if (IntrinsicCall->getIntrinsicID() != Intrinsic::amdgcn_class || + !IntrinsicCall->hasOneUse()) + return visitBinaryOperator(I); + + // "Not" the second argument of the intrinsic call + ConstantInt *Arg = dyn_cast<ConstantInt>(IntrinsicCall->getOperand(1)); + if (!Arg) + return visitBinaryOperator(I); + + IntrinsicCall->setOperand( + 1, ConstantInt::get(Arg->getType(), Arg->getZExtValue() ^ 0x3ff)); + I.replaceAllUsesWith(IntrinsicCall); + I.eraseFromParent(); + return true; +} + static bool hasUnsafeFPMath(const Function &F) { Attribute Attr = F.getFnAttribute("unsafe-fp-math"); - return Attr.getValueAsString() == "true"; + return Attr.getValueAsBool(); } static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index a8399176bb4a..c6273adca50f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -37,25 +37,54 @@ def cvt_f32_ubyteN : GICombineRule< [{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]), (apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>; +def clamp_i64_to_i16_matchdata : GIDefMatchData<"AMDGPUPreLegalizerCombinerHelper::ClampI64ToI16MatchInfo">; + +def clamp_i64_to_i16 : GICombineRule< + (defs root:$clamp_i64_to_i16, clamp_i64_to_i16_matchdata:$matchinfo), + (match (wip_match_opcode G_TRUNC):$clamp_i64_to_i16, + [{ return PreLegalizerHelper.matchClampI64ToI16(*${clamp_i64_to_i16}, MRI, *MF, ${matchinfo}); }]), + (apply [{ PreLegalizerHelper.applyClampI64ToI16(*${clamp_i64_to_i16}, ${matchinfo}); }])>; + +def med3_matchdata : GIDefMatchData<"AMDGPURegBankCombinerHelper::Med3MatchInfo">; + +def int_minmax_to_med3 : GICombineRule< + (defs root:$min_or_max, med3_matchdata:$matchinfo), + (match (wip_match_opcode G_SMAX, + G_SMIN, + G_UMAX, + G_UMIN):$min_or_max, + [{ return RegBankHelper.matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]), + (apply [{ RegBankHelper.applyMed3(*${min_or_max}, ${matchinfo}); }])>; + +def remove_fcanonicalize_matchinfo : GIDefMatchData<"Register">; + +def remove_fcanonicalize : GICombineRule< + (defs root:$fcanonicalize, remove_fcanonicalize_matchinfo:$matchinfo), + (match (wip_match_opcode G_FCANONICALIZE):$fcanonicalize, + [{ return PostLegalizerHelper.matchRemoveFcanonicalize(*${fcanonicalize}, ${matchinfo}); }]), + (apply [{ Helper.replaceSingleDefInstWithReg(*${fcanonicalize}, ${matchinfo}); }])>; + // Combines which should only apply on SI/VI def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; - def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper< - "AMDGPUGenPreLegalizerCombinerHelper", [all_combines]> { + "AMDGPUGenPreLegalizerCombinerHelper", [all_combines, clamp_i64_to_i16]> { let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule"; + let StateClass = "AMDGPUPreLegalizerCombinerHelperState"; } def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< "AMDGPUGenPostLegalizerCombinerHelper", [all_combines, gfx6gfx7_combines, - uchar_to_float, cvt_f32_ubyteN]> { + uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize]> { let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule"; let StateClass = "AMDGPUPostLegalizerCombinerHelperState"; let AdditionalArguments = []; } def AMDGPURegBankCombinerHelper : GICombinerHelper< - "AMDGPUGenRegBankCombinerHelper", []> { + "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3]> { let DisableRuleOption = "amdgpuregbankcombiner-disable-rule"; + let StateClass = "AMDGPURegBankCombinerHelperState"; + let AdditionalArguments = []; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h index 041d6deef243..87b459f7b1e0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h @@ -6,6 +6,9 @@ // //===----------------------------------------------------------------------===// +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUEXPORTCLUSTERING_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUEXPORTCLUSTERING_H + #include "llvm/CodeGen/ScheduleDAGMutation.h" #include <memory> @@ -14,3 +17,5 @@ namespace llvm { std::unique_ptr<ScheduleDAGMutation> createAMDGPUExportClusteringDAGMutation(); } // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUEXPORTCLUSTERING_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index bba03736d01a..521c8f261a00 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -70,10 +70,10 @@ def gi_smrd_sgpr : def gi_flat_offset : GIComplexOperandMatcher<s64, "selectFlatOffset">, - GIComplexPatternEquiv<FLATOffset>; -def gi_flat_offset_signed : - GIComplexOperandMatcher<s64, "selectFlatOffsetSigned">, - GIComplexPatternEquiv<FLATOffsetSigned>; + GIComplexPatternEquiv<FlatOffset>; +def gi_global_offset : + GIComplexOperandMatcher<s64, "selectGlobalOffset">, + GIComplexPatternEquiv<GlobalOffset>; def gi_global_saddr : GIComplexOperandMatcher<s64, "selectGlobalSAddr">, GIComplexPatternEquiv<GlobalSAddr>; @@ -86,7 +86,7 @@ def gi_mubuf_scratch_offen : GIComplexPatternEquiv<MUBUFScratchOffen>; def gi_flat_scratch_offset : - GIComplexOperandMatcher<s32, "selectFlatOffsetSigned">, + GIComplexOperandMatcher<s32, "selectScratchOffset">, GIComplexPatternEquiv<ScratchOffset>; def gi_flat_scratch_saddr : @@ -113,14 +113,6 @@ def gi_mubuf_offset : GIComplexOperandMatcher<s64, "selectMUBUFOffset">, GIComplexPatternEquiv<MUBUFOffset>; -def gi_mubuf_addr64_atomic : - GIComplexOperandMatcher<s64, "selectMUBUFAddr64Atomic">, - GIComplexPatternEquiv<MUBUFAddr64Atomic>; - -def gi_mubuf_offset_atomic : - GIComplexOperandMatcher<s64, "selectMUBUFOffsetAtomic">, - GIComplexPatternEquiv<MUBUFOffsetAtomic>; - def gi_smrd_buffer_imm : GIComplexOperandMatcher<s64, "selectSMRDBufferImm">, GIComplexPatternEquiv<SMRDBufferImm>; @@ -136,6 +128,8 @@ def gi_smrd_buffer_imm32 : def : GINodeEquiv<G_LOAD, AMDGPUld_glue> { let CheckMMOIsNonAtomic = 1; + let IfSignExtend = G_SEXTLOAD; + let IfZeroExtend = G_ZEXTLOAD; } def : GINodeEquiv<G_STORE, AMDGPUst_glue> { @@ -174,6 +168,10 @@ def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE1, AMDGPUcvt_f32_ubyte1>; def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE2, AMDGPUcvt_f32_ubyte2>; def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE3, AMDGPUcvt_f32_ubyte3>; +def : GINodeEquiv<G_AMDGPU_CVT_PK_I16_I32, AMDGPUpk_i16_i32_impl>; +def : GINodeEquiv<G_AMDGPU_SMED3, AMDGPUsmed3>; +def : GINodeEquiv<G_AMDGPU_UMED3, AMDGPUumed3>; + def : GINodeEquiv<G_AMDGPU_ATOMIC_CMPXCHG, AMDGPUatomic_cmp_swap>; def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD, SIbuffer_load>; def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_USHORT, SIbuffer_load_ushort>; @@ -216,6 +214,8 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_XOR, SIbuffer_atomic_xor>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_INC, SIbuffer_atomic_inc>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_DEC, SIbuffer_atomic_dec>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD, SIbuffer_atomic_fadd>; +def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>; +def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>; def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>; @@ -302,16 +302,16 @@ foreach Ty = [i64, p0, p1, p4] in { defm : SMRD_Pattern <"S_LOAD_DWORDX2", Ty>; } -def gi_as_i32timm : GICustomOperandRenderer<"renderTruncTImm32">, +def gi_as_i32timm : GICustomOperandRenderer<"renderTruncTImm">, GISDNodeXFormEquiv<as_i32timm>; -def gi_as_i16timm : GICustomOperandRenderer<"renderTruncTImm16">, +def gi_as_i16timm : GICustomOperandRenderer<"renderTruncTImm">, GISDNodeXFormEquiv<as_i16timm>; -def gi_as_i8timm : GICustomOperandRenderer<"renderTruncTImm8">, +def gi_as_i8timm : GICustomOperandRenderer<"renderTruncTImm">, GISDNodeXFormEquiv<as_i8timm>; -def gi_as_i1timm : GICustomOperandRenderer<"renderTruncTImm1">, +def gi_as_i1timm : GICustomOperandRenderer<"renderTruncTImm">, GISDNodeXFormEquiv<as_i1timm>; def gi_NegateImm : GICustomOperandRenderer<"renderNegateImm">, @@ -323,17 +323,14 @@ def gi_bitcast_fpimm_to_i32 : GICustomOperandRenderer<"renderBitcastImm">, def gi_IMMPopCount : GICustomOperandRenderer<"renderPopcntImm">, GISDNodeXFormEquiv<IMMPopCount>; -def gi_extract_glc : GICustomOperandRenderer<"renderExtractGLC">, - GISDNodeXFormEquiv<extract_glc>; - -def gi_extract_slc : GICustomOperandRenderer<"renderExtractSLC">, - GISDNodeXFormEquiv<extract_slc>; - -def gi_extract_dlc : GICustomOperandRenderer<"renderExtractDLC">, - GISDNodeXFormEquiv<extract_dlc>; +def gi_extract_cpol : GICustomOperandRenderer<"renderExtractCPol">, + GISDNodeXFormEquiv<extract_cpol>; def gi_extract_swz : GICustomOperandRenderer<"renderExtractSWZ">, GISDNodeXFormEquiv<extract_swz>; +def gi_set_glc : GICustomOperandRenderer<"renderSetGLC">, + GISDNodeXFormEquiv<set_glc>; + def gi_frameindex_to_targetframeindex : GICustomOperandRenderer<"renderFrameIndex">, GISDNodeXFormEquiv<frameindex_to_targetframeindex>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp index b3bafc5b2720..cabdc6998011 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp @@ -41,6 +41,20 @@ AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { return std::make_pair(Def->getOperand(1).getReg(), Offset); } + // Handle G_PTRTOINT (G_PTR_ADD base, const) case + if (Def->getOpcode() == TargetOpcode::G_PTRTOINT) { + MachineInstr *Base; + if (mi_match(Def->getOperand(1).getReg(), MRI, + m_GPtrAdd(m_MInstr(Base), m_ICst(Offset)))) { + // If Base was int converted to pointer, simply return int and offset. + if (Base->getOpcode() == TargetOpcode::G_INTTOPTR) + return std::make_pair(Base->getOperand(1).getReg(), Offset); + + // Register returned here will be of pointer type. + return std::make_pair(Base->getOperand(0).getReg(), Offset); + } + } + return std::make_pair(Reg, 0); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h index 404e0fcd1166..14d3a3fb7997 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/CodeGen/Register.h" #include <utility> diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index 39f9092ce77c..8eeda7b67b73 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -226,8 +226,8 @@ MetadataStreamerV2::getHSADebugProps(const MachineFunction &MF, void MetadataStreamerV2::emitVersion() { auto &Version = HSAMetadata.mVersion; - Version.push_back(VersionMajor); - Version.push_back(VersionMinor); + Version.push_back(VersionMajorV2); + Version.push_back(VersionMinorV2); } void MetadataStreamerV2::emitPrintf(const Module &Mod) { @@ -435,7 +435,8 @@ bool MetadataStreamerV2::emitTo(AMDGPUTargetStreamer &TargetStreamer) { return TargetStreamer.EmitHSAMetadata(getHSAMetadata()); } -void MetadataStreamerV2::begin(const Module &Mod) { +void MetadataStreamerV2::begin(const Module &Mod, + const IsaInfo::AMDGPUTargetID &TargetID) { emitVersion(); emitPrintf(Mod); } @@ -608,8 +609,8 @@ MetadataStreamerV3::getWorkGroupDimensions(MDNode *Node) const { void MetadataStreamerV3::emitVersion() { auto Version = HSAMetadataDoc->getArrayNode(); - Version.push_back(Version.getDocument()->getNode(VersionMajor)); - Version.push_back(Version.getDocument()->getNode(VersionMinor)); + Version.push_back(Version.getDocument()->getNode(VersionMajorV3)); + Version.push_back(Version.getDocument()->getNode(VersionMinorV3)); getRootMetadata("amdhsa.version") = Version; } @@ -881,7 +882,8 @@ bool MetadataStreamerV3::emitTo(AMDGPUTargetStreamer &TargetStreamer) { return TargetStreamer.EmitHSAMetadata(*HSAMetadataDoc, true); } -void MetadataStreamerV3::begin(const Module &Mod) { +void MetadataStreamerV3::begin(const Module &Mod, + const IsaInfo::AMDGPUTargetID &TargetID) { emitVersion(); emitPrintf(Mod); getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode(); @@ -921,6 +923,30 @@ void MetadataStreamerV3::emitKernel(const MachineFunction &MF, Kernels.push_back(Kern); } +//===----------------------------------------------------------------------===// +// HSAMetadataStreamerV4 +//===----------------------------------------------------------------------===// + +void MetadataStreamerV4::emitVersion() { + auto Version = HSAMetadataDoc->getArrayNode(); + Version.push_back(Version.getDocument()->getNode(VersionMajorV4)); + Version.push_back(Version.getDocument()->getNode(VersionMinorV4)); + getRootMetadata("amdhsa.version") = Version; +} + +void MetadataStreamerV4::emitTargetID(const IsaInfo::AMDGPUTargetID &TargetID) { + getRootMetadata("amdhsa.target") = + HSAMetadataDoc->getNode(TargetID.toString(), /*Copy=*/true); +} + +void MetadataStreamerV4::begin(const Module &Mod, + const IsaInfo::AMDGPUTargetID &TargetID) { + emitVersion(); + emitTargetID(TargetID); + emitPrintf(Mod); + getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode(); +} + } // end namespace HSAMD } // end namespace AMDGPU } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index 1c6db14b85cd..4824b4cf37c7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/BinaryFormat/MsgPackDocument.h" #include "llvm/Support/AMDGPUMetadata.h" #include "llvm/Support/Alignment.h" @@ -40,7 +41,8 @@ public: virtual bool emitTo(AMDGPUTargetStreamer &TargetStreamer) = 0; - virtual void begin(const Module &Mod) = 0; + virtual void begin(const Module &Mod, + const IsaInfo::AMDGPUTargetID &TargetID) = 0; virtual void end() = 0; @@ -48,8 +50,9 @@ public: const SIProgramInfo &ProgramInfo) = 0; }; -class MetadataStreamerV3 final : public MetadataStreamer { -private: +// TODO: Rename MetadataStreamerV3 -> MetadataStreamerMsgPackV3. +class MetadataStreamerV3 : public MetadataStreamer { +protected: std::unique_ptr<msgpack::Document> HSAMetadataDoc = std::make_unique<msgpack::Document>(); @@ -108,7 +111,8 @@ public: bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override; - void begin(const Module &Mod) override; + void begin(const Module &Mod, + const IsaInfo::AMDGPUTargetID &TargetID) override; void end() override; @@ -116,6 +120,21 @@ public: const SIProgramInfo &ProgramInfo) override; }; +// TODO: Rename MetadataStreamerV4 -> MetadataStreamerMsgPackV4. +class MetadataStreamerV4 final : public MetadataStreamerV3 { + void emitVersion(); + + void emitTargetID(const IsaInfo::AMDGPUTargetID &TargetID); + +public: + MetadataStreamerV4() = default; + ~MetadataStreamerV4() = default; + + void begin(const Module &Mod, + const IsaInfo::AMDGPUTargetID &TargetID) override; +}; + +// TODO: Rename MetadataStreamerV2 -> MetadataStreamerYamlV2. class MetadataStreamerV2 final : public MetadataStreamer { private: Metadata HSAMetadata; @@ -172,7 +191,8 @@ public: bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override; - void begin(const Module &Mod) override; + void begin(const Module &Mod, + const IsaInfo::AMDGPUTargetID &TargetID) override; void end() override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 340f4ac6f57a..a3106ded1e38 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -107,6 +107,10 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool EnableLateStructurizeCFG; + // Instructions that will be lowered with a final instruction that zeros the + // high result bits. + bool fp16SrcZerosHighBits(unsigned Opc) const; + public: explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr, CodeGenOpt::Level OptLevel = CodeGenOpt::Default) @@ -188,15 +192,9 @@ private: SDValue &Offset1, unsigned Size) const; bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, - SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC, SDValue &SWZ) const; + SDValue &Idxen, SDValue &Addr64) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, - SDValue &SOffset, SDValue &Offset, SDValue &GLC, - SDValue &SLC, SDValue &TFE, SDValue &DLC, - SDValue &SWZ) const; - bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, - SDValue &VAddr, SDValue &SOffset, SDValue &Offset, - SDValue &SLC) const; + SDValue &SOffset, SDValue &Offset) const; bool SelectMUBUFScratchOffen(SDNode *Parent, SDValue Addr, SDValue &RSrc, SDValue &VAddr, SDValue &SOffset, SDValue &ImmOffset) const; @@ -204,17 +202,17 @@ private: SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset) const; - bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, - SDValue &Offset, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC, SDValue &SWZ) const; - bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, - SDValue &Offset, SDValue &SLC) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset) const; - template <bool IsSigned> + bool SelectFlatOffsetImpl(SDNode *N, SDValue Addr, SDValue &VAddr, + SDValue &Offset, uint64_t FlatVariant) const; bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &Offset) const; + bool SelectGlobalOffset(SDNode *N, SDValue Addr, SDValue &VAddr, + SDValue &Offset) const; + bool SelectScratchOffset(SDNode *N, SDValue Addr, SDValue &VAddr, + SDValue &Offset) const; bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset) const; bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, @@ -322,6 +320,16 @@ static SDValue stripBitcast(SDValue Val) { // Figure out if this is really an extract of the high 16-bits of a dword. static bool isExtractHiElt(SDValue In, SDValue &Out) { In = stripBitcast(In); + + if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) { + if (!Idx->isOne()) + return false; + Out = In.getOperand(0); + return true; + } + } + if (In.getOpcode() != ISD::TRUNCATE) return false; @@ -341,6 +349,13 @@ static bool isExtractHiElt(SDValue In, SDValue &Out) { // Look through operations that obscure just looking at the low 16-bits of the // same register. static SDValue stripExtractLoElt(SDValue In) { + if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) { + if (Idx->isNullValue() && In.getValueSizeInBits() <= 32) + return In.getOperand(0); + } + } + if (In.getOpcode() == ISD::TRUNCATE) { SDValue Src = In.getOperand(0); if (Src.getValueType().getSizeInBits() == 32) @@ -391,6 +406,68 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { return SelectionDAGISel::runOnMachineFunction(MF); } +bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const { + // XXX - only need to list legal operations. + switch (Opc) { + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: + case ISD::FDIV: + case ISD::FREM: + case ISD::FCANONICALIZE: + case ISD::UINT_TO_FP: + case ISD::SINT_TO_FP: + case ISD::FABS: + // Fabs is lowered to a bit operation, but it's an and which will clear the + // high bits anyway. + case ISD::FSQRT: + case ISD::FSIN: + case ISD::FCOS: + case ISD::FPOWI: + case ISD::FPOW: + case ISD::FLOG: + case ISD::FLOG2: + case ISD::FLOG10: + case ISD::FEXP: + case ISD::FEXP2: + case ISD::FCEIL: + case ISD::FTRUNC: + case ISD::FRINT: + case ISD::FNEARBYINT: + case ISD::FROUND: + case ISD::FFLOOR: + case ISD::FMINNUM: + case ISD::FMAXNUM: + case AMDGPUISD::FRACT: + case AMDGPUISD::CLAMP: + case AMDGPUISD::COS_HW: + case AMDGPUISD::SIN_HW: + case AMDGPUISD::FMIN3: + case AMDGPUISD::FMAX3: + case AMDGPUISD::FMED3: + case AMDGPUISD::FMAD_FTZ: + case AMDGPUISD::RCP: + case AMDGPUISD::RSQ: + case AMDGPUISD::RCP_IFLAG: + case AMDGPUISD::LDEXP: + // On gfx10, all 16-bit instructions preserve the high bits. + return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9; + case ISD::FP_ROUND: + // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the + // high bits on gfx9. + // TODO: If we had the source node we could see if the source was fma/mad + return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; + case ISD::FMA: + case ISD::FMAD: + case AMDGPUISD::DIV_FIXUP: + return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; + default: + // fcopysign, select and others may be lowered to 32-bit bit operations + // which don't zero the high bits. + return false; + } +} + bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const { assert(Subtarget->d16PreservesUnusedBits()); MVT VT = N->getValueType(0).getSimpleVT(); @@ -1374,13 +1451,10 @@ bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base, return true; } -bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, - SDValue &VAddr, SDValue &SOffset, - SDValue &Offset, SDValue &Offen, - SDValue &Idxen, SDValue &Addr64, - SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC, - SDValue &SWZ) const { +bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr, + SDValue &SOffset, SDValue &Offset, + SDValue &Offen, SDValue &Idxen, + SDValue &Addr64) const { // Subtarget prefers to use flat instruction // FIXME: This should be a pattern predicate and not reach here if (Subtarget->useFlatForGlobal()) @@ -1388,14 +1462,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDLoc DL(Addr); - if (!GLC.getNode()) - GLC = CurDAG->getTargetConstant(0, DL, MVT::i1); - if (!SLC.getNode()) - SLC = CurDAG->getTargetConstant(0, DL, MVT::i1); - TFE = CurDAG->getTargetConstant(0, DL, MVT::i1); - DLC = CurDAG->getTargetConstant(0, DL, MVT::i1); - SWZ = CurDAG->getTargetConstant(0, DL, MVT::i1); - Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); Offen = CurDAG->getTargetConstant(0, DL, MVT::i1); Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1); @@ -1472,9 +1538,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, - SDValue &Offset, SDValue &GLC, - SDValue &SLC, SDValue &TFE, - SDValue &DLC, SDValue &SWZ) const { + SDValue &Offset) const { SDValue Ptr, Offen, Idxen, Addr64; // addr64 bit was removed for volcanic islands. @@ -1482,8 +1546,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, if (!Subtarget->hasAddr64()) return false; - if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE, DLC, SWZ)) + if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64)) return false; ConstantSDNode *C = cast<ConstantSDNode>(Addr64); @@ -1500,21 +1563,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, return false; } -bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, - SDValue &VAddr, SDValue &SOffset, - SDValue &Offset, - SDValue &SLC) const { - SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1); - SDValue GLC, TFE, DLC, SWZ; - - return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC, SWZ); -} - -static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { - auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); - return PSV && PSV->isStack(); -} - std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const { SDLoc DL(N); @@ -1551,13 +1599,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits); VAddr = SDValue(MovHighBits, 0); - // In a call sequence, stores to the argument stack area are relative to the - // stack pointer. - const MachinePointerInfo &PtrInfo - = cast<MemSDNode>(Parent)->getPointerInfo(); - SOffset = isStackPtrRelative(PtrInfo) - ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32) - : CurDAG->getTargetConstant(0, DL, MVT::i32); + SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16); return true; } @@ -1600,44 +1642,65 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, return true; } +static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) { + if (Val.getOpcode() != ISD::CopyFromReg) + return false; + auto RC = + TRI.getPhysRegClass(cast<RegisterSDNode>(Val.getOperand(1))->getReg()); + return RC && TRI.isSGPRClass(RC); +} + bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset) const { - ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr); - if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) - return false; - - SDLoc DL(Addr); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); MachineFunction &MF = CurDAG->getMachineFunction(); const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + SDLoc DL(Addr); - SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); + // CopyFromReg <sgpr> + if (IsCopyFromSGPR(*TRI, Addr)) { + SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); + SOffset = Addr; + Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + return true; + } - const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo(); + ConstantSDNode *CAddr; + if (Addr.getOpcode() == ISD::ADD) { + // Add (CopyFromReg <sgpr>) <constant> + CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1)); + if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) + return false; + if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0))) + return false; - // FIXME: Get from MachinePointerInfo? We should only be using the frame - // offset if we know this is in a call sequence. - SOffset = isStackPtrRelative(PtrInfo) - ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32) - : CurDAG->getTargetConstant(0, DL, MVT::i32); + SOffset = Addr.getOperand(0); + } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) && + SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) { + // <constant> + SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); + } else { + return false; + } + + SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16); return true; } bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, - SDValue &SOffset, SDValue &Offset, - SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC, - SDValue &SWZ) const { + SDValue &SOffset, SDValue &Offset + ) const { SDValue Ptr, VAddr, Offen, Idxen, Addr64; const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); - if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE, DLC, SWZ)) + if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64)) return false; if (!cast<ConstantSDNode>(Offen)->getSExtValue() && @@ -1656,21 +1719,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, return false; } -bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, - SDValue &Soffset, SDValue &Offset - ) const { - SDValue GLC, SLC, TFE, DLC, SWZ; - - return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ); -} -bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, - SDValue &Soffset, SDValue &Offset, - SDValue &SLC) const { - SDValue GLC, TFE, DLC, SWZ; - - return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ); -} - // Find a load or store from corresponding pattern root. // Roots may be build_vector, bitconvert or their combinations. static MemSDNode* findMemSDNode(SDNode *N) { @@ -1685,24 +1733,25 @@ static MemSDNode* findMemSDNode(SDNode *N) { llvm_unreachable("cannot find MemSDNode in the pattern!"); } -template <bool IsSigned> -bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, - SDValue Addr, - SDValue &VAddr, - SDValue &Offset) const { +bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr, + SDValue &VAddr, SDValue &Offset, + uint64_t FlatVariant) const { int64_t OffsetVal = 0; unsigned AS = findMemSDNode(N)->getAddressSpace(); - if (Subtarget->hasFlatInstOffsets() && - (!Subtarget->hasFlatSegmentOffsetBug() || - AS != AMDGPUAS::FLAT_ADDRESS)) { + bool CanHaveFlatSegmentOffsetBug = + Subtarget->hasFlatSegmentOffsetBug() && + FlatVariant == SIInstrFlags::FLAT && + (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS); + + if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) { SDValue N0, N1; if (isBaseWithConstantOffset64(Addr, N0, N1)) { - uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue(); + int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue(); const SIInstrInfo *TII = Subtarget->getInstrInfo(); - if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) { + if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) { Addr = N0; OffsetVal = COffsetVal; } else { @@ -1719,8 +1768,8 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDLoc DL(N); uint64_t RemainderOffset; - std::tie(OffsetVal, RemainderOffset) - = TII->splitFlatOffset(COffsetVal, AS, IsSigned); + std::tie(OffsetVal, RemainderOffset) = + TII->splitFlatOffset(COffsetVal, AS, FlatVariant); SDValue AddOffsetLo = getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); @@ -1777,6 +1826,25 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, return true; } +bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr, + SDValue &VAddr, + SDValue &Offset) const { + return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT); +} + +bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr, + SDValue &VAddr, + SDValue &Offset) const { + return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal); +} + +bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr, + SDValue &VAddr, + SDValue &Offset) const { + return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, + SIInstrFlags::FlatScratch); +} + // If this matches zero_extend i32:x, return x static SDValue matchZExtFromI32(SDValue Op) { if (Op.getOpcode() != ISD::ZERO_EXTEND) @@ -1802,126 +1870,144 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue(); const SIInstrInfo *TII = Subtarget->getInstrInfo(); - if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, true)) { + if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, + SIInstrFlags::FlatGlobal)) { Addr = LHS; ImmOffset = COffsetVal; - } else if (!LHS->isDivergent() && COffsetVal > 0) { - SDLoc SL(N); - // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset) + - // (large_offset & MaxOffset); - int64_t SplitImmOffset, RemainderOffset; - std::tie(SplitImmOffset, RemainderOffset) - = TII->splitFlatOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, true); - - if (isUInt<32>(RemainderOffset)) { - SDNode *VMov = CurDAG->getMachineNode( - AMDGPU::V_MOV_B32_e32, SL, MVT::i32, - CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32)); - VOffset = SDValue(VMov, 0); - SAddr = LHS; - Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16); - return true; + } else if (!LHS->isDivergent()) { + if (COffsetVal > 0) { + SDLoc SL(N); + // saddr + large_offset -> saddr + + // (voffset = large_offset & ~MaxOffset) + + // (large_offset & MaxOffset); + int64_t SplitImmOffset, RemainderOffset; + std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset( + COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); + + if (isUInt<32>(RemainderOffset)) { + SDNode *VMov = CurDAG->getMachineNode( + AMDGPU::V_MOV_B32_e32, SL, MVT::i32, + CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32)); + VOffset = SDValue(VMov, 0); + SAddr = LHS; + Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16); + return true; + } } + + // We are adding a 64 bit SGPR and a constant. If constant bus limit + // is 1 we would need to perform 1 or 2 extra moves for each half of + // the constant and it is better to do a scalar add and then issue a + // single VALU instruction to materialize zero. Otherwise it is less + // instructions to perform VALU adds with immediates or inline literals. + unsigned NumLiterals = + !TII->isInlineConstant(APInt(32, COffsetVal & 0xffffffff)) + + !TII->isInlineConstant(APInt(32, COffsetVal >> 32)); + if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals) + return false; } } // Match the variable offset. - if (Addr.getOpcode() != ISD::ADD) { - if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF || - isa<ConstantSDNode>(Addr)) - return false; - - // It's cheaper to materialize a single 32-bit zero for vaddr than the two - // moves required to copy a 64-bit SGPR to VGPR. - SAddr = Addr; - SDNode *VMov = CurDAG->getMachineNode( - AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32, - CurDAG->getTargetConstant(0, SDLoc(), MVT::i32)); - VOffset = SDValue(VMov, 0); - Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); - return true; - } + if (Addr.getOpcode() == ISD::ADD) { + LHS = Addr.getOperand(0); + RHS = Addr.getOperand(1); - LHS = Addr.getOperand(0); - RHS = Addr.getOperand(1); + if (!LHS->isDivergent()) { + // add (i64 sgpr), (zero_extend (i32 vgpr)) + if (SDValue ZextRHS = matchZExtFromI32(RHS)) { + SAddr = LHS; + VOffset = ZextRHS; + } + } - if (!LHS->isDivergent()) { - // add (i64 sgpr), (zero_extend (i32 vgpr)) - if (SDValue ZextRHS = matchZExtFromI32(RHS)) { - SAddr = LHS; - VOffset = ZextRHS; + if (!SAddr && !RHS->isDivergent()) { + // add (zero_extend (i32 vgpr)), (i64 sgpr) + if (SDValue ZextLHS = matchZExtFromI32(LHS)) { + SAddr = RHS; + VOffset = ZextLHS; + } } - } - if (!SAddr && !RHS->isDivergent()) { - // add (zero_extend (i32 vgpr)), (i64 sgpr) - if (SDValue ZextLHS = matchZExtFromI32(LHS)) { - SAddr = RHS; - VOffset = ZextLHS; + if (SAddr) { + Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); + return true; } } - if (!SAddr) + if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF || + isa<ConstantSDNode>(Addr)) return false; + // It's cheaper to materialize a single 32-bit zero for vaddr than the two + // moves required to copy a 64-bit SGPR to VGPR. + SAddr = Addr; + SDNode *VMov = + CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32, + CurDAG->getTargetConstant(0, SDLoc(), MVT::i32)); + VOffset = SDValue(VMov, 0); Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); return true; } +static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) { + if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) { + SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); + } else if (SAddr.getOpcode() == ISD::ADD && + isa<FrameIndexSDNode>(SAddr.getOperand(0))) { + // Materialize this into a scalar move for scalar address to avoid + // readfirstlane. + auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0)); + SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(), + FI->getValueType(0)); + SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr), + MVT::i32, TFI, SAddr.getOperand(1)), + 0); + } + + return SAddr; +} + // Match (32-bit SGPR base) + sext(imm offset) -bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *N, - SDValue Addr, +bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr, SDValue &SAddr, SDValue &Offset) const { if (Addr->isDivergent()) return false; - SAddr = Addr; + SDLoc DL(Addr); + int64_t COffsetVal = 0; if (CurDAG->isBaseWithConstantOffset(Addr)) { COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue(); SAddr = Addr.getOperand(0); + } else { + SAddr = Addr; } - if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) { - SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); - } else if (SAddr.getOpcode() == ISD::ADD && - isa<FrameIndexSDNode>(SAddr.getOperand(0))) { - // Materialize this into a scalar move for scalar address to avoid - // readfirstlane. - auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0)); - SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(), - FI->getValueType(0)); - SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, SDLoc(SAddr), - MVT::i32, TFI, SAddr.getOperand(1)), - 0); - } + SAddr = SelectSAddrFI(CurDAG, SAddr); const SIInstrInfo *TII = Subtarget->getInstrInfo(); - if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) { - int64_t RemainderOffset = COffsetVal; - int64_t ImmField = 0; - const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(*Subtarget, true); - // Use signed division by a power of two to truncate towards 0. - int64_t D = 1LL << (NumBits - 1); - RemainderOffset = (COffsetVal / D) * D; - ImmField = COffsetVal - RemainderOffset; - - assert(TII->isLegalFLATOffset(ImmField, AMDGPUAS::PRIVATE_ADDRESS, true)); - assert(RemainderOffset + ImmField == COffsetVal); + if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, + SIInstrFlags::FlatScratch)) { + int64_t SplitImmOffset, RemainderOffset; + std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset( + COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch); - COffsetVal = ImmField; + COffsetVal = SplitImmOffset; - SDLoc DL(N); SDValue AddOffset = - getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); - SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, DL, MVT::i32, - SAddr, AddOffset), 0); + SAddr.getOpcode() == ISD::TargetFrameIndex + ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL) + : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32); + SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32, + SAddr, AddOffset), + 0); } - Offset = CurDAG->getTargetConstant(COffsetVal, SDLoc(), MVT::i16); + Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16); return true; } @@ -2364,35 +2450,32 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { MachineSDNode *CmpSwap = nullptr; if (Subtarget->hasAddr64()) { - SDValue SRsrc, VAddr, SOffset, Offset, SLC; + SDValue SRsrc, VAddr, SOffset, Offset; - if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset, SLC)) { + if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset)) { unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN : AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN; SDValue CmpVal = Mem->getOperand(2); - SDValue GLC = CurDAG->getTargetConstant(1, SL, MVT::i1); + SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32); // XXX - Do we care about glue operands? - SDValue Ops[] = { - CmpVal, VAddr, SRsrc, SOffset, Offset, GLC, SLC, Mem->getChain() - }; + SDValue Ops[] = {CmpVal, VAddr, SRsrc, SOffset, Offset, CPol, + Mem->getChain()}; CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops); } } if (!CmpSwap) { - SDValue SRsrc, SOffset, Offset, SLC; - if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset, SLC)) { + SDValue SRsrc, SOffset, Offset; + if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset)) { unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN : AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN; SDValue CmpVal = Mem->getOperand(2); - SDValue GLC = CurDAG->getTargetConstant(1, SL, MVT::i1); - SDValue Ops[] = { - CmpVal, SRsrc, SOffset, Offset, GLC, SLC, Mem->getChain() - }; + SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32); + SDValue Ops[] = {CmpVal, SRsrc, SOffset, Offset, CPol, Mem->getChain()}; CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops); } @@ -2623,7 +2706,11 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) { Opcode = AMDGPU::SOFT_WQM; break; case Intrinsic::amdgcn_wwm: - Opcode = AMDGPU::WWM; + case Intrinsic::amdgcn_strict_wwm: + Opcode = AMDGPU::STRICT_WWM; + break; + case Intrinsic::amdgcn_strict_wqm: + Opcode = AMDGPU::STRICT_WQM; break; case Intrinsic::amdgcn_interp_p1_f16: SelectInterpP1F16(N); @@ -2773,18 +2860,62 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, if (isExtractHiElt(Hi, Hi)) Mods |= SISrcMods::OP_SEL_1; + unsigned VecSize = Src.getValueSizeInBits(); Lo = stripExtractLoElt(Lo); Hi = stripExtractLoElt(Hi); + if (Lo.getValueSizeInBits() > VecSize) { + Lo = CurDAG->getTargetExtractSubreg( + (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In), + MVT::getIntegerVT(VecSize), Lo); + } + + if (Hi.getValueSizeInBits() > VecSize) { + Hi = CurDAG->getTargetExtractSubreg( + (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In), + MVT::getIntegerVT(VecSize), Hi); + } + + assert(Lo.getValueSizeInBits() <= VecSize && + Hi.getValueSizeInBits() <= VecSize); + if (Lo == Hi && !isInlineImmediate(Lo.getNode())) { // Really a scalar input. Just select from the low half of the register to // avoid packing. - Src = Lo; + if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) { + Src = Lo; + } else { + assert(Lo.getValueSizeInBits() == 32 && VecSize == 64); + + SDLoc SL(In); + SDValue Undef = SDValue( + CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL, + Lo.getValueType()), 0); + auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID + : AMDGPU::SReg_64RegClassID; + const SDValue Ops[] = { + CurDAG->getTargetConstant(RC, SL, MVT::i32), + Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32), + Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) }; + + Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL, + Src.getValueType(), Ops), 0); + } SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; } + if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) { + uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF() + .bitcastToAPInt().getZExtValue(); + if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) { + Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);; + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; + } + } + Mods = VecMods; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 0b4b4776ad39..d68488ccb342 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -78,6 +78,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::LOAD, MVT::v5f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32); + setOperationAction(ISD::LOAD, MVT::v6f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32); + + setOperationAction(ISD::LOAD, MVT::v7f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32); + setOperationAction(ISD::LOAD, MVT::v8f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); @@ -99,9 +105,15 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::LOAD, MVT::v2f64, Promote); AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32); + setOperationAction(ISD::LOAD, MVT::v3i64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32); + setOperationAction(ISD::LOAD, MVT::v4i64, Promote); AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32); + setOperationAction(ISD::LOAD, MVT::v3f64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32); + setOperationAction(ISD::LOAD, MVT::v4f64, Promote); AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32); @@ -173,12 +185,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand); @@ -198,6 +212,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v5f32, Promote); AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32); + setOperationAction(ISD::STORE, MVT::v6f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32); + + setOperationAction(ISD::STORE, MVT::v7f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32); + setOperationAction(ISD::STORE, MVT::v8f32, Promote); AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32); @@ -219,6 +239,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v2f64, Promote); AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32); + setOperationAction(ISD::STORE, MVT::v3i64, Promote); + AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32); + + setOperationAction(ISD::STORE, MVT::v3f64, Promote); + AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32); + setOperationAction(ISD::STORE, MVT::v4i64, Promote); AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32); @@ -261,6 +287,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); + setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand); + setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand); + setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand); + setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand); + setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand); setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand); setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand); @@ -325,8 +356,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v6i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v6f32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v7i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v7f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom); @@ -335,6 +372,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom); @@ -343,6 +384,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f64, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i64, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f64, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i64, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f64, Custom); @@ -412,8 +455,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); static const MVT::SimpleValueType VectorIntTypes[] = { - MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32 - }; + MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32}; for (MVT VT : VectorIntTypes) { // Expand the following operations for the current type by default. @@ -454,8 +496,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, } static const MVT::SimpleValueType FloatVectorTypes[] = { - MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32 - }; + MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32}; for (MVT VT : FloatVectorTypes) { setOperationAction(ISD::FABS, VT, Expand); @@ -505,6 +546,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v5f32, Promote); AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32); + setOperationAction(ISD::SELECT, MVT::v6f32, Promote); + AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32); + + setOperationAction(ISD::SELECT, MVT::v7f32, Promote); + AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32); + // There are no libcalls of any kind. for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr); @@ -846,9 +893,9 @@ bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { assert(VT.isFloatingPoint()); - return VT == MVT::f32 || VT == MVT::f64 || - (Subtarget->has16BitInsts() && VT == MVT::f16) || - (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16); + // Report this based on the end legalized type. + VT = VT.getScalarType(); + return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16; } bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT, @@ -1257,8 +1304,9 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); - case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); - case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + return LowerFP_TO_INT(Op, DAG); case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: case ISD::CTLZ: @@ -1304,7 +1352,8 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) { - if (!MFI->isModuleEntryFunction()) { + if (!MFI->isModuleEntryFunction() && + !GV->getName().equals("llvm.amdgcn.module.lds")) { SDLoc DL(Op); const Function &Fn = DAG.getMachineFunction().getFunction(); DiagnosticInfoUnsupported BadLDSDecl( @@ -1368,6 +1417,14 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SmallVector<SDValue, 8> Args; unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); EVT VT = Op.getValueType(); + EVT SrcVT = Op.getOperand(0).getValueType(); + + // For these types, we have some TableGen patterns except if the index is 1 + if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) || + (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) && + Start != 1) + return Op; + DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, VT.getVectorNumElements()); @@ -2579,33 +2636,77 @@ SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, return LowerINT_TO_FP64(Op, DAG, true); } -SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, +SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); + EVT SrcVT = Src.getValueType(); - SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); + assert(SrcVT == MVT::f32 || SrcVT == MVT::f64); - SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL, - MVT::f64); - SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL, - MVT::f64); - // TODO: Should this propagate fast-math-flags? - SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0); + // The basic idea of converting a floating point number into a pair of 32-bit + // integers is illustrated as follows: + // + // tf := trunc(val); + // hif := floor(tf * 2^-32); + // lof := tf - hif * 2^32; // lof is always positive due to floor. + // hi := fptoi(hif); + // lo := fptoi(lof); + // + SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src); + SDValue Sign; + if (Signed && SrcVT == MVT::f32) { + // However, a 32-bit floating point number has only 23 bits mantissa and + // it's not enough to hold all the significant bits of `lof` if val is + // negative. To avoid the loss of precision, We need to take the absolute + // value after truncating and flip the result back based on the original + // signedness. + Sign = DAG.getNode(ISD::SRA, SL, MVT::i32, + DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc), + DAG.getConstant(31, SL, MVT::i32)); + Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc); + } - SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul); + SDValue K0, K1; + if (SrcVT == MVT::f64) { + K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)), + SL, SrcVT); + K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), + SL, SrcVT); + } else { + K0 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)), SL, + SrcVT); + K1 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)), SL, + SrcVT); + } + // TODO: Should this propagate fast-math-flags? + SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0); + SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul); - SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc); + SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc); - SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL, - MVT::i32, FloorMul); + SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT + : ISD::FP_TO_UINT, + SL, MVT::i32, FloorMul); SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma); - SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}); + SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, + DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi})); + + if (Signed && SrcVT == MVT::f32) { + assert(Sign); + // Flip the result based on the signedness, which is either all 0s or 1s. + Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64, + DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign})); + // r := xor(r, sign) - sign; + Result = + DAG.getNode(ISD::SUB, SL, MVT::i64, + DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign); + } - return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result); + return Result; } SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const { @@ -2707,44 +2808,37 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) con return DAG.getZExtOrTrunc(V, DL, Op.getValueType()); } -SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op, - SelectionDAG &DAG) const { +SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op, + SelectionDAG &DAG) const { SDValue Src = Op.getOperand(0); + unsigned OpOpcode = Op.getOpcode(); + EVT SrcVT = Src.getValueType(); + EVT DestVT = Op.getValueType(); - // TODO: Factor out code common with LowerFP_TO_UINT. + // Will be selected natively + if (SrcVT == MVT::f16 && DestVT == MVT::i16) + return Op; - EVT SrcVT = Src.getValueType(); - if (SrcVT == MVT::f16 || - (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) { + // Promote i16 to i32 + if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) { SDLoc DL(Op); - SDValue FpToInt32 = DAG.getNode(Op.getOpcode(), DL, MVT::i32, Src); - return DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, FpToInt32); + SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src); + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32); } - if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) - return LowerFP64_TO_INT(Op, DAG, true); - - return SDValue(); -} - -SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op, - SelectionDAG &DAG) const { - SDValue Src = Op.getOperand(0); - - // TODO: Factor out code common with LowerFP_TO_SINT. - - EVT SrcVT = Src.getValueType(); if (SrcVT == MVT::f16 || (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) { SDLoc DL(Op); - SDValue FpToUInt32 = DAG.getNode(Op.getOpcode(), DL, MVT::i32, Src); - return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, FpToUInt32); + SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src); + unsigned Ext = + OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + return DAG.getNode(Ext, DL, MVT::i64, FpToInt32); } - if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) - return LowerFP64_TO_INT(Op, DAG, false); + if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) + return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT); return SDValue(); } @@ -2787,8 +2881,8 @@ static bool isI24(SDValue Op, SelectionDAG &DAG) { AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24; } -static SDValue simplifyI24(SDNode *Node24, - TargetLowering::DAGCombinerInfo &DCI) { +static SDValue simplifyMul24(SDNode *Node24, + TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN; @@ -2890,9 +2984,8 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, // Expand unaligned loads earlier than legalization. Due to visitation order // problems during legalization, the emitted instructions to pack and unpack // the bytes again are not eliminated in the case of an unaligned copy. - if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment.value(), - LN->getMemOperand()->getFlags(), - &IsFast)) { + if (!allowsMisalignedMemoryAccesses( + VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) { SDValue Ops[2]; if (VT.isVector()) @@ -2946,9 +3039,8 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, // order problems during legalization, the emitted instructions to pack and // unpack the bytes again are not eliminated in the case of an unaligned // copy. - if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment.value(), - SN->getMemOperand()->getFlags(), - &IsFast)) { + if (!allowsMisalignedMemoryAccesses( + VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) { if (VT.isVector()) return scalarizeVectorStore(SN, DAG); @@ -3010,7 +3102,7 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine( switch (IID) { case Intrinsic::amdgcn_mul_i24: case Intrinsic::amdgcn_mul_u24: - return simplifyI24(N, DCI); + return simplifyMul24(N, DCI); case Intrinsic::amdgcn_fract: case Intrinsic::amdgcn_rsq: case Intrinsic::amdgcn_rcp_legacy: @@ -3312,6 +3404,13 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); + // Don't generate 24-bit multiplies on values that are in SGPRs, since + // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs + // unnecessarily). isDivergent() is used as an approximation of whether the + // value is in an SGPR. + if (!N->isDivergent()) + return SDValue(); + unsigned Size = VT.getSizeInBits(); if (VT.isVector() || Size > 64) return SDValue(); @@ -3362,6 +3461,15 @@ SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N, if (!Subtarget->hasMulI24() || VT.isVector()) return SDValue(); + // Don't generate 24-bit multiplies on values that are in SGPRs, since + // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs + // unnecessarily). isDivergent() is used as an approximation of whether the + // value is in an SGPR. + // This doesn't apply if no s_mul_hi is available (since we'll end up with a + // valu op anyway) + if (Subtarget->hasSMulHi() && !N->isDivergent()) + return SDValue(); + SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); @@ -3386,6 +3494,15 @@ SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N, if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32) return SDValue(); + // Don't generate 24-bit multiplies on values that are in SGPRs, since + // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs + // unnecessarily). isDivergent() is used as an approximation of whether the + // value is in an SGPR. + // This doesn't apply if no s_mul_hi is available (since we'll end up with a + // valu op anyway) + if (Subtarget->hasSMulHi() && !N->isDivergent()) + return SDValue(); + SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); @@ -3985,11 +4102,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, case AMDGPUISD::MUL_I24: case AMDGPUISD::MUL_U24: case AMDGPUISD::MULHI_I24: - case AMDGPUISD::MULHI_U24: { - if (SDValue V = simplifyI24(N, DCI)) - return V; - return SDValue(); - } + case AMDGPUISD::MULHI_U24: + return simplifyMul24(N, DCI); case ISD::SELECT: return performSelectCombine(N, DCI); case ISD::FNEG: @@ -4159,8 +4273,13 @@ SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG, int64_t Offset) const { MachineFunction &MF = DAG.getMachineFunction(); MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32); + // Stores to the argument stack area are relative to the stack pointer. + SDValue SP = + DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32); + Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr); SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4), MachineMemOperand::MODereferenceable); return Store; @@ -4297,7 +4416,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CVT_PK_I16_I32) NODE_NAME_CASE(CVT_PK_U16_U32) NODE_NAME_CASE(FP_TO_FP16) - NODE_NAME_CASE(FP16_ZEXT) NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) NODE_NAME_CASE(CONST_DATA_PTR) NODE_NAME_CASE(PC_ADD_REL_OFFSET) @@ -4350,6 +4468,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) NODE_NAME_CASE(BUFFER_ATOMIC_CSUB) NODE_NAME_CASE(BUFFER_ATOMIC_FADD) + NODE_NAME_CASE(BUFFER_ATOMIC_FMIN) + NODE_NAME_CASE(BUFFER_ATOMIC_FMAX) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } @@ -4425,8 +4545,7 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( break; } - case AMDGPUISD::FP_TO_FP16: - case AMDGPUISD::FP16_ZEXT: { + case AMDGPUISD::FP_TO_FP16: { unsigned BitWidth = Known.getBitWidth(); // High bits are zero. @@ -4573,7 +4692,6 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( case AMDGPUISD::BUFFER_LOAD_USHORT: return 16; case AMDGPUISD::FP_TO_FP16: - case AMDGPUISD::FP16_ZEXT: return 16; default: return 1; @@ -4727,3 +4845,8 @@ AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { return AtomicExpansionKind::None; } } + +bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtactLegal( + unsigned Opc, LLT Ty1, LLT Ty2) const { + return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index ce3618f83130..e61021d451f8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -64,10 +64,9 @@ protected: SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const; + SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; @@ -328,6 +327,9 @@ public: } AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override; + + bool isConstantUnsignedBitfieldExtactLegal(unsigned Opc, LLT Ty1, + LLT Ty2) const override; }; namespace AMDGPUISD { @@ -458,9 +460,6 @@ enum NodeType : unsigned { // are known 0. FP_TO_FP16, - // Wrapper around fp16 results that are known to zero the high bits. - FP16_ZEXT, - /// This node is for VLIW targets and it is used to represent a vector /// that is stored in consecutive registers with the same channel. /// For example: @@ -523,6 +522,8 @@ enum NodeType : unsigned { BUFFER_ATOMIC_CMPSWAP, BUFFER_ATOMIC_CSUB, BUFFER_ATOMIC_FADD, + BUFFER_ATOMIC_FMIN, + BUFFER_ATOMIC_FMAX, LAST_AMDGPU_ISD_NUMBER }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 894677ec68b6..0f9cb712f820 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -132,7 +132,6 @@ def AMDGPUpknorm_u16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFP def AMDGPUpk_i16_i32_impl : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>; def AMDGPUpk_u16_u32_impl : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>; def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>; -def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>; def AMDGPUfp_class_impl : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>; @@ -213,6 +212,8 @@ def AMDGPUcvt_f32_ubyte2 : SDNode<"AMDGPUISD::CVT_F32_UBYTE2", def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3", SDTIntToFPOp, []>; +def AMDGPUcvt_pk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32", + AMDGPUIntPackOp, []>; // urecip - This operation is a helper for integer division, it returns the // result of 1 / a as a fractional unsigned integer. @@ -311,7 +312,7 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2", SDTCisInt<4>]>, []>; -def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>; +def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>; // SI+ export def AMDGPUExportOp : SDTypeProfile<0, 8, [ @@ -461,3 +462,7 @@ def AMDGPUfdot2 : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$clamp) def AMDGPUdiv_fmas : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$vcc), [(int_amdgcn_div_fmas node:$src0, node:$src1, node:$src2, node:$vcc), (AMDGPUdiv_fmas_impl node:$src0, node:$src1, node:$src2, node:$vcc)]>; + +def AMDGPUperm : PatFrags<(ops node:$src0, node:$src1, node:$src2), + [(int_amdgcn_perm node:$src0, node:$src1, node:$src2), + (AMDGPUperm_impl node:$src0, node:$src1, node:$src2)]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index bd577a6fb8c5..323aaaf70cd4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -18,6 +18,7 @@ #include "AMDGPURegisterBankInfo.h" #include "AMDGPUTargetMachine.h" #include "SIMachineFunctionInfo.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" @@ -59,11 +60,13 @@ AMDGPUInstructionSelector::AMDGPUInstructionSelector( const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } -void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB, - CodeGenCoverage &CoverageInfo) { +void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB, + CodeGenCoverage &CoverageInfo, + ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) { MRI = &MF.getRegInfo(); Subtarget = &MF.getSubtarget<GCNSubtarget>(); - InstructionSelector::setupMF(MF, KB, CoverageInfo); + InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI); } bool AMDGPUInstructionSelector::isVCC(Register Reg, @@ -136,20 +139,29 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { const TargetRegisterClass *SrcRC = TRI.getConstrainedRegClassForOperand(Src, *MRI); - Register MaskedReg = MRI->createVirtualRegister(SrcRC); + Optional<ValueAndVReg> ConstVal = + getConstantVRegValWithLookThrough(SrcReg, *MRI, true, true); + if (ConstVal) { + unsigned MovOpc = + STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; + BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg) + .addImm(ConstVal->Value.getBoolValue() ? -1 : 0); + } else { + Register MaskedReg = MRI->createVirtualRegister(SrcRC); - // We can't trust the high bits at this point, so clear them. + // We can't trust the high bits at this point, so clear them. - // TODO: Skip masking high bits if def is known boolean. + // TODO: Skip masking high bits if def is known boolean. - unsigned AndOpc = TRI.isSGPRClass(SrcRC) ? - AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; - BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) - .addImm(1) - .addReg(SrcReg); - BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) - .addImm(0) - .addReg(MaskedReg); + unsigned AndOpc = + TRI.isSGPRClass(SrcRC) ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; + BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) + .addImm(1) + .addReg(SrcReg); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) + .addImm(0) + .addReg(MaskedReg); + } if (!MRI->getRegClassOrNull(SrcReg)) MRI->setRegClass(SrcReg, SrcRC); @@ -578,7 +590,7 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( return true; const LLT S32 = LLT::scalar(32); - const LLT V2S16 = LLT::vector(2, 16); + const LLT V2S16 = LLT::fixed_vector(2, 16); Register Dst = MI.getOperand(0).getReg(); if (MRI->getType(Dst) != V2S16) @@ -743,6 +755,30 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { return true; } +bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const { + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + Register OffsetReg = MI.getOperand(2).getReg(); + Register WidthReg = MI.getOperand(3).getReg(); + + assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID && + "scalar BFX instructions are expanded in regbankselect"); + assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 && + "64-bit vector BFX instructions are expanded in regbankselect"); + + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock *MBB = MI.getParent(); + + bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX; + unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64; + auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg) + .addReg(SrcReg) + .addReg(OffsetReg) + .addReg(WidthReg); + MI.eraseFromParent(); + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); +} + bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { if (STI.getLDSBankCount() != 16) return selectImpl(MI, *CoverageInfo); @@ -916,8 +952,11 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { return constrainCopyLikeIntrin(I, AMDGPU::WQM); case Intrinsic::amdgcn_softwqm: return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); + case Intrinsic::amdgcn_strict_wwm: case Intrinsic::amdgcn_wwm: - return constrainCopyLikeIntrin(I, AMDGPU::WWM); + return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM); + case Intrinsic::amdgcn_strict_wqm: + return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM); case Intrinsic::amdgcn_writelane: return selectWritelane(I); case Intrinsic::amdgcn_div_scale: @@ -1375,7 +1414,24 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, if (HasVSrc) { Register VSrc = MI.getOperand(1).getReg(); - MIB.addReg(VSrc); + + if (STI.needsAlignedVGPRs()) { + // Add implicit aligned super-reg to force alignment on the data operand. + Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef); + Register NewVR = + MRI->createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass); + BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), NewVR) + .addReg(VSrc, 0, MI.getOperand(1).getSubReg()) + .addImm(AMDGPU::sub0) + .addReg(Undef) + .addImm(AMDGPU::sub1); + MIB.addReg(NewVR, 0, AMDGPU::sub0); + MIB.addReg(NewVR, RegState::Implicit); + } else { + MIB.addReg(VSrc); + } + if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) return false; } @@ -1446,24 +1502,6 @@ static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, return TexFailCtrl == 0; } -static bool parseCachePolicy(uint64_t Value, - bool *GLC, bool *SLC, bool *DLC) { - if (GLC) { - *GLC = (Value & 0x1) ? 1 : 0; - Value &= ~(uint64_t)0x1; - } - if (SLC) { - *SLC = (Value & 0x2) ? 1 : 0; - Value &= ~(uint64_t)0x2; - } - if (DLC) { - *DLC = (Value & 0x4) ? 1 : 0; - Value &= ~(uint64_t)0x4; - } - - return Value == 0; -} - bool AMDGPUInstructionSelector::selectImageIntrinsic( MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const { MachineBasicBlock *MBB = MI.getParent(); @@ -1504,8 +1542,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( const bool IsA16 = (Flags & 1) != 0; const bool IsG16 = (Flags & 2) != 0; - // A16 implies 16 bit gradients - if (IsA16 && !IsG16) + // A16 implies 16 bit gradients if subtarget doesn't support G16 + if (IsA16 && !STI.hasG16() && !IsG16) return false; unsigned DMask = 0; @@ -1589,21 +1627,11 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( // TODO: Check this in verifier. assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this"); - bool GLC = false; - bool SLC = false; - bool DLC = false; - if (BaseOpcode->Atomic) { - GLC = true; // TODO no-return optimization - if (!parseCachePolicy( - MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), nullptr, - &SLC, IsGFX10Plus ? &DLC : nullptr)) - return false; - } else { - if (!parseCachePolicy( - MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), &GLC, - &SLC, IsGFX10Plus ? &DLC : nullptr)) - return false; - } + unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(); + if (BaseOpcode->Atomic) + CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization + if (CPol & ~AMDGPU::CPol::ALL) + return false; int NumVAddrRegs = 0; int NumVAddrDwords = 0; @@ -1661,8 +1689,10 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; MIB.addDef(TmpReg); - BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut) - .addReg(TmpReg, RegState::Kill, SubReg); + if (!MRI->use_empty(VDataOut)) { + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut) + .addReg(TmpReg, RegState::Kill, SubReg); + } } else { MIB.addDef(VDataOut); // vdata output @@ -1689,11 +1719,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( if (IsGFX10Plus) MIB.addImm(DimInfo->Encoding); MIB.addImm(Unorm); - if (IsGFX10Plus) - MIB.addImm(DLC); - MIB.addImm(GLC); - MIB.addImm(SLC); + MIB.addImm(CPol); MIB.addImm(IsA16 && // a16 or r128 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0); if (IsGFX10Plus) @@ -1706,6 +1733,38 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( if (BaseOpcode->HasD16) MIB.addImm(IsD16 ? -1 : 0); + if (IsTexFail) { + // An image load instruction with TFE/LWE only conditionally writes to its + // result registers. Initialize them to zero so that we always get well + // defined result values. + assert(VDataOut && !VDataIn); + Register Tied = MRI->cloneVirtualRegister(VDataOut); + Register Zero = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::V_MOV_B32_e32), Zero) + .addImm(0); + auto Parts = TRI.getRegSplitParts(MRI->getRegClass(Tied), 4); + if (STI.usePRTStrictNull()) { + // With enable-prt-strict-null enabled, initialize all result registers to + // zero. + auto RegSeq = + BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied); + for (auto Sub : Parts) + RegSeq.addReg(Zero).addImm(Sub); + } else { + // With enable-prt-strict-null disabled, only initialize the extra TFE/LWE + // result register. + Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef); + auto RegSeq = + BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied); + for (auto Sub : Parts.drop_back(1)) + RegSeq.addReg(Undef).addImm(Sub); + RegSeq.addReg(Zero).addImm(Parts.back()); + } + MIB.addReg(Tied, RegState::Implicit); + MIB->tieOperands(0, MIB->getNumOperands() - 1); + } + MI.eraseFromParent(); return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); } @@ -1733,7 +1792,7 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( case Intrinsic::amdgcn_s_barrier: return selectSBarrier(I); case Intrinsic::amdgcn_global_atomic_fadd: - return selectGlobalAtomicFaddIntrinsic(I); + return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3)); default: { return selectImpl(I, *CoverageInfo); } @@ -1848,7 +1907,7 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { return false; } - if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) { + if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) { MachineBasicBlock *MBB = I.getParent(); const DebugLoc &DL = I.getDebugLoc(); @@ -2336,6 +2395,13 @@ void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW( MachineInstr &I) const { + if (I.getOpcode() == TargetOpcode::G_ATOMICRMW_FADD) { + const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); + unsigned AS = PtrTy.getAddressSpace(); + if (AS == AMDGPUAS::GLOBAL_ADDRESS) + return selectGlobalAtomicFadd(I, I.getOperand(1), I.getOperand(2)); + } + initM0(I); return selectImpl(I, *CoverageInfo); } @@ -2386,8 +2452,7 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG( MIB.addImm(0); MIB.addImm(Offset); - MIB.addImm(1); // glc - MIB.addImm(0); // slc + MIB.addImm(AMDGPU::CPol::GLC); MIB.cloneMemRefs(MI); BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg) @@ -2772,7 +2837,7 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( Register Src1Reg = MI.getOperand(2).getReg(); ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask(); - const LLT V2S16 = LLT::vector(2, 16); + const LLT V2S16 = LLT::fixed_vector(2, 16); if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16) return false; @@ -2895,6 +2960,8 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD( MachineInstr &MI) const { + if (STI.hasGFX90AInsts()) + return selectImpl(MI, *CoverageInfo); MachineBasicBlock *MBB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); @@ -2951,7 +3018,7 @@ bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD( if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN || Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) { - Register IdxReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); + Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class()); BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg) .addReg(VIndex.getReg()) .addImm(AMDGPU::sub0) @@ -2968,7 +3035,7 @@ bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD( I.add(MI.getOperand(2)); // rsrc I.add(SOffset); I.addImm(Offset); - renderExtractSLC(I, MI, 7); + I.addImm(MI.getOperand(7).getImm()); // cpol I.cloneMemRefs(MI); MI.eraseFromParent(); @@ -2976,8 +3043,14 @@ bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD( return true; } -bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic( - MachineInstr &MI) const{ +bool AMDGPUInstructionSelector::selectGlobalAtomicFadd( + MachineInstr &MI, MachineOperand &AddrOp, MachineOperand &DataOp) const { + + if (STI.hasGFX90AInsts()) { + // gfx90a adds return versions of the global atomic fadd instructions so no + // special handling is required. + return selectImpl(MI, *CoverageInfo); + } MachineBasicBlock *MBB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); @@ -2994,16 +3067,16 @@ bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic( // FIXME: This is only needed because tablegen requires number of dst operands // in match and replace pattern to be the same. Otherwise patterns can be // exported from SDag path. - auto Addr = selectFlatOffsetImpl<true>(MI.getOperand(2)); + auto Addr = selectFlatOffsetImpl(AddrOp, SIInstrFlags::FlatGlobal); - Register Data = MI.getOperand(3).getReg(); + Register Data = DataOp.getReg(); const unsigned Opc = MRI->getType(Data).isVector() ? AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32; auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)) .addReg(Addr.first) .addReg(Data) .addImm(Addr.second) - .addImm(0) // SLC + .addImm(0) // cpol .cloneMemRefs(MI); MI.eraseFromParent(); @@ -3140,6 +3213,9 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { return selectBVHIntrinsic(I); case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: return selectAMDGPU_BUFFER_ATOMIC_FADD(I); + case AMDGPU::G_SBFX: + case AMDGPU::G_UBFX: + return selectG_SBFX_UBFX(I); default: return selectImpl(I, *CoverageInfo); } @@ -3282,7 +3358,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl( if (MI && MI->getOpcode() == AMDGPU::G_FNEG && // It's possible to see an f32 fneg here, but unlikely. // TODO: Treat f32 fneg as only high bit. - MRI.getType(Src) == LLT::vector(2, 16)) { + MRI.getType(Src) == LLT::fixed_vector(2, 16)) { Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); Src = MI->getOperand(1).getReg(); MI = MRI.getVRegDef(Src); @@ -3408,9 +3484,9 @@ AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { }}; } -template <bool Signed> std::pair<Register, int> -AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { +AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root, + uint64_t FlatVariant) const { MachineInstr *MI = Root.getParent(); auto Default = std::make_pair(Root.getReg(), 0); @@ -3426,7 +3502,7 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { return Default; unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); - if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, Signed)) + if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant)) return Default; return std::make_pair(PtrBase, ConstOffset); @@ -3434,7 +3510,7 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { - auto PtrWithOffset = selectFlatOffsetImpl<false>(Root); + auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT); return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, @@ -3443,8 +3519,18 @@ AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { - auto PtrWithOffset = selectFlatOffsetImpl<true>(Root); +AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const { + auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); }, + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const { + auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch); return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, @@ -3483,39 +3569,56 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); if (ConstOffset != 0) { - if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true)) { + if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, + SIInstrFlags::FlatGlobal)) { Addr = PtrBase; ImmOffset = ConstOffset; - } else if (ConstOffset > 0) { + } else { auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI); if (!PtrBaseDef) return None; if (isSGPR(PtrBaseDef->Reg)) { - // Offset is too large. - // - // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset) - // + (large_offset & MaxOffset); - int64_t SplitImmOffset, RemainderOffset; - std::tie(SplitImmOffset, RemainderOffset) - = TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true); - - if (isUInt<32>(RemainderOffset)) { - MachineInstr *MI = Root.getParent(); - MachineBasicBlock *MBB = MI->getParent(); - Register HighBits - = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); - - BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), - HighBits) - .addImm(RemainderOffset); - - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr - [=](MachineInstrBuilder &MIB) { MIB.addReg(HighBits); }, // voffset - [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); }, - }}; + if (ConstOffset > 0) { + // Offset is too large. + // + // saddr + large_offset -> saddr + + // (voffset = large_offset & ~MaxOffset) + + // (large_offset & MaxOffset); + int64_t SplitImmOffset, RemainderOffset; + std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset( + ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); + + if (isUInt<32>(RemainderOffset)) { + MachineInstr *MI = Root.getParent(); + MachineBasicBlock *MBB = MI->getParent(); + Register HighBits = + MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), + HighBits) + .addImm(RemainderOffset); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr + [=](MachineInstrBuilder &MIB) { + MIB.addReg(HighBits); + }, // voffset + [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); }, + }}; + } } + + // We are adding a 64 bit SGPR and a constant. If constant bus limit + // is 1 we would need to perform 1 or 2 extra moves for each half of + // the constant and it is better to do a scalar add and then issue a + // single VALU instruction to materialize zero. Otherwise it is less + // instructions to perform VALU adds with immediates or inline literals. + unsigned NumLiterals = + !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) + + !TII.isInlineConstant(APInt(32, ConstOffset >> 32)); + if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals) + return None; } } } @@ -3525,57 +3628,50 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { return None; // Match the variable offset. - if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) { - // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and - // drop this. - if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF || - AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT) - return None; - - // It's cheaper to materialize a single 32-bit zero for vaddr than the two - // moves required to copy a 64-bit SGPR to VGPR. - const Register SAddr = AddrDef->Reg; - if (!isSGPR(SAddr)) - return None; - - MachineInstr *MI = Root.getParent(); - MachineBasicBlock *MBB = MI->getParent(); - Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); - - BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), - VOffset) - .addImm(0); - - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr - [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset - [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset - }}; + if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { + // Look through the SGPR->VGPR copy. + Register SAddr = + getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI); + + if (SAddr && isSGPR(SAddr)) { + Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg(); + + // It's possible voffset is an SGPR here, but the copy to VGPR will be + // inserted later. + if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) { + return {{[=](MachineInstrBuilder &MIB) { // saddr + MIB.addReg(SAddr); + }, + [=](MachineInstrBuilder &MIB) { // voffset + MIB.addReg(VOffset); + }, + [=](MachineInstrBuilder &MIB) { // offset + MIB.addImm(ImmOffset); + }}}; + } + } } - // Look through the SGPR->VGPR copy. - Register SAddr = - getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI); - if (!SAddr || !isSGPR(SAddr)) + // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and + // drop this. + if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF || + AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg)) return None; - Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg(); + // It's cheaper to materialize a single 32-bit zero for vaddr than the two + // moves required to copy a 64-bit SGPR to VGPR. + MachineInstr *MI = Root.getParent(); + MachineBasicBlock *MBB = MI->getParent(); + Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); - // It's possible voffset is an SGPR here, but the copy to VGPR will be - // inserted later. - Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset); - if (!VOffset) - return None; + BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset) + .addImm(0); - return {{[=](MachineInstrBuilder &MIB) { // saddr - MIB.addReg(SAddr); - }, - [=](MachineInstrBuilder &MIB) { // voffset - MIB.addReg(VOffset); - }, - [=](MachineInstrBuilder &MIB) { // offset - MIB.addImm(ImmOffset); - }}}; + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr + [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset + [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset + }}; } InstructionSelector::ComplexRendererFns @@ -3590,7 +3686,8 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); if (ConstOffset != 0 && - TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) { + TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, + SIInstrFlags::FlatScratch)) { Addr = PtrBase; ImmOffset = ConstOffset; } @@ -3624,9 +3721,9 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { const DebugLoc &DL = I.getDebugLoc(); SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), SAddr) - .addFrameIndex(FI) - .addReg(RHSDef->Reg); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr) + .addFrameIndex(FI) + .addReg(RHSDef->Reg); } } @@ -3639,11 +3736,6 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { }}; } -static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { - auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); - return PSV && PSV->isStack(); -} - InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { MachineInstr *MI = Root.getParent(); @@ -3685,23 +3777,19 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { Optional<int> FI; Register VAddr = Root.getReg(); if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { - if (isBaseWithConstantOffset(Root, *MRI)) { - const MachineOperand &LHS = RootDef->getOperand(1); - const MachineOperand &RHS = RootDef->getOperand(2); - const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); - const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); - if (LHSDef && RHSDef) { - int64_t PossibleOffset = - RHSDef->getOperand(1).getCImm()->getSExtValue(); - if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && - (!STI.privateMemoryResourceIsRangeChecked() || - KnownBits->signBitIsZero(LHS.getReg()))) { - if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) - FI = LHSDef->getOperand(1).getIndex(); - else - VAddr = LHS.getReg(); - Offset = PossibleOffset; - } + Register PtrBase; + int64_t ConstOffset; + std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI); + if (ConstOffset != 0) { + if (SIInstrInfo::isLegalMUBUFImmOffset(ConstOffset) && + (!STI.privateMemoryResourceIsRangeChecked() || + KnownBits->signBitIsZero(PtrBase))) { + const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase); + if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX) + FI = PtrBaseDef->getOperand(1).getIndex(); + else + VAddr = PtrBase; + Offset = ConstOffset; } } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { FI = RootDef->getOperand(1).getIndex(); @@ -3769,18 +3857,13 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffset( const MachineFunction *MF = MBB->getParent(); const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); - const MachineMemOperand *MMO = *MI->memoperands_begin(); - const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); return {{ [=](MachineInstrBuilder &MIB) { // rsrc MIB.addReg(Info->getScratchRSrcReg()); }, [=](MachineInstrBuilder &MIB) { // soffset - if (isStackPtrRelative(PtrInfo)) - MIB.addReg(Info->getStackPtrOffsetReg()); - else - MIB.addImm(0); + MIB.addImm(0); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset }}; @@ -4130,10 +4213,8 @@ AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { [=](MachineInstrBuilder &MIB) { // offset MIB.addImm(Offset); }, - addZeroImm, // glc - addZeroImm, // slc + addZeroImm, // cpol addZeroImm, // tfe - addZeroImm, // dlc addZeroImm // swz }}; } @@ -4158,11 +4239,9 @@ AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { MIB.addImm(0); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset - addZeroImm, // glc - addZeroImm, // slc + addZeroImm, // cpol addZeroImm, // tfe - addZeroImm, // dlc - addZeroImm // swz + addZeroImm, // swz }}; } @@ -4194,7 +4273,9 @@ AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const { [=](MachineInstrBuilder &MIB) { // offset MIB.addImm(Offset); }, - addZeroImm // slc + [=](MachineInstrBuilder &MIB) { + MIB.addImm(AMDGPU::CPol::GLC); // cpol + } }}; } @@ -4218,7 +4299,7 @@ AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const { MIB.addImm(0); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset - addZeroImm // slc + [=](MachineInstrBuilder &MIB) { MIB.addImm(AMDGPU::CPol::GLC); } // cpol }}; } @@ -4308,32 +4389,25 @@ void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, MIB.addImm(MI.getOperand(OpIdx).getImm()); } -void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB, - const MachineInstr &MI, - int OpIdx) const { +void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm(MI.getOperand(OpIdx).getImm() & 1); + MIB.addImm(MI.getOperand(OpIdx).getImm() & AMDGPU::CPol::ALL); } -void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB, - const MachineInstr &MI, - int OpIdx) const { - assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1); -} - -void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB, +void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1); + MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1); } -void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, - const MachineInstr &MI, - int OpIdx) const { +void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1); + MIB.addImm(MI.getOperand(OpIdx).getImm() | AMDGPU::CPol::GLC); } void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index d70f18098cd7..cb05a1cb6369 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -36,6 +36,8 @@ struct ImageDimIntrinsicInfo; class AMDGPUInstrInfo; class AMDGPURegisterBankInfo; class AMDGPUTargetMachine; +class BlockFrequencyInfo; +class ProfileSummaryInfo; class GCNSubtarget; class MachineInstr; class MachineIRBuilder; @@ -45,6 +47,7 @@ class RegisterBank; class SIInstrInfo; class SIMachineFunctionInfo; class SIRegisterInfo; +class TargetRegisterClass; class AMDGPUInstructionSelector final : public InstructionSelector { private: @@ -59,8 +62,9 @@ public: bool select(MachineInstr &I) override; static const char *getName(); - void setupMF(MachineFunction &MF, GISelKnownBits &KB, - CodeGenCoverage &CoverageInfo) override; + void setupMF(MachineFunction &MF, GISelKnownBits *KB, + CodeGenCoverage &CoverageInfo, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) override; private: struct GEPInfo { @@ -105,6 +109,7 @@ private: bool selectG_PTR_ADD(MachineInstr &I) const; bool selectG_IMPLICIT_DEF(MachineInstr &I) const; bool selectG_INSERT(MachineInstr &I) const; + bool selectG_SBFX_UBFX(MachineInstr &I) const; bool selectInterpP1F16(MachineInstr &MI) const; bool selectWritelane(MachineInstr &MI) const; @@ -143,7 +148,8 @@ private: bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const; bool selectG_SHUFFLE_VECTOR(MachineInstr &I) const; bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const; - bool selectGlobalAtomicFaddIntrinsic(MachineInstr &I) const; + bool selectGlobalAtomicFadd(MachineInstr &I, MachineOperand &AddrOp, + MachineOperand &DataOp) const; bool selectBVHIntrinsic(MachineInstr &I) const; std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root, @@ -187,14 +193,15 @@ private: InstructionSelector::ComplexRendererFns selectSmrdSgpr(MachineOperand &Root) const; - template <bool Signed> - std::pair<Register, int> - selectFlatOffsetImpl(MachineOperand &Root) const; + std::pair<Register, int> selectFlatOffsetImpl(MachineOperand &Root, + uint64_t FlatVariant) const; InstructionSelector::ComplexRendererFns selectFlatOffset(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns - selectFlatOffsetSigned(MachineOperand &Root) const; + selectGlobalOffset(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectScratchOffset(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectGlobalSAddr(MachineOperand &Root) const; @@ -274,26 +281,6 @@ private: void renderTruncTImm(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; - void renderTruncTImm1(MachineInstrBuilder &MIB, const MachineInstr &MI, - int OpIdx) const { - renderTruncTImm(MIB, MI, OpIdx); - } - - void renderTruncTImm8(MachineInstrBuilder &MIB, const MachineInstr &MI, - int OpIdx) const { - renderTruncTImm(MIB, MI, OpIdx); - } - - void renderTruncTImm16(MachineInstrBuilder &MIB, const MachineInstr &MI, - int OpIdx) const { - renderTruncTImm(MIB, MI, OpIdx); - } - - void renderTruncTImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, - int OpIdx) const { - renderTruncTImm(MIB, MI, OpIdx); - } - void renderNegateImm(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; @@ -302,14 +289,13 @@ private: void renderPopcntImm(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; - void renderExtractGLC(MachineInstrBuilder &MIB, const MachineInstr &MI, - int OpIdx) const; - void renderExtractSLC(MachineInstrBuilder &MIB, const MachineInstr &MI, - int OpIdx) const; - void renderExtractDLC(MachineInstrBuilder &MIB, const MachineInstr &MI, - int OpIdx) const; + void renderExtractCPol(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; void renderExtractSWZ(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; + void renderSetGLC(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; + void renderFrameIndex(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 8ef9c99e8b35..119c4089d6c2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -83,8 +83,7 @@ def FalsePredicate : Predicate<"false">; // Add a predicate to the list if does not already exist to deduplicate it. class PredConcat<list<Predicate> lst, Predicate pred> { list<Predicate> ret = - !listconcat([pred], !filter(item, lst, - !ne(!cast<string>(item), !cast<string>(pred)))); + !listconcat([pred], !filter(item, lst, !ne(item, pred))); } class PredicateControl { @@ -185,6 +184,28 @@ class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag< }]; } +class is_canonicalized<SDPatternOperator op> : PatFrag< + (ops node:$src0, node:$src1), + (op $src0, $src1), + [{ + const SITargetLowering &Lowering = + *static_cast<const SITargetLowering *>(getTargetLowering()); + + return Lowering.isCanonicalized(*CurDAG, N->getOperand(0)) && + Lowering.isCanonicalized(*CurDAG, N->getOperand(1)); + }]> { + + // TODO: Improve the Legalizer for g_build_vector in Global Isel to match this class + let GISelPredicateCode = [{ + const SITargetLowering *TLI = static_cast<const SITargetLowering *>( + MF.getSubtarget().getTargetLowering()); + + return TLI->isCanonicalized(MI.getOperand(1).getReg(), const_cast<MachineFunction&>(MF)) && + TLI->isCanonicalized(MI.getOperand(2).getReg(), const_cast<MachineFunction&>(MF)); + }]; +} + + let Properties = [SDNPCommutative, SDNPAssociative] in { def smax_oneuse : HasOneUseBinOp<smax>; def smin_oneuse : HasOneUseBinOp<smin>; @@ -596,12 +617,6 @@ class DwordAddrPat<ValueType vt, RegisterClass rc> : AMDGPUPat < (vt rc:$addr) >; -// fshr pattern -class FSHRPattern <Instruction BIT_ALIGN> : AMDGPUPat < - (fshr i32:$src0, i32:$src1, i32:$src2), - (BIT_ALIGN $src0, $src1, $src2) ->; - // rotr pattern class ROTRPattern <Instruction BIT_ALIGN> : AMDGPUPat < (rotr i32:$src0, i32:$src1), diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp index 8aea33cf289d..4971b010870d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp @@ -165,10 +165,12 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) { PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS); PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS); auto *NewPtr = IRB.CreateBitCast( - IRB.CreateConstGEP1_64(IRB.CreateBitCast(Base, Int8PtrTy), - Offset - Adjust), + IRB.CreateConstGEP1_64( + IRB.getInt8Ty(), + IRB.CreatePointerBitCastOrAddrSpaceCast(Base, Int8PtrTy), + Offset - Adjust), Int32PtrTy); - LoadInst *NewLd = IRB.CreateAlignedLoad(NewPtr, Align(4)); + LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4)); NewLd->copyMetadata(LI); NewLd->setMetadata(LLVMContext::MD_range, nullptr); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 9f359c232981..c1a9b30a509e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -18,7 +18,9 @@ #include "AMDGPUInstrInfo.h" #include "AMDGPUTargetMachine.h" #include "SIMachineFunctionInfo.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/ScopeExit.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" @@ -47,7 +49,7 @@ static constexpr unsigned MaxRegisterSize = 1024; static LLT getPow2VectorType(LLT Ty) { unsigned NElts = Ty.getNumElements(); unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); - return Ty.changeNumElements(Pow2NElts); + return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); } // Round the number of bits to the next power of two bits @@ -93,7 +95,8 @@ static LegalizeMutation oneMoreElement(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { const LLT Ty = Query.Types[TypeIdx]; const LLT EltTy = Ty.getElementType(); - return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); + return std::make_pair(TypeIdx, + LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); }; } @@ -104,7 +107,9 @@ static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { unsigned Size = Ty.getSizeInBits(); unsigned Pieces = (Size + 63) / 64; unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; - return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); + return std::make_pair( + TypeIdx, + LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); }; } @@ -122,7 +127,7 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { assert(EltSize < 32); const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; - return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); + return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); }; } @@ -136,7 +141,7 @@ static LLT getBitcastRegisterType(const LLT Ty) { return LLT::scalar(Size); } - return LLT::scalarOrVector(Size / 32, 32); + return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); } static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { @@ -151,7 +156,8 @@ static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { const LLT Ty = Query.Types[TypeIdx]; unsigned Size = Ty.getSizeInBits(); assert(Size % 32 == 0); - return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); + return std::make_pair( + TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); }; } @@ -220,11 +226,13 @@ static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { }; } -static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { +// If we have a truncating store or an extending load with a data size larger +// than 32-bits, we need to reduce to a 32-bit type. +static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { const LLT Ty = Query.Types[TypeIdx]; return !Ty.isVector() && Ty.getSizeInBits() > 32 && - Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); + Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); }; } @@ -257,15 +265,14 @@ static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, } static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, - const LegalityQuery &Query, - unsigned Opcode) { + const LegalityQuery &Query) { const LLT Ty = Query.Types[0]; // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD - const bool IsLoad = Opcode != AMDGPU::G_STORE; + const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; unsigned RegSize = Ty.getSizeInBits(); - unsigned MemSize = Query.MMODescrs[0].SizeInBits; + unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); unsigned AlignBits = Query.MMODescrs[0].AlignInBits; unsigned AS = Query.Types[1].getAddressSpace(); @@ -273,6 +280,10 @@ static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) return false; + // Do not handle extending vector loads. + if (Ty.isVector() && MemSize != RegSize) + return false; + // TODO: We should be able to widen loads if the alignment is high enough, but // we also need to modify the memory access size. #if 0 @@ -341,33 +352,37 @@ static bool loadStoreBitcastWorkaround(const LLT Ty) { return EltSize != 32 && EltSize != 64; } -static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, - unsigned Opcode) { +static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { const LLT Ty = Query.Types[0]; - return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && + return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && !loadStoreBitcastWorkaround(Ty); } /// Return true if a load or store of the type should be lowered with a bitcast /// to a different type. static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, - const unsigned MemSizeInBits) { + const LLT MemTy) { + const unsigned MemSizeInBits = MemTy.getSizeInBits(); const unsigned Size = Ty.getSizeInBits(); - if (Size != MemSizeInBits) - return Size <= 32 && Ty.isVector(); + if (Size != MemSizeInBits) + return Size <= 32 && Ty.isVector(); if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) return true; - return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && + + // Don't try to handle bitcasting vector ext loads for now. + return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && + (Size <= 32 || isRegisterSize(Size)) && !isRegisterVectorElementType(Ty.getElementType()); } /// Return true if we should legalize a load by widening an odd sized memory /// access up to the alignment. Note this case when the memory access itself /// changes, not the size of the result register. -static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, +static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, unsigned AlignInBits, unsigned AddrSpace, unsigned Opcode) { + unsigned SizeInBits = MemoryTy.getSizeInBits(); // We don't want to widen cases that are naturally legal. if (isPowerOf2_32(SizeInBits)) return false; @@ -403,7 +418,7 @@ static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) return false; - return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, + return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, Query.MMODescrs[0].AlignInBits, Query.Types[1].getAddressSpace(), Opcode); } @@ -427,35 +442,35 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const LLT S512 = LLT::scalar(512); const LLT MaxScalar = LLT::scalar(MaxRegisterSize); - const LLT V2S8 = LLT::vector(2, 8); - const LLT V2S16 = LLT::vector(2, 16); - const LLT V4S16 = LLT::vector(4, 16); - - const LLT V2S32 = LLT::vector(2, 32); - const LLT V3S32 = LLT::vector(3, 32); - const LLT V4S32 = LLT::vector(4, 32); - const LLT V5S32 = LLT::vector(5, 32); - const LLT V6S32 = LLT::vector(6, 32); - const LLT V7S32 = LLT::vector(7, 32); - const LLT V8S32 = LLT::vector(8, 32); - const LLT V9S32 = LLT::vector(9, 32); - const LLT V10S32 = LLT::vector(10, 32); - const LLT V11S32 = LLT::vector(11, 32); - const LLT V12S32 = LLT::vector(12, 32); - const LLT V13S32 = LLT::vector(13, 32); - const LLT V14S32 = LLT::vector(14, 32); - const LLT V15S32 = LLT::vector(15, 32); - const LLT V16S32 = LLT::vector(16, 32); - const LLT V32S32 = LLT::vector(32, 32); - - const LLT V2S64 = LLT::vector(2, 64); - const LLT V3S64 = LLT::vector(3, 64); - const LLT V4S64 = LLT::vector(4, 64); - const LLT V5S64 = LLT::vector(5, 64); - const LLT V6S64 = LLT::vector(6, 64); - const LLT V7S64 = LLT::vector(7, 64); - const LLT V8S64 = LLT::vector(8, 64); - const LLT V16S64 = LLT::vector(16, 64); + const LLT V2S8 = LLT::fixed_vector(2, 8); + const LLT V2S16 = LLT::fixed_vector(2, 16); + const LLT V4S16 = LLT::fixed_vector(4, 16); + + const LLT V2S32 = LLT::fixed_vector(2, 32); + const LLT V3S32 = LLT::fixed_vector(3, 32); + const LLT V4S32 = LLT::fixed_vector(4, 32); + const LLT V5S32 = LLT::fixed_vector(5, 32); + const LLT V6S32 = LLT::fixed_vector(6, 32); + const LLT V7S32 = LLT::fixed_vector(7, 32); + const LLT V8S32 = LLT::fixed_vector(8, 32); + const LLT V9S32 = LLT::fixed_vector(9, 32); + const LLT V10S32 = LLT::fixed_vector(10, 32); + const LLT V11S32 = LLT::fixed_vector(11, 32); + const LLT V12S32 = LLT::fixed_vector(12, 32); + const LLT V13S32 = LLT::fixed_vector(13, 32); + const LLT V14S32 = LLT::fixed_vector(14, 32); + const LLT V15S32 = LLT::fixed_vector(15, 32); + const LLT V16S32 = LLT::fixed_vector(16, 32); + const LLT V32S32 = LLT::fixed_vector(32, 32); + + const LLT V2S64 = LLT::fixed_vector(2, 64); + const LLT V3S64 = LLT::fixed_vector(3, 64); + const LLT V4S64 = LLT::fixed_vector(4, 64); + const LLT V5S64 = LLT::fixed_vector(5, 64); + const LLT V6S64 = LLT::fixed_vector(6, 64); + const LLT V7S64 = LLT::fixed_vector(7, 64); + const LLT V8S64 = LLT::fixed_vector(8, 64); + const LLT V16S64 = LLT::fixed_vector(16, 64); std::initializer_list<LLT> AllS32Vectors = {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, @@ -495,8 +510,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; - setAction({G_BRCOND, S1}, Legal); // VCC branches - setAction({G_BRCOND, S32}, Legal); // SCC branches + // s1 for VCC branches, s32 for SCC branches. + getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more // elements for v3s16 @@ -579,11 +594,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .lower(); } - getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) - .customFor({S32, S64}) - .clampScalar(0, S32, S64) - .widenScalarToNextPow2(0, 32) - .scalarize(0); + getActionDefinitionsBuilder( + {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) + .customFor({S32, S64}) + .clampScalar(0, S32, S64) + .widenScalarToNextPow2(0, 32) + .scalarize(0); auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) .legalFor({S32}) @@ -643,7 +659,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .widenScalarToNextPow2(0, 32) .clampMaxNumElements(0, S32, 16); - setAction({G_FRAME_INDEX, PrivatePtr}, Legal); + getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); // If the amount is divergent, we have to do a wave reduction to get the // maximum value, so this is expanded during RegBankSelect. @@ -653,7 +669,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder(G_GLOBAL_VALUE) .customIf(typeIsNot(0, PrivatePtr)); - setAction({G_BLOCK_ADDR, CodePtr}, Legal); + getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); auto &FPOpActions = getActionDefinitionsBuilder( { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) @@ -809,7 +825,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) - .customFor({{S64, S64}}) + .customFor({{S64, S32}, {S64, S64}}) .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); if (ST.has16BitInsts()) FPToI.legalFor({{S16, S16}}); @@ -817,6 +833,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, FPToI.minScalar(1, S32); FPToI.minScalar(0, S32) + .widenScalarToNextPow2(0, 32) .scalarize(0) .lower(); @@ -935,10 +952,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .widenScalarToNextPow2(0, 32) .widenScalarToNextPow2(1, 32); + // S64 is only legal on SALU, and needs to be broken into 32-bit elements in + // RegBankSelect. getActionDefinitionsBuilder(G_BITREVERSE) - .legalFor({S32}) - .clampScalar(0, S32, S32) - .scalarize(0); + .legalFor({S32, S64}) + .clampScalar(0, S32, S64) + .scalarize(0) + .widenScalarToNextPow2(0); if (ST.has16BitInsts()) { getActionDefinitionsBuilder(G_BSWAP) @@ -951,7 +971,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0); if (ST.hasVOP3PInsts()) { - getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) + getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) .legalFor({S32, S16, V2S16}) .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) .clampMaxNumElements(0, S16, 2) @@ -960,7 +980,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0) .lower(); } else { - getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) + getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) .legalFor({S32, S16}) .widenScalarToNextPow2(0) .minScalar(0, S16) @@ -979,7 +999,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0) .lower(); - getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) + getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) .legalFor({S32}) .minScalar(0, S32) .widenScalarToNextPow2(0) @@ -1029,7 +1049,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const LLT DstTy = Query.Types[0]; // Split vector extloads. - unsigned MemSize = Query.MMODescrs[0].SizeInBits; + unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); unsigned AlignBits = Query.MMODescrs[0].AlignInBits; if (MemSize < DstTy.getSizeInBits()) @@ -1078,35 +1098,35 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, auto &Actions = getActionDefinitionsBuilder(Op); // Explicitly list some common cases. // TODO: Does this help compile time at all? - Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, - {V2S32, GlobalPtr, 64, GlobalAlign32}, - {V4S32, GlobalPtr, 128, GlobalAlign32}, - {S64, GlobalPtr, 64, GlobalAlign32}, - {V2S64, GlobalPtr, 128, GlobalAlign32}, - {V2S16, GlobalPtr, 32, GlobalAlign32}, - {S32, GlobalPtr, 8, GlobalAlign8}, - {S32, GlobalPtr, 16, GlobalAlign16}, - - {S32, LocalPtr, 32, 32}, - {S64, LocalPtr, 64, 32}, - {V2S32, LocalPtr, 64, 32}, - {S32, LocalPtr, 8, 8}, - {S32, LocalPtr, 16, 16}, - {V2S16, LocalPtr, 32, 32}, - - {S32, PrivatePtr, 32, 32}, - {S32, PrivatePtr, 8, 8}, - {S32, PrivatePtr, 16, 16}, - {V2S16, PrivatePtr, 32, 32}, - - {S32, ConstantPtr, 32, GlobalAlign32}, - {V2S32, ConstantPtr, 64, GlobalAlign32}, - {V4S32, ConstantPtr, 128, GlobalAlign32}, - {S64, ConstantPtr, 64, GlobalAlign32}, - {V2S32, ConstantPtr, 32, GlobalAlign32}}); + Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, + {V2S32, GlobalPtr, V2S32, GlobalAlign32}, + {V4S32, GlobalPtr, V4S32, GlobalAlign32}, + {S64, GlobalPtr, S64, GlobalAlign32}, + {V2S64, GlobalPtr, V2S64, GlobalAlign32}, + {V2S16, GlobalPtr, V2S16, GlobalAlign32}, + {S32, GlobalPtr, S8, GlobalAlign8}, + {S32, GlobalPtr, S16, GlobalAlign16}, + + {S32, LocalPtr, S32, 32}, + {S64, LocalPtr, S64, 32}, + {V2S32, LocalPtr, V2S32, 32}, + {S32, LocalPtr, S8, 8}, + {S32, LocalPtr, S16, 16}, + {V2S16, LocalPtr, S32, 32}, + + {S32, PrivatePtr, S32, 32}, + {S32, PrivatePtr, S8, 8}, + {S32, PrivatePtr, S16, 16}, + {V2S16, PrivatePtr, S32, 32}, + + {S32, ConstantPtr, S32, GlobalAlign32}, + {V2S32, ConstantPtr, V2S32, GlobalAlign32}, + {V4S32, ConstantPtr, V4S32, GlobalAlign32}, + {S64, ConstantPtr, S64, GlobalAlign32}, + {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); Actions.legalIf( [=](const LegalityQuery &Query) -> bool { - return isLoadStoreLegal(ST, Query, Op); + return isLoadStoreLegal(ST, Query); }); // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to @@ -1125,7 +1145,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, Actions.bitcastIf( [=](const LegalityQuery &Query) -> bool { return shouldBitcastLoadStoreType(ST, Query.Types[0], - Query.MMODescrs[0].SizeInBits); + Query.MMODescrs[0].MemoryTy); }, bitcastToRegisterType(0)); if (!IsStore) { @@ -1148,7 +1168,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const LLT PtrTy = Query.Types[1]; const unsigned DstSize = DstTy.getSizeInBits(); - unsigned MemSize = Query.MMODescrs[0].SizeInBits; + unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); // Split extloads. if (DstSize > MemSize) @@ -1196,16 +1216,18 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // FIXME: 3 element stores scalarized on SI // Split if it's too large for the address space. - if (Query.MMODescrs[0].SizeInBits > MaxSize) { + unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); + if (MemSize > MaxSize) { unsigned NumElts = DstTy.getNumElements(); unsigned EltSize = EltTy.getSizeInBits(); if (MaxSize % EltSize == 0) { return std::make_pair( - 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); + 0, LLT::scalarOrVector( + ElementCount::getFixed(MaxSize / EltSize), EltTy)); } - unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; + unsigned NumPieces = MemSize / MaxSize; // FIXME: Refine when odd breakdowns handled // The scalars will need to be re-legalized. @@ -1213,12 +1235,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, NumElts % NumPieces != 0) return std::make_pair(0, EltTy); - return std::make_pair(0, - LLT::vector(NumElts / NumPieces, EltTy)); + return std::make_pair( + 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); } // FIXME: We could probably handle weird extending loads better. - unsigned MemSize = Query.MMODescrs[0].SizeInBits; if (DstTy.getSizeInBits() > MemSize) return std::make_pair(0, EltTy); @@ -1230,48 +1251,58 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // should be OK, since the new parts will be further legalized. unsigned FloorSize = PowerOf2Floor(DstSize); return std::make_pair( - 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); + 0, LLT::scalarOrVector( + ElementCount::getFixed(FloorSize / EltSize), EltTy)); } // Need to split because of alignment. unsigned Align = Query.MMODescrs[0].AlignInBits; if (EltSize > Align && (EltSize / Align < DstTy.getNumElements())) { - return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); + return std::make_pair( + 0, LLT::fixed_vector(EltSize / Align, EltTy)); } // May need relegalization for the scalars. return std::make_pair(0, EltTy); }) .lowerIfMemSizeNotPow2() - .minScalar(0, S32); - - if (IsStore) - Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); - - Actions - .widenScalarToNextPow2(0) - .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) - .lower(); + .minScalar(0, S32) + .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) + .widenScalarToNextPow2(0) + .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) + .lower(); } + // FIXME: Unaligned accesses not lowered. auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) - .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, - {S32, GlobalPtr, 16, 2 * 8}, - {S32, LocalPtr, 8, 8}, - {S32, LocalPtr, 16, 16}, - {S32, PrivatePtr, 8, 8}, - {S32, PrivatePtr, 16, 16}, - {S32, ConstantPtr, 8, 8}, - {S32, ConstantPtr, 16, 2 * 8}}); + .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, + {S32, GlobalPtr, S16, 2 * 8}, + {S32, LocalPtr, S8, 8}, + {S32, LocalPtr, S16, 16}, + {S32, PrivatePtr, S8, 8}, + {S32, PrivatePtr, S16, 16}, + {S32, ConstantPtr, S8, 8}, + {S32, ConstantPtr, S16, 2 * 8}}) + .legalIf( + [=](const LegalityQuery &Query) -> bool { + return isLoadStoreLegal(ST, Query); + }); + if (ST.hasFlatAddressSpace()) { ExtLoads.legalForTypesWithMemDesc( - {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); + {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); } + // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to + // 64-bits. + // + // TODO: Should generalize bitcast action into coerce, which will also cover + // inserting addrspacecasts. + ExtLoads.customIf(typeIs(1, Constant32Ptr)); + ExtLoads.clampScalar(0, S32, S32) .widenScalarToNextPow2(0) - .unsupportedIfMemSizeNotPow2() .lower(); auto &Atomics = getActionDefinitionsBuilder( @@ -1286,10 +1317,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); } + auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); if (ST.hasLDSFPAtomics()) { - getActionDefinitionsBuilder(G_ATOMICRMW_FADD) - .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); + Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); + if (ST.hasGFX90AInsts()) + Atomic.legalFor({{S64, LocalPtr}}); } + if (ST.hasAtomicFaddInsts()) + Atomic.legalFor({{S32, GlobalPtr}}); // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output // demarshalling @@ -1302,19 +1337,21 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // Condition should be s32 for scalar, s1 for vector. getActionDefinitionsBuilder(G_SELECT) - .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, - GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, - LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) - .clampScalar(0, S16, S64) - .scalarize(1) - .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) - .fewerElementsIf(numElementsNotEven(0), scalarize(0)) - .clampMaxNumElements(0, S32, 2) - .clampMaxNumElements(0, LocalPtr, 2) - .clampMaxNumElements(0, PrivatePtr, 2) - .scalarize(0) - .widenScalarToNextPow2(0) - .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); + .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, + LocalPtr, FlatPtr, PrivatePtr, + LLT::fixed_vector(2, LocalPtr), + LLT::fixed_vector(2, PrivatePtr)}, + {S1, S32}) + .clampScalar(0, S16, S64) + .scalarize(1) + .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) + .fewerElementsIf(numElementsNotEven(0), scalarize(0)) + .clampMaxNumElements(0, S32, 2) + .clampMaxNumElements(0, LocalPtr, 2) + .clampMaxNumElements(0, PrivatePtr, 2) + .scalarize(0) + .widenScalarToNextPow2(0) + .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can // be more flexible with the shift amount type. @@ -1393,7 +1430,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; return std::make_pair( - VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); + VecTypeIdx, + LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); }) .clampScalar(EltTypeIdx, S32, S64) .clampScalar(VecTypeIdx, S32, S64) @@ -1590,17 +1628,44 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .clampScalar(0, S32, S64) .lower(); + // TODO: Only Try to form v2s16 with legal packed instructions. getActionDefinitionsBuilder(G_FSHR) .legalFor({{S32, S32}}) + .lowerFor({{V2S16, V2S16}}) + .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) .scalarize(0) .lower(); + if (ST.hasVOP3PInsts()) { + getActionDefinitionsBuilder(G_FSHL) + .lowerFor({{V2S16, V2S16}}) + .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) + .scalarize(0) + .lower(); + } else { + getActionDefinitionsBuilder(G_FSHL) + .scalarize(0) + .lower(); + } + getActionDefinitionsBuilder(G_READCYCLECOUNTER) .legalFor({S64}); getActionDefinitionsBuilder(G_FENCE) .alwaysLegal(); + getActionDefinitionsBuilder({G_SMULO, G_UMULO}) + .scalarize(0) + .minScalar(0, S32) + .lower(); + + getActionDefinitionsBuilder({G_SBFX, G_UBFX}) + .legalFor({{S32, S32}, {S64, S32}}) + .clampScalar(1, S32, S32) + .clampScalar(0, S32, S64) + .widenScalarToNextPow2(0) + .scalarize(0); + getActionDefinitionsBuilder({ // TODO: Verify V_BFI_B32 is generated from expanded bit ops G_FCOPYSIGN, @@ -1614,16 +1679,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, G_SADDO, G_SSUBO, // TODO: Implement - G_FMINIMUM, G_FMAXIMUM, - G_FSHL - }).lower(); + G_FMINIMUM, G_FMAXIMUM}).lower(); getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) .unsupported(); - computeTables(); + getLegacyLegalizerInfo().computeTables(); verify(*ST.getInstrInfo()); } @@ -1668,6 +1731,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, case TargetOpcode::G_GLOBAL_VALUE: return legalizeGlobalValue(MI, MRI, B); case TargetOpcode::G_LOAD: + case TargetOpcode::G_SEXTLOAD: + case TargetOpcode::G_ZEXTLOAD: return legalizeLoad(Helper, MI); case TargetOpcode::G_FMAD: return legalizeFMad(MI, MRI, B); @@ -1675,10 +1740,12 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, return legalizeFDIV(MI, MRI, B); case TargetOpcode::G_UDIV: case TargetOpcode::G_UREM: - return legalizeUDIV_UREM(MI, MRI, B); + case TargetOpcode::G_UDIVREM: + return legalizeUnsignedDIV_REM(MI, MRI, B); case TargetOpcode::G_SDIV: case TargetOpcode::G_SREM: - return legalizeSDIV_SREM(MI, MRI, B); + case TargetOpcode::G_SDIVREM: + return legalizeSignedDIV_REM(MI, MRI, B); case TargetOpcode::G_ATOMIC_CMPXCHG: return legalizeAtomicCmpXChg(MI, MRI, B); case TargetOpcode::G_FLOG: @@ -1751,7 +1818,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture( PtrInfo, MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant, - 4, commonAlignment(Align(64), StructOffset)); + LLT::scalar(32), commonAlignment(Align(64), StructOffset)); Register LoadAddr; @@ -2021,9 +2088,10 @@ bool AMDGPULegalizerInfo::legalizeITOFP( // TODO: Copied from DAG implementation. Verify logic and document how this // actually works. -bool AMDGPULegalizerInfo::legalizeFPTOI( - MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &B, bool Signed) const { +bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B, + bool Signed) const { Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); @@ -2031,24 +2099,57 @@ bool AMDGPULegalizerInfo::legalizeFPTOI( const LLT S64 = LLT::scalar(64); const LLT S32 = LLT::scalar(32); - assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); + const LLT SrcLT = MRI.getType(Src); + assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); unsigned Flags = MI.getFlags(); - auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); - auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); - auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); + // The basic idea of converting a floating point number into a pair of 32-bit + // integers is illustrated as follows: + // + // tf := trunc(val); + // hif := floor(tf * 2^-32); + // lof := tf - hif * 2^32; // lof is always positive due to floor. + // hi := fptoi(hif); + // lo := fptoi(lof); + // + auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); + MachineInstrBuilder Sign; + if (Signed && SrcLT == S32) { + // However, a 32-bit floating point number has only 23 bits mantissa and + // it's not enough to hold all the significant bits of `lof` if val is + // negative. To avoid the loss of precision, We need to take the absolute + // value after truncating and flip the result back based on the original + // signedness. + Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); + Trunc = B.buildFAbs(S32, Trunc, Flags); + } + MachineInstrBuilder K0, K1; + if (SrcLT == S64) { + K0 = B.buildFConstant(S64, + BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); + K1 = B.buildFConstant(S64, + BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); + } else { + K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); + K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); + } - auto Mul = B.buildFMul(S64, Trunc, K0, Flags); - auto FloorMul = B.buildFFloor(S64, Mul, Flags); - auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); + auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); + auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); + auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); - auto Hi = Signed ? - B.buildFPTOSI(S32, FloorMul) : - B.buildFPTOUI(S32, FloorMul); + auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) + : B.buildFPTOUI(S32, FloorMul); auto Lo = B.buildFPTOUI(S32, Fma); - B.buildMerge(Dst, { Lo, Hi }); + if (Signed && SrcLT == S32) { + // Flip the result based on the signedness, which is either all 0s or 1s. + Sign = B.buildMerge(S64, {Sign, Sign}); + // r := xor({lo, hi}, sign) - sign; + B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); + } else + B.buildMerge(Dst, {Lo, Hi}); MI.eraseFromParent(); return true; @@ -2141,7 +2242,7 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt( bool AMDGPULegalizerInfo::legalizeShuffleVector( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - const LLT V2S16 = LLT::vector(2, 16); + const LLT V2S16 = LLT::fixed_vector(2, 16); Register Dst = MI.getOperand(0).getReg(); Register Src0 = MI.getOperand(1).getReg(); @@ -2258,7 +2359,8 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue( SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { - if (!MFI->isModuleEntryFunction()) { + if (!MFI->isModuleEntryFunction() && + !GV->getName().equals("llvm.amdgcn.module.lds")) { const Function &Fn = MF.getFunction(); DiagnosticInfoUnsupported BadLDSDecl( Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), @@ -2334,11 +2436,12 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue( LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); + LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; MachineMemOperand *GOTMMO = MF.getMachineMemOperand( MachinePointerInfo::getGOT(MF), MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant, - 8 /*Size*/, Align(8)); + LoadTy, Align(8)); buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); @@ -2355,7 +2458,8 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue( static LLT widenToNextPowerOf2(LLT Ty) { if (Ty.isVector()) - return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); + return Ty.changeElementCount( + ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); } @@ -2378,17 +2482,21 @@ bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, return true; } + if (MI.getOpcode() != AMDGPU::G_LOAD) + return false; + Register ValReg = MI.getOperand(0).getReg(); LLT ValTy = MRI.getType(ValReg); MachineMemOperand *MMO = *MI.memoperands_begin(); const unsigned ValSize = ValTy.getSizeInBits(); - const unsigned MemSize = 8 * MMO->getSize(); + const LLT MemTy = MMO->getMemoryType(); const Align MemAlign = MMO->getAlign(); + const unsigned MemSize = MemTy.getSizeInBits(); const unsigned AlignInBits = 8 * MemAlign.value(); // Widen non-power-of-2 loads to the alignment if needed - if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { + if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { const unsigned WideMemSize = PowerOf2Ceil(MemSize); // This was already the correct extending load result type, so just adjust @@ -2472,7 +2580,7 @@ bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( "this should not have been custom lowered"); LLT ValTy = MRI.getType(CmpVal); - LLT VecTy = LLT::vector(2, ValTy); + LLT VecTy = LLT::fixed_vector(2, ValTy); Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); @@ -2624,7 +2732,7 @@ bool AMDGPULegalizerInfo::legalizeBuildVector( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { Register Dst = MI.getOperand(0).getReg(); const LLT S32 = LLT::scalar(32); - assert(MRI.getType(Dst) == LLT::vector(2, 16)); + assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); Register Src0 = MI.getOperand(1).getReg(); Register Src1 = MI.getOperand(2).getReg(); @@ -2762,11 +2870,11 @@ bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, return false; } -void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, - Register DstReg, - Register X, - Register Y, - bool IsDiv) const { +void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, + Register DstDivReg, + Register DstRemReg, + Register X, + Register Y) const { const LLT S1 = LLT::scalar(1); const LLT S32 = LLT::scalar(32); @@ -2792,28 +2900,17 @@ void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, // First quotient/remainder refinement. auto One = B.buildConstant(S32, 1); auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); - if (IsDiv) + if (DstDivReg) Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); // Second quotient/remainder refinement. Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); - if (IsDiv) - B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); - else - B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); -} + if (DstDivReg) + B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); -bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, - MachineRegisterInfo &MRI, - MachineIRBuilder &B) const { - const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; - Register DstReg = MI.getOperand(0).getReg(); - Register Num = MI.getOperand(1).getReg(); - Register Den = MI.getOperand(2).getReg(); - legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); - MI.eraseFromParent(); - return true; + if (DstRemReg) + B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); } // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 @@ -2859,11 +2956,11 @@ static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, return {ResultLo.getReg(0), ResultHi.getReg(0)}; } -void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, - Register DstReg, - Register Numer, - Register Denom, - bool IsDiv) const { +void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, + Register DstDivReg, + Register DstRemReg, + Register Numer, + Register Denom) const { const LLT S32 = LLT::scalar(32); const LLT S64 = LLT::scalar(64); const LLT S1 = LLT::scalar(1); @@ -2959,57 +3056,74 @@ void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, // endif C6 // endif C3 - if (IsDiv) { + if (DstDivReg) { auto Sel1 = B.buildSelect( S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); - B.buildSelect(DstReg, - B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); - } else { + B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), + Sel1, MulHi3); + } + + if (DstRemReg) { auto Sel2 = B.buildSelect( S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); - B.buildSelect(DstReg, - B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); + B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), + Sel2, Sub1); } } -bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, - MachineRegisterInfo &MRI, - MachineIRBuilder &B) const { +bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + Register DstDivReg, DstRemReg; + switch (MI.getOpcode()) { + default: + llvm_unreachable("Unexpected opcode!"); + case AMDGPU::G_UDIV: { + DstDivReg = MI.getOperand(0).getReg(); + break; + } + case AMDGPU::G_UREM: { + DstRemReg = MI.getOperand(0).getReg(); + break; + } + case AMDGPU::G_UDIVREM: { + DstDivReg = MI.getOperand(0).getReg(); + DstRemReg = MI.getOperand(1).getReg(); + break; + } + } + const LLT S64 = LLT::scalar(64); const LLT S32 = LLT::scalar(32); - const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; - Register DstReg = MI.getOperand(0).getReg(); - Register Num = MI.getOperand(1).getReg(); - Register Den = MI.getOperand(2).getReg(); - LLT Ty = MRI.getType(DstReg); + const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); + Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); + Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); if (Ty == S32) - legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); + legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); else if (Ty == S64) - legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); + legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); else return false; MI.eraseFromParent(); return true; - } -bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, - MachineRegisterInfo &MRI, - MachineIRBuilder &B) const { +bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { const LLT S64 = LLT::scalar(64); const LLT S32 = LLT::scalar(32); - Register DstReg = MI.getOperand(0).getReg(); - const LLT Ty = MRI.getType(DstReg); + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); if (Ty != S32 && Ty != S64) return false; - const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; - - Register LHS = MI.getOperand(1).getReg(); - Register RHS = MI.getOperand(2).getReg(); + const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); + Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); + Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); @@ -3021,20 +3135,45 @@ bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); - Register UDivRem = MRI.createGenericVirtualRegister(Ty); + Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; + switch (MI.getOpcode()) { + default: + llvm_unreachable("Unexpected opcode!"); + case AMDGPU::G_SDIV: { + DstDivReg = MI.getOperand(0).getReg(); + TmpDivReg = MRI.createGenericVirtualRegister(Ty); + break; + } + case AMDGPU::G_SREM: { + DstRemReg = MI.getOperand(0).getReg(); + TmpRemReg = MRI.createGenericVirtualRegister(Ty); + break; + } + case AMDGPU::G_SDIVREM: { + DstDivReg = MI.getOperand(0).getReg(); + DstRemReg = MI.getOperand(1).getReg(); + TmpDivReg = MRI.createGenericVirtualRegister(Ty); + TmpRemReg = MRI.createGenericVirtualRegister(Ty); + break; + } + } + if (Ty == S32) - legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); + legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); else - legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); + legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); - Register Sign; - if (IsDiv) - Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); - else - Sign = LHSign.getReg(0); // Remainder sign is the same as LHS + if (DstDivReg) { + auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); + auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); + B.buildSub(DstDivReg, SignXor, Sign); + } - UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); - B.buildSub(DstReg, UDivRem, Sign); + if (DstRemReg) { + auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS + auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); + B.buildSub(DstRemReg, SignXor, Sign); + } MI.eraseFromParent(); return true; @@ -3511,18 +3650,21 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, // (the offset that is excluded from bounds checking and swizzling, to go in // the instruction's soffset field). This function takes the first kind of // offset and figures out how to split it between voffset and immoffset. -std::tuple<Register, unsigned, unsigned> +std::pair<Register, unsigned> AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const { const unsigned MaxImm = 4095; Register BaseReg; - unsigned TotalConstOffset; + unsigned ImmOffset; const LLT S32 = LLT::scalar(32); + MachineRegisterInfo &MRI = *B.getMRI(); - std::tie(BaseReg, TotalConstOffset) = - AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); + std::tie(BaseReg, ImmOffset) = + AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); - unsigned ImmOffset = TotalConstOffset; + // If BaseReg is a pointer, convert it to int. + if (MRI.getType(BaseReg).isPointer()) + BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); // If the immediate value is too big for the immoffset field, put the value // and -4096 into the immoffset field so that the value that is copied/added @@ -3550,7 +3692,32 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, if (!BaseReg) BaseReg = B.buildConstant(S32, 0).getReg(0); - return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); + return std::make_pair(BaseReg, ImmOffset); +} + +/// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. +void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, + Register VOffset, Register SOffset, + unsigned ImmOffset, Register VIndex, + MachineRegisterInfo &MRI) const { + Optional<ValueAndVReg> MaybeVOffsetVal = + getConstantVRegValWithLookThrough(VOffset, MRI); + Optional<ValueAndVReg> MaybeSOffsetVal = + getConstantVRegValWithLookThrough(SOffset, MRI); + Optional<ValueAndVReg> MaybeVIndexVal = + getConstantVRegValWithLookThrough(VIndex, MRI); + // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, + // update the MMO with that offset. The stride is unknown so we can only do + // this if VIndex is constant 0. + if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && + MaybeVIndexVal->Value == 0) { + uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + + MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; + MMO->setOffset(TotalOffset); + } else { + // We don't have a constant combined offset to use in the MMO. Give up. + MMO->setValue((Value *)nullptr); + } } /// Handle register layout difference for f16 images for some subtargets. @@ -3572,7 +3739,8 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, int NumElts = StoreVT.getNumElements(); - return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); + return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) + .getReg(0); } if (ImageStore && ST.hasImageStoreD16Bug()) { @@ -3581,7 +3749,8 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, Reg = B.buildBitcast(S32, Reg).getReg(0); PackedRegs.push_back(Reg); PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); - return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); + return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) + .getReg(0); } if (StoreVT.getNumElements() == 3) { @@ -3590,18 +3759,19 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) PackedRegs.push_back(Unmerge.getReg(I)); PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); - Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); - return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); + Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); + return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); } if (StoreVT.getNumElements() == 4) { SmallVector<Register, 4> PackedRegs; - Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); + Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); auto Unmerge = B.buildUnmerge(S32, Reg); for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) PackedRegs.push_back(Unmerge.getReg(I)); PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); - return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); + return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) + .getReg(0); } llvm_unreachable("invalid data type"); @@ -3651,7 +3821,6 @@ bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, const int MemSize = MMO->getSize(); unsigned ImmOffset; - unsigned TotalOffset; // The typed intrinsics add an immediate after the registers. const unsigned NumVIndexOps = IsTyped ? 8 : 7; @@ -3663,6 +3832,8 @@ bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, if (HasVIndex) { VIndex = MI.getOperand(3).getReg(); OpOffset = 1; + } else { + VIndex = B.buildConstant(S32, 0).getReg(0); } Register VOffset = MI.getOperand(3 + OpOffset).getReg(); @@ -3676,9 +3847,8 @@ bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); - std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); - if (TotalOffset != 0) - MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); + std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); + updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); unsigned Opc; if (IsTyped) { @@ -3701,9 +3871,6 @@ bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, } } - if (!VIndex) - VIndex = B.buildConstant(S32, 0).getReg(0); - auto MIB = B.buildInstr(Opc) .addUse(VData) // vdata .addUse(RSrc) // rsrc @@ -3730,7 +3897,7 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, bool IsTyped) const { // FIXME: Verifier should enforce 1 MMO for these intrinsics. MachineMemOperand *MMO = *MI.memoperands_begin(); - const int MemSize = MMO->getSize(); + const LLT MemTy = MMO->getMemoryType(); const LLT S32 = LLT::scalar(32); Register Dst = MI.getOperand(0).getReg(); @@ -3746,6 +3913,8 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, if (HasVIndex) { VIndex = MI.getOperand(3).getReg(); OpOffset = 1; + } else { + VIndex = B.buildConstant(S32, 0).getReg(0); } Register VOffset = MI.getOperand(3 + OpOffset).getReg(); @@ -3759,16 +3928,14 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); unsigned ImmOffset; - unsigned TotalOffset; LLT Ty = MRI.getType(Dst); LLT EltTy = Ty.getScalarType(); const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); const bool Unpacked = ST.hasUnpackedD16VMem(); - std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); - if (TotalOffset != 0) - MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); + std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); + updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); unsigned Opc; @@ -3779,11 +3946,11 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; } else { - switch (MemSize) { - case 1: + switch (MemTy.getSizeInBits()) { + case 8: Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; break; - case 2: + case 16: Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; break; default: @@ -3794,7 +3961,8 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, Register LoadDstReg; - bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); + bool IsExtLoad = + (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); LLT UnpackedTy = Ty.changeElementSize(32); if (IsExtLoad) @@ -3804,9 +3972,6 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, else LoadDstReg = Dst; - if (!VIndex) - VIndex = B.buildConstant(S32, 0).getReg(0); - auto MIB = B.buildInstr(Opc) .addDef(LoadDstReg) // vdata .addUse(RSrc) // rsrc @@ -3898,9 +4063,16 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; + case Intrinsic::amdgcn_buffer_atomic_fadd: case Intrinsic::amdgcn_raw_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_buffer_atomic_fadd: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; + case Intrinsic::amdgcn_raw_buffer_atomic_fmin: + case Intrinsic::amdgcn_struct_buffer_atomic_fmin: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; + case Intrinsic::amdgcn_raw_buffer_atomic_fmax: + case Intrinsic::amdgcn_struct_buffer_atomic_fmax: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; default: llvm_unreachable("unhandled atomic opcode"); } @@ -3940,6 +4112,8 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, if (HasVIndex) { VIndex = MI.getOperand(4 + OpOffset).getReg(); ++OpOffset; + } else { + VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); } Register VOffset = MI.getOperand(4 + OpOffset).getReg(); @@ -3949,13 +4123,8 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, MachineMemOperand *MMO = *MI.memoperands_begin(); unsigned ImmOffset; - unsigned TotalOffset; - std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); - if (TotalOffset != 0) - MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); - - if (!VIndex) - VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); + std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); + updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); @@ -3980,14 +4149,16 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, return true; } -/// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized +/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized /// vector with s16 typed elements. -static void packImageA16AddressToDwords( - MachineIRBuilder &B, MachineInstr &MI, - SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset, - const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) { +static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, + SmallVectorImpl<Register> &PackedAddrs, + unsigned ArgOffset, + const AMDGPU::ImageDimIntrinsicInfo *Intr, + bool IsA16, bool IsG16) { const LLT S16 = LLT::scalar(16); - const LLT V2S16 = LLT::vector(2, 16); + const LLT V2S16 = LLT::fixed_vector(2, 16); + auto EndIdx = Intr->VAddrEnd; for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); @@ -3996,7 +4167,10 @@ static void packImageA16AddressToDwords( Register AddrReg = SrcOp.getReg(); - if (I < Intr->GradientStart) { + if ((I < Intr->GradientStart) || + (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || + (I >= Intr->CoordStart && !IsA16)) { + // Handle any gradient or coordinate operands that should not be packed AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); PackedAddrs.push_back(AddrReg); } else { @@ -4041,16 +4215,16 @@ static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int NumAddrRegs = AddrRegs.size(); if (NumAddrRegs != 1) { - // Round up to 8 elements for v5-v7 - // FIXME: Missing intermediate sized register classes and instructions. - if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { + // Above 8 elements round up to next power of 2 (i.e. 16). + if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); auto Undef = B.buildUndef(S32); AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); NumAddrRegs = RoundedNumRegs; } - auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); + auto VAddr = + B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); } @@ -4091,7 +4265,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( MachineRegisterInfo *MRI = B.getMRI(); const LLT S32 = LLT::scalar(32); const LLT S16 = LLT::scalar(16); - const LLT V2S16 = LLT::vector(2, 16); + const LLT V2S16 = LLT::fixed_vector(2, 16); unsigned DMask = 0; @@ -4146,7 +4320,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( if (BaseOpcode->AtomicX2) { Register VData1 = MI.getOperand(3).getReg(); // The two values are packed in one register. - LLT PackedTy = LLT::vector(2, Ty); + LLT PackedTy = LLT::fixed_vector(2, Ty); auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); MI.getOperand(2).setReg(Concat.getReg(0)); MI.getOperand(3).setReg(AMDGPU::NoRegister); @@ -4194,35 +4368,30 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( } // Rewrite the addressing register layout before doing anything else. - if (IsA16 || IsG16) { - if (IsA16) { - // Target must support the feature and gradients need to be 16 bit too - if (!ST.hasA16() || !IsG16) - return false; - } else if (!ST.hasG16()) - return false; + if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { + // 16 bit gradients are supported, but are tied to the A16 control + // so both gradients and addresses must be 16 bit + return false; + } + if (IsA16 && !ST.hasA16()) { + // A16 not supported + return false; + } + + if (IsA16 || IsG16) { if (Intr->NumVAddrs > 1) { SmallVector<Register, 4> PackedRegs; - // Don't compress addresses for G16 - const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart; - packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr, - PackEndIdx); - - if (!IsA16) { - // Add uncompressed address - for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) { - int AddrReg = MI.getOperand(ArgOffset + I).getReg(); - assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); - PackedRegs.push_back(AddrReg); - } - } + + packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, + IsG16); // See also below in the non-a16 branch - const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); + const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && + PackedRegs.size() <= ST.getNSAMaxSize(); if (!UseNSA && PackedRegs.size() > 1) { - LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); + LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); PackedRegs[0] = Concat.getReg(0); PackedRegs.resize(1); @@ -4256,7 +4425,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( // // SIShrinkInstructions will convert NSA encodings to non-NSA after register // allocation when possible. - const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); + const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && + CorrectedNumVAddrs <= ST.getNSAMaxSize(); if (!UseNSA && Intr->NumVAddrs > 1) convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, @@ -4299,7 +4469,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( return false; const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; - const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); + const LLT AdjustedTy = + Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); // The raw dword aligned data component of the load. The only legal cases // where this matters should be when using the packed D16 format, for @@ -4313,15 +4484,17 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( LLT RegTy; if (IsD16 && ST.hasUnpackedD16VMem()) { - RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); - TFETy = LLT::vector(AdjustedNumElts + 1, 32); + RoundedTy = + LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); + TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); RegTy = S32; } else { unsigned EltSize = EltTy.getSizeInBits(); unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; unsigned RoundedSize = 32 * RoundedElts; - RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); - TFETy = LLT::vector(RoundedSize / 32 + 1, S32); + RoundedTy = LLT::scalarOrVector( + ElementCount::getFixed(RoundedSize / EltSize), EltSize); + TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; } @@ -4435,10 +4608,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; // Deal with the one annoying legal case. - const LLT V3S16 = LLT::vector(3, 16); + const LLT V3S16 = LLT::fixed_vector(3, 16); if (Ty == V3S16) { padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); - auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); + auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs); B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); return true; } @@ -4460,7 +4633,7 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad( Observer.changingInstr(MI); - if (shouldBitcastLoadStoreType(ST, Ty, Size)) { + if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { Ty = getBitcastRegisterType(Ty); Helper.bitcastDst(MI, Ty, 0); Dst = MI.getOperand(0).getReg(); @@ -4502,27 +4675,55 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad( bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction - if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || - !ST.isTrapHandlerEnabled()) { - B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); - } else { - // Pass queue pointer to trap handler as input, and insert trap instruction - // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi - MachineRegisterInfo &MRI = *B.getMRI(); + if (!ST.isTrapHandlerEnabled() || + ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) + return legalizeTrapEndpgm(MI, MRI, B); + + if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { + switch (*HsaAbiVer) { + case ELF::ELFABIVERSION_AMDGPU_HSA_V2: + case ELF::ELFABIVERSION_AMDGPU_HSA_V3: + return legalizeTrapHsaQueuePtr(MI, MRI, B); + case ELF::ELFABIVERSION_AMDGPU_HSA_V4: + return ST.supportsGetDoorbellID() ? + legalizeTrapHsa(MI, MRI, B) : + legalizeTrapHsaQueuePtr(MI, MRI, B); + } + } - Register LiveIn = - MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); - if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) - return false; + llvm_unreachable("Unknown trap handler"); +} - Register SGPR01(AMDGPU::SGPR0_SGPR1); - B.buildCopy(SGPR01, LiveIn); - B.buildInstr(AMDGPU::S_TRAP) - .addImm(GCNSubtarget::TrapIDLLVMTrap) - .addReg(SGPR01, RegState::Implicit); - } +bool AMDGPULegalizerInfo::legalizeTrapEndpgm( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { + B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { + // Pass queue pointer to trap handler as input, and insert trap instruction + // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi + Register LiveIn = + MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); + if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) + return false; + Register SGPR01(AMDGPU::SGPR0_SGPR1); + B.buildCopy(SGPR01, LiveIn); + B.buildInstr(AMDGPU::S_TRAP) + .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) + .addReg(SGPR01, RegState::Implicit); + + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeTrapHsa( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { + B.buildInstr(AMDGPU::S_TRAP) + .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); MI.eraseFromParent(); return true; } @@ -4531,8 +4732,8 @@ bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { // Is non-HSA path or trap-handler disabled? then, report a warning // accordingly - if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || - !ST.isTrapHandlerEnabled()) { + if (!ST.isTrapHandlerEnabled() || + ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning); @@ -4540,7 +4741,8 @@ bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( Ctx.diagnose(NoTrap); } else { // Insert debug-trap instruction - B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); + B.buildInstr(AMDGPU::S_TRAP) + .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); } MI.eraseFromParent(); @@ -4561,6 +4763,14 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, Register RayInvDir = MI.getOperand(6).getReg(); Register TDescr = MI.getOperand(7).getReg(); + if (!ST.hasGFX10_AEncoding()) { + DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), + "intrinsic not supported on subtarget", + MI.getDebugLoc()); + B.getMF().getFunction().getContext().diagnose(BadIntrin); + return false; + } + bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa @@ -4810,6 +5020,11 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_struct_buffer_atomic_fadd: case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: + case Intrinsic::amdgcn_buffer_atomic_fadd: + case Intrinsic::amdgcn_raw_buffer_atomic_fmin: + case Intrinsic::amdgcn_struct_buffer_atomic_fmin: + case Intrinsic::amdgcn_raw_buffer_atomic_fmax: + case Intrinsic::amdgcn_struct_buffer_atomic_fmax: return legalizeBufferAtomic(MI, B, IntrID); case Intrinsic::amdgcn_atomic_inc: return legalizeAtomicIncDec(MI, B, true); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 87e8b2128a25..d4fefd89b487 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -99,25 +99,19 @@ public: MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; - bool legalizeUDIV_UREM(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &B) const; + bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; - void legalizeUDIV_UREM32Impl(MachineIRBuilder &B, - Register DstReg, Register Num, Register Den, - bool IsRem) const; - bool legalizeUDIV_UREM32(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &B) const; - bool legalizeSDIV_SREM32(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &B) const; + void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, + Register DstRemReg, Register Num, + Register Den) const; - void legalizeUDIV_UREM64Impl(MachineIRBuilder &B, - Register DstReg, Register Numer, Register Denom, - bool IsDiv) const; + void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, + Register DstRemReg, Register Numer, + Register Denom) const; - bool legalizeUDIV_UREM64(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &B) const; - bool legalizeSDIV_SREM(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &B) const; + bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; @@ -148,8 +142,11 @@ public: bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const; - std::tuple<Register, unsigned, unsigned> - splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const; + std::pair<Register, unsigned> splitBufferOffsets(MachineIRBuilder &B, + Register OrigOffset) const; + void updateBufferMMO(MachineMemOperand *MMO, Register VOffset, + Register SOffset, unsigned ImmOffset, Register VIndex, + MachineRegisterInfo &MRI) const; Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore = false) const; @@ -183,6 +180,12 @@ public: bool legalizeTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; bool legalizeDebugTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index 6b7f57252b7a..1ee6933bd7ff 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -17,6 +17,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/Loads.h" #include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/InitializePasses.h" #include "llvm/Target/TargetMachine.h" @@ -476,7 +477,7 @@ bool AMDGPULibCalls::isUnsafeMath(const CallInst *CI) const { return true; const Function *F = CI->getParent()->getParent(); Attribute Attr = F->getFnAttribute("unsafe-fp-math"); - return Attr.getValueAsString() == "true"; + return Attr.getValueAsBool(); } bool AMDGPULibCalls::useNativeFunc(const StringRef F) const { @@ -1369,9 +1370,9 @@ bool AMDGPULibCalls::fold_wavefrontsize(CallInst *CI, IRBuilder<> &B) { StringRef CPU = TM->getTargetCPU(); StringRef Features = TM->getTargetFeatureString(); - if ((CPU.empty() || CPU.equals_lower("generic")) && + if ((CPU.empty() || CPU.equals_insensitive("generic")) && (Features.empty() || - Features.find_lower("wavefrontsize") == StringRef::npos)) + Features.find_insensitive("wavefrontsize") == StringRef::npos)) return false; Function *F = CI->getParent()->getParent(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp index 646087cdb7db..32262ea75fd3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp @@ -19,10 +19,16 @@ #include "llvm/IR/Function.h" #include "llvm/IR/Module.h" #include "llvm/IR/ValueSymbolTable.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; +static cl::opt<bool> EnableOCLManglingMismatchWA( + "amdgpu-enable-ocl-mangling-mismatch-workaround", cl::init(true), + cl::ReallyHidden, + cl::desc("Enable the workaround for OCL name mangling mismatch.")); + namespace { enum EManglingParam { @@ -826,7 +832,8 @@ public: unsigned AS = UseAddrSpace ? AMDGPULibFuncBase::getAddrSpaceFromEPtrKind(p.PtrKind) : 0; - if (AS != 0) os << "U3AS" << AS; + if (EnableOCLManglingMismatchWA || AS != 0) + os << "U3AS" << AS; Ptr = p; p.PtrKind = 0; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index 8fb4f93fd4b3..0f157e53c3db 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -15,6 +15,7 @@ #include "GCNSubtarget.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/MDBuilder.h" #include "llvm/Target/TargetMachine.h" #define DEBUG_TYPE "amdgpu-lower-kernel-arguments" diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp index 9ab6a5246ce5..08a1b970648d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp @@ -67,7 +67,7 @@ static bool processUse(CallInst *CI) { const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3; const bool HasUniformWorkGroupSize = - F->getFnAttribute("uniform-work-group-size").getValueAsString() == "true"; + F->getFnAttribute("uniform-work-group-size").getValueAsBool(); if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize) return false; @@ -249,9 +249,9 @@ bool AMDGPULowerKernelAttributes::runOnModule(Module &M) { } INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE, - "AMDGPU IR optimizations", false, false) -INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE, "AMDGPU IR optimizations", - false, false) + "AMDGPU Kernel Attributes", false, false) +INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE, + "AMDGPU Kernel Attributes", false, false) char AMDGPULowerKernelAttributes::ID = 0; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp new file mode 100644 index 000000000000..70ecea8dbc3e --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -0,0 +1,400 @@ +//===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass eliminates LDS uses from non-kernel functions. +// +// The strategy is to create a new struct with a field for each LDS variable +// and allocate that struct at the same address for every kernel. Uses of the +// original LDS variables are then replaced with compile time offsets from that +// known address. AMDGPUMachineFunction allocates the LDS global. +// +// Local variables with constant annotation or non-undef initializer are passed +// through unchanged for simplication or error diagnostics in later passes. +// +// To reduce the memory overhead variables that are only used by kernels are +// excluded from this transform. The analysis to determine whether a variable +// is only used by a kernel is cheap and conservative so this may allocate +// a variable in every kernel when it was not strictly necessary to do so. +// +// A possible future refinement is to specialise the structure per-kernel, so +// that fields can be elided based on more expensive analysis. +// +// NOTE: Since this pass will directly pack LDS (assume large LDS) into a struct +// type which would cause allocating huge memory for struct instance within +// every kernel. Hence, before running this pass, it is advisable to run the +// pass "amdgpu-replace-lds-use-with-pointer" which will replace LDS uses within +// non-kernel functions by pointers and thereby minimizes the unnecessary per +// kernel allocation of LDS memory. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "Utils/AMDGPULDSUtils.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/OptimizedStructLayout.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" +#include <vector> + +#define DEBUG_TYPE "amdgpu-lower-module-lds" + +using namespace llvm; + +static cl::opt<bool> SuperAlignLDSGlobals( + "amdgpu-super-align-lds-globals", + cl::desc("Increase alignment of LDS if it is not on align boundary"), + cl::init(true), cl::Hidden); + +namespace { + +class AMDGPULowerModuleLDS : public ModulePass { + + static void removeFromUsedList(Module &M, StringRef Name, + SmallPtrSetImpl<Constant *> &ToRemove) { + GlobalVariable *GV = M.getNamedGlobal(Name); + if (!GV || ToRemove.empty()) { + return; + } + + SmallVector<Constant *, 16> Init; + auto *CA = cast<ConstantArray>(GV->getInitializer()); + for (auto &Op : CA->operands()) { + // ModuleUtils::appendToUsed only inserts Constants + Constant *C = cast<Constant>(Op); + if (!ToRemove.contains(C->stripPointerCasts())) { + Init.push_back(C); + } + } + + if (Init.size() == CA->getNumOperands()) { + return; // none to remove + } + + GV->eraseFromParent(); + + for (Constant *C : ToRemove) { + C->removeDeadConstantUsers(); + } + + if (!Init.empty()) { + ArrayType *ATy = + ArrayType::get(Type::getInt8PtrTy(M.getContext()), Init.size()); + GV = + new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage, + ConstantArray::get(ATy, Init), Name); + GV->setSection("llvm.metadata"); + } + } + + static void + removeFromUsedLists(Module &M, + const std::vector<GlobalVariable *> &LocalVars) { + SmallPtrSet<Constant *, 32> LocalVarsSet; + for (size_t I = 0; I < LocalVars.size(); I++) { + if (Constant *C = dyn_cast<Constant>(LocalVars[I]->stripPointerCasts())) { + LocalVarsSet.insert(C); + } + } + removeFromUsedList(M, "llvm.used", LocalVarsSet); + removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet); + } + + static void markUsedByKernel(IRBuilder<> &Builder, Function *Func, + GlobalVariable *SGV) { + // The llvm.amdgcn.module.lds instance is implicitly used by all kernels + // that might call a function which accesses a field within it. This is + // presently approximated to 'all kernels' if there are any such functions + // in the module. This implicit use is reified as an explicit use here so + // that later passes, specifically PromoteAlloca, account for the required + // memory without any knowledge of this transform. + + // An operand bundle on llvm.donothing works because the call instruction + // survives until after the last pass that needs to account for LDS. It is + // better than inline asm as the latter survives until the end of codegen. A + // totally robust solution would be a function with the same semantics as + // llvm.donothing that takes a pointer to the instance and is lowered to a + // no-op after LDS is allocated, but that is not presently necessary. + + LLVMContext &Ctx = Func->getContext(); + + Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI()); + + FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), {}); + + Function *Decl = + Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {}); + + Value *UseInstance[1] = {Builder.CreateInBoundsGEP( + SGV->getValueType(), SGV, ConstantInt::get(Type::getInt32Ty(Ctx), 0))}; + + Builder.CreateCall(FTy, Decl, {}, + {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)}, + ""); + } + +private: + SmallPtrSet<GlobalValue *, 32> UsedList; + +public: + static char ID; + + AMDGPULowerModuleLDS() : ModulePass(ID) { + initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override { + UsedList = AMDGPU::getUsedList(M); + + bool Changed = processUsedLDS(M); + + for (Function &F : M.functions()) { + // Only lower compute kernels' LDS. + if (!AMDGPU::isKernel(F.getCallingConv())) + continue; + Changed |= processUsedLDS(M, &F); + } + + UsedList.clear(); + return Changed; + } + +private: + bool processUsedLDS(Module &M, Function *F = nullptr) { + LLVMContext &Ctx = M.getContext(); + const DataLayout &DL = M.getDataLayout(); + + // Find variables to move into new struct instance + std::vector<GlobalVariable *> FoundLocalVars = + AMDGPU::findVariablesToLower(M, F); + + if (FoundLocalVars.empty()) { + // No variables to rewrite, no changes made. + return false; + } + + // Increase the alignment of LDS globals if necessary to maximise the chance + // that we can use aligned LDS instructions to access them. + if (SuperAlignLDSGlobals) { + for (auto *GV : FoundLocalVars) { + Align Alignment = AMDGPU::getAlign(DL, GV); + TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType()); + + if (GVSize > 8) { + // We might want to use a b96 or b128 load/store + Alignment = std::max(Alignment, Align(16)); + } else if (GVSize > 4) { + // We might want to use a b64 load/store + Alignment = std::max(Alignment, Align(8)); + } else if (GVSize > 2) { + // We might want to use a b32 load/store + Alignment = std::max(Alignment, Align(4)); + } else if (GVSize > 1) { + // We might want to use a b16 load/store + Alignment = std::max(Alignment, Align(2)); + } + + GV->setAlignment(Alignment); + } + } + + SmallVector<OptimizedStructLayoutField, 8> LayoutFields; + LayoutFields.reserve(FoundLocalVars.size()); + for (GlobalVariable *GV : FoundLocalVars) { + OptimizedStructLayoutField F(GV, DL.getTypeAllocSize(GV->getValueType()), + AMDGPU::getAlign(DL, GV)); + LayoutFields.emplace_back(F); + } + + performOptimizedStructLayout(LayoutFields); + + std::vector<GlobalVariable *> LocalVars; + LocalVars.reserve(FoundLocalVars.size()); // will be at least this large + { + // This usually won't need to insert any padding, perhaps avoid the alloc + uint64_t CurrentOffset = 0; + for (size_t I = 0; I < LayoutFields.size(); I++) { + GlobalVariable *FGV = static_cast<GlobalVariable *>( + const_cast<void *>(LayoutFields[I].Id)); + Align DataAlign = LayoutFields[I].Alignment; + + uint64_t DataAlignV = DataAlign.value(); + if (uint64_t Rem = CurrentOffset % DataAlignV) { + uint64_t Padding = DataAlignV - Rem; + + // Append an array of padding bytes to meet alignment requested + // Note (o + (a - (o % a)) ) % a == 0 + // (offset + Padding ) % align == 0 + + Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding); + LocalVars.push_back(new GlobalVariable( + M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy), + "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, + false)); + CurrentOffset += Padding; + } + + LocalVars.push_back(FGV); + CurrentOffset += LayoutFields[I].Size; + } + } + + std::vector<Type *> LocalVarTypes; + LocalVarTypes.reserve(LocalVars.size()); + std::transform( + LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes), + [](const GlobalVariable *V) -> Type * { return V->getValueType(); }); + + std::string VarName( + F ? (Twine("llvm.amdgcn.kernel.") + F->getName() + ".lds").str() + : "llvm.amdgcn.module.lds"); + StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t"); + + Align StructAlign = + AMDGPU::getAlign(DL, LocalVars[0]); + + GlobalVariable *SGV = new GlobalVariable( + M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy), + VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, + false); + SGV->setAlignment(StructAlign); + if (!F) { + appendToCompilerUsed( + M, {static_cast<GlobalValue *>( + ConstantExpr::getPointerBitCastOrAddrSpaceCast( + cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))}); + } + + // The verifier rejects used lists containing an inttoptr of a constant + // so remove the variables from these lists before replaceAllUsesWith + removeFromUsedLists(M, LocalVars); + + // Replace uses of ith variable with a constantexpr to the ith field of the + // instance that will be allocated by AMDGPUMachineFunction + Type *I32 = Type::getInt32Ty(Ctx); + for (size_t I = 0; I < LocalVars.size(); I++) { + GlobalVariable *GV = LocalVars[I]; + Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)}; + Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx); + if (F) { + // Replace all constant uses with instructions if they belong to the + // current kernel. + for (User *U : make_early_inc_range(GV->users())) { + if (ConstantExpr *C = dyn_cast<ConstantExpr>(U)) + AMDGPU::replaceConstantUsesInFunction(C, F); + } + + GV->removeDeadConstantUsers(); + + GV->replaceUsesWithIf(GEP, [F](Use &U) { + Instruction *I = dyn_cast<Instruction>(U.getUser()); + return I && I->getFunction() == F; + }); + } else { + GV->replaceAllUsesWith(GEP); + } + if (GV->use_empty()) { + UsedList.erase(GV); + GV->eraseFromParent(); + } + + uint64_t Off = DL.getStructLayout(LDSTy)->getElementOffset(I); + Align A = commonAlignment(StructAlign, Off); + refineUsesAlignment(GEP, A, DL); + } + + // Mark kernels with asm that reads the address of the allocated structure + // This is not necessary for lowering. This lets other passes, specifically + // PromoteAlloca, accurately calculate how much LDS will be used by the + // kernel after lowering. + if (!F) { + IRBuilder<> Builder(Ctx); + SmallPtrSet<Function *, 32> Kernels; + for (auto &I : M.functions()) { + Function *Func = &I; + if (AMDGPU::isKernelCC(Func) && !Kernels.contains(Func)) { + markUsedByKernel(Builder, Func, SGV); + Kernels.insert(Func); + } + } + } + return true; + } + + void refineUsesAlignment(Value *Ptr, Align A, const DataLayout &DL, + unsigned MaxDepth = 5) { + if (!MaxDepth || A == 1) + return; + + for (User *U : Ptr->users()) { + if (auto *LI = dyn_cast<LoadInst>(U)) { + LI->setAlignment(std::max(A, LI->getAlign())); + continue; + } + if (auto *SI = dyn_cast<StoreInst>(U)) { + if (SI->getPointerOperand() == Ptr) + SI->setAlignment(std::max(A, SI->getAlign())); + continue; + } + if (auto *AI = dyn_cast<AtomicRMWInst>(U)) { + // None of atomicrmw operations can work on pointers, but let's + // check it anyway in case it will or we will process ConstantExpr. + if (AI->getPointerOperand() == Ptr) + AI->setAlignment(std::max(A, AI->getAlign())); + continue; + } + if (auto *AI = dyn_cast<AtomicCmpXchgInst>(U)) { + if (AI->getPointerOperand() == Ptr) + AI->setAlignment(std::max(A, AI->getAlign())); + continue; + } + if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) { + unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType()); + APInt Off(BitWidth, 0); + if (GEP->getPointerOperand() == Ptr && + GEP->accumulateConstantOffset(DL, Off)) { + Align GA = commonAlignment(A, Off.getLimitedValue()); + refineUsesAlignment(GEP, GA, DL, MaxDepth - 1); + } + continue; + } + if (auto *I = dyn_cast<Instruction>(U)) { + if (I->getOpcode() == Instruction::BitCast || + I->getOpcode() == Instruction::AddrSpaceCast) + refineUsesAlignment(I, A, DL, MaxDepth - 1); + } + } + } +}; + +} // namespace +char AMDGPULowerModuleLDS::ID = 0; + +char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID; + +INITIALIZE_PASS(AMDGPULowerModuleLDS, DEBUG_TYPE, + "Lower uses of LDS variables from non-kernel functions", false, + false) + +ModulePass *llvm::createAMDGPULowerModuleLDSPass() { + return new AMDGPULowerModuleLDS(); +} + +PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M, + ModuleAnalysisManager &) { + return AMDGPULowerModuleLDS().runOnModule(M) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index a8cba3f5cc5c..3dd27f1996d6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -41,9 +41,6 @@ class AMDGPUMCInstLower { const TargetSubtargetInfo &ST; const AsmPrinter &AP; - const MCExpr *getLongBranchBlockExpr(const MachineBasicBlock &SrcBB, - const MachineOperand &MO) const; - public: AMDGPUMCInstLower(MCContext &ctx, const TargetSubtargetInfo &ST, const AsmPrinter &AP); @@ -95,54 +92,21 @@ static MCSymbolRefExpr::VariantKind getVariantKind(unsigned MOFlags) { } } -const MCExpr *AMDGPUMCInstLower::getLongBranchBlockExpr( - const MachineBasicBlock &SrcBB, - const MachineOperand &MO) const { - const MCExpr *DestBBSym - = MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx); - const MCExpr *SrcBBSym = MCSymbolRefExpr::create(SrcBB.getSymbol(), Ctx); - - // FIXME: The first half of this assert should be removed. This should - // probably be PC relative instead of using the source block symbol, and - // therefore the indirect branch expansion should use a bundle. - assert( - skipDebugInstructionsForward(SrcBB.begin(), SrcBB.end())->getOpcode() == - AMDGPU::S_GETPC_B64 && - ST.getInstrInfo()->get(AMDGPU::S_GETPC_B64).Size == 4); - - // s_getpc_b64 returns the address of next instruction. - const MCConstantExpr *One = MCConstantExpr::create(4, Ctx); - SrcBBSym = MCBinaryExpr::createAdd(SrcBBSym, One, Ctx); - - if (MO.getTargetFlags() == SIInstrInfo::MO_LONG_BRANCH_FORWARD) - return MCBinaryExpr::createSub(DestBBSym, SrcBBSym, Ctx); - - assert(MO.getTargetFlags() == SIInstrInfo::MO_LONG_BRANCH_BACKWARD); - return MCBinaryExpr::createSub(SrcBBSym, DestBBSym, Ctx); -} - bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const { switch (MO.getType()) { default: - llvm_unreachable("unknown operand type"); + break; case MachineOperand::MO_Immediate: MCOp = MCOperand::createImm(MO.getImm()); return true; case MachineOperand::MO_Register: MCOp = MCOperand::createReg(AMDGPU::getMCReg(MO.getReg(), ST)); return true; - case MachineOperand::MO_MachineBasicBlock: { - if (MO.getTargetFlags() != 0) { - MCOp = MCOperand::createExpr( - getLongBranchBlockExpr(*MO.getParent()->getParent(), MO)); - } else { - MCOp = MCOperand::createExpr( + case MachineOperand::MO_MachineBasicBlock: + MCOp = MCOperand::createExpr( MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx)); - } - return true; - } case MachineOperand::MO_GlobalAddress: { const GlobalValue *GV = MO.getGlobal(); SmallString<128> SymbolName; @@ -168,7 +132,15 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO, case MachineOperand::MO_RegisterMask: // Regmasks are like implicit defs. return false; + case MachineOperand::MO_MCSymbol: + if (MO.getTargetFlags() == SIInstrInfo::MO_FAR_BRANCH_OFFSET) { + MCSymbol *Sym = MO.getMCSymbol(); + MCOp = MCOperand::createExpr(Sym->getVariableValue()); + return true; + } + break; } + llvm_unreachable("unknown operand type"); } void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { @@ -274,24 +246,9 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { ++I; } } else { - // We don't want SI_MASK_BRANCH/SI_RETURN_TO_EPILOG encoded. They are + // We don't want these pseudo instructions encoded. They are // placeholder terminator instructions and should only be printed as // comments. - if (MI->getOpcode() == AMDGPU::SI_MASK_BRANCH) { - if (isVerbose()) { - SmallVector<char, 16> BBStr; - raw_svector_ostream Str(BBStr); - - const MachineBasicBlock *MBB = MI->getOperand(0).getMBB(); - const MCSymbolRefExpr *Expr - = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext); - Expr->print(Str, MAI); - OutStreamer->emitRawComment(Twine(" mask branch ") + BBStr); - } - - return; - } - if (MI->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { if (isVerbose()) OutStreamer->emitRawComment(" return to shader part epilog"); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp index b6a69b2819ee..697513b5db7a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp @@ -1419,11 +1419,7 @@ void AMDGPUMachineCFGStructurizer::extractKilledPHIs(MachineBasicBlock *MBB) { static bool isPHIRegionIndex(SmallVector<unsigned, 2> PHIRegionIndices, unsigned Index) { - for (auto i : PHIRegionIndices) { - if (i == Index) - return true; - } - return false; + return llvm::is_contained(PHIRegionIndices, Index); } bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 717145b7af53..0c743a77092c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -28,12 +28,10 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) const Function &F = MF.getFunction(); Attribute MemBoundAttr = F.getFnAttribute("amdgpu-memory-bound"); - MemoryBound = MemBoundAttr.isStringAttribute() && - MemBoundAttr.getValueAsString() == "true"; + MemoryBound = MemBoundAttr.getValueAsBool(); Attribute WaveLimitAttr = F.getFnAttribute("amdgpu-wave-limiter"); - WaveLimiter = WaveLimitAttr.isStringAttribute() && - WaveLimitAttr.getValueAsString() == "true"; + WaveLimiter = WaveLimitAttr.getValueAsBool(); CallingConv::ID CC = F.getCallingConv(); if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) @@ -64,6 +62,18 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, return Offset; } +void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Module *M) { + if (isModuleEntryFunction()) { + const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds"); + if (GV) { + unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV); + (void)Offset; + assert(Offset == 0 && + "Module LDS expected to be allocated before other LDS"); + } + } +} + void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV) { assert(DL.getTypeAllocSize(GV.getValueType()).isZero()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h index 07cac776082d..10ff50040c6a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -94,6 +94,7 @@ public: } unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV); + void allocateModuleLDSGlobal(const Module *M); Align getDynLDSAlign() const { return DynLDSAlign; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h b/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h index 82c6d75bb060..ad198a301dbe 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h @@ -6,6 +6,9 @@ // //===----------------------------------------------------------------------===// +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACROFUSION_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACROFUSION_H + #include "llvm/CodeGen/ScheduleDAGMutation.h" #include <memory> @@ -17,3 +20,5 @@ namespace llvm { std::unique_ptr<ScheduleDAGMutation> createAMDGPUMacroFusionDAGMutation(); } // llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMACROFUSION_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h b/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h index 756bc948b1dd..8af7979dba8b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h @@ -26,22 +26,6 @@ const char SectionName[] = ".note"; const char NoteNameV2[] = "AMD"; const char NoteNameV3[] = "AMDGPU"; -// TODO: Remove this file once we drop code object v2. -enum NoteType{ - NT_AMDGPU_HSA_RESERVED_0 = 0, - NT_AMDGPU_HSA_CODE_OBJECT_VERSION = 1, - NT_AMDGPU_HSA_HSAIL = 2, - NT_AMDGPU_HSA_ISA = 3, - NT_AMDGPU_HSA_PRODUCER = 4, - NT_AMDGPU_HSA_PRODUCER_OPTIONS = 5, - NT_AMDGPU_HSA_EXTENSION = 6, - NT_AMDGPU_HSA_RESERVED_7 = 7, - NT_AMDGPU_HSA_RESERVED_8 = 8, - NT_AMDGPU_HSA_RESERVED_9 = 9, - NT_AMDGPU_HSA_HLDEBUG_DEBUG = 101, - NT_AMDGPU_HSA_HLDEBUG_TARGET = 102 -}; - } // End namespace ElfNote } // End namespace AMDGPU } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp index 2f6220e425cc..2aa02299ecdc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetMachine.h" @@ -208,19 +209,22 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) { for (auto &B : F) { LastAccess = MemAccessInfo(); for (auto &I : B) { - if (getMemoryInstrPtr(&I)) { + if (const Value *Ptr = getMemoryInstrPtr(&I)) { + unsigned Size = divideCeil( + Ptr->getType()->getPointerElementType()->getPrimitiveSizeInBits(), + 32); if (isIndirectAccess(&I)) - ++FI.IAMInstCount; + FI.IAMInstCost += Size; if (isLargeStride(&I)) - ++FI.LSMInstCount; - ++FI.MemInstCount; - ++FI.InstCount; + FI.LSMInstCost += Size; + FI.MemInstCost += Size; + FI.InstCost += Size; continue; } if (auto *CB = dyn_cast<CallBase>(&I)) { Function *Callee = CB->getCalledFunction(); if (!Callee || Callee->isDeclaration()) { - ++FI.InstCount; + ++FI.InstCost; continue; } if (&F == Callee) // Handle immediate recursion @@ -230,10 +234,10 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) { if (Loc == FIM.end()) continue; - FI.MemInstCount += Loc->second.MemInstCount; - FI.InstCount += Loc->second.InstCount; - FI.IAMInstCount += Loc->second.IAMInstCount; - FI.LSMInstCount += Loc->second.LSMInstCount; + FI.MemInstCost += Loc->second.MemInstCost; + FI.InstCost += Loc->second.InstCost; + FI.IAMInstCost += Loc->second.IAMInstCost; + FI.LSMInstCost += Loc->second.LSMInstCost; } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { TargetLoweringBase::AddrMode AM; auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL); @@ -243,9 +247,9 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) { GEP->getPointerAddressSpace())) // Offset will likely be folded into load or store continue; - ++FI.InstCount; + ++FI.InstCost; } else { - ++FI.InstCount; + ++FI.InstCost; } } } @@ -263,11 +267,11 @@ bool AMDGPUPerfHint::runOnFunction(Function &F) { const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F); - LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Info->MemInstCount + LLVM_DEBUG(dbgs() << F.getName() << " MemInst cost: " << Info->MemInstCost << '\n' - << " IAMInst: " << Info->IAMInstCount << '\n' - << " LSMInst: " << Info->LSMInstCount << '\n' - << " TotalInst: " << Info->InstCount << '\n'); + << " IAMInst cost: " << Info->IAMInstCost << '\n' + << " LSMInst cost: " << Info->LSMInstCost << '\n' + << " TotalInst cost: " << Info->InstCost << '\n'); if (isMemBound(*Info)) { LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n"); @@ -285,13 +289,12 @@ bool AMDGPUPerfHint::runOnFunction(Function &F) { } bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { - return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh; + return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh; } bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { - return ((FI.MemInstCount + FI.IAMInstCount * IAWeight + - FI.LSMInstCount * LSWeight) * - 100 / FI.InstCount) > LimitWaveThresh; + return ((FI.MemInstCost + FI.IAMInstCost * IAWeight + + FI.LSMInstCost * LSWeight) * 100 / FI.InstCost) > LimitWaveThresh; } bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h index 99dbf5080741..31ff80f5f431 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h @@ -37,12 +37,11 @@ public: bool needsWaveLimiter(const Function *F) const; struct FuncInfo { - unsigned MemInstCount; - unsigned InstCount; - unsigned IAMInstCount; // Indirect access memory instruction count - unsigned LSMInstCount; // Large stride memory instruction count - FuncInfo() : MemInstCount(0), InstCount(0), IAMInstCount(0), - LSMInstCount(0) {} + unsigned MemInstCost; + unsigned InstCost; + unsigned IAMInstCost; // Indirect access memory instruction count + unsigned LSMInstCost; // Large stride memory instruction count + FuncInfo() : MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0) {} }; typedef ValueMap<const Function*, FuncInfo> FuncInfoMap; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index 09e2c762abdb..728be811afae 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -66,6 +66,8 @@ public: bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo); void applyCvtF32UByteN(MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo); + + bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg); }; bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy( @@ -245,6 +247,14 @@ void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN( MI.eraseFromParent(); } +bool AMDGPUPostLegalizerCombinerHelper::matchRemoveFcanonicalize( + MachineInstr &MI, Register &Reg) { + const SITargetLowering *TLI = static_cast<const SITargetLowering *>( + MF.getSubtarget().getTargetLowering()); + Reg = MI.getOperand(1).getReg(); + return TLI->isCanonicalized(Reg, MF); +} + class AMDGPUPostLegalizerCombinerHelperState { protected: CombinerHelper &Helper; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp index e4b628bf6b23..13f09ab8f164 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -12,6 +12,9 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPULegalizerInfo.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" @@ -26,6 +29,141 @@ using namespace llvm; using namespace MIPatternMatch; +class AMDGPUPreLegalizerCombinerHelper { +protected: + MachineIRBuilder &B; + MachineFunction &MF; + MachineRegisterInfo &MRI; + CombinerHelper &Helper; + +public: + AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) + : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){}; + + struct ClampI64ToI16MatchInfo { + int64_t Cmp1 = 0; + int64_t Cmp2 = 0; + Register Origin; + }; + + bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineFunction &MF, + ClampI64ToI16MatchInfo &MatchInfo); + + void applyClampI64ToI16(MachineInstr &MI, + const ClampI64ToI16MatchInfo &MatchInfo); +}; + +bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF, + ClampI64ToI16MatchInfo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!"); + + // Try to find a pattern where an i64 value should get clamped to short. + const LLT SrcType = MRI.getType(MI.getOperand(1).getReg()); + if (SrcType != LLT::scalar(64)) + return false; + + const LLT DstType = MRI.getType(MI.getOperand(0).getReg()); + if (DstType != LLT::scalar(16)) + return false; + + Register Base; + + auto IsApplicableForCombine = [&MatchInfo]() -> bool { + const auto Cmp1 = MatchInfo.Cmp1; + const auto Cmp2 = MatchInfo.Cmp2; + const auto Diff = std::abs(Cmp2 - Cmp1); + + // If the difference between both comparison values is 0 or 1, there is no + // need to clamp. + if (Diff == 0 || Diff == 1) + return false; + + const int64_t Min = std::numeric_limits<int16_t>::min(); + const int64_t Max = std::numeric_limits<int16_t>::max(); + + // Check if the comparison values are between SHORT_MIN and SHORT_MAX. + return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) || + (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min)); + }; + + // Try to match a combination of min / max MIR opcodes. + if (mi_match(MI.getOperand(1).getReg(), MRI, + m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) { + if (mi_match(Base, MRI, + m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) { + return IsApplicableForCombine(); + } + } + + if (mi_match(MI.getOperand(1).getReg(), MRI, + m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) { + if (mi_match(Base, MRI, + m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) { + return IsApplicableForCombine(); + } + } + + return false; +} + +// We want to find a combination of instructions that +// gets generated when an i64 gets clamped to i16. +// The corresponding pattern is: +// G_MAX / G_MAX for i16 <= G_TRUNC i64. +// This can be efficiently written as following: +// v_cvt_pk_i16_i32 v0, v0, v1 +// v_med3_i32 v0, Clamp_Min, v0, Clamp_Max +void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16( + MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) { + + Register Src = MatchInfo.Origin; + assert(MI.getParent()->getParent()->getRegInfo().getType(Src) == + LLT::scalar(64)); + const LLT S32 = LLT::scalar(32); + + B.setMBB(*MI.getParent()); + B.setInstrAndDebugLoc(MI); + + auto Unmerge = B.buildUnmerge(S32, Src); + + assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32); + + const LLT V2S16 = LLT::fixed_vector(2, 16); + auto CvtPk = + B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32, {V2S16}, + {Unmerge.getReg(0), Unmerge.getReg(1)}, MI.getFlags()); + + auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2); + auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2); + auto MinBoundaryDst = B.buildConstant(S32, MinBoundary); + auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary); + + auto Bitcast = B.buildBitcast({S32}, CvtPk); + + auto Med3 = B.buildInstr( + AMDGPU::G_AMDGPU_SMED3, {S32}, + {MinBoundaryDst.getReg(0), Bitcast.getReg(0), MaxBoundaryDst.getReg(0)}, + MI.getFlags()); + + B.buildTrunc(MI.getOperand(0).getReg(), Med3); + + MI.eraseFromParent(); +} + +class AMDGPUPreLegalizerCombinerHelperState { +protected: + CombinerHelper &Helper; + AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper; + +public: + AMDGPUPreLegalizerCombinerHelperState( + CombinerHelper &Helper, + AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper) + : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {} +}; + #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS #include "AMDGPUGenPreLegalizeGICombiner.inc" #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS @@ -59,12 +197,16 @@ bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, MachineInstr &MI, MachineIRBuilder &B) const { CombinerHelper Helper(Observer, B, KB, MDT); - AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg); + AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper); + AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper, + PreLegalizerHelper); if (Generated.tryCombineAll(Observer, MI, B, Helper)) return true; switch (MI.getOpcode()) { + case TargetOpcode::G_MEMCPY_INLINE: + return Helper.tryEmitMemcpyInline(MI); case TargetOpcode::G_CONCAT_VECTORS: return Helper.tryCombineConcatVectors(MI); case TargetOpcode::G_SHUFFLE_VECTOR: @@ -109,6 +251,9 @@ void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<MachineDominatorTree>(); AU.addPreserved<MachineDominatorTree>(); } + + AU.addRequired<GISelCSEAnalysisWrapperPass>(); + AU.addPreserved<GISelCSEAnalysisWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -130,8 +275,13 @@ bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), F.hasMinSize(), KB, MDT); + // Enable CSE. + GISelCSEAnalysisWrapper &Wrapper = + getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper(); + auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig()); + Combiner C(PCInfo, TPC); - return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); + return C.combineMachineInstrs(MF, CSEInfo); } char AMDGPUPreLegalizerCombiner::ID = 0; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp index c8bd9b96b44f..7b6959b56145 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp @@ -323,7 +323,8 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) { Type *SizetTy = Type::getInt32Ty(Ctx); Type *Tys_alloc[1] = {SizetTy}; - Type *I8Ptr = PointerType::get(Type::getInt8Ty(Ctx), 1); + Type *I8Ty = Type::getInt8Ty(Ctx); + Type *I8Ptr = PointerType::get(I8Ty, 1); FunctionType *FTy_alloc = FunctionType::get(I8Ptr, Tys_alloc, false); FunctionCallee PrintfAllocFn = M.getOrInsertFunction(StringRef("__printf_alloc"), FTy_alloc, Attr); @@ -355,9 +356,8 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) { // basicblock splits after buffer overflow check // ConstantPointerNull *zeroIntPtr = - ConstantPointerNull::get(PointerType::get(Type::getInt8Ty(Ctx), 1)); - ICmpInst *cmp = - dyn_cast<ICmpInst>(Builder.CreateICmpNE(pcall, zeroIntPtr, "")); + ConstantPointerNull::get(PointerType::get(I8Ty, 1)); + auto *cmp = cast<ICmpInst>(Builder.CreateICmpNE(pcall, zeroIntPtr, "")); if (!CI->use_empty()) { Value *result = Builder.CreateSExt(Builder.CreateNot(cmp), I32Ty, "printf_res"); @@ -371,13 +371,9 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) { // store unique printf id in the buffer // - SmallVector<Value *, 1> ZeroIdxList; - ConstantInt *zeroInt = - ConstantInt::get(Ctx, APInt(32, StringRef("0"), 10)); - ZeroIdxList.push_back(zeroInt); - GetElementPtrInst *BufferIdx = GetElementPtrInst::Create( - nullptr, pcall, ZeroIdxList, "PrintBuffID", Brnch); + I8Ty, pcall, ConstantInt::get(Ctx, APInt(32, 0)), "PrintBuffID", + Brnch); Type *idPointer = PointerType::get(I32Ty, AMDGPUAS::GLOBAL_ADDRESS); Value *id_gep_cast = @@ -385,14 +381,11 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) { new StoreInst(ConstantInt::get(I32Ty, UniqID), id_gep_cast, Brnch); - SmallVector<Value *, 2> FourthIdxList; - ConstantInt *fourInt = - ConstantInt::get(Ctx, APInt(32, StringRef("4"), 10)); - - FourthIdxList.push_back(fourInt); // 1st 4 bytes hold the printf_id + // 1st 4 bytes hold the printf_id // the following GEP is the buffer pointer - BufferIdx = GetElementPtrInst::Create(nullptr, pcall, FourthIdxList, - "PrintBuffGep", Brnch); + BufferIdx = GetElementPtrInst::Create( + I8Ty, pcall, ConstantInt::get(Ctx, APInt(32, 4)), "PrintBuffGep", + Brnch); Type *Int32Ty = Type::getInt32Ty(Ctx); Type *Int64Ty = Type::getInt64Ty(Ctx); @@ -533,7 +526,7 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) { (void)StBuff; if (I + 1 == E && ArgCount + 1 == CI->getNumArgOperands()) break; - BufferIdx = GetElementPtrInst::Create(nullptr, BufferIdx, BuffOffset, + BufferIdx = GetElementPtrInst::Create(I8Ty, BufferIdx, BuffOffset, "PrintBuffNextPtr", Brnch); LLVM_DEBUG(dbgs() << "inserting gep to the printf buffer:\n" << *BufferIdx << '\n'); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 2a6ea838efc0..3f1f21a33f7e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -126,8 +126,13 @@ public: char AMDGPUPromoteAlloca::ID = 0; char AMDGPUPromoteAllocaToVector::ID = 0; -INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE, - "AMDGPU promote alloca to vector or LDS", false, false) +INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE, + "AMDGPU promote alloca to vector or LDS", false, false) +// Move LDS uses from functions to kernels before promote alloca for accurate +// estimation of LDS available +INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDS) +INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE, + "AMDGPU promote alloca to vector or LDS", false, false) INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector", "AMDGPU promote alloca to vector", false, false) @@ -656,6 +661,11 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes( continue; } + // Do not promote vector/aggregate type instructions. It is hard to track + // their users. + if (isa<InsertValueInst>(User) || isa<InsertElementInst>(User)) + return false; + if (!User->getType()->isPointerTy()) continue; @@ -943,13 +953,15 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) { I.replaceAllUsesWith(Offset); I.eraseFromParent(); + SmallVector<IntrinsicInst *> DeferredIntrs; + for (Value *V : WorkList) { CallInst *Call = dyn_cast<CallInst>(V); if (!Call) { if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) { Value *Src0 = CI->getOperand(0); - Type *EltTy = Src0->getType()->getPointerElementType(); - PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); + PointerType *NewTy = PointerType::getWithSamePointeeType( + cast<PointerType>(Src0->getType()), AMDGPUAS::LOCAL_ADDRESS); if (isa<ConstantPointerNull>(CI->getOperand(0))) CI->setOperand(0, ConstantPointerNull::get(NewTy)); @@ -965,8 +977,8 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) { if (isa<AddrSpaceCastInst>(V)) continue; - Type *EltTy = V->getType()->getPointerElementType(); - PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); + PointerType *NewTy = PointerType::getWithSamePointeeType( + cast<PointerType>(V->getType()), AMDGPUAS::LOCAL_ADDRESS); // FIXME: It doesn't really make sense to try to do this for all // instructions. @@ -997,22 +1009,13 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) { // These intrinsics are for address space 0 only Intr->eraseFromParent(); continue; - case Intrinsic::memcpy: { - MemCpyInst *MemCpy = cast<MemCpyInst>(Intr); - Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getDestAlign(), - MemCpy->getRawSource(), MemCpy->getSourceAlign(), - MemCpy->getLength(), MemCpy->isVolatile()); - Intr->eraseFromParent(); - continue; - } - case Intrinsic::memmove: { - MemMoveInst *MemMove = cast<MemMoveInst>(Intr); - Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getDestAlign(), - MemMove->getRawSource(), MemMove->getSourceAlign(), - MemMove->getLength(), MemMove->isVolatile()); - Intr->eraseFromParent(); + case Intrinsic::memcpy: + case Intrinsic::memmove: + // These have 2 pointer operands. In case if second pointer also needs + // to be replaced we defer processing of these intrinsics until all + // other values are processed. + DeferredIntrs.push_back(Intr); continue; - } case Intrinsic::memset: { MemSetInst *MemSet = cast<MemSetInst>(Intr); Builder.CreateMemSet( @@ -1032,11 +1035,11 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) { continue; case Intrinsic::objectsize: { Value *Src = Intr->getOperand(0); - Type *SrcTy = Src->getType()->getPointerElementType(); - Function *ObjectSize = Intrinsic::getDeclaration(Mod, - Intrinsic::objectsize, - { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) } - ); + Function *ObjectSize = Intrinsic::getDeclaration( + Mod, Intrinsic::objectsize, + {Intr->getType(), + PointerType::getWithSamePointeeType( + cast<PointerType>(Src->getType()), AMDGPUAS::LOCAL_ADDRESS)}); CallInst *NewCall = Builder.CreateCall( ObjectSize, @@ -1050,6 +1053,27 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) { llvm_unreachable("Don't know how to promote alloca intrinsic use."); } } + + for (IntrinsicInst *Intr : DeferredIntrs) { + Builder.SetInsertPoint(Intr); + Intrinsic::ID ID = Intr->getIntrinsicID(); + assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove); + + MemTransferInst *MI = cast<MemTransferInst>(Intr); + auto *B = + Builder.CreateMemTransferInst(ID, MI->getRawDest(), MI->getDestAlign(), + MI->getRawSource(), MI->getSourceAlign(), + MI->getLength(), MI->isVolatile()); + + for (unsigned I = 1; I != 3; ++I) { + if (uint64_t Bytes = Intr->getDereferenceableBytes(I)) { + B->addDereferenceableAttr(I, Bytes); + } + } + + Intr->eraseFromParent(); + } + return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp index cd71c7a16c73..0e4c26170a8f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp @@ -249,7 +249,11 @@ bool AMDGPUPropagateAttributes::process() { if (!I) continue; CallBase *CI = dyn_cast<CallBase>(I); - if (!CI) + // Only propagate attributes if F is the called function. Specifically, + // do not propagate attributes if F is passed as an argument. + // FIXME: handle bitcasted callee, e.g. + // %retval = call i8* bitcast (i32* ()* @f to i8* ()*)() + if (!CI || CI->getCalledOperand() != &F) continue; Function *Caller = CI->getCaller(); if (!Caller || !Visited.insert(CI).second) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index d644c0319286..4e12e5cd8f65 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -13,7 +13,9 @@ #include "AMDGPU.h" #include "AMDGPULegalizerInfo.h" +#include "AMDGPURegisterBankInfo.h" #include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" @@ -27,6 +29,126 @@ using namespace llvm; using namespace MIPatternMatch; +class AMDGPURegBankCombinerHelper { +protected: + MachineIRBuilder &B; + MachineFunction &MF; + MachineRegisterInfo &MRI; + const RegisterBankInfo &RBI; + const TargetRegisterInfo &TRI; + CombinerHelper &Helper; + +public: + AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) + : B(B), MF(B.getMF()), MRI(*B.getMRI()), + RBI(*MF.getSubtarget().getRegBankInfo()), + TRI(*MF.getSubtarget().getRegisterInfo()), Helper(Helper){}; + + bool isVgprRegBank(Register Reg); + + struct MinMaxMedOpc { + unsigned Min, Max, Med; + }; + + struct Med3MatchInfo { + unsigned Opc; + Register Val0, Val1, Val2; + }; + + MinMaxMedOpc getMinMaxPair(unsigned Opc); + + template <class m_Cst> + bool matchMed(MachineInstr &MI, MachineRegisterInfo &MRI, MinMaxMedOpc MMMOpc, + Register &Val, Register &K0, Register &K1); + + bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); + void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); +}; + +bool AMDGPURegBankCombinerHelper::isVgprRegBank(Register Reg) { + return RBI.getRegBank(Reg, MRI, TRI)->getID() == AMDGPU::VGPRRegBankID; +} + +AMDGPURegBankCombinerHelper::MinMaxMedOpc +AMDGPURegBankCombinerHelper::getMinMaxPair(unsigned Opc) { + switch (Opc) { + default: + llvm_unreachable("Unsupported opcode"); + case AMDGPU::G_SMAX: + case AMDGPU::G_SMIN: + return {AMDGPU::G_SMIN, AMDGPU::G_SMAX, AMDGPU::G_AMDGPU_SMED3}; + case AMDGPU::G_UMAX: + case AMDGPU::G_UMIN: + return {AMDGPU::G_UMIN, AMDGPU::G_UMAX, AMDGPU::G_AMDGPU_UMED3}; + } +} + +template <class m_Cst> +bool AMDGPURegBankCombinerHelper::matchMed(MachineInstr &MI, + MachineRegisterInfo &MRI, + MinMaxMedOpc MMMOpc, Register &Val, + Register &K0, Register &K1) { + // 4 operand commutes of: min(max(Val, K0), K1). + // Find K1 from outer instr: min(max(...), K1) or min(K1, max(...)). + // Find K0 and Val from inner instr: max(K0, Val) or max(Val, K0). + // 4 operand commutes of: max(min(Val, K1), K0). + // Find K0 from outer instr: max(min(...), K0) or max(K0, min(...)). + // Find K1 and Val from inner instr: min(K1, Val) or min(Val, K1). + return mi_match( + MI, MRI, + m_any_of( + m_CommutativeBinOp( + MMMOpc.Min, m_CommutativeBinOp(MMMOpc.Max, m_Reg(Val), m_Cst(K0)), + m_Cst(K1)), + m_CommutativeBinOp( + MMMOpc.Max, m_CommutativeBinOp(MMMOpc.Min, m_Reg(Val), m_Cst(K1)), + m_Cst(K0)))); +} + +bool AMDGPURegBankCombinerHelper::matchIntMinMaxToMed3( + MachineInstr &MI, Med3MatchInfo &MatchInfo) { + Register Dst = MI.getOperand(0).getReg(); + if (!isVgprRegBank(Dst)) + return false; + + if (MRI.getType(Dst).isVector()) + return false; + + MinMaxMedOpc OpcodeTriple = getMinMaxPair(MI.getOpcode()); + Register Val, K0, K1; + // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1. + if (!matchMed<ICstRegMatch>(MI, MRI, OpcodeTriple, Val, K0, K1)) + return false; + + const APInt &K0_Imm = getConstantIntVRegVal(K0, MRI)->getValue(); + const APInt &K1_Imm = getConstantIntVRegVal(K1, MRI)->getValue(); + if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_SMED3 && K0_Imm.sgt(K1_Imm)) + return false; + if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_UMED3 && K0_Imm.ugt(K1_Imm)) + return false; + + MatchInfo = {OpcodeTriple.Med, Val, K0, K1}; + return true; +} + +void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI, + Med3MatchInfo &MatchInfo) { + B.setInstrAndDebugLoc(MI); + B.buildInstr(MatchInfo.Opc, {MI.getOperand(0)}, + {MatchInfo.Val0, MatchInfo.Val1, MatchInfo.Val2}, MI.getFlags()); + MI.eraseFromParent(); +} + +class AMDGPURegBankCombinerHelperState { +protected: + CombinerHelper &Helper; + AMDGPURegBankCombinerHelper &RegBankHelper; + +public: + AMDGPURegBankCombinerHelperState(CombinerHelper &Helper, + AMDGPURegBankCombinerHelper &RegBankHelper) + : Helper(Helper), RegBankHelper(RegBankHelper) {} +}; #define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS #include "AMDGPUGenRegBankGICombiner.inc" @@ -62,9 +184,11 @@ bool AMDGPURegBankCombinerInfo::combine(GISelChangeObserver &Observer, MachineInstr &MI, MachineIRBuilder &B) const { CombinerHelper Helper(Observer, B, KB, MDT); - AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg); + AMDGPURegBankCombinerHelper RegBankHelper(B, Helper); + AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg, Helper, + RegBankHelper); - if (Generated.tryCombineAll(Observer, MI, B, Helper)) + if (Generated.tryCombineAll(Observer, MI, B)) return true; return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 502356d4f9a4..0e4005627e02 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -443,9 +443,8 @@ static bool isScalarLoadLegal(const MachineInstr &MI) { const unsigned AS = MMO->getAddrSpace(); const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; - - // There are no extending SMRD/SMEM loads, and they require 4-byte alignment. - return MMO->getSize() >= 4 && MMO->getAlign() >= Align(4) && + // Require 4-byte alignment. + return MMO->getAlign() >= Align(4) && // Can't do a scalar atomic load. !MMO->isAtomic() && // Don't use scalar loads for volatile accesses to non-constant address @@ -591,21 +590,6 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( return AltMappings; } - case TargetOpcode::G_SMIN: - case TargetOpcode::G_SMAX: - case TargetOpcode::G_UMIN: - case TargetOpcode::G_UMAX: { - static const OpRegBankEntry<3> Table[2] = { - { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, - - // Scalar requires cmp+select, and extends if 16-bit. - // FIXME: Should there be separate costs for 32 and 16-bit - { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 } - }; - - const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } }; - return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); - } case TargetOpcode::G_UADDE: case TargetOpcode::G_USUBE: case TargetOpcode::G_SADDE: @@ -691,12 +675,13 @@ static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs, static LLT getHalfSizedType(LLT Ty) { if (Ty.isVector()) { - assert(Ty.getNumElements() % 2 == 0); - return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType()); + assert(Ty.getElementCount().isKnownMultipleOf(2)); + return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2), + Ty.getElementType()); } - assert(Ty.getSizeInBits() % 2 == 0); - return LLT::scalar(Ty.getSizeInBits() / 2); + assert(Ty.getScalarSizeInBits() % 2 == 0); + return LLT::scalar(Ty.getScalarSizeInBits() / 2); } /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If @@ -1139,8 +1124,8 @@ static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) { unsigned FirstPartNumElts = FirstSize / EltSize; unsigned RemainderElts = (TotalSize - FirstSize) / EltSize; - return {LLT::scalarOrVector(FirstPartNumElts, EltTy), - LLT::scalarOrVector(RemainderElts, EltTy)}; + return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy), + LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)}; } static LLT widen96To128(LLT Ty) { @@ -1149,7 +1134,7 @@ static LLT widen96To128(LLT Ty) { LLT EltTy = Ty.getElementType(); assert(128 % EltTy.getSizeInBits() == 0); - return LLT::vector(128 / EltTy.getSizeInBits(), EltTy); + return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy); } bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI, @@ -1160,34 +1145,61 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI, unsigned LoadSize = LoadTy.getSizeInBits(); const unsigned MaxNonSmrdLoadSize = 128; - const RegisterBank *PtrBank = - OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; - if (PtrBank == &AMDGPU::SGPRRegBank) { - // If the pointer is an SGPR, we ordinarily have nothing to do. - if (LoadSize != 96) + const RegisterBank *DstBank = + OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; + if (DstBank == &AMDGPU::SGPRRegBank) { + // There are some special cases that we need to look at for 32 bit and 96 + // bit SGPR loads otherwise we have nothing to do. + if (LoadSize != 32 && LoadSize != 96) return false; MachineMemOperand *MMO = *MI.memoperands_begin(); + const unsigned MemSize = 8 * MMO->getSize(); + // Scalar loads of size 8 or 16 bit with proper alignment may be widened to + // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit + // scalar loads should have a load size of 32 but memory access size of less + // than 32. + if (LoadSize == 32 && + (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI))) + return false; + Register PtrReg = MI.getOperand(1).getReg(); - // 96-bit loads are only available for vector loads. We need to split this - // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank); MachineIRBuilder B(MI, O); - if (MMO->getAlign() < Align(16)) { - LLT Part64, Part32; - std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); - auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0); - auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8); - - auto Undef = B.buildUndef(LoadTy); - auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0); - B.buildInsert(MI.getOperand(0), Ins0, Load1, 64); + if (LoadSize == 32) { + // This is an extending load from a sub-dword size. Widen the memory + // access size to 4 bytes and clear the extra high bits appropriately + const LLT S32 = LLT::scalar(32); + if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) { + // Must extend the sign bit into higher bits for a G_SEXTLOAD + auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); + B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize); + } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) { + // Must extend zero into higher bits with an AND for a G_ZEXTLOAD + auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); + B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize); + } else + // We do not need to touch the higher bits for regular loads. + B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0); } else { - LLT WiderTy = widen96To128(LoadTy); - auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); - B.buildExtract(MI.getOperand(0), WideLoad, 0); + // 96-bit loads are only available for vector loads. We need to split this + // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). + if (MMO->getAlign() < Align(16)) { + LLT Part64, Part32; + std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); + auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0); + auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8); + + auto Undef = B.buildUndef(LoadTy); + auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0); + B.buildInsert(MI.getOperand(0), Ins0, Load1, 64); + } else { + LLT WiderTy = widen96To128(LoadTy); + auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); + B.buildExtract(MI.getOperand(0), WideLoad, 0); + } } MI.eraseFromParent(); @@ -1345,8 +1357,8 @@ static unsigned setBufferOffsets(MachineIRBuilder &B, AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset); uint32_t SOffset, ImmOffset; - if (Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, - &RBI.Subtarget, Alignment)) { + if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, + &RBI.Subtarget, Alignment)) { if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { VOffsetReg = Base; SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); @@ -1366,7 +1378,8 @@ static unsigned setBufferOffsets(MachineIRBuilder &B, } // Handle the variable sgpr + vgpr case. - if (MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI)) { + MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI); + if (Add && (int)Offset >= 0) { Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg()); Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg()); @@ -1519,8 +1532,8 @@ bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( return true; } -bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic( - const OperandsMapper &OpdMapper, bool Signed) const { +bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper, + bool Signed) const { MachineInstr &MI = OpdMapper.getMI(); MachineRegisterInfo &MRI = OpdMapper.getMRI(); @@ -1532,19 +1545,69 @@ bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic( const LLT S32 = LLT::scalar(32); + unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1; + Register SrcReg = MI.getOperand(FirstOpnd).getReg(); + Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg(); + Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg(); + const RegisterBank *DstBank = OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; if (DstBank == &AMDGPU::VGPRRegBank) { if (Ty == S32) return true; - // TODO: 64-bit version is scalar only, so we need to expand this. - return false; - } + // There is no 64-bit vgpr bitfield extract instructions so the operation + // is expanded to a sequence of instructions that implement the operation. + ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank); + MachineIRBuilder B(MI, ApplyBank); + + const LLT S64 = LLT::scalar(64); + // Shift the source operand so that extracted bits start at bit 0. + auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg) + : B.buildLShr(S64, SrcReg, OffsetReg); + auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset); + + // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions + // if the width is a constant. + if (auto ConstWidth = getConstantVRegValWithLookThrough(WidthReg, MRI)) { + // Use the 32-bit bitfield extract instruction if the width is a constant. + // Depending on the width size, use either the low or high 32-bits. + auto Zero = B.buildConstant(S32, 0); + auto WidthImm = ConstWidth->Value.getZExtValue(); + if (WidthImm <= 32) { + // Use bitfield extract on the lower 32-bit source, and then sign-extend + // or clear the upper 32-bits. + auto Extract = + Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg) + : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg); + auto Extend = + Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero; + B.buildMerge(DstReg, {Extract, Extend}); + } else { + // Use bitfield extract on upper 32-bit source, and combine with lower + // 32-bit source. + auto UpperWidth = B.buildConstant(S32, WidthImm - 32); + auto Extract = + Signed + ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth) + : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth); + B.buildMerge(DstReg, {UnmergeSOffset.getReg(0), Extract}); + } + MI.eraseFromParent(); + return true; + } - Register SrcReg = MI.getOperand(2).getReg(); - Register OffsetReg = MI.getOperand(3).getReg(); - Register WidthReg = MI.getOperand(4).getReg(); + // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit + // operations. + auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg); + auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift); + if (Signed) + B.buildAShr(S64, SignBit, ExtShift); + else + B.buildLShr(S64, SignBit, ExtShift); + MI.eraseFromParent(); + return true; + } // The scalar form packs the offset and width in a single operand. @@ -1576,32 +1639,19 @@ bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic( return true; } -// FIXME: Duplicated from LegalizerHelper -static CmpInst::Predicate minMaxToCompare(unsigned Opc) { - switch (Opc) { - case TargetOpcode::G_SMIN: - return CmpInst::ICMP_SLT; - case TargetOpcode::G_SMAX: - return CmpInst::ICMP_SGT; - case TargetOpcode::G_UMIN: - return CmpInst::ICMP_ULT; - case TargetOpcode::G_UMAX: - return CmpInst::ICMP_UGT; - default: - llvm_unreachable("not in integer min/max"); - } -} - -static unsigned minMaxToExtend(unsigned Opc) { +// Return a suitable opcode for extending the operands of Opc when widening. +static unsigned getExtendOp(unsigned Opc) { switch (Opc) { + case TargetOpcode::G_ASHR: case TargetOpcode::G_SMIN: case TargetOpcode::G_SMAX: return TargetOpcode::G_SEXT; + case TargetOpcode::G_LSHR: case TargetOpcode::G_UMIN: case TargetOpcode::G_UMAX: return TargetOpcode::G_ZEXT; default: - llvm_unreachable("not in integer min/max"); + return TargetOpcode::G_ANYEXT; } } @@ -1628,30 +1678,6 @@ unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) { return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0)); } -static MachineInstr *buildExpandedScalarMinMax(MachineIRBuilder &B, - CmpInst::Predicate Pred, - Register Dst, Register Src0, - Register Src1) { - const LLT CmpType = LLT::scalar(32); - auto Cmp = B.buildICmp(Pred, CmpType, Src0, Src1); - return B.buildSelect(Dst, Cmp, Src0, Src1); -} - -// FIXME: Duplicated from LegalizerHelper, except changing the boolean type. -void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B, - MachineInstr &MI) const { - Register Dst = MI.getOperand(0).getReg(); - Register Src0 = MI.getOperand(1).getReg(); - Register Src1 = MI.getOperand(2).getReg(); - - const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode()); - MachineInstr *Sel = buildExpandedScalarMinMax(B, Pred, Dst, Src0, Src1); - - Register CmpReg = Sel->getOperand(1).getReg(); - B.getMRI()->setRegBank(CmpReg, AMDGPU::SGPRRegBank); - MI.eraseFromParent(); -} - // For cases where only a single copy is inserted for matching register banks. // Replace the register in the instruction operand static bool substituteSimpleCopyRegs( @@ -1688,7 +1714,7 @@ Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B, const LLT S32 = LLT::scalar(32); int NumElts = StoreVT.getNumElements(); - return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0); + return B.buildMerge(LLT::fixed_vector(NumElts, S32), WideRegs).getReg(0); } static std::pair<Register, unsigned> @@ -1754,17 +1780,14 @@ static bool isZero(Register Reg, MachineRegisterInfo &MRI) { return mi_match(Reg, MRI, m_ICst(C)) && C == 0; } -static unsigned extractGLC(unsigned CachePolicy) { - return CachePolicy & 1; +static unsigned extractCPol(unsigned CachePolicy) { + return CachePolicy & AMDGPU::CPol::ALL; } -static unsigned extractSLC(unsigned CachePolicy) { - return (CachePolicy >> 1) & 1; +static unsigned extractSWZ(unsigned CachePolicy) { + return (CachePolicy >> 3) & 1; } -static unsigned extractDLC(unsigned CachePolicy) { - return (CachePolicy >> 2) & 1; -} MachineInstr * AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B, @@ -1830,10 +1853,9 @@ AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B, MIB.addUse(RSrc) .addUse(SOffset) .addImm(ImmOffset) - .addImm(extractGLC(CachePolicy)) - .addImm(extractSLC(CachePolicy)) + .addImm(extractCPol(CachePolicy)) .addImm(0) // tfe: FIXME: Remove from inst - .addImm(extractDLC(CachePolicy)) + .addImm(extractSWZ(CachePolicy)) .cloneMemRefs(MI); // FIXME: We need a way to report failure from applyMappingImpl. @@ -2006,6 +2028,22 @@ bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect( return true; } +// Insert a cross regbank copy for a register if it already has a bank that +// differs from the one we want to set. +static Register constrainRegToBank(MachineRegisterInfo &MRI, + MachineIRBuilder &B, Register &Reg, + const RegisterBank &Bank) { + const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg); + if (CurrBank && *CurrBank != Bank) { + Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0); + MRI.setRegBank(Copy, Bank); + return Copy; + } + + MRI.setRegBank(Reg, Bank); + return Reg; +} + bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( MachineInstr &MI, MachineRegisterInfo &MRI, const OperandsMapper &OpdMapper) const { @@ -2069,17 +2107,18 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); for (unsigned L = 0; L < NumLanes; ++L) { - auto S = B.buildSelect(EltTy, Cmp, InsRegs[L], - UnmergeToEltTy.getReg(I * NumLanes + L)); + Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank); + Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L); + Op1 = constrainRegToBank(MRI, B, Op1, DstBank); - for (unsigned N : { 0, 2, 3 }) - MRI.setRegBank(S->getOperand(N).getReg(), DstBank); + Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0); + MRI.setRegBank(Select, DstBank); - Ops[I * NumLanes + L] = S->getOperand(0).getReg(); + Ops[I * NumLanes + L] = Select; } } - LLT MergeTy = LLT::vector(Ops.size(), EltTy); + LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy); if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) { B.buildBuildVector(MI.getOperand(0), Ops); } else { @@ -2336,18 +2375,40 @@ void AMDGPURegisterBankInfo::applyMappingImpl( MI.eraseFromParent(); return; } + case AMDGPU::G_ABS: { + Register SrcReg = MI.getOperand(1).getReg(); + const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg); + + // There is no VALU abs instruction so we need to replace it with a sub and + // max combination. + if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) { + MachineFunction *MF = MI.getParent()->getParent(); + ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank); + MachineIRBuilder B(MI, Apply); + LegalizerHelper Helper(*MF, Apply, B); + + if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized) + llvm_unreachable("lowerAbsToMaxNeg should have succeeded"); + return; + } + LLVM_FALLTHROUGH; + } case AMDGPU::G_ADD: case AMDGPU::G_SUB: case AMDGPU::G_MUL: case AMDGPU::G_SHL: case AMDGPU::G_LSHR: - case AMDGPU::G_ASHR: { + case AMDGPU::G_ASHR: + case AMDGPU::G_SMIN: + case AMDGPU::G_SMAX: + case AMDGPU::G_UMIN: + case AMDGPU::G_UMAX: { Register DstReg = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(DstReg); // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. // Packed 16-bit operations need to be scalarized and promoted. - if (DstTy != LLT::scalar(16) && DstTy != LLT::vector(2, 16)) + if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16)) break; const RegisterBank *DstBank = @@ -2365,10 +2426,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl( Register WideSrc0Lo, WideSrc0Hi; Register WideSrc1Lo, WideSrc1Hi; + unsigned ExtendOp = getExtendOp(MI.getOpcode()); std::tie(WideSrc0Lo, WideSrc0Hi) - = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), AMDGPU::G_ANYEXT); + = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp); std::tie(WideSrc1Lo, WideSrc1Hi) - = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), AMDGPU::G_ANYEXT); + = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp); auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo}); auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi}); B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)}); @@ -2390,73 +2452,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl( return; } - case AMDGPU::G_SMIN: - case AMDGPU::G_SMAX: - case AMDGPU::G_UMIN: - case AMDGPU::G_UMAX: { - Register DstReg = MI.getOperand(0).getReg(); - const RegisterBank *DstBank = - OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; - if (DstBank == &AMDGPU::VGPRRegBank) - break; - - MachineFunction *MF = MI.getParent()->getParent(); - MachineIRBuilder B(MI); - - // Turn scalar min/max into a compare and select. - LLT Ty = MRI.getType(DstReg); - const LLT S32 = LLT::scalar(32); - const LLT S16 = LLT::scalar(16); - const LLT V2S16 = LLT::vector(2, 16); - - if (Ty == V2S16) { - ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); - B.setChangeObserver(ApplySALU); - - // Need to widen to s32, and expand as cmp + select, and avoid producing - // illegal vector extends or unmerges that would need further - // legalization. - // - // TODO: Should we just readfirstlane? That should probably be handled - // with a UniformVGPR register bank that wouldn't need special - // consideration here. - - Register Dst = MI.getOperand(0).getReg(); - Register Src0 = MI.getOperand(1).getReg(); - Register Src1 = MI.getOperand(2).getReg(); - - Register WideSrc0Lo, WideSrc0Hi; - Register WideSrc1Lo, WideSrc1Hi; - - unsigned ExtendOp = minMaxToExtend(MI.getOpcode()); - - std::tie(WideSrc0Lo, WideSrc0Hi) = unpackV2S16ToS32(B, Src0, ExtendOp); - std::tie(WideSrc1Lo, WideSrc1Hi) = unpackV2S16ToS32(B, Src1, ExtendOp); - - Register Lo = MRI.createGenericVirtualRegister(S32); - Register Hi = MRI.createGenericVirtualRegister(S32); - const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode()); - buildExpandedScalarMinMax(B, Pred, Lo, WideSrc0Lo, WideSrc1Lo); - buildExpandedScalarMinMax(B, Pred, Hi, WideSrc0Hi, WideSrc1Hi); - - B.buildBuildVectorTrunc(Dst, {Lo, Hi}); - MI.eraseFromParent(); - } else if (Ty == S16) { - ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); - B.setChangeObserver(ApplySALU); - LegalizerHelper Helper(*MF, ApplySALU, B); - - // Need to widen to s32, and expand as cmp + select. - if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) - llvm_unreachable("widenScalar should have succeeded"); - - // FIXME: This is relying on widenScalar leaving MI in place. - lowerScalarMinMax(B, MI); - } else - lowerScalarMinMax(B, MI); - - return; - } case AMDGPU::G_SEXT_INREG: { SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); if (SrcRegs.empty()) @@ -2496,6 +2491,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( return; } case AMDGPU::G_CTPOP: + case AMDGPU::G_BITREVERSE: case AMDGPU::G_CTLZ_ZERO_UNDEF: case AMDGPU::G_CTTZ_ZERO_UNDEF: { const RegisterBank *DstBank = @@ -2605,7 +2601,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( case AMDGPU::G_BUILD_VECTOR_TRUNC: { Register DstReg = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(DstReg); - if (DstTy != LLT::vector(2, 16)) + if (DstTy != LLT::fixed_vector(2, 16)) break; assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty()); @@ -2737,7 +2733,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( assert(DstTy.getSizeInBits() == 64); - LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32); + LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32); auto CastSrc = B.buildBitcast(Vec32, SrcReg); auto One = B.buildConstant(S32, 1); @@ -2854,7 +2850,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( assert(InsTy.getSizeInBits() == 64); const LLT S32 = LLT::scalar(32); - LLT Vec32 = LLT::vector(2 * VecTy.getNumElements(), 32); + LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32); MachineIRBuilder B(MI); auto CastSrc = B.buildBitcast(Vec32, SrcReg); @@ -2953,7 +2949,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl( executeInWaterfallLoop(MI, MRI, {2, 5}); return; } - case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: { + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { applyDefaultMapping(OpdMapper); executeInWaterfallLoop(MI, MRI, {2, 5}); return; @@ -3012,10 +3010,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl( return; } case Intrinsic::amdgcn_sbfe: - applyMappingBFEIntrinsic(OpdMapper, true); + applyMappingBFE(OpdMapper, true); return; case Intrinsic::amdgcn_ubfe: - applyMappingBFEIntrinsic(OpdMapper, false); + applyMappingBFE(OpdMapper, false); return; case Intrinsic::amdgcn_ballot: // Use default handling and insert copy to vcc source. @@ -3107,6 +3105,12 @@ void AMDGPURegisterBankInfo::applyMappingImpl( case AMDGPU::G_DYN_STACKALLOC: applyMappingDynStackAlloc(MI, OpdMapper, MRI); return; + case AMDGPU::G_SBFX: + applyMappingBFE(OpdMapper, /*Signed*/ true); + return; + case AMDGPU::G_UBFX: + applyMappingBFE(OpdMapper, /*Signed*/ false); + return; default: break; } @@ -3579,7 +3583,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_SMAX: case AMDGPU::G_UMIN: case AMDGPU::G_UMAX: + case AMDGPU::G_ABS: case AMDGPU::G_SHUFFLE_VECTOR: + case AMDGPU::G_SBFX: + case AMDGPU::G_UBFX: if (isSALUMapping(MI)) return getDefaultMappingSOP(MI); LLVM_FALLTHROUGH; @@ -3621,6 +3628,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: + case AMDGPU::G_AMDGPU_CVT_PK_I16_I32: + case AMDGPU::G_AMDGPU_SMED3: return getDefaultMappingVOP(MI); case AMDGPU::G_UMULH: case AMDGPU::G_SMULH: { @@ -3679,7 +3688,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_BUILD_VECTOR: case AMDGPU::G_BUILD_VECTOR_TRUNC: { LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); - if (DstTy == LLT::vector(2, 16)) { + if (DstTy == LLT::fixed_vector(2, 16)) { unsigned DstSize = DstTy.getSizeInBits(); unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); @@ -3706,10 +3715,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); break; } + case AMDGPU::G_BITREVERSE: case AMDGPU::G_BITCAST: case AMDGPU::G_INTTOPTR: case AMDGPU::G_PTRTOINT: - case AMDGPU::G_BITREVERSE: case AMDGPU::G_FABS: case AMDGPU::G_FNEG: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); @@ -3919,7 +3928,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: - case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: { + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { // vdata_out OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); @@ -4033,6 +4044,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_cvt_pk_u8_f32: case Intrinsic::amdgcn_alignbit: case Intrinsic::amdgcn_alignbyte: + case Intrinsic::amdgcn_perm: case Intrinsic::amdgcn_fdot2: case Intrinsic::amdgcn_sdot2: case Intrinsic::amdgcn_udot2: @@ -4052,7 +4064,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_update_dpp: case Intrinsic::amdgcn_mov_dpp8: case Intrinsic::amdgcn_mov_dpp: + case Intrinsic::amdgcn_strict_wwm: case Intrinsic::amdgcn_wwm: + case Intrinsic::amdgcn_strict_wqm: case Intrinsic::amdgcn_wqm: case Intrinsic::amdgcn_softwqm: case Intrinsic::amdgcn_set_inactive: @@ -4176,7 +4190,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_mfma_i32_32x32x4i8: case Intrinsic::amdgcn_mfma_i32_32x32x8i8: case Intrinsic::amdgcn_mfma_f32_32x32x2bf16: - case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: { + case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: + case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k: + case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k: + case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k: + case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k: + case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k: + case Intrinsic::amdgcn_mfma_f64_16x16x4f64: + case Intrinsic::amdgcn_mfma_f64_4x4x4f64: { // Default for MAI intrinsics. // srcC can also be an immediate which can be folded later. // FIXME: Should we eventually add an alternative mapping with AGPR src @@ -4250,6 +4271,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } case Intrinsic::amdgcn_global_atomic_fadd: case Intrinsic::amdgcn_global_atomic_csub: + case Intrinsic::amdgcn_global_atomic_fmin: + case Intrinsic::amdgcn_global_atomic_fmax: + case Intrinsic::amdgcn_flat_atomic_fadd: + case Intrinsic::amdgcn_flat_atomic_fmin: + case Intrinsic::amdgcn_flat_atomic_fmax: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: { @@ -4306,6 +4332,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); break; } + case Intrinsic::amdgcn_live_mask: { + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); + break; + } + case Intrinsic::amdgcn_wqm_demote: case Intrinsic::amdgcn_kill: { OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); break; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h index 1c1441729e30..7e051e4a5424 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -81,10 +81,7 @@ public: MachineRegisterInfo &MRI, int RSrcIdx) const; bool applyMappingSBufferLoad(const OperandsMapper &OpdMapper) const; - bool applyMappingBFEIntrinsic(const OperandsMapper &OpdMapper, - bool Signed) const; - - void lowerScalarMinMax(MachineIRBuilder &B, MachineInstr &MI) const; + bool applyMappingBFE(const OperandsMapper &OpdMapper, bool Signed) const; Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td index 6c70b53b23c1..50999a4802b3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td @@ -7,16 +7,16 @@ //===----------------------------------------------------------------------===// def SGPRRegBank : RegisterBank<"SGPR", - [SReg_LO16, SReg_32, SReg_64, SReg_96, SReg_128, SReg_160, SReg_192, SReg_256, SReg_512, SReg_1024] + [SReg_LO16, SReg_32, SReg_64, SReg_96, SReg_128, SReg_160, SReg_192, SReg_224, SReg_256, SReg_512, SReg_1024] >; def VGPRRegBank : RegisterBank<"VGPR", - [VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_256, VReg_512, VReg_1024] + [VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_512, VReg_1024] >; // It is helpful to distinguish conditions from ordinary SGPRs. def VCCRegBank : RegisterBank <"VCC", [SReg_1]>; def AGPRRegBank : RegisterBank <"AGPR", - [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_256, AReg_512, AReg_1024] + [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_224, AReg_256, AReg_512, AReg_1024] >; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp new file mode 100644 index 000000000000..dabb4d006d99 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp @@ -0,0 +1,460 @@ +//===-- AMDGPUReplaceLDSUseWithPointer.cpp --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass replaces all the uses of LDS within non-kernel functions by +// corresponding pointer counter-parts. +// +// The main motivation behind this pass is - to *avoid* subsequent LDS lowering +// pass from directly packing LDS (assume large LDS) into a struct type which +// would otherwise cause allocating huge memory for struct instance within every +// kernel. +// +// Brief sketch of the algorithm implemented in this pass is as below: +// +// 1. Collect all the LDS defined in the module which qualify for pointer +// replacement, say it is, LDSGlobals set. +// +// 2. Collect all the reachable callees for each kernel defined in the module, +// say it is, KernelToCallees map. +// +// 3. FOR (each global GV from LDSGlobals set) DO +// LDSUsedNonKernels = Collect all non-kernel functions which use GV. +// FOR (each kernel K in KernelToCallees map) DO +// ReachableCallees = KernelToCallees[K] +// ReachableAndLDSUsedCallees = +// SetIntersect(LDSUsedNonKernels, ReachableCallees) +// IF (ReachableAndLDSUsedCallees is not empty) THEN +// Pointer = Create a pointer to point-to GV if not created. +// Initialize Pointer to point-to GV within kernel K. +// ENDIF +// ENDFOR +// Replace all uses of GV within non kernel functions by Pointer. +// ENFOR +// +// LLVM IR example: +// +// Input IR: +// +// @lds = internal addrspace(3) global [4 x i32] undef, align 16 +// +// define internal void @f0() { +// entry: +// %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds, +// i32 0, i32 0 +// ret void +// } +// +// define protected amdgpu_kernel void @k0() { +// entry: +// call void @f0() +// ret void +// } +// +// Output IR: +// +// @lds = internal addrspace(3) global [4 x i32] undef, align 16 +// @lds.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 +// +// define internal void @f0() { +// entry: +// %0 = load i16, i16 addrspace(3)* @lds.ptr, align 2 +// %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +// %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +// %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, +// i32 0, i32 0 +// ret void +// } +// +// define protected amdgpu_kernel void @k0() { +// entry: +// store i16 ptrtoint ([4 x i32] addrspace(3)* @lds to i16), +// i16 addrspace(3)* @lds.ptr, align 2 +// call void @f0() +// ret void +// } +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "Utils/AMDGPULDSUtils.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetOperations.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/ReplaceConstant.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" +#include <algorithm> +#include <vector> + +#define DEBUG_TYPE "amdgpu-replace-lds-use-with-pointer" + +using namespace llvm; + +namespace { + +class ReplaceLDSUseImpl { + Module &M; + LLVMContext &Ctx; + const DataLayout &DL; + Constant *LDSMemBaseAddr; + + DenseMap<GlobalVariable *, GlobalVariable *> LDSToPointer; + DenseMap<GlobalVariable *, SmallPtrSet<Function *, 8>> LDSToNonKernels; + DenseMap<Function *, SmallPtrSet<Function *, 8>> KernelToCallees; + DenseMap<Function *, SmallPtrSet<GlobalVariable *, 8>> KernelToLDSPointers; + DenseMap<Function *, BasicBlock *> KernelToInitBB; + DenseMap<Function *, DenseMap<GlobalVariable *, Value *>> + FunctionToLDSToReplaceInst; + + // Collect LDS which requires their uses to be replaced by pointer. + std::vector<GlobalVariable *> collectLDSRequiringPointerReplace() { + // Collect LDS which requires module lowering. + std::vector<GlobalVariable *> LDSGlobals = AMDGPU::findVariablesToLower(M); + + // Remove LDS which don't qualify for replacement. + LDSGlobals.erase(std::remove_if(LDSGlobals.begin(), LDSGlobals.end(), + [&](GlobalVariable *GV) { + return shouldIgnorePointerReplacement(GV); + }), + LDSGlobals.end()); + + return LDSGlobals; + } + + // Returns true if uses of given LDS global within non-kernel functions should + // be keep as it is without pointer replacement. + bool shouldIgnorePointerReplacement(GlobalVariable *GV) { + // LDS whose size is very small and doesn`t exceed pointer size is not worth + // replacing. + if (DL.getTypeAllocSize(GV->getValueType()) <= 2) + return true; + + // LDS which is not used from non-kernel function scope or it is used from + // global scope does not qualify for replacement. + LDSToNonKernels[GV] = AMDGPU::collectNonKernelAccessorsOfLDS(GV); + return LDSToNonKernels[GV].empty(); + + // FIXME: When GV is used within all (or within most of the kernels), then + // it does not make sense to create a pointer for it. + } + + // Insert new global LDS pointer which points to LDS. + GlobalVariable *createLDSPointer(GlobalVariable *GV) { + // LDS pointer which points to LDS is already created? return it. + auto PointerEntry = LDSToPointer.insert(std::make_pair(GV, nullptr)); + if (!PointerEntry.second) + return PointerEntry.first->second; + + // We need to create new LDS pointer which points to LDS. + // + // Each CU owns at max 64K of LDS memory, so LDS address ranges from 0 to + // 2^16 - 1. Hence 16 bit pointer is enough to hold the LDS address. + auto *I16Ty = Type::getInt16Ty(Ctx); + GlobalVariable *LDSPointer = new GlobalVariable( + M, I16Ty, false, GlobalValue::InternalLinkage, UndefValue::get(I16Ty), + GV->getName() + Twine(".ptr"), nullptr, GlobalVariable::NotThreadLocal, + AMDGPUAS::LOCAL_ADDRESS); + + LDSPointer->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + LDSPointer->setAlignment(AMDGPU::getAlign(DL, LDSPointer)); + + // Mark that an associated LDS pointer is created for LDS. + LDSToPointer[GV] = LDSPointer; + + return LDSPointer; + } + + // Split entry basic block in such a way that only lane 0 of each wave does + // the LDS pointer initialization, and return newly created basic block. + BasicBlock *activateLaneZero(Function *K) { + // If the entry basic block of kernel K is already splitted, then return + // newly created basic block. + auto BasicBlockEntry = KernelToInitBB.insert(std::make_pair(K, nullptr)); + if (!BasicBlockEntry.second) + return BasicBlockEntry.first->second; + + // Split entry basic block of kernel K. + auto *EI = &(*(K->getEntryBlock().getFirstInsertionPt())); + IRBuilder<> Builder(EI); + + Value *Mbcnt = + Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, + {Builder.getInt32(-1), Builder.getInt32(0)}); + Value *Cond = Builder.CreateICmpEQ(Mbcnt, Builder.getInt32(0)); + Instruction *WB = cast<Instruction>( + Builder.CreateIntrinsic(Intrinsic::amdgcn_wave_barrier, {}, {})); + + BasicBlock *NBB = SplitBlockAndInsertIfThen(Cond, WB, false)->getParent(); + + // Mark that the entry basic block of kernel K is splitted. + KernelToInitBB[K] = NBB; + + return NBB; + } + + // Within given kernel, initialize given LDS pointer to point to given LDS. + void initializeLDSPointer(Function *K, GlobalVariable *GV, + GlobalVariable *LDSPointer) { + // If LDS pointer is already initialized within K, then nothing to do. + auto PointerEntry = KernelToLDSPointers.insert( + std::make_pair(K, SmallPtrSet<GlobalVariable *, 8>())); + if (!PointerEntry.second) + if (PointerEntry.first->second.contains(LDSPointer)) + return; + + // Insert instructions at EI which initialize LDS pointer to point-to LDS + // within kernel K. + // + // That is, convert pointer type of GV to i16, and then store this converted + // i16 value within LDSPointer which is of type i16*. + auto *EI = &(*(activateLaneZero(K)->getFirstInsertionPt())); + IRBuilder<> Builder(EI); + Builder.CreateStore(Builder.CreatePtrToInt(GV, Type::getInt16Ty(Ctx)), + LDSPointer); + + // Mark that LDS pointer is initialized within kernel K. + KernelToLDSPointers[K].insert(LDSPointer); + } + + // We have created an LDS pointer for LDS, and initialized it to point-to LDS + // within all relevent kernels. Now replace all the uses of LDS within + // non-kernel functions by LDS pointer. + void replaceLDSUseByPointer(GlobalVariable *GV, GlobalVariable *LDSPointer) { + SmallVector<User *, 8> LDSUsers(GV->users()); + for (auto *U : LDSUsers) { + // When `U` is a constant expression, it is possible that same constant + // expression exists within multiple instructions, and within multiple + // non-kernel functions. Collect all those non-kernel functions and all + // those instructions within which `U` exist. + auto FunctionToInsts = + AMDGPU::getFunctionToInstsMap(U, false /*=CollectKernelInsts*/); + + for (auto FI = FunctionToInsts.begin(), FE = FunctionToInsts.end(); + FI != FE; ++FI) { + Function *F = FI->first; + auto &Insts = FI->second; + for (auto *I : Insts) { + // If `U` is a constant expression, then we need to break the + // associated instruction into a set of separate instructions by + // converting constant expressions into instructions. + SmallPtrSet<Instruction *, 8> UserInsts; + + if (U == I) { + // `U` is an instruction, conversion from constant expression to + // set of instructions is *not* required. + UserInsts.insert(I); + } else { + // `U` is a constant expression, convert it into corresponding set + // of instructions. + auto *CE = cast<ConstantExpr>(U); + convertConstantExprsToInstructions(I, CE, &UserInsts); + } + + // Go through all the user instrutions, if LDS exist within them as an + // operand, then replace it by replace instruction. + for (auto *II : UserInsts) { + auto *ReplaceInst = getReplacementInst(F, GV, LDSPointer); + II->replaceUsesOfWith(GV, ReplaceInst); + } + } + } + } + } + + // Create a set of replacement instructions which together replace LDS within + // non-kernel function F by accessing LDS indirectly using LDS pointer. + Value *getReplacementInst(Function *F, GlobalVariable *GV, + GlobalVariable *LDSPointer) { + // If the instruction which replaces LDS within F is already created, then + // return it. + auto LDSEntry = FunctionToLDSToReplaceInst.insert( + std::make_pair(F, DenseMap<GlobalVariable *, Value *>())); + if (!LDSEntry.second) { + auto ReplaceInstEntry = + LDSEntry.first->second.insert(std::make_pair(GV, nullptr)); + if (!ReplaceInstEntry.second) + return ReplaceInstEntry.first->second; + } + + // Get the instruction insertion point within the beginning of the entry + // block of current non-kernel function. + auto *EI = &(*(F->getEntryBlock().getFirstInsertionPt())); + IRBuilder<> Builder(EI); + + // Insert required set of instructions which replace LDS within F. + auto *V = Builder.CreateBitCast( + Builder.CreateGEP( + Builder.getInt8Ty(), LDSMemBaseAddr, + Builder.CreateLoad(LDSPointer->getValueType(), LDSPointer)), + GV->getType()); + + // Mark that the replacement instruction which replace LDS within F is + // created. + FunctionToLDSToReplaceInst[F][GV] = V; + + return V; + } + +public: + ReplaceLDSUseImpl(Module &M) + : M(M), Ctx(M.getContext()), DL(M.getDataLayout()) { + LDSMemBaseAddr = Constant::getIntegerValue( + PointerType::get(Type::getInt8Ty(M.getContext()), + AMDGPUAS::LOCAL_ADDRESS), + APInt(32, 0)); + } + + // Entry-point function which interface ReplaceLDSUseImpl with outside of the + // class. + bool replaceLDSUse(); + +private: + // For a given LDS from collected LDS globals set, replace its non-kernel + // function scope uses by pointer. + bool replaceLDSUse(GlobalVariable *GV); +}; + +// For given LDS from collected LDS globals set, replace its non-kernel function +// scope uses by pointer. +bool ReplaceLDSUseImpl::replaceLDSUse(GlobalVariable *GV) { + // Holds all those non-kernel functions within which LDS is being accessed. + SmallPtrSet<Function *, 8> &LDSAccessors = LDSToNonKernels[GV]; + + // The LDS pointer which points to LDS and replaces all the uses of LDS. + GlobalVariable *LDSPointer = nullptr; + + // Traverse through each kernel K, check and if required, initialize the + // LDS pointer to point to LDS within K. + for (auto KI = KernelToCallees.begin(), KE = KernelToCallees.end(); KI != KE; + ++KI) { + Function *K = KI->first; + SmallPtrSet<Function *, 8> Callees = KI->second; + + // Compute reachable and LDS used callees for kernel K. + set_intersect(Callees, LDSAccessors); + + // None of the LDS accessing non-kernel functions are reachable from + // kernel K. Hence, no need to initialize LDS pointer within kernel K. + if (Callees.empty()) + continue; + + // We have found reachable and LDS used callees for kernel K, and we need to + // initialize LDS pointer within kernel K, and we need to replace LDS use + // within those callees by LDS pointer. + // + // But, first check if LDS pointer is already created, if not create one. + LDSPointer = createLDSPointer(GV); + + // Initialize LDS pointer to point to LDS within kernel K. + initializeLDSPointer(K, GV, LDSPointer); + } + + // We have not found reachable and LDS used callees for any of the kernels, + // and hence we have not created LDS pointer. + if (!LDSPointer) + return false; + + // We have created an LDS pointer for LDS, and initialized it to point-to LDS + // within all relevent kernels. Now replace all the uses of LDS within + // non-kernel functions by LDS pointer. + replaceLDSUseByPointer(GV, LDSPointer); + + return true; +} + +// Entry-point function which interface ReplaceLDSUseImpl with outside of the +// class. +bool ReplaceLDSUseImpl::replaceLDSUse() { + // Collect LDS which requires their uses to be replaced by pointer. + std::vector<GlobalVariable *> LDSGlobals = + collectLDSRequiringPointerReplace(); + + // No LDS to pointer-replace. Nothing to do. + if (LDSGlobals.empty()) + return false; + + // Collect reachable callee set for each kernel defined in the module. + AMDGPU::collectReachableCallees(M, KernelToCallees); + + if (KernelToCallees.empty()) { + // Either module does not have any kernel definitions, or none of the kernel + // has a call to non-kernel functions, or we could not resolve any of the + // call sites to proper non-kernel functions, because of the situations like + // inline asm calls. Nothing to replace. + return false; + } + + // For every LDS from collected LDS globals set, replace its non-kernel + // function scope use by pointer. + bool Changed = false; + for (auto *GV : LDSGlobals) + Changed |= replaceLDSUse(GV); + + return Changed; +} + +class AMDGPUReplaceLDSUseWithPointer : public ModulePass { +public: + static char ID; + + AMDGPUReplaceLDSUseWithPointer() : ModulePass(ID) { + initializeAMDGPUReplaceLDSUseWithPointerPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetPassConfig>(); + } +}; + +} // namespace + +char AMDGPUReplaceLDSUseWithPointer::ID = 0; +char &llvm::AMDGPUReplaceLDSUseWithPointerID = + AMDGPUReplaceLDSUseWithPointer::ID; + +INITIALIZE_PASS_BEGIN( + AMDGPUReplaceLDSUseWithPointer, DEBUG_TYPE, + "Replace within non-kernel function use of LDS with pointer", + false /*only look at the cfg*/, false /*analysis pass*/) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END( + AMDGPUReplaceLDSUseWithPointer, DEBUG_TYPE, + "Replace within non-kernel function use of LDS with pointer", + false /*only look at the cfg*/, false /*analysis pass*/) + +bool AMDGPUReplaceLDSUseWithPointer::runOnModule(Module &M) { + ReplaceLDSUseImpl LDSUseReplacer{M}; + return LDSUseReplacer.replaceLDSUse(); +} + +ModulePass *llvm::createAMDGPUReplaceLDSUseWithPointerPass() { + return new AMDGPUReplaceLDSUseWithPointer(); +} + +PreservedAnalyses +AMDGPUReplaceLDSUseWithPointerPass::run(Module &M, ModuleAnalysisManager &AM) { + ReplaceLDSUseImpl LDSUseReplacer{M}; + LDSUseReplacer.replaceLDSUse(); + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp new file mode 100644 index 000000000000..ef46e53b7460 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -0,0 +1,514 @@ +//===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Analyzes how many registers and other resources are used by +/// functions. +/// +/// The results of this analysis are used to fill the register usage, flat +/// usage, etc. into hardware registers. +/// +/// The analysis takes callees into account. E.g. if a function A that needs 10 +/// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A +/// will return 20. +/// It is assumed that an indirect call can go into any function except +/// hardware-entrypoints. Therefore the register usage of functions with +/// indirect calls is estimated as the maximum of all non-entrypoint functions +/// in the module. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPUResourceUsageAnalysis.h" +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; +using namespace llvm::AMDGPU; + +#define DEBUG_TYPE "amdgpu-resource-usage" + +char llvm::AMDGPUResourceUsageAnalysis::ID = 0; +char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID; + +// We need to tell the runtime some amount ahead of time if we don't know the +// true stack size. Assume a smaller number if this is only due to dynamic / +// non-entry block allocas. +static cl::opt<uint32_t> AssumedStackSizeForExternalCall( + "amdgpu-assume-external-call-stack-size", + cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, + cl::init(16384)); + +static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects( + "amdgpu-assume-dynamic-stack-object-size", + cl::desc("Assumed extra stack use if there are any " + "variable sized objects (in bytes)"), + cl::Hidden, cl::init(4096)); + +INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE, + "Function register usage analysis", true, true) + +static const Function *getCalleeFunction(const MachineOperand &Op) { + if (Op.isImm()) { + assert(Op.getImm() == 0); + return nullptr; + } + + return cast<Function>(Op.getGlobal()); +} + +static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, + const SIInstrInfo &TII, unsigned Reg) { + for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { + if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent())) + return true; + } + + return false; +} + +int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs( + const GCNSubtarget &ST) const { + return NumExplicitSGPR + + IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch, + ST.getTargetID().isXnackOnOrAny()); +} + +int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( + const GCNSubtarget &ST) const { + if (ST.hasGFX90AInsts() && NumAGPR) + return alignTo(NumVGPR, 4) + NumAGPR; + return std::max(NumVGPR, NumAGPR); +} + +bool AMDGPUResourceUsageAnalysis::runOnSCC(CallGraphSCC &SCC) { + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) + return false; + + const TargetMachine &TM = TPC->getTM<TargetMachine>(); + bool HasIndirectCall = false; + + for (CallGraphNode *I : SCC) { + Function *F = I->getFunction(); + if (!F || F->isDeclaration()) + continue; + + MachineModuleInfo &MMI = + getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); + MachineFunction &MF = MMI.getOrCreateMachineFunction(*F); + + auto CI = CallGraphResourceInfo.insert( + std::make_pair(&MF.getFunction(), SIFunctionResourceInfo())); + SIFunctionResourceInfo &Info = CI.first->second; + assert(CI.second && "should only be called once per function"); + Info = analyzeResourceUsage(MF, TM); + HasIndirectCall |= Info.HasIndirectCall; + } + + if (HasIndirectCall) + propagateIndirectCallRegisterUsage(); + + return false; +} + +AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo +AMDGPUResourceUsageAnalysis::analyzeResourceUsage( + const MachineFunction &MF, const TargetMachine &TM) const { + SIFunctionResourceInfo Info; + + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + + Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || + MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) || + MRI.isLiveIn(MFI->getPreloadedReg( + AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT)); + + // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat + // instructions aren't used to access the scratch buffer. Inline assembly may + // need it though. + // + // If we only have implicit uses of flat_scr on flat instructions, it is not + // really needed. + if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() && + (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && + !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && + !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { + Info.UsesFlatScratch = false; + } + + Info.PrivateSegmentSize = FrameInfo.getStackSize(); + + // Assume a big number if there are any unknown sized objects. + Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); + if (Info.HasDynamicallySizedStack) + Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects; + + if (MFI->isStackRealigned()) + Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value(); + + Info.UsesVCC = + MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI); + + // If there are no calls, MachineRegisterInfo can tell us the used register + // count easily. + // A tail call isn't considered a call for MachineFrameInfo's purposes. + if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { + MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestVGPRReg = Reg; + break; + } + } + + if (ST.hasMAIInsts()) { + MCPhysReg HighestAGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestAGPRReg = Reg; + break; + } + } + Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister + ? 0 + : TRI.getHWRegIndex(HighestAGPRReg) + 1; + } + + MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestSGPRReg = Reg; + break; + } + } + + // We found the maximum register index. They start at 0, so add one to get + // the number of registers. + Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister + ? 0 + : TRI.getHWRegIndex(HighestVGPRReg) + 1; + Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister + ? 0 + : TRI.getHWRegIndex(HighestSGPRReg) + 1; + + return Info; + } + + int32_t MaxVGPR = -1; + int32_t MaxAGPR = -1; + int32_t MaxSGPR = -1; + uint64_t CalleeFrameSize = 0; + + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + // TODO: Check regmasks? Do they occur anywhere except calls? + for (const MachineOperand &MO : MI.operands()) { + unsigned Width = 0; + bool IsSGPR = false; + bool IsAGPR = false; + + if (!MO.isReg()) + continue; + + Register Reg = MO.getReg(); + switch (Reg) { + case AMDGPU::EXEC: + case AMDGPU::EXEC_LO: + case AMDGPU::EXEC_HI: + case AMDGPU::SCC: + case AMDGPU::M0: + case AMDGPU::M0_LO16: + case AMDGPU::M0_HI16: + case AMDGPU::SRC_SHARED_BASE: + case AMDGPU::SRC_SHARED_LIMIT: + case AMDGPU::SRC_PRIVATE_BASE: + case AMDGPU::SRC_PRIVATE_LIMIT: + case AMDGPU::SGPR_NULL: + case AMDGPU::MODE: + continue; + + case AMDGPU::SRC_POPS_EXITING_WAVE_ID: + llvm_unreachable("src_pops_exiting_wave_id should not be used"); + + case AMDGPU::NoRegister: + assert(MI.isDebugInstr() && + "Instruction uses invalid noreg register"); + continue; + + case AMDGPU::VCC: + case AMDGPU::VCC_LO: + case AMDGPU::VCC_HI: + case AMDGPU::VCC_LO_LO16: + case AMDGPU::VCC_LO_HI16: + case AMDGPU::VCC_HI_LO16: + case AMDGPU::VCC_HI_HI16: + Info.UsesVCC = true; + continue; + + case AMDGPU::FLAT_SCR: + case AMDGPU::FLAT_SCR_LO: + case AMDGPU::FLAT_SCR_HI: + continue; + + case AMDGPU::XNACK_MASK: + case AMDGPU::XNACK_MASK_LO: + case AMDGPU::XNACK_MASK_HI: + llvm_unreachable("xnack_mask registers should not be used"); + + case AMDGPU::LDS_DIRECT: + llvm_unreachable("lds_direct register should not be used"); + + case AMDGPU::TBA: + case AMDGPU::TBA_LO: + case AMDGPU::TBA_HI: + case AMDGPU::TMA: + case AMDGPU::TMA_LO: + case AMDGPU::TMA_HI: + llvm_unreachable("trap handler registers should not be used"); + + case AMDGPU::SRC_VCCZ: + llvm_unreachable("src_vccz register should not be used"); + + case AMDGPU::SRC_EXECZ: + llvm_unreachable("src_execz register should not be used"); + + case AMDGPU::SRC_SCC: + llvm_unreachable("src_scc register should not be used"); + + default: + break; + } + + if (AMDGPU::SReg_32RegClass.contains(Reg) || + AMDGPU::SReg_LO16RegClass.contains(Reg) || + AMDGPU::SGPR_HI16RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_32RegClass.contains(Reg) && + "trap handler registers should not be used"); + IsSGPR = true; + Width = 1; + } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || + AMDGPU::VGPR_LO16RegClass.contains(Reg) || + AMDGPU::VGPR_HI16RegClass.contains(Reg)) { + IsSGPR = false; + Width = 1; + } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || + AMDGPU::AGPR_LO16RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 1; + } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && + "trap handler registers should not be used"); + IsSGPR = true; + Width = 2; + } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { + IsSGPR = false; + Width = 2; + } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 2; + } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { + IsSGPR = false; + Width = 3; + } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { + IsSGPR = true; + Width = 3; + } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 3; + } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_128RegClass.contains(Reg) && + "trap handler registers should not be used"); + IsSGPR = true; + Width = 4; + } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { + IsSGPR = false; + Width = 4; + } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 4; + } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { + IsSGPR = false; + Width = 5; + } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { + IsSGPR = true; + Width = 5; + } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 5; + } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { + IsSGPR = false; + Width = 6; + } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { + IsSGPR = true; + Width = 6; + } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 6; + } else if (AMDGPU::VReg_224RegClass.contains(Reg)) { + IsSGPR = false; + Width = 7; + } else if (AMDGPU::SReg_224RegClass.contains(Reg)) { + IsSGPR = true; + Width = 7; + } else if (AMDGPU::AReg_224RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 7; + } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && + "trap handler registers should not be used"); + IsSGPR = true; + Width = 8; + } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { + IsSGPR = false; + Width = 8; + } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 8; + } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_512RegClass.contains(Reg) && + "trap handler registers should not be used"); + IsSGPR = true; + Width = 16; + } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { + IsSGPR = false; + Width = 16; + } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 16; + } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { + IsSGPR = true; + Width = 32; + } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { + IsSGPR = false; + Width = 32; + } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 32; + } else { + llvm_unreachable("Unknown register class"); + } + unsigned HWReg = TRI.getHWRegIndex(Reg); + int MaxUsed = HWReg + Width - 1; + if (IsSGPR) { + MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; + } else if (IsAGPR) { + MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; + } else { + MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; + } + } + + if (MI.isCall()) { + // Pseudo used just to encode the underlying global. Is there a better + // way to track this? + + const MachineOperand *CalleeOp = + TII->getNamedOperand(MI, AMDGPU::OpName::callee); + + const Function *Callee = getCalleeFunction(*CalleeOp); + DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I = + CallGraphResourceInfo.end(); + + // Avoid crashing on undefined behavior with an illegal call to a + // kernel. If a callsite's calling convention doesn't match the + // function's, it's undefined behavior. If the callsite calling + // convention does match, that would have errored earlier. + if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) + report_fatal_error("invalid call to entry function"); + + bool IsIndirect = !Callee || Callee->isDeclaration(); + if (!IsIndirect) + I = CallGraphResourceInfo.find(Callee); + + if (IsIndirect || I == CallGraphResourceInfo.end()) { + CalleeFrameSize = + std::max(CalleeFrameSize, + static_cast<uint64_t>(AssumedStackSizeForExternalCall)); + + // Register usage of indirect calls gets handled later + Info.UsesVCC = true; + Info.UsesFlatScratch = ST.hasFlatAddressSpace(); + Info.HasDynamicallySizedStack = true; + Info.HasIndirectCall = true; + } else { + // We force CodeGen to run in SCC order, so the callee's register + // usage etc. should be the cumulative usage of all callees. + MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); + MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); + MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR); + CalleeFrameSize = + std::max(I->second.PrivateSegmentSize, CalleeFrameSize); + Info.UsesVCC |= I->second.UsesVCC; + Info.UsesFlatScratch |= I->second.UsesFlatScratch; + Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; + Info.HasRecursion |= I->second.HasRecursion; + Info.HasIndirectCall |= I->second.HasIndirectCall; + } + + // FIXME: Call site could have norecurse on it + if (!Callee || !Callee->doesNotRecurse()) + Info.HasRecursion = true; + } + } + } + + Info.NumExplicitSGPR = MaxSGPR + 1; + Info.NumVGPR = MaxVGPR + 1; + Info.NumAGPR = MaxAGPR + 1; + Info.PrivateSegmentSize += CalleeFrameSize; + + return Info; +} + +void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() { + // Collect the maximum number of registers from non-hardware-entrypoints. + // All these functions are potential targets for indirect calls. + int32_t NonKernelMaxSGPRs = 0; + int32_t NonKernelMaxVGPRs = 0; + int32_t NonKernelMaxAGPRs = 0; + + for (const auto &I : CallGraphResourceInfo) { + if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) { + auto &Info = I.getSecond(); + NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR); + NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR); + NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR); + } + } + + // Add register usage for functions with indirect calls. + // For calls to unknown functions, we assume the maximum register usage of + // all non-hardware-entrypoints in the current module. + for (auto &I : CallGraphResourceInfo) { + auto &Info = I.getSecond(); + if (Info.HasIndirectCall) { + Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs); + Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs); + Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs); + } + } +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h new file mode 100644 index 000000000000..832e8119e444 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h @@ -0,0 +1,79 @@ +//===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Analyzes how many registers and other resources are used by +/// functions. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H + +#include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/IR/ValueMap.h" + +namespace llvm { + +class GCNSubtarget; +class MachineFunction; +class TargetMachine; + +struct AMDGPUResourceUsageAnalysis : public CallGraphSCCPass { + static char ID; + +public: + // Track resource usage for callee functions. + struct SIFunctionResourceInfo { + // Track the number of explicitly used VGPRs. Special registers reserved at + // the end are tracked separately. + int32_t NumVGPR = 0; + int32_t NumAGPR = 0; + int32_t NumExplicitSGPR = 0; + uint64_t PrivateSegmentSize = 0; + bool UsesVCC = false; + bool UsesFlatScratch = false; + bool HasDynamicallySizedStack = false; + bool HasRecursion = false; + bool HasIndirectCall = false; + + int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const; + int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const; + }; + + AMDGPUResourceUsageAnalysis() : CallGraphSCCPass(ID) {} + + bool runOnSCC(CallGraphSCC &SCC) override; + + bool doInitialization(CallGraph &CG) override { + CallGraphResourceInfo.clear(); + return CallGraphSCCPass::doInitialization(CG); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineModuleInfoWrapperPass>(); + AU.setPreservesAll(); + } + + const SIFunctionResourceInfo &getResourceInfo(const Function *F) const { + auto Info = CallGraphResourceInfo.find(F); + assert(Info != CallGraphResourceInfo.end() && + "Failed to find resource info for function"); + return Info->getSecond(); + } + +private: + SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF, + const TargetMachine &TM) const; + void propagateIndirectCallRegisterUsage(); + + DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo; +}; +} // namespace llvm +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index fd65727f04d4..afe016731395 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -199,6 +199,12 @@ def : SourceOfDivergence<int_r600_read_tidig_z>; def : SourceOfDivergence<int_amdgcn_atomic_inc>; def : SourceOfDivergence<int_amdgcn_atomic_dec>; def : SourceOfDivergence<int_amdgcn_global_atomic_csub>; +def : SourceOfDivergence<int_amdgcn_global_atomic_fadd>; +def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>; +def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>; +def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd>; +def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>; +def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_ds_fadd>; def : SourceOfDivergence<int_amdgcn_ds_fmin>; def : SourceOfDivergence<int_amdgcn_ds_fmax>; @@ -226,6 +232,8 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>; +def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmin>; +def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>; @@ -240,9 +248,12 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>; +def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmin>; +def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>; def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>; def : SourceOfDivergence<int_amdgcn_ps_live>; +def : SourceOfDivergence<int_amdgcn_live_mask>; def : SourceOfDivergence<int_amdgcn_ds_swizzle>; def : SourceOfDivergence<int_amdgcn_ds_ordered_add>; def : SourceOfDivergence<int_amdgcn_ds_ordered_swap>; @@ -274,6 +285,13 @@ def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x4i8>; def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x8i8>; def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2bf16>; def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16_1k>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4bf16_1k>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x4bf16_1k>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8bf16_1k>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16bf16_1k>; +def : SourceOfDivergence<int_amdgcn_mfma_f64_16x16x4f64>; +def : SourceOfDivergence<int_amdgcn_mfma_f64_4x4x4f64>; // The dummy boolean output is divergent from the IR's perspective, // but the mask results are uniform. These produce a divergent and diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index f1a7d7463676..0c5020dccecd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -98,12 +98,12 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS // Disable mutually exclusive bits. - if (FS.find_lower("+wavefrontsize") != StringRef::npos) { - if (FS.find_lower("wavefrontsize16") == StringRef::npos) + if (FS.find_insensitive("+wavefrontsize") != StringRef::npos) { + if (FS.find_insensitive("wavefrontsize16") == StringRef::npos) FullFS += "-wavefrontsize16,"; - if (FS.find_lower("wavefrontsize32") == StringRef::npos) + if (FS.find_insensitive("wavefrontsize32") == StringRef::npos) FullFS += "-wavefrontsize32,"; - if (FS.find_lower("wavefrontsize64") == StringRef::npos) + if (FS.find_insensitive("wavefrontsize64") == StringRef::npos) FullFS += "-wavefrontsize64,"; } @@ -163,6 +163,7 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, WavefrontSizeLog2 = 5; HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; + HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; TargetID.setTargetIDFromFeaturesString(FS); @@ -176,6 +177,7 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT), + GCN3Encoding(false), Has16BitInsts(false), HasMadMixInsts(false), HasMadMacF32Insts(false), @@ -184,6 +186,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : HasVOP3PInsts(false), HasMulI24(true), HasMulU24(true), + HasSMulHi(false), HasInv2PiInlineImm(false), HasFminFmaxLegacy(true), EnablePromoteAlloca(false), @@ -194,7 +197,8 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : { } GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, - const GCNTargetMachine &TM) : + const GCNTargetMachine &TM) + : // clang-format off AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), AMDGPUSubtarget(TT), TargetTriple(TT), @@ -207,6 +211,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, FastFMAF32(false), FastDenormalF32(false), HalfRate64Ops(false), + FullRate64Ops(false), FlatForGlobal(false), AutoWaitcntBeforeBarrier(false), @@ -216,6 +221,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasApertureRegs(false), SupportsXNACK(false), EnableXNACK(false), + EnableTgSplit(false), EnableCuMode(false), TrapHandler(false), @@ -227,14 +233,16 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, DumpCode(false), FP64(false), - GCN3Encoding(false), CIInsts(false), GFX8Insts(false), GFX9Insts(false), + GFX90AInsts(false), GFX10Insts(false), GFX10_3Insts(false), GFX7GFX8GFX9Insts(false), SGPRInitBug(false), + NegativeScratchOffsetBug(false), + NegativeUnalignedScratchOffsetBug(false), HasSMemRealTime(false), HasIntClamp(false), HasFmaMixInsts(false), @@ -249,10 +257,15 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasSDWAOutModsVOPC(false), HasDPP(false), HasDPP8(false), + Has64BitDPP(false), + HasPackedFP32Ops(false), + HasExtendedImageInsts(false), HasR128A16(false), HasGFX10A16(false), HasG16(false), HasNSAEncoding(false), + NSAMaxSize(0), + GFX10_AEncoding(false), GFX10_BEncoding(false), HasDLInsts(false), HasDot1Insts(false), @@ -261,6 +274,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasDot4Insts(false), HasDot5Insts(false), HasDot6Insts(false), + HasDot7Insts(false), HasMAIInsts(false), HasPkFmacF16Inst(false), HasAtomicFaddInsts(false), @@ -270,6 +284,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasVscnt(false), HasGetWaveIdInst(false), HasSMemTimeInst(false), + HasShaderCyclesRegister(false), HasRegisterBanking(false), HasVOP3Literal(false), HasNoDataDepHazard(false), @@ -278,12 +293,14 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, FlatGlobalInsts(false), FlatScratchInsts(false), ScalarFlatScratchInsts(false), + HasArchitectedFlatScratch(false), AddNoCarryInsts(false), HasUnpackedD16VMem(false), LDSMisalignedBug(false), HasMFMAInlineLiteralBug(false), UnalignedBufferAccess(false), UnalignedDSAccess(false), + HasPackedTID(false), ScalarizeGlobal(false), @@ -294,6 +311,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasVcmpxExecWARHazard(false), HasLdsBranchVmemWARHazard(false), HasNSAtoVMEMBug(false), + HasNSAClauseBug(false), HasOffset3fBug(false), HasFlatSegmentOffsetBug(false), HasImageStoreD16Bug(false), @@ -303,6 +321,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), TLInfo(TM, *this), FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { + // clang-format on MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); @@ -313,7 +332,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, } bool GCNSubtarget::enableFlatScratch() const { - return EnableFlatScratch && hasFlatScratchInsts(); + return flatScratchIsArchitected() || + (EnableFlatScratch && hasFlatScratchInsts()); } unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { @@ -336,6 +356,105 @@ unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { return 2; } +/// This list was mostly derived from experimentation. +bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { + switch (Opcode) { + case AMDGPU::V_CVT_F16_F32_e32: + case AMDGPU::V_CVT_F16_F32_e64: + case AMDGPU::V_CVT_F16_U16_e32: + case AMDGPU::V_CVT_F16_U16_e64: + case AMDGPU::V_CVT_F16_I16_e32: + case AMDGPU::V_CVT_F16_I16_e64: + case AMDGPU::V_RCP_F16_e64: + case AMDGPU::V_RCP_F16_e32: + case AMDGPU::V_RSQ_F16_e64: + case AMDGPU::V_RSQ_F16_e32: + case AMDGPU::V_SQRT_F16_e64: + case AMDGPU::V_SQRT_F16_e32: + case AMDGPU::V_LOG_F16_e64: + case AMDGPU::V_LOG_F16_e32: + case AMDGPU::V_EXP_F16_e64: + case AMDGPU::V_EXP_F16_e32: + case AMDGPU::V_SIN_F16_e64: + case AMDGPU::V_SIN_F16_e32: + case AMDGPU::V_COS_F16_e64: + case AMDGPU::V_COS_F16_e32: + case AMDGPU::V_FLOOR_F16_e64: + case AMDGPU::V_FLOOR_F16_e32: + case AMDGPU::V_CEIL_F16_e64: + case AMDGPU::V_CEIL_F16_e32: + case AMDGPU::V_TRUNC_F16_e64: + case AMDGPU::V_TRUNC_F16_e32: + case AMDGPU::V_RNDNE_F16_e64: + case AMDGPU::V_RNDNE_F16_e32: + case AMDGPU::V_FRACT_F16_e64: + case AMDGPU::V_FRACT_F16_e32: + case AMDGPU::V_FREXP_MANT_F16_e64: + case AMDGPU::V_FREXP_MANT_F16_e32: + case AMDGPU::V_FREXP_EXP_I16_F16_e64: + case AMDGPU::V_FREXP_EXP_I16_F16_e32: + case AMDGPU::V_LDEXP_F16_e64: + case AMDGPU::V_LDEXP_F16_e32: + case AMDGPU::V_LSHLREV_B16_e64: + case AMDGPU::V_LSHLREV_B16_e32: + case AMDGPU::V_LSHRREV_B16_e64: + case AMDGPU::V_LSHRREV_B16_e32: + case AMDGPU::V_ASHRREV_I16_e64: + case AMDGPU::V_ASHRREV_I16_e32: + case AMDGPU::V_ADD_U16_e64: + case AMDGPU::V_ADD_U16_e32: + case AMDGPU::V_SUB_U16_e64: + case AMDGPU::V_SUB_U16_e32: + case AMDGPU::V_SUBREV_U16_e64: + case AMDGPU::V_SUBREV_U16_e32: + case AMDGPU::V_MUL_LO_U16_e64: + case AMDGPU::V_MUL_LO_U16_e32: + case AMDGPU::V_ADD_F16_e64: + case AMDGPU::V_ADD_F16_e32: + case AMDGPU::V_SUB_F16_e64: + case AMDGPU::V_SUB_F16_e32: + case AMDGPU::V_SUBREV_F16_e64: + case AMDGPU::V_SUBREV_F16_e32: + case AMDGPU::V_MUL_F16_e64: + case AMDGPU::V_MUL_F16_e32: + case AMDGPU::V_MAX_F16_e64: + case AMDGPU::V_MAX_F16_e32: + case AMDGPU::V_MIN_F16_e64: + case AMDGPU::V_MIN_F16_e32: + case AMDGPU::V_MAX_U16_e64: + case AMDGPU::V_MAX_U16_e32: + case AMDGPU::V_MIN_U16_e64: + case AMDGPU::V_MIN_U16_e32: + case AMDGPU::V_MAX_I16_e64: + case AMDGPU::V_MAX_I16_e32: + case AMDGPU::V_MIN_I16_e64: + case AMDGPU::V_MIN_I16_e32: + // On gfx10, all 16-bit instructions preserve the high bits. + return getGeneration() <= AMDGPUSubtarget::GFX9; + case AMDGPU::V_MAD_F16_e64: + case AMDGPU::V_MADAK_F16: + case AMDGPU::V_MADMK_F16: + case AMDGPU::V_MAC_F16_e64: + case AMDGPU::V_MAC_F16_e32: + case AMDGPU::V_FMAMK_F16: + case AMDGPU::V_FMAAK_F16: + case AMDGPU::V_MAD_U16_e64: + case AMDGPU::V_MAD_I16_e64: + case AMDGPU::V_FMA_F16_e64: + case AMDGPU::V_FMAC_F16_e64: + case AMDGPU::V_FMAC_F16_e32: + case AMDGPU::V_DIV_FIXUP_F16_e64: + // In gfx9, the preferred handling of the unused high 16-bits changed. Most + // instructions maintain the legacy behavior of 0ing. Some instructions + // changed to preserving the high bits. + return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; + case AMDGPU::V_MAD_MIXLO_F16: + case AMDGPU::V_MAD_MIXHI_F16: + default: + return false; + } +} + unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, const Function &F) const { if (NWaves == 1) @@ -681,12 +800,12 @@ unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); } -unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { - const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); +unsigned +GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const { if (getGeneration() >= AMDGPUSubtarget::GFX10) return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. - if (MFI.hasFlatScratchInit()) { + if (HasFlatScratchInit) { if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) @@ -698,6 +817,28 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { return 2; // VCC. } +unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { + const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); + return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit()); +} + +unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { + // The logic to detect if the function has + // flat scratch init is slightly different than how + // SIMachineFunctionInfo constructor derives. + // We don't use amdgpu-calls, amdgpu-stack-objects + // attributes and isAmdHsaOrMesa here as it doesn't really matter. + // TODO: Outline this derivation logic and have just + // one common function in the backend to avoid duplication. + bool isEntry = AMDGPU::isEntryFunctionCC(F.getCallingConv()); + bool FunctionHasFlatScratchInit = false; + if (hasFlatAddressSpace() && isEntry && !flatScratchIsArchitected() && + enableFlatScratch()) { + FunctionHasFlatScratchInit = true; + } + return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit); +} + unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, unsigned NumSGPRs, unsigned NumVGPRs) const { @@ -711,13 +852,11 @@ unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, return Occupancy; } -unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { - const Function &F = MF.getFunction(); - const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); - +unsigned GCNSubtarget::getBaseMaxNumSGPRs( + const Function &F, std::pair<unsigned, unsigned> WavesPerEU, + unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { // Compute maximum number of SGPRs function can use using default/requested // minimum number of waves per execution unit. - std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); @@ -728,7 +867,7 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { F, "amdgpu-num-sgpr", MaxNumSGPRs); // Make sure requested value does not violate subtarget's specifications. - if (Requested && (Requested <= getReservedNumSGPRs(MF))) + if (Requested && (Requested <= ReservedNumSGPRs)) Requested = 0; // If more SGPRs are required to support the input user/system SGPRs, @@ -738,7 +877,7 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { // of reserved special registers in total. Theoretically you could re-use // the last input registers for these special registers, but this would // require a lot of complexity to deal with the weird aliasing. - unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); + unsigned InputNumSGPRs = PreloadedSGPRs; if (Requested && Requested < InputNumSGPRs) Requested = InputNumSGPRs; @@ -757,17 +896,43 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { if (hasSGPRInitBug()) MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; - return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), - MaxAddressableNumSGPRs); + return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); } -unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { +unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { const Function &F = MF.getFunction(); const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); + return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(), + getReservedNumSGPRs(MF)); +} + +static unsigned getMaxNumPreloadedSGPRs() { + // Max number of user SGPRs + unsigned MaxUserSGPRs = 4 + // private segment buffer + 2 + // Dispatch ptr + 2 + // queue ptr + 2 + // kernel segment ptr + 2 + // dispatch ID + 2 + // flat scratch init + 2; // Implicit buffer ptr + // Max number of system SGPRs + unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX + 1 + // WorkGroupIDY + 1 + // WorkGroupIDZ + 1 + // WorkGroupInfo + 1; // private segment wave byte offset + return MaxUserSGPRs + MaxSystemSGPRs; +} + +unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { + return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(), + getReservedNumSGPRs(F)); +} +unsigned GCNSubtarget::getBaseMaxNumVGPRs( + const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const { // Compute maximum number of VGPRs function can use using default/requested // minimum number of waves per execution unit. - std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); // Check if maximum number of VGPRs was explicitly requested using @@ -776,6 +941,9 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { unsigned Requested = AMDGPU::getIntegerAttribute( F, "amdgpu-num-vgpr", MaxNumVGPRs); + if (hasGFX90AInsts()) + Requested *= 2; + // Make sure requested value is compatible with values implied by // default/requested minimum/maximum number of waves per execution unit. if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) @@ -791,6 +959,16 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { return MaxNumVGPRs; } +unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { + return getBaseMaxNumVGPRs(F, getWavesPerEU(F)); +} + +unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { + const Function &F = MF.getFunction(); + const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); + return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU()); +} + void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep) const { if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index ba3a8acae551..b160cdf3a97a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -45,6 +45,7 @@ private: Triple TargetTriple; protected: + bool GCN3Encoding; bool Has16BitInsts; bool HasMadMixInsts; bool HasMadMacF32Insts; @@ -53,6 +54,7 @@ protected: bool HasVOP3PInsts; bool HasMulI24; bool HasMulU24; + bool HasSMulHi; bool HasInv2PiInlineImm; bool HasFminFmaxLegacy; bool EnablePromoteAlloca; @@ -124,6 +126,10 @@ public: return TargetTriple.getArch() == Triple::amdgcn; } + bool isGCN3Encoding() const { + return GCN3Encoding; + } + bool has16BitInsts() const { return Has16BitInsts; } @@ -156,6 +162,10 @@ public: return HasMulU24; } + bool hasSMulHi() const { + return HasSMulHi; + } + bool hasInv2PiInlineImm() const { return HasInv2PiInlineImm; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index ce7c82e2a88a..e4485f87fb79 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -32,6 +32,8 @@ #include "llvm/CodeGen/GlobalISel/Localizer.h" #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/MIRParser/MIParser.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/PassManager.h" @@ -52,6 +54,115 @@ using namespace llvm; +namespace { +class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> { +public: + SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) + : RegisterRegAllocBase(N, D, C) {} +}; + +class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> { +public: + VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) + : RegisterRegAllocBase(N, D, C) {} +}; + +static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI, + const TargetRegisterClass &RC) { + return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC); +} + +static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI, + const TargetRegisterClass &RC) { + return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC); +} + + +/// -{sgpr|vgpr}-regalloc=... command line option. +static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } + +/// A dummy default pass factory indicates whether the register allocator is +/// overridden on the command line. +static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag; +static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag; + +static SGPRRegisterRegAlloc +defaultSGPRRegAlloc("default", + "pick SGPR register allocator based on -O option", + useDefaultRegisterAllocator); + +static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false, + RegisterPassParser<SGPRRegisterRegAlloc>> +SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), + cl::desc("Register allocator to use for SGPRs")); + +static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false, + RegisterPassParser<VGPRRegisterRegAlloc>> +VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), + cl::desc("Register allocator to use for VGPRs")); + + +static void initializeDefaultSGPRRegisterAllocatorOnce() { + RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); + + if (!Ctor) { + Ctor = SGPRRegAlloc; + SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc); + } +} + +static void initializeDefaultVGPRRegisterAllocatorOnce() { + RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); + + if (!Ctor) { + Ctor = VGPRRegAlloc; + VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc); + } +} + +static FunctionPass *createBasicSGPRRegisterAllocator() { + return createBasicRegisterAllocator(onlyAllocateSGPRs); +} + +static FunctionPass *createGreedySGPRRegisterAllocator() { + return createGreedyRegisterAllocator(onlyAllocateSGPRs); +} + +static FunctionPass *createFastSGPRRegisterAllocator() { + return createFastRegisterAllocator(onlyAllocateSGPRs, false); +} + +static FunctionPass *createBasicVGPRRegisterAllocator() { + return createBasicRegisterAllocator(onlyAllocateVGPRs); +} + +static FunctionPass *createGreedyVGPRRegisterAllocator() { + return createGreedyRegisterAllocator(onlyAllocateVGPRs); +} + +static FunctionPass *createFastVGPRRegisterAllocator() { + return createFastRegisterAllocator(onlyAllocateVGPRs, true); +} + +static SGPRRegisterRegAlloc basicRegAllocSGPR( + "basic", "basic register allocator", createBasicSGPRRegisterAllocator); +static SGPRRegisterRegAlloc greedyRegAllocSGPR( + "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator); + +static SGPRRegisterRegAlloc fastRegAllocSGPR( + "fast", "fast register allocator", createFastSGPRRegisterAllocator); + + +static VGPRRegisterRegAlloc basicRegAllocVGPR( + "basic", "basic register allocator", createBasicVGPRRegisterAllocator); +static VGPRRegisterRegAlloc greedyRegAllocVGPR( + "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator); + +static VGPRRegisterRegAlloc fastRegAllocVGPR( + "fast", "fast register allocator", createFastVGPRRegisterAllocator); +} + + static cl::opt<bool> EnableR600StructurizeCFG( "r600-ir-structurize", cl::desc("Use StructurizeCFG IR pass"), @@ -162,6 +273,11 @@ static cl::opt<bool> EnableRegReassign( cl::init(true), cl::Hidden); +static cl::opt<bool> OptVGPRLiveRange( + "amdgpu-opt-vgpr-liverange", + cl::desc("Enable VGPR liverange optimizations for if-else structure"), + cl::init(true), cl::Hidden); + // Enable atomic optimization static cl::opt<bool> EnableAtomicOptimizations( "amdgpu-atomic-optimizations", @@ -193,6 +309,21 @@ static cl::opt<bool> EnableStructurizerWorkarounds( cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), cl::Hidden); +static cl::opt<bool> EnableLDSReplaceWithPointer( + "amdgpu-enable-lds-replace-with-pointer", + cl::desc("Enable LDS replace with pointer pass"), cl::init(false), + cl::Hidden); + +static cl::opt<bool, true> EnableLowerModuleLDS( + "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), + cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), + cl::Hidden); + +static cl::opt<bool> EnablePreRAOptimizations( + "amdgpu-enable-pre-ra-optimizations", + cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), + cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget()); @@ -215,9 +346,11 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIPeepholeSDWAPass(*PR); initializeSIShrinkInstructionsPass(*PR); initializeSIOptimizeExecMaskingPreRAPass(*PR); + initializeSIOptimizeVGPRLiveRangePass(*PR); initializeSILoadStoreOptimizerPass(*PR); initializeAMDGPUFixFunctionBitcastsPass(*PR); initializeAMDGPUAlwaysInlinePass(*PR); + initializeAMDGPUAttributorPass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesPass(*PR); initializeAMDGPUArgumentUsageInfoPass(*PR); @@ -228,12 +361,15 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); initializeAMDGPUPostLegalizerCombinerPass(*PR); initializeAMDGPUPreLegalizerCombinerPass(*PR); + initializeAMDGPURegBankCombinerPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); initializeAMDGPUPromoteAllocaToVectorPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); initializeAMDGPULateCodeGenPreparePass(*PR); initializeAMDGPUPropagateAttributesEarlyPass(*PR); initializeAMDGPUPropagateAttributesLatePass(*PR); + initializeAMDGPUReplaceLDSUseWithPointerPass(*PR); + initializeAMDGPULowerModuleLDSPass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); @@ -242,9 +378,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIModeRegisterPass(*PR); initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); - initializeSIRemoveShortExecBranchesPass(*PR); initializeSIPreEmitPeepholePass(*PR); - initializeSIInsertSkipsPass(*PR); + initializeSILateBranchLoweringPass(*PR); initializeSIMemoryLegalizerPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); initializeSIPreAllocateWWMRegsPass(*PR); @@ -256,9 +391,9 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUUseNativeCallsPass(*PR); initializeAMDGPUSimplifyLibCallsPass(*PR); initializeAMDGPUPrintfRuntimeBindingPass(*PR); - initializeGCNRegBankReassignPass(*PR); + initializeAMDGPUResourceUsageAnalysisPass(*PR); initializeGCNNSAReassignPass(*PR); - initializeSIAddIMGInitPass(*PR); + initializeGCNPreRAOptimizationsPass(*PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -388,6 +523,7 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; bool AMDGPUTargetMachine::EnableFunctionCalls = false; bool AMDGPUTargetMachine::EnableFixedFunctionABI = false; +bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; @@ -408,6 +544,7 @@ static bool mustPreserveGV(const GlobalValue &GV) { if (const Function *F = dyn_cast<Function>(&GV)) return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv()); + GV.removeDeadConstantUsers(); return !GV.use_empty(); } @@ -480,8 +617,7 @@ void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { AAM.registerFunctionAnalysis<AMDGPUAA>(); } -void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB, - bool DebugPassManager) { +void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PB.registerPipelineParsingCallback( [this](StringRef PassName, ModulePassManager &PM, ArrayRef<PassBuilder::PipelineElement>) { @@ -501,6 +637,14 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB, PM.addPass(AMDGPUAlwaysInlinePass()); return true; } + if (PassName == "amdgpu-replace-lds-use-with-pointer") { + PM.addPass(AMDGPUReplaceLDSUseWithPointerPass()); + return true; + } + if (PassName == "amdgpu-lower-module-lds") { + PM.addPass(AMDGPULowerModuleLDSPass()); + return true; + } return false; }); PB.registerPipelineParsingCallback( @@ -530,7 +674,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB, PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); return true; } - return false; }); @@ -546,16 +689,16 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB, return false; }); - PB.registerPipelineStartEPCallback([this, DebugPassManager]( - ModulePassManager &PM, - PassBuilder::OptimizationLevel Level) { - FunctionPassManager FPM(DebugPassManager); - FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); - FPM.addPass(AMDGPUUseNativeCallsPass()); - if (EnableLibCallSimplify && Level != PassBuilder::OptimizationLevel::O0) - FPM.addPass(AMDGPUSimplifyLibCallsPass(*this)); - PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); - }); + PB.registerPipelineStartEPCallback( + [this](ModulePassManager &PM, PassBuilder::OptimizationLevel Level) { + FunctionPassManager FPM; + FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); + FPM.addPass(AMDGPUUseNativeCallsPass()); + if (EnableLibCallSimplify && + Level != PassBuilder::OptimizationLevel::O0) + FPM.addPass(AMDGPUSimplifyLibCallsPass(*this)); + PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + }); PB.registerPipelineEarlySimplificationEPCallback( [this](ModulePassManager &PM, PassBuilder::OptimizationLevel Level) { @@ -577,12 +720,11 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB, }); PB.registerCGSCCOptimizerLateEPCallback( - [this, DebugPassManager](CGSCCPassManager &PM, - PassBuilder::OptimizationLevel Level) { + [this](CGSCCPassManager &PM, PassBuilder::OptimizationLevel Level) { if (Level == PassBuilder::OptimizationLevel::O0) return; - FunctionPassManager FPM(DebugPassManager); + FunctionPassManager FPM; // Add infer address spaces pass to the opt pipeline after inlining // but before SROA to increase SROA opportunities. @@ -732,6 +874,9 @@ public: // anything. disablePass(&StackMapLivenessID); disablePass(&FuncletLayoutID); + // Garbage collection is not supported. + disablePass(&GCLoweringID); + disablePass(&ShadowStackGCLoweringID); } AMDGPUTargetMachine &getAMDGPUTargetMachine() const { @@ -754,6 +899,19 @@ public: bool addGCPasses() override; std::unique_ptr<CSEConfigBase> getCSEConfig() const override; + + /// Check if a pass is enabled given \p Opt option. The option always + /// overrides defaults if explicitely used. Otherwise its default will + /// be used given that a pass shall work at an optimization \p Level + /// minimum. + bool isPassEnabled(const cl::opt<bool> &Opt, + CodeGenOpt::Level Level = CodeGenOpt::Default) const { + if (Opt.getNumOccurrences()) + return Opt; + if (TM->getOptLevel() < Level) + return false; + return Opt; + } }; std::unique_ptr<CSEConfigBase> AMDGPUPassConfig::getCSEConfig() const { @@ -803,9 +961,18 @@ public: bool addLegalizeMachineIR() override; void addPreRegBankSelect() override; bool addRegBankSelect() override; + void addPreGlobalInstructionSelect() override; bool addGlobalInstructionSelect() override; void addFastRegAlloc() override; void addOptimizedRegAlloc() override; + + FunctionPass *createSGPRAllocPass(bool Optimized); + FunctionPass *createVGPRAllocPass(bool Optimized); + FunctionPass *createRegAllocPass(bool Optimized) override; + + bool addRegAssignAndRewriteFast() override; + bool addRegAssignAndRewriteOptimized() override; + void addPreRegAlloc() override; bool addPreRewrite() override; void addPostRegAlloc() override; @@ -856,9 +1023,6 @@ void AMDGPUPassConfig::addIRPasses() { // A call to propagate attributes pass in the backend in case opt was not run. addPass(createAMDGPUPropagateAttributesEarlyPass(&TM)); - addPass(createAtomicExpandPass()); - - addPass(createAMDGPULowerIntrinsicsPass()); // Function calls are not supported, so make sure we inline everything. @@ -878,14 +1042,28 @@ void AMDGPUPassConfig::addIRPasses() { // Replace OpenCL enqueued block function pointers with global variables. addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); - if (TM.getOptLevel() > CodeGenOpt::None) { + // Can increase LDS used by kernel so runs before PromoteAlloca + if (EnableLowerModuleLDS) { + // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the + // pass "amdgpu-lower-module-lds", and also it required to be run only if + // "amdgpu-lower-module-lds" pass is enabled. + if (EnableLDSReplaceWithPointer) + addPass(createAMDGPUReplaceLDSUseWithPointerPass()); + + addPass(createAMDGPULowerModuleLDSPass()); + } + + if (TM.getOptLevel() > CodeGenOpt::None) addPass(createInferAddressSpacesPass()); + + addPass(createAtomicExpandPass()); + + if (TM.getOptLevel() > CodeGenOpt::None) { addPass(createAMDGPUPromoteAlloca()); if (EnableSROA) addPass(createSROAPass()); - - if (EnableScalarIRPasses) + if (isPassEnabled(EnableScalarIRPasses)) addStraightLineScalarOptimizationPasses(); if (EnableAMDGPUAliasAnalysis) { @@ -896,11 +1074,11 @@ void AMDGPUPassConfig::addIRPasses() { AAR.addAAResult(WrapperPass->getResult()); })); } - } - if (TM.getTargetTriple().getArch() == Triple::amdgcn) { - // TODO: May want to move later or split into an early and late one. - addPass(createAMDGPUCodeGenPreparePass()); + if (TM.getTargetTriple().getArch() == Triple::amdgcn) { + // TODO: May want to move later or split into an early and late one. + addPass(createAMDGPUCodeGenPreparePass()); + } } TargetPassConfig::addIRPasses(); @@ -917,7 +1095,7 @@ void AMDGPUPassConfig::addIRPasses() { // %1 = shl %a, 2 // // but EarlyCSE can do neither of them. - if (getOptLevel() != CodeGenOpt::None && EnableScalarIRPasses) + if (isPassEnabled(EnableScalarIRPasses)) addEarlyCSEOrGVNPass(); } @@ -929,11 +1107,9 @@ void AMDGPUPassConfig::addCodeGenPrepare() { EnableLowerKernelArguments) addPass(createAMDGPULowerKernelArgumentsPass()); - addPass(&AMDGPUPerfHintAnalysisID); - TargetPassConfig::addCodeGenPrepare(); - if (EnableLoadStoreVectorizer) + if (isPassEnabled(EnableLoadStoreVectorizer)) addPass(createLoadStoreVectorizerPass()); // LowerSwitch pass may introduce unreachable blocks that can @@ -944,7 +1120,8 @@ void AMDGPUPassConfig::addCodeGenPrepare() { } bool AMDGPUPassConfig::addPreISel() { - addPass(createFlattenCFGPass()); + if (TM->getOptLevel() > CodeGenOpt::None) + addPass(createFlattenCFGPass()); return false; } @@ -1014,13 +1191,15 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( bool GCNPassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); - addPass(createAMDGPULateCodeGenPreparePass()); - if (EnableAtomicOptimizations) { + if (TM->getOptLevel() > CodeGenOpt::None) + addPass(createAMDGPULateCodeGenPreparePass()); + + if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) { addPass(createAMDGPUAtomicOptimizerPass()); } - // FIXME: We need to run a pass to propagate the attributes when calls are - // supported. + if (TM->getOptLevel() > CodeGenOpt::None) + addPass(createSinkingPass()); // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit // regions formed by them. @@ -1032,13 +1211,15 @@ bool GCNPassConfig::addPreISel() { } addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions } - addPass(createSinkingPass()); addPass(createAMDGPUAnnotateUniformValues()); if (!LateCFGStructurize) { addPass(createSIAnnotateControlFlowPass()); } addPass(createLCSSAPass()); + if (TM->getOptLevel() > CodeGenOpt::Less) + addPass(&AMDGPUPerfHintAnalysisID); + return false; } @@ -1055,15 +1236,14 @@ void GCNPassConfig::addMachineSSAOptimization() { addPass(&SIFoldOperandsID); if (EnableDPPCombine) addPass(&GCNDPPCombineID); - addPass(&DeadMachineInstructionElimID); addPass(&SILoadStoreOptimizerID); - if (EnableSDWAPeephole) { + if (isPassEnabled(EnableSDWAPeephole)) { addPass(&SIPeepholeSDWAID); addPass(&EarlyMachineLICMID); addPass(&MachineCSEID); addPass(&SIFoldOperandsID); - addPass(&DeadMachineInstructionElimID); } + addPass(&DeadMachineInstructionElimID); addPass(createSIShrinkInstructionsPass()); } @@ -1079,7 +1259,6 @@ bool GCNPassConfig::addInstSelector() { AMDGPUPassConfig::addInstSelector(); addPass(&SIFixSGPRCopiesID); addPass(createSILowerI1CopiesPass()); - addPass(createSIAddIMGInitPass()); return false; } @@ -1109,12 +1288,13 @@ bool GCNPassConfig::addRegBankSelect() { return false; } +void GCNPassConfig::addPreGlobalInstructionSelect() { + bool IsOptNone = getOptLevel() == CodeGenOpt::None; + addPass(createAMDGPURegBankCombiner(IsOptNone)); +} + bool GCNPassConfig::addGlobalInstructionSelect() { - addPass(new InstructionSelect()); - // TODO: Fix instruction selection to do the right thing for image - // instructions with tfe or lwe in the first place, instead of running a - // separate pass to fix them up? - addPass(createSIAddIMGInitPass()); + addPass(new InstructionSelect(getOptLevel())); return false; } @@ -1147,8 +1327,21 @@ void GCNPassConfig::addOptimizedRegAlloc() { if (OptExecMaskPreRA) insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); - insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); + if (isPassEnabled(EnablePreRAOptimizations)) + insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID); + + // This is not an essential optimization and it has a noticeable impact on + // compilation time, so we only enable it from O2. + if (TM->getOptLevel() > CodeGenOpt::Less) + insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); + + // FIXME: when an instruction has a Killed operand, and the instruction is + // inside a bundle, seems only the BUNDLE instruction appears as the Kills of + // the register in LiveVariables, this would trigger a failure in verifier, + // we should fix it and enable the verifier. + if (OptVGPRLiveRange) + insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID, false); // This must be run immediately after phi elimination and before // TwoAddressInstructions, otherwise the processing of the tied operand of // SI_ELSE will introduce a copy of the tied operand source after the else. @@ -1161,10 +1354,81 @@ void GCNPassConfig::addOptimizedRegAlloc() { } bool GCNPassConfig::addPreRewrite() { - if (EnableRegReassign) { + if (EnableRegReassign) addPass(&GCNNSAReassignID); - addPass(&GCNRegBankReassignID); - } + return true; +} + +FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) { + // Initialize the global default. + llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag, + initializeDefaultSGPRRegisterAllocatorOnce); + + RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); + if (Ctor != useDefaultRegisterAllocator) + return Ctor(); + + if (Optimized) + return createGreedyRegisterAllocator(onlyAllocateSGPRs); + + return createFastRegisterAllocator(onlyAllocateSGPRs, false); +} + +FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) { + // Initialize the global default. + llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag, + initializeDefaultVGPRRegisterAllocatorOnce); + + RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); + if (Ctor != useDefaultRegisterAllocator) + return Ctor(); + + if (Optimized) + return createGreedyVGPRRegisterAllocator(); + + return createFastVGPRRegisterAllocator(); +} + +FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) { + llvm_unreachable("should not be used"); +} + +static const char RegAllocOptNotSupportedMessage[] = + "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc"; + +bool GCNPassConfig::addRegAssignAndRewriteFast() { + if (!usingDefaultRegAlloc()) + report_fatal_error(RegAllocOptNotSupportedMessage); + + addPass(createSGPRAllocPass(false)); + + // Equivalent of PEI for SGPRs. + addPass(&SILowerSGPRSpillsID); + + addPass(createVGPRAllocPass(false)); + return true; +} + +bool GCNPassConfig::addRegAssignAndRewriteOptimized() { + if (!usingDefaultRegAlloc()) + report_fatal_error(RegAllocOptNotSupportedMessage); + + addPass(createSGPRAllocPass(true)); + + // Commit allocated register changes. This is mostly necessary because too + // many things rely on the use lists of the physical registers, such as the + // verifier. This is only necessary with allocators which use LiveIntervals, + // since FastRegAlloc does the replacments itself. + addPass(createVirtRegRewriter(false)); + + // Equivalent of PEI for SGPRs. + addPass(&SILowerSGPRSpillsID); + + addPass(createVGPRAllocPass(true)); + + addPreRewrite(); + addPass(&VirtRegRewriterID); + return true; } @@ -1173,9 +1437,6 @@ void GCNPassConfig::addPostRegAlloc() { if (getOptLevel() > CodeGenOpt::None) addPass(&SIOptimizeExecMaskingID); TargetPassConfig::addPostRegAlloc(); - - // Equivalent of PEI for SGPRs. - addPass(&SILowerSGPRSpillsID); } void GCNPassConfig::addPreSched2() { @@ -1185,15 +1446,18 @@ void GCNPassConfig::addPreSched2() { void GCNPassConfig::addPreEmitPass() { addPass(createSIMemoryLegalizerPass()); addPass(createSIInsertWaitcntsPass()); - addPass(createSIShrinkInstructionsPass()); + + if (TM->getOptLevel() > CodeGenOpt::None) + addPass(createSIShrinkInstructionsPass()); + addPass(createSIModeRegisterPass()); if (getOptLevel() > CodeGenOpt::None) addPass(&SIInsertHardClausesID); - addPass(&SIRemoveShortExecBranchesID); - addPass(&SIInsertSkipsPassID); - addPass(&SIPreEmitPeepholeID); + addPass(&SILateBranchLoweringPassID); + if (getOptLevel() > CodeGenOpt::None) + addPass(&SIPreEmitPeepholeID); // The hazard recognizer that runs as part of the post-ra scheduler does not // guarantee to be able handle all hazards correctly. This is because if there // are multiple scheduling regions in a basic block, the regions are scheduled @@ -1217,8 +1481,8 @@ yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const { yaml::MachineFunctionInfo * GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const { const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - return new yaml::SIMachineFunctionInfo(*MFI, - *MF.getSubtarget().getRegisterInfo()); + return new yaml::SIMachineFunctionInfo( + *MFI, *MF.getSubtarget().getRegisterInfo(), MF); } bool GCNTargetMachine::parseMachineFunctionInfo( @@ -1229,7 +1493,8 @@ bool GCNTargetMachine::parseMachineFunctionInfo( MachineFunction &MF = PFS.MF; SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - MFI->initializeBaseYamlFields(YamlMFI); + if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange)) + return true; if (MFI->Occupancy == 0) { // Fixup the subtarget dependent default value. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 95aefa23c24c..1bfe026d080c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -35,6 +35,7 @@ public: static bool EnableLateStructurizeCFG; static bool EnableFunctionCalls; static bool EnableFixedFunctionABI; + static bool EnableLowerModuleLDS; AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, @@ -51,8 +52,7 @@ public: void adjustPassManager(PassManagerBuilder &) override; - void registerPassBuilderCallbacks(PassBuilder &PB, - bool DebugPassManager) override; + void registerPassBuilderCallbacks(PassBuilder &PB) override; void registerDefaultAliasAnalyses(AAManager &) override; /// Get the integer value of a null pointer in the given address space. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 7b8a79640bb2..63f449f7a726 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -19,6 +19,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/KnownBits.h" @@ -39,7 +40,7 @@ static cl::opt<unsigned> UnrollThresholdLocal( static cl::opt<unsigned> UnrollThresholdIf( "amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), - cl::init(150), cl::Hidden); + cl::init(200), cl::Hidden); static cl::opt<bool> UnrollRuntimeLocal( "amdgpu-unroll-runtime-local", @@ -106,6 +107,10 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, UP.MaxCount = std::numeric_limits<unsigned>::max(); UP.Partial = true; + // Conditional branch in a loop back edge needs 3 additional exec + // manipulations in average. + UP.BEInsns += 3; + // TODO: Do we want runtime unrolling? // Maximum alloca size than can fit registers. Reserve 16 registers. @@ -310,8 +315,17 @@ unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const { return getHardwareNumberOfRegisters(false) / NumVGPRs; } -unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const { - return 32; +TypeSize +GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { + switch (K) { + case TargetTransformInfo::RGK_Scalar: + return TypeSize::getFixed(32); + case TargetTransformInfo::RGK_FixedWidthVector: + return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32); + case TargetTransformInfo::RGK_ScalableVector: + return TypeSize::getScalable(0); + } + llvm_unreachable("Unsupported register kind"); } unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const { @@ -321,7 +335,9 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const { unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { if (Opcode == Instruction::Load || Opcode == Instruction::Store) return 32 * 4 / ElemWidth; - return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 : 1; + return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 + : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2 + : 1; } unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize, @@ -495,14 +511,12 @@ bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, } } -int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, - TTI::TargetCostKind CostKind, - TTI::OperandValueKind Opd1Info, - TTI::OperandValueKind Opd2Info, - TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo, - ArrayRef<const Value *> Args, - const Instruction *CxtI) { +InstructionCost GCNTTIImpl::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, + TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, + TTI::OperandValueProperties Opd1PropInfo, + TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, + const Instruction *CxtI) { EVT OrigTy = TLI->getValueType(DL, Ty); if (!OrigTy.isSimple()) { // FIXME: We're having to query the throughput cost so that the basic @@ -518,7 +532,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); bool IsFloat = Ty->isFPOrFPVectorTy(); // Assume that floating point arithmetic operations cost twice as much as @@ -542,12 +556,13 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, // similarly to what getCastInstrCost() does. if (auto *VTy = dyn_cast<VectorType>(Ty)) { unsigned Num = cast<FixedVectorType>(VTy)->getNumElements(); - unsigned Cost = getArithmeticInstrCost( + InstructionCost Cost = getArithmeticInstrCost( Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo, Args, CxtI); // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. - return getScalarizationOverhead(VTy, Args) + Num * Cost; + SmallVector<Type *> Tys(Args.size(), Ty); + return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost; } // We don't know anything about this scalar instruction. @@ -555,7 +570,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, } // Legalize the type. - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); // Because we don't have any legal vector operations, but the legal types, we @@ -628,6 +643,8 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, LLVM_FALLTHROUGH; case ISD::FADD: case ISD::FSUB: + if (ST->hasPackedFP32Ops() && SLT == MVT::f32) + NElts = (NElts + 1) / 2; if (SLT == MVT::f64) return LT.first * NElts * get64BitInstrCost(CostKind); @@ -713,8 +730,9 @@ static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) { } } -int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, - TTI::TargetCostKind CostKind) { +InstructionCost +GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, + TTI::TargetCostKind CostKind) { if (ICA.getID() == Intrinsic::fabs) return 0; @@ -731,45 +749,34 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, if (ICA.isTypeBasedOnly()) return getTypeBasedIntrinsicInstrCost(ICA, CostKind); - Type *RetTy = ICA.getReturnType(); - unsigned VF = ICA.getVectorFactor().getFixedValue(); unsigned RetVF = (RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements() : 1); - assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type"); const IntrinsicInst *I = ICA.getInst(); const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); FastMathFlags FMF = ICA.getFlags(); // Assume that we need to scalarize this intrinsic. - SmallVector<Type *, 4> Types; - for (const Value *Op : Args) { - Type *OpTy = Op->getType(); - assert(VF == 1 || !OpTy->isVectorTy()); - Types.push_back(VF == 1 ? OpTy : FixedVectorType::get(OpTy, VF)); - } - - if (VF > 1 && !RetTy->isVoidTy()) - RetTy = FixedVectorType::get(RetTy, VF); // Compute the scalarization overhead based on Args for a vector // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while // CostModel will pass a vector RetTy and VF is 1. - unsigned ScalarizationCost = std::numeric_limits<unsigned>::max(); - if (RetVF > 1 || VF > 1) { + InstructionCost ScalarizationCost = InstructionCost::getInvalid(); + if (RetVF > 1) { ScalarizationCost = 0; if (!RetTy->isVoidTy()) ScalarizationCost += getScalarizationOverhead(cast<VectorType>(RetTy), true, false); - ScalarizationCost += getOperandsScalarizationOverhead(Args, VF); + ScalarizationCost += + getOperandsScalarizationOverhead(Args, ICA.getArgTypes()); } - IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, Types, FMF, - ScalarizationCost, I); + IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I, + ScalarizationCost); return getIntrinsicInstrCost(Attrs, CostKind); } // Legalize the type. - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy); + std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy); unsigned NElts = LT.second.isVector() ? LT.second.getVectorNumElements() : 1; @@ -779,69 +786,96 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, if (SLT == MVT::f64) return LT.first * NElts * get64BitInstrCost(CostKind); - if (ST->has16BitInsts() && SLT == MVT::f16) + if ((ST->has16BitInsts() && SLT == MVT::f16) || + (ST->hasPackedFP32Ops() && SLT == MVT::f32)) NElts = (NElts + 1) / 2; // TODO: Get more refined intrinsic costs? unsigned InstRate = getQuarterRateInstrCost(CostKind); - if (ICA.getID() == Intrinsic::fma) { + + switch (ICA.getID()) { + case Intrinsic::fma: InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind) : getQuarterRateInstrCost(CostKind); + break; + case Intrinsic::uadd_sat: + case Intrinsic::usub_sat: + case Intrinsic::sadd_sat: + case Intrinsic::ssub_sat: + static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16}; + if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) + NElts = 1; + break; } return LT.first * NElts * InstRate; } -unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode, - TTI::TargetCostKind CostKind) { - if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) - return Opcode == Instruction::PHI ? 0 : 1; - - // XXX - For some reason this isn't called for switch. +InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode, + TTI::TargetCostKind CostKind, + const Instruction *I) { + assert((I == nullptr || I->getOpcode() == Opcode) && + "Opcode should reflect passed instruction."); + const bool SCost = + (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency); + const int CBrCost = SCost ? 5 : 7; switch (Opcode) { - case Instruction::Br: + case Instruction::Br: { + // Branch instruction takes about 4 slots on gfx900. + auto BI = dyn_cast_or_null<BranchInst>(I); + if (BI && BI->isUnconditional()) + return SCost ? 1 : 4; + // Suppose conditional branch takes additional 3 exec manipulations + // instructions in average. + return CBrCost; + } + case Instruction::Switch: { + auto SI = dyn_cast_or_null<SwitchInst>(I); + // Each case (including default) takes 1 cmp + 1 cbr instructions in + // average. + return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1); + } case Instruction::Ret: - return 10; - default: - return BaseT::getCFInstrCost(Opcode, CostKind); + return SCost ? 1 : 10; } + return BaseT::getCFInstrCost(Opcode, CostKind, I); } -int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, - bool IsPairwise, - TTI::TargetCostKind CostKind) { +InstructionCost +GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, + Optional<FastMathFlags> FMF, + TTI::TargetCostKind CostKind) { + if (TTI::requiresOrderedReduction(FMF)) + return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); + EVT OrigTy = TLI->getValueType(DL, Ty); // Computes cost on targets that have packed math instructions(which support // 16-bit types only). - if (IsPairwise || - !ST->hasVOP3PInsts() || - OrigTy.getScalarSizeInBits() != 16) - return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise, CostKind); + if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16) + return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); return LT.first * getFullRateInstrCost(); } -int GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, - bool IsPairwise, bool IsUnsigned, - TTI::TargetCostKind CostKind) { +InstructionCost +GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, + bool IsUnsigned, + TTI::TargetCostKind CostKind) { EVT OrigTy = TLI->getValueType(DL, Ty); // Computes cost on targets that have packed math instructions(which support // 16-bit types only). - if (IsPairwise || - !ST->hasVOP3PInsts() || - OrigTy.getScalarSizeInBits() != 16) - return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned, - CostKind); + if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16) + return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); return LT.first * getHalfRateInstrCost(CostKind); } -int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, - unsigned Index) { +InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, + unsigned Index) { switch (Opcode) { case Instruction::ExtractElement: case Instruction::InsertElement: { @@ -1096,8 +1130,10 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, } } -unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *VT, - int Index, VectorType *SubTp) { +InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, + VectorType *VT, ArrayRef<int> Mask, + int Index, VectorType *SubTp) { + Kind = improveShuffleKindFromMask(Kind, Mask); if (ST->hasVOP3PInsts()) { if (cast<FixedVectorType>(VT)->getNumElements() == 2 && DL.getTypeSizeInBits(VT->getElementType()) == 16) { @@ -1115,7 +1151,7 @@ unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *VT, } } - return BaseT::getShuffleCost(Kind, VT, Index, SubTp); + return BaseT::getShuffleCost(Kind, VT, Mask, Index, SubTp); } bool GCNTTIImpl::areInlineCompatible(const Function *Caller, @@ -1141,9 +1177,15 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller, if (!CallerMode.isInlineCompatible(CalleeMode)) return false; + if (Callee->hasFnAttribute(Attribute::AlwaysInline) || + Callee->hasFnAttribute(Attribute::InlineHint)) + return true; + // Hack to make compile times reasonable. - if (InlineMaxBB && !Callee->hasFnAttribute(Attribute::InlineHint)) { - // Single BB does not increase total BB amount, thus subtract 1. + if (InlineMaxBB) { + // Single BB does not increase total BB amount. + if (Callee->size() == 1) + return true; size_t BBSize = Caller->size() + Callee->size() - 1; return BBSize <= InlineMaxBB; } @@ -1192,8 +1234,10 @@ void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, } int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const { - return ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind) - : getQuarterRateInstrCost(CostKind); + return ST->hasFullRate64Ops() + ? getFullRateInstrCost() + : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind) + : getQuarterRateInstrCost(CostKind); } R600TTIImpl::R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F) @@ -1209,8 +1253,9 @@ unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const { return getHardwareNumberOfRegisters(Vec); } -unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const { - return 32; +TypeSize +R600TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { + return TypeSize::getFixed(32); } unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const { @@ -1265,8 +1310,9 @@ unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) { return 8; } -unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode, - TTI::TargetCostKind CostKind) { +InstructionCost R600TTIImpl::getCFInstrCost(unsigned Opcode, + TTI::TargetCostKind CostKind, + const Instruction *I) { if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) return Opcode == Instruction::PHI ? 0 : 1; @@ -1276,12 +1322,12 @@ unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode, case Instruction::Ret: return 10; default: - return BaseT::getCFInstrCost(Opcode, CostKind); + return BaseT::getCFInstrCost(Opcode, CostKind, I); } } -int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, - unsigned Index) { +InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, + unsigned Index) { switch (Opcode) { case Instruction::ExtractElement: case Instruction::InsertElement: { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index b29c94180fb8..37c0756eb7a8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -25,6 +25,7 @@ namespace llvm { class AMDGPUTargetLowering; +class AMDGPUTargetMachine; class GCNSubtarget; class InstCombiner; class Loop; @@ -120,7 +121,7 @@ public: unsigned getHardwareNumberOfRegisters(bool Vector) const; unsigned getNumberOfRegisters(bool Vector) const; unsigned getNumberOfRegisters(unsigned RCID) const; - unsigned getRegisterBitWidth(bool Vector) const; + TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const; unsigned getMinVectorRegisterBitWidth() const; unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const; unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, @@ -152,7 +153,7 @@ public: bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const; - int getArithmeticInstrCost( + InstructionCost getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, @@ -162,12 +163,14 @@ public: ArrayRef<const Value *> Args = ArrayRef<const Value *>(), const Instruction *CxtI = nullptr); - unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind); + InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, + const Instruction *I = nullptr); bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef<unsigned> Indices = {}) const; - int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, + unsigned Index); bool isSourceOfDivergence(const Value *V) const; bool isAlwaysUniform(const Value *V) const; @@ -194,10 +197,11 @@ public: std::function<void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const; - unsigned getVectorSplitCost() { return 0; } + InstructionCost getVectorSplitCost() { return 0; } - unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, - VectorType *SubTp); + InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, + ArrayRef<int> Mask, int Index, + VectorType *SubTp); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; @@ -207,17 +211,15 @@ public: int getInlinerVectorBonusPercent() { return 0; } - int getArithmeticReductionCost( - unsigned Opcode, - VectorType *Ty, - bool IsPairwise, + InstructionCost getArithmeticReductionCost( + unsigned Opcode, VectorType *Ty, Optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); - int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, - TTI::TargetCostKind CostKind); - int getMinMaxReductionCost( - VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned, - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); + InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, + TTI::TargetCostKind CostKind); + InstructionCost getMinMaxReductionCost( + VectorType *Ty, VectorType *CondTy, bool IsUnsigned, + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); }; class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> { @@ -242,7 +244,7 @@ public: TTI::PeelingPreferences &PP); unsigned getHardwareNumberOfRegisters(bool Vec) const; unsigned getNumberOfRegisters(bool Vec) const; - unsigned getRegisterBitWidth(bool Vector) const; + TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const; unsigned getMinVectorRegisterBitWidth() const; unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, @@ -252,8 +254,10 @@ public: bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const; unsigned getMaxInterleaveFactor(unsigned VF); - unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind); - int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); + InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, + const Instruction *I = nullptr); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, + unsigned Index); }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index 84d72e1b579f..4e3d5fdc012d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// This is a variant of the UnifyDivergentExitNodes pass. Rather than ensuring +// This is a variant of the UnifyFunctionExitNodes pass. Rather than ensuring // there is at most one ret and one unreachable instruction, it ensures there is // at most one divergent exiting block. // @@ -54,6 +54,9 @@ using namespace llvm; namespace { class AMDGPUUnifyDivergentExitNodes : public FunctionPass { +private: + const TargetTransformInfo *TTI = nullptr; + public: static char ID; // Pass identification, replacement for typeid @@ -63,6 +66,9 @@ public: // We can preserve non-critical-edgeness when we unify function exit nodes void getAnalysisUsage(AnalysisUsage &AU) const override; + BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU, + ArrayRef<BasicBlock *> ReturningBlocks, + StringRef Name); bool runOnFunction(Function &F) override; }; @@ -110,12 +116,9 @@ void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{ /// XXX - Is there a more efficient way to find this? static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA, BasicBlock &BB) { - SmallVector<BasicBlock *, 8> Stack; + SmallVector<BasicBlock *, 8> Stack(predecessors(&BB)); SmallPtrSet<BasicBlock *, 8> Visited; - for (BasicBlock *Pred : predecessors(&BB)) - Stack.push_back(Pred); - while (!Stack.empty()) { BasicBlock *Top = Stack.pop_back_val(); if (!DA.isUniform(Top->getTerminator())) @@ -130,49 +133,15 @@ static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA, return true; } -static void removeDoneExport(Function &F) { - ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext()); - for (BasicBlock &BB : F) { - for (Instruction &I : BB) { - if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) { - if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) { - Intrin->setArgOperand(6, BoolFalse); // done - } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) { - Intrin->setArgOperand(4, BoolFalse); // done - } - } - } - } -} - -static BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU, - ArrayRef<BasicBlock *> ReturningBlocks, - bool InsertExport, - const TargetTransformInfo &TTI, - StringRef Name) { +BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet( + Function &F, DomTreeUpdater &DTU, ArrayRef<BasicBlock *> ReturningBlocks, + StringRef Name) { // Otherwise, we need to insert a new basic block into the function, add a PHI // nodes (if the function returns values), and convert all of the return // instructions into unconditional branches. BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F); IRBuilder<> B(NewRetBlock); - if (InsertExport) { - // Ensure that there's only one "done" export in the shader by removing the - // "done" bit set on the original final export. More than one "done" export - // can lead to undefined behavior. - removeDoneExport(F); - - Value *Undef = UndefValue::get(B.getFloatTy()); - B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() }, - { - B.getInt32(AMDGPU::Exp::ET_NULL), - B.getInt32(0), // enabled channels - Undef, Undef, Undef, Undef, // values - B.getTrue(), // done - B.getTrue(), // valid mask - }); - } - PHINode *PN = nullptr; if (F.getReturnType()->isVoidTy()) { B.CreateRetVoid(); @@ -180,7 +149,6 @@ static BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU, // If the function doesn't return void... add a PHI node to the block... PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(), "UnifiedRetVal"); - assert(!InsertExport); B.CreateRet(PN); } @@ -206,7 +174,7 @@ static BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU, for (BasicBlock *BB : ReturningBlocks) { // Cleanup possible branch to unconditional branch to the return. - simplifyCFG(BB, TTI, RequireAndPreserveDomTree ? &DTU : nullptr, + simplifyCFG(BB, *TTI, RequireAndPreserveDomTree ? &DTU : nullptr, SimplifyCFGOptions().bonusInstThreshold(2)); } @@ -220,25 +188,21 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); - // If there's only one exit, we don't need to do anything, unless this is a - // pixel shader and that exit is an infinite loop, since we still have to - // insert an export in that case. - if (PDT.root_size() <= 1 && F.getCallingConv() != CallingConv::AMDGPU_PS) + // If there's only one exit, we don't need to do anything. + if (PDT.root_size() <= 1) return false; LegacyDivergenceAnalysis &DA = getAnalysis<LegacyDivergenceAnalysis>(); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); // Loop over all of the blocks in a function, tracking all of the blocks that // return. SmallVector<BasicBlock *, 4> ReturningBlocks; - SmallVector<BasicBlock *, 4> UniformlyReachedRetBlocks; SmallVector<BasicBlock *, 4> UnreachableBlocks; // Dummy return block for infinite loop. BasicBlock *DummyReturnBB = nullptr; - bool InsertExport = false; - bool Changed = false; std::vector<DominatorTree::UpdateType> Updates; @@ -246,8 +210,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { if (isa<ReturnInst>(BB->getTerminator())) { if (!isUniformlyReached(DA, *BB)) ReturningBlocks.push_back(BB); - else - UniformlyReachedRetBlocks.push_back(BB); } else if (isa<UnreachableInst>(BB->getTerminator())) { if (!isUniformlyReached(DA, *BB)) UnreachableBlocks.push_back(BB); @@ -259,36 +221,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { "DummyReturnBlock", &F); Type *RetTy = F.getReturnType(); Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy); - - // For pixel shaders, the producer guarantees that an export is - // executed before each return instruction. However, if there is an - // infinite loop and we insert a return ourselves, we need to uphold - // that guarantee by inserting a null export. This can happen e.g. in - // an infinite loop with kill instructions, which is supposed to - // terminate. However, we don't need to do this if there is a non-void - // return value, since then there is an epilog afterwards which will - // still export. - // - // Note: In the case where only some threads enter the infinite loop, - // this can result in the null export happening redundantly after the - // original exports. However, The last "real" export happens after all - // the threads that didn't enter an infinite loop converged, which - // means that the only extra threads to execute the null export are - // threads that entered the infinite loop, and they only could've - // exited through being killed which sets their exec bit to 0. - // Therefore, unless there's an actual infinite loop, which can have - // invalid results, or there's a kill after the last export, which we - // assume the frontend won't do, this export will have the same exec - // mask as the last "real" export, and therefore the valid mask will be - // overwritten with the same value and will still be correct. Also, - // even though this forces an extra unnecessary export wait, we assume - // that this happens rare enough in practice to that we don't have to - // worry about performance. - if (F.getCallingConv() == CallingConv::AMDGPU_PS && - RetTy->isVoidTy()) { - InsertExport = true; - } - ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); ReturningBlocks.push_back(DummyReturnBB); } @@ -380,23 +312,9 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { if (ReturningBlocks.empty()) return Changed; // No blocks return - if (ReturningBlocks.size() == 1 && !InsertExport) + if (ReturningBlocks.size() == 1) return Changed; // Already has a single return block - const TargetTransformInfo &TTI - = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - - // Unify returning blocks. If we are going to insert the export it is also - // necessary to include blocks that are uniformly reached, because in addition - // to inserting the export the "done" bits on existing exports will be cleared - // and we do not want to end up with the normal export in a non-unified, - // uniformly reached block with the "done" bit cleared. - auto BlocksToUnify = std::move(ReturningBlocks); - if (InsertExport) { - llvm::append_range(BlocksToUnify, UniformlyReachedRetBlocks); - } - - unifyReturnBlockSet(F, DTU, BlocksToUnify, InsertExport, TTI, - "UnifiedReturnBlock"); + unifyReturnBlockSet(F, DTU, ReturningBlocks, "UnifiedReturnBlock"); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index b9a8c6bd005d..56befe4ed0d0 100644 --- a/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -171,7 +171,7 @@ protected: static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) { for (MachineLoop::iterator iter = LoopInfo.begin(), iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) { - (*iter)->print(dbgs(), 0); + (*iter)->print(dbgs()); } } diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index af4a47935e3f..00032c7d4ea5 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -11,6 +11,7 @@ #include "MCTargetDesc/AMDGPUTargetStreamer.h" #include "SIDefines.h" #include "SIInstrInfo.h" +#include "SIRegisterInfo.h" #include "TargetInfo/AMDGPUTargetInfo.h" #include "Utils/AMDGPUAsmUtils.h" #include "Utils/AMDGPUBaseInfo.h" @@ -113,9 +114,7 @@ public: ImmTyInstOffset, ImmTyOffset0, ImmTyOffset1, - ImmTyDLC, - ImmTyGLC, - ImmTySLC, + ImmTyCPol, ImmTySWZ, ImmTyTFE, ImmTyD16, @@ -299,6 +298,8 @@ public: return isRegKind() && getReg() == AMDGPU::SGPR_NULL; } + bool isVRegWithInputMods() const; + bool isSDWAOperand(MVT type) const; bool isSDWAFP16Operand() const; bool isSDWAFP32Operand() const; @@ -336,12 +337,7 @@ public: bool isFlatOffset() const { return isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset); } bool isGDS() const { return isImmTy(ImmTyGDS); } bool isLDS() const { return isImmTy(ImmTyLDS); } - bool isDLC() const { return isImmTy(ImmTyDLC); } - bool isGLC() const { return isImmTy(ImmTyGLC); } - // "GLC_1" is a MatchClass of the GLC_1 operand with the default and forced - // value of the GLC operand. - bool isGLC_1() const { return isImmTy(ImmTyGLC); } - bool isSLC() const { return isImmTy(ImmTySLC); } + bool isCPol() const { return isImmTy(ImmTyCPol); } bool isSWZ() const { return isImmTy(ImmTySWZ); } bool isTFE() const { return isImmTy(ImmTyTFE); } bool isD16() const { return isImmTy(ImmTyD16); } @@ -449,6 +445,26 @@ public: return isSSrcF16(); } + bool isSSrcV2FP32() const { + llvm_unreachable("cannot happen"); + return isSSrcF32(); + } + + bool isSCSrcV2FP32() const { + llvm_unreachable("cannot happen"); + return isSCSrcF32(); + } + + bool isSSrcV2INT32() const { + llvm_unreachable("cannot happen"); + return isSSrcB32(); + } + + bool isSCSrcV2INT32() const { + llvm_unreachable("cannot happen"); + return isSCSrcB32(); + } + bool isSSrcOrLdsB32() const { return isRegOrInlineNoMods(AMDGPU::SRegOrLds_32RegClassID, MVT::i32) || isLiteralImm(MVT::i32) || isExpr(); @@ -502,6 +518,22 @@ public: return isVSrcB16() || isLiteralImm(MVT::v2i16); } + bool isVCSrcV2FP32() const { + return isVCSrcF64(); + } + + bool isVSrcV2FP32() const { + return isVSrcF64() || isLiteralImm(MVT::v2f32); + } + + bool isVCSrcV2INT32() const { + return isVCSrcB64(); + } + + bool isVSrcV2INT32() const { + return isVSrcB64() || isLiteralImm(MVT::v2i32); + } + bool isVSrcF32() const { return isVCSrcF32() || isLiteralImm(MVT::f32) || isExpr(); } @@ -542,6 +574,102 @@ public: return isVISrcF16() || isVISrcB32(); } + bool isVISrc_64B64() const { + return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i64); + } + + bool isVISrc_64F64() const { + return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::f64); + } + + bool isVISrc_64V2FP32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::f32); + } + + bool isVISrc_64V2INT32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i32); + } + + bool isVISrc_256B64() const { + return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i64); + } + + bool isVISrc_256F64() const { + return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::f64); + } + + bool isVISrc_128B16() const { + return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::i16); + } + + bool isVISrc_128V2B16() const { + return isVISrc_128B16(); + } + + bool isVISrc_128B32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::i32); + } + + bool isVISrc_128F32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::f32); + } + + bool isVISrc_256V2FP32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::f32); + } + + bool isVISrc_256V2INT32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i32); + } + + bool isVISrc_512B32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::i32); + } + + bool isVISrc_512B16() const { + return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::i16); + } + + bool isVISrc_512V2B16() const { + return isVISrc_512B16(); + } + + bool isVISrc_512F32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::f32); + } + + bool isVISrc_512F16() const { + return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::f16); + } + + bool isVISrc_512V2F16() const { + return isVISrc_512F16() || isVISrc_512B32(); + } + + bool isVISrc_1024B32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_1024RegClassID, MVT::i32); + } + + bool isVISrc_1024B16() const { + return isRegOrInlineNoMods(AMDGPU::VReg_1024RegClassID, MVT::i16); + } + + bool isVISrc_1024V2B16() const { + return isVISrc_1024B16(); + } + + bool isVISrc_1024F32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_1024RegClassID, MVT::f32); + } + + bool isVISrc_1024F16() const { + return isRegOrInlineNoMods(AMDGPU::VReg_1024RegClassID, MVT::f16); + } + + bool isVISrc_1024V2F16() const { + return isVISrc_1024F16() || isVISrc_1024B32(); + } + bool isAISrcB32() const { return isRegOrInlineNoMods(AMDGPU::AGPR_32RegClassID, MVT::i32); } @@ -566,6 +694,14 @@ public: return isAISrcF16() || isAISrcB32(); } + bool isAISrc_64B64() const { + return isRegOrInlineNoMods(AMDGPU::AReg_64RegClassID, MVT::i64); + } + + bool isAISrc_64F64() const { + return isRegOrInlineNoMods(AMDGPU::AReg_64RegClassID, MVT::f64); + } + bool isAISrc_128B32() const { return isRegOrInlineNoMods(AMDGPU::AReg_128RegClassID, MVT::i32); } @@ -590,6 +726,22 @@ public: return isAISrc_128F16() || isAISrc_128B32(); } + bool isVISrc_128F16() const { + return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::f16); + } + + bool isVISrc_128V2F16() const { + return isVISrc_128F16() || isVISrc_128B32(); + } + + bool isAISrc_256B64() const { + return isRegOrInlineNoMods(AMDGPU::AReg_256RegClassID, MVT::i64); + } + + bool isAISrc_256F64() const { + return isRegOrInlineNoMods(AMDGPU::AReg_256RegClassID, MVT::f64); + } + bool isAISrc_512B32() const { return isRegOrInlineNoMods(AMDGPU::AReg_512RegClassID, MVT::i32); } @@ -837,9 +989,7 @@ public: case ImmTyInstOffset: OS << "InstOffset"; break; case ImmTyOffset0: OS << "Offset0"; break; case ImmTyOffset1: OS << "Offset1"; break; - case ImmTyDLC: OS << "DLC"; break; - case ImmTyGLC: OS << "GLC"; break; - case ImmTySLC: OS << "SLC"; break; + case ImmTyCPol: OS << "CPol"; break; case ImmTySWZ: OS << "SWZ"; break; case ImmTyTFE: OS << "TFE"; break; case ImmTyD16: OS << "D16"; break; @@ -1021,6 +1171,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { bool ForcedDPP = false; bool ForcedSDWA = false; KernelScopeInfo KernelScope; + unsigned CPolSeen; /// @name Auto-generated Match Functions /// { @@ -1061,7 +1212,8 @@ private: bool ParseDirectiveHSACodeObjectISA(); bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header); bool ParseDirectiveAMDKernelCodeT(); - bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo) const; + // TODO: Possibly make subtargetHasRegister const. + bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo); bool ParseDirectiveAMDGPUHsaKernel(); bool ParseDirectiveISAVersion(); @@ -1105,7 +1257,7 @@ private: bool updateGprCountSymbols(RegisterKind RegKind, unsigned DwordRegIndex, unsigned RegWidth); void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands, - bool IsAtomic, bool IsAtomicReturn, bool IsLds = false); + bool IsAtomic, bool IsLds = false); void cvtDSImpl(MCInst &Inst, const OperandVector &Operands, bool IsGdsHardcoded); @@ -1140,7 +1292,7 @@ public: // AsmParser::parseDirectiveSet() cannot be specialized for specific target. AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU()); MCContext &Ctx = getContext(); - if (ISA.Major >= 6 && isHsaAbiVersion3(&getSTI())) { + if (ISA.Major >= 6 && isHsaAbiVersion3Or4(&getSTI())) { MCSymbol *Sym = Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_number")); Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx)); @@ -1157,7 +1309,7 @@ public: Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping")); Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx)); } - if (ISA.Major >= 6 && isHsaAbiVersion3(&getSTI())) { + if (ISA.Major >= 6 && isHsaAbiVersion3Or4(&getSTI())) { initializeGprCountSymbol(IS_VGPR); initializeGprCountSymbol(IS_SGPR); } else @@ -1165,10 +1317,6 @@ public: } } - bool hasXNACK() const { - return AMDGPU::hasXNACK(getSTI()); - } - bool hasMIMG_R128() const { return AMDGPU::hasMIMG_R128(getSTI()); } @@ -1181,6 +1329,8 @@ public: return AMDGPU::hasGFX10A16(getSTI()); } + bool hasG16() const { return AMDGPU::hasG16(getSTI()); } + bool isSI() const { return AMDGPU::isSI(getSTI()); } @@ -1197,6 +1347,10 @@ public: return AMDGPU::isGFX9(getSTI()); } + bool isGFX90A() const { + return AMDGPU::isGFX90A(getSTI()); + } + bool isGFX9Plus() const { return AMDGPU::isGFX9Plus(getSTI()); } @@ -1219,6 +1373,10 @@ public: return getFeatureBits()[AMDGPU::FeatureFlatInstOffsets]; } + bool hasArchitectedFlatScratch() const { + return getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch]; + } + bool hasSGPR102_SGPR103() const { return !isVI() && !isGFX9(); } @@ -1294,8 +1452,9 @@ public: bool (*ConvertResult)(int64_t&) = nullptr); OperandMatchResultTy - parseNamedBit(const char *Name, OperandVector &Operands, + parseNamedBit(StringRef Name, OperandVector &Operands, AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone); + OperandMatchResultTy parseCPol(OperandVector &Operands); OperandMatchResultTy parseStringWithPrefix(StringRef Prefix, StringRef &Value, SMLoc &StringLoc); @@ -1379,14 +1538,19 @@ private: bool validateMIMGAddrSize(const MCInst &Inst); bool validateMIMGD16(const MCInst &Inst); bool validateMIMGDim(const MCInst &Inst); - bool validateLdsDirect(const MCInst &Inst); + bool validateMIMGMSAA(const MCInst &Inst); bool validateOpSel(const MCInst &Inst); + bool validateDPP(const MCInst &Inst, const OperandVector &Operands); bool validateVccOperand(unsigned Reg) const; bool validateVOP3Literal(const MCInst &Inst, const OperandVector &Operands); bool validateMAIAccWrite(const MCInst &Inst, const OperandVector &Operands); + bool validateAGPRLdSt(const MCInst &Inst) const; + bool validateVGPRAlign(const MCInst &Inst) const; + bool validateGWS(const MCInst &Inst, const OperandVector &Operands); bool validateDivScale(const MCInst &Inst); bool validateCoherencyBits(const MCInst &Inst, const OperandVector &Operands, const SMLoc &IDLoc); + Optional<StringRef> validateLdsDirect(const MCInst &Inst); unsigned getConstantBusLimit(unsigned Opcode) const; bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; @@ -1403,6 +1567,7 @@ private: bool isId(const AsmToken &Token, const StringRef Id) const; bool isToken(const AsmToken::TokenKind Kind) const; bool trySkipId(const StringRef Id); + bool trySkipId(const StringRef Pref, const StringRef Id); bool trySkipId(const StringRef Id, const AsmToken::TokenKind Kind); bool trySkipToken(const AsmToken::TokenKind Kind); bool skipToken(const AsmToken::TokenKind Kind, const StringRef ErrMsg); @@ -1420,6 +1585,8 @@ private: void lex(); public: + void onBeginOfFile() override; + OperandMatchResultTy parseOptionalOperand(OperandVector &Operands); OperandMatchResultTy parseOptionalOpr(OperandVector &Operands); @@ -1451,16 +1618,12 @@ public: OperandMatchResultTy parseGPRIdxMode(OperandVector &Operands); int64_t parseGPRIdxMacro(); - void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); } - void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); } - void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); } - void cvtMubufLds(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false, true); } + void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false); } + void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true); } + void cvtMubufLds(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, true); } void cvtMtbuf(MCInst &Inst, const OperandVector &Operands); - AMDGPUOperand::Ptr defaultDLC() const; - AMDGPUOperand::Ptr defaultGLC() const; - AMDGPUOperand::Ptr defaultGLC_1() const; - AMDGPUOperand::Ptr defaultSLC() const; + AMDGPUOperand::Ptr defaultCPol() const; AMDGPUOperand::Ptr defaultSMRDOffset8() const; AMDGPUOperand::Ptr defaultSMEMOffset() const; @@ -1474,6 +1637,8 @@ public: void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands); void cvtVOP3(MCInst &Inst, const OperandVector &Operands); void cvtVOP3P(MCInst &Inst, const OperandVector &Operands); + void cvtVOP3P(MCInst &Inst, const OperandVector &Operands, + OptionalImmIndexMap &OptionalIdx); void cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands); @@ -1482,6 +1647,9 @@ public: void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands); void cvtIntersectRay(MCInst &Inst, const OperandVector &Operands); + void cvtSMEMAtomic(MCInst &Inst, const OperandVector &Operands); + + bool parseDimId(unsigned &Encoding); OperandMatchResultTy parseDim(OperandVector &Operands); OperandMatchResultTy parseDPP8(OperandVector &Operands); OperandMatchResultTy parseDPPCtrl(OperandVector &Operands); @@ -1551,11 +1719,16 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) { case AMDGPU::OPERAND_REG_INLINE_C_FP32: case AMDGPU::OPERAND_REG_INLINE_AC_INT32: case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: + case AMDGPU::OPERAND_REG_IMM_V2FP32: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: + case AMDGPU::OPERAND_REG_IMM_V2INT32: return &APFloat::IEEEsingle(); case AMDGPU::OPERAND_REG_IMM_INT64: case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_INLINE_C_INT64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: return &APFloat::IEEEdouble(); case AMDGPU::OPERAND_REG_IMM_INT16: case AMDGPU::OPERAND_REG_IMM_FP16: @@ -1715,7 +1888,8 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const { // literal goes into the lower half and the upper half is zero. We also // require that the literal may be losslesly converted to f16. MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 : - (type == MVT::v2i16)? MVT::i16 : type; + (type == MVT::v2i16)? MVT::i16 : + (type == MVT::v2f32)? MVT::f32 : type; APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val)); return canLosslesslyConvertToFPType(FPLiteral, ExpectedType); @@ -1725,6 +1899,13 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const { return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg()); } +bool AMDGPUOperand::isVRegWithInputMods() const { + return isRegClass(AMDGPU::VGPR_32RegClassID) || + // GFX90A allows DPP on 64-bit operands. + (isRegClass(AMDGPU::VReg_64RegClassID) && + AsmParser->getFeatureBits()[AMDGPU::Feature64BitDPP]); +} + bool AMDGPUOperand::isSDWAOperand(MVT type) const { if (AsmParser->isVI()) return isVReg32(); @@ -1751,8 +1932,9 @@ bool AMDGPUOperand::isSDWAInt32Operand() const { } bool AMDGPUOperand::isBoolReg() const { - return (AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize64] && isSCSrcB64()) || - (AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize32] && isSCSrcB32()); + auto FB = AsmParser->getFeatureBits(); + return isReg() && ((FB[AMDGPU::FeatureWavefrontSize64] && isSCSrcB64()) || + (FB[AMDGPU::FeatureWavefrontSize32] && isSCSrcB32())); } uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const @@ -1806,6 +1988,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_INLINE_C_INT64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(), AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Literal.getZExtValue())); @@ -1849,7 +2032,11 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2INT16: - case AMDGPU::OPERAND_REG_IMM_V2FP16: { + case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: + case AMDGPU::OPERAND_REG_IMM_V2FP32: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: + case AMDGPU::OPERAND_REG_IMM_V2INT32: { bool lost; APFloat FPLiteral(APFloat::IEEEdouble(), Literal); // Convert literal to single precision @@ -1881,6 +2068,10 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_INLINE_AC_FP32: case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2FP32: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: + case AMDGPU::OPERAND_REG_IMM_V2INT32: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: if (isSafeTruncation(Val, 32) && AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val), AsmParser->hasInv2PiInlineImm())) { @@ -1897,6 +2088,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_INLINE_C_INT64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: if (AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Val)); setImmKindConst(); @@ -2000,6 +2192,7 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) { case 4: return AMDGPU::VReg_128RegClassID; case 5: return AMDGPU::VReg_160RegClassID; case 6: return AMDGPU::VReg_192RegClassID; + case 7: return AMDGPU::VReg_224RegClassID; case 8: return AMDGPU::VReg_256RegClassID; case 16: return AMDGPU::VReg_512RegClassID; case 32: return AMDGPU::VReg_1024RegClassID; @@ -2022,6 +2215,7 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) { case 4: return AMDGPU::SGPR_128RegClassID; case 5: return AMDGPU::SGPR_160RegClassID; case 6: return AMDGPU::SGPR_192RegClassID; + case 7: return AMDGPU::SGPR_224RegClassID; case 8: return AMDGPU::SGPR_256RegClassID; case 16: return AMDGPU::SGPR_512RegClassID; } @@ -2034,6 +2228,7 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) { case 4: return AMDGPU::AReg_128RegClassID; case 5: return AMDGPU::AReg_160RegClassID; case 6: return AMDGPU::AReg_192RegClassID; + case 7: return AMDGPU::AReg_224RegClassID; case 8: return AMDGPU::AReg_256RegClassID; case 16: return AMDGPU::AReg_512RegClassID; case 32: return AMDGPU::AReg_1024RegClassID; @@ -2529,7 +2724,7 @@ AMDGPUAsmParser::parseRegister(bool RestoreOnFailure) { if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) { return nullptr; } - if (isHsaAbiVersion3(&getSTI())) { + if (isHsaAbiVersion3Or4(&getSTI())) { if (!updateGprCountSymbols(RegKind, RegNum, RegWidth)) return nullptr; } else @@ -3200,7 +3395,7 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) { return true; unsigned VDataSize = AMDGPU::getRegOperandSize(getMRI(), Desc, VDataIdx); - unsigned TFESize = Inst.getOperand(TFEIdx).getImm()? 1 : 0; + unsigned TFESize = (TFEIdx != -1 && Inst.getOperand(TFEIdx).getImm()) ? 1 : 0; unsigned DMask = Inst.getOperand(DMaskIdx).getImm() & 0xf; if (DMask == 0) DMask = 1; @@ -3230,6 +3425,7 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) { int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); int SrsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); int DimIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dim); + int A16Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::a16); assert(VAddr0Idx != -1); assert(SrsrcIdx != -1); @@ -3241,22 +3437,26 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) { unsigned Dim = Inst.getOperand(DimIdx).getImm(); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim); bool IsNSA = SrsrcIdx - VAddr0Idx > 1; - unsigned VAddrSize = + unsigned ActualAddrSize = IsNSA ? SrsrcIdx - VAddr0Idx : AMDGPU::getRegOperandSize(getMRI(), Desc, VAddr0Idx) / 4; + bool IsA16 = (A16Idx != -1 && Inst.getOperand(A16Idx).getImm()); + + unsigned ExpectedAddrSize = + AMDGPU::getAddrSizeMIMGOp(BaseOpcode, DimInfo, IsA16, hasG16()); - unsigned AddrSize = BaseOpcode->NumExtraArgs + - (BaseOpcode->Gradients ? DimInfo->NumGradients : 0) + - (BaseOpcode->Coordinates ? DimInfo->NumCoords : 0) + - (BaseOpcode->LodOrClampOrMip ? 1 : 0); if (!IsNSA) { - if (AddrSize > 8) - AddrSize = 16; - else if (AddrSize > 4) - AddrSize = 8; + if (ExpectedAddrSize > 8) + ExpectedAddrSize = 16; + + // Allow oversized 8 VGPR vaddr when only 5/6/7 VGPRs are required. + // This provides backward compatibility for assembly created + // before 160b/192b/224b types were directly supported. + if (ActualAddrSize == 8 && (ExpectedAddrSize >= 5 && ExpectedAddrSize <= 7)) + return true; } - return VAddrSize == AddrSize; + return ActualAddrSize == ExpectedAddrSize; } bool AMDGPUAsmParser::validateMIMGAtomicDMask(const MCInst &Inst) { @@ -3298,6 +3498,29 @@ bool AMDGPUAsmParser::validateMIMGGatherDMask(const MCInst &Inst) { return DMask == 0x1 || DMask == 0x2 || DMask == 0x4 || DMask == 0x8; } +bool AMDGPUAsmParser::validateMIMGMSAA(const MCInst &Inst) { + const unsigned Opc = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opc); + + if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0) + return true; + + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); + const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = + AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); + + if (!BaseOpcode->MSAA) + return true; + + int DimIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dim); + assert(DimIdx != -1); + + unsigned Dim = Inst.getOperand(DimIdx).getImm(); + const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim); + + return DimInfo->MSAA; +} + static bool IsMovrelsSDWAOpcode(const unsigned Opcode) { switch (Opcode) { @@ -3559,7 +3782,7 @@ static bool IsRevOpcode(const unsigned Opcode) } } -bool AMDGPUAsmParser::validateLdsDirect(const MCInst &Inst) { +Optional<StringRef> AMDGPUAsmParser::validateLdsDirect(const MCInst &Inst) { using namespace SIInstrFlags; const unsigned Opcode = Inst.getOpcode(); @@ -3567,33 +3790,29 @@ bool AMDGPUAsmParser::validateLdsDirect(const MCInst &Inst) { // lds_direct register is defined so that it can be used // with 9-bit operands only. Ignore encodings which do not accept these. - if ((Desc.TSFlags & (VOP1 | VOP2 | VOP3 | VOPC | VOP3P | SIInstrFlags::SDWA)) == 0) - return true; + const auto Enc = VOP1 | VOP2 | VOP3 | VOPC | VOP3P | SIInstrFlags::SDWA; + if ((Desc.TSFlags & Enc) == 0) + return None; - const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); - const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); - const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); + for (auto SrcName : {OpName::src0, OpName::src1, OpName::src2}) { + auto SrcIdx = getNamedOperandIdx(Opcode, SrcName); + if (SrcIdx == -1) + break; + const auto &Src = Inst.getOperand(SrcIdx); + if (Src.isReg() && Src.getReg() == LDS_DIRECT) { - const int SrcIndices[] = { Src1Idx, Src2Idx }; + if (isGFX90A()) + return StringRef("lds_direct is not supported on this GPU"); - // lds_direct cannot be specified as either src1 or src2. - for (int SrcIdx : SrcIndices) { - if (SrcIdx == -1) break; - const MCOperand &Src = Inst.getOperand(SrcIdx); - if (Src.isReg() && Src.getReg() == LDS_DIRECT) { - return false; + if (IsRevOpcode(Opcode) || (Desc.TSFlags & SIInstrFlags::SDWA)) + return StringRef("lds_direct cannot be used with this instruction"); + + if (SrcName != OpName::src0) + return StringRef("lds_direct may be used as src0 only"); } } - if (Src0Idx == -1) - return true; - - const MCOperand &Src = Inst.getOperand(Src0Idx); - if (!Src.isReg() || Src.getReg() != LDS_DIRECT) - return true; - - // lds_direct is specified as src0. Check additional limitations. - return (Desc.TSFlags & SIInstrFlags::SDWA) == 0 && !IsRevOpcode(Opcode); + return None; } SMLoc AMDGPUAsmParser::getFlatOffsetLoc(const OperandVector &Operands) const { @@ -3624,7 +3843,7 @@ bool AMDGPUAsmParser::validateFlatOffset(const MCInst &Inst, // For FLAT segment the offset must be positive; // MSB is ignored and forced to zero. - if (TSFlags & (SIInstrFlags::IsFlatGlobal | SIInstrFlags::IsFlatScratch)) { + if (TSFlags & (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch)) { unsigned OffsetSize = AMDGPU::getNumFlatOffsetBits(getSTI(), true); if (!isIntN(OffsetSize, Op.getImm())) { Error(getFlatOffsetLoc(Operands), @@ -3733,6 +3952,28 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) { return true; } +bool AMDGPUAsmParser::validateDPP(const MCInst &Inst, + const OperandVector &Operands) { + const unsigned Opc = Inst.getOpcode(); + int DppCtrlIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dpp_ctrl); + if (DppCtrlIdx < 0) + return true; + unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm(); + + if (!AMDGPU::isLegal64BitDPPControl(DppCtrl)) { + // DPP64 is supported for row_newbcast only. + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + if (Src0Idx >= 0 && + getMRI()->getSubReg(Inst.getOperand(Src0Idx).getReg(), AMDGPU::sub1)) { + SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands); + Error(S, "64 bit dpp only supports row_newbcast"); + return false; + } + } + + return true; +} + // Check if VCC register matches wavefront size bool AMDGPUAsmParser::validateVccOperand(unsigned Reg) const { auto FB = getFeatureBits(); @@ -3802,18 +4043,148 @@ bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst, return true; } +// Returns -1 if not a register, 0 if VGPR and 1 if AGPR. +static int IsAGPROperand(const MCInst &Inst, uint16_t NameIdx, + const MCRegisterInfo *MRI) { + int OpIdx = AMDGPU::getNamedOperandIdx(Inst.getOpcode(), NameIdx); + if (OpIdx < 0) + return -1; + + const MCOperand &Op = Inst.getOperand(OpIdx); + if (!Op.isReg()) + return -1; + + unsigned Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0); + auto Reg = Sub ? Sub : Op.getReg(); + const MCRegisterClass &AGPR32 = MRI->getRegClass(AMDGPU::AGPR_32RegClassID); + return AGPR32.contains(Reg) ? 1 : 0; +} + +bool AMDGPUAsmParser::validateAGPRLdSt(const MCInst &Inst) const { + uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; + if ((TSFlags & (SIInstrFlags::FLAT | SIInstrFlags::MUBUF | + SIInstrFlags::MTBUF | SIInstrFlags::MIMG | + SIInstrFlags::DS)) == 0) + return true; + + uint16_t DataNameIdx = (TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0 + : AMDGPU::OpName::vdata; + + const MCRegisterInfo *MRI = getMRI(); + int DstAreg = IsAGPROperand(Inst, AMDGPU::OpName::vdst, MRI); + int DataAreg = IsAGPROperand(Inst, DataNameIdx, MRI); + + if ((TSFlags & SIInstrFlags::DS) && DataAreg >= 0) { + int Data2Areg = IsAGPROperand(Inst, AMDGPU::OpName::data1, MRI); + if (Data2Areg >= 0 && Data2Areg != DataAreg) + return false; + } + + auto FB = getFeatureBits(); + if (FB[AMDGPU::FeatureGFX90AInsts]) { + if (DataAreg < 0 || DstAreg < 0) + return true; + return DstAreg == DataAreg; + } + + return DstAreg < 1 && DataAreg < 1; +} + +bool AMDGPUAsmParser::validateVGPRAlign(const MCInst &Inst) const { + auto FB = getFeatureBits(); + if (!FB[AMDGPU::FeatureGFX90AInsts]) + return true; + + const MCRegisterInfo *MRI = getMRI(); + const MCRegisterClass &VGPR32 = MRI->getRegClass(AMDGPU::VGPR_32RegClassID); + const MCRegisterClass &AGPR32 = MRI->getRegClass(AMDGPU::AGPR_32RegClassID); + for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { + const MCOperand &Op = Inst.getOperand(I); + if (!Op.isReg()) + continue; + + unsigned Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0); + if (!Sub) + continue; + + if (VGPR32.contains(Sub) && ((Sub - AMDGPU::VGPR0) & 1)) + return false; + if (AGPR32.contains(Sub) && ((Sub - AMDGPU::AGPR0) & 1)) + return false; + } + + return true; +} + +// gfx90a has an undocumented limitation: +// DS_GWS opcodes must use even aligned registers. +bool AMDGPUAsmParser::validateGWS(const MCInst &Inst, + const OperandVector &Operands) { + if (!getFeatureBits()[AMDGPU::FeatureGFX90AInsts]) + return true; + + int Opc = Inst.getOpcode(); + if (Opc != AMDGPU::DS_GWS_INIT_vi && Opc != AMDGPU::DS_GWS_BARRIER_vi && + Opc != AMDGPU::DS_GWS_SEMA_BR_vi) + return true; + + const MCRegisterInfo *MRI = getMRI(); + const MCRegisterClass &VGPR32 = MRI->getRegClass(AMDGPU::VGPR_32RegClassID); + int Data0Pos = + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0); + assert(Data0Pos != -1); + auto Reg = Inst.getOperand(Data0Pos).getReg(); + auto RegIdx = Reg - (VGPR32.contains(Reg) ? AMDGPU::VGPR0 : AMDGPU::AGPR0); + if (RegIdx & 1) { + SMLoc RegLoc = getRegLoc(Reg, Operands); + Error(RegLoc, "vgpr must be even aligned"); + return false; + } + + return true; +} + bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst, const OperandVector &Operands, const SMLoc &IDLoc) { - int GLCPos = AMDGPU::getNamedOperandIdx(Inst.getOpcode(), - AMDGPU::OpName::glc1); - if (GLCPos != -1) { - // -1 is set by GLC_1 default operand. In all cases "glc" must be present - // in the asm string, and the default value means it is not present. - if (Inst.getOperand(GLCPos).getImm() == -1) { + int CPolPos = AMDGPU::getNamedOperandIdx(Inst.getOpcode(), + AMDGPU::OpName::cpol); + if (CPolPos == -1) + return true; + + unsigned CPol = Inst.getOperand(CPolPos).getImm(); + + uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; + if ((TSFlags & (SIInstrFlags::SMRD)) && + (CPol & ~(AMDGPU::CPol::GLC | AMDGPU::CPol::DLC))) { + Error(IDLoc, "invalid cache policy for SMRD instruction"); + return false; + } + + if (isGFX90A() && (CPol & CPol::SCC)) { + SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands); + StringRef CStr(S.getPointer()); + S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scc")]); + Error(S, "scc is not supported on this GPU"); + return false; + } + + if (!(TSFlags & (SIInstrFlags::IsAtomicNoRet | SIInstrFlags::IsAtomicRet))) + return true; + + if (TSFlags & SIInstrFlags::IsAtomicRet) { + if (!(TSFlags & SIInstrFlags::MIMG) && !(CPol & CPol::GLC)) { Error(IDLoc, "instruction must use glc"); return false; } + } else { + if (CPol & CPol::GLC) { + SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands); + StringRef CStr(S.getPointer()); + S = SMLoc::getFromPointer(&CStr.data()[CStr.find("glc")]); + Error(S, "instruction must not use glc"); + return false; + } } return true; @@ -3822,9 +4193,8 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst, bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, const SMLoc &IDLoc, const OperandVector &Operands) { - if (!validateLdsDirect(Inst)) { - Error(getRegLoc(AMDGPU::LDS_DIRECT, Operands), - "invalid use of lds_direct"); + if (auto ErrMsg = validateLdsDirect(Inst)) { + Error(getRegLoc(LDS_DIRECT, Operands), *ErrMsg); return false; } if (!validateSOPLiteral(Inst)) { @@ -3851,6 +4221,9 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, "invalid op_sel operand"); return false; } + if (!validateDPP(Inst, Operands)) { + return false; + } // For MUBUF/MTBUF d16 is a part of opcode, so there is nothing to validate. if (!validateMIMGD16(Inst)) { Error(getImmLoc(AMDGPUOperand::ImmTyD16, Operands), @@ -3861,6 +4234,11 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, Error(IDLoc, "dim modifier is required on this GPU"); return false; } + if (!validateMIMGMSAA(Inst)) { + Error(getImmLoc(AMDGPUOperand::ImmTyDim, Operands), + "invalid dim; must be MSAA type"); + return false; + } if (!validateMIMGDataSize(Inst)) { Error(IDLoc, "image data size does not match dmask and tfe"); @@ -3893,6 +4271,26 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, if (!validateMAIAccWrite(Inst, Operands)) { return false; } + if (!validateCoherencyBits(Inst, Operands, IDLoc)) { + return false; + } + + if (!validateAGPRLdSt(Inst)) { + Error(IDLoc, getFeatureBits()[AMDGPU::FeatureGFX90AInsts] + ? "invalid register class: data and dst should be all VGPR or AGPR" + : "invalid register class: agpr loads and stores not supported on this GPU" + ); + return false; + } + if (!validateVGPRAlign(Inst)) { + Error(IDLoc, + "invalid register class: vgpr tuples must be 64 bit aligned"); + return false; + } + if (!validateGWS(Inst, Operands)) { + return false; + } + if (!validateDivScale(Inst)) { Error(IDLoc, "ABS not allowed in VOP3B instructions"); return false; @@ -4062,21 +4460,19 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGCNTarget() { if (getSTI().getTargetTriple().getArch() != Triple::amdgcn) return TokError("directive only supported for amdgcn architecture"); - std::string Target; - - SMLoc TargetStart = getLoc(); - if (getParser().parseEscapedString(Target)) + std::string TargetIDDirective; + SMLoc TargetStart = getTok().getLoc(); + if (getParser().parseEscapedString(TargetIDDirective)) return true; - SMRange TargetRange = SMRange(TargetStart, getLoc()); - std::string ExpectedTarget; - raw_string_ostream ExpectedTargetOS(ExpectedTarget); - IsaInfo::streamIsaVersion(&getSTI(), ExpectedTargetOS); + SMRange TargetRange = SMRange(TargetStart, getTok().getLoc()); + if (getTargetStreamer().getTargetID()->toString() != TargetIDDirective) + return getParser().Error(TargetRange.Start, + (Twine(".amdgcn_target directive's target id ") + + Twine(TargetIDDirective) + + Twine(" does not match the specified target id ") + + Twine(getTargetStreamer().getTargetID()->toString())).str()); - if (Target != ExpectedTargetOS.str()) - return Error(TargetRange.Start, "target must match options", TargetRange); - - getTargetStreamer().EmitDirectiveAMDGCNTarget(Target); return false; } @@ -4143,12 +4539,12 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { SMRange VGPRRange; uint64_t NextFreeVGPR = 0; + uint64_t AccumOffset = 0; SMRange SGPRRange; uint64_t NextFreeSGPR = 0; unsigned UserSGPRCount = 0; bool ReserveVCC = true; bool ReserveFlatScr = true; - bool ReserveXNACK = hasXNACK(); Optional<bool> EnableWavefrontSize32; while (true) { @@ -4191,7 +4587,15 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { if (!isUInt<sizeof(KD.private_segment_fixed_size) * CHAR_BIT>(Val)) return OutOfRangeError(ValRange); KD.private_segment_fixed_size = Val; + } else if (ID == ".amdhsa_kernarg_size") { + if (!isUInt<sizeof(KD.kernarg_size) * CHAR_BIT>(Val)) + return OutOfRangeError(ValRange); + KD.kernarg_size = Val; } else if (ID == ".amdhsa_user_sgpr_private_segment_buffer") { + if (hasArchitectedFlatScratch()) + return Error(IDRange.Start, + "directive is not supported with architected flat scratch", + IDRange); PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, Val, ValRange); @@ -4222,6 +4626,10 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { if (Val) UserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") { + if (hasArchitectedFlatScratch()) + return Error(IDRange.Start, + "directive is not supported with architected flat scratch", + IDRange); PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val, ValRange); @@ -4241,10 +4649,20 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32, Val, ValRange); } else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") { - PARSE_BITS_ENTRY( - KD.compute_pgm_rsrc2, - COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val, - ValRange); + if (hasArchitectedFlatScratch()) + return Error(IDRange.Start, + "directive is not supported with architected flat scratch", + IDRange); + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2, + COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val, ValRange); + } else if (ID == ".amdhsa_enable_private_segment") { + if (!hasArchitectedFlatScratch()) + return Error( + IDRange.Start, + "directive is not supported without architected flat scratch", + IDRange); + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2, + COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val, ValRange); } else if (ID == ".amdhsa_system_sgpr_workgroup_id_x") { PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2, COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, Val, @@ -4271,6 +4689,10 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { } else if (ID == ".amdhsa_next_free_sgpr") { SGPRRange = ValRange; NextFreeSGPR = Val; + } else if (ID == ".amdhsa_accum_offset") { + if (!isGFX90A()) + return Error(IDRange.Start, "directive requires gfx90a+", IDRange); + AccumOffset = Val; } else if (ID == ".amdhsa_reserve_vcc") { if (!isUInt<1>(Val)) return OutOfRangeError(ValRange); @@ -4278,6 +4700,10 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { } else if (ID == ".amdhsa_reserve_flat_scratch") { if (IVersion.Major < 7) return Error(IDRange.Start, "directive requires gfx7+", IDRange); + if (hasArchitectedFlatScratch()) + return Error(IDRange.Start, + "directive is not supported with architected flat scratch", + IDRange); if (!isUInt<1>(Val)) return OutOfRangeError(ValRange); ReserveFlatScr = Val; @@ -4286,7 +4712,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { return Error(IDRange.Start, "directive requires gfx8+", IDRange); if (!isUInt<1>(Val)) return OutOfRangeError(ValRange); - ReserveXNACK = Val; + if (Val != getTargetStreamer().getTargetID()->isXnackOnOrAny()) + return getParser().Error(IDRange.Start, ".amdhsa_reserve_xnack_mask does not match target id", + IDRange); } else if (ID == ".amdhsa_float_round_mode_32") { PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32, Val, ValRange); @@ -4311,6 +4739,11 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { return Error(IDRange.Start, "directive requires gfx9+", IDRange); PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FP16_OVFL, Val, ValRange); + } else if (ID == ".amdhsa_tg_split") { + if (!isGFX90A()) + return Error(IDRange.Start, "directive requires gfx90a+", IDRange); + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Val, + ValRange); } else if (ID == ".amdhsa_workgroup_processor_mode") { if (IVersion.Major < 10) return Error(IDRange.Start, "directive requires gfx10+", IDRange); @@ -4372,7 +4805,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { unsigned VGPRBlocks; unsigned SGPRBlocks; if (calculateGPRBlocks(getFeatureBits(), ReserveVCC, ReserveFlatScr, - ReserveXNACK, EnableWavefrontSize32, NextFreeVGPR, + getTargetStreamer().getTargetID()->isXnackOnOrAny(), + EnableWavefrontSize32, NextFreeVGPR, VGPRRange, NextFreeSGPR, SGPRRange, VGPRBlocks, SGPRBlocks)) return true; @@ -4395,9 +4829,21 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, COMPUTE_PGM_RSRC2_USER_SGPR_COUNT, UserSGPRCount); + if (isGFX90A()) { + if (Seen.find(".amdhsa_accum_offset") == Seen.end()) + return TokError(".amdhsa_accum_offset directive is required"); + if (AccumOffset < 4 || AccumOffset > 256 || (AccumOffset & 3)) + return TokError("accum_offset should be in range [4..256] in " + "increments of 4"); + if (AccumOffset > alignTo(std::max((uint64_t)1, NextFreeVGPR), 4)) + return TokError("accum_offset exceeds total VGPR allocation"); + AMDHSA_BITS_SET(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, + (AccumOffset / 4 - 1)); + } + getTargetStreamer().EmitAmdhsaKernelDescriptor( getSTI(), KernelName, KD, NextFreeVGPR, NextFreeSGPR, ReserveVCC, - ReserveFlatScr, ReserveXNACK); + ReserveFlatScr); return false; } @@ -4423,9 +4869,9 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() { // targeted GPU. if (isToken(AsmToken::EndOfStatement)) { AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU()); - getTargetStreamer().EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, - ISA.Stepping, - "AMD", "AMDGPU"); + getTargetStreamer().EmitDirectiveHSACodeObjectISAV2(ISA.Major, ISA.Minor, + ISA.Stepping, + "AMD", "AMDGPU"); return false; } @@ -4450,8 +4896,8 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() { if (!parseString(ArchName, "invalid arch name")) return true; - getTargetStreamer().EmitDirectiveHSACodeObjectISA(Major, Minor, Stepping, - VendorName, ArchName); + getTargetStreamer().EmitDirectiveHSACodeObjectISAV2(Major, Minor, Stepping, + VendorName, ArchName); return false; } @@ -4560,19 +5006,11 @@ bool AMDGPUAsmParser::ParseDirectiveISAVersion() { "architectures"); } - auto ISAVersionStringFromASM = getToken().getStringContents(); + auto TargetIDDirective = getLexer().getTok().getStringContents(); + if (getTargetStreamer().getTargetID()->toString() != TargetIDDirective) + return Error(getParser().getTok().getLoc(), "target id must match options"); - std::string ISAVersionStringFromSTI; - raw_string_ostream ISAVersionStreamFromSTI(ISAVersionStringFromSTI); - IsaInfo::streamIsaVersion(&getSTI(), ISAVersionStreamFromSTI); - - if (ISAVersionStringFromASM != ISAVersionStreamFromSTI.str()) { - return Error(getLoc(), - ".amd_amdgpu_isa directive does not match triple and/or mcpu " - "arguments specified through the command line"); - } - - getTargetStreamer().EmitISAVersion(ISAVersionStreamFromSTI.str()); + getTargetStreamer().EmitISAVersion(); Lex(); return false; @@ -4582,7 +5020,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() { const char *AssemblerDirectiveBegin; const char *AssemblerDirectiveEnd; std::tie(AssemblerDirectiveBegin, AssemblerDirectiveEnd) = - isHsaAbiVersion3(&getSTI()) + isHsaAbiVersion3Or4(&getSTI()) ? std::make_tuple(HSAMD::V3::AssemblerDirectiveBegin, HSAMD::V3::AssemblerDirectiveEnd) : std::make_tuple(HSAMD::AssemblerDirectiveBegin, @@ -4599,7 +5037,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() { HSAMetadataString)) return true; - if (isHsaAbiVersion3(&getSTI())) { + if (isHsaAbiVersion3Or4(&getSTI())) { if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString)) return Error(getLoc(), "invalid HSA metadata"); } else { @@ -4749,12 +5187,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() { bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { StringRef IDVal = DirectiveID.getString(); - if (isHsaAbiVersion3(&getSTI())) { - if (IDVal == ".amdgcn_target") - return ParseDirectiveAMDGCNTarget(); - + if (isHsaAbiVersion3Or4(&getSTI())) { if (IDVal == ".amdhsa_kernel") - return ParseDirectiveAMDHSAKernel(); + return ParseDirectiveAMDHSAKernel(); // TODO: Restructure/combine with PAL metadata directive. if (IDVal == AMDGPU::HSAMD::V3::AssemblerDirectiveBegin) @@ -4779,6 +5214,9 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { return ParseDirectiveHSAMetadata(); } + if (IDVal == ".amdgcn_target") + return ParseDirectiveAMDGCNTarget(); + if (IDVal == ".amdgpu_lds") return ParseDirectiveAMDGPULDS(); @@ -4792,7 +5230,7 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { } bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, - unsigned RegNo) const { + unsigned RegNo) { for (MCRegAliasIterator R(AMDGPU::TTMP12_TTMP13_TTMP14_TTMP15, &MRI, true); R.isValid(); ++R) { @@ -4824,7 +5262,7 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, case AMDGPU::XNACK_MASK: case AMDGPU::XNACK_MASK_LO: case AMDGPU::XNACK_MASK_HI: - return (isVI() || isGFX9()) && hasXNACK(); + return (isVI() || isGFX9()) && getTargetStreamer().getTargetID()->isXnackSupported(); case AMDGPU::SGPR_NULL: return isGFX10Plus(); default: @@ -4881,16 +5319,21 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic, unsigned Prefix = Operands.size(); for (;;) { + auto Loc = getLoc(); ResTy = parseReg(Operands); + if (ResTy == MatchOperand_NoMatch) + Error(Loc, "expected a register"); if (ResTy != MatchOperand_Success) - return ResTy; + return MatchOperand_ParseFail; RBraceLoc = getLoc(); if (trySkipToken(AsmToken::RBrac)) break; - if (!trySkipToken(AsmToken::Comma)) + if (!skipToken(AsmToken::Comma, + "expected a comma or a closing square bracket")) { return MatchOperand_ParseFail; + } } if (Operands.size() - Prefix > 1) { @@ -4940,11 +5383,9 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, OperandMode Mode = OperandMode_Default; if (IsMIMG && isGFX10Plus() && Operands.size() == 2) Mode = OperandMode_NSA; + CPolSeen = 0; OperandMatchResultTy Res = parseOperand(Operands, Name, Mode); - // Eat the comma or space if there is one. - trySkipToken(AsmToken::Comma); - if (Res != MatchOperand_Success) { checkUnsupportedInstruction(Name, NameLoc); if (!Parser.hasPendingError()) { @@ -4959,6 +5400,9 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, } return true; } + + // Eat the comma or space if there is one. + trySkipToken(AsmToken::Comma); } return false; @@ -5043,39 +5487,27 @@ AMDGPUAsmParser::parseOperandArrayWithPrefix(const char *Prefix, } OperandMatchResultTy -AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands, +AMDGPUAsmParser::parseNamedBit(StringRef Name, OperandVector &Operands, AMDGPUOperand::ImmTy ImmTy) { - int64_t Bit = 0; + int64_t Bit; SMLoc S = getLoc(); - // We are at the end of the statement, and this is a default argument, so - // use a default value. - if (!isToken(AsmToken::EndOfStatement)) { - switch(getTokenKind()) { - case AsmToken::Identifier: { - StringRef Tok = getTokenStr(); - if (Tok == Name) { - if (Tok == "r128" && !hasMIMG_R128()) - Error(S, "r128 modifier is not supported on this GPU"); - if (Tok == "a16" && !isGFX9() && !hasGFX10A16()) - Error(S, "a16 modifier is not supported on this GPU"); - Bit = 1; - Parser.Lex(); - } else if (Tok.startswith("no") && Tok.endswith(Name)) { - Bit = 0; - Parser.Lex(); - } else { - return MatchOperand_NoMatch; - } - break; - } - default: - return MatchOperand_NoMatch; - } + if (trySkipId(Name)) { + Bit = 1; + } else if (trySkipId("no", Name)) { + Bit = 0; + } else { + return MatchOperand_NoMatch; } - if (!isGFX10Plus() && ImmTy == AMDGPUOperand::ImmTyDLC) + if (Name == "r128" && !hasMIMG_R128()) { + Error(S, "r128 modifier is not supported on this GPU"); + return MatchOperand_ParseFail; + } + if (Name == "a16" && !isGFX9() && !hasGFX10A16()) { + Error(S, "a16 modifier is not supported on this GPU"); return MatchOperand_ParseFail; + } if (isGFX9() && ImmTy == AMDGPUOperand::ImmTyA16) ImmTy = AMDGPUOperand::ImmTyR128A16; @@ -5084,6 +5516,62 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands, return MatchOperand_Success; } +OperandMatchResultTy +AMDGPUAsmParser::parseCPol(OperandVector &Operands) { + unsigned CPolOn = 0; + unsigned CPolOff = 0; + SMLoc S = getLoc(); + + if (trySkipId("glc")) + CPolOn = AMDGPU::CPol::GLC; + else if (trySkipId("noglc")) + CPolOff = AMDGPU::CPol::GLC; + else if (trySkipId("slc")) + CPolOn = AMDGPU::CPol::SLC; + else if (trySkipId("noslc")) + CPolOff = AMDGPU::CPol::SLC; + else if (trySkipId("dlc")) + CPolOn = AMDGPU::CPol::DLC; + else if (trySkipId("nodlc")) + CPolOff = AMDGPU::CPol::DLC; + else if (trySkipId("scc")) + CPolOn = AMDGPU::CPol::SCC; + else if (trySkipId("noscc")) + CPolOff = AMDGPU::CPol::SCC; + else + return MatchOperand_NoMatch; + + if (!isGFX10Plus() && ((CPolOn | CPolOff) & AMDGPU::CPol::DLC)) { + Error(S, "dlc modifier is not supported on this GPU"); + return MatchOperand_ParseFail; + } + + if (!isGFX90A() && ((CPolOn | CPolOff) & AMDGPU::CPol::SCC)) { + Error(S, "scc modifier is not supported on this GPU"); + return MatchOperand_ParseFail; + } + + if (CPolSeen & (CPolOn | CPolOff)) { + Error(S, "duplicate cache policy modifier"); + return MatchOperand_ParseFail; + } + + CPolSeen |= (CPolOn | CPolOff); + + for (unsigned I = 1; I != Operands.size(); ++I) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); + if (Op.isCPol()) { + Op.setImm((Op.getImm() | CPolOn) & ~CPolOff); + return MatchOperand_Success; + } + } + + Operands.push_back(AMDGPUOperand::CreateImm(this, CPolOn, S, + AMDGPUOperand::ImmTyCPol)); + + return MatchOperand_Success; +} + static void addOptionalImmOperand( MCInst& Inst, const OperandVector& Operands, AMDGPUAsmParser::OptionalImmIndexMap& OptionalIdx, @@ -5757,7 +6245,7 @@ AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg, } return false; } - if (!isValidMsgOp(Msg.Id, Op.Id, Strict)) { + if (!isValidMsgOp(Msg.Id, Op.Id, getSTI(), Strict)) { Error(Op.Loc, "invalid operation id"); return false; } @@ -5765,7 +6253,7 @@ AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg, Error(Stream.Loc, "message operation does not support streams"); return false; } - if (!isValidMsgStream(Msg.Id, Op.Id, Stream.Id, Strict)) { + if (!isValidMsgStream(Msg.Id, Op.Id, Stream.Id, getSTI(), Strict)) { Error(Stream.Loc, "invalid message stream id"); return false; } @@ -5934,6 +6422,18 @@ AMDGPUAsmParser::trySkipId(const StringRef Id) { } bool +AMDGPUAsmParser::trySkipId(const StringRef Pref, const StringRef Id) { + if (isToken(AsmToken::Identifier)) { + StringRef Tok = getTokenStr(); + if (Tok.startswith(Pref) && Tok.drop_front(Pref.size()) == Id) { + lex(); + return true; + } + } + return false; +} + +bool AMDGPUAsmParser::trySkipId(const StringRef Id, const AsmToken::TokenKind Kind) { if (isId(Id) && peekToken().is(Kind)) { lex(); @@ -6489,32 +6989,38 @@ AMDGPUAsmParser::parseBoolReg(OperandVector &Operands) { // mubuf //===----------------------------------------------------------------------===// -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDLC() const { - return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDLC); -} - -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultGLC() const { - return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyGLC); -} - -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultGLC_1() const { - return AMDGPUOperand::CreateImm(this, -1, SMLoc(), AMDGPUOperand::ImmTyGLC); -} - -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSLC() const { - return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTySLC); +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultCPol() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyCPol); } void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, - const OperandVector &Operands, - bool IsAtomic, - bool IsAtomicReturn, - bool IsLds) { + const OperandVector &Operands, + bool IsAtomic, + bool IsLds) { bool IsLdsOpcode = IsLds; bool HasLdsModifier = false; OptionalImmIndexMap OptionalIdx; - assert(IsAtomicReturn ? IsAtomic : true); unsigned FirstOperandIdx = 1; + bool IsAtomicReturn = false; + + if (IsAtomic) { + for (unsigned i = FirstOperandIdx, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + if (!Op.isCPol()) + continue; + IsAtomicReturn = Op.getImm() & AMDGPU::CPol::GLC; + break; + } + + if (!IsAtomicReturn) { + int NewOpc = AMDGPU::getAtomicNoRetOp(Inst.getOpcode()); + if (NewOpc != -1) + Inst.setOpcode(NewOpc); + } + + IsAtomicReturn = MII.get(Inst.getOpcode()).TSFlags & + SIInstrFlags::IsAtomicRet; + } for (unsigned i = FirstOperandIdx, e = Operands.size(); i != e; ++i) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); @@ -6565,18 +7071,12 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, } addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset); - if (!IsAtomic || IsAtomicReturn) { - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC, - IsAtomicReturn ? -1 : 0); - } - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0); if (!IsLdsOpcode) { // tfe is not legal with lds opcodes addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); } - - if (isGFX10Plus()) - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySWZ); } void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) { @@ -6611,12 +7111,9 @@ void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) { addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyFORMAT); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); - - if (isGFX10Plus()) - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySWZ); } //===----------------------------------------------------------------------===// @@ -6658,14 +7155,12 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands, if (IsGFX10Plus) addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDim, -1); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm); - if (IsGFX10Plus) - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128A16); if (IsGFX10Plus) addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyA16); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); + if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::tfe) != -1) + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE); if (!IsGFX10Plus) addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA); @@ -6676,6 +7171,61 @@ void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands) cvtMIMG(Inst, Operands, true); } +void AMDGPUAsmParser::cvtSMEMAtomic(MCInst &Inst, const OperandVector &Operands) { + OptionalImmIndexMap OptionalIdx; + bool IsAtomicReturn = false; + + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + if (!Op.isCPol()) + continue; + IsAtomicReturn = Op.getImm() & AMDGPU::CPol::GLC; + break; + } + + if (!IsAtomicReturn) { + int NewOpc = AMDGPU::getAtomicNoRetOp(Inst.getOpcode()); + if (NewOpc != -1) + Inst.setOpcode(NewOpc); + } + + IsAtomicReturn = MII.get(Inst.getOpcode()).TSFlags & + SIInstrFlags::IsAtomicRet; + + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + + // Add the register arguments + if (Op.isReg()) { + Op.addRegOperands(Inst, 1); + if (IsAtomicReturn && i == 1) + Op.addRegOperands(Inst, 1); + continue; + } + + // Handle the case where soffset is an immediate + if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) { + Op.addImmOperands(Inst, 1); + continue; + } + + // Handle tokens like 'offen' which are sometimes hard-coded into the + // asm string. There are no MCInst operands for these. + if (Op.isToken()) { + continue; + } + assert(Op.isImm()); + + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = i; + } + + if ((int)Inst.getNumOperands() <= + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::offset)) + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0); +} + void AMDGPUAsmParser::cvtIntersectRay(MCInst &Inst, const OperandVector &Operands) { for (unsigned I = 1; I < Operands.size(); ++I) { @@ -6747,17 +7297,14 @@ static bool ConvertOmodDiv(int64_t &Div) { return false; } +// Both bound_ctrl:0 and bound_ctrl:1 are encoded as 1. +// This is intentional and ensures compatibility with sp3. +// See bug 35397 for details. static bool ConvertBoundCtrl(int64_t &BoundCtrl) { - if (BoundCtrl == 0) { + if (BoundCtrl == 0 || BoundCtrl == 1) { BoundCtrl = 1; return true; } - - if (BoundCtrl == -1) { - BoundCtrl = 0; - return true; - } - return false; } @@ -6772,9 +7319,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"lds", AMDGPUOperand::ImmTyLDS, true, nullptr}, {"offset", AMDGPUOperand::ImmTyOffset, false, nullptr}, {"inst_offset", AMDGPUOperand::ImmTyInstOffset, false, nullptr}, - {"dlc", AMDGPUOperand::ImmTyDLC, true, nullptr}, - {"glc", AMDGPUOperand::ImmTyGLC, true, nullptr}, - {"slc", AMDGPUOperand::ImmTySLC, true, nullptr}, + {"", AMDGPUOperand::ImmTyCPol, false, nullptr}, {"swz", AMDGPUOperand::ImmTySWZ, true, nullptr}, {"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr}, {"d16", AMDGPUOperand::ImmTyD16, true, nullptr}, @@ -6808,6 +7353,18 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"abid", AMDGPUOperand::ImmTyABID, false, nullptr} }; +void AMDGPUAsmParser::onBeginOfFile() { + if (!getParser().getStreamer().getTargetStreamer() || + getSTI().getTargetTriple().getArch() == Triple::r600) + return; + + if (!getTargetStreamer().getTargetID()) + getTargetStreamer().initializeTargetID(getSTI(), getSTI().getFeatureString()); + + if (isHsaAbiVersion3Or4(&getSTI())) + getTargetStreamer().EmitDirectiveAMDGCNTarget(); +} + OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) { OperandMatchResultTy res = parseOptionalOpr(Operands); @@ -6857,6 +7414,8 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOpr(OperandVector &Operands) Op.ConvertResult); } else if (Op.Type == AMDGPUOperand::ImmTyDim) { res = parseDim(Operands); + } else if (Op.Type == AMDGPUOperand::ImmTyCPol) { + res = parseCPol(Operands); } else { res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult); } @@ -7010,6 +7569,7 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands, Opc == AMDGPU::V_MAC_LEGACY_F32_e64_gfx6_gfx7 || Opc == AMDGPU::V_MAC_LEGACY_F32_e64_gfx10 || Opc == AMDGPU::V_MAC_F16_e64_vi || + Opc == AMDGPU::V_FMAC_F64_e64_gfx90a || Opc == AMDGPU::V_FMAC_F32_e64_gfx10 || Opc == AMDGPU::V_FMAC_F32_e64_vi || Opc == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 || @@ -7028,16 +7588,13 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { cvtVOP3(Inst, Operands, OptionalIdx); } -void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, - const OperandVector &Operands) { - OptionalImmIndexMap OptIdx; +void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, + OptionalImmIndexMap &OptIdx) { const int Opc = Inst.getOpcode(); const MCInstrDesc &Desc = MII.get(Opc); const bool IsPacked = (Desc.TSFlags & SIInstrFlags::IsPacked) != 0; - cvtVOP3(Inst, Operands, OptIdx); - if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in) != -1) { assert(!IsPacked); Inst.addOperand(Inst.getOperand(0)); @@ -7046,7 +7603,10 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, // FIXME: This is messy. Parse the modifiers as if it was a normal VOP3 // instruction, and then figure out where to actually put the modifiers - addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSel); + int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); + if (OpSelIdx != -1) { + addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSel); + } int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi); if (OpSelHiIdx != -1) { @@ -7057,7 +7617,6 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo); if (NegLoIdx != -1) { - assert(IsPacked); addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo); addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegHi); } @@ -7069,16 +7628,16 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers }; - int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); - - unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); + unsigned OpSel = 0; unsigned OpSelHi = 0; unsigned NegLo = 0; unsigned NegHi = 0; - if (OpSelHiIdx != -1) { + if (OpSelIdx != -1) + OpSel = Inst.getOperand(OpSelIdx).getImm(); + + if (OpSelHiIdx != -1) OpSelHi = Inst.getOperand(OpSelHiIdx).getImm(); - } if (NegLoIdx != -1) { int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi); @@ -7111,6 +7670,12 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, } } +void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) { + OptionalImmIndexMap OptIdx; + cvtVOP3(Inst, Operands, OptIdx); + cvtVOP3P(Inst, Operands, OptIdx); +} + //===----------------------------------------------------------------------===// // dpp //===----------------------------------------------------------------------===// @@ -7167,44 +7732,64 @@ bool AMDGPUOperand::isU16Imm() const { return isImm() && isUInt<16>(getImm()); } -OperandMatchResultTy AMDGPUAsmParser::parseDim(OperandVector &Operands) { - if (!isGFX10Plus()) - return MatchOperand_NoMatch; - - SMLoc S = getLoc(); - - if (!trySkipId("dim", AsmToken::Colon)) - return MatchOperand_NoMatch; +//===----------------------------------------------------------------------===// +// dim +//===----------------------------------------------------------------------===// - // We want to allow "dim:1D" etc., but the initial 1 is tokenized as an - // integer. +bool AMDGPUAsmParser::parseDimId(unsigned &Encoding) { + // We want to allow "dim:1D" etc., + // but the initial 1 is tokenized as an integer. std::string Token; if (isToken(AsmToken::Integer)) { SMLoc Loc = getToken().getEndLoc(); Token = std::string(getTokenStr()); lex(); if (getLoc() != Loc) - return MatchOperand_ParseFail; + return false; } - if (!isToken(AsmToken::Identifier)) - return MatchOperand_ParseFail; - Token += getTokenStr(); + + StringRef Suffix; + if (!parseId(Suffix)) + return false; + Token += Suffix; StringRef DimId = Token; if (DimId.startswith("SQ_RSRC_IMG_")) - DimId = DimId.substr(12); + DimId = DimId.drop_front(12); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByAsmSuffix(DimId); if (!DimInfo) - return MatchOperand_ParseFail; + return false; + + Encoding = DimInfo->Encoding; + return true; +} - lex(); +OperandMatchResultTy AMDGPUAsmParser::parseDim(OperandVector &Operands) { + if (!isGFX10Plus()) + return MatchOperand_NoMatch; - Operands.push_back(AMDGPUOperand::CreateImm(this, DimInfo->Encoding, S, + SMLoc S = getLoc(); + + if (!trySkipId("dim", AsmToken::Colon)) + return MatchOperand_NoMatch; + + unsigned Encoding; + SMLoc Loc = getLoc(); + if (!parseDimId(Encoding)) { + Error(Loc, "invalid dim value"); + return MatchOperand_ParseFail; + } + + Operands.push_back(AMDGPUOperand::CreateImm(this, Encoding, S, AMDGPUOperand::ImmTyDim)); return MatchOperand_Success; } +//===----------------------------------------------------------------------===// +// dpp +//===----------------------------------------------------------------------===// + OperandMatchResultTy AMDGPUAsmParser::parseDPP8(OperandVector &Operands) { SMLoc S = getLoc(); @@ -7245,6 +7830,9 @@ OperandMatchResultTy AMDGPUAsmParser::parseDPP8(OperandVector &Operands) { bool AMDGPUAsmParser::isSupportedDPPCtrl(StringRef Ctrl, const OperandVector &Operands) { + if (Ctrl == "row_newbcast") + return isGFX90A(); + if (Ctrl == "row_share" || Ctrl == "row_xmask") return isGFX10Plus(); @@ -7322,6 +7910,7 @@ AMDGPUAsmParser::parseDPPCtrlSel(StringRef Ctrl) { .Case("row_ror", {DppCtrl::ROW_ROR0, 1, 15}) .Case("row_share", {DppCtrl::ROW_SHARE_FIRST, 0, 15}) .Case("row_xmask", {DppCtrl::ROW_XMASK_FIRST, 0, 15}) + .Case("row_newbcast", {DppCtrl::ROW_NEWBCAST_FIRST, 0, 15}) .Default({-1, 0, 0}); bool Valid; @@ -7400,6 +7989,9 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultFI() const { void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8) { OptionalImmIndexMap OptionalIdx; + unsigned Opc = Inst.getOpcode(); + bool HasModifiers = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1; unsigned I = 1; const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { @@ -7426,7 +8018,8 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool I if (IsDPP8) { if (Op.isDPP8()) { Op.addImmOperands(Inst, 1); - } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { + } else if (HasModifiers && + isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { Op.addRegWithFPInputModsOperands(Inst, 2); } else if (Op.isFI()) { Fi = Op.getImm(); @@ -7436,8 +8029,11 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool I llvm_unreachable("Invalid operand type"); } } else { - if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { + if (HasModifiers && + isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { Op.addRegWithFPInputModsOperands(Inst, 2); + } else if (Op.isReg()) { + Op.addRegOperands(Inst, 1); } else if (Op.isDPPCtrl()) { Op.addImmOperands(Inst, 1); } else if (Op.isImm()) { @@ -7691,8 +8287,6 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op, return Operand.isGDS() ? Match_Success : Match_InvalidOperand; case MCK_lds: return Operand.isLDS() ? Match_Success : Match_InvalidOperand; - case MCK_glc: - return Operand.isGLC() ? Match_Success : Match_InvalidOperand; case MCK_idxen: return Operand.isIdxen() ? Match_Success : Match_InvalidOperand; case MCK_offen: diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 5dc5481df49e..5f43aa8388ee 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -6,17 +6,12 @@ // //===----------------------------------------------------------------------===// -def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">; -def MUBUFAddr64 : ComplexPattern<i64, 9, "SelectMUBUFAddr64">; -def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">; +def MUBUFAddr64 : ComplexPattern<i64, 4, "SelectMUBUFAddr64">; +def MUBUFOffset : ComplexPattern<i64, 3, "SelectMUBUFOffset">; def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>; def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>; -def MUBUFOffset : ComplexPattern<i64, 8, "SelectMUBUFOffset">; -def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">; -def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">; - def BUFAddrKind { int Offset = 0; int OffEn = 1; @@ -105,6 +100,8 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins, bits<1> has_slc = 1; bits<1> has_tfe = 1; bits<4> elements = 0; + bits<1> has_sccb = 1; + bits<1> sccb_value = 0; } class MTBUF_Real <MTBUF_Pseudo ps> : @@ -113,6 +110,10 @@ class MTBUF_Real <MTBUF_Pseudo ps> : let isPseudo = 0; let isCodeGenOnly = 0; + let VM_CNT = 1; + let EXP_CNT = 1; + let MTBUF = 1; + // copy relevant pseudo op flags let UseNamedOperandTable = ps.UseNamedOperandTable; let SubtargetPredicate = ps.SubtargetPredicate; @@ -120,39 +121,47 @@ class MTBUF_Real <MTBUF_Pseudo ps> : let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; + let SchedRW = ps.SchedRW; + let mayLoad = ps.mayLoad; + let mayStore = ps.mayStore; + let IsAtomicRet = ps.IsAtomicRet; + let IsAtomicNoRet = ps.IsAtomicNoRet; bits<12> offset; - bits<1> glc; - bits<1> dlc; + bits<5> cpol; bits<7> format; bits<8> vaddr; - bits<8> vdata; + bits<10> vdata; bits<7> srsrc; - bits<1> slc; bits<1> tfe; bits<8> soffset; bits<4> dfmt = format{3-0}; bits<3> nfmt = format{6-4}; + + // GFX90A+ only: instruction uses AccVGPR for data + // Bit superceedes tfe. + bits<1> acc = !if(ps.has_vdata, vdata{9}, 0); } class getMTBUFInsDA<list<RegisterClass> vdataList, list<RegisterClass> vaddrList=[]> { RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList)); RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); + RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret; dag InsNoData = !if(!empty(vaddrList), (ins SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz), + offset:$offset, FORMAT:$format, CPol:$cpol, TFE:$tfe, SWZ:$swz), (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz) + offset:$offset, FORMAT:$format, CPol:$cpol, TFE:$tfe, SWZ:$swz) ); dag InsData = !if(!empty(vaddrList), - (ins vdataClass:$vdata, SReg_128:$srsrc, - SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc, - SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz), - (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, - SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc, - SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz) + (ins vdata_op:$vdata, SReg_128:$srsrc, + SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, CPol:$cpol, + TFE:$tfe, SWZ:$swz), + (ins vdata_op:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, + SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, CPol:$cpol, + TFE:$tfe, SWZ:$swz) ); dag ret = !if(!empty(vdataList), InsNoData, InsData); } @@ -202,9 +211,9 @@ class MTBUF_Load_Pseudo <string opName, // Workaround bug bz30254 int addrKindCopy = addrKind> : MTBUF_Pseudo<opName, - (outs vdataClass:$vdata), + (outs getLdStRegisterOperand<vdataClass>.ret:$vdata), getMTBUFIns<addrKindCopy>.ret, - " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz", + " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$cpol$tfe$swz", pattern>, MTBUF_SetupAddr<addrKindCopy> { let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; @@ -217,17 +226,11 @@ multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass, int elems, ValueType load_vt = i32, SDPatternOperator ld = null_frag> { - def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems, - [(set load_vt:$vdata, - (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$format, - i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)))]>, - MTBUFAddr64Table<0, NAME>; + def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>, + MTBUFAddr64Table<0, NAME>; - def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems, - [(set load_vt:$vdata, - (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, - i8:$format, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)))]>, - MTBUFAddr64Table<1, NAME>; + def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems>, + MTBUFAddr64Table<1, NAME>; def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>; def _IDXEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>; @@ -252,7 +255,7 @@ class MTBUF_Store_Pseudo <string opName, : MTBUF_Pseudo<opName, (outs), getMTBUFIns<addrKindCopy, [vdataClassCopy]>.ret, - " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz", + " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$cpol$tfe$swz", pattern>, MTBUF_SetupAddr<addrKindCopy> { let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; @@ -265,16 +268,10 @@ multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass, int elems, ValueType store_vt = i32, SDPatternOperator st = null_frag> { - def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems, - [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, - i16:$offset, i8:$format, i1:$glc, - i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>, + def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>, MTBUFAddr64Table<0, NAME>; - def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems, - [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i8:$format, i1:$glc, - i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>, + def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems>, MTBUFAddr64Table<1, NAME>; def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>; @@ -341,6 +338,9 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins, bits<1> has_slc = 1; bits<1> has_tfe = 1; bits<4> elements = 0; + bits<1> has_sccb = 1; + bits<1> sccb_value = 0; + bits<1> IsBufferInv = 0; } class MUBUF_Real <MUBUF_Pseudo ps> : @@ -349,6 +349,10 @@ class MUBUF_Real <MUBUF_Pseudo ps> : let isPseudo = 0; let isCodeGenOnly = 0; + let VM_CNT = 1; + let EXP_CNT = 1; + let MUBUF = 1; + // copy relevant pseudo op flags let SubtargetPredicate = ps.SubtargetPredicate; let AsmMatchConverter = ps.AsmMatchConverter; @@ -357,16 +361,23 @@ class MUBUF_Real <MUBUF_Pseudo ps> : let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; let UseNamedOperandTable = ps.UseNamedOperandTable; + let SchedRW = ps.SchedRW; + let mayLoad = ps.mayLoad; + let mayStore = ps.mayStore; + let IsAtomicRet = ps.IsAtomicRet; + let IsAtomicNoRet = ps.IsAtomicNoRet; bits<12> offset; - bits<1> glc; - bits<1> dlc; + bits<5> cpol; bits<8> vaddr; - bits<8> vdata; + bits<10> vdata; bits<7> srsrc; - bits<1> slc; bits<1> tfe; bits<8> soffset; + + // GFX90A+ only: instruction uses AccVGPR for data + // Bit superceedes tfe. + bits<1> acc = !if(ps.has_vdata, vdata{9}, 0); } @@ -380,7 +391,8 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node = null_frag> : let mayLoad = 0; let mayStore = 0; - // Set everything to 0. + let IsBufferInv = 1; + // Set everything else to 0. let offen = 0; let idxen = 0; let addr64 = 0; @@ -395,6 +407,8 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node = null_frag> : let has_offset = 0; let has_slc = 0; let has_tfe = 0; + let has_sccb = 0; + let sccb_value = 0; } class getMUBUFInsDA<list<RegisterClass> vdataList, @@ -402,33 +416,31 @@ class getMUBUFInsDA<list<RegisterClass> vdataList, bit isLds = 0> { RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList)); RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); + RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret; dag InsNoData = !if(!empty(vaddrList), (ins SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, GLC:$glc, SLC:$slc), + offset:$offset, CPol_0:$cpol), (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, GLC:$glc, SLC:$slc) + offset:$offset, CPol_0:$cpol) ); dag InsData = !if(!empty(vaddrList), - (ins vdataClass:$vdata, SReg_128:$srsrc, - SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc), - (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, - SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc) + (ins vdata_op:$vdata, SReg_128:$srsrc, + SCSrc_b32:$soffset, offset:$offset, CPol_0:$cpol), + (ins vdata_op:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, + SCSrc_b32:$soffset, offset:$offset, CPol_0:$cpol) ); dag ret = !con( !if(!empty(vdataList), InsNoData, InsData), - !if(isLds, (ins DLC:$dlc, SWZ:$swz), (ins TFE:$tfe, DLC:$dlc,SWZ:$swz)) + !if(isLds, (ins SWZ_0:$swz), (ins TFE_0:$tfe, SWZ_0:$swz)) ); } class getMUBUFElements<ValueType vt> { - // eq does not support ValueType for some reason. - string vtAsStr = !cast<string>(vt); - int ret = - !if(!eq(vtAsStr, "f16"), 1, - !if(!eq(vtAsStr, "v2f16"), 2, - !if(!eq(vtAsStr, "v3f16"), 3, - !if(!eq(vtAsStr, "v4f16"), 4, + !if(!eq(vt, f16), 1, + !if(!eq(vt, v2f16), 2, + !if(!eq(vt, v3f16), 3, + !if(!eq(vt, v4f16), 4, !if(!eq(vt.Size, 32), 1, !if(!eq(vt.Size, 64), 2, !if(!eq(vt.Size, 96), 3, @@ -482,13 +494,15 @@ class MUBUF_Load_Pseudo <string opName, bit isLds = 0, list<dag> pattern=[], // Workaround bug bz30254 - int addrKindCopy = addrKind> + int addrKindCopy = addrKind, + RegisterClass vdata_rc = getVregSrcForVT<vdata_vt>.ret, + RegisterOperand vdata_op = getLdStRegisterOperand<vdata_rc>.ret> : MUBUF_Pseudo<opName, - (outs getVregSrcForVT<vdata_vt>.ret:$vdata), + (outs vdata_op:$vdata), !con(getMUBUFIns<addrKindCopy, [], isLds>.ret, - !if(HasTiedDest, (ins getVregSrcForVT<vdata_vt>.ret:$vdata_in), (ins))), - " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc" # - !if(isLds, " lds", "$tfe") # "$dlc$swz", + !if(HasTiedDest, (ins vdata_op:$vdata_in), (ins))), + " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$cpol" # + !if(isLds, " lds", "$tfe") # "$swz", pattern>, MUBUF_SetupAddr<addrKindCopy> { let PseudoInstr = opName # !if(isLds, "_lds", "") # @@ -506,15 +520,15 @@ class MUBUF_Load_Pseudo <string opName, } class MUBUF_Offset_Load_Pat <Instruction inst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> : Pat < - (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))), - (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)) + (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset))), + (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset)) >; class MUBUF_Addr64_Load_Pat <Instruction inst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> : Pat < - (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))), - (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)) + (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset))), + (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset)) >; multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> { @@ -531,7 +545,7 @@ multiclass MUBUF_Pseudo_Loads<string opName, bit TiedDest = 0, bit isLds = 0> { - defvar legal_load_vt = !if(!eq(!cast<string>(load_vt), !cast<string>(v3f16)), v4f16, load_vt); + defvar legal_load_vt = !if(!eq(load_vt, v3f16), v4f16, load_vt); def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, legal_load_vt, TiedDest, isLds>, MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>; @@ -567,7 +581,7 @@ class MUBUF_Store_Pseudo <string opName, : MUBUF_Pseudo<opName, (outs), getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret]>.ret, - " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz", + " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$cpol$tfe$swz", pattern>, MUBUF_SetupAddr<addrKindCopy> { let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; @@ -581,16 +595,16 @@ multiclass MUBUF_Pseudo_Stores<string opName, ValueType store_vt = i32, SDPatternOperator st = null_frag> { - defvar legal_store_vt = !if(!eq(!cast<string>(store_vt), !cast<string>(v3f16)), v4f16, store_vt); + defvar legal_store_vt = !if(!eq(store_vt, v3f16), v4f16, store_vt); def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, legal_store_vt, [(st legal_store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>, + i16:$offset))]>, MUBUFAddr64Table<0, NAME>; def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, legal_store_vt, [(st legal_store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>, + i16:$offset))]>, MUBUFAddr64Table<1, NAME>; def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, legal_store_vt>; @@ -608,8 +622,8 @@ multiclass MUBUF_Pseudo_Stores<string opName, class MUBUF_Pseudo_Store_Lds<string opName> : MUBUF_Pseudo<opName, (outs), - (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc, SWZ:$swz), - " $srsrc, $soffset$offset lds$glc$slc$swz"> { + (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol:$cpol, SWZ:$swz), + " $srsrc, $soffset$offset lds$cpol$swz"> { let mayLoad = 0; let mayStore = 1; let maybeAtomic = 1; @@ -626,18 +640,19 @@ class MUBUF_Pseudo_Store_Lds<string opName> class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in, list<RegisterClass> vaddrList=[]> { RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); + RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret; dag ret = !if(vdata_in, !if(!empty(vaddrList), - (ins vdataClass:$vdata_in, - SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC_1:$glc1, SLC:$slc), - (ins vdataClass:$vdata_in, vaddrClass:$vaddr, - SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC_1:$glc1, SLC:$slc) + (ins vdata_op:$vdata_in, + SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol_GLC1:$cpol), + (ins vdata_op:$vdata_in, vaddrClass:$vaddr, + SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol_GLC1:$cpol) ), !if(!empty(vaddrList), - (ins vdataClass:$vdata, - SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc), - (ins vdataClass:$vdata, vaddrClass:$vaddr, - SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc) + (ins vdata_op:$vdata, + SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol_0:$cpol), + (ins vdata_op:$vdata, vaddrClass:$vaddr, + SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol_0:$cpol) )); } @@ -678,7 +693,9 @@ class MUBUF_Atomic_Pseudo<string opName, let has_glc = 0; let has_dlc = 0; let has_tfe = 0; + let has_sccb = 1; let maybeAtomic = 1; + let AsmMatchConverter = "cvtMubufAtomic"; } class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind, @@ -690,13 +707,14 @@ class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind, : MUBUF_Atomic_Pseudo<opName, addrKindCopy, (outs), getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 0>.ret, - " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$slc", + " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$cpol", pattern>, AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 0> { let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; let glc_value = 0; let dlc_value = 0; - let AsmMatchConverter = "cvtMubufAtomic"; + let sccb_value = 0; + let IsAtomicNoRet = 1; } class MUBUF_AtomicRet_Pseudo<string opName, int addrKind, @@ -704,19 +722,21 @@ class MUBUF_AtomicRet_Pseudo<string opName, int addrKind, list<dag> pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind, - RegisterClass vdataClassCopy = vdataClass> + RegisterClass vdataClassCopy = vdataClass, + RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret> : MUBUF_Atomic_Pseudo<opName, addrKindCopy, - (outs vdataClassCopy:$vdata), + (outs vdata_op:$vdata), getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 1>.ret, - " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc1$slc", + " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$cpol", pattern>, AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 1> { let PseudoInstr = opName # "_rtn_" # getAddrName<addrKindCopy>.ret; let glc_value = 1; let dlc_value = 0; + let sccb_value = 0; + let IsAtomicRet = 1; let Constraints = "$vdata = $vdata_in"; let DisableEncoding = "$vdata_in"; - let AsmMatchConverter = "cvtMubufAtomicReturn"; } multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName, @@ -751,15 +771,15 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName, let FPAtomic = isFP in def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, [(set vdataType:$vdata, - (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc), + (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset), vdataType:$vdata_in))]>, MUBUFAddr64Table <0, NAME # "_RTN">; let FPAtomic = isFP in def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, [(set vdataType:$vdata, - (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc), - vdataType:$vdata_in))]>, + (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset), + vdataType:$vdata_in))]>, MUBUFAddr64Table <1, NAME # "_RTN">; let FPAtomic = isFP in @@ -1106,6 +1126,15 @@ defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN < defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_load_fadd_v2f16_global_noret_32 >; + +let OtherPredicates = [isGFX90APlus] in { +defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN < + "buffer_atomic_add_f32", VGPR_32, f32, atomic_load_fadd_global_32 +>; +defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN < + "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_load_fadd_v2f16_global_32 +>; +} } // End SubtargetPredicate = HasAtomicFaddInsts //===----------------------------------------------------------------------===// @@ -1154,6 +1183,17 @@ def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol", } // End let SubtargetPredicate = isGFX7Plus +let SubtargetPredicate = isGFX90APlus in { + def BUFFER_WBL2 : MUBUF_Invalidate<"buffer_wbl2"> { + } + def BUFFER_INVL2 : MUBUF_Invalidate<"buffer_invl2"> { + } + + defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>; + defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>; + defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>; +} // End SubtargetPredicate = isGFX90APlus + let SubtargetPredicate = isGFX10Plus in { def BUFFER_GL0_INV : MUBUF_Invalidate<"buffer_gl0_inv">; def BUFFER_GL1_INV : MUBUF_Invalidate<"buffer_gl1_inv">; @@ -1169,30 +1209,27 @@ let SubtargetPredicate = isGFX10Plus in { multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode, ValueType memoryVt = vt> { - defvar st = !if(!eq(!cast<string>(memoryVt), !cast<string>(vt)), name, mubuf_intrinsic_load<name, memoryVt>); + defvar st = !if(!eq(memoryVt, vt), name, mubuf_intrinsic_load<name, memoryVt>); def : GCNPat< (vt (st v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, timm:$auxiliary, 0)), (!cast<MUBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), - (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_cpol $auxiliary), 0, (extract_swz $auxiliary)) >; def : GCNPat< (vt (st v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, timm:$auxiliary, 0)), (!cast<MUBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), - (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_cpol $auxiliary), 0, (extract_swz $auxiliary)) >; def : GCNPat< (vt (st v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, timm:$auxiliary, timm)), (!cast<MUBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), - (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_cpol $auxiliary), 0, (extract_swz $auxiliary)) >; def : GCNPat< @@ -1201,8 +1238,7 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, (!cast<MUBUF_Pseudo>(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), - (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_cpol $auxiliary), 0, (extract_swz $auxiliary)) >; } @@ -1255,32 +1291,27 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ushort, i32, "BUFFER_LOAD_USHORT">; multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode, ValueType memoryVt = vt> { - defvar st = !if(!eq(!cast<string>(memoryVt), !cast<string>(vt)), name, mubuf_intrinsic_store<name, memoryVt>); + defvar st = !if(!eq(memoryVt, vt), name, mubuf_intrinsic_store<name, memoryVt>); def : GCNPat< (st vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, timm:$auxiliary, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), - (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_cpol $auxiliary), 0, (extract_swz $auxiliary)) >; def : GCNPat< (st vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, timm:$auxiliary, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), (extract_glc $auxiliary), - (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (as_i16timm $offset), (extract_cpol $auxiliary), 0, (extract_swz $auxiliary)) >; def : GCNPat< (st vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, timm:$auxiliary, timm), (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), (extract_glc $auxiliary), - (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (as_i16timm $offset), (extract_cpol $auxiliary), 0, (extract_swz $auxiliary)) >; def : GCNPat< @@ -1289,9 +1320,8 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact) getVregSrcForVT<vt>.ret:$vdata, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), - SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_glc $auxiliary), - (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_cpol $auxiliary), + 0, (extract_swz $auxiliary)) >; } @@ -1351,7 +1381,7 @@ multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt, timm:$offset, timm:$cachepolicy, 0)), (!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN) getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), (extract_slc $cachepolicy)) + (as_i16timm $offset), (set_glc $cachepolicy)) >; def : GCNPat< @@ -1359,7 +1389,7 @@ multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt, timm:$offset, timm:$cachepolicy, timm)), (!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), (extract_slc $cachepolicy)) + (as_i16timm $offset), (set_glc $cachepolicy)) >; def : GCNPat< @@ -1367,7 +1397,7 @@ multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt, i32:$soffset, timm:$offset, timm:$cachepolicy, 0)), (!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), (extract_slc $cachepolicy)) + (as_i16timm $offset), (set_glc $cachepolicy)) >; def : GCNPat< @@ -1377,7 +1407,7 @@ multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt, getVregSrcForVT<vt>.ret:$vdata_in, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), - (extract_slc $cachepolicy)) + (set_glc $cachepolicy)) >; } @@ -1425,7 +1455,7 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt, 0, i32:$soffset, timm:$offset, timm:$cachepolicy, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFSET) getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), (extract_slc $cachepolicy)) + (as_i16timm $offset), $cachepolicy) >; def : GCNPat< @@ -1433,7 +1463,7 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt, 0, i32:$soffset, timm:$offset, timm:$cachepolicy, timm), (!cast<MUBUF_Pseudo>(opcode # _IDXEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), (extract_slc $cachepolicy)) + (as_i16timm $offset), $cachepolicy) >; def : GCNPat< @@ -1441,7 +1471,7 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt, i32:$voffset, i32:$soffset, timm:$offset, timm:$cachepolicy, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), (extract_slc $cachepolicy)) + (as_i16timm $offset), $cachepolicy) >; def : GCNPat< @@ -1451,7 +1481,7 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt, (!cast<MUBUF_Pseudo>(opcode # _BOTHEN) getVregSrcForVT<vt>.ret:$vdata_in, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), - SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)) + SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), $cachepolicy) >; } @@ -1460,15 +1490,24 @@ defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, f32, "BUFFER_ATOMIC_ADD defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">; } +let SubtargetPredicate = isGFX90APlus in { + defm : BufferAtomicPatterns<SIbuffer_atomic_fadd, f32, "BUFFER_ATOMIC_ADD_F32">; + defm : BufferAtomicPatterns<SIbuffer_atomic_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">; + + defm : BufferAtomicPatterns<SIbuffer_atomic_fadd, f64, "BUFFER_ATOMIC_ADD_F64">; + defm : BufferAtomicPatterns<SIbuffer_atomic_fmin, f64, "BUFFER_ATOMIC_MIN_F64">; + defm : BufferAtomicPatterns<SIbuffer_atomic_fmax, f64, "BUFFER_ATOMIC_MAX_F64">; +} // End SubtargetPredicate = isGFX90APlus + def : GCNPat< (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, timm:$cachepolicy, 0), - (EXTRACT_SUBREG + (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS (BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), - (extract_slc $cachepolicy)), sub0) + (set_glc $cachepolicy)), VReg_64)), sub0) >; def : GCNPat< @@ -1476,10 +1515,11 @@ def : GCNPat< i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, timm:$cachepolicy, timm), - (EXTRACT_SUBREG + (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS (BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), - VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)), + VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (set_glc $cachepolicy)), VReg_64)), sub0) >; @@ -1488,10 +1528,11 @@ def : GCNPat< i32:$data, i32:$cmp, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, timm:$cachepolicy, 0), - (EXTRACT_SUBREG + (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS (BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), - VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)), + VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (set_glc $cachepolicy)), VReg_64)), sub0) >; @@ -1500,32 +1541,32 @@ def : GCNPat< i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, timm:$cachepolicy, timm), - (EXTRACT_SUBREG + (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS (BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), - SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)), + SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (set_glc $cachepolicy)), VReg_64)), sub0) >; class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt, PatFrag constant_ld> : GCNPat < (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))), - (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz) + i16:$offset))), + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset) >; multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET, ValueType vt, PatFrag atomic_ld> { def : GCNPat < - (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i1:$slc))), - (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0) + (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset))), + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset) >; def : GCNPat < - (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))), - (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0) + (vt (atomic_ld (MUBUFOffset v4i32:$rsrc, i32:$soffset, i16:$offset))), + (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset)) >; } @@ -1545,9 +1586,8 @@ multiclass MUBUFLoad_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt, PatFrag ld> { def : GCNPat < - (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))), - (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz) + (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset))), + (Instr_OFFSET $srsrc, $soffset, $offset) >; } @@ -1570,12 +1610,12 @@ multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen, def : GCNPat < (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset))), - (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0) + (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) >; def : GCNPat < (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))), - (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0) + (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0) >; } @@ -1585,12 +1625,12 @@ multiclass MUBUFScratchLoadPat_D16 <MUBUF_Pseudo InstrOffen, ValueType vt, PatFrag ld_frag> { def : GCNPat < (ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset), vt:$in), - (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, $in) + (InstrOffen $vaddr, $srsrc, $soffset, $offset, $in) >; def : GCNPat < (ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset), vt:$in), - (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, $in) + (InstrOffset $srsrc, $soffset, $offset, $in) >; } @@ -1635,14 +1675,13 @@ multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo In ValueType vt, PatFrag atomic_st> { // Store follows atomic op convention so address is first def : GCNPat < - (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i1:$slc), vt:$val), - (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0) + (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset), vt:$val), + (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset) >; def : GCNPat < - (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val), - (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0) + (atomic_st (MUBUFOffset v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val), + (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset)) >; } let SubtargetPredicate = isGFX6GFX7 in { @@ -1655,9 +1694,8 @@ multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt, PatFrag st> { def : GCNPat < - (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)), - (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz) + (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset)), + (Instr_OFFSET $vdata, $srsrc, $soffset, $offset) >; } @@ -1671,13 +1709,13 @@ multiclass MUBUFScratchStorePat <MUBUF_Pseudo InstrOffen, def : GCNPat < (st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset)), - (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0) + (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) >; def : GCNPat < (st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)), - (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0) + (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0) >; } @@ -1716,15 +1754,14 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_D16_HI_OFFEN, BUFFER_STORE_BYTE_D multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode, ValueType memoryVt = vt> { - defvar st = !if(!eq(!cast<string>(memoryVt), !cast<string>(vt)), name, mtbuf_intrinsic_load<name, memoryVt>); + defvar st = !if(!eq(memoryVt, vt), name, mtbuf_intrinsic_load<name, memoryVt>); def : GCNPat< (vt (st v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, timm:$format, timm:$auxiliary, 0)), (!cast<MTBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format), - (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_cpol $auxiliary), 0, (extract_swz $auxiliary)) >; def : GCNPat< @@ -1732,8 +1769,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, timm:$format, timm:$auxiliary, timm)), (!cast<MTBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format), - (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_cpol $auxiliary), 0, (extract_swz $auxiliary)) >; def : GCNPat< @@ -1741,8 +1777,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, timm:$format, timm:$auxiliary, 0)), (!cast<MTBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format), - (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_cpol $auxiliary), 0, (extract_swz $auxiliary)) >; def : GCNPat< @@ -1752,8 +1787,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format), - (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_cpol $auxiliary), 0, (extract_swz $auxiliary)) >; } @@ -1784,15 +1818,14 @@ let SubtargetPredicate = HasPackedD16VMem in { multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode, ValueType memoryVt = vt> { - defvar st = !if(!eq(!cast<string>(memoryVt), !cast<string>(vt)), name, mtbuf_intrinsic_store<name, memoryVt>); + defvar st = !if(!eq(memoryVt, vt), name, mtbuf_intrinsic_store<name, memoryVt>); def : GCNPat< (st vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, timm:$format, timm:$auxiliary, 0), (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format), - (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_cpol $auxiliary), 0, (extract_swz $auxiliary)) >; def : GCNPat< @@ -1800,8 +1833,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, timm:$format, timm:$auxiliary, timm), (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format), - (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_cpol $auxiliary), 0, (extract_swz $auxiliary)) >; def : GCNPat< @@ -1809,8 +1841,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, timm:$format, timm:$auxiliary, 0), (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format), - (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_cpol $auxiliary), 0, (extract_swz $auxiliary)) >; def : GCNPat< @@ -1820,8 +1851,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, getVregSrcForVT<vt>.ret:$vdata, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format), - (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_cpol $auxiliary), 0, (extract_swz $auxiliary)) >; } @@ -1863,21 +1893,21 @@ class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef> : let Inst{11-0} = !if(ps.has_offset, offset, ?); let Inst{12} = ps.offen; let Inst{13} = ps.idxen; - let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); + let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value); let Inst{16} = ps.lds; let Inst{24-18} = op; let Inst{31-26} = 0x38; let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); - let Inst{47-40} = !if(ps.has_vdata, vdata, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?); let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); - let Inst{54} = !if(ps.has_slc, slc, ?); + let Inst{54} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?); let Inst{55} = !if(ps.has_tfe, tfe, ?); let Inst{63-56} = !if(ps.has_soffset, soffset, ?); } class MUBUF_Real_gfx10<bits<8> op, MUBUF_Pseudo ps> : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.GFX10> { - let Inst{15} = !if(ps.has_dlc, dlc, ps.dlc_value); + let Inst{15} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlc_value); let Inst{25} = op{7}; } @@ -1891,13 +1921,6 @@ class MUBUF_Real_gfx6_gfx7<bits<8> op, MUBUF_Pseudo ps> : //===----------------------------------------------------------------------===// let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { - multiclass MUBUF_Real_gfx10_with_name<bits<8> op, string opName, - string asmName> { - def _gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(opName)> { - MUBUF_Pseudo ps = !cast<MUBUF_Pseudo>(opName); - let AsmString = asmName # ps.AsmOperands; - } - } multiclass MUBUF_Real_AllAddr_gfx10<bits<8> op> { def _BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>; @@ -1929,16 +1952,33 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { } multiclass MUBUF_Real_Atomics_RTN_gfx10<bits<8> op> { def _BOTHEN_RTN_gfx10 : - MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>; + MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>, + AtomicNoRet<NAME # "_BOTHEN_gfx10", 1>; def _IDXEN_RTN_gfx10 : - MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>; + MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>, + AtomicNoRet<NAME # "_IDXEN_gfx10", 1>; def _OFFEN_RTN_gfx10 : - MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>; + MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>, + AtomicNoRet<NAME # "_OFFEN_gfx10", 1>; def _OFFSET_RTN_gfx10 : - MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>; + MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>, + AtomicNoRet<NAME # "_OFFSET_gfx10", 1>; } multiclass MUBUF_Real_Atomics_gfx10<bits<8> op> : - MUBUF_Real_AllAddr_gfx10<op>, MUBUF_Real_Atomics_RTN_gfx10<op>; + MUBUF_Real_Atomics_RTN_gfx10<op> { + def _BOTHEN_gfx10 : + MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>, + AtomicNoRet<NAME # "_BOTHEN_gfx10", 0>; + def _IDXEN_gfx10 : + MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>, + AtomicNoRet<NAME # "_IDXEN_gfx10", 0>; + def _OFFEN_gfx10 : + MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>, + AtomicNoRet<NAME # "_OFFEN_gfx10", 0>; + def _OFFSET_gfx10 : + MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>, + AtomicNoRet<NAME # "_OFFSET_gfx10", 0>; + } } // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x019>; @@ -2018,18 +2058,38 @@ let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { def _LDS_BOTHEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>, MUBUFLdsTable<1, NAME # "_BOTHEN_gfx6_gfx7">; } - multiclass MUBUF_Real_Atomics_gfx6_gfx7<bits<8> op> : - MUBUF_Real_AllAddr_gfx6_gfx7<op> { + multiclass MUBUF_Real_Atomics_gfx6_gfx7<bits<8> op> { + def _ADDR64_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>, + AtomicNoRet<NAME # "_ADDR64_gfx6_gfx7", 0>; + def _BOTHEN_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>, + AtomicNoRet<NAME # "_BOTHEN_gfx6_gfx7", 0>; + def _IDXEN_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>, + AtomicNoRet<NAME # "_IDXEN_gfx6_gfx7", 0>; + def _OFFEN_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>, + AtomicNoRet<NAME # "_OFFEN_gfx6_gfx7", 0>; + def _OFFSET_gfx6_gfx7 : + MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>, + AtomicNoRet<NAME # "_OFFSET_gfx6_gfx7", 0>; + def _ADDR64_RTN_gfx6_gfx7 : - MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64_RTN")>; + MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64_RTN")>, + AtomicNoRet<NAME # "_ADDR64_gfx6_gfx7", 1>; def _BOTHEN_RTN_gfx6_gfx7 : - MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>; + MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>, + AtomicNoRet<NAME # "_BOTHEN_gfx6_gfx7", 1>; def _IDXEN_RTN_gfx6_gfx7 : - MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>; + MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>, + AtomicNoRet<NAME # "_IDXEN_gfx6_gfx7", 1>; def _OFFEN_RTN_gfx6_gfx7 : - MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>; + MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>, + AtomicNoRet<NAME # "_OFFEN_gfx6_gfx7", 1>; def _OFFSET_RTN_gfx6_gfx7 : - MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>; + MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>, + AtomicNoRet<NAME # "_OFFSET_gfx6_gfx7", 1>; } } // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" @@ -2118,13 +2178,13 @@ class Base_MTBUF_Real_gfx6_gfx7_gfx10<bits<3> op, MTBUF_Pseudo ps, int ef> : let Inst{11-0} = !if(ps.has_offset, offset, ?); let Inst{12} = ps.offen; let Inst{13} = ps.idxen; - let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); + let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value); let Inst{18-16} = op; let Inst{31-26} = 0x3a; //encoding let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); - let Inst{47-40} = !if(ps.has_vdata, vdata, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?); let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); - let Inst{54} = !if(ps.has_slc, slc, ?); + let Inst{54} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?); let Inst{55} = !if(ps.has_tfe, tfe, ?); let Inst{63-56} = !if(ps.has_soffset, soffset, ?); } @@ -2135,7 +2195,7 @@ class Base_MTBUF_Real_gfx6_gfx7_gfx10<bits<3> op, MTBUF_Pseudo ps, int ef> : class MTBUF_Real_gfx10<bits<4> op, MTBUF_Pseudo ps> : Base_MTBUF_Real_gfx6_gfx7_gfx10<op{2-0}, ps, SIEncodingFamily.GFX10> { - let Inst{15} = !if(ps.has_dlc, dlc, ps.dlc_value); + let Inst{15} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlc_value); let Inst{25-19} = format; let Inst{53} = op{3}; } @@ -2204,33 +2264,58 @@ defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x007>; // GFX8, GFX9 (VI). //===----------------------------------------------------------------------===// -class MUBUF_Real_vi <bits<7> op, MUBUF_Pseudo ps> : +class MUBUF_Real_Base_vi <bits<7> op, MUBUF_Pseudo ps, int Enc, + bit has_sccb = ps.has_sccb> : MUBUF_Real<ps>, Enc64, - SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> { - let AssemblerPredicate = isGFX8GFX9; - let DecoderNamespace = "GFX8"; + SIMCInstr<ps.PseudoInstr, Enc>, + AtomicNoRet<!subst("_RTN","",NAME), !if(ps.IsAtomicNoRet, 0, + !if(ps.IsAtomicRet, 1, ?))> { let Inst{11-0} = !if(ps.has_offset, offset, ?); let Inst{12} = ps.offen; let Inst{13} = ps.idxen; - let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); + let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value); + let Inst{15} = !if(has_sccb, cpol{CPolBit.SCC}, ps.sccb_value); let Inst{16} = ps.lds; - let Inst{17} = !if(ps.has_slc, slc, ?); + let Inst{17} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?); let Inst{24-18} = op; let Inst{31-26} = 0x38; //encoding let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); - let Inst{47-40} = !if(ps.has_vdata, vdata, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?); let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); - let Inst{55} = !if(ps.has_tfe, tfe, ?); let Inst{63-56} = !if(ps.has_soffset, soffset, ?); } +class MUBUF_Real_vi <bits<7> op, MUBUF_Pseudo ps, bit has_sccb = ps.has_sccb> : + MUBUF_Real_Base_vi<op, ps, SIEncodingFamily.VI, has_sccb> { + let AssemblerPredicate = isGFX8GFX9NotGFX90A; + let DecoderNamespace = "GFX8"; + + let Inst{55} = !if(ps.has_tfe, tfe, ?); +} + +class MUBUF_Real_gfx90a <bits<7> op, MUBUF_Pseudo ps, + bit has_sccb = ps.has_sccb> : + MUBUF_Real_Base_vi<op, ps, SIEncodingFamily.GFX90A, has_sccb> { + let AssemblerPredicate = isGFX90APlus; + let DecoderNamespace = "GFX90A"; + let AsmString = ps.Mnemonic # !subst("$sccb", !if(has_sccb, "$sccb",""), + !subst("$tfe", "", ps.AsmOperands)); + + let Inst{55} = acc; +} + +multiclass MUBUF_Real_vi_gfx90a<bits<7> op, MUBUF_Pseudo ps> { + def _vi : MUBUF_Real_vi<op, ps>; + def _gfx90a : MUBUF_Real_gfx90a<op, ps, !and(ps.has_sccb,!not(ps.FPAtomic))>; +} + multiclass MUBUF_Real_AllAddr_vi<bits<7> op> { - def _OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>; - def _OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>; - def _IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>; - def _BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>; + defm _OFFSET : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>; + defm _OFFEN : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>; + defm _IDXEN : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>; + defm _BOTHEN : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>; } multiclass MUBUF_Real_AllAddr_Lds_vi<bits<7> op> { @@ -2252,6 +2337,24 @@ multiclass MUBUF_Real_AllAddr_Lds_vi<bits<7> op> { MUBUFLdsTable<1, NAME # "_IDXEN_vi">; def _LDS_BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>, MUBUFLdsTable<1, NAME # "_BOTHEN_vi">; + + def _OFFSET_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>, + MUBUFLdsTable<0, NAME # "_OFFSET_gfx90a">; + def _OFFEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>, + MUBUFLdsTable<0, NAME # "_OFFEN_gfx90a">; + def _IDXEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>, + MUBUFLdsTable<0, NAME # "_IDXEN_gfx90a">; + def _BOTHEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>, + MUBUFLdsTable<0, NAME # "_BOTHEN_gfx90a">; + + def _LDS_OFFSET_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>, + MUBUFLdsTable<1, NAME # "_OFFSET_gfx90a">; + def _LDS_OFFEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>, + MUBUFLdsTable<1, NAME # "_OFFEN_gfx90a">; + def _LDS_IDXEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>, + MUBUFLdsTable<1, NAME # "_IDXEN_gfx90a">; + def _LDS_BOTHEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>, + MUBUFLdsTable<1, NAME # "_BOTHEN_gfx90a">; } class MUBUF_Real_gfx80 <bits<7> op, MUBUF_Pseudo ps> : @@ -2264,13 +2367,13 @@ class MUBUF_Real_gfx80 <bits<7> op, MUBUF_Pseudo ps> : let Inst{11-0} = !if(ps.has_offset, offset, ?); let Inst{12} = ps.offen; let Inst{13} = ps.idxen; - let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); + let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value); let Inst{16} = ps.lds; - let Inst{17} = !if(ps.has_slc, slc, ?); + let Inst{17} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?); let Inst{24-18} = op; let Inst{31-26} = 0x38; //encoding let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); - let Inst{47-40} = !if(ps.has_vdata, vdata, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?); let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); let Inst{55} = !if(ps.has_tfe, tfe, ?); let Inst{63-56} = !if(ps.has_soffset, soffset, ?); @@ -2285,10 +2388,10 @@ multiclass MUBUF_Real_AllAddr_gfx80<bits<7> op> { multiclass MUBUF_Real_Atomic_vi<bits<7> op> : MUBUF_Real_AllAddr_vi<op> { - def _OFFSET_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>; - def _OFFEN_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>; - def _IDXEN_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>; - def _BOTHEN_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>; + defm _OFFSET_RTN : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>; + defm _OFFEN_RTN : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>; + defm _IDXEN_RTN : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>; + defm _BOTHEN_RTN : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>; } defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_Lds_vi <0x00>; @@ -2374,46 +2477,79 @@ defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_vi <0x6a>; defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_vi <0x6b>; defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_vi <0x6c>; -def BUFFER_STORE_LDS_DWORD_vi : MUBUF_Real_vi <0x3d, BUFFER_STORE_LDS_DWORD>; +defm BUFFER_STORE_LDS_DWORD : MUBUF_Real_vi_gfx90a <0x3d, BUFFER_STORE_LDS_DWORD>; +let AssemblerPredicate = isGFX8GFX9 in { def BUFFER_WBINVL1_vi : MUBUF_Real_vi <0x3e, BUFFER_WBINVL1>; def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>; +} // End AssemblerPredicate = isGFX8GFX9 let SubtargetPredicate = HasAtomicFaddInsts in { -defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_AllAddr_vi <0x4d>; -defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_AllAddr_vi <0x4e>; +defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_Atomic_vi <0x4d>; +defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_vi <0x4e>; } // End SubtargetPredicate = HasAtomicFaddInsts -class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> : +let SubtargetPredicate = isGFX90APlus in { + defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Real_Atomic_vi<0x4f>; + defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Real_Atomic_vi<0x50>; + defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Real_Atomic_vi<0x51>; +} // End SubtargetPredicate = isGFX90APlus, AssemblerPredicate = isGFX90APlus + +def BUFFER_WBL2_gfx90a : MUBUF_Real_gfx90a<0x28, BUFFER_WBL2> { +} +def BUFFER_INVL2_gfx90a : MUBUF_Real_gfx90a<0x29, BUFFER_INVL2>; + +class MTBUF_Real_Base_vi <bits<4> op, MTBUF_Pseudo ps, int Enc> : MTBUF_Real<ps>, Enc64, - SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> { - let AssemblerPredicate = isGFX8GFX9; - let DecoderNamespace = "GFX8"; + SIMCInstr<ps.PseudoInstr, Enc> { let Inst{11-0} = !if(ps.has_offset, offset, ?); let Inst{12} = ps.offen; let Inst{13} = ps.idxen; - let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); + let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value); let Inst{18-15} = op; let Inst{22-19} = dfmt; let Inst{25-23} = nfmt; let Inst{31-26} = 0x3a; //encoding let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); - let Inst{47-40} = !if(ps.has_vdata, vdata, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?); let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); - let Inst{54} = !if(ps.has_slc, slc, ?); + let Inst{53} = !if(ps.has_sccb, cpol{CPolBit.SCC}, ps.sccb_value); + let Inst{54} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?); let Inst{55} = !if(ps.has_tfe, tfe, ?); let Inst{63-56} = !if(ps.has_soffset, soffset, ?); } +class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> : + MTBUF_Real_Base_vi <op, ps, SIEncodingFamily.VI> { + let AssemblerPredicate = isGFX8GFX9NotGFX90A; + let DecoderNamespace = "GFX8"; + + let Inst{55} = !if(ps.has_tfe, tfe, ?); +} + +class MTBUF_Real_gfx90a <bits<4> op, MTBUF_Pseudo ps> : + MTBUF_Real_Base_vi <op, ps, SIEncodingFamily.GFX90A> { + let AssemblerPredicate = isGFX90APlus; + let DecoderNamespace = "GFX90A"; + let AsmString = ps.Mnemonic # !subst("$tfe", "", ps.AsmOperands); + + let Inst{55} = acc; +} + +multiclass MTBUF_Real_vi_gfx90a<bits<4> op, MTBUF_Pseudo ps> { + def _vi : MTBUF_Real_vi<op, ps>; + def _gfx90a : MTBUF_Real_gfx90a<op, ps>; +} + multiclass MTBUF_Real_AllAddr_vi<bits<4> op> { - def _OFFSET_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>; - def _OFFEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>; - def _IDXEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>; - def _BOTHEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>; + defm _OFFSET : MTBUF_Real_vi_gfx90a <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>; + defm _OFFEN : MTBUF_Real_vi_gfx90a <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>; + defm _IDXEN : MTBUF_Real_vi_gfx90a <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>; + defm _BOTHEN : MTBUF_Real_vi_gfx90a <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>; } class MTBUF_Real_gfx80 <bits<4> op, MTBUF_Pseudo ps> : @@ -2426,15 +2562,15 @@ class MTBUF_Real_gfx80 <bits<4> op, MTBUF_Pseudo ps> : let Inst{11-0} = !if(ps.has_offset, offset, ?); let Inst{12} = ps.offen; let Inst{13} = ps.idxen; - let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); + let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value); let Inst{18-15} = op; let Inst{22-19} = dfmt; let Inst{25-23} = nfmt; let Inst{31-26} = 0x3a; //encoding let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); - let Inst{47-40} = !if(ps.has_vdata, vdata, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?); let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); - let Inst{54} = !if(ps.has_slc, slc, ?); + let Inst{54} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?); let Inst{55} = !if(ps.has_tfe, tfe, ?); let Inst{63-56} = !if(ps.has_soffset, soffset, ?); } @@ -2478,7 +2614,10 @@ let SubtargetPredicate = HasPackedD16VMem in { def MUBUFInfoTable : GenericTable { let FilterClass = "MUBUF_Pseudo"; let CppTypeName = "MUBUFInfo"; - let Fields = ["Opcode", "BaseOpcode", "elements", "has_vaddr", "has_srsrc", "has_soffset"]; + let Fields = [ + "Opcode", "BaseOpcode", "elements", "has_vaddr", "has_srsrc", "has_soffset", + "IsBufferInv" + ]; let PrimaryKey = ["Opcode"]; let PrimaryKeyName = "getMUBUFOpcodeHelper"; diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 328c81005df4..ad9528ece7d0 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -52,32 +52,41 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt let Uses = !if(has_m0_read, [M0, EXEC], [EXEC]); } -class DS_Real <DS_Pseudo ds> : - InstSI <ds.OutOperandList, ds.InOperandList, ds.Mnemonic # ds.AsmOperands, []>, +class DS_Real <DS_Pseudo ps> : + InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, Enc64 { let isPseudo = 0; let isCodeGenOnly = 0; + let LGKM_CNT = 1; let DS = 1; let UseNamedOperandTable = 1; // copy relevant pseudo op flags - let SubtargetPredicate = ds.SubtargetPredicate; - let OtherPredicates = ds.OtherPredicates; - let AsmMatchConverter = ds.AsmMatchConverter; + let SubtargetPredicate = ps.SubtargetPredicate; + let OtherPredicates = ps.OtherPredicates; + let AsmMatchConverter = ps.AsmMatchConverter; + let SchedRW = ps.SchedRW; + let mayLoad = ps.mayLoad; + let mayStore = ps.mayStore; + let IsAtomicRet = ps.IsAtomicRet; + let IsAtomicNoRet = ps.IsAtomicNoRet; // encoding fields - bits<8> vdst; + bits<10> vdst; bits<1> gds; bits<8> addr; - bits<8> data0; - bits<8> data1; + bits<10> data0; + bits<10> data1; bits<8> offset0; bits<8> offset1; bits<16> offset; - let offset0 = !if(ds.has_offset, offset{7-0}, ?); - let offset1 = !if(ds.has_offset, offset{15-8}, ?); + let offset0 = !if(ps.has_offset, offset{7-0}, ?); + let offset1 = !if(ps.has_offset, offset{15-8}, ?); + + bits<1> acc = !if(ps.has_vdst, vdst{9}, + !if(!or(ps.has_data0, ps.has_gws_data0), data0{9}, 0)); } @@ -86,7 +95,7 @@ class DS_Real <DS_Pseudo ds> : class DS_0A1D_NORET<string opName, RegisterClass rc = VGPR_32> : DS_Pseudo<opName, (outs), - (ins rc:$data0, offset:$offset, gds:$gds), + (ins getLdStRegisterOperand<rc>.ret:$data0, offset:$offset, gds:$gds), " $data0$offset$gds"> { let has_addr = 0; @@ -97,11 +106,12 @@ class DS_0A1D_NORET<string opName, RegisterClass rc = VGPR_32> class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32> : DS_Pseudo<opName, (outs), - (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds), + (ins VGPR_32:$addr, getLdStRegisterOperand<rc>.ret:$data0, offset:$offset, gds:$gds), " $addr, $data0$offset$gds"> { let has_data1 = 0; let has_vdst = 0; + let IsAtomicNoRet = 1; } multiclass DS_1A1D_NORET_mc<string opName, RegisterClass rc = VGPR_32> { @@ -114,13 +124,22 @@ multiclass DS_1A1D_NORET_mc<string opName, RegisterClass rc = VGPR_32> { } } -class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32> +multiclass DS_1A1D_NORET_mc_gfx9<string opName, RegisterClass rc = VGPR_32> { + let has_m0_read = 0 in { + def "" : DS_1A1D_NORET<opName, rc>, + AtomicNoRet<opName, 0>; + } +} + +class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32, + RegisterOperand data_op = getLdStRegisterOperand<rc>.ret> : DS_Pseudo<opName, (outs), - (ins VGPR_32:$addr, rc:$data0, rc:$data1, offset:$offset, gds:$gds), + (ins VGPR_32:$addr, data_op:$data0, data_op:$data1, offset:$offset, gds:$gds), " $addr, $data0, $data1$offset$gds"> { let has_vdst = 0; + let IsAtomicNoRet = 1; } multiclass DS_1A2D_NORET_mc<string opName, RegisterClass rc = VGPR_32> { @@ -133,10 +152,11 @@ multiclass DS_1A2D_NORET_mc<string opName, RegisterClass rc = VGPR_32> { } } -class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32> +class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32, + RegisterOperand data_op = getLdStRegisterOperand<rc>.ret> : DS_Pseudo<opName, (outs), - (ins VGPR_32:$addr, rc:$data0, rc:$data1, + (ins VGPR_32:$addr, data_op:$data0, data_op:$data1, offset0:$offset0, offset1:$offset1, gds:$gds), " $addr, $data0, $data1$offset0$offset1$gds"> { @@ -153,14 +173,16 @@ multiclass DS_1A2D_Off8_NORET_mc <string opName, RegisterClass rc = VGPR_32> { } } -class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32> +class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32, + RegisterOperand data_op = getLdStRegisterOperand<rc>.ret> : DS_Pseudo<opName, - (outs rc:$vdst), - (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds), + (outs data_op:$vdst), + (ins VGPR_32:$addr, data_op:$data0, offset:$offset, gds:$gds), " $vdst, $addr, $data0$offset$gds"> { let hasPostISelHook = 1; let has_data1 = 0; + let IsAtomicRet = 1; } multiclass DS_1A1D_RET_mc <string opName, RegisterClass rc = VGPR_32, @@ -175,15 +197,27 @@ multiclass DS_1A1D_RET_mc <string opName, RegisterClass rc = VGPR_32, } } +multiclass DS_1A1D_RET_mc_gfx9 <string opName, RegisterClass rc = VGPR_32, + string NoRetOp = ""> { + let has_m0_read = 0 in { + def "" : DS_1A1D_RET<opName, rc>, + AtomicNoRet<!if(!eq(NoRetOp, ""), "", NoRetOp), + !if(!eq(NoRetOp, ""), 0, 1)>; + } +} + class DS_1A2D_RET<string opName, RegisterClass rc = VGPR_32, - RegisterClass src = rc> + RegisterClass src = rc, + RegisterOperand dst_op = getLdStRegisterOperand<rc>.ret, + RegisterOperand src_op = getLdStRegisterOperand<src>.ret> : DS_Pseudo<opName, - (outs rc:$vdst), - (ins VGPR_32:$addr, src:$data0, src:$data1, offset:$offset, gds:$gds), + (outs dst_op:$vdst), + (ins VGPR_32:$addr, src_op:$data0, src_op:$data1, offset:$offset, gds:$gds), " $vdst, $addr, $data0, $data1$offset$gds"> { let hasPostISelHook = 1; + let IsAtomicRet = 1; } multiclass DS_1A2D_RET_mc<string opName, @@ -201,10 +235,12 @@ multiclass DS_1A2D_RET_mc<string opName, class DS_1A2D_Off8_RET<string opName, RegisterClass rc = VGPR_32, - RegisterClass src = rc> + RegisterClass src = rc, + RegisterOperand dst_op = getLdStRegisterOperand<rc>.ret, + RegisterOperand src_op = getLdStRegisterOperand<src>.ret> : DS_Pseudo<opName, - (outs rc:$vdst), - (ins VGPR_32:$addr, src:$data0, src:$data1, offset0:$offset0, offset1:$offset1, gds:$gds), + (outs dst_op:$vdst), + (ins VGPR_32:$addr, src_op:$data0, src_op:$data1, offset0:$offset0, offset1:$offset1, gds:$gds), " $vdst, $addr, $data0, $data1$offset0$offset1$gds"> { let has_offset = 0; @@ -224,11 +260,12 @@ multiclass DS_1A2D_Off8_RET_mc<string opName, } -class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = offset> +class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = offset, + RegisterOperand data_op = getLdStRegisterOperand<rc>.ret> : DS_Pseudo<opName, - (outs rc:$vdst), + (outs data_op:$vdst), !if(HasTiedOutput, - (ins VGPR_32:$addr, ofs:$offset, gds:$gds, rc:$vdst_in), + (ins VGPR_32:$addr, ofs:$offset, gds:$gds, data_op:$vdst_in), (ins VGPR_32:$addr, ofs:$offset, gds:$gds)), " $vdst, $addr$offset$gds"> { let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", ""); @@ -250,7 +287,7 @@ class DS_1A_RET_Tied<string opName, RegisterClass rc = VGPR_32> : class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32> : DS_Pseudo<opName, - (outs rc:$vdst), + (outs getLdStRegisterOperand<rc>.ret:$vdst), (ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1, gds:$gds), " $vdst, $addr$offset0$offset1$gds"> { @@ -269,7 +306,7 @@ multiclass DS_1A_Off8_RET_mc <string opName, RegisterClass rc = VGPR_32> { } class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName, - (outs VGPR_32:$vdst), + (outs getLdStRegisterOperand<VGPR_32>.ret:$vdst), (ins VGPR_32:$addr, offset:$offset), " $vdst, $addr$offset gds"> { @@ -281,7 +318,7 @@ class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName, } class DS_0A_RET <string opName> : DS_Pseudo<opName, - (outs VGPR_32:$vdst), + (outs getLdStRegisterOperand<VGPR_32>.ret:$vdst), (ins offset:$offset, gds:$gds), " $vdst$offset$gds"> { @@ -336,7 +373,8 @@ class DS_GWS_0D <string opName> class DS_GWS_1D <string opName> : DS_GWS<opName, - (ins VGPR_32:$data0, offset:$offset), " $data0$offset gds"> { + (ins getLdStRegisterOperand<VGPR_32>.ret:$data0, offset:$offset), + " $data0$offset gds"> { let has_gws_data0 = 1; let hasSideEffects = 1; @@ -360,10 +398,11 @@ class DS_VOID <string opName> : DS_Pseudo<opName, let has_gds = 0; } -class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag> +class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag, + RegisterOperand data_op = getLdStRegisterOperand<VGPR_32>.ret> : DS_Pseudo<opName, - (outs VGPR_32:$vdst), - (ins VGPR_32:$addr, VGPR_32:$data0, offset:$offset), + (outs data_op:$vdst), + (ins VGPR_32:$addr, data_op:$data0, offset:$offset), " $vdst, $addr, $data0$offset", [(set i32:$vdst, (node (DS1Addr1Offset i32:$addr, i16:$offset), i32:$data0))] > { @@ -420,6 +459,11 @@ def DS_WRITE_ADDTID_B32 : DS_0A1D_NORET<"ds_write_addtid_b32">; } // End mayLoad = 0 +let SubtargetPredicate = isGFX90APlus in { + defm DS_ADD_F64 : DS_1A1D_NORET_mc_gfx9<"ds_add_f64", VReg_64>; + defm DS_ADD_RTN_F64 : DS_1A1D_RET_mc_gfx9<"ds_add_rtn_f64", VReg_64, "ds_add_f64">; +} // End SubtargetPredicate = isGFX90APlus + defm DS_MSKOR_B32 : DS_1A2D_NORET_mc<"ds_mskor_b32">; defm DS_CMPST_B32 : DS_1A2D_NORET_mc<"ds_cmpst_b32">; defm DS_CMPST_F32 : DS_1A2D_NORET_mc<"ds_cmpst_f32">; @@ -674,38 +718,6 @@ defm : DSReadPat_mc <DS_READ_B32, vt, "load_local">; defm : DSReadPat_mc <DS_READ_B32, i32, "atomic_load_32_local">; defm : DSReadPat_mc <DS_READ_B64, i64, "atomic_load_64_local">; -let AddedComplexity = 100 in { - -foreach vt = VReg_64.RegTypes in { -defm : DSReadPat_mc <DS_READ_B64, vt, "load_align8_local">; -} - -let SubtargetPredicate = isGFX7Plus in { - -foreach vt = VReg_96.RegTypes in { -defm : DSReadPat_mc <DS_READ_B96, vt, "load_align16_local">; -} - -foreach vt = VReg_128.RegTypes in { -defm : DSReadPat_mc <DS_READ_B128, vt, "load_align16_local">; -} - -let SubtargetPredicate = HasUnalignedAccessMode in { - -foreach vt = VReg_96.RegTypes in { -defm : DSReadPat_mc <DS_READ_B96, vt, "load_local">; -} - -foreach vt = VReg_128.RegTypes in { -defm : DSReadPat_mc <DS_READ_B128, vt, "load_local">; -} - -} // End SubtargetPredicate = HasUnalignedAccessMode - -} // End SubtargetPredicate = isGFX7Plus - -} // End AddedComplexity = 100 - let OtherPredicates = [D16PreservesUnusedBits] in { def : DSReadPat_D16<DS_READ_U16_D16_HI, load_d16_hi_local, v2i16>; def : DSReadPat_D16<DS_READ_U16_D16_HI, load_d16_hi_local, v2f16>; @@ -829,31 +841,38 @@ foreach vt = VReg_128.RegTypes in { defm : DS128Bit8ByteAlignedPat_mc<vt>; } +// Prefer ds_read over ds_read2 and ds_write over ds_write2, all other things +// being equal, because it has a larger immediate offset range. let AddedComplexity = 100 in { foreach vt = VReg_64.RegTypes in { +defm : DSReadPat_mc <DS_READ_B64, vt, "load_align8_local">; defm : DSWritePat_mc <DS_WRITE_B64, vt, "store_align8_local">; } let SubtargetPredicate = isGFX7Plus in { foreach vt = VReg_96.RegTypes in { +defm : DSReadPat_mc <DS_READ_B96, vt, "load_align16_local">; defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_align16_local">; } foreach vt = VReg_128.RegTypes in { +defm : DSReadPat_mc <DS_READ_B128, vt, "load_align16_local">; defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align16_local">; } let SubtargetPredicate = HasUnalignedAccessMode in { +// FIXME: From performance point of view, is ds_read_b96/ds_write_b96 better choice +// for unaligned accesses? foreach vt = VReg_96.RegTypes in { +defm : DSReadPat_mc <DS_READ_B96, vt, "load_local">; defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_local">; } -foreach vt = VReg_128.RegTypes in { -defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_local">; -} +// For performance reasons, *do not* select ds_read_b128/ds_write_b128 for unaligned +// accesses. } // End SubtargetPredicate = HasUnalignedAccessMode @@ -938,6 +957,10 @@ defm : DSAtomicRetPat_mc<DS_MAX_RTN_U64, i64, "atomic_load_umax">; defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B64, i64, "atomic_cmp_swap">; +let SubtargetPredicate = isGFX90APlus in { +def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_64>; +} + def : Pat < (SIds_ordered_count i32:$value, i16:$offset), (DS_ORDERED_COUNT $value, (as_i16imm $offset)) @@ -959,10 +982,10 @@ class Base_DS_Real_gfx6_gfx7_gfx10<bits<8> op, DS_Pseudo ps, int ef> : let Inst{17} = !if(ps.has_gds, gds, ps.gdsValue); let Inst{25-18} = op; let Inst{31-26} = 0x36; - let Inst{39-32} = !if(ps.has_addr, addr, !if(ps.has_gws_data0, data0, 0)); - let Inst{47-40} = !if(ps.has_data0, data0, 0); - let Inst{55-48} = !if(ps.has_data1, data1, 0); - let Inst{63-56} = !if(ps.has_vdst, vdst, 0); + let Inst{39-32} = !if(ps.has_addr, addr, !if(ps.has_gws_data0, data0{7-0}, 0)); + let Inst{47-40} = !if(ps.has_data0, data0{7-0}, 0); + let Inst{55-48} = !if(ps.has_data1, data1{7-0}, 0); + let Inst{63-56} = !if(ps.has_vdst, vdst{7-0}, 0); } //===----------------------------------------------------------------------===// @@ -1166,22 +1189,23 @@ defm DS_MAX_SRC2_F64 : DS_Real_gfx6_gfx7_gfx10<0x0d3>; // GFX8, GFX9 (VI). //===----------------------------------------------------------------------===// -class DS_Real_vi <bits<8> op, DS_Pseudo ds> : - DS_Real <ds>, - SIMCInstr <ds.Mnemonic, SIEncodingFamily.VI> { +class DS_Real_vi <bits<8> op, DS_Pseudo ps> : + DS_Real <ps>, + SIMCInstr <ps.Mnemonic, SIEncodingFamily.VI> { let AssemblerPredicate = isGFX8GFX9; let DecoderNamespace = "GFX8"; // encoding - let Inst{7-0} = !if(ds.has_offset0, offset0, 0); - let Inst{15-8} = !if(ds.has_offset1, offset1, 0); - let Inst{16} = !if(ds.has_gds, gds, ds.gdsValue); + let Inst{7-0} = !if(ps.has_offset0, offset0, 0); + let Inst{15-8} = !if(ps.has_offset1, offset1, 0); + let Inst{16} = !if(ps.has_gds, gds, ps.gdsValue); let Inst{24-17} = op; + let Inst{25} = acc; let Inst{31-26} = 0x36; // ds prefix - let Inst{39-32} = !if(ds.has_addr, addr, !if(ds.has_gws_data0, data0, 0)); - let Inst{47-40} = !if(ds.has_data0, data0, 0); - let Inst{55-48} = !if(ds.has_data1, data1, 0); - let Inst{63-56} = !if(ds.has_vdst, vdst, 0); + let Inst{39-32} = !if(ps.has_addr, addr, !if(ps.has_gws_data0, data0{7-0}, 0)); + let Inst{47-40} = !if(ps.has_data0, data0{7-0}, 0); + let Inst{55-48} = !if(ps.has_data1, data1{7-0}, 0); + let Inst{63-56} = !if(ps.has_vdst, vdst{7-0}, 0); } def DS_ADD_U32_vi : DS_Real_vi<0x0, DS_ADD_U32>; @@ -1344,3 +1368,8 @@ def DS_WRITE_B96_vi : DS_Real_vi<0xde, DS_WRITE_B96>; def DS_WRITE_B128_vi : DS_Real_vi<0xdf, DS_WRITE_B128>; def DS_READ_B96_vi : DS_Real_vi<0xfe, DS_READ_B96>; def DS_READ_B128_vi : DS_Real_vi<0xff, DS_READ_B128>; + +let SubtargetPredicate = isGFX90APlus in { + def DS_ADD_F64_vi : DS_Real_vi<0x5c, DS_ADD_F64>; + def DS_ADD_RTN_F64_vi : DS_Real_vi<0x7c, DS_ADD_RTN_F64>; +} // End SubtargetPredicate = isGFX90APlus diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 8061c6c509e0..fe62b8590fa0 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -123,6 +123,7 @@ DECODE_OPERAND_REG(VReg_96) DECODE_OPERAND_REG(VReg_128) DECODE_OPERAND_REG(VReg_256) DECODE_OPERAND_REG(VReg_512) +DECODE_OPERAND_REG(VReg_1024) DECODE_OPERAND_REG(SReg_32) DECODE_OPERAND_REG(SReg_32_XM0_XEXEC) @@ -135,7 +136,9 @@ DECODE_OPERAND_REG(SReg_256) DECODE_OPERAND_REG(SReg_512) DECODE_OPERAND_REG(AGPR_32) +DECODE_OPERAND_REG(AReg_64) DECODE_OPERAND_REG(AReg_128) +DECODE_OPERAND_REG(AReg_256) DECODE_OPERAND_REG(AReg_512) DECODE_OPERAND_REG(AReg_1024) DECODE_OPERAND_REG(AV_32) @@ -157,6 +160,14 @@ static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst, return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm)); } +static DecodeStatus decodeOperand_VSrcV232(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); + return addOperand(Inst, DAsm->decodeOperand_VSrcV232(Imm)); +} + static DecodeStatus decodeOperand_VS_16(MCInst &Inst, unsigned Imm, uint64_t Addr, @@ -173,6 +184,14 @@ static DecodeStatus decodeOperand_VS_32(MCInst &Inst, return addOperand(Inst, DAsm->decodeOperand_VS_32(Imm)); } +static DecodeStatus decodeOperand_AReg_64(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); + return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm | 512)); +} + static DecodeStatus decodeOperand_AReg_128(MCInst &Inst, unsigned Imm, uint64_t Addr, @@ -181,6 +200,14 @@ static DecodeStatus decodeOperand_AReg_128(MCInst &Inst, return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW128, Imm | 512)); } +static DecodeStatus decodeOperand_AReg_256(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); + return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW256, Imm | 512)); +} + static DecodeStatus decodeOperand_AReg_512(MCInst &Inst, unsigned Imm, uint64_t Addr, @@ -197,6 +224,127 @@ static DecodeStatus decodeOperand_AReg_1024(MCInst &Inst, return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm | 512)); } +static DecodeStatus decodeOperand_VReg_64(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); + return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm)); +} + +static DecodeStatus decodeOperand_VReg_128(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); + return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW128, Imm)); +} + +static DecodeStatus decodeOperand_VReg_256(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); + return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW256, Imm)); +} + +static DecodeStatus decodeOperand_VReg_512(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); + return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW512, Imm)); +} + +static DecodeStatus decodeOperand_VReg_1024(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); + return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm)); +} + +static bool IsAGPROperand(const MCInst &Inst, int OpIdx, + const MCRegisterInfo *MRI) { + if (OpIdx < 0) + return false; + + const MCOperand &Op = Inst.getOperand(OpIdx); + if (!Op.isReg()) + return false; + + unsigned Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0); + auto Reg = Sub ? Sub : Op.getReg(); + return Reg >= AMDGPU::AGPR0 && Reg <= AMDGPU::AGPR255; +} + +static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst, + unsigned Imm, + AMDGPUDisassembler::OpWidthTy Opw, + const void *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); + if (!DAsm->isGFX90A()) { + Imm &= 511; + } else { + // If atomic has both vdata and vdst their register classes are tied. + // The bit is decoded along with the vdst, first operand. We need to + // change register class to AGPR if vdst was AGPR. + // If a DS instruction has both data0 and data1 their register classes + // are also tied. + unsigned Opc = Inst.getOpcode(); + uint64_t TSFlags = DAsm->getMCII()->get(Opc).TSFlags; + uint16_t DataNameIdx = (TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0 + : AMDGPU::OpName::vdata; + const MCRegisterInfo *MRI = DAsm->getContext().getRegisterInfo(); + int DataIdx = AMDGPU::getNamedOperandIdx(Opc, DataNameIdx); + if ((int)Inst.getNumOperands() == DataIdx) { + int DstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); + if (IsAGPROperand(Inst, DstIdx, MRI)) + Imm |= 512; + } + + if (TSFlags & SIInstrFlags::DS) { + int Data2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1); + if ((int)Inst.getNumOperands() == Data2Idx && + IsAGPROperand(Inst, DataIdx, MRI)) + Imm |= 512; + } + } + return addOperand(Inst, DAsm->decodeSrcOp(Opw, Imm | 256)); +} + +static DecodeStatus DecodeAVLdSt_32RegisterClass(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + return decodeOperand_AVLdSt_Any(Inst, Imm, + AMDGPUDisassembler::OPW32, Decoder); +} + +static DecodeStatus DecodeAVLdSt_64RegisterClass(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + return decodeOperand_AVLdSt_Any(Inst, Imm, + AMDGPUDisassembler::OPW64, Decoder); +} + +static DecodeStatus DecodeAVLdSt_96RegisterClass(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + return decodeOperand_AVLdSt_Any(Inst, Imm, + AMDGPUDisassembler::OPW96, Decoder); +} + +static DecodeStatus DecodeAVLdSt_128RegisterClass(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + return decodeOperand_AVLdSt_Any(Inst, Imm, + AMDGPUDisassembler::OPW128, Decoder); +} + static DecodeStatus decodeOperand_SReg_32(MCInst &Inst, unsigned Imm, uint64_t Addr, @@ -250,6 +398,9 @@ DecodeStatus AMDGPUDisassembler::tryDecodeInst(const uint8_t* Table, return MCDisassembler::Fail; } +// The disassembler is greedy, so we need to check FI operand value to +// not parse a dpp if the correct literal is not set. For dpp16 the +// autogenerated decoder checks the dpp literal static bool isValidDPP8(const MCInst &MI) { using namespace llvm::AMDGPU::DPP; int FiIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::fi); @@ -341,6 +492,12 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address); if (Res) break; + if (STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]) { + Res = tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address); + if (Res) + break; + } + if (STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]) { Res = tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address); if (Res) break; @@ -351,6 +508,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, if (Bytes.size() < 4) break; const uint64_t QW = ((uint64_t)eatBytes<uint32_t>(Bytes) << 32) | DW; + + if (STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]) { + Res = tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address); + if (Res) + break; + } + Res = tryDecodeInst(DecoderTableGFX864, MI, QW, Address); if (Res) break; @@ -369,6 +533,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, MI.getOpcode() == AMDGPU::V_MAC_LEGACY_F32_e64_gfx6_gfx7 || MI.getOpcode() == AMDGPU::V_MAC_LEGACY_F32_e64_gfx10 || MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi || + MI.getOpcode() == AMDGPU::V_FMAC_F64_e64_gfx90a || MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi || MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_gfx10 || MI.getOpcode() == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 || @@ -379,9 +544,44 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } if (Res && (MCII->get(MI.getOpcode()).TSFlags & - (SIInstrFlags::MUBUF | SIInstrFlags::FLAT)) && - AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::glc1) != -1) { - insertNamedMCOperand(MI, MCOperand::createImm(1), AMDGPU::OpName::glc1); + (SIInstrFlags::MUBUF | SIInstrFlags::FLAT | SIInstrFlags::SMRD))) { + int CPolPos = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::cpol); + if (CPolPos != -1) { + unsigned CPol = + (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::IsAtomicRet) ? + AMDGPU::CPol::GLC : 0; + if (MI.getNumOperands() <= (unsigned)CPolPos) { + insertNamedMCOperand(MI, MCOperand::createImm(CPol), + AMDGPU::OpName::cpol); + } else if (CPol) { + MI.getOperand(CPolPos).setImm(MI.getOperand(CPolPos).getImm() | CPol); + } + } + } + + if (Res && (MCII->get(MI.getOpcode()).TSFlags & + (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) && + (STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts])) { + // GFX90A lost TFE, its place is occupied by ACC. + int TFEOpIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe); + if (TFEOpIdx != -1) { + auto TFEIter = MI.begin(); + std::advance(TFEIter, TFEOpIdx); + MI.insert(TFEIter, MCOperand::createImm(0)); + } + } + + if (Res && (MCII->get(MI.getOpcode()).TSFlags & + (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF))) { + int SWZOpIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); + if (SWZOpIdx != -1) { + auto SWZIter = MI.begin(); + std::advance(SWZIter, SWZOpIdx); + MI.insert(SWZIter, MCOperand::createImm(0)); + } } if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG)) { @@ -453,6 +653,8 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const { return MCDisassembler::Success; } +// We must check FI == literal to reject not genuine dpp8 insts, and we must +// first add optional MI operands to check FI DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { unsigned Opc = MI.getOpcode(); unsigned DescNumOps = MCII->get(Opc).getNumOperands(); @@ -513,21 +715,21 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { if (STI.getFeatureBits()[AMDGPU::FeatureGFX10]) { unsigned DimIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dim); + int A16Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::a16); const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); const AMDGPU::MIMGDimInfo *Dim = AMDGPU::getMIMGDimInfoByEncoding(MI.getOperand(DimIdx).getImm()); + const bool IsA16 = (A16Idx != -1 && MI.getOperand(A16Idx).getImm()); + + AddrSize = + AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, AMDGPU::hasG16(STI)); - AddrSize = BaseOpcode->NumExtraArgs + - (BaseOpcode->Gradients ? Dim->NumGradients : 0) + - (BaseOpcode->Coordinates ? Dim->NumCoords : 0) + - (BaseOpcode->LodOrClampOrMip ? 1 : 0); IsNSA = Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA; if (!IsNSA) { if (AddrSize > 8) AddrSize = 16; - else if (AddrSize > 4) - AddrSize = 8; } else { if (AddrSize > Info->VAddrDwords) { // The NSA encoding does not contain enough operands for the combination @@ -545,7 +747,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { DstSize = (DstSize + 1) / 2; } - if (MI.getOperand(TFEIdx).getImm()) + if (TFEIdx != -1 && MI.getOperand(TFEIdx).getImm()) DstSize += 1; if (DstSize == Info->VDataDwords && AddrSize == Info->VAddrDwords) @@ -701,6 +903,10 @@ MCOperand AMDGPUDisassembler::decodeOperand_VSrcV216(unsigned Val) const { return decodeSrcOp(OPWV216, Val); } +MCOperand AMDGPUDisassembler::decodeOperand_VSrcV232(unsigned Val) const { + return decodeSrcOp(OPWV232, Val); +} + MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const { // Some instructions have operand restrictions beyond what the encoding // allows. Some ordinarily VSrc_32 operands are VGPR_32, so clear the extra @@ -718,10 +924,18 @@ MCOperand AMDGPUDisassembler::decodeOperand_AGPR_32(unsigned Val) const { return createRegOperand(AMDGPU::AGPR_32RegClassID, Val & 255); } +MCOperand AMDGPUDisassembler::decodeOperand_AReg_64(unsigned Val) const { + return createRegOperand(AMDGPU::AReg_64RegClassID, Val & 255); +} + MCOperand AMDGPUDisassembler::decodeOperand_AReg_128(unsigned Val) const { return createRegOperand(AMDGPU::AReg_128RegClassID, Val & 255); } +MCOperand AMDGPUDisassembler::decodeOperand_AReg_256(unsigned Val) const { + return createRegOperand(AMDGPU::AReg_256RegClassID, Val & 255); +} + MCOperand AMDGPUDisassembler::decodeOperand_AReg_512(unsigned Val) const { return createRegOperand(AMDGPU::AReg_512RegClassID, Val & 255); } @@ -758,6 +972,10 @@ MCOperand AMDGPUDisassembler::decodeOperand_VReg_512(unsigned Val) const { return createRegOperand(AMDGPU::VReg_512RegClassID, Val); } +MCOperand AMDGPUDisassembler::decodeOperand_VReg_1024(unsigned Val) const { + return createRegOperand(AMDGPU::VReg_1024RegClassID, Val); +} + MCOperand AMDGPUDisassembler::decodeOperand_SReg_32(unsigned Val) const { // table-gen generated disassembler doesn't care about operand types // leaving only registry class so SSrc_32 operand turns into SReg_32 @@ -914,8 +1132,10 @@ MCOperand AMDGPUDisassembler::decodeFPImmed(OpWidthTy Width, unsigned Imm) { case OPW128: // splat constants case OPW512: case OPW1024: + case OPWV232: return MCOperand::createImm(getInlineImmVal32(Imm)); case OPW64: + case OPW256: return MCOperand::createImm(getInlineImmVal64(Imm)); case OPW16: case OPWV216: @@ -935,8 +1155,14 @@ unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const { case OPW16: case OPWV216: return VGPR_32RegClassID; - case OPW64: return VReg_64RegClassID; + case OPW64: + case OPWV232: return VReg_64RegClassID; + case OPW96: return VReg_96RegClassID; case OPW128: return VReg_128RegClassID; + case OPW160: return VReg_160RegClassID; + case OPW256: return VReg_256RegClassID; + case OPW512: return VReg_512RegClassID; + case OPW1024: return VReg_1024RegClassID; } } @@ -950,8 +1176,11 @@ unsigned AMDGPUDisassembler::getAgprClassId(const OpWidthTy Width) const { case OPW16: case OPWV216: return AGPR_32RegClassID; - case OPW64: return AReg_64RegClassID; + case OPW64: + case OPWV232: return AReg_64RegClassID; + case OPW96: return AReg_96RegClassID; case OPW128: return AReg_128RegClassID; + case OPW160: return AReg_160RegClassID; case OPW256: return AReg_256RegClassID; case OPW512: return AReg_512RegClassID; case OPW1024: return AReg_1024RegClassID; @@ -969,8 +1198,11 @@ unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const { case OPW16: case OPWV216: return SGPR_32RegClassID; - case OPW64: return SGPR_64RegClassID; + case OPW64: + case OPWV232: return SGPR_64RegClassID; + case OPW96: return SGPR_96RegClassID; case OPW128: return SGPR_128RegClassID; + case OPW160: return SGPR_160RegClassID; case OPW256: return SGPR_256RegClassID; case OPW512: return SGPR_512RegClassID; } @@ -986,7 +1218,8 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const { case OPW16: case OPWV216: return TTMP_32RegClassID; - case OPW64: return TTMP_64RegClassID; + case OPW64: + case OPWV232: return TTMP_64RegClassID; case OPW128: return TTMP_128RegClassID; case OPW256: return TTMP_256RegClassID; case OPW512: return TTMP_512RegClassID; @@ -1040,6 +1273,7 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c case OPWV216: return decodeSpecialReg32(Val); case OPW64: + case OPWV232: return decodeSpecialReg64(Val); default: llvm_unreachable("unexpected immediate type"); @@ -1209,6 +1443,10 @@ bool AMDGPUDisassembler::isVI() const { bool AMDGPUDisassembler::isGFX9() const { return AMDGPU::isGFX9(STI); } +bool AMDGPUDisassembler::isGFX90A() const { + return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]; +} + bool AMDGPUDisassembler::isGFX9Plus() const { return AMDGPU::isGFX9Plus(STI); } bool AMDGPUDisassembler::isGFX10() const { return AMDGPU::isGFX10(STI); } @@ -1217,6 +1455,10 @@ bool AMDGPUDisassembler::isGFX10Plus() const { return AMDGPU::isGFX10Plus(STI); } +bool AMDGPUDisassembler::hasArchitectedFlatScratch() const { + return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch]; +} + //===----------------------------------------------------------------------===// // AMDGPU specific symbol handling //===----------------------------------------------------------------------===// @@ -1276,7 +1518,8 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1( AMDGPU::IsaInfo::getSGPREncodingGranule(&STI); KdStream << Indent << ".amdhsa_reserve_vcc " << 0 << '\n'; - KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n'; + if (!hasArchitectedFlatScratch()) + KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n'; KdStream << Indent << ".amdhsa_reserve_xnack_mask " << 0 << '\n'; KdStream << Indent << ".amdhsa_next_free_sgpr " << NextFreeSGPR << "\n"; @@ -1327,9 +1570,12 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC2( uint32_t FourByteBuffer, raw_string_ostream &KdStream) const { using namespace amdhsa; StringRef Indent = "\t"; - PRINT_DIRECTIVE( - ".amdhsa_system_sgpr_private_segment_wavefront_offset", - COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT); + if (hasArchitectedFlatScratch()) + PRINT_DIRECTIVE(".amdhsa_enable_private_segment", + COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT); + else + PRINT_DIRECTIVE(".amdhsa_system_sgpr_private_segment_wavefront_offset", + COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT); PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_x", COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X); PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_y", @@ -1387,7 +1633,6 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective( uint16_t TwoByteBuffer = 0; uint32_t FourByteBuffer = 0; - uint64_t EightByteBuffer = 0; StringRef ReservedBytes; StringRef Indent = "\t"; @@ -1408,11 +1653,19 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective( << FourByteBuffer << '\n'; return MCDisassembler::Success; + case amdhsa::KERNARG_SIZE_OFFSET: + FourByteBuffer = DE.getU32(Cursor); + KdStream << Indent << ".amdhsa_kernarg_size " + << FourByteBuffer << '\n'; + return MCDisassembler::Success; + case amdhsa::RESERVED0_OFFSET: - // 8 reserved bytes, must be 0. - EightByteBuffer = DE.getU64(Cursor); - if (EightByteBuffer) { - return MCDisassembler::Fail; + // 4 reserved bytes, must be 0. + ReservedBytes = DE.getBytes(Cursor, 4); + for (int I = 0; I < 4; ++I) { + if (ReservedBytes[I] != 0) { + return MCDisassembler::Fail; + } } return MCDisassembler::Success; @@ -1463,8 +1716,9 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective( using namespace amdhsa; TwoByteBuffer = DE.getU16(Cursor); - PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer", - KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER); + if (!hasArchitectedFlatScratch()) + PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer", + KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER); PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_ptr", KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR); PRINT_DIRECTIVE(".amdhsa_user_sgpr_queue_ptr", @@ -1473,8 +1727,9 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective( KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR); PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_id", KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID); - PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init", - KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); + if (!hasArchitectedFlatScratch()) + PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init", + KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size", KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); @@ -1589,6 +1844,8 @@ bool AMDGPUSymbolizer::tryAddingSymbolicOperand(MCInst &Inst, Inst.addOperand(MCOperand::createExpr(Add)); return true; } + // Add to list of referenced addresses, so caller can synthesize a label. + ReferencedAddresses.push_back(static_cast<uint64_t>(Value)); return false; } diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 714dabbc5184..dc879ec5ad88 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -99,12 +99,14 @@ public: MCOperand decodeOperand_VS_128(unsigned Val) const; MCOperand decodeOperand_VSrc16(unsigned Val) const; MCOperand decodeOperand_VSrcV216(unsigned Val) const; + MCOperand decodeOperand_VSrcV232(unsigned Val) const; MCOperand decodeOperand_VReg_64(unsigned Val) const; MCOperand decodeOperand_VReg_96(unsigned Val) const; MCOperand decodeOperand_VReg_128(unsigned Val) const; MCOperand decodeOperand_VReg_256(unsigned Val) const; MCOperand decodeOperand_VReg_512(unsigned Val) const; + MCOperand decodeOperand_VReg_1024(unsigned Val) const; MCOperand decodeOperand_SReg_32(unsigned Val) const; MCOperand decodeOperand_SReg_32_XM0_XEXEC(unsigned Val) const; @@ -117,7 +119,9 @@ public: MCOperand decodeOperand_SReg_512(unsigned Val) const; MCOperand decodeOperand_AGPR_32(unsigned Val) const; + MCOperand decodeOperand_AReg_64(unsigned Val) const; MCOperand decodeOperand_AReg_128(unsigned Val) const; + MCOperand decodeOperand_AReg_256(unsigned Val) const; MCOperand decodeOperand_AReg_512(unsigned Val) const; MCOperand decodeOperand_AReg_1024(unsigned Val) const; MCOperand decodeOperand_AV_32(unsigned Val) const; @@ -126,12 +130,15 @@ public: enum OpWidthTy { OPW32, OPW64, + OPW96, OPW128, + OPW160, OPW256, OPW512, OPW1024, OPW16, OPWV216, + OPWV232, OPW_LAST_, OPW_FIRST_ = OPW32 }; @@ -159,11 +166,16 @@ public: int getTTmpIdx(unsigned Val) const; + const MCInstrInfo *getMCII() const { return MCII.get(); } + bool isVI() const; bool isGFX9() const; + bool isGFX90A() const; bool isGFX9Plus() const; bool isGFX10() const; bool isGFX10Plus() const; + + bool hasArchitectedFlatScratch() const; }; //===----------------------------------------------------------------------===// @@ -173,6 +185,7 @@ public: class AMDGPUSymbolizer : public MCSymbolizer { private: void *DisInfo; + std::vector<uint64_t> ReferencedAddresses; public: AMDGPUSymbolizer(MCContext &Ctx, std::unique_ptr<MCRelocationInfo> &&RelInfo, @@ -187,6 +200,10 @@ public: void tryAddingPcLoadReferenceComment(raw_ostream &cStream, int64_t Value, uint64_t Address) override; + + ArrayRef<uint64_t> getReferencedAddresses() const override { + return ReferencedAddresses; + } }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td index 8d3e138ba56a..596c3d7baea0 100644 --- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -532,7 +532,10 @@ def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24", def : UMad24Pat<MULADD_UINT24_eg>; def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>; -def : FSHRPattern <BIT_ALIGN_INT_eg>; +def : AMDGPUPat < + (fshr i32:$src0, i32:$src1, i32:$src2), + (BIT_ALIGN_INT_eg $src0, $src1, $src2) +>; def : ROTRPattern <BIT_ALIGN_INT_eg>; def MULADD_eg : MULADD_Common<0x14>; def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 57a355a55a02..90f26e514f54 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -6,9 +6,9 @@ // //===----------------------------------------------------------------------===// -def FLATOffset : ComplexPattern<i64, 2, "SelectFlatOffset<false>", [], [SDNPWantRoot], -10>; -def FLATOffsetSigned : ComplexPattern<i64, 2, "SelectFlatOffset<true>", [], [SDNPWantRoot], -10>; -def ScratchOffset : ComplexPattern<i32, 2, "SelectFlatOffset<true>", [], [SDNPWantRoot], -10>; +def FlatOffset : ComplexPattern<i64, 2, "SelectFlatOffset", [], [SDNPWantRoot], -10>; +def GlobalOffset : ComplexPattern<i64, 2, "SelectGlobalOffset", [], [SDNPWantRoot], -10>; +def ScratchOffset : ComplexPattern<i32, 2, "SelectScratchOffset", [], [SDNPWantRoot], -10>; def GlobalSAddr : ComplexPattern<i64, 3, "SelectGlobalSAddr", [], [SDNPWantRoot], -10>; def ScratchSAddr : ComplexPattern<i32, 2, "SelectScratchSAddr", [], [SDNPWantRoot], -10>; @@ -54,6 +54,8 @@ class FLAT_Pseudo<string opName, dag outs, dag ins, bits<1> glcValue = 0; bits<1> has_dlc = 1; bits<1> dlcValue = 0; + bits<1> has_sccb = 1; + bits<1> sccbValue = 0; let SubtargetPredicate = !if(is_flat_global, HasFlatGlobalInsts, !if(is_flat_scratch, HasFlatScratchInsts, HasFlatAddressSpace)); @@ -67,9 +69,9 @@ class FLAT_Pseudo<string opName, dag outs, dag ins, let VM_CNT = 1; let LGKM_CNT = !not(!or(is_flat_global, is_flat_scratch)); - let IsFlatGlobal = is_flat_global; + let FlatGlobal = is_flat_global; - let IsFlatScratch = is_flat_scratch; + let FlatScratch = is_flat_scratch; } class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : @@ -79,22 +81,29 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : let isPseudo = 0; let isCodeGenOnly = 0; + let FLAT = 1; + // copy relevant pseudo op flags - let SubtargetPredicate = ps.SubtargetPredicate; - let AsmMatchConverter = ps.AsmMatchConverter; - let OtherPredicates = ps.OtherPredicates; - let TSFlags = ps.TSFlags; + let SubtargetPredicate = ps.SubtargetPredicate; + let AsmMatchConverter = ps.AsmMatchConverter; + let OtherPredicates = ps.OtherPredicates; + let TSFlags = ps.TSFlags; let UseNamedOperandTable = ps.UseNamedOperandTable; + let SchedRW = ps.SchedRW; + let mayLoad = ps.mayLoad; + let mayStore = ps.mayStore; + let IsAtomicRet = ps.IsAtomicRet; + let IsAtomicNoRet = ps.IsAtomicNoRet; + let VM_CNT = ps.VM_CNT; + let LGKM_CNT = ps.LGKM_CNT; // encoding fields bits<8> vaddr; - bits<8> vdata; + bits<10> vdata; bits<7> saddr; - bits<8> vdst; + bits<10> vdst; - bits<1> slc; - bits<1> glc; - bits<1> dlc; + bits<5> cpol; // Only valid on gfx9 bits<1> lds = 0; // XXX - What does this actually do? @@ -106,7 +115,8 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : // Signed offset. Highest bit ignored for flat and treated as 12-bit // unsigned for flat accesses. bits<13> offset; - bits<1> nv = 0; // XXX - What does this actually do? + // GFX90A+ only: instruction uses AccVGPR for data + bits<1> acc = !if(ps.has_vdst, vdst{9}, !if(ps.has_data, vdata{9}, 0)); // We don't use tfe right now, and it was removed in gfx9. bits<1> tfe = 0; @@ -116,17 +126,17 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : let Inst{13} = lds; let Inst{15-14} = seg; - let Inst{16} = !if(ps.has_glc, glc, ps.glcValue); - let Inst{17} = slc; + let Inst{16} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glcValue); + let Inst{17} = cpol{CPolBit.SLC}; let Inst{24-18} = op; let Inst{31-26} = 0x37; // Encoding. let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); - let Inst{47-40} = !if(ps.has_data, vdata, ?); + let Inst{47-40} = !if(ps.has_data, vdata{7-0}, ?); let Inst{54-48} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7f), 0); // 54-48 is reserved. - let Inst{55} = nv; // nv on GFX9+, TFE before. - let Inst{63-56} = !if(ps.has_vdst, vdst, ?); + let Inst{55} = acc; // nv on GFX9+, TFE before. AccVGPR for data on GFX90A. + let Inst{63-56} = !if(ps.has_vdst, vdst{7-0}, ?); } class GlobalSaddrTable <bit is_saddr, string Name = ""> { @@ -139,9 +149,10 @@ class GlobalSaddrTable <bit is_saddr, string Name = ""> { // saddr is 32-bit (which isn't handled here yet). class FLAT_Load_Pseudo <string opName, RegisterClass regClass, bit HasTiedOutput = 0, - bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo< + bit HasSaddr = 0, bit EnableSaddr = 0, + RegisterOperand vdata_op = getLdStRegisterOperand<regClass>.ret> : FLAT_Pseudo< opName, - (outs regClass:$vdst), + (outs vdata_op:$vdst), !con( !con( !if(EnableSaddr, @@ -149,9 +160,9 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass, (ins VReg_64:$vaddr)), (ins flat_offset:$offset)), // FIXME: Operands with default values do not work with following non-optional operands. - !if(HasTiedOutput, (ins GLC:$glc, SLC:$slc, DLC:$dlc, regClass:$vdst_in), - (ins GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc))), - " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc"> { + !if(HasTiedOutput, (ins CPol:$cpol, vdata_op:$vdst_in), + (ins CPol_0:$cpol))), + " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$cpol"> { let has_data = 0; let mayLoad = 1; let has_saddr = HasSaddr; @@ -169,10 +180,10 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass, (outs), !con( !if(EnableSaddr, - (ins VGPR_32:$vaddr, vdataClass:$vdata, SReg_64:$saddr), - (ins VReg_64:$vaddr, vdataClass:$vdata)), - (ins flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc)), - " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc"> { + (ins VGPR_32:$vaddr, getLdStRegisterOperand<vdataClass>.ret:$vdata, SReg_64:$saddr), + (ins VReg_64:$vaddr, getLdStRegisterOperand<vdataClass>.ret:$vdata)), + (ins flat_offset:$offset, CPol_0:$cpol)), + " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$cpol"> { let mayLoad = 0; let mayStore = 1; let has_vdst = 0; @@ -196,9 +207,9 @@ class FLAT_Global_Load_AddTid_Pseudo <string opName, RegisterClass regClass, opName, (outs regClass:$vdst), !con(!if(EnableSaddr, (ins SReg_64:$saddr), (ins)), - (ins flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc), + (ins flat_offset:$offset, CPol_0:$cpol), !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))), - " $vdst, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> { + " $vdst, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> { let is_flat_global = 1; let has_data = 0; let mayLoad = 1; @@ -234,8 +245,8 @@ class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterClass vdataClass, opName, (outs), !con(!if(EnableSaddr, (ins vdataClass:$vdata, SReg_64:$saddr), (ins vdataClass:$vdata)), - (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)), - " $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> { + (ins flat_offset:$offset, CPol:$cpol)), + " $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> { let is_flat_global = 1; let mayLoad = 0; let mayStore = 1; @@ -266,16 +277,16 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass, bit EnableVaddr = !not(EnableSaddr)> : FLAT_Pseudo< opName, - (outs regClass:$vdst), + (outs getLdStRegisterOperand<regClass>.ret:$vdst), !con( !if(EnableSaddr, (ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset), !if(EnableVaddr, (ins VGPR_32:$vaddr, flat_offset:$offset), (ins flat_offset:$offset))), - !if(HasTiedOutput, (ins GLC:$glc, SLC:$slc, DLC:$dlc, regClass:$vdst_in), - (ins GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc))), - " $vdst, "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> { + !if(HasTiedOutput, (ins CPol:$cpol, getLdStRegisterOperand<regClass>.ret:$vdst_in), + (ins CPol_0:$cpol))), + " $vdst, "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> { let has_data = 0; let mayLoad = 1; let has_saddr = 1; @@ -289,15 +300,16 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass, } class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit EnableSaddr = 0, - bit EnableVaddr = !not(EnableSaddr)> : FLAT_Pseudo< + bit EnableVaddr = !not(EnableSaddr), + RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret> : FLAT_Pseudo< opName, (outs), !if(EnableSaddr, - (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc), + (ins vdata_op:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol_0:$cpol), !if(EnableVaddr, - (ins vdataClass:$vdata, VGPR_32:$vaddr, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc), - (ins vdataClass:$vdata, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc))), - " "#!if(EnableVaddr, "$vaddr", "off")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> { + (ins vdata_op:$vdata, VGPR_32:$vaddr, flat_offset:$offset, CPol_0:$cpol), + (ins vdata_op:$vdata, flat_offset:$offset, CPol_0:$cpol))), + " "#!if(EnableVaddr, "$vaddr", "off")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> { let mayLoad = 0; let mayStore = 1; let has_vdst = 0; @@ -344,7 +356,10 @@ class FLAT_AtomicNoRet_Pseudo<string opName, dag outs, dag ins, let has_dlc = 0; let dlcValue = 0; let has_vdst = 0; + let has_sccb = 1; + let sccbValue = 0; let maybeAtomic = 1; + let IsAtomicNoRet = 1; } class FLAT_AtomicRet_Pseudo<string opName, dag outs, dag ins, @@ -354,6 +369,9 @@ class FLAT_AtomicRet_Pseudo<string opName, dag outs, dag ins, let has_vdst = 1; let glcValue = 1; let dlcValue = 0; + let sccbValue = 0; + let IsAtomicNoRet = 0; + let IsAtomicRet = 1; let PseudoInstr = NAME # "_RTN"; } @@ -364,11 +382,12 @@ multiclass FLAT_Atomic_Pseudo< SDPatternOperator atomic = null_frag, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, - bit isFP = isFloatType<data_vt>.ret> { + bit isFP = isFloatType<data_vt>.ret, + RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> { def "" : FLAT_AtomicNoRet_Pseudo <opName, (outs), - (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC_0:$slc), - " $vaddr, $vdata$offset$slc">, + (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol), + " $vaddr, $vdata$offset$cpol">, GlobalSaddrTable<0, opName>, AtomicNoRet <opName, 0> { let PseudoInstr = NAME; @@ -377,11 +396,11 @@ multiclass FLAT_Atomic_Pseudo< } def _RTN : FLAT_AtomicRet_Pseudo <opName, - (outs vdst_rc:$vdst), - (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, GLC_1:$glc1, SLC_0:$slc), - " $vdst, $vaddr, $vdata$offset$glc1$slc", + (outs getLdStRegisterOperand<vdst_rc>.ret:$vdst), + (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), + " $vdst, $vaddr, $vdata$offset$cpol", [(set vt:$vdst, - (atomic (FLATOffset i64:$vaddr, i16:$offset), data_vt:$vdata))]>, + (atomic (FlatOffset i64:$vaddr, i16:$offset), data_vt:$vdata))]>, GlobalSaddrTable<0, opName#"_rtn">, AtomicNoRet <opName, 1>{ let FPAtomic = isFP; @@ -396,12 +415,13 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN< SDPatternOperator atomic = null_frag, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, - bit isFP = isFloatType<data_vt>.ret> { + bit isFP = isFloatType<data_vt>.ret, + RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> { def "" : FLAT_AtomicNoRet_Pseudo <opName, (outs), - (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC_0:$slc), - " $vaddr, $vdata, off$offset$slc">, + (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol), + " $vaddr, $vdata, off$offset$cpol">, GlobalSaddrTable<0, opName>, AtomicNoRet <opName, 0> { let has_saddr = 1; @@ -411,8 +431,8 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN< def _SADDR : FLAT_AtomicNoRet_Pseudo <opName, (outs), - (ins VGPR_32:$vaddr, data_rc:$vdata, SReg_64:$saddr, flat_offset:$offset, SLC_0:$slc), - " $vaddr, $vdata, $saddr$offset$slc">, + (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_0:$cpol), + " $vaddr, $vdata, $saddr$offset$cpol">, GlobalSaddrTable<1, opName>, AtomicNoRet <opName#"_saddr", 0> { let has_saddr = 1; @@ -429,14 +449,16 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN< SDPatternOperator atomic = null_frag, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, - bit isFP = isFloatType<data_vt>.ret> { + bit isFP = isFloatType<data_vt>.ret, + RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret, + RegisterOperand vdst_op = getLdStRegisterOperand<vdst_rc>.ret> { def _RTN : FLAT_AtomicRet_Pseudo <opName, - (outs vdst_rc:$vdst), - (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, GLC_1:$glc1, SLC_0:$slc), - " $vdst, $vaddr, $vdata, off$offset$glc1$slc", + (outs vdst_op:$vdst), + (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), + " $vdst, $vaddr, $vdata, off$offset$cpol", [(set vt:$vdst, - (atomic (FLATOffsetSigned i64:$vaddr, i16:$offset), data_vt:$vdata))]>, + (atomic (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$vdata))]>, GlobalSaddrTable<0, opName#"_rtn">, AtomicNoRet <opName, 1> { let has_saddr = 1; @@ -444,9 +466,9 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN< } def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName, - (outs vdst_rc:$vdst), - (ins VGPR_32:$vaddr, data_rc:$vdata, SReg_64:$saddr, flat_offset:$offset, GLC_1:$glc1, SLC_0:$slc), - " $vdst, $vaddr, $vdata, $saddr$offset$glc1$slc">, + (outs vdst_op:$vdst), + (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol), + " $vdst, $vaddr, $vdata, $saddr$offset$cpol">, GlobalSaddrTable<1, opName#"_rtn">, AtomicNoRet <opName#"_saddr", 1> { let has_saddr = 1; @@ -605,6 +627,15 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2", } // End SubtargetPredicate = isGFX7GFX10 +let SubtargetPredicate = isGFX90APlus in { + defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64, int_amdgcn_flat_atomic_fadd>; + defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmin>; + defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmax>; + defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>; + defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>; + defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>; +} // End SubtargetPredicate = isGFX90APlus + defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>; defm GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>; defm GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>; @@ -777,6 +808,15 @@ let OtherPredicates = [HasAtomicFaddInsts] in { "global_atomic_pk_add_f16", VGPR_32, v2f16 >; } // End OtherPredicates = [HasAtomicFaddInsts] + +let OtherPredicates = [isGFX90APlus] in { + defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_RTN < + "global_atomic_add_f32", VGPR_32, f32, int_amdgcn_global_atomic_fadd + >; + defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN < + "global_atomic_pk_add_f16", VGPR_32, v2f16, int_amdgcn_global_atomic_fadd + >; +} // End OtherPredicates = [isGFX90APlus] } // End is_flat_global = 1 //===----------------------------------------------------------------------===// @@ -785,33 +825,33 @@ let OtherPredicates = [HasAtomicFaddInsts] in { // Patterns for global loads with no offset. class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (FLATOffset i64:$vaddr, i16:$offset))), + (vt (node (FlatOffset i64:$vaddr, i16:$offset))), (inst $vaddr, $offset) >; class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node (FLATOffset (i64 VReg_64:$vaddr), i16:$offset), vt:$in), - (inst $vaddr, $offset, 0, 0, 0, $in) + (node (FlatOffset (i64 VReg_64:$vaddr), i16:$offset), vt:$in), + (inst $vaddr, $offset, 0, $in) >; class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset), vt:$in), - (inst $vaddr, $offset, 0, 0, 0, $in) + (node (GlobalOffset (i64 VReg_64:$vaddr), i16:$offset), vt:$in), + (inst $vaddr, $offset, 0, $in) >; class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), vt:$in)), - (inst $saddr, $voffset, $offset, 0, 0, 0, $in) + (inst $saddr, $voffset, $offset, 0, $in) >; class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset))), + (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i16:$offset))), (inst $vaddr, $offset) >; class GlobalLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset))), - (inst $saddr, $voffset, $offset, 0, 0, 0) + (inst $saddr, $voffset, $offset, 0) >; class GlobalStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, @@ -839,19 +879,19 @@ class GlobalAtomicNoRtnSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, >; class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node vt:$data, (FLATOffset i64:$vaddr, i16:$offset)), + (node vt:$data, (FlatOffset i64:$vaddr, i16:$offset)), (inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset) >; class FlatStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node vt:$data, (FLATOffsetSigned i64:$vaddr, i16:$offset)), + (node vt:$data, (GlobalOffset i64:$vaddr, i16:$offset)), (inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset) >; class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < // atomic store follows atomic binop convention so the address comes // first. - (node (FLATOffset i64:$vaddr, i16:$offset), vt:$data), + (node (FlatOffset i64:$vaddr, i16:$offset), vt:$data), (inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset) >; @@ -859,29 +899,29 @@ class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, ValueType data_vt = vt> : GCNPat < // atomic store follows atomic binop convention so the address comes // first. - (node (FLATOffset i64:$vaddr, i16:$offset), data_vt:$data), + (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data), (inst $vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset) >; class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, ValueType data_vt = vt> : GCNPat < - (vt (node (FLATOffset i64:$vaddr, i16:$offset), data_vt:$data)), + (vt (node (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)), (inst $vaddr, $data, $offset) >; class FlatAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node (FLATOffset i64:$vaddr, i16:$offset), vt:$data), + (node (FlatOffset i64:$vaddr, i16:$offset), vt:$data), (inst VReg_64:$vaddr, getVregSrcForVT<vt>.ret:$data, $offset) >; class FlatSignedAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node (FLATOffsetSigned i64:$vaddr, i16:$offset), vt:$data), + (node (GlobalOffset i64:$vaddr, i16:$offset), vt:$data), (inst VReg_64:$vaddr, getVregSrcForVT<vt>.ret:$data, $offset) >; class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, ValueType data_vt = vt> : GCNPat < - (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset), data_vt:$data)), + (vt (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), (inst $vaddr, $data, $offset) >; @@ -892,7 +932,7 @@ class ScratchLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType class ScratchLoadSignedPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset), vt:$in), - (inst $vaddr, $offset, 0, 0, 0, $in) + (inst $vaddr, $offset, 0, $in) >; class ScratchStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < @@ -907,7 +947,7 @@ class ScratchLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType v class ScratchLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset), vt:$in)), - (inst $saddr, $offset, 0, 0, 0, $in) + (inst $saddr, $offset, 0, $in) >; class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, @@ -1202,6 +1242,17 @@ defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_ADD_F32, atomic_load_fadd_glo defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_PK_ADD_F16, atomic_load_fadd_v2f16_global_noret_32, v2f16>; } +let OtherPredicates = [isGFX90APlus] in { +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F32", atomic_load_fadd_global_32, f32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", atomic_load_fadd_v2f16_global_32, v2f16>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", atomic_load_fadd_global_64, f64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", atomic_load_fmin_global_64, f64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", atomic_load_fmax_global_64, f64>; +def : FlatSignedAtomicPat <FLAT_ATOMIC_ADD_F64_RTN, atomic_load_fadd_flat_64, f64>; +def : FlatSignedAtomicPat <FLAT_ATOMIC_MIN_F64_RTN, atomic_load_fmin_flat_64, f64>; +def : FlatSignedAtomicPat <FLAT_ATOMIC_MAX_F64_RTN, atomic_load_fmax_flat_64, f64>; +} + } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in { @@ -1337,16 +1388,21 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_ci <0x60, FLAT_ATOMIC_FMAX_X2 // VI //===----------------------------------------------------------------------===// -class FLAT_Real_vi <bits<7> op, FLAT_Pseudo ps> : +class FLAT_Real_vi <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> : FLAT_Real <op, ps>, SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> { let AssemblerPredicate = isGFX8GFX9; let DecoderNamespace = "GFX8"; + + let Inst{25} = !if(has_sccb, cpol{CPolBit.SCC}, ps.sccbValue); + let AsmString = ps.Mnemonic # + !subst("$sccb", !if(has_sccb, "$sccb",""), ps.AsmOperands); } -multiclass FLAT_Real_AllAddr_vi<bits<7> op> { - def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME)>; - def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>; +multiclass FLAT_Real_AllAddr_vi<bits<7> op, + bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> { + def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME), has_sccb>; + def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>; } def FLAT_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, FLAT_LOAD_UBYTE>; @@ -1374,15 +1430,17 @@ def FLAT_LOAD_SBYTE_D16_HI_vi : FLAT_Real_vi <0x23, FLAT_LOAD_SBYTE_D16_HI>; def FLAT_LOAD_SHORT_D16_vi : FLAT_Real_vi <0x24, FLAT_LOAD_SHORT_D16>; def FLAT_LOAD_SHORT_D16_HI_vi : FLAT_Real_vi <0x25, FLAT_LOAD_SHORT_D16_HI>; -multiclass FLAT_Real_Atomics_vi <bits<7> op, FLAT_Pseudo ps> { - def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>; - def _RTN_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>; +multiclass FLAT_Real_Atomics_vi <bits<7> op, FLAT_Pseudo ps, + bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> { + def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>; + def _RTN_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>; } -multiclass FLAT_Global_Real_Atomics_vi<bits<7> op> : - FLAT_Real_AllAddr_vi<op> { - def _RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN")>; - def _SADDR_RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN")>; +multiclass FLAT_Global_Real_Atomics_vi<bits<7> op, + bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> : + FLAT_Real_AllAddr_vi<op, has_sccb> { + def _RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>; + def _SADDR_RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>; } @@ -1489,6 +1547,19 @@ defm SCRATCH_STORE_DWORDX2 : FLAT_Real_AllAddr_vi <0x1d>; defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_vi <0x1e>; defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_vi <0x1f>; +let SubtargetPredicate = HasAtomicFaddInsts in { +defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Real_Atomics_vi <0x04d, 0>; +defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_vi <0x04e, 0>; +} + +let SubtargetPredicate = isGFX90AOnly in { + defm FLAT_ATOMIC_ADD_F64 : FLAT_Real_Atomics_vi<0x4f, FLAT_ATOMIC_ADD_F64, 0>; + defm FLAT_ATOMIC_MIN_F64 : FLAT_Real_Atomics_vi<0x50, FLAT_ATOMIC_MIN_F64, 0>; + defm FLAT_ATOMIC_MAX_F64 : FLAT_Real_Atomics_vi<0x51, FLAT_ATOMIC_MAX_F64, 0>; + defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Real_Atomics_vi<0x4f, 0>; + defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Real_Atomics_vi<0x50, 0>; + defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Real_Atomics_vi<0x51, 0>; +} // End SubtargetPredicate = isGFX90AOnly //===----------------------------------------------------------------------===// // GFX10. @@ -1500,7 +1571,7 @@ class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps> : let DecoderNamespace = "GFX10"; let Inst{11-0} = offset{11-0}; - let Inst{12} = !if(ps.has_dlc, dlc, ps.dlcValue); + let Inst{12} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlcValue); let Inst{54-48} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7d), 0x7d); let Inst{55} = 0; } @@ -1695,10 +1766,3 @@ defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Real_ScratchAllAddr_gfx10<0x022>; defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Real_ScratchAllAddr_gfx10<0x023>; defm SCRATCH_LOAD_SHORT_D16 : FLAT_Real_ScratchAllAddr_gfx10<0x024>; defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Real_ScratchAllAddr_gfx10<0x025>; - -let SubtargetPredicate = HasAtomicFaddInsts in { - -defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Real_AllAddr_vi <0x04d>; -defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Real_AllAddr_vi <0x04e>; - -} // End SubtargetPredicate = HasAtomicFaddInsts diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp index e4eacd101ce8..2bf365168048 100644 --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -54,21 +54,20 @@ namespace { class GCNDPPCombine : public MachineFunctionPass { MachineRegisterInfo *MRI; const SIInstrInfo *TII; + const GCNSubtarget *ST; using RegSubRegPair = TargetInstrInfo::RegSubRegPair; MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const; - MachineInstr *createDPPInst(MachineInstr &OrigMI, - MachineInstr &MovMI, + MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI, RegSubRegPair CombOldVGPR, - MachineOperand *OldOpnd, - bool CombBCZ) const; + MachineOperand *OldOpnd, bool CombBCZ, + bool IsShrinkable) const; - MachineInstr *createDPPInst(MachineInstr &OrigMI, - MachineInstr &MovMI, - RegSubRegPair CombOldVGPR, - bool CombBCZ) const; + MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI, + RegSubRegPair CombOldVGPR, bool CombBCZ, + bool IsShrinkable) const; bool hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName, @@ -99,7 +98,8 @@ public: } private: - int getDPPOp(unsigned Op) const; + int getDPPOp(unsigned Op, bool IsShrinkable) const; + bool isShrinkable(MachineInstr &MI) const; }; } // end anonymous namespace @@ -114,11 +114,40 @@ FunctionPass *llvm::createGCNDPPCombinePass() { return new GCNDPPCombine(); } -int GCNDPPCombine::getDPPOp(unsigned Op) const { +bool GCNDPPCombine::isShrinkable(MachineInstr &MI) const { + unsigned Op = MI.getOpcode(); + if (!TII->isVOP3(Op)) { + return false; + } + if (!TII->hasVALU32BitEncoding(Op)) { + LLVM_DEBUG(dbgs() << " Inst hasn't e32 equivalent\n"); + return false; + } + if (const auto *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { + // Give up if there are any uses of the carry-out from instructions like + // V_ADD_CO_U32. The shrunken form of the instruction would write it to vcc + // instead of to a virtual register. + if (!MRI->use_nodbg_empty(SDst->getReg())) + return false; + } + // check if other than abs|neg modifiers are set (opsel for example) + const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG); + if (!hasNoImmOrEqual(MI, AMDGPU::OpName::src0_modifiers, 0, Mask) || + !hasNoImmOrEqual(MI, AMDGPU::OpName::src1_modifiers, 0, Mask) || + !hasNoImmOrEqual(MI, AMDGPU::OpName::clamp, 0) || + !hasNoImmOrEqual(MI, AMDGPU::OpName::omod, 0)) { + LLVM_DEBUG(dbgs() << " Inst has non-default modifiers\n"); + return false; + } + return true; +} + +int GCNDPPCombine::getDPPOp(unsigned Op, bool IsShrinkable) const { auto DPP32 = AMDGPU::getDPPOp32(Op); - if (DPP32 == -1) { + if (IsShrinkable) { + assert(DPP32 == -1); auto E32 = AMDGPU::getVOPe32(Op); - DPP32 = (E32 == -1)? -1 : AMDGPU::getDPPOp32(E32); + DPP32 = (E32 == -1) ? -1 : AMDGPU::getDPPOp32(E32); } return (DPP32 == -1 || TII->pseudoToMCOpcode(DPP32) == -1) ? -1 : DPP32; } @@ -137,7 +166,8 @@ MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const { case AMDGPU::IMPLICIT_DEF: return nullptr; case AMDGPU::COPY: - case AMDGPU::V_MOV_B32_e32: { + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::V_MOV_B64_PSEUDO: { auto &Op1 = Def->getOperand(1); if (Op1.isImm()) return &Op1; @@ -150,11 +180,13 @@ MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const { MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI, RegSubRegPair CombOldVGPR, - bool CombBCZ) const { - assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp); + bool CombBCZ, + bool IsShrinkable) const { + assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp || + MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); auto OrigOp = OrigMI.getOpcode(); - auto DPPOp = getDPPOp(OrigOp); + auto DPPOp = getDPPOp(OrigOp, IsShrinkable); if (DPPOp == -1) { LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n"); return nullptr; @@ -174,7 +206,11 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old); if (OldIdx != -1) { assert(OldIdx == NumOperands); - assert(isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI)); + assert(isOfRegClass( + CombOldVGPR, + *MRI->getRegClass( + TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg()), + *MRI)); auto *Def = getVRegSubRegDef(CombOldVGPR, *MRI); DPPInst.addReg(CombOldVGPR.Reg, Def ? 0 : RegState::Undef, CombOldVGPR.SubReg); @@ -308,11 +344,9 @@ static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) { return false; } -MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, - MachineInstr &MovMI, - RegSubRegPair CombOldVGPR, - MachineOperand *OldOpndValue, - bool CombBCZ) const { +MachineInstr *GCNDPPCombine::createDPPInst( + MachineInstr &OrigMI, MachineInstr &MovMI, RegSubRegPair CombOldVGPR, + MachineOperand *OldOpndValue, bool CombBCZ, bool IsShrinkable) const { assert(CombOldVGPR.Reg); if (!CombBCZ && OldOpndValue && OldOpndValue->isImm()) { auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); @@ -325,12 +359,14 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, return nullptr; } CombOldVGPR = getRegSubRegPair(*Src1); - if (!isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI)) { - LLVM_DEBUG(dbgs() << " failed: src1 isn't a VGPR32 register\n"); + auto MovDst = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst); + const TargetRegisterClass *RC = MRI->getRegClass(MovDst->getReg()); + if (!isOfRegClass(CombOldVGPR, *RC, *MRI)) { + LLVM_DEBUG(dbgs() << " failed: src1 has wrong register class\n"); return nullptr; } } - return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ); + return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ, IsShrinkable); } // returns true if MI doesn't have OpndName immediate operand or the @@ -346,7 +382,8 @@ bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName, } bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { - assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp); + assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp || + MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI); auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst); @@ -362,6 +399,17 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { return false; } + if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) { + auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl); + assert(DppCtrl && DppCtrl->isImm()); + if (!AMDGPU::isLegal64BitDPPControl(DppCtrl->getImm())) { + LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move uses unsupported" + " control value\n"); + // Let it split, then control may become legal. + return false; + } + } + auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask); assert(RowMaskOpnd && RowMaskOpnd->isImm()); auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask); @@ -430,8 +478,9 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { auto CombOldVGPR = getRegSubRegPair(*OldOpnd); // try to reuse previous old reg if its undefined (IMPLICIT_DEF) if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef + const TargetRegisterClass *RC = MRI->getRegClass(DPPMovReg); CombOldVGPR = RegSubRegPair( - MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass)); + MRI->createVirtualRegister(RC)); auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(), TII->get(AMDGPU::IMPLICIT_DEF), CombOldVGPR.Reg); DPPMIs.push_back(UndefInst.getInstr()); @@ -482,21 +531,8 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { continue; } - if (TII->isVOP3(OrigOp)) { - if (!TII->hasVALU32BitEncoding(OrigOp)) { - LLVM_DEBUG(dbgs() << " failed: VOP3 hasn't e32 equivalent\n"); - break; - } - // check if other than abs|neg modifiers are set (opsel for example) - const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG); - if (!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src0_modifiers, 0, Mask) || - !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src1_modifiers, 0, Mask) || - !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::clamp, 0) || - !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::omod, 0)) { - LLVM_DEBUG(dbgs() << " failed: VOP3 has non-default modifiers\n"); - break; - } - } else if (!TII->isVOP1(OrigOp) && !TII->isVOP2(OrigOp)) { + bool IsShrinkable = isShrinkable(OrigMI); + if (!(IsShrinkable || TII->isVOP1(OrigOp) || TII->isVOP2(OrigOp))) { LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n"); break; } @@ -521,7 +557,7 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { LLVM_DEBUG(dbgs() << " combining: " << OrigMI); if (Use == Src0) { if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR, - OldOpndValue, CombBCZ)) { + OldOpndValue, CombBCZ, IsShrinkable)) { DPPMIs.push_back(DPPInst); Rollback = false; } @@ -532,8 +568,9 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { BB->insert(OrigMI, NewMI); if (TII->commuteInstruction(*NewMI)) { LLVM_DEBUG(dbgs() << " commuted: " << *NewMI); - if (auto *DPPInst = createDPPInst(*NewMI, MovMI, CombOldVGPR, - OldOpndValue, CombBCZ)) { + if (auto *DPPInst = + createDPPInst(*NewMI, MovMI, CombOldVGPR, OldOpndValue, CombBCZ, + IsShrinkable)) { DPPMIs.push_back(DPPInst); Rollback = false; } @@ -566,12 +603,12 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { } bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) { - auto &ST = MF.getSubtarget<GCNSubtarget>(); - if (!ST.hasDPP() || skipFunction(MF.getFunction())) + ST = &MF.getSubtarget<GCNSubtarget>(); + if (!ST->hasDPP() || skipFunction(MF.getFunction())) return false; MRI = &MF.getRegInfo(); - TII = ST.getInstrInfo(); + TII = ST->getInstrInfo(); bool Changed = false; for (auto &MBB : MF) { @@ -581,12 +618,17 @@ bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) { Changed = true; ++NumDPPMovsCombined; } else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) { - auto Split = TII->expandMovDPP64(MI); - for (auto M : { Split.first, Split.second }) { - if (combineDPPMov(*M)) - ++NumDPPMovsCombined; + if (ST->has64BitDPP() && combineDPPMov(MI)) { + Changed = true; + ++NumDPPMovsCombined; + } else { + auto Split = TII->expandMovDPP64(MI); + for (auto M : { Split.first, Split.second }) { + if (M && combineDPPMov(*M)) + ++NumDPPMovsCombined; + } + Changed = true; } - Changed = true; } } } diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index ed1dc77bd545..bc2fb1e9770c 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -23,6 +23,9 @@ using namespace llvm; // Hazard Recoginizer Implementation //===----------------------------------------------------------------------===// +static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, + const GCNSubtarget &ST); + GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), @@ -32,8 +35,9 @@ GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : TRI(TII.getRegisterInfo()), ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) { - MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5; + MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5; TSchedModel.init(&ST); + RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST); } void GCNHazardRecognizer::Reset() { @@ -87,6 +91,25 @@ static bool isSMovRel(unsigned Opcode) { } } +static bool isDGEMM(unsigned Opcode) { + return Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || + Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64 || + Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_e64 || + Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64; +} + +static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); + + if (!SIInstrInfo::isMAI(MI) || + isDGEMM(Opcode) || + Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || + Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64) + return false; + + return true; +} + static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI) { if (TII.isAlwaysGDS(MI.getOpcode())) @@ -138,12 +161,6 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) return HazardType; - // FIXME: Should flat be considered vmem? - if ((SIInstrInfo::isVMEM(*MI) || - SIInstrInfo::isFLAT(*MI)) - && checkVMEMHazards(MI) > 0) - return HazardType; - if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) return HazardType; @@ -153,6 +170,12 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { if (ST.hasNoDataDepHazard()) return NoHazard; + // FIXME: Should flat be considered vmem? + if ((SIInstrInfo::isVMEM(*MI) || + SIInstrInfo::isFLAT(*MI)) + && checkVMEMHazards(MI) > 0) + return HazardType; + if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) return HazardType; @@ -165,6 +188,11 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0) return HazardType; + if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) || + SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) || + SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0) + return HazardType; + if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0) return HazardType; @@ -251,9 +279,6 @@ unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { if (SIInstrInfo::isSMRD(*MI)) return std::max(WaitStates, checkSMRDHazards(MI)); - if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) - WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); - if (ST.hasNSAtoVMEMBug()) WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI)); @@ -262,6 +287,9 @@ unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { if (ST.hasNoDataDepHazard()) return WaitStates; + if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) + WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); + if (SIInstrInfo::isVALU(*MI)) WaitStates = std::max(WaitStates, checkVALUHazards(MI)); @@ -274,6 +302,11 @@ unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { if (isRWLane(MI->getOpcode())) WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); + if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) || + SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) || + SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0) + WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI)); + if (MI->isInlineAsm()) return std::max(WaitStates, checkInlineAsmHazards(MI)); @@ -319,8 +352,7 @@ void GCNHazardRecognizer::AdvanceCycle() { // Do not track non-instructions which do not affect the wait states. // If included, these instructions can lead to buffer overflow such that // detectable hazards are missed. - if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() || - CurrCycleInstr->isKill()) { + if (CurrCycleInstr->isMetaInstruction()) { CurrCycleInstr = nullptr; return; } @@ -359,23 +391,22 @@ void GCNHazardRecognizer::RecedeCycle() { // Helper Functions //===----------------------------------------------------------------------===// -typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn; +typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn; // Returns a minimum wait states since \p I walking all predecessors. // Only scans until \p IsExpired does not return true. // Can only be run in a hazard recognizer mode. static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, - MachineBasicBlock *MBB, - MachineBasicBlock::reverse_instr_iterator I, - int WaitStates, - IsExpiredFn IsExpired, + const MachineBasicBlock *MBB, + MachineBasicBlock::const_reverse_instr_iterator I, + int WaitStates, IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited) { for (auto E = MBB->instr_rend(); I != E; ++I) { // Don't add WaitStates for parent BUNDLE instructions. if (I->isBundle()) continue; - if (IsHazard(&*I)) + if (IsHazard(*I)) return WaitStates; if (I->isInlineAsm() || I->isMetaInstruction()) @@ -383,12 +414,11 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, WaitStates += SIInstrInfo::getNumWaitStates(*I); - if (IsExpired(&*I, WaitStates)) + if (IsExpired(*I, WaitStates)) return std::numeric_limits<int>::max(); } - int MinWaitStates = WaitStates; - bool Found = false; + int MinWaitStates = std::numeric_limits<int>::max(); for (MachineBasicBlock *Pred : MBB->predecessors()) { if (!Visited.insert(Pred).second) continue; @@ -396,25 +426,14 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates, IsExpired, Visited); - if (W == std::numeric_limits<int>::max()) - continue; - - MinWaitStates = Found ? std::min(MinWaitStates, W) : W; - if (IsExpired(nullptr, MinWaitStates)) - return MinWaitStates; - - Found = true; + MinWaitStates = std::min(MinWaitStates, W); } - if (Found) - return MinWaitStates; - - return std::numeric_limits<int>::max(); + return MinWaitStates; } static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, - MachineInstr *MI, - IsExpiredFn IsExpired) { + const MachineInstr *MI, IsExpiredFn IsExpired) { DenseSet<const MachineBasicBlock *> Visited; return getWaitStatesSince(IsHazard, MI->getParent(), std::next(MI->getReverseIterator()), @@ -423,7 +442,7 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { if (IsHazardRecognizerMode) { - auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) { + auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) { return WaitStates >= Limit; }; return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); @@ -432,7 +451,7 @@ int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { int WaitStates = 0; for (MachineInstr *MI : EmittedInstrs) { if (MI) { - if (IsHazard(MI)) + if (IsHazard(*MI)) return WaitStates; if (MI->isInlineAsm()) @@ -451,8 +470,8 @@ int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, int Limit) { const SIRegisterInfo *TRI = ST.getRegisterInfo(); - auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) { - return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI); + auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) { + return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI); }; return getWaitStatesSince(IsHazardFn, Limit); @@ -460,8 +479,8 @@ int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, int Limit) { - auto IsHazardFn = [IsHazard] (MachineInstr *MI) { - return isSSetReg(MI->getOpcode()) && IsHazard(MI); + auto IsHazardFn = [IsHazard](const MachineInstr &MI) { + return isSSetReg(MI.getOpcode()) && IsHazard(MI); }; return getWaitStatesSince(IsHazardFn, Limit); @@ -560,8 +579,12 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { // A read of an SGPR by SMRD instruction requires 4 wait states when the // SGPR was written by a VALU instruction. int SmrdSgprWaitStates = 4; - auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; - auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); }; + auto IsHazardDefFn = [this](const MachineInstr &MI) { + return TII.isVALU(MI); + }; + auto IsBufferHazardDefFn = [this](const MachineInstr &MI) { + return TII.isSALU(MI); + }; bool IsBufferSMRD = TII.isBufferSMRD(*SMRD); @@ -601,9 +624,11 @@ int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { // A read of an SGPR by a VMEM instruction requires 5 wait states when the // SGPR was written by a VALU Instruction. const int VmemSgprWaitStates = 5; - auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; + auto IsHazardDefFn = [this](const MachineInstr &MI) { + return TII.isVALU(MI); + }; for (const MachineOperand &Use : VMEM->uses()) { - if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg())) + if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg())) continue; int WaitStatesNeededForUse = @@ -622,15 +647,18 @@ int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { int DppVgprWaitStates = 2; int DppExecWaitStates = 5; int WaitStatesNeeded = 0; - auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; + auto IsHazardDefFn = [TII](const MachineInstr &MI) { + return TII->isVALU(MI); + }; for (const MachineOperand &Use : DPP->uses()) { if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) continue; int WaitStatesNeededForUse = - DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(), - [](MachineInstr *) { return true; }, - DppVgprWaitStates); + DppVgprWaitStates - getWaitStatesSinceDef( + Use.getReg(), + [](const MachineInstr &) { return true; }, + DppVgprWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); } @@ -648,7 +676,9 @@ int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { // v_div_fmas requires 4 wait states after a write to vcc from a VALU // instruction. const int DivFMasWaitStates = 4; - auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; + auto IsHazardDefFn = [TII](const MachineInstr &MI) { + return TII->isVALU(MI); + }; int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn, DivFMasWaitStates); @@ -660,8 +690,8 @@ int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr); const int GetRegWaitStates = 2; - auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) { - return GetRegHWReg == getHWReg(TII, *MI); + auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) { + return GetRegHWReg == getHWReg(TII, MI); }; int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates); @@ -673,8 +703,8 @@ int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { unsigned HWReg = getHWReg(TII, *SetRegInstr); const int SetRegWaitStates = ST.getSetRegWaitStates(); - auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) { - return HWReg == getHWReg(TII, *MI); + auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) { + return HWReg == getHWReg(TII, MI); }; int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates); return SetRegWaitStates - WaitStatesNeeded; @@ -739,13 +769,13 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, const int VALUWaitStates = 1; int WaitStatesNeeded = 0; - if (!TRI->isVGPR(MRI, Def.getReg())) + if (!TRI->isVectorRegister(MRI, Def.getReg())) return WaitStatesNeeded; Register Reg = Def.getReg(); - auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) { - int DataIdx = createsVALUHazard(*MI); + auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) { + int DataIdx = createsVALUHazard(MI); return DataIdx >= 0 && - TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg); + TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg); }; int WaitStatesNeededForDef = VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); @@ -808,9 +838,7 @@ int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { return 0; Register LaneSelectReg = LaneSelectOp->getReg(); - auto IsHazardFn = [TII] (MachineInstr *MI) { - return TII->isVALU(*MI); - }; + auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); }; const int RWLaneWaitStates = 4; int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn, @@ -826,8 +854,8 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { const int RFEWaitStates = 1; - auto IsHazardFn = [TII] (MachineInstr *MI) { - return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS; + auto IsHazardFn = [TII](const MachineInstr &MI) { + return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS; }; int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates); return RFEWaitStates - WaitStatesNeeded; @@ -836,9 +864,7 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { const SIInstrInfo *TII = ST.getInstrInfo(); const int SMovRelWaitStates = 1; - auto IsHazardFn = [TII] (MachineInstr *MI) { - return TII->isSALU(*MI); - }; + auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); }; return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, SMovRelWaitStates); } @@ -856,18 +882,12 @@ bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { return false; const SIInstrInfo *TII = ST.getInstrInfo(); - auto IsHazardFn = [TII] (MachineInstr *MI) { - return TII->isVOPC(*MI); - }; + auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVOPC(MI); }; - auto IsExpiredFn = [] (MachineInstr *MI, int) { - if (!MI) - return false; - unsigned Opc = MI->getOpcode(); - return SIInstrInfo::isVALU(*MI) && - Opc != AMDGPU::V_NOP_e32 && - Opc != AMDGPU::V_NOP_e64 && - Opc != AMDGPU::V_NOP_sdwa; + auto IsExpiredFn = [](const MachineInstr &MI, int) { + unsigned Opc = MI.getOpcode(); + return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 && + Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa; }; if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == @@ -900,13 +920,14 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { const SIRegisterInfo *TRI = ST.getRegisterInfo(); - auto IsHazardFn = [TRI, MI] (MachineInstr *I) { - if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) && - !SIInstrInfo::isFLAT(*I)) + auto IsHazardFn = [TRI, MI](const MachineInstr &I) { + if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) && + !SIInstrInfo::isFLAT(I)) return false; for (const MachineOperand &Def : MI->defs()) { - MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI); + const MachineOperand *Op = + I.findRegisterUseOperand(Def.getReg(), false, TRI); if (!Op) continue; return true; @@ -914,12 +935,12 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { return false; }; - auto IsExpiredFn = [](MachineInstr *MI, int) { - return MI && (SIInstrInfo::isVALU(*MI) || - (MI->getOpcode() == AMDGPU::S_WAITCNT && - !MI->getOperand(0).getImm()) || - (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && - MI->getOperand(0).getImm() == 0xffe3)); + auto IsExpiredFn = [](const MachineInstr &MI, int) { + return SIInstrInfo::isVALU(MI) || + (MI.getOpcode() == AMDGPU::S_WAITCNT && + !MI.getOperand(0).getImm()) || + (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && + MI.getOperand(0).getImm() == 0xffe3); }; if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == @@ -968,43 +989,41 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { return false; const Register SDSTReg = SDST->getReg(); - auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) { - return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI); + auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) { + return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI); }; - auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) { - if (MI) { - if (TII->isSALU(*MI)) { - switch (MI->getOpcode()) { - case AMDGPU::S_SETVSKIP: - case AMDGPU::S_VERSION: - case AMDGPU::S_WAITCNT_VSCNT: - case AMDGPU::S_WAITCNT_VMCNT: - case AMDGPU::S_WAITCNT_EXPCNT: - // These instructions cannot not mitigate the hazard. + auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) { + if (TII->isSALU(MI)) { + switch (MI.getOpcode()) { + case AMDGPU::S_SETVSKIP: + case AMDGPU::S_VERSION: + case AMDGPU::S_WAITCNT_VSCNT: + case AMDGPU::S_WAITCNT_VMCNT: + case AMDGPU::S_WAITCNT_EXPCNT: + // These instructions cannot not mitigate the hazard. + return false; + case AMDGPU::S_WAITCNT_LGKMCNT: + // Reducing lgkmcnt count to 0 always mitigates the hazard. + return (MI.getOperand(1).getImm() == 0) && + (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL); + case AMDGPU::S_WAITCNT: { + const int64_t Imm = MI.getOperand(0).getImm(); + AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); + return (Decoded.LgkmCnt == 0); + } + default: + // SOPP instructions cannot mitigate the hazard. + if (TII->isSOPP(MI)) return false; - case AMDGPU::S_WAITCNT_LGKMCNT: - // Reducing lgkmcnt count to 0 always mitigates the hazard. - return (MI->getOperand(1).getImm() == 0) && - (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL); - case AMDGPU::S_WAITCNT: { - const int64_t Imm = MI->getOperand(0).getImm(); - AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); - return (Decoded.LgkmCnt == 0); - } - default: - // SOPP instructions cannot mitigate the hazard. - if (TII->isSOPP(*MI)) - return false; - // At this point the SALU can be assumed to mitigate the hazard - // because either: - // (a) it is independent of the at risk SMEM (breaking chain), - // or - // (b) it is dependent on the SMEM, in which case an appropriate - // s_waitcnt lgkmcnt _must_ exist between it and the at risk - // SMEM instruction. - return true; - } + // At this point the SALU can be assumed to mitigate the hazard + // because either: + // (a) it is independent of the at risk SMEM (breaking chain), + // or + // (b) it is dependent on the SMEM, in which case an appropriate + // s_waitcnt lgkmcnt _must_ exist between it and the at risk + // SMEM instruction. + return true; } } return false; @@ -1028,25 +1047,23 @@ bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { if (!MI->modifiesRegister(AMDGPU::EXEC, TRI)) return false; - auto IsHazardFn = [TRI] (MachineInstr *I) { - if (SIInstrInfo::isVALU(*I)) + auto IsHazardFn = [TRI](const MachineInstr &I) { + if (SIInstrInfo::isVALU(I)) return false; - return I->readsRegister(AMDGPU::EXEC, TRI); + return I.readsRegister(AMDGPU::EXEC, TRI); }; const SIInstrInfo *TII = ST.getInstrInfo(); - auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) { - if (!MI) - return false; - if (SIInstrInfo::isVALU(*MI)) { - if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst)) + auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) { + if (SIInstrInfo::isVALU(MI)) { + if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) return true; - for (auto MO : MI->implicit_operands()) + for (auto MO : MI.implicit_operands()) if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) return true; } - if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && - (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe) + if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && + (MI.getOperand(0).getImm() & 0xfffe) == 0xfffe) return true; return false; }; @@ -1061,52 +1078,71 @@ bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { return true; } -bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { +static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, + const GCNSubtarget &ST) { if (!ST.hasLdsBranchVmemWARHazard()) return false; - auto IsHazardInst = [] (const MachineInstr *MI) { - if (SIInstrInfo::isDS(*MI)) + // Check if the necessary condition for the hazard is met: both LDS and VMEM + // instructions need to appear in the same function. + bool HasLds = false; + bool HasVmem = false; + for (auto &MBB : MF) { + for (auto &MI : MBB) { + HasLds |= SIInstrInfo::isDS(MI); + HasVmem |= + SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI); + if (HasLds && HasVmem) + return true; + } + } + return false; +} + +bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { + if (!RunLdsBranchVmemWARHazardFixup) + return false; + + assert(ST.hasLdsBranchVmemWARHazard()); + + auto IsHazardInst = [](const MachineInstr &MI) { + if (SIInstrInfo::isDS(MI)) return 1; - if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI)) + if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) return 2; return 0; }; - auto InstType = IsHazardInst(MI); + auto InstType = IsHazardInst(*MI); if (!InstType) return false; - auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) { - return I && (IsHazardInst(I) || - (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && - I->getOperand(0).getReg() == AMDGPU::SGPR_NULL && - !I->getOperand(1).getImm())); + auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) { + return IsHazardInst(I) || (I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && + I.getOperand(0).getReg() == AMDGPU::SGPR_NULL && + !I.getOperand(1).getImm()); }; - auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) { - if (!I->isBranch()) + auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) { + if (!I.isBranch()) return false; - auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) { + auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) { auto InstType2 = IsHazardInst(I); return InstType2 && InstType != InstType2; }; - auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) { - if (!I) - return false; - + auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) { auto InstType2 = IsHazardInst(I); if (InstType == InstType2) return true; - return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && - I->getOperand(0).getReg() == AMDGPU::SGPR_NULL && - !I->getOperand(1).getImm(); + return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && + I.getOperand(0).getReg() == AMDGPU::SGPR_NULL && + !I.getOperand(1).getImm(); }; - return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) != + return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) != std::numeric_limits<int>::max(); }; @@ -1137,12 +1173,12 @@ int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { if (!Offset || (Offset->getImm() & 6) == 0) return 0; - auto IsHazardFn = [TII] (MachineInstr *I) { - if (!SIInstrInfo::isMIMG(*I)) + auto IsHazardFn = [TII](const MachineInstr &I) { + if (!SIInstrInfo::isMIMG(I)) return false; - const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode()); + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode()); return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA && - TII->getInstSizeInBytes(*I) >= 16; + TII->getInstSizeInBytes(I) >= 16; }; return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1); @@ -1154,17 +1190,17 @@ int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) return 0; - auto IsHazardFn = [] (MachineInstr *I) { - if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I)) + auto IsHazardFn = [](const MachineInstr &I) { + if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I)) return false; - return SIInstrInfo::isFPAtomic(*I); + return SIInstrInfo::isFPAtomic(I); }; - auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) { - if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI)) + auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) { + if (WaitStates >= 3 || SIInstrInfo::isVALU(MI)) return true; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { case AMDGPU::S_WAITCNT: case AMDGPU::S_WAITCNT_VSCNT: case AMDGPU::S_WAITCNT_VMCNT: @@ -1179,7 +1215,6 @@ int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { return false; }; - return FPAtomicToDenormModeWaitStates - ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn); } @@ -1187,11 +1222,15 @@ int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { assert(SIInstrInfo::isMAI(*MI)); + return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI); +} + +int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) { int WaitStatesNeeded = 0; unsigned Opc = MI->getOpcode(); - auto IsVALUFn = [] (MachineInstr *MI) { - return SIInstrInfo::isVALU(*MI); + auto IsVALUFn = [](const MachineInstr &MI) { + return SIInstrInfo::isVALU(MI); }; if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write @@ -1220,10 +1259,10 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { } } - auto IsMFMAFn = [] (MachineInstr *MI) { - return SIInstrInfo::isMAI(*MI) && - MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && - MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64; + auto IsMFMAFn = [](const MachineInstr &MI) { + return SIInstrInfo::isMAI(MI) && + MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && + MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64; }; for (const MachineOperand &Op : MI->explicit_operands()) { @@ -1245,15 +1284,15 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { Register Reg = Op.getReg(); unsigned HazardDefLatency = 0; - auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this] - (MachineInstr *MI) { + auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, + this](const MachineInstr &MI) { if (!IsMFMAFn(MI)) return false; - Register DstReg = MI->getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); if (DstReg == Reg) return false; - HazardDefLatency = std::max(HazardDefLatency, - TSchedModel.computeInstrLatency(MI)); + HazardDefLatency = + std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI)); return TRI.regsOverlap(DstReg, Reg); }; @@ -1292,10 +1331,10 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { if (WaitStatesNeeded == MaxWaitStates) return WaitStatesNeeded; // Early exit. - auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) { - if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) + auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) { + if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) return false; - Register DstReg = MI->getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); return TRI.regsOverlap(Reg, DstReg); }; @@ -1324,13 +1363,13 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { Register DstReg = MI->getOperand(0).getReg(); unsigned HazardDefLatency = 0; - auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this] - (MachineInstr *MI) { + auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, + this](const MachineInstr &MI) { if (!IsMFMAFn(MI)) return false; - Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); - HazardDefLatency = std::max(HazardDefLatency, - TSchedModel.computeInstrLatency(MI)); + Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg(); + HazardDefLatency = + std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI)); return TRI.regsOverlap(Reg, DstReg); }; @@ -1353,14 +1392,171 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { return WaitStatesNeeded; } +int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { + int WaitStatesNeeded = 0; + unsigned Opc = MI->getOpcode(); + + auto IsMFMAFn = [](const MachineInstr &MI) { + return SIInstrInfo::isMAI(MI) && + MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && + MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64; + }; + + auto IsLegacyVALUFn = [&IsMFMAFn](const MachineInstr &MI) { + return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI); + }; + + auto IsLegacyVALUNotDotFn = [&IsMFMAFn](const MachineInstr &MI) { + return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI) && !SIInstrInfo::isDOT(MI); + }; + + if (!IsMFMAFn(*MI)) + return WaitStatesNeeded; + + const int VALUWritesExecWaitStates = 4; + int WaitStatesNeededForUse = VALUWritesExecWaitStates - + getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn, + VALUWritesExecWaitStates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); + + // Loop for both DGEMM and S/HGEMM 2nd instruction. + for (const MachineOperand &Use : MI->explicit_uses()) { + const int LegacyVALUNotDotWritesVGPRWaitStates = 2; + const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2; + const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8; + const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16; + const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3; + const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9; + const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17; + const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9; + const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4; + const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5; + const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11; + const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19; + const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6; + const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11; + const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4; + const int MaxWaitStates = 19; + + if (!Use.isReg()) + continue; + unsigned Reg = Use.getReg(); + bool FullReg; + const MachineInstr *MI1; + + auto IsOverlappedDGEMMorXDLFn = [Reg, &IsMFMAFn, &FullReg, &MI1, + this](const MachineInstr &MI) { + if (!IsMFMAFn(MI)) + return false; + if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI)) + return false; + Register DstReg = MI.getOperand(0).getReg(); + FullReg = (DstReg == Reg); + MI1 = &MI; + return TRI.regsOverlap(DstReg, Reg); + }; + + WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates - + getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + int NumWaitStates = getWaitStatesSinceDef(Reg, IsOverlappedDGEMMorXDLFn, + MaxWaitStates); + if (NumWaitStates == std::numeric_limits<int>::max()) + continue; + + int OpNo = MI->getOperandNo(&Use); + unsigned Opc1 = MI1->getOpcode(); + int NeedWaitStates = 0; + if (OpNo == SrcCIdx) { + if (!isDGEMM(Opc) && isDGEMM(Opc1)) { + NeedWaitStates = 0; + } else if (FullReg) { + if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || + Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) && + (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || + Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64)) + NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates; + } else { + switch (Opc1) { + case AMDGPU::V_MFMA_F64_16X16X4F64_e64: + case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: + if (!isXDL(ST, *MI)) + NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates; + break; + case AMDGPU::V_MFMA_F64_4X4X4F64_e64: + case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: + if (!isXDL(ST, *MI)) + NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates; + break; + default: + switch (TSchedModel.computeInstrLatency(MI1)) { + case 2: + NeedWaitStates = isDGEMM(Opc) + ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates + : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates; + break; + case 8: + NeedWaitStates = isDGEMM(Opc) + ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates + : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates; + break; + case 16: LLVM_FALLTHROUGH; + default: + NeedWaitStates = isDGEMM(Opc) + ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates + : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates; + } + } + } + } else { + switch (Opc1) { + case AMDGPU::V_MFMA_F64_16X16X4F64_e64: + case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: + NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates; + break; + case AMDGPU::V_MFMA_F64_4X4X4F64_e64: + case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: + NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates; + break; + default: + switch (TSchedModel.computeInstrLatency(MI1)) { + case 2: + NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates; + break; + case 8: + NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates; + break; + case 16: LLVM_FALLTHROUGH; + default: + NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates; + } + } + } + if (WaitStatesNeeded >= NeedWaitStates) + continue; + + WaitStatesNeededForUse = NeedWaitStates - NumWaitStates; + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + if (WaitStatesNeeded == MaxWaitStates) + break; + } + + return WaitStatesNeeded; +} + int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { - if (!ST.hasMAIInsts()) + // On gfx90a+ releveant hazards are checked in checkMAIVALUHazards() + if (!ST.hasMAIInsts() || ST.hasGFX90AInsts()) return 0; int WaitStatesNeeded = 0; - auto IsAccVgprReadFn = [] (MachineInstr *MI) { - return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64; + auto IsAccVgprReadFn = [](const MachineInstr &MI) { + return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64; }; for (const MachineOperand &Op : MI->explicit_uses()) { @@ -1380,12 +1576,12 @@ int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { if (WaitStatesNeeded == MaxWaitStates) return WaitStatesNeeded; // Early exit. - auto IsVALUAccVgprRdWrCheckFn = [Reg, this](MachineInstr *MI) { - if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 && - MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) + auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) { + if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 && + MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) return false; - auto IsVALUFn = [] (MachineInstr *MI) { - return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI); + auto IsVALUFn = [](const MachineInstr &MI) { + return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI); }; return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) < std::numeric_limits<int>::max(); @@ -1399,22 +1595,252 @@ int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { return WaitStatesNeeded; } +int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { + if (!ST.hasGFX90AInsts()) + return 0; + + auto IsMFMAFn = [](const MachineInstr &MI) -> bool { + return SIInstrInfo::isMAI(MI) && + MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && + MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64; + }; + + auto IsDGEMMFn = [](const MachineInstr &MI) -> bool { + return isDGEMM(MI.getOpcode()); + }; + + // This is checked in checkMAIHazards90A() + if (IsMFMAFn(*MI)) + return 0; + + int WaitStatesNeeded = 0; + + bool IsMemOrExport = SIInstrInfo::isVMEM(*MI) || + SIInstrInfo::isFLAT(*MI) || + SIInstrInfo::isDS(*MI) || + SIInstrInfo::isEXP(*MI); + bool IsVALU = SIInstrInfo::isVALU(*MI); + + const MachineInstr *MFMA = nullptr; + unsigned Reg; + auto IsDGEMMorXDLWriteFn = [&Reg, &IsMFMAFn, &MFMA, + this](const MachineInstr &MI) { + if (!IsMFMAFn(MI) || !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg)) + return false; + if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI)) + return false; + MFMA = &MI; + return true; + }; + + const MachineInstr *DOT = nullptr; + auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) { + if (!SIInstrInfo::isDOT(MI) || + !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg)) + return false; + DOT = &MI; + return true; + }; + + int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::src2); + + if (IsMemOrExport || IsVALU) { + const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5; + const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11; + const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19; + const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9; + const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18; + const int DMFMA4x4WriteVgprVALUReadWaitStates = 6; + const int DMFMA16x16WriteVgprVALUReadWaitStates = 11; + const int DotWriteSameDotReadSrcAB = 3; + const int DotWriteDifferentVALURead = 3; + const int MaxWaitStates = 19; + + for (const MachineOperand &Use : MI->explicit_uses()) { + if (!Use.isReg()) + continue; + Reg = Use.getReg(); + + DOT = nullptr; + int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn, + MaxWaitStates); + if (DOT) { + int NeedWaitStates = 0; + if (DOT->getOpcode() == MI->getOpcode()) { + if (&Use - &MI->getOperand(0) != SrcCIdx) + NeedWaitStates = DotWriteSameDotReadSrcAB; + } else { + NeedWaitStates = DotWriteDifferentVALURead; + } + + int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + + MFMA = nullptr; + WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn, + MaxWaitStates); + if (!MFMA) + continue; + + unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA); + int NeedWaitStates = MaxWaitStates; + switch (HazardDefLatency) { + case 2: + NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates; + break; + case 4: + assert(isDGEMM(MFMA->getOpcode())); + NeedWaitStates = + IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates + : DMFMA4x4WriteVgprVALUReadWaitStates; + break; + case 8: + NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates; + break; + case 16: LLVM_FALLTHROUGH; + default: + NeedWaitStates = + isDGEMM(MFMA->getOpcode()) + ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates + : DMFMA16x16WriteVgprVALUReadWaitStates + : SMFMA32x32WriteVgprVALUMemExpReadWaitStates; + break; + } + + int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + if (WaitStatesNeeded == MaxWaitStates) + break; + } + } + + unsigned Opc = MI->getOpcode(); + const int DMFMAToFMA64WaitStates = 2; + if ((Opc == AMDGPU::V_FMA_F64_e64 || + Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 || + Opc == AMDGPU::V_FMAC_F64_dpp) && + WaitStatesNeeded < DMFMAToFMA64WaitStates) { + int WaitStatesNeededForUse = DMFMAToFMA64WaitStates - + getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + + if (!IsVALU && !IsMemOrExport) + return WaitStatesNeeded; + + for (const MachineOperand &Def : MI->defs()) { + const int SMFMA4x4WriteVgprVALUWawWaitStates = 5; + const int SMFMA16x16WriteVgprVALUWawWaitStates = 11; + const int SMFMA32x32WriteVgprVALUWawWaitStates = 19; + const int SMFMA4x4ReadVgprVALUWarWaitStates = 1; + const int SMFMA16x16ReadVgprVALUWarWaitStates = 7; + const int SMFMA32x32ReadVgprVALUWarWaitStates = 15; + const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6; + const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11; + const int DotWriteDifferentVALUWrite = 3; + const int MaxWaitStates = 19; + const int MaxWarWaitStates = 15; + + Reg = Def.getReg(); + + DOT = nullptr; + int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn, + MaxWaitStates); + if (DOT && DOT->getOpcode() != MI->getOpcode()) + WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite - + WaitStatesSinceDef); + + MFMA = nullptr; + WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn, + MaxWaitStates); + if (MFMA) { + int NeedWaitStates = MaxWaitStates; + switch (TSchedModel.computeInstrLatency(MFMA)) { + case 2: + NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates; + break; + case 4: + assert(isDGEMM(MFMA->getOpcode())); + NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates; + break; + case 8: + NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates; + break; + case 16: LLVM_FALLTHROUGH; + default: + NeedWaitStates = isDGEMM(MFMA->getOpcode()) + ? DMFMA16x16WriteVgprVALUWriteWaitStates + : SMFMA32x32WriteVgprVALUWawWaitStates; + break; + } + + int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + if (WaitStatesNeeded == MaxWaitStates) + break; + } + + auto IsSMFMAReadAsCFn = [&Reg, &IsMFMAFn, &MFMA, + this](const MachineInstr &MI) { + if (!IsMFMAFn(MI) || isDGEMM(MI.getOpcode()) || + !MI.readsRegister(Reg, &TRI)) + return false; + + const MachineOperand *SrcC = + TII.getNamedOperand(MI, AMDGPU::OpName::src2); + assert(SrcC); + if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg)) + return false; + + MFMA = &MI; + return true; + }; + + MFMA = nullptr; + int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn, + MaxWarWaitStates); + if (!MFMA) + continue; + + unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA); + int NeedWaitStates = MaxWaitStates; + switch (HazardDefLatency) { + case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates; + break; + case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates; + break; + case 16: LLVM_FALLTHROUGH; + default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates; + break; + } + + int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse; + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + + return WaitStatesNeeded; +} + bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) { if (!SU->isInstr()) return false; - MachineInstr *MAI = nullptr; - auto IsMFMAFn = [&MAI] (MachineInstr *MI) { + const MachineInstr *MAI = nullptr; + auto IsMFMAFn = [&MAI](const MachineInstr &MI) { MAI = nullptr; - if (SIInstrInfo::isMAI(*MI) && - MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && - MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64) - MAI = MI; + if (SIInstrInfo::isMAI(MI) && + MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && + MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64) + MAI = &MI; return MAI != nullptr; }; MachineInstr *MI = SU->getInstr(); - if (IsMFMAFn(MI)) { + if (IsMFMAFn(*MI)) { int W = getWaitStatesSince(IsMFMAFn, 16); if (MAI) return W < (int)TSchedModel.computeInstrLatency(MAI); diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index 447ca828ae64..162121c2c525 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -32,7 +32,7 @@ class GCNSubtarget; class GCNHazardRecognizer final : public ScheduleHazardRecognizer { public: - typedef function_ref<bool(MachineInstr *)> IsHazardFn; + typedef function_ref<bool(const MachineInstr &)> IsHazardFn; private: // Distinguish if we are called from scheduler or hazard recognizer @@ -48,6 +48,7 @@ private: const SIInstrInfo &TII; const SIRegisterInfo &TRI; TargetSchedModel TSchedModel; + bool RunLdsBranchVmemWARHazardFixup; /// RegUnits of uses in the current soft memory clause. BitVector ClauseUses; @@ -94,6 +95,9 @@ private: bool fixLdsBranchVmemWARHazard(MachineInstr *MI); int checkMAIHazards(MachineInstr *MI); + int checkMAIHazards908(MachineInstr *MI); + int checkMAIHazards90A(MachineInstr *MI); + int checkMAIVALUHazards(MachineInstr *MI); int checkMAILdStHazards(MachineInstr *MI); public: diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp index fc7105bc15a7..9f98f9ada802 100644 --- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp +++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp @@ -190,6 +190,14 @@ GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const { if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg()) return NSA_Status::FIXED; + // InlineSpiller does not call LRM::assign() after an LI split leaving + // it in an inconsistent state, so we cannot call LRM::unassign(). + // See llvm bug #48911. + // Skip reassign if a register has originated from such split. + // FIXME: Remove the workaround when bug #48911 is fixed. + if (VRM->getPreSplitReg(Reg)) + return NSA_Status::FIXED; + const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg) diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp new file mode 100644 index 000000000000..a51399d7da5f --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -0,0 +1,162 @@ +//===-- GCNPreRAOptimizations.cpp -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass combines split register tuple initialization into a single psuedo: +/// +/// undef %0.sub1:sreg_64 = S_MOV_B32 1 +/// %0.sub0:sreg_64 = S_MOV_B32 2 +/// => +/// %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 0x200000001 +/// +/// This is to allow rematerialization of a value instead of spilling. It is +/// supposed to be done after register coalescer to allow it to do its job and +/// before actual register allocation to allow rematerialization. +/// +/// Right now the pass only handles 64 bit SGPRs with immediate initializers, +/// although the same shall be possible with other register classes and +/// instructions if necessary. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-pre-ra-optimizations" + +namespace { + +class GCNPreRAOptimizations : public MachineFunctionPass { +private: + const SIInstrInfo *TII; + MachineRegisterInfo *MRI; + LiveIntervals *LIS; + + bool processReg(Register Reg); + +public: + static char ID; + + GCNPreRAOptimizations() : MachineFunctionPass(ID) { + initializeGCNPreRAOptimizationsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "AMDGPU Pre-RA optimizations"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LiveIntervals>(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(GCNPreRAOptimizations, DEBUG_TYPE, + "AMDGPU Pre-RA optimizations", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_END(GCNPreRAOptimizations, DEBUG_TYPE, "Pre-RA optimizations", + false, false) + +char GCNPreRAOptimizations::ID = 0; + +char &llvm::GCNPreRAOptimizationsID = GCNPreRAOptimizations::ID; + +FunctionPass *llvm::createGCNPreRAOptimizationsPass() { + return new GCNPreRAOptimizations(); +} + +bool GCNPreRAOptimizations::processReg(Register Reg) { + MachineInstr *Def0 = nullptr; + MachineInstr *Def1 = nullptr; + uint64_t Init = 0; + + for (MachineInstr &I : MRI->def_instructions(Reg)) { + if (I.getOpcode() != AMDGPU::S_MOV_B32 || I.getOperand(0).getReg() != Reg || + !I.getOperand(1).isImm() || I.getNumOperands() != 2) + return false; + + switch (I.getOperand(0).getSubReg()) { + default: + return false; + case AMDGPU::sub0: + if (Def0) + return false; + Def0 = &I; + Init |= I.getOperand(1).getImm() & 0xffffffff; + break; + case AMDGPU::sub1: + if (Def1) + return false; + Def1 = &I; + Init |= static_cast<uint64_t>(I.getOperand(1).getImm()) << 32; + break; + } + } + + if (!Def0 || !Def1 || Def0->getParent() != Def1->getParent()) + return false; + + LLVM_DEBUG(dbgs() << "Combining:\n " << *Def0 << " " << *Def1 + << " =>\n"); + + if (SlotIndex::isEarlierInstr(LIS->getInstructionIndex(*Def1), + LIS->getInstructionIndex(*Def0))) + std::swap(Def0, Def1); + + LIS->RemoveMachineInstrFromMaps(*Def0); + LIS->RemoveMachineInstrFromMaps(*Def1); + auto NewI = BuildMI(*Def0->getParent(), *Def0, Def0->getDebugLoc(), + TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), Reg) + .addImm(Init); + + Def0->eraseFromParent(); + Def1->eraseFromParent(); + LIS->InsertMachineInstrInMaps(*NewI); + LIS->removeInterval(Reg); + LIS->createAndComputeVirtRegInterval(Reg); + + LLVM_DEBUG(dbgs() << " " << *NewI); + + return true; +} + +bool GCNPreRAOptimizations::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + TII = ST.getInstrInfo(); + MRI = &MF.getRegInfo(); + LIS = &getAnalysis<LiveIntervals>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + bool Changed = false; + + for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) { + Register Reg = Register::index2VirtReg(I); + if (!LIS->hasInterval(Reg)) + continue; + const TargetRegisterClass *RC = MRI->getRegClass(Reg); + if (RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC)) + continue; + Changed |= processReg(Reg); + } + + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td index 7447ec2db188..3a68ed1934e1 100644 --- a/llvm/lib/Target/AMDGPU/GCNProcessors.td +++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td @@ -184,6 +184,10 @@ def : ProcessorModel<"gfx909", SIQuarterSpeedModel, FeatureISAVersion9_0_9.Features >; +def : ProcessorModel<"gfx90a", SIDPFullSpeedModel, + FeatureISAVersion9_0_A.Features +>; + def : ProcessorModel<"gfx90c", SIQuarterSpeedModel, FeatureISAVersion9_0_C.Features >; @@ -204,6 +208,10 @@ def : ProcessorModel<"gfx1012", GFX10SpeedModel, FeatureISAVersion10_1_2.Features >; +def : ProcessorModel<"gfx1013", GFX10SpeedModel, + FeatureISAVersion10_1_3.Features +>; + def : ProcessorModel<"gfx1030", GFX10SpeedModel, FeatureISAVersion10_3_0.Features >; @@ -219,3 +227,11 @@ def : ProcessorModel<"gfx1032", GFX10SpeedModel, def : ProcessorModel<"gfx1033", GFX10SpeedModel, FeatureISAVersion10_3_0.Features >; + +def : ProcessorModel<"gfx1034", GFX10SpeedModel, + FeatureISAVersion10_3_0.Features +>; + +def : ProcessorModel<"gfx1035", GFX10SpeedModel, + FeatureISAVersion10_3_0.Features +>; diff --git a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp deleted file mode 100644 index a12e9ab03e1d..000000000000 --- a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp +++ /dev/null @@ -1,862 +0,0 @@ -//===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Try to reassign registers on GFX10+ to reduce register bank -/// conflicts. -/// -/// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in -/// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to -/// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1, -/// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc. -/// -/// The shader can read one dword from each of these banks once per cycle. -/// If an instruction has to read more register operands from the same bank -/// an additional cycle is needed. HW attempts to pre-load registers through -/// input operand gathering, but a stall cycle may occur if that fails. For -/// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands, -/// potentially incuring 2 stall cycles. -/// -/// The pass tries to reassign registers to reduce bank conflicts. -/// -/// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so -/// that 4 has to be subtracted from an SGPR bank number to get the real value. -/// This also corresponds to bit numbers in bank masks used in the pass. -/// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "GCNSubtarget.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/LiveIntervals.h" -#include "llvm/CodeGen/LiveRegMatrix.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/InitializePasses.h" - -using namespace llvm; - -static cl::opt<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign", - cl::desc("Verify stall cycles in the regbanks reassign pass"), - cl::value_desc("0|1|2"), - cl::init(0), cl::Hidden); - -#define DEBUG_TYPE "amdgpu-regbanks-reassign" - -#define NUM_VGPR_BANKS 4 -#define NUM_SGPR_BANKS 8 -#define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS) -#define SGPR_BANK_OFFSET NUM_VGPR_BANKS -#define VGPR_BANK_MASK 0xf -#define SGPR_BANK_MASK 0xff0 -#define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET) - -STATISTIC(NumStallsDetected, - "Number of operand read stalls detected"); -STATISTIC(NumStallsRecovered, - "Number of operand read stalls recovered"); - -namespace { - -class GCNRegBankReassign : public MachineFunctionPass { - - class OperandMask { - public: - OperandMask(unsigned r, unsigned s, unsigned m) - : Reg(r), SubReg(s), Mask(m) {} - Register Reg; - unsigned SubReg; - unsigned Mask; - }; - - class Candidate { - public: - Candidate(MachineInstr *mi, Register reg, unsigned subreg, - unsigned freebanks) - : MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks) {} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - void dump(const GCNRegBankReassign *P) const { - MI->dump(); - dbgs() << P->printReg(Reg) << " to banks "; - dumpFreeBanks(FreeBanks); - dbgs() << '\n'; - } -#endif - - MachineInstr *MI; - Register Reg; - unsigned SubReg; - unsigned FreeBanks; - }; - - class CandidateList : public std::map<unsigned, std::list<Candidate>> { - public: - void push(unsigned Weight, const Candidate&& C) { - operator[](Weight).push_front(C); - } - - Candidate &back() { - return rbegin()->second.back(); - } - - void pop_back() { - rbegin()->second.pop_back(); - if (rbegin()->second.empty()) - erase(rbegin()->first); - } - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - void dump(const GCNRegBankReassign *P) const { - dbgs() << "\nCandidates:\n\n"; - for (auto &B : *this) { - dbgs() << " Weight " << B.first << ":\n"; - for (auto &C : B.second) - C.dump(P); - } - dbgs() << "\n\n"; - } -#endif - }; - -public: - static char ID; - -public: - GCNRegBankReassign() : MachineFunctionPass(ID) { - initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - StringRef getPassName() const override { return "GCN RegBank Reassign"; } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineLoopInfo>(); - AU.addRequired<LiveIntervals>(); - AU.addRequired<VirtRegMap>(); - AU.addRequired<LiveRegMatrix>(); - AU.setPreservesAll(); - MachineFunctionPass::getAnalysisUsage(AU); - } - -private: - const GCNSubtarget *ST; - - const MachineRegisterInfo *MRI; - - const SIRegisterInfo *TRI; - - MachineLoopInfo *MLI; - - VirtRegMap *VRM; - - LiveRegMatrix *LRM; - - LiveIntervals *LIS; - - unsigned MaxNumVGPRs; - - unsigned MaxNumSGPRs; - - BitVector RegsUsed; - - SmallVector<OperandMask, 8> OperandMasks; - - CandidateList Candidates; - - const MCPhysReg *CSRegs; - - // Returns bank for a phys reg. - unsigned getPhysRegBank(Register Reg, unsigned SubReg) const; - - // Return a bit set for each register bank used. 4 banks for VGPRs and - // 8 banks for SGPRs. - // Registers already processed and recorded in RegsUsed are excluded. - // If Bank is not -1 assume Reg:SubReg to belong to that Bank. - uint32_t getRegBankMask(Register Reg, unsigned SubReg, int Bank); - - // Analyze one instruction returning the number of stalls and a mask of the - // banks used by all operands. - // If Reg and Bank are provided, assume all uses of Reg will be replaced with - // a register chosen from Bank. - std::pair<unsigned, unsigned> analyzeInst(const MachineInstr &MI, - Register Reg = Register(), - unsigned SubReg = 0, int Bank = -1); - - // Return true if register is regular VGPR or SGPR or their tuples. - // Returns false for special registers like m0, vcc etc. - bool isReassignable(Register Reg) const; - - // Check if registers' defs are old and may be pre-loaded. - // Returns 0 if both registers are old enough, 1 or 2 if one or both - // registers will not likely be pre-loaded. - unsigned getOperandGatherWeight(const MachineInstr& MI, - Register Reg1, - Register Reg2, - unsigned StallCycles) const; - - - // Find all bank bits in UsedBanks where Mask can be relocated to. - unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const; - - // Find all bank bits in UsedBanks where Mask can be relocated to. - // Bank is relative to the register and not its subregister component. - // Returns 0 is a register is not reassignable. - unsigned getFreeBanks(Register Reg, unsigned SubReg, unsigned Mask, - unsigned UsedBanks) const; - - // Add cadidate instruction to the work list. - void collectCandidates(MachineInstr& MI, unsigned UsedBanks, - unsigned StallCycles); - - // Collect cadidate instructions across function. Returns a number stall - // cycles detected. Only counts stalls if Collect is false. - unsigned collectCandidates(MachineFunction &MF, bool Collect = true); - - // Remove all candidates that read specified register. - void removeCandidates(Register Reg); - - // Compute stalls within the uses of SrcReg replaced by a register from - // Bank. If Bank is -1 does not perform substitution. If Collect is set - // candidates are collected and added to work list. - unsigned computeStallCycles(Register SrcReg, - Register Reg = Register(), - unsigned SubReg = 0, int Bank = -1, - bool Collect = false); - - // Search for a register in Bank unused within LI. - // Returns phys reg or NoRegister. - MCRegister scavengeReg(LiveInterval &LI, unsigned Bank, - unsigned SubReg) const; - - // Try to reassign candidate. Returns number or stall cycles saved. - unsigned tryReassign(Candidate &C); - - bool verifyCycles(MachineFunction &MF, - unsigned OriginalCycles, unsigned CyclesSaved); - - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -public: - Printable printReg(Register Reg, unsigned SubReg = 0) const { - return Printable([Reg, SubReg, this](raw_ostream &OS) { - if (Reg.isPhysical()) { - OS << llvm::printReg(Reg, TRI); - return; - } - if (!VRM->isAssignedReg(Reg)) - OS << "<unassigned> " << llvm::printReg(Reg, TRI); - else - OS << llvm::printReg(Reg, TRI) << '(' - << llvm::printReg(VRM->getPhys(Reg), TRI) << ')'; - if (SubReg) - OS << ':' << TRI->getSubRegIndexName(SubReg); - }); - } - - static Printable printBank(unsigned Bank) { - return Printable([Bank](raw_ostream &OS) { - OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank); - }); - } - - static void dumpFreeBanks(unsigned FreeBanks) { - for (unsigned L = 0; L < NUM_BANKS; ++L) - if (FreeBanks & (1 << L)) - dbgs() << printBank(L) << ' '; - } -#endif -}; - -} // End anonymous namespace. - -INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", - false, false) -INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_DEPENDENCY(VirtRegMap) -INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) -INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", - false, false) - - -char GCNRegBankReassign::ID = 0; - -char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID; - -unsigned GCNRegBankReassign::getPhysRegBank(Register Reg, - unsigned SubReg) const { - assert(Reg.isPhysical()); - - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - unsigned Size = TRI->getRegSizeInBits(*RC); - if (Size == 16) - Reg = TRI->get32BitRegister(Reg); - else if (Size > 32) { - if (SubReg) { - const TargetRegisterClass *SubRC = TRI->getSubRegClass(RC, SubReg); - Reg = TRI->getSubReg(Reg, SubReg); - if (TRI->getRegSizeInBits(*SubRC) > 32) - Reg = TRI->getSubReg(Reg, AMDGPU::sub0); - } else { - Reg = TRI->getSubReg(Reg, AMDGPU::sub0); - } - } - - if (TRI->hasVGPRs(RC)) { - unsigned RegNo = Reg - AMDGPU::VGPR0; - return RegNo % NUM_VGPR_BANKS; - } - - unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2; - return RegNo % NUM_SGPR_BANKS + SGPR_BANK_OFFSET; -} - -uint32_t GCNRegBankReassign::getRegBankMask(Register Reg, unsigned SubReg, - int Bank) { - if (Reg.isVirtual()) { - if (!VRM->isAssignedReg(Reg)) - return 0; - - Reg = VRM->getPhys(Reg); - if (!Reg) - return 0; - if (SubReg) - Reg = TRI->getSubReg(Reg, SubReg); - } - - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - unsigned Size = TRI->getRegSizeInBits(*RC); - - if (Size == 16) { - Reg = TRI->get32BitRegister(Reg); - Size = 1; - } else { - Size /= 32; - if (Size > 1) - Reg = TRI->getSubReg(Reg, AMDGPU::sub0); - } - - if (TRI->hasVGPRs(RC)) { - // VGPRs have 4 banks assigned in a round-robin fashion. - unsigned RegNo = Reg - AMDGPU::VGPR0; - uint32_t Mask = maskTrailingOnes<uint32_t>(Size); - unsigned Used = 0; - // Bitmask lacks an extract method - for (unsigned I = 0; I < Size; ++I) - if (RegsUsed.test(RegNo + I)) - Used |= 1 << I; - RegsUsed.set(RegNo, RegNo + Size); - Mask &= ~Used; - Mask <<= (Bank == -1) ? RegNo % NUM_VGPR_BANKS : uint32_t(Bank); - return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; - } - - // SGPRs have 8 banks holding 2 consequitive registers each. - unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2; - unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs(); - if (RegNo + StartBit >= RegsUsed.size()) - return 0; - - if (Size > 1) - Size /= 2; - unsigned Mask = (1 << Size) - 1; - unsigned Used = 0; - for (unsigned I = 0; I < Size; ++I) - if (RegsUsed.test(StartBit + RegNo + I)) - Used |= 1 << I; - RegsUsed.set(StartBit + RegNo, StartBit + RegNo + Size); - Mask &= ~Used; - Mask <<= (Bank == -1) ? RegNo % NUM_SGPR_BANKS - : unsigned(Bank - SGPR_BANK_OFFSET); - Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; - // Reserve 4 bank ids for VGPRs. - return Mask << SGPR_BANK_OFFSET; -} - -std::pair<unsigned, unsigned> -GCNRegBankReassign::analyzeInst(const MachineInstr &MI, Register Reg, - unsigned SubReg, int Bank) { - unsigned StallCycles = 0; - unsigned UsedBanks = 0; - - if (MI.isDebugValue()) - return std::make_pair(StallCycles, UsedBanks); - - RegsUsed.reset(); - OperandMasks.clear(); - for (const auto& Op : MI.explicit_uses()) { - // Undef can be assigned to any register, so two vregs can be assigned - // the same phys reg within the same instruction. - if (!Op.isReg() || Op.isUndef()) - continue; - - const Register R = Op.getReg(); - const TargetRegisterClass *RC = TRI->getRegClassForReg(*MRI, R); - - // Do not compute stalls for AGPRs - if (TRI->hasAGPRs(RC)) - continue; - - // Do not compute stalls if sub-register covers all banks - if (Op.getSubReg()) { - LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()); - if (TRI->hasVGPRs(RC)) { - if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS) - continue; - } else { - if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS) - continue; - } - } - - unsigned ShiftedBank = Bank; - - if (Bank != -1 && R == Reg && (Op.getSubReg() || SubReg)) { - unsigned RegOffset = - TRI->getChannelFromSubReg(SubReg ? SubReg : (unsigned)AMDGPU::sub0); - unsigned Offset = TRI->getChannelFromSubReg( - Op.getSubReg() ? Op.getSubReg() : (unsigned)AMDGPU::sub0); - if (Bank < NUM_VGPR_BANKS) { - unsigned Shift = ((NUM_VGPR_BANKS + Offset) - RegOffset); - ShiftedBank = (Bank + Shift) % NUM_VGPR_BANKS; - } else if (Bank >= SGPR_BANK_OFFSET) { - unsigned Shift = (NUM_SGPR_BANKS + (Offset >> 1)) - (RegOffset >> 1); - ShiftedBank = SGPR_BANK_OFFSET + - (Bank - SGPR_BANK_OFFSET + Shift) % NUM_SGPR_BANKS; - } - } - - uint32_t Mask = getRegBankMask(R, Op.getSubReg(), - (Reg == R) ? ShiftedBank : -1); - StallCycles += countPopulation(UsedBanks & Mask); - UsedBanks |= Mask; - OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask)); - } - - return std::make_pair(StallCycles, UsedBanks); -} - -unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI, - Register Reg1, - Register Reg2, - unsigned StallCycles) const -{ - unsigned Defs = 0; - MachineBasicBlock::const_instr_iterator Def(MI.getIterator()); - MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin()); - for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) { - if (MI.isDebugInstr()) - continue; - --Def; - if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF) - continue; - if (Def->modifiesRegister(Reg1, TRI)) - Defs |= 1; - if (Def->modifiesRegister(Reg2, TRI)) - Defs |= 2; - } - return countPopulation(Defs); -} - -bool GCNRegBankReassign::isReassignable(Register Reg) const { - if (Reg.isPhysical() || !VRM->isAssignedReg(Reg)) - return false; - - const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); - - Register PhysReg = VRM->getPhys(Reg); - - if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg) - return false; - - for (auto U : MRI->use_nodbg_operands(Reg)) { - if (U.isImplicit()) - return false; - const MachineInstr *UseInst = U.getParent(); - if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg) - return false; - } - - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg); - unsigned Size = TRI->getRegSizeInBits(*RC); - - // TODO: Support 16 bit registers. Those needs to be moved with their - // parent VGPR_32 and potentially a sibling 16 bit sub-register. - if (Size < 32) - return false; - - if (TRI->hasVGPRs(RC)) - return true; - - if (Size == 16) - return AMDGPU::SGPR_LO16RegClass.contains(PhysReg); - - if (Size > 32) - PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0); - - return AMDGPU::SGPR_32RegClass.contains(PhysReg); -} - -unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask, - unsigned UsedBanks) const { - unsigned Size = countPopulation(Mask); - unsigned FreeBanks = 0; - unsigned Bank = findFirstSet(Mask); - - UsedBanks &= ~Mask; - - // Find free VGPR banks - if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) { - for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) { - if (Bank == I) - continue; - unsigned NewMask = ((1 << Size) - 1) << I; - NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; - if (!(UsedBanks & NewMask)) - FreeBanks |= 1 << I; - } - return FreeBanks; - } - - // Find free SGPR banks - // SGPR tuples must be aligned, so step is size in banks it - // crosses. - Bank -= SGPR_BANK_OFFSET; - for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) { - if (Bank == I) - continue; - unsigned NewMask = ((1 << Size) - 1) << I; - NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; - if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET))) - FreeBanks |= (1 << SGPR_BANK_OFFSET) << I; - } - - return FreeBanks; -} - -unsigned GCNRegBankReassign::getFreeBanks(Register Reg, - unsigned SubReg, - unsigned Mask, - unsigned UsedBanks) const { - if (!isReassignable(Reg)) - return 0; - - unsigned FreeBanks = getFreeBanks(Mask, UsedBanks); - - unsigned Offset = TRI->getChannelFromSubReg(SubReg); - if (Offset && (Mask & VGPR_BANK_MASK)) { - unsigned Shift = Offset; - if (Shift >= NUM_VGPR_BANKS) - return 0; - unsigned VB = FreeBanks & VGPR_BANK_MASK; - FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) & - VGPR_BANK_MASK; - } else if (Offset > 1 && (Mask & SGPR_BANK_MASK)) { - unsigned Shift = Offset >> 1; - if (Shift >= NUM_SGPR_BANKS) - return 0; - unsigned SB = FreeBanks >> SGPR_BANK_OFFSET; - FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) & - SGPR_BANK_SHIFTED_MASK; - FreeBanks <<= SGPR_BANK_OFFSET; - } - - LLVM_DEBUG(if (FreeBanks) { - dbgs() << "Potential reassignments of " << printReg(Reg, SubReg) - << " to banks: "; dumpFreeBanks(FreeBanks); - dbgs() << '\n'; }); - - return FreeBanks; -} - -void GCNRegBankReassign::collectCandidates(MachineInstr& MI, - unsigned UsedBanks, - unsigned StallCycles) { - LLVM_DEBUG(MI.dump()); - - if (!StallCycles) - return; - - LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n'); - - for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) { - for (unsigned J = I + 1; J != E; ++J) { - if (!(OperandMasks[I].Mask & OperandMasks[J].Mask)) - continue; - - Register Reg1 = OperandMasks[I].Reg; - Register Reg2 = OperandMasks[J].Reg; - unsigned SubReg1 = OperandMasks[I].SubReg; - unsigned SubReg2 = OperandMasks[J].SubReg; - unsigned Mask1 = OperandMasks[I].Mask; - unsigned Mask2 = OperandMasks[J].Mask; - unsigned Size1 = countPopulation(Mask1); - unsigned Size2 = countPopulation(Mask2); - - LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) << - " and " << printReg(Reg2, SubReg2) << '\n'); - - unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles); - Weight += MLI->getLoopDepth(MI.getParent()) * 10; - - LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n'); - - unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks); - unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks); - if (FreeBanks1) - Candidates.push(Weight + ((Size2 > Size1) ? 1 : 0), - Candidate(&MI, Reg1, SubReg1, FreeBanks1)); - if (FreeBanks2) - Candidates.push(Weight + ((Size1 > Size2) ? 1 : 0), - Candidate(&MI, Reg2, SubReg2, FreeBanks2)); - } - } -} - -unsigned GCNRegBankReassign::computeStallCycles(Register SrcReg, Register Reg, - unsigned SubReg, int Bank, - bool Collect) { - unsigned TotalStallCycles = 0; - SmallSet<const MachineInstr *, 16> Visited; - - for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) { - if (MI.isBundle()) - continue; - if (!Visited.insert(&MI).second) - continue; - unsigned StallCycles; - unsigned UsedBanks; - std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, SubReg, Bank); - TotalStallCycles += StallCycles; - if (Collect) - collectCandidates(MI, UsedBanks, StallCycles); - } - - return TotalStallCycles; -} - -MCRegister GCNRegBankReassign::scavengeReg(LiveInterval &LI, unsigned Bank, - unsigned SubReg) const { - const TargetRegisterClass *RC = MRI->getRegClass(LI.reg()); - unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs - : MaxNumSGPRs; - unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0 - : AMDGPU::SGPR0); - - for (MCRegister Reg : RC->getRegisters()) { - // Check occupancy limit. - if (TRI->isSubRegisterEq(Reg, MaxReg)) - break; - - if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg, SubReg) != Bank) - continue; - - for (unsigned I = 0; CSRegs[I]; ++I) - if (TRI->isSubRegisterEq(Reg, CSRegs[I]) && - !LRM->isPhysRegUsed(CSRegs[I])) - return MCRegister::from(AMDGPU::NoRegister); - - LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n'); - - if (!LRM->checkInterference(LI, Reg)) - return Reg; - } - - return MCRegister::from(AMDGPU::NoRegister); -} - -unsigned GCNRegBankReassign::tryReassign(Candidate &C) { - if (!LIS->hasInterval(C.Reg)) - return 0; - - LiveInterval &LI = LIS->getInterval(C.Reg); - LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump(); - LI.dump()); - - // For each candidate bank walk all instructions in the range of live - // interval and check if replacing the register with one belonging to - // the candidate bank reduces conflicts. - - unsigned OrigStalls = computeStallCycles(C.Reg); - LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n'); - if (!OrigStalls) - return 0; - - struct BankStall { - BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {}; - bool operator<(const BankStall &RHS) const { - if (Stalls == RHS.Stalls) - return Bank < RHS.Bank; - return Stalls > RHS.Stalls; - } - unsigned Bank; - unsigned Stalls; - }; - SmallVector<BankStall, 8> BankStalls; - - for (int Bank = 0; Bank < NUM_BANKS; ++Bank) { - if (C.FreeBanks & (1 << Bank)) { - LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n'); - unsigned Stalls = computeStallCycles(C.Reg, C.Reg, C.SubReg, Bank); - if (Stalls < OrigStalls) { - LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> " - << Stalls << '\n'); - BankStalls.push_back(BankStall((unsigned)Bank, Stalls)); - } - } - } - llvm::sort(BankStalls); - - MCRegister OrigReg = VRM->getPhys(C.Reg); - LRM->unassign(LI); - while (!BankStalls.empty()) { - BankStall BS = BankStalls.pop_back_val(); - MCRegister Reg = scavengeReg(LI, BS.Bank, C.SubReg); - if (Reg == AMDGPU::NoRegister) { - LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank) - << '\n'); - continue; - } - LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg) - << (LRM->isPhysRegUsed(Reg) ? "" : " (new)") - << " in bank " << printBank(BS.Bank) << '\n'); - - LRM->assign(LI, Reg); - - LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n'); - - return OrigStalls - BS.Stalls; - } - LRM->assign(LI, OrigReg); - - return 0; -} - -unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF, - bool Collect) { - unsigned TotalStallCycles = 0; - - for (MachineBasicBlock &MBB : MF) { - - LLVM_DEBUG(if (Collect) { - if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber(); - else dbgs() << MBB.getName(); dbgs() << ":\n"; - }); - - for (MachineInstr &MI : MBB.instrs()) { - if (MI.isBundle()) - continue; // we analyze the instructions inside the bundle individually - - unsigned StallCycles; - unsigned UsedBanks; - std::tie(StallCycles, UsedBanks) = analyzeInst(MI); - - if (Collect) - collectCandidates(MI, UsedBanks, StallCycles); - - TotalStallCycles += StallCycles; - } - - LLVM_DEBUG(if (Collect) { dbgs() << '\n'; }); - } - - return TotalStallCycles; -} - -void GCNRegBankReassign::removeCandidates(Register Reg) { - typename CandidateList::iterator Next; - for (auto I = Candidates.begin(), E = Candidates.end(); I != E; I = Next) { - Next = std::next(I); - I->second.remove_if([Reg, this](const Candidate& C) { - return C.MI->readsRegister(Reg, TRI); - }); - if (I->second.empty()) - Candidates.erase(I); - } -} - -bool GCNRegBankReassign::verifyCycles(MachineFunction &MF, - unsigned OriginalCycles, - unsigned CyclesSaved) { - unsigned StallCycles = collectCandidates(MF, false); - LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles - << " stall cycles left\n"); - return StallCycles + CyclesSaved == OriginalCycles; -} - -bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) { - ST = &MF.getSubtarget<GCNSubtarget>(); - if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction())) - return false; - - MRI = &MF.getRegInfo(); - TRI = ST->getRegisterInfo(); - MLI = &getAnalysis<MachineLoopInfo>(); - VRM = &getAnalysis<VirtRegMap>(); - LRM = &getAnalysis<LiveRegMatrix>(); - LIS = &getAnalysis<LiveIntervals>(); - - const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - unsigned Occupancy = MFI->getOccupancy(); - MaxNumVGPRs = ST->getMaxNumVGPRs(MF); - MaxNumSGPRs = ST->getMaxNumSGPRs(MF); - MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs); - MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs); - - CSRegs = MRI->getCalleeSavedRegs(); - unsigned NumRegBanks = AMDGPU::VGPR_32RegClass.getNumRegs() + - // Not a tight bound - AMDGPU::SReg_32RegClass.getNumRegs() / 2 + 1; - RegsUsed.resize(NumRegBanks); - - LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName() - << '\n'); - - unsigned StallCycles = collectCandidates(MF); - NumStallsDetected += StallCycles; - - LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in " - "function " << MF.getName() << '\n'); - - LLVM_DEBUG(Candidates.dump(this)); - - unsigned CyclesSaved = 0; - while (!Candidates.empty()) { - Candidate C = Candidates.back(); - unsigned LocalCyclesSaved = tryReassign(C); - CyclesSaved += LocalCyclesSaved; - - if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved)) - report_fatal_error("RegBank reassign stall cycles verification failed."); - - Candidates.pop_back(); - if (LocalCyclesSaved) { - removeCandidates(C.Reg); - computeStallCycles(C.Reg, AMDGPU::NoRegister, 0, -1, true); - - LLVM_DEBUG(Candidates.dump(this)); - } - } - NumStallsRecovered += CyclesSaved; - - LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved - << " cycles saved in function " << MF.getName() << '\n'); - - Candidates.clear(); - - if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved)) - report_fatal_error("RegBank reassign stall cycles verification failed."); - - RegsUsed.clear(); - - return CyclesSaved > 0; -} diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index aeec3e886327..3456f9a6156c 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -125,12 +125,14 @@ bool GCNRegPressure::less(const GCNSubtarget &ST, unsigned MaxOccupancy) const { const auto SGPROcc = std::min(MaxOccupancy, ST.getOccupancyWithNumSGPRs(getSGPRNum())); - const auto VGPROcc = std::min(MaxOccupancy, - ST.getOccupancyWithNumVGPRs(getVGPRNum())); + const auto VGPROcc = + std::min(MaxOccupancy, + ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()))); const auto OtherSGPROcc = std::min(MaxOccupancy, ST.getOccupancyWithNumSGPRs(O.getSGPRNum())); - const auto OtherVGPROcc = std::min(MaxOccupancy, - ST.getOccupancyWithNumVGPRs(O.getVGPRNum())); + const auto OtherVGPROcc = + std::min(MaxOccupancy, + ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts()))); const auto Occ = std::min(SGPROcc, VGPROcc); const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc); @@ -161,7 +163,8 @@ bool GCNRegPressure::less(const GCNSubtarget &ST, } } return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()): - (getVGPRNum() < O.getVGPRNum()); + (getVGPRNum(ST.hasGFX90AInsts()) < + O.getVGPRNum(ST.hasGFX90AInsts())); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -169,7 +172,9 @@ LLVM_DUMP_METHOD void GCNRegPressure::print(raw_ostream &OS, const GCNSubtarget *ST) const { OS << "VGPRs: " << Value[VGPR32] << ' '; OS << "AGPRs: " << Value[AGPR32]; - if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGPRNum()) << ')'; + if (ST) OS << "(O" + << ST->getOccupancyWithNumVGPRs(getVGPRNum(ST->hasGFX90AInsts())) + << ')'; OS << ", SGPRs: " << getSGPRNum(); if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGPRNum()) << ')'; OS << ", LVGPR WT: " << getVGPRTuplesWeight() @@ -384,6 +389,7 @@ bool GCNDownwardRPTracker::advanceBeforeNext() { void GCNDownwardRPTracker::advanceToNext() { LastTrackedMI = &*NextMI++; + NextMI = skipDebugInstructionsForward(NextMI, MBBEnd); // Add new registers or mask bits. for (const auto &MO : LastTrackedMI->operands()) { diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index ba8c85aa502b..257561cb8430 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -42,12 +42,19 @@ struct GCNRegPressure { clear(); } - bool empty() const { return getSGPRNum() == 0 && getVGPRNum() == 0; } + bool empty() const { return getSGPRNum() == 0 && getVGPRNum(false) == 0; } void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); } unsigned getSGPRNum() const { return Value[SGPR32]; } - unsigned getVGPRNum() const { return std::max(Value[VGPR32], Value[AGPR32]); } + unsigned getVGPRNum(bool UnifiedVGPRFile) const { + if (UnifiedVGPRFile) { + return Value[AGPR32] ? alignTo(Value[VGPR32], 4) + Value[AGPR32] + : Value[VGPR32] + Value[AGPR32]; + } + return std::max(Value[VGPR32], Value[AGPR32]); + } + unsigned getAGPRNum() const { return Value[AGPR32]; } unsigned getVGPRTuplesWeight() const { return std::max(Value[VGPR_TUPLE], Value[AGPR_TUPLE]); } @@ -55,7 +62,7 @@ struct GCNRegPressure { unsigned getOccupancy(const GCNSubtarget &ST) const { return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()), - ST.getOccupancyWithNumVGPRs(getVGPRNum())); + ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()))); } void inc(unsigned Reg, @@ -160,7 +167,7 @@ class GCNDownwardRPTracker : public GCNRPTracker { public: GCNDownwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {} - const MachineBasicBlock::const_iterator getNext() const { return NextMI; } + MachineBasicBlock::const_iterator getNext() const { return NextMI; } // Reset tracker to the point before the MI // filling live regs upon this point using LIS. diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 6e2550298dc6..0212b8e17641 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -20,7 +20,8 @@ using namespace llvm; GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( const MachineSchedContext *C) : - GenericScheduler(C), TargetOccupancy(0), MF(nullptr) { } + GenericScheduler(C), TargetOccupancy(0), HasClusteredNodes(false), + HasExcessPressure(false), MF(nullptr) { } void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) { GenericScheduler::initialize(DAG); @@ -103,11 +104,13 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU // marked as RegExcess in tryCandidate() when they are compared with // instructions that increase the register pressure. if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { + HasExcessPressure = true; Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); } if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { + HasExcessPressure = true; Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit); } @@ -121,6 +124,7 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; if (SGPRDelta >= 0 || VGPRDelta >= 0) { + HasExcessPressure = true; if (SGPRDelta > VGPRDelta) { Cand.RPDelta.CriticalMax = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); @@ -279,6 +283,15 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) { if (SU->isBottomReady()) Bot.removeReady(SU); + if (!HasClusteredNodes && SU->getInstr()->mayLoadOrStore()) { + for (SDep &Dep : SU->Preds) { + if (Dep.isCluster()) { + HasClusteredNodes = true; + break; + } + } + } + LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr()); return SU; @@ -320,22 +333,30 @@ void GCNScheduleDAGMILive::schedule() { PressureBefore.print(dbgs())); } + GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; + // Set HasClusteredNodes to true for late stages where we have already + // collected it. That way pickNode() will not scan SDep's when not needed. + S.HasClusteredNodes = Stage > InitialSchedule; + S.HasExcessPressure = false; ScheduleDAGMILive::schedule(); Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd); RescheduleRegions[RegionIdx] = false; + if (Stage == InitialSchedule && S.HasClusteredNodes) + RegionsWithClusters[RegionIdx] = true; + if (S.HasExcessPressure) + RegionsWithHighRP[RegionIdx] = true; if (!LIS) return; // Check the results of scheduling. - GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; auto PressureAfter = getRealRegPressure(); LLVM_DEBUG(dbgs() << "Pressure after scheduling: "; PressureAfter.print(dbgs())); if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && - PressureAfter.getVGPRNum() <= S.VGPRCriticalLimit) { + PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { Pressure[RegionIdx] = PressureAfter; LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); return; @@ -366,9 +387,12 @@ void GCNScheduleDAGMILive::schedule() { unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); - if (PressureAfter.getVGPRNum() > MaxVGPRs || - PressureAfter.getSGPRNum() > MaxSGPRs) + if (PressureAfter.getVGPRNum(false) > MaxVGPRs || + PressureAfter.getAGPRNum() > MaxVGPRs || + PressureAfter.getSGPRNum() > MaxSGPRs) { RescheduleRegions[RegionIdx] = true; + RegionsWithHighRP[RegionIdx] = true; + } if (WavesAfter >= MinOccupancy) { if (Stage == UnclusteredReschedule && @@ -378,6 +402,9 @@ void GCNScheduleDAGMILive::schedule() { PressureAfter.less(ST, PressureBefore) || !RescheduleRegions[RegionIdx]) { Pressure[RegionIdx] = PressureAfter; + if (!RegionsWithClusters[RegionIdx] && + (Stage + 1) == UnclusteredReschedule) + RescheduleRegions[RegionIdx] = false; return; } else { LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n"); @@ -385,7 +412,8 @@ void GCNScheduleDAGMILive::schedule() { } LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); - RescheduleRegions[RegionIdx] = true; + RescheduleRegions[RegionIdx] = RegionsWithClusters[RegionIdx] || + (Stage + 1) != UnclusteredReschedule; RegionEnd = RegionBegin; for (MachineInstr *MI : Unsched) { if (MI->isDebugInstr()) @@ -460,7 +488,9 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) { I = Rgn.first; auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second); auto LRS = BBLiveInMap.lookup(NonDbgMI); +#ifdef EXPENSIVE_CHECKS assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS)); +#endif RPTracker.reset(*I, &LRS); } @@ -516,7 +546,11 @@ void GCNScheduleDAGMILive::finalizeSchedule() { LiveIns.resize(Regions.size()); Pressure.resize(Regions.size()); RescheduleRegions.resize(Regions.size()); + RegionsWithClusters.resize(Regions.size()); + RegionsWithHighRP.resize(Regions.size()); RescheduleRegions.set(); + RegionsWithClusters.reset(); + RegionsWithHighRP.reset(); if (!Regions.empty()) BBLiveInMap = getBBLiveInMap(); @@ -561,7 +595,10 @@ void GCNScheduleDAGMILive::finalizeSchedule() { SavedMutations.swap(Mutations); for (auto Region : Regions) { - if (Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx]) { + if ((Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx]) || + (Stage == ClusteredLowOccupancyReschedule && + !RegionsWithClusters[RegionIdx] && !RegionsWithHighRP[RegionIdx])) { + ++RegionIdx; continue; } diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 2d81d9977c31..15eba3f5eac0 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -50,6 +50,14 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler { unsigned TargetOccupancy; + // schedule() have seen a clustered memory operation. Set it to false + // before a region scheduling to know if the region had such clusters. + bool HasClusteredNodes; + + // schedule() have seen a an excess register pressure and had to track + // register pressure for actual scheduling heuristics. + bool HasExcessPressure; + MachineFunction *MF; public: @@ -96,6 +104,12 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { // or we generally desire to reschedule it. BitVector RescheduleRegions; + // Record regions which use clustered loads/stores. + BitVector RegionsWithClusters; + + // Record regions with high register pressure. + BitVector RegionsWithHighRP; + // Region live-in cache. SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 7a7178126444..bd0c40081c01 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -41,24 +41,16 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, using AMDGPUSubtarget::getMaxWavesPerEU; public: - enum TrapHandlerAbi { - TrapHandlerAbiNone = 0, - TrapHandlerAbiHsa = 1 + // Following 2 enums are documented at: + // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi + enum class TrapHandlerAbi { + NONE = 0x00, + AMDHSA = 0x01, }; - enum TrapID { - TrapIDHardwareReserved = 0, - TrapIDHSADebugTrap = 1, - TrapIDLLVMTrap = 2, - TrapIDLLVMDebugTrap = 3, - TrapIDDebugBreakpoint = 7, - TrapIDDebugReserved8 = 8, - TrapIDDebugReservedFE = 0xfe, - TrapIDDebugReservedFF = 0xff - }; - - enum TrapRegValues { - LLVMTrapHandlerRegValue = 1 + enum class TrapID { + LLVMAMDHSATrap = 0x02, + LLVMAMDHSADebugTrap = 0x03, }; private: @@ -82,6 +74,7 @@ protected: bool FastFMAF32; bool FastDenormalF32; bool HalfRate64Ops; + bool FullRate64Ops; // Dynamically set bits that enable features. bool FlatForGlobal; @@ -95,6 +88,7 @@ protected: // for XNACK. bool EnableXNACK; + bool EnableTgSplit; bool EnableCuMode; bool TrapHandler; @@ -110,14 +104,17 @@ protected: bool FP64; bool FMA; bool MIMG_R128; - bool GCN3Encoding; + bool IsGCN; bool CIInsts; bool GFX8Insts; bool GFX9Insts; + bool GFX90AInsts; bool GFX10Insts; bool GFX10_3Insts; bool GFX7GFX8GFX9Insts; bool SGPRInitBug; + bool NegativeScratchOffsetBug; + bool NegativeUnalignedScratchOffsetBug; bool HasSMemRealTime; bool HasIntClamp; bool HasFmaMixInsts; @@ -132,10 +129,15 @@ protected: bool HasSDWAOutModsVOPC; bool HasDPP; bool HasDPP8; + bool Has64BitDPP; + bool HasPackedFP32Ops; + bool HasExtendedImageInsts; bool HasR128A16; bool HasGFX10A16; bool HasG16; bool HasNSAEncoding; + unsigned NSAMaxSize; + bool GFX10_AEncoding; bool GFX10_BEncoding; bool HasDLInsts; bool HasDot1Insts; @@ -144,6 +146,7 @@ protected: bool HasDot4Insts; bool HasDot5Insts; bool HasDot6Insts; + bool HasDot7Insts; bool HasMAIInsts; bool HasPkFmacF16Inst; bool HasAtomicFaddInsts; @@ -157,6 +160,7 @@ protected: bool HasVscnt; bool HasGetWaveIdInst; bool HasSMemTimeInst; + bool HasShaderCyclesRegister; bool HasRegisterBanking; bool HasVOP3Literal; bool HasNoDataDepHazard; @@ -165,12 +169,19 @@ protected: bool FlatGlobalInsts; bool FlatScratchInsts; bool ScalarFlatScratchInsts; + bool HasArchitectedFlatScratch; bool AddNoCarryInsts; bool HasUnpackedD16VMem; + bool R600ALUInst; + bool CaymanISA; + bool CFALUBug; bool LDSMisalignedBug; bool HasMFMAInlineLiteralBug; + bool HasVertexCache; + short TexVTXClauseSize; bool UnalignedBufferAccess; bool UnalignedDSAccess; + bool HasPackedTID; bool ScalarizeGlobal; bool HasVcmpxPermlaneHazard; @@ -180,6 +191,7 @@ protected: bool HasVcmpxExecWARHazard; bool HasLdsBranchVmemWARHazard; bool HasNSAtoVMEMBug; + bool HasNSAClauseBug; bool HasOffset3fBug; bool HasFlatSegmentOffsetBug; bool HasImageStoreD16Bug; @@ -241,6 +253,10 @@ public: return RegBankInfo.get(); } + const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const { + return TargetID; + } + // Nothing implemented, just prevent crashes on use. const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { return &TSInfo; @@ -271,6 +287,11 @@ public: unsigned getConstantBusLimit(unsigned Opcode) const; + /// Returns if the result of this instruction with a 16-bit result returned in + /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve + /// the original value. + bool zeroesHigh16BitsOfDest(unsigned Opcode) const; + bool hasIntClamp() const { return HasIntClamp; } @@ -295,6 +316,10 @@ public: return HalfRate64Ops; } + bool hasFullRate64Ops() const { + return FullRate64Ops; + } + bool hasAddr64() const { return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); } @@ -370,7 +395,12 @@ public: } TrapHandlerAbi getTrapHandlerAbi() const { - return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone; + return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE; + } + + bool supportsGetDoorbellID() const { + // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets. + return getGeneration() >= GFX9; } /// True if the offset field of DS instructions works as expected. On SI, the @@ -510,6 +540,10 @@ public: return TargetID.isXnackOnOrAny(); } + bool isTgSplitEnabled() const { + return EnableTgSplit; + } + bool isCuModeEnabled() const { return EnableCuMode; } @@ -666,6 +700,10 @@ public: return HasDot6Insts; } + bool hasDot7Insts() const { + return HasDot7Insts; + } + bool hasMAIInsts() const { return HasMAIInsts; } @@ -694,6 +732,10 @@ public: return HasSMemTimeInst; } + bool hasShaderCyclesRegister() const { + return HasShaderCyclesRegister; + } + bool hasRegisterBanking() const { return HasRegisterBanking; } @@ -780,6 +822,9 @@ public: return GFX8Insts; } + /// \returns true if the subtarget has the v_permlanex16_b32 instruction. + bool hasPermLaneX16() const { return getGeneration() >= GFX10; } + bool hasDPP() const { return HasDPP; } @@ -796,6 +841,22 @@ public: return HasDPP8; } + bool has64BitDPP() const { + return Has64BitDPP; + } + + bool hasPackedFP32Ops() const { + return HasPackedFP32Ops; + } + + bool hasFmaakFmamkF32Insts() const { + return getGeneration() >= GFX10; + } + + bool hasExtendedImageInsts() const { + return HasExtendedImageInsts; + } + bool hasR128A16() const { return HasR128A16; } @@ -818,6 +879,12 @@ public: bool hasNSAEncoding() const { return HasNSAEncoding; } + unsigned getNSAMaxSize() const { return NSAMaxSize; } + + bool hasGFX10_AEncoding() const { + return GFX10_AEncoding; + } + bool hasGFX10_BEncoding() const { return GFX10_BEncoding; } @@ -840,6 +907,12 @@ public: return SGPRInitBug; } + bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; } + + bool hasNegativeUnalignedScratchOffsetBug() const { + return NegativeUnalignedScratchOffsetBug; + } + bool hasMFMAInlineLiteralBug() const { return HasMFMAInlineLiteralBug; } @@ -894,8 +967,17 @@ public: return HasNSAtoVMEMBug; } + bool hasNSAClauseBug() const { return HasNSAClauseBug; } + bool hasHardClauses() const { return getGeneration() >= GFX10; } + bool hasGFX90AInsts() const { return GFX90AInsts; } + + /// Return if operations acting on VGPR tuples require even alignment. + bool needsAlignedVGPRs() const { return GFX90AInsts; } + + bool hasPackedTID() const { return HasPackedTID; } + /// Return the maximum number of waves per SIMD for kernels using \p SGPRs /// SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; @@ -917,6 +999,10 @@ public: return getGeneration() >= AMDGPUSubtarget::GFX9; } + /// \returns true if the flat_scratch register is initialized by the HW. + /// In this case it is readonly. + bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; } + /// \returns true if the machine has merged shaders in which s0-s7 are /// reserved by the hardware and user SGPRs start at s8 bool hasMergedShaders() const { @@ -955,9 +1041,24 @@ public: return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); } - /// \returns Reserved number of SGPRs for given function \p MF. + /// \returns Reserved number of SGPRs. This is common + /// utility function called by MachineFunction and + /// Function variants of getReservedNumSGPRs. + unsigned getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const; + /// \returns Reserved number of SGPRs for given machine function \p MF. unsigned getReservedNumSGPRs(const MachineFunction &MF) const; + /// \returns Reserved number of SGPRs for given function \p F. + unsigned getReservedNumSGPRs(const Function &F) const; + + /// \returns max num SGPRs. This is the common utility + /// function called by MachineFunction and Function + /// variants of getMaxNumSGPRs. + unsigned getBaseMaxNumSGPRs(const Function &F, + std::pair<unsigned, unsigned> WavesPerEU, + unsigned PreloadedSGPRs, + unsigned ReservedNumSGPRs) const; + /// \returns Maximum number of SGPRs that meets number of waves per execution /// unit requirement for function \p MF, or number of SGPRs explicitly /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. @@ -968,6 +1069,16 @@ public: /// unit requirement. unsigned getMaxNumSGPRs(const MachineFunction &MF) const; + /// \returns Maximum number of SGPRs that meets number of waves per execution + /// unit requirement for function \p F, or number of SGPRs explicitly + /// requested using "amdgpu-num-sgpr" attribute attached to function \p F. + /// + /// \returns Value that meets number of waves per execution unit requirement + /// if explicitly requested value cannot be converted to integer, violates + /// subtarget's specifications, or does not meet number of waves per execution + /// unit requirement. + unsigned getMaxNumSGPRs(const Function &F) const; + /// \returns VGPR allocation granularity supported by the subtarget. unsigned getVGPRAllocGranule() const { return AMDGPU::IsaInfo::getVGPRAllocGranule(this); @@ -1000,6 +1111,20 @@ public: return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); } + /// \returns max num VGPRs. This is the common utility function + /// called by MachineFunction and Function variants of getMaxNumVGPRs. + unsigned getBaseMaxNumVGPRs(const Function &F, + std::pair<unsigned, unsigned> WavesPerEU) const; + /// \returns Maximum number of VGPRs that meets number of waves per execution + /// unit requirement for function \p F, or number of VGPRs explicitly + /// requested using "amdgpu-num-vgpr" attribute attached to function \p F. + /// + /// \returns Value that meets number of waves per execution unit requirement + /// if explicitly requested value cannot be converted to integer, violates + /// subtarget's specifications, or does not meet number of waves per execution + /// unit requirement. + unsigned getMaxNumVGPRs(const Function &F) const; + /// \returns Maximum number of VGPRs that meets number of waves per execution /// unit requirement for function \p MF, or number of VGPRs explicitly /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp index 426648d19d55..bb2c298c2850 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -80,9 +80,12 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, const auto *SymA = Target.getSymA(); assert(SymA); - Ctx.reportError(Fixup.getLoc(), - Twine("undefined label '") + SymA->getSymbol().getName() + "'"); - return ELF::R_AMDGPU_NONE; + if (SymA->getSymbol().isUndefined()) { + Ctx.reportError(Fixup.getLoc(), Twine("undefined label '") + + SymA->getSymbol().getName() + "'"); + return ELF::R_AMDGPU_NONE; + } + return ELF::R_AMDGPU_REL16; } llvm_unreachable("unhandled relocation type"); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index fbf7dc2a72db..9ba0ffbced3d 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -10,6 +10,7 @@ #include "AMDGPUInstPrinter.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIDefines.h" +#include "SIRegisterInfo.h" #include "Utils/AMDGPUAsmUtils.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/MC/MCExpr.h" @@ -146,7 +147,7 @@ void AMDGPUInstPrinter::printFlatOffset(const MCInst *MI, unsigned OpNo, const MCInstrDesc &Desc = MII.get(MI->getOpcode()); bool IsFlatSeg = !(Desc.TSFlags & - (SIInstrFlags::IsFlatGlobal | SIInstrFlags::IsFlatScratch)); + (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch)); if (IsFlatSeg) { // Unsigned offset printU16ImmDecOperand(MI, OpNo, O); @@ -201,20 +202,19 @@ void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo, printNamedBit(MI, OpNo, O, "gds"); } -void AMDGPUInstPrinter::printDLC(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - if (AMDGPU::isGFX10Plus(STI)) - printNamedBit(MI, OpNo, O, "dlc"); -} - -void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - printNamedBit(MI, OpNo, O, "glc"); -} - -void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - printNamedBit(MI, OpNo, O, "slc"); +void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + auto Imm = MI->getOperand(OpNo).getImm(); + if (Imm & CPol::GLC) + O << " glc"; + if (Imm & CPol::SLC) + O << " slc"; + if ((Imm & CPol::DLC) && AMDGPU::isGFX10Plus(STI)) + O << " dlc"; + if ((Imm & CPol::SCC) && AMDGPU::isGFX90A(STI)) + O << " scc"; + if (Imm & ~CPol::ALL) + O << " /* unexpected cache policy bit */"; } void AMDGPUInstPrinter::printSWZ(const MCInst *MI, unsigned OpNo, @@ -362,22 +362,30 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, } void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { + const MCSubtargetInfo &STI, + raw_ostream &O) { + auto Opcode = MI->getOpcode(); + auto Flags = MII.get(Opcode).TSFlags; + if (OpNo == 0) { - if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3) - O << "_e64 "; - else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::DPP) - O << "_dpp "; - else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::SDWA) - O << "_sdwa "; - else - O << "_e32 "; + if (Flags & SIInstrFlags::VOP3) { + if (!getVOP3IsSingle(Opcode)) + O << "_e64"; + } else if (Flags & SIInstrFlags::DPP) { + O << "_dpp"; + } else if (Flags & SIInstrFlags::SDWA) { + O << "_sdwa"; + } else if (((Flags & SIInstrFlags::VOP1) && !getVOP1IsSingle(Opcode)) || + ((Flags & SIInstrFlags::VOP2) && !getVOP2IsSingle(Opcode))) { + O << "_e32"; + } + O << " "; } printOperand(MI, OpNo, STI, O); // Print default vcc/vcc_lo operand. - switch (MI->getOpcode()) { + switch (Opcode) { default: break; case AMDGPU::V_ADD_CO_CI_U32_e32_gfx10: @@ -601,6 +609,10 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, case AMDGPU::OPERAND_REG_INLINE_C_FP32: case AMDGPU::OPERAND_REG_INLINE_AC_INT32: case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_REG_IMM_V2INT32: + case AMDGPU::OPERAND_REG_IMM_V2FP32: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: case MCOI::OPERAND_IMMEDIATE: printImmediate32(Op.getImm(), STI, O); break; @@ -608,6 +620,7 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_INLINE_C_INT64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: printImmediate64(Op.getImm(), STI, O); break; case AMDGPU::OPERAND_REG_INLINE_C_INT16: @@ -656,18 +669,19 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, // custom printer. llvm_unreachable("unexpected immediate operand type"); } - } else if (Op.isFPImm()) { + } else if (Op.isDFPImm()) { + double Value = bit_cast<double>(Op.getDFPImm()); // We special case 0.0 because otherwise it will be printed as an integer. - if (Op.getFPImm() == 0.0) + if (Value == 0.0) O << "0.0"; else { const MCInstrDesc &Desc = MII.get(MI->getOpcode()); int RCID = Desc.OpInfo[OpNo].RegClass; unsigned RCBits = AMDGPU::getRegBitWidth(MRI.getRegClass(RCID)); if (RCBits == 32) - printImmediate32(FloatToBits(Op.getFPImm()), STI, O); + printImmediate32(FloatToBits(Value), STI, O); else if (RCBits == 64) - printImmediate64(DoubleToBits(Op.getFPImm()), STI, O); + printImmediate64(DoubleToBits(Value), STI, O); else llvm_unreachable("Invalid register class size"); } @@ -727,7 +741,7 @@ void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI, if (OpNo + 1 < MI->getNumOperands() && (InputModifiers & SISrcMods::ABS) == 0) { const MCOperand &Op = MI->getOperand(OpNo + 1); - NegMnemo = Op.isImm() || Op.isFPImm(); + NegMnemo = Op.isImm() || Op.isDFPImm(); } if (NegMnemo) { O << "neg("; @@ -793,7 +807,16 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo, using namespace AMDGPU::DPP; unsigned Imm = MI->getOperand(OpNo).getImm(); - if (Imm <= DppCtrl::QUAD_PERM_LAST) { + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::src0); + + if (Src0Idx >= 0 && + Desc.OpInfo[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID && + !AMDGPU::isLegal64BitDPPControl(Imm)) { + O << " /* 64 bit dpp only supports row_newbcast */"; + return; + } else if (Imm <= DppCtrl::QUAD_PERM_LAST) { O << "quad_perm:["; O << formatDec(Imm & 0x3) << ','; O << formatDec((Imm & 0xc) >> 2) << ','; @@ -853,11 +876,15 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo, O << "row_bcast:31"; } else if ((Imm >= DppCtrl::ROW_SHARE_FIRST) && (Imm <= DppCtrl::ROW_SHARE_LAST)) { - if (!AMDGPU::isGFX10Plus(STI)) { - O << "/* row_share is not supported on ASICs earlier than GFX10 */"; + if (AMDGPU::isGFX90A(STI)) { + O << "row_newbcast:"; + } else if (AMDGPU::isGFX10Plus(STI)) { + O << "row_share:"; + } else { + O << " /* row_newbcast/row_share is not supported on ASICs earlier " + "than GFX90A/GFX10 */"; return; } - O << "row_share:"; printU4ImmDecOperand(MI, OpNo, O); } else if ((Imm >= DppCtrl::ROW_XMASK_FIRST) && (Imm <= DppCtrl::ROW_XMASK_LAST)) { @@ -891,7 +918,7 @@ void AMDGPUInstPrinter::printBoundCtrl(const MCInst *MI, unsigned OpNo, raw_ostream &O) { unsigned Imm = MI->getOperand(OpNo).getImm(); if (Imm) { - O << " bound_ctrl:0"; // XXX - this syntax is used in sp3 + O << " bound_ctrl:1"; } } @@ -1236,8 +1263,8 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, decodeMsg(Imm16, MsgId, OpId, StreamId); if (isValidMsgId(MsgId, STI) && - isValidMsgOp(MsgId, OpId) && - isValidMsgStream(MsgId, OpId, StreamId)) { + isValidMsgOp(MsgId, OpId, STI) && + isValidMsgStream(MsgId, OpId, StreamId, STI)) { O << "sendmsg(" << getMsgName(MsgId); if (msgRequiresOp(MsgId)) { O << ", " << getMsgOpName(MsgId, OpId); @@ -1560,12 +1587,12 @@ void R600InstPrinter::printOperand(const MCInst *MI, unsigned OpNo, } } else if (Op.isImm()) { O << Op.getImm(); - } else if (Op.isFPImm()) { + } else if (Op.isDFPImm()) { // We special case 0.0 because otherwise it will be printed as an integer. - if (Op.getFPImm() == 0.0) + if (Op.getDFPImm() == 0.0) O << "0.0"; else { - O << Op.getFPImm(); + O << bit_cast<double>(Op.getDFPImm()); } } else if (Op.isExpr()) { const MCExpr *Exp = Op.getExpr(); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index 8d13aa682211..3cb4fcb28cb0 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -68,12 +68,8 @@ private: const MCSubtargetInfo &STI, raw_ostream &O); void printGDS(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printDLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printGLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printSLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); + void printCPol(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printSWZ(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printTFE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 1836237c8df5..5c728bd86817 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -42,6 +42,7 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT, HasNoDeadStrip = true; //===--- Dwarf Emission Directives -----------------------------------===// SupportsDebugInformation = true; + UsesCFIForDebug = true; DwarfRegNumForCFI = true; UseIntegratedAssembler = false; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index f0eb11b70c97..9a9a2c973f44 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -31,6 +31,20 @@ using namespace llvm::AMDGPU; // AMDGPUTargetStreamer //===----------------------------------------------------------------------===// +static void convertIsaVersionV2(uint32_t &Major, uint32_t &Minor, + uint32_t &Stepping, bool Sramecc, bool Xnack) { + if (Major == 9 && Minor == 0) { + switch (Stepping) { + case 0: + case 2: + case 4: + case 6: + if (Xnack) + Stepping++; + } + } +} + bool AMDGPUTargetStreamer::EmitHSAMetadataV2(StringRef HSAMetadataString) { HSAMD::Metadata HSAMetadata; if (HSAMD::fromString(HSAMetadataString, HSAMetadata)) @@ -86,14 +100,18 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) { case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX908: AK = GK_GFX908; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A: AK = GK_GFX90A; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C: AK = GK_GFX90C; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: AK = GK_GFX1012; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1013: AK = GK_GFX1013; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030: AK = GK_GFX1030; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1031: AK = GK_GFX1031; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1032: AK = GK_GFX1032; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1033: AK = GK_GFX1033; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1034: AK = GK_GFX1034; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1035: AK = GK_GFX1035; break; case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break; } @@ -145,14 +163,18 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) { case GK_GFX906: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX906; case GK_GFX908: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX908; case GK_GFX909: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909; + case GK_GFX90A: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A; case GK_GFX90C: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C; case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010; case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011; case GK_GFX1012: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012; + case GK_GFX1013: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1013; case GK_GFX1030: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030; case GK_GFX1031: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1031; case GK_GFX1032: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1032; case GK_GFX1033: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1033; + case GK_GFX1034: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1034; + case GK_GFX1035: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1035; case GK_NONE: return ELF::EF_AMDGPU_MACH_NONE; } @@ -180,8 +202,8 @@ void AMDGPUTargetAsmStreamer::finish() { getPALMetadata()->reset(); } -void AMDGPUTargetAsmStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) { - OS << "\t.amdgcn_target \"" << Target << "\"\n"; +void AMDGPUTargetAsmStreamer::EmitDirectiveAMDGCNTarget() { + OS << "\t.amdgcn_target \"" << getTargetID()->toString() << "\"\n"; } void AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectVersion( @@ -191,15 +213,14 @@ void AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectVersion( } void -AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major, - uint32_t Minor, - uint32_t Stepping, - StringRef VendorName, - StringRef ArchName) { - OS << "\t.hsa_code_object_isa " << - Twine(Major) << "," << Twine(Minor) << "," << Twine(Stepping) << - ",\"" << VendorName << "\",\"" << ArchName << "\"\n"; - +AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectISAV2(uint32_t Major, + uint32_t Minor, + uint32_t Stepping, + StringRef VendorName, + StringRef ArchName) { + convertIsaVersionV2(Major, Minor, Stepping, TargetID->isSramEccOnOrAny(), TargetID->isXnackOnOrAny()); + OS << "\t.hsa_code_object_isa " << Twine(Major) << "," << Twine(Minor) << "," + << Twine(Stepping) << ",\"" << VendorName << "\",\"" << ArchName << "\"\n"; } void @@ -225,8 +246,8 @@ void AMDGPUTargetAsmStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, << Alignment.value() << '\n'; } -bool AMDGPUTargetAsmStreamer::EmitISAVersion(StringRef IsaVersionString) { - OS << "\t.amd_amdgpu_isa \"" << IsaVersionString << "\"\n"; +bool AMDGPUTargetAsmStreamer::EmitISAVersion() { + OS << "\t.amd_amdgpu_isa \"" << getTargetID()->toString() << "\"\n"; return true; } @@ -258,17 +279,32 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata( return true; } -bool AMDGPUTargetAsmStreamer::EmitCodeEnd() { +bool AMDGPUTargetAsmStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) { const uint32_t Encoded_s_code_end = 0xbf9f0000; - OS << "\t.p2alignl 6, " << Encoded_s_code_end << '\n'; - OS << "\t.fill 48, 4, " << Encoded_s_code_end << '\n'; + const uint32_t Encoded_s_nop = 0xbf800000; + uint32_t Encoded_pad = Encoded_s_code_end; + + // Instruction cache line size in bytes. + const unsigned Log2CacheLineSize = 6; + const unsigned CacheLineSize = 1u << Log2CacheLineSize; + + // Extra padding amount in bytes to support prefetch mode 3. + unsigned FillSize = 3 * CacheLineSize; + + if (AMDGPU::isGFX90A(STI)) { + Encoded_pad = Encoded_s_nop; + FillSize = 16 * CacheLineSize; + } + + OS << "\t.p2alignl " << Log2CacheLineSize << ", " << Encoded_pad << '\n'; + OS << "\t.fill " << (FillSize / 4) << ", 4, " << Encoded_pad << '\n'; return true; } void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KD, uint64_t NextVGPR, uint64_t NextSGPR, - bool ReserveVCC, bool ReserveFlatScr, bool ReserveXNACK) { + bool ReserveVCC, bool ReserveFlatScr) { IsaVersion IVersion = getIsaVersion(STI.getCPU()); OS << "\t.amdhsa_kernel " << KernelName << '\n'; @@ -281,10 +317,13 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( << '\n'; OS << "\t\t.amdhsa_private_segment_fixed_size " << KD.private_segment_fixed_size << '\n'; + OS << "\t\t.amdhsa_kernarg_size " << KD.kernarg_size << '\n'; - PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_buffer", KD, - kernel_code_properties, - amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER); + if (!hasArchitectedFlatScratch(STI)) + PRINT_FIELD( + OS, ".amdhsa_user_sgpr_private_segment_buffer", KD, + kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER); PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_ptr", KD, kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR); @@ -297,9 +336,10 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_id", KD, kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID); - PRINT_FIELD(OS, ".amdhsa_user_sgpr_flat_scratch_init", KD, - kernel_code_properties, - amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); + if (!hasArchitectedFlatScratch(STI)) + PRINT_FIELD(OS, ".amdhsa_user_sgpr_flat_scratch_init", KD, + kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_size", KD, kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); @@ -307,10 +347,12 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( PRINT_FIELD(OS, ".amdhsa_wavefront_size32", KD, kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32); - PRINT_FIELD( - OS, ".amdhsa_system_sgpr_private_segment_wavefront_offset", KD, - compute_pgm_rsrc2, - amdhsa::COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT); + PRINT_FIELD(OS, + (hasArchitectedFlatScratch(STI) + ? ".amdhsa_enable_private_segment" + : ".amdhsa_system_sgpr_private_segment_wavefront_offset"), + KD, compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT); PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_x", KD, compute_pgm_rsrc2, amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X); @@ -331,12 +373,30 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( OS << "\t\t.amdhsa_next_free_vgpr " << NextVGPR << '\n'; OS << "\t\t.amdhsa_next_free_sgpr " << NextSGPR << '\n'; + if (AMDGPU::isGFX90A(STI)) + OS << "\t\t.amdhsa_accum_offset " << + (AMDHSA_BITS_GET(KD.compute_pgm_rsrc3, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET) + 1) * 4 + << '\n'; + if (!ReserveVCC) OS << "\t\t.amdhsa_reserve_vcc " << ReserveVCC << '\n'; - if (IVersion.Major >= 7 && !ReserveFlatScr) + if (IVersion.Major >= 7 && !ReserveFlatScr && !hasArchitectedFlatScratch(STI)) OS << "\t\t.amdhsa_reserve_flat_scratch " << ReserveFlatScr << '\n'; - if (IVersion.Major >= 8 && ReserveXNACK != hasXNACK(STI)) - OS << "\t\t.amdhsa_reserve_xnack_mask " << ReserveXNACK << '\n'; + + if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(&STI)) { + switch (*HsaAbiVer) { + default: + break; + case ELF::ELFABIVERSION_AMDGPU_HSA_V2: + break; + case ELF::ELFABIVERSION_AMDGPU_HSA_V3: + case ELF::ELFABIVERSION_AMDGPU_HSA_V4: + if (getTargetID()->isXnackSupported()) + OS << "\t\t.amdhsa_reserve_xnack_mask " << getTargetID()->isXnackOnOrAny() << '\n'; + break; + } + } PRINT_FIELD(OS, ".amdhsa_float_round_mode_32", KD, compute_pgm_rsrc1, @@ -360,6 +420,10 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( PRINT_FIELD(OS, ".amdhsa_fp16_overflow", KD, compute_pgm_rsrc1, amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL); + if (AMDGPU::isGFX90A(STI)) + PRINT_FIELD(OS, ".amdhsa_tg_split", KD, + compute_pgm_rsrc3, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT); if (IVersion.Major >= 10) { PRINT_FIELD(OS, ".amdhsa_workgroup_processor_mode", KD, compute_pgm_rsrc1, @@ -405,23 +469,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI) - : AMDGPUTargetStreamer(S), Streamer(S), Os(STI.getTargetTriple().getOS()) { - MCAssembler &MCA = getStreamer().getAssembler(); - unsigned EFlags = MCA.getELFHeaderEFlags(); - - EFlags &= ~ELF::EF_AMDGPU_MACH; - EFlags |= getElfMach(STI.getCPU()); - - EFlags &= ~ELF::EF_AMDGPU_XNACK; - if (AMDGPU::hasXNACK(STI)) - EFlags |= ELF::EF_AMDGPU_XNACK; - - EFlags &= ~ELF::EF_AMDGPU_SRAM_ECC; - if (AMDGPU::hasSRAMECC(STI)) - EFlags |= ELF::EF_AMDGPU_SRAM_ECC; - - MCA.setELFHeaderEFlags(EFlags); -} + : AMDGPUTargetStreamer(S), STI(STI), Streamer(S) {} MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() { return static_cast<MCELFStreamer &>(Streamer); @@ -431,6 +479,9 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() { // We use it for emitting the accumulated PAL metadata as a .note record. // The PAL metadata is reset after it is emitted. void AMDGPUTargetELFStreamer::finish() { + MCAssembler &MCA = getStreamer().getAssembler(); + MCA.setELFHeaderEFlags(getEFlags()); + std::string Blob; const char *Vendor = getPALMetadata()->getVendor(); unsigned Type = getPALMetadata()->getType(); @@ -456,7 +507,7 @@ void AMDGPUTargetELFStreamer::EmitNote( unsigned NoteFlags = 0; // TODO Apparently, this is currently needed for OpenCL as mentioned in // https://reviews.llvm.org/D74995 - if (Os == Triple::AMDHSA) + if (STI.getTargetTriple().getOS() == Triple::AMDHSA) NoteFlags = ELF::SHF_ALLOC; S.PushSection(); @@ -472,24 +523,150 @@ void AMDGPUTargetELFStreamer::EmitNote( S.PopSection(); } -void AMDGPUTargetELFStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {} +unsigned AMDGPUTargetELFStreamer::getEFlags() { + switch (STI.getTargetTriple().getArch()) { + default: + llvm_unreachable("Unsupported Arch"); + case Triple::r600: + return getEFlagsR600(); + case Triple::amdgcn: + return getEFlagsAMDGCN(); + } +} + +unsigned AMDGPUTargetELFStreamer::getEFlagsR600() { + assert(STI.getTargetTriple().getArch() == Triple::r600); + + return getElfMach(STI.getCPU()); +} + +unsigned AMDGPUTargetELFStreamer::getEFlagsAMDGCN() { + assert(STI.getTargetTriple().getArch() == Triple::amdgcn); + + switch (STI.getTargetTriple().getOS()) { + default: + // TODO: Why are some tests have "mingw" listed as OS? + // llvm_unreachable("Unsupported OS"); + case Triple::UnknownOS: + return getEFlagsUnknownOS(); + case Triple::AMDHSA: + return getEFlagsAMDHSA(); + case Triple::AMDPAL: + return getEFlagsAMDPAL(); + case Triple::Mesa3D: + return getEFlagsMesa3D(); + } +} + +unsigned AMDGPUTargetELFStreamer::getEFlagsUnknownOS() { + // TODO: Why are some tests have "mingw" listed as OS? + // assert(STI.getTargetTriple().getOS() == Triple::UnknownOS); + + return getEFlagsV3(); +} + +unsigned AMDGPUTargetELFStreamer::getEFlagsAMDHSA() { + assert(STI.getTargetTriple().getOS() == Triple::AMDHSA); + + if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(&STI)) { + switch (*HsaAbiVer) { + case ELF::ELFABIVERSION_AMDGPU_HSA_V2: + case ELF::ELFABIVERSION_AMDGPU_HSA_V3: + return getEFlagsV3(); + case ELF::ELFABIVERSION_AMDGPU_HSA_V4: + return getEFlagsV4(); + } + } + + llvm_unreachable("HSA OS ABI Version identification must be defined"); +} + +unsigned AMDGPUTargetELFStreamer::getEFlagsAMDPAL() { + assert(STI.getTargetTriple().getOS() == Triple::AMDPAL); + + return getEFlagsV3(); +} + +unsigned AMDGPUTargetELFStreamer::getEFlagsMesa3D() { + assert(STI.getTargetTriple().getOS() == Triple::Mesa3D); + + return getEFlagsV3(); +} + +unsigned AMDGPUTargetELFStreamer::getEFlagsV3() { + unsigned EFlagsV3 = 0; + + // mach. + EFlagsV3 |= getElfMach(STI.getCPU()); + + // xnack. + if (getTargetID()->isXnackOnOrAny()) + EFlagsV3 |= ELF::EF_AMDGPU_FEATURE_XNACK_V3; + // sramecc. + if (getTargetID()->isSramEccOnOrAny()) + EFlagsV3 |= ELF::EF_AMDGPU_FEATURE_SRAMECC_V3; + + return EFlagsV3; +} + +unsigned AMDGPUTargetELFStreamer::getEFlagsV4() { + unsigned EFlagsV4 = 0; + + // mach. + EFlagsV4 |= getElfMach(STI.getCPU()); + + // xnack. + switch (getTargetID()->getXnackSetting()) { + case AMDGPU::IsaInfo::TargetIDSetting::Unsupported: + EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4; + break; + case AMDGPU::IsaInfo::TargetIDSetting::Any: + EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_XNACK_ANY_V4; + break; + case AMDGPU::IsaInfo::TargetIDSetting::Off: + EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_XNACK_OFF_V4; + break; + case AMDGPU::IsaInfo::TargetIDSetting::On: + EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_XNACK_ON_V4; + break; + } + // sramecc. + switch (getTargetID()->getSramEccSetting()) { + case AMDGPU::IsaInfo::TargetIDSetting::Unsupported: + EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4; + break; + case AMDGPU::IsaInfo::TargetIDSetting::Any: + EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_SRAMECC_ANY_V4; + break; + case AMDGPU::IsaInfo::TargetIDSetting::Off: + EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_SRAMECC_OFF_V4; + break; + case AMDGPU::IsaInfo::TargetIDSetting::On: + EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_SRAMECC_ON_V4; + break; + } + + return EFlagsV4; +} + +void AMDGPUTargetELFStreamer::EmitDirectiveAMDGCNTarget() {} void AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion( uint32_t Major, uint32_t Minor) { EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(8, getContext()), - ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_VERSION, [&](MCELFStreamer &OS) { + ELF::NT_AMD_HSA_CODE_OBJECT_VERSION, [&](MCELFStreamer &OS) { OS.emitInt32(Major); OS.emitInt32(Minor); }); } void -AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major, - uint32_t Minor, - uint32_t Stepping, - StringRef VendorName, - StringRef ArchName) { +AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISAV2(uint32_t Major, + uint32_t Minor, + uint32_t Stepping, + StringRef VendorName, + StringRef ArchName) { uint16_t VendorNameSize = VendorName.size() + 1; uint16_t ArchNameSize = ArchName.size() + 1; @@ -497,8 +674,9 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major, sizeof(Major) + sizeof(Minor) + sizeof(Stepping) + VendorNameSize + ArchNameSize; + convertIsaVersionV2(Major, Minor, Stepping, TargetID->isSramEccOnOrAny(), TargetID->isXnackOnOrAny()); EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(DescSZ, getContext()), - ElfNote::NT_AMDGPU_HSA_ISA, [&](MCELFStreamer &OS) { + ELF::NT_AMD_HSA_ISA_VERSION, [&](MCELFStreamer &OS) { OS.emitInt16(VendorNameSize); OS.emitInt16(ArchNameSize); OS.emitInt32(Major); @@ -546,7 +724,7 @@ void AMDGPUTargetELFStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, SymbolELF->setSize(MCConstantExpr::create(Size, getContext())); } -bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) { +bool AMDGPUTargetELFStreamer::EmitISAVersion() { // Create two labels to mark the beginning and end of the desc field // and a MCExpr to calculate the size of the desc field. auto &Context = getContext(); @@ -556,10 +734,10 @@ bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) { MCSymbolRefExpr::create(DescEnd, Context), MCSymbolRefExpr::create(DescBegin, Context), Context); - EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_AMDGPU_ISA, + EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_HSA_ISA_NAME, [&](MCELFStreamer &OS) { OS.emitLabel(DescBegin); - OS.emitBytes(IsaVersionString); + OS.emitBytes(getTargetID()->toString()); OS.emitLabel(DescEnd); }); return true; @@ -607,7 +785,7 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata( MCSymbolRefExpr::create(DescEnd, Context), MCSymbolRefExpr::create(DescBegin, Context), Context); - EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_AMDGPU_HSA_METADATA, + EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_HSA_METADATA, [&](MCELFStreamer &OS) { OS.emitLabel(DescBegin); OS.emitBytes(HSAMetadataString); @@ -616,14 +794,28 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata( return true; } -bool AMDGPUTargetELFStreamer::EmitCodeEnd() { +bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) { const uint32_t Encoded_s_code_end = 0xbf9f0000; + const uint32_t Encoded_s_nop = 0xbf800000; + uint32_t Encoded_pad = Encoded_s_code_end; + + // Instruction cache line size in bytes. + const unsigned Log2CacheLineSize = 6; + const unsigned CacheLineSize = 1u << Log2CacheLineSize; + + // Extra padding amount in bytes to support prefetch mode 3. + unsigned FillSize = 3 * CacheLineSize; + + if (AMDGPU::isGFX90A(STI)) { + Encoded_pad = Encoded_s_nop; + FillSize = 16 * CacheLineSize; + } MCStreamer &OS = getStreamer(); OS.PushSection(); - OS.emitValueToAlignment(64, Encoded_s_code_end, 4); - for (unsigned I = 0; I < 48; ++I) - OS.emitInt32(Encoded_s_code_end); + OS.emitValueToAlignment(CacheLineSize, Encoded_pad, 4); + for (unsigned I = 0; I < FillSize; I += 4) + OS.emitInt32(Encoded_pad); OS.PopSection(); return true; } @@ -631,8 +823,7 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd() { void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, - uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr, - bool ReserveXNACK) { + uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) { auto &Streamer = getStreamer(); auto &Context = Streamer.getContext(); @@ -659,8 +850,11 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor( Streamer.emitLabel(KernelDescriptorSymbol); Streamer.emitInt32(KernelDescriptor.group_segment_fixed_size); Streamer.emitInt32(KernelDescriptor.private_segment_fixed_size); + Streamer.emitInt32(KernelDescriptor.kernarg_size); + for (uint8_t Res : KernelDescriptor.reserved0) Streamer.emitInt8(Res); + // FIXME: Remove the use of VK_AMDGPU_REL64 in the expression below. The // expression being created is: // (start of kernel code) - (start of kernel descriptor) diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index 1ad64532931c..cef34a5e5a59 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H +#include "Utils/AMDGPUBaseInfo.h" #include "Utils/AMDGPUPALMetadata.h" #include "llvm/MC/MCStreamer.h" @@ -23,6 +24,7 @@ class MCSymbol; class MDNode; class Module; class Type; +class formatted_raw_ostream; namespace AMDGPU { namespace HSAMD { @@ -38,6 +40,9 @@ class AMDGPUTargetStreamer : public MCTargetStreamer { AMDGPUPALMetadata PALMetadata; protected: + // TODO: Move HSAMetadataStream to AMDGPUTargetStreamer. + Optional<AMDGPU::IsaInfo::AMDGPUTargetID> TargetID; + MCContext &getContext() const { return Streamer.getContext(); } public: @@ -45,15 +50,15 @@ public: AMDGPUPALMetadata *getPALMetadata() { return &PALMetadata; } - virtual void EmitDirectiveAMDGCNTarget(StringRef Target) = 0; + virtual void EmitDirectiveAMDGCNTarget() = 0; virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major, uint32_t Minor) = 0; - virtual void EmitDirectiveHSACodeObjectISA(uint32_t Major, uint32_t Minor, - uint32_t Stepping, - StringRef VendorName, - StringRef ArchName) = 0; + virtual void EmitDirectiveHSACodeObjectISAV2(uint32_t Major, uint32_t Minor, + uint32_t Stepping, + StringRef VendorName, + StringRef ArchName) = 0; virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) = 0; @@ -63,7 +68,7 @@ public: Align Alignment) = 0; /// \returns True on success, false on failure. - virtual bool EmitISAVersion(StringRef IsaVersionString) = 0; + virtual bool EmitISAVersion() = 0; /// \returns True on success, false on failure. virtual bool EmitHSAMetadataV2(StringRef HSAMetadataString); @@ -84,16 +89,32 @@ public: virtual bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) = 0; /// \returns True on success, false on failure. - virtual bool EmitCodeEnd() = 0; + virtual bool EmitCodeEnd(const MCSubtargetInfo &STI) = 0; virtual void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, - uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr, - bool ReserveXNACK) = 0; + uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) = 0; static StringRef getArchNameFromElfMach(unsigned ElfMach); static unsigned getElfMach(StringRef GPU); + + const Optional<AMDGPU::IsaInfo::AMDGPUTargetID> &getTargetID() const { + return TargetID; + } + Optional<AMDGPU::IsaInfo::AMDGPUTargetID> &getTargetID() { + return TargetID; + } + void initializeTargetID(const MCSubtargetInfo &STI) { + assert(TargetID == None && "TargetID can only be initialized once"); + TargetID.emplace(STI); + } + void initializeTargetID(const MCSubtargetInfo &STI, StringRef FeatureString) { + initializeTargetID(STI); + + assert(getTargetID() != None && "TargetID is None"); + getTargetID()->setTargetIDFromFeaturesString(FeatureString); + } }; class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer { @@ -103,14 +124,14 @@ public: void finish() override; - void EmitDirectiveAMDGCNTarget(StringRef Target) override; + void EmitDirectiveAMDGCNTarget() override; void EmitDirectiveHSACodeObjectVersion(uint32_t Major, uint32_t Minor) override; - void EmitDirectiveHSACodeObjectISA(uint32_t Major, uint32_t Minor, - uint32_t Stepping, StringRef VendorName, - StringRef ArchName) override; + void EmitDirectiveHSACodeObjectISAV2(uint32_t Major, uint32_t Minor, + uint32_t Stepping, StringRef VendorName, + StringRef ArchName) override; void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override; @@ -119,7 +140,7 @@ public: void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override; /// \returns True on success, false on failure. - bool EmitISAVersion(StringRef IsaVersionString) override; + bool EmitISAVersion() override; /// \returns True on success, false on failure. bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) override; @@ -128,22 +149,34 @@ public: bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override; /// \returns True on success, false on failure. - bool EmitCodeEnd() override; + bool EmitCodeEnd(const MCSubtargetInfo &STI) override; void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, - uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr, - bool ReserveXNACK) override; + uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) override; }; class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer { + const MCSubtargetInfo &STI; MCStreamer &Streamer; - Triple::OSType Os; void EmitNote(StringRef Name, const MCExpr *DescSize, unsigned NoteType, function_ref<void(MCELFStreamer &)> EmitDesc); + unsigned getEFlags(); + + unsigned getEFlagsR600(); + unsigned getEFlagsAMDGCN(); + + unsigned getEFlagsUnknownOS(); + unsigned getEFlagsAMDHSA(); + unsigned getEFlagsAMDPAL(); + unsigned getEFlagsMesa3D(); + + unsigned getEFlagsV3(); + unsigned getEFlagsV4(); + public: AMDGPUTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI); @@ -151,14 +184,14 @@ public: void finish() override; - void EmitDirectiveAMDGCNTarget(StringRef Target) override; + void EmitDirectiveAMDGCNTarget() override; void EmitDirectiveHSACodeObjectVersion(uint32_t Major, uint32_t Minor) override; - void EmitDirectiveHSACodeObjectISA(uint32_t Major, uint32_t Minor, - uint32_t Stepping, StringRef VendorName, - StringRef ArchName) override; + void EmitDirectiveHSACodeObjectISAV2(uint32_t Major, uint32_t Minor, + uint32_t Stepping, StringRef VendorName, + StringRef ArchName) override; void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override; @@ -167,7 +200,7 @@ public: void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override; /// \returns True on success, false on failure. - bool EmitISAVersion(StringRef IsaVersionString) override; + bool EmitISAVersion() override; /// \returns True on success, false on failure. bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) override; @@ -176,13 +209,12 @@ public: bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override; /// \returns True on success, false on failure. - bool EmitCodeEnd() override; + bool EmitCodeEnd(const MCSubtargetInfo &STI) override; void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, - uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr, - bool ReserveXNACK) override; + uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) override; }; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index 1a1ffcda3b4e..dbce4b2e872c 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -71,6 +71,9 @@ public: unsigned getAVOperandEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; + +private: + uint64_t getImplicitOpSelHiEncoding(int Opcode) const; }; } // end anonymous namespace @@ -219,7 +222,7 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, Imm = C->getValue(); } else { - assert(!MO.isFPImm()); + assert(!MO.isDFPImm()); if (!MO.isImm()) return ~0; @@ -234,12 +237,17 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, case AMDGPU::OPERAND_REG_INLINE_C_FP32: case AMDGPU::OPERAND_REG_INLINE_AC_INT32: case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_REG_IMM_V2INT32: + case AMDGPU::OPERAND_REG_IMM_V2FP32: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: return getLit32Encoding(static_cast<uint32_t>(Imm), STI); case AMDGPU::OPERAND_REG_IMM_INT64: case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_INLINE_C_INT64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: return getLit64Encoding(static_cast<uint64_t>(Imm), STI); case AMDGPU::OPERAND_REG_IMM_INT16: @@ -274,16 +282,40 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, } } +uint64_t SIMCCodeEmitter::getImplicitOpSelHiEncoding(int Opcode) const { + using namespace AMDGPU::VOP3PEncoding; + using namespace AMDGPU::OpName; + + if (AMDGPU::getNamedOperandIdx(Opcode, op_sel_hi) != -1) { + if (AMDGPU::getNamedOperandIdx(Opcode, src2) != -1) + return 0; + if (AMDGPU::getNamedOperandIdx(Opcode, src1) != -1) + return OP_SEL_HI_2; + if (AMDGPU::getNamedOperandIdx(Opcode, src0) != -1) + return OP_SEL_HI_1 | OP_SEL_HI_2; + } + return OP_SEL_HI_0 | OP_SEL_HI_1 | OP_SEL_HI_2; +} + void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { verifyInstructionPredicates(MI, computeAvailableFeatures(STI.getFeatureBits())); + int Opcode = MI.getOpcode(); uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups, STI); - const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + const MCInstrDesc &Desc = MCII.get(Opcode); unsigned bytes = Desc.getSize(); + // Set unused op_sel_hi bits to 1 for VOP3P and MAI instructions. + // Note that accvgpr_read/write are MAI, have src0, but do not use op_sel. + if ((Desc.TSFlags & SIInstrFlags::VOP3P) || + Opcode == AMDGPU::V_ACCVGPR_READ_B32_vi || + Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) { + Encoding |= getImplicitOpSelHiEncoding(Opcode); + } + for (unsigned i = 0; i < bytes; i++) { OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff)); } @@ -431,6 +463,7 @@ SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo, MRI.getRegClass(AMDGPU::AReg_128RegClassID).contains(Reg) || MRI.getRegClass(AMDGPU::AReg_160RegClassID).contains(Reg) || MRI.getRegClass(AMDGPU::AReg_192RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_224RegClassID).contains(Reg) || MRI.getRegClass(AMDGPU::AReg_256RegClassID).contains(Reg) || MRI.getRegClass(AMDGPU::AGPR_LO16RegClassID).contains(Reg)) Enc |= 512; diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 54c8cdf196ac..bacb790aac62 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -11,12 +11,14 @@ // // - MIMGEncGfx6: encoding introduced with gfx6 (obsoleted for atomics in gfx8) // - MIMGEncGfx8: encoding introduced with gfx8 for atomics -// - MIMGEncGfx10Default: gfx default (non-NSA) encoding +// - MIMGEncGfx90a: encoding for gfx90a for atomics +// - MIMGEncGfx10Default: gfx10 default (non-NSA) encoding // - MIMGEncGfx10NSA: gfx10 NSA encoding class MIMGEncoding; def MIMGEncGfx6 : MIMGEncoding; def MIMGEncGfx8 : MIMGEncoding; +def MIMGEncGfx90a : MIMGEncoding; def MIMGEncGfx10Default : MIMGEncoding; def MIMGEncGfx10NSA : MIMGEncoding; @@ -39,6 +41,8 @@ class MIMGBaseOpcode : PredicateControl { bit Coordinates = 1; bit LodOrClampOrMip = 0; bit HasD16 = 0; + bit IsAtomicRet = 0; + bit MSAA = 0; } def MIMGBaseOpcode : GenericEnum { @@ -50,7 +54,7 @@ def MIMGBaseOpcodesTable : GenericTable { let CppTypeName = "MIMGBaseOpcodeInfo"; let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", "Gather4", "NumExtraArgs", "Gradients", "G16", "Coordinates", - "LodOrClampOrMip", "HasD16"]; + "LodOrClampOrMip", "HasD16", "MSAA"]; string TypeOf_BaseOpcode = "MIMGBaseOpcode"; let PrimaryKey = ["BaseOpcode"]; @@ -64,7 +68,7 @@ def MIMGDim : GenericEnum { def MIMGDimInfoTable : GenericTable { let FilterClass = "AMDGPUDimProps"; let CppTypeName = "MIMGDimInfo"; - let Fields = ["Dim", "NumCoords", "NumGradients", "DA", "Encoding", "AsmSuffix"]; + let Fields = ["Dim", "NumCoords", "NumGradients", "MSAA", "DA", "Encoding", "AsmSuffix"]; string TypeOf_Dim = "MIMGDim"; let PrimaryKey = ["Dim"]; @@ -81,9 +85,17 @@ def getMIMGDimInfoByAsmSuffix : SearchIndex { let Key = ["AsmSuffix"]; } -class mimg <bits<8> si_gfx10, bits<8> vi = si_gfx10> { - field bits<8> SI_GFX10 = si_gfx10; - field bits<8> VI = vi; +def MIMG { + int NOP = -1; +} + +class mimgopc <int base, int vi = base, int si = base> { + field bits<8> BASE = base; // Opcode for all but atomics + field bits<8> VI = vi; // VI is only used for atomic instructions + field bits<8> SI = si; // SI is only used for atomic instructions + bit HAS_BASE = !ne(base, MIMG.NOP); + bit HAS_VI = !ne(vi, MIMG.NOP); + bit HAS_SI = !ne(si, MIMG.NOP); } class MIMGLZMapping<MIMGBaseOpcode l, MIMGBaseOpcode lz> { @@ -198,14 +210,24 @@ class MIMGNSAHelper<int num_addrs> { // Base class of all pre-gfx10 MIMG instructions. class MIMG_gfx6789<bits<8> op, dag outs, string dns = ""> : MIMG<outs, dns>, MIMGe_gfx6789<op> { - let SubtargetPredicate = isGFX6GFX7GFX8GFX9; - let AssemblerPredicate = isGFX6GFX7GFX8GFX9; + let SubtargetPredicate = isGFX6GFX7GFX8GFX9NotGFX90A; + let AssemblerPredicate = isGFX6GFX7GFX8GFX9NotGFX90A; let MIMGEncoding = MIMGEncGfx6; let d16 = !if(BaseOpcode.HasD16, ?, 0); } +class MIMG_gfx90a<bits<8> op, dag outs, string dns = ""> + : MIMG<outs, dns>, MIMGe_gfx90a<op> { + let SubtargetPredicate = isGFX90APlus; + let AssemblerPredicate = isGFX90APlus; + + let MIMGEncoding = MIMGEncGfx90a; + + let d16 = !if(BaseOpcode.HasD16, ?, 0); +} + // Base class of all non-NSA gfx10 MIMG instructions. class MIMG_gfx10<int op, dag outs, string dns = ""> : MIMG<outs, dns>, MIMGe_gfx10<op> { @@ -218,8 +240,8 @@ class MIMG_gfx10<int op, dag outs, string dns = ""> let nsa = 0; } -// Base class for all NSA MIMG instructions. Note that 1-dword addresses always -// use non-NSA variants. +// Base class for all NSA MIMG instructions. +// Note that 1-dword addresses always use non-NSA variants. class MIMG_nsa_gfx10<int op, dag outs, int num_addrs, string dns=""> : MIMG<outs, dns>, MIMGe_gfx10<op> { let SubtargetPredicate = isGFX10Plus; @@ -235,169 +257,229 @@ class MIMG_nsa_gfx10<int op, dag outs, int num_addrs, string dns=""> let nsa = nsah.NSA; } -class MIMG_NoSampler_Helper <bits<8> op, string asm, +class MIMG_NoSampler_Helper <mimgopc op, string asm, RegisterClass dst_rc, RegisterClass addr_rc, string dns=""> - : MIMG_gfx6789 <op, (outs dst_rc:$vdata), dns> { + : MIMG_gfx6789 <op.BASE, (outs dst_rc:$vdata), dns> { let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc, - DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, + DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); - let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da" + let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da" #!if(BaseOpcode.HasD16, "$d16", ""); } -class MIMG_NoSampler_gfx10<int op, string opcode, +class MIMG_NoSampler_Helper_gfx90a <mimgopc op, string asm, + RegisterClass dst_rc, + RegisterClass addr_rc, + string dns=""> + : MIMG_gfx90a <op.BASE, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> { + let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc, + DMask:$dmask, UNorm:$unorm, CPol:$cpol, + R128A16:$r128, LWE:$lwe, DA:$da), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + +class MIMG_NoSampler_gfx10<mimgopc op, string opcode, RegisterClass DataRC, RegisterClass AddrRC, string dns=""> - : MIMG_gfx10<op, (outs DataRC:$vdata), dns> { + : MIMG_gfx10<op.BASE, (outs DataRC:$vdata), dns> { let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask, - Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc, - SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), + Dim:$dim, UNorm:$unorm, CPol:$cpol, + R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); - let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe" + let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe" #!if(BaseOpcode.HasD16, "$d16", ""); } -class MIMG_NoSampler_nsa_gfx10<int op, string opcode, +class MIMG_NoSampler_nsa_gfx10<mimgopc op, string opcode, RegisterClass DataRC, int num_addrs, string dns=""> - : MIMG_nsa_gfx10<op, (outs DataRC:$vdata), num_addrs, dns> { + : MIMG_nsa_gfx10<op.BASE, (outs DataRC:$vdata), num_addrs, dns> { let InOperandList = !con(AddrIns, (ins SReg_256:$srsrc, DMask:$dmask, - Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc, - SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), + Dim:$dim, UNorm:$unorm, CPol:$cpol, + R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); - let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe" + let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe" #!if(BaseOpcode.HasD16, "$d16", ""); } -multiclass MIMG_NoSampler_Src_Helper <bits<8> op, string asm, +multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm, RegisterClass dst_rc, - bit enableDisasm> { + bit enableDisasm, + bit ExtendedImageInst = 1> { let ssamp = 0 in { let VAddrDwords = 1 in { - def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32, - !if(enableDisasm, "AMDGPU", "")>; - def _V1_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VGPR_32, - !if(enableDisasm, "AMDGPU", "")>; + if op.HAS_BASE then { + def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32, + !if(enableDisasm, "AMDGPU", "")>; + if !not(ExtendedImageInst) then + def _V1_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VGPR_32, + !if(enableDisasm, "GFX90A", "")>; + def _V1_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VGPR_32, + !if(enableDisasm, "AMDGPU", "")>; + } } let VAddrDwords = 2 in { - def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>; - def _V2_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_64>; - def _V2_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 2>; + if op.HAS_BASE then { + def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>; + if !not(ExtendedImageInst) then + def _V2_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_64>; + def _V2_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_64>; + def _V2_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 2>; + } } let VAddrDwords = 3 in { - def _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>; - def _V3_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_96>; - def _V3_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 3>; + if op.HAS_BASE then { + def _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>; + if !not(ExtendedImageInst) then + def _V3_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_96>; + def _V3_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_96>; + def _V3_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 3>; + } } let VAddrDwords = 4 in { - def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>; - def _V4_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_128>; - def _V4_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 4, - !if(enableDisasm, "AMDGPU", "")>; + if op.HAS_BASE then { + def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>; + if !not(ExtendedImageInst) then + def _V4_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_128>; + def _V4_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_128>; + def _V4_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 4, + !if(enableDisasm, "AMDGPU", "")>; + } } } } -multiclass MIMG_NoSampler <bits<8> op, string asm, bit has_d16, bit mip = 0, - bit isResInfo = 0> { +multiclass MIMG_NoSampler <mimgopc op, string asm, bit has_d16, bit mip = 0, + bit isResInfo = 0, + bit msaa = 0> { def "" : MIMGBaseOpcode { let Coordinates = !not(isResInfo); let LodOrClampOrMip = mip; let HasD16 = has_d16; + let MSAA = msaa; } let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), mayLoad = !not(isResInfo) in { let VDataDwords = 1 in - defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>; + defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1, msaa>; let VDataDwords = 2 in - defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 0>; + defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 0, msaa>; let VDataDwords = 3 in - defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>; + defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0, msaa>; let VDataDwords = 4 in - defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>; + defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0, msaa>; let VDataDwords = 5 in - defm _V5 : MIMG_NoSampler_Src_Helper <op, asm, VReg_160, 0>; + defm _V5 : MIMG_NoSampler_Src_Helper <op, asm, VReg_160, 0, msaa>; } } -class MIMG_Store_Helper <bits<8> op, string asm, +class MIMG_Store_Helper <mimgopc op, string asm, RegisterClass data_rc, RegisterClass addr_rc, string dns = ""> - : MIMG_gfx6789<op, (outs), dns> { + : MIMG_gfx6789<op.BASE, (outs), dns> { let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, - DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, + DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); - let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da" + let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da" #!if(BaseOpcode.HasD16, "$d16", ""); } -class MIMG_Store_gfx10<int op, string opcode, +class MIMG_Store_Helper_gfx90a <mimgopc op, string asm, + RegisterClass data_rc, + RegisterClass addr_rc, + string dns = ""> + : MIMG_gfx90a<op.BASE, (outs), dns> { + let InOperandList = !con((ins getLdStRegisterOperand<data_rc>.ret:$vdata, + addr_rc:$vaddr, SReg_256:$srsrc, + DMask:$dmask, UNorm:$unorm, CPol:$cpol, + R128A16:$r128, LWE:$lwe, DA:$da), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + +class MIMG_Store_gfx10<mimgopc op, string opcode, RegisterClass DataRC, RegisterClass AddrRC, string dns=""> - : MIMG_gfx10<op, (outs), dns> { + : MIMG_gfx10<op.BASE, (outs), dns> { let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, - DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc, - GLC:$glc, SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), + DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, + R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); - let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe" + let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe" #!if(BaseOpcode.HasD16, "$d16", ""); } -class MIMG_Store_nsa_gfx10<int op, string opcode, +class MIMG_Store_nsa_gfx10<mimgopc op, string opcode, RegisterClass DataRC, int num_addrs, string dns=""> - : MIMG_nsa_gfx10<op, (outs), num_addrs, dns> { + : MIMG_nsa_gfx10<op.BASE, (outs), num_addrs, dns> { let InOperandList = !con((ins DataRC:$vdata), AddrIns, (ins SReg_256:$srsrc, DMask:$dmask, - Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc, - SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), + Dim:$dim, UNorm:$unorm, CPol:$cpol, + R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); - let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe" + let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe" #!if(BaseOpcode.HasD16, "$d16", ""); } -multiclass MIMG_Store_Addr_Helper <int op, string asm, +multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm, RegisterClass data_rc, bit enableDisasm> { let mayLoad = 0, mayStore = 1, hasSideEffects = 0, hasPostISelHook = 0, DisableWQM = 1, ssamp = 0 in { let VAddrDwords = 1 in { - def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32, - !if(enableDisasm, "AMDGPU", "")>; - def _V1_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VGPR_32, - !if(enableDisasm, "AMDGPU", "")>; + if op.HAS_BASE then { + def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32, + !if(enableDisasm, "AMDGPU", "")>; + def _V1_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VGPR_32, + !if(enableDisasm, "GFX90A", "")>; + def _V1_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VGPR_32, + !if(enableDisasm, "AMDGPU", "")>; + } } let VAddrDwords = 2 in { - def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>; - def _V2_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_64>; - def _V2_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 2>; + if op.HAS_BASE then { + def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>; + def _V2_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_64>; + def _V2_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_64>; + def _V2_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 2>; + } } let VAddrDwords = 3 in { - def _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>; - def _V3_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_96>; - def _V3_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 3>; + if op.HAS_BASE then { + def _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>; + def _V3_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_96>; + def _V3_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_96>; + def _V3_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 3>; + } } let VAddrDwords = 4 in { - def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>; - def _V4_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_128>; - def _V4_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 4, - !if(enableDisasm, "AMDGPU", "")>; + if op.HAS_BASE then { + def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>; + def _V4_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_128>; + def _V4_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_128>; + def _V4_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 4, + !if(enableDisasm, "AMDGPU", "")>; + } } } } -multiclass MIMG_Store <bits<8> op, string asm, bit has_d16, bit mip = 0> { +multiclass MIMG_Store <mimgopc op, string asm, bit has_d16, bit mip = 0> { def "" : MIMGBaseOpcode { let Store = 1; let LodOrClampOrMip = mip; @@ -425,43 +507,63 @@ class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterClass data_rc, let AsmMatchConverter = "cvtMIMGAtomic"; let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, - DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, + DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da); - let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"; + let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da"; +} + +class MIMG_Atomic_gfx90a_base <bits<8> op, string asm, RegisterClass data_rc, + RegisterClass addr_rc, string dns=""> + : MIMG_gfx90a <op, (outs getLdStRegisterOperand<data_rc>.ret:$vdst), dns> { + let Constraints = "$vdst = $vdata"; + let AsmMatchConverter = "cvtMIMGAtomic"; + + let InOperandList = (ins getLdStRegisterOperand<data_rc>.ret:$vdata, + addr_rc:$vaddr, SReg_256:$srsrc, + DMask:$dmask, UNorm:$unorm, CPol:$cpol, + R128A16:$r128, LWE:$lwe, DA:$da); + let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da"; } -class MIMG_Atomic_si<mimg op, string asm, RegisterClass data_rc, +class MIMG_Atomic_si<mimgopc op, string asm, RegisterClass data_rc, RegisterClass addr_rc, bit enableDasm = 0> - : MIMG_Atomic_gfx6789_base<op.SI_GFX10, asm, data_rc, addr_rc, + : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc, !if(enableDasm, "GFX6GFX7", "")> { let AssemblerPredicate = isGFX6GFX7; } -class MIMG_Atomic_vi<mimg op, string asm, RegisterClass data_rc, +class MIMG_Atomic_vi<mimgopc op, string asm, RegisterClass data_rc, RegisterClass addr_rc, bit enableDasm = 0> : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX8", "")> { - let AssemblerPredicate = isGFX8GFX9; + let AssemblerPredicate = isGFX8GFX9NotGFX90A; let MIMGEncoding = MIMGEncGfx8; } -class MIMG_Atomic_gfx10<mimg op, string opcode, +class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterClass data_rc, + RegisterClass addr_rc, bit enableDasm = 0> + : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX90A", "")> { + let AssemblerPredicate = isGFX90APlus; + let MIMGEncoding = MIMGEncGfx90a; +} + +class MIMG_Atomic_gfx10<mimgopc op, string opcode, RegisterClass DataRC, RegisterClass AddrRC, bit enableDisasm = 0> - : MIMG_gfx10<!cast<int>(op.SI_GFX10), (outs DataRC:$vdst), + : MIMG_gfx10<!cast<int>(op.BASE), (outs DataRC:$vdst), !if(enableDisasm, "AMDGPU", "")> { let Constraints = "$vdst = $vdata"; let AsmMatchConverter = "cvtMIMGAtomic"; let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, - DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc, - GLC:$glc, SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe); - let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe"; + DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, + R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe); + let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; } -class MIMG_Atomic_nsa_gfx10<mimg op, string opcode, +class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode, RegisterClass DataRC, int num_addrs, bit enableDisasm = 0> - : MIMG_nsa_gfx10<!cast<int>(op.SI_GFX10), (outs DataRC:$vdst), num_addrs, + : MIMG_nsa_gfx10<!cast<int>(op.BASE), (outs DataRC:$vdst), num_addrs, !if(enableDisasm, "AMDGPU", "")> { let Constraints = "$vdst = $vdata"; let AsmMatchConverter = "cvtMIMGAtomic"; @@ -469,95 +571,137 @@ class MIMG_Atomic_nsa_gfx10<mimg op, string opcode, let InOperandList = !con((ins DataRC:$vdata), AddrIns, (ins SReg_256:$srsrc, DMask:$dmask, - Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc, - SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe)); - let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe"; + Dim:$dim, UNorm:$unorm, CPol:$cpol, + R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe)); + let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; } -multiclass MIMG_Atomic_Addr_Helper_m <mimg op, string asm, +multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm, RegisterClass data_rc, - bit enableDasm = 0> { + bit enableDasm = 0, + bit isFP = 0> { let hasSideEffects = 1, // FIXME: remove this mayLoad = 1, mayStore = 1, hasPostISelHook = 0, DisableWQM = 1, - ssamp = 0 in { + ssamp = 0, FPAtomic = isFP in { let VAddrDwords = 1 in { - def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, enableDasm>; - def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>; - def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>; + if op.HAS_SI then { + def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, enableDasm>; + } + if op.HAS_VI then { + def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, enableDasm>; + } + if op.HAS_BASE then { + def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>; + } } let VAddrDwords = 2 in { - def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, 0>; - def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>; - def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>; - def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>; + if op.HAS_SI then { + def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, 0>; + } + if op.HAS_VI then { + def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>; + def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64, 0>; + } + if op.HAS_BASE then { + def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>; + def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>; + } } let VAddrDwords = 3 in { - def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, 0>; - def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>; - def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>; - def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>; + if op.HAS_SI then { + def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, 0>; + } + if op.HAS_VI then { + def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>; + def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96, 0>; + } + if op.HAS_BASE then { + def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>; + def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>; + } } let VAddrDwords = 4 in { - def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, 0>; - def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>; - def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>; - def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>; + if op.HAS_SI then { + def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, 0>; + } + if op.HAS_VI then { + def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>; + def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128, 0>; + } + if op.HAS_BASE then { + def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>; + def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>; + } } } } -multiclass MIMG_Atomic <mimg op, string asm, bit isCmpSwap = 0> { // 64-bit atomics - def "" : MIMGBaseOpcode { - let Atomic = 1; - let AtomicX2 = isCmpSwap; - } +multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0> { // 64-bit atomics + let IsAtomicRet = 1 in { + def "" : MIMGBaseOpcode { + let Atomic = 1; + let AtomicX2 = isCmpSwap; + } - let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in { - // _V* variants have different dst size, but the size is encoded implicitly, - // using dmask and tfe. Only 32-bit variant is registered with disassembler. - // Other variants are reconstructed by disassembler using dmask and tfe. - let VDataDwords = !if(isCmpSwap, 2, 1) in - defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_64, VGPR_32), 1>; - let VDataDwords = !if(isCmpSwap, 4, 2) in - defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_128, VReg_64)>; - } + let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in { + // _V* variants have different dst size, but the size is encoded implicitly, + // using dmask and tfe. Only 32-bit variant is registered with disassembler. + // Other variants are reconstructed by disassembler using dmask and tfe. + let VDataDwords = !if(isCmpSwap, 2, 1) in + defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_64, VGPR_32), 1, isFP>; + let VDataDwords = !if(isCmpSwap, 4, 2) in + defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_128, VReg_64), 0, isFP>; + } + } // End IsAtomicRet = 1 } -class MIMG_Sampler_Helper <bits<8> op, string asm, RegisterClass dst_rc, +class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterClass dst_rc, RegisterClass src_rc, string dns=""> - : MIMG_gfx6789 <op, (outs dst_rc:$vdata), dns> { + : MIMG_gfx6789 <op.BASE, (outs dst_rc:$vdata), dns> { let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, - DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, + DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); - let AsmString = asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da" + let AsmString = asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$cpol$r128$tfe$lwe$da" #!if(BaseOpcode.HasD16, "$d16", ""); } -class MIMG_Sampler_gfx10<int op, string opcode, +class MIMG_Sampler_gfx90a<mimgopc op, string asm, RegisterClass dst_rc, + RegisterClass src_rc, string dns=""> + : MIMG_gfx90a<op.BASE, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> { + let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, + DMask:$dmask, UNorm:$unorm, CPol:$cpol, + R128A16:$r128, LWE:$lwe, DA:$da), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$cpol$r128$lwe$da" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + +class MIMG_Sampler_gfx10<mimgopc op, string opcode, RegisterClass DataRC, RegisterClass AddrRC, string dns=""> - : MIMG_gfx10<op, (outs DataRC:$vdata), dns> { + : MIMG_gfx10<op.BASE, (outs DataRC:$vdata), dns> { let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, SReg_128:$ssamp, - DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc, - GLC:$glc, SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), + DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, + R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); let AsmString = opcode#" $vdata, $vaddr0, $srsrc, $ssamp$dmask$dim$unorm" - #"$dlc$glc$slc$r128$a16$tfe$lwe" + #"$cpol$r128$a16$tfe$lwe" #!if(BaseOpcode.HasD16, "$d16", ""); } -class MIMG_Sampler_nsa_gfx10<int op, string opcode, +class MIMG_Sampler_nsa_gfx10<mimgopc op, string opcode, RegisterClass DataRC, int num_addrs, string dns=""> - : MIMG_nsa_gfx10<op, (outs DataRC:$vdata), num_addrs, dns> { + : MIMG_nsa_gfx10<op.BASE, (outs DataRC:$vdata), num_addrs, dns> { let InOperandList = !con(AddrIns, (ins SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask, - Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc, - SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), + Dim:$dim, UNorm:$unorm, CPol:$cpol, + R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc, $ssamp$dmask$dim$unorm" - #"$dlc$glc$slc$r128$a16$tfe$lwe" + #"$cpol$r128$a16$tfe$lwe" #!if(BaseOpcode.HasD16, "$d16", ""); } @@ -569,8 +713,11 @@ class MIMGAddrSize<int dw, bit enable_disasm> { !if(!eq(NumWords, 2), VReg_64, !if(!eq(NumWords, 3), VReg_96, !if(!eq(NumWords, 4), VReg_128, + !if(!eq(NumWords, 5), VReg_160, + !if(!eq(NumWords, 6), VReg_192, + !if(!eq(NumWords, 7), VReg_224, !if(!le(NumWords, 8), VReg_256, - !if(!le(NumWords, 16), VReg_512, ?))))))); + !if(!le(NumWords, 16), VReg_512, ?)))))))))); // Whether the instruction variant with this vaddr size should be enabled for // the auto-generated disassembler. @@ -588,9 +735,9 @@ class isRangeInList<int min, int max, list<int> lst> { bit ret = !foldl(0, lst, lhs, y, !or(lhs, !and(!le(min, y), !le(y, max)))); } -class MIMGAddrSizes_tmp<list<MIMGAddrSize> lst, int min> { - list<MIMGAddrSize> List = lst; - int Min = min; +class MIMGAddrSizes_dw_range<list<int> range> { + int Min = !head(range); + int Max = !if(!empty(!tail(range)), Min, !head(!tail(range))); } class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample> { @@ -600,8 +747,8 @@ class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample> { list<int> AllNumAddrWords = !foreach(dw, !if(sample.Gradients, !if(!eq(sample.LodOrClamp, ""), - [2, 3, 4, 5, 6, 7, 9], - [2, 3, 4, 5, 7, 8, 10]), + [2, 3, 4, 5, 6, 7, 8, 9], + [2, 3, 4, 5, 6, 7, 8, 9, 10]), !if(!eq(sample.LodOrClamp, ""), [1, 2, 3], [1, 2, 3, 4])), @@ -611,12 +758,17 @@ class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample> { // required numbers of address words. The disassembler defaults to the // smallest register class. list<MIMGAddrSize> MachineInstrs = - !foldl(MIMGAddrSizes_tmp<[], 0>, [1, 2, 3, 4, 8, 16], lhs, dw, - !if(isRangeInList<lhs.Min, dw, AllNumAddrWords>.ret, - MIMGAddrSizes_tmp< - !listconcat(lhs.List, [MIMGAddrSize<dw, !empty(lhs.List)>]), - !if(!eq(dw, 3), 3, !add(dw, 1))>, // we still need _V4 for codegen w/ 3 dwords - lhs)).List; + !foldl([]<MIMGAddrSize>, + !foreach(range, + // V4 is generated for V3 and V4 + // V8 is generated for V5 through V8 + // V16 is generated for V9 through V16 + [[1],[2],[3],[3,4],[5],[6],[7],[5,8],[9,16]], + MIMGAddrSizes_dw_range<range>), + lhs, dw, + !if(isRangeInList<dw.Min, dw.Max, AllNumAddrWords>.ret, + !listconcat(lhs, [MIMGAddrSize<dw.Max, !empty(lhs)>]), + lhs)); // For NSA, generate machine instructions for all possible numbers of words // except 1 (which is already covered by the non-NSA case). @@ -632,25 +784,34 @@ class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample> { lhs)))); } -multiclass MIMG_Sampler_Src_Helper <bits<8> op, string asm, +multiclass MIMG_Sampler_Src_Helper <mimgopc op, string asm, AMDGPUSampleVariant sample, RegisterClass dst_rc, - bit enableDisasm = 0> { + bit enableDisasm = 0, + bit ExtendedImageInst = 1> { foreach addr = MIMG_Sampler_AddrSizes<sample>.MachineInstrs in { let VAddrDwords = addr.NumWords in { - def _V # addr.NumWords - : MIMG_Sampler_Helper <op, asm, dst_rc, addr.RegClass, - !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>; - def _V # addr.NumWords # _gfx10 - : MIMG_Sampler_gfx10 <op, asm, dst_rc, addr.RegClass, - !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>; + if op.HAS_BASE then { + def _V # addr.NumWords + : MIMG_Sampler_Helper <op, asm, dst_rc, addr.RegClass, + !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>; + if !not(ExtendedImageInst) then + def _V # addr.NumWords # _gfx90a + : MIMG_Sampler_gfx90a <op, asm, dst_rc, addr.RegClass, + !if(!and(enableDisasm, addr.Disassemble), "GFX90A", "")>; + def _V # addr.NumWords # _gfx10 + : MIMG_Sampler_gfx10 <op, asm, dst_rc, addr.RegClass, + !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>; + } } } foreach addr = MIMG_Sampler_AddrSizes<sample>.NSAInstrs in { let VAddrDwords = addr.NumWords in { - def _V # addr.NumWords # _nsa_gfx10 - : MIMG_Sampler_nsa_gfx10<op, asm, dst_rc, addr.NumWords, - !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>; + if op.HAS_BASE then { + def _V # addr.NumWords # _nsa_gfx10 + : MIMG_Sampler_nsa_gfx10<op, asm, dst_rc, addr.NumWords, + !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>; + } } } } @@ -663,9 +824,10 @@ class MIMG_Sampler_BaseOpcode<AMDGPUSampleVariant sample> let LodOrClampOrMip = !ne(sample.LodOrClamp, ""); } -multiclass MIMG_Sampler <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0, +multiclass MIMG_Sampler <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0, bit isG16 = 0, bit isGetLod = 0, - string asm = "image_sample"#sample.LowerCaseMod#!if(isG16, "_g16", "")> { + string asm = "image_sample"#sample.LowerCaseMod#!if(isG16, "_g16", ""), + bit ExtendedImageInst = !ne(sample.LowerCaseMod, "")> { def "" : MIMG_Sampler_BaseOpcode<sample> { let HasD16 = !not(isGetLod); let G16 = isG16; @@ -674,22 +836,22 @@ multiclass MIMG_Sampler <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0, let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm, mayLoad = !not(isGetLod) in { let VDataDwords = 1 in - defm _V1 : MIMG_Sampler_Src_Helper<op, asm, sample, VGPR_32, 1>; + defm _V1 : MIMG_Sampler_Src_Helper<op, asm, sample, VGPR_32, 1, ExtendedImageInst>; let VDataDwords = 2 in - defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; + defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64, 0, ExtendedImageInst>; let VDataDwords = 3 in - defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>; + defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96, 0, ExtendedImageInst>; let VDataDwords = 4 in - defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>; + defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 0, ExtendedImageInst>; let VDataDwords = 5 in - defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160>; + defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160, 0, ExtendedImageInst>; } } -multiclass MIMG_Sampler_WQM <bits<8> op, AMDGPUSampleVariant sample> +multiclass MIMG_Sampler_WQM <mimgopc op, AMDGPUSampleVariant sample> : MIMG_Sampler<op, sample, 1>; -multiclass MIMG_Gather <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0, +multiclass MIMG_Gather <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0, string asm = "image_gather4"#sample.LowerCaseMod> { def "" : MIMG_Sampler_BaseOpcode<sample> { let HasD16 = 1; @@ -697,7 +859,7 @@ multiclass MIMG_Gather <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0, } let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm, - Gather4 = 1, hasPostISelHook = 0 in { + Gather4 = 1 in { let VDataDwords = 2 in defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */ let VDataDwords = 4 in @@ -707,11 +869,11 @@ multiclass MIMG_Gather <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0, } } -multiclass MIMG_Gather_WQM <bits<8> op, AMDGPUSampleVariant sample> +multiclass MIMG_Gather_WQM <mimgopc op, AMDGPUSampleVariant sample> : MIMG_Gather<op, sample, 1>; -class MIMG_IntersectRay_gfx10<int op, string opcode, RegisterClass AddrRC, bit A16> - : MIMG_gfx10<op, (outs VReg_128:$vdata), "AMDGPU"> { +class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC, bit A16> + : MIMG_gfx10<op.BASE, (outs VReg_128:$vdata), "AMDGPU"> { let InOperandList = !con((ins AddrRC:$vaddr0, SReg_128:$srsrc), !if(A16, (ins GFX10A16:$a16), (ins))); @@ -720,25 +882,23 @@ class MIMG_IntersectRay_gfx10<int op, string opcode, RegisterClass AddrRC, bit A let nsa = 0; } -class MIMG_IntersectRay_nsa_gfx10<int op, string opcode, int num_addrs, bit A16> - : MIMG_nsa_gfx10<op, (outs VReg_128:$vdata), num_addrs, "AMDGPU"> { +class MIMG_IntersectRay_nsa_gfx10<mimgopc op, string opcode, int num_addrs, bit A16> + : MIMG_nsa_gfx10<op.BASE, (outs VReg_128:$vdata), num_addrs, "AMDGPU"> { let InOperandList = !con(nsah.AddrIns, (ins SReg_128:$srsrc), !if(A16, (ins GFX10A16:$a16), (ins))); let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc"#!if(A16, "$a16", ""); } -multiclass MIMG_IntersectRay<int op, string opcode, int num_addrs, bit A16> { +multiclass MIMG_IntersectRay<mimgopc op, string opcode, int num_addrs, bit A16> { def "" : MIMGBaseOpcode; - let SubtargetPredicate = HasGFX10_BEncoding, - AssemblerPredicate = HasGFX10_BEncoding, + let SubtargetPredicate = HasGFX10_AEncoding, + AssemblerPredicate = HasGFX10_AEncoding, AsmMatchConverter = !if(A16, "cvtIntersectRay", ""), dmask = 0xf, unorm = 1, d16 = 0, - glc = 0, - slc = 0, - dlc = 0, + cpol = 0, tfe = 0, lwe = 0, r128 = 1, @@ -762,131 +922,133 @@ multiclass MIMG_IntersectRay<int op, string opcode, int num_addrs, bit A16> { //===----------------------------------------------------------------------===// // MIMG Instructions //===----------------------------------------------------------------------===// -defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load", 1>; -defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip", 1, 1>; -defm IMAGE_LOAD_PCK : MIMG_NoSampler <0x00000002, "image_load_pck", 0>; -defm IMAGE_LOAD_PCK_SGN : MIMG_NoSampler <0x00000003, "image_load_pck_sgn", 0>; -defm IMAGE_LOAD_MIP_PCK : MIMG_NoSampler <0x00000004, "image_load_mip_pck", 0, 1>; -defm IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoSampler <0x00000005, "image_load_mip_pck_sgn", 0, 1>; -defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store", 1>; -defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip", 1, 1>; -defm IMAGE_STORE_PCK : MIMG_Store <0x0000000a, "image_store_pck", 0>; -defm IMAGE_STORE_MIP_PCK : MIMG_Store <0x0000000b, "image_store_mip_pck", 0, 1>; - -defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo", 0, 1, 1>; - -defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimg<0x0f, 0x10>, "image_atomic_swap">; -defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", 1>; -defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimg<0x11, 0x12>, "image_atomic_add">; -defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimg<0x12, 0x13>, "image_atomic_sub">; -//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; -- not on VI -defm IMAGE_ATOMIC_SMIN : MIMG_Atomic <mimg<0x14>, "image_atomic_smin">; -defm IMAGE_ATOMIC_UMIN : MIMG_Atomic <mimg<0x15>, "image_atomic_umin">; -defm IMAGE_ATOMIC_SMAX : MIMG_Atomic <mimg<0x16>, "image_atomic_smax">; -defm IMAGE_ATOMIC_UMAX : MIMG_Atomic <mimg<0x17>, "image_atomic_umax">; -defm IMAGE_ATOMIC_AND : MIMG_Atomic <mimg<0x18>, "image_atomic_and">; -defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimg<0x19>, "image_atomic_or">; -defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimg<0x1a>, "image_atomic_xor">; -defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimg<0x1b>, "image_atomic_inc">; -defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimg<0x1c>, "image_atomic_dec">; -//let FPAtomic = 1 in { -//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d, 1>; -- not on VI -//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI -//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI -//} // End let FPAtomic = 1 -defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, AMDGPUSample>; -defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, AMDGPUSample_cl>; -defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, AMDGPUSample_d>; -defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, AMDGPUSample_d_cl>; -defm IMAGE_SAMPLE_D_G16 : MIMG_Sampler <0x000000a2, AMDGPUSample_d, 0, 1>; -defm IMAGE_SAMPLE_D_CL_G16 : MIMG_Sampler <0x000000a3, AMDGPUSample_d_cl, 0, 1>; -defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, AMDGPUSample_l>; -defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, AMDGPUSample_b>; -defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, AMDGPUSample_b_cl>; -defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, AMDGPUSample_lz>; -defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, AMDGPUSample_c>; -defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, AMDGPUSample_c_cl>; -defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, AMDGPUSample_c_d>; -defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, AMDGPUSample_c_d_cl>; -defm IMAGE_SAMPLE_C_D_G16 : MIMG_Sampler <0x000000aa, AMDGPUSample_c_d, 0, 1>; -defm IMAGE_SAMPLE_C_D_CL_G16 : MIMG_Sampler <0x000000ab, AMDGPUSample_c_d_cl, 0, 1>; -defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, AMDGPUSample_c_l>; -defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, AMDGPUSample_c_b>; -defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, AMDGPUSample_c_b_cl>; -defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, AMDGPUSample_c_lz>; -defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, AMDGPUSample_o>; -defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, AMDGPUSample_cl_o>; -defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, AMDGPUSample_d_o>; -defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, AMDGPUSample_d_cl_o>; -defm IMAGE_SAMPLE_D_O_G16 : MIMG_Sampler <0x000000b2, AMDGPUSample_d_o, 0, 1>; -defm IMAGE_SAMPLE_D_CL_O_G16 : MIMG_Sampler <0x000000b3, AMDGPUSample_d_cl_o, 0, 1>; -defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, AMDGPUSample_l_o>; -defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, AMDGPUSample_b_o>; -defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, AMDGPUSample_b_cl_o>; -defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, AMDGPUSample_lz_o>; -defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, AMDGPUSample_c_o>; -defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, AMDGPUSample_c_cl_o>; -defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, AMDGPUSample_c_d_o>; -defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, AMDGPUSample_c_d_cl_o>; -defm IMAGE_SAMPLE_C_D_O_G16 : MIMG_Sampler <0x000000ba, AMDGPUSample_c_d_o, 0, 1>; -defm IMAGE_SAMPLE_C_D_CL_O_G16 : MIMG_Sampler <0x000000bb, AMDGPUSample_c_d_cl_o, 0, 1>; -defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, AMDGPUSample_c_l_o>; -defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, AMDGPUSample_c_b_cl_o>; -defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, AMDGPUSample_c_b_o>; -defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, AMDGPUSample_c_lz_o>; -defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, AMDGPUSample>; -defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, AMDGPUSample_cl>; -defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, AMDGPUSample_l>; -defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, AMDGPUSample_b>; -defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, AMDGPUSample_b_cl>; -defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, AMDGPUSample_lz>; -defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, AMDGPUSample_c>; -defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, AMDGPUSample_c_cl>; -defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, AMDGPUSample_c_l>; -defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, AMDGPUSample_c_b>; -defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, AMDGPUSample_c_b_cl>; -defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, AMDGPUSample_c_lz>; -defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, AMDGPUSample_o>; -defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, AMDGPUSample_cl_o>; -defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, AMDGPUSample_l_o>; -defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, AMDGPUSample_b_o>; -defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, AMDGPUSample_b_cl_o>; -defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, AMDGPUSample_lz_o>; -defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, AMDGPUSample_c_o>; -defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, AMDGPUSample_c_cl_o>; -defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, AMDGPUSample_c_l_o>; -defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, AMDGPUSample_c_b_o>; -defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, AMDGPUSample_c_b_cl_o>; -defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, AMDGPUSample_c_lz_o>; - -defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, AMDGPUSample, 1, 0, 1, "image_get_lod">; - -defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, AMDGPUSample_cd>; -defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, AMDGPUSample_cd_cl>; -defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, AMDGPUSample_c_cd>; -defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, AMDGPUSample_c_cd_cl>; -defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, AMDGPUSample_cd_o>; -defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, AMDGPUSample_cd_cl_o>; -defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, AMDGPUSample_c_cd_o>; -defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, AMDGPUSample_c_cd_cl_o>; -defm IMAGE_SAMPLE_CD_G16 : MIMG_Sampler <0x000000e8, AMDGPUSample_cd, 0, 1>; -defm IMAGE_SAMPLE_CD_CL_G16 : MIMG_Sampler <0x000000e9, AMDGPUSample_cd_cl, 0, 1>; -defm IMAGE_SAMPLE_C_CD_G16 : MIMG_Sampler <0x000000ea, AMDGPUSample_c_cd, 0, 1>; -defm IMAGE_SAMPLE_C_CD_CL_G16 : MIMG_Sampler <0x000000eb, AMDGPUSample_c_cd_cl, 0, 1>; -defm IMAGE_SAMPLE_CD_O_G16 : MIMG_Sampler <0x000000ec, AMDGPUSample_cd_o, 0, 1>; -defm IMAGE_SAMPLE_CD_CL_O_G16 : MIMG_Sampler <0x000000ed, AMDGPUSample_cd_cl_o, 0, 1>; -defm IMAGE_SAMPLE_C_CD_O_G16 : MIMG_Sampler <0x000000ee, AMDGPUSample_c_cd_o, 0, 1>; -defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <0x000000ef, AMDGPUSample_c_cd_cl_o, 0, 1>; +defm IMAGE_LOAD : MIMG_NoSampler <mimgopc<0x00>, "image_load", 1>; +defm IMAGE_LOAD_MIP : MIMG_NoSampler <mimgopc<0x01>, "image_load_mip", 1, 1>; +defm IMAGE_LOAD_PCK : MIMG_NoSampler <mimgopc<0x02>, "image_load_pck", 0>; +defm IMAGE_LOAD_PCK_SGN : MIMG_NoSampler <mimgopc<0x03>, "image_load_pck_sgn", 0>; +defm IMAGE_LOAD_MIP_PCK : MIMG_NoSampler <mimgopc<0x04>, "image_load_mip_pck", 0, 1>; +defm IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoSampler <mimgopc<0x05>, "image_load_mip_pck_sgn", 0, 1>; +defm IMAGE_STORE : MIMG_Store <mimgopc<0x08>, "image_store", 1>; +defm IMAGE_STORE_MIP : MIMG_Store <mimgopc<0x09>, "image_store_mip", 1, 1>; +defm IMAGE_STORE_PCK : MIMG_Store <mimgopc<0x0a>, "image_store_pck", 0>; +defm IMAGE_STORE_MIP_PCK : MIMG_Store <mimgopc<0x0b>, "image_store_mip_pck", 0, 1>; + +defm IMAGE_GET_RESINFO : MIMG_NoSampler <mimgopc<0x0e>, "image_get_resinfo", 0, 1, 1>; + +defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimgopc<0x0f, 0x10, 0x0f>, "image_atomic_swap">; +defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimgopc<0x10, 0x11, 0x10>, "image_atomic_cmpswap", 1>; +defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimgopc<0x11, 0x12, 0x11>, "image_atomic_add">; +defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimgopc<0x12, 0x13, 0x12>, "image_atomic_sub">; +defm IMAGE_ATOMIC_RSUB : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, 0x13>, "image_atomic_rsub">; +defm IMAGE_ATOMIC_SMIN : MIMG_Atomic <mimgopc<0x14>, "image_atomic_smin">; +defm IMAGE_ATOMIC_UMIN : MIMG_Atomic <mimgopc<0x15>, "image_atomic_umin">; +defm IMAGE_ATOMIC_SMAX : MIMG_Atomic <mimgopc<0x16>, "image_atomic_smax">; +defm IMAGE_ATOMIC_UMAX : MIMG_Atomic <mimgopc<0x17>, "image_atomic_umax">; +defm IMAGE_ATOMIC_AND : MIMG_Atomic <mimgopc<0x18>, "image_atomic_and">; +defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimgopc<0x19>, "image_atomic_or">; +defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimgopc<0x1a>, "image_atomic_xor">; +defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimgopc<0x1b>, "image_atomic_inc">; +defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimgopc<0x1c>, "image_atomic_dec">; +defm IMAGE_ATOMIC_FCMPSWAP : MIMG_Atomic <mimgopc<0x1d, MIMG.NOP>, "image_atomic_fcmpswap", 0, 1>; +defm IMAGE_ATOMIC_FMIN : MIMG_Atomic <mimgopc<0x1e, MIMG.NOP>, "image_atomic_fmin", 0, 1>; +defm IMAGE_ATOMIC_FMAX : MIMG_Atomic <mimgopc<0x1f, MIMG.NOP>, "image_atomic_fmax", 0, 1>; + +defm IMAGE_SAMPLE : MIMG_Sampler_WQM <mimgopc<0x20>, AMDGPUSample>; +let OtherPredicates = [HasExtendedImageInsts] in { +defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <mimgopc<0x21>, AMDGPUSample_cl>; +defm IMAGE_SAMPLE_D : MIMG_Sampler <mimgopc<0x22>, AMDGPUSample_d>; +defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <mimgopc<0x23>, AMDGPUSample_d_cl>; +defm IMAGE_SAMPLE_D_G16 : MIMG_Sampler <mimgopc<0xa2>, AMDGPUSample_d, 0, 1>; +defm IMAGE_SAMPLE_D_CL_G16 : MIMG_Sampler <mimgopc<0xa3>, AMDGPUSample_d_cl, 0, 1>; +defm IMAGE_SAMPLE_L : MIMG_Sampler <mimgopc<0x24>, AMDGPUSample_l>; +defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <mimgopc<0x25>, AMDGPUSample_b>; +defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <mimgopc<0x26>, AMDGPUSample_b_cl>; +defm IMAGE_SAMPLE_LZ : MIMG_Sampler <mimgopc<0x27>, AMDGPUSample_lz>; +defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <mimgopc<0x28>, AMDGPUSample_c>; +defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <mimgopc<0x29>, AMDGPUSample_c_cl>; +defm IMAGE_SAMPLE_C_D : MIMG_Sampler <mimgopc<0x2a>, AMDGPUSample_c_d>; +defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <mimgopc<0x2b>, AMDGPUSample_c_d_cl>; +defm IMAGE_SAMPLE_C_D_G16 : MIMG_Sampler <mimgopc<0xaa>, AMDGPUSample_c_d, 0, 1>; +defm IMAGE_SAMPLE_C_D_CL_G16 : MIMG_Sampler <mimgopc<0xab>, AMDGPUSample_c_d_cl, 0, 1>; +defm IMAGE_SAMPLE_C_L : MIMG_Sampler <mimgopc<0x2c>, AMDGPUSample_c_l>; +defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <mimgopc<0x2d>, AMDGPUSample_c_b>; +defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <mimgopc<0x2e>, AMDGPUSample_c_b_cl>; +defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <mimgopc<0x2f>, AMDGPUSample_c_lz>; +defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <mimgopc<0x30>, AMDGPUSample_o>; +defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <mimgopc<0x31>, AMDGPUSample_cl_o>; +defm IMAGE_SAMPLE_D_O : MIMG_Sampler <mimgopc<0x32>, AMDGPUSample_d_o>; +defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <mimgopc<0x33>, AMDGPUSample_d_cl_o>; +defm IMAGE_SAMPLE_D_O_G16 : MIMG_Sampler <mimgopc<0xb2>, AMDGPUSample_d_o, 0, 1>; +defm IMAGE_SAMPLE_D_CL_O_G16 : MIMG_Sampler <mimgopc<0xb3>, AMDGPUSample_d_cl_o, 0, 1>; +defm IMAGE_SAMPLE_L_O : MIMG_Sampler <mimgopc<0x34>, AMDGPUSample_l_o>; +defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <mimgopc<0x35>, AMDGPUSample_b_o>; +defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <mimgopc<0x36>, AMDGPUSample_b_cl_o>; +defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <mimgopc<0x37>, AMDGPUSample_lz_o>; +defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <mimgopc<0x38>, AMDGPUSample_c_o>; +defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <mimgopc<0x39>, AMDGPUSample_c_cl_o>; +defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <mimgopc<0x3a>, AMDGPUSample_c_d_o>; +defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <mimgopc<0x3b>, AMDGPUSample_c_d_cl_o>; +defm IMAGE_SAMPLE_C_D_O_G16 : MIMG_Sampler <mimgopc<0xba>, AMDGPUSample_c_d_o, 0, 1>; +defm IMAGE_SAMPLE_C_D_CL_O_G16 : MIMG_Sampler <mimgopc<0xbb>, AMDGPUSample_c_d_cl_o, 0, 1>; +defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <mimgopc<0x3c>, AMDGPUSample_c_l_o>; +defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <mimgopc<0x3e>, AMDGPUSample_c_b_cl_o>; +defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <mimgopc<0x3d>, AMDGPUSample_c_b_o>; +defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <mimgopc<0x3f>, AMDGPUSample_c_lz_o>; +defm IMAGE_GATHER4 : MIMG_Gather_WQM <mimgopc<0x40>, AMDGPUSample>; +defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <mimgopc<0x41>, AMDGPUSample_cl>; +defm IMAGE_GATHER4_L : MIMG_Gather <mimgopc<0x44>, AMDGPUSample_l>; +defm IMAGE_GATHER4_B : MIMG_Gather_WQM <mimgopc<0x45>, AMDGPUSample_b>; +defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <mimgopc<0x46>, AMDGPUSample_b_cl>; +defm IMAGE_GATHER4_LZ : MIMG_Gather <mimgopc<0x47>, AMDGPUSample_lz>; +defm IMAGE_GATHER4_C : MIMG_Gather_WQM <mimgopc<0x48>, AMDGPUSample_c>; +defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <mimgopc<0x49>, AMDGPUSample_c_cl>; +defm IMAGE_GATHER4_C_L : MIMG_Gather <mimgopc<0x4c>, AMDGPUSample_c_l>; +defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <mimgopc<0x4d>, AMDGPUSample_c_b>; +defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <mimgopc<0x4e>, AMDGPUSample_c_b_cl>; +defm IMAGE_GATHER4_C_LZ : MIMG_Gather <mimgopc<0x4f>, AMDGPUSample_c_lz>; +defm IMAGE_GATHER4_O : MIMG_Gather_WQM <mimgopc<0x50>, AMDGPUSample_o>; +defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <mimgopc<0x51>, AMDGPUSample_cl_o>; +defm IMAGE_GATHER4_L_O : MIMG_Gather <mimgopc<0x54>, AMDGPUSample_l_o>; +defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <mimgopc<0x55>, AMDGPUSample_b_o>; +defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <mimgopc<0x56>, AMDGPUSample_b_cl_o>; +defm IMAGE_GATHER4_LZ_O : MIMG_Gather <mimgopc<0x57>, AMDGPUSample_lz_o>; +defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <mimgopc<0x58>, AMDGPUSample_c_o>; +defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <mimgopc<0x59>, AMDGPUSample_c_cl_o>; +defm IMAGE_GATHER4_C_L_O : MIMG_Gather <mimgopc<0x5c>, AMDGPUSample_c_l_o>; +defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <mimgopc<0x5d>, AMDGPUSample_c_b_o>; +defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <mimgopc<0x5e>, AMDGPUSample_c_b_cl_o>; +defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <mimgopc<0x5f>, AMDGPUSample_c_lz_o>; +//defm IMAGE_GATHER4H : MIMG_Gather_WQM <mimgopc<0x61>, ?>; + +defm IMAGE_GET_LOD : MIMG_Sampler <mimgopc<0x60>, AMDGPUSample, 1, 0, 1, "image_get_lod">; + +defm IMAGE_SAMPLE_CD : MIMG_Sampler <mimgopc<0x68>, AMDGPUSample_cd>; +defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <mimgopc<0x69>, AMDGPUSample_cd_cl>; +defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <mimgopc<0x6a>, AMDGPUSample_c_cd>; +defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <mimgopc<0x6b>, AMDGPUSample_c_cd_cl>; +defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <mimgopc<0x6c>, AMDGPUSample_cd_o>; +defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <mimgopc<0x6d>, AMDGPUSample_cd_cl_o>; +defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <mimgopc<0x6e>, AMDGPUSample_c_cd_o>; +defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <mimgopc<0x6f>, AMDGPUSample_c_cd_cl_o>; +defm IMAGE_SAMPLE_CD_G16 : MIMG_Sampler <mimgopc<0xe8>, AMDGPUSample_cd, 0, 1>; +defm IMAGE_SAMPLE_CD_CL_G16 : MIMG_Sampler <mimgopc<0xe9>, AMDGPUSample_cd_cl, 0, 1>; +defm IMAGE_SAMPLE_C_CD_G16 : MIMG_Sampler <mimgopc<0xea>, AMDGPUSample_c_cd, 0, 1>; +defm IMAGE_SAMPLE_C_CD_CL_G16 : MIMG_Sampler <mimgopc<0xeb>, AMDGPUSample_c_cd_cl, 0, 1>; +defm IMAGE_SAMPLE_CD_O_G16 : MIMG_Sampler <mimgopc<0xec>, AMDGPUSample_cd_o, 0, 1>; +defm IMAGE_SAMPLE_CD_CL_O_G16 : MIMG_Sampler <mimgopc<0xed>, AMDGPUSample_cd_cl_o, 0, 1>; +defm IMAGE_SAMPLE_C_CD_O_G16 : MIMG_Sampler <mimgopc<0xee>, AMDGPUSample_c_cd_o, 0, 1>; +defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <mimgopc<0xef>, AMDGPUSample_c_cd_cl_o, 0, 1>; +} // End OtherPredicates = [HasExtendedImageInsts] //def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>; //def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>; -let SubtargetPredicate = HasGFX10_BEncoding in -defm IMAGE_MSAA_LOAD : MIMG_NoSampler <0x00000080, "image_msaa_load", 1>; +let SubtargetPredicate = HasGFX10_AEncoding in +defm IMAGE_MSAA_LOAD_X : MIMG_NoSampler <mimgopc<0x80>, "image_msaa_load", 1, 0, 0, 1>; -defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay<0xe6, "image_bvh_intersect_ray", 11, 0>; -defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay<0xe6, "image_bvh_intersect_ray", 8, 1>; -defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay<0xe7, "image_bvh64_intersect_ray", 12, 0>; -defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<0xe7, "image_bvh64_intersect_ray", 9, 1>; +defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0xe6>, "image_bvh_intersect_ray", 11, 0>; +defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0xe6>, "image_bvh_intersect_ray", 8, 1>; +defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0xe7>, "image_bvh64_intersect_ray", 12, 0>; +defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0xe7>, "image_bvh64_intersect_ray", 9, 1>; /********** ========================================= **********/ /********** Table of dimension-aware image intrinsics **********/ diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index c0120903396c..002ef1801448 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -451,9 +451,9 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); - case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG); + case ISD::SHL_PARTS: case ISD::SRA_PARTS: - case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG); + case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY); case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW); case ISD::FCOS: @@ -765,78 +765,11 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { DAG.getConstantFP(numbers::pif, DL, MVT::f32)); } -SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - - SDValue Lo = Op.getOperand(0); - SDValue Hi = Op.getOperand(1); - SDValue Shift = Op.getOperand(2); - SDValue Zero = DAG.getConstant(0, DL, VT); - SDValue One = DAG.getConstant(1, DL, VT); - - SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT); - SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); - SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); - SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); - - // The dance around Width1 is necessary for 0 special case. - // Without it the CompShift might be 32, producing incorrect results in - // Overflow. So we do the shift in two steps, the alternative is to - // add a conditional to filter the special case. - - SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift); - Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One); - - SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift); - HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow); - SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift); - - SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift); - SDValue LoBig = Zero; - - Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); - Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); - - return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); -} - -SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - - SDValue Lo = Op.getOperand(0); - SDValue Hi = Op.getOperand(1); - SDValue Shift = Op.getOperand(2); - SDValue Zero = DAG.getConstant(0, DL, VT); - SDValue One = DAG.getConstant(1, DL, VT); - - const bool SRA = Op.getOpcode() == ISD::SRA_PARTS; - - SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT); - SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); - SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); - SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); - - // The dance around Width1 is necessary for 0 special case. - // Without it the CompShift might be 32, producing incorrect results in - // Overflow. So we do the shift in two steps, the alternative is to - // add a conditional to filter the special case. - - SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift); - Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One); - - SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift); - SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift); - LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow); - - SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift); - SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero; - - Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); - Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); - - return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); +SDValue R600TargetLowering::LowerShiftParts(SDValue Op, + SelectionDAG &DAG) const { + SDValue Lo, Hi; + expandShiftParts(Op.getNode(), Lo, Hi, DAG); + return DAG.getMergeValues({Lo, Hi}, SDLoc(Op)); } SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG, @@ -1239,7 +1172,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { Align Alignment = StoreNode->getAlign(); if (Alignment < MemVT.getStoreSize() && - !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment.value(), + !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment, StoreNode->getMemOperand()->getFlags(), nullptr)) { return expandUnalignedStore(StoreNode, DAG); @@ -1640,7 +1573,7 @@ bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, } bool R600TargetLowering::allowsMisalignedMemoryAccesses( - EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, + EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, bool *IsFast) const { if (IsFast) *IsFast = false; @@ -1655,7 +1588,7 @@ bool R600TargetLowering::allowsMisalignedMemoryAccesses( if (IsFast) *IsFast = true; - return VT.bitsGT(MVT::i32) && Align % 4 == 0; + return VT.bitsGT(MVT::i32) && Alignment >= Align(4); } static SDValue CompactSwizzlableVector( diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h index b560da8e91d9..920cf3cd97ef 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h @@ -50,10 +50,19 @@ public: const SelectionDAG &DAG) const override; bool allowsMisalignedMemoryAccesses( - EVT VT, unsigned AS, unsigned Align, + EVT VT, unsigned AS, Align Alignment, MachineMemOperand::Flags Flags = MachineMemOperand::MONone, bool *IsFast = nullptr) const override; + virtual bool canCombineTruncStore(EVT ValVT, EVT MemVT, + bool LegalOperations) const override { + // R600 has "custom" lowering for truncating stores despite not supporting + // those instructions. If we allow that custom lowering in the DAG combiner + // then all truncates are merged into truncating stores, giving worse code + // generation. This hook prevents the DAG combiner performing that combine. + return isTruncStoreLegal(ValVT, MemVT); + } + private: unsigned Gen; /// Each OpenCL kernel has nine implicit parameters that are stored in the @@ -85,8 +94,7 @@ private: SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSHLParts(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSRXParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUADDSUBO(SDValue Op, SelectionDAG &DAG, unsigned mainop, unsigned ovf) const; diff --git a/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp b/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp index 5fd912e0fb39..8f1a069c232d 100644 --- a/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp +++ b/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp @@ -301,7 +301,8 @@ class R600OpenCLImageTypeLoweringPass : public ModulePass { } } SmallVector<ReturnInst*, 8> Returns; - CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns); + CloneFunctionInto(NewF, F, VMap, CloneFunctionChangeType::LocalChangesOnly, + Returns); // Build new MDNode. SmallVector<Metadata *, 6> KernelMDArgs; diff --git a/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp b/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp deleted file mode 100644 index 3b753cb66ead..000000000000 --- a/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp +++ /dev/null @@ -1,169 +0,0 @@ -//===-- SIAddIMGInit.cpp - Add any required IMG inits ---------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file -/// Any MIMG instructions that use tfe or lwe require an initialization of the -/// result register that will be written in the case of a memory access failure -/// The required code is also added to tie this init code to the result of the -/// img instruction -/// -//===----------------------------------------------------------------------===// -// - -#include "AMDGPU.h" -#include "GCNSubtarget.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "llvm/CodeGen/MachineFunctionPass.h" - -#define DEBUG_TYPE "si-img-init" - -using namespace llvm; - -namespace { - -class SIAddIMGInit : public MachineFunctionPass { -public: - static char ID; - -public: - SIAddIMGInit() : MachineFunctionPass(ID) { - initializeSIAddIMGInitPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - -} // End anonymous namespace. - -INITIALIZE_PASS(SIAddIMGInit, DEBUG_TYPE, "SI Add IMG Init", false, false) - -char SIAddIMGInit::ID = 0; - -char &llvm::SIAddIMGInitID = SIAddIMGInit::ID; - -FunctionPass *llvm::createSIAddIMGInitPass() { return new SIAddIMGInit(); } - -bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo *RI = ST.getRegisterInfo(); - bool Changed = false; - - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; - ++BI) { - MachineBasicBlock &MBB = *BI; - MachineBasicBlock::iterator I, Next; - for (I = MBB.begin(); I != MBB.end(); I = Next) { - Next = std::next(I); - MachineInstr &MI = *I; - - auto Opcode = MI.getOpcode(); - if (TII->isMIMG(Opcode) && !MI.mayStore()) { - MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe); - MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe); - MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16); - - if (!TFE && !LWE) // intersect_ray - continue; - - unsigned TFEVal = TFE->getImm(); - unsigned LWEVal = LWE->getImm(); - unsigned D16Val = D16 ? D16->getImm() : 0; - - if (TFEVal || LWEVal) { - // At least one of TFE or LWE are non-zero - // We have to insert a suitable initialization of the result value and - // tie this to the dest of the image instruction. - - const DebugLoc &DL = MI.getDebugLoc(); - - int DstIdx = - AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); - - // Calculate which dword we have to initialize to 0. - MachineOperand *MO_Dmask = - TII->getNamedOperand(MI, AMDGPU::OpName::dmask); - - // check that dmask operand is found. - assert(MO_Dmask && "Expected dmask operand in instruction"); - - unsigned dmask = MO_Dmask->getImm(); - // Determine the number of active lanes taking into account the - // Gather4 special case - unsigned ActiveLanes = - TII->isGather4(Opcode) ? 4 : countPopulation(dmask); - - bool Packed = !ST.hasUnpackedD16VMem(); - - unsigned InitIdx = - D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1; - - // Abandon attempt if the dst size isn't large enough - // - this is in fact an error but this is picked up elsewhere and - // reported correctly. - uint32_t DstSize = - RI->getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32; - if (DstSize < InitIdx) - continue; - - // Create a register for the intialization value. - Register PrevDst = - MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx)); - unsigned NewDst = 0; // Final initialized value will be in here - - // If PRTStrictNull feature is enabled (the default) then initialize - // all the result registers to 0, otherwise just the error indication - // register (VGPRn+1) - unsigned SizeLeft = ST.usePRTStrictNull() ? InitIdx : 1; - unsigned CurrIdx = ST.usePRTStrictNull() ? 0 : (InitIdx - 1); - - if (DstSize == 1) { - // In this case we can just initialize the result directly - BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), PrevDst) - .addImm(0); - NewDst = PrevDst; - } else { - BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst); - for (; SizeLeft; SizeLeft--, CurrIdx++) { - NewDst = - MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx)); - // Initialize dword - Register SubReg = - MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg) - .addImm(0); - // Insert into the super-reg - BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst) - .addReg(PrevDst) - .addReg(SubReg) - .addImm(SIRegisterInfo::getSubRegFromChannel(CurrIdx)); - - PrevDst = NewDst; - } - } - - // Add as an implicit operand - MachineInstrBuilder(MF, MI).addReg(NewDst, RegState::Implicit); - - // Tie the just added implicit operand to the dst - MI.tieOperands(DstIdx, MI.getNumOperands() - 1); - - Changed = true; - } - } - } - } - - return Changed; -} diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index 625749deb3a8..397b2f873515 100644 --- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -71,6 +71,8 @@ class SIAnnotateControlFlow : public FunctionPass { bool isElse(PHINode *Phi); + bool hasKill(const BasicBlock *BB); + void eraseIfUnused(PHINode *Phi); void openIf(BranchInst *Term); @@ -98,6 +100,7 @@ public: AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LegacyDivergenceAnalysis>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<TargetPassConfig>(); FunctionPass::getAnalysisUsage(AU); @@ -181,6 +184,15 @@ bool SIAnnotateControlFlow::isElse(PHINode *Phi) { return true; } +bool SIAnnotateControlFlow::hasKill(const BasicBlock *BB) { + for (const Instruction &I : *BB) { + if (const CallInst *CI = dyn_cast<CallInst>(&I)) + if (CI->getIntrinsicID() == Intrinsic::amdgcn_kill) + return true; + } + return false; +} + // Erase "Phi" if it is not used any more void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) { if (RecursivelyDeleteDeadPHINode(Phi)) { @@ -339,7 +351,7 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) { if (isTopOfStack(BB)) { PHINode *Phi = dyn_cast<PHINode>(Term->getCondition()); - if (Phi && Phi->getParent() == BB && isElse(Phi)) { + if (Phi && Phi->getParent() == BB && isElse(Phi) && !hasKill(BB)) { insertElse(Term); eraseIfUnused(Phi); continue; diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index c83802b323c3..d3c0d792804d 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -91,7 +91,7 @@ enum : uint64_t { D16Buf = UINT64_C(1) << 50, // FLAT instruction accesses FLAT_GLBL segment. - IsFlatGlobal = UINT64_C(1) << 51, + FlatGlobal = UINT64_C(1) << 51, // Uses floating point double precision rounding mode FPDPRounding = UINT64_C(1) << 52, @@ -106,7 +106,13 @@ enum : uint64_t { IsDOT = UINT64_C(1) << 55, // FLAT instruction accesses FLAT_SCRATCH segment. - IsFlatScratch = UINT64_C(1) << 56 + FlatScratch = UINT64_C(1) << 56, + + // Atomic without return. + IsAtomicNoRet = UINT64_C(1) << 57, + + // Atomic with return. + IsAtomicRet = UINT64_C(1) << 58 }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. @@ -136,6 +142,8 @@ namespace AMDGPU { OPERAND_REG_IMM_FP16, OPERAND_REG_IMM_V2FP16, OPERAND_REG_IMM_V2INT16, + OPERAND_REG_IMM_V2INT32, + OPERAND_REG_IMM_V2FP32, /// Operands with register or inline constant OPERAND_REG_INLINE_C_INT16, @@ -144,25 +152,30 @@ namespace AMDGPU { OPERAND_REG_INLINE_C_FP16, OPERAND_REG_INLINE_C_FP32, OPERAND_REG_INLINE_C_FP64, - OPERAND_REG_INLINE_C_V2FP16, OPERAND_REG_INLINE_C_V2INT16, + OPERAND_REG_INLINE_C_V2FP16, + OPERAND_REG_INLINE_C_V2INT32, + OPERAND_REG_INLINE_C_V2FP32, /// Operands with an AccVGPR register or inline constant OPERAND_REG_INLINE_AC_INT16, OPERAND_REG_INLINE_AC_INT32, OPERAND_REG_INLINE_AC_FP16, OPERAND_REG_INLINE_AC_FP32, - OPERAND_REG_INLINE_AC_V2FP16, + OPERAND_REG_INLINE_AC_FP64, OPERAND_REG_INLINE_AC_V2INT16, + OPERAND_REG_INLINE_AC_V2FP16, + OPERAND_REG_INLINE_AC_V2INT32, + OPERAND_REG_INLINE_AC_V2FP32, OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32, - OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_V2INT16, + OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_V2FP32, OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16, - OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_AC_V2INT16, + OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_AC_V2FP32, OPERAND_REG_INLINE_AC_FIRST = OPERAND_REG_INLINE_AC_INT16, - OPERAND_REG_INLINE_AC_LAST = OPERAND_REG_INLINE_AC_V2INT16, + OPERAND_REG_INLINE_AC_LAST = OPERAND_REG_INLINE_AC_V2FP32, OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32, OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST, @@ -263,15 +276,33 @@ enum : unsigned { } // namespace AMDGPU namespace AMDGPU { +namespace CPol { + +enum CPol { + GLC = 1, + SLC = 2, + DLC = 4, + SCC = 16, + ALL = GLC | SLC | DLC | SCC +}; + +} // namespace CPol + namespace SendMsg { // Encoding of SIMM16 used in s_sendmsg* insns. enum Id { // Message ID, width(4) [3:0]. ID_UNKNOWN_ = -1, ID_INTERRUPT = 1, - ID_GS, - ID_GS_DONE, - ID_GS_ALLOC_REQ = 9, - ID_GET_DOORBELL = 10, + ID_GS = 2, + ID_GS_DONE = 3, + ID_SAVEWAVE = 4, // added in GFX8 + ID_STALL_WAVE_GEN = 5, // added in GFX9 + ID_HALT_WAVES = 6, // added in GFX9 + ID_ORDERED_PS_DONE = 7, // added in GFX9 + ID_EARLY_PRIM_DEALLOC = 8, // added in GFX9, removed in GFX10 + ID_GS_ALLOC_REQ = 9, // added in GFX9 + ID_GET_DOORBELL = 10, // added in GFX9 + ID_GET_DDID = 11, // added in GFX10 ID_SYSMSG = 15, ID_GAPS_LAST_, // Indicate that sequence has gaps. ID_GAPS_FIRST_ = ID_INTERRUPT, @@ -289,16 +320,16 @@ enum Op { // Both GS and SYS operation IDs. OP_MASK_ = (((1 << OP_WIDTH_) - 1) << OP_SHIFT_), // GS operations are encoded in bits 5:4 OP_GS_NOP = 0, - OP_GS_CUT, - OP_GS_EMIT, - OP_GS_EMIT_CUT, + OP_GS_CUT = 1, + OP_GS_EMIT = 2, + OP_GS_EMIT_CUT = 3, OP_GS_LAST_, OP_GS_FIRST_ = OP_GS_NOP, // SYS operations are encoded in bits 6:4 OP_SYS_ECC_ERR_INTERRUPT = 1, - OP_SYS_REG_RD, - OP_SYS_HOST_TRAP_ACK, - OP_SYS_TTRACE_PC, + OP_SYS_REG_RD = 2, + OP_SYS_HOST_TRAP_ACK = 3, + OP_SYS_TTRACE_PC = 4, OP_SYS_LAST_, OP_SYS_FIRST_ = OP_SYS_ECC_ERR_INTERRUPT, }; @@ -640,6 +671,7 @@ enum SDWA9EncValues : unsigned { namespace DPP { +// clang-format off enum DppCtrl : unsigned { QUAD_PERM_FIRST = 0, QUAD_PERM_ID = 0xE4, // identity permutation @@ -674,12 +706,17 @@ enum DppCtrl : unsigned { BCAST31 = 0x143, DPP_UNUSED8_FIRST = 0x144, DPP_UNUSED8_LAST = 0x14F, + ROW_NEWBCAST_FIRST= 0x150, + ROW_NEWBCAST_LAST = 0x15F, + ROW_SHARE0 = 0x150, ROW_SHARE_FIRST = 0x150, ROW_SHARE_LAST = 0x15F, + ROW_XMASK0 = 0x160, ROW_XMASK_FIRST = 0x160, ROW_XMASK_LAST = 0x16F, DPP_LAST = ROW_XMASK_LAST }; +// clang-format on enum DppFiMode { DPP_FI_0 = 0, @@ -716,6 +753,17 @@ enum Target : unsigned { }; } // namespace Exp + +namespace VOP3PEncoding { + +enum OpSel : uint64_t { + OP_SEL_HI_0 = UINT64_C(1) << 59, + OP_SEL_HI_1 = UINT64_C(1) << 60, + OP_SEL_HI_2 = UINT64_C(1) << 14, +}; + +} // namespace VOP3PEncoding + } // namespace AMDGPU #define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 34f59bf34dd5..d5c56bf2a321 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -581,8 +581,9 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { continue; case AMDGPU::COPY: case AMDGPU::WQM: + case AMDGPU::STRICT_WQM: case AMDGPU::SOFT_WQM: - case AMDGPU::WWM: { + case AMDGPU::STRICT_WWM: { Register DstReg = MI.getOperand(0).getReg(); const TargetRegisterClass *SrcRC, *DstRC; diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index d5fa9afded27..ad910522ba90 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -90,6 +90,8 @@ public: SmallVectorImpl<FoldCandidate> &FoldList, SmallVectorImpl<MachineInstr *> &CopiesToReplace) const; + bool tryFoldCndMask(MachineInstr &MI) const; + bool tryFoldZeroHighBits(MachineInstr &MI) const; void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; const MachineOperand *isClamp(const MachineInstr &MI) const; @@ -97,6 +99,9 @@ public: std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const; bool tryFoldOMod(MachineInstr &MI); + bool tryFoldRegSequence(MachineInstr &MI); + bool tryFoldLCSSAPhi(MachineInstr &MI); + bool tryFoldLoad(MachineInstr &MI); public: SIFoldOperands() : MachineFunctionPass(ID) { @@ -135,6 +140,8 @@ static unsigned macToMad(unsigned Opc) { return AMDGPU::V_FMA_F16_gfx9_e64; case AMDGPU::V_FMAC_LEGACY_F32_e64: return AMDGPU::V_FMA_LEGACY_F32_e64; + case AMDGPU::V_FMAC_F64_e64: + return AMDGPU::V_FMA_F64_e64; } return AMDGPU::INSTRUCTION_LIST_END; } @@ -332,8 +339,8 @@ static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList, if (Fold.UseMI == MI && Fold.UseOpNo == OpNo) return; LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal") - << " operand " << OpNo << "\n " << *MI << '\n'); - FoldList.push_back(FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp)); + << " operand " << OpNo << "\n " << *MI); + FoldList.emplace_back(MI, OpNo, FoldOp, Commuted, ShrinkOp); } static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, @@ -484,37 +491,37 @@ static bool isUseSafeToFold(const SIInstrInfo *TII, //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg()); } -// Find a def of the UseReg, check if it is a reg_seqence and find initializers +// Find a def of the UseReg, check if it is a reg_sequence and find initializers // for each subreg, tracking it to foldable inline immediate if possible. // Returns true on success. static bool getRegSeqInit( SmallVectorImpl<std::pair<MachineOperand*, unsigned>> &Defs, Register UseReg, uint8_t OpTy, const SIInstrInfo *TII, const MachineRegisterInfo &MRI) { - MachineInstr *Def = MRI.getUniqueVRegDef(UseReg); + MachineInstr *Def = MRI.getVRegDef(UseReg); if (!Def || !Def->isRegSequence()) return false; for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) { MachineOperand *Sub = &Def->getOperand(I); - assert (Sub->isReg()); + assert(Sub->isReg()); - for (MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub->getReg()); - SubDef && Sub->isReg() && !Sub->getSubReg() && - TII->isFoldableCopy(*SubDef); - SubDef = MRI.getUniqueVRegDef(Sub->getReg())) { + for (MachineInstr *SubDef = MRI.getVRegDef(Sub->getReg()); + SubDef && Sub->isReg() && Sub->getReg().isVirtual() && + !Sub->getSubReg() && TII->isFoldableCopy(*SubDef); + SubDef = MRI.getVRegDef(Sub->getReg())) { MachineOperand *Op = &SubDef->getOperand(1); if (Op->isImm()) { if (TII->isInlineConstant(*Op, OpTy)) Sub = Op; break; } - if (!Op->isReg()) + if (!Op->isReg() || Op->getReg().isPhysical()) break; Sub = Op; } - Defs.push_back(std::make_pair(Sub, Def->getOperand(I + 1).getImm())); + Defs.emplace_back(Sub, Def->getOperand(I + 1).getImm()); } return true; @@ -531,8 +538,10 @@ static bool tryToFoldACImm(const SIInstrInfo *TII, return false; uint8_t OpTy = OpInfo[UseOpIdx].OperandType; - if (OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST || - OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST) + if ((OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST || + OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST) && + (OpTy < AMDGPU::OPERAND_REG_INLINE_C_FIRST || + OpTy > AMDGPU::OPERAND_REG_INLINE_C_LAST)) return false; if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) && @@ -548,12 +557,23 @@ static bool tryToFoldACImm(const SIInstrInfo *TII, if (!UseReg.isVirtual()) return false; - if (llvm::any_of(FoldList, [UseMI](const FoldCandidate &FC) { - return FC.UseMI == UseMI; - })) + if (isUseMIInFoldList(FoldList, UseMI)) return false; MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo(); + + // Maybe it is just a COPY of an immediate itself. + MachineInstr *Def = MRI.getVRegDef(UseReg); + MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); + if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) { + MachineOperand &DefOp = Def->getOperand(1); + if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) && + TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) { + UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm()); + return true; + } + } + SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs; if (!getRegSeqInit(Defs, UseReg, OpTy, TII, MRI)) return false; @@ -605,22 +625,17 @@ void SIFoldOperands::foldOperand( Register RegSeqDstReg = UseMI->getOperand(0).getReg(); unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); - MachineRegisterInfo::use_nodbg_iterator Next; - for (MachineRegisterInfo::use_nodbg_iterator - RSUse = MRI->use_nodbg_begin(RegSeqDstReg), RSE = MRI->use_nodbg_end(); - RSUse != RSE; RSUse = Next) { - Next = std::next(RSUse); - - MachineInstr *RSUseMI = RSUse->getParent(); + for (auto &RSUse : make_early_inc_range(MRI->use_nodbg_operands(RegSeqDstReg))) { + MachineInstr *RSUseMI = RSUse.getParent(); if (tryToFoldACImm(TII, UseMI->getOperand(0), RSUseMI, - RSUse.getOperandNo(), FoldList)) + RSUseMI->getOperandNo(&RSUse), FoldList)) continue; - if (RSUse->getSubReg() != RegSeqDstSubReg) + if (RSUse.getSubReg() != RegSeqDstSubReg) continue; - foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList, + foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(&RSUse), FoldList, CopiesToReplace); } @@ -680,19 +695,15 @@ void SIFoldOperands::foldOperand( const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg); if (!DestReg.isPhysical()) { if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) { - MachineRegisterInfo::use_nodbg_iterator NextUse; SmallVector<FoldCandidate, 4> CopyUses; - for (MachineRegisterInfo::use_nodbg_iterator Use = MRI->use_nodbg_begin(DestReg), - E = MRI->use_nodbg_end(); - Use != E; Use = NextUse) { - NextUse = std::next(Use); + for (auto &Use : MRI->use_nodbg_operands(DestReg)) { // There's no point trying to fold into an implicit operand. - if (Use->isImplicit()) + if (Use.isImplicit()) continue; - FoldCandidate FC = FoldCandidate(Use->getParent(), Use.getOperandNo(), - &UseMI->getOperand(1)); - CopyUses.push_back(FC); + CopyUses.emplace_back(Use.getParent(), + Use.getParent()->getOperandNo(&Use), + &UseMI->getOperand(1)); } for (auto &F : CopyUses) { foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, FoldList, CopiesToReplace); @@ -728,8 +739,7 @@ void SIFoldOperands::foldOperand( if (UseMI->isCopy() && OpToFold.isReg() && UseMI->getOperand(0).getReg().isVirtual() && !UseMI->getOperand(1).getSubReg()) { - LLVM_DEBUG(dbgs() << "Folding " << OpToFold - << "\n into " << *UseMI << '\n'); + LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI); unsigned Size = TII->getOpSize(*UseMI, 1); Register UseReg = OpToFold.getReg(); UseMI->getOperand(1).setReg(UseReg); @@ -813,7 +823,7 @@ void SIFoldOperands::foldOperand( B.addImm(Defs[I].second); } - LLVM_DEBUG(dbgs() << "Folded " << *UseMI << '\n'); + LLVM_DEBUG(dbgs() << "Folded " << *UseMI); return; } @@ -825,6 +835,10 @@ void SIFoldOperands::foldOperand( else if (TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) && TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg())) UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64)); + else if (ST->hasGFX90AInsts() && + TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) && + TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg())) + UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32)); return; } @@ -1033,14 +1047,19 @@ static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI, // Try to simplify operations with a constant that may appear after instruction // selection. // TODO: See if a frame index with a fixed offset can fold. -static bool tryConstantFoldOp(MachineRegisterInfo &MRI, - const SIInstrInfo *TII, - MachineInstr *MI, - MachineOperand *ImmOp) { +static bool tryConstantFoldOp(MachineRegisterInfo &MRI, const SIInstrInfo *TII, + MachineInstr *MI) { unsigned Opc = MI->getOpcode(); - if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 || - Opc == AMDGPU::S_NOT_B32) { - MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm()); + + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + if (Src0Idx == -1) + return false; + MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx)); + + if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 || + Opc == AMDGPU::S_NOT_B32) && + Src0->isImm()) { + MI->getOperand(1).ChangeToImmediate(~Src0->getImm()); mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); return true; } @@ -1048,9 +1067,6 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI, int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); if (Src1Idx == -1) return false; - - int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx)); MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx)); if (!Src0->isImm() && !Src1->isImm()) @@ -1134,35 +1150,61 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI, } // Try to fold an instruction into a simpler one -static bool tryFoldInst(const SIInstrInfo *TII, - MachineInstr *MI) { - unsigned Opc = MI->getOpcode(); +bool SIFoldOperands::tryFoldCndMask(MachineInstr &MI) const { + unsigned Opc = MI.getOpcode(); + if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 && + Opc != AMDGPU::V_CNDMASK_B64_PSEUDO) + return false; - if (Opc == AMDGPU::V_CNDMASK_B32_e32 || - Opc == AMDGPU::V_CNDMASK_B32_e64 || - Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) { - const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); - const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1); - int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers); - int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); - if (Src1->isIdenticalTo(*Src0) && - (Src1ModIdx == -1 || !MI->getOperand(Src1ModIdx).getImm()) && - (Src0ModIdx == -1 || !MI->getOperand(Src0ModIdx).getImm())) { - LLVM_DEBUG(dbgs() << "Folded " << *MI << " into "); - auto &NewDesc = - TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false)); - int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); - if (Src2Idx != -1) - MI->RemoveOperand(Src2Idx); - MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1)); - if (Src1ModIdx != -1) - MI->RemoveOperand(Src1ModIdx); - if (Src0ModIdx != -1) - MI->RemoveOperand(Src0ModIdx); - mutateCopyOp(*MI, NewDesc); - LLVM_DEBUG(dbgs() << *MI << '\n'); - return true; - } + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (!Src1->isIdenticalTo(*Src0)) { + auto *Src0Imm = getImmOrMaterializedImm(*MRI, *Src0); + auto *Src1Imm = getImmOrMaterializedImm(*MRI, *Src1); + if (!Src1Imm->isIdenticalTo(*Src0Imm)) + return false; + } + + int Src1ModIdx = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers); + int Src0ModIdx = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); + if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) || + (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0)) + return false; + + LLVM_DEBUG(dbgs() << "Folded " << MI << " into "); + auto &NewDesc = + TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false)); + int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); + if (Src2Idx != -1) + MI.RemoveOperand(Src2Idx); + MI.RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1)); + if (Src1ModIdx != -1) + MI.RemoveOperand(Src1ModIdx); + if (Src0ModIdx != -1) + MI.RemoveOperand(Src0ModIdx); + mutateCopyOp(MI, NewDesc); + LLVM_DEBUG(dbgs() << MI); + return true; +} + +bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const { + if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 && + MI.getOpcode() != AMDGPU::V_AND_B32_e32) + return false; + + MachineOperand *Src0 = getImmOrMaterializedImm(*MRI, MI.getOperand(1)); + if (!Src0->isImm() || Src0->getImm() != 0xffff) + return false; + + Register Src1 = MI.getOperand(2).getReg(); + MachineInstr *SrcDef = MRI->getVRegDef(Src1); + if (ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode())) { + Register Dst = MI.getOperand(0).getReg(); + MRI->replaceRegWith(Dst, SrcDef->getOperand(0).getReg()); + MI.eraseFromParent(); + return true; } return false; @@ -1177,20 +1219,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, SmallVector<FoldCandidate, 4> FoldList; MachineOperand &Dst = MI.getOperand(0); - bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); - if (FoldingImm) { - unsigned NumLiteralUses = 0; - MachineOperand *NonInlineUse = nullptr; - int NonInlineUseOpNo = -1; - - MachineRegisterInfo::use_nodbg_iterator NextUse; - for (MachineRegisterInfo::use_nodbg_iterator - Use = MRI->use_nodbg_begin(Dst.getReg()), E = MRI->use_nodbg_end(); - Use != E; Use = NextUse) { - NextUse = std::next(Use); - MachineInstr *UseMI = Use->getParent(); - unsigned OpNo = Use.getOperandNo(); - + if (OpToFold.isImm()) { + for (auto &UseMI : + make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) { // Folding the immediate may reveal operations that can be constant // folded or replaced with a copy. This can happen for example after // frame indices are lowered to constants or from splitting 64-bit @@ -1199,18 +1230,21 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, // We may also encounter cases where one or both operands are // immediates materialized into a register, which would ordinarily not // be folded due to multiple uses or operand constraints. + if (tryConstantFoldOp(*MRI, TII, &UseMI)) + LLVM_DEBUG(dbgs() << "Constant folded " << UseMI); + } + } - if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) { - LLVM_DEBUG(dbgs() << "Constant folded " << *UseMI << '\n'); + bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); + if (FoldingImm) { + unsigned NumLiteralUses = 0; + MachineOperand *NonInlineUse = nullptr; + int NonInlineUseOpNo = -1; - // Some constant folding cases change the same immediate's use to a new - // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user - // again. The same constant folded instruction could also have a second - // use operand. - NextUse = MRI->use_nodbg_begin(Dst.getReg()); - FoldList.clear(); - continue; - } + for (auto &Use : + make_early_inc_range(MRI->use_nodbg_operands(Dst.getReg()))) { + MachineInstr *UseMI = Use.getParent(); + unsigned OpNo = UseMI->getOperandNo(&Use); // Try to fold any inline immediate uses, and then only fold other // constants if they have one use. @@ -1230,11 +1264,10 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) { foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) { - foldOperand(OpToFold, UseMI, OpNo, FoldList, - CopiesToReplace); + foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); } else { if (++NumLiteralUses == 1) { - NonInlineUse = &*Use; + NonInlineUse = &Use; NonInlineUseOpNo = OpNo; } } @@ -1246,16 +1279,13 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, } } else { // Folding register. - SmallVector <MachineRegisterInfo::use_nodbg_iterator, 4> UsesToProcess; - for (MachineRegisterInfo::use_nodbg_iterator - Use = MRI->use_nodbg_begin(Dst.getReg()), E = MRI->use_nodbg_end(); - Use != E; ++Use) { - UsesToProcess.push_back(Use); - } + SmallVector <MachineOperand *, 4> UsesToProcess; + for (auto &Use : MRI->use_nodbg_operands(Dst.getReg())) + UsesToProcess.push_back(&Use); for (auto U : UsesToProcess) { MachineInstr *UseMI = U->getParent(); - foldOperand(OpToFold, UseMI, U.getOperandNo(), + foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList, CopiesToReplace); } } @@ -1265,11 +1295,8 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, for (MachineInstr *Copy : CopiesToReplace) Copy->addImplicitDefUseOperands(*MF); - SmallPtrSet<MachineInstr *, 16> Folded; for (FoldCandidate &Fold : FoldList) { assert(!Fold.isReg() || Fold.OpToFold); - if (Folded.count(Fold.UseMI)) - continue; if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) { Register Reg = Fold.OpToFold->getReg(); MachineInstr *DefMI = Fold.OpToFold->getParent(); @@ -1288,9 +1315,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, } LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << static_cast<int>(Fold.UseOpNo) << " of " - << *Fold.UseMI << '\n'); - if (tryFoldInst(TII, Fold.UseMI)) - Folded.insert(Fold.UseMI); + << *Fold.UseMI); } else if (Fold.isCommuted()) { // Restoring instruction's original operand order if fold has failed. TII->commuteInstruction(*Fold.UseMI, false); @@ -1341,23 +1366,10 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const { } } -// We obviously have multiple uses in a clamp since the register is used twice -// in the same instruction. -static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) { - int Count = 0; - for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end(); - I != E; ++I) { - if (++Count > 1) - return false; - } - - return true; -} - // FIXME: Clamp for v_mad_mixhi_f16 handled during isel. bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) { const MachineOperand *ClampSrc = isClamp(MI); - if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg())) + if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg())) return false; MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg()); @@ -1370,8 +1382,7 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) { if (!DefClamp) return false; - LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def - << '\n'); + LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def); // Clamp is applied after omod, so it is OK if omod is set. DefClamp->setImm(1); @@ -1382,6 +1393,18 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) { static int getOModValue(unsigned Opc, int64_t Val) { switch (Opc) { + case AMDGPU::V_MUL_F64_e64: { + switch (Val) { + case 0x3fe0000000000000: // 0.5 + return SIOutMods::DIV2; + case 0x4000000000000000: // 2.0 + return SIOutMods::MUL2; + case 0x4010000000000000: // 4.0 + return SIOutMods::MUL4; + default: + return SIOutMods::NONE; + } + } case AMDGPU::V_MUL_F32_e64: { switch (static_cast<uint32_t>(Val)) { case 0x3f000000: // 0.5 @@ -1418,11 +1441,13 @@ std::pair<const MachineOperand *, int> SIFoldOperands::isOMod(const MachineInstr &MI) const { unsigned Op = MI.getOpcode(); switch (Op) { + case AMDGPU::V_MUL_F64_e64: case AMDGPU::V_MUL_F32_e64: case AMDGPU::V_MUL_F16_e64: { // If output denormals are enabled, omod is ignored. if ((Op == AMDGPU::V_MUL_F32_e64 && MFI->getMode().FP32OutputDenormals) || - (Op == AMDGPU::V_MUL_F16_e64 && MFI->getMode().FP64FP16OutputDenormals)) + ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F16_e64) && + MFI->getMode().FP64FP16OutputDenormals)) return std::make_pair(nullptr, SIOutMods::NONE); const MachineOperand *RegOp = nullptr; @@ -1448,11 +1473,13 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const { return std::make_pair(RegOp, OMod); } + case AMDGPU::V_ADD_F64_e64: case AMDGPU::V_ADD_F32_e64: case AMDGPU::V_ADD_F16_e64: { // If output denormals are enabled, omod is ignored. if ((Op == AMDGPU::V_ADD_F32_e64 && MFI->getMode().FP32OutputDenormals) || - (Op == AMDGPU::V_ADD_F16_e64 && MFI->getMode().FP64FP16OutputDenormals)) + ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F16_e64) && + MFI->getMode().FP64FP16OutputDenormals)) return std::make_pair(nullptr, SIOutMods::NONE); // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x @@ -1481,7 +1508,7 @@ bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) { std::tie(RegOp, OMod) = isOMod(MI); if (OMod == SIOutMods::NONE || !RegOp->isReg() || RegOp->getSubReg() != AMDGPU::NoSubRegister || - !hasOneNonDBGUseInst(*MRI, RegOp->getReg())) + !MRI->hasOneNonDBGUser(RegOp->getReg())) return false; MachineInstr *Def = MRI->getVRegDef(RegOp->getReg()); @@ -1494,7 +1521,7 @@ bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) { if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp)) return false; - LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n'); + LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def); DefOMod->setImm(OMod); MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg()); @@ -1502,6 +1529,198 @@ bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) { return true; } +// Try to fold a reg_sequence with vgpr output and agpr inputs into an +// instruction which can take an agpr. So far that means a store. +bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) { + assert(MI.isRegSequence()); + auto Reg = MI.getOperand(0).getReg(); + + if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) || + !MRI->hasOneNonDBGUse(Reg)) + return false; + + SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs; + if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER, TII, *MRI)) + return false; + + for (auto &Def : Defs) { + const auto *Op = Def.first; + if (!Op->isReg()) + return false; + if (TRI->isAGPR(*MRI, Op->getReg())) + continue; + // Maybe this is a COPY from AREG + const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg()); + if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg()) + return false; + if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg())) + return false; + } + + MachineOperand *Op = &*MRI->use_nodbg_begin(Reg); + MachineInstr *UseMI = Op->getParent(); + while (UseMI->isCopy() && !Op->getSubReg()) { + Reg = UseMI->getOperand(0).getReg(); + if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg)) + return false; + Op = &*MRI->use_nodbg_begin(Reg); + UseMI = Op->getParent(); + } + + if (Op->getSubReg()) + return false; + + unsigned OpIdx = Op - &UseMI->getOperand(0); + const MCInstrDesc &InstDesc = UseMI->getDesc(); + const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; + switch (OpInfo.RegClass) { + case AMDGPU::AV_32RegClassID: LLVM_FALLTHROUGH; + case AMDGPU::AV_64RegClassID: LLVM_FALLTHROUGH; + case AMDGPU::AV_96RegClassID: LLVM_FALLTHROUGH; + case AMDGPU::AV_128RegClassID: LLVM_FALLTHROUGH; + case AMDGPU::AV_160RegClassID: + break; + default: + return false; + } + + const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)); + auto Dst = MRI->createVirtualRegister(NewDstRC); + auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII->get(AMDGPU::REG_SEQUENCE), Dst); + + for (unsigned I = 0; I < Defs.size(); ++I) { + MachineOperand *Def = Defs[I].first; + Def->setIsKill(false); + if (TRI->isAGPR(*MRI, Def->getReg())) { + RS.add(*Def); + } else { // This is a copy + MachineInstr *SubDef = MRI->getVRegDef(Def->getReg()); + SubDef->getOperand(1).setIsKill(false); + RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg()); + } + RS.addImm(Defs[I].second); + } + + Op->setReg(Dst); + if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) { + Op->setReg(Reg); + RS->eraseFromParent(); + return false; + } + + LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI); + + // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users, + // in which case we can erase them all later in runOnMachineFunction. + if (MRI->use_nodbg_empty(MI.getOperand(0).getReg())) + MI.eraseFromParentAndMarkDBGValuesForRemoval(); + return true; +} + +// Try to hoist an AGPR to VGPR copy out of the loop across a LCSSA PHI. +// This should allow folding of an AGPR into a consumer which may support it. +// I.e.: +// +// loop: // loop: +// %1:vreg = COPY %0:areg // exit: +// exit: => // %1:areg = PHI %0:areg, %loop +// %2:vreg = PHI %1:vreg, %loop // %2:vreg = COPY %1:areg +bool SIFoldOperands::tryFoldLCSSAPhi(MachineInstr &PHI) { + assert(PHI.isPHI()); + + if (PHI.getNumExplicitOperands() != 3) // Single input LCSSA PHI + return false; + + Register PhiIn = PHI.getOperand(1).getReg(); + Register PhiOut = PHI.getOperand(0).getReg(); + if (PHI.getOperand(1).getSubReg() || + !TRI->isVGPR(*MRI, PhiIn) || !TRI->isVGPR(*MRI, PhiOut)) + return false; + + // A single use should not matter for correctness, but if it has another use + // inside the loop we may perform copy twice in a worst case. + if (!MRI->hasOneNonDBGUse(PhiIn)) + return false; + + MachineInstr *Copy = MRI->getVRegDef(PhiIn); + if (!Copy || !Copy->isCopy()) + return false; + + Register CopyIn = Copy->getOperand(1).getReg(); + if (!TRI->isAGPR(*MRI, CopyIn) || Copy->getOperand(1).getSubReg()) + return false; + + const TargetRegisterClass *ARC = MRI->getRegClass(CopyIn); + Register NewReg = MRI->createVirtualRegister(ARC); + PHI.getOperand(1).setReg(CopyIn); + PHI.getOperand(0).setReg(NewReg); + + MachineBasicBlock *MBB = PHI.getParent(); + BuildMI(*MBB, MBB->getFirstNonPHI(), Copy->getDebugLoc(), + TII->get(AMDGPU::COPY), PhiOut) + .addReg(NewReg, RegState::Kill); + Copy->eraseFromParent(); // We know this copy had a single use. + + LLVM_DEBUG(dbgs() << "Folded " << PHI); + + return true; +} + +// Attempt to convert VGPR load to an AGPR load. +bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) { + assert(MI.mayLoad()); + if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1) + return false; + + MachineOperand &Def = MI.getOperand(0); + if (!Def.isDef()) + return false; + + Register DefReg = Def.getReg(); + + if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg)) + return false; + + SmallVector<const MachineInstr*, 8> Users; + SmallVector<Register, 8> MoveRegs; + for (const MachineInstr &I : MRI->use_nodbg_instructions(DefReg)) { + Users.push_back(&I); + } + if (Users.empty()) + return false; + + // Check that all uses a copy to an agpr or a reg_sequence producing an agpr. + while (!Users.empty()) { + const MachineInstr *I = Users.pop_back_val(); + if (!I->isCopy() && !I->isRegSequence()) + return false; + Register DstReg = I->getOperand(0).getReg(); + if (TRI->isAGPR(*MRI, DstReg)) + continue; + MoveRegs.push_back(DstReg); + for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg)) { + Users.push_back(&U); + } + } + + const TargetRegisterClass *RC = MRI->getRegClass(DefReg); + MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC)); + if (!TII->isOperandLegal(MI, 0, &Def)) { + MRI->setRegClass(DefReg, RC); + return false; + } + + while (!MoveRegs.empty()) { + Register Reg = MoveRegs.pop_back_val(); + MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg))); + } + + LLVM_DEBUG(dbgs() << "Folded " << MI); + + return true; +} + bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -1520,14 +1739,21 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { bool HasNSZ = MFI->hasNoSignedZerosFPMath(); for (MachineBasicBlock *MBB : depth_first(&MF)) { - MachineBasicBlock::iterator I, Next; - MachineOperand *CurrentKnownM0Val = nullptr; - for (I = MBB->begin(); I != MBB->end(); I = Next) { - Next = std::next(I); - MachineInstr &MI = *I; + for (auto &MI : make_early_inc_range(*MBB)) { + tryFoldCndMask(MI); + + if (tryFoldZeroHighBits(MI)) + continue; - tryFoldInst(TII, &MI); + if (MI.isRegSequence() && tryFoldRegSequence(MI)) + continue; + + if (MI.isPHI() && tryFoldLCSSAPhi(MI)) + continue; + + if (MI.mayLoad() && tryFoldLoad(MI)) + continue; if (!TII->isFoldableCopy(MI)) { // Saw an unknown clobber of m0, so we no longer know what it is. @@ -1575,11 +1801,31 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { // %3 = COPY %vgpr0; VGPR_32:%3 // ... // %vgpr0 = V_MOV_B32_e32 1, implicit %exec - MachineOperand &Dst = MI.getOperand(0); - if (Dst.isReg() && !Dst.getReg().isVirtual()) + if (!MI.getOperand(0).getReg().isVirtual()) continue; foldInstOperand(MI, OpToFold); + + // If we managed to fold all uses of this copy then we might as well + // delete it now. + // The only reason we need to follow chains of copies here is that + // tryFoldRegSequence looks forward through copies before folding a + // REG_SEQUENCE into its eventual users. + auto *InstToErase = &MI; + while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) { + auto &SrcOp = InstToErase->getOperand(1); + auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register(); + InstToErase->eraseFromParentAndMarkDBGValuesForRemoval(); + InstToErase = nullptr; + if (!SrcReg || SrcReg.isPhysical()) + break; + InstToErase = MRI->getVRegDef(SrcReg); + if (!InstToErase || !TII->isFoldableCopy(*InstToErase)) + break; + } + if (InstToErase && InstToErase->isRegSequence() && + MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) + InstToErase->eraseFromParentAndMarkDBGValuesForRemoval(); } } return true; diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp index a12e013b4fe6..80ee7a00252a 100644 --- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp @@ -6,10 +6,11 @@ // //===----------------------------------------------------------------------===// // -/// \file -/// This pass creates bundles of SMEM and VMEM instructions forming memory -/// clauses if XNACK is enabled. Def operands of clauses are marked as early -/// clobber to make sure we will not override any source within a clause. +/// \file This pass extends the live ranges of registers used as pointers in +/// sequences of adjacent SMEM and VMEM instructions if XNACK is enabled. A +/// load that would overwrite a pointer would require breaking the soft clause. +/// Artificially extend the live ranges of the pointer operands by adding +/// implicit-def early-clobber operands throughout the soft clause. /// //===----------------------------------------------------------------------===// @@ -59,10 +60,8 @@ public: } private: - template <typename Callable> - void forAllLanes(Register Reg, LaneBitmask LaneMask, Callable Func) const; - - bool canBundle(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const; + bool canBundle(const MachineInstr &MI, const RegUse &Defs, + const RegUse &Uses) const; bool checkPressure(const MachineInstr &MI, GCNDownwardRPTracker &RPT); void collectRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const; bool processRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses, @@ -106,12 +105,12 @@ static bool isSMEMClauseInst(const MachineInstr &MI) { // There no sense to create store clauses, they do not define anything, // thus there is nothing to set early-clobber. static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) { - if (MI.isDebugValue() || MI.isBundled()) + assert(!MI.isDebugInstr() && "debug instructions should not reach here"); + if (MI.isBundled()) return false; if (!MI.mayLoad() || MI.mayStore()) return false; - if (AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1 || - AMDGPU::getAtomicRetOp(MI.getOpcode()) != -1) + if (SIInstrInfo::isAtomic(MI)) return false; if (IsVMEMClause && !isVMEMClauseInst(MI)) return false; @@ -148,63 +147,10 @@ static unsigned getMopState(const MachineOperand &MO) { return S; } -template <typename Callable> -void SIFormMemoryClauses::forAllLanes(Register Reg, LaneBitmask LaneMask, - Callable Func) const { - if (LaneMask.all() || Reg.isPhysical() || - LaneMask == MRI->getMaxLaneMaskForVReg(Reg)) { - Func(0); - return; - } - - const TargetRegisterClass *RC = MRI->getRegClass(Reg); - unsigned E = TRI->getNumSubRegIndices(); - SmallVector<unsigned, AMDGPU::NUM_TARGET_SUBREGS> CoveringSubregs; - for (unsigned Idx = 1; Idx < E; ++Idx) { - // Is this index even compatible with the given class? - if (TRI->getSubClassWithSubReg(RC, Idx) != RC) - continue; - LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx); - // Early exit if we found a perfect match. - if (SubRegMask == LaneMask) { - Func(Idx); - return; - } - - if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none()) - continue; - - CoveringSubregs.push_back(Idx); - } - - llvm::sort(CoveringSubregs, [this](unsigned A, unsigned B) { - LaneBitmask MaskA = TRI->getSubRegIndexLaneMask(A); - LaneBitmask MaskB = TRI->getSubRegIndexLaneMask(B); - unsigned NA = MaskA.getNumLanes(); - unsigned NB = MaskB.getNumLanes(); - if (NA != NB) - return NA > NB; - return MaskA.getHighestLane() > MaskB.getHighestLane(); - }); - - for (unsigned Idx : CoveringSubregs) { - LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx); - if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none()) - continue; - - Func(Idx); - LaneMask &= ~SubRegMask; - if (LaneMask.none()) - return; - } - - llvm_unreachable("Failed to find all subregs to cover lane mask"); -} - // Returns false if there is a use of a def already in the map. // In this case we must break the clause. -bool SIFormMemoryClauses::canBundle(const MachineInstr &MI, - RegUse &Defs, RegUse &Uses) const { +bool SIFormMemoryClauses::canBundle(const MachineInstr &MI, const RegUse &Defs, + const RegUse &Uses) const { // Check interference with defs. for (const MachineOperand &MO : MI.operands()) { // TODO: Prologue/Epilogue Insertion pass does not process bundled @@ -221,7 +167,7 @@ bool SIFormMemoryClauses::canBundle(const MachineInstr &MI, if (MO.isTied()) return false; - RegUse &Map = MO.isDef() ? Uses : Defs; + const RegUse &Map = MO.isDef() ? Uses : Defs; auto Conflict = Map.find(Reg); if (Conflict == Map.end()) continue; @@ -249,9 +195,19 @@ bool SIFormMemoryClauses::checkPressure(const MachineInstr &MI, RPT.advanceToNext(); GCNRegPressure MaxPressure = RPT.moveMaxPressure(); unsigned Occupancy = MaxPressure.getOccupancy(*ST); + + // Don't push over half the register budget. We don't want to introduce + // spilling just to form a soft clause. + // + // FIXME: This pressure check is fundamentally broken. First, this is checking + // the global pressure, not the pressure at this specific point in the + // program. Second, it's not accounting for the increased liveness of the use + // operands due to the early clobber we will introduce. Third, the pressure + // tracking does not account for the alignment requirements for SGPRs, or the + // fragmentation of registers the allocator will need to satisfy. if (Occupancy >= MFI->getMinAllowedOccupancy() && - MaxPressure.getVGPRNum() <= MaxVGPRs && - MaxPressure.getSGPRNum() <= MaxSGPRs) { + MaxPressure.getVGPRNum(ST->hasGFX90AInsts()) <= MaxVGPRs / 2 && + MaxPressure.getSGPRNum() <= MaxSGPRs / 2) { LastRecordedOccupancy = Occupancy; return true; } @@ -328,6 +284,9 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) { MachineInstr &MI = *I; Next = std::next(I); + if (MI.isMetaInstruction()) + continue; + bool IsVMEM = isVMEMClauseInst(MI); if (!isValidClauseInst(MI, IsVMEM)) @@ -347,8 +306,13 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) { continue; } + MachineBasicBlock::iterator LastClauseInst = Next; unsigned Length = 1; for ( ; Next != E && Length < FuncMaxClause; ++Next) { + // Debug instructions should not change the kill insertion. + if (Next->isMetaInstruction()) + continue; + if (!isValidClauseInst(*Next, IsVMEM)) break; @@ -358,6 +322,7 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) { if (!processRegUses(*Next, Defs, Uses, RPT)) break; + LastClauseInst = Next; ++Length; } if (Length < 2) { @@ -368,36 +333,74 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) { Changed = true; MFI->limitOccupancy(LastRecordedOccupancy); - auto B = BuildMI(MBB, I, DebugLoc(), TII->get(TargetOpcode::BUNDLE)); - Ind->insertMachineInstrInMaps(*B); + assert(!LastClauseInst->isMetaInstruction()); - // Restore the state after processing the bundle. - RPT.reset(*B, &LiveRegsCopy); + SlotIndex ClauseLiveInIdx = LIS->getInstructionIndex(MI); + SlotIndex ClauseLiveOutIdx = + LIS->getInstructionIndex(*LastClauseInst).getNextIndex(); - for (auto BI = I; BI != Next; ++BI) { - BI->bundleWithPred(); - Ind->removeSingleMachineInstrFromMaps(*BI); + // Track the last inserted kill. + MachineInstrBuilder Kill; - for (MachineOperand &MO : BI->defs()) - if (MO.readsReg()) - MO.setIsInternalRead(true); - } + // Insert one kill per register, with operands covering all necessary + // subregisters. + for (auto &&R : Uses) { + Register Reg = R.first; + if (Reg.isPhysical()) + continue; - for (auto &&R : Defs) { - forAllLanes(R.first, R.second.second, [&R, &B](unsigned SubReg) { - unsigned S = R.second.first | RegState::EarlyClobber; - if (!SubReg) - S &= ~(RegState::Undef | RegState::Dead); - B.addDef(R.first, S, SubReg); - }); + // Collect the register operands we should extend the live ranges of. + SmallVector<std::tuple<unsigned, unsigned>> KillOps; + const LiveInterval &LI = LIS->getInterval(R.first); + + if (!LI.hasSubRanges()) { + if (!LI.liveAt(ClauseLiveOutIdx)) { + KillOps.emplace_back(R.second.first | RegState::Kill, + AMDGPU::NoSubRegister); + } + } else { + LaneBitmask KilledMask; + for (const LiveInterval::SubRange &SR : LI.subranges()) { + if (SR.liveAt(ClauseLiveInIdx) && !SR.liveAt(ClauseLiveOutIdx)) + KilledMask |= SR.LaneMask; + } + + if (KilledMask.none()) + continue; + + SmallVector<unsigned> KilledIndexes; + bool Success = TRI->getCoveringSubRegIndexes( + *MRI, MRI->getRegClass(Reg), KilledMask, KilledIndexes); + (void)Success; + assert(Success && "Failed to find subregister mask to cover lanes"); + for (unsigned SubReg : KilledIndexes) { + KillOps.emplace_back(R.second.first | RegState::Kill, SubReg); + } + } + + if (KillOps.empty()) + continue; + + // We only want to extend the live ranges of used registers. If they + // already have existing uses beyond the bundle, we don't need the kill. + // + // It's possible all of the use registers were already live past the + // bundle. + Kill = BuildMI(*MI.getParent(), std::next(LastClauseInst), + DebugLoc(), TII->get(AMDGPU::KILL)); + for (auto &Op : KillOps) + Kill.addUse(Reg, std::get<0>(Op), std::get<1>(Op)); + Ind->insertMachineInstrInMaps(*Kill); } - for (auto &&R : Uses) { - forAllLanes(R.first, R.second.second, [&R, &B](unsigned SubReg) { - B.addUse(R.first, R.second.first & ~RegState::Kill, SubReg); - }); + if (!Kill) { + RPT.reset(MI, &LiveRegsCopy); + continue; } + // Restore the state after processing the end of the bundle. + RPT.reset(*Kill, &LiveRegsCopy); + for (auto &&R : Defs) { Register Reg = R.first; Uses.erase(Reg); diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 0398d27756db..c9883d38e08c 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -20,18 +20,16 @@ using namespace llvm; #define DEBUG_TYPE "frame-info" - -// Find a scratch register that we can use at the start of the prologue to -// re-align the stack pointer. We avoid using callee-save registers since they -// may appear to be free when this is called from canUseAsPrologue (during -// shrink wrapping), but then no longer be free when this is called from -// emitPrologue. -// -// FIXME: This is a bit conservative, since in the above case we could use one -// of the callee-save registers as a scratch temp to re-align the stack pointer, -// but we would then have to make sure that we were in fact saving at least one -// callee-save register in the prologue, which is additional complexity that -// doesn't seem worth the benefit. +static cl::opt<bool> EnableSpillVGPRToAGPR( + "amdgpu-spill-vgpr-to-agpr", + cl::desc("Enable spilling VGPRs to AGPRs"), + cl::ReallyHidden, + cl::init(true)); + +// Find a scratch register that we can use in the prologue. We avoid using +// callee-save registers since they may appear to be free when this is called +// from canUseAsPrologue (during shrink wrapping), but then no longer be free +// when this is called from emitPrologue. static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, LivePhysRegs &LiveRegs, const TargetRegisterClass &RC, @@ -55,12 +53,6 @@ static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, } } - // If we require an unused register, this is used in contexts where failure is - // an option and has an alternative plan. In other contexts, this must - // succeed0. - if (!Unused) - report_fatal_error("failed to find free scratch register"); - return MCRegister(); } @@ -72,10 +64,8 @@ static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF, SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF.getFrameInfo(); -#ifndef NDEBUG const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); -#endif // We need to save and restore the current FP/BP. @@ -105,7 +95,7 @@ static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF, int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr, TargetStackID::SGPRSpill); - if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) { + if (TRI->spillSGPRToVGPR() && MFI->allocateSGPRSpillToVGPR(MF, NewFI)) { // 3: There's no free lane to spill, and no free register to save FP/BP, // so we're forced to spill another VGPR to use for the spill. FrameIndex = NewFI; @@ -131,166 +121,45 @@ static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF, // We need to specially emit stack operations here because a different frame // register is used than in the rest of the function, as getFrameRegister would // use. -static void buildPrologSpill(const GCNSubtarget &ST, LivePhysRegs &LiveRegs, +static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, + const SIMachineFunctionInfo &FuncInfo, + LivePhysRegs &LiveRegs, MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - const SIInstrInfo *TII, Register SpillReg, - Register ScratchRsrcReg, Register SPReg, int FI) { - MachineFunction *MF = MBB.getParent(); - MachineFrameInfo &MFI = MF->getFrameInfo(); + MachineBasicBlock::iterator I, Register SpillReg, + int FI) { + unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR + : AMDGPU::BUFFER_STORE_DWORD_OFFSET; - int64_t Offset = MFI.getObjectOffset(FI); - - MachineMemOperand *MMO = MF->getMachineMemOperand( - MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4, - MFI.getObjectAlign(FI)); - - if (ST.enableFlatScratch()) { - if (TII->isLegalFLATOffset(Offset, AMDGPUAS::PRIVATE_ADDRESS, true)) { - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_STORE_DWORD_SADDR)) - .addReg(SpillReg, RegState::Kill) - .addReg(SPReg) - .addImm(Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // dlc - .addMemOperand(MMO); - return; - } - } else if (SIInstrInfo::isLegalMUBUFImmOffset(Offset)) { - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET)) - .addReg(SpillReg, RegState::Kill) - .addReg(ScratchRsrcReg) - .addReg(SPReg) - .addImm(Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .addImm(0) // dlc - .addImm(0) // swz - .addMemOperand(MMO); - return; - } - - // Don't clobber the TmpVGPR if we also need a scratch reg for the stack - // offset in the spill. + MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); + MachineMemOperand *MMO = MF.getMachineMemOperand( + PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI), + FrameInfo.getObjectAlign(FI)); LiveRegs.addReg(SpillReg); - - if (ST.enableFlatScratch()) { - MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( - MF->getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0RegClass); - - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(SPReg) - .addImm(Offset); - - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_STORE_DWORD_SADDR)) - .addReg(SpillReg, RegState::Kill) - .addReg(OffsetReg, RegState::Kill) - .addImm(0) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // dlc - .addMemOperand(MMO); - } else { - MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( - MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass); - - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) - .addImm(Offset); - - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN)) - .addReg(SpillReg, RegState::Kill) - .addReg(OffsetReg, RegState::Kill) - .addReg(ScratchRsrcReg) - .addReg(SPReg) - .addImm(0) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .addImm(0) // dlc - .addImm(0) // swz - .addMemOperand(MMO); - } - + TRI.buildSpillLoadStore(MBB, I, Opc, FI, SpillReg, true, + FuncInfo.getStackPtrOffsetReg(), 0, MMO, nullptr, + &LiveRegs); LiveRegs.removeReg(SpillReg); } -static void buildEpilogReload(const GCNSubtarget &ST, LivePhysRegs &LiveRegs, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - const SIInstrInfo *TII, Register SpillReg, - Register ScratchRsrcReg, Register SPReg, int FI) { - MachineFunction *MF = MBB.getParent(); - MachineFrameInfo &MFI = MF->getFrameInfo(); - int64_t Offset = MFI.getObjectOffset(FI); - - MachineMemOperand *MMO = MF->getMachineMemOperand( - MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4, - MFI.getObjectAlign(FI)); - - if (ST.enableFlatScratch()) { - if (TII->isLegalFLATOffset(Offset, AMDGPUAS::PRIVATE_ADDRESS, true)) { - BuildMI(MBB, I, DebugLoc(), - TII->get(AMDGPU::SCRATCH_LOAD_DWORD_SADDR), SpillReg) - .addReg(SPReg) - .addImm(Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // dlc - .addMemOperand(MMO); - return; - } - MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( - MF->getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0RegClass); - - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(SPReg) - .addImm(Offset); - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_LOAD_DWORD_SADDR), - SpillReg) - .addReg(OffsetReg, RegState::Kill) - .addImm(0) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // dlc - .addMemOperand(MMO); - return; - } - - if (SIInstrInfo::isLegalMUBUFImmOffset(Offset)) { - BuildMI(MBB, I, DebugLoc(), - TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg) - .addReg(ScratchRsrcReg) - .addReg(SPReg) - .addImm(Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .addImm(0) // dlc - .addImm(0) // swz - .addMemOperand(MMO); - return; - } +static void buildEpilogRestore(const GCNSubtarget &ST, + const SIRegisterInfo &TRI, + const SIMachineFunctionInfo &FuncInfo, + LivePhysRegs &LiveRegs, MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, Register SpillReg, + int FI) { + unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR + : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; - MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( - MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass); - - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) - .addImm(Offset); - - BuildMI(MBB, I, DebugLoc(), - TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg) - .addReg(OffsetReg, RegState::Kill) - .addReg(ScratchRsrcReg) - .addReg(SPReg) - .addImm(0) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .addImm(0) // dlc - .addImm(0) // swz - .addMemOperand(MMO); + MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); + MachineMemOperand *MMO = MF.getMachineMemOperand( + PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI), + FrameInfo.getObjectAlign(FI)); + TRI.buildSpillLoadStore(MBB, I, Opc, FI, SpillReg, false, + FuncInfo.getStackPtrOffsetReg(), 0, MMO, nullptr, + &LiveRegs); } static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, @@ -384,8 +253,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit( BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit) .addReg(FlatScrInit) .addImm(EncodedOffset) // offset - .addImm(0) // glc - .addImm(0) // dlc + .addImm(0) // cpol .addMemOperand(MMO); // Mask the offset in [47:0] of the descriptor @@ -445,9 +313,9 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit( // Add wave offset in bytes to private base offset. // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) - .addReg(FlatScrInitLo) - .addReg(ScratchWaveOffsetReg); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo) + .addReg(FlatScrInitLo) + .addReg(ScratchWaveOffsetReg); // Convert offset to 256-byte units. BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI) @@ -545,6 +413,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); const Function &F = MF.getFunction(); + MachineFrameInfo &FrameInfo = MF.getFrameInfo(); assert(MFI->isEntryFunction()); @@ -622,7 +491,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, Register SPReg = MFI->getStackPtrOffsetReg(); assert(SPReg != AMDGPU::SP_REG); BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) - .addImm(MF.getFrameInfo().getStackSize() * getScratchScaleFactor(ST)); + .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST)); } if (hasFP(MF)) { @@ -631,12 +500,18 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); } - if (MFI->hasFlatScratchInit() || ScratchRsrcReg) { + bool NeedsFlatScratchInit = + MFI->hasFlatScratchInit() && + (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() || + (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch())); + + if ((NeedsFlatScratchInit || ScratchRsrcReg) && + !ST.flatScratchIsArchitected()) { MRI.addLiveIn(PreloadedScratchWaveOffsetReg); MBB.addLiveIn(PreloadedScratchWaveOffsetReg); } - if (MFI->hasFlatScratchInit()) { + if (NeedsFlatScratchInit) { emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); } @@ -663,6 +538,7 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( // The pointer to the GIT is formed from the offset passed in and either // the amdgpu-git-ptr-high function attribute or the top part of the PC Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); + Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); buildGitPtr(MBB, I, DL, TII, Rsrc01); @@ -681,10 +557,23 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg) .addReg(Rsrc01) .addImm(EncodedOffset) // offset - .addImm(0) // glc - .addImm(0) // dlc + .addImm(0) // cpol .addReg(ScratchRsrcReg, RegState::ImplicitDefine) .addMemOperand(MMO); + + // The driver will always set the SRD for wave 64 (bits 118:117 of + // descriptor / bits 22:21 of third sub-reg will be 0b11) + // If the shader is actually wave32 we have to modify the const_index_stride + // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The + // reason the driver does this is that there can be cases where it presents + // 2 shaders with different wave size (e.g. VsFs). + // TODO: convert to using SCRATCH instructions or multiple SRD buffers + if (ST.isWave32()) { + const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32); + BuildMI(MBB, I, DL, SBitsetB32, Rsrc03) + .addImm(21) + .addReg(Rsrc03); + } } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) { assert(!ST.isAmdHsaOrMesa(Fn)); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); @@ -716,8 +605,7 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) .addReg(MFI->getImplicitBufferPtrUserSGPR()) .addImm(0) // offset - .addImm(0) // glc - .addImm(0) // dlc + .addImm(0) // cpol .addMemOperand(MMO) .addReg(ScratchRsrcReg, RegState::ImplicitDefine); @@ -785,11 +673,28 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { case TargetStackID::SGPRSpill: return true; case TargetStackID::ScalableVector: + case TargetStackID::WasmLocal: return false; } llvm_unreachable("Invalid TargetStackID::Value"); } +static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI, + const SIMachineFunctionInfo *FuncInfo, + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, bool IsProlog) { + if (LiveRegs.empty()) { + LiveRegs.init(TRI); + if (IsProlog) { + LiveRegs.addLiveIns(MBB); + } else { + // In epilog. + LiveRegs.addLiveOuts(MBB); + LiveRegs.stepBackward(*MBBI); + } + } +} + // Activate all lanes, returns saved exec. static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, MachineFunction &MF, @@ -804,28 +709,14 @@ static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); DebugLoc DL; - if (LiveRegs.empty()) { - if (IsProlog) { - LiveRegs.init(TRI); - LiveRegs.addLiveIns(MBB); - if (FuncInfo->SGPRForFPSaveRestoreCopy) - LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy); - - if (FuncInfo->SGPRForBPSaveRestoreCopy) - LiveRegs.removeReg(FuncInfo->SGPRForBPSaveRestoreCopy); - } else { - // In epilog. - LiveRegs.init(*ST.getRegisterInfo()); - LiveRegs.addLiveOuts(MBB); - LiveRegs.stepBackward(*MBBI); - } - } + initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, IsProlog); ScratchExecCopy = findScratchNonCalleeSaveRegister( MRI, LiveRegs, *TRI.getWaveMaskRegClass()); + if (!ScratchExecCopy) + report_fatal_error("failed to find free scratch register"); - if (!IsProlog) - LiveRegs.removeReg(ScratchExecCopy); + LiveRegs.addReg(ScratchExecCopy); const unsigned OrSaveExec = ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; @@ -834,6 +725,13 @@ static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, return ScratchExecCopy; } +// A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR. +// Otherwise we are spilling to memory. +static bool spilledToMemory(const MachineFunction &MF, int SaveIndex) { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + return MFI.getStackID(SaveIndex) != TargetStackID::SGPRSpill; +} + void SIFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); @@ -865,126 +763,93 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, // turn on all lanes before doing the spill to memory. Register ScratchExecCopy; - bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue(); - bool SpillFPToMemory = false; - // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR. - // Otherwise we are spilling the FP to memory. - if (HasFPSaveIndex) { - SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) != - TargetStackID::SGPRSpill; - } - - bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue(); - bool SpillBPToMemory = false; - // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR. - // Otherwise we are spilling the BP to memory. - if (HasBPSaveIndex) { - SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) != - TargetStackID::SGPRSpill; - } - - // Emit the copy if we need an FP, and are using a free SGPR to save it. - if (FuncInfo->SGPRForFPSaveRestoreCopy) { - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy) - .addReg(FramePtrReg) - .setMIFlag(MachineInstr::FrameSetup); - } - - // Emit the copy if we need a BP, and are using a free SGPR to save it. - if (FuncInfo->SGPRForBPSaveRestoreCopy) { - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), - FuncInfo->SGPRForBPSaveRestoreCopy) - .addReg(BasePtrReg) - .setMIFlag(MachineInstr::FrameSetup); - } + Optional<int> FPSaveIndex = FuncInfo->FramePointerSaveIndex; + Optional<int> BPSaveIndex = FuncInfo->BasePointerSaveIndex; - // If a copy has been emitted for FP and/or BP, Make the SGPRs - // used in the copy instructions live throughout the function. - SmallVector<MCPhysReg, 2> TempSGPRs; - if (FuncInfo->SGPRForFPSaveRestoreCopy) - TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy); - - if (FuncInfo->SGPRForBPSaveRestoreCopy) - TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy); + // VGPRs used for SGPR->VGPR spills + for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg : + FuncInfo->getSGPRSpillVGPRs()) { + if (!Reg.FI) + continue; - if (!TempSGPRs.empty()) { - for (MachineBasicBlock &MBB : MF) { - for (MCPhysReg Reg : TempSGPRs) - MBB.addLiveIn(Reg); + if (!ScratchExecCopy) + ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, + /*IsProlog*/ true); - MBB.sortUniqueLiveIns(); - } + buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, Reg.VGPR, + *Reg.FI); } - for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg - : FuncInfo->getSGPRSpillVGPRs()) { - if (!Reg.FI.hasValue()) + // VGPRs used for Whole Wave Mode + for (const auto &Reg : FuncInfo->WWMReservedRegs) { + auto VGPR = Reg.first; + auto FI = Reg.second; + if (!FI) continue; if (!ScratchExecCopy) - ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); + ScratchExecCopy = + buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ true); - buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, Reg.VGPR, - FuncInfo->getScratchRSrcReg(), - StackPtrReg, - Reg.FI.getValue()); + buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, VGPR, *FI); } - if (HasFPSaveIndex && SpillFPToMemory) { - assert(!MFI.isDeadObjectIndex(FuncInfo->FramePointerSaveIndex.getValue())); + if (ScratchExecCopy) { + // FIXME: Split block and make terminator. + unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) + .addReg(ScratchExecCopy, RegState::Kill); + LiveRegs.addReg(ScratchExecCopy); + } - if (!ScratchExecCopy) - ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); + if (FPSaveIndex && spilledToMemory(MF, *FPSaveIndex)) { + const int FramePtrFI = *FPSaveIndex; + assert(!MFI.isDeadObjectIndex(FramePtrFI)); + + initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + if (!TmpVGPR) + report_fatal_error("failed to find free scratch register"); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) .addReg(FramePtrReg); - buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, - FuncInfo->getScratchRSrcReg(), StackPtrReg, - FuncInfo->FramePointerSaveIndex.getValue()); + buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR, + FramePtrFI); } - if (HasBPSaveIndex && SpillBPToMemory) { - assert(!MFI.isDeadObjectIndex(*FuncInfo->BasePointerSaveIndex)); + if (BPSaveIndex && spilledToMemory(MF, *BPSaveIndex)) { + const int BasePtrFI = *BPSaveIndex; + assert(!MFI.isDeadObjectIndex(BasePtrFI)); - if (!ScratchExecCopy) - ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); + initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + if (!TmpVGPR) + report_fatal_error("failed to find free scratch register"); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) .addReg(BasePtrReg); - buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR, - FuncInfo->getScratchRSrcReg(), StackPtrReg, - *FuncInfo->BasePointerSaveIndex); - } - - if (ScratchExecCopy) { - // FIXME: Split block and make terminator. - unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) - .addReg(ScratchExecCopy, RegState::Kill); - LiveRegs.addReg(ScratchExecCopy); + buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR, + BasePtrFI); } // In this case, spill the FP to a reserved VGPR. - if (HasFPSaveIndex && !SpillFPToMemory) { - const int FI = FuncInfo->FramePointerSaveIndex.getValue(); - assert(!MFI.isDeadObjectIndex(FI)); + if (FPSaveIndex && !spilledToMemory(MF, *FPSaveIndex)) { + const int FramePtrFI = *FPSaveIndex; + assert(!MFI.isDeadObjectIndex(FramePtrFI)); - assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); + assert(MFI.getStackID(FramePtrFI) == TargetStackID::SGPRSpill); ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = - FuncInfo->getSGPRToVGPRSpills(FI); + FuncInfo->getSGPRToVGPRSpills(FramePtrFI); assert(Spill.size() == 1); // Save FP before setting it up. - // FIXME: This should respect spillSGPRToVGPR; BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR) .addReg(FramePtrReg) .addImm(Spill[0].Lane) @@ -992,8 +857,8 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, } // In this case, spill the BP to a reserved VGPR. - if (HasBPSaveIndex && !SpillBPToMemory) { - const int BasePtrFI = *FuncInfo->BasePointerSaveIndex; + if (BPSaveIndex && !spilledToMemory(MF, *BPSaveIndex)) { + const int BasePtrFI = *BPSaveIndex; assert(!MFI.isDeadObjectIndex(BasePtrFI)); assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill); @@ -1002,14 +867,51 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, assert(Spill.size() == 1); // Save BP before setting it up. - // FIXME: This should respect spillSGPRToVGPR; BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR) .addReg(BasePtrReg) .addImm(Spill[0].Lane) .addReg(Spill[0].VGPR, RegState::Undef); } - if (TRI.needsStackRealignment(MF)) { + // Emit the copy if we need an FP, and are using a free SGPR to save it. + if (FuncInfo->SGPRForFPSaveRestoreCopy) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), + FuncInfo->SGPRForFPSaveRestoreCopy) + .addReg(FramePtrReg) + .setMIFlag(MachineInstr::FrameSetup); + } + + // Emit the copy if we need a BP, and are using a free SGPR to save it. + if (FuncInfo->SGPRForBPSaveRestoreCopy) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), + FuncInfo->SGPRForBPSaveRestoreCopy) + .addReg(BasePtrReg) + .setMIFlag(MachineInstr::FrameSetup); + } + + // If a copy has been emitted for FP and/or BP, Make the SGPRs + // used in the copy instructions live throughout the function. + SmallVector<MCPhysReg, 2> TempSGPRs; + if (FuncInfo->SGPRForFPSaveRestoreCopy) + TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy); + + if (FuncInfo->SGPRForBPSaveRestoreCopy) + TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy); + + if (!TempSGPRs.empty()) { + for (MachineBasicBlock &MBB : MF) { + for (MCPhysReg Reg : TempSGPRs) + MBB.addLiveIn(Reg); + + MBB.sortUniqueLiveIns(); + } + if (!LiveRegs.empty()) { + LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy); + LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy); + } + } + + if (TRI.hasStackRealignment(MF)) { HasFP = true; const unsigned Alignment = MFI.getMaxAlign().value(); @@ -1017,23 +919,16 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, if (LiveRegs.empty()) { LiveRegs.init(TRI); LiveRegs.addLiveIns(MBB); - LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy); - LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy); } - Register ScratchSPReg = findScratchNonCalleeSaveRegister( - MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass); - assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy && - ScratchSPReg != FuncInfo->SGPRForBPSaveRestoreCopy); - - // s_add_u32 tmp_reg, s32, NumBytes - // s_and_b32 s32, tmp_reg, 0b111...0000 - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg) + // s_add_i32 s33, s32, NumBytes + // s_and_b32 s33, s33, 0b111...0000 + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg) .addReg(StackPtrReg) .addImm((Alignment - 1) * getScratchScaleFactor(ST)) .setMIFlag(MachineInstr::FrameSetup); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) - .addReg(ScratchSPReg, RegState::Kill) + .addReg(FramePtrReg, RegState::Kill) .addImm(-Alignment * getScratchScaleFactor(ST)) .setMIFlag(MachineInstr::FrameSetup); FuncInfo->setIsStackRealigned(true); @@ -1054,7 +949,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, } if (HasFP && RoundedSize != 0) { - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg) .addReg(StackPtrReg) .addImm(RoundedSize * getScratchScaleFactor(ST)) .setMIFlag(MachineInstr::FrameSetup); @@ -1101,58 +996,47 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, const Register BasePtrReg = TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); - bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue(); - bool SpillFPToMemory = false; - if (HasFPSaveIndex) { - SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) != - TargetStackID::SGPRSpill; - } - - bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue(); - bool SpillBPToMemory = false; - if (HasBPSaveIndex) { - SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) != - TargetStackID::SGPRSpill; - } + Optional<int> FPSaveIndex = FuncInfo->FramePointerSaveIndex; + Optional<int> BPSaveIndex = FuncInfo->BasePointerSaveIndex; if (RoundedSize != 0 && hasFP(MF)) { - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) - .addReg(StackPtrReg) - .addImm(RoundedSize * getScratchScaleFactor(ST)) - .setMIFlag(MachineInstr::FrameDestroy); + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg) + .addReg(StackPtrReg) + .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST))) + .setMIFlag(MachineInstr::FrameDestroy); } if (FuncInfo->SGPRForFPSaveRestoreCopy) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) .addReg(FuncInfo->SGPRForFPSaveRestoreCopy) - .setMIFlag(MachineInstr::FrameSetup); + .setMIFlag(MachineInstr::FrameDestroy); } if (FuncInfo->SGPRForBPSaveRestoreCopy) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg) .addReg(FuncInfo->SGPRForBPSaveRestoreCopy) - .setMIFlag(MachineInstr::FrameSetup); + .setMIFlag(MachineInstr::FrameDestroy); } - Register ScratchExecCopy; - if (HasFPSaveIndex) { - const int FI = FuncInfo->FramePointerSaveIndex.getValue(); - assert(!MFI.isDeadObjectIndex(FI)); - if (SpillFPToMemory) { - if (!ScratchExecCopy) - ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); - - MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister( + if (FPSaveIndex) { + const int FramePtrFI = *FPSaveIndex; + assert(!MFI.isDeadObjectIndex(FramePtrFI)); + if (spilledToMemory(MF, FramePtrFI)) { + initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); + + MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::VGPR_32RegClass); - buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TempVGPR, - FuncInfo->getScratchRSrcReg(), StackPtrReg, FI); + if (!TmpVGPR) + report_fatal_error("failed to find free scratch register"); + buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR, + FramePtrFI); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg) - .addReg(TempVGPR, RegState::Kill); + .addReg(TmpVGPR, RegState::Kill); } else { // Reload from VGPR spill. - assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); + assert(MFI.getStackID(FramePtrFI) == TargetStackID::SGPRSpill); ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = - FuncInfo->getSGPRToVGPRSpills(FI); + FuncInfo->getSGPRToVGPRSpills(FramePtrFI); assert(Spill.size() == 1); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), FramePtrReg) .addReg(Spill[0].VGPR) @@ -1160,19 +1044,20 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, } } - if (HasBPSaveIndex) { - const int BasePtrFI = *FuncInfo->BasePointerSaveIndex; + if (BPSaveIndex) { + const int BasePtrFI = *BPSaveIndex; assert(!MFI.isDeadObjectIndex(BasePtrFI)); - if (SpillBPToMemory) { - if (!ScratchExecCopy) - ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); + if (spilledToMemory(MF, BasePtrFI)) { + initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); - MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister( + MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::VGPR_32RegClass); - buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TempVGPR, - FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI); + if (!TmpVGPR) + report_fatal_error("failed to find free scratch register"); + buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR, + BasePtrFI); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg) - .addReg(TempVGPR, RegState::Kill); + .addReg(TmpVGPR, RegState::Kill); } else { // Reload from VGPR spill. assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill); @@ -1185,17 +1070,31 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, } } - for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg : + Register ScratchExecCopy; + for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg : FuncInfo->getSGPRSpillVGPRs()) { - if (!Reg.FI.hasValue()) + if (!Reg.FI) + continue; + + if (!ScratchExecCopy) + ScratchExecCopy = + buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false); + + buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, Reg.VGPR, + *Reg.FI); + } + + for (const auto &Reg : FuncInfo->WWMReservedRegs) { + auto VGPR = Reg.first; + auto FI = Reg.second; + if (!FI) continue; if (!ScratchExecCopy) - ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); + ScratchExecCopy = + buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false); - buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, Reg.VGPR, - FuncInfo->getScratchRSrcReg(), StackPtrReg, - Reg.FI.getValue()); + buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, VGPR, *FI); } if (ScratchExecCopy) { @@ -1240,9 +1139,73 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( MachineFrameInfo &MFI = MF.getFrameInfo(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs() + && EnableSpillVGPRToAGPR; + + if (SpillVGPRToAGPR) { + // To track the spill frame indices handled in this pass. + BitVector SpillFIs(MFI.getObjectIndexEnd(), false); + + bool SeenDbgInstr = false; + + for (MachineBasicBlock &MBB : MF) { + MachineBasicBlock::iterator Next; + for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) { + MachineInstr &MI = *I; + Next = std::next(I); + + if (MI.isDebugInstr()) + SeenDbgInstr = true; + + if (TII->isVGPRSpill(MI)) { + // Try to eliminate stack used by VGPR spills before frame + // finalization. + unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::vaddr); + int FI = MI.getOperand(FIOp).getIndex(); + Register VReg = + TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); + if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, + TRI->isAGPR(MRI, VReg))) { + // FIXME: change to enterBasicBlockEnd() + RS->enterBasicBlock(MBB); + TRI->eliminateFrameIndex(MI, 0, FIOp, RS); + SpillFIs.set(FI); + continue; + } + } + } + } + + for (MachineBasicBlock &MBB : MF) { + for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs()) + MBB.addLiveIn(Reg); + + for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs()) + MBB.addLiveIn(Reg); + + MBB.sortUniqueLiveIns(); + + if (!SpillFIs.empty() && SeenDbgInstr) { + // FIXME: The dead frame indices are replaced with a null register from + // the debug value instructions. We should instead, update it with the + // correct register value. But not sure the register value alone is + for (MachineInstr &MI : MBB) { + if (MI.isDebugValue() && MI.getOperand(0).isFI() && + SpillFIs[MI.getOperand(0).getIndex()]) { + MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/); + MI.getOperand(0).setIsDebug(); + } + } + } + } + } + FuncInfo->removeDeadFrameIndices(MFI); assert(allSGPRSpillsAreDead(MF) && "SGPR spill should have been removed in SILowerSGPRSpills"); @@ -1253,16 +1216,8 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( if (!allStackObjectsAreDead(MFI)) { assert(RS && "RegScavenger required if spilling"); - if (FuncInfo->isEntryFunction()) { - int ScavengeFI = MFI.CreateFixedObject( - TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false); - RS->addScavengingFrameIndex(ScavengeFI); - } else { - int ScavengeFI = MFI.CreateStackObject( - TRI->getSpillSize(AMDGPU::SGPR_32RegClass), - TRI->getSpillAlign(AMDGPU::SGPR_32RegClass), false); - RS->addScavengingFrameIndex(ScavengeFI); - } + // Add an emergency spill slot + RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI)); } } @@ -1280,7 +1235,13 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, const SIRegisterInfo *TRI = ST.getRegisterInfo(); // Ignore the SGPRs the default implementation found. - SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask()); + SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask()); + + // Do not save AGPRs prior to GFX90A because there was no easy way to do so. + // In gfx908 there was do AGPR loads and stores and thus spilling also + // require a temporary VGPR. + if (!ST.hasGFX90AInsts()) + SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask()); // hasFP only knows about stack objects that already exist. We're now // determining the stack slots that will be created, so we have to predict @@ -1335,7 +1296,7 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, SavedRegs.reset(MFI->getStackPtrOffsetReg()); const BitVector AllSavedRegs = SavedRegs; - SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask()); + SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask()); // If clearing VGPRs changed the mask, we will have some CSR VGPR spills. const bool HaveAnyCSRVGPR = SavedRegs != AllSavedRegs; @@ -1409,10 +1370,12 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); Register SPReg = MFI->getStackPtrOffsetReg(); - unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; - BuildMI(MBB, I, DL, TII->get(Op), SPReg) - .addReg(SPReg) - .addImm(Amount * getScratchScaleFactor(ST)); + Amount *= getScratchScaleFactor(ST); + if (IsDestroy) + Amount = -Amount; + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg) + .addReg(SPReg) + .addImm(Amount); } else if (CalleePopAmount != 0) { llvm_unreachable("is this used?"); } @@ -1450,8 +1413,9 @@ bool SIFrameLowering::hasFP(const MachineFunction &MF) const { } return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || - MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF) || - MF.getTarget().Options.DisableFramePointerElim(MF); + MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment( + MF) || + MF.getTarget().Options.DisableFramePointerElim(MF); } // This is essentially a reduced version of hasFP for entry functions. Since the diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 839437b5e3f8..d98acfc6c532 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -19,11 +19,13 @@ #include "SIRegisterInfo.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" #include "llvm/Support/CommandLine.h" @@ -80,36 +82,49 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); - addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); - addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); + + const SIRegisterInfo *TRI = STI.getRegisterInfo(); + const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class(); + + addRegisterClass(MVT::f64, V64RegClass); + addRegisterClass(MVT::v2f32, V64RegClass); addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass); - addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass); + addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96)); addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass); - addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); + addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128)); addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass); - addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass); + addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160)); + + addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass); + addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192)); + + addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass); + addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192)); + + addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass); + addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224)); addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass); - addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); + addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256)); addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass); - addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass); + addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256)); addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass); - addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); + addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512)); addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass); - addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass); + addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512)); addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass); - addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass); + addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024)); if (Subtarget->has16BitInsts()) { addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); @@ -123,7 +138,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, } addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); - addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass); + addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024)); computeRegisterProperties(Subtarget->getRegisterInfo()); @@ -139,6 +154,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::LOAD, MVT::v3i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i32, Custom); setOperationAction(ISD::LOAD, MVT::v5i32, Custom); + setOperationAction(ISD::LOAD, MVT::v6i32, Custom); + setOperationAction(ISD::LOAD, MVT::v7i32, Custom); setOperationAction(ISD::LOAD, MVT::v8i32, Custom); setOperationAction(ISD::LOAD, MVT::v16i32, Custom); setOperationAction(ISD::LOAD, MVT::i1, Custom); @@ -148,6 +165,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v3i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); setOperationAction(ISD::STORE, MVT::v5i32, Custom); + setOperationAction(ISD::STORE, MVT::v6i32, Custom); + setOperationAction(ISD::STORE, MVT::v7i32, Custom); setOperationAction(ISD::STORE, MVT::v8i32, Custom); setOperationAction(ISD::STORE, MVT::v16i32, Custom); setOperationAction(ISD::STORE, MVT::i1, Custom); @@ -170,6 +189,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand); setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand); + setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand); + setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand); setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand); setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand); setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand); @@ -197,8 +218,16 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v3i32, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v3f32, Expand); setOperationAction(ISD::TRUNCATE, MVT::v4i32, Expand); setOperationAction(ISD::FP_ROUND, MVT::v4f32, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v5i32, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v5f32, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v6i32, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v6f32, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v7i32, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v7f32, Expand); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Expand); setOperationAction(ISD::FP_ROUND, MVT::v8f32, Expand); setOperationAction(ISD::TRUNCATE, MVT::v16i32, Expand); @@ -239,6 +268,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // with > 4 elements. for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, + MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32 }) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { @@ -249,10 +279,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, case ISD::BITCAST: case ISD::EXTRACT_VECTOR_ELT: case ISD::INSERT_VECTOR_ELT: - case ISD::INSERT_SUBVECTOR: case ISD::EXTRACT_SUBVECTOR: case ISD::SCALAR_TO_VECTOR: break; + case ISD::INSERT_SUBVECTOR: case ISD::CONCAT_VECTORS: setOperationAction(Op, VT, Custom); break; @@ -284,6 +314,20 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); } + for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) { + setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32); + + setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32); + + setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32); + } + for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) { setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32); @@ -336,17 +380,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // Avoid stack access for these. // TODO: Generalize to more vector types. + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom); @@ -362,9 +403,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Custom); - // Deal with vec5 vector operations when widened to vec8. + // Deal with vec5/6/7 vector operations when widened to vec8. setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6i32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6f32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7i32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7f32, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Custom); @@ -384,6 +429,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, } setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); // FIXME: This should be narrowed to i32, but that only happens if i64 is // illegal. @@ -525,8 +571,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote); AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32); - setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote); - setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote); + setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i16, Custom); // F16 - Constant Actions. setOperationAction(ISD::ConstantFP, MVT::f16, Legal); @@ -718,6 +764,19 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FEXP, MVT::v2f16, Custom); setOperationAction(ISD::SELECT, MVT::v4i16, Custom); setOperationAction(ISD::SELECT, MVT::v4f16, Custom); + + if (Subtarget->hasPackedFP32Ops()) { + setOperationAction(ISD::FADD, MVT::v2f32, Legal); + setOperationAction(ISD::FMUL, MVT::v2f32, Legal); + setOperationAction(ISD::FMA, MVT::v2f32, Legal); + setOperationAction(ISD::FNEG, MVT::v2f32, Legal); + + for (MVT VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32 }) { + setOperationAction(ISD::FADD, VT, Custom); + setOperationAction(ISD::FMUL, VT, Custom); + setOperationAction(ISD::FMA, VT, Custom); + } + } } setOperationAction(ISD::FNEG, MVT::v4f16, Custom); @@ -1128,17 +1187,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineMemOperand::MOVolatile; return true; } - case Intrinsic::amdgcn_global_atomic_fadd: { - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(CI.getType()); - Info.ptrVal = CI.getOperand(0); - Info.align.reset(); - Info.flags = MachineMemOperand::MOLoad | - MachineMemOperand::MOStore | - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOVolatile; - return true; - } case Intrinsic::amdgcn_image_bvh_intersect_ray: { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); Info.opc = ISD::INTRINSIC_W_CHAIN; @@ -1150,6 +1198,22 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineMemOperand::MODereferenceable; return true; } + case Intrinsic::amdgcn_global_atomic_fadd: + case Intrinsic::amdgcn_global_atomic_fmin: + case Intrinsic::amdgcn_global_atomic_fmax: + case Intrinsic::amdgcn_flat_atomic_fadd: + case Intrinsic::amdgcn_flat_atomic_fmin: + case Intrinsic::amdgcn_flat_atomic_fmax: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = CI.getOperand(0); + Info.align.reset(); + Info.flags = MachineMemOperand::MOLoad | + MachineMemOperand::MOStore | + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOVolatile; + return true; + } case Intrinsic::amdgcn_ds_gws_init: case Intrinsic::amdgcn_ds_gws_barrier: case Intrinsic::amdgcn_ds_gws_sema_v: @@ -1191,6 +1255,9 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: case Intrinsic::amdgcn_global_atomic_fadd: + case Intrinsic::amdgcn_flat_atomic_fadd: + case Intrinsic::amdgcn_flat_atomic_fmin: + case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_global_atomic_csub: { Value *Ptr = II->getArgOperand(0); AccessTy = II->getType(); @@ -1210,9 +1277,9 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { } return AM.Scale == 0 && - (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset( - AM.BaseOffs, AMDGPUAS::FLAT_ADDRESS, - /*Signed=*/false)); + (AM.BaseOffs == 0 || + Subtarget->getInstrInfo()->isLegalFLATOffset( + AM.BaseOffs, AMDGPUAS::FLAT_ADDRESS, SIInstrFlags::FLAT)); } bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const { @@ -1220,7 +1287,7 @@ bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const { return AM.Scale == 0 && (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset( AM.BaseOffs, AMDGPUAS::GLOBAL_ADDRESS, - /*Signed=*/true)); + SIInstrFlags::FlatGlobal)); if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) { // Assume the we will use FLAT for all global memory accesses @@ -1385,10 +1452,15 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( return true; } + // Either, the alignment requirements are "enabled", or there is an + // unaligned LDS access related hardware bug though alignment requirements + // are "disabled". In either case, we need to check for proper alignment + // requirements. + // if (Size == 64) { - // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte - // aligned, 8 byte access in a single operation using ds_read2/write2_b32 - // with adjacent offsets. + // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we + // can do a 4 byte aligned, 8 byte access in a single operation using + // ds_read2/write2_b32 with adjacent offsets. bool AlignedBy4 = Alignment >= Align(4); if (IsFast) *IsFast = AlignedBy4; @@ -1396,22 +1468,23 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( return AlignedBy4; } if (Size == 96) { - // ds_read/write_b96 require 16-byte alignment on gfx8 and older. - bool Aligned = Alignment >= Align(16); + // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on + // gfx8 and older. + bool AlignedBy16 = Alignment >= Align(16); if (IsFast) - *IsFast = Aligned; + *IsFast = AlignedBy16; - return Aligned; + return AlignedBy16; } if (Size == 128) { - // ds_read/write_b128 require 16-byte alignment on gfx8 and older, but we - // can do a 8 byte aligned, 16 byte access in a single operation using - // ds_read2/write2_b64. - bool Aligned = Alignment >= Align(8); + // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on + // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a + // single operation using ds_read2/write2_b64. + bool AlignedBy8 = Alignment >= Align(8); if (IsFast) - *IsFast = Aligned; + *IsFast = AlignedBy8; - return Aligned; + return AlignedBy8; } } @@ -1467,8 +1540,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( } bool SITargetLowering::allowsMisalignedMemoryAccesses( - EVT VT, unsigned AddrSpace, unsigned Alignment, - MachineMemOperand::Flags Flags, bool *IsFast) const { + EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, + bool *IsFast) const { if (IsFast) *IsFast = false; @@ -1482,7 +1555,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses( } return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace, - Align(Alignment), Flags, IsFast); + Alignment, Flags, IsFast); } EVT SITargetLowering::getOptimalMemOpType( @@ -1535,8 +1608,8 @@ bool SITargetLowering::isMemOpUniform(const SDNode *N) const { TargetLoweringBase::LegalizeTypeAction SITargetLowering::getPreferredVectorAction(MVT VT) const { - int NumElts = VT.getVectorNumElements(); - if (NumElts != 1 && VT.getScalarType().bitsLE(MVT::i16)) + if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && + VT.getScalarType().bitsLE(MVT::i16)) return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector; return TargetLoweringBase::getPreferredVectorAction(VT); } @@ -1799,23 +1872,37 @@ void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo, MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); CCInfo.AllocateReg(Reg); - Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg)); + unsigned Mask = (Subtarget->hasPackedTID() && + Info.hasWorkItemIDY()) ? 0x3ff : ~0u; + Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask)); } if (Info.hasWorkItemIDY()) { - Register Reg = AMDGPU::VGPR1; - MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); + assert(Info.hasWorkItemIDX()); + if (Subtarget->hasPackedTID()) { + Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0, + 0x3ff << 10)); + } else { + unsigned Reg = AMDGPU::VGPR1; + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); - CCInfo.AllocateReg(Reg); - Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); + CCInfo.AllocateReg(Reg); + Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); + } } if (Info.hasWorkItemIDZ()) { - Register Reg = AMDGPU::VGPR2; - MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); + assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY()); + if (Subtarget->hasPackedTID()) { + Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0, + 0x3ff << 20)); + } else { + unsigned Reg = AMDGPU::VGPR2; + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); - CCInfo.AllocateReg(Reg); - Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); + CCInfo.AllocateReg(Reg); + Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); + } } } @@ -1865,12 +1952,32 @@ static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, return ArgDescriptor::createRegister(Reg); } -static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) { - return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32); +// If this has a fixed position, we still should allocate the register in the +// CCInfo state. Technically we could get away with this for values passed +// outside of the normal argument range. +static void allocateFixedSGPRInputImpl(CCState &CCInfo, + const TargetRegisterClass *RC, + MCRegister Reg) { + Reg = CCInfo.AllocateReg(Reg); + assert(Reg != AMDGPU::NoRegister); + MachineFunction &MF = CCInfo.getMachineFunction(); + MF.addLiveIn(Reg, RC); +} + +static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) { + if (Arg) { + allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, + Arg.getRegister()); + } else + Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32); } -static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) { - return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16); +static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) { + if (Arg) { + allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, + Arg.getRegister()); + } else + Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16); } /// Allocate implicit function VGPR arguments at the end of allocated user @@ -1919,29 +2026,29 @@ void SITargetLowering::allocateSpecialInputSGPRs( // TODO: Unify handling with private memory pointers. if (Info.hasDispatchPtr()) - ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo); + allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr); if (Info.hasQueuePtr()) - ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo); + allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr); // Implicit arg ptr takes the place of the kernarg segment pointer. This is a // constant offset from the kernarg segment. if (Info.hasImplicitArgPtr()) - ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo); + allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr); if (Info.hasDispatchID()) - ArgInfo.DispatchID = allocateSGPR64Input(CCInfo); + allocateSGPR64Input(CCInfo, ArgInfo.DispatchID); // flat_scratch_init is not applicable for non-kernel functions. if (Info.hasWorkGroupIDX()) - ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo); + allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX); if (Info.hasWorkGroupIDY()) - ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo); + allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY); if (Info.hasWorkGroupIDZ()) - ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo); + allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ); } // Allocate special inputs passed in user SGPRs. @@ -2203,6 +2310,8 @@ SDValue SITargetLowering::LowerFormalArguments( return DAG.getEntryNode(); } + Info->allocateModuleLDSGlobal(Fn.getParent()); + SmallVector<ISD::InputArg, 16> Splits; SmallVector<CCValAssign, 16> ArgLocs; BitVector Skipped(Ins.size()); @@ -2767,6 +2876,7 @@ static bool canGuaranteeTCO(CallingConv::ID CC) { static bool mayTailCallThisCC(CallingConv::ID CC) { switch (CC) { case CallingConv::C: + case CallingConv::AMDGPU_Gfx: return true; default: return canGuaranteeTCO(CC); @@ -2781,6 +2891,11 @@ bool SITargetLowering::isEligibleForTailCallOptimization( if (!mayTailCallThisCC(CalleeCC)) return false; + // For a divergent call target, we need to do a waterfall loop over the + // possible callees which precludes us from using a simple jump. + if (Callee->isDivergent()) + return false; + MachineFunction &MF = DAG.getMachineFunction(); const Function &CallerF = MF.getFunction(); CallingConv::ID CallerCC = CallerF.getCallingConv(); @@ -2888,12 +3003,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, if (!CLI.CB) report_fatal_error("unsupported libcall legalization"); - if (!AMDGPUTargetMachine::EnableFixedFunctionABI && - !CLI.CB->getCalledFunction() && CallConv != CallingConv::AMDGPU_Gfx) { - return lowerUnhandledCall(CLI, InVals, - "unsupported indirect call to function "); - } - if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) { return lowerUnhandledCall(CLI, InVals, "unsupported required tail call to function "); @@ -3054,7 +3163,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, // locations, which are supposed to be immutable? Chain = addTokenForArgument(Chain, DAG, MFI, FI); } else { - DstAddr = PtrOff; + // Stores to the argument stack area are relative to the stack pointer. + SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(), + MVT::i32); + DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff); DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset); Alignment = commonAlignment(Subtarget->getStackAlignment(), LocMemOffset); @@ -4150,11 +4262,35 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( return BB; } case AMDGPU::DS_GWS_INIT: - case AMDGPU::DS_GWS_SEMA_V: case AMDGPU::DS_GWS_SEMA_BR: + case AMDGPU::DS_GWS_BARRIER: + if (Subtarget->needsAlignedVGPRs()) { + // Add implicit aligned super-reg to force alignment on the data operand. + const DebugLoc &DL = MI.getDebugLoc(); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::data0); + Register DataReg = Op->getReg(); + bool IsAGPR = TRI->isAGPR(MRI, DataReg); + Register Undef = MRI.createVirtualRegister( + IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef); + Register NewVR = + MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass + : &AMDGPU::VReg_64_Align2RegClass); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), NewVR) + .addReg(DataReg, 0, Op->getSubReg()) + .addImm(AMDGPU::sub0) + .addReg(Undef) + .addImm(AMDGPU::sub1); + Op->setReg(NewVR); + Op->setSubReg(AMDGPU::sub0); + MI.addOperand(MachineOperand::CreateReg(NewVR, false, true)); + } + LLVM_FALLTHROUGH; + case AMDGPU::DS_GWS_SEMA_V: case AMDGPU::DS_GWS_SEMA_P: case AMDGPU::DS_GWS_SEMA_RELEASE_ALL: - case AMDGPU::DS_GWS_BARRIER: // A s_waitcnt 0 is required to be the instruction immediately following. if (getSubtarget()->hasGWSAutoReplay()) { bundleInstWithWaitcnt(MI); @@ -4360,7 +4496,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const { unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); - assert(VT == MVT::v4i16 || VT == MVT::v4f16); + assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || + VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32); SDValue Lo0, Hi0; std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -4381,7 +4518,8 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const { unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); - assert(VT == MVT::v4i16 || VT == MVT::v4f16); + assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || + VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32); SDValue Lo0, Hi0; std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -4456,6 +4594,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return lowerFMINNUM_FMAXNUM(Op, DAG); case ISD::FMA: return splitTernaryVectorOp(Op, DAG); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + return LowerFP_TO_INT(Op, DAG); case ISD::SHL: case ISD::SRA: case ISD::SRL: @@ -5092,12 +5233,35 @@ SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const { } SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { + if (!Subtarget->isTrapHandlerEnabled() || + Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) + return lowerTrapEndpgm(Op, DAG); + + if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(Subtarget)) { + switch (*HsaAbiVer) { + case ELF::ELFABIVERSION_AMDGPU_HSA_V2: + case ELF::ELFABIVERSION_AMDGPU_HSA_V3: + return lowerTrapHsaQueuePtr(Op, DAG); + case ELF::ELFABIVERSION_AMDGPU_HSA_V4: + return Subtarget->supportsGetDoorbellID() ? + lowerTrapHsa(Op, DAG) : lowerTrapHsaQueuePtr(Op, DAG); + } + } + + llvm_unreachable("Unknown trap handler"); +} + +SDValue SITargetLowering::lowerTrapEndpgm( + SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Chain = Op.getOperand(0); + return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain); +} - if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || - !Subtarget->isTrapHandlerEnabled()) - return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain); +SDValue SITargetLowering::lowerTrapHsaQueuePtr( + SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Chain = Op.getOperand(0); MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); @@ -5108,22 +5272,37 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64); SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue()); + + uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap); SDValue Ops[] = { ToReg, - DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16), + DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01, ToReg.getValue(1) }; return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); } +SDValue SITargetLowering::lowerTrapHsa( + SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Chain = Op.getOperand(0); + + uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap); + SDValue Ops[] = { + Chain, + DAG.getTargetConstant(TrapID, SL, MVT::i16) + }; + return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); +} + SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Chain = Op.getOperand(0); MachineFunction &MF = DAG.getMachineFunction(); - if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || - !Subtarget->isTrapHandlerEnabled()) { + if (!Subtarget->isTrapHandlerEnabled() || + Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { DiagnosticInfoUnsupported NoTrap(MF.getFunction(), "debugtrap handler not supported", Op.getDebugLoc(), @@ -5133,9 +5312,10 @@ SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const { return Chain; } + uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap); SDValue Ops[] = { Chain, - DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16) + DAG.getTargetConstant(TrapID, SL, MVT::i16) }; return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); } @@ -5666,23 +5846,10 @@ static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef<SDValue> Elts) { assert(!Elts.empty()); MVT Type; - unsigned NumElts; - - if (Elts.size() == 1) { - Type = MVT::f32; - NumElts = 1; - } else if (Elts.size() == 2) { - Type = MVT::v2f32; - NumElts = 2; - } else if (Elts.size() == 3) { - Type = MVT::v3f32; - NumElts = 3; - } else if (Elts.size() <= 4) { - Type = MVT::v4f32; - NumElts = 4; - } else if (Elts.size() <= 8) { - Type = MVT::v8f32; - NumElts = 8; + unsigned NumElts = Elts.size(); + + if (NumElts <= 8) { + Type = MVT::getVectorVT(MVT::f32, NumElts); } else { assert(Elts.size() <= 16); Type = MVT::v16f32; @@ -5704,28 +5871,6 @@ static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, return DAG.getBuildVector(Type, DL, VecElts); } -static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG, - SDValue *GLC, SDValue *SLC, SDValue *DLC) { - auto CachePolicyConst = cast<ConstantSDNode>(CachePolicy.getNode()); - - uint64_t Value = CachePolicyConst->getZExtValue(); - SDLoc DL(CachePolicy); - if (GLC) { - *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32); - Value &= ~(uint64_t)0x1; - } - if (SLC) { - *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32); - Value &= ~(uint64_t)0x2; - } - if (DLC) { - *DLC = DAG.getTargetConstant((Value & 0x4) ? 1 : 0, DL, MVT::i32); - Value &= ~(uint64_t)0x4; - } - - return Value == 0; -} - static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts) { EVT SrcVT = Src.getValueType(); @@ -5752,7 +5897,7 @@ static SDValue constructRetValue(SelectionDAG &DAG, ArrayRef<EVT> ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, - const SDLoc &DL, LLVMContext &Context) { + const SDLoc &DL) { // Determine the required return type. This is the same regardless of IsTexFail flag EVT ReqRetVT = ResultTypes[0]; int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1; @@ -5835,11 +5980,11 @@ static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE, return Value == 0; } -static void packImageA16AddressToDwords(SelectionDAG &DAG, SDValue Op, - MVT PackVectorVT, - SmallVectorImpl<SDValue> &PackedAddrs, - unsigned DimIdx, unsigned EndIdx, - unsigned NumGradients) { +static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op, + MVT PackVectorVT, + SmallVectorImpl<SDValue> &PackedAddrs, + unsigned DimIdx, unsigned EndIdx, + unsigned NumGradients) { SDLoc DL(Op); for (unsigned I = DimIdx; I < EndIdx; I++) { SDValue Addr = Op.getOperand(I); @@ -5994,56 +6139,64 @@ SDValue SITargetLowering::lowerImage(SDValue Op, MVT VAddrVT = Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType(); MVT VAddrScalarVT = VAddrVT.getScalarType(); - MVT PackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; + MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType(); VAddrScalarVT = VAddrVT.getScalarType(); + MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; - if (IsA16 || IsG16) { - if (IsA16) { - if (!ST->hasA16()) { - LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " - "support 16 bit addresses\n"); - return Op; - } - if (!IsG16) { - LLVM_DEBUG( - dbgs() << "Failed to lower image intrinsic: 16 bit addresses " - "need 16 bit derivatives but got 32 bit derivatives\n"); - return Op; - } - } else if (!ST->hasG16()) { + + if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) { + // 16 bit gradients are supported, but are tied to the A16 control + // so both gradients and addresses must be 16 bit + LLVM_DEBUG( + dbgs() << "Failed to lower image intrinsic: 16 bit addresses " + "require 16 bit args for both gradients and addresses"); + return Op; + } + + if (IsA16) { + if (!ST->hasA16()) { LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " - "support 16 bit derivatives\n"); + "support 16 bit addresses\n"); return Op; } + } - if (BaseOpcode->Gradients && !IsA16) { - if (!ST->hasG16()) { - LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " - "support 16 bit derivatives\n"); - return Op; - } - // Activate g16 - const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = - AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); - IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16 - } + // We've dealt with incorrect input so we know that if IsA16, IsG16 + // are set then we have to compress/pack operands (either address, + // gradient or both) + // In the case where a16 and gradients are tied (no G16 support) then we + // have already verified that both IsA16 and IsG16 are true + if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) { + // Activate g16 + const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = + AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); + IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16 + } - // Don't compress addresses for G16 - const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart); - packImageA16AddressToDwords(DAG, Op, PackVectorVT, VAddrs, - ArgOffset + Intr->GradientStart, PackEndIdx, - Intr->NumGradients); + // Add gradients (packed or unpacked) + if (IsG16) { + // Pack the gradients + // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart); + packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs, + ArgOffset + Intr->GradientStart, + ArgOffset + Intr->CoordStart, Intr->NumGradients); + } else { + for (unsigned I = ArgOffset + Intr->GradientStart; + I < ArgOffset + Intr->CoordStart; I++) + VAddrs.push_back(Op.getOperand(I)); + } - if (!IsA16) { - // Add uncompressed address - for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++) - VAddrs.push_back(Op.getOperand(I)); - } + // Add addresses (packed or unpacked) + if (IsA16) { + packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs, + ArgOffset + Intr->CoordStart, VAddrEnd, + 0 /* No gradients */); } else { - for (unsigned I = ArgOffset + Intr->GradientStart; I < VAddrEnd; I++) + // Add uncompressed address + for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++) VAddrs.push_back(Op.getOperand(I)); } @@ -6058,8 +6211,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op, // // SIShrinkInstructions will convert NSA encodings to non-NSA after register // allocation when possible. - bool UseNSA = - ST->hasFeature(AMDGPU::FeatureNSAEncoding) && VAddrs.size() >= 3; + bool UseNSA = ST->hasFeature(AMDGPU::FeatureNSAEncoding) && + VAddrs.size() >= 3 && + VAddrs.size() <= (unsigned)ST->getNSAMaxSize(); SDValue VAddr; if (!UseNSA) VAddr = getBuildDwordsVector(DAG, DL, VAddrs); @@ -6120,19 +6274,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } } - SDValue GLC; - SDValue SLC; - SDValue DLC; - if (BaseOpcode->Atomic) { - GLC = True; // TODO no-return optimization - if (!parseCachePolicy(Op.getOperand(ArgOffset + Intr->CachePolicyIndex), - DAG, nullptr, &SLC, IsGFX10Plus ? &DLC : nullptr)) - return Op; - } else { - if (!parseCachePolicy(Op.getOperand(ArgOffset + Intr->CachePolicyIndex), - DAG, &GLC, &SLC, IsGFX10Plus ? &DLC : nullptr)) - return Op; - } + unsigned CPol = cast<ConstantSDNode>( + Op.getOperand(ArgOffset + Intr->CachePolicyIndex))->getZExtValue(); + if (BaseOpcode->Atomic) + CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization + if (CPol & ~AMDGPU::CPol::ALL) + return Op; SmallVector<SDValue, 26> Ops; if (BaseOpcode->Store || BaseOpcode->Atomic) @@ -6148,16 +6295,17 @@ SDValue SITargetLowering::lowerImage(SDValue Op, if (IsGFX10Plus) Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32)); Ops.push_back(Unorm); - if (IsGFX10Plus) - Ops.push_back(DLC); - Ops.push_back(GLC); - Ops.push_back(SLC); + Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32)); Ops.push_back(IsA16 && // r128, a16 for gfx9 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False); if (IsGFX10Plus) Ops.push_back(IsA16 ? True : False); - Ops.push_back(TFE); - Ops.push_back(LWE); + if (!Subtarget->hasGFX90AInsts()) { + Ops.push_back(TFE); //tfe + } else if (cast<ConstantSDNode>(TFE)->getZExtValue()) { + report_fatal_error("TFE is not supported on this GPU"); + } + Ops.push_back(LWE); // lwe if (!IsGFX10Plus) Ops.push_back(DimInfo->DA ? True : False); if (BaseOpcode->HasD16) @@ -6175,7 +6323,15 @@ SDValue SITargetLowering::lowerImage(SDValue Op, : AMDGPU::MIMGEncGfx10Default, NumVDataDwords, NumVAddrDwords); } else { - if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (Subtarget->hasGFX90AInsts()) { + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a, + NumVDataDwords, NumVAddrDwords); + if (Opcode == -1) + report_fatal_error( + "requested image instruction is not supported on this GPU"); + } + if (Opcode == -1 && + Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, NumVDataDwords, NumVAddrDwords); if (Opcode == -1) @@ -6194,15 +6350,13 @@ SDValue SITargetLowering::lowerImage(SDValue Op, SmallVector<SDValue, 1> Elt; DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1); return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL); - } else if (!BaseOpcode->Store) { - return constructRetValue(DAG, NewNode, - OrigResultTypes, IsTexFail, - Subtarget->hasUnpackedD16VMem(), IsD16, - DMaskLanes, NumVDataDwords, DL, - *DAG.getContext()); } - - return SDValue(NewNode, 0); + if (BaseOpcode->Store) + return SDValue(NewNode, 0); + return constructRetValue(DAG, NewNode, + OrigResultTypes, IsTexFail, + Subtarget->hasUnpackedD16VMem(), IsD16, + DMaskLanes, NumVDataDwords, DL); } SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, @@ -6448,11 +6602,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(), SDLoc(Op), MVT::i32); case Intrinsic::amdgcn_s_buffer_load: { - bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget); - SDValue GLC; - SDValue DLC = DAG.getTargetConstant(0, DL, MVT::i1); - if (!parseCachePolicy(Op.getOperand(3), DAG, &GLC, nullptr, - IsGFX10Plus ? &DLC : nullptr)) + unsigned CPol = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); + if (CPol & ~AMDGPU::CPol::ALL) return Op; return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), DAG); @@ -6607,6 +6758,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_alignbit: return DAG.getNode(ISD::FSHR, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::amdgcn_perm: + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_reloc_constant: { Module *M = const_cast<Module *>(MF.getFunction().getParent()); const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD(); @@ -6626,28 +6780,29 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } } -// This function computes an appropriate offset to pass to -// MachineMemOperand::setOffset() based on the offset inputs to -// an intrinsic. If any of the offsets are non-contstant or -// if VIndex is non-zero then this function returns 0. Otherwise, -// it returns the sum of VOffset, SOffset, and Offset. -static unsigned getBufferOffsetForMMO(SDValue VOffset, - SDValue SOffset, - SDValue Offset, - SDValue VIndex = SDValue()) { - +/// Update \p MMO based on the offset inputs to an intrinsic. +static void updateBufferMMO(MachineMemOperand *MMO, SDValue VOffset, + SDValue SOffset, SDValue Offset, + SDValue VIndex = SDValue()) { if (!isa<ConstantSDNode>(VOffset) || !isa<ConstantSDNode>(SOffset) || - !isa<ConstantSDNode>(Offset)) - return 0; + !isa<ConstantSDNode>(Offset)) { + // The combined offset is not known to be constant, so we cannot represent + // it in the MMO. Give up. + MMO->setValue((Value *)nullptr); + return; + } - if (VIndex) { - if (!isa<ConstantSDNode>(VIndex) || !cast<ConstantSDNode>(VIndex)->isNullValue()) - return 0; + if (VIndex && (!isa<ConstantSDNode>(VIndex) || + !cast<ConstantSDNode>(VIndex)->isNullValue())) { + // The strided index component of the address is not known to be zero, so we + // cannot represent it in the MMO. Give up. + MMO->setValue((Value *)nullptr); + return; } - return cast<ConstantSDNode>(VOffset)->getSExtValue() + - cast<ConstantSDNode>(SOffset)->getSExtValue() + - cast<ConstantSDNode>(Offset)->getSExtValue(); + MMO->setOffset(cast<ConstantSDNode>(VOffset)->getSExtValue() + + cast<ConstantSDNode>(SOffset)->getSExtValue() + + cast<ConstantSDNode>(Offset)->getSExtValue()); } SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op, @@ -6670,13 +6825,21 @@ SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op, }; auto *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6])); + updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6]); EVT MemVT = VData.getValueType(); return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT, M->getMemOperand()); } +// Return a value to use for the idxen operand by examining the vindex operand. +static unsigned getIdxEn(SDValue VIndex) { + if (auto VIndexC = dyn_cast<ConstantSDNode>(VIndex)) + // No need to set idxen if vindex is known to be zero. + return VIndexC->getZExtValue() != 0; + return 1; +} + SDValue SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, unsigned NewOpcode) const { @@ -6697,8 +6860,7 @@ SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, }; auto *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6], - Ops[3])); + updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]); EVT MemVT = VData.getValueType(); return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT, @@ -6811,9 +6973,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_buffer_load_format: { unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue(); - unsigned IdxEn = 1; - if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3))) - IdxEn = Idx->getZExtValue() != 0; + unsigned IdxEn = getIdxEn(Op.getOperand(3)); SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // rsrc @@ -6824,11 +6984,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - - unsigned Offset = setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]); - // We don't know the offset if vindex is non-zero, so clear it. - if (IdxEn) - Offset = 0; + setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]); unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; @@ -6836,7 +6992,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, EVT VT = Op.getValueType(); EVT IntVT = VT.changeTypeToInteger(); auto *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(Offset); + updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5], Ops[2]); EVT LoadVT = Op.getValueType(); if (LoadVT.getScalarType() == MVT::f16) @@ -6868,7 +7024,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, }; auto *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5])); + updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5]); return lowerIntrinsicLoad(M, IsFormat, DAG, Ops); } case Intrinsic::amdgcn_struct_buffer_load: @@ -6888,8 +7044,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, }; auto *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5], - Ops[2])); + updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5], Ops[2]); return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops); } case Intrinsic::amdgcn_tbuffer_load: { @@ -6900,9 +7055,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue(); unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue(); unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue(); - unsigned IdxEn = 1; - if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3))) - IdxEn = Idx->getZExtValue() != 0; + unsigned IdxEn = getIdxEn(Op.getOperand(3)); SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // rsrc @@ -6983,9 +7136,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_buffer_atomic_xor: case Intrinsic::amdgcn_buffer_atomic_fadd: { unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue(); - unsigned IdxEn = 1; - if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4))) - IdxEn = Idx->getZExtValue() != 0; + unsigned IdxEn = getIdxEn(Op.getOperand(4)); SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // vdata @@ -6997,14 +7148,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); - // We don't know the offset if vindex is non-zero, so clear it. - if (IdxEn) - Offset = 0; + setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(Offset); + updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]); unsigned Opcode = 0; switch (IntrID) { @@ -7042,7 +7191,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; break; case Intrinsic::amdgcn_buffer_atomic_fadd: - if (!Op.getValue(0).use_empty()) { + if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) { DiagnosticInfoUnsupported NoFpRet(DAG.getMachineFunction().getFunction(), "return versions of fp atomics not supported", @@ -7063,6 +7212,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); case Intrinsic::amdgcn_struct_buffer_atomic_fadd: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); + case Intrinsic::amdgcn_raw_buffer_atomic_fmin: + return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN); + case Intrinsic::amdgcn_struct_buffer_atomic_fmin: + return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN); + case Intrinsic::amdgcn_raw_buffer_atomic_fmax: + return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX); + case Intrinsic::amdgcn_struct_buffer_atomic_fmax: + return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX); case Intrinsic::amdgcn_raw_buffer_atomic_swap: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP); case Intrinsic::amdgcn_raw_buffer_atomic_add: @@ -7119,9 +7276,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_buffer_atomic_cmpswap: { unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue(); - unsigned IdxEn = 1; - if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(5))) - IdxEn = Idx->getZExtValue() != 0; + unsigned IdxEn = getIdxEn(Op.getOperand(5)); SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // src @@ -7134,13 +7289,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - unsigned Offset = setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]); - // We don't know the offset if vindex is non-zero, so clear it. - if (IdxEn) - Offset = 0; + setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]); + EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(Offset); + updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7], Ops[4]); return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); @@ -7161,7 +7314,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, }; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7])); + updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7]); return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); @@ -7182,33 +7335,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, }; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7], - Ops[4])); + updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7], Ops[4]); return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); } - case Intrinsic::amdgcn_global_atomic_fadd: { - if (!Op.getValue(0).use_empty()) { - DiagnosticInfoUnsupported - NoFpRet(DAG.getMachineFunction().getFunction(), - "return versions of fp atomics not supported", - DL.getDebugLoc(), DS_Error); - DAG.getContext()->diagnose(NoFpRet); - return SDValue(); - } - MemSDNode *M = cast<MemSDNode>(Op); - SDValue Ops[] = { - M->getOperand(0), // Chain - M->getOperand(2), // Ptr - M->getOperand(3) // Value - }; - - EVT VT = Op.getOperand(3).getValueType(); - return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT, - DAG.getVTList(VT, MVT::Other), Ops, - M->getMemOperand()); - } case Intrinsic::amdgcn_image_bvh_intersect_ray: { SDLoc DL(Op); MemSDNode *M = cast<MemSDNode>(Op); @@ -7224,6 +7355,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, assert(RayDir.getValueType() == MVT::v4f16 || RayDir.getValueType() == MVT::v4f32); + if (!Subtarget->hasGFX10_AEncoding()) { + emitRemovedIntrinsicError(DAG, DL, Op.getValueType()); + return SDValue(); + } + bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16; bool Is64 = NodePtr.getValueType() == MVT::i64; unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa @@ -7279,7 +7415,55 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, DAG.setNodeMemRefs(NewNode, {MemRef}); return SDValue(NewNode, 0); } + case Intrinsic::amdgcn_global_atomic_fadd: + if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) { + DiagnosticInfoUnsupported + NoFpRet(DAG.getMachineFunction().getFunction(), + "return versions of fp atomics not supported", + DL.getDebugLoc(), DS_Error); + DAG.getContext()->diagnose(NoFpRet); + return SDValue(); + } + LLVM_FALLTHROUGH; + case Intrinsic::amdgcn_global_atomic_fmin: + case Intrinsic::amdgcn_global_atomic_fmax: + case Intrinsic::amdgcn_flat_atomic_fadd: + case Intrinsic::amdgcn_flat_atomic_fmin: + case Intrinsic::amdgcn_flat_atomic_fmax: { + MemSDNode *M = cast<MemSDNode>(Op); + SDValue Ops[] = { + M->getOperand(0), // Chain + M->getOperand(2), // Ptr + M->getOperand(3) // Value + }; + unsigned Opcode = 0; + switch (IntrID) { + case Intrinsic::amdgcn_global_atomic_fadd: + case Intrinsic::amdgcn_flat_atomic_fadd: { + EVT VT = Op.getOperand(3).getValueType(); + return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT, + DAG.getVTList(VT, MVT::Other), Ops, + M->getMemOperand()); + } + case Intrinsic::amdgcn_global_atomic_fmin: + case Intrinsic::amdgcn_flat_atomic_fmin: { + Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN; + break; + } + case Intrinsic::amdgcn_global_atomic_fmax: + case Intrinsic::amdgcn_flat_atomic_fmax: { + Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX; + break; + } + default: + llvm_unreachable("unhandled atomic opcode"); + } + return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op), + M->getVTList(), Ops, M->getMemoryVT(), + M->getMemOperand()); + } default: + if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrID)) return lowerImage(Op, ImageDimIntr, DAG, true); @@ -7448,9 +7632,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue(); unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue(); unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue(); - unsigned IdxEn = 1; - if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4))) - IdxEn = Idx->getZExtValue() != 0; + unsigned IdxEn = getIdxEn(Op.getOperand(4)); SDValue Ops[] = { Chain, VData, // vdata @@ -7461,7 +7643,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op.getOperand(7), // offset DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idexen + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : AMDGPUISD::TBUFFER_STORE_FORMAT; @@ -7486,7 +7668,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Offsets.second, // offset Op.getOperand(7), // format Op.getOperand(8), // cachepolicy, swizzled buffer - DAG.getTargetConstant(1, DL, MVT::i1), // idexen + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : AMDGPUISD::TBUFFER_STORE_FORMAT; @@ -7511,7 +7693,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Offsets.second, // offset Op.getOperand(6), // format Op.getOperand(7), // cachepolicy, swizzled buffer - DAG.getTargetConstant(0, DL, MVT::i1), // idexen + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : AMDGPUISD::TBUFFER_STORE_FORMAT; @@ -7528,9 +7710,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, VData = handleD16VData(VData, DAG); unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue(); unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue(); - unsigned IdxEn = 1; - if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4))) - IdxEn = Idx->getZExtValue() != 0; + unsigned IdxEn = getIdxEn(Op.getOperand(4)); SDValue Ops[] = { Chain, VData, @@ -7542,15 +7722,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); - // We don't know the offset if vindex is non-zero, so clear it. - if (IdxEn) - Offset = 0; + setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(Offset); + updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]); // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics EVT VDataType = VData.getValueType().getScalarType(); @@ -7597,7 +7775,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6])); + updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6]); // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32) @@ -7644,8 +7822,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6], - Ops[3])); + updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]); // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics EVT VDataType = VData.getValueType().getScalarType(); @@ -7725,9 +7902,9 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets( // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the // three offsets (voffset, soffset and instoffset) into the SDValue[3] array // pointed to by Offsets. -unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset, - SelectionDAG &DAG, SDValue *Offsets, - Align Alignment) const { +void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, + SelectionDAG &DAG, SDValue *Offsets, + Align Alignment) const { SDLoc DL(CombinedOffset); if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) { uint32_t Imm = C->getZExtValue(); @@ -7737,7 +7914,7 @@ unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset, Offsets[0] = DAG.getConstant(0, DL, MVT::i32); Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); - return SOffset + ImmOffset; + return; } } if (DAG.isBaseWithConstantOffset(CombinedOffset)) { @@ -7750,13 +7927,12 @@ unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset, Offsets[0] = N0; Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); - return 0; + return; } } Offsets[0] = CombinedOffset; Offsets[1] = DAG.getConstant(0, DL, MVT::i32); Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32); - return 0; } // Handle 8 bit and 16 bit buffer loads @@ -8263,8 +8439,8 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { // Returns immediate value for setting the F32 denorm mode when using the // S_DENORM_MODE instruction. -static const SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG, - const SDLoc &SL, const GCNSubtarget *ST) { +static SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG, + const SDLoc &SL, const GCNSubtarget *ST) { assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE"); int DPDenormModeDefault = hasFP64FP16Denormals(DAG.getMachineFunction()) ? FP_DENORM_FLUSH_NONE @@ -8794,18 +8970,20 @@ SDValue SITargetLowering::splitBinaryBitConstantOp( } // Returns true if argument is a boolean value which is not serialized into -// memory or argument and does not require v_cmdmask_b32 to be deserialized. +// memory or argument and does not require v_cndmask_b32 to be deserialized. static bool isBoolSGPR(SDValue V) { if (V.getValueType() != MVT::i1) return false; switch (V.getOpcode()) { - default: break; + default: + break; case ISD::SETCC: + case AMDGPUISD::FP_CLASS: + return true; case ISD::AND: case ISD::OR: case ISD::XOR: - case AMDGPUISD::FP_CLASS: - return true; + return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1)); } return false; } @@ -9206,63 +9384,6 @@ SDValue SITargetLowering::performXorCombine(SDNode *N, return SDValue(); } -// Instructions that will be lowered with a final instruction that zeros the -// high result bits. -// XXX - probably only need to list legal operations. -static bool fp16SrcZerosHighBits(unsigned Opc) { - switch (Opc) { - case ISD::FADD: - case ISD::FSUB: - case ISD::FMUL: - case ISD::FDIV: - case ISD::FREM: - case ISD::FMA: - case ISD::FMAD: - case ISD::FCANONICALIZE: - case ISD::FP_ROUND: - case ISD::UINT_TO_FP: - case ISD::SINT_TO_FP: - case ISD::FABS: - // Fabs is lowered to a bit operation, but it's an and which will clear the - // high bits anyway. - case ISD::FSQRT: - case ISD::FSIN: - case ISD::FCOS: - case ISD::FPOWI: - case ISD::FPOW: - case ISD::FLOG: - case ISD::FLOG2: - case ISD::FLOG10: - case ISD::FEXP: - case ISD::FEXP2: - case ISD::FCEIL: - case ISD::FTRUNC: - case ISD::FRINT: - case ISD::FNEARBYINT: - case ISD::FROUND: - case ISD::FFLOOR: - case ISD::FMINNUM: - case ISD::FMAXNUM: - case AMDGPUISD::FRACT: - case AMDGPUISD::CLAMP: - case AMDGPUISD::COS_HW: - case AMDGPUISD::SIN_HW: - case AMDGPUISD::FMIN3: - case AMDGPUISD::FMAX3: - case AMDGPUISD::FMED3: - case AMDGPUISD::FMAD_FTZ: - case AMDGPUISD::RCP: - case AMDGPUISD::RSQ: - case AMDGPUISD::RCP_IFLAG: - case AMDGPUISD::LDEXP: - return true; - default: - // fcopysign, select and others may be lowered to 32-bit bit operations - // which don't zero the high bits. - return false; - } -} - SDValue SITargetLowering::performZeroExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (!Subtarget->has16BitInsts() || @@ -9277,15 +9398,6 @@ SDValue SITargetLowering::performZeroExtendCombine(SDNode *N, if (Src.getValueType() != MVT::i16) return SDValue(); - // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src - // FIXME: It is not universally true that the high bits are zeroed on gfx9. - if (Src.getOpcode() == ISD::BITCAST) { - SDValue BCSrc = Src.getOperand(0); - if (BCSrc.getValueType() == MVT::f16 && - fp16SrcZerosHighBits(BCSrc.getOpcode())) - return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc); - } - return SDValue(); } @@ -9482,19 +9594,18 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, // Could be anything. return false; - case ISD::BITCAST: { + case ISD::BITCAST: + return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1); + case ISD::TRUNCATE: { // Hack round the mess we make when legalizing extract_vector_elt - SDValue Src = Op.getOperand(0); - if (Src.getValueType() == MVT::i16 && - Src.getOpcode() == ISD::TRUNCATE) { - SDValue TruncSrc = Src.getOperand(0); + if (Op.getValueType() == MVT::i16) { + SDValue TruncSrc = Op.getOperand(0); if (TruncSrc.getValueType() == MVT::i32 && TruncSrc.getOpcode() == ISD::BITCAST && TruncSrc.getOperand(0).getValueType() == MVT::v2f16) { return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1); } } - return false; } case ISD::INTRINSIC_WO_CHAIN: { @@ -9527,6 +9638,45 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, llvm_unreachable("invalid operation"); } +bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF, + unsigned MaxDepth) const { + MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineInstr *MI = MRI.getVRegDef(Reg); + unsigned Opcode = MI->getOpcode(); + + if (Opcode == AMDGPU::G_FCANONICALIZE) + return true; + + if (Opcode == AMDGPU::G_FCONSTANT) { + auto F = MI->getOperand(1).getFPImm()->getValueAPF(); + if (F.isNaN() && F.isSignaling()) + return false; + return !F.isDenormal() || denormalsEnabledForType(MRI.getType(Reg), MF); + } + + if (MaxDepth == 0) + return false; + + switch (Opcode) { + case AMDGPU::G_FMINNUM_IEEE: + case AMDGPU::G_FMAXNUM_IEEE: { + if (Subtarget->supportsMinMaxDenormModes() || + denormalsEnabledForType(MRI.getType(Reg), MF)) + return true; + for (unsigned I = 1, E = MI->getNumOperands(); I != E; ++I) { + if (!isCanonicalized(MI->getOperand(I).getReg(), MF, MaxDepth - 1)) + return false; + } + return true; + } + default: + return denormalsEnabledForType(MRI.getType(Reg), MF) && + isKnownNeverSNaN(Reg, MRI); + } + + llvm_unreachable("invalid operation"); +} + // Constant fold canonicalize. SDValue SITargetLowering::getCanonicalConstantFP( SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const { @@ -9694,15 +9844,19 @@ SDValue SITargetLowering::performIntMed3ImmCombine( } // If there isn't a 16-bit med3 operation, convert to 32-bit. - MVT NVT = MVT::i32; - unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + if (VT == MVT::i16) { + MVT NVT = MVT::i32; + unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + + SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0)); + SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1)); + SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1); - SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0)); - SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1)); - SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1); + SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3); + return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3); + } - SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3); - return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3); + return SDValue(); } static ConstantFPSDNode *getSplatConstantFP(SDValue Op) { @@ -10408,7 +10562,7 @@ SDValue SITargetLowering::performFMACombine(SDNode *N, EVT VT = N->getValueType(0); SDLoc SL(N); - if (!Subtarget->hasDot2Insts() || VT != MVT::f32) + if (!Subtarget->hasDot7Insts() || VT != MVT::f32) return SDValue(); // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) -> @@ -10791,7 +10945,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, unsigned NewDmask = 0; unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1; unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1; - bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) || + bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) || Node->getConstantOperandVal(LWEIdx)) ? 1 : 0; unsigned TFCLane = 0; bool HasChain = Node->getNumValues() > 1; @@ -11067,6 +11221,95 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, return Node; } +// Any MIMG instructions that use tfe or lwe require an initialization of the +// result register that will be written in the case of a memory access failure. +// The required code is also added to tie this init code to the result of the +// img instruction. +void SITargetLowering::AddIMGInit(MachineInstr &MI) const { + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + MachineBasicBlock &MBB = *MI.getParent(); + + MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe); + MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe); + MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16); + + if (!TFE && !LWE) // intersect_ray + return; + + unsigned TFEVal = TFE ? TFE->getImm() : 0; + unsigned LWEVal = LWE->getImm(); + unsigned D16Val = D16 ? D16->getImm() : 0; + + if (!TFEVal && !LWEVal) + return; + + // At least one of TFE or LWE are non-zero + // We have to insert a suitable initialization of the result value and + // tie this to the dest of the image instruction. + + const DebugLoc &DL = MI.getDebugLoc(); + + int DstIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); + + // Calculate which dword we have to initialize to 0. + MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask); + + // check that dmask operand is found. + assert(MO_Dmask && "Expected dmask operand in instruction"); + + unsigned dmask = MO_Dmask->getImm(); + // Determine the number of active lanes taking into account the + // Gather4 special case + unsigned ActiveLanes = TII->isGather4(MI) ? 4 : countPopulation(dmask); + + bool Packed = !Subtarget->hasUnpackedD16VMem(); + + unsigned InitIdx = + D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1; + + // Abandon attempt if the dst size isn't large enough + // - this is in fact an error but this is picked up elsewhere and + // reported correctly. + uint32_t DstSize = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32; + if (DstSize < InitIdx) + return; + + // Create a register for the intialization value. + Register PrevDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx)); + unsigned NewDst = 0; // Final initialized value will be in here + + // If PRTStrictNull feature is enabled (the default) then initialize + // all the result registers to 0, otherwise just the error indication + // register (VGPRn+1) + unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1; + unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1); + + BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst); + for (; SizeLeft; SizeLeft--, CurrIdx++) { + NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx)); + // Initialize dword + Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg) + .addImm(0); + // Insert into the super-reg + BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst) + .addReg(PrevDst) + .addReg(SubReg) + .addImm(SIRegisterInfo::getSubRegFromChannel(CurrIdx)); + + PrevDst = NewDst; + } + + // Add as an implicit operand + MI.addOperand(MachineOperand::CreateReg(NewDst, false, true)); + + // Tie the just added implicit operand to the dst + MI.tieOperands(DstIdx, MI.getNumOperands() - 1); +} + /// Assign the register class depending on the number of /// bits set in the writemask void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, @@ -11114,10 +11357,12 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode()); if (NoRetAtomicOp != -1) { if (!Node->hasAnyUseOfValue(0)) { - int Glc1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), - AMDGPU::OpName::glc1); - if (Glc1Idx != -1) - MI.RemoveOperand(Glc1Idx); + int CPolIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::cpol); + if (CPolIdx != -1) { + MachineOperand &CPol = MI.getOperand(CPolIdx); + CPol.setImm(CPol.getImm() & ~AMDGPU::CPol::GLC); + } MI.RemoveOperand(0); MI.setDesc(TII->get(NoRetAtomicOp)); return; @@ -11148,6 +11393,9 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, } return; } + + if (TII->isMIMG(MI) && !MI.mayStore()) + AddIMGInit(MI); } static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, @@ -11226,9 +11474,11 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL, //===----------------------------------------------------------------------===// std::pair<unsigned, const TargetRegisterClass *> -SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, +SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, StringRef Constraint, MVT VT) const { + const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_); + const TargetRegisterClass *RC = nullptr; if (Constraint.size() == 1) { const unsigned BitWidth = VT.getSizeInBits(); @@ -11257,7 +11507,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, RC = &AMDGPU::VGPR_32RegClass; break; default: - RC = SIRegisterInfo::getVGPRClassForBitWidth(BitWidth); + RC = TRI->getVGPRClassForBitWidth(BitWidth); if (!RC) return std::make_pair(0U, nullptr); break; @@ -11271,7 +11521,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, RC = &AMDGPU::AGPR_32RegClass; break; default: - RC = SIRegisterInfo::getAGPRClassForBitWidth(BitWidth); + RC = TRI->getAGPRClassForBitWidth(BitWidth); if (!RC) return std::make_pair(0U, nullptr); break; @@ -11444,6 +11694,47 @@ bool SITargetLowering::checkAsmConstraintValA(SDValue Op, return false; } +static int getAlignedAGPRClassID(unsigned UnalignedClassID) { + switch (UnalignedClassID) { + case AMDGPU::VReg_64RegClassID: + return AMDGPU::VReg_64_Align2RegClassID; + case AMDGPU::VReg_96RegClassID: + return AMDGPU::VReg_96_Align2RegClassID; + case AMDGPU::VReg_128RegClassID: + return AMDGPU::VReg_128_Align2RegClassID; + case AMDGPU::VReg_160RegClassID: + return AMDGPU::VReg_160_Align2RegClassID; + case AMDGPU::VReg_192RegClassID: + return AMDGPU::VReg_192_Align2RegClassID; + case AMDGPU::VReg_224RegClassID: + return AMDGPU::VReg_224_Align2RegClassID; + case AMDGPU::VReg_256RegClassID: + return AMDGPU::VReg_256_Align2RegClassID; + case AMDGPU::VReg_512RegClassID: + return AMDGPU::VReg_512_Align2RegClassID; + case AMDGPU::VReg_1024RegClassID: + return AMDGPU::VReg_1024_Align2RegClassID; + case AMDGPU::AReg_64RegClassID: + return AMDGPU::AReg_64_Align2RegClassID; + case AMDGPU::AReg_96RegClassID: + return AMDGPU::AReg_96_Align2RegClassID; + case AMDGPU::AReg_128RegClassID: + return AMDGPU::AReg_128_Align2RegClassID; + case AMDGPU::AReg_160RegClassID: + return AMDGPU::AReg_160_Align2RegClassID; + case AMDGPU::AReg_192RegClassID: + return AMDGPU::AReg_192_Align2RegClassID; + case AMDGPU::AReg_256RegClassID: + return AMDGPU::AReg_256_Align2RegClassID; + case AMDGPU::AReg_512RegClassID: + return AMDGPU::AReg_512_Align2RegClassID; + case AMDGPU::AReg_1024RegClassID: + return AMDGPU::AReg_1024_Align2RegClassID; + default: + return -1; + } +} + // Figure out which registers should be reserved for stack access. Only after // the function is legalized do we know all of the non-spill stack objects or if // calls are present. @@ -11452,6 +11743,7 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); if (Info->isEntryFunction()) { // Callable functions have fixed registers used for stack access. @@ -11474,7 +11766,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { Info->limitOccupancy(MF); if (ST.isWave32() && !MF.empty()) { - const SIInstrInfo *TII = ST.getInstrInfo(); for (auto &MBB : MF) { for (auto &MI : MBB) { TII->fixImplicitOperands(MI); @@ -11482,13 +11773,30 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { } } + // FIXME: This is a hack to fixup AGPR classes to use the properly aligned + // classes if required. Ideally the register class constraints would differ + // per-subtarget, but there's no easy way to achieve that right now. This is + // not a problem for VGPRs because the correctly aligned VGPR class is implied + // from using them as the register class for legal types. + if (ST.needsAlignedVGPRs()) { + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + const Register Reg = Register::index2VirtReg(I); + const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); + if (!RC) + continue; + int NewClassID = getAlignedAGPRClassID(RC->getID()); + if (NewClassID != -1) + MRI.setRegClass(Reg, TRI->getRegClass(NewClassID)); + } + } + TargetLoweringBase::finalizeLowering(MF); // Allocate a VGPR for future SGPR Spill if // "amdgpu-reserve-vgpr-for-sgpr-spill" option is used // FIXME: We won't need this hack if we split SGPR allocation from VGPR - if (VGPRReserveforSGPRSpill && !Info->VGPRReservedForSGPRSpill && - !Info->isEntryFunction() && MF.getFrameInfo().hasStackObjects()) + if (VGPRReserveforSGPRSpill && TRI->spillSGPRToVGPR() && + !Info->VGPRReservedForSGPRSpill && !Info->isEntryFunction()) Info->reserveVGPRforSGPRSpills(MF); } @@ -11690,8 +11998,37 @@ bool SITargetLowering::isSDNodeSourceOfDivergence( case ISD::INTRINSIC_W_CHAIN: return AMDGPU::isIntrinsicSourceOfDivergence( cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()); + case AMDGPUISD::ATOMIC_CMP_SWAP: + case AMDGPUISD::ATOMIC_INC: + case AMDGPUISD::ATOMIC_DEC: + case AMDGPUISD::ATOMIC_LOAD_FMIN: + case AMDGPUISD::ATOMIC_LOAD_FMAX: + case AMDGPUISD::BUFFER_ATOMIC_SWAP: + case AMDGPUISD::BUFFER_ATOMIC_ADD: + case AMDGPUISD::BUFFER_ATOMIC_SUB: + case AMDGPUISD::BUFFER_ATOMIC_SMIN: + case AMDGPUISD::BUFFER_ATOMIC_UMIN: + case AMDGPUISD::BUFFER_ATOMIC_SMAX: + case AMDGPUISD::BUFFER_ATOMIC_UMAX: + case AMDGPUISD::BUFFER_ATOMIC_AND: + case AMDGPUISD::BUFFER_ATOMIC_OR: + case AMDGPUISD::BUFFER_ATOMIC_XOR: + case AMDGPUISD::BUFFER_ATOMIC_INC: + case AMDGPUISD::BUFFER_ATOMIC_DEC: + case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: + case AMDGPUISD::BUFFER_ATOMIC_CSUB: + case AMDGPUISD::BUFFER_ATOMIC_FADD: + case AMDGPUISD::BUFFER_ATOMIC_FMIN: + case AMDGPUISD::BUFFER_ATOMIC_FMAX: + // Target-specific read-modify-write atomics are sources of divergence. + return true; + default: + if (auto *A = dyn_cast<AtomicSDNode>(N)) { + // Generic read-modify-write atomics are sources of divergence. + return A->readMem() && A->writeMem(); + } + return false; } - return false; } bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG, @@ -11707,6 +12044,19 @@ bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG, } } +bool SITargetLowering::denormalsEnabledForType(LLT Ty, + MachineFunction &MF) const { + switch (Ty.getScalarSizeInBits()) { + case 32: + return hasFP32Denormals(MF); + case 64: + case 16: + return hasFP64FP16Denormals(MF); + default: + return false; + } +} + bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN, @@ -11745,24 +12095,57 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { if (Ty->isHalfTy()) return AtomicExpansionKind::None; - if (!Ty->isFloatTy()) + if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy())) return AtomicExpansionKind::CmpXChg; - // TODO: Do have these for flat. Older targets also had them for buffers. unsigned AS = RMW->getPointerAddressSpace(); - if (AS == AMDGPUAS::GLOBAL_ADDRESS && Subtarget->hasAtomicFaddInsts()) { - if (!fpModeMatchesGlobalFPAtomicMode(RMW)) + if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) && + Subtarget->hasAtomicFaddInsts()) { + // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe + // floating point atomic instructions. May generate more efficient code, + // but may not respect rounding and denormal modes, and may give incorrect + // results for certain memory destinations. + if (RMW->getFunction() + ->getFnAttribute("amdgpu-unsafe-fp-atomics") + .getValueAsString() != "true") + return AtomicExpansionKind::CmpXChg; + + if (Subtarget->hasGFX90AInsts()) { + if (Ty->isFloatTy() && AS == AMDGPUAS::FLAT_ADDRESS) + return AtomicExpansionKind::CmpXChg; + + auto SSID = RMW->getSyncScopeID(); + if (SSID == SyncScope::System || + SSID == RMW->getContext().getOrInsertSyncScopeID("one-as")) + return AtomicExpansionKind::CmpXChg; + + return AtomicExpansionKind::None; + } + + if (AS == AMDGPUAS::FLAT_ADDRESS) return AtomicExpansionKind::CmpXChg; - return RMW->use_empty() ? AtomicExpansionKind::None : - AtomicExpansionKind::CmpXChg; + return RMW->use_empty() ? AtomicExpansionKind::None + : AtomicExpansionKind::CmpXChg; } // DS FP atomics do repect the denormal mode, but the rounding mode is fixed // to round-to-nearest-even. - return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ? - AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg; + // The only exception is DS_ADD_F64 which never flushes regardless of mode. + if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) { + if (!Ty->isDoubleTy()) + return AtomicExpansionKind::None; + + return (fpModeMatchesGlobalFPAtomicMode(RMW) || + RMW->getFunction() + ->getFnAttribute("amdgpu-unsafe-fp-atomics") + .getValueAsString() == "true") + ? AtomicExpansionKind::None + : AtomicExpansionKind::CmpXChg; + } + + return AtomicExpansionKind::CmpXChg; } default: break; @@ -11872,10 +12255,11 @@ bool SITargetLowering::requiresUniformRegister(MachineFunction &MF, return hasCFUser(V, Visited, Subtarget->getWavefrontSize()); } -std::pair<int, MVT> +std::pair<InstructionCost, MVT> SITargetLowering::getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const { - auto Cost = TargetLoweringBase::getTypeLegalizationCost(DL, Ty); + std::pair<InstructionCost, MVT> Cost = + TargetLoweringBase::getTypeLegalizationCost(DL, Ty); auto Size = DL.getTypeSizeInBits(Ty); // Maximum load or store can handle 8 dwords for scalar and 4 for // vector ALU. Let's assume anything above 8 dwords is expensive diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 823d6eca9bf8..f3d34267a81d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -144,7 +144,11 @@ private: SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerTrapHsaQueuePtr(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const; SDValue lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const; SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; @@ -227,10 +231,8 @@ private: // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the // three offsets (voffset, soffset and instoffset) into the SDValue[3] array // pointed to by Offsets. - /// \returns 0 If there is a non-constant offset or if the offset is 0. - /// Otherwise returns the constant offset. - unsigned setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, - SDValue *Offsets, Align Alignment = Align(4)) const; + void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, + SDValue *Offsets, Align Alignment = Align(4)) const; // Handle 8 bit and 16 bit buffer loads SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL, @@ -283,7 +285,7 @@ public: } bool allowsMisalignedMemoryAccesses( - EVT VT, unsigned AS, unsigned Alignment, + EVT VT, unsigned AS, Align Alignment, MachineMemOperand::Flags Flags = MachineMemOperand::MONone, bool *IsFast = nullptr) const override; @@ -393,6 +395,7 @@ public: SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; + void AddIMGInit(MachineInstr &MI) const; void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override; @@ -439,7 +442,10 @@ public: bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth = 5) const; + bool isCanonicalized(Register Reg, MachineFunction &MF, + unsigned MaxDepth = 5) const; bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const; + bool denormalsEnabledForType(LLT Ty, MachineFunction &MF) const; bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, @@ -483,8 +489,8 @@ public: const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const; - std::pair<int, MVT> getTypeLegalizationCost(const DataLayout &DL, - Type *Ty) const; + std::pair<InstructionCost, MVT> getTypeLegalizationCost(const DataLayout &DL, + Type *Ty) const; }; } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp index 5611c9c5d57e..7ba20eb6027b 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp @@ -63,30 +63,10 @@ enum HardClauseType { HARDCLAUSE_ILLEGAL, }; -HardClauseType getHardClauseType(const MachineInstr &MI) { - // On current architectures we only get a benefit from clausing loads. - if (MI.mayLoad()) { - if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) - return HARDCLAUSE_VMEM; - if (SIInstrInfo::isFLAT(MI)) - return HARDCLAUSE_FLAT; - // TODO: LDS - if (SIInstrInfo::isSMRD(MI)) - return HARDCLAUSE_SMEM; - } - - // Don't form VALU clauses. It's not clear what benefit they give, if any. - - // In practice s_nop is the only internal instruction we're likely to see. - // It's safe to treat the rest as illegal. - if (MI.getOpcode() == AMDGPU::S_NOP) - return HARDCLAUSE_INTERNAL; - return HARDCLAUSE_ILLEGAL; -} - class SIInsertHardClauses : public MachineFunctionPass { public: static char ID; + const GCNSubtarget *ST = nullptr; SIInsertHardClauses() : MachineFunctionPass(ID) {} @@ -95,6 +75,34 @@ public: MachineFunctionPass::getAnalysisUsage(AU); } + HardClauseType getHardClauseType(const MachineInstr &MI) { + + // On current architectures we only get a benefit from clausing loads. + if (MI.mayLoad()) { + if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) { + if (ST->hasNSAClauseBug()) { + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); + if (Info && Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA) + return HARDCLAUSE_ILLEGAL; + } + return HARDCLAUSE_VMEM; + } + if (SIInstrInfo::isFLAT(MI)) + return HARDCLAUSE_FLAT; + // TODO: LDS + if (SIInstrInfo::isSMRD(MI)) + return HARDCLAUSE_SMEM; + } + + // Don't form VALU clauses. It's not clear what benefit they give, if any. + + // In practice s_nop is the only internal instruction we're likely to see. + // It's safe to treat the rest as illegal. + if (MI.getOpcode() == AMDGPU::S_NOP) + return HARDCLAUSE_INTERNAL; + return HARDCLAUSE_ILLEGAL; + } + // Track information about a clause as we discover it. struct ClauseInfo { // The type of all (non-internal) instructions in the clause. @@ -132,12 +140,12 @@ public: if (skipFunction(MF.getFunction())) return false; - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - if (!ST.hasHardClauses()) + ST = &MF.getSubtarget<GCNSubtarget>(); + if (!ST->hasHardClauses()) return false; - const SIInstrInfo *SII = ST.getInstrInfo(); - const TargetRegisterInfo *TRI = ST.getRegisterInfo(); + const SIInstrInfo *SII = ST->getInstrInfo(); + const TargetRegisterInfo *TRI = ST->getRegisterInfo(); bool Changed = false; for (auto &MBB : MF) { diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp deleted file mode 100644 index 9d31cd5cedc3..000000000000 --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ /dev/null @@ -1,504 +0,0 @@ -//===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass inserts branches on the 0 exec mask over divergent branches -/// branches when it's expected that jumping over the untaken control flow will -/// be cheaper than having every workitem no-op through it. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "GCNSubtarget.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/InitializePasses.h" - -using namespace llvm; - -#define DEBUG_TYPE "si-insert-skips" - -static cl::opt<unsigned> SkipThresholdFlag( - "amdgpu-skip-threshold-legacy", - cl::desc("Number of instructions before jumping over divergent control flow"), - cl::init(12), cl::Hidden); - -namespace { - -class SIInsertSkips : public MachineFunctionPass { -private: - const SIRegisterInfo *TRI = nullptr; - const SIInstrInfo *TII = nullptr; - unsigned SkipThreshold = 0; - MachineDominatorTree *MDT = nullptr; - - MachineBasicBlock *EarlyExitBlock = nullptr; - bool EarlyExitClearsExec = false; - - bool shouldSkip(const MachineBasicBlock &From, - const MachineBasicBlock &To) const; - - bool dominatesAllReachable(MachineBasicBlock &MBB); - void ensureEarlyExitBlock(MachineBasicBlock &MBB, bool ClearExec); - void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - DebugLoc DL); - - bool kill(MachineInstr &MI); - void earlyTerm(MachineInstr &MI); - - bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB); - -public: - static char ID; - - SIInsertSkips() : MachineFunctionPass(ID) {} - - bool runOnMachineFunction(MachineFunction &MF) override; - - StringRef getPassName() const override { - return "SI insert s_cbranch_execz instructions"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - -} // end anonymous namespace - -char SIInsertSkips::ID = 0; - -INITIALIZE_PASS_BEGIN(SIInsertSkips, DEBUG_TYPE, - "SI insert s_cbranch_execz instructions", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_END(SIInsertSkips, DEBUG_TYPE, - "SI insert s_cbranch_execz instructions", false, false) - -char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID; - -static bool opcodeEmitsNoInsts(const MachineInstr &MI) { - if (MI.isMetaInstruction()) - return true; - - // Handle target specific opcodes. - switch (MI.getOpcode()) { - case AMDGPU::SI_MASK_BRANCH: - return true; - default: - return false; - } -} - -bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From, - const MachineBasicBlock &To) const { - unsigned NumInstr = 0; - const MachineFunction *MF = From.getParent(); - - for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); - MBBI != End && MBBI != ToI; ++MBBI) { - const MachineBasicBlock &MBB = *MBBI; - - for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end(); - NumInstr < SkipThreshold && I != E; ++I) { - if (opcodeEmitsNoInsts(*I)) - continue; - - // FIXME: Since this is required for correctness, this should be inserted - // during SILowerControlFlow. - - // When a uniform loop is inside non-uniform control flow, the branch - // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken - // when EXEC = 0. We should skip the loop lest it becomes infinite. - if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ || - I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) - return true; - - if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) - return true; - - // These instructions are potentially expensive even if EXEC = 0. - if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) || - I->getOpcode() == AMDGPU::S_WAITCNT) - return true; - - ++NumInstr; - if (NumInstr >= SkipThreshold) - return true; - } - } - - return false; -} - -/// Check whether \p MBB dominates all blocks that are reachable from it. -bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) { - for (MachineBasicBlock *Other : depth_first(&MBB)) { - if (!MDT->dominates(&MBB, Other)) - return false; - } - return true; -} - -static void generateEndPgm(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, DebugLoc DL, - const SIInstrInfo *TII, bool IsPS) { - // "null export" - if (IsPS) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE)) - .addImm(AMDGPU::Exp::ET_NULL) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addImm(1) // vm - .addImm(0) // compr - .addImm(0); // en - } - // s_endpgm - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0); -} - -void SIInsertSkips::ensureEarlyExitBlock(MachineBasicBlock &MBB, - bool ClearExec) { - MachineFunction *MF = MBB.getParent(); - DebugLoc DL; - - if (!EarlyExitBlock) { - EarlyExitBlock = MF->CreateMachineBasicBlock(); - MF->insert(MF->end(), EarlyExitBlock); - generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII, - MF->getFunction().getCallingConv() == - CallingConv::AMDGPU_PS); - EarlyExitClearsExec = false; - } - - if (ClearExec && !EarlyExitClearsExec) { - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); - unsigned Mov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - Register Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - auto ExitI = EarlyExitBlock->getFirstNonPHI(); - BuildMI(*EarlyExitBlock, ExitI, DL, TII->get(Mov), Exec).addImm(0); - EarlyExitClearsExec = true; - } -} - -static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, - MachineDominatorTree *MDT) { - MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/ true); - - // Update dominator tree - using DomTreeT = DomTreeBase<MachineBasicBlock>; - SmallVector<DomTreeT::UpdateType, 16> DTUpdates; - for (MachineBasicBlock *Succ : SplitBB->successors()) { - DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ}); - DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ}); - } - DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB}); - MDT->getBase().applyUpdates(DTUpdates); -} - -/// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given -/// iterator. Only applies to pixel shaders. -void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, DebugLoc DL) { - MachineFunction *MF = MBB.getParent(); - (void)MF; - assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS); - - // It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a - // basic block that has no further successors (e.g., there was an - // `unreachable` there in IR). This can happen with original source of the - // form: - // - // if (uniform_condition) { - // write_to_memory(); - // discard; - // } - // - // In this case, we write the "null_export; s_endpgm" skip code in the - // already-existing basic block. - auto NextBBI = std::next(MBB.getIterator()); - bool NoSuccessor = - I == MBB.end() && !llvm::is_contained(MBB.successors(), &*NextBBI); - - if (NoSuccessor) { - generateEndPgm(MBB, I, DL, TII, true); - } else { - ensureEarlyExitBlock(MBB, false); - - MachineInstr *BranchMI = - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) - .addMBB(EarlyExitBlock); - - // Split the block if the branch will not come at the end. - auto Next = std::next(BranchMI->getIterator()); - if (Next != MBB.end() && !Next->isTerminator()) - splitBlock(MBB, *BranchMI, MDT); - - MBB.addSuccessor(EarlyExitBlock); - MDT->getBase().insertEdge(&MBB, EarlyExitBlock); - } -} - -/// Translate a SI_KILL_*_TERMINATOR into exec-manipulating instructions. -/// Return true unless the terminator is a no-op. -bool SIInsertSkips::kill(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - - switch (MI.getOpcode()) { - case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: { - unsigned Opcode = 0; - - // The opcodes are inverted because the inline immediate has to be - // the first operand, e.g. from "x < imm" to "imm > x" - switch (MI.getOperand(2).getImm()) { - case ISD::SETOEQ: - case ISD::SETEQ: - Opcode = AMDGPU::V_CMPX_EQ_F32_e64; - break; - case ISD::SETOGT: - case ISD::SETGT: - Opcode = AMDGPU::V_CMPX_LT_F32_e64; - break; - case ISD::SETOGE: - case ISD::SETGE: - Opcode = AMDGPU::V_CMPX_LE_F32_e64; - break; - case ISD::SETOLT: - case ISD::SETLT: - Opcode = AMDGPU::V_CMPX_GT_F32_e64; - break; - case ISD::SETOLE: - case ISD::SETLE: - Opcode = AMDGPU::V_CMPX_GE_F32_e64; - break; - case ISD::SETONE: - case ISD::SETNE: - Opcode = AMDGPU::V_CMPX_LG_F32_e64; - break; - case ISD::SETO: - Opcode = AMDGPU::V_CMPX_O_F32_e64; - break; - case ISD::SETUO: - Opcode = AMDGPU::V_CMPX_U_F32_e64; - break; - case ISD::SETUEQ: - Opcode = AMDGPU::V_CMPX_NLG_F32_e64; - break; - case ISD::SETUGT: - Opcode = AMDGPU::V_CMPX_NGE_F32_e64; - break; - case ISD::SETUGE: - Opcode = AMDGPU::V_CMPX_NGT_F32_e64; - break; - case ISD::SETULT: - Opcode = AMDGPU::V_CMPX_NLE_F32_e64; - break; - case ISD::SETULE: - Opcode = AMDGPU::V_CMPX_NLT_F32_e64; - break; - case ISD::SETUNE: - Opcode = AMDGPU::V_CMPX_NEQ_F32_e64; - break; - default: - llvm_unreachable("invalid ISD:SET cond code"); - } - - const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>(); - if (ST.hasNoSdstCMPX()) - Opcode = AMDGPU::getVCMPXNoSDstOp(Opcode); - - assert(MI.getOperand(0).isReg()); - - if (TRI->isVGPR(MBB.getParent()->getRegInfo(), - MI.getOperand(0).getReg())) { - Opcode = AMDGPU::getVOPe32(Opcode); - BuildMI(MBB, &MI, DL, TII->get(Opcode)) - .add(MI.getOperand(1)) - .add(MI.getOperand(0)); - } else { - auto I = BuildMI(MBB, &MI, DL, TII->get(Opcode)); - if (!ST.hasNoSdstCMPX()) - I.addReg(AMDGPU::VCC, RegState::Define); - - I.addImm(0) // src0 modifiers - .add(MI.getOperand(1)) - .addImm(0) // src1 modifiers - .add(MI.getOperand(0)); - - I.addImm(0); // omod - } - return true; - } - case AMDGPU::SI_KILL_I1_TERMINATOR: { - const MachineFunction *MF = MI.getParent()->getParent(); - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); - unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - const MachineOperand &Op = MI.getOperand(0); - int64_t KillVal = MI.getOperand(1).getImm(); - assert(KillVal == 0 || KillVal == -1); - - // Kill all threads if Op0 is an immediate and equal to the Kill value. - if (Op.isImm()) { - int64_t Imm = Op.getImm(); - assert(Imm == 0 || Imm == -1); - - if (Imm == KillVal) { - BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32 - : AMDGPU::S_MOV_B64), Exec) - .addImm(0); - return true; - } - return false; - } - - unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64; - if (ST.isWave32()) - Opcode = KillVal ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_AND_B32; - BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec) - .addReg(Exec) - .add(Op); - return true; - } - default: - llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR"); - } -} - -void SIInsertSkips::earlyTerm(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - const DebugLoc DL = MI.getDebugLoc(); - - ensureEarlyExitBlock(MBB, true); - - auto BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0)) - .addMBB(EarlyExitBlock); - auto Next = std::next(MI.getIterator()); - - if (Next != MBB.end() && !Next->isTerminator()) - splitBlock(MBB, *BranchMI, MDT); - - MBB.addSuccessor(EarlyExitBlock); - MDT->getBase().insertEdge(&MBB, EarlyExitBlock); -} - -// Returns true if a branch over the block was inserted. -bool SIInsertSkips::skipMaskBranch(MachineInstr &MI, - MachineBasicBlock &SrcMBB) { - MachineBasicBlock *DestBB = MI.getOperand(0).getMBB(); - - if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB)) - return false; - - const DebugLoc &DL = MI.getDebugLoc(); - MachineBasicBlock::iterator InsPt = std::next(MI.getIterator()); - - BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) - .addMBB(DestBB); - - return true; -} - -bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - TII = ST.getInstrInfo(); - TRI = &TII->getRegisterInfo(); - MDT = &getAnalysis<MachineDominatorTree>(); - SkipThreshold = SkipThresholdFlag; - - SmallVector<MachineInstr *, 4> KillInstrs; - SmallVector<MachineInstr *, 4> EarlyTermInstrs; - bool MadeChange = false; - - for (MachineBasicBlock &MBB : MF) { - MachineBasicBlock::iterator I, Next; - for (I = MBB.begin(); I != MBB.end(); I = Next) { - Next = std::next(I); - MachineInstr &MI = *I; - - switch (MI.getOpcode()) { - case AMDGPU::SI_MASK_BRANCH: - MadeChange |= skipMaskBranch(MI, MBB); - break; - - case AMDGPU::S_BRANCH: - // Optimize out branches to the next block. - // FIXME: Shouldn't this be handled by BranchFolding? - if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) { - assert(&MI == &MBB.back()); - MI.eraseFromParent(); - MadeChange = true; - } - break; - - case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: - case AMDGPU::SI_KILL_I1_TERMINATOR: { - MadeChange = true; - bool CanKill = kill(MI); - - // Check if we can add an early "if exec=0 { end shader }". - // - // Note that we _always_ do this if it is correct, even if the kill - // happens fairly late in the shader, because the null export should - // generally still be cheaper than normal export(s). - // - // TODO: The dominatesAllReachable check is conservative: if the - // dominance is only missing due to _uniform_ branches, we could - // in fact insert the early-exit as well. - if (CanKill && - MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS && - dominatesAllReachable(MBB)) { - // Mark the instruction for kill-if-dead insertion. We delay this - // change because it modifies the CFG. - KillInstrs.push_back(&MI); - } else { - MI.eraseFromParent(); - } - break; - } - - case AMDGPU::SI_KILL_CLEANUP: - if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS && - dominatesAllReachable(MBB)) { - KillInstrs.push_back(&MI); - } else { - MI.eraseFromParent(); - } - break; - - case AMDGPU::SI_EARLY_TERMINATE_SCC0: - EarlyTermInstrs.push_back(&MI); - break; - - default: - break; - } - } - } - - for (MachineInstr *Instr : EarlyTermInstrs) { - // Early termination in GS does nothing - if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS) - earlyTerm(*Instr); - Instr->eraseFromParent(); - } - for (MachineInstr *Kill : KillInstrs) { - skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()), - Kill->getDebugLoc()); - Kill->eraseFromParent(); - } - KillInstrs.clear(); - EarlyTermInstrs.clear(); - EarlyExitBlock = nullptr; - - return MadeChange; -} diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index c12745586da1..7d6f79922d2e 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -27,6 +27,7 @@ #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/CodeGen/MachinePostDominators.h" @@ -131,7 +132,8 @@ static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = { // We reserve a fixed number of VGPR slots in the scoring tables for // special tokens like SCMEM_LDS (needed for buffer load to LDS). enum RegisterMapping { - SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets. + SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets. + AGPR_OFFSET = 226, // Maximum programmable ArchVGPRs across all targets. SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets. NUM_EXTRA_VGPRS = 1, // A reserved slot for DS. EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses. @@ -244,8 +246,8 @@ public: const SIRegisterInfo *TRI, unsigned OpNo) const; bool counterOutOfOrder(InstCounterType T) const; - bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; - bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const; + void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; + void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; void determineWait(InstCounterType T, unsigned ScoreToWait, AMDGPU::Waitcnt &Wait) const; void applyWaitcnt(const AMDGPU::Waitcnt &Wait); @@ -417,7 +419,7 @@ public: } if (DebugCounter::isCounterSet(ForceLgkmCounter) && - DebugCounter::shouldExecute(ForceLgkmCounter)) { + DebugCounter::shouldExecute(ForceLgkmCounter)) { ForceEmitWaitcnt[LGKM_CNT] = true; } else { ForceEmitWaitcnt[LGKM_CNT] = false; @@ -441,6 +443,9 @@ public: WaitcntBrackets *ScoreBrackets); bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets); + bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, + MachineInstr &OldWaitcntInstr, + AMDGPU::Waitcnt &Wait, const MachineInstr *MI); }; } // end anonymous namespace @@ -451,8 +456,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, const SIRegisterInfo *TRI, unsigned OpNo) const { const MachineOperand &Op = MI->getOperand(OpNo); - assert(Op.isReg()); - if (!TRI->isInAllocatableClass(Op.getReg()) || TRI->isAGPR(*MRI, Op.getReg())) + if (!TRI->isInAllocatableClass(Op.getReg())) return {-1, -1}; // A use via a PW operand does not need a waitcnt. @@ -463,9 +467,11 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)); - if (TRI->isVGPR(*MRI, Op.getReg())) { + if (TRI->isVectorRegister(*MRI, Op.getReg())) { assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL); Result.first = Reg - RegisterEncoding.VGPR0; + if (TRI->isAGPR(*MRI, Op.getReg())) + Result.first += AGPR_OFFSET; assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS); } else if (TRI->isSGPRReg(*MRI, Op.getReg())) { assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS); @@ -491,7 +497,7 @@ void WaitcntBrackets::setExpScore(const MachineInstr *MI, const MachineRegisterInfo *MRI, unsigned OpNo, unsigned Val) { RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo); - assert(TRI->isVGPR(*MRI, MI->getOperand(OpNo).getReg())); + assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg())); for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { setRegScore(RegNo, EXP_CNT, Val); } @@ -538,7 +544,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, AMDGPU::OpName::data1), CurrScore); } - } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 && + } else if (SIInstrInfo::isAtomicRet(Inst) && Inst.getOpcode() != AMDGPU::DS_GWS_INIT && Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V && Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR && @@ -549,7 +555,8 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) { for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { const MachineOperand &Op = Inst.getOperand(I); - if (Op.isReg() && !Op.isDef() && TRI->isVGPR(*MRI, Op.getReg())) { + if (Op.isReg() && !Op.isDef() && + TRI->isVectorRegister(*MRI, Op.getReg())) { setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); } } @@ -560,7 +567,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, &Inst, TII, TRI, MRI, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), CurrScore); - } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) { + } else if (SIInstrInfo::isAtomicRet(Inst)) { setExpScore( &Inst, TII, TRI, MRI, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), @@ -569,7 +576,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, } else if (TII->isMIMG(Inst)) { if (Inst.mayStore()) { setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); - } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) { + } else if (SIInstrInfo::isAtomicRet(Inst)) { setExpScore( &Inst, TII, TRI, MRI, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), @@ -582,7 +589,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, } else if (TII->isMUBUF(Inst)) { if (Inst.mayStore()) { setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); - } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) { + } else if (SIInstrInfo::isAtomicRet(Inst)) { setExpScore( &Inst, TII, TRI, MRI, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), @@ -606,7 +613,8 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, } for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { MachineOperand &MO = Inst.getOperand(I); - if (MO.isReg() && !MO.isDef() && TRI->isVGPR(*MRI, MO.getReg())) { + if (MO.isReg() && !MO.isDef() && + TRI->isVectorRegister(*MRI, MO.getReg())) { setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); } } @@ -704,22 +712,23 @@ void WaitcntBrackets::print(raw_ostream &OS) { /// Simplify the waitcnt, in the sense of removing redundant counts, and return /// whether a waitcnt instruction is needed at all. -bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { - return simplifyWaitcnt(VM_CNT, Wait.VmCnt) | - simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) | - simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt) | - simplifyWaitcnt(VS_CNT, Wait.VsCnt); +void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { + simplifyWaitcnt(VM_CNT, Wait.VmCnt); + simplifyWaitcnt(EXP_CNT, Wait.ExpCnt); + simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt); + simplifyWaitcnt(VS_CNT, Wait.VsCnt); } -bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T, +void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, unsigned &Count) const { const unsigned LB = getScoreLB(T); const unsigned UB = getScoreUB(T); - if (Count < UB && UB - Count > LB) - return true; - Count = ~0u; - return false; + // The number of outstanding events for this type, T, can be calculated + // as (UB - LB). If the current Count is greater than or equal to the number + // of outstanding events, then the wait for this counter is redundant. + if (Count >= UB - LB) + Count = ~0u; } void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait, @@ -794,6 +803,107 @@ FunctionPass *llvm::createSIInsertWaitcntsPass() { return new SIInsertWaitcnts(); } +/// Combine consecutive waitcnt instructions that precede \p MI and follow +/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added +/// by previous passes. Currently this pass conservatively assumes that these +/// preexisting waitcnt are required for correctness. +bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, + MachineInstr &OldWaitcntInstr, + AMDGPU::Waitcnt &Wait, + const MachineInstr *MI) { + bool Modified = false; + MachineInstr *WaitcntInstr = nullptr; + MachineInstr *WaitcntVsCntInstr = nullptr; + for (auto II = OldWaitcntInstr.getIterator(), NextI = std::next(II); + &*II != MI; II = NextI, ++NextI) { + if (II->isMetaInstruction()) + continue; + + if (II->getOpcode() == AMDGPU::S_WAITCNT) { + // Conservatively update required wait if this waitcnt was added in an + // earlier pass. In this case it will not exist in the tracked waitcnt + // set. + if (!TrackedWaitcntSet.count(&*II)) { + unsigned IEnc = II->getOperand(0).getImm(); + AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc); + Wait = Wait.combined(OldWait); + } + + // Merge consecutive waitcnt of the same type by erasing multiples. + if (!WaitcntInstr) { + WaitcntInstr = &*II; + } else { + II->eraseFromParent(); + Modified = true; + } + + } else { + assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT); + assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL); + if (!TrackedWaitcntSet.count(&*II)) { + unsigned OldVSCnt = + TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm(); + Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt); + } + + if (!WaitcntVsCntInstr) { + WaitcntVsCntInstr = &*II; + } else { + II->eraseFromParent(); + Modified = true; + } + } + } + + // Updated encoding of merged waitcnt with the required wait. + if (WaitcntInstr) { + if (Wait.hasWaitExceptVsCnt()) { + unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait); + unsigned OldEnc = WaitcntInstr->getOperand(0).getImm(); + if (OldEnc != NewEnc) { + WaitcntInstr->getOperand(0).setImm(NewEnc); + Modified = true; + } + ScoreBrackets.applyWaitcnt(Wait); + Wait.VmCnt = ~0u; + Wait.LgkmCnt = ~0u; + Wait.ExpCnt = ~0u; + + LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" + << "Old Instr: " << MI << "New Instr: " << *WaitcntInstr + << '\n'); + } else { + WaitcntInstr->eraseFromParent(); + Modified = true; + } + } + + if (WaitcntVsCntInstr) { + if (Wait.hasWaitVsCnt()) { + assert(ST->hasVscnt()); + unsigned OldVSCnt = + TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16) + ->getImm(); + if (Wait.VsCnt != OldVSCnt) { + TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16) + ->setImm(Wait.VsCnt); + Modified = true; + } + ScoreBrackets.applyWaitcnt(Wait); + Wait.VsCnt = ~0u; + + LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" + << "Old Instr: " << MI + << "New Instr: " << *WaitcntVsCntInstr << '\n'); + } else { + WaitcntVsCntInstr->eraseFromParent(); + Modified = true; + } + } + + return Modified; +} + static bool readsVCCZ(const MachineInstr &MI) { unsigned Opc = MI.getOpcode(); return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && @@ -829,15 +939,17 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( MachineInstr &MI, WaitcntBrackets &ScoreBrackets, MachineInstr *OldWaitcntInstr) { setForceEmitWaitcnt(); - bool IsForceEmitWaitcnt = isForceEmitWaitcnt(); if (MI.isMetaInstruction()) return false; AMDGPU::Waitcnt Wait; + bool Modified = false; - // See if this instruction has a forced S_WAITCNT VM. - // TODO: Handle other cases of NeedsWaitcntVmBefore() + // FIXME: This should have already been handled by the memory legalizer. + // Removing this currently doesn't affect any lit tests, but we need to + // verify that nothing was relying on this. The number of buffer invalidates + // being handled here should not be expanded. if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL || @@ -1003,7 +1115,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( RegInterval Interval = ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I); - const bool IsVGPR = TRI->isVGPR(*MRI, Op.getReg()); + const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg()); for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { if (IsVGPR) { // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the @@ -1049,32 +1161,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( } } - // Early-out if no wait is indicated. - if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) { - bool Modified = false; - if (OldWaitcntInstr) { - for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II); - &*II != &MI; II = NextI, ++NextI) { - if (II->isDebugInstr()) - continue; - - if (TrackedWaitcntSet.count(&*II)) { - TrackedWaitcntSet.erase(&*II); - II->eraseFromParent(); - Modified = true; - } else if (II->getOpcode() == AMDGPU::S_WAITCNT) { - int64_t Imm = II->getOperand(0).getImm(); - ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm)); - } else { - assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT); - assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL); - auto W = TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm(); - ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt(~0u, ~0u, ~0u, W)); - } - } - } - return Modified; - } + // Verify that the wait is actually needed. + ScoreBrackets.simplifyWaitcnt(Wait); if (ForceEmitZeroWaitcnts) Wait = AMDGPU::Waitcnt::allZero(ST->hasVscnt()); @@ -1088,57 +1176,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( if (ForceEmitWaitcnt[VS_CNT]) Wait.VsCnt = 0; - ScoreBrackets.applyWaitcnt(Wait); - - AMDGPU::Waitcnt OldWait; - bool Modified = false; - if (OldWaitcntInstr) { - for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II); - &*II != &MI; II = NextI, NextI++) { - if (II->isDebugInstr()) - continue; - - if (II->getOpcode() == AMDGPU::S_WAITCNT) { - unsigned IEnc = II->getOperand(0).getImm(); - AMDGPU::Waitcnt IWait = AMDGPU::decodeWaitcnt(IV, IEnc); - OldWait = OldWait.combined(IWait); - if (!TrackedWaitcntSet.count(&*II)) - Wait = Wait.combined(IWait); - unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait); - if (IEnc != NewEnc) { - II->getOperand(0).setImm(NewEnc); - Modified = true; - } - Wait.VmCnt = ~0u; - Wait.LgkmCnt = ~0u; - Wait.ExpCnt = ~0u; - } else { - assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT); - assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL); - - unsigned ICnt = TII->getNamedOperand(*II, AMDGPU::OpName::simm16) - ->getImm(); - OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt); - if (!TrackedWaitcntSet.count(&*II)) - Wait.VsCnt = std::min(Wait.VsCnt, ICnt); - if (Wait.VsCnt != ICnt) { - TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->setImm(Wait.VsCnt); - Modified = true; - } - Wait.VsCnt = ~0u; - } - - LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" - << "Old Instr: " << MI - << "New Instr: " << *II << '\n'); - - if (!Wait.hasWait()) - return Modified; - } + // Try to merge the required wait with preexisting waitcnt instructions. + // Also erase redundant waitcnt. + Modified = + applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, &MI); + } else { + // Update waitcnt brackets after determining the required wait. + ScoreBrackets.applyWaitcnt(Wait); } - if (Wait.VmCnt != ~0u || Wait.LgkmCnt != ~0u || Wait.ExpCnt != ~0u) { + // Build new waitcnt instructions unless no wait is needed or the old waitcnt + // instruction was modified to handle the required wait. + if (Wait.hasWaitExceptVsCnt()) { unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) @@ -1151,7 +1201,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( << "New Instr: " << *SWaitInst << '\n'); } - if (Wait.VsCnt != ~0u) { + if (Wait.hasWaitVsCnt()) { assert(ST->hasVscnt()); auto SWaitInst = @@ -1208,6 +1258,10 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { if (!TII->usesLGKM_CNT(MI)) return false; + // If in tgsplit mode then there can be no use of LDS. + if (ST->isTgSplitEnabled()) + return false; + // If there are no memory operands then conservatively assume the flat // operation may access LDS. if (MI.memoperands_empty()) @@ -1246,8 +1300,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ++FlatASCount; if (!ST->hasVscnt()) ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); - else if (Inst.mayLoad() && - AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1) + else if (Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst)) ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst); else ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst); @@ -1267,16 +1320,10 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, if (FlatASCount > 1) ScoreBrackets->setPendingFlat(); } else if (SIInstrInfo::isVMEM(Inst) && - // TODO: get a better carve out. - Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 && - Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC && - Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL && - Inst.getOpcode() != AMDGPU::BUFFER_GL0_INV && - Inst.getOpcode() != AMDGPU::BUFFER_GL1_INV) { + !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) { if (!ST->hasVscnt()) ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); - else if ((Inst.mayLoad() && - AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1) || + else if ((Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst)) || /* IMAGE_GET_RESINFO / IMAGE_GET_LOD */ (TII->isMIMG(Inst) && !Inst.mayLoad() && !Inst.mayStore())) ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst); @@ -1284,7 +1331,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst); if (ST->vmemWriteNeedsExpWaitcnt() && - (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) { + (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) { ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst); } } else if (TII->isSMRD(Inst)) { @@ -1424,7 +1471,8 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, Iter != E;) { MachineInstr &Inst = *Iter; - // Track pre-existing waitcnts from earlier iterations. + // Track pre-existing waitcnts that were added in earlier iterations or by + // the memory legalizer. if (Inst.getOpcode() == AMDGPU::S_WAITCNT || (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() && @@ -1473,8 +1521,12 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, if (TII->isSMRD(Inst)) { for (const MachineMemOperand *Memop : Inst.memoperands()) { - const Value *Ptr = Memop->getValue(); - SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent())); + // No need to handle invariant loads when avoiding WAR conflicts, as + // there cannot be a vector store to the same memory location. + if (!Memop->isInvariant()) { + const Value *Ptr = Memop->getValue(); + SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent())); + } } if (ST->hasReadVCCZBug()) { // This smem read could complete and clobber vccz at any time. @@ -1550,6 +1602,28 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { TrackedWaitcntSet.clear(); BlockInfos.clear(); + bool Modified = false; + + if (!MFI->isEntryFunction()) { + // Wait for any outstanding memory operations that the input registers may + // depend on. We can't track them and it's better to do the wait after the + // costly call sequence. + + // TODO: Could insert earlier and schedule more liberally with operations + // that only use caller preserved registers. + MachineBasicBlock &EntryBB = MF.front(); + MachineBasicBlock::iterator I = EntryBB.begin(); + for (MachineBasicBlock::iterator E = EntryBB.end(); + I != E && (I->isPHI() || I->isMetaInstruction()); ++I) + ; + BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); + if (ST->hasVscnt()) + BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(0); + + Modified = true; + } // Keep iterating over the blocks in reverse post order, inserting and // updating s_waitcnt where needed, until a fix point is reached. @@ -1557,7 +1631,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { BlockInfos.insert({MBB, BlockInfo(MBB)}); std::unique_ptr<WaitcntBrackets> Brackets; - bool Modified = false; bool Repeat; do { Repeat = false; @@ -1657,26 +1730,5 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { } } - if (!MFI->isEntryFunction()) { - // Wait for any outstanding memory operations that the input registers may - // depend on. We can't track them and it's better to the wait after the - // costly call sequence. - - // TODO: Could insert earlier and schedule more liberally with operations - // that only use caller preserved registers. - MachineBasicBlock &EntryBB = MF.front(); - MachineBasicBlock::iterator I = EntryBB.begin(); - for (MachineBasicBlock::iterator E = EntryBB.end(); - I != E && (I->isPHI() || I->isMetaInstruction()); ++I) - ; - BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); - if (ST->hasVscnt()) - BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) - .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(0); - - Modified = true; - } - return Modified; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td index 7ce042b67aba..e39f52875f1f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -113,7 +113,7 @@ class InstSI <dag outs, dag ins, string asm = "", // This field indicates that FLAT instruction accesses FLAT_GLBL segment. // Must be 0 for non-FLAT instructions. - field bit IsFlatGlobal = 0; + field bit FlatGlobal = 0; // Reads the mode register, usually for FP environment. field bit ReadsModeReg = 0; @@ -133,7 +133,13 @@ class InstSI <dag outs, dag ins, string asm = "", // This field indicates that FLAT instruction accesses FLAT_SCRATCH segment. // Must be 0 for non-FLAT instructions. - field bit IsFlatScratch = 0; + field bit FlatScratch = 0; + + // Atomic without a return. + field bit IsAtomicNoRet = 0; + + // Atomic with return. + field bit IsAtomicRet = 0; // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = SALU; @@ -193,7 +199,7 @@ class InstSI <dag outs, dag ins, string asm = "", let TSFlags{50} = D16Buf; - let TSFlags{51} = IsFlatGlobal; + let TSFlags{51} = FlatGlobal; let TSFlags{52} = FPDPRounding; @@ -203,7 +209,11 @@ class InstSI <dag outs, dag ins, string asm = "", let TSFlags{55} = IsDOT; - let TSFlags{56} = IsFlatScratch; + let TSFlags{56} = FlatScratch; + + let TSFlags{57} = IsAtomicNoRet; + + let TSFlags{58} = IsAtomicRet; let SchedRW = [Write32Bit]; @@ -251,6 +261,13 @@ class Enc64 { int Size = 8; } +def CPolBit { + int GLC = 0; + int SLC = 1; + int DLC = 2; + int SCC = 4; +} + class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">; class VINTRPe <bits<2> op> : Enc32 { @@ -268,27 +285,25 @@ class VINTRPe <bits<2> op> : Enc32 { } class MIMGe : Enc64 { - bits<8> vdata; + bits<10> vdata; bits<4> dmask; bits<1> unorm; - bits<1> glc; + bits<5> cpol; bits<1> r128; bits<1> tfe; bits<1> lwe; - bits<1> slc; bit d16; bits<7> srsrc; bits<7> ssamp; let Inst{11-8} = dmask; let Inst{12} = unorm; - let Inst{13} = glc; + let Inst{13} = cpol{CPolBit.GLC}; let Inst{15} = r128; - let Inst{16} = tfe; let Inst{17} = lwe; - let Inst{25} = slc; + let Inst{25} = cpol{CPolBit.SLC}; let Inst{31-26} = 0x3c; - let Inst{47-40} = vdata; + let Inst{47-40} = vdata{7-0}; let Inst{52-48} = srsrc{6-2}; let Inst{57-53} = ssamp{6-2}; let Inst{63} = d16; @@ -299,7 +314,21 @@ class MIMGe_gfx6789 <bits<8> op> : MIMGe { bits<1> da; let Inst{0} = op{7}; + let Inst{7} = cpol{CPolBit.SCC}; + let Inst{14} = da; + let Inst{16} = tfe; + let Inst{24-18} = op{6-0}; + let Inst{39-32} = vaddr; +} + +class MIMGe_gfx90a <bits<8> op> : MIMGe { + bits<8> vaddr; + bits<1> da; + + let Inst{0} = op{7}; + let Inst{7} = cpol{CPolBit.SCC}; let Inst{14} = da; + let Inst{16} = vdata{9}; // ACC bit let Inst{24-18} = op{6-0}; let Inst{39-32} = vaddr; } @@ -308,13 +337,13 @@ class MIMGe_gfx10 <bits<8> op> : MIMGe { bits<8> vaddr0; bits<3> dim; bits<2> nsa; - bits<1> dlc; bits<1> a16; let Inst{0} = op{7}; let Inst{2-1} = nsa; let Inst{5-3} = dim; - let Inst{7} = dlc; + let Inst{7} = cpol{CPolBit.DLC}; + let Inst{16} = tfe; let Inst{24-18} = op{6-0}; let Inst{39-32} = vaddr0; let Inst{62} = a16; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index dfd0075bf03a..7ab0f7a100c5 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -25,6 +25,7 @@ #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/MC/MCContext.h" #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetMachine.h" @@ -107,20 +108,26 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, AAResults *AA) const { - // TODO: The generic check fails for VALU instructions that should be - // rematerializable due to implicit reads of exec. We really want all of the - // generic logic for this except for this. - switch (MI.getOpcode()) { - case AMDGPU::V_MOV_B32_e32: - case AMDGPU::V_MOV_B32_e64: - case AMDGPU::V_MOV_B64_PSEUDO: - case AMDGPU::V_ACCVGPR_READ_B32_e64: - case AMDGPU::V_ACCVGPR_WRITE_B32_e64: - // No implicit operands. - return MI.getNumOperands() == MI.getDesc().getNumOperands(); - default: - return false; + if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI)) { + // Normally VALU use of exec would block the rematerialization, but that + // is OK in this case to have an implicit exec read as all VALU do. + // We really want all of the generic logic for this except for this. + + // Another potential implicit use is mode register. The core logic of + // the RA will not attempt rematerialization if mode is set anywhere + // in the function, otherwise it is safe since mode is not changed. + return !MI.hasImplicitDef() && + MI.getNumImplicitOperands() == MI.getDesc().getNumImplicitUses() && + !MI.mayRaiseFPException(); } + + return false; +} + +bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const { + // Any implicit use of exec by VALU is not a real register read. + return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() && + isVALU(*MO.getParent()); } bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, @@ -313,39 +320,22 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth( } if (isMUBUF(LdSt) || isMTBUF(LdSt)) { - const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset); - if (SOffset && SOffset->isReg()) { - // We can only handle this if it's a stack access, as any other resource - // would require reporting multiple base registers. - const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); - if (AddrReg && !AddrReg->isFI()) - return false; - - const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); - const SIMachineFunctionInfo *MFI - = LdSt.getParent()->getParent()->getInfo<SIMachineFunctionInfo>(); - if (RSrc->getReg() != MFI->getScratchRSrcReg()) - return false; - - const MachineOperand *OffsetImm = - getNamedOperand(LdSt, AMDGPU::OpName::offset); - BaseOps.push_back(RSrc); - BaseOps.push_back(SOffset); - Offset = OffsetImm->getImm(); - } else { - BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); - if (!BaseOp) // e.g. BUFFER_WBINVL1_VOL - return false; + const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); + if (!RSrc) // e.g. BUFFER_WBINVL1_VOL + return false; + BaseOps.push_back(RSrc); + BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); + if (BaseOp && !BaseOp->isFI()) BaseOps.push_back(BaseOp); - - BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); - if (BaseOp) - BaseOps.push_back(BaseOp); - - const MachineOperand *OffsetImm = - getNamedOperand(LdSt, AMDGPU::OpName::offset); - Offset = OffsetImm->getImm(); - if (SOffset) // soffset can be an inline immediate. + const MachineOperand *OffsetImm = + getNamedOperand(LdSt, AMDGPU::OpName::offset); + Offset = OffsetImm->getImm(); + const MachineOperand *SOffset = + getNamedOperand(LdSt, AMDGPU::OpName::soffset); + if (SOffset) { + if (SOffset->isReg()) + BaseOps.push_back(SOffset); + else Offset += SOffset->getImm(); } // Get appropriate operand, and compute width accordingly. @@ -576,15 +566,18 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII, if (!Tmp) report_fatal_error("Cannot scavenge VGPR to copy to AGPR"); RS.setRegUsed(Tmp); - // Only loop through if there are any free registers left, otherwise - // scavenger may report a fatal error without emergency spill slot - // or spill with the slot. - while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { - Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); - if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) - break; - Tmp = Tmp2; - RS.setRegUsed(Tmp); + + if (!TII.getSubtarget().hasGFX90AInsts()) { + // Only loop through if there are any free registers left, otherwise + // scavenger may report a fatal error without emergency spill slot + // or spill with the slot. + while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { + Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); + if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) + break; + Tmp = Tmp2; + RS.setRegUsed(Tmp); + } } // Insert copy to temporary VGPR. @@ -782,7 +775,6 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } - if (RC == &AMDGPU::AGPR_32RegClass) { if (AMDGPU::VGPR_32RegClass.contains(SrcReg)) { BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) @@ -790,6 +782,12 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } + if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) { + BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + // FIXME: Pass should maintain scavenger to avoid scan through the block on // every AGPR spill. RegScavenger RS; @@ -797,7 +795,8 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } - if (RI.getRegSizeInBits(*RC) == 16) { + const unsigned Size = RI.getRegSizeInBits(*RC); + if (Size == 16) { assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || @@ -863,9 +862,27 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } + const TargetRegisterClass *SrcRC = RI.getPhysRegClass(SrcReg); + if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) { + if (ST.hasPackedFP32Ops()) { + BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg) + .addImm(SISrcMods::OP_SEL_1) + .addReg(SrcReg) + .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) + .addReg(SrcReg) + .addImm(0) // op_sel_lo + .addImm(0) // op_sel_hi + .addImm(0) // neg_lo + .addImm(0) // neg_hi + .addImm(0) // clamp + .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); + return; + } + } + const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); if (RI.isSGPRClass(RC)) { - if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) { + if (!RI.isSGPRClass(SrcRC)) { reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); return; } @@ -873,12 +890,21 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } + unsigned EltSize = 4; unsigned Opcode = AMDGPU::V_MOV_B32_e32; if (RI.hasAGPRs(RC)) { - Opcode = RI.hasVGPRs(RI.getPhysRegClass(SrcReg)) ? + Opcode = (RI.hasVGPRs(SrcRC)) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; - } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) { + } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(SrcRC)) { Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64; + } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) && + (RI.isProperlyAlignedRC(*RC) && + (SrcRC == RC || RI.isSGPRClass(SrcRC)))) { + // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov. + if (ST.hasPackedFP32Ops()) { + Opcode = AMDGPU::V_PK_MOV_B32; + EltSize = 8; + } } // For the cases where we need an intermediate instruction/temporary register @@ -890,7 +916,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (Opcode == AMDGPU::INSTRUCTION_LIST_END) RS.reset(new RegScavenger()); - ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, 4); + ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); // If there is an overlap, we can't kill the super-register on the last // instruction, since it will also kill the components made live by this def. @@ -911,6 +937,23 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), RI.getSubReg(SrcReg, SubIdx), UseKill, *RS, ImpDefSuper, ImpUseSuper); + } else if (Opcode == AMDGPU::V_PK_MOV_B32) { + Register DstSubReg = RI.getSubReg(DestReg, SubIdx); + Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx); + MachineInstrBuilder MIB = + BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg) + .addImm(SISrcMods::OP_SEL_1) + .addReg(SrcSubReg) + .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) + .addReg(SrcSubReg) + .addImm(0) // op_sel_lo + .addImm(0) // op_sel_hi + .addImm(0) // neg_lo + .addImm(0) // neg_hi + .addImm(0) // clamp + .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); + if (Idx == 0) + MIB.addReg(DestReg, RegState::Define | RegState::Implicit); } else { MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx)) @@ -969,7 +1012,7 @@ void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, .addImm(Value); return; } - if (RegClass == &AMDGPU::VReg_64RegClass) { + if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) { BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) .addImm(Value); return; @@ -1301,6 +1344,8 @@ static unsigned getSGPRSpillSaveOpcode(unsigned Size) { return AMDGPU::SI_SPILL_S160_SAVE; case 24: return AMDGPU::SI_SPILL_S192_SAVE; + case 28: + return AMDGPU::SI_SPILL_S224_SAVE; case 32: return AMDGPU::SI_SPILL_S256_SAVE; case 64: @@ -1326,6 +1371,8 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) { return AMDGPU::SI_SPILL_V160_SAVE; case 24: return AMDGPU::SI_SPILL_V192_SAVE; + case 28: + return AMDGPU::SI_SPILL_V224_SAVE; case 32: return AMDGPU::SI_SPILL_V256_SAVE; case 64: @@ -1351,6 +1398,8 @@ static unsigned getAGPRSpillSaveOpcode(unsigned Size) { return AMDGPU::SI_SPILL_A160_SAVE; case 24: return AMDGPU::SI_SPILL_A192_SAVE; + case 28: + return AMDGPU::SI_SPILL_A224_SAVE; case 32: return AMDGPU::SI_SPILL_A256_SAVE; case 64: @@ -1434,6 +1483,8 @@ static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { return AMDGPU::SI_SPILL_S160_RESTORE; case 24: return AMDGPU::SI_SPILL_S192_RESTORE; + case 28: + return AMDGPU::SI_SPILL_S224_RESTORE; case 32: return AMDGPU::SI_SPILL_S256_RESTORE; case 64: @@ -1459,6 +1510,8 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { return AMDGPU::SI_SPILL_V160_RESTORE; case 24: return AMDGPU::SI_SPILL_V192_RESTORE; + case 28: + return AMDGPU::SI_SPILL_V224_RESTORE; case 32: return AMDGPU::SI_SPILL_V256_RESTORE; case 64: @@ -1484,6 +1537,8 @@ static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { return AMDGPU::SI_SPILL_A160_RESTORE; case 24: return AMDGPU::SI_SPILL_A192_RESTORE; + case 28: + return AMDGPU::SI_SPILL_A224_RESTORE; case 32: return AMDGPU::SI_SPILL_A256_RESTORE; case 64: @@ -1590,6 +1645,7 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { } bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { + const SIRegisterInfo *TRI = ST.getRegisterInfo(); MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MBB.findDebugLoc(MI); switch (MI.getOpcode()) { @@ -1640,6 +1696,18 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(AMDGPU::S_ANDN2_B32)); break; + case AMDGPU::S_AND_B64_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_AND_B64)); + break; + + case AMDGPU::S_AND_B32_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_AND_B32)); + break; + case AMDGPU::V_MOV_B64_PSEUDO: { Register Dst = MI.getOperand(0).getReg(); Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); @@ -1650,20 +1718,49 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { assert(!SrcOp.isFPImm()); if (SrcOp.isImm()) { APInt Imm(64, SrcOp.getImm()); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) - .addImm(Imm.getLoBits(32).getZExtValue()) - .addReg(Dst, RegState::Implicit | RegState::Define); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) - .addImm(Imm.getHiBits(32).getZExtValue()) - .addReg(Dst, RegState::Implicit | RegState::Define); + APInt Lo(32, Imm.getLoBits(32).getZExtValue()); + APInt Hi(32, Imm.getHiBits(32).getZExtValue()); + if (ST.hasPackedFP32Ops() && Lo == Hi && isInlineConstant(Lo)) { + BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) + .addImm(SISrcMods::OP_SEL_1) + .addImm(Lo.getSExtValue()) + .addImm(SISrcMods::OP_SEL_1) + .addImm(Lo.getSExtValue()) + .addImm(0) // op_sel_lo + .addImm(0) // op_sel_hi + .addImm(0) // neg_lo + .addImm(0) // neg_hi + .addImm(0); // clamp + } else { + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) + .addImm(Lo.getSExtValue()) + .addReg(Dst, RegState::Implicit | RegState::Define); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) + .addImm(Hi.getSExtValue()) + .addReg(Dst, RegState::Implicit | RegState::Define); + } } else { assert(SrcOp.isReg()); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) - .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) - .addReg(Dst, RegState::Implicit | RegState::Define); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) - .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) - .addReg(Dst, RegState::Implicit | RegState::Define); + if (ST.hasPackedFP32Ops() && + !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) { + BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) + .addImm(SISrcMods::OP_SEL_1) // src0_mod + .addReg(SrcOp.getReg()) + .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod + .addReg(SrcOp.getReg()) + .addImm(0) // op_sel_lo + .addImm(0) // op_sel_hi + .addImm(0) // neg_lo + .addImm(0) // neg_hi + .addImm(0); // clamp + } else { + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) + .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) + .addReg(Dst, RegState::Implicit | RegState::Define); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) + .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) + .addReg(Dst, RegState::Implicit | RegState::Define); + } } MI.eraseFromParent(); break; @@ -1672,11 +1769,35 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { expandMovDPP64(MI); break; } + case AMDGPU::S_MOV_B64_IMM_PSEUDO: { + const MachineOperand &SrcOp = MI.getOperand(1); + assert(!SrcOp.isFPImm()); + APInt Imm(64, SrcOp.getImm()); + if (Imm.isIntN(32) || isInlineConstant(Imm)) { + MI.setDesc(get(AMDGPU::S_MOV_B64)); + break; + } + + Register Dst = MI.getOperand(0).getReg(); + Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); + Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); + + APInt Lo(32, Imm.getLoBits(32).getZExtValue()); + APInt Hi(32, Imm.getHiBits(32).getZExtValue()); + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo) + .addImm(Lo.getSExtValue()) + .addReg(Dst, RegState::Implicit | RegState::Define); + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi) + .addImm(Hi.getSExtValue()) + .addReg(Dst, RegState::Implicit | RegState::Define); + MI.eraseFromParent(); + break; + } case AMDGPU::V_SET_INACTIVE_B32: { unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MI, DL, get(NotOpc), Exec) - .addReg(Exec); + auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); + FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) .add(MI.getOperand(2)); BuildMI(MBB, MI, DL, get(NotOpc), Exec) @@ -1687,8 +1808,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case AMDGPU::V_SET_INACTIVE_B64: { unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MI, DL, get(NotOpc), Exec) - .addReg(Exec); + auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); + FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), MI.getOperand(0).getReg()) .add(MI.getOperand(2)); @@ -1848,16 +1969,29 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.eraseFromParent(); break; } - case AMDGPU::ENTER_WWM: { + case AMDGPU::ENTER_STRICT_WWM: { // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when - // WWM is entered. + // Whole Wave Mode is entered. MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64)); break; } - case AMDGPU::EXIT_WWM: { + case AMDGPU::ENTER_STRICT_WQM: { // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when - // WWM is exited. + // STRICT_WQM is entered. + const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64; + const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec); + BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec); + + MI.eraseFromParent(); + break; + } + case AMDGPU::EXIT_STRICT_WWM: + case AMDGPU::EXIT_STRICT_WQM: { + // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when + // WWM/STICT_WQM is exited. MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); break; } @@ -1877,7 +2011,6 @@ SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { unsigned Part = 0; MachineInstr *Split[2]; - for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) { auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp)); if (Dst.isPhysical()) { @@ -2098,32 +2231,36 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, // s_getpc_b64. Insert pc arithmetic code before last terminator. MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); - // TODO: Handle > 32-bit block address. - if (BrOffset >= 0) { - BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) - .addReg(PCReg, RegState::Define, AMDGPU::sub0) - .addReg(PCReg, 0, AMDGPU::sub0) - .addMBB(&DestBB, MO_LONG_BRANCH_FORWARD); - BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) - .addReg(PCReg, RegState::Define, AMDGPU::sub1) - .addReg(PCReg, 0, AMDGPU::sub1) - .addImm(0); - } else { - // Backwards branch. - BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) + auto &MCCtx = MF->getContext(); + MCSymbol *PostGetPCLabel = + MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true); + GetPC->setPostInstrSymbol(*MF, PostGetPCLabel); + + MCSymbol *OffsetLo = + MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true); + MCSymbol *OffsetHi = + MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true); + BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) .addReg(PCReg, RegState::Define, AMDGPU::sub0) .addReg(PCReg, 0, AMDGPU::sub0) - .addMBB(&DestBB, MO_LONG_BRANCH_BACKWARD); - BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) + .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET); + BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) .addReg(PCReg, RegState::Define, AMDGPU::sub1) .addReg(PCReg, 0, AMDGPU::sub1) - .addImm(0); - } + .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET); // Insert the indirect branch after the other terminator. BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) .addReg(PCReg); + auto ComputeBlockSize = [](const TargetInstrInfo *TII, + const MachineBasicBlock &MBB) { + unsigned Size = 0; + for (const MachineInstr &MI : MBB) + Size += TII->getInstSizeInBytes(MI); + return Size; + }; + // FIXME: If spilling is necessary, this will fail because this scavenger has // no emergency stack slots. It is non-trivial to spill in this situation, // because the restore code needs to be specially placed after the @@ -2168,7 +2305,16 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, MRI.clearVirtRegs(); RS->setRegUsed(Scav); - return 4 + 8 + 4 + 4; + // Now, the distance could be defined. + auto *Offset = MCBinaryExpr::createSub( + MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx), + MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx); + // Add offset assignments. + auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx); + OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx)); + auto *ShAmt = MCConstantExpr::create(32, MCCtx); + OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx)); + return ComputeBlockSize(this, MBB); } unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { @@ -2263,18 +2409,18 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, // Skip over the instructions that are artificially terminators for special // exec management. - while (I != E && !I->isBranch() && !I->isReturn() && - I->getOpcode() != AMDGPU::SI_MASK_BRANCH) { + while (I != E && !I->isBranch() && !I->isReturn()) { switch (I->getOpcode()) { - case AMDGPU::SI_MASK_BRANCH: case AMDGPU::S_MOV_B64_term: case AMDGPU::S_XOR_B64_term: case AMDGPU::S_OR_B64_term: case AMDGPU::S_ANDN2_B64_term: + case AMDGPU::S_AND_B64_term: case AMDGPU::S_MOV_B32_term: case AMDGPU::S_XOR_B32_term: case AMDGPU::S_OR_B32_term: case AMDGPU::S_ANDN2_B32_term: + case AMDGPU::S_AND_B32_term: break; case AMDGPU::SI_IF: case AMDGPU::SI_ELSE: @@ -2292,34 +2438,7 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, if (I == E) return false; - if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH) - return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); - - ++I; - - // TODO: Should be able to treat as fallthrough? - if (I == MBB.end()) - return true; - - if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify)) - return true; - - MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB(); - - // Specifically handle the case where the conditional branch is to the same - // destination as the mask branch. e.g. - // - // si_mask_branch BB8 - // s_cbranch_execz BB8 - // s_cbranch BB9 - // - // This is required to understand divergent loops which may need the branches - // to be relaxed. - if (TBB != MaskBrDest || Cond.empty()) - return true; - - auto Pred = Cond[0].getImm(); - return (Pred != EXECZ && Pred != EXECNZ); + return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); } unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, @@ -2330,11 +2449,6 @@ unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, unsigned RemovedSize = 0; while (I != MBB.end()) { MachineBasicBlock::iterator Next = std::next(I); - if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) { - I = Next; - continue; - } - RemovedSize += getInstSizeInBytes(*I); I->eraseFromParent(); ++Count; @@ -2400,6 +2514,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, MachineInstr *CondBr = BuildMI(&MBB, DL, get(Opcode)) .addMBB(TBB); + fixImplicitOperands(*CondBr); BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) .addMBB(FBB); @@ -2593,6 +2708,7 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { case AMDGPU::COPY: case AMDGPU::V_ACCVGPR_WRITE_B32_e64: case AMDGPU::V_ACCVGPR_READ_B32_e64: + case AMDGPU::V_ACCVGPR_MOV_B32: return true; default: return false; @@ -2983,7 +3099,9 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, unsigned Opc = MI.getOpcode(); bool IsF16 = false; bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || - Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64; + Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || + Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; + bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; switch (Opc) { default: @@ -2994,13 +3112,15 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, LLVM_FALLTHROUGH; case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_FMAC_F32_e64: + case AMDGPU::V_FMAC_F64_e64: break; case AMDGPU::V_MAC_F16_e32: case AMDGPU::V_FMAC_F16_e32: IsF16 = true; LLVM_FALLTHROUGH; case AMDGPU::V_MAC_F32_e32: - case AMDGPU::V_FMAC_F32_e32: { + case AMDGPU::V_FMAC_F32_e32: + case AMDGPU::V_FMAC_F64_e32: { int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); const MachineOperand *Src0 = &MI.getOperand(Src0Idx); @@ -3026,7 +3146,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); MachineInstrBuilder MIB; - if (!Src0Mods && !Src1Mods && !Clamp && !Omod && + if (!Src0Mods && !Src1Mods && !Clamp && !Omod && !IsF64 && // If we have an SGPR input, we will violate the constant bus restriction. (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { @@ -3074,7 +3194,9 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, } } - unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64 : AMDGPU::V_FMA_F32_e64) + unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64 + : IsF64 ? AMDGPU::V_FMA_F64_e64 + : AMDGPU::V_FMA_F32_e64) : (IsF16 ? AMDGPU::V_MAD_F16_e64 : AMDGPU::V_MAD_F32_e64); if (pseudoToMCOpcode(NewOpc) == -1) return nullptr; @@ -3262,6 +3384,10 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_IMM_FP32: case AMDGPU::OPERAND_REG_INLINE_C_INT32: case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_IMM_V2FP32: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: + case AMDGPU::OPERAND_REG_IMM_V2INT32: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: case AMDGPU::OPERAND_REG_INLINE_AC_INT32: case AMDGPU::OPERAND_REG_INLINE_AC_FP32: { int32_t Trunc = static_cast<int32_t>(Imm); @@ -3271,6 +3397,7 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_INLINE_C_INT64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: return AMDGPU::isInlinableLiteral64(MO.getImm(), ST.hasInv2PiInlineImm()); case AMDGPU::OPERAND_REG_IMM_INT16: @@ -3382,6 +3509,10 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, } bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { + // GFX90A does not have V_MUL_LEGACY_F32_e32. + if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts()) + return false; + int Op32 = AMDGPU::getVOPe32(Opcode); if (Op32 == -1) return false; @@ -3439,6 +3570,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI, case AMDGPU::V_MAC_F16_e64: case AMDGPU::V_FMAC_F32_e64: case AMDGPU::V_FMAC_F16_e64: + case AMDGPU::V_FMAC_F64_e64: if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) || hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) return false; @@ -3663,7 +3795,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, // Make sure the register classes are correct. for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { - if (MI.getOperand(i).isFPImm()) { + const MachineOperand &MO = MI.getOperand(i); + if (MO.isFPImm()) { ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " "all fp values to integers."; return false; @@ -3690,8 +3823,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, case AMDGPU::OPERAND_REG_INLINE_AC_INT32: case AMDGPU::OPERAND_REG_INLINE_AC_FP32: case AMDGPU::OPERAND_REG_INLINE_AC_INT16: - case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { - const MachineOperand &MO = MI.getOperand(i); + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: { if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { ErrInfo = "Illegal immediate value for operand."; return false; @@ -3712,12 +3845,37 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, continue; } - if (!MI.getOperand(i).isReg()) + if (!MO.isReg()) + continue; + Register Reg = MO.getReg(); + if (!Reg) continue; + // FIXME: Ideally we would have separate instruction definitions with the + // aligned register constraint. + // FIXME: We do not verify inline asm operands, but custom inline asm + // verification is broken anyway + if (ST.needsAlignedVGPRs()) { + const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg); + const bool IsVGPR = RI.hasVGPRs(RC); + const bool IsAGPR = !IsVGPR && RI.hasAGPRs(RC); + if ((IsVGPR || IsAGPR) && MO.getSubReg()) { + const TargetRegisterClass *SubRC = + RI.getSubRegClass(RC, MO.getSubReg()); + RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg()); + if (RC) + RC = SubRC; + } + + // Check that this is the aligned version of the class. + if (!RC || !RI.isProperlyAlignedRC(*RC)) { + ErrInfo = "Subtarget requires even aligned vector registers"; + return false; + } + } + if (RegClass != -1) { - Register Reg = MI.getOperand(i).getReg(); - if (Reg == AMDGPU::NoRegister || Reg.isVirtual()) + if (Reg.isVirtual()) continue; const TargetRegisterClass *RC = RI.getRegClass(RegClass); @@ -3864,7 +4022,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; unsigned ConstantBusCount = 0; - unsigned LiteralCount = 0; + bool UsesLiteral = false; + const MachineOperand *LiteralVal = nullptr; if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) ++ConstantBusCount; @@ -3886,8 +4045,15 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, SGPRsUsed.push_back(SGPRUsed); } } else { - ++ConstantBusCount; - ++LiteralCount; + if (!UsesLiteral) { + ++ConstantBusCount; + UsesLiteral = true; + LiteralVal = &MO; + } else if (!MO.isIdenticalTo(*LiteralVal)) { + assert(isVOP3(MI)); + ErrInfo = "VOP3 instruction uses more than one literal"; + return false; + } } } } @@ -3911,15 +4077,9 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, return false; } - if (isVOP3(MI) && LiteralCount) { - if (!ST.hasVOP3Literal()) { - ErrInfo = "VOP3 instruction uses literal"; - return false; - } - if (LiteralCount > 1) { - ErrInfo = "VOP3 instruction uses more than one literal"; - return false; - } + if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) { + ErrInfo = "VOP3 instruction uses literal"; + return false; } } @@ -4113,25 +4273,10 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, IsA16 = A16->getImm() != 0; } - bool PackDerivatives = IsA16 || BaseOpcode->G16; bool IsNSA = SRsrcIdx - VAddr0Idx > 1; - unsigned AddrWords = BaseOpcode->NumExtraArgs; - unsigned AddrComponents = (BaseOpcode->Coordinates ? Dim->NumCoords : 0) + - (BaseOpcode->LodOrClampOrMip ? 1 : 0); - if (IsA16) - AddrWords += (AddrComponents + 1) / 2; - else - AddrWords += AddrComponents; - - if (BaseOpcode->Gradients) { - if (PackDerivatives) - // There are two gradients per coordinate, we pack them separately. - // For the 3d case, we get (dy/du, dx/du) (-, dz/du) (dy/dv, dx/dv) (-, dz/dv) - AddrWords += (Dim->NumGradients / 2 + 1) / 2 * 2; - else - AddrWords += Dim->NumGradients; - } + unsigned AddrWords = + AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16()); unsigned VAddrWords; if (IsNSA) { @@ -4141,12 +4286,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32; if (AddrWords > 8) AddrWords = 16; - else if (AddrWords > 4) - AddrWords = 8; - else if (AddrWords == 4) - AddrWords = 4; - else if (AddrWords == 3) - AddrWords = 3; } if (VAddrWords != AddrWords) { @@ -4187,8 +4326,89 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST && ST.getGeneration() < AMDGPUSubtarget::GFX10) { + if (DC >= DppCtrl::ROW_NEWBCAST_FIRST && + DC <= DppCtrl::ROW_NEWBCAST_LAST && + !ST.hasGFX90AInsts()) { + ErrInfo = "Invalid dpp_ctrl value: " + "row_newbroadcast/row_share is not supported before " + "GFX90A/GFX10"; + return false; + } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) { + ErrInfo = "Invalid dpp_ctrl value: " + "row_share and row_xmask are not supported before GFX10"; + return false; + } + } + + int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); + int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); + + if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO && + ((DstIdx >= 0 && + (Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64RegClassID || + Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64_Align2RegClassID)) || + ((Src0Idx >= 0 && + (Desc.OpInfo[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID || + Desc.OpInfo[Src0Idx].RegClass == + AMDGPU::VReg_64_Align2RegClassID)))) && + !AMDGPU::isLegal64BitDPPControl(DC)) { ErrInfo = "Invalid dpp_ctrl value: " - "row_share and row_xmask are not supported before GFX10"; + "64 bit dpp only support row_newbcast"; + return false; + } + } + + if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) { + const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); + uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0 + : AMDGPU::OpName::vdata; + const MachineOperand *Data = getNamedOperand(MI, DataNameIdx); + const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1); + if (Data && !Data->isReg()) + Data = nullptr; + + if (ST.hasGFX90AInsts()) { + if (Dst && Data && + (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) { + ErrInfo = "Invalid register class: " + "vdata and vdst should be both VGPR or AGPR"; + return false; + } + if (Data && Data2 && + (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) { + ErrInfo = "Invalid register class: " + "both data operands should be VGPR or AGPR"; + return false; + } + } else { + if ((Dst && RI.isAGPR(MRI, Dst->getReg())) || + (Data && RI.isAGPR(MRI, Data->getReg())) || + (Data2 && RI.isAGPR(MRI, Data2->getReg()))) { + ErrInfo = "Invalid register class: " + "agpr loads and stores not supported on this GPU"; + return false; + } + } + } + + if (ST.needsAlignedVGPRs() && + (MI.getOpcode() == AMDGPU::DS_GWS_INIT || + MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR || + MI.getOpcode() == AMDGPU::DS_GWS_BARRIER)) { + const MachineOperand *Op = getNamedOperand(MI, AMDGPU::OpName::data0); + Register Reg = Op->getReg(); + bool Aligned = true; + if (Reg.isPhysical()) { + Aligned = !(RI.getHWRegIndex(Reg) & 1); + } else { + const TargetRegisterClass &RC = *MRI.getRegClass(Reg); + Aligned = RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) && + !(RI.getChannelFromSubReg(Op->getSubReg()) & 1); + } + + if (!Aligned) { + ErrInfo = "Subtarget requires even aligned vector registers " + "for DS_GWS instructions"; return false; } } @@ -4205,7 +4425,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; case AMDGPU::WQM: return AMDGPU::WQM; case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM; - case AMDGPU::WWM: return AMDGPU::WWM; + case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM; + case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM; case AMDGPU::S_MOV_B32: { const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); return MI.getOperand(1).isReg() || @@ -4276,6 +4497,59 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { "Unexpected scalar opcode without corresponding vector one!"); } +static unsigned adjustAllocatableRegClass(const GCNSubtarget &ST, + const MachineRegisterInfo &MRI, + const MCInstrDesc &TID, + unsigned RCID, + bool IsAllocatable) { + if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && + (TID.mayLoad() || TID.mayStore() || + (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) { + switch (RCID) { + case AMDGPU::AV_32RegClassID: return AMDGPU::VGPR_32RegClassID; + case AMDGPU::AV_64RegClassID: return AMDGPU::VReg_64RegClassID; + case AMDGPU::AV_96RegClassID: return AMDGPU::VReg_96RegClassID; + case AMDGPU::AV_128RegClassID: return AMDGPU::VReg_128RegClassID; + case AMDGPU::AV_160RegClassID: return AMDGPU::VReg_160RegClassID; + default: + break; + } + } + return RCID; +} + +const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID, + unsigned OpNum, const TargetRegisterInfo *TRI, + const MachineFunction &MF) + const { + if (OpNum >= TID.getNumOperands()) + return nullptr; + auto RegClass = TID.OpInfo[OpNum].RegClass; + bool IsAllocatable = false; + if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) { + // vdst and vdata should be both VGPR or AGPR, same for the DS instructions + // with two data operands. Request register class constainted to VGPR only + // of both operands present as Machine Copy Propagation can not check this + // constraint and possibly other passes too. + // + // The check is limited to FLAT and DS because atomics in non-flat encoding + // have their vdst and vdata tied to be the same register. + const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode, + AMDGPU::OpName::vdst); + const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode, + (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0 + : AMDGPU::OpName::vdata); + if (DataIdx != -1) { + IsAllocatable = VDstIdx != -1 || + AMDGPU::getNamedOperandIdx(TID.Opcode, + AMDGPU::OpName::data1) != -1; + } + } + RegClass = adjustAllocatableRegClass(ST, MF.getRegInfo(), TID, RegClass, + IsAllocatable); + return RI.getRegClass(RegClass); +} + const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, unsigned OpNo) const { const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); @@ -4290,6 +4564,7 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, } unsigned RCID = Desc.OpInfo[OpNo].RegClass; + RCID = adjustAllocatableRegClass(ST, MRI, Desc, RCID, true); return RI.getRegClass(RCID); } @@ -4308,8 +4583,9 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); - if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) - VRC = &AMDGPU::VReg_64RegClass; + const TargetRegisterClass *VRC64 = RI.getVGPR64Class(); + if (RI.getCommonSubClass(VRC64, VRC)) + VRC = VRC64; else VRC = &AMDGPU::VGPR_32RegClass; @@ -4466,7 +4742,40 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, if (MO->isReg()) { assert(DefinedRC); - return isLegalRegOperand(MRI, OpInfo, *MO); + if (!isLegalRegOperand(MRI, OpInfo, *MO)) + return false; + bool IsAGPR = RI.isAGPR(MRI, MO->getReg()); + if (IsAGPR && !ST.hasMAIInsts()) + return false; + unsigned Opc = MI.getOpcode(); + if (IsAGPR && + (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && + (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc))) + return false; + // Atomics should have both vdst and vdata either vgpr or agpr. + const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); + const int DataIdx = AMDGPU::getNamedOperandIdx(Opc, + isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata); + if ((int)OpIdx == VDstIdx && DataIdx != -1 && + MI.getOperand(DataIdx).isReg() && + RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR) + return false; + if ((int)OpIdx == DataIdx) { + if (VDstIdx != -1 && + RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR) + return false; + // DS instructions with 2 src operands also must have tied RC. + const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::data1); + if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() && + RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR) + return false; + } + if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && + (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) && + RI.isSGPRReg(MRI, MO->getReg())) + return false; + return true; } // Handle non-register types that are treated like immediates. @@ -4740,6 +5049,86 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, } } +bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { + unsigned Opc = Inst.getOpcode(); + int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); + if (OldSAddrIdx < 0) + return false; + + assert(isSegmentSpecificFLAT(Inst)); + + int NewOpc = AMDGPU::getGlobalVaddrOp(Opc); + if (NewOpc < 0) + NewOpc = AMDGPU::getFlatScratchInstSVfromSS(Opc); + if (NewOpc < 0) + return false; + + MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo(); + MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx); + if (RI.isSGPRReg(MRI, SAddr.getReg())) + return false; + + int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr); + if (NewVAddrIdx < 0) + return false; + + int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); + + // Check vaddr, it shall be zero or absent. + MachineInstr *VAddrDef = nullptr; + if (OldVAddrIdx >= 0) { + MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx); + VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg()); + if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 || + !VAddrDef->getOperand(1).isImm() || + VAddrDef->getOperand(1).getImm() != 0) + return false; + } + + const MCInstrDesc &NewDesc = get(NewOpc); + Inst.setDesc(NewDesc); + + // Callers expect interator to be valid after this call, so modify the + // instruction in place. + if (OldVAddrIdx == NewVAddrIdx) { + MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx); + // Clear use list from the old vaddr holding a zero register. + MRI.removeRegOperandFromUseList(&NewVAddr); + MRI.moveOperands(&NewVAddr, &SAddr, 1); + Inst.RemoveOperand(OldSAddrIdx); + // Update the use list with the pointer we have just moved from vaddr to + // saddr poisition. Otherwise new vaddr will be missing from the use list. + MRI.removeRegOperandFromUseList(&NewVAddr); + MRI.addRegOperandToUseList(&NewVAddr); + } else { + assert(OldSAddrIdx == NewVAddrIdx); + + if (OldVAddrIdx >= 0) { + int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc, + AMDGPU::OpName::vdst_in); + + // RemoveOperand doesn't try to fixup tied operand indexes at it goes, so + // it asserts. Untie the operands for now and retie them afterwards. + if (NewVDstIn != -1) { + int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in); + Inst.untieRegOperand(OldVDstIn); + } + + Inst.RemoveOperand(OldVAddrIdx); + + if (NewVDstIn != -1) { + int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst); + Inst.tieOperands(NewVDst, NewVDstIn); + } + } + } + + if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg())) + VAddrDef->eraseFromParent(); + + return true; +} + // FIXME: Remove this when SelectionDAG is obsoleted. void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const { @@ -4752,6 +5141,9 @@ void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI, if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg()))) return; + if (moveFlatAddrToVGPR(MI)) + return; + Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI); SAddr->setReg(ToSGPR); } @@ -4905,7 +5297,7 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, .addReg(Exec) .addReg(SaveExec); - BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB); } // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register @@ -5316,17 +5708,10 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, .add(*SOffset) .add(*Offset); - // Atomics do not have this operand. - if (const MachineOperand *GLC = - getNamedOperand(MI, AMDGPU::OpName::glc)) { - MIB.addImm(GLC->getImm()); + if (const MachineOperand *CPol = + getNamedOperand(MI, AMDGPU::OpName::cpol)) { + MIB.addImm(CPol->getImm()); } - if (const MachineOperand *DLC = - getNamedOperand(MI, AMDGPU::OpName::dlc)) { - MIB.addImm(DLC->getImm()); - } - - MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); if (const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe)) { @@ -5346,7 +5731,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, .addReg(NewSRsrc) .add(*SOffset) .add(*Offset) - .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) + .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol)) .cloneMemRefs(MI); } @@ -5449,6 +5834,11 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst, Inst.eraseFromParent(); continue; + case AMDGPU::S_BREV_B64: + splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true); + Inst.eraseFromParent(); + continue; + case AMDGPU::S_NOT_B64: splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); Inst.eraseFromParent(); @@ -5654,6 +6044,8 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst, // Only propagate through live-def of SCC. if (Op.isDef() && !Op.isDead()) addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); + if (Op.isUse()) + addSCCDefsToVALUWorklist(Op, Worklist); Inst.RemoveOperand(i); } } @@ -5999,7 +6391,7 @@ void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, void SIInstrInfo::splitScalar64BitUnaryOp( SetVectorType &Worklist, MachineInstr &Inst, - unsigned Opcode) const { + unsigned Opcode, bool Swap) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -6032,6 +6424,9 @@ void SIInstrInfo::splitScalar64BitUnaryOp( Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); + if (Swap) + std::swap(DestSub0, DestSub1); + Register FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) .addReg(DestSub0) @@ -6341,7 +6736,8 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( case AMDGPU::COPY: case AMDGPU::WQM: case AMDGPU::SOFT_WQM: - case AMDGPU::WWM: + case AMDGPU::STRICT_WWM: + case AMDGPU::STRICT_WQM: case AMDGPU::REG_SEQUENCE: case AMDGPU::PHI: case AMDGPU::INSERT_SUBREG: @@ -6485,6 +6881,32 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, } } +// Instructions that use SCC may be converted to VALU instructions. When that +// happens, the SCC register is changed to VCC_LO. The instruction that defines +// SCC must be changed to an instruction that defines VCC. This function makes +// sure that the instruction that defines SCC is added to the moveToVALU +// worklist. +void SIInstrInfo::addSCCDefsToVALUWorklist(MachineOperand &Op, + SetVectorType &Worklist) const { + assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isUse()); + + MachineInstr *SCCUseInst = Op.getParent(); + // Look for a preceeding instruction that either defines VCC or SCC. If VCC + // then there is nothing to do because the defining instruction has been + // converted to a VALU already. If SCC then that instruction needs to be + // converted to a VALU. + for (MachineInstr &MI : + make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)), + SCCUseInst->getParent()->rend())) { + if (MI.modifiesRegister(AMDGPU::VCC, &RI)) + break; + if (MI.definesRegister(AMDGPU::SCC, &RI)) { + Worklist.insert(&MI); + break; + } + } +} + const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( const MachineInstr &Inst) const { const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); @@ -6499,7 +6921,8 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( case AMDGPU::INSERT_SUBREG: case AMDGPU::WQM: case AMDGPU::SOFT_WQM: - case AMDGPU::WWM: { + case AMDGPU::STRICT_WWM: + case AMDGPU::STRICT_WQM: { const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1); if (RI.hasAGPRs(SrcRC)) { if (RI.hasAGPRs(NewDstRC)) @@ -6614,7 +7037,7 @@ MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { - return (22ULL << 44) | // IMG_FORMAT_32_FLOAT + return (AMDGPU::MTBUFFormat::UFMT_32_FLOAT << 44) | (1ULL << 56) | // RESOURCE_LEVEL = 1 (3ULL << 60); // OOB_SELECT = 3 } @@ -6786,11 +7209,6 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { } switch (Opc) { - case TargetOpcode::IMPLICIT_DEF: - case TargetOpcode::KILL: - case TargetOpcode::DBG_VALUE: - case TargetOpcode::EH_LABEL: - return 0; case TargetOpcode::BUNDLE: return getInstBundleSize(MI); case TargetOpcode::INLINEASM: @@ -6800,6 +7218,8 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST); } default: + if (MI.isMetaInstruction()) + return 0; return DescSize; } } @@ -7026,36 +7446,92 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass); } +// Depending on the used address space and instructions, some immediate offsets +// are allowed and some are not. +// In general, flat instruction offsets can only be non-negative, global and +// scratch instruction offsets can also be negative. +// +// There are several bugs related to these offsets: +// On gfx10.1, flat instructions that go into the global address space cannot +// use an offset. +// +// For scratch instructions, the address can be either an SGPR or a VGPR. +// The following offsets can be used, depending on the architecture (x means +// cannot be used): +// +----------------------------+------+------+ +// | Address-Mode | SGPR | VGPR | +// +----------------------------+------+------+ +// | gfx9 | | | +// | negative, 4-aligned offset | x | ok | +// | negative, unaligned offset | x | ok | +// +----------------------------+------+------+ +// | gfx10 | | | +// | negative, 4-aligned offset | ok | ok | +// | negative, unaligned offset | ok | x | +// +----------------------------+------+------+ +// | gfx10.3 | | | +// | negative, 4-aligned offset | ok | ok | +// | negative, unaligned offset | ok | ok | +// +----------------------------+------+------+ +// +// This function ignores the addressing mode, so if an offset cannot be used in +// one addressing mode, it is considered illegal. bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, - bool Signed) const { + uint64_t FlatVariant) const { // TODO: Should 0 be special cased? if (!ST.hasFlatInstOffsets()) return false; - if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS) + if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT && + (AddrSpace == AMDGPUAS::FLAT_ADDRESS || + AddrSpace == AMDGPUAS::GLOBAL_ADDRESS)) + return false; + + bool Signed = FlatVariant != SIInstrFlags::FLAT; + if (ST.hasNegativeScratchOffsetBug() && + FlatVariant == SIInstrFlags::FlatScratch) + Signed = false; + if (ST.hasNegativeUnalignedScratchOffsetBug() && + FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 && + (Offset % 4) != 0) { return false; + } unsigned N = AMDGPU::getNumFlatOffsetBits(ST, Signed); return Signed ? isIntN(N, Offset) : isUIntN(N, Offset); } -std::pair<int64_t, int64_t> SIInstrInfo::splitFlatOffset(int64_t COffsetVal, - unsigned AddrSpace, - bool IsSigned) const { +// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not. +std::pair<int64_t, int64_t> +SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, + uint64_t FlatVariant) const { int64_t RemainderOffset = COffsetVal; int64_t ImmField = 0; - const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST, IsSigned); - if (IsSigned) { + bool Signed = FlatVariant != SIInstrFlags::FLAT; + if (ST.hasNegativeScratchOffsetBug() && + FlatVariant == SIInstrFlags::FlatScratch) + Signed = false; + + const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST, Signed); + if (Signed) { // Use signed division by a power of two to truncate towards 0. int64_t D = 1LL << (NumBits - 1); RemainderOffset = (COffsetVal / D) * D; ImmField = COffsetVal - RemainderOffset; + + if (ST.hasNegativeUnalignedScratchOffsetBug() && + FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 && + (ImmField % 4) != 0) { + // Make ImmField a multiple of 4 + RemainderOffset += ImmField % 4; + ImmField -= ImmField % 4; + } } else if (COffsetVal >= 0) { ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits); RemainderOffset = COffsetVal - ImmField; } - assert(isLegalFLATOffset(ImmField, AddrSpace, IsSigned)); + assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant)); assert(RemainderOffset + ImmField == COffsetVal); return {ImmField, RemainderOffset}; } @@ -7069,7 +7545,8 @@ enum SIEncodingFamily { GFX80 = 4, GFX9 = 5, GFX10 = 6, - SDWA10 = 7 + SDWA10 = 7, + GFX90A = 8 }; static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { @@ -7141,6 +7618,15 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { if (MCOp == -1) return Opcode; + if (ST.hasGFX90AInsts()) { + uint16_t NMCOp = (uint16_t)-1; + NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A); + if (NMCOp == (uint16_t)-1) + NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9); + if (NMCOp != (uint16_t)-1) + MCOp = NMCOp; + } + // (uint16_t)-1 means that Opcode is a pseudo instruction that has // no encoding in the given subtarget generation. if (MCOp == (uint16_t)-1) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index ce59fe86c688..fc5e5be03541 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -96,7 +96,8 @@ private: unsigned Opcode) const; void splitScalar64BitUnaryOp(SetVectorType &Worklist, - MachineInstr &Inst, unsigned Opcode) const; + MachineInstr &Inst, unsigned Opcode, + bool Swap = false) const; void splitScalar64BitAddSub(SetVectorType &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; @@ -122,6 +123,8 @@ private: void addSCCDefUsersToVALUWorklist(MachineOperand &Op, MachineInstr &SCCDefInst, SetVectorType &Worklist) const; + void addSCCDefsToVALUWorklist(MachineOperand &Op, + SetVectorType &Worklist) const; const TargetRegisterClass * getDestEquivalentVGPRClass(const MachineInstr &Inst) const; @@ -158,8 +161,7 @@ public: // MO_REL32_HI -> symbol@rel32@hi -> R_AMDGPU_REL32_HI. MO_REL32_HI = 5, - MO_LONG_BRANCH_FORWARD = 6, - MO_LONG_BRANCH_BACKWARD = 7, + MO_FAR_BRANCH_OFFSET = 6, MO_ABS32_LO = 8, MO_ABS32_HI = 9, @@ -171,9 +173,15 @@ public: return RI; } + const GCNSubtarget &getSubtarget() const { + return ST; + } + bool isReallyTriviallyReMaterializable(const MachineInstr &MI, AAResults *AA) const override; + bool isIgnorableUse(const MachineOperand &MO) const override; + bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1, int64_t &Offset2) const override; @@ -501,28 +509,28 @@ public: // i.e. global_* or scratch_*. static bool isSegmentSpecificFLAT(const MachineInstr &MI) { auto Flags = MI.getDesc().TSFlags; - return Flags & (SIInstrFlags::IsFlatGlobal | SIInstrFlags::IsFlatScratch); + return Flags & (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch); } bool isSegmentSpecificFLAT(uint16_t Opcode) const { auto Flags = get(Opcode).TSFlags; - return Flags & (SIInstrFlags::IsFlatGlobal | SIInstrFlags::IsFlatScratch); + return Flags & (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch); } static bool isFLATGlobal(const MachineInstr &MI) { - return MI.getDesc().TSFlags & SIInstrFlags::IsFlatGlobal; + return MI.getDesc().TSFlags & SIInstrFlags::FlatGlobal; } bool isFLATGlobal(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::IsFlatGlobal; + return get(Opcode).TSFlags & SIInstrFlags::FlatGlobal; } static bool isFLATScratch(const MachineInstr &MI) { - return MI.getDesc().TSFlags & SIInstrFlags::IsFlatScratch; + return MI.getDesc().TSFlags & SIInstrFlags::FlatScratch; } bool isFLATScratch(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::IsFlatScratch; + return get(Opcode).TSFlags & SIInstrFlags::FlatScratch; } // Any FLAT encoded instruction, including global_* and scratch_*. @@ -538,6 +546,32 @@ public: return get(Opcode).TSFlags & SIInstrFlags::EXP; } + static bool isAtomicNoRet(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::IsAtomicNoRet; + } + + bool isAtomicNoRet(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::IsAtomicNoRet; + } + + static bool isAtomicRet(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::IsAtomicRet; + } + + bool isAtomicRet(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::IsAtomicRet; + } + + static bool isAtomic(const MachineInstr &MI) { + return MI.getDesc().TSFlags & (SIInstrFlags::IsAtomicRet | + SIInstrFlags::IsAtomicNoRet); + } + + bool isAtomic(uint16_t Opcode) const { + return get(Opcode).TSFlags & (SIInstrFlags::IsAtomicRet | + SIInstrFlags::IsAtomicNoRet); + } + static bool isWQM(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::WQM; } @@ -915,6 +949,10 @@ public: MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT = nullptr) const; + /// Change SADDR form of a FLAT \p Inst to its VADDR form if saddr operand + /// was moved to VGPR. \returns true if succeeded. + bool moveFlatAddrToVGPR(MachineInstr &Inst) const; + /// Replace this instruction's opcode with the equivalent VALU /// opcode. This function will also move the users of \p MI to the /// VALU if necessary. If present, \p MDT is updated. @@ -1039,13 +1077,13 @@ public: /// encoded instruction. If \p Signed, this is for an instruction that /// interprets the offset as signed. bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, - bool Signed) const; + uint64_t FlatVariant) const; /// Split \p COffsetVal into {immediate offset field, remainder offset} /// values. std::pair<int64_t, int64_t> splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, - bool IsSigned) const; + uint64_t FlatVariant) const; /// \brief Return a target-specific opcode if Opcode is a pseudo instruction. /// Return -1 if the target-specific opcode for the pseudo instruction does @@ -1059,11 +1097,7 @@ public: const TargetRegisterClass *getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI, const MachineFunction &MF) - const override { - if (OpNum >= TID.getNumOperands()) - return nullptr; - return RI.getRegClass(TID.OpInfo[OpNum].RegClass); - } + const override; void fixImplicitOperands(MachineInstr &MI) const; @@ -1166,26 +1200,39 @@ namespace AMDGPU { int getMUBUFNoLdsInst(uint16_t Opcode); LLVM_READONLY - int getAtomicRetOp(uint16_t Opcode); - - LLVM_READONLY int getAtomicNoRetOp(uint16_t Opcode); LLVM_READONLY int getSOPKOp(uint16_t Opcode); + /// \returns SADDR form of a FLAT Global instruction given an \p Opcode + /// of a VADDR form. LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode); + /// \returns VADDR form of a FLAT Global instruction given an \p Opcode + /// of a SADDR form. + LLVM_READONLY + int getGlobalVaddrOp(uint16_t Opcode); + LLVM_READONLY int getVCMPXNoSDstOp(uint16_t Opcode); + /// \returns ST form with only immediate offset of a FLAT Scratch instruction + /// given an \p Opcode of an SS (SADDR) form. LLVM_READONLY int getFlatScratchInstSTfromSS(uint16_t Opcode); + /// \returns SS (SADDR) form of a FLAT Scratch instruction given an \p Opcode + /// of an SV (VADDR) form. LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode); + /// \returns SV (VADDR) form of a FLAT Scratch instruction given an \p Opcode + /// of an SS (SADDR) form. + LLVM_READONLY + int getFlatScratchInstSVfromSS(uint16_t Opcode); + const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19); const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 5adc9e817d41..25b647d34ec1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -16,7 +16,7 @@ class GCNPredicateControl : PredicateControl { Predicate VIAssemblerPredicate = isGFX8GFX9; } -// Execpt for the NONE field, this must be kept in sync with the +// Except for the NONE field, this must be kept in sync with the // SIEncodingFamily enum in AMDGPUInstrInfo.cpp def SIEncodingFamily { int NONE = -1; @@ -28,6 +28,7 @@ def SIEncodingFamily { int GFX9 = 5; int GFX10 = 6; int SDWA10 = 7; + int GFX90A = 8; } //===----------------------------------------------------------------------===// @@ -186,6 +187,8 @@ def SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">; def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">; def SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">; def SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">; +def SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">; +def SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">; def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", SDTypeProfile<1, 9, @@ -265,21 +268,25 @@ class isFloatType<ValueType SrcVT> { !eq(SrcVT.Value, v2f16.Value), !eq(SrcVT.Value, v4f16.Value), !eq(SrcVT.Value, v2f32.Value), - !eq(SrcVT.Value, v2f64.Value)); + !eq(SrcVT.Value, v2f64.Value), + !eq(SrcVT.Value, v4f64.Value)); } class isIntType<ValueType SrcVT> { bit ret = !or(!eq(SrcVT.Value, i16.Value), !eq(SrcVT.Value, i32.Value), - !eq(SrcVT.Value, i64.Value)); + !eq(SrcVT.Value, i64.Value), + !eq(SrcVT.Value, v2i32.Value)); } class isPackedType<ValueType SrcVT> { bit ret = !or(!eq(SrcVT.Value, v2i16.Value), !eq(SrcVT.Value, v2f16.Value), - !eq(SrcVT.Value, v4f16.Value)); + !eq(SrcVT.Value, v4f16.Value), + !eq(SrcVT.Value, v2f32.Value)); } + //===----------------------------------------------------------------------===// // PatFrags for global memory operations //===----------------------------------------------------------------------===// @@ -629,6 +636,11 @@ def add_ctpop : PatFrag < (add (ctpop $src0), $src1) >; +def xnor : PatFrag < + (ops node:$src0, node:$src1), + (not (xor $src0, $src1)) +>; + foreach I = 1-4 in { def shl#I#_add : PatFrag < (ops node:$src0, node:$src1), @@ -802,26 +814,28 @@ def NegSubInlineConstV216 : PatLeaf<(build_vector), [{ (isNullConstantOrUndef(Src1) && isNegInlineImmediate(Src0.getNode())); }], getNegV2I16Imm>; -//===----------------------------------------------------------------------===// -// MUBUF/SMEM Patterns -//===----------------------------------------------------------------------===// -def extract_glc : SDNodeXForm<timm, [{ - return CurDAG->getTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i8); +def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{ + return fp16SrcZerosHighBits(N->getOpcode()); }]>; -def extract_slc : SDNodeXForm<timm, [{ - return CurDAG->getTargetConstant((N->getZExtValue() >> 1) & 1, SDLoc(N), MVT::i8); -}]>; -def extract_dlc : SDNodeXForm<timm, [{ - return CurDAG->getTargetConstant((N->getZExtValue() >> 2) & 1, SDLoc(N), MVT::i8); +//===----------------------------------------------------------------------===// +// MUBUF/SMEM Patterns +//===----------------------------------------------------------------------===// + +def extract_cpol : SDNodeXForm<timm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() & AMDGPU::CPol::ALL, SDLoc(N), MVT::i8); }]>; def extract_swz : SDNodeXForm<timm, [{ return CurDAG->getTargetConstant((N->getZExtValue() >> 3) & 1, SDLoc(N), MVT::i8); }]>; +def set_glc : SDNodeXForm<timm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() | AMDGPU::CPol::GLC, SDLoc(N), MVT::i8); +}]>; + //===----------------------------------------------------------------------===// // Custom Operands //===----------------------------------------------------------------------===// @@ -1074,6 +1088,12 @@ class NamedOperandU32Default0<string Name, AsmOperandClass MatchClass> : let ParserMatchClass = MatchClass; } +class NamedOperandU32Default1<string Name, AsmOperandClass MatchClass> : + OperandWithDefaultOps<i32, (ops (i32 1))> { + let PrintMethod = "print"#Name; + let ParserMatchClass = MatchClass; +} + let OperandType = "OPERAND_IMMEDIATE" in { def offen : NamedOperandBit<"Offen", NamedMatchClass<"Offen">>; @@ -1097,18 +1117,14 @@ def clampmod : NamedOperandBit<"ClampSI", NamedMatchClass<"ClampSI">>; def clampmod0 : NamedOperandBit_0<"ClampSI", NamedMatchClass<"ClampSI">>; def highmod : NamedOperandBit<"High", NamedMatchClass<"High">>; -def DLC : NamedOperandBit<"DLC", NamedMatchClass<"DLC">>; -def DLC_0 : NamedOperandBit_0<"DLC", NamedMatchClass<"DLC">>; - -def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>; -def GLC_0 : NamedOperandBit_0<"GLC", NamedMatchClass<"GLC">>; -def GLC_1 : NamedOperandBit_1<"GLC", NamedMatchClass<"GLC_1">>; - -def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>; -def SLC_0 : NamedOperandBit_0<"SLC", NamedMatchClass<"SLC">>; +def CPol : NamedOperandU32<"CPol", NamedMatchClass<"CPol">>; +def CPol_0 : NamedOperandU32Default0<"CPol", NamedMatchClass<"CPol">>; +def CPol_GLC1 : NamedOperandU32Default1<"CPol", NamedMatchClass<"CPol">>; def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>; +def TFE_0 : NamedOperandBit_0<"TFE", NamedMatchClass<"TFE">>; def SWZ : NamedOperandBit<"SWZ", NamedMatchClass<"SWZ">>; +def SWZ_0 : NamedOperandBit_0<"SWZ", NamedMatchClass<"SWZ">>; def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>; def DA : NamedOperandBit<"DA", NamedMatchClass<"DA">>; def R128A16 : NamedOperandBit<"R128A16", NamedMatchClass<"R128A16">>; @@ -1243,7 +1259,7 @@ def FP32SDWAInputMods : FPSDWAInputMods<FP32SDWAInputModsMatchClass>; def FPVRegInputModsMatchClass : AsmOperandClass { let Name = "VRegWithFPInputMods"; let ParserMethod = "parseRegWithFPInputMods"; - let PredicateMethod = "isVReg32"; + let PredicateMethod = "isVRegWithInputMods"; } def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> { @@ -1270,7 +1286,7 @@ def Int32SDWAInputMods : IntSDWAInputMods<Int32SDWAInputModsMatchClass>; def IntVRegInputModsMatchClass : AsmOperandClass { let Name = "VRegWithIntInputMods"; let ParserMethod = "parseRegWithIntInputMods"; - let PredicateMethod = "isVReg32"; + let PredicateMethod = "isVRegWithInputMods"; } def IntVRegInputMods : InputMods <IntVRegInputModsMatchClass> { @@ -1363,11 +1379,6 @@ def DSTOMOD { int NONE = 0; } -def TRAPID{ - int LLVM_TRAP = 2; - int LLVM_DEBUG_TRAP = 3; -} - def HWREG { int MODE = 1; int STATUS = 2; @@ -1507,8 +1518,12 @@ class getVOP3SrcForVT<ValueType VT> { VSrc_128, !if(!eq(VT.Size, 64), !if(isFP, - VSrc_f64, - VSrc_b64), + !if(!eq(VT.Value, v2f32.Value), + VSrc_v2f32, + VSrc_f64), + !if(!eq(VT.Value, v2i32.Value), + VSrc_v2b32, + VSrc_b64)), !if(!eq(VT.Value, i1.Value), SSrc_i1, !if(isFP, @@ -1541,7 +1556,9 @@ class isModifierType<ValueType SrcVT> { !eq(SrcVT.Value, f32.Value), !eq(SrcVT.Value, f64.Value), !eq(SrcVT.Value, v2f16.Value), - !eq(SrcVT.Value, v2i16.Value)); + !eq(SrcVT.Value, v2i16.Value), + !eq(SrcVT.Value, v2f32.Value), + !eq(SrcVT.Value, v2i32.Value)); } // Return type of input modifiers operand for specified input operand @@ -1598,8 +1615,11 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, !if (!eq(NumSrcArgs, 1), !if (HasModifiers, // VOP1 with modifiers - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - clampmod0:$clamp, omod0:$omod) + !if(HasOMod, + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + clampmod0:$clamp, omod0:$omod), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + clampmod0:$clamp)) /* else */, // VOP1 without modifiers !if (HasClamp, @@ -1695,7 +1715,7 @@ class getInsVOP3OpSel <RegisterOperand Src0RC, RegisterOperand Src1RC, Src0Mod, Src1Mod, Src2Mod, 1/*HasOpSel*/, 0>.ret; } -class getInsDPPBase <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC, +class getInsDPPBase <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, bit HasModifiers, Operand Src0Mod, Operand Src1Mod> { @@ -1705,45 +1725,45 @@ class getInsDPPBase <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass !if (!eq(NumSrcArgs, 1), !if (HasModifiers, // VOP1_DPP with modifiers - (ins DstRC:$old, Src0Mod:$src0_modifiers, + (ins OldRC:$old, Src0Mod:$src0_modifiers, Src0RC:$src0) /* else */, // VOP1_DPP without modifiers - (ins DstRC:$old, Src0RC:$src0) + (ins OldRC:$old, Src0RC:$src0) /* endif */), !if (HasModifiers, // VOP2_DPP with modifiers - (ins DstRC:$old, + (ins OldRC:$old, Src0Mod:$src0_modifiers, Src0RC:$src0, Src1Mod:$src1_modifiers, Src1RC:$src1) /* else */, // VOP2_DPP without modifiers - (ins DstRC:$old, + (ins OldRC:$old, Src0RC:$src0, Src1RC:$src1) ))); } -class getInsDPP <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC, +class getInsDPP <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, bit HasModifiers, Operand Src0Mod, Operand Src1Mod> { - dag ret = !con(getInsDPPBase<DstRC, Src0RC, Src1RC, NumSrcArgs, + dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, NumSrcArgs, HasModifiers, Src0Mod, Src1Mod>.ret, (ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)); } -class getInsDPP16 <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC, +class getInsDPP16 <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, bit HasModifiers, Operand Src0Mod, Operand Src1Mod> { - dag ret = !con(getInsDPP<DstRC, Src0RC, Src1RC, NumSrcArgs, + dag ret = !con(getInsDPP<OldRC, Src0RC, Src1RC, NumSrcArgs, HasModifiers, Src0Mod, Src1Mod>.ret, (ins FI:$fi)); } -class getInsDPP8 <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC, +class getInsDPP8 <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, bit HasModifiers, Operand Src0Mod, Operand Src1Mod> { - dag ret = !con(getInsDPPBase<DstRC, Src0RC, Src1RC, NumSrcArgs, + dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, NumSrcArgs, HasModifiers, Src0Mod, Src1Mod>.ret, (ins dpp8:$dpp8, FI:$fi)); } @@ -1846,7 +1866,7 @@ class getAsm64 <bit HasDst, int NumSrcArgs, bit HasIntClamp, bit HasModifiers, // instruction. class getAsmVOP3P <bit HasDst, int NumSrcArgs, bit HasModifiers, bit HasClamp, ValueType DstVT = i32> { - string dst = " $vdst"; + string dst = "$vdst"; string src0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,"); string src1 = !if(!eq(NumSrcArgs, 1), "", !if(!eq(NumSrcArgs, 2), " $src1", @@ -1867,7 +1887,7 @@ class getAsmVOP3OpSel <int NumSrcArgs, bit Src0HasMods, bit Src1HasMods, bit Src2HasMods> { - string dst = " $vdst"; + string dst = "$vdst"; string isrc0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,"); string isrc1 = !if(!eq(NumSrcArgs, 1), "", @@ -1972,14 +1992,29 @@ class getAsmSDWA9 <bit HasDst, bit HasOMod, int NumSrcArgs, string ret = dst#args#sdwa; } +class getHas64BitOps <int NumSrcArgs, ValueType DstVT, ValueType Src0VT, + ValueType Src1VT> { + bit ret = !if(!eq(NumSrcArgs, 3), + 0, + !if(!eq(DstVT.Size, 64), + 1, + !if(!eq(Src0VT.Size, 64), + 1, + !if(!eq(Src1VT.Size, 64), + 1, + 0 + ) + ) + ) + ); +} -// Function that checks if instruction supports DPP and SDWA -class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32, - ValueType Src1VT = i32> { +class getHasSDWA <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32, + ValueType Src1VT = i32> { bit ret = !if(!eq(NumSrcArgs, 3), - 0, // NumSrcArgs == 3 - No DPP or SDWA for VOP3 + 0, // NumSrcArgs == 3 - No SDWA for VOP3 !if(!eq(DstVT.Size, 64), - 0, // 64-bit dst - No DPP or SDWA for 64-bit operands + 0, // 64-bit dst - No SDWA for 64-bit operands !if(!eq(Src0VT.Size, 64), 0, // 64-bit src0 !if(!eq(Src1VT.Size, 64), @@ -1993,8 +2028,42 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32, class getHasDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32, ValueType Src1VT = i32> { - bit ret = !if(!eq(NumSrcArgs, 0), 0, - getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret); + bit ret = !if(!eq(NumSrcArgs, 3), + 0, // NumSrcArgs == 3 - No DPP for VOP3 + 1); +} + +class getHasExt64BitDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32, + ValueType Src1VT = i32> { + bit ret = !and(getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret, + getHas64BitOps<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret); +} + +// Function that checks if instruction supports DPP and SDWA +class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32, + ValueType Src1VT = i32> { + bit ret = !or(getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret, + getHasSDWA<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret); +} + +// Return an AGPR+VGPR operand class for the given VGPR register class. +class getLdStRegisterOperand<RegisterClass RC> { + RegisterOperand ret = + !if(!eq(RC.Size, 32), AVLdSt_32, + !if(!eq(RC.Size, 64), AVLdSt_64, + !if(!eq(RC.Size, 96), AVLdSt_96, + !if(!eq(RC.Size, 128), AVLdSt_128, + !if(!eq(RC.Size, 160), AVLdSt_160, + RegisterOperand<VReg_1> // invalid register + ))))); +} + +class BitOr<bit a, bit b> { + bit ret = !if(a, 1, !if(b, 1, 0)); +} + +class BitAnd<bit a, bit b> { + bit ret = !if(a, !if(b, 1, 0), 0); } def PatGenMode { @@ -2037,6 +2106,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0, field bit HasDst = !ne(DstVT.Value, untyped.Value); field bit HasDst32 = HasDst; field bit EmitDst = HasDst; // force dst encoding, see v_movreld_b32 special case + field bit EmitDstSel = EmitDst; field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret; field bit HasSrc0 = !ne(Src0VT.Value, untyped.Value); field bit HasSrc1 = !ne(Src1VT.Value, untyped.Value); @@ -2077,12 +2147,14 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0, field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; field bit HasExtDPP = getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; - field bit HasExtSDWA = HasExt; - field bit HasExtSDWA9 = HasExt; + field bit HasExt64BitDPP = getHasExt64BitDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; + field bit HasExtSDWA = getHasSDWA<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; + field bit HasExtSDWA9 = HasExtSDWA; field int NeedPatGen = PatGenMode.NoPattern; field bit IsMAI = 0; field bit IsDOT = 0; + field bit IsSingle = 0; field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods); field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods); @@ -2134,7 +2206,9 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0, field string AsmDPP = !if(HasExtDPP, getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret, ""); field string AsmDPP16 = getAsmDPP16<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret; - field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0, DstVT>.ret; + // DPP8 encoding has no fields for modifiers, and it is enforced by setting + // the asm operand name via this HasModifiers flag + field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret; field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret; field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret; @@ -2144,6 +2218,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0, class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> { let HasExt = 0; let HasExtDPP = 0; + let HasExt64BitDPP = 0; let HasExtSDWA = 0; let HasExtSDWA9 = 0; } @@ -2191,6 +2266,7 @@ def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>; def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>; def VOP_F16_F32 : VOPProfile <[f16, f32, untyped, untyped]>; def VOP_F32_F16 : VOPProfile <[f32, f16, untyped, untyped]>; +def VOP_I64_I64 : VOPProfile <[i64, i64, untyped, untyped]>; def VOP_F32_F32_F16 : VOPProfile <[f32, f32, f16, untyped]>; def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>; @@ -2234,6 +2310,16 @@ def VOP_V4I32_I32_I32_V4I32 : VOPProfile <[v4i32, i32, i32, v4i32]>; def VOP_V16I32_I32_I32_V16I32 : VOPProfile <[v16i32, i32, i32, v16i32]>; def VOP_V32I32_I32_I32_V32I32 : VOPProfile <[v32i32, i32, i32, v32i32]>; +def VOP_V4F64_F64_F64_V4F64 : VOPProfile <[v4f64, f64, f64, v4f64]>; +def VOP_V1F64_F64_F64_V1F64 : VOPProfile <[v1f64, f64, f64, v1f64]>; + +def VOP_V2F32_V2F32_V2F32_V2F32 : VOPProfile <[v2f32, v2f32, v2f32, v2f32]>; +def VOP_V2F32_V2F32_V2F32 : VOPProfile <[v2f32, v2f32, v2f32, untyped]>; +def VOP_V2I32_V2I32_V2I32 : VOPProfile <[v2i32, v2i32, v2i32, untyped]>; +def VOP_V4F32_V4I16_V4I16_V4F32 : VOPProfile <[v4f32, v4i16, v4i16, v4f32]>; +def VOP_V16F32_V4I16_V4I16_V16F32 : VOPProfile <[v16f32, v4i16, v4i16, v16f32]>; +def VOP_V32F32_V4I16_V4I16_V32F32 : VOPProfile <[v32f32, v4i16, v4i16, v32f32]>; + class Commutable_REV <string revOp, bit isOrig> { string RevOp = revOp; bit IsOrig = isOrig; @@ -2372,7 +2458,8 @@ def getMCOpcodeGen : InstrMapping { [!cast<string>(SIEncodingFamily.GFX80)], [!cast<string>(SIEncodingFamily.GFX9)], [!cast<string>(SIEncodingFamily.GFX10)], - [!cast<string>(SIEncodingFamily.SDWA10)]]; + [!cast<string>(SIEncodingFamily.SDWA10)], + [!cast<string>(SIEncodingFamily.GFX90A)]]; } // Get equivalent SOPK instruction. @@ -2408,15 +2495,6 @@ def getMUBUFNoLdsInst : InstrMapping { let ValueCols = [["0"]]; } -// Maps an atomic opcode to its version with a return value. -def getAtomicRetOp : InstrMapping { - let FilterClass = "AtomicNoRet"; - let RowFields = ["NoRetOp"]; - let ColFields = ["IsRet"]; - let KeyCol = ["0"]; - let ValueCols = [["1"]]; -} - // Maps an atomic opcode to its returnless version. def getAtomicNoRetOp : InstrMapping { let FilterClass = "AtomicNoRet"; @@ -2435,6 +2513,15 @@ def getGlobalSaddrOp : InstrMapping { let ValueCols = [["1"]]; } +// Maps a GLOBAL SADDR to its VADDR form. +def getGlobalVaddrOp : InstrMapping { + let FilterClass = "GlobalSaddrTable"; + let RowFields = ["SaddrOp"]; + let ColFields = ["IsSaddr"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + // Maps a v_cmpx opcode with sdst to opcode without sdst. def getVCMPXNoSDstOp : InstrMapping { let FilterClass = "VCMPXNoSDstTable"; @@ -2470,6 +2557,14 @@ def getFlatScratchInstSSfromSV : InstrMapping { let ValueCols = [["SS"]]; } +def getFlatScratchInstSVfromSS : InstrMapping { + let FilterClass = "FlatScratchInst"; + let RowFields = ["SVOp"]; + let ColFields = ["Mode"]; + let KeyCol = ["SS"]; + let ValueCols = [["SV"]]; +} + include "SIInstructions.td" include "DSInstructions.td" diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 7c1cbd67c993..fbf4634bfc94 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -41,18 +41,21 @@ multiclass V_INTERP_P1_F32_m : VINTRP_m < (i32 timm:$attrchan), (i32 timm:$attr), M0))] >; -let OtherPredicates = [has32BankLDS] in { +let OtherPredicates = [has32BankLDS, isNotGFX90APlus] in { defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m; -} // End OtherPredicates = [has32BankLDS] +} // End OtherPredicates = [has32BankLDS, isNotGFX90APlus] -let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in { +let OtherPredicates = [has16BankLDS, isNotGFX90APlus], + Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in { defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m; -} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 +} // End OtherPredicates = [has32BankLDS, isNotGFX90APlus], + // Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 +let OtherPredicates = [isNotGFX90APlus] in { let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in { defm V_INTERP_P2_F32 : VINTRP_m < @@ -73,6 +76,8 @@ defm V_INTERP_MOV_F32 : VINTRP_m < [(set f32:$vdst, (int_amdgcn_interp_mov (i32 timm:$vsrc), (i32 timm:$attrchan), (i32 timm:$attr), M0))]>; +} // End OtherPredicates = [isNotGFX90APlus] + } // End Uses = [MODE, M0, EXEC] //===----------------------------------------------------------------------===// @@ -86,11 +91,6 @@ def ATOMIC_FENCE : SPseudoInstSI< let maybeAtomic = 1; } -def VOP_I64_I64_DPP : VOPProfile <[i64, i64, untyped, untyped]> { - let HasExt = 1; - let HasExtDPP = 1; -} - let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { // For use in patterns @@ -104,13 +104,31 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst), // 64-bit vector move instruction. This is mainly used by the // SIFoldOperands pass to enable folding of inline immediates. def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), - (ins VSrc_b64:$src0)>; + (ins VSrc_b64:$src0)> { + let isReMaterializable = 1; + let isAsCheapAsAMove = 1; + let isMoveImm = 1; + let SchedRW = [Write64Bit]; + let Size = 16; // Needs maximum 2 v_mov_b32 instructions 8 byte long each. +} // 64-bit vector move with dpp. Expanded post-RA. -def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64_DPP> { +def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64> { let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete. } +// 64-bit scalar move immediate instruction. This is used to avoid subregs +// initialization and allow rematerialization. +def S_MOV_B64_IMM_PSEUDO : SPseudoInstSI <(outs SReg_64:$sdst), + (ins i64imm:$src0)> { + let isReMaterializable = 1; + let isAsCheapAsAMove = 1; + let isMoveImm = 1; + let SchedRW = [WriteSALU, Write64Bit]; + let Size = 16; // Needs maximum 2 s_mov_b32 instructions 8 byte long each. + let Uses = []; +} + // Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the // WQM pass processes it. def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; @@ -119,17 +137,32 @@ def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; // turned into a copy by WQM pass, but does not seed WQM requirements. def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; -// Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so +// Pseudoinstruction for @llvm.amdgcn.strict.wwm. It is turned into a copy post-RA, so // that the @earlyclobber is respected. The @earlyclobber is to make sure that -// the instruction that defines $src0 (which is run in WWM) doesn't +// the instruction that defines $src0 (which is run in Whole Wave Mode) doesn't // accidentally clobber inactive channels of $vdst. let Constraints = "@earlyclobber $vdst" in { -def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; +def STRICT_WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; +def STRICT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; } } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] -def ENTER_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { +def ENTER_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { + let Uses = [EXEC]; + let Defs = [EXEC, SCC]; + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; +} + +def EXIT_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; +} + +def ENTER_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { let Uses = [EXEC]; let Defs = [EXEC, SCC]; let hasSideEffects = 0; @@ -137,7 +170,7 @@ def ENTER_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { let mayStore = 0; } -def EXIT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { +def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { let hasSideEffects = 0; let mayLoad = 0; let mayStore = 0; @@ -145,6 +178,7 @@ def EXIT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { // Invert the exec mask and overwrite the inactive lanes of dst with inactive, // restoring it after we're done. +let Defs = [SCC] in { def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst), (ins VGPR_32: $src, VSrc_b32:$inactive), [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> { @@ -156,6 +190,7 @@ def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst), [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> { let Constraints = "$src = $vdst"; } +} // End Defs = [SCC] let usesCustomInserter = 1, Defs = [VCC, EXEC] in { def V_ADD_U64_PSEUDO : VPseudoInstSI < @@ -230,6 +265,7 @@ def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>; def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>; def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>; def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>; +def S_AND_B64_term : WrapTerminatorInst<S_AND_B64>; } let WaveSizePredicate = isWave32 in { @@ -237,6 +273,7 @@ def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>; def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>; def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>; def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>; +def S_AND_B32_term : WrapTerminatorInst<S_AND_B32>; } @@ -255,19 +292,6 @@ def WAVE_BARRIER : SPseudoInstSI<(outs), (ins), // SI pseudo instructions. These are used by the CFG structurizer pass // and should be lowered to ISA instructions prior to codegen. -// Dummy terminator instruction to use after control flow instructions -// replaced with exec mask operations. -def SI_MASK_BRANCH : VPseudoInstSI < - (outs), (ins brtarget:$target)> { - let isBranch = 0; - let isTerminator = 1; - let isBarrier = 0; - let SchedRW = []; - let hasNoSchedulingInfo = 1; - let FixedSize = 1; - let Size = 0; -} - let isTerminator = 1 in { let OtherPredicates = [EnableLateCFGStructurize] in { @@ -294,6 +318,14 @@ def SI_ELSE : CFPseudoInstSI < let hasSideEffects = 1; } +def SI_WATERFALL_LOOP : CFPseudoInstSI < + (outs), + (ins brtarget:$target), [], 1> { + let Size = 8; + let isBranch = 1; + let Defs = []; +} + def SI_LOOP : CFPseudoInstSI < (outs), (ins SReg_1:$saved, brtarget:$target), [(AMDGPUloop i1:$saved, bb:$target)], 1, 1> { @@ -337,24 +369,22 @@ multiclass PseudoInstKill <dag ins> { // required in degenerate cases (when V_CMPX cannot be used due to constant // bus limitations) and because it allows us to avoid having to track SCC // liveness across basic blocks. - let Defs = [EXEC,VCC,SCC] in + let Defs = [EXEC,SCC] in def _PSEUDO : PseudoInstSI <(outs), ins> { let isConvergent = 1; let usesCustomInserter = 1; } - let Defs = [EXEC,VCC,SCC] in + let Defs = [EXEC,SCC] in def _TERMINATOR : SPseudoInstSI <(outs), ins> { let isTerminator = 1; } } defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>; +let Defs = [VCC] in defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>; -let Defs = [EXEC] in -def SI_KILL_CLEANUP : SPseudoInstSI <(outs), (ins)>; - let Defs = [EXEC,VCC] in def SI_ILLEGAL_COPY : SPseudoInstSI < (outs unknown:$dst), (ins unknown:$src), @@ -376,6 +406,18 @@ def SI_PS_LIVE : PseudoInstSI < let SALU = 1; } +let Uses = [EXEC] in { +def SI_LIVE_MASK : PseudoInstSI < + (outs SReg_1:$dst), (ins), + [(set i1:$dst, (int_amdgcn_live_mask))]> { + let SALU = 1; +} +let Defs = [EXEC,SCC] in { +// Demote: Turn a pixel shader thread into a helper lane. +def SI_DEMOTE_I1 : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)>; +} // End Defs = [EXEC,SCC] +} // End Uses = [EXEC] + def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins), [(int_amdgcn_unreachable)], "; divergent unreachable"> { @@ -463,7 +505,7 @@ def SI_CALL : SPseudoInstSI < // Tail call handling pseudo def SI_TCRETURN : SPseudoInstSI <(outs), - (ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff), + (ins SReg_64:$src0, unknown:$callee, i32imm:$fpdiff), [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> { let Size = 4; let isCall = 1; @@ -476,6 +518,11 @@ def SI_TCRETURN : SPseudoInstSI <(outs), let isConvergent = 1; } +// Handle selecting indirect tail calls +def : GCNPat< + (AMDGPUtc_return i64:$src0, (i64 0), (i32 timm:$fpdiff)), + (SI_TCRETURN SReg_64:$src0, (i64 0), i32imm:$fpdiff) +>; def ADJCALLSTACKUP : SPseudoInstSI< (outs), (ins i32imm:$amt0, i32imm:$amt1), @@ -654,6 +701,7 @@ defm SI_SPILL_S96 : SI_SPILL_SGPR <SReg_96>; defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>; defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>; defm SI_SPILL_S192 : SI_SPILL_SGPR <SReg_192>; +defm SI_SPILL_S224 : SI_SPILL_SGPR <SReg_224>; defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>; @@ -697,6 +745,7 @@ defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>; defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>; defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>; defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>; +defm SI_SPILL_V224 : SI_SPILL_VGPR <VReg_224>; defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>; defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>; defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>; @@ -707,6 +756,7 @@ defm SI_SPILL_A96 : SI_SPILL_VGPR <AReg_96, 1>; defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128, 1>; defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160, 1>; defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192, 1>; +defm SI_SPILL_A224 : SI_SPILL_VGPR <AReg_224, 1>; defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>; defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>; defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>; @@ -749,6 +799,16 @@ def : Pat < (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond)) >; +def : Pat < + (int_amdgcn_wqm_demote i1:$src), + (SI_DEMOTE_I1 SCSrc_i1:$src, 0) +>; + +def : Pat < + (int_amdgcn_wqm_demote (i1 (not i1:$src))), + (SI_DEMOTE_I1 SCSrc_i1:$src, -1) +>; + // TODO: we could add more variants for other types of conditionals def : Pat < @@ -1021,6 +1081,38 @@ foreach Index = 0-4 in { >; } +foreach Index = 0-5 in { + def Extract_Element_v6i32_#Index : Extract_Element < + i32, v6i32, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v6i32_#Index : Insert_Element < + i32, v6i32, Index, !cast<SubRegIndex>(sub#Index) + >; + + def Extract_Element_v6f32_#Index : Extract_Element < + f32, v6f32, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v6f32_#Index : Insert_Element < + f32, v6f32, Index, !cast<SubRegIndex>(sub#Index) + >; +} + +foreach Index = 0-6 in { + def Extract_Element_v7i32_#Index : Extract_Element < + i32, v7i32, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v7i32_#Index : Insert_Element < + i32, v7i32, Index, !cast<SubRegIndex>(sub#Index) + >; + + def Extract_Element_v7f32_#Index : Extract_Element < + f32, v7f32, Index, !cast<SubRegIndex>(sub#Index) + >; + def Insert_Element_v7f32_#Index : Insert_Element < + f32, v7f32, Index, !cast<SubRegIndex>(sub#Index) + >; +} + foreach Index = 0-7 in { def Extract_Element_v8i32_#Index : Extract_Element < i32, v8i32, Index, !cast<SubRegIndex>(sub#Index) @@ -1171,8 +1263,32 @@ def : BitConvert <v4f32, v2i64, VReg_128>; def : BitConvert <v2i64, v4f32, VReg_128>; // 160-bit bitcast -def : BitConvert <v5i32, v5f32, SGPR_160>; -def : BitConvert <v5f32, v5i32, SGPR_160>; +def : BitConvert <v5i32, v5f32, SReg_160>; +def : BitConvert <v5f32, v5i32, SReg_160>; +def : BitConvert <v5i32, v5f32, VReg_160>; +def : BitConvert <v5f32, v5i32, VReg_160>; + +// 192-bit bitcast +def : BitConvert <v6i32, v6f32, SReg_192>; +def : BitConvert <v6f32, v6i32, SReg_192>; +def : BitConvert <v6i32, v6f32, VReg_192>; +def : BitConvert <v6f32, v6i32, VReg_192>; +def : BitConvert <v3i64, v3f64, VReg_192>; +def : BitConvert <v3f64, v3i64, VReg_192>; +def : BitConvert <v3i64, v6i32, VReg_192>; +def : BitConvert <v3i64, v6f32, VReg_192>; +def : BitConvert <v3f64, v6i32, VReg_192>; +def : BitConvert <v3f64, v6f32, VReg_192>; +def : BitConvert <v6i32, v3i64, VReg_192>; +def : BitConvert <v6f32, v3i64, VReg_192>; +def : BitConvert <v6i32, v3f64, VReg_192>; +def : BitConvert <v6f32, v3f64, VReg_192>; + +// 224-bit bitcast +def : BitConvert <v7i32, v7f32, SReg_224>; +def : BitConvert <v7f32, v7i32, SReg_224>; +def : BitConvert <v7i32, v7f32, VReg_224>; +def : BitConvert <v7f32, v7i32, VReg_224>; // 256-bit bitcast def : BitConvert <v8i32, v8f32, SReg_256>; @@ -1349,6 +1465,19 @@ def : GCNPat < // sub1) // >; +// COPY_TO_REGCLASS is needed to avoid using SCC from S_XOR_B32 instead +// of the real value. +def : GCNPat < + (fneg (v2f32 SReg_64:$src)), + (v2f32 (REG_SEQUENCE SReg_64, + (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub0)), + (i32 (S_MOV_B32 (i32 0x80000000)))), + SReg_32)), sub0, + (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub1)), + (i32 (S_MOV_B32 (i32 0x80000000)))), + SReg_32)), sub1)) +>; + } // End let AddedComplexity = 1 def : GCNPat < @@ -1414,6 +1543,15 @@ def : GCNPat < >; def : GCNPat < + (getDivergentFrag<fneg>.ret (v2f32 VReg_64:$src)), + (V_PK_ADD_F32 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, VReg_64:$src, + 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, 0, + 0, 0, 0, 0, 0) +> { + let SubtargetPredicate = HasPackedFP32Ops; +} + +def : GCNPat < (fcopysign f16:$src0, f16:$src1), (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1) >; @@ -1532,9 +1670,16 @@ def : GCNPat < /********** Intrinsic Patterns **********/ /********** ================== **********/ +let OtherPredicates = [isNotGFX90APlus] in // FIXME: Should use _e64 and select source modifiers. def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>; +let OtherPredicates = [isGFX90APlus] in +def : GCNPat < + (fpow f32:$src0, f32:$src1), + (V_EXP_F32_e32 (V_MUL_LEGACY_F32_e64 0, f32:$src1, SRCMODS.NONE, (V_LOG_F32_e32 f32:$src0), 0, 0)) +>; + def : GCNPat < (i32 (sext i1:$src0)), (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), @@ -1793,6 +1938,8 @@ class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, S (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE)) >; +def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>; +def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>; def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>; def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>; def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>; @@ -1930,11 +2077,19 @@ def : GCNPat < //===----------------------------------------------------------------------===// // Miscellaneous Patterns //===----------------------------------------------------------------------===// -def : GCNPat < - (i32 (AMDGPUfp16_zext f16:$src)), - (COPY $src) ->; +// Eliminate a zero extension from an fp16 operation if it already +// zeros the high bits of the 32-bit register. +// +// This is complicated on gfx9+. Some instructions maintain the legacy +// zeroing behavior, but others preserve the high bits. Some have a +// control bit to change the behavior. We can't simply say with +// certainty what the source behavior is without more context on how +// the src is lowered. e.g. fptrunc + fma may be lowered to a +// v_fma_mix* instruction which does not zero, or may not. +def : GCNPat< + (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))), + (COPY VSrc_b16:$src)>; def : GCNPat < (i32 (trunc i64:$a)), @@ -2141,6 +2296,17 @@ def : GCNPat < SRCMODS.NONE, $src2) >; +let SubtargetPredicate = isGFX90APlus in +def : GCNPat < + (fma (f64 (VOP3Mods0 f64:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (f64 (VOP3Mods f64:$src1, i32:$src1_modifiers)), + (f64 (VOP3NoMods f64:$src2))), + (V_FMAC_F64_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, + SRCMODS.NONE, $src2, $clamp, $omod) +>; + +// COPY is workaround tablegen bug from multiple outputs +// from S_LSHL_B32's multiple outputs from implicit scc def. def : GCNPat < (v2i16 (build_vector (i16 0), (i16 SReg_32:$src1))), (S_LSHL_B32 SReg_32:$src1, (i16 16)) @@ -2207,9 +2373,13 @@ def : GCNPat < (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1) >; +def : GCNPat < + (v2f16 (is_canonicalized<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)), + (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))), + (V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1) +>; } // End SubtargetPredicate = HasVOP3PInsts - def : GCNPat < (v2f16 (scalar_to_vector f16:$src0)), (COPY $src0) @@ -2233,7 +2403,7 @@ def : GCNPat < def : GCNPat < (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask, timm:$bound_ctrl)), - (V_MOV_B64_DPP_PSEUDO VReg_64:$src, VReg_64:$src, + (V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$src, VReg_64_Align2:$src, (as_i32timm $dpp_ctrl), (as_i32timm $row_mask), (as_i32timm $bank_mask), (as_i1timm $bound_ctrl)) @@ -2242,7 +2412,7 @@ def : GCNPat < def : GCNPat < (i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask, timm:$bound_ctrl)), - (V_MOV_B64_DPP_PSEUDO VReg_64:$old, VReg_64:$src, (as_i32timm $dpp_ctrl), + (V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$old, VReg_64_Align2:$src, (as_i32timm $dpp_ctrl), (as_i32timm $row_mask), (as_i32timm $bank_mask), (as_i1timm $bound_ctrl)) >; @@ -2573,6 +2743,24 @@ def G_AMDGPU_CVT_F32_UBYTE#N : AMDGPUGenericInstruction { } } +def G_AMDGPU_CVT_PK_I16_I32 : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1); + let hasSideEffects = 0; +} + +def G_AMDGPU_SMED3 : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); + let hasSideEffects = 0; +} + +def G_AMDGPU_UMED3 : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); + let hasSideEffects = 0; +} + // Atomic cmpxchg. $cmpval ad $newval are packed in a single vector // operand Expects a MachineMemOperand in addition to explicit // operands. @@ -2614,6 +2802,8 @@ def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_FMIN : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_FMAX : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction { let OutOperandList = (outs type0:$dst); diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp new file mode 100644 index 000000000000..d560b477b8ba --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp @@ -0,0 +1,231 @@ +//===-- SILateBranchLowering.cpp - Final preparation of branches ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass mainly lowers early terminate pseudo instructions. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-late-branch-lowering" + +namespace { + +class SILateBranchLowering : public MachineFunctionPass { +private: + const SIRegisterInfo *TRI = nullptr; + const SIInstrInfo *TII = nullptr; + MachineDominatorTree *MDT = nullptr; + + void earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock); + +public: + static char ID; + + unsigned MovOpc; + Register ExecReg; + + SILateBranchLowering() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "SI Final Branch Preparation"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // end anonymous namespace + +char SILateBranchLowering::ID = 0; + +INITIALIZE_PASS_BEGIN(SILateBranchLowering, DEBUG_TYPE, + "SI insert s_cbranch_execz instructions", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(SILateBranchLowering, DEBUG_TYPE, + "SI insert s_cbranch_execz instructions", false, false) + +char &llvm::SILateBranchLoweringPassID = SILateBranchLowering::ID; + +static void generateEndPgm(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + const SIInstrInfo *TII, MachineFunction &MF) { + const Function &F = MF.getFunction(); + bool IsPS = F.getCallingConv() == CallingConv::AMDGPU_PS; + + // Check if hardware has been configured to expect color or depth exports. + bool HasExports = + AMDGPU::getHasColorExport(F) || AMDGPU::getHasDepthExport(F); + + // Prior to GFX10, hardware always expects at least one export for PS. + bool MustExport = !AMDGPU::isGFX10Plus(TII->getSubtarget()); + + if (IsPS && (HasExports || MustExport)) { + // Generate "null export" if hardware is expecting PS to export. + BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE)) + .addImm(AMDGPU::Exp::ET_NULL) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addImm(1) // vm + .addImm(0) // compr + .addImm(0); // en + } + + // s_endpgm + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0); +} + +static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, + MachineDominatorTree *MDT) { + MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/ true); + + // Update dominator tree + using DomTreeT = DomTreeBase<MachineBasicBlock>; + SmallVector<DomTreeT::UpdateType, 16> DTUpdates; + for (MachineBasicBlock *Succ : SplitBB->successors()) { + DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ}); + DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ}); + } + DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB}); + MDT->getBase().applyUpdates(DTUpdates); +} + +void SILateBranchLowering::earlyTerm(MachineInstr &MI, + MachineBasicBlock *EarlyExitBlock) { + MachineBasicBlock &MBB = *MI.getParent(); + const DebugLoc DL = MI.getDebugLoc(); + + auto BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0)) + .addMBB(EarlyExitBlock); + auto Next = std::next(MI.getIterator()); + + if (Next != MBB.end() && !Next->isTerminator()) + splitBlock(MBB, *BranchMI, MDT); + + MBB.addSuccessor(EarlyExitBlock); + MDT->getBase().insertEdge(&MBB, EarlyExitBlock); +} + +bool SILateBranchLowering::runOnMachineFunction(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + MDT = &getAnalysis<MachineDominatorTree>(); + + MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + + SmallVector<MachineInstr *, 4> EarlyTermInstrs; + SmallVector<MachineInstr *, 1> EpilogInstrs; + bool MadeChange = false; + + for (MachineBasicBlock &MBB : MF) { + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + + switch (MI.getOpcode()) { + case AMDGPU::S_BRANCH: + // Optimize out branches to the next block. + // This only occurs in -O0 when BranchFolding is not executed. + if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) { + assert(&MI == &MBB.back()); + MI.eraseFromParent(); + MadeChange = true; + } + break; + + case AMDGPU::SI_EARLY_TERMINATE_SCC0: + EarlyTermInstrs.push_back(&MI); + break; + + case AMDGPU::SI_RETURN_TO_EPILOG: + EpilogInstrs.push_back(&MI); + break; + + default: + break; + } + } + } + + // Lower any early exit branches first + if (!EarlyTermInstrs.empty()) { + MachineBasicBlock *EarlyExitBlock = MF.CreateMachineBasicBlock(); + DebugLoc DL; + + MF.insert(MF.end(), EarlyExitBlock); + BuildMI(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII->get(MovOpc), + ExecReg) + .addImm(0); + generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII, MF); + + for (MachineInstr *Instr : EarlyTermInstrs) { + // Early termination in GS does nothing + if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS) + earlyTerm(*Instr, EarlyExitBlock); + Instr->eraseFromParent(); + } + + EarlyTermInstrs.clear(); + MadeChange = true; + } + + // Now check return to epilog instructions occur at function end + if (!EpilogInstrs.empty()) { + MachineBasicBlock *EmptyMBBAtEnd = nullptr; + assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid()); + + // If there are multiple returns to epilog then all will + // become jumps to new empty end block. + if (EpilogInstrs.size() > 1) { + EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); + MF.insert(MF.end(), EmptyMBBAtEnd); + } + + for (auto MI : EpilogInstrs) { + auto MBB = MI->getParent(); + if (MBB == &MF.back() && MI == &MBB->back()) + continue; + + // SI_RETURN_TO_EPILOG is not the last instruction. + // Jump to empty block at function end. + if (!EmptyMBBAtEnd) { + EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); + MF.insert(MF.end(), EmptyMBBAtEnd); + } + + MBB->addSuccessor(EmptyMBBAtEnd); + MDT->getBase().insertEdge(MBB, EmptyMBBAtEnd); + BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) + .addMBB(EmptyMBBAtEnd); + MI->eraseFromParent(); + MadeChange = true; + } + + EpilogInstrs.clear(); + } + + return MadeChange; +} diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index b39420f3c7db..493c1ad87f93 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -104,9 +104,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass { unsigned BaseOff; unsigned DMask; InstClassEnum InstClass; - bool GLC; - bool SLC; - bool DLC; + unsigned CPol = 0; bool UseST64; int AddrIdx[MaxAddressRegs]; const MachineOperand *AddrReg[MaxAddressRegs]; @@ -199,6 +197,7 @@ private: const CombineInfo &Paired); const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, const CombineInfo &Paired); + const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired, SmallVectorImpl<MachineInstr *> &InstsToMove); @@ -304,6 +303,16 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { return 2; case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: return 4; + case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH; + case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH; + case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH; + case AMDGPU::DS_WRITE_B32_gfx9: + return 1; + case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH; + case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH; + case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH; + case AMDGPU::DS_WRITE_B64_gfx9: + return 2; default: return 0; } @@ -521,11 +530,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { Offset &= 0xffff; } else if (InstClass != MIMG) { - GLC = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm(); - if (InstClass != S_BUFFER_LOAD_IMM) { - SLC = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm(); - } - DLC = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm(); + CPol = TII.getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); } AddressRegs Regs = getRegs(Opc, TII); @@ -675,10 +680,9 @@ bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, return false; // Check other optional immediate operands for equality. - unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc, - AMDGPU::OpName::d16, AMDGPU::OpName::unorm, - AMDGPU::OpName::da, AMDGPU::OpName::r128, - AMDGPU::OpName::a16, AMDGPU::OpName::dlc}; + unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, + AMDGPU::OpName::unorm, AMDGPU::OpName::da, + AMDGPU::OpName::r128, AMDGPU::OpName::a16}; for (auto op : OperandsToMatch) { int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); @@ -725,6 +729,16 @@ static unsigned getBufferFormatWithCompCount(unsigned OldFormat, return NewFormatInfo->Format; } +// Return the value in the inclusive range [Lo,Hi] that is aligned to the +// highest power of two. Note that the result is well defined for all inputs +// including corner cases like: +// - if Lo == Hi, return that value +// - if Lo == 0, return 0 (even though the "- 1" below underflows +// - if Lo > Hi, return 0 (as if the range wrapped around) +static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { + return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1); +} + bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, CombineInfo &Paired, @@ -764,20 +778,19 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, return false; } - unsigned EltOffset0 = CI.Offset / CI.EltSize; - unsigned EltOffset1 = Paired.Offset / CI.EltSize; + uint32_t EltOffset0 = CI.Offset / CI.EltSize; + uint32_t EltOffset1 = Paired.Offset / CI.EltSize; CI.UseST64 = false; CI.BaseOff = 0; - // Handle DS instructions. + // Handle all non-DS instructions. if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { return (EltOffset0 + CI.Width == EltOffset1 || EltOffset1 + Paired.Width == EltOffset0) && - CI.GLC == Paired.GLC && CI.DLC == Paired.DLC && - (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC == Paired.SLC); + CI.CPol == Paired.CPol && + (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol); } - // Handle SMEM and VMEM instructions. // If the offset in elements doesn't fit in 8-bits, we might be able to use // the stride 64 versions. if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && @@ -800,22 +813,36 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, } // Try to shift base address to decrease offsets. - unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0); - CI.BaseOff = std::min(CI.Offset, Paired.Offset); + uint32_t Min = std::min(EltOffset0, EltOffset1); + uint32_t Max = std::max(EltOffset0, EltOffset1); - if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) { + const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; + if (((Max - Min) & ~Mask) == 0) { if (Modify) { - CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; - Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; + // From the range of values we could use for BaseOff, choose the one that + // is aligned to the highest power of two, to maximise the chance that + // the same offset can be reused for other load/store pairs. + uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); + // Copy the low bits of the offsets, so that when we adjust them by + // subtracting BaseOff they will be multiples of 64. + BaseOff |= Min & maskTrailingOnes<uint32_t>(6); + CI.BaseOff = BaseOff * CI.EltSize; + CI.Offset = (EltOffset0 - BaseOff) / 64; + Paired.Offset = (EltOffset1 - BaseOff) / 64; CI.UseST64 = true; } return true; } - if (isUInt<8>(OffsetDiff)) { + if (isUInt<8>(Max - Min)) { if (Modify) { - CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize; - Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize; + // From the range of values we could use for BaseOff, choose the one that + // is aligned to the highest power of two, to maximise the chance that + // the same offset can be reused for other load/store pairs. + uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); + CI.BaseOff = BaseOff * CI.EltSize; + CI.Offset = EltOffset0 - BaseOff; + Paired.Offset = EltOffset1 - BaseOff; } return true; } @@ -841,6 +868,26 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, } } +const TargetRegisterClass * +SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { + if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { + return TRI->getRegClassForReg(*MRI, Dst->getReg()); + } + if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { + return TRI->getRegClassForReg(*MRI, Src->getReg()); + } + if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { + return TRI->getRegClassForReg(*MRI, Src->getReg()); + } + if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { + return TRI->getRegClassForReg(*MRI, Dst->getReg()); + } + if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { + return TRI->getRegClassForReg(*MRI, Src->getReg()); + } + return nullptr; +} + /// This function assumes that CI comes before Paired in a basic block. bool SILoadStoreOptimizer::checkAndPrepareMerge( CombineInfo &CI, CombineInfo &Paired, @@ -873,6 +920,9 @@ bool SILoadStoreOptimizer::checkAndPrepareMerge( DenseSet<Register> PhysRegUsesToMove; addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); + const TargetRegisterClass *DataRC = getDataRegClass(*CI.I); + bool IsAGPR = TRI->hasAGPRs(DataRC); + MachineBasicBlock::iterator E = std::next(Paired.I); MachineBasicBlock::iterator MBBI = std::next(CI.I); MachineBasicBlock::iterator MBBE = CI.I->getParent()->end(); @@ -941,6 +991,17 @@ bool SILoadStoreOptimizer::checkAndPrepareMerge( continue; if (&*MBBI == &*Paired.I) { + if (TRI->hasAGPRs(getDataRegClass(*MBBI)) != IsAGPR) + return false; + // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data + // operands. However we are reporting that ds_write2 shall have + // only VGPR data so that machine copy propagation does not + // create an illegal instruction with a VGPR and AGPR sources. + // Consequenctially if we create such instruction the verifier + // will complain. + if (IsAGPR && CI.InstClass == DS_WRITE) + return false; + // We need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged // instruction. @@ -1014,8 +1075,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, const MCInstrDesc &Read2Desc = TII->get(Opc); - const TargetRegisterClass *SuperRC = - (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); Register DestReg = MRI->createVirtualRegister(SuperRC); DebugLoc DL = CI.I->getDebugLoc(); @@ -1229,8 +1289,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) .addImm(MergedOffset) // offset - .addImm(CI.GLC) // glc - .addImm(CI.DLC) // dlc + .addImm(CI.CPol) // cpol .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); @@ -1289,10 +1348,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) .addImm(MergedOffset) // offset - .addImm(CI.GLC) // glc - .addImm(CI.SLC) // slc + .addImm(CI.CPol) // cpol .addImm(0) // tfe - .addImm(CI.DLC) // dlc .addImm(0) // swz .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); @@ -1356,10 +1413,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) .addImm(MergedOffset) // offset .addImm(JoinedFormat) // format - .addImm(CI.GLC) // glc - .addImm(CI.SLC) // slc + .addImm(CI.CPol) // cpol .addImm(0) // tfe - .addImm(CI.DLC) // dlc .addImm(0) // swz .addMemOperand( combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); @@ -1436,10 +1491,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) .addImm(std::min(CI.Offset, Paired.Offset)) // offset .addImm(JoinedFormat) // format - .addImm(CI.GLC) // glc - .addImm(CI.SLC) // slc + .addImm(CI.CPol) // cpol .addImm(0) // tfe - .addImm(CI.DLC) // dlc .addImm(0) // swz .addMemOperand( combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); @@ -1536,18 +1589,12 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, case 16: return &AMDGPU::SGPR_512RegClass; } - } else { - switch (CI.Width + Paired.Width) { - default: - return nullptr; - case 2: - return &AMDGPU::VReg_64RegClass; - case 3: - return &AMDGPU::VReg_96RegClass; - case 4: - return &AMDGPU::VReg_128RegClass; - } } + + unsigned BitWidth = 32 * (CI.Width + Paired.Width); + return TRI->hasAGPRs(getDataRegClass(*CI.I)) + ? TRI->getAGPRClassForBitWidth(BitWidth) + : TRI->getVGPRClassForBitWidth(BitWidth); } MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( @@ -1596,10 +1643,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) .addImm(std::min(CI.Offset, Paired.Offset)) // offset - .addImm(CI.GLC) // glc - .addImm(CI.SLC) // slc + .addImm(CI.CPol) // cpol .addImm(0) // tfe - .addImm(CI.DLC) // dlc .addImm(0) // swz .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); @@ -1671,7 +1716,7 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, (void)HiHalf; LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); - Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); + Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); MachineInstr *FullBase = BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) .addReg(DestSub0) diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 5839e59b4d7f..0f2836e1e7fb 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -72,10 +72,9 @@ private: MachineRegisterInfo *MRI = nullptr; SetVector<MachineInstr*> LoweredEndCf; DenseSet<Register> LoweredIf; - SmallSet<MachineInstr *, 16> NeedsKillCleanup; + SmallSet<MachineBasicBlock *, 4> KillBlocks; const TargetRegisterClass *BoolRC = nullptr; - bool InsertKillCleanups; unsigned AndOpc; unsigned OrOpc; unsigned XorOpc; @@ -86,6 +85,8 @@ private: unsigned OrSaveExecOpc; unsigned Exec; + bool hasKill(const MachineBasicBlock *Begin, const MachineBasicBlock *End); + void emitIf(MachineInstr &MI); void emitElse(MachineInstr &MI); void emitIfBreak(MachineInstr &MI); @@ -163,8 +164,8 @@ static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) { char &llvm::SILowerControlFlowID = SILowerControlFlow::ID; -static bool hasKill(const MachineBasicBlock *Begin, - const MachineBasicBlock *End, const SIInstrInfo *TII) { +bool SILowerControlFlow::hasKill(const MachineBasicBlock *Begin, + const MachineBasicBlock *End) { DenseSet<const MachineBasicBlock*> Visited; SmallVector<MachineBasicBlock *, 4> Worklist(Begin->successors()); @@ -173,9 +174,8 @@ static bool hasKill(const MachineBasicBlock *Begin, if (MBB == End || !Visited.insert(MBB).second) continue; - for (auto &Term : MBB->terminators()) - if (TII->isKillTerminator(Term.getOpcode())) - return true; + if (KillBlocks.contains(MBB)) + return true; Worklist.append(MBB->succ_begin(), MBB->succ_end()); } @@ -211,32 +211,11 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { // just cleared bits. bool SimpleIf = isSimpleIf(MI, MRI); - if (InsertKillCleanups) { - // Check for SI_KILL_*_TERMINATOR on full path of control flow and - // flag the associated SI_END_CF for insertion of a kill cleanup. - auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg); - while (UseMI->getOpcode() != AMDGPU::SI_END_CF) { - assert(std::next(UseMI) == MRI->use_instr_nodbg_end()); - assert(UseMI->getOpcode() == AMDGPU::SI_ELSE); - MachineOperand &NextExec = UseMI->getOperand(0); - Register NextExecReg = NextExec.getReg(); - if (NextExec.isDead()) { - assert(!SimpleIf); - break; - } - UseMI = MRI->use_instr_nodbg_begin(NextExecReg); - } - if (UseMI->getOpcode() == AMDGPU::SI_END_CF) { - if (hasKill(MI.getParent(), UseMI->getParent(), TII)) { - NeedsKillCleanup.insert(&*UseMI); - SimpleIf = false; - } - } - } else if (SimpleIf) { + if (SimpleIf) { // Check for SI_KILL_*_TERMINATOR on path from if to endif. // if there is any such terminator simplifications are not safe. auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg); - SimpleIf = !hasKill(MI.getParent(), UseMI->getParent(), TII); + SimpleIf = !hasKill(MI.getParent(), UseMI->getParent()); } // Add an implicit def of exec to discourage scheduling VALU after this which @@ -451,8 +430,6 @@ SILowerControlFlow::skipIgnoreExecInstsTrivialSucc( auto E = B->end(); for ( ; It != E; ++It) { - if (It->getOpcode() == AMDGPU::SI_KILL_CLEANUP) - continue; if (TII->mayReadEXEC(*MRI, *It)) break; } @@ -505,18 +482,8 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) { LoweredEndCf.insert(NewMI); - // If this ends control flow which contains kills (as flagged in emitIf) - // then insert an SI_KILL_CLEANUP immediately following the exec mask - // manipulation. This can be lowered to early termination if appropriate. - MachineInstr *CleanUpMI = nullptr; - if (NeedsKillCleanup.count(&MI)) - CleanUpMI = BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_KILL_CLEANUP)); - - if (LIS) { + if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *NewMI); - if (CleanUpMI) - LIS->InsertMachineInstrInMaps(*CleanUpMI); - } MI.eraseFromParent(); @@ -633,6 +600,10 @@ MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) { emitLoop(MI); break; + case AMDGPU::SI_WATERFALL_LOOP: + MI.setDesc(TII->get(AMDGPU::S_CBRANCH_EXECNZ)); + break; + case AMDGPU::SI_END_CF: SplitBB = emitEndCf(MI); break; @@ -811,8 +782,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { LIS = getAnalysisIfAvailable<LiveIntervals>(); MRI = &MF.getRegInfo(); BoolRC = TRI->getBoolRC(); - InsertKillCleanups = - MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; if (ST.isWave32()) { AndOpc = AMDGPU::S_AND_B32; @@ -836,7 +805,27 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { Exec = AMDGPU::EXEC; } - SmallVector<MachineInstr *, 32> Worklist; + // Compute set of blocks with kills + const bool CanDemote = + MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; + for (auto &MBB : MF) { + bool IsKillBlock = false; + for (auto &Term : MBB.terminators()) { + if (TII->isKillTerminator(Term.getOpcode())) { + KillBlocks.insert(&MBB); + IsKillBlock = true; + break; + } + } + if (CanDemote && !IsKillBlock) { + for (auto &MI : MBB) { + if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) { + KillBlocks.insert(&MBB); + break; + } + } + } + } MachineFunction::iterator NextBB; for (MachineFunction::iterator BI = MF.begin(); @@ -853,18 +842,12 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { switch (MI.getOpcode()) { case AMDGPU::SI_IF: - SplitMBB = process(MI); - break; - case AMDGPU::SI_ELSE: case AMDGPU::SI_IF_BREAK: + case AMDGPU::SI_WATERFALL_LOOP: case AMDGPU::SI_LOOP: case AMDGPU::SI_END_CF: - // Only build worklist if SI_IF instructions must be processed first. - if (InsertKillCleanups) - Worklist.push_back(&MI); - else - SplitMBB = process(MI); + SplitMBB = process(MI); break; // FIXME: find a better place for this @@ -886,14 +869,11 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { } } - for (MachineInstr *MI : Worklist) - process(*MI); - optimizeEndCf(); LoweredEndCf.clear(); LoweredIf.clear(); - NeedsKillCleanup.clear(); + KillBlocks.clear(); return true; } diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp index 9570680ad9cb..672266f0c11e 100644 --- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -598,6 +598,11 @@ void SILowerI1Copies::lowerPhis() { MachineBasicBlock *PostDomBound = PDT->findNearestCommonDominator(DomBlocks); + + // FIXME: This fails to find irreducible cycles. If we have a def (other + // than a constant) in a pair of blocks that end up looping back to each + // other, it will be mishandle. Due to structurization this shouldn't occur + // in practice. unsigned FoundLoopLevel = LF.findLoop(PostDomBound); SSAUpdater.Initialize(DstReg); @@ -732,6 +737,9 @@ bool SILowerI1Copies::isConstantLaneMask(Register Reg, bool &Val) const { const MachineInstr *MI; for (;;) { MI = MRI->getUniqueVRegDef(Reg); + if (MI->getOpcode() == AMDGPU::IMPLICIT_DEF) + return true; + if (MI->getOpcode() != AMDGPU::COPY) break; @@ -808,9 +816,9 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DstReg, unsigned PrevReg, unsigned CurReg) { - bool PrevVal; + bool PrevVal = false; bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal); - bool CurVal; + bool CurVal = false; bool CurConstant = isConstantLaneMask(CurReg, CurVal); if (PrevConstant && CurConstant) { diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 30405059530e..38b9d85b653b 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -31,12 +31,6 @@ using MBBVector = SmallVector<MachineBasicBlock *, 4>; namespace { -static cl::opt<bool> EnableSpillVGPRToAGPR( - "amdgpu-spill-vgpr-to-agpr", - cl::desc("Enable spilling VGPRs to AGPRs"), - cl::ReallyHidden, - cl::init(true)); - class SILowerSGPRSpills : public MachineFunctionPass { private: const SIRegisterInfo *TRI = nullptr; @@ -71,6 +65,7 @@ char SILowerSGPRSpills::ID = 0; INITIALIZE_PASS_BEGIN(SILowerSGPRSpills, DEBUG_TYPE, "SI lower SGPR spill instructions", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_DEPENDENCY(VirtRegMap) INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE, "SI lower SGPR spill instructions", false, false) @@ -88,6 +83,8 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock, MachineBasicBlock::iterator I = SaveBlock.begin(); if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + for (const CalleeSavedInfo &CS : CSI) { // Insert the spill to the stack frame. MCRegister Reg = CS.getReg(); @@ -96,8 +93,13 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock, const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, MVT::i32); - TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC, - TRI); + // If this value was already livein, we probably have a direct use of the + // incoming register value, so don't kill at the spill point. This happens + // since we pass some special inputs (workgroup IDs) in the callee saved + // range. + const bool IsLiveIn = MRI.isLiveIn(Reg); + TII.storeRegToStackSlot(SaveBlock, I, Reg, !IsLiveIn, CS.getFrameIdx(), + RC, TRI); if (LIS) { assert(std::distance(MIS.begin(), I) == 1); @@ -255,13 +257,10 @@ static bool lowerShiftReservedVGPR(MachineFunction &MF, if (!LowestAvailableVGPR) LowestAvailableVGPR = PreReservedVGPR; - const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - Optional<int> FI; - // Check if we are reserving a CSR. Create a stack object for a possible spill - // in the function prologue. - if (FuncInfo->isCalleeSavedReg(CSRegs, LowestAvailableVGPR)) - FI = FrameInfo.CreateSpillStackObject(4, Align(4)); + // Create a stack object for a possible spill in the function prologue. + // Note Non-CSR VGPR also need this as we may overwrite inactive lanes. + Optional<int> FI = FrameInfo.CreateSpillStackObject(4, Align(4)); // Find saved info about the pre-reserved register. const auto *ReservedVGPRInfoItr = @@ -291,6 +290,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { TRI = &TII->getRegisterInfo(); VRM = getAnalysisIfAvailable<VirtRegMap>(); + LIS = getAnalysisIfAvailable<LiveIntervals>(); assert(SaveBlocks.empty() && RestoreBlocks.empty()); @@ -300,29 +300,28 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { bool HasCSRs = spillCalleeSavedRegs(MF); MachineFrameInfo &MFI = MF.getFrameInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + if (!MFI.hasStackObjects() && !HasCSRs) { SaveBlocks.clear(); RestoreBlocks.clear(); + if (FuncInfo->VGPRReservedForSGPRSpill) { + // Free the reserved VGPR for later possible use by frame lowering. + FuncInfo->removeVGPRForSGPRSpill(FuncInfo->VGPRReservedForSGPRSpill, MF); + MRI.freezeReservedRegs(MF); + } return false; } - MachineRegisterInfo &MRI = MF.getRegInfo(); - SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); - const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs() - && EnableSpillVGPRToAGPR; - bool MadeChange = false; - - const bool SpillToAGPR = EnableSpillVGPRToAGPR && ST.hasMAIInsts(); - std::unique_ptr<RegScavenger> RS; - bool NewReservedRegs = false; // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be // handled as SpilledToReg in regular PrologEpilogInserter. const bool HasSGPRSpillToVGPR = TRI->spillSGPRToVGPR() && (HasCSRs || FuncInfo->hasSpilledSGPRs()); - if (HasSGPRSpillToVGPR || SpillVGPRToAGPR) { + if (HasSGPRSpillToVGPR) { // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs // are spilled to VGPRs, in which case we can eliminate the stack usage. // @@ -331,33 +330,15 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { lowerShiftReservedVGPR(MF, ST); + // To track the spill frame indices handled in this pass. + BitVector SpillFIs(MFI.getObjectIndexEnd(), false); + for (MachineBasicBlock &MBB : MF) { MachineBasicBlock::iterator Next; for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) { MachineInstr &MI = *I; Next = std::next(I); - if (SpillToAGPR && TII->isVGPRSpill(MI)) { - // Try to eliminate stack used by VGPR spills before frame - // finalization. - unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(), - AMDGPU::OpName::vaddr); - int FI = MI.getOperand(FIOp).getIndex(); - Register VReg = - TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); - if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, - TRI->isAGPR(MRI, VReg))) { - NewReservedRegs = true; - if (!RS) - RS.reset(new RegScavenger()); - - // FIXME: change to enterBasicBlockEnd() - RS->enterBasicBlock(MBB); - TRI->eliminateFrameIndex(MI, 0, FIOp, RS.get()); - continue; - } - } - if (!TII->isSGPRSpill(MI)) continue; @@ -365,24 +346,32 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) { NewReservedRegs = true; - bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI, nullptr); + bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI, + nullptr, LIS); (void)Spilled; assert(Spilled && "failed to spill SGPR to VGPR when allocated"); + SpillFIs.set(FI); } } } + // FIXME: Adding to live-ins redundant with reserving registers. for (MachineBasicBlock &MBB : MF) { for (auto SSpill : FuncInfo->getSGPRSpillVGPRs()) MBB.addLiveIn(SSpill.VGPR); - - for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs()) - MBB.addLiveIn(Reg); - - for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs()) - MBB.addLiveIn(Reg); - MBB.sortUniqueLiveIns(); + + // FIXME: The dead frame indices are replaced with a null register from + // the debug value instructions. We should instead, update it with the + // correct register value. But not sure the register value alone is + // adequate to lower the DIExpression. It should be worked out later. + for (MachineInstr &MI : MBB) { + if (MI.isDebugValue() && MI.getOperand(0).isFI() && + SpillFIs[MI.getOperand(0).getIndex()]) { + MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/); + MI.getOperand(0).setIsDebug(); + } + } } MadeChange = true; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 9a0cdc7b1f4d..85cfe36df16a 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -8,6 +8,22 @@ #include "SIMachineFunctionInfo.h" #include "AMDGPUTargetMachine.h" +#include "AMDGPUSubtarget.h" +#include "SIRegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/Optional.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MIRParser/MIParser.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Function.h" +#include <cassert> +#include <vector> #define MAX_LANES 64 @@ -49,6 +65,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) // Enable all kernel inputs if we have the fixed ABI. Don't bother if we don't // have any calls. const bool UseFixedABI = AMDGPUTargetMachine::EnableFixedFunctionABI && + CC != CallingConv::AMDGPU_Gfx && (!isEntryFunction() || HasCalls); if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) { @@ -61,6 +78,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) } if (!isEntryFunction()) { + if (UseFixedABI) + ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; + // TODO: Pick a high register, and shift down, similar to a kernel. FrameOffsetReg = AMDGPU::SGPR33; StackPtrOffsetReg = AMDGPU::SGPR32; @@ -119,13 +139,15 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (WorkItemIDZ) WorkItemIDY = true; - PrivateSegmentWaveByteOffset = true; + if (!ST.flatScratchIsArchitected()) { + PrivateSegmentWaveByteOffset = true; - // HS and GS always have the scratch wave offset in SGPR5 on GFX9. - if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 && - (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS)) - ArgInfo.PrivateSegmentWaveByteOffset = - ArgDescriptor::createRegister(AMDGPU::SGPR5); + // HS and GS always have the scratch wave offset in SGPR5 on GFX9. + if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 && + (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS)) + ArgInfo.PrivateSegmentWaveByteOffset = + ArgDescriptor::createRegister(AMDGPU::SGPR5); + } } bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); @@ -156,13 +178,14 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (UseFixedABI || F.hasFnAttribute("amdgpu-kernarg-segment-ptr")) KernargSegmentPtr = true; + // TODO: This could be refined a lot. The attribute is a poor way of + // detecting calls or stack objects that may require it before argument + // lowering. if (ST.hasFlatAddressSpace() && isEntryFunction() && - (isAmdHsaOrMesa || ST.enableFlatScratch())) { - // TODO: This could be refined a lot. The attribute is a poor way of - // detecting calls or stack objects that may require it before argument - // lowering. - if (HasCalls || HasStackObjects || ST.enableFlatScratch()) - FlatScratchInit = true; + (isAmdHsaOrMesa || ST.enableFlatScratch()) && + (HasCalls || HasStackObjects || ST.enableFlatScratch()) && + !ST.flatScratchIsArchitected()) { + FlatScratchInit = true; } Attribute A = F.getFnAttribute("amdgpu-git-ptr-high"); @@ -285,8 +308,6 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, assert(Size >= 4 && "invalid sgpr spill size"); assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs"); - const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); - // Make sure to handle the case where a wide SGPR spill may span between two // VGPRs. for (unsigned I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) { @@ -309,16 +330,24 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, // partially spill the SGPR to VGPRs. SGPRToVGPRSpills.erase(FI); NumVGPRSpillLanes -= I; + +#if 0 + DiagnosticInfoResourceLimit DiagOutOfRegs(MF.getFunction(), + "VGPRs for SGPR spilling", + 0, DS_Error); + MF.getFunction().getContext().diagnose(DiagOutOfRegs); +#endif return false; } - Optional<int> CSRSpillFI; - if ((FrameInfo.hasCalls() || !isEntryFunction()) && CSRegs && - isCalleeSavedReg(CSRegs, LaneVGPR)) { - CSRSpillFI = FrameInfo.CreateSpillStackObject(4, Align(4)); + Optional<int> SpillFI; + // We need to preserve inactive lanes, so always save, even caller-save + // registers. + if (!isEntryFunction()) { + SpillFI = FrameInfo.CreateSpillStackObject(4, Align(4)); } - SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, CSRSpillFI)); + SpillVGPRs.push_back(SGPRSpillVGPR(LaneVGPR, SpillFI)); // Add this register as live-in to all blocks to avoid machine verifer // complaining about use of an undefined physical register. @@ -344,7 +373,7 @@ bool SIMachineFunctionInfo::reserveVGPRforSGPRSpills(MachineFunction &MF) { MF.getRegInfo(), &AMDGPU::VGPR_32RegClass, MF, true); if (LaneVGPR == Register()) return false; - SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, None)); + SpillVGPRs.push_back(SGPRSpillVGPR(LaneVGPR, None)); FuncInfo->VGPRReservedForSGPRSpill = LaneVGPR; return true; } @@ -437,6 +466,21 @@ void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) { } } +int SIMachineFunctionInfo::getScavengeFI(MachineFrameInfo &MFI, + const SIRegisterInfo &TRI) { + if (ScavengeFI) + return *ScavengeFI; + if (isEntryFunction()) { + ScavengeFI = MFI.CreateFixedObject( + TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false); + } else { + ScavengeFI = MFI.CreateStackObject( + TRI.getSpillSize(AMDGPU::SGPR_32RegClass), + TRI.getSpillAlign(AMDGPU::SGPR_32RegClass), false); + } + return *ScavengeFI; +} + MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const { assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); return AMDGPU::SGPR0 + NumUserSGPRs; @@ -529,7 +573,8 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo, } yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( - const llvm::SIMachineFunctionInfo &MFI, const TargetRegisterInfo &TRI) + const llvm::SIMachineFunctionInfo &MFI, const TargetRegisterInfo &TRI, + const llvm::MachineFunction &MF) : ExplicitKernArgSize(MFI.getExplicitKernArgSize()), MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()), DynLDSAlign(MFI.getDynLDSAlign()), IsEntryFunction(MFI.isEntryFunction()), @@ -543,6 +588,9 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)), StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)), ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)), Mode(MFI.getMode()) { + auto SFI = MFI.getOptionalScavengeFI(); + if (SFI) + ScavengeFI = yaml::FrameIndex(*SFI, MF.getFrameInfo()); } void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) { @@ -550,7 +598,8 @@ void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) { } bool SIMachineFunctionInfo::initializeBaseYamlFields( - const yaml::SIMachineFunctionInfo &YamlMFI) { + const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF, + PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange) { ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize; MaxKernArgAlign = assumeAligned(YamlMFI.MaxKernArgAlign); LDSSize = YamlMFI.LDSSize; @@ -563,6 +612,24 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields( WaveLimiter = YamlMFI.WaveLimiter; HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs; HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs; + + if (YamlMFI.ScavengeFI) { + auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo()); + if (!FIOrErr) { + // Create a diagnostic for a the frame index. + const MemoryBuffer &Buffer = + *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID()); + + Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1, + SourceMgr::DK_Error, toString(FIOrErr.takeError()), + "", None, None); + SourceRange = YamlMFI.ScavengeFI->SourceRange; + return true; + } + ScavengeFI = *FIOrErr; + } else { + ScavengeFI = None; + } return false; } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 35fb43162199..fb6d4f8841ab 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -17,6 +17,7 @@ #include "AMDGPUMachineFunction.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" +#include "llvm/ADT/MapVector.h" #include "llvm/CodeGen/MIRYamlMapping.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/Support/raw_ostream.h" @@ -288,10 +289,12 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { Optional<SIArgumentInfo> ArgInfo; SIMode Mode; + Optional<FrameIndex> ScavengeFI; SIMachineFunctionInfo() = default; SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &, - const TargetRegisterInfo &TRI); + const TargetRegisterInfo &TRI, + const llvm::MachineFunction &MF); void mappingImpl(yaml::IO &YamlIO) override; ~SIMachineFunctionInfo() = default; @@ -321,6 +324,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> { YamlIO.mapOptional("highBitsOf32BitAddress", MFI.HighBitsOf32BitAddress, 0u); YamlIO.mapOptional("occupancy", MFI.Occupancy, 0); + YamlIO.mapOptional("scavengeFI", MFI.ScavengeFI); } }; @@ -445,15 +449,15 @@ public: bool hasReg() { return VGPR != 0;} }; - struct SGPRSpillVGPRCSR { + struct SGPRSpillVGPR { // VGPR used for SGPR spills Register VGPR; - // If the VGPR is a CSR, the stack slot used to save/restore it in the - // prolog/epilog. + // If the VGPR is is used for SGPR spills in a non-entrypoint function, the + // stack slot used to save/restore it in the prolog/epilog. Optional<int> FI; - SGPRSpillVGPRCSR(Register V, Optional<int> F) : VGPR(V), FI(F) {} + SGPRSpillVGPR(Register V, Optional<int> F) : VGPR(V), FI(F) {} }; struct VGPRSpillToAGPR { @@ -461,16 +465,16 @@ public: bool FullyAllocated = false; }; - SparseBitVector<> WWMReservedRegs; - - void ReserveWWMRegister(Register Reg) { WWMReservedRegs.set(Reg); } + // Map WWM VGPR to a stack slot that is used to save/restore it in the + // prolog/epilog. + MapVector<Register, Optional<int>> WWMReservedRegs; private: // Track VGPR + wave index for each subregister of the SGPR spilled to // frameindex key. DenseMap<int, std::vector<SpilledReg>> SGPRToVGPRSpills; unsigned NumVGPRSpillLanes = 0; - SmallVector<SGPRSpillVGPRCSR, 2> SpillVGPRs; + SmallVector<SGPRSpillVGPR, 2> SpillVGPRs; DenseMap<int, VGPRSpillToAGPR> VGPRToAGPRSpills; @@ -480,6 +484,10 @@ private: // VGPRs used for AGPR spills. SmallVector<MCPhysReg, 32> SpillVGPR; + // Emergency stack slot. Sometimes, we create this before finalizing the stack + // frame, so save it here and add it to the RegScavenger later. + Optional<int> ScavengeFI; + public: // FIXME /// If this is set, an SGPR used for save/restore of the register used for the /// frame pointer. @@ -497,7 +505,14 @@ public: // FIXME public: SIMachineFunctionInfo(const MachineFunction &MF); - bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI); + bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI, + const MachineFunction &MF, + PerFunctionMIParsingState &PFS, + SMDiagnostic &Error, SMRange &SourceRange); + + void reserveWWMRegister(Register Reg, Optional<int> FI) { + WWMReservedRegs.insert(std::make_pair(Reg, FI)); + } ArrayRef<SpilledReg> getSGPRToVGPRSpills(int FrameIndex) const { auto I = SGPRToVGPRSpills.find(FrameIndex); @@ -505,9 +520,7 @@ public: ArrayRef<SpilledReg>() : makeArrayRef(I->second); } - ArrayRef<SGPRSpillVGPRCSR> getSGPRSpillVGPRs() const { - return SpillVGPRs; - } + ArrayRef<SGPRSpillVGPR> getSGPRSpillVGPRs() const { return SpillVGPRs; } void setSGPRSpillVGPRs(Register NewVGPR, Optional<int> newFI, int Index) { SpillVGPRs[Index].VGPR = NewVGPR; @@ -538,6 +551,9 @@ public: bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR); void removeDeadFrameIndices(MachineFrameInfo &MFI); + int getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI); + Optional<int> getOptionalScavengeFI() const { return ScavengeFI; } + bool hasCalculatedTID() const { return TIDReg != 0; }; Register getTIDReg() const { return TIDReg; }; void setTIDReg(Register Reg) { TIDReg = Reg; } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 3caa75e4d958..71be73c2f0e4 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -84,22 +84,6 @@ enum class SIAtomicAddrSpace { LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) }; -/// Sets named bit \p BitName to "true" if present in instruction \p MI. -/// \returns Returns true if \p MI is modified, false otherwise. -template <uint16_t BitName> -bool enableNamedBit(const MachineBasicBlock::iterator &MI) { - int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName); - if (BitIdx == -1) - return false; - - MachineOperand &Bit = MI->getOperand(BitIdx); - if (Bit.getImm() != 0) - return false; - - Bit.setImm(1); - return true; -} - class SIMemOpInfo final { private: @@ -129,12 +113,43 @@ private: IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal) { + + if (Ordering == AtomicOrdering::NotAtomic) { + assert(Scope == SIAtomicScope::NONE && + OrderingAddrSpace == SIAtomicAddrSpace::NONE && + !IsCrossAddressSpaceOrdering && + FailureOrdering == AtomicOrdering::NotAtomic); + return; + } + + assert(Scope != SIAtomicScope::NONE && + (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != + SIAtomicAddrSpace::NONE && + (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) != + SIAtomicAddrSpace::NONE && + !isStrongerThan(FailureOrdering, Ordering)); + // There is also no cross address space ordering if the ordering // address space is the same as the instruction address space and // only contains a single address space. if ((OrderingAddrSpace == InstrAddrSpace) && isPowerOf2_32(uint32_t(InstrAddrSpace))) this->IsCrossAddressSpaceOrdering = false; + + // Limit the scope to the maximum supported by the instruction's address + // spaces. + if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) == + SIAtomicAddrSpace::NONE) { + this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD); + } else if ((InstrAddrSpace & + ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) == + SIAtomicAddrSpace::NONE) { + this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP); + } else if ((InstrAddrSpace & + ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS | + SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) { + this->Scope = std::min(Scope, SIAtomicScope::AGENT); + } } public: @@ -202,12 +217,12 @@ private: void reportUnsupported(const MachineBasicBlock::iterator &MI, const char *Msg) const; - /// Inspects the target synchonization scope \p SSID and determines + /// Inspects the target synchronization scope \p SSID and determines /// the SI atomic scope it corresponds to, the address spaces it /// covers, and whether the memory ordering applies between address /// spaces. Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> - toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const; + toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const; /// \return Return a bit set of the address spaces accessed by \p AS. SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; @@ -257,6 +272,11 @@ protected: SICacheControl(const GCNSubtarget &ST); + /// Sets named bit \p BitName to "true" if present in instruction \p MI. + /// \returns Returns true if \p MI is modified, false otherwise. + bool enableNamedBit(const MachineBasicBlock::iterator MI, + AMDGPU::CPol::CPol Bit) const; + public: /// Create a cache control for the subtarget \p ST. @@ -269,6 +289,20 @@ public: SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const = 0; + /// Update \p MI memory store instruction to bypass any caches up to + /// the \p Scope memory scope for address spaces \p + /// AddrSpace. Return true iff the instruction was modified. + virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const = 0; + + /// Update \p MI memory read-modify-write instruction to bypass any caches up + /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true + /// iff the instruction was modified. + virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const = 0; + /// Update \p MI memory instruction of kind \p Op associated with address /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return /// true iff the instruction was modified. @@ -324,13 +358,13 @@ protected: /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI /// is modified, false otherwise. bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { - return enableNamedBit<AMDGPU::OpName::glc>(MI); + return enableNamedBit(MI, AMDGPU::CPol::GLC); } /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI /// is modified, false otherwise. bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { - return enableNamedBit<AMDGPU::OpName::slc>(MI); + return enableNamedBit(MI, AMDGPU::CPol::SLC); } public: @@ -341,6 +375,14 @@ public: SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const override; + bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override; + + bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override; + bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsVolatile, @@ -377,13 +419,54 @@ public: }; +class SIGfx90ACacheControl : public SIGfx7CacheControl { +public: + + SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}; + + bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override; + + bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override; + + bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override; + + bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, + SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsVolatile, + bool IsNonTemporal) const override; + + bool insertWait(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + SIMemOp Op, + bool IsCrossAddrSpaceOrdering, + Position Pos) const override; + + bool insertAcquire(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const override; + + bool insertRelease(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + bool IsCrossAddrSpaceOrdering, + Position Pos) const override; +}; + class SIGfx10CacheControl : public SIGfx7CacheControl { protected: /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI /// is modified, false otherwise. bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { - return enableNamedBit<AMDGPU::OpName::dlc>(MI); + return enableNamedBit(MI, AMDGPU::CPol::DLC); } public: @@ -424,7 +507,7 @@ private: /// Return true iff instruction \p MI is a atomic instruction that /// returns a result. bool isAtomicRet(const MachineInstr &MI) const { - return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1; + return SIInstrInfo::isAtomicRet(MI); } /// Removes all processed atomic pseudo instructions from the current @@ -476,7 +559,7 @@ void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, - SIAtomicAddrSpace InstrScope) const { + SIAtomicAddrSpace InstrAddrSpace) const { if (SSID == SyncScope::System) return std::make_tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, @@ -499,23 +582,23 @@ SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, true); if (SSID == MMI->getSystemOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::SYSTEM, - SIAtomicAddrSpace::ATOMIC & InstrScope, + SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); if (SSID == MMI->getAgentOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::AGENT, - SIAtomicAddrSpace::ATOMIC & InstrScope, + SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::WORKGROUP, - SIAtomicAddrSpace::ATOMIC & InstrScope, + SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::WAVEFRONT, - SIAtomicAddrSpace::ATOMIC & InstrScope, + SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::SINGLETHREAD, - SIAtomicAddrSpace::ATOMIC & InstrScope, + SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); return None; } @@ -557,7 +640,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( IsVolatile |= MMO->isVolatile(); InstrAddrSpace |= toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); - AtomicOrdering OpOrdering = MMO->getOrdering(); + AtomicOrdering OpOrdering = MMO->getSuccessOrdering(); if (OpOrdering != AtomicOrdering::NotAtomic) { const auto &IsSyncScopeInclusion = MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); @@ -568,9 +651,9 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( } SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID(); - Ordering = - isStrongerThan(Ordering, OpOrdering) ? - Ordering : MMO->getOrdering(); + Ordering = isStrongerThan(Ordering, OpOrdering) + ? Ordering + : MMO->getSuccessOrdering(); assert(MMO->getFailureOrdering() != AtomicOrdering::Release && MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); FailureOrdering = @@ -591,7 +674,8 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = ScopeOrNone.getValue(); if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || - ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { + ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) || + ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) { reportUnsupported(MI, "Unsupported atomic address space"); return None; } @@ -659,7 +743,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo( } return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, - IsCrossAddressSpaceOrdering); + IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); } Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( @@ -682,9 +766,21 @@ SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) { InsertCacheInv = !AmdgcnSkipCacheInvalidations; } +bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, + AMDGPU::CPol::CPol Bit) const { + MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol); + if (!CPol) + return false; + + CPol->setImm(CPol->getImm() | Bit); + return true; +} + /* static */ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { GCNSubtarget::Generation Generation = ST.getGeneration(); + if (ST.hasGFX90AInsts()) + return std::make_unique<SIGfx90ACacheControl>(ST); if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) return std::make_unique<SIGfx6CacheControl>(ST); if (Generation < AMDGPUSubtarget::GFX10) @@ -725,6 +821,32 @@ bool SIGfx6CacheControl::enableLoadCacheBypass( return Changed; } +bool SIGfx6CacheControl::enableStoreCacheBypass( + const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const { + assert(!MI->mayLoad() && MI->mayStore()); + bool Changed = false; + + /// The L1 cache is write through so does not need to be bypassed. There is no + /// bypass control for the L2 cache at the isa level. + + return Changed; +} + +bool SIGfx6CacheControl::enableRMWCacheBypass( + const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const { + assert(MI->mayLoad() && MI->mayStore()); + bool Changed = false; + + /// The L1 cache is write through so does not need to be bypassed. There is no + /// bypass control for the L2 cache at the isa level. + + return Changed; +} + bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsVolatile, bool IsNonTemporal) const { @@ -968,6 +1090,292 @@ bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, return Changed; } +bool SIGfx90ACacheControl::enableLoadCacheBypass( + const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const { + assert(MI->mayLoad() && !MI->mayStore()); + bool Changed = false; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + Changed |= enableGLCBit(MI); + break; + case SIAtomicScope::WORKGROUP: + // In threadgroup split mode the waves of a work-group can be executing on + // different CUs. Therefore need to bypass the L1 which is per CU. + // Otherwise in non-threadgroup split mode all waves of a work-group are + // on the same CU, and so the L1 does not need to be bypassed. + if (ST.isTgSplitEnabled()) Changed |= enableGLCBit(MI); + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to bypass. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + /// The scratch address space does not need the global memory caches + /// to be bypassed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not have a cache. + + return Changed; +} + +bool SIGfx90ACacheControl::enableStoreCacheBypass( + const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const { + assert(!MI->mayLoad() && MI->mayStore()); + bool Changed = false; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + /// Do not set glc for store atomic operations as they implicitly write + /// through the L1 cache. + break; + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to bypass. Store atomics implicitly write through the L1 + // cache. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + /// The scratch address space does not need the global memory caches + /// to be bypassed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not have a cache. + + return Changed; +} + +bool SIGfx90ACacheControl::enableRMWCacheBypass( + const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const { + assert(MI->mayLoad() && MI->mayStore()); + bool Changed = false; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + /// Do not set glc for RMW atomic operations as they implicitly bypass + /// the L1 cache, and the glc bit is instead used to indicate if they are + /// return or no-return. + break; + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to bypass. RMW atomics implicitly bypass the L1 cache. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + return Changed; +} + +bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( + MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsVolatile, bool IsNonTemporal) const { + // Only handle load and store, not atomic read-modify-write insructions. The + // latter use glc to indicate if the atomic returns a result and so must not + // be used for cache control. + assert(MI->mayLoad() ^ MI->mayStore()); + + // Only update load and store, not LLVM IR atomic read-modify-write + // instructions. The latter are always marked as volatile so cannot sensibly + // handle it as do not want to pessimize all atomics. Also they do not support + // the nontemporal attribute. + assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE); + + bool Changed = false; + + if (IsVolatile) { + if (Op == SIMemOp::LOAD) { + Changed |= enableGLCBit(MI); + } + + // Ensure operation has completed at system scope to cause all volatile + // operations to be visible outside the program in a global order. Do not + // request cross address space as only the global address space can be + // observable outside the program, so no need to cause a waitcnt for LDS + // address space operations. + Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, + Position::AFTER); + + return Changed; + } + + if (IsNonTemporal) { + // Request L1 MISS_EVICT and L2 STREAM for load and store instructions. + Changed |= enableGLCBit(MI); + Changed |= enableSLCBit(MI); + return Changed; + } + + return Changed; +} + +bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + SIMemOp Op, + bool IsCrossAddrSpaceOrdering, + Position Pos) const { + if (ST.isTgSplitEnabled()) { + // In threadgroup split mode the waves of a work-group can be executing on + // different CUs. Therefore need to wait for global or GDS memory operations + // to complete to ensure they are visible to waves in the other CUs. + // Otherwise in non-threadgroup split mode all waves of a work-group are on + // the same CU, so no need to wait for global memory as all waves in the + // work-group access the same the L1, nor wait for GDS as access are ordered + // on a CU. + if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | + SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && + (Scope == SIAtomicScope::WORKGROUP)) { + // Same as GFX7 using agent scope. + Scope = SIAtomicScope::AGENT; + } + // In threadgroup split mode LDS cannot be allocated so no need to wait for + // LDS memory operations. + AddrSpace &= ~SIAtomicAddrSpace::LDS; + } + return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, + IsCrossAddrSpaceOrdering, Pos); +} + +bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const { + if (!InsertCacheInv) + return false; + + bool Changed = false; + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + if (Pos == Position::AFTER) + ++MI; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Ensures that following loads will not see stale remote VMEM data or + // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and + // CC will never be stale due to the local memory probes. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); + // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the + // hardware does not reorder memory operations by the same wave with + // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to + // remove any cache lines of earlier writes by the same wave and ensures + // later reads by the same wave will refetch the cache lines. + Changed = true; + break; + case SIAtomicScope::AGENT: + // Same as GFX7. + break; + case SIAtomicScope::WORKGROUP: + // In threadgroup split mode the waves of a work-group can be executing on + // different CUs. Therefore need to invalidate the L1 which is per CU. + // Otherwise in non-threadgroup split mode all waves of a work-group are + // on the same CU, and so the L1 does not need to be invalidated. + if (ST.isTgSplitEnabled()) { + // Same as GFX7 using agent scope. + Scope = SIAtomicScope::AGENT; + } + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // Same as GFX7. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + /// The scratch address space does not need the global memory cache + /// to be flushed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not have a cache. + + if (Pos == Position::AFTER) + --MI; + + Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); + + return Changed; +} + +bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + bool IsCrossAddrSpaceOrdering, + Position Pos) const { + bool Changed = false; + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + if (Pos == Position::AFTER) + ++MI; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the + // hardware does not reorder memory operations by the same wave with + // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed + // to initiate writeback of any dirty cache lines of earlier writes by the + // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the + // writeback has completed. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)); + // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT + // vmcnt(0)" needed by the "BUFFER_WBL2". + Changed = true; + break; + case SIAtomicScope::AGENT: + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // Same as GFX7. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + if (Pos == Position::AFTER) + --MI; + + Changed |= + SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, + IsCrossAddrSpaceOrdering, Pos); + + return Changed; +} + bool SIGfx10CacheControl::enableLoadCacheBypass( const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, @@ -1292,6 +1700,13 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, bool Changed = false; if (MOI.isAtomic()) { + if (MOI.getOrdering() == AtomicOrdering::Monotonic || + MOI.getOrdering() == AtomicOrdering::Release || + MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { + Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(), + MOI.getOrderingAddrSpace()); + } + if (MOI.getOrdering() == AtomicOrdering::Release || MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) Changed |= CC->insertRelease(MI, MOI.getScope(), @@ -1336,7 +1751,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, Position::BEFORE); // TODO: If both release and invalidate are happening they could be combined - // to use the single "BUFFER_WBL2" instruction. This could be done by + // to use the single "BUFFER_WBINV*" instruction. This could be done by // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to // track cache invalidate and write back instructions. @@ -1360,6 +1775,15 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, bool Changed = false; if (MOI.isAtomic()) { + if (MOI.getOrdering() == AtomicOrdering::Monotonic || + MOI.getOrdering() == AtomicOrdering::Acquire || + MOI.getOrdering() == AtomicOrdering::Release || + MOI.getOrdering() == AtomicOrdering::AcquireRelease || + MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { + Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(), + MOI.getInstrAddrSpace()); + } + if (MOI.getOrdering() == AtomicOrdering::Release || MOI.getOrdering() == AtomicOrdering::AcquireRelease || MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || @@ -1375,7 +1799,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, MOI.getFailureOrdering() == AtomicOrdering::Acquire || MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { Changed |= CC->insertWait(MI, MOI.getScope(), - MOI.getOrderingAddrSpace(), + MOI.getInstrAddrSpace(), isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE, MOI.getIsCrossAddressSpaceOrdering(), @@ -1401,7 +1825,7 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { // Unbundle instructions after the post-RA scheduler. - if (MI->isBundle()) { + if (MI->isBundle() && MI->mayLoadOrStore()) { MachineBasicBlock::instr_iterator II(MI->getIterator()); for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); I != E && I->isBundledWithPred(); ++I) { diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index 54f20912d0a9..b9c839fe28ba 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -220,6 +220,18 @@ static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) { MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32)); return true; } + case AMDGPU::S_AND_B64_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII.get(AMDGPU::S_AND_B64)); + return true; + } + case AMDGPU::S_AND_B32_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII.get(AMDGPU::S_AND_B32)); + return true; + } default: return false; } diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index 162e96655df2..5f89f3826683 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -416,15 +416,20 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { continue; Register SavedExec = I->getOperand(0).getReg(); - if (SavedExec.isVirtual() && MRI->hasOneNonDBGUse(SavedExec) && - MRI->use_instr_nodbg_begin(SavedExec)->getParent() == - I->getParent()) { - LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *I << '\n'); - LIS->RemoveMachineInstrFromMaps(*I); - I->eraseFromParent(); - MRI->replaceRegWith(SavedExec, ExecReg); - LIS->removeInterval(SavedExec); - Changed = true; + if (SavedExec.isVirtual() && MRI->hasOneNonDBGUse(SavedExec)) { + MachineInstr *SingleExecUser = &*MRI->use_instr_nodbg_begin(SavedExec); + int Idx = SingleExecUser->findRegisterUseOperandIdx(SavedExec); + assert(Idx != -1); + if (SingleExecUser->getParent() == I->getParent() && + !SingleExecUser->getOperand(Idx).isImplicit() && + TII->isOperandLegal(*SingleExecUser, Idx, &I->getOperand(1))) { + LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *I << '\n'); + LIS->RemoveMachineInstrFromMaps(*I); + I->eraseFromParent(); + MRI->replaceRegWith(SavedExec, ExecReg); + LIS->removeInterval(SavedExec); + Changed = true; + } } break; } diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp new file mode 100644 index 000000000000..307c9eba9d3b --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp @@ -0,0 +1,637 @@ +//===--------------------- SIOptimizeVGPRLiveRange.cpp -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass tries to remove unnecessary VGPR live ranges in divergent if-else +/// structures and waterfall loops. +/// +/// When we do structurization, we usually transform an if-else into two +/// sucessive if-then (with a flow block to do predicate inversion). Consider a +/// simple case after structurization: A divergent value %a was defined before +/// if-else and used in both THEN (use in THEN is optional) and ELSE part: +/// bb.if: +/// %a = ... +/// ... +/// bb.then: +/// ... = op %a +/// ... // %a can be dead here +/// bb.flow: +/// ... +/// bb.else: +/// ... = %a +/// ... +/// bb.endif +/// +/// As register allocator has no idea of the thread-control-flow, it will just +/// assume %a would be alive in the whole range of bb.then because of a later +/// use in bb.else. On AMDGPU architecture, the VGPR is accessed with respect +/// to exec mask. For this if-else case, the lanes active in bb.then will be +/// inactive in bb.else, and vice-versa. So we are safe to say that %a was dead +/// after the last use in bb.then until the end of the block. The reason is +/// the instructions in bb.then will only overwrite lanes that will never be +/// accessed in bb.else. +/// +/// This pass aims to to tell register allocator that %a is in-fact dead, +/// through inserting a phi-node in bb.flow saying that %a is undef when coming +/// from bb.then, and then replace the uses in the bb.else with the result of +/// newly inserted phi. +/// +/// Two key conditions must be met to ensure correctness: +/// 1.) The def-point should be in the same loop-level as if-else-endif to make +/// sure the second loop iteration still get correct data. +/// 2.) There should be no further uses after the IF-ELSE region. +/// +/// +/// Waterfall loops get inserted around instructions that use divergent values +/// but can only be executed with a uniform value. For example an indirect call +/// to a divergent address: +/// bb.start: +/// %a = ... +/// %fun = ... +/// ... +/// bb.loop: +/// call %fun (%a) +/// ... // %a can be dead here +/// loop %bb.loop +/// +/// The loop block is executed multiple times, but it is run exactly once for +/// each active lane. Similar to the if-else case, the register allocator +/// assumes that %a is live throughout the loop as it is used again in the next +/// iteration. If %a is a VGPR that is unused after the loop, it does not need +/// to be live after its last use in the loop block. By inserting a phi-node at +/// the start of bb.loop that is undef when coming from bb.loop, the register +/// allocation knows that the value of %a does not need to be preserved through +/// iterations of the loop. +/// +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-opt-vgpr-liverange" + +namespace { + +class SIOptimizeVGPRLiveRange : public MachineFunctionPass { +private: + const SIRegisterInfo *TRI = nullptr; + const SIInstrInfo *TII = nullptr; + LiveVariables *LV = nullptr; + MachineDominatorTree *MDT = nullptr; + const MachineLoopInfo *Loops = nullptr; + MachineRegisterInfo *MRI = nullptr; + +public: + static char ID; + + MachineBasicBlock *getElseTarget(MachineBasicBlock *MBB) const; + + void collectElseRegionBlocks(MachineBasicBlock *Flow, + MachineBasicBlock *Endif, + SmallSetVector<MachineBasicBlock *, 16> &) const; + + void + collectCandidateRegisters(MachineBasicBlock *If, MachineBasicBlock *Flow, + MachineBasicBlock *Endif, + SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks, + SmallVectorImpl<Register> &CandidateRegs) const; + + void collectWaterfallCandidateRegisters( + MachineBasicBlock *Loop, + SmallSetVector<Register, 16> &CandidateRegs) const; + + void findNonPHIUsesInBlock(Register Reg, MachineBasicBlock *MBB, + SmallVectorImpl<MachineInstr *> &Uses) const; + + void updateLiveRangeInThenRegion(Register Reg, MachineBasicBlock *If, + MachineBasicBlock *Flow) const; + + void updateLiveRangeInElseRegion( + Register Reg, Register NewReg, MachineBasicBlock *Flow, + MachineBasicBlock *Endif, + SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const; + + void + optimizeLiveRange(Register Reg, MachineBasicBlock *If, + MachineBasicBlock *Flow, MachineBasicBlock *Endif, + SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const; + + void optimizeWaterfallLiveRange(Register Reg, MachineBasicBlock *If) const; + + SIOptimizeVGPRLiveRange() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "SI Optimize VGPR LiveRange"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LiveVariables>(); + AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachineLoopInfo>(); + AU.addPreserved<LiveVariables>(); + AU.addPreserved<MachineDominatorTree>(); + AU.addPreserved<MachineLoopInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::IsSSA); + } +}; + +} // end anonymous namespace + +// Check whether the MBB is a else flow block and get the branching target which +// is the Endif block +MachineBasicBlock * +SIOptimizeVGPRLiveRange::getElseTarget(MachineBasicBlock *MBB) const { + for (auto &BR : MBB->terminators()) { + if (BR.getOpcode() == AMDGPU::SI_ELSE) + return BR.getOperand(2).getMBB(); + } + return nullptr; +} + +void SIOptimizeVGPRLiveRange::collectElseRegionBlocks( + MachineBasicBlock *Flow, MachineBasicBlock *Endif, + SmallSetVector<MachineBasicBlock *, 16> &Blocks) const { + assert(Flow != Endif); + + MachineBasicBlock *MBB = Endif; + unsigned Cur = 0; + while (MBB) { + for (auto *Pred : MBB->predecessors()) { + if (Pred != Flow && !Blocks.contains(Pred)) + Blocks.insert(Pred); + } + + if (Cur < Blocks.size()) + MBB = Blocks[Cur++]; + else + MBB = nullptr; + } + + LLVM_DEBUG({ + dbgs() << "Found Else blocks: "; + for (auto *MBB : Blocks) + dbgs() << printMBBReference(*MBB) << ' '; + dbgs() << '\n'; + }); +} + +/// Find the instructions(excluding phi) in \p MBB that uses the \p Reg. +void SIOptimizeVGPRLiveRange::findNonPHIUsesInBlock( + Register Reg, MachineBasicBlock *MBB, + SmallVectorImpl<MachineInstr *> &Uses) const { + for (auto &UseMI : MRI->use_nodbg_instructions(Reg)) { + if (UseMI.getParent() == MBB && !UseMI.isPHI()) + Uses.push_back(&UseMI); + } +} + +/// Collect the killed registers in the ELSE region which are not alive through +/// the whole THEN region. +void SIOptimizeVGPRLiveRange::collectCandidateRegisters( + MachineBasicBlock *If, MachineBasicBlock *Flow, MachineBasicBlock *Endif, + SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks, + SmallVectorImpl<Register> &CandidateRegs) const { + + SmallSet<Register, 8> KillsInElse; + + for (auto *Else : ElseBlocks) { + for (auto &MI : Else->instrs()) { + if (MI.isDebugInstr()) + continue; + + for (auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.getReg() || MO.isDef()) + continue; + + Register MOReg = MO.getReg(); + // We can only optimize AGPR/VGPR virtual register + if (MOReg.isPhysical() || !TRI->isVectorRegister(*MRI, MOReg)) + continue; + + if (MO.readsReg()) { + LiveVariables::VarInfo &VI = LV->getVarInfo(MOReg); + const MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent(); + // Make sure two conditions are met: + // a.) the value is defined before/in the IF block + // b.) should be defined in the same loop-level. + if ((VI.AliveBlocks.test(If->getNumber()) || DefMBB == If) && + Loops->getLoopFor(DefMBB) == Loops->getLoopFor(If)) { + // Check if the register is live into the endif block. If not, + // consider it killed in the else region. + LiveVariables::VarInfo &VI = LV->getVarInfo(MOReg); + if (!VI.isLiveIn(*Endif, MOReg, *MRI)) { + KillsInElse.insert(MOReg); + } else { + LLVM_DEBUG(dbgs() << "Excluding " << printReg(MOReg, TRI) + << " as Live in Endif\n"); + } + } + } + } + } + } + + // Check the phis in the Endif, looking for value coming from the ELSE + // region. Make sure the phi-use is the last use. + for (auto &MI : Endif->phis()) { + for (unsigned Idx = 1; Idx < MI.getNumOperands(); Idx += 2) { + auto &MO = MI.getOperand(Idx); + auto *Pred = MI.getOperand(Idx + 1).getMBB(); + if (Pred == Flow) + continue; + assert(ElseBlocks.contains(Pred) && "Should be from Else region\n"); + + if (!MO.isReg() || !MO.getReg() || MO.isUndef()) + continue; + + Register Reg = MO.getReg(); + if (Reg.isPhysical() || !TRI->isVectorRegister(*MRI, Reg)) + continue; + + LiveVariables::VarInfo &VI = LV->getVarInfo(Reg); + + if (VI.isLiveIn(*Endif, Reg, *MRI)) { + LLVM_DEBUG(dbgs() << "Excluding " << printReg(Reg, TRI) + << " as Live in Endif\n"); + continue; + } + // Make sure two conditions are met: + // a.) the value is defined before/in the IF block + // b.) should be defined in the same loop-level. + const MachineBasicBlock *DefMBB = MRI->getVRegDef(Reg)->getParent(); + if ((VI.AliveBlocks.test(If->getNumber()) || DefMBB == If) && + Loops->getLoopFor(DefMBB) == Loops->getLoopFor(If)) + KillsInElse.insert(Reg); + } + } + + auto IsLiveThroughThen = [&](Register Reg) { + for (auto I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end(); I != E; + ++I) { + if (!I->readsReg()) + continue; + auto *UseMI = I->getParent(); + auto *UseMBB = UseMI->getParent(); + if (UseMBB == Flow || UseMBB == Endif) { + if (!UseMI->isPHI()) + return true; + + auto *IncomingMBB = UseMI->getOperand(I.getOperandNo() + 1).getMBB(); + // The register is live through the path If->Flow or Flow->Endif. + // we should not optimize for such cases. + if ((UseMBB == Flow && IncomingMBB != If) || + (UseMBB == Endif && IncomingMBB == Flow)) + return true; + } + } + return false; + }; + + for (auto Reg : KillsInElse) { + if (!IsLiveThroughThen(Reg)) + CandidateRegs.push_back(Reg); + } +} + +/// Collect the registers used in the waterfall loop block that are defined +/// before. +void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters( + MachineBasicBlock *Loop, + SmallSetVector<Register, 16> &CandidateRegs) const { + + for (auto &MI : Loop->instrs()) { + if (MI.isDebugInstr()) + continue; + + for (auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.getReg() || MO.isDef()) + continue; + + Register MOReg = MO.getReg(); + // We can only optimize AGPR/VGPR virtual register + if (MOReg.isPhysical() || !TRI->isVectorRegister(*MRI, MOReg)) + continue; + + if (MO.readsReg()) { + const MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent(); + // Make sure the value is defined before the LOOP block + if (DefMBB != Loop && !CandidateRegs.contains(MOReg)) { + // If the variable is used after the loop, the register coalescer will + // merge the newly created register and remove the phi node again. + // Just do nothing in that case. + LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(MOReg); + bool IsUsed = false; + for (auto *Succ : Loop->successors()) { + if (Succ != Loop && OldVarInfo.isLiveIn(*Succ, MOReg, *MRI)) { + IsUsed = true; + break; + } + } + if (!IsUsed) { + LLVM_DEBUG(dbgs() << "Found candidate reg: " + << printReg(MOReg, TRI, 0, MRI) << '\n'); + CandidateRegs.insert(MOReg); + } else { + LLVM_DEBUG(dbgs() << "Reg is used after loop, ignoring: " + << printReg(MOReg, TRI, 0, MRI) << '\n'); + } + } + } + } + } +} + +// Re-calculate the liveness of \p Reg in the THEN-region +void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion( + Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow) const { + + SmallPtrSet<MachineBasicBlock *, 16> PHIIncoming; + + MachineBasicBlock *ThenEntry = nullptr; + for (auto *Succ : If->successors()) { + if (Succ != Flow) { + ThenEntry = Succ; + break; + } + } + assert(ThenEntry && "No successor in Then region?"); + + LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg); + df_iterator_default_set<MachineBasicBlock *, 16> Visited; + + for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) { + if (MBB == Flow) + break; + + // Clear Live bit, as we will recalculate afterwards + LLVM_DEBUG(dbgs() << "Clear AliveBlock " << printMBBReference(*MBB) + << '\n'); + OldVarInfo.AliveBlocks.reset(MBB->getNumber()); + } + + // Get the blocks the Reg should be alive through + for (auto I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end(); I != E; + ++I) { + auto *UseMI = I->getParent(); + if (UseMI->isPHI() && I->readsReg()) { + if (Visited.contains(UseMI->getParent())) + PHIIncoming.insert(UseMI->getOperand(I.getOperandNo() + 1).getMBB()); + } + } + + Visited.clear(); + + for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) { + if (MBB == Flow) + break; + + SmallVector<MachineInstr *> Uses; + // PHI instructions has been processed before. + findNonPHIUsesInBlock(Reg, MBB, Uses); + + if (Uses.size() == 1) { + LLVM_DEBUG(dbgs() << "Found one Non-PHI use in " + << printMBBReference(*MBB) << '\n'); + LV->HandleVirtRegUse(Reg, MBB, *(*Uses.begin())); + } else if (Uses.size() > 1) { + // Process the instructions in-order + LLVM_DEBUG(dbgs() << "Found " << Uses.size() << " Non-PHI uses in " + << printMBBReference(*MBB) << '\n'); + for (MachineInstr &MI : *MBB) { + if (llvm::is_contained(Uses, &MI)) + LV->HandleVirtRegUse(Reg, MBB, MI); + } + } + + // Mark Reg alive through the block if this is a PHI incoming block + if (PHIIncoming.contains(MBB)) + LV->MarkVirtRegAliveInBlock(OldVarInfo, MRI->getVRegDef(Reg)->getParent(), + MBB); + } + + // Set the isKilled flag if we get new Kills in the THEN region. + for (auto *MI : OldVarInfo.Kills) { + if (Visited.contains(MI->getParent())) + MI->addRegisterKilled(Reg, TRI); + } +} + +void SIOptimizeVGPRLiveRange::updateLiveRangeInElseRegion( + Register Reg, Register NewReg, MachineBasicBlock *Flow, + MachineBasicBlock *Endif, + SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const { + LiveVariables::VarInfo &NewVarInfo = LV->getVarInfo(NewReg); + LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg); + + // Transfer aliveBlocks from Reg to NewReg + for (auto *MBB : ElseBlocks) { + unsigned BBNum = MBB->getNumber(); + if (OldVarInfo.AliveBlocks.test(BBNum)) { + NewVarInfo.AliveBlocks.set(BBNum); + LLVM_DEBUG(dbgs() << "Removing AliveBlock " << printMBBReference(*MBB) + << '\n'); + OldVarInfo.AliveBlocks.reset(BBNum); + } + } + + // Transfer the possible Kills in ElseBlocks from Reg to NewReg + auto I = OldVarInfo.Kills.begin(); + while (I != OldVarInfo.Kills.end()) { + if (ElseBlocks.contains((*I)->getParent())) { + NewVarInfo.Kills.push_back(*I); + I = OldVarInfo.Kills.erase(I); + } else { + ++I; + } + } +} + +void SIOptimizeVGPRLiveRange::optimizeLiveRange( + Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow, + MachineBasicBlock *Endif, + SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const { + // Insert a new PHI, marking the value from the THEN region being + // undef. + LLVM_DEBUG(dbgs() << "Optimizing " << printReg(Reg, TRI) << '\n'); + const auto *RC = MRI->getRegClass(Reg); + Register NewReg = MRI->createVirtualRegister(RC); + Register UndefReg = MRI->createVirtualRegister(RC); + MachineInstrBuilder PHI = BuildMI(*Flow, Flow->getFirstNonPHI(), DebugLoc(), + TII->get(TargetOpcode::PHI), NewReg); + for (auto *Pred : Flow->predecessors()) { + if (Pred == If) + PHI.addReg(Reg).addMBB(Pred); + else + PHI.addReg(UndefReg, RegState::Undef).addMBB(Pred); + } + + // Replace all uses in the ELSE region or the PHIs in ENDIF block + // Use early increment range because setReg() will update the linked list. + for (auto &O : make_early_inc_range(MRI->use_operands(Reg))) { + auto *UseMI = O.getParent(); + auto *UseBlock = UseMI->getParent(); + // Replace uses in Endif block + if (UseBlock == Endif) { + assert(UseMI->isPHI() && "Uses should be PHI in Endif block"); + O.setReg(NewReg); + continue; + } + + // Replace uses in Else region + if (ElseBlocks.contains(UseBlock)) + O.setReg(NewReg); + } + + // The optimized Reg is not alive through Flow blocks anymore. + LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg); + OldVarInfo.AliveBlocks.reset(Flow->getNumber()); + + updateLiveRangeInElseRegion(Reg, NewReg, Flow, Endif, ElseBlocks); + updateLiveRangeInThenRegion(Reg, If, Flow); +} + +void SIOptimizeVGPRLiveRange::optimizeWaterfallLiveRange( + Register Reg, MachineBasicBlock *Loop) const { + // Insert a new PHI, marking the value from the last loop iteration undef. + LLVM_DEBUG(dbgs() << "Optimizing " << printReg(Reg, TRI) << '\n'); + const auto *RC = MRI->getRegClass(Reg); + Register NewReg = MRI->createVirtualRegister(RC); + Register UndefReg = MRI->createVirtualRegister(RC); + + // Replace all uses in the LOOP region + // Use early increment range because setReg() will update the linked list. + for (auto &O : make_early_inc_range(MRI->use_operands(Reg))) { + auto *UseMI = O.getParent(); + auto *UseBlock = UseMI->getParent(); + // Replace uses in Loop block + if (UseBlock == Loop) + O.setReg(NewReg); + } + + MachineInstrBuilder PHI = BuildMI(*Loop, Loop->getFirstNonPHI(), DebugLoc(), + TII->get(TargetOpcode::PHI), NewReg); + for (auto *Pred : Loop->predecessors()) { + if (Pred == Loop) + PHI.addReg(UndefReg, RegState::Undef).addMBB(Pred); + else + PHI.addReg(Reg).addMBB(Pred); + } + + LiveVariables::VarInfo &NewVarInfo = LV->getVarInfo(NewReg); + LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg); + + // collectWaterfallCandidateRegisters only collects registers that are dead + // after the loop. So we know that the old reg is not live throughout the + // whole block anymore. + OldVarInfo.AliveBlocks.reset(Loop->getNumber()); + + // Mark the last use as kill + for (auto &MI : reverse(Loop->instrs())) { + if (MI.readsRegister(NewReg, TRI)) { + MI.addRegisterKilled(NewReg, TRI); + NewVarInfo.Kills.push_back(&MI); + break; + } + } + assert(!NewVarInfo.Kills.empty() && + "Failed to find last usage of register in loop"); +} + +char SIOptimizeVGPRLiveRange::ID = 0; + +INITIALIZE_PASS_BEGIN(SIOptimizeVGPRLiveRange, DEBUG_TYPE, + "SI Optimize VGPR LiveRange", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_DEPENDENCY(LiveVariables) +INITIALIZE_PASS_END(SIOptimizeVGPRLiveRange, DEBUG_TYPE, + "SI Optimize VGPR LiveRange", false, false) + +char &llvm::SIOptimizeVGPRLiveRangeID = SIOptimizeVGPRLiveRange::ID; + +FunctionPass *llvm::createSIOptimizeVGPRLiveRangePass() { + return new SIOptimizeVGPRLiveRange(); +} + +bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) { + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + MDT = &getAnalysis<MachineDominatorTree>(); + Loops = &getAnalysis<MachineLoopInfo>(); + LV = &getAnalysis<LiveVariables>(); + MRI = &MF.getRegInfo(); + + if (skipFunction(MF.getFunction())) + return false; + + bool MadeChange = false; + + // TODO: we need to think about the order of visiting the blocks to get + // optimal result for nesting if-else cases. + for (MachineBasicBlock &MBB : MF) { + for (auto &MI : MBB.terminators()) { + // Detect the if-else blocks + if (MI.getOpcode() == AMDGPU::SI_IF) { + MachineBasicBlock *IfTarget = MI.getOperand(2).getMBB(); + auto *Endif = getElseTarget(IfTarget); + if (!Endif) + continue; + + SmallSetVector<MachineBasicBlock *, 16> ElseBlocks; + SmallVector<Register> CandidateRegs; + + LLVM_DEBUG(dbgs() << "Checking IF-ELSE-ENDIF: " + << printMBBReference(MBB) << ' ' + << printMBBReference(*IfTarget) << ' ' + << printMBBReference(*Endif) << '\n'); + + // Collect all the blocks in the ELSE region + collectElseRegionBlocks(IfTarget, Endif, ElseBlocks); + + // Collect the registers can be optimized + collectCandidateRegisters(&MBB, IfTarget, Endif, ElseBlocks, + CandidateRegs); + MadeChange |= !CandidateRegs.empty(); + // Now we are safe to optimize. + for (auto Reg : CandidateRegs) + optimizeLiveRange(Reg, &MBB, IfTarget, Endif, ElseBlocks); + } else if (MI.getOpcode() == AMDGPU::SI_WATERFALL_LOOP) { + LLVM_DEBUG(dbgs() << "Checking Waterfall loop: " + << printMBBReference(MBB) << '\n'); + + SmallSetVector<Register, 16> CandidateRegs; + collectWaterfallCandidateRegisters(&MBB, CandidateRegs); + MadeChange |= !CandidateRegs.empty(); + // Now we are safe to optimize. + for (auto Reg : CandidateRegs) + optimizeWaterfallLiveRange(Reg, &MBB); + } + } + } + + return MadeChange; +} diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp index ab05081e55d5..e05aafe5e291 100644 --- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp +++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp @@ -48,10 +48,18 @@ private: SmallSet<Register, 16> Defs; - bool isDependentLoad(const MachineInstr &MI) const; + void collectUsedRegUnits(const MachineInstr &MI, + BitVector &UsedRegUnits) const; + bool isBundleCandidate(const MachineInstr &MI) const; + bool isDependentLoad(const MachineInstr &MI) const; + bool canBundle(const MachineInstr &MI, const MachineInstr &NextMI) const; }; +constexpr uint64_t MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF | + SIInstrFlags::SMRD | SIInstrFlags::DS | + SIInstrFlags::FLAT | SIInstrFlags::MIMG; + } // End anonymous namespace. INITIALIZE_PASS(SIPostRABundler, DEBUG_TYPE, "SI post-RA bundler", false, false) @@ -80,55 +88,125 @@ bool SIPostRABundler::isDependentLoad(const MachineInstr &MI) const { return false; } +void SIPostRABundler::collectUsedRegUnits(const MachineInstr &MI, + BitVector &UsedRegUnits) const { + for (const MachineOperand &Op : MI.operands()) { + if (!Op.isReg() || !Op.readsReg()) + continue; + + Register Reg = Op.getReg(); + assert(!Op.getSubReg() && + "subregister indexes should not be present after RA"); + + for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) + UsedRegUnits.set(*Units); + } +} + +bool SIPostRABundler::isBundleCandidate(const MachineInstr &MI) const { + const uint64_t IMemFlags = MI.getDesc().TSFlags & MemFlags; + return IMemFlags != 0 && MI.mayLoadOrStore() && !MI.isBundled(); +} + +bool SIPostRABundler::canBundle(const MachineInstr &MI, + const MachineInstr &NextMI) const { + const uint64_t IMemFlags = MI.getDesc().TSFlags & MemFlags; + + return (IMemFlags != 0 && MI.mayLoadOrStore() && !NextMI.isBundled() && + NextMI.mayLoad() == MI.mayLoad() && NextMI.mayStore() == MI.mayStore() && + ((NextMI.getDesc().TSFlags & MemFlags) == IMemFlags) && + !isDependentLoad(NextMI)); +} + bool SIPostRABundler::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); - bool Changed = false; - const uint64_t MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF | - SIInstrFlags::SMRD | SIInstrFlags::DS | - SIInstrFlags::FLAT | SIInstrFlags::MIMG; + BitVector BundleUsedRegUnits(TRI->getNumRegUnits()); + BitVector KillUsedRegUnits(TRI->getNumRegUnits()); + bool Changed = false; for (MachineBasicBlock &MBB : MF) { MachineBasicBlock::instr_iterator Next; MachineBasicBlock::instr_iterator B = MBB.instr_begin(); MachineBasicBlock::instr_iterator E = MBB.instr_end(); + for (auto I = B; I != E; I = Next) { Next = std::next(I); + if (!isBundleCandidate(*I)) + continue; + + assert(Defs.empty()); + + if (I->getNumExplicitDefs() != 0) + Defs.insert(I->defs().begin()->getReg()); + + MachineBasicBlock::instr_iterator BundleStart = I; + MachineBasicBlock::instr_iterator BundleEnd = I; + unsigned ClauseLength = 1; + for (I = Next; I != E; I = Next) { + Next = std::next(I); + + assert(BundleEnd != I); + if (canBundle(*BundleEnd, *I)) { + BundleEnd = I; + if (I->getNumExplicitDefs() != 0) + Defs.insert(I->defs().begin()->getReg()); + ++ClauseLength; + } else if (!I->isMetaInstruction()) { + // Allow meta instructions in between bundle candidates, but do not + // start or end a bundle on one. + // + // TODO: It may be better to move meta instructions like dbg_value + // after the bundle. We're relying on the memory legalizer to unbundle + // these. + break; + } + } + + Next = std::next(BundleEnd); + if (ClauseLength > 1) { + Changed = true; + + // Before register allocation, kills are inserted after potential soft + // clauses to hint register allocation. Look for kills that look like + // this, and erase them. + if (Next != E && Next->isKill()) { + + // TODO: Should maybe back-propagate kill flags to the bundle. + for (const MachineInstr &BundleMI : make_range(BundleStart, Next)) + collectUsedRegUnits(BundleMI, BundleUsedRegUnits); + + BundleUsedRegUnits.flip(); - const uint64_t IMemFlags = I->getDesc().TSFlags & MemFlags; + while (Next != E && Next->isKill()) { + MachineInstr &Kill = *Next; + collectUsedRegUnits(Kill, KillUsedRegUnits); - if (IMemFlags == 0 || I->isBundled() || !I->mayLoadOrStore() || - B->mayLoad() != I->mayLoad() || B->mayStore() != I->mayStore() || - ((B->getDesc().TSFlags & MemFlags) != IMemFlags) || - isDependentLoad(*I)) { + KillUsedRegUnits &= BundleUsedRegUnits; - if (B != I) { - if (std::next(B) != I) { - finalizeBundle(MBB, B, I); - Changed = true; + // Erase the kill if it's a subset of the used registers. + // + // TODO: Should we just remove all kills? Is there any real reason to + // keep them after RA? + if (KillUsedRegUnits.none()) { + ++Next; + Kill.eraseFromParent(); + } else + break; + + KillUsedRegUnits.reset(); } - Next = I; + + BundleUsedRegUnits.reset(); } - B = Next; - Defs.clear(); - continue; + finalizeBundle(MBB, BundleStart, Next); } - if (I->getNumExplicitDefs() == 0) - continue; - - Defs.insert(I->defs().begin()->getReg()); - } - - if (B != E && std::next(B) != E) { - finalizeBundle(MBB, B, E); - Changed = true; + Defs.clear(); } - - Defs.clear(); } return Changed; diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp index dc08d9dcb9bb..c2e2875ed6bf 100644 --- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp @@ -38,6 +38,9 @@ private: RegisterClassInfo RegClassInfo; std::vector<unsigned> RegsToRewrite; +#ifndef NDEBUG + void printWWMInfo(const MachineInstr &MI); +#endif public: static char ID; @@ -139,13 +142,26 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) { } SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + MachineFrameInfo &FrameInfo = MF.getFrameInfo(); for (unsigned Reg : RegsToRewrite) { LIS->removeInterval(Reg); const Register PhysReg = VRM->getPhys(Reg); assert(PhysReg != 0); - MFI->ReserveWWMRegister(PhysReg); + + // Check if PhysReg is already reserved + if (!MFI->WWMReservedRegs.count(PhysReg)) { + Optional<int> FI; + if (!MFI->isEntryFunction()) { + // Create a stack object for a possible spill in the function prologue. + // Note: Non-CSR VGPR also need this as we may overwrite inactive lanes. + const TargetRegisterClass *RC = TRI->getPhysRegClass(PhysReg); + FI = FrameInfo.CreateSpillStackObject(TRI->getSpillSize(*RC), + TRI->getSpillAlign(*RC)); + } + MFI->reserveWWMRegister(PhysReg, FI); + } } RegsToRewrite.clear(); @@ -154,6 +170,31 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) { MRI->freezeReservedRegs(MF); } +#ifndef NDEBUG +LLVM_DUMP_METHOD void +SIPreAllocateWWMRegs::printWWMInfo(const MachineInstr &MI) { + + unsigned Opc = MI.getOpcode(); + + if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM) { + dbgs() << "Entering "; + } else { + assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM); + dbgs() << "Exiting "; + } + + if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WWM) { + dbgs() << "Strict WWM "; + } else { + assert(Opc == AMDGPU::ENTER_STRICT_WQM || Opc == AMDGPU::EXIT_STRICT_WQM); + dbgs() << "Strict WQM "; + } + + dbgs() << "region: " << MI; +} + +#endif + bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "SIPreAllocateWWMRegs: function " << MF.getName() << "\n"); @@ -185,21 +226,23 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) { MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64) RegsAssigned |= processDef(MI.getOperand(0)); - if (MI.getOpcode() == AMDGPU::ENTER_WWM) { - LLVM_DEBUG(dbgs() << "entering WWM region: " << MI << "\n"); + if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM || + MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM) { + LLVM_DEBUG(printWWMInfo(MI)); InWWM = true; continue; } - if (MI.getOpcode() == AMDGPU::EXIT_WWM) { - LLVM_DEBUG(dbgs() << "exiting WWM region: " << MI << "\n"); + if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM || + MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM) { + LLVM_DEBUG(printWWMInfo(MI)); InWWM = false; } if (!InWWM) continue; - LLVM_DEBUG(dbgs() << "processing " << MI << "\n"); + LLVM_DEBUG(dbgs() << "Processing " << MI); for (MachineOperand &DefOpnd : MI.defs()) { RegsAssigned |= processDef(DefOpnd); diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 9ca43512cd91..dce0f4b0df5f 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -14,13 +14,20 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" using namespace llvm; #define DEBUG_TYPE "si-pre-emit-peephole" +static unsigned SkipThreshold; + +static cl::opt<unsigned, true> SkipThresholdFlag( + "amdgpu-skip-threshold", cl::Hidden, + cl::desc( + "Number of instructions before jumping over divergent control flow"), + cl::location(SkipThreshold), cl::init(12)); + namespace { class SIPreEmitPeephole : public MachineFunctionPass { @@ -30,6 +37,13 @@ private: bool optimizeVccBranch(MachineInstr &MI) const; bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const; + bool getBlockDestinations(MachineBasicBlock &SrcMBB, + MachineBasicBlock *&TrueMBB, + MachineBasicBlock *&FalseMBB, + SmallVectorImpl<MachineOperand> &Cond); + bool mustRetainExeczBranch(const MachineBasicBlock &From, + const MachineBasicBlock &To) const; + bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); public: static char ID; @@ -219,8 +233,11 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First, return false; // Scan back to find an identical S_SET_GPR_IDX_ON - for (MachineBasicBlock::iterator I = std::next(First.getIterator()), - E = MI.getIterator(); I != E; ++I) { + for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()), + E = MI.getIterator(); + I != E; ++I) { + if (I->isBundle()) + continue; switch (I->getOpcode()) { case AMDGPU::S_SET_GPR_IDX_MODE: return false; @@ -249,9 +266,77 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First, } } - MI.eraseFromParent(); + MI.eraseFromBundle(); for (MachineInstr *RI : ToRemove) - RI->eraseFromParent(); + RI->eraseFromBundle(); + return true; +} + +bool SIPreEmitPeephole::getBlockDestinations( + MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB, + MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) { + if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond)) + return false; + + if (!FalseMBB) + FalseMBB = SrcMBB.getNextNode(); + + return true; +} + +bool SIPreEmitPeephole::mustRetainExeczBranch( + const MachineBasicBlock &From, const MachineBasicBlock &To) const { + unsigned NumInstr = 0; + const MachineFunction *MF = From.getParent(); + + for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); + MBBI != End && MBBI != ToI; ++MBBI) { + const MachineBasicBlock &MBB = *MBBI; + + for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + // When a uniform loop is inside non-uniform control flow, the branch + // leaving the loop might never be taken when EXEC = 0. + // Hence we should retain cbranch out of the loop lest it become infinite. + if (I->isConditionalBranch()) + return true; + + if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) + return true; + + // These instructions are potentially expensive even if EXEC = 0. + if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) || + TII->isDS(*I) || I->getOpcode() == AMDGPU::S_WAITCNT) + return true; + + ++NumInstr; + if (NumInstr >= SkipThreshold) + return true; + } + } + + return false; +} + +// Returns true if the skip branch instruction is removed. +bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, + MachineBasicBlock &SrcMBB) { + MachineBasicBlock *TrueMBB = nullptr; + MachineBasicBlock *FalseMBB = nullptr; + SmallVector<MachineOperand, 1> Cond; + + if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond)) + return false; + + // Consider only the forward branches. + if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) || + mustRetainExeczBranch(*FalseMBB, *TrueMBB)) + return false; + + LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI); + MI.eraseFromParent(); + SrcMBB.removeSuccessor(TrueMBB); + return true; } @@ -259,52 +344,25 @@ bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); - MachineBasicBlock *EmptyMBBAtEnd = nullptr; bool Changed = false; + MF.RenumberBlocks(); + for (MachineBasicBlock &MBB : MF) { - MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator(); - MachineBasicBlock::iterator TermI = MBBE; - // Check first terminator for VCC branches to optimize + MachineBasicBlock::iterator TermI = MBB.getFirstTerminator(); + // Check first terminator for branches to optimize if (TermI != MBB.end()) { MachineInstr &MI = *TermI; switch (MI.getOpcode()) { case AMDGPU::S_CBRANCH_VCCZ: case AMDGPU::S_CBRANCH_VCCNZ: Changed |= optimizeVccBranch(MI); - continue; - default: + break; + case AMDGPU::S_CBRANCH_EXECZ: + Changed |= removeExeczBranch(MI, MBB); break; } } - // Check all terminators for SI_RETURN_TO_EPILOG - // FIXME: This is not an optimization and should be moved somewhere else. - while (TermI != MBB.end()) { - MachineInstr &MI = *TermI; - if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { - assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid()); - - // Graphics shaders returning non-void shouldn't contain S_ENDPGM, - // because external bytecode will be appended at the end. - if (&MBB != &MF.back() || &MI != &MBB.back()) { - // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block - // at the end and jump there. - if (!EmptyMBBAtEnd) { - EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); - MF.insert(MF.end(), EmptyMBBAtEnd); - } - - MBB.addSuccessor(EmptyMBBAtEnd); - BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) - .addMBB(EmptyMBBAtEnd); - MI.eraseFromParent(); - MBBE = MBB.getFirstTerminator(); - TermI = MBBE; - continue; - } - } - TermI++; - } if (!ST.hasVGPRIndexMode()) continue; @@ -315,10 +373,10 @@ bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) { // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a // second is not needed. Do expensive checks in the optimizeSetGPR() // and limit the distance to 20 instructions for compile time purposes. - for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBBE; ) { - MachineInstr &MI = *MBBI; - ++MBBI; - + // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions + // may be bundled with the instructions they modify. + for (auto &MI : + make_early_inc_range(make_range(MBB.instr_begin(), MBB.instr_end()))) { if (Count == Threshold) SetGPRMI = nullptr; else diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h index 9b72d0829d80..b13afceba20e 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h @@ -41,10 +41,13 @@ struct SIProgramInfo { uint32_t ScratchBlocks = 0; uint64_t ComputePGMRSrc2 = 0; + uint64_t ComputePGMRSrc3GFX90A = 0; uint32_t NumVGPR = 0; uint32_t NumArchVGPR = 0; uint32_t NumAccVGPR = 0; + uint32_t AccumOffset = 0; + uint32_t TgSplit = 0; uint32_t NumSGPR = 0; uint32_t LDSSize = 0; bool FlatUsed = false; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 7a45d8c54f9a..bba5bf7fdbc3 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -43,6 +43,233 @@ std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable; static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9}; +namespace llvm { + +// A temporary struct to spill SGPRs. +// This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits +// just v_writelane and v_readlane. +// +// When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR +// is saved to scratch (or the other way around for loads). +// For this, a VGPR is required where the needed lanes can be clobbered. The +// RegScavenger can provide a VGPR where currently active lanes can be +// clobbered, but we still need to save inactive lanes. +// The high-level steps are: +// - Try to scavenge SGPR(s) to save exec +// - Try to scavenge VGPR +// - Save needed, all or inactive lanes of a TmpVGPR +// - Spill/Restore SGPRs using TmpVGPR +// - Restore TmpVGPR +// +// To save all lanes of TmpVGPR, exec needs to be saved and modified. If we +// cannot scavenge temporary SGPRs to save exec, we use the following code: +// buffer_store_dword TmpVGPR ; only if active lanes need to be saved +// s_not exec, exec +// buffer_store_dword TmpVGPR ; save inactive lanes +// s_not exec, exec +struct SGPRSpillBuilder { + struct PerVGPRData { + unsigned PerVGPR; + unsigned NumVGPRs; + int64_t VGPRLanes; + }; + + // The SGPR to save + Register SuperReg; + MachineBasicBlock::iterator MI; + ArrayRef<int16_t> SplitParts; + unsigned NumSubRegs; + bool IsKill; + const DebugLoc &DL; + + /* When spilling to stack */ + // The SGPRs are written into this VGPR, which is then written to scratch + // (or vice versa for loads). + Register TmpVGPR = AMDGPU::NoRegister; + // Temporary spill slot to save TmpVGPR to. + int TmpVGPRIndex = 0; + // If TmpVGPR is live before the spill or if it is scavenged. + bool TmpVGPRLive = false; + // Scavenged SGPR to save EXEC. + Register SavedExecReg = AMDGPU::NoRegister; + // Stack index to write the SGPRs to. + int Index; + unsigned EltSize = 4; + + RegScavenger *RS; + MachineBasicBlock &MBB; + MachineFunction &MF; + SIMachineFunctionInfo &MFI; + const SIInstrInfo &TII; + const SIRegisterInfo &TRI; + bool IsWave32; + Register ExecReg; + unsigned MovOpc; + unsigned NotOpc; + + SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, + bool IsWave32, MachineBasicBlock::iterator MI, int Index, + RegScavenger *RS) + : SuperReg(MI->getOperand(0).getReg()), MI(MI), + IsKill(MI->getOperand(0).isKill()), DL(MI->getDebugLoc()), Index(Index), + RS(RS), MBB(*MI->getParent()), MF(*MBB.getParent()), + MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), + IsWave32(IsWave32) { + const TargetRegisterClass *RC = TRI.getPhysRegClass(SuperReg); + SplitParts = TRI.getRegSplitParts(RC, EltSize); + NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); + + if (IsWave32) { + ExecReg = AMDGPU::EXEC_LO; + MovOpc = AMDGPU::S_MOV_B32; + NotOpc = AMDGPU::S_NOT_B32; + } else { + ExecReg = AMDGPU::EXEC; + MovOpc = AMDGPU::S_MOV_B64; + NotOpc = AMDGPU::S_NOT_B64; + } + + assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); + assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && + SuperReg != AMDGPU::EXEC && "exec should never spill"); + } + + PerVGPRData getPerVGPRData() { + PerVGPRData Data; + Data.PerVGPR = IsWave32 ? 32 : 64; + Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR; + Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL; + return Data; + } + + // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is + // free. + // Writes these instructions if an SGPR can be scavenged: + // s_mov_b64 s[6:7], exec ; Save exec + // s_mov_b64 exec, 3 ; Wanted lanemask + // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot + // + // Writes these instructions if no SGPR can be scavenged: + // buffer_store_dword v0 ; Only if no free VGPR was found + // s_not_b64 exec, exec + // buffer_store_dword v0 ; Save inactive lanes + // ; exec stays inverted, it is flipped back in + // ; restore. + void prepare() { + // Scavenged temporary VGPR to use. It must be scavenged once for any number + // of spilled subregs. + // FIXME: The liveness analysis is limited and does not tell if a register + // is in use in lanes that are currently inactive. We can never be sure if + // a register as actually in use in another lane, so we need to save all + // used lanes of the chosen VGPR. + assert(RS && "Cannot spill SGPR to memory without RegScavenger"); + TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0, false); + + // Reserve temporary stack slot + TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI); + if (TmpVGPR) { + // Found a register that is dead in the currently active lanes, we only + // need to spill inactive lanes. + TmpVGPRLive = false; + } else { + // Pick v0 because it doesn't make a difference. + TmpVGPR = AMDGPU::VGPR0; + TmpVGPRLive = true; + } + + // Try to scavenge SGPRs to save exec + assert(!SavedExecReg && "Exec is already saved, refuse to save again"); + const TargetRegisterClass &RC = + IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass; + RS->setRegUsed(SuperReg); + SavedExecReg = RS->scavengeRegister(&RC, MI, 0, false); + + int64_t VGPRLanes = getPerVGPRData().VGPRLanes; + + if (SavedExecReg) { + RS->setRegUsed(SavedExecReg); + // Set exec to needed lanes + BuildMI(MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg); + auto I = BuildMI(MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes); + if (!TmpVGPRLive) + I.addReg(TmpVGPR, RegState::ImplicitDefine); + // Spill needed lanes + TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); + } else { + // Spill active lanes + if (TmpVGPRLive) + TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false, + /*IsKill*/ false); + // Spill inactive lanes + auto I = BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); + if (!TmpVGPRLive) + I.addReg(TmpVGPR, RegState::ImplicitDefine); + TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); + } + } + + // Writes these instructions if an SGPR can be scavenged: + // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot + // s_waitcnt vmcnt(0) ; If a free VGPR was found + // s_mov_b64 exec, s[6:7] ; Save exec + // + // Writes these instructions if no SGPR can be scavenged: + // buffer_load_dword v0 ; Restore inactive lanes + // s_waitcnt vmcnt(0) ; If a free VGPR was found + // s_not_b64 exec, exec + // buffer_load_dword v0 ; Only if no free VGPR was found + void restore() { + if (SavedExecReg) { + // Restore used lanes + TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, + /*IsKill*/ false); + // Restore exec + auto I = BuildMI(MBB, MI, DL, TII.get(MovOpc), ExecReg) + .addReg(SavedExecReg, RegState::Kill); + // Add an implicit use of the load so it is not dead. + // FIXME This inserts an unnecessary waitcnt + if (!TmpVGPRLive) { + I.addReg(TmpVGPR, RegState::ImplicitKill); + } + } else { + // Restore inactive lanes + TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, + /*IsKill*/ false); + auto I = BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); + if (!TmpVGPRLive) { + I.addReg(TmpVGPR, RegState::ImplicitKill); + } + // Restore active lanes + if (TmpVGPRLive) + TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true); + } + } + + // Write TmpVGPR to memory or read TmpVGPR from memory. + // Either using a single buffer_load/store if exec is set to the needed mask + // or using + // buffer_load + // s_not exec, exec + // buffer_load + // s_not exec, exec + void readWriteTmpVGPR(unsigned Offset, bool IsLoad) { + if (SavedExecReg) { + // Spill needed lanes + TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); + } else { + // Spill active lanes + TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad, + /*IsKill*/ false); + // Spill inactive lanes + BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); + TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); + BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); + } + } +}; + +} // namespace llvm + SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { @@ -122,7 +349,9 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( case CallingConv::Fast: case CallingConv::Cold: case CallingConv::AMDGPU_Gfx: - return CSR_AMDGPU_HighRegs_SaveList; + return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts() + ? CSR_AMDGPU_HighRegs_With_AGPRs_SaveList + : CSR_AMDGPU_HighRegs_SaveList; default: { // Dummy to not crash RegisterClassInfo. static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; @@ -143,7 +372,9 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, case CallingConv::Fast: case CallingConv::Cold: case CallingConv::AMDGPU_Gfx: - return CSR_AMDGPU_HighRegs_RegMask; + return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts() + ? CSR_AMDGPU_HighRegs_With_AGPRs_RegMask + : CSR_AMDGPU_HighRegs_RegMask; default: return nullptr; } @@ -172,7 +403,7 @@ bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const { // When we need stack realignment, we can't reference off of the // stack pointer, so we reserve a base pointer. const MachineFrameInfo &MFI = MF.getFrameInfo(); - return MFI.getNumFixedObjects() && needsStackRealignment(MF); + return MFI.getNumFixedObjects() && shouldRealignStack(MF); } Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; } @@ -181,6 +412,14 @@ const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { return CSR_AMDGPU_AllVGPRs_RegMask; } +const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const { + return CSR_AMDGPU_AllAGPRs_RegMask; +} + +const uint32_t *SIRegisterInfo::getAllVectorRegMask() const { + return CSR_AMDGPU_AllVectorRegs_RegMask; +} + const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { return CSR_AMDGPU_AllAllocatableSRegs_RegMask; } @@ -263,6 +502,12 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { } unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); + // TODO: In an entry function without calls and AGPRs used it is possible + // to use the whole register budget for VGPRs. Even more it shall + // be possible to estimate maximum AGPR/VGPR pressure and split + // register file accordingly. + if (ST.hasGFX90AInsts()) + MaxNumVGPRs /= 2; unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); @@ -323,9 +568,20 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); } - for (MCRegister Reg : MFI->WWMReservedRegs) { - reserveRegisterTuples(Reserved, Reg); + for (auto Reg : MFI->WWMReservedRegs) { + reserveRegisterTuples(Reserved, Reg.first); + } + + // Reserve VGPRs used for SGPR spilling. + // Note we treat freezeReservedRegs unusually because we run register + // allocation in two phases. It's OK to re-freeze with new registers for the + // second run. +#if 0 + for (auto &SpilledFI : MFI->sgpr_spill_vgprs()) { + for (auto &SpilledVGPR : SpilledFI.second) + reserveRegisterTuples(Reserved, SpilledVGPR.VGPR); } +#endif // FIXME: Stop using reserved registers for this. for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) @@ -340,7 +596,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { return Reserved; } -bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const { +bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const { const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); // On entry, the base address is 0, so it can't possibly need any more // alignment. @@ -350,7 +606,7 @@ bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const { if (Info->isEntryFunction()) return false; - return TargetRegisterInfo::canRealignStack(MF); + return TargetRegisterInfo::shouldRealignStack(MF); } bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { @@ -408,7 +664,7 @@ int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, } bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { - if (!MI->mayLoadOrStore()) + if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI)) return false; int64_t FullOffset = Offset + getScratchInstrOffset(MI); @@ -417,7 +673,8 @@ bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset); const SIInstrInfo *TII = ST.getInstrInfo(); - return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, true); + return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, + SIInstrFlags::FlatScratch); } Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, @@ -457,7 +714,7 @@ Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, .addFrameIndex(FrameIdx); if (ST.enableFlatScratch() ) { - BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_U32), BaseReg) + BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg) .addReg(OffsetReg, RegState::Kill) .addReg(FIReg); return BaseReg; @@ -500,7 +757,8 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI)); if (IsFlat) { - assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true) && + assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, + SIInstrFlags::FlatScratch) && "offset should be legal"); FIOp->ChangeToRegister(BaseReg, false); OffsetOp->setImm(NewOffset); @@ -531,7 +789,8 @@ bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset); const SIInstrInfo *TII = ST.getInstrInfo(); - return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true); + return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, + SIInstrFlags::FlatScratch); } const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( @@ -566,6 +825,13 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_A256_SAVE: case AMDGPU::SI_SPILL_A256_RESTORE: return 8; + case AMDGPU::SI_SPILL_S224_SAVE: + case AMDGPU::SI_SPILL_S224_RESTORE: + case AMDGPU::SI_SPILL_V224_SAVE: + case AMDGPU::SI_SPILL_V224_RESTORE: + case AMDGPU::SI_SPILL_A224_SAVE: + case AMDGPU::SI_SPILL_A224_RESTORE: + return 7; case AMDGPU::SI_SPILL_S192_SAVE: case AMDGPU::SI_SPILL_S192_RESTORE: case AMDGPU::SI_SPILL_V192_SAVE: @@ -667,13 +933,11 @@ static int getOffsetMUBUFLoad(unsigned Opc) { } static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - int Index, - unsigned Lane, - unsigned ValueReg, - bool IsKill) { - MachineBasicBlock *MBB = MI->getParent(); - MachineFunction *MF = MI->getParent()->getParent(); + int Index, unsigned Lane, + unsigned ValueReg, bool IsKill) { + MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); const SIInstrInfo *TII = ST.getInstrInfo(); @@ -691,8 +955,8 @@ static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_ACCVGPR_READ_B32_e64; - auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst) - .addReg(Src, getKillRegState(IsKill)); + auto MIB = BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst) + .addReg(Src, getKillRegState(IsKill)); MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); return MIB; } @@ -716,7 +980,7 @@ static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, return false; const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); - if (spillVGPRtoAGPR(ST, MI, Index, 0, Reg->getReg(), false).getInstr()) + if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr()) return true; MachineInstrBuilder NewMI = @@ -725,10 +989,8 @@ static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) .addImm(Offset) - .addImm(0) // glc - .addImm(0) // slc + .addImm(0) // cpol .addImm(0) // tfe - .addImm(0) // dlc .addImm(0) // swz .cloneMemRefs(*MI); @@ -774,23 +1036,20 @@ static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, return LoadStoreOp; } -void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, - unsigned LoadStoreOp, - int Index, - Register ValueReg, - bool IsKill, - MCRegister ScratchOffsetReg, - int64_t InstOffset, - MachineMemOperand *MMO, - RegScavenger *RS) const { - MachineBasicBlock *MBB = MI->getParent(); - MachineFunction *MF = MI->getParent()->getParent(); +void SIRegisterInfo::buildSpillLoadStore( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill, + MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO, + RegScavenger *RS, LivePhysRegs *LiveRegs) const { + assert((!RS || !LiveRegs) && "Only RS or LiveRegs can be set but not both"); + + MachineFunction *MF = MBB.getParent(); const SIInstrInfo *TII = ST.getInstrInfo(); const MachineFrameInfo &MFI = MF->getFrameInfo(); const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); const MCInstrDesc *Desc = &TII->get(LoadStoreOp); - const DebugLoc &DL = MI->getDebugLoc(); + const DebugLoc &DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc(); bool IsStore = Desc->mayStore(); bool IsFlat = TII->isFLATScratch(LoadStoreOp); @@ -798,7 +1057,8 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, MCRegister SOffset = ScratchOffsetReg; const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); - const bool IsAGPR = hasAGPRs(RC); + // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores. + const bool IsAGPR = !ST.hasGFX90AInsts() && hasAGPRs(RC); const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8; // Always use 4 byte operations for AGPRs because we need to scavenge @@ -823,9 +1083,10 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, assert((IsFlat || ((Offset % EltSize) == 0)) && "unexpected VGPR spill offset"); - bool IsOffsetLegal = IsFlat - ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, true) - : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset); + bool IsOffsetLegal = + IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, + SIInstrFlags::FlatScratch) + : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset); if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) { SOffset = MCRegister(); @@ -836,9 +1097,17 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, Offset *= ST.getWavefrontSize(); // We don't have access to the register scavenger if this function is called - // during PEI::scavengeFrameVirtualRegs(). - if (RS) + // during PEI::scavengeFrameVirtualRegs() so use LiveRegs in this case. + if (RS) { SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false); + } else if (LiveRegs) { + for (MCRegister Reg : AMDGPU::SGPR_32RegClass) { + if (LiveRegs->available(MF->getRegInfo(), Reg)) { + SOffset = Reg; + break; + } + } + } if (!SOffset) { // There are no free SGPRs, and since we are in the process of spilling @@ -860,10 +1129,9 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, report_fatal_error("could not scavenge SGPR to spill in entry function"); if (ScratchOffsetReg == AMDGPU::NoRegister) { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset) - .addImm(Offset); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset); } else { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) .addReg(ScratchOffsetReg) .addImm(Offset); } @@ -916,7 +1184,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, Register Sub = IsSubReg ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane))) : ValueReg; - auto MIB = spillVGPRtoAGPR(ST, MI, Index, Lane, Sub, IsKill); + auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill); if (!MIB.getInstr()) break; if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == 0)) { @@ -962,9 +1230,9 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, RS->setRegUsed(TmpReg); } if (IsStore) { - auto AccRead = BuildMI(*MBB, MI, DL, - TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TmpReg) - .addReg(SubReg, getKillRegState(IsKill)); + auto AccRead = BuildMI(MBB, MI, DL, + TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TmpReg) + .addReg(SubReg, getKillRegState(IsKill)); if (NeedSuperRegDef) AccRead.addReg(ValueReg, RegState::ImplicitDefine); AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse); @@ -977,9 +1245,9 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize, commonAlignment(Alignment, RemRegOffset)); - auto MIB = BuildMI(*MBB, MI, DL, *Desc) - .addReg(SubReg, - getDefRegState(!IsStore) | getKillRegState(IsKill)); + auto MIB = + BuildMI(MBB, MI, DL, *Desc) + .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)); if (!IsFlat) MIB.addReg(FuncInfo->getScratchRSrcReg()); @@ -990,11 +1258,9 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, MIB.addReg(SOffset, SOffsetRegState); } MIB.addImm(Offset + RemRegOffset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0); // tfe for MUBUF or dlc for FLAT + .addImm(0); // cpol if (!IsFlat) - MIB.addImm(0) // dlc + MIB.addImm(0) // tfe .addImm(0); // swz MIB.addMemOperand(NewMMO); @@ -1002,9 +1268,9 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, MIB.addReg(ValueReg, RegState::ImplicitDefine); if (!IsStore && TmpReg != AMDGPU::NoRegister) { - MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), + MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), FinalReg) - .addReg(TmpReg, RegState::Kill); + .addReg(TmpReg, RegState::Kill); MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); } @@ -1014,321 +1280,239 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, if (ScratchOffsetRegDelta != 0) { // Subtract the offset we added to the ScratchOffset register. - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), SOffset) + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) .addReg(SOffset) - .addImm(ScratchOffsetRegDelta); + .addImm(-ScratchOffsetRegDelta); } } -// Generate a VMEM access which loads or stores the VGPR containing an SGPR -// spill such that all the lanes set in VGPRLanes are loaded or stored. -// This generates exec mask manipulation and will use SGPRs available in MI -// or VGPR lanes in the VGPR to save and restore the exec mask. -void SIRegisterInfo::buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI, - int Index, int Offset, - unsigned EltSize, Register VGPR, - int64_t VGPRLanes, - RegScavenger *RS, - bool IsLoad) const { - MachineBasicBlock *MBB = MI->getParent(); - MachineFunction *MF = MBB->getParent(); - SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); - const SIInstrInfo *TII = ST.getInstrInfo(); - - Register SuperReg = MI->getOperand(0).getReg(); - const TargetRegisterClass *RC = getPhysRegClass(SuperReg); - ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); - unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); - unsigned FirstPart = Offset * 32; - unsigned ExecLane = 0; - - bool IsKill = MI->getOperand(0).isKill(); - const DebugLoc &DL = MI->getDebugLoc(); - - // Cannot handle load/store to EXEC - assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && - SuperReg != AMDGPU::EXEC && "exec should never spill"); - - // On Wave32 only handle EXEC_LO. - // On Wave64 only update EXEC_HI if there is sufficent space for a copy. - bool OnlyExecLo = isWave32 || NumSubRegs == 1 || SuperReg == AMDGPU::EXEC_HI; - - unsigned ExecMovOpc = OnlyExecLo ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - Register ExecReg = OnlyExecLo ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - Register SavedExecReg; - - // Backup EXEC - if (OnlyExecLo) { - SavedExecReg = - NumSubRegs == 1 - ? SuperReg - : Register(getSubReg(SuperReg, SplitParts[FirstPart + ExecLane])); - } else { - // If src/dst is an odd size it is possible subreg0 is not aligned. - for (; ExecLane < (NumSubRegs - 1); ++ExecLane) { - SavedExecReg = getMatchingSuperReg( - getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]), AMDGPU::sub0, - &AMDGPU::SReg_64_XEXECRegClass); - if (SavedExecReg) - break; - } - } - assert(SavedExecReg); - BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), SavedExecReg).addReg(ExecReg); - - // Setup EXEC - BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg).addImm(VGPRLanes); - +void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, + int Offset, bool IsLoad, + bool IsKill) const { // Load/store VGPR - MachineFrameInfo &FrameInfo = MF->getFrameInfo(); + MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo(); assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill); - Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) - ? getBaseRegister() - : getFrameRegister(*MF); + Register FrameReg = + FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF) + ? getBaseRegister() + : getFrameRegister(SB.MF); Align Alignment = FrameInfo.getObjectAlign(Index); - MachinePointerInfo PtrInfo = - MachinePointerInfo::getFixedStack(*MF, Index); - MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SB.MF, Index); + MachineMemOperand *MMO = SB.MF.getMachineMemOperand( PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore, - EltSize, Alignment); + SB.EltSize, Alignment); if (IsLoad) { unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; - buildSpillLoadStore(MI, Opc, - Index, - VGPR, false, - FrameReg, - Offset * EltSize, MMO, - RS); + buildSpillLoadStore(SB.MBB, SB.MI, Opc, Index, SB.TmpVGPR, false, FrameReg, + Offset * SB.EltSize, MMO, SB.RS); } else { unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR : AMDGPU::BUFFER_STORE_DWORD_OFFSET; - buildSpillLoadStore(MI, Opc, Index, VGPR, - IsKill, FrameReg, - Offset * EltSize, MMO, RS); + buildSpillLoadStore(SB.MBB, SB.MI, Opc, Index, SB.TmpVGPR, IsKill, FrameReg, + Offset * SB.EltSize, MMO, SB.RS); // This only ever adds one VGPR spill - MFI->addToSpilledVGPRs(1); - } - - // Restore EXEC - BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg) - .addReg(SavedExecReg, getKillRegState(IsLoad || IsKill)); - - // Restore clobbered SGPRs - if (IsLoad) { - // Nothing to do; register will be overwritten - } else if (!IsKill) { - // Restore SGPRs from appropriate VGPR lanes - if (!OnlyExecLo) { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), - getSubReg(SuperReg, SplitParts[FirstPart + ExecLane + 1])) - .addReg(VGPR) - .addImm(ExecLane + 1); - } - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), - NumSubRegs == 1 ? SavedExecReg - : Register(getSubReg( - SuperReg, SplitParts[FirstPart + ExecLane]))) - .addReg(VGPR, RegState::Kill) - .addImm(ExecLane); + SB.MFI.addToSpilledVGPRs(1); } } bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS, + LiveIntervals *LIS, bool OnlyToVGPR) const { - MachineBasicBlock *MBB = MI->getParent(); - MachineFunction *MF = MBB->getParent(); - SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); - ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills - = MFI->getSGPRToVGPRSpills(Index); + ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills = + SB.MFI.getSGPRToVGPRSpills(Index); bool SpillToVGPR = !VGPRSpills.empty(); if (OnlyToVGPR && !SpillToVGPR) return false; - const SIInstrInfo *TII = ST.getInstrInfo(); - - Register SuperReg = MI->getOperand(0).getReg(); - bool IsKill = MI->getOperand(0).isKill(); - const DebugLoc &DL = MI->getDebugLoc(); - - assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() && - SuperReg != MFI->getFrameOffsetReg())); - - assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); - assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && - SuperReg != AMDGPU::EXEC && "exec should never spill"); - - unsigned EltSize = 4; - const TargetRegisterClass *RC = getPhysRegClass(SuperReg); - - ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); - unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); + assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() && + SB.SuperReg != SB.MFI.getFrameOffsetReg())); if (SpillToVGPR) { - for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { - Register SubReg = NumSubRegs == 1 - ? SuperReg - : Register(getSubReg(SuperReg, SplitParts[i])); + for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { + Register SubReg = + SB.NumSubRegs == 1 + ? SB.SuperReg + : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; - bool UseKill = IsKill && i == NumSubRegs - 1; + bool UseKill = SB.IsKill && i == SB.NumSubRegs - 1; // Mark the "old value of vgpr" input undef only if this is the first sgpr // spill to this specific vgpr in the first basic block. - auto MIB = - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill.VGPR) - .addReg(SubReg, getKillRegState(UseKill)) - .addImm(Spill.Lane) - .addReg(Spill.VGPR); + auto MIB = BuildMI(SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32), + Spill.VGPR) + .addReg(SubReg, getKillRegState(UseKill)) + .addImm(Spill.Lane) + .addReg(Spill.VGPR); + if (LIS) { + if (i == 0) + LIS->ReplaceMachineInstrInMaps(*MI, *MIB); + else + LIS->InsertMachineInstrInMaps(*MIB); + } - if (i == 0 && NumSubRegs > 1) { + if (i == 0 && SB.NumSubRegs > 1) { // We may be spilling a super-register which is only partially defined, // and need to ensure later spills think the value is defined. - MIB.addReg(SuperReg, RegState::ImplicitDefine); + MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); } - if (NumSubRegs > 1) - MIB.addReg(SuperReg, getKillRegState(UseKill) | RegState::Implicit); + if (SB.NumSubRegs > 1) + MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit); // FIXME: Since this spills to another register instead of an actual // frame index, we should delete the frame index when all references to // it are fixed. } } else { - // Scavenged temporary VGPR to use. It must be scavenged once for any number - // of spilled subregs. - Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); - RS->setRegUsed(TmpVGPR); + SB.prepare(); - // SubReg carries the "Kill" flag when SubReg == SuperReg. - unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); + // SubReg carries the "Kill" flag when SubReg == SB.SuperReg. + unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill); - unsigned PerVGPR = 32; - unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR; - int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL; + // Per VGPR helper data + auto PVD = SB.getPerVGPRData(); - for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) { + for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { unsigned TmpVGPRFlags = RegState::Undef; // Write sub registers into the VGPR - for (unsigned i = Offset * PerVGPR, - e = std::min((Offset + 1) * PerVGPR, NumSubRegs); + for (unsigned i = Offset * PVD.PerVGPR, + e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); i < e; ++i) { - Register SubReg = NumSubRegs == 1 - ? SuperReg - : Register(getSubReg(SuperReg, SplitParts[i])); + Register SubReg = + SB.NumSubRegs == 1 + ? SB.SuperReg + : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); MachineInstrBuilder WriteLane = - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), TmpVGPR) + BuildMI(SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32), + SB.TmpVGPR) .addReg(SubReg, SubKillState) - .addImm(i % PerVGPR) - .addReg(TmpVGPR, TmpVGPRFlags); + .addImm(i % PVD.PerVGPR) + .addReg(SB.TmpVGPR, TmpVGPRFlags); TmpVGPRFlags = 0; + if (LIS) { + if (i == 0) + LIS->ReplaceMachineInstrInMaps(*MI, *WriteLane); + else + LIS->InsertMachineInstrInMaps(*WriteLane); + } + // There could be undef components of a spilled super register. // TODO: Can we detect this and skip the spill? - if (NumSubRegs > 1) { - // The last implicit use of the SuperReg carries the "Kill" flag. + if (SB.NumSubRegs > 1) { + // The last implicit use of the SB.SuperReg carries the "Kill" flag. unsigned SuperKillState = 0; - if (i + 1 == NumSubRegs) - SuperKillState |= getKillRegState(IsKill); - WriteLane.addReg(SuperReg, RegState::Implicit | SuperKillState); + if (i + 1 == SB.NumSubRegs) + SuperKillState |= getKillRegState(SB.IsKill); + WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState); } } // Write out VGPR - buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes, - RS, false); + SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false); } + + SB.restore(); } MI->eraseFromParent(); - MFI->addToSpilledSGPRs(NumSubRegs); + SB.MFI.addToSpilledSGPRs(SB.NumSubRegs); + + if (LIS) + LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); + return true; } bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS, + LiveIntervals *LIS, bool OnlyToVGPR) const { - MachineFunction *MF = MI->getParent()->getParent(); - MachineBasicBlock *MBB = MI->getParent(); - SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); - ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills - = MFI->getSGPRToVGPRSpills(Index); + ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills = + SB.MFI.getSGPRToVGPRSpills(Index); bool SpillToVGPR = !VGPRSpills.empty(); if (OnlyToVGPR && !SpillToVGPR) return false; - const SIInstrInfo *TII = ST.getInstrInfo(); - const DebugLoc &DL = MI->getDebugLoc(); - - Register SuperReg = MI->getOperand(0).getReg(); - - assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); - assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && - SuperReg != AMDGPU::EXEC && "exec should never spill"); - - unsigned EltSize = 4; - - const TargetRegisterClass *RC = getPhysRegClass(SuperReg); - - ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); - unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); - if (SpillToVGPR) { - for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { - Register SubReg = NumSubRegs == 1 - ? SuperReg - : Register(getSubReg(SuperReg, SplitParts[i])); + for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) { + Register SubReg = + SB.NumSubRegs == 1 + ? SB.SuperReg + : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; - auto MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg) - .addReg(Spill.VGPR) - .addImm(Spill.Lane); - if (NumSubRegs > 1 && i == 0) - MIB.addReg(SuperReg, RegState::ImplicitDefine); + auto MIB = + BuildMI(SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), SubReg) + .addReg(Spill.VGPR) + .addImm(Spill.Lane); + if (SB.NumSubRegs > 1 && i == 0) + MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); + if (LIS) { + if (i == e - 1) + LIS->ReplaceMachineInstrInMaps(*MI, *MIB); + else + LIS->InsertMachineInstrInMaps(*MIB); + } + } } else { - Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); - RS->setRegUsed(TmpVGPR); + SB.prepare(); - unsigned PerVGPR = 32; - unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR; - int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL; + // Per VGPR helper data + auto PVD = SB.getPerVGPRData(); - for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) { + for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) { // Load in VGPR data - buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes, - RS, true); + SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true); // Unpack lanes - for (unsigned i = Offset * PerVGPR, - e = std::min((Offset + 1) * PerVGPR, NumSubRegs); + for (unsigned i = Offset * PVD.PerVGPR, + e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs); i < e; ++i) { - Register SubReg = NumSubRegs == 1 - ? SuperReg - : Register(getSubReg(SuperReg, SplitParts[i])); + Register SubReg = + SB.NumSubRegs == 1 + ? SB.SuperReg + : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); bool LastSubReg = (i + 1 == e); - auto MIB = - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg) - .addReg(TmpVGPR, getKillRegState(LastSubReg)) - .addImm(i); - if (NumSubRegs > 1 && i == 0) - MIB.addReg(SuperReg, RegState::ImplicitDefine); + auto MIB = BuildMI(SB.MBB, MI, SB.DL, + SB.TII.get(AMDGPU::V_READLANE_B32), SubReg) + .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) + .addImm(i); + if (SB.NumSubRegs > 1 && i == 0) + MIB.addReg(SB.SuperReg, RegState::ImplicitDefine); + if (LIS) { + if (i == e - 1) + LIS->ReplaceMachineInstrInMaps(*MI, *MIB); + else + LIS->InsertMachineInstrInMaps(*MIB); + } } } + + SB.restore(); } MI->eraseFromParent(); + + if (LIS) + LIS->removeAllRegUnitsForPhysReg(SB.SuperReg); + return true; } @@ -1338,28 +1522,31 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( MachineBasicBlock::iterator MI, int FI, - RegScavenger *RS) const { + RegScavenger *RS, + LiveIntervals *LIS) const { switch (MI->getOpcode()) { case AMDGPU::SI_SPILL_S1024_SAVE: case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S224_SAVE: case AMDGPU::SI_SPILL_S192_SAVE: case AMDGPU::SI_SPILL_S160_SAVE: case AMDGPU::SI_SPILL_S128_SAVE: case AMDGPU::SI_SPILL_S96_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: - return spillSGPR(MI, FI, RS, true); + return spillSGPR(MI, FI, RS, LIS, true); case AMDGPU::SI_SPILL_S1024_RESTORE: case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_S224_RESTORE: case AMDGPU::SI_SPILL_S192_RESTORE: case AMDGPU::SI_SPILL_S160_RESTORE: case AMDGPU::SI_SPILL_S128_RESTORE: case AMDGPU::SI_SPILL_S96_RESTORE: case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_S32_RESTORE: - return restoreSGPR(MI, FI, RS, true); + return restoreSGPR(MI, FI, RS, LIS, true); default: llvm_unreachable("not an SGPR spill instruction"); } @@ -1389,6 +1576,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_S1024_SAVE: case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S224_SAVE: case AMDGPU::SI_SPILL_S192_SAVE: case AMDGPU::SI_SPILL_S160_SAVE: case AMDGPU::SI_SPILL_S128_SAVE: @@ -1403,6 +1591,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_S1024_RESTORE: case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_S224_RESTORE: case AMDGPU::SI_SPILL_S192_RESTORE: case AMDGPU::SI_SPILL_S160_RESTORE: case AMDGPU::SI_SPILL_S128_RESTORE: @@ -1417,6 +1606,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_V1024_SAVE: case AMDGPU::SI_SPILL_V512_SAVE: case AMDGPU::SI_SPILL_V256_SAVE: + case AMDGPU::SI_SPILL_V224_SAVE: case AMDGPU::SI_SPILL_V192_SAVE: case AMDGPU::SI_SPILL_V160_SAVE: case AMDGPU::SI_SPILL_V128_SAVE: @@ -1426,6 +1616,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_A1024_SAVE: case AMDGPU::SI_SPILL_A512_SAVE: case AMDGPU::SI_SPILL_A256_SAVE: + case AMDGPU::SI_SPILL_A224_SAVE: case AMDGPU::SI_SPILL_A192_SAVE: case AMDGPU::SI_SPILL_A160_SAVE: case AMDGPU::SI_SPILL_A128_SAVE: @@ -1439,13 +1630,11 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR : AMDGPU::BUFFER_STORE_DWORD_OFFSET; - buildSpillLoadStore(MI, Opc, - Index, - VData->getReg(), VData->isKill(), - FrameReg, - TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), - *MI->memoperands_begin(), - RS); + auto *MBB = MI->getParent(); + buildSpillLoadStore( + *MBB, MI, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, + TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), + *MI->memoperands_begin(), RS); MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); MI->eraseFromParent(); break; @@ -1456,6 +1645,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_V128_RESTORE: case AMDGPU::SI_SPILL_V160_RESTORE: case AMDGPU::SI_SPILL_V192_RESTORE: + case AMDGPU::SI_SPILL_V224_RESTORE: case AMDGPU::SI_SPILL_V256_RESTORE: case AMDGPU::SI_SPILL_V512_RESTORE: case AMDGPU::SI_SPILL_V1024_RESTORE: @@ -1465,6 +1655,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_A128_RESTORE: case AMDGPU::SI_SPILL_A160_RESTORE: case AMDGPU::SI_SPILL_A192_RESTORE: + case AMDGPU::SI_SPILL_A224_RESTORE: case AMDGPU::SI_SPILL_A256_RESTORE: case AMDGPU::SI_SPILL_A512_RESTORE: case AMDGPU::SI_SPILL_A1024_RESTORE: { @@ -1475,18 +1666,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; - buildSpillLoadStore(MI, Opc, - Index, - VData->getReg(), VData->isKill(), - FrameReg, - TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), - *MI->memoperands_begin(), - RS); + auto *MBB = MI->getParent(); + buildSpillLoadStore( + *MBB, MI, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, + TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), + *MI->memoperands_begin(), RS); MI->eraseFromParent(); break; } default: { + // Other access to frame index const DebugLoc &DL = MI->getDebugLoc(); int64_t Offset = FrameInfo.getObjectOffset(Index); @@ -1507,7 +1697,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, TII->getNamedOperand(*MI, AMDGPU::OpName::offset); int64_t NewOffset = Offset + OffsetOp->getImm(); if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, - true)) { + SIInstrFlags::FlatScratch)) { OffsetOp->setImm(NewOffset); if (FrameReg) return; @@ -1580,9 +1770,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, FIOp.setIsKill(false); } - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), TmpSReg) - .addReg(FrameReg) - .addImm(Offset); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg) + .addReg(FrameReg) + .addImm(Offset); if (!UseSGPR) BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) @@ -1590,10 +1780,10 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, if (TmpSReg == FrameReg) { // Undo frame register modification. - BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_SUB_U32), + BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32), FrameReg) - .addReg(FrameReg) - .addImm(Offset); + .addReg(FrameReg) + .addImm(-Offset); } return; @@ -1667,17 +1857,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) .addReg(FrameReg) .addImm(ST.getWavefrontSizeLog2()); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg) - .addReg(ScaledReg, RegState::Kill) - .addImm(Offset); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) + .addReg(ScaledReg, RegState::Kill) + .addImm(Offset); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) .addReg(ScaledReg, RegState::Kill); // If there were truly no free SGPRs, we need to undo everything. if (!TmpScaledReg.isValid()) { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScaledReg) - .addReg(ScaledReg, RegState::Kill) - .addImm(Offset); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) + .addReg(ScaledReg, RegState::Kill) + .addImm(-Offset); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) .addReg(FrameReg) .addImm(ST.getWavefrontSizeLog2()); @@ -1735,14 +1925,8 @@ StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const { return AMDGPUInstPrinter::getRegisterName(Reg); } -const TargetRegisterClass * -SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) { - if (BitWidth == 1) - return &AMDGPU::VReg_1RegClass; - if (BitWidth <= 16) - return &AMDGPU::VGPR_LO16RegClass; - if (BitWidth <= 32) - return &AMDGPU::VGPR_32RegClass; +static const TargetRegisterClass * +getAnyVGPRClassForBitWidth(unsigned BitWidth) { if (BitWidth <= 64) return &AMDGPU::VReg_64RegClass; if (BitWidth <= 96) @@ -1753,6 +1937,8 @@ SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) { return &AMDGPU::VReg_160RegClass; if (BitWidth <= 192) return &AMDGPU::VReg_192RegClass; + if (BitWidth <= 224) + return &AMDGPU::VReg_224RegClass; if (BitWidth <= 256) return &AMDGPU::VReg_256RegClass; if (BitWidth <= 512) @@ -1763,12 +1949,44 @@ SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) { return nullptr; } +static const TargetRegisterClass * +getAlignedVGPRClassForBitWidth(unsigned BitWidth) { + if (BitWidth <= 64) + return &AMDGPU::VReg_64_Align2RegClass; + if (BitWidth <= 96) + return &AMDGPU::VReg_96_Align2RegClass; + if (BitWidth <= 128) + return &AMDGPU::VReg_128_Align2RegClass; + if (BitWidth <= 160) + return &AMDGPU::VReg_160_Align2RegClass; + if (BitWidth <= 192) + return &AMDGPU::VReg_192_Align2RegClass; + if (BitWidth <= 224) + return &AMDGPU::VReg_224_Align2RegClass; + if (BitWidth <= 256) + return &AMDGPU::VReg_256_Align2RegClass; + if (BitWidth <= 512) + return &AMDGPU::VReg_512_Align2RegClass; + if (BitWidth <= 1024) + return &AMDGPU::VReg_1024_Align2RegClass; + + return nullptr; +} + const TargetRegisterClass * -SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) { +SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const { + if (BitWidth == 1) + return &AMDGPU::VReg_1RegClass; if (BitWidth <= 16) - return &AMDGPU::AGPR_LO16RegClass; + return &AMDGPU::VGPR_LO16RegClass; if (BitWidth <= 32) - return &AMDGPU::AGPR_32RegClass; + return &AMDGPU::VGPR_32RegClass; + return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth) + : getAnyVGPRClassForBitWidth(BitWidth); +} + +static const TargetRegisterClass * +getAnyAGPRClassForBitWidth(unsigned BitWidth) { if (BitWidth <= 64) return &AMDGPU::AReg_64RegClass; if (BitWidth <= 96) @@ -1779,6 +1997,8 @@ SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) { return &AMDGPU::AReg_160RegClass; if (BitWidth <= 192) return &AMDGPU::AReg_192RegClass; + if (BitWidth <= 224) + return &AMDGPU::AReg_224RegClass; if (BitWidth <= 256) return &AMDGPU::AReg_256RegClass; if (BitWidth <= 512) @@ -1789,6 +2009,40 @@ SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) { return nullptr; } +static const TargetRegisterClass * +getAlignedAGPRClassForBitWidth(unsigned BitWidth) { + if (BitWidth <= 64) + return &AMDGPU::AReg_64_Align2RegClass; + if (BitWidth <= 96) + return &AMDGPU::AReg_96_Align2RegClass; + if (BitWidth <= 128) + return &AMDGPU::AReg_128_Align2RegClass; + if (BitWidth <= 160) + return &AMDGPU::AReg_160_Align2RegClass; + if (BitWidth <= 192) + return &AMDGPU::AReg_192_Align2RegClass; + if (BitWidth <= 224) + return &AMDGPU::AReg_224_Align2RegClass; + if (BitWidth <= 256) + return &AMDGPU::AReg_256_Align2RegClass; + if (BitWidth <= 512) + return &AMDGPU::AReg_512_Align2RegClass; + if (BitWidth <= 1024) + return &AMDGPU::AReg_1024_Align2RegClass; + + return nullptr; +} + +const TargetRegisterClass * +SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const { + if (BitWidth <= 16) + return &AMDGPU::AGPR_LO16RegClass; + if (BitWidth <= 32) + return &AMDGPU::AGPR_32RegClass; + return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth) + : getAnyAGPRClassForBitWidth(BitWidth); +} + const TargetRegisterClass * SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { if (BitWidth <= 16) @@ -1805,6 +2059,8 @@ SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { return &AMDGPU::SGPR_160RegClass; if (BitWidth <= 192) return &AMDGPU::SGPR_192RegClass; + if (BitWidth <= 224) + return &AMDGPU::SGPR_224RegClass; if (BitWidth <= 256) return &AMDGPU::SGPR_256RegClass; if (BitWidth <= 512) @@ -1827,29 +2083,51 @@ SIRegisterInfo::getPhysRegClass(MCRegister Reg) const { &AMDGPU::VGPR_32RegClass, &AMDGPU::SReg_32RegClass, &AMDGPU::AGPR_32RegClass, + &AMDGPU::AGPR_32RegClass, + &AMDGPU::VReg_64_Align2RegClass, &AMDGPU::VReg_64RegClass, &AMDGPU::SReg_64RegClass, + &AMDGPU::AReg_64_Align2RegClass, &AMDGPU::AReg_64RegClass, + &AMDGPU::VReg_96_Align2RegClass, &AMDGPU::VReg_96RegClass, &AMDGPU::SReg_96RegClass, + &AMDGPU::AReg_96_Align2RegClass, &AMDGPU::AReg_96RegClass, + &AMDGPU::VReg_128_Align2RegClass, &AMDGPU::VReg_128RegClass, &AMDGPU::SReg_128RegClass, + &AMDGPU::AReg_128_Align2RegClass, &AMDGPU::AReg_128RegClass, + &AMDGPU::VReg_160_Align2RegClass, &AMDGPU::VReg_160RegClass, &AMDGPU::SReg_160RegClass, + &AMDGPU::AReg_160_Align2RegClass, &AMDGPU::AReg_160RegClass, + &AMDGPU::VReg_192_Align2RegClass, &AMDGPU::VReg_192RegClass, &AMDGPU::SReg_192RegClass, + &AMDGPU::AReg_192_Align2RegClass, &AMDGPU::AReg_192RegClass, + &AMDGPU::VReg_224_Align2RegClass, + &AMDGPU::VReg_224RegClass, + &AMDGPU::SReg_224RegClass, + &AMDGPU::AReg_224_Align2RegClass, + &AMDGPU::AReg_224RegClass, + &AMDGPU::VReg_256_Align2RegClass, &AMDGPU::VReg_256RegClass, &AMDGPU::SReg_256RegClass, + &AMDGPU::AReg_256_Align2RegClass, &AMDGPU::AReg_256RegClass, + &AMDGPU::VReg_512_Align2RegClass, &AMDGPU::VReg_512RegClass, &AMDGPU::SReg_512RegClass, + &AMDGPU::AReg_512_Align2RegClass, &AMDGPU::AReg_512RegClass, &AMDGPU::SReg_1024RegClass, + &AMDGPU::VReg_1024_Align2RegClass, &AMDGPU::VReg_1024RegClass, + &AMDGPU::AReg_1024_Align2RegClass, &AMDGPU::AReg_1024RegClass, &AMDGPU::SCC_CLASSRegClass, &AMDGPU::Pseudo_SReg_32RegClass, @@ -1949,6 +2227,16 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( return RC; } +const TargetRegisterClass * +SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, + const TargetRegisterClass *SubRC, + unsigned SubIdx) const { + // Ensure this subregister index is aligned in the super register. + const TargetRegisterClass *MatchRC = + getMatchingSuperRegClass(SuperRC, SubRC, SubIdx); + return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr; +} + bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) @@ -2147,6 +2435,12 @@ MCRegister SIRegisterInfo::getVCC() const { return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; } +const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { + // VGPR tuples have an alignment requirement on gfx90a variants. + return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass + : &AMDGPU::VReg_64RegClass; +} + const TargetRegisterClass * SIRegisterInfo::getRegClass(unsigned RCID) const { switch ((int)RCID) { @@ -2234,6 +2528,18 @@ MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const { return AMDGPU::NoRegister; } +bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const { + if (!ST.needsAlignedVGPRs()) + return true; + + if (hasVGPRs(&RC)) + return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC))); + if (hasAGPRs(&RC)) + return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC))); + + return true; +} + bool SIRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const { switch (PhysReg) { case AMDGPU::SGPR_NULL: diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 963da9b3536b..2a92051e5fb2 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -21,7 +21,9 @@ namespace llvm { class GCNSubtarget; class LiveIntervals; +class LivePhysRegs; class RegisterBank; +struct SGPRSpillBuilder; class SIMachineFunctionInfo; class SIRegisterInfo final : public AMDGPUGenRegisterInfo { @@ -79,7 +81,7 @@ public: bool hasBasePointer(const MachineFunction &MF) const; Register getBaseRegister() const; - bool canRealignStack(const MachineFunction &MF) const override; + bool shouldRealignStack(const MachineFunction &MF) const override; bool requiresRegisterScavenging(const MachineFunction &Fn) const override; bool requiresFrameIndexScavenging(const MachineFunction &MF) const override; @@ -106,18 +108,18 @@ public: const TargetRegisterClass *getPointerRegClass( const MachineFunction &MF, unsigned Kind = 0) const override; - void buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI, int Index, - int Offset, unsigned EltSize, Register VGPR, - int64_t VGPRLanes, RegScavenger *RS, - bool IsLoad) const; + void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset, + bool IsLoad, bool IsKill = true) const; /// If \p OnlyToVGPR is true, this will only succeed if this bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, + LiveIntervals *LIS = nullptr, bool OnlyToVGPR = false) const; bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, + LiveIntervals *LIS = nullptr, bool OnlyToVGPR = false) const; void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, @@ -125,7 +127,8 @@ public: RegScavenger *RS) const override; bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, - int FI, RegScavenger *RS) const; + int FI, RegScavenger *RS, + LiveIntervals *LIS = nullptr) const; StringRef getRegAsmName(MCRegister Reg) const override; @@ -134,8 +137,13 @@ public: return getEncodingValue(Reg) & 0xff; } - static const TargetRegisterClass *getVGPRClassForBitWidth(unsigned BitWidth); - static const TargetRegisterClass *getAGPRClassForBitWidth(unsigned BitWidth); + LLVM_READONLY + const TargetRegisterClass *getVGPRClassForBitWidth(unsigned BitWidth) const; + + LLVM_READONLY + const TargetRegisterClass *getAGPRClassForBitWidth(unsigned BitWidth) const; + + LLVM_READONLY static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth); /// Return the 'base' register class for this register. @@ -182,12 +190,21 @@ public: const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const; - /// \returns The register class that is used for a sub-register of \p RC for - /// the given \p SubIdx. If \p SubIdx equals NoSubRegister, \p RC will - /// be returned. + /// \returns The canonical register class that is used for a sub-register of + /// \p RC for the given \p SubIdx. If \p SubIdx equals NoSubRegister, \p RC + /// will be returned. const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC, unsigned SubIdx) const; + /// Returns a register class which is compatible with \p SuperRC, such that a + /// subregister exists with class \p SubRC with subregister index \p + /// SubIdx. If this is impossible (e.g., an unaligned subregister index within + /// a register tuple), return null. + const TargetRegisterClass * + getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, + const TargetRegisterClass *SubRC, + unsigned SubIdx) const; + bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC, unsigned DefSubReg, const TargetRegisterClass *SrcRC, @@ -268,6 +285,10 @@ public: : &AMDGPU::SReg_64_XEXECRegClass; } + // Return the appropriate register class to use for 64-bit VGPRs for the + // subtarget. + const TargetRegisterClass *getVGPR64Class() const; + MCRegister getVCC() const; const TargetRegisterClass *getRegClass(unsigned RCID) const; @@ -279,6 +300,8 @@ public: LiveIntervals *LIS) const; const uint32_t *getAllVGPRRegMask() const; + const uint32_t *getAllAGPRRegMask() const; + const uint32_t *getAllVectorRegMask() const; const uint32_t *getAllAllocatableSRegMask() const; // \returns number of 32 bit registers covered by a \p LM @@ -306,6 +329,10 @@ public: // \returns \p Reg otherwise. MCPhysReg get32BitRegister(MCPhysReg Reg) const; + // Returns true if a given register class is properly aligned for + // the subtarget. + bool isProperlyAlignedRC(const TargetRegisterClass &RC) const; + /// Return all SGPR128 which satisfy the waves per execution unit requirement /// of the subtarget. ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF) const; @@ -318,16 +345,16 @@ public: /// of the subtarget. ArrayRef<MCPhysReg> getAllSGPR32(const MachineFunction &MF) const; -private: - void buildSpillLoadStore(MachineBasicBlock::iterator MI, - unsigned LoadStoreOp, - int Index, - Register ValueReg, - bool ValueIsKill, - MCRegister ScratchOffsetReg, - int64_t InstrOffset, - MachineMemOperand *MMO, - RegScavenger *RS) const; + // Insert spill or restore instructions. + // When lowering spill pseudos, the RegScavenger should be set. + // For creating spill instructions during frame lowering, where no scavenger + // is available, LiveRegs can be used. + void buildSpillLoadStore(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, unsigned LoadStoreOp, + int Index, Register ValueReg, bool ValueIsKill, + MCRegister ScratchOffsetReg, int64_t InstrOffset, + MachineMemOperand *MMO, RegScavenger *RS, + LivePhysRegs *LiveRegs = nullptr) const; }; } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 92390f1f3297..6e3c4e8775f3 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -36,12 +36,12 @@ foreach Index = 1...31 in { foreach Size = {2...6,8,16} in { foreach Index = Indexes<!sub(33, Size)>.slice in { - def !foldl("", Indexes<Size>.slice, acc, cur, - !strconcat(acc#!if(!eq(acc,""),"","_"), "sub"#!add(cur, Index))) : + def !interleave(!foreach(cur, Indexes<Size>.slice, "sub"#!add(cur, Index)), + "_") : SubRegIndex<!mul(Size, 32), !shl(Index, 5)> { let CoveringSubRegIndices = - !foldl([]<SubRegIndex>, Indexes<Size>.slice, acc, cur, - !listconcat(acc, [!cast<SubRegIndex>(sub#!add(cur, Index))])); + !foreach(cur, Indexes<Size>.slice, + !cast<SubRegIndex>(sub#!add(cur, Index))); } } } @@ -58,6 +58,7 @@ class getSubRegs<int size> { list<SubRegIndex> ret4 = [sub0, sub1, sub2, sub3]; list<SubRegIndex> ret5 = [sub0, sub1, sub2, sub3, sub4]; list<SubRegIndex> ret6 = [sub0, sub1, sub2, sub3, sub4, sub5]; + list<SubRegIndex> ret7 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6]; list<SubRegIndex> ret8 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7]; list<SubRegIndex> ret16 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, @@ -77,9 +78,10 @@ class getSubRegs<int size> { !if(!eq(size, 4), ret4, !if(!eq(size, 5), ret5, !if(!eq(size, 6), ret6, - !if(!eq(size, 8), ret8, - !if(!eq(size, 16), ret16, - ret32))))))); + !if(!eq(size, 7), ret7, + !if(!eq(size, 8), ret8, + !if(!eq(size, 16), ret16, + ret32)))))))); } // Generates list of sequential register tuple names. @@ -350,9 +352,12 @@ def SGPR_128Regs : SIRegisterTuples<getSubRegs<4>.ret, SGPR_32, 105, 4, 4, "s">; // SGPR 160-bit registers. No operations use these, but for symmetry with 160-bit VGPRs. def SGPR_160Regs : SIRegisterTuples<getSubRegs<5>.ret, SGPR_32, 105, 4, 5, "s">; -// SGPR 192-bit registers +// SGPR 192-bit registers. No operations use these, but for symmetry with 192-bit VGPRs. def SGPR_192Regs : SIRegisterTuples<getSubRegs<6>.ret, SGPR_32, 105, 4, 6, "s">; +// SGPR 224-bit registers. No operations use these, but for symmetry with 224-bit VGPRs. +def SGPR_224Regs : SIRegisterTuples<getSubRegs<7>.ret, SGPR_32, 105, 4, 7, "s">; + // SGPR 256-bit registers def SGPR_256Regs : SIRegisterTuples<getSubRegs<8>.ret, SGPR_32, 105, 4, 8, "s">; @@ -368,6 +373,7 @@ def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32, let isAllocatable = 0; } +// Trap handler TMP 16-bit registers def TTMP_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16, (add (sequence "TTMP%u_LO16", 0, 15))> { let Size = 16; @@ -377,11 +383,25 @@ def TTMP_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16, // Trap handler TMP 64-bit registers def TTMP_64Regs : SIRegisterTuples<getSubRegs<2>.ret, TTMP_32, 15, 2, 2, "ttmp">; +// Trap handler TMP 96-bit registers +def TTMP_96Regs : SIRegisterTuples<getSubRegs<3>.ret, TTMP_32, 15, 3, 3, "ttmp">; + // Trap handler TMP 128-bit registers def TTMP_128Regs : SIRegisterTuples<getSubRegs<4>.ret, TTMP_32, 15, 4, 4, "ttmp">; +// Trap handler TMP 160-bit registers +def TTMP_160Regs : SIRegisterTuples<getSubRegs<5>.ret, TTMP_32, 15, 4, 5, "ttmp">; + +// Trap handler TMP 192-bit registers +def TTMP_192Regs : SIRegisterTuples<getSubRegs<6>.ret, TTMP_32, 15, 4, 6, "ttmp">; + +// Trap handler TMP 224-bit registers +def TTMP_224Regs : SIRegisterTuples<getSubRegs<7>.ret, TTMP_32, 15, 4, 7, "ttmp">; + +// Trap handler TMP 256-bit registers def TTMP_256Regs : SIRegisterTuples<getSubRegs<8>.ret, TTMP_32, 15, 4, 8, "ttmp">; +// Trap handler TMP 512-bit registers def TTMP_512Regs : SIRegisterTuples<getSubRegs<16>.ret, TTMP_32, 15, 4, 16, "ttmp">; class TmpRegTuplesBase<int index, int size, @@ -508,6 +528,9 @@ def VGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, VGPR_32, 255, 1, 5, "v">; // VGPR 192-bit registers def VGPR_192 : SIRegisterTuples<getSubRegs<6>.ret, VGPR_32, 255, 1, 6, "v">; +// VGPR 224-bit registers +def VGPR_224 : SIRegisterTuples<getSubRegs<7>.ret, VGPR_32, 255, 1, 7, "v">; + // VGPR 256-bit registers def VGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, VGPR_32, 255, 1, 8, "v">; @@ -547,6 +570,9 @@ def AGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, AGPR_32, 255, 1, 5, "a">; // AGPR 192-bit registers def AGPR_192 : SIRegisterTuples<getSubRegs<6>.ret, AGPR_32, 255, 1, 6, "a">; +// AGPR 224-bit registers +def AGPR_224 : SIRegisterTuples<getSubRegs<7>.ret, AGPR_32, 255, 1, 7, "a">; + // AGPR 256-bit registers def AGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, AGPR_32, 255, 1, 8, "a">; @@ -682,111 +708,53 @@ def SReg_1 : RegisterClass<"AMDGPU", [i1], 32, let isAllocatable = 0; } -// Requires 2 s_mov_b64 to copy -let CopyCost = 2 in { - -// There are no 3-component scalar instructions, but this is needed -// for symmetry with VGPRs. -def SGPR_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, - (add SGPR_96Regs)> { - let AllocationPriority = 14; -} - -def SReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, - (add SGPR_96)> { - let AllocationPriority = 14; -} - -def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, - (add SGPR_128Regs)> { - let AllocationPriority = 15; -} - -def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, - (add TTMP_128Regs)> { - let isAllocatable = 0; -} - -def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, - (add SGPR_128, TTMP_128)> { - let isAllocatable = 0; -} - -} // End CopyCost = 2 - -// There are no 5-component scalar instructions, but this is needed -// for symmetry with VGPRs. -def SGPR_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32, - (add SGPR_160Regs)> { - let AllocationPriority = 16; -} - -def SReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32, - (add SGPR_160)> { - // FIXME: Should be isAllocatable = 0, but that causes all TableGen-generated - // subclasses of SGPR_160 to be marked unallocatable too. -} - -def SGPR_192 : RegisterClass<"AMDGPU", [untyped], 32, (add SGPR_192Regs)> { - let Size = 192; - let AllocationPriority = 17; -} - -def SReg_192 : RegisterClass<"AMDGPU", [untyped], 32, (add SGPR_192)> { - let Size = 192; - let isAllocatable = 0; -} - -def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32, (add SGPR_256Regs)> { - let AllocationPriority = 18; -} - -def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32, (add TTMP_256Regs)> { - let isAllocatable = 0; -} +multiclass SRegClass<int numRegs, int priority, + list<ValueType> regTypes, + SIRegisterTuples regList, + SIRegisterTuples ttmpList = regList, + int copyCost = !sra(!add(numRegs, 1), 1)> { + defvar hasTTMP = !ne(regList, ttmpList); + defvar suffix = !cast<string>(!mul(numRegs, 32)); + defvar sgprName = !strconcat("SGPR_", suffix); + defvar ttmpName = !strconcat("TTMP_", suffix); -def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32, - (add SGPR_256, TTMP_256)> { - // Requires 4 s_mov_b64 to copy - let CopyCost = 4; - let isAllocatable = 0; -} + let AllocationPriority = priority, CopyCost = copyCost in { + def "" # sgprName : RegisterClass<"AMDGPU", regTypes, 32, (add regList)> { + } -def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32, - (add SGPR_512Regs)> { - let AllocationPriority = 19; -} + if hasTTMP then { + def "" # ttmpName : RegisterClass<"AMDGPU", regTypes, 32, (add ttmpList)> { + let isAllocatable = 0; + } + } -def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32, - (add TTMP_512Regs)> { - let isAllocatable = 0; + def SReg_ # suffix : + RegisterClass<"AMDGPU", regTypes, 32, + !con(!dag(add, [!cast<RegisterClass>(sgprName)], ["sgpr"]), + !if(hasTTMP, + !dag(add, [!cast<RegisterClass>(ttmpName)], ["ttmp"]), + (add)))> { + let isAllocatable = 0; + } + } } -def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32, - (add SGPR_512, TTMP_512)> { - // Requires 8 s_mov_b64 to copy - let CopyCost = 8; - let isAllocatable = 0; -} +defm "" : SRegClass<3, 14, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>; +defm "" : SRegClass<4, 15, [v4i32, v4f32, v2i64], SGPR_128Regs, TTMP_128Regs>; +defm "" : SRegClass<5, 16, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>; +defm "" : SRegClass<6, 17, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>; +defm "" : SRegClass<7, 18, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>; +defm "" : SRegClass<8, 19, [v8i32, v8f32, v4i64, v4f64], SGPR_256Regs, TTMP_256Regs>; +defm "" : SRegClass<16, 20, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>; +defm "" : SRegClass<32, 21, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>; def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add VGPR_32, LDS_DIRECT_CLASS)> { let isAllocatable = 0; } -def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32, v16i64, v16f64], 32, - (add SGPR_1024Regs)> { - let AllocationPriority = 20; -} - -def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32, v16i64, v16f64], 32, - (add SGPR_1024)> { - let CopyCost = 16; - let isAllocatable = 0; -} - // Register class for all vector registers (VGPRs + Interpolation Registers) -class VRegClass<int numRegs, list<ValueType> regTypes, dag regList> : +class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> : RegisterClass<"AMDGPU", regTypes, 32, regList> { let Size = !mul(numRegs, 32); @@ -796,31 +764,48 @@ class VRegClass<int numRegs, list<ValueType> regTypes, dag regList> : let Weight = numRegs; } -def VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4], - (add VGPR_64)>; -def VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>; -def VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, i128], (add VGPR_128)>; -def VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>; -def VReg_192 : VRegClass<6, [untyped], (add VGPR_192)>; -def VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64], (add VGPR_256)>; -def VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>; -def VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>; +// Define a register tuple class, along with one requiring an even +// aligned base register. +multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> { + // Define the regular class. + def "" : VRegClassBase<numRegs, regTypes, regList>; -class ARegClass<int numRegs, list<ValueType> regTypes, dag regList> : - VRegClass<numRegs, regTypes, regList> { - // Requires n v_accvgpr_write and n v_accvgpr_read to copy + burn 1 vgpr - let CopyCost = !add(numRegs, numRegs, 1); + // Define 2-aligned variant + def _Align2 : VRegClassBase<numRegs, regTypes, (decimate regList, 2)>; +} + +defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4], + (add VGPR_64)>; +defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>; +defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64], (add VGPR_128)>; +defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>; + +defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>; +defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>; +defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64], (add VGPR_256)>; +defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>; +defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>; + +multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> { + let CopyCost = !add(numRegs, numRegs, 1) in { + // Define the regular class. + def "" : VRegClassBase<numRegs, regTypes, regList>; + + // Define 2-aligned variant + def _Align2 : VRegClassBase<numRegs, regTypes, (decimate regList, 2)>; + } } -def AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16], +defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16], (add AGPR_64)>; -def AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>; -def AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64], (add AGPR_128)>; -def AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>; -def AReg_192 : ARegClass<6, [untyped], (add AGPR_192)>; -def AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>; -def AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>; -def AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)>; +defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>; +defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64], (add AGPR_128)>; +defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>; +defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>; +defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>; +defm AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>; +defm AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>; +defm AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)>; } // End GeneratePressureSet = 0 @@ -847,21 +832,36 @@ def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, let isAllocatable = 0; } -def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> { +def VS_64 : RegisterClass<"AMDGPU", [i64, f64, v2f32], 32, (add VReg_64, SReg_64)> { let isAllocatable = 0; } -def AV_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, +def AV_32 : RegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add AGPR_32, VGPR_32)> { let isAllocatable = 0; } -def AV_64 : RegisterClass<"AMDGPU", [i64, f64, v4f16], 32, +def AV_64 : RegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add AReg_64, VReg_64)> { let isAllocatable = 0; } } // End GeneratePressureSet = 0 +def AV_96 : RegisterClass<"AMDGPU", VReg_96.RegTypes, 32, + (add AReg_96, VReg_96)> { + let isAllocatable = 0; +} + +def AV_128 : RegisterClass<"AMDGPU", VReg_128.RegTypes, 32, + (add AReg_128, VReg_128)> { + let isAllocatable = 0; +} + +def AV_160 : RegisterClass<"AMDGPU", VReg_160.RegTypes, 32, + (add AReg_160, VReg_160)> { + let isAllocatable = 0; +} + //===----------------------------------------------------------------------===// // Register operands //===----------------------------------------------------------------------===// @@ -912,21 +912,38 @@ multiclass SIRegOperand32 <string rc, string MatchName, string opType, } } -multiclass SIRegOperand <string rc, string MatchName, string opType> : - SIRegOperand32<rc, MatchName, opType> { +multiclass SIRegOperand64 <string rc, string MatchName, string opType, + string rc_suffix = "_64", bit Vectors = 1> { let OperandNamespace = "AMDGPU" in { - def _b64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> { + def _b64 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> { let OperandType = opType#"_INT64"; let ParserMatchClass = RegImmMatcher<MatchName#"B64">; } - def _f64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> { + def _f64 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> { let OperandType = opType#"_FP64"; let ParserMatchClass = RegImmMatcher<MatchName#"F64">; } + + if Vectors then + def _v2f32 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> { + let OperandType = opType#"_V2FP32"; + let ParserMatchClass = RegImmMatcher<MatchName#"V2FP32">; + let DecoderMethod = "decodeOperand_VSrcV232"; + } + if Vectors then + def _v2b32 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> { + let OperandType = opType#"_V2INT32"; + let ParserMatchClass = RegImmMatcher<MatchName#"V2INT32">; + let DecoderMethod = "decodeOperand_VSrcV232"; + } } } +multiclass SIRegOperand <string rc, string MatchName, string opType> : + SIRegOperand32<rc, MatchName, opType>, + SIRegOperand64<rc, MatchName, opType>; + // FIXME: 64-bit sources can sometimes use 32-bit constants. multiclass RegImmOperand <string rc, string MatchName> : SIRegOperand<rc, MatchName, "OPERAND_REG_IMM">; @@ -938,10 +955,18 @@ multiclass RegInlineOperand32 <string rc, string MatchName, string rc_suffix = "_32"> : SIRegOperand32<rc, MatchName, "OPERAND_REG_INLINE_C", rc_suffix>; +multiclass RegInlineOperand64 <string rc, string MatchName, + string rc_suffix = "_64"> + : SIRegOperand64<rc, MatchName, "OPERAND_REG_INLINE_C", rc_suffix>; + multiclass RegInlineOperandAC <string rc, string MatchName, string rc_suffix = "_32"> : SIRegOperand32<rc, MatchName, "OPERAND_REG_INLINE_AC", rc_suffix>; +multiclass RegInlineOperandAC64 <string rc, string MatchName, + string rc_suffix = "_64"> + : SIRegOperand64<rc, MatchName, "OPERAND_REG_INLINE_AC", rc_suffix, 0>; + //===----------------------------------------------------------------------===// // SSrc_* Operands with an SGPR or a 32-bit immediate //===----------------------------------------------------------------------===// @@ -971,7 +996,7 @@ def VSrc_128 : RegisterOperand<VReg_128> { } //===----------------------------------------------------------------------===// -// VSrc_* Operands with an VGPR +// VRegSrc_* Operands with a VGPR //===----------------------------------------------------------------------===// // This is for operands with the enum(9), VSrc encoding restriction, @@ -1001,6 +1026,13 @@ defm VCSrc : RegInlineOperand<"VS", "VCSrc">; //===----------------------------------------------------------------------===// defm VISrc : RegInlineOperand32<"VGPR", "VISrc">; +let DecoderMethod = "decodeOperand_VReg_64" in +defm VISrc_64 : RegInlineOperand64<"VReg", "VISrc_64", "_64">; +defm VISrc_128 : RegInlineOperandAC<"VReg", "VISrc_128", "_128">; +let DecoderMethod = "decodeOperand_VReg_256" in +defm VISrc_256 : RegInlineOperand64<"VReg", "VISrc_256", "_256">; +defm VISrc_512 : RegInlineOperandAC<"VReg", "VISrc_512", "_512">; +defm VISrc_1024 : RegInlineOperandAC<"VReg", "VISrc_1024", "_1024">; //===----------------------------------------------------------------------===// // AVSrc_* Operands with an AGPR or VGPR @@ -1016,6 +1048,31 @@ def AVSrc_64 : RegisterOperand<AV_64> { let EncoderMethod = "getAVOperandEncoding"; } +def AVLdSt_32 : RegisterOperand<AV_32> { + let DecoderMethod = "DecodeAVLdSt_32RegisterClass"; + let EncoderMethod = "getAVOperandEncoding"; +} + +def AVLdSt_64 : RegisterOperand<AV_64> { + let DecoderMethod = "DecodeAVLdSt_64RegisterClass"; + let EncoderMethod = "getAVOperandEncoding"; +} + +def AVLdSt_96 : RegisterOperand<AV_96> { + let DecoderMethod = "DecodeAVLdSt_96RegisterClass"; + let EncoderMethod = "getAVOperandEncoding"; +} + +def AVLdSt_128 : RegisterOperand<AV_128> { + let DecoderMethod = "DecodeAVLdSt_128RegisterClass"; + let EncoderMethod = "getAVOperandEncoding"; +} + +def AVLdSt_160 : RegisterOperand<AV_160> { + let DecoderMethod = "DecodeAVLdSt_160RegisterClass"; + let EncoderMethod = "getAVOperandEncoding"; +} + //===----------------------------------------------------------------------===// // ACSrc_* Operands with an AGPR or an inline constant //===----------------------------------------------------------------------===// @@ -1024,3 +1081,8 @@ defm AISrc : RegInlineOperandAC<"AGPR", "AISrc">; defm AISrc_128 : RegInlineOperandAC<"AReg", "AISrc_128", "_128">; defm AISrc_512 : RegInlineOperandAC<"AReg", "AISrc_512", "_512">; defm AISrc_1024 : RegInlineOperandAC<"AReg", "AISrc_1024", "_1024">; + +let DecoderMethod = "decodeOperand_AReg_64" in +defm AISrc_64 : RegInlineOperandAC64<"AReg", "AISrc_64", "_64">; +let DecoderMethod = "decodeOperand_AReg_256" in +defm AISrc_256 : RegInlineOperandAC64<"AReg", "AISrc_256", "_256">; diff --git a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp b/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp deleted file mode 100644 index d30ff4a3fd15..000000000000 --- a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp +++ /dev/null @@ -1,159 +0,0 @@ -//===-- SIRemoveShortExecBranches.cpp ------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass optmizes the s_cbranch_execz instructions. -/// The pass removes this skip instruction for short branches, -/// if there is no unwanted sideeffect in the fallthrough code sequence. -/// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "GCNSubtarget.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/Support/CommandLine.h" - -using namespace llvm; - -#define DEBUG_TYPE "si-remove-short-exec-branches" - -static unsigned SkipThreshold; - -static cl::opt<unsigned, true> SkipThresholdFlag( - "amdgpu-skip-threshold", cl::Hidden, - cl::desc( - "Number of instructions before jumping over divergent control flow"), - cl::location(SkipThreshold), cl::init(12)); - -namespace { - -class SIRemoveShortExecBranches : public MachineFunctionPass { -private: - const SIInstrInfo *TII = nullptr; - bool getBlockDestinations(MachineBasicBlock &SrcMBB, - MachineBasicBlock *&TrueMBB, - MachineBasicBlock *&FalseMBB, - SmallVectorImpl<MachineOperand> &Cond); - bool mustRetainExeczBranch(const MachineBasicBlock &From, - const MachineBasicBlock &To) const; - bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); - -public: - static char ID; - - SIRemoveShortExecBranches() : MachineFunctionPass(ID) { - initializeSIRemoveShortExecBranchesPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; -}; - -} // End anonymous namespace. - -INITIALIZE_PASS(SIRemoveShortExecBranches, DEBUG_TYPE, - "SI remove short exec branches", false, false) - -char SIRemoveShortExecBranches::ID = 0; - -char &llvm::SIRemoveShortExecBranchesID = SIRemoveShortExecBranches::ID; - -bool SIRemoveShortExecBranches::getBlockDestinations( - MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB, - MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) { - if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond)) - return false; - - if (!FalseMBB) - FalseMBB = SrcMBB.getNextNode(); - - return true; -} - -bool SIRemoveShortExecBranches::mustRetainExeczBranch( - const MachineBasicBlock &From, const MachineBasicBlock &To) const { - unsigned NumInstr = 0; - const MachineFunction *MF = From.getParent(); - - for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); - MBBI != End && MBBI != ToI; ++MBBI) { - const MachineBasicBlock &MBB = *MBBI; - - for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - // When a uniform loop is inside non-uniform control flow, the branch - // leaving the loop might never be taken when EXEC = 0. - // Hence we should retain cbranch out of the loop lest it become infinite. - if (I->isConditionalBranch()) - return true; - - if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) - return true; - - if (TII->isKillTerminator(I->getOpcode())) - return true; - - // These instructions are potentially expensive even if EXEC = 0. - if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) || - I->getOpcode() == AMDGPU::S_WAITCNT) - return true; - - ++NumInstr; - if (NumInstr >= SkipThreshold) - return true; - } - } - - return false; -} - -// Returns true if the skip branch instruction is removed. -bool SIRemoveShortExecBranches::removeExeczBranch(MachineInstr &MI, - MachineBasicBlock &SrcMBB) { - MachineBasicBlock *TrueMBB = nullptr; - MachineBasicBlock *FalseMBB = nullptr; - SmallVector<MachineOperand, 1> Cond; - - if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond)) - return false; - - // Consider only the forward branches. - if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) || - mustRetainExeczBranch(*FalseMBB, *TrueMBB)) - return false; - - LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI); - MI.eraseFromParent(); - SrcMBB.removeSuccessor(TrueMBB); - - return true; -} - -bool SIRemoveShortExecBranches::runOnMachineFunction(MachineFunction &MF) { - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - TII = ST.getInstrInfo(); - MF.RenumberBlocks(); - bool Changed = false; - - for (MachineBasicBlock &MBB : MF) { - MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); - if (MBBI == MBB.end()) - continue; - - MachineInstr &MI = *MBBI; - switch (MI.getOpcode()) { - case AMDGPU::S_CBRANCH_EXECZ: - Changed = removeExeczBranch(MI, MBB); - break; - default: - break; - } - } - - return Changed; -} diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index db4a009e08d7..b24c061af7ab 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -54,10 +54,15 @@ def WriteTrans64 : SchedWrite; // Half rate 64-bit instructions. def Write64Bit : SchedWrite; +// Integer multiplications. +def WriteIntMul : SchedWrite; + // mAI multipass instructions. def Write2PassMAI : SchedWrite; def Write8PassMAI : SchedWrite; def Write16PassMAI : SchedWrite; +def Write4PassDGEMM : SchedWrite; +def Write8PassDGEMM : SchedWrite; // FIXME: Should there be a class for instructions which are VALU // instructions and have VALU rates, but write to the SALU (i.e. VOPC @@ -80,6 +85,7 @@ class SISchedMachineModel : SchedMachineModel { def SIFullSpeedModel : SISchedMachineModel; def SIQuarterSpeedModel : SISchedMachineModel; +def SIDPFullSpeedModel : SISchedMachineModel; def GFX10SpeedModel : SISchedMachineModel; // XXX: Are the resource counts correct? @@ -101,6 +107,9 @@ def HWVMEM : ProcResource<1> { def HWVALU : ProcResource<1> { let BufferSize = 1; } +def HWTransVALU : ProcResource<1> { // Transcendental VALU + let BufferSize = 1; +} def HWRC : ProcResource<1> { // Register destination cache let BufferSize = 1; } @@ -137,11 +146,13 @@ multiclass SICommonWriteRes { def : HWWriteRes<WriteBarrier, [HWBranch], 500>; // XXX: Guessed ??? def : HWVALUWriteRes<Write32Bit, 1>; - def : HWVALUWriteRes<Write64Bit, 2>; def : HWVALUWriteRes<WriteFloatCvt, 4>; def : HWVALUWriteRes<WriteTrans32, 4>; def : HWVALUWriteRes<WriteQuarterRate32, 4>; + def : HWVALUWriteRes<Write4PassDGEMM, 4>; + def : HWVALUWriteRes<Write8PassDGEMM, 16>; + let ResourceCycles = [2] in def : HWWriteRes<Write2PassMAI, [HWXDL], 2>; let ResourceCycles = [8] in @@ -150,7 +161,6 @@ multiclass SICommonWriteRes { def : HWWriteRes<Write16PassMAI, [HWXDL], 16>; def : ReadAdvance<MIVGPRRead, -2>; - def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>; // Technically mfma reads can be from 0 to 4 cycles but that does not make // sense to model because its register setup is huge. In particular if we @@ -159,10 +169,6 @@ multiclass SICommonWriteRes { // need to consume 2 or 4 more vgprs to be initialized before the acc // write sequence. Just assume worst case here. def : ReadAdvance<MIMFMARead, -4>; - - def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_..._4X4X")>; - def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_..._16X16X")>; - def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_..._32X32X")>; } def PredIsVGPR32Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) <= 32}]>; @@ -176,11 +182,13 @@ let SchedModel = SIFullSpeedModel in { defm : SICommonWriteRes; -def : HWVALUWriteRes<WriteFloatFMA, 1>; -def : HWVALUWriteRes<WriteDouble, 4>; -def : HWVALUWriteRes<WriteDoubleAdd, 2>; -def : HWVALUWriteRes<WriteDoubleCvt, 4>; -def : HWVALUWriteRes<WriteTrans64, 4>; +def : HWVALUWriteRes<Write64Bit, 2>; +def : HWVALUWriteRes<WriteIntMul, 4>; +def : HWVALUWriteRes<WriteFloatFMA, 1>; +def : HWVALUWriteRes<WriteDouble, 4>; +def : HWVALUWriteRes<WriteDoubleAdd, 2>; +def : HWVALUWriteRes<WriteDoubleCvt, 4>; +def : HWVALUWriteRes<WriteTrans64, 4>; def : InstRW<[WriteCopy], (instrs COPY)>; @@ -190,16 +198,44 @@ let SchedModel = SIQuarterSpeedModel in { defm : SICommonWriteRes; -def : HWVALUWriteRes<WriteFloatFMA, 16>; -def : HWVALUWriteRes<WriteDouble, 16>; -def : HWVALUWriteRes<WriteDoubleAdd, 8>; -def : HWVALUWriteRes<WriteDoubleCvt, 4>; -def : HWVALUWriteRes<WriteTrans64, 16>; +def : HWVALUWriteRes<Write64Bit, 2>; +def : HWVALUWriteRes<WriteIntMul, 4>; +def : HWVALUWriteRes<WriteFloatFMA, 16>; +def : HWVALUWriteRes<WriteDouble, 16>; +def : HWVALUWriteRes<WriteDoubleAdd, 8>; +def : HWVALUWriteRes<WriteDoubleCvt, 4>; +def : HWVALUWriteRes<WriteTrans64, 16>; def : InstRW<[WriteCopy], (instrs COPY)>; +def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>; +def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_..._4X4X")>; +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_..._16X16X")>; +def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_..._32X32X")>; } // End SchedModel = SIQuarterSpeedModel +let SchedModel = SIDPFullSpeedModel in { + +defm : SICommonWriteRes; + +def : HWVALUWriteRes<WriteFloatFMA, 1>; +def : HWVALUWriteRes<WriteDouble, 1>; +def : HWVALUWriteRes<WriteDoubleAdd, 1>; +def : HWVALUWriteRes<WriteDoubleCvt, 1>; +def : HWVALUWriteRes<WriteTrans64, 4>; +def : HWVALUWriteRes<WriteIntMul, 1>; +def : HWVALUWriteRes<Write64Bit, 1>; + +def : InstRW<[WriteCopy], (instrs COPY)>; +def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>; +def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_4X4X")>; +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X")>; +def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X")>; +def : InstRW<[Write4PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_4X4X")>; +def : InstRW<[Write8PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>; + +} // End SchedModel = SIDPFullSpeedModel + let SchedModel = GFX10SpeedModel in { // The latency values are 1 / (operations / cycle). @@ -207,13 +243,14 @@ let SchedModel = GFX10SpeedModel in { def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>; def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>; def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 6>; -def : HWWriteRes<WriteTrans32, [HWVALU, HWRC], 10>; +def : HWWriteRes<WriteTrans32, [HWTransVALU, HWRC], 10>; def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 8>; def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>; def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 22>; def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 22>; def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 22>; -def : HWWriteRes<WriteTrans64, [HWVALU, HWRC], 24>; +def : HWWriteRes<WriteIntMul, [HWVALU, HWRC], 8>; +def : HWWriteRes<WriteTrans64, [HWVALU, HWTransVALU, HWRC], 24>; def : HWWriteRes<WriteBranch, [HWBranch], 32>; def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>; diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 2628070f219c..45dd57ea1be4 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -75,17 +75,19 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, MachineOperand &MovSrc = Def->getOperand(1); bool ConstantFolded = false; - if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) || - isUInt<32>(MovSrc.getImm()))) { - Src0.ChangeToImmediate(MovSrc.getImm()); - ConstantFolded = true; - } else if (MovSrc.isFI()) { - Src0.ChangeToFrameIndex(MovSrc.getIndex()); - ConstantFolded = true; - } else if (MovSrc.isGlobal()) { - Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(), - MovSrc.getTargetFlags()); - ConstantFolded = true; + if (TII->isOperandLegal(MI, Src0Idx, &MovSrc)) { + if (MovSrc.isImm() && + (isInt<32>(MovSrc.getImm()) || isUInt<32>(MovSrc.getImm()))) { + Src0.ChangeToImmediate(MovSrc.getImm()); + ConstantFolded = true; + } else if (MovSrc.isFI()) { + Src0.ChangeToFrameIndex(MovSrc.getIndex()); + ConstantFolded = true; + } else if (MovSrc.isGlobal()) { + Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(), + MovSrc.getTargetFlags()); + ConstantFolded = true; + } } if (ConstantFolded) { @@ -230,9 +232,14 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) { RC = &AMDGPU::VReg_96RegClass; } else if (Info->VAddrDwords == 4) { RC = &AMDGPU::VReg_128RegClass; - } else if (Info->VAddrDwords <= 8) { + } else if (Info->VAddrDwords == 5) { + RC = &AMDGPU::VReg_160RegClass; + } else if (Info->VAddrDwords == 6) { + RC = &AMDGPU::VReg_192RegClass; + } else if (Info->VAddrDwords == 7) { + RC = &AMDGPU::VReg_224RegClass; + } else if (Info->VAddrDwords == 8) { RC = &AMDGPU::VReg_256RegClass; - NewAddrDwords = 8; } else { RC = &AMDGPU::VReg_512RegClass; NewAddrDwords = 16; @@ -571,7 +578,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, dropInstructionKeepingImpDefs(*MovY, TII); MachineInstr *Next = &*std::next(MovT.getIterator()); - if (MRI.use_nodbg_empty(T)) { + if (T.isVirtual() && MRI.use_nodbg_empty(T)) { dropInstructionKeepingImpDefs(MovT, TII); } else { Xop.setIsKill(false); diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 0640e24b37ec..38548eaf9478 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -7,14 +7,17 @@ //===----------------------------------------------------------------------===// // /// \file -/// This pass adds instructions to enable whole quad mode for pixel -/// shaders, and whole wavefront mode for all programs. +/// This pass adds instructions to enable whole quad mode (strict or non-strict) +/// for pixel shaders, and strict whole wavefront mode for all programs. +/// +/// The "strict" prefix indicates that inactive lanes do not take part in +/// control flow, specifically an inactive lane enabled by a strict WQM/WWM will +/// always be enabled irrespective of control flow decisions. Conversely in +/// non-strict WQM inactive lanes may control flow decisions. /// /// Whole quad mode is required for derivative computations, but it interferes -/// with shader side effects (stores and atomics). This pass is run on the -/// scheduled machine IR but before register coalescing, so that machine SSA is -/// available for analysis. It ensures that WQM is enabled when necessary, but -/// disabled around stores and atomics. +/// with shader side effects (stores and atomics). It ensures that WQM is +/// enabled when necessary, but disabled around stores and atomics. /// /// When necessary, this pass creates a function prolog /// @@ -28,12 +31,21 @@ /// ... /// S_MOV_B64 EXEC, Tmp /// -/// We also compute when a sequence of instructions requires Whole Wavefront -/// Mode (WWM) and insert instructions to save and restore it: +/// We also compute when a sequence of instructions requires strict whole +/// wavefront mode (StrictWWM) and insert instructions to save and restore it: +/// +/// S_OR_SAVEEXEC_B64 Tmp, -1 +/// ... +/// S_MOV_B64 EXEC, Tmp +/// +/// When a sequence of instructions requires strict whole quad mode (StrictWQM) +/// we use a similar save and restore mechanism and force whole quad mode for +/// those instructions: /// -/// S_OR_SAVEEXEC_B64 Tmp, -1 -/// ... -/// S_MOV_B64 EXEC, Tmp +/// S_MOV_B64 Tmp, EXEC +/// S_WQM_B64 EXEC, EXEC +/// ... +/// S_MOV_B64 EXEC, Tmp /// /// In order to avoid excessive switching during sequences of Exact /// instructions, the pass first analyzes which instructions must be run in WQM @@ -62,8 +74,10 @@ #include "llvm/ADT/PostOrderIterator.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/IR/CallingConv.h" #include "llvm/InitializePasses.h" #include "llvm/Support/raw_ostream.h" @@ -76,8 +90,10 @@ namespace { enum { StateWQM = 0x1, - StateWWM = 0x2, - StateExact = 0x4, + StateStrictWWM = 0x2, + StateStrictWQM = 0x4, + StateExact = 0x8, + StateStrict = StateStrictWWM | StateStrictWQM, }; struct PrintState { @@ -89,19 +105,23 @@ public: #ifndef NDEBUG static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) { - if (PS.State & StateWQM) - OS << "WQM"; - if (PS.State & StateWWM) { - if (PS.State & StateWQM) - OS << '|'; - OS << "WWM"; - } - if (PS.State & StateExact) { - if (PS.State & (StateWQM | StateWWM)) - OS << '|'; - OS << "Exact"; - } + static const std::pair<char, const char *> Mapping[] = { + std::make_pair(StateWQM, "WQM"), + std::make_pair(StateStrictWWM, "StrictWWM"), + std::make_pair(StateStrictWQM, "StrictWQM"), + std::make_pair(StateExact, "Exact")}; + char State = PS.State; + for (auto M : Mapping) { + if (State & M.first) { + OS << M.second; + State &= ~M.first; + + if (State) + OS << '|'; + } + } + assert(State == 0); return OS; } #endif @@ -116,6 +136,8 @@ struct BlockInfo { char Needs = 0; char InNeeds = 0; char OutNeeds = 0; + char InitialState = 0; + bool NeedsLowering = false; }; struct WorkItem { @@ -129,23 +151,33 @@ struct WorkItem { class SIWholeQuadMode : public MachineFunctionPass { private: - CallingConv::ID CallingConv; const SIInstrInfo *TII; const SIRegisterInfo *TRI; const GCNSubtarget *ST; MachineRegisterInfo *MRI; LiveIntervals *LIS; + MachineDominatorTree *MDT; + MachinePostDominatorTree *PDT; unsigned AndOpc; - unsigned XorTermrOpc; + unsigned AndN2Opc; + unsigned XorOpc; + unsigned AndSaveExecOpc; unsigned OrSaveExecOpc; - unsigned Exec; + unsigned WQMOpc; + Register Exec; + Register LiveMaskReg; DenseMap<const MachineInstr *, InstrInfo> Instructions; MapVector<MachineBasicBlock *, BlockInfo> Blocks; - SmallVector<MachineInstr *, 1> LiveMaskQueries; + + // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction + DenseMap<const MachineInstr *, char> StateTransition; + + SmallVector<MachineInstr *, 2> LiveMaskQueries; SmallVector<MachineInstr *, 4> LowerToMovInstrs; SmallVector<MachineInstr *, 4> LowerToCopyInstrs; + SmallVector<MachineInstr *, 4> KillInstrs; void printInfo(); @@ -153,6 +185,8 @@ private: std::vector<WorkItem> &Worklist); void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg, unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist); + void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag, + std::vector<WorkItem> &Worklist); void markInstructionUses(const MachineInstr &MI, char Flag, std::vector<WorkItem> &Worklist); char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist); @@ -167,17 +201,27 @@ private: MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC); void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SaveWQM, unsigned LiveMaskReg); + Register SaveWQM); void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SavedWQM); - void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SaveOrig); - void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SavedOrig); - void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry); - - void lowerLiveMaskQueries(unsigned LiveMaskReg); + Register SavedWQM); + void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, + Register SaveOrig, char StrictStateNeeded); + void fromStrictMode(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before, Register SavedOrig, + char NonStrictState, char CurrentStrictState); + + MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI); + + MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI, + bool IsWQM); + MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI); + + void lowerBlock(MachineBasicBlock &MBB); + void processBlock(MachineBasicBlock &MBB, bool IsEntry); + + void lowerLiveMaskQueries(); void lowerCopyInstrs(); + void lowerKillInstrs(bool IsWQM); public: static char ID; @@ -193,9 +237,17 @@ public: AU.addRequired<LiveIntervals>(); AU.addPreserved<SlotIndexes>(); AU.addPreserved<LiveIntervals>(); - AU.setPreservesCFG(); + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachinePostDominatorTree>(); + AU.addPreserved<MachinePostDominatorTree>(); MachineFunctionPass::getAnalysisUsage(AU); } + + MachineFunctionProperties getClearedProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::IsSSA); + } }; } // end anonymous namespace @@ -205,6 +257,8 @@ char SIWholeQuadMode::ID = 0; INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) @@ -241,8 +295,6 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, assert(!(Flag & StateExact) && Flag != 0); - LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI); - // Remove any disabled states from the flag. The user that required it gets // an undefined value in the helper lanes. For example, this can happen if // the result of an atomic is used by instruction that requires WQM, where @@ -254,6 +306,7 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, if ((II.Needs & Flag) == Flag) return; + LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI); II.Needs |= Flag; Worklist.push_back(&MI); } @@ -262,108 +315,167 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg, unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist) { - assert(!MRI->isSSA()); - LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI); LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI)); - if (!UseLRQ.valueIn()) + const VNInfo *Value = UseLRQ.valueIn(); + if (!Value) return; - SmallPtrSet<const VNInfo *, 4> Visited; - SmallVector<const VNInfo *, 4> ToProcess; - ToProcess.push_back(UseLRQ.valueIn()); + // Note: this code assumes that lane masks on AMDGPU completely + // cover registers. + const LaneBitmask UseLanes = + SubReg ? TRI->getSubRegIndexLaneMask(SubReg) + : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg) + : LaneBitmask::getNone()); + + // Perform a depth-first iteration of the LiveRange graph marking defs. + // Stop processing of a given branch when all use lanes have been defined. + // The first definition stops processing for a physical register. + struct PhiEntry { + const VNInfo *Phi; + unsigned PredIdx; + LaneBitmask DefinedLanes; + + PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes) + : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {} + }; + using VisitKey = std::pair<const VNInfo *, LaneBitmask>; + SmallVector<PhiEntry, 2> PhiStack; + SmallSet<VisitKey, 4> Visited; + LaneBitmask DefinedLanes; + unsigned NextPredIdx = 0; // Only used for processing phi nodes do { - const VNInfo *Value = ToProcess.pop_back_val(); - Visited.insert(Value); + const VNInfo *NextValue = nullptr; + const VisitKey Key(Value, DefinedLanes); + + if (!Visited.count(Key)) { + Visited.insert(Key); + // On first visit to a phi then start processing first predecessor + NextPredIdx = 0; + } if (Value->isPHIDef()) { - // Need to mark all defs used in the PHI node + // Each predecessor node in the phi must be processed as a subgraph const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def); assert(MBB && "Phi-def has no defining MBB"); - for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(), - PE = MBB->pred_end(); - PI != PE; ++PI) { + + // Find next predecessor to process + unsigned Idx = NextPredIdx; + auto PI = MBB->pred_begin() + Idx; + auto PE = MBB->pred_end(); + for (; PI != PE && !NextValue; ++PI, ++Idx) { if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) { - if (!Visited.count(VN)) - ToProcess.push_back(VN); + if (!Visited.count(VisitKey(VN, DefinedLanes))) + NextValue = VN; } } + + // If there are more predecessors to process; add phi to stack + if (PI != PE) + PhiStack.emplace_back(Value, Idx, DefinedLanes); } else { MachineInstr *MI = LIS->getInstructionFromIndex(Value->def); assert(MI && "Def has no defining instruction"); - markInstruction(*MI, Flag, Worklist); - // Iterate over all operands to find relevant definitions - for (const MachineOperand &Op : MI->operands()) { - if (!(Op.isReg() && Op.getReg() == Reg)) - continue; + if (Reg.isVirtual()) { + // Iterate over all operands to find relevant definitions + bool HasDef = false; + for (const MachineOperand &Op : MI->operands()) { + if (!(Op.isReg() && Op.isDef() && Op.getReg() == Reg)) + continue; + + // Compute lanes defined and overlap with use + LaneBitmask OpLanes = + Op.isUndef() ? LaneBitmask::getAll() + : TRI->getSubRegIndexLaneMask(Op.getSubReg()); + LaneBitmask Overlap = (UseLanes & OpLanes); - // Does this def cover whole register? - bool DefinesFullReg = - Op.isUndef() || !Op.getSubReg() || Op.getSubReg() == SubReg; - if (!DefinesFullReg) { - // Partial definition; need to follow and mark input value + // Record if this instruction defined any of use + HasDef |= Overlap.any(); + + // Mark any lanes defined + DefinedLanes |= OpLanes; + } + + // Check if all lanes of use have been defined + if ((DefinedLanes & UseLanes) != UseLanes) { + // Definition not complete; need to process input value LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI)); if (const VNInfo *VN = LRQ.valueIn()) { - if (!Visited.count(VN)) - ToProcess.push_back(VN); + if (!Visited.count(VisitKey(VN, DefinedLanes))) + NextValue = VN; } } + + // Only mark the instruction if it defines some part of the use + if (HasDef) + markInstruction(*MI, Flag, Worklist); + } else { + // For physical registers simply mark the defining instruction + markInstruction(*MI, Flag, Worklist); } } - } while (!ToProcess.empty()); -} -/// Mark all instructions defining the uses in \p MI with \p Flag. -void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag, - std::vector<WorkItem> &Worklist) { + if (!NextValue && !PhiStack.empty()) { + // Reach end of chain; revert to processing last phi + PhiEntry &Entry = PhiStack.back(); + NextValue = Entry.Phi; + NextPredIdx = Entry.PredIdx; + DefinedLanes = Entry.DefinedLanes; + PhiStack.pop_back(); + } - LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": " - << MI); + Value = NextValue; + } while (Value); +} - for (const MachineOperand &Use : MI.uses()) { - if (!Use.isReg() || !Use.isUse()) - continue; +void SIWholeQuadMode::markOperand(const MachineInstr &MI, + const MachineOperand &Op, char Flag, + std::vector<WorkItem> &Worklist) { + assert(Op.isReg()); + Register Reg = Op.getReg(); - Register Reg = Use.getReg(); + // Ignore some hardware registers + switch (Reg) { + case AMDGPU::EXEC: + case AMDGPU::EXEC_LO: + return; + default: + break; + } + LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op + << " for " << MI); + if (Reg.isVirtual()) { + LiveRange &LR = LIS->getInterval(Reg); + markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist); + } else { // Handle physical registers that we need to track; this is mostly relevant // for VCC, which can appear as the (implicit) input of a uniform branch, // e.g. when a loop counter is stored in a VGPR. - if (!Reg.isVirtual()) { - if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO) + for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid(); + ++RegUnit) { + LiveRange &LR = LIS->getRegUnit(*RegUnit); + const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn(); + if (!Value) continue; - for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid(); - ++RegUnit) { - LiveRange &LR = LIS->getRegUnit(*RegUnit); - const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn(); - if (!Value) - continue; + markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist); + } + } +} - if (MRI->isSSA()) { - // Since we're in machine SSA, we do not need to track physical - // registers across basic blocks. - if (Value->isPHIDef()) - continue; - markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag, - Worklist); - } else { - markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist); - } - } +/// Mark all instructions defining the uses in \p MI with \p Flag. +void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag, + std::vector<WorkItem> &Worklist) { + LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": " + << MI); + for (const MachineOperand &Use : MI.uses()) { + if (!Use.isReg() || !Use.isUse()) continue; - } - - if (MRI->isSSA()) { - for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) - markInstruction(DefMI, Flag, Worklist); - } else { - LiveRange &LR = LIS->getInterval(Reg); - markDefs(MI, LR, Reg, Use.getSubReg(), Flag, Worklist); - } + markOperand(MI, Use, Flag, Worklist); } } @@ -392,6 +504,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, char Flags = 0; if (TII->isWQM(Opcode)) { + // If LOD is not supported WQM is not needed. + if (!ST->hasExtendedImageInsts()) + continue; // Sampling instructions don't need to produce results for all pixels // in a quad, they just require all inputs of a quad to have been // computed for derivatives. @@ -407,27 +522,31 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, LowerToCopyInstrs.push_back(&MI); SoftWQMInstrs.push_back(&MI); continue; - } else if (Opcode == AMDGPU::WWM) { - // The WWM intrinsic doesn't make the same guarantee, and plus it needs - // to be executed in WQM or Exact so that its copy doesn't clobber - // inactive lanes. - markInstructionUses(MI, StateWWM, Worklist); - GlobalFlags |= StateWWM; + } else if (Opcode == AMDGPU::STRICT_WWM) { + // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus + // it needs to be executed in WQM or Exact so that its copy doesn't + // clobber inactive lanes. + markInstructionUses(MI, StateStrictWWM, Worklist); + GlobalFlags |= StateStrictWWM; + LowerToMovInstrs.push_back(&MI); + continue; + } else if (Opcode == AMDGPU::STRICT_WQM) { + // STRICT_WQM is similar to STRICTWWM, but instead of enabling all + // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in + // quads that have at least one active thread. + markInstructionUses(MI, StateStrictWQM, Worklist); + GlobalFlags |= StateStrictWQM; LowerToMovInstrs.push_back(&MI); continue; } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 || Opcode == AMDGPU::V_SET_INACTIVE_B64) { - III.Disabled = StateWWM; + III.Disabled = StateStrict; MachineOperand &Inactive = MI.getOperand(2); if (Inactive.isReg()) { if (Inactive.isUndef()) { LowerToCopyInstrs.push_back(&MI); } else { - Register Reg = Inactive.getReg(); - if (Reg.isVirtual()) { - for (MachineInstr &DefMI : MRI->def_instructions(Reg)) - markInstruction(DefMI, StateWWM, Worklist); - } + markOperand(MI, Inactive, StateStrictWWM, Worklist); } } SetInactiveInstrs.push_back(&MI); @@ -439,15 +558,21 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, Worklist.push_back(&MBB); } GlobalFlags |= StateExact; - III.Disabled = StateWQM | StateWWM; + III.Disabled = StateWQM | StateStrict; continue; } else { - if (Opcode == AMDGPU::SI_PS_LIVE) { + if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) { LiveMaskQueries.push_back(&MI); + } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR || + Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR || + Opcode == AMDGPU::SI_DEMOTE_I1) { + KillInstrs.push_back(&MI); + BBI.NeedsLowering = true; } else if (WQMOutputs) { // The function is in machine SSA form, which means that physical // VGPRs correspond to shader inputs and outputs. Inputs are // only used, outputs are only defined. + // FIXME: is this still valid? for (const MachineOperand &MO : MI.defs()) { if (!MO.isReg()) continue; @@ -510,7 +635,7 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, // Propagate backwards within block if (MachineInstr *PrevMI = MI.getPrevNode()) { - char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds; + char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds; if (!PrevMI->isPHI()) { InstrInfo &PrevII = Instructions[PrevMI]; if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) { @@ -526,10 +651,12 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, if (II.Needs != 0) markInstructionUses(MI, II.Needs, Worklist); - // Ensure we process a block containing WWM, even if it does not require any - // WQM transitions. - if (II.Needs & StateWWM) - BI.Needs |= StateWWM; + // Ensure we process a block containing StrictWWM/StrictWQM, even if it does + // not require any WQM transitions. + if (II.Needs & StateStrictWWM) + BI.Needs |= StateStrictWWM; + if (II.Needs & StateStrictWQM) + BI.Needs |= StateStrictWQM; } void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, @@ -604,6 +731,339 @@ SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB, return Restore; } +MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB, + MachineInstr *TermMI) { + LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ " + << *TermMI << "\n"); + + MachineBasicBlock *SplitBB = + BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS); + + // Convert last instruction in block to a terminator. + // Note: this only covers the expected patterns + unsigned NewOpcode = 0; + switch (TermMI->getOpcode()) { + case AMDGPU::S_AND_B32: + NewOpcode = AMDGPU::S_AND_B32_term; + break; + case AMDGPU::S_AND_B64: + NewOpcode = AMDGPU::S_AND_B64_term; + break; + case AMDGPU::S_MOV_B32: + NewOpcode = AMDGPU::S_MOV_B32_term; + break; + case AMDGPU::S_MOV_B64: + NewOpcode = AMDGPU::S_MOV_B64_term; + break; + default: + break; + } + if (NewOpcode) + TermMI->setDesc(TII->get(NewOpcode)); + + if (SplitBB != BB) { + // Update dominator trees + using DomTreeT = DomTreeBase<MachineBasicBlock>; + SmallVector<DomTreeT::UpdateType, 16> DTUpdates; + for (MachineBasicBlock *Succ : SplitBB->successors()) { + DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ}); + DTUpdates.push_back({DomTreeT::Delete, BB, Succ}); + } + DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB}); + if (MDT) + MDT->getBase().applyUpdates(DTUpdates); + if (PDT) + PDT->getBase().applyUpdates(DTUpdates); + + // Link blocks + MachineInstr *MI = + BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH)) + .addMBB(SplitBB); + LIS->InsertMachineInstrInMaps(*MI); + } + + return SplitBB; +} + +MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB, + MachineInstr &MI) { + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Opcode = 0; + + assert(MI.getOperand(0).isReg()); + + // Comparison is for live lanes; however here we compute the inverse + // (killed lanes). This is because VCMP will always generate 0 bits + // for inactive lanes so a mask of live lanes would not be correct + // inside control flow. + // Invert the comparison by swapping the operands and adjusting + // the comparison codes. + + switch (MI.getOperand(2).getImm()) { + case ISD::SETUEQ: + Opcode = AMDGPU::V_CMP_LG_F32_e64; + break; + case ISD::SETUGT: + Opcode = AMDGPU::V_CMP_GE_F32_e64; + break; + case ISD::SETUGE: + Opcode = AMDGPU::V_CMP_GT_F32_e64; + break; + case ISD::SETULT: + Opcode = AMDGPU::V_CMP_LE_F32_e64; + break; + case ISD::SETULE: + Opcode = AMDGPU::V_CMP_LT_F32_e64; + break; + case ISD::SETUNE: + Opcode = AMDGPU::V_CMP_EQ_F32_e64; + break; + case ISD::SETO: + Opcode = AMDGPU::V_CMP_O_F32_e64; + break; + case ISD::SETUO: + Opcode = AMDGPU::V_CMP_U_F32_e64; + break; + case ISD::SETOEQ: + case ISD::SETEQ: + Opcode = AMDGPU::V_CMP_NEQ_F32_e64; + break; + case ISD::SETOGT: + case ISD::SETGT: + Opcode = AMDGPU::V_CMP_NLT_F32_e64; + break; + case ISD::SETOGE: + case ISD::SETGE: + Opcode = AMDGPU::V_CMP_NLE_F32_e64; + break; + case ISD::SETOLT: + case ISD::SETLT: + Opcode = AMDGPU::V_CMP_NGT_F32_e64; + break; + case ISD::SETOLE: + case ISD::SETLE: + Opcode = AMDGPU::V_CMP_NGE_F32_e64; + break; + case ISD::SETONE: + case ISD::SETNE: + Opcode = AMDGPU::V_CMP_NLG_F32_e64; + break; + default: + llvm_unreachable("invalid ISD:SET cond code"); + } + + // Pick opcode based on comparison type. + MachineInstr *VcmpMI; + const MachineOperand &Op0 = MI.getOperand(0); + const MachineOperand &Op1 = MI.getOperand(1); + if (TRI->isVGPR(*MRI, Op0.getReg())) { + Opcode = AMDGPU::getVOPe32(Opcode); + VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0); + } else { + VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)) + .addReg(AMDGPU::VCC, RegState::Define) + .addImm(0) // src0 modifiers + .add(Op1) + .addImm(0) // src1 modifiers + .add(Op0) + .addImm(0); // omod + } + + // VCC represents lanes killed. + Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; + + MachineInstr *MaskUpdateMI = + BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) + .addReg(LiveMaskReg) + .addReg(VCC); + + // State of SCC represents whether any lanes are live in mask, + // if SCC is 0 then no lanes will be alive anymore. + MachineInstr *EarlyTermMI = + BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0)); + + MachineInstr *ExecMaskMI = + BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC); + + assert(MBB.succ_size() == 1); + MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH)) + .addMBB(*MBB.succ_begin()); + + // Update live intervals + LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI); + MBB.remove(&MI); + + LIS->InsertMachineInstrInMaps(*MaskUpdateMI); + LIS->InsertMachineInstrInMaps(*ExecMaskMI); + LIS->InsertMachineInstrInMaps(*EarlyTermMI); + LIS->InsertMachineInstrInMaps(*NewTerm); + + return NewTerm; +} + +MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB, + MachineInstr &MI, bool IsWQM) { + const DebugLoc &DL = MI.getDebugLoc(); + MachineInstr *MaskUpdateMI = nullptr; + + const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1); + const MachineOperand &Op = MI.getOperand(0); + int64_t KillVal = MI.getOperand(1).getImm(); + MachineInstr *ComputeKilledMaskMI = nullptr; + Register CndReg = !Op.isImm() ? Op.getReg() : Register(); + Register TmpReg; + + // Is this a static or dynamic kill? + if (Op.isImm()) { + if (Op.getImm() == KillVal) { + // Static: all active lanes are killed + MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) + .addReg(LiveMaskReg) + .addReg(Exec); + } else { + // Static: kill does nothing + MachineInstr *NewTerm = nullptr; + if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) { + LIS->RemoveMachineInstrFromMaps(MI); + } else { + assert(MBB.succ_size() == 1); + NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH)) + .addMBB(*MBB.succ_begin()); + LIS->ReplaceMachineInstrInMaps(MI, *NewTerm); + } + MBB.remove(&MI); + return NewTerm; + } + } else { + if (!KillVal) { + // Op represents live lanes after kill, + // so exec mask needs to be factored in. + TmpReg = MRI->createVirtualRegister(TRI->getBoolRC()); + ComputeKilledMaskMI = + BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec); + MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) + .addReg(LiveMaskReg) + .addReg(TmpReg); + } else { + // Op represents lanes to kill + MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) + .addReg(LiveMaskReg) + .add(Op); + } + } + + // State of SCC represents whether any lanes are live in mask, + // if SCC is 0 then no lanes will be alive anymore. + MachineInstr *EarlyTermMI = + BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0)); + + // In the case we got this far some lanes are still live, + // update EXEC to deactivate lanes as appropriate. + MachineInstr *NewTerm; + MachineInstr *WQMMaskMI = nullptr; + Register LiveMaskWQM; + if (IsDemote) { + // Demotes deactive quads with only helper lanes + LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC()); + WQMMaskMI = + BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg); + NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec) + .addReg(Exec) + .addReg(LiveMaskWQM); + } else { + // Kills deactivate lanes + if (Op.isImm()) { + unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0); + } else if (!IsWQM) { + NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec) + .addReg(Exec) + .addReg(LiveMaskReg); + } else { + unsigned Opcode = KillVal ? AndN2Opc : AndOpc; + NewTerm = + BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op); + } + } + + // Update live intervals + LIS->RemoveMachineInstrFromMaps(MI); + MBB.remove(&MI); + assert(EarlyTermMI); + assert(MaskUpdateMI); + assert(NewTerm); + if (ComputeKilledMaskMI) + LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI); + LIS->InsertMachineInstrInMaps(*MaskUpdateMI); + LIS->InsertMachineInstrInMaps(*EarlyTermMI); + if (WQMMaskMI) + LIS->InsertMachineInstrInMaps(*WQMMaskMI); + LIS->InsertMachineInstrInMaps(*NewTerm); + + if (CndReg) { + LIS->removeInterval(CndReg); + LIS->createAndComputeVirtRegInterval(CndReg); + } + if (TmpReg) + LIS->createAndComputeVirtRegInterval(TmpReg); + if (LiveMaskWQM) + LIS->createAndComputeVirtRegInterval(LiveMaskWQM); + + return NewTerm; +} + +// Replace (or supplement) instructions accessing live mask. +// This can only happen once all the live mask registers have been created +// and the execute state (WQM/StrictWWM/Exact) of instructions is known. +void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { + auto BII = Blocks.find(&MBB); + if (BII == Blocks.end()) + return; + + const BlockInfo &BI = BII->second; + if (!BI.NeedsLowering) + return; + + LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n"); + + SmallVector<MachineInstr *, 4> SplitPoints; + char State = BI.InitialState; + + auto II = MBB.getFirstNonPHI(), IE = MBB.end(); + while (II != IE) { + auto Next = std::next(II); + MachineInstr &MI = *II; + + if (StateTransition.count(&MI)) + State = StateTransition[&MI]; + + MachineInstr *SplitPoint = nullptr; + switch (MI.getOpcode()) { + case AMDGPU::SI_DEMOTE_I1: + case AMDGPU::SI_KILL_I1_TERMINATOR: + SplitPoint = lowerKillI1(MBB, MI, State == StateWQM); + break; + case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: + SplitPoint = lowerKillF32(MBB, MI); + break; + default: + break; + } + if (SplitPoint) + SplitPoints.push_back(SplitPoint); + + II = Next; + } + + // Perform splitting after instruction scan to simplify iteration. + if (!SplitPoints.empty()) { + MachineBasicBlock *BB = &MBB; + for (MachineInstr *MI : SplitPoints) { + BB = splitBlock(BB, MI); + } + } +} + // Return an iterator in the (inclusive) range [First, Last] at which // instructions can be safely inserted, keeping in mind that some of the // instructions we want to add necessarily clobber SCC. @@ -680,93 +1140,108 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion( void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SaveWQM, unsigned LiveMaskReg) { + Register SaveWQM) { MachineInstr *MI; if (SaveWQM) { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ? - AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64), - SaveWQM) + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndSaveExecOpc), SaveWQM) .addReg(LiveMaskReg); } else { - unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ? - AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64), - Exec) + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndOpc), Exec) .addReg(Exec) .addReg(LiveMaskReg); } LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = StateExact; } void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SavedWQM) { + Register SavedWQM) { MachineInstr *MI; - unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; if (SavedWQM) { MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec) .addReg(SavedWQM); } else { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ? - AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64), - Exec) - .addReg(Exec); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec); } LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = StateWQM; } -void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB, - MachineBasicBlock::iterator Before, - unsigned SaveOrig) { +void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before, + Register SaveOrig, char StrictStateNeeded) { MachineInstr *MI; - assert(SaveOrig); - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig) - .addImm(-1); + assert(StrictStateNeeded == StateStrictWWM || + StrictStateNeeded == StateStrictWQM); + + if (StrictStateNeeded == StateStrictWWM) { + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM), + SaveOrig) + .addImm(-1); + } else { + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM), + SaveOrig) + .addImm(-1); + } LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = StateStrictWWM; } -void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB, - MachineBasicBlock::iterator Before, - unsigned SavedOrig) { +void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before, + Register SavedOrig, char NonStrictState, + char CurrentStrictState) { MachineInstr *MI; assert(SavedOrig); - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), - ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC) - .addReg(SavedOrig); + assert(CurrentStrictState == StateStrictWWM || + CurrentStrictState == StateStrictWQM); + + if (CurrentStrictState == StateStrictWWM) { + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM), + Exec) + .addReg(SavedOrig); + } else { + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM), + Exec) + .addReg(SavedOrig); + } LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = NonStrictState; } -void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, - bool isEntry) { +void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) { auto BII = Blocks.find(&MBB); if (BII == Blocks.end()) return; - const BlockInfo &BI = BII->second; + BlockInfo &BI = BII->second; // This is a non-entry block that is WQM throughout, so no need to do // anything. - if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) + if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) { + BI.InitialState = StateWQM; return; + } LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB) << ":\n"); - unsigned SavedWQMReg = 0; - unsigned SavedNonWWMReg = 0; - bool WQMFromExec = isEntry; - char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM; - char NonWWMState = 0; + Register SavedWQMReg; + Register SavedNonStrictReg; + bool WQMFromExec = IsEntry; + char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM; + char NonStrictState = 0; const TargetRegisterClass *BoolRC = TRI->getBoolRC(); auto II = MBB.getFirstNonPHI(), IE = MBB.end(); - if (isEntry) { + if (IsEntry) { // Skip the instruction that saves LiveMask if (II != IE && II->getOpcode() == AMDGPU::COPY) ++II; @@ -776,22 +1251,25 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, // Exact or vice versa. MachineBasicBlock::iterator FirstWQM = IE; - // This stores the first instruction where it's safe to switch from WWM to - // Exact/WQM or to switch to WWM. It must always be the same as, or after, - // FirstWQM since if it's safe to switch to/from WWM, it must be safe to - // switch to/from WQM as well. - MachineBasicBlock::iterator FirstWWM = IE; + // This stores the first instruction where it's safe to switch from Strict + // mode to Exact/WQM or to switch to Strict mode. It must always be the same + // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must + // be safe to switch to/from WQM as well. + MachineBasicBlock::iterator FirstStrict = IE; + + // Record initial state is block information. + BI.InitialState = State; for (;;) { MachineBasicBlock::iterator Next = II; - char Needs = StateExact | StateWQM; // WWM is disabled by default + char Needs = StateExact | StateWQM; // Strict mode is disabled by default. char OutNeeds = 0; if (FirstWQM == IE) FirstWQM = II; - if (FirstWWM == IE) - FirstWWM = II; + if (FirstStrict == IE) + FirstStrict = II; // First, figure out the allowed states (Needs) based on the propagated // flags. @@ -801,8 +1279,10 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) { auto III = Instructions.find(&MI); if (III != Instructions.end()) { - if (III->second.Needs & StateWWM) - Needs = StateWWM; + if (III->second.Needs & StateStrictWWM) + Needs = StateStrictWWM; + else if (III->second.Needs & StateStrictWQM) + Needs = StateStrictWQM; else if (III->second.Needs & StateWQM) Needs = StateWQM; else @@ -811,8 +1291,8 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, } } else { // If the instruction doesn't actually need a correct EXEC, then we can - // safely leave WWM enabled. - Needs = StateExact | StateWQM | StateWWM; + // safely leave Strict mode enabled. + Needs = StateExact | StateWQM | StateStrict; } if (MI.isTerminator() && OutNeeds == StateExact) @@ -832,32 +1312,56 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, // Now, transition if necessary. if (!(Needs & State)) { MachineBasicBlock::iterator First; - if (State == StateWWM || Needs == StateWWM) { - // We must switch to or from WWM - First = FirstWWM; + if (State == StateStrictWWM || Needs == StateStrictWWM || + State == StateStrictWQM || Needs == StateStrictWQM) { + // We must switch to or from Strict mode. + First = FirstStrict; } else { - // We only need to switch to/from WQM, so we can use FirstWQM + // We only need to switch to/from WQM, so we can use FirstWQM. First = FirstWQM; } + // Whether we need to save SCC depends on start and end states. + bool SaveSCC = false; + switch (State) { + case StateExact: + case StateStrictWWM: + case StateStrictWQM: + // Exact/Strict -> Strict: save SCC + // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec + // Exact/Strict -> Exact: no save + SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec); + break; + case StateWQM: + // WQM -> Exact/Strict: save SCC + SaveSCC = !(Needs & StateWQM); + break; + default: + llvm_unreachable("Unknown state"); + break; + } MachineBasicBlock::iterator Before = - prepareInsertion(MBB, First, II, Needs == StateWQM, - Needs == StateExact || WQMFromExec); - - if (State == StateWWM) { - assert(SavedNonWWMReg); - fromWWM(MBB, Before, SavedNonWWMReg); - LIS->createAndComputeVirtRegInterval(SavedNonWWMReg); - SavedNonWWMReg = 0; - State = NonWWMState; + prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC); + + if (State & StateStrict) { + assert(State == StateStrictWWM || State == StateStrictWQM); + assert(SavedNonStrictReg); + fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State); + + LIS->createAndComputeVirtRegInterval(SavedNonStrictReg); + SavedNonStrictReg = 0; + State = NonStrictState; } - if (Needs == StateWWM) { - NonWWMState = State; - assert(!SavedNonWWMReg); - SavedNonWWMReg = MRI->createVirtualRegister(BoolRC); - toWWM(MBB, Before, SavedNonWWMReg); - State = StateWWM; + if (Needs & StateStrict) { + NonStrictState = State; + assert(Needs == StateStrictWWM || Needs == StateStrictWQM); + assert(!SavedNonStrictReg); + SavedNonStrictReg = MRI->createVirtualRegister(BoolRC); + + toStrictMode(MBB, Before, SavedNonStrictReg, Needs); + State = Needs; + } else { if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) { if (!WQMFromExec && (OutNeeds & StateWQM)) { @@ -865,7 +1369,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, SavedWQMReg = MRI->createVirtualRegister(BoolRC); } - toExact(MBB, Before, SavedWQMReg, LiveMaskReg); + toExact(MBB, Before, SavedWQMReg); State = StateExact; } else if (State == StateExact && (Needs & StateWQM) && !(Needs & StateExact)) { @@ -879,17 +1383,18 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, } State = StateWQM; } else { - // We can get here if we transitioned from WWM to a non-WWM state that - // already matches our needs, but we shouldn't need to do anything. + // We can get here if we transitioned from StrictWWM to a + // non-StrictWWM state that already matches our needs, but we + // shouldn't need to do anything. assert(Needs & State); } } } - if (Needs != (StateExact | StateWQM | StateWWM)) { + if (Needs != (StateExact | StateWQM | StateStrict)) { if (Needs != (StateExact | StateWQM)) FirstWQM = IE; - FirstWWM = IE; + FirstStrict = IE; } if (II == IE) @@ -898,10 +1403,10 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, II = Next; } assert(!SavedWQMReg); - assert(!SavedNonWWMReg); + assert(!SavedNonStrictReg); } -void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { +void SIWholeQuadMode::lowerLiveMaskQueries() { for (MachineInstr *MI : LiveMaskQueries) { const DebugLoc &DL = MI->getDebugLoc(); Register Dest = MI->getOperand(0).getReg(); @@ -931,9 +1436,12 @@ void SIWholeQuadMode::lowerCopyInstrs() { const unsigned MovOp = TII->getMovOpcode(regClass); MI->setDesc(TII->get(MovOp)); - // And make it implicitly depend on exec (like all VALU movs should do). - MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); - } else if (!MRI->isSSA()) { + // Check that it already implicitly depends on exec (like all VALU movs + // should do). + assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) { + return MO.isUse() && MO.getReg() == AMDGPU::EXEC; + })); + } else { // Remove early-clobber and exec dependency from simple SGPR copies. // This allows some to be eliminated during/post RA. LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI); @@ -969,13 +1477,38 @@ void SIWholeQuadMode::lowerCopyInstrs() { } } +void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) { + for (MachineInstr *MI : KillInstrs) { + MachineBasicBlock *MBB = MI->getParent(); + MachineInstr *SplitPoint = nullptr; + switch (MI->getOpcode()) { + case AMDGPU::SI_DEMOTE_I1: + case AMDGPU::SI_KILL_I1_TERMINATOR: + SplitPoint = lowerKillI1(*MBB, *MI, IsWQM); + break; + case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: + SplitPoint = lowerKillF32(*MBB, *MI); + break; + default: + continue; + } + if (SplitPoint) + splitBlock(MBB, SplitPoint); + } +} + bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { + LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName() + << " ------------- \n"); + LLVM_DEBUG(MF.dump();); + Instructions.clear(); Blocks.clear(); LiveMaskQueries.clear(); LowerToCopyInstrs.clear(); LowerToMovInstrs.clear(); - CallingConv = MF.getFunction().getCallingConv(); + KillInstrs.clear(); + StateTransition.clear(); ST = &MF.getSubtarget<GCNSubtarget>(); @@ -983,64 +1516,72 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); LIS = &getAnalysis<LiveIntervals>(); + MDT = &getAnalysis<MachineDominatorTree>(); + PDT = &getAnalysis<MachinePostDominatorTree>(); if (ST->isWave32()) { AndOpc = AMDGPU::S_AND_B32; - XorTermrOpc = AMDGPU::S_XOR_B32_term; + AndN2Opc = AMDGPU::S_ANDN2_B32; + XorOpc = AMDGPU::S_XOR_B32; + AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32; OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32; + WQMOpc = AMDGPU::S_WQM_B32; Exec = AMDGPU::EXEC_LO; } else { AndOpc = AMDGPU::S_AND_B64; - XorTermrOpc = AMDGPU::S_XOR_B64_term; + AndN2Opc = AMDGPU::S_ANDN2_B64; + XorOpc = AMDGPU::S_XOR_B64; + AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64; OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64; + WQMOpc = AMDGPU::S_WQM_B64; Exec = AMDGPU::EXEC; } - char GlobalFlags = analyzeFunction(MF); - unsigned LiveMaskReg = 0; - if (!(GlobalFlags & StateWQM)) { - lowerLiveMaskQueries(Exec); - if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty()) - return !LiveMaskQueries.empty(); - } else { - // Store a copy of the original live mask when required - MachineBasicBlock &Entry = MF.front(); - MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); - - if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) { - LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC()); - MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(), - TII->get(AMDGPU::COPY), LiveMaskReg) - .addReg(Exec); - LIS->InsertMachineInstrInMaps(*MI); - } + const char GlobalFlags = analyzeFunction(MF); + const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty()); + + LiveMaskReg = Exec; - lowerLiveMaskQueries(LiveMaskReg); + // Shader is simple does not need any state changes or any complex lowering + if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() && + LowerToMovInstrs.empty() && KillInstrs.empty()) { + lowerLiveMaskQueries(); + return !LiveMaskQueries.empty(); + } - if (GlobalFlags == StateWQM) { - // For a shader that needs only WQM, we can just set it once. - auto MI = BuildMI(Entry, EntryMI, DebugLoc(), - TII->get(ST->isWave32() ? AMDGPU::S_WQM_B32 - : AMDGPU::S_WQM_B64), - Exec) - .addReg(Exec); - LIS->InsertMachineInstrInMaps(*MI); + MachineBasicBlock &Entry = MF.front(); + MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); - lowerCopyInstrs(); - // EntryMI may become invalid here - return true; - } + // Store a copy of the original live mask when required + if (NeedsLiveMask || (GlobalFlags & StateWQM)) { + LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC()); + MachineInstr *MI = + BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) + .addReg(Exec); + LIS->InsertMachineInstrInMaps(*MI); } LLVM_DEBUG(printInfo()); + lowerLiveMaskQueries(); lowerCopyInstrs(); - // Handle the general case - for (auto BII : Blocks) - processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin()); + // Shader only needs WQM + if (GlobalFlags == StateWQM) { + auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec) + .addReg(Exec); + LIS->InsertMachineInstrInMaps(*MI); + lowerKillInstrs(true); + } else { + for (auto BII : Blocks) + processBlock(*BII.first, BII.first == &Entry); + // Lowering blocks causes block splitting so perform as a second pass. + for (auto BII : Blocks) + lowerBlock(*BII.first); + } - if (LiveMaskReg) + // Compute live range for live mask + if (LiveMaskReg != Exec) LIS->createAndComputeVirtRegInterval(LiveMaskReg); // Physical registers like SCC aren't tracked by default anyway, so just @@ -1048,5 +1589,9 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { // the analysis results. LIS->removeRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI)); + // If we performed any kills then recompute EXEC + if (!KillInstrs.empty()) + LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI)); + return true; } diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 5b8896c21832..8502ed61b366 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -57,10 +57,19 @@ class SM_Real <SM_Pseudo ps> Instruction Opcode = !cast<Instruction>(NAME); // copy relevant pseudo op flags - let SubtargetPredicate = ps.SubtargetPredicate; - let AsmMatchConverter = ps.AsmMatchConverter; + let LGKM_CNT = ps.LGKM_CNT; + let SMRD = ps.SMRD; + let mayStore = ps.mayStore; + let mayLoad = ps.mayLoad; + let hasSideEffects = ps.hasSideEffects; let UseNamedOperandTable = ps.UseNamedOperandTable; - let SMRD = ps.SMRD; + let SchedRW = ps.SchedRW; + let SubtargetPredicate = ps.SubtargetPredicate; + let AsmMatchConverter = ps.AsmMatchConverter; + let IsAtomicRet = ps.IsAtomicRet; + let IsAtomicNoRet = ps.IsAtomicNoRet; + + let TSFlags = ps.TSFlags; bit is_buffer = ps.is_buffer; @@ -69,6 +78,7 @@ class SM_Real <SM_Pseudo ps> bits<7> sdst; bits<32> offset; bits<1> imm = !if(ps.has_offset, ps.offset_is_imm, 0); + bits<5> cpol; } class SM_Probe_Pseudo <string opName, dag ins, bit isImm> @@ -120,8 +130,8 @@ multiclass SM_Pseudo_Loads<string opName, RegisterClass dstClass> { def _IMM : SM_Load_Pseudo <opName, (outs dstClass:$sdst), - (ins baseClass:$sbase, i32imm:$offset, i1imm:$glc, i1imm:$dlc), - " $sdst, $sbase, $offset$glc$dlc", []> { + (ins baseClass:$sbase, i32imm:$offset, CPol:$cpol), + " $sdst, $sbase, $offset$cpol", []> { let offset_is_imm = 1; let BaseClass = baseClass; let PseudoInstr = opName # "_IMM"; @@ -131,8 +141,8 @@ multiclass SM_Pseudo_Loads<string opName, def _SGPR : SM_Load_Pseudo <opName, (outs dstClass:$sdst), - (ins baseClass:$sbase, SReg_32:$soff, i1imm:$glc, i1imm:$dlc), - " $sdst, $sbase, $offset$glc$dlc", []> { + (ins baseClass:$sbase, SReg_32:$soff, CPol:$cpol), + " $sdst, $sbase, $offset$cpol", []> { let BaseClass = baseClass; let PseudoInstr = opName # "_SGPR"; let has_glc = 1; @@ -144,8 +154,8 @@ multiclass SM_Pseudo_Stores<string opName, RegisterClass baseClass, RegisterClass srcClass> { def _IMM : SM_Store_Pseudo <opName, - (ins srcClass:$sdata, baseClass:$sbase, i32imm:$offset, i1imm:$glc, i1imm:$dlc), - " $sdata, $sbase, $offset$glc$dlc", []> { + (ins srcClass:$sdata, baseClass:$sbase, i32imm:$offset, CPol:$cpol), + " $sdata, $sbase, $offset$cpol", []> { let offset_is_imm = 1; let BaseClass = baseClass; let SrcClass = srcClass; @@ -153,8 +163,8 @@ multiclass SM_Pseudo_Stores<string opName, } def _SGPR : SM_Store_Pseudo <opName, - (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soff, i1imm:$glc, i1imm:$dlc), - " $sdata, $sbase, $offset$glc$dlc", []> { + (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soff, CPol:$cpol), + " $sdata, $sbase, $offset$cpol", []> { let BaseClass = baseClass; let SrcClass = srcClass; let PseudoInstr = opName # "_SGPR"; @@ -227,24 +237,32 @@ class SM_Atomic_Pseudo <string opName, let ScalarStore = 1; let hasSideEffects = 1; let maybeAtomic = 1; + + let IsAtomicNoRet = !not(isRet); + let IsAtomicRet = isRet; + + let AsmMatchConverter = "cvtSMEMAtomic"; } class SM_Pseudo_Atomic<string opName, RegisterClass baseClass, RegisterClass dataClass, bit isImm, - bit isRet> : + bit isRet, + string opNameWithSuffix = opName # !if(isImm, + !if(isRet, "_IMM_RTN", "_IMM"), + !if(isRet, "_SGPR_RTN", "_SGPR")), + Operand CPolTy = !if(isRet, CPol_GLC1, CPol)> : SM_Atomic_Pseudo<opName, !if(isRet, (outs dataClass:$sdst), (outs)), !if(isImm, - (ins dataClass:$sdata, baseClass:$sbase, smem_offset:$offset, DLC:$dlc), - (ins dataClass:$sdata, baseClass:$sbase, SReg_32:$offset, DLC:$dlc)), - !if(isRet, " $sdst", " $sdata") # ", $sbase, $offset" # !if(isRet, " glc", "") # "$dlc", - isRet> { + (ins dataClass:$sdata, baseClass:$sbase, smem_offset:$offset, CPolTy:$cpol), + (ins dataClass:$sdata, baseClass:$sbase, SReg_32:$offset, CPolTy:$cpol)), + !if(isRet, " $sdst", " $sdata") # ", $sbase, $offset$cpol", + isRet>, + AtomicNoRet <opNameWithSuffix, isRet> { let offset_is_imm = isImm; - let PseudoInstr = opName # !if(isImm, - !if(isRet, "_IMM_RTN", "_IMM"), - !if(isRet, "_SGPR_RTN", "_SGPR")); + let PseudoInstr = opNameWithSuffix; let Constraints = !if(isRet, "$sdst = $sdata", ""); let DisableEncoding = !if(isRet, "$sdata", ""); @@ -456,13 +474,13 @@ multiclass SM_Real_Loads_si<bits<5> op, string ps, SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> { def _IMM_si : SMRD_Real_si <op, immPs> { - let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_8:$offset, GLC:$glc, DLC:$dlc); + let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_8:$offset, CPol:$cpol); } // FIXME: The operand name $offset is inconsistent with $soff used // in the pseudo def _SGPR_si : SMRD_Real_si <op, sgprPs> { - let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc); + let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol); } } @@ -490,32 +508,31 @@ class SMEM_Real_vi <bits<8> op, SM_Pseudo ps> : SM_Real<ps> , SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> , Enc64 { - bit glc; - let AssemblerPredicate = isGFX8GFX9; let DecoderNamespace = "GFX8"; let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?); let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?); - let Inst{16} = !if(ps.has_glc, glc, ?); + let Inst{16} = !if(ps.has_glc, cpol{CPolBit.GLC}, ?); let Inst{17} = imm; let Inst{25-18} = op; let Inst{31-26} = 0x30; //encoding // VI supports 20-bit unsigned offsets while GFX9+ supports 21-bit signed. // Offset value is corrected accordingly when offset is encoded/decoded. - let Inst{52-32} = !if(ps.has_offset, offset{20-0}, ?); + let Inst{38-32} = !if(ps.has_offset, offset{6-0}, ?); + let Inst{52-39} = !if(ps.has_offset, !if(imm, offset{20-7}, ?), ?); } multiclass SM_Real_Loads_vi<bits<8> op, string ps, SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM), SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> { def _IMM_vi : SMEM_Real_vi <op, immPs> { - let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, GLC:$glc, DLC:$dlc); + let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol); } def _SGPR_vi : SMEM_Real_vi <op, sgprPs> { - let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc); + let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol); } } @@ -533,11 +550,11 @@ multiclass SM_Real_Stores_vi<bits<8> op, string ps, // FIXME: The operand name $offset is inconsistent with $soff used // in the pseudo def _IMM_vi : SMEM_Real_Store_vi <op, immPs> { - let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, GLC:$glc, DLC:$dlc); + let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol); } def _SGPR_vi : SMEM_Real_Store_vi <op, sgprPs> { - let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc); + let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol); } } @@ -589,15 +606,16 @@ defm S_ATC_PROBE_BUFFER : SM_Real_Probe_vi <0x27, "S_ATC_PROBE_BUFFER">; //===----------------------------------------------------------------------===// class SMEM_Atomic_Real_vi <bits<8> op, SM_Atomic_Pseudo ps> - : SMEM_Real_vi <op, ps> { + : SMEM_Real_vi <op, ps>, + AtomicNoRet <!subst("_RTN","",NAME), ps.glc> { bits<7> sdata; let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; - let glc = ps.glc; - let Inst{12-6} = !if(glc, sdst{6-0}, sdata{6-0}); + let cpol{CPolBit.GLC} = ps.glc; + let Inst{12-6} = !if(ps.glc, sdst{6-0}, sdata{6-0}); } multiclass SM_Real_Atomics_vi<bits<8> op, string ps> { @@ -686,13 +704,7 @@ class SMRD_Real_Load_IMM_ci <bits<5> op, SM_Load_Pseudo ps> : let AssemblerPredicate = isGFX7Only; let DecoderNamespace = "GFX7"; - let InOperandList = (ins ps.BaseClass:$sbase, smrd_literal_offset:$offset, GLC:$glc, DLC:$dlc); - - let LGKM_CNT = ps.LGKM_CNT; - let mayLoad = ps.mayLoad; - let mayStore = ps.mayStore; - let hasSideEffects = ps.hasSideEffects; - let SchedRW = ps.SchedRW; + let InOperandList = (ins ps.BaseClass:$sbase, smrd_literal_offset:$offset, CPol:$cpol); let Inst{7-0} = 0xff; let Inst{8} = 0; @@ -764,26 +776,26 @@ multiclass SMRD_Pattern <string Instr, ValueType vt> { // 1. IMM offset def : GCNPat < (smrd_load (SMRDImm i64:$sbase, i32:$offset)), - (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0, 0)) + (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0)) >; // 2. 32-bit IMM offset on CI def : GCNPat < (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)), - (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0, 0))> { + (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0))> { let OtherPredicates = [isGFX7Only]; } // 3. SGPR offset def : GCNPat < (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)), - (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0, 0)) + (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0)) >; // 4. No offset def : GCNPat < (vt (smrd_load (i64 SReg_64:$sbase))), - (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0, 0)) + (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0)) >; } @@ -791,8 +803,7 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt> { // 1. Offset as an immediate def : GCNPat < (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), timm:$cachepolicy), - (vt (!cast<SM_Pseudo>(Instr#"_IMM") SReg_128:$sbase, i32imm:$offset, (extract_glc $cachepolicy), - (extract_dlc $cachepolicy)))> { + (vt (!cast<SM_Pseudo>(Instr#"_IMM") SReg_128:$sbase, i32imm:$offset, (extract_cpol $cachepolicy)))> { let AddedComplexity = 2; } @@ -800,7 +811,7 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt> { def : GCNPat < (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), timm:$cachepolicy)), (!cast<InstSI>(Instr#"_IMM_ci") SReg_128:$sbase, smrd_literal_offset:$offset, - (extract_glc $cachepolicy), (extract_dlc $cachepolicy))> { + (extract_cpol $cachepolicy))> { let OtherPredicates = [isGFX7Only]; let AddedComplexity = 1; } @@ -808,8 +819,7 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt> { // 3. Offset loaded in an 32bit SGPR def : GCNPat < (SIsbuffer_load v4i32:$sbase, i32:$offset, timm:$cachepolicy), - (vt (!cast<SM_Pseudo>(Instr#"_SGPR") SReg_128:$sbase, SReg_32:$offset, (extract_glc $cachepolicy), - (extract_dlc $cachepolicy))) + (vt (!cast<SM_Pseudo>(Instr#"_SGPR") SReg_128:$sbase, SReg_32:$offset, (extract_cpol $cachepolicy))) >; } @@ -858,14 +868,16 @@ def : GCNPat < >; } // let OtherPredicates = [HasSMemTimeInst] -let OtherPredicates = [HasNoSMemTimeInst] in { +let OtherPredicates = [HasShaderCyclesRegister] in { def : GCNPat < (i64 (readcyclecounter)), (REG_SEQUENCE SReg_64, (S_GETREG_B32 getHwRegImm<HWREG.SHADER_CYCLES, 0, -12>.ret), sub0, - (S_MOV_B32 (i32 0)), sub1) ->; -} // let OtherPredicates = [HasNoSMemTimeInst] + (S_MOV_B32 (i32 0)), sub1)> { + // Prefer this to s_memtime because it has lower and more predictable latency. + let AddedComplexity = 1; +} +} // let OtherPredicates = [HasShaderCyclesRegister] //===----------------------------------------------------------------------===// // GFX10. @@ -873,16 +885,13 @@ def : GCNPat < class SMEM_Real_gfx10<bits<8> op, SM_Pseudo ps> : SM_Real<ps>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX10>, Enc64 { - bit glc; - bit dlc; - let AssemblerPredicate = isGFX10Plus; let DecoderNamespace = "GFX10"; let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?); let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?); - let Inst{14} = !if(ps.has_dlc, dlc, ?); - let Inst{16} = !if(ps.has_glc, glc, ?); + let Inst{14} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ?); + let Inst{16} = !if(ps.has_glc, cpol{CPolBit.GLC}, ?); let Inst{25-18} = op; let Inst{31-26} = 0x3d; let Inst{52-32} = !if(ps.offset_is_imm, !if(ps.has_offset, offset{20-0}, ?), ?); @@ -894,10 +903,10 @@ multiclass SM_Real_Loads_gfx10<bits<8> op, string ps, SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM), SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> { def _IMM_gfx10 : SMEM_Real_gfx10<op, immPs> { - let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, GLC:$glc, DLC:$dlc); + let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol); } def _SGPR_gfx10 : SMEM_Real_gfx10<op, sgprPs> { - let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc); + let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol); } } @@ -914,11 +923,11 @@ multiclass SM_Real_Stores_gfx10<bits<8> op, string ps, // FIXME: The operand name $offset is inconsistent with $soff used // in the pseudo def _IMM_gfx10 : SMEM_Real_Store_gfx10 <op, immPs> { - let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, GLC:$glc, DLC:$dlc); + let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol); } def _SGPR_gfx10 : SMEM_Real_Store_gfx10 <op, sgprPs> { - let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc); + let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol); } } @@ -973,18 +982,18 @@ defm S_ATC_PROBE : SM_Real_Probe_gfx10 <0x26, "S_ATC_PROBE">; defm S_ATC_PROBE_BUFFER : SM_Real_Probe_gfx10 <0x27, "S_ATC_PROBE_BUFFER">; class SMEM_Atomic_Real_gfx10 <bits<8> op, SM_Atomic_Pseudo ps> - : SMEM_Real_gfx10 <op, ps> { + : SMEM_Real_gfx10 <op, ps>, + AtomicNoRet <!subst("_RTN","",NAME), ps.glc> { bits<7> sdata; - bit dlc; let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; - let glc = ps.glc; + let cpol{CPolBit.GLC} = ps.glc; - let Inst{14} = !if(ps.has_dlc, dlc, 0); - let Inst{12-6} = !if(glc, sdst{6-0}, sdata{6-0}); + let Inst{14} = !if(ps.has_dlc, cpol{CPolBit.DLC}, 0); + let Inst{12-6} = !if(ps.glc, sdst{6-0}, sdata{6-0}); } multiclass SM_Real_Atomics_gfx10<bits<8> op, string ps> { diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 7426af931a62..e9697017aac0 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -59,6 +59,8 @@ class SOP1_Real<bits<8> op, SOP1_Pseudo ps, string real_name = ps.Mnemonic> : real_name # " " # ps.AsmOperands, []>, Enc32 { + let SALU = 1; + let SOP1 = 1; let isPseudo = 0; let isCodeGenOnly = 0; let Size = 4; @@ -66,6 +68,9 @@ class SOP1_Real<bits<8> op, SOP1_Pseudo ps, string real_name = ps.Mnemonic> : // copy relevant pseudo op flags let SubtargetPredicate = ps.SubtargetPredicate; let AsmMatchConverter = ps.AsmMatchConverter; + let SchedRW = ps.SchedRW; + let mayLoad = ps.mayLoad; + let mayStore = ps.mayStore; // encoding bits<7> sdst; @@ -157,7 +162,7 @@ let isMoveImm = 1 in { let isReMaterializable = 1, isAsCheapAsAMove = 1 in { def S_MOV_B32 : SOP1_32 <"s_mov_b32">; def S_MOV_B64 : SOP1_64 <"s_mov_b64">; - } // End isRematerializeable = 1 + } // End isReMaterializable = 1 let Uses = [SCC] in { def S_CMOV_B32 : SOP1_32 <"s_cmov_b32">; @@ -192,10 +197,14 @@ def : GCNPat < >; } +let isReMaterializable = 1, isAsCheapAsAMove = 1 in { def S_BREV_B32 : SOP1_32 <"s_brev_b32", [(set i32:$sdst, (bitreverse i32:$src0))] >; -def S_BREV_B64 : SOP1_64 <"s_brev_b64">; +def S_BREV_B64 : SOP1_64 <"s_brev_b64", + [(set i64:$sdst, (bitreverse i64:$src0))] +>; +} // End isReMaterializable = 1, isAsCheapAsAMove = 1 let Defs = [SCC] in { def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">; @@ -208,6 +217,7 @@ def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64", >; } // End Defs = [SCC] +let isReMaterializable = 1 in { def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">; def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">; def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64", @@ -235,11 +245,13 @@ def S_SEXT_I32_I8 : SOP1_32 <"s_sext_i32_i8", def S_SEXT_I32_I16 : SOP1_32 <"s_sext_i32_i16", [(set i32:$sdst, (sext_inreg i32:$src0, i16))] >; +} // End isReMaterializable = 1 def S_BITSET0_B32 : SOP1_32 <"s_bitset0_b32", [], 1>; def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64", [], 1>; def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32", [], 1>; def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64", [], 1>; + def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64", [(set i64:$sdst, (int_amdgcn_s_getpc))] >; @@ -291,7 +303,9 @@ def S_CBRANCH_JOIN : SOP1_0_32R <"s_cbranch_join">; } // End SubtargetPredicate = isGFX6GFX7GFX8GFX9 let Defs = [SCC] in { -def S_ABS_I32 : SOP1_32 <"s_abs_i32">; +def S_ABS_I32 : SOP1_32 <"s_abs_i32", + [(set i32:$sdst, (abs i32:$src0))] + >; } // End Defs = [SCC] let SubtargetPredicate = HasVGPRIndexMode in { @@ -309,6 +323,7 @@ let SubtargetPredicate = isGFX9Plus in { def S_ANDN2_WREXEC_B64 : SOP1_64<"s_andn2_wrexec_b64">; } // End hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC] + let isReMaterializable = 1 in def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32">; } // End SubtargetPredicate = isGFX9Plus @@ -363,14 +378,19 @@ class SOP2_Real<bits<7> op, SOP_Pseudo ps, string real_name = ps.Mnemonic> : InstSI <ps.OutOperandList, ps.InOperandList, real_name # " " # ps.AsmOperands, []>, Enc32 { + let SALU = 1; + let SOP2 = 1; let isPseudo = 0; let isCodeGenOnly = 0; // copy relevant pseudo op flags - let SubtargetPredicate = ps.SubtargetPredicate; - let AsmMatchConverter = ps.AsmMatchConverter; + let SubtargetPredicate = ps.SubtargetPredicate; + let AsmMatchConverter = ps.AsmMatchConverter; let UseNamedOperandTable = ps.UseNamedOperandTable; - let TSFlags = ps.TSFlags; + let TSFlags = ps.TSFlags; + let SchedRW = ps.SchedRW; + let mayLoad = ps.mayLoad; + let mayStore = ps.mayStore; // encoding bits<7> sdst; @@ -596,6 +616,7 @@ def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64", >; } // End Defs = [SCC] +let isReMaterializable = 1 in { def S_BFM_B32 : SOP2_32 <"s_bfm_b32", [(set i32:$sdst, (UniformBinFrag<AMDGPUbfm> i32:$src0, i32:$src1))]>; def S_BFM_B64 : SOP2_64_32_32 <"s_bfm_b64">; @@ -605,7 +626,7 @@ def S_MUL_I32 : SOP2_32 <"s_mul_i32", [(set i32:$sdst, (mul i32:$src0, i32:$src1))]> { let isCommutable = 1; } - +} // End isReMaterializable = 1 } // End AddedComplexity = 1 let Defs = [SCC] in { @@ -640,9 +661,11 @@ let SubtargetPredicate = isGFX8GFX9 in { } let SubtargetPredicate = isGFX9Plus in { - def S_PACK_LL_B32_B16 : SOP2_32<"s_pack_ll_b32_b16">; - def S_PACK_LH_B32_B16 : SOP2_32<"s_pack_lh_b32_b16">; - def S_PACK_HH_B32_B16 : SOP2_32<"s_pack_hh_b32_b16">; + let isReMaterializable = 1 in { + def S_PACK_LL_B32_B16 : SOP2_32<"s_pack_ll_b32_b16">; + def S_PACK_LH_B32_B16 : SOP2_32<"s_pack_lh_b32_b16">; + def S_PACK_HH_B32_B16 : SOP2_32<"s_pack_hh_b32_b16">; + } // End isReMaterializable = 1 let Defs = [SCC] in { def S_LSHL1_ADD_U32 : SOP2_32<"s_lshl1_add_u32", @@ -659,12 +682,12 @@ let SubtargetPredicate = isGFX9Plus in { >; } // End Defs = [SCC] - let isCommutable = 1 in { + let isCommutable = 1, isReMaterializable = 1 in { def S_MUL_HI_U32 : SOP2_32<"s_mul_hi_u32", [(set i32:$sdst, (UniformBinFrag<mulhu> SSrc_b32:$src0, SSrc_b32:$src1))]>; def S_MUL_HI_I32 : SOP2_32<"s_mul_hi_i32", [(set i32:$sdst, (UniformBinFrag<mulhs> SSrc_b32:$src0, SSrc_b32:$src1))]>; - } + } // End isCommutable = 1, isReMaterializable = 1 } // End SubtargetPredicate = isGFX9Plus //===----------------------------------------------------------------------===// @@ -693,6 +716,8 @@ class SOPK_Pseudo <string opName, dag outs, dag ins, class SOPK_Real<bits<5> op, SOPK_Pseudo ps> : InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # " " # ps.AsmOperands, []> { + let SALU = 1; + let SOPK = 1; let isPseudo = 0; let isCodeGenOnly = 0; @@ -701,6 +726,11 @@ class SOPK_Real<bits<5> op, SOPK_Pseudo ps> : let AsmMatchConverter = ps.AsmMatchConverter; let DisableEncoding = ps.DisableEncoding; let Constraints = ps.Constraints; + let SchedRW = ps.SchedRW; + let mayLoad = ps.mayLoad; + let mayStore = ps.mayStore; + let isBranch = ps.isBranch; + let isCall = ps.isCall; // encoding bits<7> sdst; @@ -947,15 +977,20 @@ class SOPC_Real<bits<7> op, SOPC_Pseudo ps, string real_name = ps.Mnemonic> : InstSI <ps.OutOperandList, ps.InOperandList, real_name # " " # ps.AsmOperands, []>, Enc32 { + let SALU = 1; + let SOPC = 1; let isPseudo = 0; let isCodeGenOnly = 0; // copy relevant pseudo op flags - let SubtargetPredicate = ps.SubtargetPredicate; - let OtherPredicates = ps.OtherPredicates; - let AsmMatchConverter = ps.AsmMatchConverter; + let SubtargetPredicate = ps.SubtargetPredicate; + let OtherPredicates = ps.OtherPredicates; + let AsmMatchConverter = ps.AsmMatchConverter; let UseNamedOperandTable = ps.UseNamedOperandTable; - let TSFlags = ps.TSFlags; + let TSFlags = ps.TSFlags; + let SchedRW = ps.SchedRW; + let mayLoad = ps.mayLoad; + let mayStore = ps.mayStore; // encoding bits<8> src0; @@ -1075,15 +1110,20 @@ class SOPPRelaxTable <bit isRelaxed, string keyName, string gfxip> { class SOPP_Real<bits<7> op, SOPP_Pseudo ps, string real_name = ps.Mnemonic> : InstSI <ps.OutOperandList, ps.InOperandList, real_name # ps.AsmOperands, []> { + let SALU = 1; + let SOPP = 1; let isPseudo = 0; let isCodeGenOnly = 0; // copy relevant pseudo op flags - let SubtargetPredicate = ps.SubtargetPredicate; - let OtherPredicates = ps.OtherPredicates; - let AsmMatchConverter = ps.AsmMatchConverter; + let SubtargetPredicate = ps.SubtargetPredicate; + let OtherPredicates = ps.OtherPredicates; + let AsmMatchConverter = ps.AsmMatchConverter; let UseNamedOperandTable = ps.UseNamedOperandTable; - let TSFlags = ps.TSFlags; + let TSFlags = ps.TSFlags; + let SchedRW = ps.SchedRW; + let mayLoad = ps.mayLoad; + let mayStore = ps.mayStore; bits <16> simm16; } @@ -1226,7 +1266,8 @@ def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > { let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins WAIT_FLAG:$simm16), "$simm16", [(int_amdgcn_s_waitcnt timm:$simm16)]>; -def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i16imm:$simm16), "$simm16">; +def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16", + [(int_amdgcn_s_sethalt timm:$simm16)]>; def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">; // On SI the documentation says sleep for approximately 64 * low 2 @@ -1433,8 +1474,9 @@ class Select_gfx6_gfx7<string opName> : SIMCInstr<opName, SIEncodingFamily.SI> { //===----------------------------------------------------------------------===// multiclass SOP1_Real_gfx10<bits<8> op> { - def _gfx10 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>, - Select_gfx10<!cast<SOP1_Pseudo>(NAME).Mnemonic>; + defvar ps = !cast<SOP1_Pseudo>(NAME); + def _gfx10 : SOP1_Real<op, ps>, + Select_gfx10<ps.Mnemonic>; } defm S_ANDN1_SAVEEXEC_B64 : SOP1_Real_gfx10<0x037>; @@ -1462,8 +1504,9 @@ defm S_MOVRELSD_2_B32 : SOP1_Real_gfx10<0x049>; multiclass SOP1_Real_gfx6_gfx7<bits<8> op> { - def _gfx6_gfx7 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>, - Select_gfx6_gfx7<!cast<SOP1_Pseudo>(NAME).Mnemonic>; + defvar ps = !cast<SOP1_Pseudo>(NAME); + def _gfx6_gfx7 : SOP1_Real<op, ps>, + Select_gfx6_gfx7<ps.Mnemonic>; } multiclass SOP1_Real_gfx6_gfx7_gfx10<bits<8> op> : @@ -1524,8 +1567,9 @@ defm S_ABS_I32 : SOP1_Real_gfx6_gfx7_gfx10<0x034>; //===----------------------------------------------------------------------===// multiclass SOP2_Real_gfx10<bits<7> op> { - def _gfx10 : SOP2_Real<op, !cast<SOP2_Pseudo>(NAME)>, - Select_gfx10<!cast<SOP2_Pseudo>(NAME).Mnemonic>; + defvar ps = !cast<SOP2_Pseudo>(NAME); + def _gfx10 : SOP2_Real<op, ps>, + Select_gfx10<ps.Mnemonic>; } defm S_LSHL1_ADD_U32 : SOP2_Real_gfx10<0x02e>; @@ -1543,8 +1587,9 @@ defm S_MUL_HI_I32 : SOP2_Real_gfx10<0x036>; //===----------------------------------------------------------------------===// multiclass SOP2_Real_gfx6_gfx7<bits<7> op> { - def _gfx6_gfx7 : SOP2_Real<op, !cast<SOP_Pseudo>(NAME)>, - Select_gfx6_gfx7<!cast<SOP_Pseudo>(NAME).Mnemonic>; + defvar ps = !cast<SOP_Pseudo>(NAME); + def _gfx6_gfx7 : SOP2_Real<op, ps>, + Select_gfx6_gfx7<ps.Mnemonic>; } multiclass SOP2_Real_gfx6_gfx7_gfx10<bits<7> op> : @@ -1600,13 +1645,15 @@ defm S_ABSDIFF_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x02c>; //===----------------------------------------------------------------------===// multiclass SOPK_Real32_gfx10<bits<5> op> { - def _gfx10 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>, - Select_gfx10<!cast<SOPK_Pseudo>(NAME).Mnemonic>; + defvar ps = !cast<SOPK_Pseudo>(NAME); + def _gfx10 : SOPK_Real32<op, ps>, + Select_gfx10<ps.Mnemonic>; } multiclass SOPK_Real64_gfx10<bits<5> op> { - def _gfx10 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>, - Select_gfx10<!cast<SOPK_Pseudo>(NAME).Mnemonic>; + defvar ps = !cast<SOPK_Pseudo>(NAME); + def _gfx10 : SOPK_Real64<op, ps>, + Select_gfx10<ps.Mnemonic>; } defm S_VERSION : SOPK_Real32_gfx10<0x001>; @@ -1623,13 +1670,15 @@ defm S_SUBVECTOR_LOOP_END : SOPK_Real32_gfx10<0x01c>; //===----------------------------------------------------------------------===// multiclass SOPK_Real32_gfx6_gfx7<bits<5> op> { - def _gfx6_gfx7 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>, - Select_gfx6_gfx7<!cast<SOPK_Pseudo>(NAME).Mnemonic>; + defvar ps = !cast<SOPK_Pseudo>(NAME); + def _gfx6_gfx7 : SOPK_Real32<op, ps>, + Select_gfx6_gfx7<ps.Mnemonic>; } multiclass SOPK_Real64_gfx6_gfx7<bits<5> op> { - def _gfx6_gfx7 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>, - Select_gfx6_gfx7<!cast<SOPK_Pseudo>(NAME).Mnemonic>; + defvar ps = !cast<SOPK_Pseudo>(NAME); + def _gfx6_gfx7 : SOPK_Real64<op, ps>, + Select_gfx6_gfx7<ps.Mnemonic>; } multiclass SOPK_Real32_gfx6_gfx7_gfx10<bits<5> op> : @@ -1665,21 +1714,24 @@ defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx6_gfx7_gfx10<0x015>; //===----------------------------------------------------------------------===// multiclass SOPP_Real_32_gfx6_gfx7<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic> { - def _gfx6_gfx7 : SOPP_Real_32<op, !cast<SOPP_Pseudo>(NAME), real_name>, - Select_gfx6_gfx7<!cast<SOPP_Pseudo>(NAME).Mnemonic>, - SOPPRelaxTable<0, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx6_gfx7">; + defvar ps = !cast<SOPP_Pseudo>(NAME); + def _gfx6_gfx7 : SOPP_Real_32<op, ps, real_name>, + Select_gfx6_gfx7<ps.Mnemonic>, + SOPPRelaxTable<0, ps.KeyName, "_gfx6_gfx7">; } multiclass SOPP_Real_32_gfx8_gfx9<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> { - def _vi : SOPP_Real_32<op, !cast<SOPP_Pseudo>(NAME), real_name>, - Select_vi<!cast<SOPP_Pseudo>(NAME).Mnemonic>, - SOPPRelaxTable<0, !cast<SOPP_Pseudo>(NAME).KeyName, "_vi">; + defvar ps = !cast<SOPP_Pseudo>(NAME); + def _vi : SOPP_Real_32<op, ps, real_name>, + Select_vi<ps.Mnemonic>, + SOPPRelaxTable<0, ps.KeyName, "_vi">; } multiclass SOPP_Real_32_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> { - def _gfx10 : SOPP_Real_32<op, !cast<SOPP_Pseudo>(NAME), real_name>, - Select_gfx10<!cast<SOPP_Pseudo>(NAME).Mnemonic>, - SOPPRelaxTable<0, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx10">; + defvar ps = !cast<SOPP_Pseudo>(NAME); + def _gfx10 : SOPP_Real_32<op, ps, real_name>, + Select_gfx10<ps.Mnemonic>, + SOPPRelaxTable<0, ps.KeyName, "_gfx10">; } multiclass SOPP_Real_32_gfx8_gfx9_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> : @@ -1693,21 +1745,24 @@ multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op, string real_name = //64 bit encodings, for Relaxation multiclass SOPP_Real_64_gfx6_gfx7<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> { - def _gfx6_gfx7 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), real_name>, - Select_gfx6_gfx7<!cast<SOPP_Pseudo>(NAME).Mnemonic>, - SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx6_gfx7">; + defvar ps = !cast<SOPP_Pseudo>(NAME); + def _gfx6_gfx7 : SOPP_Real_64<op, ps, real_name>, + Select_gfx6_gfx7<ps.Mnemonic>, + SOPPRelaxTable<1, ps.KeyName, "_gfx6_gfx7">; } multiclass SOPP_Real_64_gfx8_gfx9<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> { - def _vi : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), real_name>, - Select_vi<!cast<SOPP_Pseudo>(NAME).Mnemonic>, - SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_vi">; + defvar ps = !cast<SOPP_Pseudo>(NAME); + def _vi : SOPP_Real_64<op, ps, real_name>, + Select_vi<ps.Mnemonic>, + SOPPRelaxTable<1, ps.KeyName, "_vi">; } multiclass SOPP_Real_64_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> { - def _gfx10 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), real_name>, - Select_gfx10<!cast<SOPP_Pseudo>(NAME).Mnemonic>, - SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx10">; + defvar ps = !cast<SOPP_Pseudo>(NAME); + def _gfx10 : SOPP_Real_64<op, ps, real_name>, + Select_gfx10<ps.Mnemonic>, + SOPPRelaxTable<1, ps.KeyName, "_gfx10">; } multiclass SOPP_Real_64_gfx8_gfx9_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> : @@ -1727,18 +1782,7 @@ multiclass SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> { defm S_NOP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x000>; defm S_ENDPGM : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x001, "s_endpgm">; -defm S_BRANCH : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x002>; defm S_WAKEUP : SOPP_Real_32_gfx8_gfx9_gfx10<0x003>; -defm S_CBRANCH_SCC0 : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x004>; -defm S_CBRANCH_SCC1 : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x005>; -defm S_CBRANCH_VCCZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x006>; -defm S_CBRANCH_VCCNZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x007>; -defm S_CBRANCH_EXECZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x008>; -defm S_CBRANCH_EXECNZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x009>; -defm S_CBRANCH_CDBGSYS : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x017>; -defm S_CBRANCH_CDBGUSER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x018>; -defm S_CBRANCH_CDBGSYS_OR_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x019>; -defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x01A>; defm S_BARRIER : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00a>; defm S_WAITCNT : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00c>; defm S_SETHALT : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00d>; @@ -1765,23 +1809,40 @@ defm S_ROUND_MODE : SOPP_Real_32_gfx10<0x024>; defm S_DENORM_MODE : SOPP_Real_32_gfx10<0x025>; defm S_TTRACEDATA_IMM : SOPP_Real_32_gfx10<0x028>; +let isBranch = 1 in { +defm S_BRANCH : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x002>; +defm S_CBRANCH_SCC0 : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x004>; +defm S_CBRANCH_SCC1 : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x005>; +defm S_CBRANCH_VCCZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x006>; +defm S_CBRANCH_VCCNZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x007>; +defm S_CBRANCH_EXECZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x008>; +defm S_CBRANCH_EXECNZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x009>; +defm S_CBRANCH_CDBGSYS : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x017>; +defm S_CBRANCH_CDBGUSER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x018>; +defm S_CBRANCH_CDBGSYS_OR_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x019>; +defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x01A>; +} + //===----------------------------------------------------------------------===// // SOPC - GFX6, GFX7, GFX8, GFX9, GFX10 //===----------------------------------------------------------------------===// multiclass SOPC_Real_gfx6_gfx7<bits<7> op> { - def _gfx6_gfx7 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>, - Select_gfx6_gfx7<!cast<SOPC_Pseudo>(NAME).Mnemonic>; + defvar ps = !cast<SOPC_Pseudo>(NAME); + def _gfx6_gfx7 : SOPC_Real<op, ps>, + Select_gfx6_gfx7<ps.Mnemonic>; } multiclass SOPC_Real_gfx8_gfx9<bits<7> op> { - def _vi : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>, - Select_vi<!cast<SOPC_Pseudo>(NAME).Mnemonic>; + defvar ps = !cast<SOPC_Pseudo>(NAME); + def _vi : SOPC_Real<op, ps>, + Select_vi<ps.Mnemonic>; } multiclass SOPC_Real_gfx10<bits<7> op> { - def _gfx10 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>, - Select_gfx10<!cast<SOPC_Pseudo>(NAME).Mnemonic>; + defvar ps = !cast<SOPC_Pseudo>(NAME); + def _gfx10 : SOPC_Real<op, ps>, + Select_gfx10<ps.Mnemonic>; } multiclass SOPC_Real_gfx8_gfx9_gfx10<bits<7> op> : diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index c8a85d76a55b..0bee9022975e 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -15,19 +15,19 @@ namespace AMDGPU { namespace SendMsg { // This must be in sync with llvm::AMDGPU::SendMsg::Id enum members, see SIDefines.h. -const char* const IdSymbolic[] = { +const char *const IdSymbolic[ID_GAPS_LAST_] = { nullptr, "MSG_INTERRUPT", "MSG_GS", "MSG_GS_DONE", - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, + "MSG_SAVEWAVE", + "MSG_STALL_WAVE_GEN", + "MSG_HALT_WAVES", + "MSG_ORDERED_PS_DONE", + "MSG_EARLY_PRIM_DEALLOC", "MSG_GS_ALLOC_REQ", "MSG_GET_DOORBELL", - nullptr, + "MSG_GET_DDID", nullptr, nullptr, nullptr, @@ -35,7 +35,7 @@ const char* const IdSymbolic[] = { }; // These two must be in sync with llvm::AMDGPU::SendMsg::Op enum members, see SIDefines.h. -const char* const OpSysSymbolic[] = { +const char *const OpSysSymbolic[OP_SYS_LAST_] = { nullptr, "SYSMSG_OP_ECC_ERR_INTERRUPT", "SYSMSG_OP_REG_RD", @@ -43,7 +43,7 @@ const char* const OpSysSymbolic[] = { "SYSMSG_OP_TTRACE_PC" }; -const char* const OpGsSymbolic[] = { +const char *const OpGsSymbolic[OP_GS_LAST_] = { "GS_OP_NOP", "GS_OP_CUT", "GS_OP_EMIT", diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h index 3eb27c5e5f42..d1deb570a938 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h @@ -9,6 +9,8 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H +#include "SIDefines.h" + namespace llvm { class StringLiteral; @@ -17,9 +19,9 @@ namespace AMDGPU { namespace SendMsg { // Symbolic names for the sendmsg(...) syntax. -extern const char* const IdSymbolic[]; -extern const char* const OpSysSymbolic[]; -extern const char* const OpGsSymbolic[]; +extern const char *const IdSymbolic[ID_GAPS_LAST_]; +extern const char *const OpSysSymbolic[OP_SYS_LAST_]; +extern const char *const OpGsSymbolic[OP_GS_LAST_]; } // namespace SendMsg diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 4c1e4dec7ecb..29bbf50cbfdc 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -30,7 +30,8 @@ static llvm::cl::opt<unsigned> AmdhsaCodeObjectVersion( "amdhsa-code-object-version", llvm::cl::Hidden, - llvm::cl::desc("AMDHSA Code Object Version"), llvm::cl::init(3)); + llvm::cl::desc("AMDHSA Code Object Version"), llvm::cl::init(4), + llvm::cl::ZeroOrMore); namespace { @@ -96,23 +97,36 @@ Optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI) { return ELF::ELFABIVERSION_AMDGPU_HSA_V2; case 3: return ELF::ELFABIVERSION_AMDGPU_HSA_V3; + case 4: + return ELF::ELFABIVERSION_AMDGPU_HSA_V4; default: - return ELF::ELFABIVERSION_AMDGPU_HSA_V3; + report_fatal_error(Twine("Unsupported AMDHSA Code Object Version ") + + Twine(AmdhsaCodeObjectVersion)); } } bool isHsaAbiVersion2(const MCSubtargetInfo *STI) { - if (const auto &&HsaAbiVer = getHsaAbiVersion(STI)) - return HsaAbiVer.getValue() == ELF::ELFABIVERSION_AMDGPU_HSA_V2; + if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI)) + return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V2; return false; } bool isHsaAbiVersion3(const MCSubtargetInfo *STI) { - if (const auto &&HsaAbiVer = getHsaAbiVersion(STI)) - return HsaAbiVer.getValue() == ELF::ELFABIVERSION_AMDGPU_HSA_V3; + if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI)) + return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V3; + return false; +} + +bool isHsaAbiVersion4(const MCSubtargetInfo *STI) { + if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI)) + return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V4; return false; } +bool isHsaAbiVersion3Or4(const MCSubtargetInfo *STI) { + return isHsaAbiVersion3(STI) || isHsaAbiVersion4(STI); +} + #define GET_MIMGBaseOpcodesTable_IMPL #define GET_MIMGDimInfoTable_IMPL #define GET_MIMGInfoTable_IMPL @@ -141,6 +155,34 @@ int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) { return NewInfo ? NewInfo->Opcode : -1; } +unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, + const MIMGDimInfo *Dim, bool IsA16, + bool IsG16Supported) { + unsigned AddrWords = BaseOpcode->NumExtraArgs; + unsigned AddrComponents = (BaseOpcode->Coordinates ? Dim->NumCoords : 0) + + (BaseOpcode->LodOrClampOrMip ? 1 : 0); + if (IsA16) + AddrWords += divideCeil(AddrComponents, 2); + else + AddrWords += AddrComponents; + + // Note: For subtargets that support A16 but not G16, enabling A16 also + // enables 16 bit gradients. + // For subtargets that support A16 (operand) and G16 (done with a different + // instruction encoding), they are independent. + + if (BaseOpcode->Gradients) { + if ((IsA16 && !IsG16Supported) || BaseOpcode->G16) + // There are two gradients per coordinate, we pack them separately. + // For the 3d case, + // we get (dy/du, dx/du) (-, dz/du) (dy/dv, dx/dv) (-, dz/dv) + AddrWords += alignTo<2>(Dim->NumGradients / 2); + else + AddrWords += Dim->NumGradients; + } + return AddrWords; +} + struct MUBUFInfo { uint16_t Opcode; uint16_t BaseOpcode; @@ -148,6 +190,7 @@ struct MUBUFInfo { bool has_vaddr; bool has_srsrc; bool has_soffset; + bool IsBufferInv; }; struct MTBUFInfo { @@ -164,12 +207,23 @@ struct SMInfo { bool IsBuffer; }; +struct VOPInfo { + uint16_t Opcode; + bool IsSingle; +}; + #define GET_MTBUFInfoTable_DECL #define GET_MTBUFInfoTable_IMPL #define GET_MUBUFInfoTable_DECL #define GET_MUBUFInfoTable_IMPL #define GET_SMInfoTable_DECL #define GET_SMInfoTable_IMPL +#define GET_VOP1InfoTable_DECL +#define GET_VOP1InfoTable_IMPL +#define GET_VOP2InfoTable_DECL +#define GET_VOP2InfoTable_IMPL +#define GET_VOP3InfoTable_DECL +#define GET_VOP3InfoTable_IMPL #include "AMDGPUGenSearchableTables.inc" int getMTBUFBaseOpcode(unsigned Opc) { @@ -232,11 +286,31 @@ bool getMUBUFHasSoffset(unsigned Opc) { return Info ? Info->has_soffset : false; } +bool getMUBUFIsBufferInv(unsigned Opc) { + const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc); + return Info ? Info->IsBufferInv : false; +} + bool getSMEMIsBuffer(unsigned Opc) { const SMInfo *Info = getSMEMOpcodeHelper(Opc); return Info ? Info->IsBuffer : false; } +bool getVOP1IsSingle(unsigned Opc) { + const VOPInfo *Info = getVOP1OpcodeHelper(Opc); + return Info ? Info->IsSingle : false; +} + +bool getVOP2IsSingle(unsigned Opc) { + const VOPInfo *Info = getVOP2OpcodeHelper(Opc); + return Info ? Info->IsSingle : false; +} + +bool getVOP3IsSingle(unsigned Opc) { + const VOPInfo *Info = getVOP3OpcodeHelper(Opc); + return Info ? Info->IsSingle : false; +} + // Wrapper for Tablegen'd function. enum Subtarget is not defined in any // header files, so we need to wrap it in a function that takes unsigned // instead. @@ -247,7 +321,8 @@ int getMCOpcode(uint16_t Opcode, unsigned Gen) { namespace IsaInfo { AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI) - : XnackSetting(TargetIDSetting::Any), SramEccSetting(TargetIDSetting::Any) { + : STI(STI), XnackSetting(TargetIDSetting::Any), + SramEccSetting(TargetIDSetting::Any) { if (!STI.getFeatureBits().test(FeatureSupportsXNACK)) XnackSetting = TargetIDSetting::Unsupported; if (!STI.getFeatureBits().test(FeatureSupportsSRAMECC)) @@ -334,25 +409,109 @@ void AMDGPUTargetID::setTargetIDFromTargetIDStream(StringRef TargetID) { } } -void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) { - auto TargetTriple = STI->getTargetTriple(); - auto Version = getIsaVersion(STI->getCPU()); +std::string AMDGPUTargetID::toString() const { + std::string StringRep = ""; + raw_string_ostream StreamRep(StringRep); - Stream << TargetTriple.getArchName() << '-' - << TargetTriple.getVendorName() << '-' - << TargetTriple.getOSName() << '-' - << TargetTriple.getEnvironmentName() << '-' - << "gfx" - << Version.Major - << Version.Minor - << Version.Stepping; + auto TargetTriple = STI.getTargetTriple(); + auto Version = getIsaVersion(STI.getCPU()); - if (hasXNACK(*STI)) - Stream << "+xnack"; - if (hasSRAMECC(*STI)) - Stream << "+sramecc"; + StreamRep << TargetTriple.getArchName() << '-' + << TargetTriple.getVendorName() << '-' + << TargetTriple.getOSName() << '-' + << TargetTriple.getEnvironmentName() << '-'; - Stream.flush(); + std::string Processor = ""; + // TODO: Following else statement is present here because we used various + // alias names for GPUs up until GFX9 (e.g. 'fiji' is same as 'gfx803'). + // Remove once all aliases are removed from GCNProcessors.td. + if (Version.Major >= 9) + Processor = STI.getCPU().str(); + else + Processor = (Twine("gfx") + Twine(Version.Major) + Twine(Version.Minor) + + Twine(Version.Stepping)) + .str(); + + std::string Features = ""; + if (Optional<uint8_t> HsaAbiVersion = getHsaAbiVersion(&STI)) { + switch (*HsaAbiVersion) { + case ELF::ELFABIVERSION_AMDGPU_HSA_V2: + // Code object V2 only supported specific processors and had fixed + // settings for the XNACK. + if (Processor == "gfx600") { + } else if (Processor == "gfx601") { + } else if (Processor == "gfx602") { + } else if (Processor == "gfx700") { + } else if (Processor == "gfx701") { + } else if (Processor == "gfx702") { + } else if (Processor == "gfx703") { + } else if (Processor == "gfx704") { + } else if (Processor == "gfx705") { + } else if (Processor == "gfx801") { + if (!isXnackOnOrAny()) + report_fatal_error( + "AMD GPU code object V2 does not support processor " + Processor + + " without XNACK"); + } else if (Processor == "gfx802") { + } else if (Processor == "gfx803") { + } else if (Processor == "gfx805") { + } else if (Processor == "gfx810") { + if (!isXnackOnOrAny()) + report_fatal_error( + "AMD GPU code object V2 does not support processor " + Processor + + " without XNACK"); + } else if (Processor == "gfx900") { + if (isXnackOnOrAny()) + Processor = "gfx901"; + } else if (Processor == "gfx902") { + if (isXnackOnOrAny()) + Processor = "gfx903"; + } else if (Processor == "gfx904") { + if (isXnackOnOrAny()) + Processor = "gfx905"; + } else if (Processor == "gfx906") { + if (isXnackOnOrAny()) + Processor = "gfx907"; + } else if (Processor == "gfx90c") { + if (isXnackOnOrAny()) + report_fatal_error( + "AMD GPU code object V2 does not support processor " + Processor + + " with XNACK being ON or ANY"); + } else { + report_fatal_error( + "AMD GPU code object V2 does not support processor " + Processor); + } + break; + case ELF::ELFABIVERSION_AMDGPU_HSA_V3: + // xnack. + if (isXnackOnOrAny()) + Features += "+xnack"; + // In code object v2 and v3, "sramecc" feature was spelled with a + // hyphen ("sram-ecc"). + if (isSramEccOnOrAny()) + Features += "+sram-ecc"; + break; + case ELF::ELFABIVERSION_AMDGPU_HSA_V4: + // sramecc. + if (getSramEccSetting() == TargetIDSetting::Off) + Features += ":sramecc-"; + else if (getSramEccSetting() == TargetIDSetting::On) + Features += ":sramecc+"; + // xnack. + if (getXnackSetting() == TargetIDSetting::Off) + Features += ":xnack-"; + else if (getXnackSetting() == TargetIDSetting::On) + Features += ":xnack+"; + break; + default: + break; + } + } + + StreamRep << Processor << Features; + + StreamRep.flush(); + return StringRep; } unsigned getWavefrontSize(const MCSubtargetInfo *STI) { @@ -402,6 +561,8 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) { unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI) { // FIXME: Need to take scratch memory into account. + if (isGFX90A(*STI)) + return 8; if (!isGFX10Plus(*STI)) return 10; return hasGFX10_3Insts(*STI) ? 16 : 20; @@ -531,6 +692,9 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs) { unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, Optional<bool> EnableWavefrontSize32) { + if (STI->getFeatureBits().test(FeatureGFX90AInsts)) + return 8; + bool IsWave32 = EnableWavefrontSize32 ? *EnableWavefrontSize32 : STI->getFeatureBits().test(FeatureWavefrontSize32); @@ -543,6 +707,8 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, Optional<bool> EnableWavefrontSize32) { + if (STI->getFeatureBits().test(FeatureGFX90AInsts)) + return 8; bool IsWave32 = EnableWavefrontSize32 ? *EnableWavefrontSize32 : @@ -552,12 +718,16 @@ unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, } unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) { + if (STI->getFeatureBits().test(FeatureGFX90AInsts)) + return 512; if (!isGFX10Plus(*STI)) return 256; return STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1024 : 512; } unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) { + if (STI->getFeatureBits().test(FeatureGFX90AInsts)) + return 512; return 256; } @@ -653,6 +823,11 @@ amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor( AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED, 1); } + if (AMDGPU::isGFX90A(*STI)) { + AMDHSA_BITS_SET(KD.compute_pgm_rsrc3, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, + STI->getFeatureBits().test(FeatureTgSplit) ? 1 : 0); + } return KD; } @@ -1049,23 +1224,32 @@ int64_t getMsgId(const StringRef Name) { return ID_UNKNOWN_; } -static bool isValidMsgId(int64_t MsgId) { - return (ID_GAPS_FIRST_ <= MsgId && MsgId < ID_GAPS_LAST_) && IdSymbolic[MsgId]; -} - bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict) { if (Strict) { - if (MsgId == ID_GS_ALLOC_REQ || MsgId == ID_GET_DOORBELL) + switch (MsgId) { + case ID_SAVEWAVE: + return isVI(STI) || isGFX9Plus(STI); + case ID_STALL_WAVE_GEN: + case ID_HALT_WAVES: + case ID_ORDERED_PS_DONE: + case ID_GS_ALLOC_REQ: + case ID_GET_DOORBELL: return isGFX9Plus(STI); - else - return isValidMsgId(MsgId); + case ID_EARLY_PRIM_DEALLOC: + return isGFX9(STI); + case ID_GET_DDID: + return isGFX10Plus(STI); + default: + return 0 <= MsgId && MsgId < ID_GAPS_LAST_ && IdSymbolic[MsgId]; + } } else { return 0 <= MsgId && isUInt<ID_WIDTH_>(MsgId); } } StringRef getMsgName(int64_t MsgId) { - return isValidMsgId(MsgId)? IdSymbolic[MsgId] : ""; + assert(0 <= MsgId && MsgId < ID_GAPS_LAST_); + return IdSymbolic[MsgId]; } int64_t getMsgOpId(int64_t MsgId, const StringRef Name) { @@ -1080,7 +1264,9 @@ int64_t getMsgOpId(int64_t MsgId, const StringRef Name) { return OP_UNKNOWN_; } -bool isValidMsgOp(int64_t MsgId, int64_t OpId, bool Strict) { +bool isValidMsgOp(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI, + bool Strict) { + assert(isValidMsgId(MsgId, STI, Strict)); if (!Strict) return 0 <= OpId && isUInt<OP_WIDTH_>(OpId); @@ -1103,7 +1289,9 @@ StringRef getMsgOpName(int64_t MsgId, int64_t OpId) { return (MsgId == ID_SYSMSG)? OpSysSymbolic[OpId] : OpGsSymbolic[OpId]; } -bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, bool Strict) { +bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, + const MCSubtargetInfo &STI, bool Strict) { + assert(isValidMsgOp(MsgId, OpId, STI, Strict)); if (!Strict) return 0 <= StreamId && isUInt<STREAM_ID_WIDTH_>(StreamId); @@ -1156,6 +1344,17 @@ unsigned getInitialPSInputAddr(const Function &F) { return getIntegerAttribute(F, "InitialPSInputAddr", 0); } +bool getHasColorExport(const Function &F) { + // As a safe default always respond as if PS has color exports. + return getIntegerAttribute( + F, "amdgpu-color-export", + F.getCallingConv() == CallingConv::AMDGPU_PS ? 1 : 0) != 0; +} + +bool getHasDepthExport(const Function &F) { + return getIntegerAttribute(F, "amdgpu-depth-export", 0) != 0; +} + bool isShader(CallingConv::ID cc) { switch(cc) { case CallingConv::AMDGPU_VS: @@ -1259,6 +1458,10 @@ bool isGCN3Encoding(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding]; } +bool isGFX10_AEncoding(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureGFX10_AEncoding]; +} + bool isGFX10_BEncoding(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]; } @@ -1267,6 +1470,14 @@ bool hasGFX10_3Insts(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGFX10_3Insts]; } +bool isGFX90A(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]; +} + +bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch]; +} + bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) { const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID); const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0); @@ -1374,6 +1585,9 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) { case AMDGPU::OPERAND_REG_INLINE_AC_FP16: case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + case AMDGPU::OPERAND_REG_IMM_V2FP32: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: return true; default: return false; @@ -1413,41 +1627,67 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::VReg_64RegClassID: case AMDGPU::AReg_64RegClassID: case AMDGPU::SReg_64_XEXECRegClassID: + case AMDGPU::VReg_64_Align2RegClassID: + case AMDGPU::AReg_64_Align2RegClassID: return 64; case AMDGPU::SGPR_96RegClassID: case AMDGPU::SReg_96RegClassID: case AMDGPU::VReg_96RegClassID: case AMDGPU::AReg_96RegClassID: + case AMDGPU::VReg_96_Align2RegClassID: + case AMDGPU::AReg_96_Align2RegClassID: + case AMDGPU::AV_96RegClassID: return 96; case AMDGPU::SGPR_128RegClassID: case AMDGPU::SReg_128RegClassID: case AMDGPU::VReg_128RegClassID: case AMDGPU::AReg_128RegClassID: + case AMDGPU::VReg_128_Align2RegClassID: + case AMDGPU::AReg_128_Align2RegClassID: + case AMDGPU::AV_128RegClassID: return 128; case AMDGPU::SGPR_160RegClassID: case AMDGPU::SReg_160RegClassID: case AMDGPU::VReg_160RegClassID: case AMDGPU::AReg_160RegClassID: + case AMDGPU::VReg_160_Align2RegClassID: + case AMDGPU::AReg_160_Align2RegClassID: + case AMDGPU::AV_160RegClassID: return 160; case AMDGPU::SGPR_192RegClassID: case AMDGPU::SReg_192RegClassID: case AMDGPU::VReg_192RegClassID: case AMDGPU::AReg_192RegClassID: + case AMDGPU::VReg_192_Align2RegClassID: + case AMDGPU::AReg_192_Align2RegClassID: return 192; + case AMDGPU::SGPR_224RegClassID: + case AMDGPU::SReg_224RegClassID: + case AMDGPU::VReg_224RegClassID: + case AMDGPU::AReg_224RegClassID: + case AMDGPU::VReg_224_Align2RegClassID: + case AMDGPU::AReg_224_Align2RegClassID: + return 224; case AMDGPU::SGPR_256RegClassID: case AMDGPU::SReg_256RegClassID: case AMDGPU::VReg_256RegClassID: case AMDGPU::AReg_256RegClassID: + case AMDGPU::VReg_256_Align2RegClassID: + case AMDGPU::AReg_256_Align2RegClassID: return 256; case AMDGPU::SGPR_512RegClassID: case AMDGPU::SReg_512RegClassID: case AMDGPU::VReg_512RegClassID: case AMDGPU::AReg_512RegClassID: + case AMDGPU::VReg_512_Align2RegClassID: + case AMDGPU::AReg_512_Align2RegClassID: return 512; case AMDGPU::SGPR_1024RegClassID: case AMDGPU::SReg_1024RegClassID: case AMDGPU::VReg_1024RegClassID: case AMDGPU::AReg_1024RegClassID: + case AMDGPU::VReg_1024_Align2RegClassID: + case AMDGPU::AReg_1024_Align2RegClassID: return 1024; default: llvm_unreachable("Unexpected register class"); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index f9378693cf48..72c872dec5ba 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -44,6 +44,12 @@ bool isHsaAbiVersion2(const MCSubtargetInfo *STI); /// \returns True if HSA OS ABI Version identification is 3, /// false otherwise. bool isHsaAbiVersion3(const MCSubtargetInfo *STI); +/// \returns True if HSA OS ABI Version identification is 4, +/// false otherwise. +bool isHsaAbiVersion4(const MCSubtargetInfo *STI); +/// \returns True if HSA OS ABI Version identification is 3 or 4, +/// false otherwise. +bool isHsaAbiVersion3Or4(const MCSubtargetInfo *STI); struct GcnBufferFormatInfo { unsigned Format; @@ -78,6 +84,7 @@ enum class TargetIDSetting { class AMDGPUTargetID { private: + const MCSubtargetInfo &STI; TargetIDSetting XnackSetting; TargetIDSetting SramEccSetting; @@ -145,10 +152,10 @@ public: void setTargetIDFromFeaturesString(StringRef FS); void setTargetIDFromTargetIDStream(StringRef TargetID); -}; -/// Streams isa version string for given subtarget \p STI into \p Stream. -void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream); + /// \returns String representation of an object. + std::string toString() const; +}; /// \returns Wavefront size for given subtarget \p STI. unsigned getWavefrontSize(const MCSubtargetInfo *STI); @@ -284,6 +291,7 @@ struct MIMGBaseOpcodeInfo { bool Coordinates; bool LodOrClampOrMip; bool HasD16; + bool MSAA; }; LLVM_READONLY @@ -293,6 +301,7 @@ struct MIMGDimInfo { MIMGDim Dim; uint8_t NumCoords; uint8_t NumGradients; + bool MSAA; bool DA; uint8_t Encoding; const char *AsmSuffix; @@ -338,6 +347,11 @@ int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, LLVM_READONLY int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels); +LLVM_READONLY +unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, + const MIMGDimInfo *Dim, bool IsA16, + bool IsG16Supported); + struct MIMGInfo { uint16_t Opcode; uint16_t BaseOpcode; @@ -386,9 +400,21 @@ LLVM_READONLY bool getMUBUFHasSoffset(unsigned Opc); LLVM_READONLY +bool getMUBUFIsBufferInv(unsigned Opc); + +LLVM_READONLY bool getSMEMIsBuffer(unsigned Opc); LLVM_READONLY +bool getVOP1IsSingle(unsigned Opc); + +LLVM_READONLY +bool getVOP2IsSingle(unsigned Opc); + +LLVM_READONLY +bool getVOP3IsSingle(unsigned Opc); + +LLVM_READONLY const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, @@ -459,6 +485,14 @@ struct Waitcnt { return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u || VsCnt != ~0u; } + bool hasWaitExceptVsCnt() const { + return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u; + } + + bool hasWaitVsCnt() const { + return VsCnt != ~0u; + } + bool dominates(const Waitcnt &Other) const { return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt && LgkmCnt <= Other.LgkmCnt && VsCnt <= Other.VsCnt; @@ -627,10 +661,12 @@ LLVM_READNONE bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict = true); LLVM_READNONE -bool isValidMsgOp(int64_t MsgId, int64_t OpId, bool Strict = true); +bool isValidMsgOp(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI, + bool Strict = true); LLVM_READNONE -bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, bool Strict = true); +bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, + const MCSubtargetInfo &STI, bool Strict = true); LLVM_READNONE bool msgRequiresOp(int64_t MsgId); @@ -653,6 +689,10 @@ uint64_t encodeMsg(uint64_t MsgId, unsigned getInitialPSInputAddr(const Function &F); +bool getHasColorExport(const Function &F); + +bool getHasDepthExport(const Function &F); + LLVM_READNONE bool isShader(CallingConv::ID CC); @@ -701,8 +741,11 @@ bool isGFX9Plus(const MCSubtargetInfo &STI); bool isGFX10(const MCSubtargetInfo &STI); bool isGFX10Plus(const MCSubtargetInfo &STI); bool isGCN3Encoding(const MCSubtargetInfo &STI); +bool isGFX10_AEncoding(const MCSubtargetInfo &STI); bool isGFX10_BEncoding(const MCSubtargetInfo &STI); bool hasGFX10_3Insts(const MCSubtargetInfo &STI); +bool isGFX90A(const MCSubtargetInfo &STI); +bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI); /// Is Reg - scalar register bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); @@ -746,12 +789,17 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) { case AMDGPU::OPERAND_REG_INLINE_C_FP32: case AMDGPU::OPERAND_REG_INLINE_AC_INT32: case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_REG_IMM_V2INT32: + case AMDGPU::OPERAND_REG_IMM_V2FP32: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: return 4; case AMDGPU::OPERAND_REG_IMM_INT64: case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_INLINE_C_INT64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: return 8; case AMDGPU::OPERAND_REG_IMM_INT16: @@ -847,6 +895,11 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, const GCNSubtarget *Subtarget, Align Alignment = Align(4)); +LLVM_READNONE +inline bool isLegal64BitDPPControl(unsigned DC) { + return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST; +} + /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp new file mode 100644 index 000000000000..da8fcf3900bb --- /dev/null +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp @@ -0,0 +1,355 @@ +//===- AMDGPULDSUtils.cpp -------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// AMDGPU LDS related helper utility functions. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPULDSUtils.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/ReplaceConstant.h" + +using namespace llvm; + +namespace llvm { + +namespace AMDGPU { + +// An helper class for collecting all reachable callees for each kernel defined +// within the module. +class CollectReachableCallees { + Module &M; + CallGraph CG; + SmallPtrSet<CallGraphNode *, 8> AddressTakenFunctions; + + // Collect all address taken functions within the module. + void collectAddressTakenFunctions() { + auto *ECNode = CG.getExternalCallingNode(); + + for (auto GI = ECNode->begin(), GE = ECNode->end(); GI != GE; ++GI) { + auto *CGN = GI->second; + auto *F = CGN->getFunction(); + if (!F || F->isDeclaration() || AMDGPU::isKernelCC(F)) + continue; + AddressTakenFunctions.insert(CGN); + } + } + + // For given kernel, collect all its reachable non-kernel functions. + SmallPtrSet<Function *, 8> collectReachableCallees(Function *K) { + SmallPtrSet<Function *, 8> ReachableCallees; + + // Call graph node which represents this kernel. + auto *KCGN = CG[K]; + + // Go through all call graph nodes reachable from the node representing this + // kernel, visit all their call sites, if the call site is direct, add + // corresponding callee to reachable callee set, if it is indirect, resolve + // the indirect call site to potential reachable callees, add them to + // reachable callee set, and repeat the process for the newly added + // potential callee nodes. + // + // FIXME: Need to handle bit-casted function pointers. + // + SmallVector<CallGraphNode *, 8> CGNStack(df_begin(KCGN), df_end(KCGN)); + SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes; + while (!CGNStack.empty()) { + auto *CGN = CGNStack.pop_back_val(); + + if (!VisitedCGNodes.insert(CGN).second) + continue; + + for (auto GI = CGN->begin(), GE = CGN->end(); GI != GE; ++GI) { + auto *RCB = cast<CallBase>(GI->first.getValue()); + auto *RCGN = GI->second; + + if (auto *DCallee = RCGN->getFunction()) { + ReachableCallees.insert(DCallee); + } else if (RCB->isIndirectCall()) { + auto *RCBFTy = RCB->getFunctionType(); + for (auto *ACGN : AddressTakenFunctions) { + auto *ACallee = ACGN->getFunction(); + if (ACallee->getFunctionType() == RCBFTy) { + ReachableCallees.insert(ACallee); + CGNStack.append(df_begin(ACGN), df_end(ACGN)); + } + } + } + } + } + + return ReachableCallees; + } + +public: + explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) { + // Collect address taken functions. + collectAddressTakenFunctions(); + } + + void collectReachableCallees( + DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) { + // Collect reachable callee set for each kernel defined in the module. + for (Function &F : M.functions()) { + if (!AMDGPU::isKernelCC(&F)) + continue; + Function *K = &F; + KernelToCallees[K] = collectReachableCallees(K); + } + } +}; + +void collectReachableCallees( + Module &M, + DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) { + CollectReachableCallees CRC{M}; + CRC.collectReachableCallees(KernelToCallees); +} + +SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV) { + SmallPtrSet<Function *, 8> LDSAccessors; + SmallVector<User *, 8> UserStack(GV->users()); + SmallPtrSet<User *, 8> VisitedUsers; + + while (!UserStack.empty()) { + auto *U = UserStack.pop_back_val(); + + // `U` is already visited? continue to next one. + if (!VisitedUsers.insert(U).second) + continue; + + // `U` is a global variable which is initialized with LDS. Ignore LDS. + if (isa<GlobalValue>(U)) + return SmallPtrSet<Function *, 8>(); + + // Recursively explore constant users. + if (isa<Constant>(U)) { + append_range(UserStack, U->users()); + continue; + } + + // `U` should be an instruction, if it belongs to a non-kernel function F, + // then collect F. + Function *F = cast<Instruction>(U)->getFunction(); + if (!AMDGPU::isKernelCC(F)) + LDSAccessors.insert(F); + } + + return LDSAccessors; +} + +DenseMap<Function *, SmallPtrSet<Instruction *, 8>> +getFunctionToInstsMap(User *U, bool CollectKernelInsts) { + DenseMap<Function *, SmallPtrSet<Instruction *, 8>> FunctionToInsts; + SmallVector<User *, 8> UserStack; + SmallPtrSet<User *, 8> VisitedUsers; + + UserStack.push_back(U); + + while (!UserStack.empty()) { + auto *UU = UserStack.pop_back_val(); + + if (!VisitedUsers.insert(UU).second) + continue; + + if (isa<GlobalValue>(UU)) + continue; + + if (isa<Constant>(UU)) { + append_range(UserStack, UU->users()); + continue; + } + + auto *I = cast<Instruction>(UU); + Function *F = I->getFunction(); + if (CollectKernelInsts) { + if (!AMDGPU::isKernelCC(F)) { + continue; + } + } else { + if (AMDGPU::isKernelCC(F)) { + continue; + } + } + + FunctionToInsts.insert(std::make_pair(F, SmallPtrSet<Instruction *, 8>())); + FunctionToInsts[F].insert(I); + } + + return FunctionToInsts; +} + +bool isKernelCC(const Function *Func) { + return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv()); +} + +Align getAlign(DataLayout const &DL, const GlobalVariable *GV) { + return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL), + GV->getValueType()); +} + +static void collectFunctionUses(User *U, const Function *F, + SetVector<Instruction *> &InstUsers) { + SmallVector<User *> Stack{U}; + + while (!Stack.empty()) { + U = Stack.pop_back_val(); + + if (auto *I = dyn_cast<Instruction>(U)) { + if (I->getFunction() == F) + InstUsers.insert(I); + continue; + } + + if (!isa<ConstantExpr>(U)) + continue; + + append_range(Stack, U->users()); + } +} + +void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F) { + SetVector<Instruction *> InstUsers; + + collectFunctionUses(C, F, InstUsers); + for (Instruction *I : InstUsers) { + convertConstantExprsToInstructions(I, C); + } +} + +bool hasUserInstruction(const GlobalValue *GV) { + SmallPtrSet<const User *, 8> Visited; + SmallVector<const User *, 16> Stack(GV->users()); + + while (!Stack.empty()) { + const User *U = Stack.pop_back_val(); + + if (!Visited.insert(U).second) + continue; + + if (isa<Instruction>(U)) + return true; + + append_range(Stack, U->users()); + } + + return false; +} + +bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) { + // We are not interested in kernel LDS lowering for module LDS itself. + if (F && GV.getName() == "llvm.amdgcn.module.lds") + return false; + + bool Ret = false; + SmallPtrSet<const User *, 8> Visited; + SmallVector<const User *, 16> Stack(GV.users()); + SmallPtrSet<const GlobalValue *, 8> GlobalUsers; + + assert(!F || isKernelCC(F)); + + while (!Stack.empty()) { + const User *V = Stack.pop_back_val(); + Visited.insert(V); + + if (auto *G = dyn_cast<GlobalValue>(V)) { + StringRef GName = G->getName(); + if (F && GName != "llvm.used" && GName != "llvm.compiler.used") { + // For kernel LDS lowering, if G is not a compiler.used list, then we + // cannot lower the lds GV since we cannot replace the use of GV within + // G. + return false; + } + GlobalUsers.insert(G); + continue; + } + + if (auto *I = dyn_cast<Instruction>(V)) { + const Function *UF = I->getFunction(); + if (UF == F) { + // Used from this kernel, we want to put it into the structure. + Ret = true; + } else if (!F) { + // For module LDS lowering, lowering is required if the user instruction + // is from non-kernel function. + Ret |= !isKernelCC(UF); + } + continue; + } + + // User V should be a constant, recursively visit users of V. + assert(isa<Constant>(V) && "Expected a constant."); + append_range(Stack, V->users()); + } + + if (!F && !Ret) { + // For module LDS lowering, we have not yet decided if we should lower GV or + // not. Explore all global users of GV, and check if atleast one of these + // global users appear as an use within an instruction (possibly nested use + // via constant expression), if so, then conservately lower LDS. + for (auto *G : GlobalUsers) + Ret |= hasUserInstruction(G); + } + + return Ret; +} + +std::vector<GlobalVariable *> findVariablesToLower(Module &M, + const Function *F) { + std::vector<llvm::GlobalVariable *> LocalVars; + for (auto &GV : M.globals()) { + if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { + continue; + } + if (!GV.hasInitializer()) { + // addrspace(3) without initializer implies cuda/hip extern __shared__ + // the semantics for such a variable appears to be that all extern + // __shared__ variables alias one another, in which case this transform + // is not required + continue; + } + if (!isa<UndefValue>(GV.getInitializer())) { + // Initializers are unimplemented for local address space. + // Leave such variables in place for consistent error reporting. + continue; + } + if (GV.isConstant()) { + // A constant undef variable can't be written to, and any load is + // undef, so it should be eliminated by the optimizer. It could be + // dropped by the back end if not. This pass skips over it. + continue; + } + if (!shouldLowerLDSToStruct(GV, F)) { + continue; + } + LocalVars.push_back(&GV); + } + return LocalVars; +} + +SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) { + SmallPtrSet<GlobalValue *, 32> UsedList; + + SmallVector<GlobalValue *, 32> TmpVec; + collectUsedGlobalVariables(M, TmpVec, true); + UsedList.insert(TmpVec.begin(), TmpVec.end()); + + TmpVec.clear(); + collectUsedGlobalVariables(M, TmpVec, false); + UsedList.insert(TmpVec.begin(), TmpVec.end()); + + return UsedList; +} + +} // end namespace AMDGPU + +} // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h new file mode 100644 index 000000000000..ffcafb9b76ce --- /dev/null +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h @@ -0,0 +1,70 @@ +//===- AMDGPULDSUtils.h - LDS related helper functions -*- C++ -*----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// AMDGPU LDS related helper utility functions. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H +#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H + +#include "AMDGPU.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/IR/Constants.h" + +namespace llvm { + +class ConstantExpr; + +namespace AMDGPU { + +/// Collect reachable callees for each kernel defined in the module \p M and +/// return collected callees at \p KernelToCallees. +void collectReachableCallees( + Module &M, + DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees); + +/// For the given LDS global \p GV, visit all its users and collect all +/// non-kernel functions within which \p GV is used and return collected list of +/// such non-kernel functions. +SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV); + +/// Collect all the instructions where user \p U belongs to. \p U could be +/// instruction itself or it could be a constant expression which is used within +/// an instruction. If \p CollectKernelInsts is true, collect instructions only +/// from kernels, otherwise collect instructions only from non-kernel functions. +DenseMap<Function *, SmallPtrSet<Instruction *, 8>> +getFunctionToInstsMap(User *U, bool CollectKernelInsts); + +bool isKernelCC(const Function *Func); + +Align getAlign(DataLayout const &DL, const GlobalVariable *GV); + +/// \returns true if a given global variable \p GV (or its global users) appear +/// as an use within some instruction (either from kernel or from non-kernel). +bool hasUserInstruction(const GlobalValue *GV); + +/// \returns true if an LDS global requres lowering to a module LDS structure +/// if \p F is not given. If \p F is given it must be a kernel and function +/// \returns true if an LDS global is directly used from that kernel and it +/// is safe to replace its uses with a kernel LDS structure member. +bool shouldLowerLDSToStruct(const GlobalVariable &GV, + const Function *F = nullptr); + +std::vector<GlobalVariable *> findVariablesToLower(Module &M, + const Function *F = nullptr); + +SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M); + +/// Replace all uses of constant \p C with instructions in \p F. +void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F); +} // end namespace AMDGPU + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp index b7dd757a8af3..f6b5975f1934 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -41,7 +41,7 @@ void AMDGPUPALMetadata::readFromIR(Module &M) { } return; } - BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA; + BlobType = ELF::NT_AMD_PAL_METADATA; NamedMD = M.getNamedMetadata("amdgpu.pal.metadata"); if (!NamedMD || !NamedMD->getNumOperands()) { // Emit msgpack metadata by default @@ -69,7 +69,7 @@ void AMDGPUPALMetadata::readFromIR(Module &M) { // Metadata. bool AMDGPUPALMetadata::setFromBlob(unsigned Type, StringRef Blob) { BlobType = Type; - if (Type == ELF::NT_AMD_AMDGPU_PAL_METADATA) + if (Type == ELF::NT_AMD_PAL_METADATA) return setFromLegacyBlob(Blob); return setFromMsgPackBlob(Blob); } @@ -243,6 +243,27 @@ void AMDGPUPALMetadata::setFunctionScratchSize(const MachineFunction &MF, Node[".stack_frame_size_in_bytes"] = MsgPackDoc.getNode(Val); } +// Set the amount of LDS used in bytes in the metadata. +void AMDGPUPALMetadata::setFunctionLdsSize(const MachineFunction &MF, + unsigned Val) { + auto Node = getShaderFunction(MF.getFunction().getName()); + Node[".lds_size"] = MsgPackDoc.getNode(Val); +} + +// Set the number of used vgprs in the metadata. +void AMDGPUPALMetadata::setFunctionNumUsedVgprs(const MachineFunction &MF, + unsigned Val) { + auto Node = getShaderFunction(MF.getFunction().getName()); + Node[".vgpr_count"] = MsgPackDoc.getNode(Val); +} + +// Set the number of used vgprs in the metadata. +void AMDGPUPALMetadata::setFunctionNumUsedSgprs(const MachineFunction &MF, + unsigned Val) { + auto Node = getShaderFunction(MF.getFunction().getName()); + Node[".sgpr_count"] = MsgPackDoc.getNode(Val); +} + // Set the hardware register bit in PAL metadata to enable wave32 on the // shader of the given calling convention. void AMDGPUPALMetadata::setWave32(unsigned CC) { @@ -592,6 +613,41 @@ static const char *getRegisterName(unsigned RegNum) { {0xa2c1, "VGT_STRMOUT_VTX_STRIDE_3"}, {0xa316, "VGT_VERTEX_REUSE_BLOCK_CNTL"}, + {0x2e28, "COMPUTE_PGM_RSRC3"}, + {0x2e2a, "COMPUTE_SHADER_CHKSUM"}, + {0x2e24, "COMPUTE_USER_ACCUM_0"}, + {0x2e25, "COMPUTE_USER_ACCUM_1"}, + {0x2e26, "COMPUTE_USER_ACCUM_2"}, + {0x2e27, "COMPUTE_USER_ACCUM_3"}, + {0xa1ff, "GE_MAX_OUTPUT_PER_SUBGROUP"}, + {0xa2d3, "GE_NGG_SUBGRP_CNTL"}, + {0xc25f, "GE_STEREO_CNTL"}, + {0xc262, "GE_USER_VGPR_EN"}, + {0xc258, "IA_MULTI_VGT_PARAM_PIPED"}, + {0xa210, "PA_STEREO_CNTL"}, + {0xa1c2, "SPI_SHADER_IDX_FORMAT"}, + {0x2c80, "SPI_SHADER_PGM_CHKSUM_GS"}, + {0x2d00, "SPI_SHADER_PGM_CHKSUM_HS"}, + {0x2c06, "SPI_SHADER_PGM_CHKSUM_PS"}, + {0x2c45, "SPI_SHADER_PGM_CHKSUM_VS"}, + {0x2c88, "SPI_SHADER_PGM_LO_GS"}, + {0x2cb2, "SPI_SHADER_USER_ACCUM_ESGS_0"}, + {0x2cb3, "SPI_SHADER_USER_ACCUM_ESGS_1"}, + {0x2cb4, "SPI_SHADER_USER_ACCUM_ESGS_2"}, + {0x2cb5, "SPI_SHADER_USER_ACCUM_ESGS_3"}, + {0x2d32, "SPI_SHADER_USER_ACCUM_LSHS_0"}, + {0x2d33, "SPI_SHADER_USER_ACCUM_LSHS_1"}, + {0x2d34, "SPI_SHADER_USER_ACCUM_LSHS_2"}, + {0x2d35, "SPI_SHADER_USER_ACCUM_LSHS_3"}, + {0x2c32, "SPI_SHADER_USER_ACCUM_PS_0"}, + {0x2c33, "SPI_SHADER_USER_ACCUM_PS_1"}, + {0x2c34, "SPI_SHADER_USER_ACCUM_PS_2"}, + {0x2c35, "SPI_SHADER_USER_ACCUM_PS_3"}, + {0x2c72, "SPI_SHADER_USER_ACCUM_VS_0"}, + {0x2c73, "SPI_SHADER_USER_ACCUM_VS_1"}, + {0x2c74, "SPI_SHADER_USER_ACCUM_VS_2"}, + {0x2c75, "SPI_SHADER_USER_ACCUM_VS_3"}, + {0, nullptr}}; auto Entry = RegInfoTable; for (; Entry->Num && Entry->Num != RegNum; ++Entry) @@ -653,7 +709,7 @@ void AMDGPUPALMetadata::toString(std::string &String) { // a .note record of the specified AMD type. Returns an empty blob if // there is no PAL metadata, void AMDGPUPALMetadata::toBlob(unsigned Type, std::string &Blob) { - if (Type == ELF::NT_AMD_AMDGPU_PAL_METADATA) + if (Type == ELF::NT_AMD_PAL_METADATA) toLegacyBlob(Blob); else if (Type) toMsgPackBlob(Blob); @@ -790,7 +846,7 @@ const char *AMDGPUPALMetadata::getVendor() const { } // Get .note record type of metadata blob to be emitted: -// ELF::NT_AMD_AMDGPU_PAL_METADATA (legacy key=val format), or +// ELF::NT_AMD_PAL_METADATA (legacy key=val format), or // ELF::NT_AMDGPU_METADATA (MsgPack format), or // 0 (no PAL metadata). unsigned AMDGPUPALMetadata::getType() const { @@ -799,12 +855,12 @@ unsigned AMDGPUPALMetadata::getType() const { // Return whether the blob type is legacy PAL metadata. bool AMDGPUPALMetadata::isLegacy() const { - return BlobType == ELF::NT_AMD_AMDGPU_PAL_METADATA; + return BlobType == ELF::NT_AMD_PAL_METADATA; } // Set legacy PAL metadata format. void AMDGPUPALMetadata::setLegacy() { - BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA; + BlobType = ELF::NT_AMD_PAL_METADATA; } // Erase all PAL metadata. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h index 8fa1f738487c..7fdd9a8429c1 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h @@ -80,6 +80,21 @@ public: // Set the stack frame size of a function in the metadata. void setFunctionScratchSize(const MachineFunction &MF, unsigned Val); + // Set the amount of LDS used in bytes in the metadata. This is an optional + // advisory record for logging etc; wave dispatch actually uses the rsrc1 + // register for the shader stage to determine the amount of LDS to allocate. + void setFunctionLdsSize(const MachineFunction &MF, unsigned Val); + + // Set the number of used vgprs in the metadata. This is an optional advisory + // record for logging etc; wave dispatch actually uses the rsrc1 register for + // the shader stage to determine the number of vgprs to allocate. + void setFunctionNumUsedVgprs(const MachineFunction &MF, unsigned Val); + + // Set the number of used sgprs in the metadata. This is an optional advisory + // record for logging etc; wave dispatch actually uses the rsrc1 register for + // the shader stage to determine the number of sgprs to allocate. + void setFunctionNumUsedSgprs(const MachineFunction &MF, unsigned Val); + // Set the hardware register bit in PAL metadata to enable wave32 on the // shader of the given calling convention. void setWave32(unsigned CC); @@ -95,7 +110,7 @@ public: const char *getVendor() const; // Get .note record type of metadata blob to be emitted: - // ELF::NT_AMD_AMDGPU_PAL_METADATA (legacy key=val format), or + // ELF::NT_AMD_PAL_METADATA (legacy key=val format), or // ELF::NT_AMDGPU_METADATA (MsgPack format), or // 0 (no PAL metadata). unsigned getType() const; diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index f1e470031982..35d5fe13ad30 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -60,9 +60,12 @@ class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1On } class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> : + VOP_Real <ps>, InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, SIMCInstr <ps.PseudoInstr, EncodingFamily> { + let VALU = 1; + let VOP1 = 1; let isPseudo = 0; let isCodeGenOnly = 0; @@ -79,6 +82,10 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> : let UseNamedOperandTable = ps.UseNamedOperandTable; let Uses = ps.Uses; let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let mayLoad = ps.mayLoad; + let mayStore = ps.mayStore; + let TRANS = ps.TRANS; } class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : @@ -144,6 +151,15 @@ def VOP1_F64_I32 : VOPProfileI2F <f64, i32>; def VOP1_F32_I32 : VOPProfileI2F <f32, i32>; def VOP1_F16_I16 : VOPProfileI2F <f16, i16>; +class VOP_SPECIAL_OMOD_PROF<ValueType dstVt, ValueType srcVt> : + VOPProfile<[dstVt, srcVt, untyped, untyped]> { + + let HasOMod = 1; +} +def VOP_I32_F32_SPECIAL_OMOD : VOP_SPECIAL_OMOD_PROF<i32, f32>; +def VOP_I32_F64_SPECIAL_OMOD : VOP_SPECIAL_OMOD_PROF<i32, f64>; +def VOP_I16_F16_SPECIAL_OMOD : VOP_SPECIAL_OMOD_PROF<i16, f16>; + //===----------------------------------------------------------------------===// // VOP1 Instructions //===----------------------------------------------------------------------===// @@ -187,8 +203,10 @@ def V_READFIRSTLANE_B32 : let Inst{31-25} = 0x3f; //encoding } +let isReMaterializable = 1 in { let SchedRW = [WriteDoubleCvt] in { -defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>; +// OMod clears exceptions when set in this instruction +defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64_SPECIAL_OMOD, fp_to_sint>; let mayRaiseFPException = 0 in { defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>; @@ -196,7 +214,8 @@ defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>; defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>; defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>; -defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>; +// OMod clears exceptions when set in this instruction +defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64_SPECIAL_OMOD, fp_to_uint>; let mayRaiseFPException = 0 in { defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP1_F64_I32, uint_to_fp>; @@ -213,11 +232,12 @@ defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>; defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>; } -defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>; -defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>; -let FPDPRounding = 1 in { +// OMod clears exceptions when set in these 2 instructions +defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32_SPECIAL_OMOD, fp_to_uint>; +defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32_SPECIAL_OMOD, fp_to_sint>; +let FPDPRounding = 1, isReMaterializable = 0 in { defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>; -} // End FPDPRounding = 1 +} // End FPDPRounding = 1, isReMaterializable = 0 defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>; @@ -268,7 +288,7 @@ defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32, AMDGPUffbl_b32>; defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32, AMDGPUffbh_i32>; let SchedRW = [WriteDoubleAdd] in { -defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>; +defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64_SPECIAL_OMOD, int_amdgcn_frexp_exp>; defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>; let FPDPRounding = 1 in { defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>; @@ -277,6 +297,7 @@ defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>; defm V_FREXP_EXP_I32_F32 : VOP1Inst <"v_frexp_exp_i32_f32", VOP_I32_F32, int_amdgcn_frexp_exp>; defm V_FREXP_MANT_F32 : VOP1Inst <"v_frexp_mant_f32", VOP_F32_F32, int_amdgcn_frexp_mant>; +} // End isReMaterializable = 1 let VOPAsmPrefer32Bit = 1 in { defm V_CLREXCP : VOP1Inst <"v_clrexcp", VOP_NO_EXT<VOP_NONE>>; @@ -337,6 +358,7 @@ defm V_MOVRELS_B32 : VOP1Inst <"v_movrels_b32", VOP_MOVRELS>; defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_MOVRELSD>; } // End Uses = [M0, EXEC] +let isReMaterializable = 1 in { let SubtargetPredicate = isGFX6GFX7 in { let TRANS = 1, SchedRW = [WriteTrans32] in { defm V_LOG_CLAMP_F32 : @@ -351,12 +373,12 @@ let SubtargetPredicate = isGFX6GFX7 in { VOP1Inst<"v_rsq_legacy_f32", VOP_F32_F32, int_amdgcn_rsq_legacy>; } // End TRANS = 1, SchedRW = [WriteTrans32] - let SchedRW = [WriteDouble] in { + let SchedRW = [WriteTrans64] in { defm V_RCP_CLAMP_F64 : VOP1Inst<"v_rcp_clamp_f64", VOP_F64_F64>; defm V_RSQ_CLAMP_F64 : VOP1Inst<"v_rsq_clamp_f64", VOP_F64_F64, AMDGPUrsq_clamp>; - } // End SchedRW = [WriteDouble] + } // End SchedRW = [WriteTrans64] } // End SubtargetPredicate = isGFX6GFX7 let SubtargetPredicate = isGFX7GFX8GFX9 in { @@ -374,6 +396,7 @@ let SubtargetPredicate = isGFX7Plus in { defm V_FLOOR_F64 : VOP1Inst<"v_floor_f64", VOP_F64_F64, ffloor>; } // End SchedRW = [WriteDoubleAdd] } // End SubtargetPredicate = isGFX7Plus +} // End isReMaterializable = 1 let SubtargetPredicate = Has16BitInsts in { @@ -381,8 +404,9 @@ let FPDPRounding = 1 in { defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>; defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>; } // End FPDPRounding = 1 -defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>; -defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>; +// OMod clears exceptions when set in these two instructions +defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16_SPECIAL_OMOD, fp_to_uint>; +defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16_SPECIAL_OMOD, fp_to_sint>; let TRANS = 1, SchedRW = [WriteTrans32] in { defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>; defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>; @@ -393,7 +417,7 @@ defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>; defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>; } // End TRANS = 1, SchedRW = [WriteTrans32] defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>; -defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16, int_amdgcn_frexp_exp>; +defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16_SPECIAL_OMOD, int_amdgcn_frexp_exp>; defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>; defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>; defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>; @@ -434,11 +458,12 @@ let SubtargetPredicate = isGFX9Plus in { let SchedRW = [Write64Bit, Write64Bit]; } + let isReMaterializable = 1 in defm V_SAT_PK_U8_I16 : VOP1Inst<"v_sat_pk_u8_i16", VOP_I32_I32>; let mayRaiseFPException = 0 in { - defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16>; - defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16>; + defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16_SPECIAL_OMOD>; + defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16_SPECIAL_OMOD>; } // End mayRaiseFPException = 0 } // End SubtargetPredicate = isGFX9Plus @@ -461,6 +486,18 @@ let SubtargetPredicate = isGFX10Plus in { } // End Uses = [M0] } // End SubtargetPredicate = isGFX10Plus +def VOPProfileAccMov : VOP_NO_EXT<VOP_I32_I32> { + let DstRC = RegisterOperand<AGPR_32>; + let Src0RC32 = RegisterOperand<AGPR_32>; + let Asm32 = " $vdst, $src0"; +} + +def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1> { + let SubtargetPredicate = isGFX90APlus; + let isReMaterializable = 1; + let isAsCheapAsAMove = 1; +} + //===----------------------------------------------------------------------===// // Target-specific instruction encodings. //===----------------------------------------------------------------------===// @@ -471,6 +508,7 @@ class VOP1_DPP<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP1 let Defs = ps.Defs; let SchedRW = ps.SchedRW; let Uses = ps.Uses; + let TRANS = ps.TRANS; bits<8> vdst; let Inst{8-0} = 0xfa; @@ -498,9 +536,6 @@ class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : let Inst{16-9} = op; let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0); let Inst{31-25} = 0x3f; - - let AssemblerPredicate = HasDPP8; - let SubtargetPredicate = HasDPP8; } //===----------------------------------------------------------------------===// @@ -823,6 +858,8 @@ defm V_SAT_PK_U8_I16 : VOP1_Real_vi<0x4f>; defm V_CVT_NORM_I16_F16 : VOP1_Real_vi<0x4d>; defm V_CVT_NORM_U16_F16 : VOP1_Real_vi<0x4e>; +defm V_ACCVGPR_MOV_B32 : VOP1Only_Real_vi<0x52>; + // Copy of v_mov_b32 with $vdst as a use operand for use with VGPR // indexing mode. vdst can't be treated as a def for codegen purposes, // and an implicit use and def of the super register should be added. diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 7a334eaadaed..7860b7e7f8a6 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -81,9 +81,12 @@ class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suf } class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> : + VOP_Real <ps>, InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, SIMCInstr <ps.PseudoInstr, EncodingFamily> { + let VALU = 1; + let VOP2 = 1; let isPseudo = 0; let isCodeGenOnly = 0; @@ -101,6 +104,9 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> : let UseNamedOperandTable = ps.UseNamedOperandTable; let Uses = ps.Uses; let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let mayLoad = ps.mayLoad; + let mayStore = ps.mayStore; } class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : @@ -237,7 +243,9 @@ multiclass VOP2eInst <string opName, } def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, - Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; + Commutable_REV<revOp#"_e64", !eq(revOp, opName)> { + let isReMaterializable = 1; + } } } @@ -267,10 +275,9 @@ class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm), (ins VCSrc_f16:$src0, VGPR_32:$src1, ImmOpType:$imm)); field bit HasExt = 0; + let IsSingle = 1; - // Hack to stop printing _e64 - let DstRC = RegisterOperand<VGPR_32>; - field string Asm32 = " $vdst, $src0, $src1, $imm"; + field string Asm32 = "$vdst, $src0, $src1, $imm"; } def VOP_MADAK_F16 : VOP_MADAK <f16>; @@ -280,37 +287,38 @@ class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm); field dag Ins32 = (ins VCSrc_f32:$src0, ImmOpType:$imm, VGPR_32:$src1); field bit HasExt = 0; + let IsSingle = 1; - // Hack to stop printing _e64 - let DstRC = RegisterOperand<VGPR_32>; - field string Asm32 = " $vdst, $src0, $imm, $src1"; + field string Asm32 = "$vdst, $src0, $imm, $src1"; } def VOP_MADMK_F16 : VOP_MADMK <f16>; def VOP_MADMK_F32 : VOP_MADMK <f32>; +class getRegisterOperandForVT<ValueType VT> { + RegisterOperand ret = RegisterOperand<getVregSrcForVT<VT>.ret>; +} + // FIXME: Remove src2_modifiers. It isn't used, so is wasting memory // and processing time but it makes it easier to convert to mad. class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, vt0]> { - let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); - let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3, + let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT<Src2VT>.ret:$src2); + let Ins64 = getIns64<Src0RC64, Src1RC64, getRegisterOperandForVT<Src2VT>.ret, 3, 0, HasModifiers, HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret; let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, - VGPR_32:$src2, // stub argument + getVregSrcForVT<Src2VT>.ret:$src2, // stub argument dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); let InsDPP16 = !con(InsDPP, (ins FI:$fi)); - let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, - VGPR_32:$src2, // stub argument + getVregSrcForVT<Src2VT>.ret:$src2, // stub argument dpp8:$dpp8, FI:$fi); - let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, - VGPR_32:$src2, // stub argument + getVregSrcForVT<Src2VT>.ret:$src2, // stub argument clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); @@ -335,6 +343,8 @@ def VOP_MAC_F16 : VOP_MAC <f16>; def VOP_MAC_F32 : VOP_MAC <f32>; let HasExtDPP = 0 in def VOP_MAC_LEGACY_F32 : VOP_MAC <f32>; +let HasExtSDWA = 0, HasExt64BitDPP = 1 in +def VOP_MAC_F64 : VOP_MAC <f64>; class VOP_DOT_ACC<ValueType vt0, ValueType vt1> : VOP_MAC<vt0, vt1> { let HasClamp = 0; @@ -448,6 +458,7 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> { let HasExt = 0; let HasExtDPP = 0; + let HasExt64BitDPP = 0; let HasExtSDWA = 0; let HasExtSDWA9 = 0; } @@ -464,6 +475,7 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> { let HasExt = 0; let HasExtDPP = 0; + let HasExt64BitDPP = 0; let HasExtSDWA = 0; let HasExtSDWA9 = 0; } @@ -473,10 +485,11 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> { //===----------------------------------------------------------------------===// defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>; -let SubtargetPredicate = HasMadMacF32Insts in +let SubtargetPredicate = HasMadMacF32Insts, isReMaterializable = 1 in def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>; let isCommutable = 1 in { +let isReMaterializable = 1 in { defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, any_fadd>; defm V_SUB_F32 : VOP2Inst <"v_sub_f32", VOP_F32_F32_F32, any_fsub>; defm V_SUBREV_F32 : VOP2Inst <"v_subrev_f32", VOP_F32_F32_F32, null_frag, "v_sub_f32">; @@ -498,6 +511,7 @@ defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, lshl_rev, "v_ls defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, and>; defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, or>; defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>; +} // End isReMaterializable = 1 let mayRaiseFPException = 0 in { let OtherPredicates = [HasMadMacF32Insts] in { @@ -510,6 +524,7 @@ defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_MAC_LEGACY_F32>; } // End Constraints = "$vdst = $src2", DisableEncoding="$src2", // isConvertibleToThreeAddress = 1 +let isReMaterializable = 1 in def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, []>; } // End OtherPredicates = [HasMadMacF32Insts] } // End mayRaiseFPException = 0 @@ -524,7 +539,7 @@ defm V_SUBB_U32 : VOP2bInst <"v_subb_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, " defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32", 1>; -let SubtargetPredicate = HasAddNoCarryInsts in { +let SubtargetPredicate = HasAddNoCarryInsts, isReMaterializable = 1 in { defm V_ADD_U32 : VOP2Inst <"v_add_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_add_u32", 1>; defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>; defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>; @@ -543,12 +558,12 @@ def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, } // End $vdst = $vdst_in, DisableEncoding $vdst_in } // End isConvergent = 1 +let isReMaterializable = 1 in { defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>; defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, add_ctpop>; defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_lo>; defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>; defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>; -defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst" let ReadsModeReg = 0, mayRaiseFPException = 0 in { defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_V2I16_F32_F32>, AMDGPUpknorm_i16_f32>; @@ -572,7 +587,9 @@ defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, sra>; defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, shl>; } // End SubtargetPredicate = isGFX6GFX7 } // End isCommutable = 1 +} // End isReMaterializable = 1 +defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst" class DivergentBinOp<SDPatternOperator Op, VOP_Pseudo Inst> : GCNPat< @@ -672,7 +689,8 @@ defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>; let SubtargetPredicate = HasDLInsts in { -defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32>; +let isReMaterializable = 1 in +defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32, xnor>; let Constraints = "$vdst = $src2", DisableEncoding = "$src2", @@ -692,6 +710,14 @@ defm V_FMAC_LEGACY_F32 : VOP2Inst <"v_fmac_legacy_f32", VOP_MAC_LEGACY_F32>; } // End SubtargetPredicate = HasFmaLegacy32 +let SubtargetPredicate = isGFX90APlus, + Constraints = "$vdst = $src2", + DisableEncoding="$src2", + isConvertibleToThreeAddress = 1, + isCommutable = 1, + SchedRW = [WriteDoubleAdd] in +defm V_FMAC_F64 : VOP2Inst <"v_fmac_f64", VOP_MAC_F64>; + let Constraints = "$vdst = $src2", DisableEncoding="$src2", isConvertibleToThreeAddress = 1, @@ -735,17 +761,21 @@ let AddedComplexity = 30 in { } } // End AddedComplexity = 30 +let SubtargetPredicate = HasFmaakFmamkF32Insts, isReMaterializable = 1 in { +def V_FMAMK_F32 : VOP2_Pseudo<"v_fmamk_f32", VOP_MADMK_F32, [], "">; + +let isCommutable = 1 in +def V_FMAAK_F32 : VOP2_Pseudo<"v_fmaak_f32", VOP_MADAK_F32, [], "">; +} + let SubtargetPredicate = isGFX10Plus in { -def V_FMAMK_F32 : VOP2_Pseudo<"v_fmamk_f32", VOP_MADMK_F32, [], "">; -let FPDPRounding = 1 in +let FPDPRounding = 1 in { def V_FMAMK_F16 : VOP2_Pseudo <"v_fmamk_f16", VOP_MADMK_F16, [], "">; -let isCommutable = 1 in { -def V_FMAAK_F32 : VOP2_Pseudo<"v_fmaak_f32", VOP_MADAK_F32, [], "">; -let FPDPRounding = 1 in +let isCommutable = 1 in def V_FMAAK_F16 : VOP2_Pseudo <"v_fmaak_f16", VOP_MADAK_F16, [], "">; -} // End isCommutable = 1 +} // End FPDPRounding = 1 let Constraints = "$vdst = $src2", DisableEncoding="$src2", @@ -913,8 +943,6 @@ class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps, let Inst{30-25} = op; let Inst{31} = 0x0; - let AssemblerPredicate = HasDPP8; - let SubtargetPredicate = HasDPP8; let OtherPredicates = ps.OtherPredicates; } @@ -1122,14 +1150,18 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { multiclass VOP3Only_Real_gfx10<bits<10> op> { def _e64_gfx10 : VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>, - VOP3e_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; + VOP3e_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { + let IsSingle = 1; + } } //===---------------------------- VOP3beOnly ----------------------------===// multiclass VOP3beOnly_Real_gfx10<bits<10> op> { def _e64_gfx10 : VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>, - VOP3be_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; + VOP3be_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { + let IsSingle = 1; + } } } // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" @@ -1177,7 +1209,10 @@ defm V_FMAAK_F16 : VOP2Only_Real_MADK_gfx10<0x038>; defm V_MAX_F16 : VOP2_Real_gfx10<0x039>; defm V_MIN_F16 : VOP2_Real_gfx10<0x03a>; defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>; + +let IsSingle = 1 in { defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>; +} // VOP2 no carry-in, carry-out. defm V_ADD_NC_U32 : @@ -1251,20 +1286,20 @@ let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>, VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>; } - multiclass VOP2_Real_e32_gfx6_gfx7<bits<6> op, string PseudoName = NAME> { + multiclass VOP2_Real_e32_gfx6_gfx7<bits<6> op, string opName = NAME> { def _e32_gfx6_gfx7 : - VOP2_Real<!cast<VOP2_Pseudo>(PseudoName#"_e32"), SIEncodingFamily.SI>, - VOP2e<op{5-0}, !cast<VOP2_Pseudo>(PseudoName#"_e32").Pfl>; + VOP2_Real<!cast<VOP2_Pseudo>(opName#"_e32"), SIEncodingFamily.SI>, + VOP2e<op{5-0}, !cast<VOP2_Pseudo>(opName#"_e32").Pfl>; } - multiclass VOP2_Real_e64_gfx6_gfx7<bits<6> op, string PseudoName = NAME> { + multiclass VOP2_Real_e64_gfx6_gfx7<bits<6> op, string opName = NAME> { def _e64_gfx6_gfx7 : - VOP3_Real<!cast<VOP3_Pseudo>(PseudoName#"_e64"), SIEncodingFamily.SI>, - VOP3e_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(PseudoName#"_e64").Pfl>; + VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.SI>, + VOP3e_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(opName#"_e64").Pfl>; } - multiclass VOP2be_Real_e64_gfx6_gfx7<bits<6> op, string PseudoName = NAME> { + multiclass VOP2be_Real_e64_gfx6_gfx7<bits<6> op, string opName = NAME> { def _e64_gfx6_gfx7 : - VOP3_Real<!cast<VOP3_Pseudo>(PseudoName#"_e64"), SIEncodingFamily.SI>, - VOP3be_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(PseudoName#"_e64").Pfl>; + VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.SI>, + VOP3be_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(opName#"_e64").Pfl>; } } // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" @@ -1281,16 +1316,16 @@ multiclass VOP2be_Real_gfx6_gfx7<bits<6> op> : VOP2_Real_e32_gfx6_gfx7<op>, VOP2be_Real_e64_gfx6_gfx7<op>; multiclass VOP2be_Real_gfx6_gfx7_with_name<bits<6> op, - string PseudoName, string asmName> { - defvar ps32 = !cast<VOP2_Pseudo>(PseudoName#"_e32"); - defvar ps64 = !cast<VOP3_Pseudo>(PseudoName#"_e64"); + string opName, string asmName> { + defvar ps32 = !cast<VOP2_Pseudo>(opName#"_e32"); + defvar ps64 = !cast<VOP3_Pseudo>(opName#"_e64"); let AsmString = asmName # ps32.AsmOperands in { - defm "" : VOP2_Real_e32_gfx6_gfx7<op, PseudoName>; + defm "" : VOP2_Real_e32_gfx6_gfx7<op, opName>; } let AsmString = asmName # ps64.AsmOperands in { - defm "" : VOP2be_Real_e64_gfx6_gfx7<op, PseudoName>; + defm "" : VOP2be_Real_e64_gfx6_gfx7<op, opName>; } } @@ -1391,10 +1426,7 @@ multiclass VOP2_Real_e64only_vi <bits<10> op> { def _e64_vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>, VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { - // Hack to stop printing _e64 - VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME#"_e64"); - let OutOperandList = (outs VGPR_32:$vdst); - let AsmString = ps.Mnemonic # " " # ps.AsmOperands; + let IsSingle = 1; } } @@ -1525,6 +1557,7 @@ defm V_CNDMASK_B32 : VOP2_Real_e32e64_vi <0x0>; defm V_ADD_F32 : VOP2_Real_e32e64_vi <0x1>; defm V_SUB_F32 : VOP2_Real_e32e64_vi <0x2>; defm V_SUBREV_F32 : VOP2_Real_e32e64_vi <0x3>; +let AssemblerPredicate = isGCN3ExcludingGFX90A in defm V_MUL_LEGACY_F32 : VOP2_Real_e32e64_vi <0x4>; defm V_MUL_F32 : VOP2_Real_e32e64_vi <0x5>; defm V_MUL_I32_I24 : VOP2_Real_e32e64_vi <0x6>; @@ -1641,6 +1674,42 @@ defm V_XNOR_B32 : VOP2_Real_e32e64_vi <0x3d>; } // End SubtargetPredicate = HasDLInsts +let AssemblerPredicate = isGFX90APlus, DecoderNamespace = "GFX90A" in { + multiclass VOP2_Real_e32_gfx90a <bits<6> op> { + def _e32_gfx90a : + VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX90A>, + VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>; + } + + multiclass VOP2_Real_e64_gfx90a <bits<10> op> { + def _e64_gfx90a : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX90A>, + VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; + } + + multiclass Base_VOP2_Real_e32e64_gfx90a <bits<6> op> : + VOP2_Real_e32_gfx90a<op>, + VOP2_Real_e64_gfx90a<{0, 1, 0, 0, op{5-0}}>; + + multiclass VOP2_Real_e32e64_gfx90a <bits<6> op> : + Base_VOP2_Real_e32e64_gfx90a<op> { + + foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_gfx90a : + VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX90A>, + VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> { + let DecoderNamespace = "SDWA9"; + } + } +} // End AssemblerPredicate = isGFX90APlus, DecoderNamespace = "GFX90A" + +let SubtargetPredicate = isGFX90APlus in { + defm V_FMAC_F64 : VOP2_Real_e32e64_gfx90a <0x4>; + let IsSingle = 1 in { + defm V_MUL_LEGACY_F32 : VOP2_Real_e64_gfx90a <0x2a1>; + } +} // End SubtargetPredicate = isGFX90APlus + multiclass VOP2_Real_DOT_ACC_gfx9<bits<6> op> : VOP2_Real_e32_vi<op> { def _dpp_vi : VOP2_DPP<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>; } diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 42dc995609f0..ee3b87f487d0 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -184,47 +184,24 @@ class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProf let IsPacked = !if(Features.IsPacked, 1, P.IsPacked); let HasModifiers = !if(Features.IsMAI, 0, !or(Features.IsPacked, P.HasModifiers)); - - // FIXME: Hack to stop printing _e64 - let Outs64 = (outs DstRC.RegClass:$vdst); - let Asm64 = - " " # !if(Features.HasOpSel, - getAsmVOP3OpSel<NumSrcArgs, - HasIntClamp, - P.HasOMod, - HasSrc0FloatMods, - HasSrc1FloatMods, - HasSrc2FloatMods>.ret, - !if(Features.HasClamp, - getAsm64<HasDst, NumSrcArgs, HasIntClamp, - HasModifiers, HasOMod, DstVT>.ret, - P.Asm64)); - let NeedPatGen = P.NeedPatGen; + let IsSingle = 1; } class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> { let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst); - let Asm64 = " $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod"; -} - -def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32> { - // FIXME: Hack to stop printing _e64 - let DstRC = RegisterOperand<VGPR_32>; + let Asm64 = "$vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod"; + let IsSingle = 1; } -def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> { - // FIXME: Hack to stop printing _e64 - let DstRC = RegisterOperand<VReg_64>; -} +def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32>; +def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64>; def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> { let HasClamp = 1; - - // FIXME: Hack to stop printing _e64 - let DstRC = RegisterOperand<VReg_64>; + let IsSingle = 1; let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst); - let Asm64 = " $vdst, $sdst, $src0, $src1, $src2$clamp"; + let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp"; } //===----------------------------------------------------------------------===// @@ -287,7 +264,7 @@ class VOP3_INTERP16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> { let HasOMod = !ne(DstVT.Value, f16.Value); let HasHigh = 1; - let Outs64 = (outs VGPR_32:$vdst); + let Outs64 = (outs DstRC.RegClass:$vdst); let Ins64 = getInterp16Ins<HasSrc2, HasOMod, Src0Mod, Src2Mod>.ret; let Asm64 = getInterp16Asm<HasSrc2, HasOMod>.ret; } @@ -298,6 +275,7 @@ class VOP3_INTERP16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> { let isCommutable = 1 in { +let isReMaterializable = 1 in { let mayRaiseFPException = 0 in { let SubtargetPredicate = HasMadMacF32Insts in { defm V_MAD_LEGACY_F32 : VOP3Inst <"v_mad_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; @@ -325,12 +303,13 @@ defm V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_l defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_like, 1>; } // End SchedRW = [WriteDoubleAdd] -let SchedRW = [WriteQuarterRate32] in { +let SchedRW = [WriteIntMul] in { defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>, mul>; defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile<VOP_I32_I32_I32>, mulhu>; defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>; defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>; -} // End SchedRW = [WriteQuarterRate32] +} // End SchedRW = [WriteIntMul] +} // End isReMaterializable = 1 let Uses = [MODE, VCC, EXEC] in { // v_div_fmas_f32: @@ -351,6 +330,7 @@ defm V_DIV_FMAS_F64 : VOP3Inst_Pseudo_Wrapper <"v_div_fmas_f64", VOP_F64_F64_F6 } // End isCommutable = 1 +let isReMaterializable = 1 in { let mayRaiseFPException = 0 in { defm V_CUBEID_F32 : VOP3Inst <"v_cubeid_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubeid>; defm V_CUBESC_F32 : VOP3Inst <"v_cubesc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubesc>; @@ -364,22 +344,27 @@ defm V_BFI_B32 : VOP3Inst <"v_bfi_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGP defm V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, fshr>; defm V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>; -let mayRaiseFPException = 0 in { // XXX - Seems suspect but manual doesn't say it does -defm V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmin3>; -defm V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmin3>; -defm V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumin3>; -defm V_MAX3_F32 : VOP3Inst <"v_max3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmax3>; -defm V_MAX3_I32 : VOP3Inst <"v_max3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmax3>; -defm V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumax3>; -defm V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>; -defm V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmed3>; -defm V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumed3>; +// XXX - No FPException seems suspect but manual doesn't say it does +let mayRaiseFPException = 0 in { + let isCommutable = 1 in { + defm V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmin3>; + defm V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumin3>; + defm V_MAX3_I32 : VOP3Inst <"v_max3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmax3>; + defm V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumax3>; + defm V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmed3>; + defm V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumed3>; + } // End isCommutable = 1 + defm V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmin3>; + defm V_MAX3_F32 : VOP3Inst <"v_max3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmax3>; + defm V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>; } // End mayRaiseFPException = 0 -defm V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; -defm V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; -defm V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; -defm V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; +let isCommutable = 1 in { + defm V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; + defm V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; + defm V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; + defm V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; +} // End isCommutable = 1 defm V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>; defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUdiv_fixup>; @@ -388,6 +373,7 @@ let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in { defm V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, AMDGPUdiv_fixup>; defm V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp, 1>; } // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1 +} // End isReMaterializable = 1 let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it does. @@ -399,6 +385,7 @@ let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it d defm V_DIV_SCALE_F64 : VOP3Inst_Pseudo_Wrapper <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1>; } // End mayRaiseFPException = 0 +let isReMaterializable = 1 in defm V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; let Constraints = "@earlyclobber $vdst" in { @@ -406,6 +393,7 @@ defm V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64 } // End Constraints = "@earlyclobber $vdst" +let isReMaterializable = 1 in { let SchedRW = [WriteDouble] in { defm V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, int_amdgcn_trig_preop>; } // End SchedRW = [WriteDouble] @@ -423,12 +411,14 @@ let SchedRW = [Write64Bit] in { defm V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, ashr_rev>; } // End SubtargetPredicate = isGFX8Plus } // End SchedRW = [Write64Bit] +} // End isReMaterializable = 1 def : GCNPat< (i32 (getDivergentFrag<sext>.ret i16:$src)), (i32 (V_BFE_I32_e64 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10)))) >; +let isReMaterializable = 1 in { let SubtargetPredicate = isGFX6GFX7GFX10 in { defm V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; } // End SubtargetPredicate = isGFX6GFX7GFX10 @@ -438,6 +428,7 @@ let SubtargetPredicate = isGFX8Plus in { defm V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUperm>; } // End SubtargetPredicate = isGFX8Plus } // End SchedRW = [Write32Bit] +} // End isReMaterializable = 1 let SubtargetPredicate = isGFX7Plus in { @@ -447,10 +438,10 @@ defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32 } // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] let isCommutable = 1 in { -let SchedRW = [WriteQuarterRate32, WriteSALU] in { +let SchedRW = [WriteIntMul, WriteSALU] in { defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>; defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; -} // End SchedRW = [WriteQuarterRate32, WriteSALU] +} // End SchedRW = [WriteIntMul, WriteSALU] } // End isCommutable = 1 } // End SubtargetPredicate = isGFX7Plus @@ -476,6 +467,7 @@ let renamedInGFX9 = 1 in { let FPDPRounding = 1 in { defm V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>; let Uses = [MODE, M0, EXEC] in { + let OtherPredicates = [isNotGFX90APlus] in // For some reason the intrinsic operands are in a different order // from the instruction operands. def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>, @@ -497,24 +489,24 @@ let SubtargetPredicate = isGFX9Only, FPDPRounding = 1 in { let SubtargetPredicate = isGFX9Plus in { defm V_MAD_U16_gfx9 : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>; defm V_MAD_I16_gfx9 : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>; +let OtherPredicates = [isNotGFX90APlus] in def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>; } // End SubtargetPredicate = isGFX9Plus -let Uses = [MODE, M0, EXEC], FPDPRounding = 1 in { +// This predicate should only apply to the selection pattern. The +// instruction still exists and should decode on subtargets with +// other bank counts. +let OtherPredicates = [isNotGFX90APlus, has32BankLDS], Uses = [MODE, M0, EXEC], FPDPRounding = 1 in { def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>, [(set f32:$vdst, (int_amdgcn_interp_p1_f16 (VOP3Mods f32:$src0, i32:$src0_modifiers), (i32 timm:$attrchan), (i32 timm:$attr), - (i1 timm:$high), M0))]> { - // This predicate should only apply to the selection pattern. The - // instruction still exists and should decode on subtargets with - // other bank counts. - let OtherPredicates = [has32BankLDS]; -} - + (i1 timm:$high), M0))]>; +} // End OtherPredicates = [isNotGFX90APlus, has32BankLDS], Uses = [MODE, M0, EXEC], FPDPRounding = 1 +let OtherPredicates = [isNotGFX90APlus], Uses = [MODE, M0, EXEC], FPDPRounding = 1 in { def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>; -} // End Uses = [MODE, M0, EXEC], FPDPRounding = 1 +} // End OtherPredicates = [isNotGFX90APlus], Uses = [MODE, M0, EXEC], FPDPRounding = 1 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1 @@ -527,11 +519,11 @@ def : GCNPat< ), VGPR_32)), sub1) >; -let SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC] in { +let SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC], OtherPredicates = [isNotGFX90APlus] in { def V_INTERP_P1_F32_e64 : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>; def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>; def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>; -} // End SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC] +} // End SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC], OtherPredicates = [isNotGFX90APlus] let Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] in { @@ -618,16 +610,16 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag< } let SubtargetPredicate = isGFX9Plus in { -defm V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>; -defm V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; -defm V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; -defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; -defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; -defm V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; -defm V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; - -defm V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; - +let isCommutable = 1, isReMaterializable = 1 in { + defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; + defm V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; + defm V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; + defm V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; + defm V_ADD_I32 : VOP3Inst <"v_add_i32", VOP3_Profile<VOP_I32_I32_I32_ARITH>>; + defm V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; +} // End isCommutable = 1, isReMaterializable = 1 +// TODO src0 contains the opsel bit for dst, so if we commute, need to mask and swap this +// to the new src0. defm V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmed3>; defm V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmed3>; defm V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumed3>; @@ -649,8 +641,13 @@ defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32 defm V_CVT_PKNORM_I16_F16 : VOP3Inst <"v_cvt_pknorm_i16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>; defm V_CVT_PKNORM_U16_F16 : VOP3Inst <"v_cvt_pknorm_u16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>; -defm V_ADD_I32 : VOP3Inst <"v_add_i32", VOP3_Profile<VOP_I32_I32_I32_ARITH>>; +defm V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>; + +let isReMaterializable = 1 in { defm V_SUB_I32 : VOP3Inst <"v_sub_i32", VOP3_Profile<VOP_I32_I32_I32_ARITH>>; +defm V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; +defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; +} // End isReMaterializable = 1 class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat < @@ -729,7 +726,9 @@ class PermlaneDiscardVDstIn<SDPatternOperator permlane, let SubtargetPredicate = isGFX10Plus in { - defm V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; + let isCommutable = 1, isReMaterializable = 1 in { + defm V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; + } // End isCommutable = 1, isReMaterializable = 1 def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32_e64>; let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { @@ -833,6 +832,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { VOP3e_gfx10<op, !cast<VOP3_Pseudo>(opName#"_e64").Pfl> { VOP3_Pseudo ps = !cast<VOP3_Pseudo>(opName#"_e64"); let AsmString = asmName # ps.AsmOperands; + let IsSingle = 1; } } multiclass VOP3be_Real_gfx10<bits<10> op> { diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 64e70b8f64b0..48f5eb1dc272 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -10,71 +10,82 @@ // VOP3P Classes //===----------------------------------------------------------------------===// -class VOP3PInst<string OpName, VOPProfile P, - SDPatternOperator node = null_frag, - bit HasExplicitClamp = 0> : - VOP3P_Pseudo<OpName, P, - !if(P.HasModifiers, getVOP3PModPat<P, node, HasExplicitClamp>.ret, getVOP3Pat<P, node>.ret) ->; +// Used for FMA_MIX* and MAD_MIX* insts +// Their operands are only sort of f16 operands. Depending on +// op_sel_hi, these may be interpreted as f32. The inline immediate +// values are really f16 converted to f32, so we treat these as f16 +// operands. +class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR, + bit useTiedOutput = 0> : VOP3_Profile<P, Features> { + bit UseTiedOutput = useTiedOutput; + + dag srcs = + (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0, + FP16InputMods:$src1_modifiers, VCSrc_f16:$src1, + FP16InputMods:$src2_modifiers, VCSrc_f16:$src2); + + // FIXME: clampmod0 misbehaves with the non-default vdst_in + // following it. For now workaround this by requiring clamp + // in tied patterns. This should use undef_tied_input, but it + // seems underdeveloped and doesn't apply the right register + // class constraints. + dag mods = !con(!if(UseTiedOutput, (ins clampmod:$clamp, VGPR_32:$vdst_in), + (ins clampmod0:$clamp)), + (ins op_sel0:$op_sel, op_sel_hi0:$op_sel_hi)); + // We use Ins64 because that is the one which populates InOperandList + // due to the logic in class VOP3_Pseudo + let Ins64 = !con(srcs, mods); + let Asm64 = + "$vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp"; +} + +multiclass VOP3PInst<string OpName, VOPProfile P, + SDPatternOperator node = null_frag, bit HasExplicitClamp = 0> { + def NAME : VOP3P_Pseudo<OpName, P, + !if (P.HasModifiers, + getVOP3PModPat<P, node, HasExplicitClamp>.ret, + getVOP3Pat<P, node>.ret)>; +} + // Non-packed instructions that use the VOP3P encoding. // VOP3 neg/abs and VOP3P opsel/opsel_hi modifiers are allowed. -class VOP3_VOP3PInst<string OpName, VOPProfile P, bit UseTiedOutput = 0, - SDPatternOperator node = null_frag> : - VOP3P_Pseudo<OpName, P> { - // These operands are only sort of f16 operands. Depending on - // op_sel_hi, these may be interpreted as f32. The inline immediate - // values are really f16 converted to f32, so we treat these as f16 - // operands. - let InOperandList = - !con( - !con( - (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0, - FP16InputMods:$src1_modifiers, VCSrc_f16:$src1, - FP16InputMods:$src2_modifiers, VCSrc_f16:$src2), - // FIXME: clampmod0 misbehaves with the non-default vdst_in - // following it. For now workaround this by requiring clamp - // in tied patterns. This should use undef_tied_input, but it - // seems underdeveloped and doesn't apply the right register - // class constraints. - !if(UseTiedOutput, (ins clampmod:$clamp, VGPR_32:$vdst_in), - (ins clampmod0:$clamp))), - (ins op_sel0:$op_sel, op_sel_hi0:$op_sel_hi)); - - let Constraints = !if(UseTiedOutput, "$vdst = $vdst_in", ""); - let DisableEncoding = !if(UseTiedOutput, "$vdst_in", ""); - let AsmOperands = - " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp"; +multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P, + SDPatternOperator node = null_frag> { + def NAME : VOP3P_Pseudo<OpName, P> { + let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", ""); + let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", ""); + } } let isCommutable = 1 in { -def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; -def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; +defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; +defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; let FPDPRounding = 1 in { -def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>; -def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fadd>; -def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fmul>; +defm V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>; +defm V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fadd>; +defm V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fmul>; } // End FPDPRounding = 1 -def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>; -def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>; +defm V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>; +defm V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>; -def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>; -def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; -def V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>; +defm V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>; +defm V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; +defm V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>; -def V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>; -def V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umin>; -def V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smax>; -def V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>; +defm V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>; +defm V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umin>; +defm V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smax>; +defm V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>; } -def V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; -def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>; +defm V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; +defm V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>; -def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshl_rev>; -def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>; -def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>; +defm V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshl_rev>; +defm V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>; +defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>; let SubtargetPredicate = HasVOP3PInsts in { @@ -169,14 +180,14 @@ let SubtargetPredicate = HasMadMixInsts in { // Size of src arguments (16/32) is controlled by op_sel. // For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi. let isCommutable = 1, mayRaiseFPException = 0 in { -def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>; +defm V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3P_Mix_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>; let FPDPRounding = 1 in { // Clamp modifier is applied after conversion to f16. -def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; +defm V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>; let ClampLo = 0, ClampHi = 1 in { -def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; +defm V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>; } } // End FPDPRounding = 1 } @@ -188,14 +199,14 @@ defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>; // Essentially the same as the mad_mix versions let SubtargetPredicate = HasFmaMixInsts in { let isCommutable = 1 in { -def V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>; +defm V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3P_Mix_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>; let FPDPRounding = 1 in { // Clamp modifier is applied after conversion to f16. -def V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; +defm V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>; let ClampLo = 0, ClampHi = 1 in { -def V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; +defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>; } } // End FPDPRounding = 1 } @@ -287,25 +298,30 @@ class SDot2Pat<Instruction Inst> : GCNPat < let IsDOT = 1 in { let SubtargetPredicate = HasDot2Insts in { -def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", - VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, - AMDGPUfdot2, 1/*ExplicitClamp*/>; -def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", +defm V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2, 1>; -def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", +defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>; -def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", + +} // End SubtargetPredicate = HasDot2Insts + +let SubtargetPredicate = HasDot7Insts in { + +defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", + VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, + AMDGPUfdot2, 1/*ExplicitClamp*/>; +defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>; -def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", +defm V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>; -} // End SubtargetPredicate = HasDot2Insts +} // End SubtargetPredicate = HasDot7Insts let SubtargetPredicate = HasDot1Insts in { -def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", +defm V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>; -def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", +defm V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>; } // End SubtargetPredicate = HasDot1Insts @@ -319,7 +335,7 @@ foreach Type = ["U", "I"] in def : GCNPat < !cast<dag>(!foldl((i32 i32:$src2), [0, 1, 2, 3], lhs, y, (add_oneuse lhs, (!cast<PatFrag>("Mul"#Type#"_Elt"#y) i32:$src0, i32:$src1)))), - (!cast<VOP3PInst>("V_DOT4_"#Type#"32_"#Type#8) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; + (!cast<VOP3P_Pseudo>("V_DOT4_"#Type#"32_"#Type#8) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; foreach Type = ["U", "I"] in let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).SubtargetPredicate in @@ -327,7 +343,7 @@ foreach Type = ["U", "I"] in !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)), [1, 2, 3, 4, 5, 6, 7], lhs, y, (NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))), - (!cast<VOP3PInst>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; + (!cast<VOP3P_Pseudo>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; // Different variants of dot8 code-gen dag patterns are not generated through table-gen due to a huge increase // in the compile time. Directly handle the pattern generated by the FE here. @@ -337,12 +353,19 @@ foreach Type = ["U", "I"] in !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)), [7, 1, 2, 3, 4, 5, 6], lhs, y, (NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))), - (!cast<VOP3PInst>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; + (!cast<VOP3P_Pseudo>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; def ADst_32 : VOPDstOperand<AGPR_32>; +def ADst_64 : VOPDstOperand<AReg_64>; def ADst_128 : VOPDstOperand<AReg_128>; +def ADst_256 : VOPDstOperand<AReg_256>; def ADst_512 : VOPDstOperand<AReg_512>; def ADst_1024 : VOPDstOperand<AReg_1024>; +def VDst_64 : VOPDstOperand<VReg_64>; +def VDst_128 : VOPDstOperand<VReg_128>; +def VDst_256 : VOPDstOperand<VReg_256>; +def VDst_512 : VOPDstOperand<VReg_512>; +def VDst_1024 : VOPDstOperand<VReg_1024>; def VOPProfileAccRead : VOP3_Profile<VOP_I32_I32, VOP3_MAI> { let Src0RC64 = ARegSrc_32; @@ -362,7 +385,10 @@ class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC let Src2RC64 = _SrcRC; let HasOpSel = 0; let HasClamp = 0; - let Asm64 = " $vdst, $src0, $src1, $src2$cbsz$abid$blgp"; + let HasIntClamp = 0; + let HasOMod = 0; + let HasModifiers = 0; + let Asm64 = "$vdst, $src0, $src1, $src2$cbsz$abid$blgp"; let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp); } @@ -378,6 +404,29 @@ def VOPProfileMAI_F32_V2I16_X32 : VOPProfileMAI<VOP_V32F32_V2I16_V2I16_V32F32, A def VOPProfileMAI_F32_V4F16_X4 : VOPProfileMAI<VOP_V4F32_V4F16_V4F16_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>; def VOPProfileMAI_F32_V4F16_X16 : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>; def VOPProfileMAI_F32_V4F16_X32 : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>; +def VOPProfileMAI_F32_V4I16_X4 : VOPProfileMAI<VOP_V4F32_V4I16_V4I16_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>; +def VOPProfileMAI_F32_V4I16_X16 : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>; +def VOPProfileMAI_F32_V4I16_X32 : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>; +def VOPProfileMAI_F64_16X16X4F64 : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64, AISrc_256_f64, ADst_256, AVSrc_64>; +def VOPProfileMAI_F64_4X4X4F64 : VOPProfileMAI<VOP_F64_F64_F64_F64, AISrc_64_f64, ADst_64, AVSrc_64>; + +def VOPProfileMAI_F32_F32_X4_VCD : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, VISrc_128_f32, VDst_128>; +def VOPProfileMAI_F32_F32_X16_VCD : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32, VISrc_512_f32, VDst_512>; +def VOPProfileMAI_F32_F32_X32_VCD : VOPProfileMAI<VOP_V32F32_F32_F32_V32F32, VISrc_1024_f32, VDst_1024>; +def VOPProfileMAI_I32_I32_X4_VCD : VOPProfileMAI<VOP_V4I32_I32_I32_V4I32, VISrc_128_b32, VDst_128>; +def VOPProfileMAI_I32_I32_X16_VCD : VOPProfileMAI<VOP_V16I32_I32_I32_V16I32, VISrc_512_b32, VDst_512>; +def VOPProfileMAI_I32_I32_X32_VCD : VOPProfileMAI<VOP_V32I32_I32_I32_V32I32, VISrc_1024_b32, VDst_1024>; +def VOPProfileMAI_F32_V2I16_X4_VCD : VOPProfileMAI<VOP_V4F32_V2I16_V2I16_V4F32, VISrc_128_b32, VDst_128>; +def VOPProfileMAI_F32_V2I16_X16_VCD : VOPProfileMAI<VOP_V16F32_V2I16_V2I16_V16F32, VISrc_512_b32, VDst_512>; +def VOPProfileMAI_F32_V2I16_X32_VCD : VOPProfileMAI<VOP_V32F32_V2I16_V2I16_V32F32, VISrc_1024_b32, VDst_1024>; +def VOPProfileMAI_F32_V4F16_X4_VCD : VOPProfileMAI<VOP_V4F32_V4F16_V4F16_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>; +def VOPProfileMAI_F32_V4F16_X16_VCD : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>; +def VOPProfileMAI_F32_V4F16_X32_VCD : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, VISrc_1024_b32, VDst_1024, AVSrc_64>; +def VOPProfileMAI_F32_V4I16_X4_VCD : VOPProfileMAI<VOP_V4F32_V4I16_V4I16_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>; +def VOPProfileMAI_F32_V4I16_X16_VCD : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>; +def VOPProfileMAI_F32_V4I16_X32_VCD : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, VISrc_1024_b32, VDst_1024, AVSrc_64>; +def VOPProfileMAI_F64_16X16X4F64_VCD : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64, VISrc_256_f64, VDst_256, AVSrc_64>; +def VOPProfileMAI_F64_4X4X4F64_VCD : VOPProfileMAI<VOP_F64_F64_F64_F64, VISrc_64_f64, VDst_64, AVSrc_64>; let Predicates = [HasMAIInsts] in { @@ -388,32 +437,57 @@ let isAsCheapAsAMove = 1, isReMaterializable = 1 in { } // End isMoveImm = 1 } // End isAsCheapAsAMove = 1, isReMaterializable = 1 -// FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported. -let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in { -defm V_MFMA_F32_4X4X1F32 : VOP3Inst<"v_mfma_f32_4x4x1f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_4x4x1f32>; -defm V_MFMA_F32_4X4X4F16 : VOP3Inst<"v_mfma_f32_4x4x4f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_4x4x4f16>; -defm V_MFMA_I32_4X4X4I8 : VOP3Inst<"v_mfma_i32_4x4x4i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_4x4x4i8>; -defm V_MFMA_F32_4X4X2BF16 : VOP3Inst<"v_mfma_f32_4x4x2bf16", VOPProfileMAI_F32_V2I16_X4, int_amdgcn_mfma_f32_4x4x2bf16>; -defm V_MFMA_F32_16X16X1F32 : VOP3Inst<"v_mfma_f32_16x16x1f32", VOPProfileMAI_F32_F32_X16, int_amdgcn_mfma_f32_16x16x1f32>; -defm V_MFMA_F32_16X16X4F32 : VOP3Inst<"v_mfma_f32_16x16x4f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_16x16x4f32>; -defm V_MFMA_F32_16X16X4F16 : VOP3Inst<"v_mfma_f32_16x16x4f16", VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_16x16x4f16>; -defm V_MFMA_F32_16X16X16F16 : VOP3Inst<"v_mfma_f32_16x16x16f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_16x16x16f16>; -defm V_MFMA_I32_16X16X4I8 : VOP3Inst<"v_mfma_i32_16x16x4i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_16x16x4i8>; -defm V_MFMA_I32_16X16X16I8 : VOP3Inst<"v_mfma_i32_16x16x16i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_16x16x16i8>; -defm V_MFMA_F32_16X16X2BF16 : VOP3Inst<"v_mfma_f32_16x16x2bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_16x16x2bf16>; -defm V_MFMA_F32_16X16X8BF16 : VOP3Inst<"v_mfma_f32_16x16x8bf16", VOPProfileMAI_F32_V2I16_X4, int_amdgcn_mfma_f32_16x16x8bf16>; -defm V_MFMA_F32_32X32X1F32 : VOP3Inst<"v_mfma_f32_32x32x1f32", VOPProfileMAI_F32_F32_X32, int_amdgcn_mfma_f32_32x32x1f32>; -defm V_MFMA_F32_32X32X2F32 : VOP3Inst<"v_mfma_f32_32x32x2f32", VOPProfileMAI_F32_F32_X16, int_amdgcn_mfma_f32_32x32x2f32>; -defm V_MFMA_F32_32X32X4F16 : VOP3Inst<"v_mfma_f32_32x32x4f16", VOPProfileMAI_F32_V4F16_X32, int_amdgcn_mfma_f32_32x32x4f16>; -defm V_MFMA_F32_32X32X8F16 : VOP3Inst<"v_mfma_f32_32x32x8f16", VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_32x32x8f16>; -defm V_MFMA_I32_32X32X4I8 : VOP3Inst<"v_mfma_i32_32x32x4i8", VOPProfileMAI_I32_I32_X32, int_amdgcn_mfma_i32_32x32x4i8>; -defm V_MFMA_I32_32X32X8I8 : VOP3Inst<"v_mfma_i32_32x32x8i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_32x32x8i8>; -defm V_MFMA_F32_32X32X2BF16 : VOP3Inst<"v_mfma_f32_32x32x2bf16", VOPProfileMAI_F32_V2I16_X32, int_amdgcn_mfma_f32_32x32x2bf16>; -defm V_MFMA_F32_32X32X4BF16 : VOP3Inst<"v_mfma_f32_32x32x4bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_32x32x4bf16>; -} // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 +multiclass MAIInst<string OpName, string P, SDPatternOperator node> { + let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in { + // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported. + defm "" : VOP3Inst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P), node>; + + let SubtargetPredicate = isGFX90APlus, Mnemonic = OpName in + defm _vgprcd : VOP3Inst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD")>; + } // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 +} + +defm V_MFMA_F32_4X4X1F32 : MAIInst<"v_mfma_f32_4x4x1f32", "F32_F32_X4", int_amdgcn_mfma_f32_4x4x1f32>; +defm V_MFMA_F32_4X4X4F16 : MAIInst<"v_mfma_f32_4x4x4f16", "F32_V4F16_X4", int_amdgcn_mfma_f32_4x4x4f16>; +defm V_MFMA_I32_4X4X4I8 : MAIInst<"v_mfma_i32_4x4x4i8", "I32_I32_X4", int_amdgcn_mfma_i32_4x4x4i8>; +defm V_MFMA_F32_16X16X1F32 : MAIInst<"v_mfma_f32_16x16x1f32", "F32_F32_X16", int_amdgcn_mfma_f32_16x16x1f32>; +defm V_MFMA_F32_16X16X4F32 : MAIInst<"v_mfma_f32_16x16x4f32", "F32_F32_X4", int_amdgcn_mfma_f32_16x16x4f32>; +defm V_MFMA_F32_16X16X4F16 : MAIInst<"v_mfma_f32_16x16x4f16", "F32_V4F16_X16", int_amdgcn_mfma_f32_16x16x4f16>; +defm V_MFMA_F32_16X16X16F16 : MAIInst<"v_mfma_f32_16x16x16f16", "F32_V4F16_X4", int_amdgcn_mfma_f32_16x16x16f16>; +defm V_MFMA_I32_16X16X4I8 : MAIInst<"v_mfma_i32_16x16x4i8", "I32_I32_X16", int_amdgcn_mfma_i32_16x16x4i8>; +defm V_MFMA_F32_32X32X1F32 : MAIInst<"v_mfma_f32_32x32x1f32", "F32_F32_X32", int_amdgcn_mfma_f32_32x32x1f32>; +defm V_MFMA_F32_32X32X2F32 : MAIInst<"v_mfma_f32_32x32x2f32", "F32_F32_X16", int_amdgcn_mfma_f32_32x32x2f32>; +defm V_MFMA_F32_32X32X4F16 : MAIInst<"v_mfma_f32_32x32x4f16", "F32_V4F16_X32", int_amdgcn_mfma_f32_32x32x4f16>; +defm V_MFMA_F32_32X32X8F16 : MAIInst<"v_mfma_f32_32x32x8f16", "F32_V4F16_X16", int_amdgcn_mfma_f32_32x32x8f16>; +defm V_MFMA_I32_32X32X4I8 : MAIInst<"v_mfma_i32_32x32x4i8", "I32_I32_X32", int_amdgcn_mfma_i32_32x32x4i8>; +defm V_MFMA_I32_16X16X16I8 : MAIInst<"v_mfma_i32_16x16x16i8", "I32_I32_X4", int_amdgcn_mfma_i32_16x16x16i8>; +defm V_MFMA_I32_32X32X8I8 : MAIInst<"v_mfma_i32_32x32x8i8", "I32_I32_X16", int_amdgcn_mfma_i32_32x32x8i8>; +defm V_MFMA_F32_4X4X2BF16 : MAIInst<"v_mfma_f32_4x4x2bf16", "F32_V2I16_X4", int_amdgcn_mfma_f32_4x4x2bf16>; +defm V_MFMA_F32_16X16X2BF16 : MAIInst<"v_mfma_f32_16x16x2bf16", "F32_V2I16_X16", int_amdgcn_mfma_f32_16x16x2bf16>; +defm V_MFMA_F32_16X16X8BF16 : MAIInst<"v_mfma_f32_16x16x8bf16", "F32_V2I16_X4", int_amdgcn_mfma_f32_16x16x8bf16>; +defm V_MFMA_F32_32X32X2BF16 : MAIInst<"v_mfma_f32_32x32x2bf16", "F32_V2I16_X32", int_amdgcn_mfma_f32_32x32x2bf16>; +defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16", int_amdgcn_mfma_f32_32x32x4bf16>; } // End SubtargetPredicate = HasMAIInsts +let Predicates = [isGFX90APlus] in { + defm V_MFMA_F32_32X32X4BF16_1K : MAIInst<"v_mfma_f32_32x32x4bf16_1k", "F32_V4I16_X32", int_amdgcn_mfma_f32_32x32x4bf16_1k>; + defm V_MFMA_F32_16X16X4BF16_1K : MAIInst<"v_mfma_f32_16x16x4bf16_1k", "F32_V4I16_X16", int_amdgcn_mfma_f32_16x16x4bf16_1k>; + defm V_MFMA_F32_4X4X4BF16_1K : MAIInst<"v_mfma_f32_4x4x4bf16_1k", "F32_V4I16_X4", int_amdgcn_mfma_f32_4x4x4bf16_1k>; + defm V_MFMA_F32_32X32X8BF16_1K : MAIInst<"v_mfma_f32_32x32x8bf16_1k", "F32_V4I16_X16", int_amdgcn_mfma_f32_32x32x8bf16_1k>; + defm V_MFMA_F32_16X16X16BF16_1K : MAIInst<"v_mfma_f32_16x16x16bf16_1k", "F32_V4I16_X4", int_amdgcn_mfma_f32_16x16x16bf16_1k>; + + defm V_MFMA_F64_16X16X4F64 : MAIInst<"v_mfma_f64_16x16x4f64", "F64_16X16X4F64", int_amdgcn_mfma_f64_16x16x4f64>; + defm V_MFMA_F64_4X4X4F64 : MAIInst<"v_mfma_f64_4x4x4f64", "F64_4X4X4F64", int_amdgcn_mfma_f64_4x4x4f64>; +} // End Predicates = [isGFX90APlus] + +let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 in { + defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fma>; + defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>; + defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>; + defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>; +} // End SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 + def : MnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">; def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">; @@ -430,23 +504,36 @@ multiclass VOP3P_Real_vi<bits<7> op> { VOP3Pe <op, !cast<VOP3_Pseudo>(NAME).Pfl> { let AssemblerPredicate = HasVOP3PInsts; let DecoderNamespace = "GFX8"; + let VOP3P = 1; } } multiclass VOP3P_Real_MAI<bits<7> op> { def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>, - VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { + VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, ?> { let AssemblerPredicate = HasMAIInsts; let DecoderNamespace = "GFX8"; - let Inst{14} = 1; // op_sel_hi(2) default value - let Inst{59} = 1; // op_sel_hi(0) default value - let Inst{60} = 1; // op_sel_hi(1) default value + let Inst{14} = ?; // op_sel_hi(2) + let Inst{59} = ?; // op_sel_hi(0) + let Inst{60} = ?; // op_sel_hi(1) } } -multiclass VOP3P_Real_MFMA<bits<7> op> { +multiclass VOP3P_Real_MFMA_gfx90a<bits<7> op> { + let SubtargetPredicate = isGFX90AOnly, + AssemblerPredicate = isGFX90AOnly, DecoderNamespace = "GFX90A" in { + def _gfx90a_acd : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX90A>, + VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, 1>; + + def _gfx90a_vcd : VOP3P_Real<!cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64"), SIEncodingFamily.GFX90A>, + VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64").Pfl, 0>; + } // End AssemblerPredicate = isGFX90AOnly, DecoderNamespace = "GFX90A" +} + +multiclass VOP3P_Real_MFMA<bits<7> op> : + VOP3P_Real_MFMA_gfx90a <op> { def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>, - VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { + VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, ?> { let AssemblerPredicate = HasMAIInsts; let DecoderNamespace = "GFX8"; } @@ -494,13 +581,18 @@ defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x22>; let SubtargetPredicate = HasDot2Insts in { -defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x23>; defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x26>; defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x27>; + +} // End SubtargetPredicate = HasDot2Insts + +let SubtargetPredicate = HasDot7Insts in { + +defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x23>; defm V_DOT4_U32_U8 : VOP3P_Real_vi <0x29>; defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x2b>; -} // End SubtargetPredicate = HasDot2Insts +} // End SubtargetPredicate = HasDot7Insts let SubtargetPredicate = HasDot1Insts in { @@ -536,16 +628,31 @@ defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MFMA <0x6d>; } // End SubtargetPredicate = HasMAIInsts +defm V_MFMA_F32_32X32X4BF16_1K : VOP3P_Real_MFMA_gfx90a <0x63>; +defm V_MFMA_F32_16X16X4BF16_1K : VOP3P_Real_MFMA_gfx90a <0x64>; +defm V_MFMA_F32_4X4X4BF16_1K : VOP3P_Real_MFMA_gfx90a <0x65>; +defm V_MFMA_F32_32X32X8BF16_1K : VOP3P_Real_MFMA_gfx90a <0x66>; +defm V_MFMA_F32_16X16X16BF16_1K : VOP3P_Real_MFMA_gfx90a <0x67>; +defm V_MFMA_F64_16X16X4F64 : VOP3P_Real_MFMA_gfx90a <0x6e>; +defm V_MFMA_F64_4X4X4F64 : VOP3P_Real_MFMA_gfx90a <0x6f>; + +let SubtargetPredicate = HasPackedFP32Ops in { + defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>; + defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>; + defm V_PK_ADD_F32 : VOP3P_Real_vi <0x32>; + defm V_PK_MOV_B32 : VOP3P_Real_vi <0x33>; +} // End SubtargetPredicate = HasPackedFP32Ops + //===----------------------------------------------------------------------===// // GFX10. //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { +let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10", VOP3P = 1 in { multiclass VOP3P_Real_gfx10<bits<7> op> { def _gfx10 : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.GFX10>, VOP3Pe_gfx10 <op, !cast<VOP3P_Pseudo>(NAME).Pfl>; } -} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" +} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10", VOP3P = 1 defm V_PK_MAD_I16 : VOP3P_Real_gfx10<0x00>; defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10<0x01>; @@ -572,13 +679,18 @@ defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10<0x22>; let SubtargetPredicate = HasDot2Insts in { -defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x13>; defm V_DOT2_I32_I16 : VOP3P_Real_gfx10 <0x14>; defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x15>; + +} // End SubtargetPredicate = HasDot2Insts + +let SubtargetPredicate = HasDot7Insts in { + +defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x13>; defm V_DOT4_U32_U8 : VOP3P_Real_gfx10 <0x17>; defm V_DOT8_U32_U4 : VOP3P_Real_gfx10 <0x19>; -} // End SubtargetPredicate = HasDot2Insts +} // End SubtargetPredicate = HasDot7Insts let SubtargetPredicate = HasDot1Insts in { diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 99599c5cd667..c0cc91029d11 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -27,10 +27,6 @@ class VOPC_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> { let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0); let Inst{24-17} = op; let Inst{31-25} = 0x3e; // encoding - - // VOPC disallows dst_sel and dst_unused as they have no effect on destination - let Inst{42-40} = 0; - let Inst{44-43} = 0; } class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be <P> { @@ -56,6 +52,8 @@ class VOPC_Profile<list<SchedReadWrite> sched, ValueType vt0, ValueType vt1 = vt let Asm32 = "$src0, $src1"; // The destination for 32-bit encoding is implicit. let HasDst32 = 0; + // VOPC disallows dst_sel and dst_unused as they have no effect on destination + let EmitDstSel = 0; let Outs64 = (outs VOPDstS64orS32:$sdst); list<SchedReadWrite> Schedule = sched; } @@ -106,6 +104,8 @@ class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily> : InstSI <ps.OutOperandList, ps.InOperandList, ps.PseudoInstr # " " # ps.AsmOperands, []>, SIMCInstr <ps.PseudoInstr, EncodingFamily> { + let VALU = 1; + let VOPC = 1; let isPseudo = 0; let isCodeGenOnly = 0; @@ -121,6 +121,9 @@ class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily> : let UseNamedOperandTable = ps.UseNamedOperandTable; let Uses = ps.Uses; let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let mayLoad = ps.mayLoad; + let mayStore = ps.mayStore; } class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : @@ -760,7 +763,7 @@ defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">; // We need to use COPY_TO_REGCLASS to w/a the problem when ReplaceAllUsesWith() // complaints it cannot replace i1 <-> i64/i32 if node was not morphed in place. -multiclass ICMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> { +multiclass ICMP_Pattern <PatFrags cond, Instruction inst, ValueType vt> { let WaveSizePredicate = isWave64 in def : GCNPat < (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), @@ -807,7 +810,7 @@ defm : ICMP_Pattern <COND_SGE, V_CMP_GE_I16_e64, i16>; defm : ICMP_Pattern <COND_SLT, V_CMP_LT_I16_e64, i16>; defm : ICMP_Pattern <COND_SLE, V_CMP_LE_I16_e64, i16>; -multiclass FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> { +multiclass FCMP_Pattern <PatFrags cond, Instruction inst, ValueType vt> { let WaveSizePredicate = isWave64 in def : GCNPat < (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 282c1002d3c9..5f6f664ea3e7 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -140,10 +140,18 @@ class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> : let VOP3P = 1; } +class VOP_Real<VOP_Pseudo ps> { + Instruction Opcode = !cast<Instruction>(NAME); + bit IsSingle = ps.Pfl.IsSingle; +} + class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> : + VOP_Real <ps>, InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, SIMCInstr <ps.PseudoInstr, EncodingFamily> { + let VALU = 1; + let VOP3 = 1; let isPseudo = 0; let isCodeGenOnly = 0; let UseNamedOperandTable = 1; @@ -162,6 +170,10 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> : let UseNamedOperandTable = ps.UseNamedOperandTable; let Uses = ps.Uses; let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let mayLoad = ps.mayLoad; + let mayStore = ps.mayStore; + let TRANS = ps.TRANS; VOPProfile Pfl = ps.Pfl; } @@ -317,7 +329,7 @@ class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 { let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1) let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2) - let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, 0); // op_sel_hi(2) + let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, ?); // op_sel_hi(2) let Inst{15} = !if(P.HasClamp, clamp{0}, 0); @@ -326,14 +338,14 @@ class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 { let Inst{40-32} = !if(P.HasSrc0, src0, 0); let Inst{49-41} = !if(P.HasSrc1, src1, 0); let Inst{58-50} = !if(P.HasSrc2, src2, 0); - let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, 0); // op_sel_hi(0) - let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, 0); // op_sel_hi(1) + let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, ?); // op_sel_hi(0) + let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, ?); // op_sel_hi(1) let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo) let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo) let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo) } -class VOP3Pe_MAI <bits<7> op, VOPProfile P> : Enc64 { +class VOP3Pe_MAI <bits<7> op, VOPProfile P, bit acc_cd = 0> : Enc64 { bits<8> vdst; bits<10> src0; bits<10> src1; @@ -341,14 +353,13 @@ class VOP3Pe_MAI <bits<7> op, VOPProfile P> : Enc64 { bits<3> blgp; bits<3> cbsz; bits<4> abid; - bits<1> clamp; let Inst{7-0} = vdst; let Inst{10-8} = !if(P.HasSrc1, cbsz, 0); let Inst{14-11} = !if(P.HasSrc1, abid, 0); - let Inst{15} = !if(P.HasClamp, clamp{0}, 0); + let Inst{15} = acc_cd; let Inst{22-16} = op; let Inst{31-23} = 0x1a7; //encoding @@ -411,8 +422,8 @@ class VOP_SDWAe<VOPProfile P> : Enc64 { bits<1> clamp; let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0); - let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, 0); - let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, 0); + let Inst{42-40} = !if(P.EmitDstSel, dst_sel{2-0}, ?); + let Inst{44-43} = !if(P.EmitDstSel, dst_unused{1-0}, ?); let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0); let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, 0); let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0); @@ -462,8 +473,8 @@ class VOP_SDWA9Ae<VOPProfile P> : VOP_SDWA9e<P> { bits<1> clamp; bits<2> omod; - let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, 0); - let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, 0); + let Inst{42-40} = !if(P.EmitDstSel, dst_sel{2-0}, ?); + let Inst{44-43} = !if(P.EmitDstSel, dst_unused{1-0}, ?); let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0); let Inst{47-46} = !if(P.HasSDWAOMod, omod{1-0}, 0); } @@ -515,12 +526,13 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> : InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA> { + let VALU = 1; + let SDWA = 1; let isPseudo = 0; let isCodeGenOnly = 0; let Defs = ps.Defs; let Uses = ps.Uses; - let SchedRW = ps.SchedRW; let hasSideEffects = ps.hasSideEffects; let Constraints = ps.Constraints; @@ -536,17 +548,22 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> : let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; + let SchedRW = ps.SchedRW; + let mayLoad = ps.mayLoad; + let mayStore = ps.mayStore; + let TRANS = ps.TRANS; } class Base_VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> : InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands9, []> { + let VALU = 1; + let SDWA = 1; let isPseudo = 0; let isCodeGenOnly = 0; let Defs = ps.Defs; let Uses = ps.Uses; - let SchedRW = ps.SchedRW; let hasSideEffects = ps.hasSideEffects; let Constraints = ps.Constraints; @@ -564,6 +581,10 @@ class Base_VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> : let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; + let SchedRW = ps.SchedRW; + let mayLoad = ps.mayLoad; + let mayStore = ps.mayStore; + let TRANS = ps.TRANS; } class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> : @@ -628,8 +649,8 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : string AsmOperands = P.AsmDPP; let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", ""); - let SubtargetPredicate = HasDPP; - let AssemblerPredicate = HasDPP; + let SubtargetPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP); + let AssemblerPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP); let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP, AMDGPUAsmVariants.Disable); let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); @@ -643,12 +664,13 @@ class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> : InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, SIMCInstr <ps.PseudoInstr, EncodingFamily> { + let VALU = 1; + let DPP = 1; let isPseudo = 0; let isCodeGenOnly = 0; let Defs = ps.Defs; let Uses = ps.Uses; - let SchedRW = ps.SchedRW; let hasSideEffects = ps.hasSideEffects; let Constraints = ps.Constraints; @@ -665,6 +687,10 @@ class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> : let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; + let SchedRW = ps.SchedRW; + let mayLoad = ps.mayLoad; + let mayStore = ps.mayStore; + let TRANS = ps.TRANS; } class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16, @@ -683,8 +709,8 @@ class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16, let Size = 8; let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", ""); - let SubtargetPredicate = HasDPP; - let AssemblerPredicate = HasDPP; + let SubtargetPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP); + let AssemblerPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP); let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP, AMDGPUAsmVariants.Disable); let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); @@ -795,3 +821,17 @@ include "VOP1Instructions.td" include "VOP2Instructions.td" include "VOP3Instructions.td" include "VOP3PInstructions.td" + + +class VOPInfoTable <string Format> : GenericTable { + let FilterClass = Format # "_Real"; + let CppTypeName = "VOPInfo"; + let Fields = ["Opcode", "IsSingle"]; + + let PrimaryKey = ["Opcode"]; + let PrimaryKeyName = "get" # Format # "OpcodeHelper"; +} + +def VOP1InfoTable : VOPInfoTable<"VOP1">; +def VOP2InfoTable : VOPInfoTable<"VOP2">; +def VOP3InfoTable : VOPInfoTable<"VOP3">; |