aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h46
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td261
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp40
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp64
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp65
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp588
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h24
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp110
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp528
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp948
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h45
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td43
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp29
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCombine.td37
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td47
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp38
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h30
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp519
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp257
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h13
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td9
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp484
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h52
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructions.td31
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp881
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h39
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp400
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp67
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp18
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPTNote.h16
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp43
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h11
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp154
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp29
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp76
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp128
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp421
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp460
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp514
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h79
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td18
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp220
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp391
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp208
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h42
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp116
-rw-r--r--llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp1072
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td609
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td197
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp311
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h17
-rw-r--r--llvm/lib/Target/AMDGPU/EvergreenInstructions.td5
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td262
-rw-r--r--llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp148
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp822
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h6
-rw-r--r--llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp162
-rw-r--r--llvm/lib/Target/AMDGPU/GCNProcessors.td16
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp862
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.cpp18
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.h15
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp51
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.h14
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h163
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp107
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h8
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp326
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h84
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp37
-rw-r--r--llvm/lib/Target/AMDGPU/MIMGInstructions.td770
-rw-r--r--llvm/lib/Target/AMDGPU/R600ISelLowering.cpp87
-rw-r--r--llvm/lib/Target/AMDGPU/R600ISelLowering.h14
-rw-r--r--llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp169
-rw-r--r--llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h82
-rw-r--r--llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp534
-rw-r--r--llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp185
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.cpp702
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp1218
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h20
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp58
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertSkips.cpp504
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp330
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrFormats.td55
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp918
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h85
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td231
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td278
-rw-r--r--llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp231
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp153
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp96
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp95
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp111
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h42
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp498
-rw-r--r--llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp23
-rw-r--r--llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp637
-rw-r--r--llvm/lib/Target/AMDGPU/SIPostRABundler.cpp136
-rw-r--r--llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp55
-rw-r--r--llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp144
-rw-r--r--llvm/lib/Target/AMDGPU/SIProgramInfo.h3
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp932
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.h69
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td322
-rw-r--r--llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp159
-rw-r--r--llvm/lib/Target/AMDGPU/SISchedule.td73
-rw-r--r--llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp35
-rw-r--r--llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp1025
-rw-r--r--llvm/lib/Target/AMDGPU/SMInstructions.td141
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td205
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp18
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h8
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp306
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h63
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp355
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h70
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp68
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h17
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td71
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td161
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td144
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td324
-rw-r--r--llvm/lib/Target/AMDGPU/VOPCInstructions.td15
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td74
152 files changed, 18265 insertions, 8988 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 677c49331cd5..ca088e63e03c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -15,10 +15,10 @@
namespace llvm {
-class AMDGPUTargetMachine;
class FunctionPass;
class GCNTargetMachine;
class ImmutablePass;
+class MachineFunctionPass;
class ModulePass;
class Pass;
class Target;
@@ -51,12 +51,12 @@ FunctionPass *createSIAnnotateControlFlowPass();
FunctionPass *createSIFoldOperandsPass();
FunctionPass *createSIPeepholeSDWAPass();
FunctionPass *createSILowerI1CopiesPass();
-FunctionPass *createSIAddIMGInitPass();
FunctionPass *createSIShrinkInstructionsPass();
FunctionPass *createSILoadStoreOptimizerPass();
FunctionPass *createSIWholeQuadModePass();
FunctionPass *createSIFixControlFlowLiveIntervalsPass();
FunctionPass *createSIOptimizeExecMaskingPreRAPass();
+FunctionPass *createSIOptimizeVGPRLiveRangePass();
FunctionPass *createSIFixSGPRCopiesPass();
FunctionPass *createSIMemoryLegalizerPass();
FunctionPass *createSIInsertWaitcntsPass();
@@ -72,7 +72,10 @@ FunctionPass *createAMDGPUMachineCFGStructurizerPass();
FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *);
ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *);
FunctionPass *createAMDGPURewriteOutArgumentsPass();
+ModulePass *createAMDGPUReplaceLDSUseWithPointerPass();
+ModulePass *createAMDGPULowerModuleLDSPass();
FunctionPass *createSIModeRegisterPass();
+FunctionPass *createGCNPreRAOptimizationsPass();
struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
AMDGPUSimplifyLibCallsPass(TargetMachine &TM) : TM(TM) {}
@@ -94,6 +97,8 @@ extern char &AMDGPUMachineCFGStructurizerID;
void initializeAMDGPUAlwaysInlinePass(PassRegistry&);
Pass *createAMDGPUAnnotateKernelFeaturesPass();
+Pass *createAMDGPUAttributorPass();
+void initializeAMDGPUAttributorPass(PassRegistry &);
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
extern char &AMDGPUAnnotateKernelFeaturesID;
@@ -146,6 +151,21 @@ private:
TargetMachine &TM;
};
+void initializeAMDGPUReplaceLDSUseWithPointerPass(PassRegistry &);
+extern char &AMDGPUReplaceLDSUseWithPointerID;
+
+struct AMDGPUReplaceLDSUseWithPointerPass
+ : PassInfoMixin<AMDGPUReplaceLDSUseWithPointerPass> {
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+void initializeAMDGPULowerModuleLDSPass(PassRegistry &);
+extern char &AMDGPULowerModuleLDSID;
+
+struct AMDGPULowerModuleLDSPass : PassInfoMixin<AMDGPULowerModuleLDSPass> {
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
extern char &AMDGPURewriteOutArgumentsID;
@@ -197,14 +217,11 @@ extern char &SIWholeQuadModeID;
void initializeSILowerControlFlowPass(PassRegistry &);
extern char &SILowerControlFlowID;
-void initializeSIRemoveShortExecBranchesPass(PassRegistry &);
-extern char &SIRemoveShortExecBranchesID;
-
void initializeSIPreEmitPeepholePass(PassRegistry &);
extern char &SIPreEmitPeepholeID;
-void initializeSIInsertSkipsPass(PassRegistry &);
-extern char &SIInsertSkipsPassID;
+void initializeSILateBranchLoweringPass(PassRegistry &);
+extern char &SILateBranchLoweringPassID;
void initializeSIOptimizeExecMaskingPass(PassRegistry &);
extern char &SIOptimizeExecMaskingID;
@@ -218,9 +235,6 @@ extern char &AMDGPUSimplifyLibCallsID;
void initializeAMDGPUUseNativeCallsPass(PassRegistry &);
extern char &AMDGPUUseNativeCallsID;
-void initializeSIAddIMGInitPass(PassRegistry &);
-extern char &SIAddIMGInitID;
-
void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &);
extern char &AMDGPUPerfHintAnalysisID;
@@ -271,6 +285,9 @@ ModulePass *createAMDGPUPrintfRuntimeBinding();
void initializeAMDGPUPrintfRuntimeBindingPass(PassRegistry&);
extern char &AMDGPUPrintfRuntimeBindingID;
+void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &);
+extern char &AMDGPUResourceUsageAnalysisID;
+
struct AMDGPUPrintfRuntimeBindingPass
: PassInfoMixin<AMDGPUPrintfRuntimeBindingPass> {
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
@@ -287,6 +304,9 @@ struct AMDGPUUnifyMetadataPass : PassInfoMixin<AMDGPUUnifyMetadataPass> {
void initializeSIOptimizeExecMaskingPreRAPass(PassRegistry&);
extern char &SIOptimizeExecMaskingPreRAID;
+void initializeSIOptimizeVGPRLiveRangePass(PassRegistry &);
+extern char &SIOptimizeVGPRLiveRangeID;
+
void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&);
extern char &AMDGPUAnnotateUniformValuesPassID;
@@ -331,12 +351,12 @@ ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass();
void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &);
extern char &AMDGPUOpenCLEnqueuedBlockLoweringID;
-void initializeGCNRegBankReassignPass(PassRegistry &);
-extern char &GCNRegBankReassignID;
-
void initializeGCNNSAReassignPass(PassRegistry &);
extern char &GCNNSAReassignID;
+void initializeGCNPreRAOptimizationsPass(PassRegistry &);
+extern char &GCNPreRAOptimizationsID;
+
namespace AMDGPU {
enum TargetIndex {
TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index c352c0097c5c..7991f3d2a6b2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -51,6 +51,12 @@ def HalfRate64Ops : SubtargetFeature<"half-rate-64-ops",
"Most fp64 instructions are half rate instead of quarter"
>;
+def FullRate64Ops : SubtargetFeature<"full-rate-64-ops",
+ "FullRate64Ops",
+ "true",
+ "Most fp64 instructions are full rate"
+>;
+
def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
"FlatAddressSpace",
"true",
@@ -148,6 +154,12 @@ def FeatureXNACK : SubtargetFeature<"xnack",
"Enable XNACK support"
>;
+def FeatureTgSplit : SubtargetFeature<"tgsplit",
+ "EnableTgSplit",
+ "true",
+ "Enable threadgroup split execution"
+>;
+
def FeatureCuMode : SubtargetFeature<"cumode",
"EnableCuMode",
"true",
@@ -214,10 +226,28 @@ def FeatureNSAtoVMEMBug : SubtargetFeature<"nsa-to-vmem-bug",
"MIMG-NSA followed by VMEM fail if EXEC_LO or EXEC_HI equals zero"
>;
+def FeatureNSAClauseBug : SubtargetFeature<"nsa-clause-bug",
+ "HasNSAClauseBug",
+ "true",
+ "MIMG-NSA in a hard clause has unpredictable results on GFX10.1"
+>;
+
def FeatureFlatSegmentOffsetBug : SubtargetFeature<"flat-segment-offset-bug",
"HasFlatSegmentOffsetBug",
"true",
- "GFX10 bug, inst_offset ignored in flat segment"
+ "GFX10 bug where inst_offset is ignored when flat instructions access global memory"
+>;
+
+def FeatureNegativeScratchOffsetBug : SubtargetFeature<"negative-scratch-offset-bug",
+ "NegativeScratchOffsetBug",
+ "true",
+ "Negative immediate offsets in scratch instructions with an SGPR offset page fault on GFX9"
+>;
+
+def FeatureNegativeUnalignedScratchOffsetBug : SubtargetFeature<"negative-unaligned-scratch-offset-bug",
+ "NegativeUnalignedScratchOffsetBug",
+ "true",
+ "Scratch instructions with a VGPR offset and a negative immediate offset that is not a multiple of 4 read wrong memory on GFX10"
>;
def FeatureOffset3fBug : SubtargetFeature<"offset-3f-bug",
@@ -272,6 +302,12 @@ def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts",
"Additional instructions for GFX9+"
>;
+def FeatureGFX90AInsts : SubtargetFeature<"gfx90a-insts",
+ "GFX90AInsts",
+ "true",
+ "Additional instructions for GFX90A+"
+>;
+
def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts",
"GFX10Insts",
"true",
@@ -387,6 +423,18 @@ def FeatureDPP8 : SubtargetFeature<"dpp8",
"Support DPP8 (Data Parallel Primitives) extension"
>;
+def Feature64BitDPP : SubtargetFeature<"dpp-64bit",
+ "Has64BitDPP",
+ "true",
+ "Support DPP (Data Parallel Primitives) extension"
+>;
+
+def FeaturePackedFP32Ops : SubtargetFeature<"packed-fp32-ops",
+ "HasPackedFP32Ops",
+ "true",
+ "Support packed fp32 instructions"
+>;
+
def FeatureR128A16 : SubtargetFeature<"r128-a16",
"HasR128A16",
"true",
@@ -411,6 +459,18 @@ def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding",
"Support NSA encoding for image instructions"
>;
+def FeatureExtendedImageInsts : SubtargetFeature<"extended-image-insts",
+ "HasExtendedImageInsts",
+ "true",
+ "Support mips != 0, lod != 0, gather4, and get_lod"
+>;
+
+def FeatureGFX10_AEncoding : SubtargetFeature<"gfx10_a-encoding",
+ "GFX10_AEncoding",
+ "true",
+ "Has BVH ray tracing instructions"
+>;
+
def FeatureGFX10_BEncoding : SubtargetFeature<"gfx10_b-encoding",
"GFX10_BEncoding",
"true",
@@ -444,7 +504,7 @@ def FeatureDot1Insts : SubtargetFeature<"dot1-insts",
def FeatureDot2Insts : SubtargetFeature<"dot2-insts",
"HasDot2Insts",
"true",
- "Has v_dot2_f32_f16, v_dot2_i32_i16, v_dot2_u32_u16, v_dot4_u32_u8, v_dot8_u32_u4 instructions"
+ "Has v_dot2_i32_i16, v_dot2_u32_u16 instructions"
>;
def FeatureDot3Insts : SubtargetFeature<"dot3-insts",
@@ -471,6 +531,12 @@ def FeatureDot6Insts : SubtargetFeature<"dot6-insts",
"Has v_dot4c_i32_i8 instruction"
>;
+def FeatureDot7Insts : SubtargetFeature<"dot7-insts",
+ "HasDot7Insts",
+ "true",
+ "Has v_dot2_f32_f16, v_dot4_u32_u8, v_dot8_u32_u4 instructions"
+>;
+
def FeatureMAIInsts : SubtargetFeature<"mai-insts",
"HasMAIInsts",
"true",
@@ -527,6 +593,12 @@ def FeatureSMemTimeInst : SubtargetFeature<"s-memtime-inst",
"Has s_memtime instruction"
>;
+def FeatureShaderCyclesRegister : SubtargetFeature<"shader-cycles-register",
+ "HasShaderCyclesRegister",
+ "true",
+ "Has SHADER_CYCLES hardware register"
+>;
+
def FeatureMadMacF32Insts : SubtargetFeature<"mad-mac-f32-insts",
"HasMadMacF32Insts",
"true",
@@ -557,6 +629,16 @@ def FeatureNoDataDepHazard : SubtargetFeature<"no-data-dep-hazard",
"Does not need SW waitstates"
>;
+class SubtargetFeatureNSAMaxSize <int Value> : SubtargetFeature <
+ "nsa-max-size-"#Value,
+ "NSAMaxSize",
+ !cast<string>(Value),
+ "The maximum non-sequential address size in VGPRs."
+>;
+
+def FeatureNSAMaxSize5 : SubtargetFeatureNSAMaxSize<5>;
+def FeatureNSAMaxSize13 : SubtargetFeatureNSAMaxSize<13>;
+
//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
@@ -659,6 +741,18 @@ def FeatureUnalignedAccessMode : SubtargetFeature<"unaligned-access-mode",
" supports it"
>;
+def FeaturePackedTID : SubtargetFeature<"packed-tid",
+ "HasPackedTID",
+ "true",
+ "Workitem IDs are packed into v0 at kernel launch"
+>;
+
+def FeatureArchitectedFlatScratch : SubtargetFeature<"architected-flat-scratch",
+ "HasArchitectedFlatScratch",
+ "true",
+ "Flat Scratch register is a readonly SPI initialized architected register"
+>;
+
// Dummy feature used to disable assembler instructions.
def FeatureDisable : SubtargetFeature<"",
"FeatureDisable","true",
@@ -675,7 +769,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
[FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128,
FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
- FeatureTrigReducedRange]
+ FeatureTrigReducedRange, FeatureExtendedImageInsts
+ ]
>;
def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
@@ -684,7 +779,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
FeatureWavefrontSize64, FeatureFlatAddressSpace,
FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange,
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
- FeatureDsSrc2Insts, FeatureUnalignedBufferAccess]
+ FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess
+ ]
>;
def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
@@ -697,7 +793,9 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP,
FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts,
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
- FeatureDsSrc2Insts, FeatureFastDenormalF32, FeatureUnalignedBufferAccess]
+ FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32,
+ FeatureUnalignedBufferAccess
+ ]
>;
def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
@@ -712,9 +810,10 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts,
FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
- FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts,
- FeatureFastDenormalF32, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
- FeatureSupportsXNACK]
+ FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
+ FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
+ FeatureNegativeScratchOffsetBug
+ ]
>;
def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
@@ -729,9 +828,9 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts,
FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking,
- FeatureVOP3Literal, FeatureDPP8,
+ FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
- FeatureGFX10A16, FeatureFastDenormalF32, FeatureG16,
+ FeatureGFX10A16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess
]
>;
@@ -816,17 +915,26 @@ def FeatureISAVersion9_0_0 : FeatureSet<
[FeatureGFX9,
FeatureMadMixInsts,
FeatureLDSBankCount32,
+ FeatureDsSrc2Insts,
+ FeatureExtendedImageInsts,
+ FeatureMadMacF32Insts,
FeatureImageGather4D16Bug]>;
def FeatureISAVersion9_0_2 : FeatureSet<
[FeatureGFX9,
FeatureMadMixInsts,
FeatureLDSBankCount32,
+ FeatureDsSrc2Insts,
+ FeatureExtendedImageInsts,
+ FeatureMadMacF32Insts,
FeatureImageGather4D16Bug]>;
def FeatureISAVersion9_0_4 : FeatureSet<
[FeatureGFX9,
FeatureLDSBankCount32,
+ FeatureDsSrc2Insts,
+ FeatureExtendedImageInsts,
+ FeatureMadMacF32Insts,
FeatureFmaMixInsts,
FeatureImageGather4D16Bug]>;
@@ -835,9 +943,13 @@ def FeatureISAVersion9_0_6 : FeatureSet<
HalfRate64Ops,
FeatureFmaMixInsts,
FeatureLDSBankCount32,
+ FeatureDsSrc2Insts,
+ FeatureExtendedImageInsts,
+ FeatureMadMacF32Insts,
FeatureDLInsts,
FeatureDot1Insts,
FeatureDot2Insts,
+ FeatureDot7Insts,
FeatureSupportsSRAMECC,
FeatureImageGather4D16Bug]>;
@@ -846,6 +958,9 @@ def FeatureISAVersion9_0_8 : FeatureSet<
HalfRate64Ops,
FeatureFmaMixInsts,
FeatureLDSBankCount32,
+ FeatureDsSrc2Insts,
+ FeatureExtendedImageInsts,
+ FeatureMadMacF32Insts,
FeatureDLInsts,
FeatureDot1Insts,
FeatureDot2Insts,
@@ -853,6 +968,7 @@ def FeatureISAVersion9_0_8 : FeatureSet<
FeatureDot4Insts,
FeatureDot5Insts,
FeatureDot6Insts,
+ FeatureDot7Insts,
FeatureMAIInsts,
FeaturePkFmacF16Inst,
FeatureAtomicFaddInsts,
@@ -864,13 +980,41 @@ def FeatureISAVersion9_0_9 : FeatureSet<
[FeatureGFX9,
FeatureMadMixInsts,
FeatureLDSBankCount32,
+ FeatureDsSrc2Insts,
+ FeatureExtendedImageInsts,
+ FeatureMadMacF32Insts,
FeatureImageGather4D16Bug]>;
+def FeatureISAVersion9_0_A : FeatureSet<
+ [FeatureGFX9,
+ FeatureGFX90AInsts,
+ FeatureFmaMixInsts,
+ FeatureLDSBankCount32,
+ FeatureDLInsts,
+ FeatureDot1Insts,
+ FeatureDot2Insts,
+ FeatureDot3Insts,
+ FeatureDot4Insts,
+ FeatureDot5Insts,
+ FeatureDot6Insts,
+ FeatureDot7Insts,
+ Feature64BitDPP,
+ FeaturePackedFP32Ops,
+ FeatureMAIInsts,
+ FeaturePkFmacF16Inst,
+ FeatureAtomicFaddInsts,
+ FeatureMadMacF32Insts,
+ FeatureSupportsSRAMECC,
+ FeaturePackedTID,
+ FullRate64Ops]>;
+
def FeatureISAVersion9_0_C : FeatureSet<
[FeatureGFX9,
FeatureMadMixInsts,
FeatureLDSBankCount32,
- FeatureXNACK,
+ FeatureDsSrc2Insts,
+ FeatureExtendedImageInsts,
+ FeatureMadMacF32Insts,
FeatureImageGather4D16Bug]>;
// TODO: Organize more features into groups.
@@ -884,8 +1028,10 @@ def FeatureGroup {
FeatureVcmpxExecWARHazard,
FeatureLdsBranchVmemWARHazard,
FeatureNSAtoVMEMBug,
+ FeatureNSAClauseBug,
FeatureOffset3fBug,
- FeatureFlatSegmentOffsetBug
+ FeatureFlatSegmentOffsetBug,
+ FeatureNegativeUnalignedScratchOffsetBug
];
}
@@ -895,12 +1041,12 @@ def FeatureISAVersion10_1_0 : FeatureSet<
FeatureLDSBankCount32,
FeatureDLInsts,
FeatureNSAEncoding,
+ FeatureNSAMaxSize5,
FeatureWavefrontSize32,
FeatureScalarStores,
FeatureScalarAtomics,
FeatureScalarFlatScratchInsts,
FeatureGetWaveIdInst,
- FeatureSMemTimeInst,
FeatureMadMacF32Insts,
FeatureDsSrc2Insts,
FeatureLdsMisalignedBug,
@@ -915,13 +1061,14 @@ def FeatureISAVersion10_1_1 : FeatureSet<
FeatureDot2Insts,
FeatureDot5Insts,
FeatureDot6Insts,
+ FeatureDot7Insts,
FeatureNSAEncoding,
+ FeatureNSAMaxSize5,
FeatureWavefrontSize32,
FeatureScalarStores,
FeatureScalarAtomics,
FeatureScalarFlatScratchInsts,
FeatureGetWaveIdInst,
- FeatureSMemTimeInst,
FeatureMadMacF32Insts,
FeatureDsSrc2Insts,
FeatureLdsMisalignedBug,
@@ -936,13 +1083,32 @@ def FeatureISAVersion10_1_2 : FeatureSet<
FeatureDot2Insts,
FeatureDot5Insts,
FeatureDot6Insts,
+ FeatureDot7Insts,
+ FeatureNSAEncoding,
+ FeatureNSAMaxSize5,
+ FeatureWavefrontSize32,
+ FeatureScalarStores,
+ FeatureScalarAtomics,
+ FeatureScalarFlatScratchInsts,
+ FeatureGetWaveIdInst,
+ FeatureMadMacF32Insts,
+ FeatureDsSrc2Insts,
+ FeatureLdsMisalignedBug,
+ FeatureSupportsXNACK])>;
+
+def FeatureISAVersion10_1_3 : FeatureSet<
+ !listconcat(FeatureGroup.GFX10_1_Bugs,
+ [FeatureGFX10,
+ FeatureGFX10_AEncoding,
+ FeatureLDSBankCount32,
+ FeatureDLInsts,
FeatureNSAEncoding,
+ FeatureNSAMaxSize5,
FeatureWavefrontSize32,
FeatureScalarStores,
FeatureScalarAtomics,
FeatureScalarFlatScratchInsts,
FeatureGetWaveIdInst,
- FeatureSMemTimeInst,
FeatureMadMacF32Insts,
FeatureDsSrc2Insts,
FeatureLdsMisalignedBug,
@@ -950,6 +1116,7 @@ def FeatureISAVersion10_1_2 : FeatureSet<
def FeatureISAVersion10_3_0 : FeatureSet<
[FeatureGFX10,
+ FeatureGFX10_AEncoding,
FeatureGFX10_BEncoding,
FeatureGFX10_3Insts,
FeatureLDSBankCount32,
@@ -958,8 +1125,11 @@ def FeatureISAVersion10_3_0 : FeatureSet<
FeatureDot2Insts,
FeatureDot5Insts,
FeatureDot6Insts,
+ FeatureDot7Insts,
FeatureNSAEncoding,
- FeatureWavefrontSize32]>;
+ FeatureNSAMaxSize13,
+ FeatureWavefrontSize32,
+ FeatureShaderCyclesRegister]>;
//===----------------------------------------------------------------------===//
@@ -1077,6 +1247,14 @@ def isGFX6GFX7GFX8GFX9 :
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
AssemblerPredicate<(all_of (not FeatureGFX10Insts))>;
+def isGFX6GFX7GFX8GFX9NotGFX90A :
+ Predicate<"!Subtarget->hasGFX90AInsts() &&"
+ "(Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
+ " Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
+ " Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
+ " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">,
+ AssemblerPredicate<(all_of (not FeatureGFX10Insts), (not FeatureGFX90AInsts))>;
+
def isGFX7Plus :
Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
AssemblerPredicate<(all_of FeatureCIInsts)>;
@@ -1097,6 +1275,32 @@ def isGFX9Only : Predicate <
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
AssemblerPredicate<(all_of FeatureGCN3Encoding, FeatureGFX9Insts)>;
+def isGCN3ExcludingGFX90A :
+ Predicate<"Subtarget->isGCN3Encoding() && !Subtarget->hasGFX90AInsts()">,
+ AssemblerPredicate<(all_of FeatureGCN3Encoding, (not FeatureGFX90AInsts))>;
+
+def isGFX90APlus :
+ Predicate<"Subtarget->hasGFX90AInsts()">,
+ AssemblerPredicate<(all_of FeatureGFX90AInsts)>;
+
+def isNotGFX90APlus :
+ Predicate<"!Subtarget->hasGFX90AInsts()">,
+ AssemblerPredicate<(all_of (not FeatureGFX90AInsts))>;
+
+def isGFX8GFX9NotGFX90A :
+ Predicate<"!Subtarget->hasGFX90AInsts() &&"
+ "(Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
+ " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">,
+ AssemblerPredicate<(all_of FeatureGFX8Insts, FeatureGCN3Encoding, (not FeatureGFX90AInsts))>;
+
+def isGFX90AOnly :
+ Predicate<"Subtarget->hasGFX90AInsts()">,
+ AssemblerPredicate<(all_of FeatureGFX90AInsts)>;
+
+def isGFX908orGFX90A :
+ Predicate<"Subtarget->hasMAIInsts()">,
+ AssemblerPredicate<(all_of FeatureMAIInsts)>;
+
def isGFX8GFX9 :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
@@ -1126,6 +1330,9 @@ def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">,
def HasFlatScratchSTMode : Predicate<"Subtarget->hasFlatScratchSTMode()">,
AssemblerPredicate<(any_of FeatureGFX10_3Insts)>;
+def HasGFX10_AEncoding : Predicate<"Subtarget->hasGFX10_AEncoding()">,
+ AssemblerPredicate<(all_of FeatureGFX10_AEncoding)>;
+
def HasGFX10_BEncoding : Predicate<"Subtarget->hasGFX10_BEncoding()">,
AssemblerPredicate<(all_of FeatureGFX10_BEncoding)>;
@@ -1177,6 +1384,19 @@ def HasDPP : Predicate<"Subtarget->hasDPP()">,
def HasDPP8 : Predicate<"Subtarget->hasDPP8()">,
AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureDPP8)>;
+def Has64BitDPP : Predicate<"Subtarget->has64BitDPP()">,
+ AssemblerPredicate<(all_of Feature64BitDPP)>;
+
+def HasPackedFP32Ops : Predicate<"Subtarget->hasPackedFP32Ops()">,
+ AssemblerPredicate<(all_of FeaturePackedFP32Ops)>;
+
+def HasFmaakFmamkF32Insts :
+ Predicate<"Subtarget->hasFmaakFmamkF32Insts()">,
+ AssemblerPredicate<(any_of FeatureGFX10Insts)>;
+
+def HasExtendedImageInsts : Predicate<"Subtarget->hasExtendedImageInsts()">,
+ AssemblerPredicate<(all_of FeatureExtendedImageInsts)>;
+
def HasR128A16 : Predicate<"Subtarget->hasR128A16()">,
AssemblerPredicate<(all_of FeatureR128A16)>;
@@ -1238,6 +1458,9 @@ def HasDot5Insts : Predicate<"Subtarget->hasDot5Insts()">,
def HasDot6Insts : Predicate<"Subtarget->hasDot6Insts()">,
AssemblerPredicate<(all_of FeatureDot6Insts)>;
+def HasDot7Insts : Predicate<"Subtarget->hasDot7Insts()">,
+ AssemblerPredicate<(all_of FeatureDot7Insts)>;
+
def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">,
AssemblerPredicate<(all_of FeatureGetWaveIdInst)>;
@@ -1250,7 +1473,8 @@ def HasSMemRealTime : Predicate<"Subtarget->hasSMemRealTime()">,
def HasSMemTimeInst : Predicate<"Subtarget->hasSMemTimeInst()">,
AssemblerPredicate<(all_of FeatureSMemTimeInst)>;
-def HasNoSMemTimeInst : Predicate<"!Subtarget->hasSMemTimeInst()">;
+def HasShaderCyclesRegister : Predicate<"Subtarget->hasShaderCyclesRegister()">,
+ AssemblerPredicate<(all_of FeatureShaderCyclesRegister)>;
def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">,
AssemblerPredicate<(all_of FeaturePkFmacF16Inst)>;
@@ -1267,9 +1491,6 @@ def HasAtomicFaddInsts : Predicate<"Subtarget->hasAtomicFaddInsts()">,
def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">,
AssemblerPredicate<(all_of FeatureDsSrc2Insts)>;
-def HasOffset3fBug : Predicate<"!Subtarget->hasOffset3fBug()">,
- AssemblerPredicate<(all_of FeatureOffset3fBug)>;
-
def EnableLateCFGStructurize : Predicate<
"EnableLateStructurizeCFG">;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
index 0ed89e9ca8d6..88b88a04a7d1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -41,24 +41,28 @@ void AMDGPUAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesAll();
}
-// These arrays are indexed by address space value enum elements 0 ... to 7
-static const AliasResult ASAliasRules[8][8] = {
- /* Flat Global Region Group Constant Private Constant 32-bit Buffer Fat Ptr */
- /* Flat */ {MayAlias, MayAlias, NoAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias},
- /* Global */ {MayAlias, MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , MayAlias, MayAlias},
- /* Region */ {NoAlias, NoAlias , MayAlias, NoAlias , NoAlias, NoAlias , NoAlias, NoAlias},
- /* Group */ {MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , NoAlias , NoAlias , NoAlias},
- /* Constant */ {MayAlias, MayAlias, NoAlias, NoAlias , NoAlias , NoAlias , MayAlias, MayAlias},
- /* Private */ {MayAlias, NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, NoAlias , NoAlias},
- /* Constant 32-bit */ {MayAlias, MayAlias, NoAlias, NoAlias , MayAlias, NoAlias , NoAlias , MayAlias},
- /* Buffer Fat Ptr */ {MayAlias, MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , MayAlias, MayAlias}
-};
-
static AliasResult getAliasResult(unsigned AS1, unsigned AS2) {
static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 7, "Addr space out of range");
if (AS1 > AMDGPUAS::MAX_AMDGPU_ADDRESS || AS2 > AMDGPUAS::MAX_AMDGPU_ADDRESS)
- return MayAlias;
+ return AliasResult::MayAlias;
+
+#define ASMay AliasResult::MayAlias
+#define ASNo AliasResult::NoAlias
+ // This array is indexed by address space value enum elements 0 ... to 7
+ static const AliasResult ASAliasRules[8][8] = {
+ /* Flat Global Region Group Constant Private Const32 Buf Fat Ptr */
+ /* Flat */ {ASMay, ASMay, ASNo, ASMay, ASMay, ASMay, ASMay, ASMay},
+ /* Global */ {ASMay, ASMay, ASNo, ASNo, ASMay, ASNo, ASMay, ASMay},
+ /* Region */ {ASNo, ASNo, ASMay, ASNo, ASNo, ASNo, ASNo, ASNo},
+ /* Group */ {ASMay, ASNo, ASNo, ASMay, ASNo, ASNo, ASNo, ASNo},
+ /* Constant */ {ASMay, ASMay, ASNo, ASNo, ASNo, ASNo, ASMay, ASMay},
+ /* Private */ {ASMay, ASNo, ASNo, ASNo, ASNo, ASMay, ASNo, ASNo},
+ /* Constant 32-bit */ {ASMay, ASMay, ASNo, ASNo, ASMay, ASNo, ASNo, ASMay},
+ /* Buffer Fat Ptr */ {ASMay, ASMay, ASNo, ASNo, ASMay, ASNo, ASMay, ASMay}
+ };
+#undef ASMay
+#undef ASNo
return ASAliasRules[AS1][AS2];
}
@@ -70,7 +74,7 @@ AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA,
unsigned asB = LocB.Ptr->getType()->getPointerAddressSpace();
AliasResult Result = getAliasResult(asA, asB);
- if (Result == NoAlias)
+ if (Result == AliasResult::NoAlias)
return Result;
// In general, FLAT (generic) pointers could be aliased to LOCAL or PRIVATE
@@ -87,21 +91,21 @@ AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA,
if (asA == AMDGPUAS::FLAT_ADDRESS &&
(asB == AMDGPUAS::LOCAL_ADDRESS || asB == AMDGPUAS::PRIVATE_ADDRESS)) {
const auto *ObjA =
- getUnderlyingObject(A.Ptr->stripPointerCastsAndInvariantGroups());
+ getUnderlyingObject(A.Ptr->stripPointerCastsForAliasAnalysis());
if (const LoadInst *LI = dyn_cast<LoadInst>(ObjA)) {
// If a generic pointer is loaded from the constant address space, it
// could only be a GLOBAL or CONSTANT one as that address space is soley
// prepared on the host side, where only GLOBAL or CONSTANT variables are
// visible. Note that this even holds for regular functions.
if (LI->getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS)
- return NoAlias;
+ return AliasResult::NoAlias;
} else if (const Argument *Arg = dyn_cast<Argument>(ObjA)) {
const Function *F = Arg->getParent();
switch (F->getCallingConv()) {
case CallingConv::AMDGPU_KERNEL:
// In the kernel function, kernel arguments won't alias to (local)
// variables in shared or private address space.
- return NoAlias;
+ return AliasResult::NoAlias;
default:
// TODO: In the regular function, if that local variable in the
// location B is not captured, that argument pointer won't alias to it
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index 51af25050950..2af9fc955875 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -120,10 +120,10 @@ static bool alwaysInlineImpl(Module &M, bool GlobalOpt) {
for (GlobalVariable &GV : M.globals()) {
// TODO: Region address
unsigned AS = GV.getAddressSpace();
- if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS)
- continue;
-
- recursivelyVisitUsers(GV, FuncsToAlwaysInline);
+ if ((AS == AMDGPUAS::REGION_ADDRESS) ||
+ (AS == AMDGPUAS::LOCAL_ADDRESS &&
+ !AMDGPUTargetMachine::EnableLowerModuleLDS))
+ recursivelyVisitUsers(GV, FuncsToAlwaysInline);
}
if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index a4e72f787230..af6dfc07eb50 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -25,6 +25,13 @@
using namespace llvm;
namespace {
+static constexpr StringLiteral ImplicitAttrNames[] = {
+ // X ids unnecessarily propagated to kernels.
+ "amdgpu-work-item-id-x", "amdgpu-work-item-id-y",
+ "amdgpu-work-item-id-z", "amdgpu-work-group-id-x",
+ "amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
+ "amdgpu-dispatch-ptr", "amdgpu-dispatch-id",
+ "amdgpu-queue-ptr", "amdgpu-implicitarg-ptr"};
class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
private:
@@ -194,18 +201,10 @@ static bool handleAttr(Function &Parent, const Function &Callee,
static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
bool &NeedQueuePtr) {
- // X ids unnecessarily propagated to kernels.
- static constexpr StringLiteral AttrNames[] = {
- "amdgpu-work-item-id-x", "amdgpu-work-item-id-y",
- "amdgpu-work-item-id-z", "amdgpu-work-group-id-x",
- "amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
- "amdgpu-dispatch-ptr", "amdgpu-dispatch-id",
- "amdgpu-implicitarg-ptr"};
-
if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
NeedQueuePtr = true;
- for (StringRef AttrName : AttrNames)
+ for (StringRef AttrName : ImplicitAttrNames)
handleAttr(Parent, Callee, AttrName);
}
@@ -268,7 +267,20 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
bool Changed = false;
bool NeedQueuePtr = false;
bool HaveCall = false;
+ bool HasIndirectCall = false;
bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
+ CallingConv::ID CC = F.getCallingConv();
+ bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx);
+
+ // If this function hasAddressTaken() = true
+ // then add all attributes corresponding to the implicit args.
+ if (CallingConvSupportsAllImplicits &&
+ F.hasAddressTaken(nullptr, true, true, true)) {
+ for (StringRef AttrName : ImplicitAttrNames) {
+ F.addFnAttr(AttrName);
+ }
+ Changed = true;
+ }
for (BasicBlock &BB : F) {
for (Instruction &I : BB) {
@@ -281,10 +293,12 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
const Function *Callee =
dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
- // TODO: Do something with indirect calls.
+ // Note the occurence of indirect call.
if (!Callee) {
- if (!CB->isInlineAsm())
+ if (!CB->isInlineAsm()) {
+ HasIndirectCall = true;
HaveCall = true;
+ }
continue;
}
@@ -351,6 +365,28 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
Changed = true;
}
+ // This pass cannot copy attributes from callees to callers
+ // if there is an indirect call and in thus such cases,
+ // hasAddressTaken() would be false for kernels and functions
+ // making an indirect call (if they are themselves not indirectly called).
+ // We must tag all such kernels/functions with all implicits attributes
+ // for correctness.
+ // e.g.
+ // 1. Kernel K1 makes an indirect call to function F1.
+ // Without detecting an indirect call in K1, this pass will not
+ // add all implicit args to K1 (which is incorrect).
+ // 2. Kernel K1 makes direct call to F1 which makes indirect call to function
+ // F2.
+ // Without detecting an indirect call in F1 (whose hasAddressTaken() is
+ // false), the pass will not add all implicit args to F1 (which is
+ // essential for correctness).
+ if (CallingConvSupportsAllImplicits && HasIndirectCall) {
+ for (StringRef AttrName : ImplicitAttrNames) {
+ F.addFnAttr(AttrName);
+ }
+ Changed = true;
+ }
+
return Changed;
}
@@ -367,9 +403,11 @@ bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
}
Function *F = I->getFunction();
- // Add feature attributes
- if (!F || F->isDeclaration())
+ // Ignore functions with graphics calling conventions, these are currently
+ // not allowed to have kernel arguments.
+ if (!F || F->isDeclaration() || AMDGPU::isGraphics(F->getCallingConv()))
continue;
+ // Add feature attributes
Changed |= addFeatureAttributes(*F);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index c2a4d67ea98e..7d6845b287bc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -14,10 +14,8 @@
#include "AMDGPU.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemorySSA.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/InitializePasses.h"
@@ -30,8 +28,7 @@ namespace {
class AMDGPUAnnotateUniformValues : public FunctionPass,
public InstVisitor<AMDGPUAnnotateUniformValues> {
LegacyDivergenceAnalysis *DA;
- MemoryDependenceResults *MDR;
- LoopInfo *LI;
+ MemorySSA *MSSA;
DenseMap<Value*, GetElementPtrInst*> noClobberClones;
bool isEntryFunc;
@@ -46,8 +43,7 @@ public:
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LegacyDivergenceAnalysis>();
- AU.addRequired<MemoryDependenceWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<MemorySSAWrapperPass>();
AU.setPreservesAll();
}
@@ -61,8 +57,7 @@ public:
INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
"Add AMDGPU uniform metadata", false, false)
INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
-INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
"Add AMDGPU uniform metadata", false, false)
@@ -75,49 +70,14 @@ static void setNoClobberMetadata(Instruction *I) {
I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {}));
}
-static void DFS(BasicBlock *Root, SetVector<BasicBlock*> & Set) {
- for (auto I : predecessors(Root))
- if (Set.insert(I))
- DFS(I, Set);
-}
-
bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) {
- // 1. get Loop for the Load->getparent();
- // 2. if it exists, collect all the BBs from the most outer
- // loop and check for the writes. If NOT - start DFS over all preds.
- // 3. Start DFS over all preds from the most outer loop header.
- SetVector<BasicBlock *> Checklist;
- BasicBlock *Start = Load->getParent();
- Checklist.insert(Start);
- const Value *Ptr = Load->getPointerOperand();
- const Loop *L = LI->getLoopFor(Start);
- if (L) {
- const Loop *P = L;
- do {
- L = P;
- P = P->getParentLoop();
- } while (P);
- Checklist.insert(L->block_begin(), L->block_end());
- Start = L->getHeader();
- }
-
- DFS(Start, Checklist);
- for (auto &BB : Checklist) {
- BasicBlock::iterator StartIt = (!L && (BB == Load->getParent())) ?
- BasicBlock::iterator(Load) : BB->end();
- auto Q = MDR->getPointerDependencyFrom(
- MemoryLocation::getBeforeOrAfter(Ptr), true, StartIt, BB, Load);
- if (Q.isClobber() || Q.isUnknown() ||
- // Store defines the load and thus clobbers it.
- (Q.isDef() && Q.getInst()->mayWriteToMemory()))
- return true;
- }
- return false;
+ const MemoryAccess *MA = MSSA->getWalker()->getClobberingMemoryAccess(Load);
+ return !MSSA->isLiveOnEntryDef(MA);
}
void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
if (DA->isUniform(&I))
- setUniformMetadata(I.getParent()->getTerminator());
+ setUniformMetadata(&I);
}
void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
@@ -154,9 +114,9 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
Value *Idx = Constant::getIntegerValue(
Type::getInt32Ty(Ptr->getContext()), APInt(64, 0));
// Insert GEP at the entry to make it dominate all uses
- PtrI = GetElementPtrInst::Create(
- Ptr->getType()->getPointerElementType(), Ptr,
- ArrayRef<Value*>(Idx), Twine(""), F->getEntryBlock().getFirstNonPHI());
+ PtrI = GetElementPtrInst::Create(I.getType(), Ptr,
+ ArrayRef<Value *>(Idx), Twine(""),
+ F->getEntryBlock().getFirstNonPHI());
}
I.replaceUsesOfWith(Ptr, PtrI);
}
@@ -177,9 +137,8 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
- DA = &getAnalysis<LegacyDivergenceAnalysis>();
- MDR = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
- LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ DA = &getAnalysis<LegacyDivergenceAnalysis>();
+ MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
isEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv());
visit(F);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index fb273a1650ae..aab76d27ef11 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -92,7 +92,7 @@ AMDGPUFunctionArgInfo::getPreloadedValue(
case AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER: {
return std::make_tuple(PrivateSegmentBuffer ? &PrivateSegmentBuffer
: nullptr,
- &AMDGPU::SGPR_128RegClass, LLT::vector(4, 32));
+ &AMDGPU::SGPR_128RegClass, LLT::fixed_vector(4, 32));
}
case AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR:
return std::make_tuple(ImplicitBufferPtr ? &ImplicitBufferPtr : nullptr,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 139ac3bab14c..e9ed45d8cd14 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -143,7 +143,8 @@ struct AMDGPUFunctionArgInfo {
// Input registers for non-HSA ABI
ArgDescriptor ImplicitBufferPtr;
- // VGPRs inputs. These are always v0, v1 and v2 for entry functions.
+ // VGPRs inputs. For entry functions these are either v0, v1 and v2 or packed
+ // into v0, 10 bits per dimension if packed-tid is set.
ArgDescriptor WorkItemIDX;
ArgDescriptor WorkItemIDY;
ArgDescriptor WorkItemIDZ;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index c655e5ec87b7..cbc4ab212566 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -18,6 +18,7 @@
#include "AMDGPUAsmPrinter.h"
#include "AMDGPU.h"
#include "AMDGPUHSAMetadataStreamer.h"
+#include "AMDGPUResourceUsageAnalysis.h"
#include "AMDKernelCodeT.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUInstPrinter.h"
@@ -39,22 +40,6 @@
using namespace llvm;
using namespace llvm::AMDGPU;
-// We need to tell the runtime some amount ahead of time if we don't know the
-// true stack size. Assume a smaller number if this is only due to dynamic /
-// non-entry block allocas.
-static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
- "amdgpu-assume-external-call-stack-size",
- cl::desc("Assumed stack use of any external call (in bytes)"),
- cl::Hidden,
- cl::init(16384));
-
-static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
- "amdgpu-assume-dynamic-stack-object-size",
- cl::desc("Assumed extra stack use if there are any "
- "variable sized objects (in bytes)"),
- cl::Hidden,
- cl::init(4096));
-
// This should get the default rounding mode from the kernel. We just set the
// default here, but this could change if the OpenCL rounding mode pragmas are
// used.
@@ -97,12 +82,14 @@ extern "C" void LLVM_EXTERNAL_VISIBILITY LLVMInitializeAMDGPUAsmPrinter() {
AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer)
- : AsmPrinter(TM, std::move(Streamer)) {
+ : AsmPrinter(TM, std::move(Streamer)) {
if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
if (isHsaAbiVersion2(getGlobalSTI())) {
HSAMetadataStream.reset(new HSAMD::MetadataStreamerV2());
- } else {
+ } else if (isHsaAbiVersion3(getGlobalSTI())) {
HSAMetadataStream.reset(new HSAMD::MetadataStreamerV3());
+ } else {
+ HSAMetadataStream.reset(new HSAMD::MetadataStreamerV4());
}
}
}
@@ -122,34 +109,34 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
}
void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
- if (isHsaAbiVersion3(getGlobalSTI())) {
- std::string ExpectedTarget;
- raw_string_ostream ExpectedTargetOS(ExpectedTarget);
- IsaInfo::streamIsaVersion(getGlobalSTI(), ExpectedTargetOS);
-
- getTargetStreamer()->EmitDirectiveAMDGCNTarget(ExpectedTarget);
- }
+ // TODO: Which one is called first, emitStartOfAsmFile or
+ // emitFunctionBodyStart?
+ if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
+ initializeTargetID(M);
if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
TM.getTargetTriple().getOS() != Triple::AMDPAL)
return;
+ if (isHsaAbiVersion3Or4(getGlobalSTI()))
+ getTargetStreamer()->EmitDirectiveAMDGCNTarget();
+
if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
- HSAMetadataStream->begin(M);
+ HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
getTargetStreamer()->getPALMetadata()->readFromIR(M);
- if (isHsaAbiVersion3(getGlobalSTI()))
+ if (isHsaAbiVersion3Or4(getGlobalSTI()))
return;
- // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2.
+ // HSA emits NT_AMD_HSA_CODE_OBJECT_VERSION for code objects v2.
if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1);
- // HSA and PAL emit NT_AMDGPU_HSA_ISA for code objects v2.
+ // HSA and PAL emit NT_AMD_HSA_ISA_VERSION for code objects v2.
IsaVersion Version = getIsaVersion(getGlobalSTI()->getCPU());
- getTargetStreamer()->EmitDirectiveHSACodeObjectISA(
+ getTargetStreamer()->EmitDirectiveHSACodeObjectISAV2(
Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU");
}
@@ -159,15 +146,11 @@ void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
return;
if (TM.getTargetTriple().getOS() != Triple::AMDHSA ||
- isHsaAbiVersion2(getGlobalSTI())) {
- // Emit ISA Version (NT_AMD_AMDGPU_ISA).
- std::string ISAVersionString;
- raw_string_ostream ISAVersionStream(ISAVersionString);
- IsaInfo::streamIsaVersion(getGlobalSTI(), ISAVersionStream);
- getTargetStreamer()->EmitISAVersion(ISAVersionStream.str());
- }
+ isHsaAbiVersion2(getGlobalSTI()))
+ getTargetStreamer()->EmitISAVersion();
// Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
+ // Emit HSA Metadata (NT_AMD_HSA_METADATA).
if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
HSAMetadataStream->end();
bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
@@ -192,11 +175,37 @@ bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
void AMDGPUAsmPrinter::emitFunctionBodyStart() {
const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
+ const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
+ const Function &F = MF->getFunction();
+
+ // TODO: Which one is called first, emitStartOfAsmFile or
+ // emitFunctionBodyStart?
+ if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
+ initializeTargetID(*F.getParent());
+
+ const auto &FunctionTargetID = STM.getTargetID();
+ // Make sure function's xnack settings are compatible with module's
+ // xnack settings.
+ if (FunctionTargetID.isXnackSupported() &&
+ FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
+ FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
+ OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
+ "' function does not match module xnack setting");
+ return;
+ }
+ // Make sure function's sramecc settings are compatible with module's
+ // sramecc settings.
+ if (FunctionTargetID.isSramEccSupported() &&
+ FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
+ FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
+ OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
+ "' function does not match module sramecc setting");
+ return;
+ }
+
if (!MFI.isEntryFunction())
return;
- const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
- const Function &F = MF->getFunction();
if ((STM.isMesaKernel(F) || isHsaAbiVersion2(getGlobalSTI())) &&
(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
@@ -232,26 +241,25 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
if (ReadOnlySection.getAlignment() < 64)
ReadOnlySection.setAlignment(Align(64));
- const MCSubtargetInfo &STI = MF->getSubtarget();
+ const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
SmallString<128> KernelName;
getNameWithPrefix(KernelName, &MF->getFunction());
getTargetStreamer()->EmitAmdhsaKernelDescriptor(
- STI, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
+ STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
CurrentProgramInfo.NumVGPRsForWavesPerEU,
CurrentProgramInfo.NumSGPRsForWavesPerEU -
- IsaInfo::getNumExtraSGPRs(&STI,
+ IsaInfo::getNumExtraSGPRs(&STM,
CurrentProgramInfo.VCCUsed,
CurrentProgramInfo.FlatUsed),
- CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
- hasXNACK(STI));
+ CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
Streamer.PopSection();
}
void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
if (TM.getTargetTriple().getOS() == Triple::AMDHSA &&
- isHsaAbiVersion3(getGlobalSTI())) {
+ isHsaAbiVersion3Or4(getGlobalSTI())) {
AsmPrinter::emitFunctionEntryLabel();
return;
}
@@ -322,17 +330,15 @@ void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
}
bool AMDGPUAsmPrinter::doFinalization(Module &M) {
- CallGraphResourceInfo.clear();
-
// Pad with s_code_end to help tools and guard against instruction prefetch
// causing stale data in caches. Arguably this should be done by the linker,
// which is why this isn't done for Mesa.
const MCSubtargetInfo &STI = *getGlobalSTI();
- if (AMDGPU::isGFX10Plus(STI) &&
+ if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
(STI.getTargetTriple().getOS() == Triple::AMDHSA ||
STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
- getTargetStreamer()->EmitCodeEnd();
+ getTargetStreamer()->EmitCodeEnd(STI);
}
return AsmPrinter::doFinalization(M);
@@ -400,6 +406,9 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
const MachineFunction &MF,
const SIProgramInfo &PI) const {
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+ const Function &F = MF.getFunction();
+
amdhsa::kernel_descriptor_t KernelDescriptor;
memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
@@ -409,14 +418,24 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
+
+ Align MaxKernArgAlign;
+ KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
+
KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1();
KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2;
KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
+ assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
+ if (STM.hasGFX90AInsts())
+ KernelDescriptor.compute_pgm_rsrc3 =
+ CurrentProgramInfo.ComputePGMRSrc3GFX90A;
+
return KernelDescriptor;
}
bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+ ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
CurrentProgramInfo = SIProgramInfo();
const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
@@ -438,12 +457,6 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
if (MFI->isModuleEntryFunction()) {
getSIProgramInfo(CurrentProgramInfo, MF);
- } else {
- auto I = CallGraphResourceInfo.insert(
- std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
- SIFunctionResourceInfo &Info = I.first->second;
- assert(I.second && "should only be called once per function");
- Info = analyzeResourceUsage(MF);
}
if (STM.isAmdPalOS()) {
@@ -480,7 +493,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
if (!MFI->isEntryFunction()) {
OutStreamer->emitRawComment(" Function info:", false);
- SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()];
+ const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
+ ResourceUsage->getResourceInfo(&MF.getFunction());
emitCommonFunctionComments(
Info.NumVGPR,
STM.hasMAIInsts() ? Info.NumAGPR : Optional<uint32_t>(),
@@ -521,6 +535,11 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
" NumVGPRsForWavesPerEU: " +
Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
+ if (STM.hasGFX90AInsts())
+ OutStreamer->emitRawComment(
+ " AccumOffset: " +
+ Twine((CurrentProgramInfo.AccumOffset + 1) * 4), false);
+
OutStreamer->emitRawComment(
" Occupancy: " +
Twine(CurrentProgramInfo.Occupancy), false);
@@ -550,6 +569,21 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
false);
+
+ assert(STM.hasGFX90AInsts() ||
+ CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
+ if (STM.hasGFX90AInsts()) {
+ OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
+ Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
+ false);
+ OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
+ Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
+ false);
+ }
}
if (DumpCodeInstEmitter) {
@@ -572,6 +606,36 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
return false;
}
+// TODO: Fold this into emitFunctionBodyStart.
+void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
+ // In the beginning all features are either 'Any' or 'NotSupported',
+ // depending on global target features. This will cover empty modules.
+ getTargetStreamer()->initializeTargetID(
+ *getGlobalSTI(), getGlobalSTI()->getFeatureString());
+
+ // If module is empty, we are done.
+ if (M.empty())
+ return;
+
+ // If module is not empty, need to find first 'Off' or 'On' feature
+ // setting per feature from functions in module.
+ for (auto &F : M) {
+ auto &TSTargetID = getTargetStreamer()->getTargetID();
+ if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
+ (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
+ break;
+
+ const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
+ const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
+ if (TSTargetID->isXnackSupported())
+ if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
+ TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
+ if (TSTargetID->isSramEccSupported())
+ if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
+ TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
+ }
+}
+
uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = STM.getInstrInfo();
@@ -593,398 +657,17 @@ uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const
return CodeSize;
}
-static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
- const SIInstrInfo &TII,
- unsigned Reg) {
- for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
- if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
- return true;
- }
-
- return false;
-}
-
-int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
- const GCNSubtarget &ST) const {
- return NumExplicitSGPR + IsaInfo::getNumExtraSGPRs(&ST,
- UsesVCC, UsesFlatScratch);
-}
-
-int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumVGPRs(
- const GCNSubtarget &ST) const {
- return std::max(NumVGPR, NumAGPR);
-}
-
-static const Function *getCalleeFunction(const MachineOperand &Op) {
- if (Op.isImm()) {
- assert(Op.getImm() == 0);
- return nullptr;
- }
-
- return cast<Function>(Op.getGlobal());
-}
-
-AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
- const MachineFunction &MF) const {
- SIFunctionResourceInfo Info;
-
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
- const MachineRegisterInfo &MRI = MF.getRegInfo();
- const SIInstrInfo *TII = ST.getInstrInfo();
- const SIRegisterInfo &TRI = TII->getRegisterInfo();
-
- Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
- MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI);
-
- // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
- // instructions aren't used to access the scratch buffer. Inline assembly may
- // need it though.
- //
- // If we only have implicit uses of flat_scr on flat instructions, it is not
- // really needed.
- if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
- (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
- !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
- !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
- Info.UsesFlatScratch = false;
- }
-
- Info.PrivateSegmentSize = FrameInfo.getStackSize();
-
- // Assume a big number if there are any unknown sized objects.
- Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
- if (Info.HasDynamicallySizedStack)
- Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
-
- if (MFI->isStackRealigned())
- Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
-
- Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
- MRI.isPhysRegUsed(AMDGPU::VCC_HI);
-
- // If there are no calls, MachineRegisterInfo can tell us the used register
- // count easily.
- // A tail call isn't considered a call for MachineFrameInfo's purposes.
- if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
- MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
- for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
- if (MRI.isPhysRegUsed(Reg)) {
- HighestVGPRReg = Reg;
- break;
- }
- }
-
- if (ST.hasMAIInsts()) {
- MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
- for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
- if (MRI.isPhysRegUsed(Reg)) {
- HighestAGPRReg = Reg;
- break;
- }
- }
- Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister ? 0 :
- TRI.getHWRegIndex(HighestAGPRReg) + 1;
- }
-
- MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
- for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
- if (MRI.isPhysRegUsed(Reg)) {
- HighestSGPRReg = Reg;
- break;
- }
- }
-
- // We found the maximum register index. They start at 0, so add one to get the
- // number of registers.
- Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 :
- TRI.getHWRegIndex(HighestVGPRReg) + 1;
- Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 :
- TRI.getHWRegIndex(HighestSGPRReg) + 1;
-
- return Info;
- }
-
- int32_t MaxVGPR = -1;
- int32_t MaxAGPR = -1;
- int32_t MaxSGPR = -1;
- uint64_t CalleeFrameSize = 0;
-
- for (const MachineBasicBlock &MBB : MF) {
- for (const MachineInstr &MI : MBB) {
- // TODO: Check regmasks? Do they occur anywhere except calls?
- for (const MachineOperand &MO : MI.operands()) {
- unsigned Width = 0;
- bool IsSGPR = false;
- bool IsAGPR = false;
-
- if (!MO.isReg())
- continue;
-
- Register Reg = MO.getReg();
- switch (Reg) {
- case AMDGPU::EXEC:
- case AMDGPU::EXEC_LO:
- case AMDGPU::EXEC_HI:
- case AMDGPU::SCC:
- case AMDGPU::M0:
- case AMDGPU::SRC_SHARED_BASE:
- case AMDGPU::SRC_SHARED_LIMIT:
- case AMDGPU::SRC_PRIVATE_BASE:
- case AMDGPU::SRC_PRIVATE_LIMIT:
- case AMDGPU::SGPR_NULL:
- case AMDGPU::MODE:
- continue;
-
- case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
- llvm_unreachable("src_pops_exiting_wave_id should not be used");
-
- case AMDGPU::NoRegister:
- assert(MI.isDebugInstr() && "Instruction uses invalid noreg register");
- continue;
-
- case AMDGPU::VCC:
- case AMDGPU::VCC_LO:
- case AMDGPU::VCC_HI:
- case AMDGPU::VCC_LO_LO16:
- case AMDGPU::VCC_LO_HI16:
- case AMDGPU::VCC_HI_LO16:
- case AMDGPU::VCC_HI_HI16:
- Info.UsesVCC = true;
- continue;
-
- case AMDGPU::FLAT_SCR:
- case AMDGPU::FLAT_SCR_LO:
- case AMDGPU::FLAT_SCR_HI:
- continue;
-
- case AMDGPU::XNACK_MASK:
- case AMDGPU::XNACK_MASK_LO:
- case AMDGPU::XNACK_MASK_HI:
- llvm_unreachable("xnack_mask registers should not be used");
-
- case AMDGPU::LDS_DIRECT:
- llvm_unreachable("lds_direct register should not be used");
-
- case AMDGPU::TBA:
- case AMDGPU::TBA_LO:
- case AMDGPU::TBA_HI:
- case AMDGPU::TMA:
- case AMDGPU::TMA_LO:
- case AMDGPU::TMA_HI:
- llvm_unreachable("trap handler registers should not be used");
-
- case AMDGPU::SRC_VCCZ:
- llvm_unreachable("src_vccz register should not be used");
-
- case AMDGPU::SRC_EXECZ:
- llvm_unreachable("src_execz register should not be used");
-
- case AMDGPU::SRC_SCC:
- llvm_unreachable("src_scc register should not be used");
-
- default:
- break;
- }
-
- if (AMDGPU::SReg_32RegClass.contains(Reg) ||
- AMDGPU::SReg_LO16RegClass.contains(Reg) ||
- AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
- assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
- "trap handler registers should not be used");
- IsSGPR = true;
- Width = 1;
- } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
- AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
- AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 1;
- } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
- AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 1;
- } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
- assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
- "trap handler registers should not be used");
- IsSGPR = true;
- Width = 2;
- } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 2;
- } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 2;
- } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 3;
- } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 3;
- } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 3;
- } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
- assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
- "trap handler registers should not be used");
- IsSGPR = true;
- Width = 4;
- } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 4;
- } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 4;
- } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 5;
- } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 5;
- } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 5;
- } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 6;
- } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 6;
- } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 6;
- } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
- assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
- "trap handler registers should not be used");
- IsSGPR = true;
- Width = 8;
- } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 8;
- } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 8;
- } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
- assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
- "trap handler registers should not be used");
- IsSGPR = true;
- Width = 16;
- } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 16;
- } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 16;
- } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
- IsSGPR = true;
- Width = 32;
- } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
- IsSGPR = false;
- Width = 32;
- } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
- IsSGPR = false;
- IsAGPR = true;
- Width = 32;
- } else {
- llvm_unreachable("Unknown register class");
- }
- unsigned HWReg = TRI.getHWRegIndex(Reg);
- int MaxUsed = HWReg + Width - 1;
- if (IsSGPR) {
- MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
- } else if (IsAGPR) {
- MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
- } else {
- MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
- }
- }
-
- if (MI.isCall()) {
- // Pseudo used just to encode the underlying global. Is there a better
- // way to track this?
-
- const MachineOperand *CalleeOp
- = TII->getNamedOperand(MI, AMDGPU::OpName::callee);
-
- const Function *Callee = getCalleeFunction(*CalleeOp);
- DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
- CallGraphResourceInfo.end();
- bool IsExternal = !Callee || Callee->isDeclaration();
- if (!IsExternal)
- I = CallGraphResourceInfo.find(Callee);
-
- if (IsExternal || I == CallGraphResourceInfo.end()) {
- // Avoid crashing on undefined behavior with an illegal call to a
- // kernel. If a callsite's calling convention doesn't match the
- // function's, it's undefined behavior. If the callsite calling
- // convention does match, that would have errored earlier.
- // FIXME: The verifier shouldn't allow this.
- if (!IsExternal &&
- AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
- report_fatal_error("invalid call to entry function");
-
- // If this is a call to an external function, we can't do much. Make
- // conservative guesses.
-
- // 48 SGPRs - vcc, - flat_scr, -xnack
- int MaxSGPRGuess =
- 47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace());
- MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
- MaxVGPR = std::max(MaxVGPR, 23);
- MaxAGPR = std::max(MaxAGPR, 23);
-
- CalleeFrameSize = std::max(CalleeFrameSize,
- static_cast<uint64_t>(AssumedStackSizeForExternalCall));
-
- Info.UsesVCC = true;
- Info.UsesFlatScratch = ST.hasFlatAddressSpace();
- Info.HasDynamicallySizedStack = true;
- } else {
- // We force CodeGen to run in SCC order, so the callee's register
- // usage etc. should be the cumulative usage of all callees.
-
- MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
- MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
- MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
- CalleeFrameSize
- = std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
- Info.UsesVCC |= I->second.UsesVCC;
- Info.UsesFlatScratch |= I->second.UsesFlatScratch;
- Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
- Info.HasRecursion |= I->second.HasRecursion;
- }
-
- // FIXME: Call site could have norecurse on it
- if (!Callee || !Callee->doesNotRecurse())
- Info.HasRecursion = true;
- }
- }
- }
-
- Info.NumExplicitSGPR = MaxSGPR + 1;
- Info.NumVGPR = MaxVGPR + 1;
- Info.NumAGPR = MaxAGPR + 1;
- Info.PrivateSegmentSize += CalleeFrameSize;
-
- return Info;
-}
-
void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const MachineFunction &MF) {
- SIFunctionResourceInfo Info = analyzeResourceUsage(MF);
+ const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
+ ResourceUsage->getResourceInfo(&MF.getFunction());
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
ProgInfo.NumArchVGPR = Info.NumVGPR;
ProgInfo.NumAccVGPR = Info.NumAGPR;
ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM);
+ ProgInfo.AccumOffset = alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1;
+ ProgInfo.TgSplit = STM.isTgSplitEnabled();
ProgInfo.NumSGPR = Info.NumExplicitSGPR;
ProgInfo.ScratchSize = Info.PrivateSegmentSize;
ProgInfo.VCCUsed = Info.UsesVCC;
@@ -1001,7 +684,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are
+ // The calculations related to SGPR/VGPR blocks are
// duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
// unified.
unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
@@ -1163,6 +846,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) |
S_00B84C_EXCP_EN(0);
+ if (STM.hasGFX90AInsts()) {
+ AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
+ ProgInfo.AccumOffset);
+ AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
+ ProgInfo.TgSplit);
+ }
+
ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize,
ProgInfo.NumSGPRsForWavesPerEU,
ProgInfo.NumVGPRsForWavesPerEU);
@@ -1262,10 +954,16 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
auto *MD = getTargetStreamer()->getPALMetadata();
const MachineFrameInfo &MFI = MF.getFrameInfo();
MD->setFunctionScratchSize(MF, MFI.getStackSize());
+
// Set compute registers
MD->setRsrc1(CallingConv::AMDGPU_CS,
CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS));
MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.ComputePGMRSrc2);
+
+ // Set optional info
+ MD->setFunctionLdsSize(MF, CurrentProgramInfo.LDSSize);
+ MD->setFunctionNumUsedVgprs(MF, CurrentProgramInfo.NumVGPRsForWavesPerEU);
+ MD->setFunctionNumUsedSgprs(MF, CurrentProgramInfo.NumSGPRsForWavesPerEU);
}
// This is supposed to be log2(Size)
@@ -1383,3 +1081,9 @@ bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
}
return true;
}
+
+void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<AMDGPUResourceUsageAnalysis>();
+ AU.addPreserved<AMDGPUResourceUsageAnalysis>();
+ AsmPrinter::getAnalysisUsage(AU);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 9e1e26d65d8c..d3a555bc228f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -22,6 +22,7 @@ struct amd_kernel_code_t;
namespace llvm {
class AMDGPUMachineFunction;
+struct AMDGPUResourceUsageAnalysis;
class AMDGPUTargetStreamer;
class MCCodeEmitter;
class MCOperand;
@@ -39,32 +40,17 @@ struct kernel_descriptor_t;
class AMDGPUAsmPrinter final : public AsmPrinter {
private:
- // Track resource usage for callee functions.
- struct SIFunctionResourceInfo {
- // Track the number of explicitly used VGPRs. Special registers reserved at
- // the end are tracked separately.
- int32_t NumVGPR = 0;
- int32_t NumAGPR = 0;
- int32_t NumExplicitSGPR = 0;
- uint64_t PrivateSegmentSize = 0;
- bool UsesVCC = false;
- bool UsesFlatScratch = false;
- bool HasDynamicallySizedStack = false;
- bool HasRecursion = false;
-
- int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const;
- int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const;
- };
+ void initializeTargetID(const Module &M);
+
+ AMDGPUResourceUsageAnalysis *ResourceUsage;
SIProgramInfo CurrentProgramInfo;
- DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;
std::unique_ptr<AMDGPU::HSAMD::MetadataStreamer> HSAMetadataStream;
MCCodeEmitter *DumpCodeInstEmitter = nullptr;
uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
- SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF) const;
void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo,
@@ -146,6 +132,8 @@ public:
const char *ExtraCode, raw_ostream &O) override;
protected:
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
std::vector<std::string> DisasmLines, HexLines;
size_t DisasmLineMaxLen;
};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index aae2a54c198b..3e9fdcb1618e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -48,6 +48,8 @@ private:
const GCNSubtarget *ST;
bool IsPixelShader;
+ Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
+ Value *const Identity) const;
Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
Value *const Identity) const;
Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;
@@ -279,6 +281,45 @@ static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
return B.CreateSelect(Cond, LHS, RHS);
}
+// Use the builder to create a reduction of V across the wavefront, with all
+// lanes active, returning the same result in all lanes.
+Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B,
+ AtomicRMWInst::BinOp Op, Value *V,
+ Value *const Identity) const {
+ Type *const Ty = V->getType();
+ Module *M = B.GetInsertBlock()->getModule();
+ Function *UpdateDPP =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
+
+ // Reduce within each row of 16 lanes.
+ for (unsigned Idx = 0; Idx < 4; Idx++) {
+ V = buildNonAtomicBinOp(
+ B, Op, V,
+ B.CreateCall(UpdateDPP,
+ {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx),
+ B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
+ }
+
+ // Reduce within each pair of rows (i.e. 32 lanes).
+ assert(ST->hasPermLaneX16());
+ V = buildNonAtomicBinOp(
+ B, Op, V,
+ B.CreateIntrinsic(
+ Intrinsic::amdgcn_permlanex16, {},
+ {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}));
+
+ if (ST->isWave32())
+ return V;
+
+ // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
+ // combine them with a scalar operation.
+ Function *ReadLane =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+ Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
+ Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
+ return buildNonAtomicBinOp(B, Op, Lane0, Lane32);
+}
+
// Use the builder to create an inclusive scan of V across the wavefront, with
// all lanes active.
Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
@@ -287,10 +328,6 @@ Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
- Function *PermLaneX16 =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_permlanex16, {});
- Function *ReadLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
for (unsigned Idx = 0; Idx < 4; Idx++) {
V = buildNonAtomicBinOp(
@@ -317,9 +354,10 @@ Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
// Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
// 48..63).
- Value *const PermX =
- B.CreateCall(PermLaneX16, {V, V, B.getInt32(-1), B.getInt32(-1),
- B.getFalse(), B.getFalse()});
+ assert(ST->hasPermLaneX16());
+ Value *const PermX = B.CreateIntrinsic(
+ Intrinsic::amdgcn_permlanex16, {},
+ {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
V = buildNonAtomicBinOp(
B, Op, V,
B.CreateCall(UpdateDPP,
@@ -327,7 +365,8 @@ Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}));
if (!ST->isWave32()) {
// Combine lane 31 into lanes 32..63.
- Value *const Lane31 = B.CreateCall(ReadLane, {V, B.getInt32(31)});
+ Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
+ {V, B.getInt32(31)});
V = buildNonAtomicBinOp(
B, Op, V,
B.CreateCall(UpdateDPP,
@@ -346,10 +385,6 @@ Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V,
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
- Function *ReadLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
- Function *WriteLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
if (ST->hasDPPWavefrontShifts()) {
// GFX9 has DPP wavefront shift operations.
@@ -357,6 +392,11 @@ Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V,
{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
B.getInt32(0xf), B.getFalse()});
} else {
+ Function *ReadLane =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+ Function *WriteLane =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
+
// On GFX10 all DPP operations are confined to a single row. To get cross-
// row operations we have to use permlane or readlane.
Value *Old = V;
@@ -480,6 +520,8 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
Value *ExclScan = nullptr;
Value *NewV = nullptr;
+ const bool NeedResult = !I.use_empty();
+
// If we have a divergent value in each lane, we need to combine the value
// using DPP.
if (ValDivergent) {
@@ -489,35 +531,27 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
const AtomicRMWInst::BinOp ScanOp =
Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
- NewV = buildScan(B, ScanOp, NewV, Identity);
- ExclScan = buildShiftRight(B, NewV, Identity);
-
- // Read the value from the last lane, which has accumlated the values of
- // each active lane in the wavefront. This will be our new value which we
- // will provide to the atomic operation.
- Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
- if (TyBitWidth == 64) {
- Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
- Value *const ExtractHi =
- B.CreateTrunc(B.CreateLShr(NewV, 32), B.getInt32Ty());
- CallInst *const ReadLaneLo = B.CreateIntrinsic(
- Intrinsic::amdgcn_readlane, {}, {ExtractLo, LastLaneIdx});
- CallInst *const ReadLaneHi = B.CreateIntrinsic(
- Intrinsic::amdgcn_readlane, {}, {ExtractHi, LastLaneIdx});
- Value *const PartialInsert = B.CreateInsertElement(
- UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));
- Value *const Insert =
- B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));
- NewV = B.CreateBitCast(Insert, Ty);
- } else if (TyBitWidth == 32) {
+ if (!NeedResult && ST->hasPermLaneX16()) {
+ // On GFX10 the permlanex16 instruction helps us build a reduction without
+ // too many readlanes and writelanes, which are generally bad for
+ // performance.
+ NewV = buildReduction(B, ScanOp, NewV, Identity);
+ } else {
+ NewV = buildScan(B, ScanOp, NewV, Identity);
+ if (NeedResult)
+ ExclScan = buildShiftRight(B, NewV, Identity);
+
+ // Read the value from the last lane, which has accumlated the values of
+ // each active lane in the wavefront. This will be our new value which we
+ // will provide to the atomic operation.
+ Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
+ assert(TyBitWidth == 32);
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
{NewV, LastLaneIdx});
- } else {
- llvm_unreachable("Unhandled atomic bit width");
}
// Finally mark the readlanes in the WWM section.
- NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
} else {
switch (Op) {
default:
@@ -583,7 +617,6 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
// original instruction.
B.SetInsertPoint(&I);
- const bool NeedResult = !I.use_empty();
if (NeedResult) {
// Create a PHI node to get our new atomic result into the exit block.
PHINode *const PHI = B.CreatePHI(Ty, 2);
@@ -621,7 +654,8 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
// from the first lane, to get our lane's index into the atomic result.
Value *LaneOffset = nullptr;
if (ValDivergent) {
- LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan);
+ LaneOffset =
+ B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
} else {
switch (Op) {
default:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
new file mode 100644
index 000000000000..61b1d22edc33
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -0,0 +1,528 @@
+//===- AMDGPUAttributor.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass uses Attributor framework to deduce AMDGPU attributes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsR600.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/IPO/Attributor.h"
+
+#define DEBUG_TYPE "amdgpu-attributor"
+
+using namespace llvm;
+
+static constexpr StringLiteral ImplicitAttrNames[] = {
+ // X ids unnecessarily propagated to kernels.
+ "amdgpu-work-item-id-x", "amdgpu-work-item-id-y",
+ "amdgpu-work-item-id-z", "amdgpu-work-group-id-x",
+ "amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
+ "amdgpu-dispatch-ptr", "amdgpu-dispatch-id",
+ "amdgpu-queue-ptr", "amdgpu-implicitarg-ptr"};
+
+// We do not need to note the x workitem or workgroup id because they are always
+// initialized.
+//
+// TODO: We should not add the attributes if the known compile time workgroup
+// size is 1 for y/z.
+static StringRef intrinsicToAttrName(Intrinsic::ID ID, bool &NonKernelOnly,
+ bool &IsQueuePtr) {
+ switch (ID) {
+ case Intrinsic::amdgcn_workitem_id_x:
+ NonKernelOnly = true;
+ return "amdgpu-work-item-id-x";
+ case Intrinsic::amdgcn_workgroup_id_x:
+ NonKernelOnly = true;
+ return "amdgpu-work-group-id-x";
+ case Intrinsic::amdgcn_workitem_id_y:
+ case Intrinsic::r600_read_tidig_y:
+ return "amdgpu-work-item-id-y";
+ case Intrinsic::amdgcn_workitem_id_z:
+ case Intrinsic::r600_read_tidig_z:
+ return "amdgpu-work-item-id-z";
+ case Intrinsic::amdgcn_workgroup_id_y:
+ case Intrinsic::r600_read_tgid_y:
+ return "amdgpu-work-group-id-y";
+ case Intrinsic::amdgcn_workgroup_id_z:
+ case Intrinsic::r600_read_tgid_z:
+ return "amdgpu-work-group-id-z";
+ case Intrinsic::amdgcn_dispatch_ptr:
+ return "amdgpu-dispatch-ptr";
+ case Intrinsic::amdgcn_dispatch_id:
+ return "amdgpu-dispatch-id";
+ case Intrinsic::amdgcn_kernarg_segment_ptr:
+ return "amdgpu-kernarg-segment-ptr";
+ case Intrinsic::amdgcn_implicitarg_ptr:
+ return "amdgpu-implicitarg-ptr";
+ case Intrinsic::amdgcn_queue_ptr:
+ case Intrinsic::amdgcn_is_shared:
+ case Intrinsic::amdgcn_is_private:
+ // TODO: Does not require queue ptr on gfx9+
+ case Intrinsic::trap:
+ case Intrinsic::debugtrap:
+ IsQueuePtr = true;
+ return "amdgpu-queue-ptr";
+ default:
+ return "";
+ }
+}
+
+static bool castRequiresQueuePtr(unsigned SrcAS) {
+ return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
+}
+
+static bool isDSAddress(const Constant *C) {
+ const GlobalValue *GV = dyn_cast<GlobalValue>(C);
+ if (!GV)
+ return false;
+ unsigned AS = GV->getAddressSpace();
+ return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
+}
+
+class AMDGPUInformationCache : public InformationCache {
+public:
+ AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
+ BumpPtrAllocator &Allocator,
+ SetVector<Function *> *CGSCC, TargetMachine &TM)
+ : InformationCache(M, AG, Allocator, CGSCC), TM(TM) {}
+ TargetMachine &TM;
+
+ enum ConstantStatus { DS_GLOBAL = 1 << 0, ADDR_SPACE_CAST = 1 << 1 };
+
+ /// Check if the subtarget has aperture regs.
+ bool hasApertureRegs(Function &F) {
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ return ST.hasApertureRegs();
+ }
+
+private:
+ /// Check if the ConstantExpr \p CE requires queue ptr attribute.
+ static bool visitConstExpr(const ConstantExpr *CE) {
+ if (CE->getOpcode() == Instruction::AddrSpaceCast) {
+ unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
+ return castRequiresQueuePtr(SrcAS);
+ }
+ return false;
+ }
+
+ /// Get the constant access bitmap for \p C.
+ uint8_t getConstantAccess(const Constant *C) {
+ auto It = ConstantStatus.find(C);
+ if (It != ConstantStatus.end())
+ return It->second;
+
+ uint8_t Result = 0;
+ if (isDSAddress(C))
+ Result = DS_GLOBAL;
+
+ if (const auto *CE = dyn_cast<ConstantExpr>(C))
+ if (visitConstExpr(CE))
+ Result |= ADDR_SPACE_CAST;
+
+ for (const Use &U : C->operands()) {
+ const auto *OpC = dyn_cast<Constant>(U);
+ if (!OpC)
+ continue;
+
+ Result |= getConstantAccess(OpC);
+ }
+ return Result;
+ }
+
+public:
+ /// Returns true if \p Fn needs a queue ptr attribute because of \p C.
+ bool needsQueuePtr(const Constant *C, Function &Fn) {
+ bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
+ bool HasAperture = hasApertureRegs(Fn);
+
+ // No need to explore the constants.
+ if (!IsNonEntryFunc && HasAperture)
+ return false;
+
+ uint8_t Access = getConstantAccess(C);
+
+ // We need to trap on DS globals in non-entry functions.
+ if (IsNonEntryFunc && (Access & DS_GLOBAL))
+ return true;
+
+ return !HasAperture && (Access & ADDR_SPACE_CAST);
+ }
+
+private:
+ /// Used to determine if the Constant needs a queue ptr attribute.
+ DenseMap<const Constant *, uint8_t> ConstantStatus;
+};
+
+struct AAAMDAttributes : public StateWrapper<BooleanState, AbstractAttribute> {
+ using Base = StateWrapper<BooleanState, AbstractAttribute>;
+ AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
+
+ /// Create an abstract attribute view for the position \p IRP.
+ static AAAMDAttributes &createForPosition(const IRPosition &IRP,
+ Attributor &A);
+
+ /// See AbstractAttribute::getName().
+ const std::string getName() const override { return "AAAMDAttributes"; }
+
+ /// See AbstractAttribute::getIdAddr().
+ const char *getIdAddr() const override { return &ID; }
+
+ /// This function should return true if the type of the \p AA is
+ /// AAAMDAttributes.
+ static bool classof(const AbstractAttribute *AA) {
+ return (AA->getIdAddr() == &ID);
+ }
+
+ virtual const DenseSet<StringRef> &getAttributes() const = 0;
+
+ /// Unique ID (due to the unique address)
+ static const char ID;
+};
+const char AAAMDAttributes::ID = 0;
+
+struct AAAMDWorkGroupSize
+ : public StateWrapper<BooleanState, AbstractAttribute> {
+ using Base = StateWrapper<BooleanState, AbstractAttribute>;
+ AAAMDWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
+
+ /// Create an abstract attribute view for the position \p IRP.
+ static AAAMDWorkGroupSize &createForPosition(const IRPosition &IRP,
+ Attributor &A);
+
+ /// See AbstractAttribute::getName().
+ const std::string getName() const override { return "AAAMDWorkGroupSize"; }
+
+ /// See AbstractAttribute::getIdAddr().
+ const char *getIdAddr() const override { return &ID; }
+
+ /// This function should return true if the type of the \p AA is
+ /// AAAMDAttributes.
+ static bool classof(const AbstractAttribute *AA) {
+ return (AA->getIdAddr() == &ID);
+ }
+
+ /// Unique ID (due to the unique address)
+ static const char ID;
+};
+const char AAAMDWorkGroupSize::ID = 0;
+
+struct AAAMDWorkGroupSizeFunction : public AAAMDWorkGroupSize {
+ AAAMDWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
+ : AAAMDWorkGroupSize(IRP, A) {}
+
+ void initialize(Attributor &A) override {
+ Function *F = getAssociatedFunction();
+ CallingConv::ID CC = F->getCallingConv();
+
+ if (CC != CallingConv::AMDGPU_KERNEL)
+ return;
+
+ bool InitialValue = false;
+ if (F->hasFnAttribute("uniform-work-group-size"))
+ InitialValue = F->getFnAttribute("uniform-work-group-size")
+ .getValueAsString()
+ .equals("true");
+
+ if (InitialValue)
+ indicateOptimisticFixpoint();
+ else
+ indicatePessimisticFixpoint();
+ }
+
+ ChangeStatus updateImpl(Attributor &A) override {
+ ChangeStatus Change = ChangeStatus::UNCHANGED;
+
+ auto CheckCallSite = [&](AbstractCallSite CS) {
+ Function *Caller = CS.getInstruction()->getFunction();
+ LLVM_DEBUG(dbgs() << "[AAAMDWorkGroupSize] Call " << Caller->getName()
+ << "->" << getAssociatedFunction()->getName() << "\n");
+
+ const auto &CallerInfo = A.getAAFor<AAAMDWorkGroupSize>(
+ *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
+
+ Change = Change | clampStateAndIndicateChange(this->getState(),
+ CallerInfo.getState());
+
+ return true;
+ };
+
+ bool AllCallSitesKnown = true;
+ if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
+ indicatePessimisticFixpoint();
+
+ return Change;
+ }
+
+ ChangeStatus manifest(Attributor &A) override {
+ SmallVector<Attribute, 8> AttrList;
+ LLVMContext &Ctx = getAssociatedFunction()->getContext();
+
+ AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size",
+ getAssumed() ? "true" : "false"));
+ return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
+ /* ForceReplace */ true);
+ }
+
+ bool isValidState() const override {
+ // This state is always valid, even when the state is false.
+ return true;
+ }
+
+ const std::string getAsStr() const override {
+ return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {}
+};
+
+AAAMDWorkGroupSize &AAAMDWorkGroupSize::createForPosition(const IRPosition &IRP,
+ Attributor &A) {
+ if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
+ return *new (A.Allocator) AAAMDWorkGroupSizeFunction(IRP, A);
+ llvm_unreachable("AAAMDWorkGroupSize is only valid for function position");
+}
+
+struct AAAMDAttributesFunction : public AAAMDAttributes {
+ AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
+ : AAAMDAttributes(IRP, A) {}
+
+ void initialize(Attributor &A) override {
+ Function *F = getAssociatedFunction();
+ CallingConv::ID CC = F->getCallingConv();
+ bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx);
+
+ // Don't add attributes to instrinsics
+ if (F->isIntrinsic()) {
+ indicatePessimisticFixpoint();
+ return;
+ }
+
+ // Ignore functions with graphics calling conventions, these are currently
+ // not allowed to have kernel arguments.
+ if (AMDGPU::isGraphics(F->getCallingConv())) {
+ indicatePessimisticFixpoint();
+ return;
+ }
+
+ for (StringRef Attr : ImplicitAttrNames) {
+ if (F->hasFnAttribute(Attr))
+ Attributes.insert(Attr);
+ }
+
+ // TODO: We shouldn't need this in the future.
+ if (CallingConvSupportsAllImplicits &&
+ F->hasAddressTaken(nullptr, true, true, true)) {
+ for (StringRef AttrName : ImplicitAttrNames) {
+ Attributes.insert(AttrName);
+ }
+ }
+ }
+
+ ChangeStatus updateImpl(Attributor &A) override {
+ Function *F = getAssociatedFunction();
+ ChangeStatus Change = ChangeStatus::UNCHANGED;
+ bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
+ CallingConv::ID CC = F->getCallingConv();
+ bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx);
+ auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
+
+ auto AddAttribute = [&](StringRef AttrName) {
+ if (Attributes.insert(AttrName).second)
+ Change = ChangeStatus::CHANGED;
+ };
+
+ // Check for Intrinsics and propagate attributes.
+ const AACallEdges &AAEdges = A.getAAFor<AACallEdges>(
+ *this, this->getIRPosition(), DepClassTy::REQUIRED);
+
+ // We have to assume that we can reach a function with these attributes.
+ // We do not consider inline assembly as a unknown callee.
+ if (CallingConvSupportsAllImplicits && AAEdges.hasNonAsmUnknownCallee()) {
+ for (StringRef AttrName : ImplicitAttrNames) {
+ AddAttribute(AttrName);
+ }
+ }
+
+ bool NeedsQueuePtr = false;
+ bool HasCall = false;
+ for (Function *Callee : AAEdges.getOptimisticEdges()) {
+ Intrinsic::ID IID = Callee->getIntrinsicID();
+ if (IID != Intrinsic::not_intrinsic) {
+ if (!IsNonEntryFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) {
+ AddAttribute("amdgpu-kernarg-segment-ptr");
+ continue;
+ }
+
+ bool NonKernelOnly = false;
+ StringRef AttrName =
+ intrinsicToAttrName(IID, NonKernelOnly, NeedsQueuePtr);
+
+ if (!AttrName.empty() && (IsNonEntryFunc || !NonKernelOnly))
+ AddAttribute(AttrName);
+
+ continue;
+ }
+
+ HasCall = true;
+ const AAAMDAttributes &AAAMD = A.getAAFor<AAAMDAttributes>(
+ *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
+ const DenseSet<StringRef> &CalleeAttributes = AAAMD.getAttributes();
+ // Propagate implicit attributes from called function.
+ for (StringRef AttrName : ImplicitAttrNames)
+ if (CalleeAttributes.count(AttrName))
+ AddAttribute(AttrName);
+ }
+
+ HasCall |= AAEdges.hasUnknownCallee();
+ if (!IsNonEntryFunc && HasCall)
+ AddAttribute("amdgpu-calls");
+
+ // Check the function body.
+ auto CheckAlloca = [&](Instruction &I) {
+ AddAttribute("amdgpu-stack-objects");
+ return false;
+ };
+
+ bool UsedAssumedInformation = false;
+ A.checkForAllInstructions(CheckAlloca, *this, {Instruction::Alloca},
+ UsedAssumedInformation);
+
+ // If we found that we need amdgpu-queue-ptr, nothing else to do.
+ if (NeedsQueuePtr || Attributes.count("amdgpu-queue-ptr")) {
+ AddAttribute("amdgpu-queue-ptr");
+ return Change;
+ }
+
+ auto CheckAddrSpaceCasts = [&](Instruction &I) {
+ unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
+ if (castRequiresQueuePtr(SrcAS)) {
+ NeedsQueuePtr = true;
+ return false;
+ }
+ return true;
+ };
+
+ bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
+
+ // `checkForAllInstructions` is much more cheaper than going through all
+ // instructions, try it first.
+
+ // amdgpu-queue-ptr is not needed if aperture regs is present.
+ if (!HasApertureRegs)
+ A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
+ {Instruction::AddrSpaceCast},
+ UsedAssumedInformation);
+
+ // If we found that we need amdgpu-queue-ptr, nothing else to do.
+ if (NeedsQueuePtr) {
+ AddAttribute("amdgpu-queue-ptr");
+ return Change;
+ }
+
+ if (!IsNonEntryFunc && HasApertureRegs)
+ return Change;
+
+ for (BasicBlock &BB : *F) {
+ for (Instruction &I : BB) {
+ for (const Use &U : I.operands()) {
+ if (const auto *C = dyn_cast<Constant>(U)) {
+ if (InfoCache.needsQueuePtr(C, *F)) {
+ AddAttribute("amdgpu-queue-ptr");
+ return Change;
+ }
+ }
+ }
+ }
+ }
+
+ return Change;
+ }
+
+ ChangeStatus manifest(Attributor &A) override {
+ SmallVector<Attribute, 8> AttrList;
+ LLVMContext &Ctx = getAssociatedFunction()->getContext();
+
+ for (StringRef AttrName : Attributes)
+ AttrList.push_back(Attribute::get(Ctx, AttrName));
+
+ return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
+ /* ForceReplace */ true);
+ }
+
+ const std::string getAsStr() const override {
+ return "AMDInfo[" + std::to_string(Attributes.size()) + "]";
+ }
+
+ const DenseSet<StringRef> &getAttributes() const override {
+ return Attributes;
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {}
+
+private:
+ DenseSet<StringRef> Attributes;
+};
+
+AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
+ Attributor &A) {
+ if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
+ return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
+ llvm_unreachable("AAAMDAttributes is only valid for function position");
+}
+
+class AMDGPUAttributor : public ModulePass {
+public:
+ AMDGPUAttributor() : ModulePass(ID) {}
+
+ /// doInitialization - Virtual method overridden by subclasses to do
+ /// any necessary initialization before any pass is run.
+ bool doInitialization(Module &) override {
+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+ if (!TPC)
+ report_fatal_error("TargetMachine is required");
+
+ TM = &TPC->getTM<TargetMachine>();
+ return false;
+ }
+
+ bool runOnModule(Module &M) override {
+ SetVector<Function *> Functions;
+ AnalysisGetter AG;
+ for (Function &F : M)
+ Functions.insert(&F);
+
+ CallGraphUpdater CGUpdater;
+ BumpPtrAllocator Allocator;
+ AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM);
+ Attributor A(Functions, InfoCache, CGUpdater);
+
+ for (Function &F : M) {
+ A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
+ A.getOrCreateAAFor<AAAMDWorkGroupSize>(IRPosition::function(F));
+ }
+
+ ChangeStatus Change = A.run();
+ return Change == ChangeStatus::CHANGED;
+ }
+
+ StringRef getPassName() const override { return "AMDGPU Attributor"; }
+ TargetMachine *TM;
+ static char ID;
+};
+
+char AMDGPUAttributor::ID = 0;
+
+Pass *llvm::createAMDGPUAttributorPass() { return new AMDGPUAttributor(); }
+INITIALIZE_PASS(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false, false)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 852a05b3c181..b9faad453aba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -29,44 +29,39 @@ using namespace llvm;
namespace {
-struct AMDGPUValueHandler : public CallLowering::ValueHandler {
- AMDGPUValueHandler(bool IsIncoming, MachineIRBuilder &B,
- MachineRegisterInfo &MRI, CCAssignFn *AssignFn)
- : ValueHandler(IsIncoming, B, MRI, AssignFn) {}
-
- /// Wrapper around extendRegister to ensure we extend to a full 32-bit
- /// register.
- Register extendRegisterMin32(Register ValVReg, CCValAssign &VA) {
- if (VA.getLocVT().getSizeInBits() < 32) {
- // 16-bit types are reported as legal for 32-bit registers. We need to
- // extend and do a 32-bit copy to avoid the verifier complaining about it.
- return MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
- }
-
- return extendRegister(ValVReg, VA);
+/// Wrapper around extendRegister to ensure we extend to a full 32-bit register.
+static Register extendRegisterMin32(CallLowering::ValueHandler &Handler,
+ Register ValVReg, CCValAssign &VA) {
+ if (VA.getLocVT().getSizeInBits() < 32) {
+ // 16-bit types are reported as legal for 32-bit registers. We need to
+ // extend and do a 32-bit copy to avoid the verifier complaining about it.
+ return Handler.MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
}
-};
-struct AMDGPUOutgoingValueHandler : public AMDGPUValueHandler {
+ return Handler.extendRegister(ValVReg, VA);
+}
+
+struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
- MachineInstrBuilder MIB, CCAssignFn *AssignFn)
- : AMDGPUValueHandler(false, B, MRI, AssignFn), MIB(MIB) {}
+ MachineInstrBuilder MIB)
+ : OutgoingValueHandler(B, MRI), MIB(MIB) {}
MachineInstrBuilder MIB;
Register getStackAddress(uint64_t Size, int64_t Offset,
- MachinePointerInfo &MPO) override {
+ MachinePointerInfo &MPO,
+ ISD::ArgFlagsTy Flags) override {
llvm_unreachable("not implemented");
}
- void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
+ void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
MachinePointerInfo &MPO, CCValAssign &VA) override {
llvm_unreachable("not implemented");
}
void assignValueToReg(Register ValVReg, Register PhysReg,
CCValAssign &VA) override {
- Register ExtReg = extendRegisterMin32(ValVReg, VA);
+ Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
// If this is a scalar return, insert a readfirstlane just in case the value
// ends up in a VGPR.
@@ -83,27 +78,23 @@ struct AMDGPUOutgoingValueHandler : public AMDGPUValueHandler {
MIRBuilder.buildCopy(PhysReg, ExtReg);
MIB.addUse(PhysReg, RegState::Implicit);
}
-
- bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo,
- const CallLowering::ArgInfo &Info,
- ISD::ArgFlagsTy Flags,
- CCState &State) override {
- return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
- }
};
-struct AMDGPUIncomingArgHandler : public AMDGPUValueHandler {
+struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
uint64_t StackUsed = 0;
- AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
- CCAssignFn *AssignFn)
- : AMDGPUValueHandler(true, B, MRI, AssignFn) {}
+ AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI)
+ : IncomingValueHandler(B, MRI) {}
Register getStackAddress(uint64_t Size, int64_t Offset,
- MachinePointerInfo &MPO) override {
+ MachinePointerInfo &MPO,
+ ISD::ArgFlagsTy Flags) override {
auto &MFI = MIRBuilder.getMF().getFrameInfo();
- int FI = MFI.CreateFixedObject(Size, Offset, true);
+
+ // Byval is assumed to be writable memory, but other stack passed arguments
+ // are not.
+ const bool IsImmutable = !Flags.isByVal();
+ int FI = MFI.CreateFixedObject(Size, Offset, IsImmutable);
MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
auto AddrReg = MIRBuilder.buildFrameIndex(
LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), FI);
@@ -119,35 +110,24 @@ struct AMDGPUIncomingArgHandler : public AMDGPUValueHandler {
// 16-bit types are reported as legal for 32-bit registers. We need to do
// a 32-bit copy, and truncate to avoid the verifier complaining about it.
auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
- MIRBuilder.buildTrunc(ValVReg, Copy);
+
+ // If we have signext/zeroext, it applies to the whole 32-bit register
+ // before truncation.
+ auto Extended =
+ buildExtensionHint(VA, Copy.getReg(0), LLT(VA.getLocVT()));
+ MIRBuilder.buildTrunc(ValVReg, Extended);
return;
}
- switch (VA.getLocInfo()) {
- case CCValAssign::LocInfo::SExt:
- case CCValAssign::LocInfo::ZExt:
- case CCValAssign::LocInfo::AExt: {
- auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg);
- MIRBuilder.buildTrunc(ValVReg, Copy);
- break;
- }
- default:
- MIRBuilder.buildCopy(ValVReg, PhysReg);
- break;
- }
+ IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA);
}
- void assignValueToAddress(Register ValVReg, Register Addr, uint64_t MemSize,
+ void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
MachinePointerInfo &MPO, CCValAssign &VA) override {
MachineFunction &MF = MIRBuilder.getMF();
- // The reported memory location may be wider than the value.
- const LLT RegTy = MRI.getType(ValVReg);
- MemSize = std::min(static_cast<uint64_t>(RegTy.getSizeInBytes()), MemSize);
-
- // FIXME: Get alignment
auto MMO = MF.getMachineMemOperand(
- MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemSize,
+ MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemTy,
inferAlignFromPtrInfo(MF, MPO));
MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
}
@@ -159,9 +139,8 @@ struct AMDGPUIncomingArgHandler : public AMDGPUValueHandler {
};
struct FormalArgHandler : public AMDGPUIncomingArgHandler {
- FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
- CCAssignFn *AssignFn)
- : AMDGPUIncomingArgHandler(B, MRI, AssignFn) {}
+ FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI)
+ : AMDGPUIncomingArgHandler(B, MRI) {}
void markPhysRegUsed(unsigned PhysReg) override {
MIRBuilder.getMBB().addLiveIn(PhysReg);
@@ -170,8 +149,8 @@ struct FormalArgHandler : public AMDGPUIncomingArgHandler {
struct CallReturnHandler : public AMDGPUIncomingArgHandler {
CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
- MachineInstrBuilder MIB, CCAssignFn *AssignFn)
- : AMDGPUIncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+ MachineInstrBuilder MIB)
+ : AMDGPUIncomingArgHandler(MIRBuilder, MRI), MIB(MIB) {}
void markPhysRegUsed(unsigned PhysReg) override {
MIB.addDef(PhysReg, RegState::Implicit);
@@ -180,10 +159,7 @@ struct CallReturnHandler : public AMDGPUIncomingArgHandler {
MachineInstrBuilder MIB;
};
-struct AMDGPUOutgoingArgHandler : public AMDGPUValueHandler {
- MachineInstrBuilder MIB;
- CCAssignFn *AssignFnVarArg;
-
+struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
/// For tail calls, the byte offset of the call's argument area from the
/// callee's. Unused elsewhere.
int FPDiff;
@@ -195,20 +171,23 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUValueHandler {
AMDGPUOutgoingArgHandler(MachineIRBuilder &MIRBuilder,
MachineRegisterInfo &MRI, MachineInstrBuilder MIB,
- CCAssignFn *AssignFn, CCAssignFn *AssignFnVarArg,
bool IsTailCall = false, int FPDiff = 0)
- : AMDGPUValueHandler(false, MIRBuilder, MRI, AssignFn), MIB(MIB),
- AssignFnVarArg(AssignFnVarArg), FPDiff(FPDiff), IsTailCall(IsTailCall) {
- }
+ : AMDGPUOutgoingValueHandler(MIRBuilder, MRI, MIB), FPDiff(FPDiff),
+ IsTailCall(IsTailCall) {}
Register getStackAddress(uint64_t Size, int64_t Offset,
- MachinePointerInfo &MPO) override {
+ MachinePointerInfo &MPO,
+ ISD::ArgFlagsTy Flags) override {
MachineFunction &MF = MIRBuilder.getMF();
const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32);
const LLT S32 = LLT::scalar(32);
if (IsTailCall) {
- llvm_unreachable("implement me");
+ Offset += FPDiff;
+ int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
+ auto FIReg = MIRBuilder.buildFrameIndex(PtrTy, FI);
+ MPO = MachinePointerInfo::getFixedStack(MF, FI);
+ return FIReg.getReg(0);
}
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -226,35 +205,29 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUValueHandler {
void assignValueToReg(Register ValVReg, Register PhysReg,
CCValAssign &VA) override {
MIB.addUse(PhysReg, RegState::Implicit);
- Register ExtReg = extendRegisterMin32(ValVReg, VA);
+ Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
MIRBuilder.buildCopy(PhysReg, ExtReg);
}
- void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
+ void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
MachinePointerInfo &MPO, CCValAssign &VA) override {
MachineFunction &MF = MIRBuilder.getMF();
uint64_t LocMemOffset = VA.getLocMemOffset();
const auto &ST = MF.getSubtarget<GCNSubtarget>();
auto MMO = MF.getMachineMemOperand(
- MPO, MachineMemOperand::MOStore, Size,
- commonAlignment(ST.getStackAlignment(), LocMemOffset));
+ MPO, MachineMemOperand::MOStore, MemTy,
+ commonAlignment(ST.getStackAlignment(), LocMemOffset));
MIRBuilder.buildStore(ValVReg, Addr, *MMO);
}
- void assignValueToAddress(const CallLowering::ArgInfo &Arg, Register Addr,
- uint64_t MemSize, MachinePointerInfo &MPO,
- CCValAssign &VA) override {
+ void assignValueToAddress(const CallLowering::ArgInfo &Arg,
+ unsigned ValRegIndex, Register Addr, LLT MemTy,
+ MachinePointerInfo &MPO, CCValAssign &VA) override {
Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
- ? extendRegister(Arg.Regs[0], VA)
- : Arg.Regs[0];
-
- // If we extended the value type we might need to adjust the MMO's
- // Size. This happens if ComputeValueVTs widened a small type value to a
- // legal register type (e.g. s8->s16)
- const LLT RegTy = MRI.getType(ValVReg);
- MemSize = std::min(MemSize, (uint64_t)RegTy.getSizeInBytes());
- assignValueToAddress(ValVReg, Addr, MemSize, MPO, VA);
+ ? extendRegister(Arg.Regs[ValRegIndex], VA)
+ : Arg.Regs[ValRegIndex];
+ assignValueToAddress(ValVReg, Addr, MemTy, MPO, VA);
}
};
}
@@ -277,149 +250,6 @@ static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) {
}
}
-// FIXME: This should move to generic code.
-void AMDGPUCallLowering::splitToValueTypes(MachineIRBuilder &B,
- const ArgInfo &OrigArg,
- SmallVectorImpl<ArgInfo> &SplitArgs,
- const DataLayout &DL,
- CallingConv::ID CallConv) const {
- const SITargetLowering &TLI = *getTLI<SITargetLowering>();
- LLVMContext &Ctx = OrigArg.Ty->getContext();
-
- SmallVector<EVT, 4> SplitVTs;
- ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs);
-
- assert(OrigArg.Regs.size() == SplitVTs.size());
-
- if (SplitVTs.size() == 0)
- return;
-
- if (SplitVTs.size() == 1) {
- // No splitting to do, but we want to replace the original type (e.g. [1 x
- // double] -> double).
- SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx),
- OrigArg.Flags[0], OrigArg.IsFixed);
- return;
- }
-
- // Create one ArgInfo for each virtual register in the original ArgInfo.
- assert(OrigArg.Regs.size() == SplitVTs.size() && "Regs / types mismatch");
-
- bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters(
- OrigArg.Ty, CallConv, false);
- for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) {
- Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx);
- SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags[0],
- OrigArg.IsFixed);
- if (NeedsRegBlock)
- SplitArgs.back().Flags[0].setInConsecutiveRegs();
- }
-
- SplitArgs.back().Flags[0].setInConsecutiveRegsLast();
-}
-
-void AMDGPUCallLowering::processSplitArgs(
- MachineIRBuilder &B, const ArgInfo &OrigArg,
- const SmallVectorImpl<ArgInfo> &SplitArg,
- SmallVectorImpl<ArgInfo> &SplitArgs, const DataLayout &DL,
- CallingConv::ID CallConv, bool IsOutgoing,
- SplitArgTy PerformArgSplit) const {
- LLVMContext &Ctx = OrigArg.Ty->getContext();
- const SITargetLowering &TLI = *getTLI<SITargetLowering>();
-
- // FIXME: This is mostly nasty pre-processing before handleAssignments. Most
- // of this should be performed by handleAssignments.
-
- for (int SplitIdx = 0, e = SplitArg.size(); SplitIdx != e; ++SplitIdx) {
- const ArgInfo &CurSplitArg = SplitArg[SplitIdx];
- Register Reg = OrigArg.Regs[SplitIdx];
- EVT VT = EVT::getEVT(CurSplitArg.Ty);
- LLT LLTy = getLLTForType(*CurSplitArg.Ty, DL);
-
- unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
- MVT RegVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
-
- if (NumParts == 1) {
- // No splitting to do, but we want to replace the original type (e.g. [1 x
- // double] -> double).
- SplitArgs.emplace_back(Reg, CurSplitArg.Ty, OrigArg.Flags,
- OrigArg.IsFixed);
- continue;
- }
-
- SmallVector<Register, 8> SplitRegs;
- Type *PartTy = EVT(RegVT).getTypeForEVT(Ctx);
- LLT PartLLT = getLLTForType(*PartTy, DL);
- MachineRegisterInfo &MRI = *B.getMRI();
-
- // FIXME: Should we be reporting all of the part registers for a single
- // argument, and let handleAssignments take care of the repacking?
- for (unsigned i = 0; i < NumParts; ++i) {
- Register PartReg = MRI.createGenericVirtualRegister(PartLLT);
- SplitRegs.push_back(PartReg);
- SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags);
- }
-
- PerformArgSplit(SplitRegs, Reg, LLTy, PartLLT, SplitIdx);
- }
-}
-
-// TODO: Move to generic code
-static void unpackRegsToOrigType(MachineIRBuilder &B,
- ArrayRef<Register> DstRegs,
- Register SrcReg,
- const CallLowering::ArgInfo &Info,
- LLT SrcTy,
- LLT PartTy) {
- assert(DstRegs.size() > 1 && "Nothing to unpack");
-
- const unsigned PartSize = PartTy.getSizeInBits();
-
- if (SrcTy.isVector() && !PartTy.isVector() &&
- PartSize > SrcTy.getElementType().getSizeInBits()) {
- // Vector was scalarized, and the elements extended.
- auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), SrcReg);
- for (int i = 0, e = DstRegs.size(); i != e; ++i)
- B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i));
- return;
- }
-
- LLT GCDTy = getGCDType(SrcTy, PartTy);
- if (GCDTy == PartTy) {
- // If this already evenly divisible, we can create a simple unmerge.
- B.buildUnmerge(DstRegs, SrcReg);
- return;
- }
-
- MachineRegisterInfo &MRI = *B.getMRI();
- LLT DstTy = MRI.getType(DstRegs[0]);
- LLT LCMTy = getLCMType(SrcTy, PartTy);
-
- const unsigned LCMSize = LCMTy.getSizeInBits();
- const unsigned DstSize = DstTy.getSizeInBits();
- const unsigned SrcSize = SrcTy.getSizeInBits();
-
- Register UnmergeSrc = SrcReg;
- if (LCMSize != SrcSize) {
- // Widen to the common type.
- Register Undef = B.buildUndef(SrcTy).getReg(0);
- SmallVector<Register, 8> MergeParts(1, SrcReg);
- for (unsigned Size = SrcSize; Size != LCMSize; Size += SrcSize)
- MergeParts.push_back(Undef);
-
- UnmergeSrc = B.buildMerge(LCMTy, MergeParts).getReg(0);
- }
-
- // Unmerge to the original registers and pad with dead defs.
- SmallVector<Register, 8> UnmergeResults(DstRegs.begin(), DstRegs.end());
- for (unsigned Size = DstSize * DstRegs.size(); Size != LCMSize;
- Size += DstSize) {
- UnmergeResults.push_back(MRI.createGenericVirtualRegister(DstTy));
- }
-
- B.buildUnmerge(UnmergeResults, UnmergeSrc);
-}
-
bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF,
CallingConv::ID CallConv,
SmallVectorImpl<BaseArgInfo> &Outs,
@@ -458,18 +288,12 @@ bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
assert(VRegs.size() == SplitEVTs.size() &&
"For each split Type there should be exactly one VReg.");
- // We pre-process the return value decomposed into EVTs.
- SmallVector<ArgInfo, 8> PreSplitRetInfos;
-
- // Further processing is applied to split the arguments from PreSplitRetInfos
- // into 32-bit pieces in SplitRetInfos before passing off to
- // handleAssignments.
SmallVector<ArgInfo, 8> SplitRetInfos;
for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
EVT VT = SplitEVTs[i];
Register Reg = VRegs[i];
- ArgInfo RetInfo(Reg, VT.getTypeForEVT(Ctx));
+ ArgInfo RetInfo(Reg, VT.getTypeForEVT(Ctx), 0);
setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F);
if (VT.isScalarInteger()) {
@@ -497,23 +321,15 @@ bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F);
}
- splitToValueTypes(B, RetInfo, PreSplitRetInfos, DL, CC);
-
- // FIXME: This splitting should mostly be done by handleAssignments
- processSplitArgs(B, RetInfo,
- PreSplitRetInfos, SplitRetInfos, DL, CC, true,
- [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy,
- LLT PartLLT, int VTSplitIdx) {
- unpackRegsToOrigType(B, Regs, SrcReg,
- PreSplitRetInfos[VTSplitIdx], LLTy,
- PartLLT);
- });
- PreSplitRetInfos.clear();
+ splitToValueTypes(RetInfo, SplitRetInfos, DL, CC);
}
CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
- AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn);
- return handleAssignments(B, SplitRetInfos, RetHandler);
+
+ OutgoingValueAssigner Assigner(AssignFn);
+ AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret);
+ return determineAndHandleAssignments(RetHandler, Assigner, SplitRetInfos, B,
+ CC, F.isVarArg());
}
bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
@@ -568,7 +384,6 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
}
void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B,
- Type *ParamTy,
uint64_t Offset) const {
MachineFunction &MF = B.getMF();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -582,26 +397,45 @@ void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B,
B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg);
}
-void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy,
- uint64_t Offset, Align Alignment,
- Register DstReg) const {
+void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, ArgInfo &OrigArg,
+ uint64_t Offset,
+ Align Alignment) const {
MachineFunction &MF = B.getMF();
const Function &F = MF.getFunction();
const DataLayout &DL = F.getParent()->getDataLayout();
MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
- unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
- Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
- lowerParameterPtr(PtrReg, B, ParamTy, Offset);
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- PtrInfo,
- MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOInvariant,
- TypeSize, Alignment);
+ SmallVector<ArgInfo, 32> SplitArgs;
+ SmallVector<uint64_t> FieldOffsets;
+ splitToValueTypes(OrigArg, SplitArgs, DL, F.getCallingConv(), &FieldOffsets);
+
+ unsigned Idx = 0;
+ for (ArgInfo &SplitArg : SplitArgs) {
+ Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
+ lowerParameterPtr(PtrReg, B, Offset + FieldOffsets[Idx]);
+
+ LLT ArgTy = getLLTForType(*SplitArg.Ty, DL);
+ if (SplitArg.Flags[0].isPointer()) {
+ // Compensate for losing pointeriness in splitValueTypes.
+ LLT PtrTy = LLT::pointer(SplitArg.Flags[0].getPointerAddrSpace(),
+ ArgTy.getScalarSizeInBits());
+ ArgTy = ArgTy.isVector() ? LLT::vector(ArgTy.getElementCount(), PtrTy)
+ : PtrTy;
+ }
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ PtrInfo,
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ ArgTy, commonAlignment(Alignment, FieldOffsets[Idx]));
- B.buildLoad(DstReg, PtrReg, *MMO);
+ assert(SplitArg.Regs.size() == 1);
+
+ B.buildLoad(SplitArg.Regs[0], PtrReg, *MMO);
+ ++Idx;
+ }
}
// Allocate special inputs passed in user SGPRs.
@@ -665,9 +499,10 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
const SITargetLowering &TLI = *getTLI<SITargetLowering>();
-
const DataLayout &DL = F.getParent()->getDataLayout();
+ Info->allocateModuleLDSGlobal(F.getParent());
+
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
@@ -706,24 +541,19 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
assert(VRegs[i].size() == 1 &&
"expected only one register for byval pointers");
if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) {
- lowerParameterPtr(VRegs[i][0], B, ArgTy, ArgOffset);
+ lowerParameterPtr(VRegs[i][0], B, ArgOffset);
} else {
const LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
Register PtrReg = MRI.createGenericVirtualRegister(ConstPtrTy);
- lowerParameterPtr(PtrReg, B, ArgTy, ArgOffset);
+ lowerParameterPtr(PtrReg, B, ArgOffset);
B.buildAddrSpaceCast(VRegs[i][0], PtrReg);
}
} else {
- ArrayRef<Register> OrigArgRegs = VRegs[i];
- Register ArgReg =
- OrigArgRegs.size() == 1
- ? OrigArgRegs[0]
- : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
-
- lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg);
- if (OrigArgRegs.size() > 1)
- unpackRegs(OrigArgRegs, ArgReg, ArgTy, B);
+ ArgInfo OrigArg(VRegs[i], Arg, i);
+ const unsigned OrigArgIdx = i + AttributeList::FirstArgIndex;
+ setArgFlags(OrigArg, OrigArgIdx, DL, F);
+ lowerParameter(B, OrigArg, ArgOffset, Alignment);
}
++i;
@@ -734,117 +564,6 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
return true;
}
-/// Pack values \p SrcRegs to cover the vector type result \p DstRegs.
-static MachineInstrBuilder mergeVectorRegsToResultRegs(
- MachineIRBuilder &B, ArrayRef<Register> DstRegs, ArrayRef<Register> SrcRegs) {
- MachineRegisterInfo &MRI = *B.getMRI();
- LLT LLTy = MRI.getType(DstRegs[0]);
- LLT PartLLT = MRI.getType(SrcRegs[0]);
-
- // Deal with v3s16 split into v2s16
- LLT LCMTy = getLCMType(LLTy, PartLLT);
- if (LCMTy == LLTy) {
- // Common case where no padding is needed.
- assert(DstRegs.size() == 1);
- return B.buildConcatVectors(DstRegs[0], SrcRegs);
- }
-
- const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits();
- Register Undef = B.buildUndef(PartLLT).getReg(0);
-
- // Build vector of undefs.
- SmallVector<Register, 8> WidenedSrcs(NumWide, Undef);
-
- // Replace the first sources with the real registers.
- std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin());
-
- auto Widened = B.buildConcatVectors(LCMTy, WidenedSrcs);
- int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits();
-
- SmallVector<Register, 8> PadDstRegs(NumDst);
- std::copy(DstRegs.begin(), DstRegs.end(), PadDstRegs.begin());
-
- // Create the excess dead defs for the unmerge.
- for (int I = DstRegs.size(); I != NumDst; ++I)
- PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy);
-
- return B.buildUnmerge(PadDstRegs, Widened);
-}
-
-// TODO: Move this to generic code
-static void packSplitRegsToOrigType(MachineIRBuilder &B,
- ArrayRef<Register> OrigRegs,
- ArrayRef<Register> Regs,
- LLT LLTy,
- LLT PartLLT) {
- MachineRegisterInfo &MRI = *B.getMRI();
-
- if (!LLTy.isVector() && !PartLLT.isVector()) {
- assert(OrigRegs.size() == 1);
- LLT OrigTy = MRI.getType(OrigRegs[0]);
-
- unsigned SrcSize = PartLLT.getSizeInBits() * Regs.size();
- if (SrcSize == OrigTy.getSizeInBits())
- B.buildMerge(OrigRegs[0], Regs);
- else {
- auto Widened = B.buildMerge(LLT::scalar(SrcSize), Regs);
- B.buildTrunc(OrigRegs[0], Widened);
- }
-
- return;
- }
-
- if (LLTy.isVector() && PartLLT.isVector()) {
- assert(OrigRegs.size() == 1);
- assert(LLTy.getElementType() == PartLLT.getElementType());
- mergeVectorRegsToResultRegs(B, OrigRegs, Regs);
- return;
- }
-
- assert(LLTy.isVector() && !PartLLT.isVector());
-
- LLT DstEltTy = LLTy.getElementType();
-
- // Pointer information was discarded. We'll need to coerce some register types
- // to avoid violating type constraints.
- LLT RealDstEltTy = MRI.getType(OrigRegs[0]).getElementType();
-
- assert(DstEltTy.getSizeInBits() == RealDstEltTy.getSizeInBits());
-
- if (DstEltTy == PartLLT) {
- // Vector was trivially scalarized.
-
- if (RealDstEltTy.isPointer()) {
- for (Register Reg : Regs)
- MRI.setType(Reg, RealDstEltTy);
- }
-
- B.buildBuildVector(OrigRegs[0], Regs);
- } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) {
- // Deal with vector with 64-bit elements decomposed to 32-bit
- // registers. Need to create intermediate 64-bit elements.
- SmallVector<Register, 8> EltMerges;
- int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits();
-
- assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0);
-
- for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I) {
- auto Merge = B.buildMerge(RealDstEltTy, Regs.take_front(PartsPerElt));
- // Fix the type in case this is really a vector of pointers.
- MRI.setType(Merge.getReg(0), RealDstEltTy);
- EltMerges.push_back(Merge.getReg(0));
- Regs = Regs.drop_front(PartsPerElt);
- }
-
- B.buildBuildVector(OrigRegs[0], EltMerges);
- } else {
- // Vector was split, and elements promoted to a wider type.
- LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT);
- auto BV = B.buildBuildVector(BVType, Regs);
- B.buildTrunc(OrigRegs[0], BV);
- }
-}
-
bool AMDGPUCallLowering::lowerFormalArguments(
MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs,
FunctionLoweringInfo &FLI) const {
@@ -867,6 +586,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
const DataLayout &DL = F.getParent()->getDataLayout();
+ Info->allocateModuleLDSGlobal(F.getParent());
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
@@ -885,7 +605,6 @@ bool AMDGPUCallLowering::lowerFormalArguments(
CCInfo.AllocateReg(ImplicitBufferPtrReg);
}
- SmallVector<ArgInfo, 8> SplitArg;
SmallVector<ArgInfo, 32> SplitArgs;
unsigned Idx = 0;
unsigned PSInputNum = 0;
@@ -931,23 +650,11 @@ bool AMDGPUCallLowering::lowerFormalArguments(
}
}
- ArgInfo OrigArg(VRegs[Idx], Arg.getType());
+ ArgInfo OrigArg(VRegs[Idx], Arg, Idx);
const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex;
setArgFlags(OrigArg, OrigArgIdx, DL, F);
- SplitArg.clear();
- splitToValueTypes(B, OrigArg, SplitArg, DL, CC);
-
- processSplitArgs(B, OrigArg, SplitArg, SplitArgs, DL, CC, false,
- // FIXME: We should probably be passing multiple registers
- // to handleAssignments to do this
- [&](ArrayRef<Register> Regs, Register DstReg, LLT LLTy,
- LLT PartLLT, int VTSplitIdx) {
- assert(DstReg == VRegs[Idx][VTSplitIdx]);
- packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs,
- LLTy, PartLLT);
- });
-
+ splitToValueTypes(OrigArg, SplitArgs, DL, CC);
++Idx;
}
@@ -1004,10 +711,16 @@ bool AMDGPUCallLowering::lowerFormalArguments(
TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
}
- FormalArgHandler Handler(B, MRI, AssignFn);
- if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler))
+ IncomingValueAssigner Assigner(AssignFn);
+ if (!determineAssignments(Assigner, SplitArgs, CCInfo))
+ return false;
+
+ FormalArgHandler Handler(B, MRI);
+ if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B))
return false;
+ uint64_t StackOffset = Assigner.StackOffset;
+
if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
// Special inputs come after user arguments.
TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
@@ -1022,6 +735,12 @@ bool AMDGPUCallLowering::lowerFormalArguments(
TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
}
+ // When we tail call, we need to check if the callee's arguments will fit on
+ // the caller's stack. So, whenever we lower formal arguments, we should keep
+ // track of this information, since we might lower a tail call in this
+ // function later.
+ Info->setBytesInStackArgArea(StackOffset);
+
// Move back to the end of the basic block.
B.setMBB(MBB);
@@ -1184,7 +903,7 @@ getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
bool IsTailCall) {
- return AMDGPU::SI_CALL;
+ return IsTailCall ? AMDGPU::SI_TCRETURN : AMDGPU::SI_CALL;
}
// Add operands to call instruction to track the callee.
@@ -1208,6 +927,317 @@ static bool addCallTargetOperands(MachineInstrBuilder &CallInst,
return true;
}
+bool AMDGPUCallLowering::doCallerAndCalleePassArgsTheSameWay(
+ CallLoweringInfo &Info, MachineFunction &MF,
+ SmallVectorImpl<ArgInfo> &InArgs) const {
+ const Function &CallerF = MF.getFunction();
+ CallingConv::ID CalleeCC = Info.CallConv;
+ CallingConv::ID CallerCC = CallerF.getCallingConv();
+
+ // If the calling conventions match, then everything must be the same.
+ if (CalleeCC == CallerCC)
+ return true;
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+
+ // Make sure that the caller and callee preserve all of the same registers.
+ auto TRI = ST.getRegisterInfo();
+
+ const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
+ const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+ if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
+ return false;
+
+ // Check if the caller and callee will handle arguments in the same way.
+ const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+ CCAssignFn *CalleeAssignFnFixed;
+ CCAssignFn *CalleeAssignFnVarArg;
+ std::tie(CalleeAssignFnFixed, CalleeAssignFnVarArg) =
+ getAssignFnsForCC(CalleeCC, TLI);
+
+ CCAssignFn *CallerAssignFnFixed;
+ CCAssignFn *CallerAssignFnVarArg;
+ std::tie(CallerAssignFnFixed, CallerAssignFnVarArg) =
+ getAssignFnsForCC(CallerCC, TLI);
+
+ // FIXME: We are not accounting for potential differences in implicitly passed
+ // inputs, but only the fixed ABI is supported now anyway.
+ IncomingValueAssigner CalleeAssigner(CalleeAssignFnFixed,
+ CalleeAssignFnVarArg);
+ IncomingValueAssigner CallerAssigner(CallerAssignFnFixed,
+ CallerAssignFnVarArg);
+ return resultsCompatible(Info, MF, InArgs, CalleeAssigner, CallerAssigner);
+}
+
+bool AMDGPUCallLowering::areCalleeOutgoingArgsTailCallable(
+ CallLoweringInfo &Info, MachineFunction &MF,
+ SmallVectorImpl<ArgInfo> &OutArgs) const {
+ // If there are no outgoing arguments, then we are done.
+ if (OutArgs.empty())
+ return true;
+
+ const Function &CallerF = MF.getFunction();
+ CallingConv::ID CalleeCC = Info.CallConv;
+ CallingConv::ID CallerCC = CallerF.getCallingConv();
+ const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+
+ CCAssignFn *AssignFnFixed;
+ CCAssignFn *AssignFnVarArg;
+ std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI);
+
+ // We have outgoing arguments. Make sure that we can tail call with them.
+ SmallVector<CCValAssign, 16> OutLocs;
+ CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext());
+ OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
+
+ if (!determineAssignments(Assigner, OutArgs, OutInfo)) {
+ LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n");
+ return false;
+ }
+
+ // Make sure that they can fit on the caller's stack.
+ const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ if (OutInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) {
+ LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n");
+ return false;
+ }
+
+ // Verify that the parameters in callee-saved registers match.
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC);
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ return parametersInCSRMatch(MRI, CallerPreservedMask, OutLocs, OutArgs);
+}
+
+/// Return true if the calling convention is one that we can guarantee TCO for.
+static bool canGuaranteeTCO(CallingConv::ID CC) {
+ return CC == CallingConv::Fast;
+}
+
+/// Return true if we might ever do TCO for calls with this calling convention.
+static bool mayTailCallThisCC(CallingConv::ID CC) {
+ switch (CC) {
+ case CallingConv::C:
+ case CallingConv::AMDGPU_Gfx:
+ return true;
+ default:
+ return canGuaranteeTCO(CC);
+ }
+}
+
+bool AMDGPUCallLowering::isEligibleForTailCallOptimization(
+ MachineIRBuilder &B, CallLoweringInfo &Info,
+ SmallVectorImpl<ArgInfo> &InArgs, SmallVectorImpl<ArgInfo> &OutArgs) const {
+ // Must pass all target-independent checks in order to tail call optimize.
+ if (!Info.IsTailCall)
+ return false;
+
+ MachineFunction &MF = B.getMF();
+ const Function &CallerF = MF.getFunction();
+ CallingConv::ID CalleeCC = Info.CallConv;
+ CallingConv::ID CallerCC = CallerF.getCallingConv();
+
+ const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
+ const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
+ // Kernels aren't callable, and don't have a live in return address so it
+ // doesn't make sense to do a tail call with entry functions.
+ if (!CallerPreserved)
+ return false;
+
+ if (!mayTailCallThisCC(CalleeCC)) {
+ LLVM_DEBUG(dbgs() << "... Calling convention cannot be tail called.\n");
+ return false;
+ }
+
+ if (any_of(CallerF.args(), [](const Argument &A) {
+ return A.hasByValAttr() || A.hasSwiftErrorAttr();
+ })) {
+ LLVM_DEBUG(dbgs() << "... Cannot tail call from callers with byval "
+ "or swifterror arguments\n");
+ return false;
+ }
+
+ // If we have -tailcallopt, then we're done.
+ if (MF.getTarget().Options.GuaranteedTailCallOpt)
+ return canGuaranteeTCO(CalleeCC) && CalleeCC == CallerF.getCallingConv();
+
+ // Verify that the incoming and outgoing arguments from the callee are
+ // safe to tail call.
+ if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) {
+ LLVM_DEBUG(
+ dbgs()
+ << "... Caller and callee have incompatible calling conventions.\n");
+ return false;
+ }
+
+ if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "... Call is eligible for tail call optimization.\n");
+ return true;
+}
+
+// Insert outgoing implicit arguments for a call, by inserting copies to the
+// implicit argument registers and adding the necessary implicit uses to the
+// call instruction.
+void AMDGPUCallLowering::handleImplicitCallArguments(
+ MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
+ const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo,
+ ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const {
+ if (!ST.enableFlatScratch()) {
+ // Insert copies for the SRD. In the HSA case, this should be an identity
+ // copy.
+ auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::fixed_vector(4, 32),
+ FuncInfo.getScratchRSrcReg());
+ MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
+ CallInst.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
+ }
+
+ for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
+ MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second);
+ CallInst.addReg(ArgReg.first, RegState::Implicit);
+ }
+}
+
+bool AMDGPUCallLowering::lowerTailCall(
+ MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
+ SmallVectorImpl<ArgInfo> &OutArgs) const {
+ MachineFunction &MF = MIRBuilder.getMF();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ const Function &F = MF.getFunction();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+
+ // True when we're tail calling, but without -tailcallopt.
+ bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt;
+
+ // Find out which ABI gets to decide where things go.
+ CallingConv::ID CalleeCC = Info.CallConv;
+ CCAssignFn *AssignFnFixed;
+ CCAssignFn *AssignFnVarArg;
+ std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI);
+
+ MachineInstrBuilder CallSeqStart;
+ if (!IsSibCall)
+ CallSeqStart = MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP);
+
+ unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true);
+ auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
+ if (!addCallTargetOperands(MIB, MIRBuilder, Info))
+ return false;
+
+ // Byte offset for the tail call. When we are sibcalling, this will always
+ // be 0.
+ MIB.addImm(0);
+
+ // Tell the call which registers are clobbered.
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC);
+ MIB.addRegMask(Mask);
+
+ // FPDiff is the byte offset of the call's argument area from the callee's.
+ // Stores to callee stack arguments will be placed in FixedStackSlots offset
+ // by this amount for a tail call. In a sibling call it must be 0 because the
+ // caller will deallocate the entire stack and the callee still expects its
+ // arguments to begin at SP+0.
+ int FPDiff = 0;
+
+ // This will be 0 for sibcalls, potentially nonzero for tail calls produced
+ // by -tailcallopt. For sibcalls, the memory operands for the call are
+ // already available in the caller's incoming argument space.
+ unsigned NumBytes = 0;
+ if (!IsSibCall) {
+ // We aren't sibcalling, so we need to compute FPDiff. We need to do this
+ // before handling assignments, because FPDiff must be known for memory
+ // arguments.
+ unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
+ SmallVector<CCValAssign, 16> OutLocs;
+ CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext());
+
+ // FIXME: Not accounting for callee implicit inputs
+ OutgoingValueAssigner CalleeAssigner(AssignFnFixed, AssignFnVarArg);
+ if (!determineAssignments(CalleeAssigner, OutArgs, OutInfo))
+ return false;
+
+ // The callee will pop the argument stack as a tail call. Thus, we must
+ // keep it 16-byte aligned.
+ NumBytes = alignTo(OutInfo.getNextStackOffset(), ST.getStackAlignment());
+
+ // FPDiff will be negative if this tail call requires more space than we
+ // would automatically have in our incoming argument space. Positive if we
+ // actually shrink the stack.
+ FPDiff = NumReusableBytes - NumBytes;
+
+ // The stack pointer must be 16-byte aligned at all times it's used for a
+ // memory operation, which in practice means at *all* times and in
+ // particular across call boundaries. Therefore our own arguments started at
+ // a 16-byte aligned SP and the delta applied for the tail call should
+ // satisfy the same constraint.
+ assert(isAligned(ST.getStackAlignment(), FPDiff) &&
+ "unaligned stack on tail call");
+ }
+
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
+
+ // We could pass MIB and directly add the implicit uses to the call
+ // now. However, as an aesthetic choice, place implicit argument operands
+ // after the ordinary user argument registers.
+ SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
+
+ if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
+ Info.CallConv != CallingConv::AMDGPU_Gfx) {
+ // With a fixed ABI, allocate fixed registers before user arguments.
+ if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
+ return false;
+ }
+
+ OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
+
+ if (!determineAssignments(Assigner, OutArgs, CCInfo))
+ return false;
+
+ // Do the actual argument marshalling.
+ AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, true, FPDiff);
+ if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder))
+ return false;
+
+ handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, ImplicitArgRegs);
+
+ // If we have -tailcallopt, we need to adjust the stack. We'll do the call
+ // sequence start and end here.
+ if (!IsSibCall) {
+ MIB->getOperand(1).setImm(FPDiff);
+ CallSeqStart.addImm(NumBytes).addImm(0);
+ // End the call sequence *before* emitting the call. Normally, we would
+ // tidy the frame up after the call. However, here, we've laid out the
+ // parameters so that when SP is reset, they will be in the correct
+ // location.
+ MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN).addImm(NumBytes).addImm(0);
+ }
+
+ // Now we can add the actual call instruction to the correct basic block.
+ MIRBuilder.insertInstr(MIB);
+
+ // If Callee is a reg, since it is used by a target specific
+ // instruction, it must have a register class matching the
+ // constraint of that instruction.
+
+ // FIXME: We should define regbankselectable call instructions to handle
+ // divergent call targets.
+ if (MIB->getOperand(0).isReg()) {
+ MIB->getOperand(0).setReg(constrainOperandRegClass(
+ MF, *TRI, MRI, *ST.getInstrInfo(), *ST.getRegBankInfo(), *MIB,
+ MIB->getDesc(), MIB->getOperand(0), 0));
+ }
+
+ MF.getFrameInfo().setHasTailCall();
+ Info.LoweredTailCall = true;
+ return true;
+}
+
bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const {
if (Info.IsVarArg) {
@@ -1223,39 +1253,24 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
MachineRegisterInfo &MRI = MF.getRegInfo();
const SITargetLowering &TLI = *getTLI<SITargetLowering>();
const DataLayout &DL = F.getParent()->getDataLayout();
- CallingConv::ID CallConv = F.getCallingConv();
if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
- CallConv != CallingConv::AMDGPU_Gfx) {
+ Info.CallConv != CallingConv::AMDGPU_Gfx) {
LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n");
return false;
}
- if (AMDGPU::isShader(CallConv)) {
- LLVM_DEBUG(dbgs() << "Unhandled call from graphics shader\n");
- return false;
- }
-
SmallVector<ArgInfo, 8> OutArgs;
+ for (auto &OrigArg : Info.OrigArgs)
+ splitToValueTypes(OrigArg, OutArgs, DL, Info.CallConv);
- SmallVector<ArgInfo, 8> SplitArg;
- for (auto &OrigArg : Info.OrigArgs) {
- splitToValueTypes(MIRBuilder, OrigArg, SplitArg, DL, Info.CallConv);
-
- processSplitArgs(
- MIRBuilder, OrigArg, SplitArg, OutArgs, DL, Info.CallConv, true,
- // FIXME: We should probably be passing multiple registers to
- // handleAssignments to do this
- [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT,
- int VTSplitIdx) {
- unpackRegsToOrigType(MIRBuilder, Regs, SrcReg, OrigArg, LLTy, PartLLT);
- });
-
- SplitArg.clear();
- }
+ SmallVector<ArgInfo, 8> InArgs;
+ if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy())
+ splitToValueTypes(Info.OrigRet, InArgs, DL, Info.CallConv);
// If we can lower as a tail call, do that instead.
- bool CanTailCallOpt = false;
+ bool CanTailCallOpt =
+ isEligibleForTailCallOptimization(MIRBuilder, Info, InArgs, OutArgs);
// We must emit a tail call if we have musttail.
if (Info.IsMustTailCall && !CanTailCallOpt) {
@@ -1263,6 +1278,9 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
return false;
}
+ if (CanTailCallOpt)
+ return lowerTailCall(MIRBuilder, Info, OutArgs);
+
// Find out which ABI gets to decide where things go.
CCAssignFn *AssignFnFixed;
CCAssignFn *AssignFnVarArg;
@@ -1295,7 +1313,8 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// after the ordinary user argument registers.
SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
- if (AMDGPUTargetMachine::EnableFixedFunctionABI) {
+ if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
+ Info.CallConv != CallingConv::AMDGPU_Gfx) {
// With a fixed ABI, allocate fixed registers before user arguments.
if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
return false;
@@ -1303,26 +1322,18 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// Do the actual argument marshalling.
SmallVector<Register, 8> PhysRegs;
- AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
- AssignFnVarArg, false);
- if (!handleAssignments(CCInfo, ArgLocs, MIRBuilder, OutArgs, Handler))
+
+ OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
+ if (!determineAssignments(Assigner, OutArgs, CCInfo))
+ return false;
+
+ AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, false);
+ if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder))
return false;
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- if (!ST.enableFlatScratch()) {
- // Insert copies for the SRD. In the HSA case, this should be an identity
- // copy.
- auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32),
- MFI->getScratchRSrcReg());
- MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
- MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
- }
-
- for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
- MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second);
- MIB.addReg(ArgReg.first, RegState::Implicit);
- }
+ handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, ImplicitArgRegs);
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();
@@ -1340,55 +1351,32 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
1));
}
- auto OrigInsertPt = MIRBuilder.getInsertPt();
-
// Now we can add the actual call instruction to the correct position.
MIRBuilder.insertInstr(MIB);
- // Insert this now to give us an anchor point for managing the insert point.
- MachineInstrBuilder CallSeqEnd =
- MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN);
-
- SmallVector<ArgInfo, 8> InArgs;
- if (!Info.CanLowerReturn) {
- insertSRetLoads(MIRBuilder, Info.OrigRet.Ty, Info.OrigRet.Regs,
- Info.DemoteRegister, Info.DemoteStackIndex);
- } else if (!Info.OrigRet.Ty->isVoidTy()) {
- SmallVector<ArgInfo, 8> PreSplitRetInfos;
-
- splitToValueTypes(
- MIRBuilder, Info.OrigRet, PreSplitRetInfos/*InArgs*/, DL, Info.CallConv);
-
- processSplitArgs(MIRBuilder, Info.OrigRet,
- PreSplitRetInfos, InArgs/*SplitRetInfos*/, DL, Info.CallConv, false,
- [&](ArrayRef<Register> Regs, Register DstReg,
- LLT LLTy, LLT PartLLT, int VTSplitIdx) {
- assert(DstReg == Info.OrigRet.Regs[VTSplitIdx]);
- packSplitRegsToOrigType(MIRBuilder, Info.OrigRet.Regs[VTSplitIdx],
- Regs, LLTy, PartLLT);
- });
- }
-
- // Make sure the raw argument copies are inserted before the marshalling to
- // the original types.
- MIRBuilder.setInsertPt(MIRBuilder.getMBB(), CallSeqEnd);
-
// Finally we can copy the returned value back into its virtual-register. In
// symmetry with the arguments, the physical register must be an
// implicit-define of the call instruction.
if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) {
CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv,
Info.IsVarArg);
- CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn);
- if (!handleAssignments(MIRBuilder, InArgs, Handler))
+ IncomingValueAssigner Assigner(RetAssignFn);
+ CallReturnHandler Handler(MIRBuilder, MRI, MIB);
+ if (!determineAndHandleAssignments(Handler, Assigner, InArgs, MIRBuilder,
+ Info.CallConv, Info.IsVarArg))
return false;
}
uint64_t CalleePopBytes = NumBytes;
- CallSeqEnd.addImm(0)
+
+ MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN)
+ .addImm(0)
.addImm(CalleePopBytes);
- // Restore the insert point to after the call sequence.
- MIRBuilder.setInsertPt(MIRBuilder.getMBB(), OrigInsertPt);
+ if (!Info.CanLowerReturn) {
+ insertSRetLoads(MIRBuilder, Info.OrigRet.Ty, Info.OrigRet.Regs,
+ Info.DemoteRegister, Info.DemoteStackIndex);
+ }
+
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
index 1312388e4a38..569c6d75204d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -19,28 +19,16 @@
namespace llvm {
class AMDGPUTargetLowering;
+class GCNSubtarget;
class MachineInstrBuilder;
+class SIMachineFunctionInfo;
class AMDGPUCallLowering final : public CallLowering {
- void lowerParameterPtr(Register DstReg, MachineIRBuilder &B, Type *ParamTy,
+ void lowerParameterPtr(Register DstReg, MachineIRBuilder &B,
uint64_t Offset) const;
- void lowerParameter(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset,
- Align Alignment, Register DstReg) const;
-
- /// A function of this type is used to perform value split action.
- using SplitArgTy = std::function<void(ArrayRef<Register>, Register, LLT, LLT, int)>;
-
- void splitToValueTypes(MachineIRBuilder &B, const ArgInfo &OrigArgInfo,
- SmallVectorImpl<ArgInfo> &SplitArgs,
- const DataLayout &DL, CallingConv::ID CallConv) const;
-
- void processSplitArgs(MachineIRBuilder &B, const ArgInfo &OrigArgInfo,
- const SmallVectorImpl<ArgInfo> &SplitArg,
- SmallVectorImpl<ArgInfo> &SplitArgs,
- const DataLayout &DL, CallingConv::ID CallConv,
- bool IsOutgoing,
- SplitArgTy PerformArgSplit) const;
+ void lowerParameter(MachineIRBuilder &B, ArgInfo &AI, uint64_t Offset,
+ Align Alignment) const;
bool canLowerReturn(MachineFunction &MF, CallingConv::ID CallConv,
SmallVectorImpl<BaseArgInfo> &Outs,
@@ -68,6 +56,29 @@ public:
SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs,
CallLoweringInfo &Info) const;
+ bool
+ doCallerAndCalleePassArgsTheSameWay(CallLoweringInfo &Info,
+ MachineFunction &MF,
+ SmallVectorImpl<ArgInfo> &InArgs) const;
+
+ bool
+ areCalleeOutgoingArgsTailCallable(CallLoweringInfo &Info, MachineFunction &MF,
+ SmallVectorImpl<ArgInfo> &OutArgs) const;
+
+ /// Returns true if the call can be lowered as a tail call.
+ bool
+ isEligibleForTailCallOptimization(MachineIRBuilder &MIRBuilder,
+ CallLoweringInfo &Info,
+ SmallVectorImpl<ArgInfo> &InArgs,
+ SmallVectorImpl<ArgInfo> &OutArgs) const;
+
+ void handleImplicitCallArguments(
+ MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
+ const GCNSubtarget &ST, const SIMachineFunctionInfo &MFI,
+ ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const;
+
+ bool lowerTailCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
+ SmallVectorImpl<ArgInfo> &OutArgs) const;
bool lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 250c42776297..90b52395b76c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -34,16 +34,13 @@ def CC_SI_Gfx : CallingConv<[
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
]>>>,
- CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
- CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
- CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>,
- CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
- CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>,
- CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
- CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
+ CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>
]>;
def RetCC_SI_Gfx : CallingConv<[
+ CCIfType<[i1], CCPromoteToType<i32>>,
+ CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
+
// 0-3 are reserved for the stack buffer descriptor
// 32 is reserved for the stack pointer
CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
@@ -74,14 +71,6 @@ def RetCC_SI_Gfx : CallingConv<[
VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127,
VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135
]>>>,
-
- CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
- CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
- CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>,
- CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
- CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>,
- CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
- CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
]>;
def CC_SI_SHADER : CallingConv<[
@@ -118,6 +107,7 @@ def CC_SI_SHADER : CallingConv<[
]>;
def RetCC_SI_Shader : CallingConv<[
+ CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
CCIfType<[i32, i16] , CCAssignToReg<[
SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
@@ -175,6 +165,10 @@ def CSR_AMDGPU_VGPRs : CalleeSavedRegs<
(sequence "VGPR%u", 248, 255))
>;
+def CSR_AMDGPU_AGPRs_32_255 : CalleeSavedRegs<
+ (sequence "AGPR%u", 32, 255)
+>;
+
def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs<
(sequence "SGPR%u", 32, 105)
>;
@@ -184,6 +178,13 @@ def CSR_AMDGPU_AllVGPRs : CalleeSavedRegs<
(sequence "VGPR%u", 0, 255)
>;
+def CSR_AMDGPU_AllAGPRs : CalleeSavedRegs<
+ (sequence "AGPR%u", 0, 255)
+>;
+def CSR_AMDGPU_AllVectorRegs : CalleeSavedRegs<
+ (add CSR_AMDGPU_AllVGPRs, CSR_AMDGPU_AllAGPRs)
+>;
+
// Just to get the regmask, not for calling convention purposes.
def CSR_AMDGPU_AllAllocatableSRegs : CalleeSavedRegs<
(add (sequence "SGPR%u", 0, 105), VCC_LO, VCC_HI)
@@ -193,6 +194,10 @@ def CSR_AMDGPU_HighRegs : CalleeSavedRegs<
(add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs_32_105)
>;
+def CSR_AMDGPU_HighRegs_With_AGPRs : CalleeSavedRegs<
+ (add CSR_AMDGPU_HighRegs, CSR_AMDGPU_AGPRs_32_255)
+>;
+
def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>;
// Calling convention for leaf functions
@@ -205,13 +210,7 @@ def CC_AMDGPU_Func : CallingConv<[
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
- CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
- CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
- CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>,
- CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
- CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>,
- CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
- CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
+ CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>
]>;
// Calling convention for leaf functions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 2556996df97f..60e79c2c6c2f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -22,6 +22,7 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/KnownBits.h"
@@ -200,6 +201,7 @@ public:
AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
bool visitFDiv(BinaryOperator &I);
+ bool visitXor(BinaryOperator &I);
bool visitInstruction(Instruction &I) { return false; }
bool visitBinaryOperator(BinaryOperator &I);
@@ -807,9 +809,34 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
return !!NewFDiv;
}
+bool AMDGPUCodeGenPrepare::visitXor(BinaryOperator &I) {
+ // Match the Xor instruction, its type and its operands
+ IntrinsicInst *IntrinsicCall = dyn_cast<IntrinsicInst>(I.getOperand(0));
+ ConstantInt *RHS = dyn_cast<ConstantInt>(I.getOperand(1));
+ if (!RHS || !IntrinsicCall || RHS->getSExtValue() != -1)
+ return visitBinaryOperator(I);
+
+ // Check if the Call is an intrinsic intruction to amdgcn_class intrinsic
+ // has only one use
+ if (IntrinsicCall->getIntrinsicID() != Intrinsic::amdgcn_class ||
+ !IntrinsicCall->hasOneUse())
+ return visitBinaryOperator(I);
+
+ // "Not" the second argument of the intrinsic call
+ ConstantInt *Arg = dyn_cast<ConstantInt>(IntrinsicCall->getOperand(1));
+ if (!Arg)
+ return visitBinaryOperator(I);
+
+ IntrinsicCall->setOperand(
+ 1, ConstantInt::get(Arg->getType(), Arg->getZExtValue() ^ 0x3ff));
+ I.replaceAllUsesWith(IntrinsicCall);
+ I.eraseFromParent();
+ return true;
+}
+
static bool hasUnsafeFPMath(const Function &F) {
Attribute Attr = F.getFnAttribute("unsafe-fp-math");
- return Attr.getValueAsString() == "true";
+ return Attr.getValueAsBool();
}
static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index a8399176bb4a..c6273adca50f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -37,25 +37,54 @@ def cvt_f32_ubyteN : GICombineRule<
[{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]),
(apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>;
+def clamp_i64_to_i16_matchdata : GIDefMatchData<"AMDGPUPreLegalizerCombinerHelper::ClampI64ToI16MatchInfo">;
+
+def clamp_i64_to_i16 : GICombineRule<
+ (defs root:$clamp_i64_to_i16, clamp_i64_to_i16_matchdata:$matchinfo),
+ (match (wip_match_opcode G_TRUNC):$clamp_i64_to_i16,
+ [{ return PreLegalizerHelper.matchClampI64ToI16(*${clamp_i64_to_i16}, MRI, *MF, ${matchinfo}); }]),
+ (apply [{ PreLegalizerHelper.applyClampI64ToI16(*${clamp_i64_to_i16}, ${matchinfo}); }])>;
+
+def med3_matchdata : GIDefMatchData<"AMDGPURegBankCombinerHelper::Med3MatchInfo">;
+
+def int_minmax_to_med3 : GICombineRule<
+ (defs root:$min_or_max, med3_matchdata:$matchinfo),
+ (match (wip_match_opcode G_SMAX,
+ G_SMIN,
+ G_UMAX,
+ G_UMIN):$min_or_max,
+ [{ return RegBankHelper.matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]),
+ (apply [{ RegBankHelper.applyMed3(*${min_or_max}, ${matchinfo}); }])>;
+
+def remove_fcanonicalize_matchinfo : GIDefMatchData<"Register">;
+
+def remove_fcanonicalize : GICombineRule<
+ (defs root:$fcanonicalize, remove_fcanonicalize_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_FCANONICALIZE):$fcanonicalize,
+ [{ return PostLegalizerHelper.matchRemoveFcanonicalize(*${fcanonicalize}, ${matchinfo}); }]),
+ (apply [{ Helper.replaceSingleDefInstWithReg(*${fcanonicalize}, ${matchinfo}); }])>;
+
// Combines which should only apply on SI/VI
def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
-
def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
- "AMDGPUGenPreLegalizerCombinerHelper", [all_combines]> {
+ "AMDGPUGenPreLegalizerCombinerHelper", [all_combines, clamp_i64_to_i16]> {
let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule";
+ let StateClass = "AMDGPUPreLegalizerCombinerHelperState";
}
def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
"AMDGPUGenPostLegalizerCombinerHelper",
[all_combines, gfx6gfx7_combines,
- uchar_to_float, cvt_f32_ubyteN]> {
+ uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize]> {
let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule";
let StateClass = "AMDGPUPostLegalizerCombinerHelperState";
let AdditionalArguments = [];
}
def AMDGPURegBankCombinerHelper : GICombinerHelper<
- "AMDGPUGenRegBankCombinerHelper", []> {
+ "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3]> {
let DisableRuleOption = "amdgpuregbankcombiner-disable-rule";
+ let StateClass = "AMDGPURegBankCombinerHelperState";
+ let AdditionalArguments = [];
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h
index 041d6deef243..87b459f7b1e0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h
@@ -6,6 +6,9 @@
//
//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUEXPORTCLUSTERING_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUEXPORTCLUSTERING_H
+
#include "llvm/CodeGen/ScheduleDAGMutation.h"
#include <memory>
@@ -14,3 +17,5 @@ namespace llvm {
std::unique_ptr<ScheduleDAGMutation> createAMDGPUExportClusteringDAGMutation();
} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUEXPORTCLUSTERING_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index bba03736d01a..521c8f261a00 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -70,10 +70,10 @@ def gi_smrd_sgpr :
def gi_flat_offset :
GIComplexOperandMatcher<s64, "selectFlatOffset">,
- GIComplexPatternEquiv<FLATOffset>;
-def gi_flat_offset_signed :
- GIComplexOperandMatcher<s64, "selectFlatOffsetSigned">,
- GIComplexPatternEquiv<FLATOffsetSigned>;
+ GIComplexPatternEquiv<FlatOffset>;
+def gi_global_offset :
+ GIComplexOperandMatcher<s64, "selectGlobalOffset">,
+ GIComplexPatternEquiv<GlobalOffset>;
def gi_global_saddr :
GIComplexOperandMatcher<s64, "selectGlobalSAddr">,
GIComplexPatternEquiv<GlobalSAddr>;
@@ -86,7 +86,7 @@ def gi_mubuf_scratch_offen :
GIComplexPatternEquiv<MUBUFScratchOffen>;
def gi_flat_scratch_offset :
- GIComplexOperandMatcher<s32, "selectFlatOffsetSigned">,
+ GIComplexOperandMatcher<s32, "selectScratchOffset">,
GIComplexPatternEquiv<ScratchOffset>;
def gi_flat_scratch_saddr :
@@ -113,14 +113,6 @@ def gi_mubuf_offset :
GIComplexOperandMatcher<s64, "selectMUBUFOffset">,
GIComplexPatternEquiv<MUBUFOffset>;
-def gi_mubuf_addr64_atomic :
- GIComplexOperandMatcher<s64, "selectMUBUFAddr64Atomic">,
- GIComplexPatternEquiv<MUBUFAddr64Atomic>;
-
-def gi_mubuf_offset_atomic :
- GIComplexOperandMatcher<s64, "selectMUBUFOffsetAtomic">,
- GIComplexPatternEquiv<MUBUFOffsetAtomic>;
-
def gi_smrd_buffer_imm :
GIComplexOperandMatcher<s64, "selectSMRDBufferImm">,
GIComplexPatternEquiv<SMRDBufferImm>;
@@ -136,6 +128,8 @@ def gi_smrd_buffer_imm32 :
def : GINodeEquiv<G_LOAD, AMDGPUld_glue> {
let CheckMMOIsNonAtomic = 1;
+ let IfSignExtend = G_SEXTLOAD;
+ let IfZeroExtend = G_ZEXTLOAD;
}
def : GINodeEquiv<G_STORE, AMDGPUst_glue> {
@@ -174,6 +168,10 @@ def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE1, AMDGPUcvt_f32_ubyte1>;
def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE2, AMDGPUcvt_f32_ubyte2>;
def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE3, AMDGPUcvt_f32_ubyte3>;
+def : GINodeEquiv<G_AMDGPU_CVT_PK_I16_I32, AMDGPUpk_i16_i32_impl>;
+def : GINodeEquiv<G_AMDGPU_SMED3, AMDGPUsmed3>;
+def : GINodeEquiv<G_AMDGPU_UMED3, AMDGPUumed3>;
+
def : GINodeEquiv<G_AMDGPU_ATOMIC_CMPXCHG, AMDGPUatomic_cmp_swap>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD, SIbuffer_load>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_USHORT, SIbuffer_load_ushort>;
@@ -216,6 +214,8 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_XOR, SIbuffer_atomic_xor>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_INC, SIbuffer_atomic_inc>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_DEC, SIbuffer_atomic_dec>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD, SIbuffer_atomic_fadd>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>;
@@ -302,16 +302,16 @@ foreach Ty = [i64, p0, p1, p4] in {
defm : SMRD_Pattern <"S_LOAD_DWORDX2", Ty>;
}
-def gi_as_i32timm : GICustomOperandRenderer<"renderTruncTImm32">,
+def gi_as_i32timm : GICustomOperandRenderer<"renderTruncTImm">,
GISDNodeXFormEquiv<as_i32timm>;
-def gi_as_i16timm : GICustomOperandRenderer<"renderTruncTImm16">,
+def gi_as_i16timm : GICustomOperandRenderer<"renderTruncTImm">,
GISDNodeXFormEquiv<as_i16timm>;
-def gi_as_i8timm : GICustomOperandRenderer<"renderTruncTImm8">,
+def gi_as_i8timm : GICustomOperandRenderer<"renderTruncTImm">,
GISDNodeXFormEquiv<as_i8timm>;
-def gi_as_i1timm : GICustomOperandRenderer<"renderTruncTImm1">,
+def gi_as_i1timm : GICustomOperandRenderer<"renderTruncTImm">,
GISDNodeXFormEquiv<as_i1timm>;
def gi_NegateImm : GICustomOperandRenderer<"renderNegateImm">,
@@ -323,17 +323,14 @@ def gi_bitcast_fpimm_to_i32 : GICustomOperandRenderer<"renderBitcastImm">,
def gi_IMMPopCount : GICustomOperandRenderer<"renderPopcntImm">,
GISDNodeXFormEquiv<IMMPopCount>;
-def gi_extract_glc : GICustomOperandRenderer<"renderExtractGLC">,
- GISDNodeXFormEquiv<extract_glc>;
-
-def gi_extract_slc : GICustomOperandRenderer<"renderExtractSLC">,
- GISDNodeXFormEquiv<extract_slc>;
-
-def gi_extract_dlc : GICustomOperandRenderer<"renderExtractDLC">,
- GISDNodeXFormEquiv<extract_dlc>;
+def gi_extract_cpol : GICustomOperandRenderer<"renderExtractCPol">,
+ GISDNodeXFormEquiv<extract_cpol>;
def gi_extract_swz : GICustomOperandRenderer<"renderExtractSWZ">,
GISDNodeXFormEquiv<extract_swz>;
+def gi_set_glc : GICustomOperandRenderer<"renderSetGLC">,
+ GISDNodeXFormEquiv<set_glc>;
+
def gi_frameindex_to_targetframeindex : GICustomOperandRenderer<"renderFrameIndex">,
GISDNodeXFormEquiv<frameindex_to_targetframeindex>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index b3bafc5b2720..cabdc6998011 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -41,6 +41,20 @@ AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
return std::make_pair(Def->getOperand(1).getReg(), Offset);
}
+ // Handle G_PTRTOINT (G_PTR_ADD base, const) case
+ if (Def->getOpcode() == TargetOpcode::G_PTRTOINT) {
+ MachineInstr *Base;
+ if (mi_match(Def->getOperand(1).getReg(), MRI,
+ m_GPtrAdd(m_MInstr(Base), m_ICst(Offset)))) {
+ // If Base was int converted to pointer, simply return int and offset.
+ if (Base->getOpcode() == TargetOpcode::G_INTTOPTR)
+ return std::make_pair(Base->getOperand(1).getReg(), Offset);
+
+ // Register returned here will be of pointer type.
+ return std::make_pair(Base->getOperand(0).getReg(), Offset);
+ }
+ }
+
return std::make_pair(Reg, 0);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
index 404e0fcd1166..14d3a3fb7997 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
@@ -9,6 +9,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/CodeGen/Register.h"
#include <utility>
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 39f9092ce77c..8eeda7b67b73 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -226,8 +226,8 @@ MetadataStreamerV2::getHSADebugProps(const MachineFunction &MF,
void MetadataStreamerV2::emitVersion() {
auto &Version = HSAMetadata.mVersion;
- Version.push_back(VersionMajor);
- Version.push_back(VersionMinor);
+ Version.push_back(VersionMajorV2);
+ Version.push_back(VersionMinorV2);
}
void MetadataStreamerV2::emitPrintf(const Module &Mod) {
@@ -435,7 +435,8 @@ bool MetadataStreamerV2::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
return TargetStreamer.EmitHSAMetadata(getHSAMetadata());
}
-void MetadataStreamerV2::begin(const Module &Mod) {
+void MetadataStreamerV2::begin(const Module &Mod,
+ const IsaInfo::AMDGPUTargetID &TargetID) {
emitVersion();
emitPrintf(Mod);
}
@@ -608,8 +609,8 @@ MetadataStreamerV3::getWorkGroupDimensions(MDNode *Node) const {
void MetadataStreamerV3::emitVersion() {
auto Version = HSAMetadataDoc->getArrayNode();
- Version.push_back(Version.getDocument()->getNode(VersionMajor));
- Version.push_back(Version.getDocument()->getNode(VersionMinor));
+ Version.push_back(Version.getDocument()->getNode(VersionMajorV3));
+ Version.push_back(Version.getDocument()->getNode(VersionMinorV3));
getRootMetadata("amdhsa.version") = Version;
}
@@ -881,7 +882,8 @@ bool MetadataStreamerV3::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
return TargetStreamer.EmitHSAMetadata(*HSAMetadataDoc, true);
}
-void MetadataStreamerV3::begin(const Module &Mod) {
+void MetadataStreamerV3::begin(const Module &Mod,
+ const IsaInfo::AMDGPUTargetID &TargetID) {
emitVersion();
emitPrintf(Mod);
getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode();
@@ -921,6 +923,30 @@ void MetadataStreamerV3::emitKernel(const MachineFunction &MF,
Kernels.push_back(Kern);
}
+//===----------------------------------------------------------------------===//
+// HSAMetadataStreamerV4
+//===----------------------------------------------------------------------===//
+
+void MetadataStreamerV4::emitVersion() {
+ auto Version = HSAMetadataDoc->getArrayNode();
+ Version.push_back(Version.getDocument()->getNode(VersionMajorV4));
+ Version.push_back(Version.getDocument()->getNode(VersionMinorV4));
+ getRootMetadata("amdhsa.version") = Version;
+}
+
+void MetadataStreamerV4::emitTargetID(const IsaInfo::AMDGPUTargetID &TargetID) {
+ getRootMetadata("amdhsa.target") =
+ HSAMetadataDoc->getNode(TargetID.toString(), /*Copy=*/true);
+}
+
+void MetadataStreamerV4::begin(const Module &Mod,
+ const IsaInfo::AMDGPUTargetID &TargetID) {
+ emitVersion();
+ emitTargetID(TargetID);
+ emitPrintf(Mod);
+ getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode();
+}
+
} // end namespace HSAMD
} // end namespace AMDGPU
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 1c6db14b85cd..4824b4cf37c7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -15,6 +15,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H
#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/BinaryFormat/MsgPackDocument.h"
#include "llvm/Support/AMDGPUMetadata.h"
#include "llvm/Support/Alignment.h"
@@ -40,7 +41,8 @@ public:
virtual bool emitTo(AMDGPUTargetStreamer &TargetStreamer) = 0;
- virtual void begin(const Module &Mod) = 0;
+ virtual void begin(const Module &Mod,
+ const IsaInfo::AMDGPUTargetID &TargetID) = 0;
virtual void end() = 0;
@@ -48,8 +50,9 @@ public:
const SIProgramInfo &ProgramInfo) = 0;
};
-class MetadataStreamerV3 final : public MetadataStreamer {
-private:
+// TODO: Rename MetadataStreamerV3 -> MetadataStreamerMsgPackV3.
+class MetadataStreamerV3 : public MetadataStreamer {
+protected:
std::unique_ptr<msgpack::Document> HSAMetadataDoc =
std::make_unique<msgpack::Document>();
@@ -108,7 +111,8 @@ public:
bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override;
- void begin(const Module &Mod) override;
+ void begin(const Module &Mod,
+ const IsaInfo::AMDGPUTargetID &TargetID) override;
void end() override;
@@ -116,6 +120,21 @@ public:
const SIProgramInfo &ProgramInfo) override;
};
+// TODO: Rename MetadataStreamerV4 -> MetadataStreamerMsgPackV4.
+class MetadataStreamerV4 final : public MetadataStreamerV3 {
+ void emitVersion();
+
+ void emitTargetID(const IsaInfo::AMDGPUTargetID &TargetID);
+
+public:
+ MetadataStreamerV4() = default;
+ ~MetadataStreamerV4() = default;
+
+ void begin(const Module &Mod,
+ const IsaInfo::AMDGPUTargetID &TargetID) override;
+};
+
+// TODO: Rename MetadataStreamerV2 -> MetadataStreamerYamlV2.
class MetadataStreamerV2 final : public MetadataStreamer {
private:
Metadata HSAMetadata;
@@ -172,7 +191,8 @@ public:
bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override;
- void begin(const Module &Mod) override;
+ void begin(const Module &Mod,
+ const IsaInfo::AMDGPUTargetID &TargetID) override;
void end() override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 340f4ac6f57a..a3106ded1e38 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -107,6 +107,10 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool EnableLateStructurizeCFG;
+ // Instructions that will be lowered with a final instruction that zeros the
+ // high result bits.
+ bool fp16SrcZerosHighBits(unsigned Opc) const;
+
public:
explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr,
CodeGenOpt::Level OptLevel = CodeGenOpt::Default)
@@ -188,15 +192,9 @@ private:
SDValue &Offset1, unsigned Size) const;
bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
SDValue &SOffset, SDValue &Offset, SDValue &Offen,
- SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
- SDValue &TFE, SDValue &DLC, SDValue &SWZ) const;
+ SDValue &Idxen, SDValue &Addr64) const;
bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
- SDValue &SOffset, SDValue &Offset, SDValue &GLC,
- SDValue &SLC, SDValue &TFE, SDValue &DLC,
- SDValue &SWZ) const;
- bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
- SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
- SDValue &SLC) const;
+ SDValue &SOffset, SDValue &Offset) const;
bool SelectMUBUFScratchOffen(SDNode *Parent,
SDValue Addr, SDValue &RSrc, SDValue &VAddr,
SDValue &SOffset, SDValue &ImmOffset) const;
@@ -204,17 +202,17 @@ private:
SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
SDValue &Offset) const;
- bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
- SDValue &Offset, SDValue &GLC, SDValue &SLC,
- SDValue &TFE, SDValue &DLC, SDValue &SWZ) const;
- bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
- SDValue &Offset, SDValue &SLC) const;
bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
SDValue &Offset) const;
- template <bool IsSigned>
+ bool SelectFlatOffsetImpl(SDNode *N, SDValue Addr, SDValue &VAddr,
+ SDValue &Offset, uint64_t FlatVariant) const;
bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &Offset) const;
+ bool SelectGlobalOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
+ SDValue &Offset) const;
+ bool SelectScratchOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
+ SDValue &Offset) const;
bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset) const;
bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
@@ -322,6 +320,16 @@ static SDValue stripBitcast(SDValue Val) {
// Figure out if this is really an extract of the high 16-bits of a dword.
static bool isExtractHiElt(SDValue In, SDValue &Out) {
In = stripBitcast(In);
+
+ if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
+ if (!Idx->isOne())
+ return false;
+ Out = In.getOperand(0);
+ return true;
+ }
+ }
+
if (In.getOpcode() != ISD::TRUNCATE)
return false;
@@ -341,6 +349,13 @@ static bool isExtractHiElt(SDValue In, SDValue &Out) {
// Look through operations that obscure just looking at the low 16-bits of the
// same register.
static SDValue stripExtractLoElt(SDValue In) {
+ if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
+ if (Idx->isNullValue() && In.getValueSizeInBits() <= 32)
+ return In.getOperand(0);
+ }
+ }
+
if (In.getOpcode() == ISD::TRUNCATE) {
SDValue Src = In.getOperand(0);
if (Src.getValueType().getSizeInBits() == 32)
@@ -391,6 +406,68 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
return SelectionDAGISel::runOnMachineFunction(MF);
}
+bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
+ // XXX - only need to list legal operations.
+ switch (Opc) {
+ case ISD::FADD:
+ case ISD::FSUB:
+ case ISD::FMUL:
+ case ISD::FDIV:
+ case ISD::FREM:
+ case ISD::FCANONICALIZE:
+ case ISD::UINT_TO_FP:
+ case ISD::SINT_TO_FP:
+ case ISD::FABS:
+ // Fabs is lowered to a bit operation, but it's an and which will clear the
+ // high bits anyway.
+ case ISD::FSQRT:
+ case ISD::FSIN:
+ case ISD::FCOS:
+ case ISD::FPOWI:
+ case ISD::FPOW:
+ case ISD::FLOG:
+ case ISD::FLOG2:
+ case ISD::FLOG10:
+ case ISD::FEXP:
+ case ISD::FEXP2:
+ case ISD::FCEIL:
+ case ISD::FTRUNC:
+ case ISD::FRINT:
+ case ISD::FNEARBYINT:
+ case ISD::FROUND:
+ case ISD::FFLOOR:
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM:
+ case AMDGPUISD::FRACT:
+ case AMDGPUISD::CLAMP:
+ case AMDGPUISD::COS_HW:
+ case AMDGPUISD::SIN_HW:
+ case AMDGPUISD::FMIN3:
+ case AMDGPUISD::FMAX3:
+ case AMDGPUISD::FMED3:
+ case AMDGPUISD::FMAD_FTZ:
+ case AMDGPUISD::RCP:
+ case AMDGPUISD::RSQ:
+ case AMDGPUISD::RCP_IFLAG:
+ case AMDGPUISD::LDEXP:
+ // On gfx10, all 16-bit instructions preserve the high bits.
+ return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
+ case ISD::FP_ROUND:
+ // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
+ // high bits on gfx9.
+ // TODO: If we had the source node we could see if the source was fma/mad
+ return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
+ case ISD::FMA:
+ case ISD::FMAD:
+ case AMDGPUISD::DIV_FIXUP:
+ return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
+ default:
+ // fcopysign, select and others may be lowered to 32-bit bit operations
+ // which don't zero the high bits.
+ return false;
+ }
+}
+
bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
assert(Subtarget->d16PreservesUnusedBits());
MVT VT = N->getValueType(0).getSimpleVT();
@@ -1374,13 +1451,10 @@ bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
return true;
}
-bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
- SDValue &VAddr, SDValue &SOffset,
- SDValue &Offset, SDValue &Offen,
- SDValue &Idxen, SDValue &Addr64,
- SDValue &GLC, SDValue &SLC,
- SDValue &TFE, SDValue &DLC,
- SDValue &SWZ) const {
+bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
+ SDValue &SOffset, SDValue &Offset,
+ SDValue &Offen, SDValue &Idxen,
+ SDValue &Addr64) const {
// Subtarget prefers to use flat instruction
// FIXME: This should be a pattern predicate and not reach here
if (Subtarget->useFlatForGlobal())
@@ -1388,14 +1462,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
SDLoc DL(Addr);
- if (!GLC.getNode())
- GLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
- if (!SLC.getNode())
- SLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
- TFE = CurDAG->getTargetConstant(0, DL, MVT::i1);
- DLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
- SWZ = CurDAG->getTargetConstant(0, DL, MVT::i1);
-
Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
@@ -1472,9 +1538,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue &VAddr, SDValue &SOffset,
- SDValue &Offset, SDValue &GLC,
- SDValue &SLC, SDValue &TFE,
- SDValue &DLC, SDValue &SWZ) const {
+ SDValue &Offset) const {
SDValue Ptr, Offen, Idxen, Addr64;
// addr64 bit was removed for volcanic islands.
@@ -1482,8 +1546,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
if (!Subtarget->hasAddr64())
return false;
- if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
- GLC, SLC, TFE, DLC, SWZ))
+ if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
return false;
ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
@@ -1500,21 +1563,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
return false;
}
-bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
- SDValue &VAddr, SDValue &SOffset,
- SDValue &Offset,
- SDValue &SLC) const {
- SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1);
- SDValue GLC, TFE, DLC, SWZ;
-
- return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC, SWZ);
-}
-
-static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
- auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
- return PSV && PSV->isStack();
-}
-
std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
SDLoc DL(N);
@@ -1551,13 +1599,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
VAddr = SDValue(MovHighBits, 0);
- // In a call sequence, stores to the argument stack area are relative to the
- // stack pointer.
- const MachinePointerInfo &PtrInfo
- = cast<MemSDNode>(Parent)->getPointerInfo();
- SOffset = isStackPtrRelative(PtrInfo)
- ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
- : CurDAG->getTargetConstant(0, DL, MVT::i32);
+ SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
return true;
}
@@ -1600,44 +1642,65 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
return true;
}
+static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
+ if (Val.getOpcode() != ISD::CopyFromReg)
+ return false;
+ auto RC =
+ TRI.getPhysRegClass(cast<RegisterSDNode>(Val.getOperand(1))->getReg());
+ return RC && TRI.isSGPRClass(RC);
+}
+
bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
SDValue Addr,
SDValue &SRsrc,
SDValue &SOffset,
SDValue &Offset) const {
- ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr);
- if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue()))
- return false;
-
- SDLoc DL(Addr);
+ const SIRegisterInfo *TRI =
+ static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
MachineFunction &MF = CurDAG->getMachineFunction();
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ SDLoc DL(Addr);
- SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
+ // CopyFromReg <sgpr>
+ if (IsCopyFromSGPR(*TRI, Addr)) {
+ SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
+ SOffset = Addr;
+ Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+ return true;
+ }
- const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
+ ConstantSDNode *CAddr;
+ if (Addr.getOpcode() == ISD::ADD) {
+ // Add (CopyFromReg <sgpr>) <constant>
+ CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+ if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue()))
+ return false;
+ if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
+ return false;
- // FIXME: Get from MachinePointerInfo? We should only be using the frame
- // offset if we know this is in a call sequence.
- SOffset = isStackPtrRelative(PtrInfo)
- ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
- : CurDAG->getTargetConstant(0, DL, MVT::i32);
+ SOffset = Addr.getOperand(0);
+ } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
+ SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
+ // <constant>
+ SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ } else {
+ return false;
+ }
+
+ SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
return true;
}
bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
- SDValue &SOffset, SDValue &Offset,
- SDValue &GLC, SDValue &SLC,
- SDValue &TFE, SDValue &DLC,
- SDValue &SWZ) const {
+ SDValue &SOffset, SDValue &Offset
+ ) const {
SDValue Ptr, VAddr, Offen, Idxen, Addr64;
const SIInstrInfo *TII =
static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
- if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
- GLC, SLC, TFE, DLC, SWZ))
+ if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
return false;
if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
@@ -1656,21 +1719,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
return false;
}
-bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
- SDValue &Soffset, SDValue &Offset
- ) const {
- SDValue GLC, SLC, TFE, DLC, SWZ;
-
- return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ);
-}
-bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
- SDValue &Soffset, SDValue &Offset,
- SDValue &SLC) const {
- SDValue GLC, TFE, DLC, SWZ;
-
- return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ);
-}
-
// Find a load or store from corresponding pattern root.
// Roots may be build_vector, bitconvert or their combinations.
static MemSDNode* findMemSDNode(SDNode *N) {
@@ -1685,24 +1733,25 @@ static MemSDNode* findMemSDNode(SDNode *N) {
llvm_unreachable("cannot find MemSDNode in the pattern!");
}
-template <bool IsSigned>
-bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
- SDValue Addr,
- SDValue &VAddr,
- SDValue &Offset) const {
+bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
+ SDValue &VAddr, SDValue &Offset,
+ uint64_t FlatVariant) const {
int64_t OffsetVal = 0;
unsigned AS = findMemSDNode(N)->getAddressSpace();
- if (Subtarget->hasFlatInstOffsets() &&
- (!Subtarget->hasFlatSegmentOffsetBug() ||
- AS != AMDGPUAS::FLAT_ADDRESS)) {
+ bool CanHaveFlatSegmentOffsetBug =
+ Subtarget->hasFlatSegmentOffsetBug() &&
+ FlatVariant == SIInstrFlags::FLAT &&
+ (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS);
+
+ if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
SDValue N0, N1;
if (isBaseWithConstantOffset64(Addr, N0, N1)) {
- uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
+ int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
- if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) {
+ if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
Addr = N0;
OffsetVal = COffsetVal;
} else {
@@ -1719,8 +1768,8 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
SDLoc DL(N);
uint64_t RemainderOffset;
- std::tie(OffsetVal, RemainderOffset)
- = TII->splitFlatOffset(COffsetVal, AS, IsSigned);
+ std::tie(OffsetVal, RemainderOffset) =
+ TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
SDValue AddOffsetLo =
getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
@@ -1777,6 +1826,25 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
return true;
}
+bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
+ SDValue &VAddr,
+ SDValue &Offset) const {
+ return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
+}
+
+bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
+ SDValue &VAddr,
+ SDValue &Offset) const {
+ return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
+}
+
+bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
+ SDValue &VAddr,
+ SDValue &Offset) const {
+ return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
+ SIInstrFlags::FlatScratch);
+}
+
// If this matches zero_extend i32:x, return x
static SDValue matchZExtFromI32(SDValue Op) {
if (Op.getOpcode() != ISD::ZERO_EXTEND)
@@ -1802,126 +1870,144 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
- if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, true)) {
+ if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
+ SIInstrFlags::FlatGlobal)) {
Addr = LHS;
ImmOffset = COffsetVal;
- } else if (!LHS->isDivergent() && COffsetVal > 0) {
- SDLoc SL(N);
- // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset) +
- // (large_offset & MaxOffset);
- int64_t SplitImmOffset, RemainderOffset;
- std::tie(SplitImmOffset, RemainderOffset)
- = TII->splitFlatOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, true);
-
- if (isUInt<32>(RemainderOffset)) {
- SDNode *VMov = CurDAG->getMachineNode(
- AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
- CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
- VOffset = SDValue(VMov, 0);
- SAddr = LHS;
- Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
- return true;
+ } else if (!LHS->isDivergent()) {
+ if (COffsetVal > 0) {
+ SDLoc SL(N);
+ // saddr + large_offset -> saddr +
+ // (voffset = large_offset & ~MaxOffset) +
+ // (large_offset & MaxOffset);
+ int64_t SplitImmOffset, RemainderOffset;
+ std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
+ COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
+
+ if (isUInt<32>(RemainderOffset)) {
+ SDNode *VMov = CurDAG->getMachineNode(
+ AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
+ CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
+ VOffset = SDValue(VMov, 0);
+ SAddr = LHS;
+ Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
+ return true;
+ }
}
+
+ // We are adding a 64 bit SGPR and a constant. If constant bus limit
+ // is 1 we would need to perform 1 or 2 extra moves for each half of
+ // the constant and it is better to do a scalar add and then issue a
+ // single VALU instruction to materialize zero. Otherwise it is less
+ // instructions to perform VALU adds with immediates or inline literals.
+ unsigned NumLiterals =
+ !TII->isInlineConstant(APInt(32, COffsetVal & 0xffffffff)) +
+ !TII->isInlineConstant(APInt(32, COffsetVal >> 32));
+ if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
+ return false;
}
}
// Match the variable offset.
- if (Addr.getOpcode() != ISD::ADD) {
- if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
- isa<ConstantSDNode>(Addr))
- return false;
-
- // It's cheaper to materialize a single 32-bit zero for vaddr than the two
- // moves required to copy a 64-bit SGPR to VGPR.
- SAddr = Addr;
- SDNode *VMov = CurDAG->getMachineNode(
- AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
- CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
- VOffset = SDValue(VMov, 0);
- Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
- return true;
- }
+ if (Addr.getOpcode() == ISD::ADD) {
+ LHS = Addr.getOperand(0);
+ RHS = Addr.getOperand(1);
- LHS = Addr.getOperand(0);
- RHS = Addr.getOperand(1);
+ if (!LHS->isDivergent()) {
+ // add (i64 sgpr), (zero_extend (i32 vgpr))
+ if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
+ SAddr = LHS;
+ VOffset = ZextRHS;
+ }
+ }
- if (!LHS->isDivergent()) {
- // add (i64 sgpr), (zero_extend (i32 vgpr))
- if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
- SAddr = LHS;
- VOffset = ZextRHS;
+ if (!SAddr && !RHS->isDivergent()) {
+ // add (zero_extend (i32 vgpr)), (i64 sgpr)
+ if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
+ SAddr = RHS;
+ VOffset = ZextLHS;
+ }
}
- }
- if (!SAddr && !RHS->isDivergent()) {
- // add (zero_extend (i32 vgpr)), (i64 sgpr)
- if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
- SAddr = RHS;
- VOffset = ZextLHS;
+ if (SAddr) {
+ Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
+ return true;
}
}
- if (!SAddr)
+ if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
+ isa<ConstantSDNode>(Addr))
return false;
+ // It's cheaper to materialize a single 32-bit zero for vaddr than the two
+ // moves required to copy a 64-bit SGPR to VGPR.
+ SAddr = Addr;
+ SDNode *VMov =
+ CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
+ CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
+ VOffset = SDValue(VMov, 0);
Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
return true;
}
+static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
+ if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
+ SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
+ } else if (SAddr.getOpcode() == ISD::ADD &&
+ isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
+ // Materialize this into a scalar move for scalar address to avoid
+ // readfirstlane.
+ auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
+ SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
+ FI->getValueType(0));
+ SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
+ MVT::i32, TFI, SAddr.getOperand(1)),
+ 0);
+ }
+
+ return SAddr;
+}
+
// Match (32-bit SGPR base) + sext(imm offset)
-bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *N,
- SDValue Addr,
+bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
SDValue &SAddr,
SDValue &Offset) const {
if (Addr->isDivergent())
return false;
- SAddr = Addr;
+ SDLoc DL(Addr);
+
int64_t COffsetVal = 0;
if (CurDAG->isBaseWithConstantOffset(Addr)) {
COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
SAddr = Addr.getOperand(0);
+ } else {
+ SAddr = Addr;
}
- if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
- SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
- } else if (SAddr.getOpcode() == ISD::ADD &&
- isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
- // Materialize this into a scalar move for scalar address to avoid
- // readfirstlane.
- auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
- SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
- FI->getValueType(0));
- SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, SDLoc(SAddr),
- MVT::i32, TFI, SAddr.getOperand(1)),
- 0);
- }
+ SAddr = SelectSAddrFI(CurDAG, SAddr);
const SIInstrInfo *TII = Subtarget->getInstrInfo();
- if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
- int64_t RemainderOffset = COffsetVal;
- int64_t ImmField = 0;
- const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(*Subtarget, true);
- // Use signed division by a power of two to truncate towards 0.
- int64_t D = 1LL << (NumBits - 1);
- RemainderOffset = (COffsetVal / D) * D;
- ImmField = COffsetVal - RemainderOffset;
-
- assert(TII->isLegalFLATOffset(ImmField, AMDGPUAS::PRIVATE_ADDRESS, true));
- assert(RemainderOffset + ImmField == COffsetVal);
+ if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
+ SIInstrFlags::FlatScratch)) {
+ int64_t SplitImmOffset, RemainderOffset;
+ std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
+ COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch);
- COffsetVal = ImmField;
+ COffsetVal = SplitImmOffset;
- SDLoc DL(N);
SDValue AddOffset =
- getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
- SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, DL, MVT::i32,
- SAddr, AddOffset), 0);
+ SAddr.getOpcode() == ISD::TargetFrameIndex
+ ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
+ : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32);
+ SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
+ SAddr, AddOffset),
+ 0);
}
- Offset = CurDAG->getTargetConstant(COffsetVal, SDLoc(), MVT::i16);
+ Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16);
return true;
}
@@ -2364,35 +2450,32 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
MachineSDNode *CmpSwap = nullptr;
if (Subtarget->hasAddr64()) {
- SDValue SRsrc, VAddr, SOffset, Offset, SLC;
+ SDValue SRsrc, VAddr, SOffset, Offset;
- if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset, SLC)) {
+ if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset)) {
unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN :
AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN;
SDValue CmpVal = Mem->getOperand(2);
- SDValue GLC = CurDAG->getTargetConstant(1, SL, MVT::i1);
+ SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32);
// XXX - Do we care about glue operands?
- SDValue Ops[] = {
- CmpVal, VAddr, SRsrc, SOffset, Offset, GLC, SLC, Mem->getChain()
- };
+ SDValue Ops[] = {CmpVal, VAddr, SRsrc, SOffset, Offset, CPol,
+ Mem->getChain()};
CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
}
}
if (!CmpSwap) {
- SDValue SRsrc, SOffset, Offset, SLC;
- if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset, SLC)) {
+ SDValue SRsrc, SOffset, Offset;
+ if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset)) {
unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN :
AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN;
SDValue CmpVal = Mem->getOperand(2);
- SDValue GLC = CurDAG->getTargetConstant(1, SL, MVT::i1);
- SDValue Ops[] = {
- CmpVal, SRsrc, SOffset, Offset, GLC, SLC, Mem->getChain()
- };
+ SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32);
+ SDValue Ops[] = {CmpVal, SRsrc, SOffset, Offset, CPol, Mem->getChain()};
CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
}
@@ -2623,7 +2706,11 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
Opcode = AMDGPU::SOFT_WQM;
break;
case Intrinsic::amdgcn_wwm:
- Opcode = AMDGPU::WWM;
+ case Intrinsic::amdgcn_strict_wwm:
+ Opcode = AMDGPU::STRICT_WWM;
+ break;
+ case Intrinsic::amdgcn_strict_wqm:
+ Opcode = AMDGPU::STRICT_WQM;
break;
case Intrinsic::amdgcn_interp_p1_f16:
SelectInterpP1F16(N);
@@ -2773,18 +2860,62 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
if (isExtractHiElt(Hi, Hi))
Mods |= SISrcMods::OP_SEL_1;
+ unsigned VecSize = Src.getValueSizeInBits();
Lo = stripExtractLoElt(Lo);
Hi = stripExtractLoElt(Hi);
+ if (Lo.getValueSizeInBits() > VecSize) {
+ Lo = CurDAG->getTargetExtractSubreg(
+ (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
+ MVT::getIntegerVT(VecSize), Lo);
+ }
+
+ if (Hi.getValueSizeInBits() > VecSize) {
+ Hi = CurDAG->getTargetExtractSubreg(
+ (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
+ MVT::getIntegerVT(VecSize), Hi);
+ }
+
+ assert(Lo.getValueSizeInBits() <= VecSize &&
+ Hi.getValueSizeInBits() <= VecSize);
+
if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
// Really a scalar input. Just select from the low half of the register to
// avoid packing.
- Src = Lo;
+ if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
+ Src = Lo;
+ } else {
+ assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
+
+ SDLoc SL(In);
+ SDValue Undef = SDValue(
+ CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
+ Lo.getValueType()), 0);
+ auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
+ : AMDGPU::SReg_64RegClassID;
+ const SDValue Ops[] = {
+ CurDAG->getTargetConstant(RC, SL, MVT::i32),
+ Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
+ Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
+
+ Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
+ Src.getValueType(), Ops), 0);
+ }
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
}
+ if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
+ uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
+ .bitcastToAPInt().getZExtValue();
+ if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
+ Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);;
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+ }
+ }
+
Mods = VecMods;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 0b4b4776ad39..d68488ccb342 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -78,6 +78,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
+ setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
+
+ setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
+
setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
@@ -99,9 +105,15 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
+ setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
+
setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
+ setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
+
setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
@@ -173,12 +185,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
@@ -198,6 +212,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v5f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
+ setOperationAction(ISD::STORE, MVT::v6f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
+
+ setOperationAction(ISD::STORE, MVT::v7f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
+
setOperationAction(ISD::STORE, MVT::v8f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
@@ -219,6 +239,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v2f64, Promote);
AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
+ setOperationAction(ISD::STORE, MVT::v3i64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
+
+ setOperationAction(ISD::STORE, MVT::v3f64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
+
setOperationAction(ISD::STORE, MVT::v4i64, Promote);
AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
@@ -261,6 +287,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
+ setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
+ setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
+ setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
+ setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
+
setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
@@ -325,8 +356,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v6i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v6f32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v7i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v7f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);
@@ -335,6 +372,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6f32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6i32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7f32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
@@ -343,6 +384,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f64, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i64, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f64, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i64, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f64, Custom);
@@ -412,8 +455,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
static const MVT::SimpleValueType VectorIntTypes[] = {
- MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32
- };
+ MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32};
for (MVT VT : VectorIntTypes) {
// Expand the following operations for the current type by default.
@@ -454,8 +496,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
}
static const MVT::SimpleValueType FloatVectorTypes[] = {
- MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32
- };
+ MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32};
for (MVT VT : FloatVectorTypes) {
setOperationAction(ISD::FABS, VT, Expand);
@@ -505,6 +546,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
+ setOperationAction(ISD::SELECT, MVT::v6f32, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
+
+ setOperationAction(ISD::SELECT, MVT::v7f32, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
+
// There are no libcalls of any kind.
for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
@@ -846,9 +893,9 @@ bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
assert(VT.isFloatingPoint());
- return VT == MVT::f32 || VT == MVT::f64 ||
- (Subtarget->has16BitInsts() && VT == MVT::f16) ||
- (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
+ // Report this based on the end legalized type.
+ VT = VT.getScalarType();
+ return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
}
bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
@@ -1257,8 +1304,9 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
- case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
- case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ return LowerFP_TO_INT(Op, DAG);
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF:
case ISD::CTLZ:
@@ -1304,7 +1352,8 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
- if (!MFI->isModuleEntryFunction()) {
+ if (!MFI->isModuleEntryFunction() &&
+ !GV->getName().equals("llvm.amdgcn.module.lds")) {
SDLoc DL(Op);
const Function &Fn = DAG.getMachineFunction().getFunction();
DiagnosticInfoUnsupported BadLDSDecl(
@@ -1368,6 +1417,14 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
SmallVector<SDValue, 8> Args;
unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
EVT VT = Op.getValueType();
+ EVT SrcVT = Op.getOperand(0).getValueType();
+
+ // For these types, we have some TableGen patterns except if the index is 1
+ if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) ||
+ (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
+ Start != 1)
+ return Op;
+
DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
VT.getVectorNumElements());
@@ -2579,33 +2636,77 @@ SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
return LowerINT_TO_FP64(Op, DAG, true);
}
-SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
+SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
bool Signed) const {
SDLoc SL(Op);
SDValue Src = Op.getOperand(0);
+ EVT SrcVT = Src.getValueType();
- SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
+ assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
- SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
- MVT::f64);
- SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
- MVT::f64);
- // TODO: Should this propagate fast-math-flags?
- SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
+ // The basic idea of converting a floating point number into a pair of 32-bit
+ // integers is illustrated as follows:
+ //
+ // tf := trunc(val);
+ // hif := floor(tf * 2^-32);
+ // lof := tf - hif * 2^32; // lof is always positive due to floor.
+ // hi := fptoi(hif);
+ // lo := fptoi(lof);
+ //
+ SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
+ SDValue Sign;
+ if (Signed && SrcVT == MVT::f32) {
+ // However, a 32-bit floating point number has only 23 bits mantissa and
+ // it's not enough to hold all the significant bits of `lof` if val is
+ // negative. To avoid the loss of precision, We need to take the absolute
+ // value after truncating and flip the result back based on the original
+ // signedness.
+ Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
+ DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
+ DAG.getConstant(31, SL, MVT::i32));
+ Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
+ }
- SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
+ SDValue K0, K1;
+ if (SrcVT == MVT::f64) {
+ K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)),
+ SL, SrcVT);
+ K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)),
+ SL, SrcVT);
+ } else {
+ K0 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)), SL,
+ SrcVT);
+ K1 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)), SL,
+ SrcVT);
+ }
+ // TODO: Should this propagate fast-math-flags?
+ SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
+ SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
- SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
+ SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
- SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
- MVT::i32, FloorMul);
+ SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
+ : ISD::FP_TO_UINT,
+ SL, MVT::i32, FloorMul);
SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
- SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
+ SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
+ DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
+
+ if (Signed && SrcVT == MVT::f32) {
+ assert(Sign);
+ // Flip the result based on the signedness, which is either all 0s or 1s.
+ Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
+ DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
+ // r := xor(r, sign) - sign;
+ Result =
+ DAG.getNode(ISD::SUB, SL, MVT::i64,
+ DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
+ }
- return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
+ return Result;
}
SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
@@ -2707,44 +2808,37 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) con
return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
}
-SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
+ SelectionDAG &DAG) const {
SDValue Src = Op.getOperand(0);
+ unsigned OpOpcode = Op.getOpcode();
+ EVT SrcVT = Src.getValueType();
+ EVT DestVT = Op.getValueType();
- // TODO: Factor out code common with LowerFP_TO_UINT.
+ // Will be selected natively
+ if (SrcVT == MVT::f16 && DestVT == MVT::i16)
+ return Op;
- EVT SrcVT = Src.getValueType();
- if (SrcVT == MVT::f16 ||
- (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
+ // Promote i16 to i32
+ if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
SDLoc DL(Op);
- SDValue FpToInt32 = DAG.getNode(Op.getOpcode(), DL, MVT::i32, Src);
- return DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, FpToInt32);
+ SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
}
- if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
- return LowerFP64_TO_INT(Op, DAG, true);
-
- return SDValue();
-}
-
-SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
- SelectionDAG &DAG) const {
- SDValue Src = Op.getOperand(0);
-
- // TODO: Factor out code common with LowerFP_TO_SINT.
-
- EVT SrcVT = Src.getValueType();
if (SrcVT == MVT::f16 ||
(SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
SDLoc DL(Op);
- SDValue FpToUInt32 = DAG.getNode(Op.getOpcode(), DL, MVT::i32, Src);
- return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, FpToUInt32);
+ SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
+ unsigned Ext =
+ OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
}
- if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
- return LowerFP64_TO_INT(Op, DAG, false);
+ if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
+ return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
return SDValue();
}
@@ -2787,8 +2881,8 @@ static bool isI24(SDValue Op, SelectionDAG &DAG) {
AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;
}
-static SDValue simplifyI24(SDNode *Node24,
- TargetLowering::DAGCombinerInfo &DCI) {
+static SDValue simplifyMul24(SDNode *Node24,
+ TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
@@ -2890,9 +2984,8 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
// Expand unaligned loads earlier than legalization. Due to visitation order
// problems during legalization, the emitted instructions to pack and unpack
// the bytes again are not eliminated in the case of an unaligned copy.
- if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment.value(),
- LN->getMemOperand()->getFlags(),
- &IsFast)) {
+ if (!allowsMisalignedMemoryAccesses(
+ VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
SDValue Ops[2];
if (VT.isVector())
@@ -2946,9 +3039,8 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
// order problems during legalization, the emitted instructions to pack and
// unpack the bytes again are not eliminated in the case of an unaligned
// copy.
- if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment.value(),
- SN->getMemOperand()->getFlags(),
- &IsFast)) {
+ if (!allowsMisalignedMemoryAccesses(
+ VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
if (VT.isVector())
return scalarizeVectorStore(SN, DAG);
@@ -3010,7 +3102,7 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
switch (IID) {
case Intrinsic::amdgcn_mul_i24:
case Intrinsic::amdgcn_mul_u24:
- return simplifyI24(N, DCI);
+ return simplifyMul24(N, DCI);
case Intrinsic::amdgcn_fract:
case Intrinsic::amdgcn_rsq:
case Intrinsic::amdgcn_rcp_legacy:
@@ -3312,6 +3404,13 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
EVT VT = N->getValueType(0);
+ // Don't generate 24-bit multiplies on values that are in SGPRs, since
+ // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
+ // unnecessarily). isDivergent() is used as an approximation of whether the
+ // value is in an SGPR.
+ if (!N->isDivergent())
+ return SDValue();
+
unsigned Size = VT.getSizeInBits();
if (VT.isVector() || Size > 64)
return SDValue();
@@ -3362,6 +3461,15 @@ SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
if (!Subtarget->hasMulI24() || VT.isVector())
return SDValue();
+ // Don't generate 24-bit multiplies on values that are in SGPRs, since
+ // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
+ // unnecessarily). isDivergent() is used as an approximation of whether the
+ // value is in an SGPR.
+ // This doesn't apply if no s_mul_hi is available (since we'll end up with a
+ // valu op anyway)
+ if (Subtarget->hasSMulHi() && !N->isDivergent())
+ return SDValue();
+
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
@@ -3386,6 +3494,15 @@ SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
return SDValue();
+ // Don't generate 24-bit multiplies on values that are in SGPRs, since
+ // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
+ // unnecessarily). isDivergent() is used as an approximation of whether the
+ // value is in an SGPR.
+ // This doesn't apply if no s_mul_hi is available (since we'll end up with a
+ // valu op anyway)
+ if (Subtarget->hasSMulHi() && !N->isDivergent())
+ return SDValue();
+
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
@@ -3985,11 +4102,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
case AMDGPUISD::MUL_I24:
case AMDGPUISD::MUL_U24:
case AMDGPUISD::MULHI_I24:
- case AMDGPUISD::MULHI_U24: {
- if (SDValue V = simplifyI24(N, DCI))
- return V;
- return SDValue();
- }
+ case AMDGPUISD::MULHI_U24:
+ return simplifyMul24(N, DCI);
case ISD::SELECT:
return performSelectCombine(N, DCI);
case ISD::FNEG:
@@ -4159,8 +4273,13 @@ SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
int64_t Offset) const {
MachineFunction &MF = DAG.getMachineFunction();
MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
+ // Stores to the argument stack area are relative to the stack pointer.
+ SDValue SP =
+ DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
+ Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
MachineMemOperand::MODereferenceable);
return Store;
@@ -4297,7 +4416,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CVT_PK_I16_I32)
NODE_NAME_CASE(CVT_PK_U16_U32)
NODE_NAME_CASE(FP_TO_FP16)
- NODE_NAME_CASE(FP16_ZEXT)
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
NODE_NAME_CASE(CONST_DATA_PTR)
NODE_NAME_CASE(PC_ADD_REL_OFFSET)
@@ -4350,6 +4468,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
+ NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
+ NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
}
@@ -4425,8 +4545,7 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
break;
}
- case AMDGPUISD::FP_TO_FP16:
- case AMDGPUISD::FP16_ZEXT: {
+ case AMDGPUISD::FP_TO_FP16: {
unsigned BitWidth = Known.getBitWidth();
// High bits are zero.
@@ -4573,7 +4692,6 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
case AMDGPUISD::BUFFER_LOAD_USHORT:
return 16;
case AMDGPUISD::FP_TO_FP16:
- case AMDGPUISD::FP16_ZEXT:
return 16;
default:
return 1;
@@ -4727,3 +4845,8 @@ AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
return AtomicExpansionKind::None;
}
}
+
+bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtactLegal(
+ unsigned Opc, LLT Ty1, LLT Ty2) const {
+ return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64));
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index ce3618f83130..e61021d451f8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -64,10 +64,9 @@ protected:
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const;
+ SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
@@ -328,6 +327,9 @@ public:
}
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
+
+ bool isConstantUnsignedBitfieldExtactLegal(unsigned Opc, LLT Ty1,
+ LLT Ty2) const override;
};
namespace AMDGPUISD {
@@ -458,9 +460,6 @@ enum NodeType : unsigned {
// are known 0.
FP_TO_FP16,
- // Wrapper around fp16 results that are known to zero the high bits.
- FP16_ZEXT,
-
/// This node is for VLIW targets and it is used to represent a vector
/// that is stored in consecutive registers with the same channel.
/// For example:
@@ -523,6 +522,8 @@ enum NodeType : unsigned {
BUFFER_ATOMIC_CMPSWAP,
BUFFER_ATOMIC_CSUB,
BUFFER_ATOMIC_FADD,
+ BUFFER_ATOMIC_FMIN,
+ BUFFER_ATOMIC_FMAX,
LAST_AMDGPU_ISD_NUMBER
};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 894677ec68b6..0f9cb712f820 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -132,7 +132,6 @@ def AMDGPUpknorm_u16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFP
def AMDGPUpk_i16_i32_impl : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
def AMDGPUpk_u16_u32_impl : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
-def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;
def AMDGPUfp_class_impl : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
@@ -213,6 +212,8 @@ def AMDGPUcvt_f32_ubyte2 : SDNode<"AMDGPUISD::CVT_F32_UBYTE2",
def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3",
SDTIntToFPOp, []>;
+def AMDGPUcvt_pk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32",
+ AMDGPUIntPackOp, []>;
// urecip - This operation is a helper for integer division, it returns the
// result of 1 / a as a fractional unsigned integer.
@@ -311,7 +312,7 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2",
SDTCisInt<4>]>,
[]>;
-def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
+def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
// SI+ export
def AMDGPUExportOp : SDTypeProfile<0, 8, [
@@ -461,3 +462,7 @@ def AMDGPUfdot2 : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$clamp)
def AMDGPUdiv_fmas : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$vcc),
[(int_amdgcn_div_fmas node:$src0, node:$src1, node:$src2, node:$vcc),
(AMDGPUdiv_fmas_impl node:$src0, node:$src1, node:$src2, node:$vcc)]>;
+
+def AMDGPUperm : PatFrags<(ops node:$src0, node:$src1, node:$src2),
+ [(int_amdgcn_perm node:$src0, node:$src1, node:$src2),
+ (AMDGPUperm_impl node:$src0, node:$src1, node:$src2)]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index bd577a6fb8c5..323aaaf70cd4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -18,6 +18,7 @@
#include "AMDGPURegisterBankInfo.h"
#include "AMDGPUTargetMachine.h"
#include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
@@ -59,11 +60,13 @@ AMDGPUInstructionSelector::AMDGPUInstructionSelector(
const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
-void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB,
- CodeGenCoverage &CoverageInfo) {
+void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB,
+ CodeGenCoverage &CoverageInfo,
+ ProfileSummaryInfo *PSI,
+ BlockFrequencyInfo *BFI) {
MRI = &MF.getRegInfo();
Subtarget = &MF.getSubtarget<GCNSubtarget>();
- InstructionSelector::setupMF(MF, KB, CoverageInfo);
+ InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
}
bool AMDGPUInstructionSelector::isVCC(Register Reg,
@@ -136,20 +139,29 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
const TargetRegisterClass *SrcRC
= TRI.getConstrainedRegClassForOperand(Src, *MRI);
- Register MaskedReg = MRI->createVirtualRegister(SrcRC);
+ Optional<ValueAndVReg> ConstVal =
+ getConstantVRegValWithLookThrough(SrcReg, *MRI, true, true);
+ if (ConstVal) {
+ unsigned MovOpc =
+ STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
+ BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
+ .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
+ } else {
+ Register MaskedReg = MRI->createVirtualRegister(SrcRC);
- // We can't trust the high bits at this point, so clear them.
+ // We can't trust the high bits at this point, so clear them.
- // TODO: Skip masking high bits if def is known boolean.
+ // TODO: Skip masking high bits if def is known boolean.
- unsigned AndOpc = TRI.isSGPRClass(SrcRC) ?
- AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
- BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
- .addImm(1)
- .addReg(SrcReg);
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
- .addImm(0)
- .addReg(MaskedReg);
+ unsigned AndOpc =
+ TRI.isSGPRClass(SrcRC) ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
+ BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
+ .addImm(1)
+ .addReg(SrcReg);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
+ .addImm(0)
+ .addReg(MaskedReg);
+ }
if (!MRI->getRegClassOrNull(SrcReg))
MRI->setRegClass(SrcReg, SrcRC);
@@ -578,7 +590,7 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
return true;
const LLT S32 = LLT::scalar(32);
- const LLT V2S16 = LLT::vector(2, 16);
+ const LLT V2S16 = LLT::fixed_vector(2, 16);
Register Dst = MI.getOperand(0).getReg();
if (MRI->getType(Dst) != V2S16)
@@ -743,6 +755,30 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
return true;
}
+bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ Register OffsetReg = MI.getOperand(2).getReg();
+ Register WidthReg = MI.getOperand(3).getReg();
+
+ assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
+ "scalar BFX instructions are expanded in regbankselect");
+ assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
+ "64-bit vector BFX instructions are expanded in regbankselect");
+
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock *MBB = MI.getParent();
+
+ bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
+ unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
+ auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
+ .addReg(SrcReg)
+ .addReg(OffsetReg)
+ .addReg(WidthReg);
+ MI.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
if (STI.getLDSBankCount() != 16)
return selectImpl(MI, *CoverageInfo);
@@ -916,8 +952,11 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
return constrainCopyLikeIntrin(I, AMDGPU::WQM);
case Intrinsic::amdgcn_softwqm:
return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
+ case Intrinsic::amdgcn_strict_wwm:
case Intrinsic::amdgcn_wwm:
- return constrainCopyLikeIntrin(I, AMDGPU::WWM);
+ return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
+ case Intrinsic::amdgcn_strict_wqm:
+ return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
case Intrinsic::amdgcn_writelane:
return selectWritelane(I);
case Intrinsic::amdgcn_div_scale:
@@ -1375,7 +1414,24 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
if (HasVSrc) {
Register VSrc = MI.getOperand(1).getReg();
- MIB.addReg(VSrc);
+
+ if (STI.needsAlignedVGPRs()) {
+ // Add implicit aligned super-reg to force alignment on the data operand.
+ Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
+ Register NewVR =
+ MRI->createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass);
+ BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), NewVR)
+ .addReg(VSrc, 0, MI.getOperand(1).getSubReg())
+ .addImm(AMDGPU::sub0)
+ .addReg(Undef)
+ .addImm(AMDGPU::sub1);
+ MIB.addReg(NewVR, 0, AMDGPU::sub0);
+ MIB.addReg(NewVR, RegState::Implicit);
+ } else {
+ MIB.addReg(VSrc);
+ }
+
if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
return false;
}
@@ -1446,24 +1502,6 @@ static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
return TexFailCtrl == 0;
}
-static bool parseCachePolicy(uint64_t Value,
- bool *GLC, bool *SLC, bool *DLC) {
- if (GLC) {
- *GLC = (Value & 0x1) ? 1 : 0;
- Value &= ~(uint64_t)0x1;
- }
- if (SLC) {
- *SLC = (Value & 0x2) ? 1 : 0;
- Value &= ~(uint64_t)0x2;
- }
- if (DLC) {
- *DLC = (Value & 0x4) ? 1 : 0;
- Value &= ~(uint64_t)0x4;
- }
-
- return Value == 0;
-}
-
bool AMDGPUInstructionSelector::selectImageIntrinsic(
MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
MachineBasicBlock *MBB = MI.getParent();
@@ -1504,8 +1542,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
const bool IsA16 = (Flags & 1) != 0;
const bool IsG16 = (Flags & 2) != 0;
- // A16 implies 16 bit gradients
- if (IsA16 && !IsG16)
+ // A16 implies 16 bit gradients if subtarget doesn't support G16
+ if (IsA16 && !STI.hasG16() && !IsG16)
return false;
unsigned DMask = 0;
@@ -1589,21 +1627,11 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
// TODO: Check this in verifier.
assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
- bool GLC = false;
- bool SLC = false;
- bool DLC = false;
- if (BaseOpcode->Atomic) {
- GLC = true; // TODO no-return optimization
- if (!parseCachePolicy(
- MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), nullptr,
- &SLC, IsGFX10Plus ? &DLC : nullptr))
- return false;
- } else {
- if (!parseCachePolicy(
- MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), &GLC,
- &SLC, IsGFX10Plus ? &DLC : nullptr))
- return false;
- }
+ unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
+ if (BaseOpcode->Atomic)
+ CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
+ if (CPol & ~AMDGPU::CPol::ALL)
+ return false;
int NumVAddrRegs = 0;
int NumVAddrDwords = 0;
@@ -1661,8 +1689,10 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
MIB.addDef(TmpReg);
- BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
- .addReg(TmpReg, RegState::Kill, SubReg);
+ if (!MRI->use_empty(VDataOut)) {
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
+ .addReg(TmpReg, RegState::Kill, SubReg);
+ }
} else {
MIB.addDef(VDataOut); // vdata output
@@ -1689,11 +1719,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
if (IsGFX10Plus)
MIB.addImm(DimInfo->Encoding);
MIB.addImm(Unorm);
- if (IsGFX10Plus)
- MIB.addImm(DLC);
- MIB.addImm(GLC);
- MIB.addImm(SLC);
+ MIB.addImm(CPol);
MIB.addImm(IsA16 && // a16 or r128
STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
if (IsGFX10Plus)
@@ -1706,6 +1733,38 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
if (BaseOpcode->HasD16)
MIB.addImm(IsD16 ? -1 : 0);
+ if (IsTexFail) {
+ // An image load instruction with TFE/LWE only conditionally writes to its
+ // result registers. Initialize them to zero so that we always get well
+ // defined result values.
+ assert(VDataOut && !VDataIn);
+ Register Tied = MRI->cloneVirtualRegister(VDataOut);
+ Register Zero = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::V_MOV_B32_e32), Zero)
+ .addImm(0);
+ auto Parts = TRI.getRegSplitParts(MRI->getRegClass(Tied), 4);
+ if (STI.usePRTStrictNull()) {
+ // With enable-prt-strict-null enabled, initialize all result registers to
+ // zero.
+ auto RegSeq =
+ BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
+ for (auto Sub : Parts)
+ RegSeq.addReg(Zero).addImm(Sub);
+ } else {
+ // With enable-prt-strict-null disabled, only initialize the extra TFE/LWE
+ // result register.
+ Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
+ auto RegSeq =
+ BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
+ for (auto Sub : Parts.drop_back(1))
+ RegSeq.addReg(Undef).addImm(Sub);
+ RegSeq.addReg(Zero).addImm(Parts.back());
+ }
+ MIB.addReg(Tied, RegState::Implicit);
+ MIB->tieOperands(0, MIB->getNumOperands() - 1);
+ }
+
MI.eraseFromParent();
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
}
@@ -1733,7 +1792,7 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
case Intrinsic::amdgcn_s_barrier:
return selectSBarrier(I);
case Intrinsic::amdgcn_global_atomic_fadd:
- return selectGlobalAtomicFaddIntrinsic(I);
+ return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3));
default: {
return selectImpl(I, *CoverageInfo);
}
@@ -1848,7 +1907,7 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
return false;
}
- if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) {
+ if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
MachineBasicBlock *MBB = I.getParent();
const DebugLoc &DL = I.getDebugLoc();
@@ -2336,6 +2395,13 @@ void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
MachineInstr &I) const {
+ if (I.getOpcode() == TargetOpcode::G_ATOMICRMW_FADD) {
+ const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
+ unsigned AS = PtrTy.getAddressSpace();
+ if (AS == AMDGPUAS::GLOBAL_ADDRESS)
+ return selectGlobalAtomicFadd(I, I.getOperand(1), I.getOperand(2));
+ }
+
initM0(I);
return selectImpl(I, *CoverageInfo);
}
@@ -2386,8 +2452,7 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
MIB.addImm(0);
MIB.addImm(Offset);
- MIB.addImm(1); // glc
- MIB.addImm(0); // slc
+ MIB.addImm(AMDGPU::CPol::GLC);
MIB.cloneMemRefs(MI);
BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
@@ -2772,7 +2837,7 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
Register Src1Reg = MI.getOperand(2).getReg();
ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
- const LLT V2S16 = LLT::vector(2, 16);
+ const LLT V2S16 = LLT::fixed_vector(2, 16);
if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
return false;
@@ -2895,6 +2960,8 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
MachineInstr &MI) const {
+ if (STI.hasGFX90AInsts())
+ return selectImpl(MI, *CoverageInfo);
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
@@ -2951,7 +3018,7 @@ bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN ||
Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) {
- Register IdxReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
.addReg(VIndex.getReg())
.addImm(AMDGPU::sub0)
@@ -2968,7 +3035,7 @@ bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
I.add(MI.getOperand(2)); // rsrc
I.add(SOffset);
I.addImm(Offset);
- renderExtractSLC(I, MI, 7);
+ I.addImm(MI.getOperand(7).getImm()); // cpol
I.cloneMemRefs(MI);
MI.eraseFromParent();
@@ -2976,8 +3043,14 @@ bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
return true;
}
-bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic(
- MachineInstr &MI) const{
+bool AMDGPUInstructionSelector::selectGlobalAtomicFadd(
+ MachineInstr &MI, MachineOperand &AddrOp, MachineOperand &DataOp) const {
+
+ if (STI.hasGFX90AInsts()) {
+ // gfx90a adds return versions of the global atomic fadd instructions so no
+ // special handling is required.
+ return selectImpl(MI, *CoverageInfo);
+ }
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
@@ -2994,16 +3067,16 @@ bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic(
// FIXME: This is only needed because tablegen requires number of dst operands
// in match and replace pattern to be the same. Otherwise patterns can be
// exported from SDag path.
- auto Addr = selectFlatOffsetImpl<true>(MI.getOperand(2));
+ auto Addr = selectFlatOffsetImpl(AddrOp, SIInstrFlags::FlatGlobal);
- Register Data = MI.getOperand(3).getReg();
+ Register Data = DataOp.getReg();
const unsigned Opc = MRI->getType(Data).isVector() ?
AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32;
auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
.addReg(Addr.first)
.addReg(Data)
.addImm(Addr.second)
- .addImm(0) // SLC
+ .addImm(0) // cpol
.cloneMemRefs(MI);
MI.eraseFromParent();
@@ -3140,6 +3213,9 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
return selectBVHIntrinsic(I);
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
return selectAMDGPU_BUFFER_ATOMIC_FADD(I);
+ case AMDGPU::G_SBFX:
+ case AMDGPU::G_UBFX:
+ return selectG_SBFX_UBFX(I);
default:
return selectImpl(I, *CoverageInfo);
}
@@ -3282,7 +3358,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(
if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
// It's possible to see an f32 fneg here, but unlikely.
// TODO: Treat f32 fneg as only high bit.
- MRI.getType(Src) == LLT::vector(2, 16)) {
+ MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
Src = MI->getOperand(1).getReg();
MI = MRI.getVRegDef(Src);
@@ -3408,9 +3484,9 @@ AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
}};
}
-template <bool Signed>
std::pair<Register, int>
-AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
+AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
+ uint64_t FlatVariant) const {
MachineInstr *MI = Root.getParent();
auto Default = std::make_pair(Root.getReg(), 0);
@@ -3426,7 +3502,7 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
return Default;
unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
- if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, Signed))
+ if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
return Default;
return std::make_pair(PtrBase, ConstOffset);
@@ -3434,7 +3510,7 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
- auto PtrWithOffset = selectFlatOffsetImpl<false>(Root);
+ auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
@@ -3443,8 +3519,18 @@ AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
- auto PtrWithOffset = selectFlatOffsetImpl<true>(Root);
+AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
+ auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
+ auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
@@ -3483,39 +3569,56 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
if (ConstOffset != 0) {
- if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true)) {
+ if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
+ SIInstrFlags::FlatGlobal)) {
Addr = PtrBase;
ImmOffset = ConstOffset;
- } else if (ConstOffset > 0) {
+ } else {
auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
if (!PtrBaseDef)
return None;
if (isSGPR(PtrBaseDef->Reg)) {
- // Offset is too large.
- //
- // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset)
- // + (large_offset & MaxOffset);
- int64_t SplitImmOffset, RemainderOffset;
- std::tie(SplitImmOffset, RemainderOffset)
- = TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true);
-
- if (isUInt<32>(RemainderOffset)) {
- MachineInstr *MI = Root.getParent();
- MachineBasicBlock *MBB = MI->getParent();
- Register HighBits
- = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
- BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
- HighBits)
- .addImm(RemainderOffset);
-
- return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
- [=](MachineInstrBuilder &MIB) { MIB.addReg(HighBits); }, // voffset
- [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
- }};
+ if (ConstOffset > 0) {
+ // Offset is too large.
+ //
+ // saddr + large_offset -> saddr +
+ // (voffset = large_offset & ~MaxOffset) +
+ // (large_offset & MaxOffset);
+ int64_t SplitImmOffset, RemainderOffset;
+ std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
+ ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
+
+ if (isUInt<32>(RemainderOffset)) {
+ MachineInstr *MI = Root.getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+ Register HighBits =
+ MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
+ HighBits)
+ .addImm(RemainderOffset);
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
+ [=](MachineInstrBuilder &MIB) {
+ MIB.addReg(HighBits);
+ }, // voffset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
+ }};
+ }
}
+
+ // We are adding a 64 bit SGPR and a constant. If constant bus limit
+ // is 1 we would need to perform 1 or 2 extra moves for each half of
+ // the constant and it is better to do a scalar add and then issue a
+ // single VALU instruction to materialize zero. Otherwise it is less
+ // instructions to perform VALU adds with immediates or inline literals.
+ unsigned NumLiterals =
+ !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) +
+ !TII.isInlineConstant(APInt(32, ConstOffset >> 32));
+ if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
+ return None;
}
}
}
@@ -3525,57 +3628,50 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
return None;
// Match the variable offset.
- if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) {
- // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
- // drop this.
- if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
- AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT)
- return None;
-
- // It's cheaper to materialize a single 32-bit zero for vaddr than the two
- // moves required to copy a 64-bit SGPR to VGPR.
- const Register SAddr = AddrDef->Reg;
- if (!isSGPR(SAddr))
- return None;
-
- MachineInstr *MI = Root.getParent();
- MachineBasicBlock *MBB = MI->getParent();
- Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
- BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
- VOffset)
- .addImm(0);
-
- return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
- [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
- [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
- }};
+ if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
+ // Look through the SGPR->VGPR copy.
+ Register SAddr =
+ getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
+
+ if (SAddr && isSGPR(SAddr)) {
+ Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
+
+ // It's possible voffset is an SGPR here, but the copy to VGPR will be
+ // inserted later.
+ if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
+ return {{[=](MachineInstrBuilder &MIB) { // saddr
+ MIB.addReg(SAddr);
+ },
+ [=](MachineInstrBuilder &MIB) { // voffset
+ MIB.addReg(VOffset);
+ },
+ [=](MachineInstrBuilder &MIB) { // offset
+ MIB.addImm(ImmOffset);
+ }}};
+ }
+ }
}
- // Look through the SGPR->VGPR copy.
- Register SAddr =
- getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
- if (!SAddr || !isSGPR(SAddr))
+ // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
+ // drop this.
+ if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
+ AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
return None;
- Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
+ // It's cheaper to materialize a single 32-bit zero for vaddr than the two
+ // moves required to copy a 64-bit SGPR to VGPR.
+ MachineInstr *MI = Root.getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+ Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- // It's possible voffset is an SGPR here, but the copy to VGPR will be
- // inserted later.
- Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset);
- if (!VOffset)
- return None;
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
+ .addImm(0);
- return {{[=](MachineInstrBuilder &MIB) { // saddr
- MIB.addReg(SAddr);
- },
- [=](MachineInstrBuilder &MIB) { // voffset
- MIB.addReg(VOffset);
- },
- [=](MachineInstrBuilder &MIB) { // offset
- MIB.addImm(ImmOffset);
- }}};
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+ }};
}
InstructionSelector::ComplexRendererFns
@@ -3590,7 +3686,8 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
if (ConstOffset != 0 &&
- TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
+ TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
+ SIInstrFlags::FlatScratch)) {
Addr = PtrBase;
ImmOffset = ConstOffset;
}
@@ -3624,9 +3721,9 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
const DebugLoc &DL = I.getDebugLoc();
SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), SAddr)
- .addFrameIndex(FI)
- .addReg(RHSDef->Reg);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
+ .addFrameIndex(FI)
+ .addReg(RHSDef->Reg);
}
}
@@ -3639,11 +3736,6 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
}};
}
-static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
- auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
- return PSV && PSV->isStack();
-}
-
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
MachineInstr *MI = Root.getParent();
@@ -3685,23 +3777,19 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
Optional<int> FI;
Register VAddr = Root.getReg();
if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
- if (isBaseWithConstantOffset(Root, *MRI)) {
- const MachineOperand &LHS = RootDef->getOperand(1);
- const MachineOperand &RHS = RootDef->getOperand(2);
- const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
- const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
- if (LHSDef && RHSDef) {
- int64_t PossibleOffset =
- RHSDef->getOperand(1).getCImm()->getSExtValue();
- if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) &&
- (!STI.privateMemoryResourceIsRangeChecked() ||
- KnownBits->signBitIsZero(LHS.getReg()))) {
- if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
- FI = LHSDef->getOperand(1).getIndex();
- else
- VAddr = LHS.getReg();
- Offset = PossibleOffset;
- }
+ Register PtrBase;
+ int64_t ConstOffset;
+ std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
+ if (ConstOffset != 0) {
+ if (SIInstrInfo::isLegalMUBUFImmOffset(ConstOffset) &&
+ (!STI.privateMemoryResourceIsRangeChecked() ||
+ KnownBits->signBitIsZero(PtrBase))) {
+ const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
+ if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
+ FI = PtrBaseDef->getOperand(1).getIndex();
+ else
+ VAddr = PtrBase;
+ Offset = ConstOffset;
}
} else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
FI = RootDef->getOperand(1).getIndex();
@@ -3769,18 +3857,13 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffset(
const MachineFunction *MF = MBB->getParent();
const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
- const MachineMemOperand *MMO = *MI->memoperands_begin();
- const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
return {{
[=](MachineInstrBuilder &MIB) { // rsrc
MIB.addReg(Info->getScratchRSrcReg());
},
[=](MachineInstrBuilder &MIB) { // soffset
- if (isStackPtrRelative(PtrInfo))
- MIB.addReg(Info->getStackPtrOffsetReg());
- else
- MIB.addImm(0);
+ MIB.addImm(0);
},
[=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
}};
@@ -4130,10 +4213,8 @@ AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
[=](MachineInstrBuilder &MIB) { // offset
MIB.addImm(Offset);
},
- addZeroImm, // glc
- addZeroImm, // slc
+ addZeroImm, // cpol
addZeroImm, // tfe
- addZeroImm, // dlc
addZeroImm // swz
}};
}
@@ -4158,11 +4239,9 @@ AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
MIB.addImm(0);
},
[=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
- addZeroImm, // glc
- addZeroImm, // slc
+ addZeroImm, // cpol
addZeroImm, // tfe
- addZeroImm, // dlc
- addZeroImm // swz
+ addZeroImm, // swz
}};
}
@@ -4194,7 +4273,9 @@ AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
[=](MachineInstrBuilder &MIB) { // offset
MIB.addImm(Offset);
},
- addZeroImm // slc
+ [=](MachineInstrBuilder &MIB) {
+ MIB.addImm(AMDGPU::CPol::GLC); // cpol
+ }
}};
}
@@ -4218,7 +4299,7 @@ AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
MIB.addImm(0);
},
[=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
- addZeroImm // slc
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(AMDGPU::CPol::GLC); } // cpol
}};
}
@@ -4308,32 +4389,25 @@ void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
MIB.addImm(MI.getOperand(OpIdx).getImm());
}
-void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB,
- const MachineInstr &MI,
- int OpIdx) const {
+void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm(MI.getOperand(OpIdx).getImm() & 1);
+ MIB.addImm(MI.getOperand(OpIdx).getImm() & AMDGPU::CPol::ALL);
}
-void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB,
- const MachineInstr &MI,
- int OpIdx) const {
- assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1);
-}
-
-void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB,
+void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1);
+ MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
}
-void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
- const MachineInstr &MI,
- int OpIdx) const {
+void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
+ MIB.addImm(MI.getOperand(OpIdx).getImm() | AMDGPU::CPol::GLC);
}
void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index d70f18098cd7..cb05a1cb6369 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -36,6 +36,8 @@ struct ImageDimIntrinsicInfo;
class AMDGPUInstrInfo;
class AMDGPURegisterBankInfo;
class AMDGPUTargetMachine;
+class BlockFrequencyInfo;
+class ProfileSummaryInfo;
class GCNSubtarget;
class MachineInstr;
class MachineIRBuilder;
@@ -45,6 +47,7 @@ class RegisterBank;
class SIInstrInfo;
class SIMachineFunctionInfo;
class SIRegisterInfo;
+class TargetRegisterClass;
class AMDGPUInstructionSelector final : public InstructionSelector {
private:
@@ -59,8 +62,9 @@ public:
bool select(MachineInstr &I) override;
static const char *getName();
- void setupMF(MachineFunction &MF, GISelKnownBits &KB,
- CodeGenCoverage &CoverageInfo) override;
+ void setupMF(MachineFunction &MF, GISelKnownBits *KB,
+ CodeGenCoverage &CoverageInfo, ProfileSummaryInfo *PSI,
+ BlockFrequencyInfo *BFI) override;
private:
struct GEPInfo {
@@ -105,6 +109,7 @@ private:
bool selectG_PTR_ADD(MachineInstr &I) const;
bool selectG_IMPLICIT_DEF(MachineInstr &I) const;
bool selectG_INSERT(MachineInstr &I) const;
+ bool selectG_SBFX_UBFX(MachineInstr &I) const;
bool selectInterpP1F16(MachineInstr &MI) const;
bool selectWritelane(MachineInstr &MI) const;
@@ -143,7 +148,8 @@ private:
bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const;
bool selectG_SHUFFLE_VECTOR(MachineInstr &I) const;
bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const;
- bool selectGlobalAtomicFaddIntrinsic(MachineInstr &I) const;
+ bool selectGlobalAtomicFadd(MachineInstr &I, MachineOperand &AddrOp,
+ MachineOperand &DataOp) const;
bool selectBVHIntrinsic(MachineInstr &I) const;
std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root,
@@ -187,14 +193,15 @@ private:
InstructionSelector::ComplexRendererFns
selectSmrdSgpr(MachineOperand &Root) const;
- template <bool Signed>
- std::pair<Register, int>
- selectFlatOffsetImpl(MachineOperand &Root) const;
+ std::pair<Register, int> selectFlatOffsetImpl(MachineOperand &Root,
+ uint64_t FlatVariant) const;
InstructionSelector::ComplexRendererFns
selectFlatOffset(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
- selectFlatOffsetSigned(MachineOperand &Root) const;
+ selectGlobalOffset(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectScratchOffset(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectGlobalSAddr(MachineOperand &Root) const;
@@ -274,26 +281,6 @@ private:
void renderTruncTImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
- void renderTruncTImm1(MachineInstrBuilder &MIB, const MachineInstr &MI,
- int OpIdx) const {
- renderTruncTImm(MIB, MI, OpIdx);
- }
-
- void renderTruncTImm8(MachineInstrBuilder &MIB, const MachineInstr &MI,
- int OpIdx) const {
- renderTruncTImm(MIB, MI, OpIdx);
- }
-
- void renderTruncTImm16(MachineInstrBuilder &MIB, const MachineInstr &MI,
- int OpIdx) const {
- renderTruncTImm(MIB, MI, OpIdx);
- }
-
- void renderTruncTImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
- int OpIdx) const {
- renderTruncTImm(MIB, MI, OpIdx);
- }
-
void renderNegateImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
@@ -302,14 +289,13 @@ private:
void renderPopcntImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
- void renderExtractGLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
- int OpIdx) const;
- void renderExtractSLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
- int OpIdx) const;
- void renderExtractDLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
- int OpIdx) const;
+ void renderExtractCPol(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const;
void renderExtractSWZ(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
+ void renderSetGLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const;
+
void renderFrameIndex(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 8ef9c99e8b35..119c4089d6c2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -83,8 +83,7 @@ def FalsePredicate : Predicate<"false">;
// Add a predicate to the list if does not already exist to deduplicate it.
class PredConcat<list<Predicate> lst, Predicate pred> {
list<Predicate> ret =
- !listconcat([pred], !filter(item, lst,
- !ne(!cast<string>(item), !cast<string>(pred))));
+ !listconcat([pred], !filter(item, lst, !ne(item, pred)));
}
class PredicateControl {
@@ -185,6 +184,28 @@ class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag<
}];
}
+class is_canonicalized<SDPatternOperator op> : PatFrag<
+ (ops node:$src0, node:$src1),
+ (op $src0, $src1),
+ [{
+ const SITargetLowering &Lowering =
+ *static_cast<const SITargetLowering *>(getTargetLowering());
+
+ return Lowering.isCanonicalized(*CurDAG, N->getOperand(0)) &&
+ Lowering.isCanonicalized(*CurDAG, N->getOperand(1));
+ }]> {
+
+ // TODO: Improve the Legalizer for g_build_vector in Global Isel to match this class
+ let GISelPredicateCode = [{
+ const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
+ MF.getSubtarget().getTargetLowering());
+
+ return TLI->isCanonicalized(MI.getOperand(1).getReg(), const_cast<MachineFunction&>(MF)) &&
+ TLI->isCanonicalized(MI.getOperand(2).getReg(), const_cast<MachineFunction&>(MF));
+ }];
+}
+
+
let Properties = [SDNPCommutative, SDNPAssociative] in {
def smax_oneuse : HasOneUseBinOp<smax>;
def smin_oneuse : HasOneUseBinOp<smin>;
@@ -596,12 +617,6 @@ class DwordAddrPat<ValueType vt, RegisterClass rc> : AMDGPUPat <
(vt rc:$addr)
>;
-// fshr pattern
-class FSHRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
- (fshr i32:$src0, i32:$src1, i32:$src2),
- (BIT_ALIGN $src0, $src1, $src2)
->;
-
// rotr pattern
class ROTRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
(rotr i32:$src0, i32:$src1),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 8aea33cf289d..4971b010870d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -165,10 +165,12 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS);
PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS);
auto *NewPtr = IRB.CreateBitCast(
- IRB.CreateConstGEP1_64(IRB.CreateBitCast(Base, Int8PtrTy),
- Offset - Adjust),
+ IRB.CreateConstGEP1_64(
+ IRB.getInt8Ty(),
+ IRB.CreatePointerBitCastOrAddrSpaceCast(Base, Int8PtrTy),
+ Offset - Adjust),
Int32PtrTy);
- LoadInst *NewLd = IRB.CreateAlignedLoad(NewPtr, Align(4));
+ LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
NewLd->copyMetadata(LI);
NewLd->setMetadata(LLVMContext::MD_range, nullptr);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 9f359c232981..c1a9b30a509e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -18,7 +18,9 @@
#include "AMDGPUInstrInfo.h"
#include "AMDGPUTargetMachine.h"
#include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/ScopeExit.h"
+#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
@@ -47,7 +49,7 @@ static constexpr unsigned MaxRegisterSize = 1024;
static LLT getPow2VectorType(LLT Ty) {
unsigned NElts = Ty.getNumElements();
unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
- return Ty.changeNumElements(Pow2NElts);
+ return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
}
// Round the number of bits to the next power of two bits
@@ -93,7 +95,8 @@ static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
const LLT EltTy = Ty.getElementType();
- return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
+ return std::make_pair(TypeIdx,
+ LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
};
}
@@ -104,7 +107,9 @@ static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
unsigned Size = Ty.getSizeInBits();
unsigned Pieces = (Size + 63) / 64;
unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
- return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
+ return std::make_pair(
+ TypeIdx,
+ LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
};
}
@@ -122,7 +127,7 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
assert(EltSize < 32);
const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
- return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
+ return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
};
}
@@ -136,7 +141,7 @@ static LLT getBitcastRegisterType(const LLT Ty) {
return LLT::scalar(Size);
}
- return LLT::scalarOrVector(Size / 32, 32);
+ return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
}
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
@@ -151,7 +156,8 @@ static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
const LLT Ty = Query.Types[TypeIdx];
unsigned Size = Ty.getSizeInBits();
assert(Size % 32 == 0);
- return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
+ return std::make_pair(
+ TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
};
}
@@ -220,11 +226,13 @@ static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
};
}
-static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
+// If we have a truncating store or an extending load with a data size larger
+// than 32-bits, we need to reduce to a 32-bit type.
+static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
- Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
+ Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
};
}
@@ -257,15 +265,14 @@ static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
}
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
- const LegalityQuery &Query,
- unsigned Opcode) {
+ const LegalityQuery &Query) {
const LLT Ty = Query.Types[0];
// Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
- const bool IsLoad = Opcode != AMDGPU::G_STORE;
+ const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
unsigned RegSize = Ty.getSizeInBits();
- unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+ unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
unsigned AS = Query.Types[1].getAddressSpace();
@@ -273,6 +280,10 @@ static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
return false;
+ // Do not handle extending vector loads.
+ if (Ty.isVector() && MemSize != RegSize)
+ return false;
+
// TODO: We should be able to widen loads if the alignment is high enough, but
// we also need to modify the memory access size.
#if 0
@@ -341,33 +352,37 @@ static bool loadStoreBitcastWorkaround(const LLT Ty) {
return EltSize != 32 && EltSize != 64;
}
-static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
- unsigned Opcode) {
+static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
const LLT Ty = Query.Types[0];
- return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
+ return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
!loadStoreBitcastWorkaround(Ty);
}
/// Return true if a load or store of the type should be lowered with a bitcast
/// to a different type.
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
- const unsigned MemSizeInBits) {
+ const LLT MemTy) {
+ const unsigned MemSizeInBits = MemTy.getSizeInBits();
const unsigned Size = Ty.getSizeInBits();
- if (Size != MemSizeInBits)
- return Size <= 32 && Ty.isVector();
+ if (Size != MemSizeInBits)
+ return Size <= 32 && Ty.isVector();
if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
return true;
- return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
+
+ // Don't try to handle bitcasting vector ext loads for now.
+ return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
+ (Size <= 32 || isRegisterSize(Size)) &&
!isRegisterVectorElementType(Ty.getElementType());
}
/// Return true if we should legalize a load by widening an odd sized memory
/// access up to the alignment. Note this case when the memory access itself
/// changes, not the size of the result register.
-static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
+static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
unsigned AlignInBits, unsigned AddrSpace,
unsigned Opcode) {
+ unsigned SizeInBits = MemoryTy.getSizeInBits();
// We don't want to widen cases that are naturally legal.
if (isPowerOf2_32(SizeInBits))
return false;
@@ -403,7 +418,7 @@ static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
return false;
- return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
+ return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
Query.MMODescrs[0].AlignInBits,
Query.Types[1].getAddressSpace(), Opcode);
}
@@ -427,35 +442,35 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT S512 = LLT::scalar(512);
const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
- const LLT V2S8 = LLT::vector(2, 8);
- const LLT V2S16 = LLT::vector(2, 16);
- const LLT V4S16 = LLT::vector(4, 16);
-
- const LLT V2S32 = LLT::vector(2, 32);
- const LLT V3S32 = LLT::vector(3, 32);
- const LLT V4S32 = LLT::vector(4, 32);
- const LLT V5S32 = LLT::vector(5, 32);
- const LLT V6S32 = LLT::vector(6, 32);
- const LLT V7S32 = LLT::vector(7, 32);
- const LLT V8S32 = LLT::vector(8, 32);
- const LLT V9S32 = LLT::vector(9, 32);
- const LLT V10S32 = LLT::vector(10, 32);
- const LLT V11S32 = LLT::vector(11, 32);
- const LLT V12S32 = LLT::vector(12, 32);
- const LLT V13S32 = LLT::vector(13, 32);
- const LLT V14S32 = LLT::vector(14, 32);
- const LLT V15S32 = LLT::vector(15, 32);
- const LLT V16S32 = LLT::vector(16, 32);
- const LLT V32S32 = LLT::vector(32, 32);
-
- const LLT V2S64 = LLT::vector(2, 64);
- const LLT V3S64 = LLT::vector(3, 64);
- const LLT V4S64 = LLT::vector(4, 64);
- const LLT V5S64 = LLT::vector(5, 64);
- const LLT V6S64 = LLT::vector(6, 64);
- const LLT V7S64 = LLT::vector(7, 64);
- const LLT V8S64 = LLT::vector(8, 64);
- const LLT V16S64 = LLT::vector(16, 64);
+ const LLT V2S8 = LLT::fixed_vector(2, 8);
+ const LLT V2S16 = LLT::fixed_vector(2, 16);
+ const LLT V4S16 = LLT::fixed_vector(4, 16);
+
+ const LLT V2S32 = LLT::fixed_vector(2, 32);
+ const LLT V3S32 = LLT::fixed_vector(3, 32);
+ const LLT V4S32 = LLT::fixed_vector(4, 32);
+ const LLT V5S32 = LLT::fixed_vector(5, 32);
+ const LLT V6S32 = LLT::fixed_vector(6, 32);
+ const LLT V7S32 = LLT::fixed_vector(7, 32);
+ const LLT V8S32 = LLT::fixed_vector(8, 32);
+ const LLT V9S32 = LLT::fixed_vector(9, 32);
+ const LLT V10S32 = LLT::fixed_vector(10, 32);
+ const LLT V11S32 = LLT::fixed_vector(11, 32);
+ const LLT V12S32 = LLT::fixed_vector(12, 32);
+ const LLT V13S32 = LLT::fixed_vector(13, 32);
+ const LLT V14S32 = LLT::fixed_vector(14, 32);
+ const LLT V15S32 = LLT::fixed_vector(15, 32);
+ const LLT V16S32 = LLT::fixed_vector(16, 32);
+ const LLT V32S32 = LLT::fixed_vector(32, 32);
+
+ const LLT V2S64 = LLT::fixed_vector(2, 64);
+ const LLT V3S64 = LLT::fixed_vector(3, 64);
+ const LLT V4S64 = LLT::fixed_vector(4, 64);
+ const LLT V5S64 = LLT::fixed_vector(5, 64);
+ const LLT V6S64 = LLT::fixed_vector(6, 64);
+ const LLT V7S64 = LLT::fixed_vector(7, 64);
+ const LLT V8S64 = LLT::fixed_vector(8, 64);
+ const LLT V16S64 = LLT::fixed_vector(16, 64);
std::initializer_list<LLT> AllS32Vectors =
{V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
@@ -495,8 +510,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
- setAction({G_BRCOND, S1}, Legal); // VCC branches
- setAction({G_BRCOND, S32}, Legal); // SCC branches
+ // s1 for VCC branches, s32 for SCC branches.
+ getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
// TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
// elements for v3s16
@@ -579,11 +594,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.lower();
}
- getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
- .customFor({S32, S64})
- .clampScalar(0, S32, S64)
- .widenScalarToNextPow2(0, 32)
- .scalarize(0);
+ getActionDefinitionsBuilder(
+ {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
+ .customFor({S32, S64})
+ .clampScalar(0, S32, S64)
+ .widenScalarToNextPow2(0, 32)
+ .scalarize(0);
auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
.legalFor({S32})
@@ -643,7 +659,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.widenScalarToNextPow2(0, 32)
.clampMaxNumElements(0, S32, 16);
- setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
+ getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
// If the amount is divergent, we have to do a wave reduction to get the
// maximum value, so this is expanded during RegBankSelect.
@@ -653,7 +669,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder(G_GLOBAL_VALUE)
.customIf(typeIsNot(0, PrivatePtr));
- setAction({G_BLOCK_ADDR, CodePtr}, Legal);
+ getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
auto &FPOpActions = getActionDefinitionsBuilder(
{ G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
@@ -809,7 +825,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
.legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
- .customFor({{S64, S64}})
+ .customFor({{S64, S32}, {S64, S64}})
.narrowScalarFor({{S64, S16}}, changeTo(0, S32));
if (ST.has16BitInsts())
FPToI.legalFor({{S16, S16}});
@@ -817,6 +833,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
FPToI.minScalar(1, S32);
FPToI.minScalar(0, S32)
+ .widenScalarToNextPow2(0, 32)
.scalarize(0)
.lower();
@@ -935,10 +952,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.widenScalarToNextPow2(0, 32)
.widenScalarToNextPow2(1, 32);
+ // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
+ // RegBankSelect.
getActionDefinitionsBuilder(G_BITREVERSE)
- .legalFor({S32})
- .clampScalar(0, S32, S32)
- .scalarize(0);
+ .legalFor({S32, S64})
+ .clampScalar(0, S32, S64)
+ .scalarize(0)
+ .widenScalarToNextPow2(0);
if (ST.has16BitInsts()) {
getActionDefinitionsBuilder(G_BSWAP)
@@ -951,7 +971,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0);
if (ST.hasVOP3PInsts()) {
- getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
.legalFor({S32, S16, V2S16})
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
.clampMaxNumElements(0, S16, 2)
@@ -960,7 +980,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0)
.lower();
} else {
- getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
.legalFor({S32, S16})
.widenScalarToNextPow2(0)
.minScalar(0, S16)
@@ -979,7 +999,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0)
.lower();
- getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
.legalFor({S32})
.minScalar(0, S32)
.widenScalarToNextPow2(0)
@@ -1029,7 +1049,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT DstTy = Query.Types[0];
// Split vector extloads.
- unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+ unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
if (MemSize < DstTy.getSizeInBits())
@@ -1078,35 +1098,35 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
auto &Actions = getActionDefinitionsBuilder(Op);
// Explicitly list some common cases.
// TODO: Does this help compile time at all?
- Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
- {V2S32, GlobalPtr, 64, GlobalAlign32},
- {V4S32, GlobalPtr, 128, GlobalAlign32},
- {S64, GlobalPtr, 64, GlobalAlign32},
- {V2S64, GlobalPtr, 128, GlobalAlign32},
- {V2S16, GlobalPtr, 32, GlobalAlign32},
- {S32, GlobalPtr, 8, GlobalAlign8},
- {S32, GlobalPtr, 16, GlobalAlign16},
-
- {S32, LocalPtr, 32, 32},
- {S64, LocalPtr, 64, 32},
- {V2S32, LocalPtr, 64, 32},
- {S32, LocalPtr, 8, 8},
- {S32, LocalPtr, 16, 16},
- {V2S16, LocalPtr, 32, 32},
-
- {S32, PrivatePtr, 32, 32},
- {S32, PrivatePtr, 8, 8},
- {S32, PrivatePtr, 16, 16},
- {V2S16, PrivatePtr, 32, 32},
-
- {S32, ConstantPtr, 32, GlobalAlign32},
- {V2S32, ConstantPtr, 64, GlobalAlign32},
- {V4S32, ConstantPtr, 128, GlobalAlign32},
- {S64, ConstantPtr, 64, GlobalAlign32},
- {V2S32, ConstantPtr, 32, GlobalAlign32}});
+ Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
+ {V2S32, GlobalPtr, V2S32, GlobalAlign32},
+ {V4S32, GlobalPtr, V4S32, GlobalAlign32},
+ {S64, GlobalPtr, S64, GlobalAlign32},
+ {V2S64, GlobalPtr, V2S64, GlobalAlign32},
+ {V2S16, GlobalPtr, V2S16, GlobalAlign32},
+ {S32, GlobalPtr, S8, GlobalAlign8},
+ {S32, GlobalPtr, S16, GlobalAlign16},
+
+ {S32, LocalPtr, S32, 32},
+ {S64, LocalPtr, S64, 32},
+ {V2S32, LocalPtr, V2S32, 32},
+ {S32, LocalPtr, S8, 8},
+ {S32, LocalPtr, S16, 16},
+ {V2S16, LocalPtr, S32, 32},
+
+ {S32, PrivatePtr, S32, 32},
+ {S32, PrivatePtr, S8, 8},
+ {S32, PrivatePtr, S16, 16},
+ {V2S16, PrivatePtr, S32, 32},
+
+ {S32, ConstantPtr, S32, GlobalAlign32},
+ {V2S32, ConstantPtr, V2S32, GlobalAlign32},
+ {V4S32, ConstantPtr, V4S32, GlobalAlign32},
+ {S64, ConstantPtr, S64, GlobalAlign32},
+ {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
Actions.legalIf(
[=](const LegalityQuery &Query) -> bool {
- return isLoadStoreLegal(ST, Query, Op);
+ return isLoadStoreLegal(ST, Query);
});
// Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
@@ -1125,7 +1145,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
Actions.bitcastIf(
[=](const LegalityQuery &Query) -> bool {
return shouldBitcastLoadStoreType(ST, Query.Types[0],
- Query.MMODescrs[0].SizeInBits);
+ Query.MMODescrs[0].MemoryTy);
}, bitcastToRegisterType(0));
if (!IsStore) {
@@ -1148,7 +1168,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT PtrTy = Query.Types[1];
const unsigned DstSize = DstTy.getSizeInBits();
- unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+ unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
// Split extloads.
if (DstSize > MemSize)
@@ -1196,16 +1216,18 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// FIXME: 3 element stores scalarized on SI
// Split if it's too large for the address space.
- if (Query.MMODescrs[0].SizeInBits > MaxSize) {
+ unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
+ if (MemSize > MaxSize) {
unsigned NumElts = DstTy.getNumElements();
unsigned EltSize = EltTy.getSizeInBits();
if (MaxSize % EltSize == 0) {
return std::make_pair(
- 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
+ 0, LLT::scalarOrVector(
+ ElementCount::getFixed(MaxSize / EltSize), EltTy));
}
- unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
+ unsigned NumPieces = MemSize / MaxSize;
// FIXME: Refine when odd breakdowns handled
// The scalars will need to be re-legalized.
@@ -1213,12 +1235,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
NumElts % NumPieces != 0)
return std::make_pair(0, EltTy);
- return std::make_pair(0,
- LLT::vector(NumElts / NumPieces, EltTy));
+ return std::make_pair(
+ 0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
}
// FIXME: We could probably handle weird extending loads better.
- unsigned MemSize = Query.MMODescrs[0].SizeInBits;
if (DstTy.getSizeInBits() > MemSize)
return std::make_pair(0, EltTy);
@@ -1230,48 +1251,58 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// should be OK, since the new parts will be further legalized.
unsigned FloorSize = PowerOf2Floor(DstSize);
return std::make_pair(
- 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
+ 0, LLT::scalarOrVector(
+ ElementCount::getFixed(FloorSize / EltSize), EltTy));
}
// Need to split because of alignment.
unsigned Align = Query.MMODescrs[0].AlignInBits;
if (EltSize > Align &&
(EltSize / Align < DstTy.getNumElements())) {
- return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
+ return std::make_pair(
+ 0, LLT::fixed_vector(EltSize / Align, EltTy));
}
// May need relegalization for the scalars.
return std::make_pair(0, EltTy);
})
.lowerIfMemSizeNotPow2()
- .minScalar(0, S32);
-
- if (IsStore)
- Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
-
- Actions
- .widenScalarToNextPow2(0)
- .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
- .lower();
+ .minScalar(0, S32)
+ .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
+ .widenScalarToNextPow2(0)
+ .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
+ .lower();
}
+ // FIXME: Unaligned accesses not lowered.
auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
- .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
- {S32, GlobalPtr, 16, 2 * 8},
- {S32, LocalPtr, 8, 8},
- {S32, LocalPtr, 16, 16},
- {S32, PrivatePtr, 8, 8},
- {S32, PrivatePtr, 16, 16},
- {S32, ConstantPtr, 8, 8},
- {S32, ConstantPtr, 16, 2 * 8}});
+ .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
+ {S32, GlobalPtr, S16, 2 * 8},
+ {S32, LocalPtr, S8, 8},
+ {S32, LocalPtr, S16, 16},
+ {S32, PrivatePtr, S8, 8},
+ {S32, PrivatePtr, S16, 16},
+ {S32, ConstantPtr, S8, 8},
+ {S32, ConstantPtr, S16, 2 * 8}})
+ .legalIf(
+ [=](const LegalityQuery &Query) -> bool {
+ return isLoadStoreLegal(ST, Query);
+ });
+
if (ST.hasFlatAddressSpace()) {
ExtLoads.legalForTypesWithMemDesc(
- {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
+ {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
}
+ // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
+ // 64-bits.
+ //
+ // TODO: Should generalize bitcast action into coerce, which will also cover
+ // inserting addrspacecasts.
+ ExtLoads.customIf(typeIs(1, Constant32Ptr));
+
ExtLoads.clampScalar(0, S32, S32)
.widenScalarToNextPow2(0)
- .unsupportedIfMemSizeNotPow2()
.lower();
auto &Atomics = getActionDefinitionsBuilder(
@@ -1286,10 +1317,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
}
+ auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
if (ST.hasLDSFPAtomics()) {
- getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
- .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
+ Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
+ if (ST.hasGFX90AInsts())
+ Atomic.legalFor({{S64, LocalPtr}});
}
+ if (ST.hasAtomicFaddInsts())
+ Atomic.legalFor({{S32, GlobalPtr}});
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
// demarshalling
@@ -1302,19 +1337,21 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// Condition should be s32 for scalar, s1 for vector.
getActionDefinitionsBuilder(G_SELECT)
- .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
- GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
- LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
- .clampScalar(0, S16, S64)
- .scalarize(1)
- .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
- .fewerElementsIf(numElementsNotEven(0), scalarize(0))
- .clampMaxNumElements(0, S32, 2)
- .clampMaxNumElements(0, LocalPtr, 2)
- .clampMaxNumElements(0, PrivatePtr, 2)
- .scalarize(0)
- .widenScalarToNextPow2(0)
- .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
+ .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
+ LocalPtr, FlatPtr, PrivatePtr,
+ LLT::fixed_vector(2, LocalPtr),
+ LLT::fixed_vector(2, PrivatePtr)},
+ {S1, S32})
+ .clampScalar(0, S16, S64)
+ .scalarize(1)
+ .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
+ .fewerElementsIf(numElementsNotEven(0), scalarize(0))
+ .clampMaxNumElements(0, S32, 2)
+ .clampMaxNumElements(0, LocalPtr, 2)
+ .clampMaxNumElements(0, PrivatePtr, 2)
+ .scalarize(0)
+ .widenScalarToNextPow2(0)
+ .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
// TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
// be more flexible with the shift amount type.
@@ -1393,7 +1430,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
return std::make_pair(
- VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
+ VecTypeIdx,
+ LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
})
.clampScalar(EltTypeIdx, S32, S64)
.clampScalar(VecTypeIdx, S32, S64)
@@ -1590,17 +1628,44 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.clampScalar(0, S32, S64)
.lower();
+ // TODO: Only Try to form v2s16 with legal packed instructions.
getActionDefinitionsBuilder(G_FSHR)
.legalFor({{S32, S32}})
+ .lowerFor({{V2S16, V2S16}})
+ .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
.scalarize(0)
.lower();
+ if (ST.hasVOP3PInsts()) {
+ getActionDefinitionsBuilder(G_FSHL)
+ .lowerFor({{V2S16, V2S16}})
+ .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
+ .scalarize(0)
+ .lower();
+ } else {
+ getActionDefinitionsBuilder(G_FSHL)
+ .scalarize(0)
+ .lower();
+ }
+
getActionDefinitionsBuilder(G_READCYCLECOUNTER)
.legalFor({S64});
getActionDefinitionsBuilder(G_FENCE)
.alwaysLegal();
+ getActionDefinitionsBuilder({G_SMULO, G_UMULO})
+ .scalarize(0)
+ .minScalar(0, S32)
+ .lower();
+
+ getActionDefinitionsBuilder({G_SBFX, G_UBFX})
+ .legalFor({{S32, S32}, {S64, S32}})
+ .clampScalar(1, S32, S32)
+ .clampScalar(0, S32, S64)
+ .widenScalarToNextPow2(0)
+ .scalarize(0);
+
getActionDefinitionsBuilder({
// TODO: Verify V_BFI_B32 is generated from expanded bit ops
G_FCOPYSIGN,
@@ -1614,16 +1679,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
G_SADDO, G_SSUBO,
// TODO: Implement
- G_FMINIMUM, G_FMAXIMUM,
- G_FSHL
- }).lower();
+ G_FMINIMUM, G_FMAXIMUM}).lower();
getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
.unsupported();
- computeTables();
+ getLegacyLegalizerInfo().computeTables();
verify(*ST.getInstrInfo());
}
@@ -1668,6 +1731,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
case TargetOpcode::G_GLOBAL_VALUE:
return legalizeGlobalValue(MI, MRI, B);
case TargetOpcode::G_LOAD:
+ case TargetOpcode::G_SEXTLOAD:
+ case TargetOpcode::G_ZEXTLOAD:
return legalizeLoad(Helper, MI);
case TargetOpcode::G_FMAD:
return legalizeFMad(MI, MRI, B);
@@ -1675,10 +1740,12 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
return legalizeFDIV(MI, MRI, B);
case TargetOpcode::G_UDIV:
case TargetOpcode::G_UREM:
- return legalizeUDIV_UREM(MI, MRI, B);
+ case TargetOpcode::G_UDIVREM:
+ return legalizeUnsignedDIV_REM(MI, MRI, B);
case TargetOpcode::G_SDIV:
case TargetOpcode::G_SREM:
- return legalizeSDIV_SREM(MI, MRI, B);
+ case TargetOpcode::G_SDIVREM:
+ return legalizeSignedDIV_REM(MI, MRI, B);
case TargetOpcode::G_ATOMIC_CMPXCHG:
return legalizeAtomicCmpXChg(MI, MRI, B);
case TargetOpcode::G_FLOG:
@@ -1751,7 +1818,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
PtrInfo,
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
- 4, commonAlignment(Align(64), StructOffset));
+ LLT::scalar(32), commonAlignment(Align(64), StructOffset));
Register LoadAddr;
@@ -2021,9 +2088,10 @@ bool AMDGPULegalizerInfo::legalizeITOFP(
// TODO: Copied from DAG implementation. Verify logic and document how this
// actually works.
-bool AMDGPULegalizerInfo::legalizeFPTOI(
- MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B, bool Signed) const {
+bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B,
+ bool Signed) const {
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
@@ -2031,24 +2099,57 @@ bool AMDGPULegalizerInfo::legalizeFPTOI(
const LLT S64 = LLT::scalar(64);
const LLT S32 = LLT::scalar(32);
- assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
+ const LLT SrcLT = MRI.getType(Src);
+ assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
unsigned Flags = MI.getFlags();
- auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
- auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
- auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
+ // The basic idea of converting a floating point number into a pair of 32-bit
+ // integers is illustrated as follows:
+ //
+ // tf := trunc(val);
+ // hif := floor(tf * 2^-32);
+ // lof := tf - hif * 2^32; // lof is always positive due to floor.
+ // hi := fptoi(hif);
+ // lo := fptoi(lof);
+ //
+ auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
+ MachineInstrBuilder Sign;
+ if (Signed && SrcLT == S32) {
+ // However, a 32-bit floating point number has only 23 bits mantissa and
+ // it's not enough to hold all the significant bits of `lof` if val is
+ // negative. To avoid the loss of precision, We need to take the absolute
+ // value after truncating and flip the result back based on the original
+ // signedness.
+ Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
+ Trunc = B.buildFAbs(S32, Trunc, Flags);
+ }
+ MachineInstrBuilder K0, K1;
+ if (SrcLT == S64) {
+ K0 = B.buildFConstant(S64,
+ BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
+ K1 = B.buildFConstant(S64,
+ BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
+ } else {
+ K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
+ K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
+ }
- auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
- auto FloorMul = B.buildFFloor(S64, Mul, Flags);
- auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
+ auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
+ auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
+ auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
- auto Hi = Signed ?
- B.buildFPTOSI(S32, FloorMul) :
- B.buildFPTOUI(S32, FloorMul);
+ auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
+ : B.buildFPTOUI(S32, FloorMul);
auto Lo = B.buildFPTOUI(S32, Fma);
- B.buildMerge(Dst, { Lo, Hi });
+ if (Signed && SrcLT == S32) {
+ // Flip the result based on the signedness, which is either all 0s or 1s.
+ Sign = B.buildMerge(S64, {Sign, Sign});
+ // r := xor({lo, hi}, sign) - sign;
+ B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
+ } else
+ B.buildMerge(Dst, {Lo, Hi});
MI.eraseFromParent();
return true;
@@ -2141,7 +2242,7 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
bool AMDGPULegalizerInfo::legalizeShuffleVector(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
- const LLT V2S16 = LLT::vector(2, 16);
+ const LLT V2S16 = LLT::fixed_vector(2, 16);
Register Dst = MI.getOperand(0).getReg();
Register Src0 = MI.getOperand(1).getReg();
@@ -2258,7 +2359,8 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
- if (!MFI->isModuleEntryFunction()) {
+ if (!MFI->isModuleEntryFunction() &&
+ !GV->getName().equals("llvm.amdgcn.module.lds")) {
const Function &Fn = MF.getFunction();
DiagnosticInfoUnsupported BadLDSDecl(
Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
@@ -2334,11 +2436,12 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
+ LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
MachinePointerInfo::getGOT(MF),
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
- 8 /*Size*/, Align(8));
+ LoadTy, Align(8));
buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
@@ -2355,7 +2458,8 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
static LLT widenToNextPowerOf2(LLT Ty) {
if (Ty.isVector())
- return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
+ return Ty.changeElementCount(
+ ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
}
@@ -2378,17 +2482,21 @@ bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
return true;
}
+ if (MI.getOpcode() != AMDGPU::G_LOAD)
+ return false;
+
Register ValReg = MI.getOperand(0).getReg();
LLT ValTy = MRI.getType(ValReg);
MachineMemOperand *MMO = *MI.memoperands_begin();
const unsigned ValSize = ValTy.getSizeInBits();
- const unsigned MemSize = 8 * MMO->getSize();
+ const LLT MemTy = MMO->getMemoryType();
const Align MemAlign = MMO->getAlign();
+ const unsigned MemSize = MemTy.getSizeInBits();
const unsigned AlignInBits = 8 * MemAlign.value();
// Widen non-power-of-2 loads to the alignment if needed
- if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
+ if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
const unsigned WideMemSize = PowerOf2Ceil(MemSize);
// This was already the correct extending load result type, so just adjust
@@ -2472,7 +2580,7 @@ bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
"this should not have been custom lowered");
LLT ValTy = MRI.getType(CmpVal);
- LLT VecTy = LLT::vector(2, ValTy);
+ LLT VecTy = LLT::fixed_vector(2, ValTy);
Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
@@ -2624,7 +2732,7 @@ bool AMDGPULegalizerInfo::legalizeBuildVector(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
Register Dst = MI.getOperand(0).getReg();
const LLT S32 = LLT::scalar(32);
- assert(MRI.getType(Dst) == LLT::vector(2, 16));
+ assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
Register Src0 = MI.getOperand(1).getReg();
Register Src1 = MI.getOperand(2).getReg();
@@ -2762,11 +2870,11 @@ bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
return false;
}
-void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
- Register DstReg,
- Register X,
- Register Y,
- bool IsDiv) const {
+void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
+ Register DstDivReg,
+ Register DstRemReg,
+ Register X,
+ Register Y) const {
const LLT S1 = LLT::scalar(1);
const LLT S32 = LLT::scalar(32);
@@ -2792,28 +2900,17 @@ void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
// First quotient/remainder refinement.
auto One = B.buildConstant(S32, 1);
auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
- if (IsDiv)
+ if (DstDivReg)
Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
// Second quotient/remainder refinement.
Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
- if (IsDiv)
- B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
- else
- B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
-}
+ if (DstDivReg)
+ B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
-bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const {
- const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
- Register DstReg = MI.getOperand(0).getReg();
- Register Num = MI.getOperand(1).getReg();
- Register Den = MI.getOperand(2).getReg();
- legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
- MI.eraseFromParent();
- return true;
+ if (DstRemReg)
+ B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
}
// Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
@@ -2859,11 +2956,11 @@ static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
return {ResultLo.getReg(0), ResultHi.getReg(0)};
}
-void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
- Register DstReg,
- Register Numer,
- Register Denom,
- bool IsDiv) const {
+void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
+ Register DstDivReg,
+ Register DstRemReg,
+ Register Numer,
+ Register Denom) const {
const LLT S32 = LLT::scalar(32);
const LLT S64 = LLT::scalar(64);
const LLT S1 = LLT::scalar(1);
@@ -2959,57 +3056,74 @@ void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
// endif C6
// endif C3
- if (IsDiv) {
+ if (DstDivReg) {
auto Sel1 = B.buildSelect(
S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
- B.buildSelect(DstReg,
- B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
- } else {
+ B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
+ Sel1, MulHi3);
+ }
+
+ if (DstRemReg) {
auto Sel2 = B.buildSelect(
S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
- B.buildSelect(DstReg,
- B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
+ B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
+ Sel2, Sub1);
}
}
-bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const {
+bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ Register DstDivReg, DstRemReg;
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode!");
+ case AMDGPU::G_UDIV: {
+ DstDivReg = MI.getOperand(0).getReg();
+ break;
+ }
+ case AMDGPU::G_UREM: {
+ DstRemReg = MI.getOperand(0).getReg();
+ break;
+ }
+ case AMDGPU::G_UDIVREM: {
+ DstDivReg = MI.getOperand(0).getReg();
+ DstRemReg = MI.getOperand(1).getReg();
+ break;
+ }
+ }
+
const LLT S64 = LLT::scalar(64);
const LLT S32 = LLT::scalar(32);
- const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
- Register DstReg = MI.getOperand(0).getReg();
- Register Num = MI.getOperand(1).getReg();
- Register Den = MI.getOperand(2).getReg();
- LLT Ty = MRI.getType(DstReg);
+ const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
+ Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
+ Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
+ LLT Ty = MRI.getType(MI.getOperand(0).getReg());
if (Ty == S32)
- legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
+ legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
else if (Ty == S64)
- legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
+ legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
else
return false;
MI.eraseFromParent();
return true;
-
}
-bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const {
+bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
const LLT S64 = LLT::scalar(64);
const LLT S32 = LLT::scalar(32);
- Register DstReg = MI.getOperand(0).getReg();
- const LLT Ty = MRI.getType(DstReg);
+ LLT Ty = MRI.getType(MI.getOperand(0).getReg());
if (Ty != S32 && Ty != S64)
return false;
- const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
-
- Register LHS = MI.getOperand(1).getReg();
- Register RHS = MI.getOperand(2).getReg();
+ const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
+ Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
+ Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
@@ -3021,20 +3135,45 @@ bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
- Register UDivRem = MRI.createGenericVirtualRegister(Ty);
+ Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode!");
+ case AMDGPU::G_SDIV: {
+ DstDivReg = MI.getOperand(0).getReg();
+ TmpDivReg = MRI.createGenericVirtualRegister(Ty);
+ break;
+ }
+ case AMDGPU::G_SREM: {
+ DstRemReg = MI.getOperand(0).getReg();
+ TmpRemReg = MRI.createGenericVirtualRegister(Ty);
+ break;
+ }
+ case AMDGPU::G_SDIVREM: {
+ DstDivReg = MI.getOperand(0).getReg();
+ DstRemReg = MI.getOperand(1).getReg();
+ TmpDivReg = MRI.createGenericVirtualRegister(Ty);
+ TmpRemReg = MRI.createGenericVirtualRegister(Ty);
+ break;
+ }
+ }
+
if (Ty == S32)
- legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
+ legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
else
- legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
+ legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
- Register Sign;
- if (IsDiv)
- Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
- else
- Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
+ if (DstDivReg) {
+ auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
+ auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
+ B.buildSub(DstDivReg, SignXor, Sign);
+ }
- UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
- B.buildSub(DstReg, UDivRem, Sign);
+ if (DstRemReg) {
+ auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
+ auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
+ B.buildSub(DstRemReg, SignXor, Sign);
+ }
MI.eraseFromParent();
return true;
@@ -3511,18 +3650,21 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
// (the offset that is excluded from bounds checking and swizzling, to go in
// the instruction's soffset field). This function takes the first kind of
// offset and figures out how to split it between voffset and immoffset.
-std::tuple<Register, unsigned, unsigned>
+std::pair<Register, unsigned>
AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
Register OrigOffset) const {
const unsigned MaxImm = 4095;
Register BaseReg;
- unsigned TotalConstOffset;
+ unsigned ImmOffset;
const LLT S32 = LLT::scalar(32);
+ MachineRegisterInfo &MRI = *B.getMRI();
- std::tie(BaseReg, TotalConstOffset) =
- AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
+ std::tie(BaseReg, ImmOffset) =
+ AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
- unsigned ImmOffset = TotalConstOffset;
+ // If BaseReg is a pointer, convert it to int.
+ if (MRI.getType(BaseReg).isPointer())
+ BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
// If the immediate value is too big for the immoffset field, put the value
// and -4096 into the immoffset field so that the value that is copied/added
@@ -3550,7 +3692,32 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
if (!BaseReg)
BaseReg = B.buildConstant(S32, 0).getReg(0);
- return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
+ return std::make_pair(BaseReg, ImmOffset);
+}
+
+/// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
+void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
+ Register VOffset, Register SOffset,
+ unsigned ImmOffset, Register VIndex,
+ MachineRegisterInfo &MRI) const {
+ Optional<ValueAndVReg> MaybeVOffsetVal =
+ getConstantVRegValWithLookThrough(VOffset, MRI);
+ Optional<ValueAndVReg> MaybeSOffsetVal =
+ getConstantVRegValWithLookThrough(SOffset, MRI);
+ Optional<ValueAndVReg> MaybeVIndexVal =
+ getConstantVRegValWithLookThrough(VIndex, MRI);
+ // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
+ // update the MMO with that offset. The stride is unknown so we can only do
+ // this if VIndex is constant 0.
+ if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
+ MaybeVIndexVal->Value == 0) {
+ uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
+ MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
+ MMO->setOffset(TotalOffset);
+ } else {
+ // We don't have a constant combined offset to use in the MMO. Give up.
+ MMO->setValue((Value *)nullptr);
+ }
}
/// Handle register layout difference for f16 images for some subtargets.
@@ -3572,7 +3739,8 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
int NumElts = StoreVT.getNumElements();
- return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
+ return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
+ .getReg(0);
}
if (ImageStore && ST.hasImageStoreD16Bug()) {
@@ -3581,7 +3749,8 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
Reg = B.buildBitcast(S32, Reg).getReg(0);
PackedRegs.push_back(Reg);
PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
- return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
+ return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
+ .getReg(0);
}
if (StoreVT.getNumElements() == 3) {
@@ -3590,18 +3759,19 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
PackedRegs.push_back(Unmerge.getReg(I));
PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
- Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
- return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
+ Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
+ return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
}
if (StoreVT.getNumElements() == 4) {
SmallVector<Register, 4> PackedRegs;
- Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
+ Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
auto Unmerge = B.buildUnmerge(S32, Reg);
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
PackedRegs.push_back(Unmerge.getReg(I));
PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
- return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
+ return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
+ .getReg(0);
}
llvm_unreachable("invalid data type");
@@ -3651,7 +3821,6 @@ bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
const int MemSize = MMO->getSize();
unsigned ImmOffset;
- unsigned TotalOffset;
// The typed intrinsics add an immediate after the registers.
const unsigned NumVIndexOps = IsTyped ? 8 : 7;
@@ -3663,6 +3832,8 @@ bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
if (HasVIndex) {
VIndex = MI.getOperand(3).getReg();
OpOffset = 1;
+ } else {
+ VIndex = B.buildConstant(S32, 0).getReg(0);
}
Register VOffset = MI.getOperand(3 + OpOffset).getReg();
@@ -3676,9 +3847,8 @@ bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
- std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
- if (TotalOffset != 0)
- MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
+ std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
+ updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
unsigned Opc;
if (IsTyped) {
@@ -3701,9 +3871,6 @@ bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
}
}
- if (!VIndex)
- VIndex = B.buildConstant(S32, 0).getReg(0);
-
auto MIB = B.buildInstr(Opc)
.addUse(VData) // vdata
.addUse(RSrc) // rsrc
@@ -3730,7 +3897,7 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
bool IsTyped) const {
// FIXME: Verifier should enforce 1 MMO for these intrinsics.
MachineMemOperand *MMO = *MI.memoperands_begin();
- const int MemSize = MMO->getSize();
+ const LLT MemTy = MMO->getMemoryType();
const LLT S32 = LLT::scalar(32);
Register Dst = MI.getOperand(0).getReg();
@@ -3746,6 +3913,8 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
if (HasVIndex) {
VIndex = MI.getOperand(3).getReg();
OpOffset = 1;
+ } else {
+ VIndex = B.buildConstant(S32, 0).getReg(0);
}
Register VOffset = MI.getOperand(3 + OpOffset).getReg();
@@ -3759,16 +3928,14 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
unsigned ImmOffset;
- unsigned TotalOffset;
LLT Ty = MRI.getType(Dst);
LLT EltTy = Ty.getScalarType();
const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
const bool Unpacked = ST.hasUnpackedD16VMem();
- std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
- if (TotalOffset != 0)
- MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
+ std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
+ updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
unsigned Opc;
@@ -3779,11 +3946,11 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
} else {
- switch (MemSize) {
- case 1:
+ switch (MemTy.getSizeInBits()) {
+ case 8:
Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
break;
- case 2:
+ case 16:
Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
break;
default:
@@ -3794,7 +3961,8 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
Register LoadDstReg;
- bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
+ bool IsExtLoad =
+ (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
LLT UnpackedTy = Ty.changeElementSize(32);
if (IsExtLoad)
@@ -3804,9 +3972,6 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
else
LoadDstReg = Dst;
- if (!VIndex)
- VIndex = B.buildConstant(S32, 0).getReg(0);
-
auto MIB = B.buildInstr(Opc)
.addDef(LoadDstReg) // vdata
.addUse(RSrc) // rsrc
@@ -3898,9 +4063,16 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
+ case Intrinsic::amdgcn_buffer_atomic_fadd:
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
+ case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
+ case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
+ case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
+ case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
default:
llvm_unreachable("unhandled atomic opcode");
}
@@ -3940,6 +4112,8 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
if (HasVIndex) {
VIndex = MI.getOperand(4 + OpOffset).getReg();
++OpOffset;
+ } else {
+ VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
}
Register VOffset = MI.getOperand(4 + OpOffset).getReg();
@@ -3949,13 +4123,8 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
MachineMemOperand *MMO = *MI.memoperands_begin();
unsigned ImmOffset;
- unsigned TotalOffset;
- std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
- if (TotalOffset != 0)
- MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
-
- if (!VIndex)
- VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
+ std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
+ updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
@@ -3980,14 +4149,16 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
return true;
}
-/// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
+/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
/// vector with s16 typed elements.
-static void packImageA16AddressToDwords(
- MachineIRBuilder &B, MachineInstr &MI,
- SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
- const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
+static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
+ SmallVectorImpl<Register> &PackedAddrs,
+ unsigned ArgOffset,
+ const AMDGPU::ImageDimIntrinsicInfo *Intr,
+ bool IsA16, bool IsG16) {
const LLT S16 = LLT::scalar(16);
- const LLT V2S16 = LLT::vector(2, 16);
+ const LLT V2S16 = LLT::fixed_vector(2, 16);
+ auto EndIdx = Intr->VAddrEnd;
for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
@@ -3996,7 +4167,10 @@ static void packImageA16AddressToDwords(
Register AddrReg = SrcOp.getReg();
- if (I < Intr->GradientStart) {
+ if ((I < Intr->GradientStart) ||
+ (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
+ (I >= Intr->CoordStart && !IsA16)) {
+ // Handle any gradient or coordinate operands that should not be packed
AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
PackedAddrs.push_back(AddrReg);
} else {
@@ -4041,16 +4215,16 @@ static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
int NumAddrRegs = AddrRegs.size();
if (NumAddrRegs != 1) {
- // Round up to 8 elements for v5-v7
- // FIXME: Missing intermediate sized register classes and instructions.
- if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
+ // Above 8 elements round up to next power of 2 (i.e. 16).
+ if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
auto Undef = B.buildUndef(S32);
AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
NumAddrRegs = RoundedNumRegs;
}
- auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
+ auto VAddr =
+ B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
}
@@ -4091,7 +4265,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
MachineRegisterInfo *MRI = B.getMRI();
const LLT S32 = LLT::scalar(32);
const LLT S16 = LLT::scalar(16);
- const LLT V2S16 = LLT::vector(2, 16);
+ const LLT V2S16 = LLT::fixed_vector(2, 16);
unsigned DMask = 0;
@@ -4146,7 +4320,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
if (BaseOpcode->AtomicX2) {
Register VData1 = MI.getOperand(3).getReg();
// The two values are packed in one register.
- LLT PackedTy = LLT::vector(2, Ty);
+ LLT PackedTy = LLT::fixed_vector(2, Ty);
auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
MI.getOperand(2).setReg(Concat.getReg(0));
MI.getOperand(3).setReg(AMDGPU::NoRegister);
@@ -4194,35 +4368,30 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
}
// Rewrite the addressing register layout before doing anything else.
- if (IsA16 || IsG16) {
- if (IsA16) {
- // Target must support the feature and gradients need to be 16 bit too
- if (!ST.hasA16() || !IsG16)
- return false;
- } else if (!ST.hasG16())
- return false;
+ if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
+ // 16 bit gradients are supported, but are tied to the A16 control
+ // so both gradients and addresses must be 16 bit
+ return false;
+ }
+ if (IsA16 && !ST.hasA16()) {
+ // A16 not supported
+ return false;
+ }
+
+ if (IsA16 || IsG16) {
if (Intr->NumVAddrs > 1) {
SmallVector<Register, 4> PackedRegs;
- // Don't compress addresses for G16
- const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
- packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
- PackEndIdx);
-
- if (!IsA16) {
- // Add uncompressed address
- for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
- int AddrReg = MI.getOperand(ArgOffset + I).getReg();
- assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
- PackedRegs.push_back(AddrReg);
- }
- }
+
+ packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
+ IsG16);
// See also below in the non-a16 branch
- const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
+ const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
+ PackedRegs.size() <= ST.getNSAMaxSize();
if (!UseNSA && PackedRegs.size() > 1) {
- LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
+ LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
PackedRegs[0] = Concat.getReg(0);
PackedRegs.resize(1);
@@ -4256,7 +4425,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
//
// SIShrinkInstructions will convert NSA encodings to non-NSA after register
// allocation when possible.
- const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
+ const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
+ CorrectedNumVAddrs <= ST.getNSAMaxSize();
if (!UseNSA && Intr->NumVAddrs > 1)
convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
@@ -4299,7 +4469,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
return false;
const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
- const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
+ const LLT AdjustedTy =
+ Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
// The raw dword aligned data component of the load. The only legal cases
// where this matters should be when using the packed D16 format, for
@@ -4313,15 +4484,17 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
LLT RegTy;
if (IsD16 && ST.hasUnpackedD16VMem()) {
- RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
- TFETy = LLT::vector(AdjustedNumElts + 1, 32);
+ RoundedTy =
+ LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
+ TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
RegTy = S32;
} else {
unsigned EltSize = EltTy.getSizeInBits();
unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
unsigned RoundedSize = 32 * RoundedElts;
- RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
- TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
+ RoundedTy = LLT::scalarOrVector(
+ ElementCount::getFixed(RoundedSize / EltSize), EltSize);
+ TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
}
@@ -4435,10 +4608,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
// Deal with the one annoying legal case.
- const LLT V3S16 = LLT::vector(3, 16);
+ const LLT V3S16 = LLT::fixed_vector(3, 16);
if (Ty == V3S16) {
padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
- auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
+ auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
return true;
}
@@ -4460,7 +4633,7 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
Observer.changingInstr(MI);
- if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
+ if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
Ty = getBitcastRegisterType(Ty);
Helper.bitcastDst(MI, Ty, 0);
Dst = MI.getOperand(0).getReg();
@@ -4502,27 +4675,55 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
- // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
- if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
- !ST.isTrapHandlerEnabled()) {
- B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
- } else {
- // Pass queue pointer to trap handler as input, and insert trap instruction
- // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
- MachineRegisterInfo &MRI = *B.getMRI();
+ if (!ST.isTrapHandlerEnabled() ||
+ ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
+ return legalizeTrapEndpgm(MI, MRI, B);
+
+ if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
+ switch (*HsaAbiVer) {
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
+ return legalizeTrapHsaQueuePtr(MI, MRI, B);
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
+ return ST.supportsGetDoorbellID() ?
+ legalizeTrapHsa(MI, MRI, B) :
+ legalizeTrapHsaQueuePtr(MI, MRI, B);
+ }
+ }
- Register LiveIn =
- MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
- if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
- return false;
+ llvm_unreachable("Unknown trap handler");
+}
- Register SGPR01(AMDGPU::SGPR0_SGPR1);
- B.buildCopy(SGPR01, LiveIn);
- B.buildInstr(AMDGPU::S_TRAP)
- .addImm(GCNSubtarget::TrapIDLLVMTrap)
- .addReg(SGPR01, RegState::Implicit);
- }
+bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
+ B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
+ // Pass queue pointer to trap handler as input, and insert trap instruction
+ // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
+ Register LiveIn =
+ MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
+ if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
+ return false;
+ Register SGPR01(AMDGPU::SGPR0_SGPR1);
+ B.buildCopy(SGPR01, LiveIn);
+ B.buildInstr(AMDGPU::S_TRAP)
+ .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
+ .addReg(SGPR01, RegState::Implicit);
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeTrapHsa(
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
+ B.buildInstr(AMDGPU::S_TRAP)
+ .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
MI.eraseFromParent();
return true;
}
@@ -4531,8 +4732,8 @@ bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
// Is non-HSA path or trap-handler disabled? then, report a warning
// accordingly
- if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
- !ST.isTrapHandlerEnabled()) {
+ if (!ST.isTrapHandlerEnabled() ||
+ ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
"debugtrap handler not supported",
MI.getDebugLoc(), DS_Warning);
@@ -4540,7 +4741,8 @@ bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
Ctx.diagnose(NoTrap);
} else {
// Insert debug-trap instruction
- B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
+ B.buildInstr(AMDGPU::S_TRAP)
+ .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
}
MI.eraseFromParent();
@@ -4561,6 +4763,14 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
Register RayInvDir = MI.getOperand(6).getReg();
Register TDescr = MI.getOperand(7).getReg();
+ if (!ST.hasGFX10_AEncoding()) {
+ DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
+ "intrinsic not supported on subtarget",
+ MI.getDebugLoc());
+ B.getMF().getFunction().getContext().diagnose(BadIntrin);
+ return false;
+ }
+
bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
@@ -4810,6 +5020,11 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
+ case Intrinsic::amdgcn_buffer_atomic_fadd:
+ case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
+ case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
+ case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
+ case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
return legalizeBufferAtomic(MI, B, IntrID);
case Intrinsic::amdgcn_atomic_inc:
return legalizeAtomicIncDec(MI, B, true);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 87e8b2128a25..d4fefd89b487 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -99,25 +99,19 @@ public:
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
- bool legalizeUDIV_UREM(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const;
+ bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
- void legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
- Register DstReg, Register Num, Register Den,
- bool IsRem) const;
- bool legalizeUDIV_UREM32(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const;
- bool legalizeSDIV_SREM32(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const;
+ void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg,
+ Register DstRemReg, Register Num,
+ Register Den) const;
- void legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
- Register DstReg, Register Numer, Register Denom,
- bool IsDiv) const;
+ void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg,
+ Register DstRemReg, Register Numer,
+ Register Denom) const;
- bool legalizeUDIV_UREM64(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const;
- bool legalizeSDIV_SREM(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const;
+ bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
@@ -148,8 +142,11 @@ public:
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, unsigned AddrSpace) const;
- std::tuple<Register, unsigned, unsigned>
- splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const;
+ std::pair<Register, unsigned> splitBufferOffsets(MachineIRBuilder &B,
+ Register OrigOffset) const;
+ void updateBufferMMO(MachineMemOperand *MMO, Register VOffset,
+ Register SOffset, unsigned ImmOffset, Register VIndex,
+ MachineRegisterInfo &MRI) const;
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
Register Reg, bool ImageStore = false) const;
@@ -183,6 +180,12 @@ public:
bool legalizeTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
+ bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+ bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+ bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
bool legalizeDebugTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 6b7f57252b7a..1ee6933bd7ff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -17,6 +17,7 @@
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/InitializePasses.h"
#include "llvm/Target/TargetMachine.h"
@@ -476,7 +477,7 @@ bool AMDGPULibCalls::isUnsafeMath(const CallInst *CI) const {
return true;
const Function *F = CI->getParent()->getParent();
Attribute Attr = F->getFnAttribute("unsafe-fp-math");
- return Attr.getValueAsString() == "true";
+ return Attr.getValueAsBool();
}
bool AMDGPULibCalls::useNativeFunc(const StringRef F) const {
@@ -1369,9 +1370,9 @@ bool AMDGPULibCalls::fold_wavefrontsize(CallInst *CI, IRBuilder<> &B) {
StringRef CPU = TM->getTargetCPU();
StringRef Features = TM->getTargetFeatureString();
- if ((CPU.empty() || CPU.equals_lower("generic")) &&
+ if ((CPU.empty() || CPU.equals_insensitive("generic")) &&
(Features.empty() ||
- Features.find_lower("wavefrontsize") == StringRef::npos))
+ Features.find_insensitive("wavefrontsize") == StringRef::npos))
return false;
Function *F = CI->getParent()->getParent();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
index 646087cdb7db..32262ea75fd3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -19,10 +19,16 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
+static cl::opt<bool> EnableOCLManglingMismatchWA(
+ "amdgpu-enable-ocl-mangling-mismatch-workaround", cl::init(true),
+ cl::ReallyHidden,
+ cl::desc("Enable the workaround for OCL name mangling mismatch."));
+
namespace {
enum EManglingParam {
@@ -826,7 +832,8 @@ public:
unsigned AS = UseAddrSpace
? AMDGPULibFuncBase::getAddrSpaceFromEPtrKind(p.PtrKind)
: 0;
- if (AS != 0) os << "U3AS" << AS;
+ if (EnableOCLManglingMismatchWA || AS != 0)
+ os << "U3AS" << AS;
Ptr = p;
p.PtrKind = 0;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index 8fb4f93fd4b3..0f157e53c3db 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -15,6 +15,7 @@
#include "GCNSubtarget.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/Target/TargetMachine.h"
#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index 9ab6a5246ce5..08a1b970648d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -67,7 +67,7 @@ static bool processUse(CallInst *CI) {
const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
const bool HasUniformWorkGroupSize =
- F->getFnAttribute("uniform-work-group-size").getValueAsString() == "true";
+ F->getFnAttribute("uniform-work-group-size").getValueAsBool();
if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)
return false;
@@ -249,9 +249,9 @@ bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
}
INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
- "AMDGPU IR optimizations", false, false)
-INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE, "AMDGPU IR optimizations",
- false, false)
+ "AMDGPU Kernel Attributes", false, false)
+INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,
+ "AMDGPU Kernel Attributes", false, false)
char AMDGPULowerKernelAttributes::ID = 0;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
new file mode 100644
index 000000000000..70ecea8dbc3e
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -0,0 +1,400 @@
+//===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates LDS uses from non-kernel functions.
+//
+// The strategy is to create a new struct with a field for each LDS variable
+// and allocate that struct at the same address for every kernel. Uses of the
+// original LDS variables are then replaced with compile time offsets from that
+// known address. AMDGPUMachineFunction allocates the LDS global.
+//
+// Local variables with constant annotation or non-undef initializer are passed
+// through unchanged for simplication or error diagnostics in later passes.
+//
+// To reduce the memory overhead variables that are only used by kernels are
+// excluded from this transform. The analysis to determine whether a variable
+// is only used by a kernel is cheap and conservative so this may allocate
+// a variable in every kernel when it was not strictly necessary to do so.
+//
+// A possible future refinement is to specialise the structure per-kernel, so
+// that fields can be elided based on more expensive analysis.
+//
+// NOTE: Since this pass will directly pack LDS (assume large LDS) into a struct
+// type which would cause allocating huge memory for struct instance within
+// every kernel. Hence, before running this pass, it is advisable to run the
+// pass "amdgpu-replace-lds-use-with-pointer" which will replace LDS uses within
+// non-kernel functions by pointers and thereby minimizes the unnecessary per
+// kernel allocation of LDS memory.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "Utils/AMDGPULDSUtils.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/OptimizedStructLayout.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <vector>
+
+#define DEBUG_TYPE "amdgpu-lower-module-lds"
+
+using namespace llvm;
+
+static cl::opt<bool> SuperAlignLDSGlobals(
+ "amdgpu-super-align-lds-globals",
+ cl::desc("Increase alignment of LDS if it is not on align boundary"),
+ cl::init(true), cl::Hidden);
+
+namespace {
+
+class AMDGPULowerModuleLDS : public ModulePass {
+
+ static void removeFromUsedList(Module &M, StringRef Name,
+ SmallPtrSetImpl<Constant *> &ToRemove) {
+ GlobalVariable *GV = M.getNamedGlobal(Name);
+ if (!GV || ToRemove.empty()) {
+ return;
+ }
+
+ SmallVector<Constant *, 16> Init;
+ auto *CA = cast<ConstantArray>(GV->getInitializer());
+ for (auto &Op : CA->operands()) {
+ // ModuleUtils::appendToUsed only inserts Constants
+ Constant *C = cast<Constant>(Op);
+ if (!ToRemove.contains(C->stripPointerCasts())) {
+ Init.push_back(C);
+ }
+ }
+
+ if (Init.size() == CA->getNumOperands()) {
+ return; // none to remove
+ }
+
+ GV->eraseFromParent();
+
+ for (Constant *C : ToRemove) {
+ C->removeDeadConstantUsers();
+ }
+
+ if (!Init.empty()) {
+ ArrayType *ATy =
+ ArrayType::get(Type::getInt8PtrTy(M.getContext()), Init.size());
+ GV =
+ new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
+ ConstantArray::get(ATy, Init), Name);
+ GV->setSection("llvm.metadata");
+ }
+ }
+
+ static void
+ removeFromUsedLists(Module &M,
+ const std::vector<GlobalVariable *> &LocalVars) {
+ SmallPtrSet<Constant *, 32> LocalVarsSet;
+ for (size_t I = 0; I < LocalVars.size(); I++) {
+ if (Constant *C = dyn_cast<Constant>(LocalVars[I]->stripPointerCasts())) {
+ LocalVarsSet.insert(C);
+ }
+ }
+ removeFromUsedList(M, "llvm.used", LocalVarsSet);
+ removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet);
+ }
+
+ static void markUsedByKernel(IRBuilder<> &Builder, Function *Func,
+ GlobalVariable *SGV) {
+ // The llvm.amdgcn.module.lds instance is implicitly used by all kernels
+ // that might call a function which accesses a field within it. This is
+ // presently approximated to 'all kernels' if there are any such functions
+ // in the module. This implicit use is reified as an explicit use here so
+ // that later passes, specifically PromoteAlloca, account for the required
+ // memory without any knowledge of this transform.
+
+ // An operand bundle on llvm.donothing works because the call instruction
+ // survives until after the last pass that needs to account for LDS. It is
+ // better than inline asm as the latter survives until the end of codegen. A
+ // totally robust solution would be a function with the same semantics as
+ // llvm.donothing that takes a pointer to the instance and is lowered to a
+ // no-op after LDS is allocated, but that is not presently necessary.
+
+ LLVMContext &Ctx = Func->getContext();
+
+ Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI());
+
+ FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), {});
+
+ Function *Decl =
+ Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {});
+
+ Value *UseInstance[1] = {Builder.CreateInBoundsGEP(
+ SGV->getValueType(), SGV, ConstantInt::get(Type::getInt32Ty(Ctx), 0))};
+
+ Builder.CreateCall(FTy, Decl, {},
+ {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)},
+ "");
+ }
+
+private:
+ SmallPtrSet<GlobalValue *, 32> UsedList;
+
+public:
+ static char ID;
+
+ AMDGPULowerModuleLDS() : ModulePass(ID) {
+ initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override {
+ UsedList = AMDGPU::getUsedList(M);
+
+ bool Changed = processUsedLDS(M);
+
+ for (Function &F : M.functions()) {
+ // Only lower compute kernels' LDS.
+ if (!AMDGPU::isKernel(F.getCallingConv()))
+ continue;
+ Changed |= processUsedLDS(M, &F);
+ }
+
+ UsedList.clear();
+ return Changed;
+ }
+
+private:
+ bool processUsedLDS(Module &M, Function *F = nullptr) {
+ LLVMContext &Ctx = M.getContext();
+ const DataLayout &DL = M.getDataLayout();
+
+ // Find variables to move into new struct instance
+ std::vector<GlobalVariable *> FoundLocalVars =
+ AMDGPU::findVariablesToLower(M, F);
+
+ if (FoundLocalVars.empty()) {
+ // No variables to rewrite, no changes made.
+ return false;
+ }
+
+ // Increase the alignment of LDS globals if necessary to maximise the chance
+ // that we can use aligned LDS instructions to access them.
+ if (SuperAlignLDSGlobals) {
+ for (auto *GV : FoundLocalVars) {
+ Align Alignment = AMDGPU::getAlign(DL, GV);
+ TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType());
+
+ if (GVSize > 8) {
+ // We might want to use a b96 or b128 load/store
+ Alignment = std::max(Alignment, Align(16));
+ } else if (GVSize > 4) {
+ // We might want to use a b64 load/store
+ Alignment = std::max(Alignment, Align(8));
+ } else if (GVSize > 2) {
+ // We might want to use a b32 load/store
+ Alignment = std::max(Alignment, Align(4));
+ } else if (GVSize > 1) {
+ // We might want to use a b16 load/store
+ Alignment = std::max(Alignment, Align(2));
+ }
+
+ GV->setAlignment(Alignment);
+ }
+ }
+
+ SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
+ LayoutFields.reserve(FoundLocalVars.size());
+ for (GlobalVariable *GV : FoundLocalVars) {
+ OptimizedStructLayoutField F(GV, DL.getTypeAllocSize(GV->getValueType()),
+ AMDGPU::getAlign(DL, GV));
+ LayoutFields.emplace_back(F);
+ }
+
+ performOptimizedStructLayout(LayoutFields);
+
+ std::vector<GlobalVariable *> LocalVars;
+ LocalVars.reserve(FoundLocalVars.size()); // will be at least this large
+ {
+ // This usually won't need to insert any padding, perhaps avoid the alloc
+ uint64_t CurrentOffset = 0;
+ for (size_t I = 0; I < LayoutFields.size(); I++) {
+ GlobalVariable *FGV = static_cast<GlobalVariable *>(
+ const_cast<void *>(LayoutFields[I].Id));
+ Align DataAlign = LayoutFields[I].Alignment;
+
+ uint64_t DataAlignV = DataAlign.value();
+ if (uint64_t Rem = CurrentOffset % DataAlignV) {
+ uint64_t Padding = DataAlignV - Rem;
+
+ // Append an array of padding bytes to meet alignment requested
+ // Note (o + (a - (o % a)) ) % a == 0
+ // (offset + Padding ) % align == 0
+
+ Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding);
+ LocalVars.push_back(new GlobalVariable(
+ M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy),
+ "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
+ false));
+ CurrentOffset += Padding;
+ }
+
+ LocalVars.push_back(FGV);
+ CurrentOffset += LayoutFields[I].Size;
+ }
+ }
+
+ std::vector<Type *> LocalVarTypes;
+ LocalVarTypes.reserve(LocalVars.size());
+ std::transform(
+ LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes),
+ [](const GlobalVariable *V) -> Type * { return V->getValueType(); });
+
+ std::string VarName(
+ F ? (Twine("llvm.amdgcn.kernel.") + F->getName() + ".lds").str()
+ : "llvm.amdgcn.module.lds");
+ StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t");
+
+ Align StructAlign =
+ AMDGPU::getAlign(DL, LocalVars[0]);
+
+ GlobalVariable *SGV = new GlobalVariable(
+ M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),
+ VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
+ false);
+ SGV->setAlignment(StructAlign);
+ if (!F) {
+ appendToCompilerUsed(
+ M, {static_cast<GlobalValue *>(
+ ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+ cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))});
+ }
+
+ // The verifier rejects used lists containing an inttoptr of a constant
+ // so remove the variables from these lists before replaceAllUsesWith
+ removeFromUsedLists(M, LocalVars);
+
+ // Replace uses of ith variable with a constantexpr to the ith field of the
+ // instance that will be allocated by AMDGPUMachineFunction
+ Type *I32 = Type::getInt32Ty(Ctx);
+ for (size_t I = 0; I < LocalVars.size(); I++) {
+ GlobalVariable *GV = LocalVars[I];
+ Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
+ Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx);
+ if (F) {
+ // Replace all constant uses with instructions if they belong to the
+ // current kernel.
+ for (User *U : make_early_inc_range(GV->users())) {
+ if (ConstantExpr *C = dyn_cast<ConstantExpr>(U))
+ AMDGPU::replaceConstantUsesInFunction(C, F);
+ }
+
+ GV->removeDeadConstantUsers();
+
+ GV->replaceUsesWithIf(GEP, [F](Use &U) {
+ Instruction *I = dyn_cast<Instruction>(U.getUser());
+ return I && I->getFunction() == F;
+ });
+ } else {
+ GV->replaceAllUsesWith(GEP);
+ }
+ if (GV->use_empty()) {
+ UsedList.erase(GV);
+ GV->eraseFromParent();
+ }
+
+ uint64_t Off = DL.getStructLayout(LDSTy)->getElementOffset(I);
+ Align A = commonAlignment(StructAlign, Off);
+ refineUsesAlignment(GEP, A, DL);
+ }
+
+ // Mark kernels with asm that reads the address of the allocated structure
+ // This is not necessary for lowering. This lets other passes, specifically
+ // PromoteAlloca, accurately calculate how much LDS will be used by the
+ // kernel after lowering.
+ if (!F) {
+ IRBuilder<> Builder(Ctx);
+ SmallPtrSet<Function *, 32> Kernels;
+ for (auto &I : M.functions()) {
+ Function *Func = &I;
+ if (AMDGPU::isKernelCC(Func) && !Kernels.contains(Func)) {
+ markUsedByKernel(Builder, Func, SGV);
+ Kernels.insert(Func);
+ }
+ }
+ }
+ return true;
+ }
+
+ void refineUsesAlignment(Value *Ptr, Align A, const DataLayout &DL,
+ unsigned MaxDepth = 5) {
+ if (!MaxDepth || A == 1)
+ return;
+
+ for (User *U : Ptr->users()) {
+ if (auto *LI = dyn_cast<LoadInst>(U)) {
+ LI->setAlignment(std::max(A, LI->getAlign()));
+ continue;
+ }
+ if (auto *SI = dyn_cast<StoreInst>(U)) {
+ if (SI->getPointerOperand() == Ptr)
+ SI->setAlignment(std::max(A, SI->getAlign()));
+ continue;
+ }
+ if (auto *AI = dyn_cast<AtomicRMWInst>(U)) {
+ // None of atomicrmw operations can work on pointers, but let's
+ // check it anyway in case it will or we will process ConstantExpr.
+ if (AI->getPointerOperand() == Ptr)
+ AI->setAlignment(std::max(A, AI->getAlign()));
+ continue;
+ }
+ if (auto *AI = dyn_cast<AtomicCmpXchgInst>(U)) {
+ if (AI->getPointerOperand() == Ptr)
+ AI->setAlignment(std::max(A, AI->getAlign()));
+ continue;
+ }
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
+ unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType());
+ APInt Off(BitWidth, 0);
+ if (GEP->getPointerOperand() == Ptr &&
+ GEP->accumulateConstantOffset(DL, Off)) {
+ Align GA = commonAlignment(A, Off.getLimitedValue());
+ refineUsesAlignment(GEP, GA, DL, MaxDepth - 1);
+ }
+ continue;
+ }
+ if (auto *I = dyn_cast<Instruction>(U)) {
+ if (I->getOpcode() == Instruction::BitCast ||
+ I->getOpcode() == Instruction::AddrSpaceCast)
+ refineUsesAlignment(I, A, DL, MaxDepth - 1);
+ }
+ }
+ }
+};
+
+} // namespace
+char AMDGPULowerModuleLDS::ID = 0;
+
+char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID;
+
+INITIALIZE_PASS(AMDGPULowerModuleLDS, DEBUG_TYPE,
+ "Lower uses of LDS variables from non-kernel functions", false,
+ false)
+
+ModulePass *llvm::createAMDGPULowerModuleLDSPass() {
+ return new AMDGPULowerModuleLDS();
+}
+
+PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M,
+ ModuleAnalysisManager &) {
+ return AMDGPULowerModuleLDS().runOnModule(M) ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index a8cba3f5cc5c..3dd27f1996d6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -41,9 +41,6 @@ class AMDGPUMCInstLower {
const TargetSubtargetInfo &ST;
const AsmPrinter &AP;
- const MCExpr *getLongBranchBlockExpr(const MachineBasicBlock &SrcBB,
- const MachineOperand &MO) const;
-
public:
AMDGPUMCInstLower(MCContext &ctx, const TargetSubtargetInfo &ST,
const AsmPrinter &AP);
@@ -95,54 +92,21 @@ static MCSymbolRefExpr::VariantKind getVariantKind(unsigned MOFlags) {
}
}
-const MCExpr *AMDGPUMCInstLower::getLongBranchBlockExpr(
- const MachineBasicBlock &SrcBB,
- const MachineOperand &MO) const {
- const MCExpr *DestBBSym
- = MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx);
- const MCExpr *SrcBBSym = MCSymbolRefExpr::create(SrcBB.getSymbol(), Ctx);
-
- // FIXME: The first half of this assert should be removed. This should
- // probably be PC relative instead of using the source block symbol, and
- // therefore the indirect branch expansion should use a bundle.
- assert(
- skipDebugInstructionsForward(SrcBB.begin(), SrcBB.end())->getOpcode() ==
- AMDGPU::S_GETPC_B64 &&
- ST.getInstrInfo()->get(AMDGPU::S_GETPC_B64).Size == 4);
-
- // s_getpc_b64 returns the address of next instruction.
- const MCConstantExpr *One = MCConstantExpr::create(4, Ctx);
- SrcBBSym = MCBinaryExpr::createAdd(SrcBBSym, One, Ctx);
-
- if (MO.getTargetFlags() == SIInstrInfo::MO_LONG_BRANCH_FORWARD)
- return MCBinaryExpr::createSub(DestBBSym, SrcBBSym, Ctx);
-
- assert(MO.getTargetFlags() == SIInstrInfo::MO_LONG_BRANCH_BACKWARD);
- return MCBinaryExpr::createSub(SrcBBSym, DestBBSym, Ctx);
-}
-
bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
MCOperand &MCOp) const {
switch (MO.getType()) {
default:
- llvm_unreachable("unknown operand type");
+ break;
case MachineOperand::MO_Immediate:
MCOp = MCOperand::createImm(MO.getImm());
return true;
case MachineOperand::MO_Register:
MCOp = MCOperand::createReg(AMDGPU::getMCReg(MO.getReg(), ST));
return true;
- case MachineOperand::MO_MachineBasicBlock: {
- if (MO.getTargetFlags() != 0) {
- MCOp = MCOperand::createExpr(
- getLongBranchBlockExpr(*MO.getParent()->getParent(), MO));
- } else {
- MCOp = MCOperand::createExpr(
+ case MachineOperand::MO_MachineBasicBlock:
+ MCOp = MCOperand::createExpr(
MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx));
- }
-
return true;
- }
case MachineOperand::MO_GlobalAddress: {
const GlobalValue *GV = MO.getGlobal();
SmallString<128> SymbolName;
@@ -168,7 +132,15 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
case MachineOperand::MO_RegisterMask:
// Regmasks are like implicit defs.
return false;
+ case MachineOperand::MO_MCSymbol:
+ if (MO.getTargetFlags() == SIInstrInfo::MO_FAR_BRANCH_OFFSET) {
+ MCSymbol *Sym = MO.getMCSymbol();
+ MCOp = MCOperand::createExpr(Sym->getVariableValue());
+ return true;
+ }
+ break;
}
+ llvm_unreachable("unknown operand type");
}
void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
@@ -274,24 +246,9 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
++I;
}
} else {
- // We don't want SI_MASK_BRANCH/SI_RETURN_TO_EPILOG encoded. They are
+ // We don't want these pseudo instructions encoded. They are
// placeholder terminator instructions and should only be printed as
// comments.
- if (MI->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
- if (isVerbose()) {
- SmallVector<char, 16> BBStr;
- raw_svector_ostream Str(BBStr);
-
- const MachineBasicBlock *MBB = MI->getOperand(0).getMBB();
- const MCSymbolRefExpr *Expr
- = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
- Expr->print(Str, MAI);
- OutStreamer->emitRawComment(Twine(" mask branch ") + BBStr);
- }
-
- return;
- }
-
if (MI->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
if (isVerbose())
OutStreamer->emitRawComment(" return to shader part epilog");
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index b6a69b2819ee..697513b5db7a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -1419,11 +1419,7 @@ void AMDGPUMachineCFGStructurizer::extractKilledPHIs(MachineBasicBlock *MBB) {
static bool isPHIRegionIndex(SmallVector<unsigned, 2> PHIRegionIndices,
unsigned Index) {
- for (auto i : PHIRegionIndices) {
- if (i == Index)
- return true;
- }
- return false;
+ return llvm::is_contained(PHIRegionIndices, Index);
}
bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 717145b7af53..0c743a77092c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -28,12 +28,10 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF)
const Function &F = MF.getFunction();
Attribute MemBoundAttr = F.getFnAttribute("amdgpu-memory-bound");
- MemoryBound = MemBoundAttr.isStringAttribute() &&
- MemBoundAttr.getValueAsString() == "true";
+ MemoryBound = MemBoundAttr.getValueAsBool();
Attribute WaveLimitAttr = F.getFnAttribute("amdgpu-wave-limiter");
- WaveLimiter = WaveLimitAttr.isStringAttribute() &&
- WaveLimitAttr.getValueAsString() == "true";
+ WaveLimiter = WaveLimitAttr.getValueAsBool();
CallingConv::ID CC = F.getCallingConv();
if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
@@ -64,6 +62,18 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
return Offset;
}
+void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Module *M) {
+ if (isModuleEntryFunction()) {
+ const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds");
+ if (GV) {
+ unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV);
+ (void)Offset;
+ assert(Offset == 0 &&
+ "Module LDS expected to be allocated before other LDS");
+ }
+ }
+}
+
void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL,
const GlobalVariable &GV) {
assert(DL.getTypeAllocSize(GV.getValueType()).isZero());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 07cac776082d..10ff50040c6a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -94,6 +94,7 @@ public:
}
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);
+ void allocateModuleLDSGlobal(const Module *M);
Align getDynLDSAlign() const { return DynLDSAlign; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h b/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h
index 82c6d75bb060..ad198a301dbe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h
@@ -6,6 +6,9 @@
//
//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACROFUSION_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACROFUSION_H
+
#include "llvm/CodeGen/ScheduleDAGMutation.h"
#include <memory>
@@ -17,3 +20,5 @@ namespace llvm {
std::unique_ptr<ScheduleDAGMutation> createAMDGPUMacroFusionDAGMutation();
} // llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMACROFUSION_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h b/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
index 756bc948b1dd..8af7979dba8b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
@@ -26,22 +26,6 @@ const char SectionName[] = ".note";
const char NoteNameV2[] = "AMD";
const char NoteNameV3[] = "AMDGPU";
-// TODO: Remove this file once we drop code object v2.
-enum NoteType{
- NT_AMDGPU_HSA_RESERVED_0 = 0,
- NT_AMDGPU_HSA_CODE_OBJECT_VERSION = 1,
- NT_AMDGPU_HSA_HSAIL = 2,
- NT_AMDGPU_HSA_ISA = 3,
- NT_AMDGPU_HSA_PRODUCER = 4,
- NT_AMDGPU_HSA_PRODUCER_OPTIONS = 5,
- NT_AMDGPU_HSA_EXTENSION = 6,
- NT_AMDGPU_HSA_RESERVED_7 = 7,
- NT_AMDGPU_HSA_RESERVED_8 = 8,
- NT_AMDGPU_HSA_RESERVED_9 = 9,
- NT_AMDGPU_HSA_HLDEBUG_DEBUG = 101,
- NT_AMDGPU_HSA_HLDEBUG_TARGET = 102
-};
-
} // End namespace ElfNote
} // End namespace AMDGPU
} // End namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index 2f6220e425cc..2aa02299ecdc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -23,6 +23,7 @@
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Target/TargetMachine.h"
@@ -208,19 +209,22 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
for (auto &B : F) {
LastAccess = MemAccessInfo();
for (auto &I : B) {
- if (getMemoryInstrPtr(&I)) {
+ if (const Value *Ptr = getMemoryInstrPtr(&I)) {
+ unsigned Size = divideCeil(
+ Ptr->getType()->getPointerElementType()->getPrimitiveSizeInBits(),
+ 32);
if (isIndirectAccess(&I))
- ++FI.IAMInstCount;
+ FI.IAMInstCost += Size;
if (isLargeStride(&I))
- ++FI.LSMInstCount;
- ++FI.MemInstCount;
- ++FI.InstCount;
+ FI.LSMInstCost += Size;
+ FI.MemInstCost += Size;
+ FI.InstCost += Size;
continue;
}
if (auto *CB = dyn_cast<CallBase>(&I)) {
Function *Callee = CB->getCalledFunction();
if (!Callee || Callee->isDeclaration()) {
- ++FI.InstCount;
+ ++FI.InstCost;
continue;
}
if (&F == Callee) // Handle immediate recursion
@@ -230,10 +234,10 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
if (Loc == FIM.end())
continue;
- FI.MemInstCount += Loc->second.MemInstCount;
- FI.InstCount += Loc->second.InstCount;
- FI.IAMInstCount += Loc->second.IAMInstCount;
- FI.LSMInstCount += Loc->second.LSMInstCount;
+ FI.MemInstCost += Loc->second.MemInstCost;
+ FI.InstCost += Loc->second.InstCost;
+ FI.IAMInstCost += Loc->second.IAMInstCost;
+ FI.LSMInstCost += Loc->second.LSMInstCost;
} else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
TargetLoweringBase::AddrMode AM;
auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL);
@@ -243,9 +247,9 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
GEP->getPointerAddressSpace()))
// Offset will likely be folded into load or store
continue;
- ++FI.InstCount;
+ ++FI.InstCost;
} else {
- ++FI.InstCount;
+ ++FI.InstCost;
}
}
}
@@ -263,11 +267,11 @@ bool AMDGPUPerfHint::runOnFunction(Function &F) {
const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F);
- LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Info->MemInstCount
+ LLVM_DEBUG(dbgs() << F.getName() << " MemInst cost: " << Info->MemInstCost
<< '\n'
- << " IAMInst: " << Info->IAMInstCount << '\n'
- << " LSMInst: " << Info->LSMInstCount << '\n'
- << " TotalInst: " << Info->InstCount << '\n');
+ << " IAMInst cost: " << Info->IAMInstCost << '\n'
+ << " LSMInst cost: " << Info->LSMInstCost << '\n'
+ << " TotalInst cost: " << Info->InstCost << '\n');
if (isMemBound(*Info)) {
LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
@@ -285,13 +289,12 @@ bool AMDGPUPerfHint::runOnFunction(Function &F) {
}
bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
- return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh;
+ return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh;
}
bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
- return ((FI.MemInstCount + FI.IAMInstCount * IAWeight +
- FI.LSMInstCount * LSWeight) *
- 100 / FI.InstCount) > LimitWaveThresh;
+ return ((FI.MemInstCost + FI.IAMInstCost * IAWeight +
+ FI.LSMInstCost * LSWeight) * 100 / FI.InstCost) > LimitWaveThresh;
}
bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
index 99dbf5080741..31ff80f5f431 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
@@ -37,12 +37,11 @@ public:
bool needsWaveLimiter(const Function *F) const;
struct FuncInfo {
- unsigned MemInstCount;
- unsigned InstCount;
- unsigned IAMInstCount; // Indirect access memory instruction count
- unsigned LSMInstCount; // Large stride memory instruction count
- FuncInfo() : MemInstCount(0), InstCount(0), IAMInstCount(0),
- LSMInstCount(0) {}
+ unsigned MemInstCost;
+ unsigned InstCost;
+ unsigned IAMInstCost; // Indirect access memory instruction count
+ unsigned LSMInstCost; // Large stride memory instruction count
+ FuncInfo() : MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0) {}
};
typedef ValueMap<const Function*, FuncInfo> FuncInfoMap;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index 09e2c762abdb..728be811afae 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -66,6 +66,8 @@ public:
bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
void applyCvtF32UByteN(MachineInstr &MI,
const CvtF32UByteMatchInfo &MatchInfo);
+
+ bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg);
};
bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
@@ -245,6 +247,14 @@ void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
MI.eraseFromParent();
}
+bool AMDGPUPostLegalizerCombinerHelper::matchRemoveFcanonicalize(
+ MachineInstr &MI, Register &Reg) {
+ const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
+ MF.getSubtarget().getTargetLowering());
+ Reg = MI.getOperand(1).getReg();
+ return TLI->isCanonicalized(Reg, MF);
+}
+
class AMDGPUPostLegalizerCombinerHelperState {
protected:
CombinerHelper &Helper;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index e4b628bf6b23..13f09ab8f164 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -12,6 +12,9 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPULegalizerInfo.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
@@ -26,6 +29,141 @@
using namespace llvm;
using namespace MIPatternMatch;
+class AMDGPUPreLegalizerCombinerHelper {
+protected:
+ MachineIRBuilder &B;
+ MachineFunction &MF;
+ MachineRegisterInfo &MRI;
+ CombinerHelper &Helper;
+
+public:
+ AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
+ : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
+
+ struct ClampI64ToI16MatchInfo {
+ int64_t Cmp1 = 0;
+ int64_t Cmp2 = 0;
+ Register Origin;
+ };
+
+ bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineFunction &MF,
+ ClampI64ToI16MatchInfo &MatchInfo);
+
+ void applyClampI64ToI16(MachineInstr &MI,
+ const ClampI64ToI16MatchInfo &MatchInfo);
+};
+
+bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16(
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF,
+ ClampI64ToI16MatchInfo &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
+
+ // Try to find a pattern where an i64 value should get clamped to short.
+ const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
+ if (SrcType != LLT::scalar(64))
+ return false;
+
+ const LLT DstType = MRI.getType(MI.getOperand(0).getReg());
+ if (DstType != LLT::scalar(16))
+ return false;
+
+ Register Base;
+
+ auto IsApplicableForCombine = [&MatchInfo]() -> bool {
+ const auto Cmp1 = MatchInfo.Cmp1;
+ const auto Cmp2 = MatchInfo.Cmp2;
+ const auto Diff = std::abs(Cmp2 - Cmp1);
+
+ // If the difference between both comparison values is 0 or 1, there is no
+ // need to clamp.
+ if (Diff == 0 || Diff == 1)
+ return false;
+
+ const int64_t Min = std::numeric_limits<int16_t>::min();
+ const int64_t Max = std::numeric_limits<int16_t>::max();
+
+ // Check if the comparison values are between SHORT_MIN and SHORT_MAX.
+ return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
+ (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
+ };
+
+ // Try to match a combination of min / max MIR opcodes.
+ if (mi_match(MI.getOperand(1).getReg(), MRI,
+ m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
+ if (mi_match(Base, MRI,
+ m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
+ return IsApplicableForCombine();
+ }
+ }
+
+ if (mi_match(MI.getOperand(1).getReg(), MRI,
+ m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
+ if (mi_match(Base, MRI,
+ m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
+ return IsApplicableForCombine();
+ }
+ }
+
+ return false;
+}
+
+// We want to find a combination of instructions that
+// gets generated when an i64 gets clamped to i16.
+// The corresponding pattern is:
+// G_MAX / G_MAX for i16 <= G_TRUNC i64.
+// This can be efficiently written as following:
+// v_cvt_pk_i16_i32 v0, v0, v1
+// v_med3_i32 v0, Clamp_Min, v0, Clamp_Max
+void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
+ MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) {
+
+ Register Src = MatchInfo.Origin;
+ assert(MI.getParent()->getParent()->getRegInfo().getType(Src) ==
+ LLT::scalar(64));
+ const LLT S32 = LLT::scalar(32);
+
+ B.setMBB(*MI.getParent());
+ B.setInstrAndDebugLoc(MI);
+
+ auto Unmerge = B.buildUnmerge(S32, Src);
+
+ assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32);
+
+ const LLT V2S16 = LLT::fixed_vector(2, 16);
+ auto CvtPk =
+ B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32, {V2S16},
+ {Unmerge.getReg(0), Unmerge.getReg(1)}, MI.getFlags());
+
+ auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
+ auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
+ auto MinBoundaryDst = B.buildConstant(S32, MinBoundary);
+ auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary);
+
+ auto Bitcast = B.buildBitcast({S32}, CvtPk);
+
+ auto Med3 = B.buildInstr(
+ AMDGPU::G_AMDGPU_SMED3, {S32},
+ {MinBoundaryDst.getReg(0), Bitcast.getReg(0), MaxBoundaryDst.getReg(0)},
+ MI.getFlags());
+
+ B.buildTrunc(MI.getOperand(0).getReg(), Med3);
+
+ MI.eraseFromParent();
+}
+
+class AMDGPUPreLegalizerCombinerHelperState {
+protected:
+ CombinerHelper &Helper;
+ AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper;
+
+public:
+ AMDGPUPreLegalizerCombinerHelperState(
+ CombinerHelper &Helper,
+ AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper)
+ : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {}
+};
+
#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
#include "AMDGPUGenPreLegalizeGICombiner.inc"
#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
@@ -59,12 +197,16 @@ bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
MachineInstr &MI,
MachineIRBuilder &B) const {
CombinerHelper Helper(Observer, B, KB, MDT);
- AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg);
+ AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper);
+ AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
+ PreLegalizerHelper);
if (Generated.tryCombineAll(Observer, MI, B, Helper))
return true;
switch (MI.getOpcode()) {
+ case TargetOpcode::G_MEMCPY_INLINE:
+ return Helper.tryEmitMemcpyInline(MI);
case TargetOpcode::G_CONCAT_VECTORS:
return Helper.tryCombineConcatVectors(MI);
case TargetOpcode::G_SHUFFLE_VECTOR:
@@ -109,6 +251,9 @@ void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<MachineDominatorTree>();
AU.addPreserved<MachineDominatorTree>();
}
+
+ AU.addRequired<GISelCSEAnalysisWrapperPass>();
+ AU.addPreserved<GISelCSEAnalysisWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -130,8 +275,13 @@ bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
F.hasMinSize(), KB, MDT);
+ // Enable CSE.
+ GISelCSEAnalysisWrapper &Wrapper =
+ getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
+ auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig());
+
Combiner C(PCInfo, TPC);
- return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
+ return C.combineMachineInstrs(MF, CSEInfo);
}
char AMDGPUPreLegalizerCombiner::ID = 0;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index c8bd9b96b44f..7b6959b56145 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -323,7 +323,8 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
Type *SizetTy = Type::getInt32Ty(Ctx);
Type *Tys_alloc[1] = {SizetTy};
- Type *I8Ptr = PointerType::get(Type::getInt8Ty(Ctx), 1);
+ Type *I8Ty = Type::getInt8Ty(Ctx);
+ Type *I8Ptr = PointerType::get(I8Ty, 1);
FunctionType *FTy_alloc = FunctionType::get(I8Ptr, Tys_alloc, false);
FunctionCallee PrintfAllocFn =
M.getOrInsertFunction(StringRef("__printf_alloc"), FTy_alloc, Attr);
@@ -355,9 +356,8 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
// basicblock splits after buffer overflow check
//
ConstantPointerNull *zeroIntPtr =
- ConstantPointerNull::get(PointerType::get(Type::getInt8Ty(Ctx), 1));
- ICmpInst *cmp =
- dyn_cast<ICmpInst>(Builder.CreateICmpNE(pcall, zeroIntPtr, ""));
+ ConstantPointerNull::get(PointerType::get(I8Ty, 1));
+ auto *cmp = cast<ICmpInst>(Builder.CreateICmpNE(pcall, zeroIntPtr, ""));
if (!CI->use_empty()) {
Value *result =
Builder.CreateSExt(Builder.CreateNot(cmp), I32Ty, "printf_res");
@@ -371,13 +371,9 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
// store unique printf id in the buffer
//
- SmallVector<Value *, 1> ZeroIdxList;
- ConstantInt *zeroInt =
- ConstantInt::get(Ctx, APInt(32, StringRef("0"), 10));
- ZeroIdxList.push_back(zeroInt);
-
GetElementPtrInst *BufferIdx = GetElementPtrInst::Create(
- nullptr, pcall, ZeroIdxList, "PrintBuffID", Brnch);
+ I8Ty, pcall, ConstantInt::get(Ctx, APInt(32, 0)), "PrintBuffID",
+ Brnch);
Type *idPointer = PointerType::get(I32Ty, AMDGPUAS::GLOBAL_ADDRESS);
Value *id_gep_cast =
@@ -385,14 +381,11 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
new StoreInst(ConstantInt::get(I32Ty, UniqID), id_gep_cast, Brnch);
- SmallVector<Value *, 2> FourthIdxList;
- ConstantInt *fourInt =
- ConstantInt::get(Ctx, APInt(32, StringRef("4"), 10));
-
- FourthIdxList.push_back(fourInt); // 1st 4 bytes hold the printf_id
+ // 1st 4 bytes hold the printf_id
// the following GEP is the buffer pointer
- BufferIdx = GetElementPtrInst::Create(nullptr, pcall, FourthIdxList,
- "PrintBuffGep", Brnch);
+ BufferIdx = GetElementPtrInst::Create(
+ I8Ty, pcall, ConstantInt::get(Ctx, APInt(32, 4)), "PrintBuffGep",
+ Brnch);
Type *Int32Ty = Type::getInt32Ty(Ctx);
Type *Int64Ty = Type::getInt64Ty(Ctx);
@@ -533,7 +526,7 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
(void)StBuff;
if (I + 1 == E && ArgCount + 1 == CI->getNumArgOperands())
break;
- BufferIdx = GetElementPtrInst::Create(nullptr, BufferIdx, BuffOffset,
+ BufferIdx = GetElementPtrInst::Create(I8Ty, BufferIdx, BuffOffset,
"PrintBuffNextPtr", Brnch);
LLVM_DEBUG(dbgs() << "inserting gep to the printf buffer:\n"
<< *BufferIdx << '\n');
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 2a6ea838efc0..3f1f21a33f7e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -126,8 +126,13 @@ public:
char AMDGPUPromoteAlloca::ID = 0;
char AMDGPUPromoteAllocaToVector::ID = 0;
-INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
- "AMDGPU promote alloca to vector or LDS", false, false)
+INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
+ "AMDGPU promote alloca to vector or LDS", false, false)
+// Move LDS uses from functions to kernels before promote alloca for accurate
+// estimation of LDS available
+INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDS)
+INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
+ "AMDGPU promote alloca to vector or LDS", false, false)
INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
"AMDGPU promote alloca to vector", false, false)
@@ -656,6 +661,11 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
continue;
}
+ // Do not promote vector/aggregate type instructions. It is hard to track
+ // their users.
+ if (isa<InsertValueInst>(User) || isa<InsertElementInst>(User))
+ return false;
+
if (!User->getType()->isPointerTy())
continue;
@@ -943,13 +953,15 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
I.replaceAllUsesWith(Offset);
I.eraseFromParent();
+ SmallVector<IntrinsicInst *> DeferredIntrs;
+
for (Value *V : WorkList) {
CallInst *Call = dyn_cast<CallInst>(V);
if (!Call) {
if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
Value *Src0 = CI->getOperand(0);
- Type *EltTy = Src0->getType()->getPointerElementType();
- PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+ PointerType *NewTy = PointerType::getWithSamePointeeType(
+ cast<PointerType>(Src0->getType()), AMDGPUAS::LOCAL_ADDRESS);
if (isa<ConstantPointerNull>(CI->getOperand(0)))
CI->setOperand(0, ConstantPointerNull::get(NewTy));
@@ -965,8 +977,8 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
if (isa<AddrSpaceCastInst>(V))
continue;
- Type *EltTy = V->getType()->getPointerElementType();
- PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+ PointerType *NewTy = PointerType::getWithSamePointeeType(
+ cast<PointerType>(V->getType()), AMDGPUAS::LOCAL_ADDRESS);
// FIXME: It doesn't really make sense to try to do this for all
// instructions.
@@ -997,22 +1009,13 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
// These intrinsics are for address space 0 only
Intr->eraseFromParent();
continue;
- case Intrinsic::memcpy: {
- MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
- Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getDestAlign(),
- MemCpy->getRawSource(), MemCpy->getSourceAlign(),
- MemCpy->getLength(), MemCpy->isVolatile());
- Intr->eraseFromParent();
- continue;
- }
- case Intrinsic::memmove: {
- MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
- Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getDestAlign(),
- MemMove->getRawSource(), MemMove->getSourceAlign(),
- MemMove->getLength(), MemMove->isVolatile());
- Intr->eraseFromParent();
+ case Intrinsic::memcpy:
+ case Intrinsic::memmove:
+ // These have 2 pointer operands. In case if second pointer also needs
+ // to be replaced we defer processing of these intrinsics until all
+ // other values are processed.
+ DeferredIntrs.push_back(Intr);
continue;
- }
case Intrinsic::memset: {
MemSetInst *MemSet = cast<MemSetInst>(Intr);
Builder.CreateMemSet(
@@ -1032,11 +1035,11 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
continue;
case Intrinsic::objectsize: {
Value *Src = Intr->getOperand(0);
- Type *SrcTy = Src->getType()->getPointerElementType();
- Function *ObjectSize = Intrinsic::getDeclaration(Mod,
- Intrinsic::objectsize,
- { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
- );
+ Function *ObjectSize = Intrinsic::getDeclaration(
+ Mod, Intrinsic::objectsize,
+ {Intr->getType(),
+ PointerType::getWithSamePointeeType(
+ cast<PointerType>(Src->getType()), AMDGPUAS::LOCAL_ADDRESS)});
CallInst *NewCall = Builder.CreateCall(
ObjectSize,
@@ -1050,6 +1053,27 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
llvm_unreachable("Don't know how to promote alloca intrinsic use.");
}
}
+
+ for (IntrinsicInst *Intr : DeferredIntrs) {
+ Builder.SetInsertPoint(Intr);
+ Intrinsic::ID ID = Intr->getIntrinsicID();
+ assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
+
+ MemTransferInst *MI = cast<MemTransferInst>(Intr);
+ auto *B =
+ Builder.CreateMemTransferInst(ID, MI->getRawDest(), MI->getDestAlign(),
+ MI->getRawSource(), MI->getSourceAlign(),
+ MI->getLength(), MI->isVolatile());
+
+ for (unsigned I = 1; I != 3; ++I) {
+ if (uint64_t Bytes = Intr->getDereferenceableBytes(I)) {
+ B->addDereferenceableAttr(I, Bytes);
+ }
+ }
+
+ Intr->eraseFromParent();
+ }
+
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
index cd71c7a16c73..0e4c26170a8f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
@@ -249,7 +249,11 @@ bool AMDGPUPropagateAttributes::process() {
if (!I)
continue;
CallBase *CI = dyn_cast<CallBase>(I);
- if (!CI)
+ // Only propagate attributes if F is the called function. Specifically,
+ // do not propagate attributes if F is passed as an argument.
+ // FIXME: handle bitcasted callee, e.g.
+ // %retval = call i8* bitcast (i32* ()* @f to i8* ()*)()
+ if (!CI || CI->getCalledOperand() != &F)
continue;
Function *Caller = CI->getCaller();
if (!Caller || !Visited.insert(CI).second)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index d644c0319286..4e12e5cd8f65 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -13,7 +13,9 @@
#include "AMDGPU.h"
#include "AMDGPULegalizerInfo.h"
+#include "AMDGPURegisterBankInfo.h"
#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
@@ -27,6 +29,126 @@
using namespace llvm;
using namespace MIPatternMatch;
+class AMDGPURegBankCombinerHelper {
+protected:
+ MachineIRBuilder &B;
+ MachineFunction &MF;
+ MachineRegisterInfo &MRI;
+ const RegisterBankInfo &RBI;
+ const TargetRegisterInfo &TRI;
+ CombinerHelper &Helper;
+
+public:
+ AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
+ : B(B), MF(B.getMF()), MRI(*B.getMRI()),
+ RBI(*MF.getSubtarget().getRegBankInfo()),
+ TRI(*MF.getSubtarget().getRegisterInfo()), Helper(Helper){};
+
+ bool isVgprRegBank(Register Reg);
+
+ struct MinMaxMedOpc {
+ unsigned Min, Max, Med;
+ };
+
+ struct Med3MatchInfo {
+ unsigned Opc;
+ Register Val0, Val1, Val2;
+ };
+
+ MinMaxMedOpc getMinMaxPair(unsigned Opc);
+
+ template <class m_Cst>
+ bool matchMed(MachineInstr &MI, MachineRegisterInfo &MRI, MinMaxMedOpc MMMOpc,
+ Register &Val, Register &K0, Register &K1);
+
+ bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo);
+ void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo);
+};
+
+bool AMDGPURegBankCombinerHelper::isVgprRegBank(Register Reg) {
+ return RBI.getRegBank(Reg, MRI, TRI)->getID() == AMDGPU::VGPRRegBankID;
+}
+
+AMDGPURegBankCombinerHelper::MinMaxMedOpc
+AMDGPURegBankCombinerHelper::getMinMaxPair(unsigned Opc) {
+ switch (Opc) {
+ default:
+ llvm_unreachable("Unsupported opcode");
+ case AMDGPU::G_SMAX:
+ case AMDGPU::G_SMIN:
+ return {AMDGPU::G_SMIN, AMDGPU::G_SMAX, AMDGPU::G_AMDGPU_SMED3};
+ case AMDGPU::G_UMAX:
+ case AMDGPU::G_UMIN:
+ return {AMDGPU::G_UMIN, AMDGPU::G_UMAX, AMDGPU::G_AMDGPU_UMED3};
+ }
+}
+
+template <class m_Cst>
+bool AMDGPURegBankCombinerHelper::matchMed(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MinMaxMedOpc MMMOpc, Register &Val,
+ Register &K0, Register &K1) {
+ // 4 operand commutes of: min(max(Val, K0), K1).
+ // Find K1 from outer instr: min(max(...), K1) or min(K1, max(...)).
+ // Find K0 and Val from inner instr: max(K0, Val) or max(Val, K0).
+ // 4 operand commutes of: max(min(Val, K1), K0).
+ // Find K0 from outer instr: max(min(...), K0) or max(K0, min(...)).
+ // Find K1 and Val from inner instr: min(K1, Val) or min(Val, K1).
+ return mi_match(
+ MI, MRI,
+ m_any_of(
+ m_CommutativeBinOp(
+ MMMOpc.Min, m_CommutativeBinOp(MMMOpc.Max, m_Reg(Val), m_Cst(K0)),
+ m_Cst(K1)),
+ m_CommutativeBinOp(
+ MMMOpc.Max, m_CommutativeBinOp(MMMOpc.Min, m_Reg(Val), m_Cst(K1)),
+ m_Cst(K0))));
+}
+
+bool AMDGPURegBankCombinerHelper::matchIntMinMaxToMed3(
+ MachineInstr &MI, Med3MatchInfo &MatchInfo) {
+ Register Dst = MI.getOperand(0).getReg();
+ if (!isVgprRegBank(Dst))
+ return false;
+
+ if (MRI.getType(Dst).isVector())
+ return false;
+
+ MinMaxMedOpc OpcodeTriple = getMinMaxPair(MI.getOpcode());
+ Register Val, K0, K1;
+ // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1.
+ if (!matchMed<ICstRegMatch>(MI, MRI, OpcodeTriple, Val, K0, K1))
+ return false;
+
+ const APInt &K0_Imm = getConstantIntVRegVal(K0, MRI)->getValue();
+ const APInt &K1_Imm = getConstantIntVRegVal(K1, MRI)->getValue();
+ if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_SMED3 && K0_Imm.sgt(K1_Imm))
+ return false;
+ if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_UMED3 && K0_Imm.ugt(K1_Imm))
+ return false;
+
+ MatchInfo = {OpcodeTriple.Med, Val, K0, K1};
+ return true;
+}
+
+void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI,
+ Med3MatchInfo &MatchInfo) {
+ B.setInstrAndDebugLoc(MI);
+ B.buildInstr(MatchInfo.Opc, {MI.getOperand(0)},
+ {MatchInfo.Val0, MatchInfo.Val1, MatchInfo.Val2}, MI.getFlags());
+ MI.eraseFromParent();
+}
+
+class AMDGPURegBankCombinerHelperState {
+protected:
+ CombinerHelper &Helper;
+ AMDGPURegBankCombinerHelper &RegBankHelper;
+
+public:
+ AMDGPURegBankCombinerHelperState(CombinerHelper &Helper,
+ AMDGPURegBankCombinerHelper &RegBankHelper)
+ : Helper(Helper), RegBankHelper(RegBankHelper) {}
+};
#define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS
#include "AMDGPUGenRegBankGICombiner.inc"
@@ -62,9 +184,11 @@ bool AMDGPURegBankCombinerInfo::combine(GISelChangeObserver &Observer,
MachineInstr &MI,
MachineIRBuilder &B) const {
CombinerHelper Helper(Observer, B, KB, MDT);
- AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg);
+ AMDGPURegBankCombinerHelper RegBankHelper(B, Helper);
+ AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg, Helper,
+ RegBankHelper);
- if (Generated.tryCombineAll(Observer, MI, B, Helper))
+ if (Generated.tryCombineAll(Observer, MI, B))
return true;
return false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 502356d4f9a4..0e4005627e02 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -443,9 +443,8 @@ static bool isScalarLoadLegal(const MachineInstr &MI) {
const unsigned AS = MMO->getAddrSpace();
const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
-
- // There are no extending SMRD/SMEM loads, and they require 4-byte alignment.
- return MMO->getSize() >= 4 && MMO->getAlign() >= Align(4) &&
+ // Require 4-byte alignment.
+ return MMO->getAlign() >= Align(4) &&
// Can't do a scalar atomic load.
!MMO->isAtomic() &&
// Don't use scalar loads for volatile accesses to non-constant address
@@ -591,21 +590,6 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
return AltMappings;
}
- case TargetOpcode::G_SMIN:
- case TargetOpcode::G_SMAX:
- case TargetOpcode::G_UMIN:
- case TargetOpcode::G_UMAX: {
- static const OpRegBankEntry<3> Table[2] = {
- { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
-
- // Scalar requires cmp+select, and extends if 16-bit.
- // FIXME: Should there be separate costs for 32 and 16-bit
- { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 }
- };
-
- const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } };
- return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
- }
case TargetOpcode::G_UADDE:
case TargetOpcode::G_USUBE:
case TargetOpcode::G_SADDE:
@@ -691,12 +675,13 @@ static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
static LLT getHalfSizedType(LLT Ty) {
if (Ty.isVector()) {
- assert(Ty.getNumElements() % 2 == 0);
- return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType());
+ assert(Ty.getElementCount().isKnownMultipleOf(2));
+ return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2),
+ Ty.getElementType());
}
- assert(Ty.getSizeInBits() % 2 == 0);
- return LLT::scalar(Ty.getSizeInBits() / 2);
+ assert(Ty.getScalarSizeInBits() % 2 == 0);
+ return LLT::scalar(Ty.getScalarSizeInBits() / 2);
}
/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
@@ -1139,8 +1124,8 @@ static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
unsigned FirstPartNumElts = FirstSize / EltSize;
unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
- return {LLT::scalarOrVector(FirstPartNumElts, EltTy),
- LLT::scalarOrVector(RemainderElts, EltTy)};
+ return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
+ LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
}
static LLT widen96To128(LLT Ty) {
@@ -1149,7 +1134,7 @@ static LLT widen96To128(LLT Ty) {
LLT EltTy = Ty.getElementType();
assert(128 % EltTy.getSizeInBits() == 0);
- return LLT::vector(128 / EltTy.getSizeInBits(), EltTy);
+ return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
}
bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
@@ -1160,34 +1145,61 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
unsigned LoadSize = LoadTy.getSizeInBits();
const unsigned MaxNonSmrdLoadSize = 128;
- const RegisterBank *PtrBank =
- OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
- if (PtrBank == &AMDGPU::SGPRRegBank) {
- // If the pointer is an SGPR, we ordinarily have nothing to do.
- if (LoadSize != 96)
+ const RegisterBank *DstBank =
+ OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+ if (DstBank == &AMDGPU::SGPRRegBank) {
+ // There are some special cases that we need to look at for 32 bit and 96
+ // bit SGPR loads otherwise we have nothing to do.
+ if (LoadSize != 32 && LoadSize != 96)
return false;
MachineMemOperand *MMO = *MI.memoperands_begin();
+ const unsigned MemSize = 8 * MMO->getSize();
+ // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
+ // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
+ // scalar loads should have a load size of 32 but memory access size of less
+ // than 32.
+ if (LoadSize == 32 &&
+ (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
+ return false;
+
Register PtrReg = MI.getOperand(1).getReg();
- // 96-bit loads are only available for vector loads. We need to split this
- // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
MachineIRBuilder B(MI, O);
- if (MMO->getAlign() < Align(16)) {
- LLT Part64, Part32;
- std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
- auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
- auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
-
- auto Undef = B.buildUndef(LoadTy);
- auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
- B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
+ if (LoadSize == 32) {
+ // This is an extending load from a sub-dword size. Widen the memory
+ // access size to 4 bytes and clear the extra high bits appropriately
+ const LLT S32 = LLT::scalar(32);
+ if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
+ // Must extend the sign bit into higher bits for a G_SEXTLOAD
+ auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
+ B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
+ } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
+ // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
+ auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
+ B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
+ } else
+ // We do not need to touch the higher bits for regular loads.
+ B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
} else {
- LLT WiderTy = widen96To128(LoadTy);
- auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
- B.buildExtract(MI.getOperand(0), WideLoad, 0);
+ // 96-bit loads are only available for vector loads. We need to split this
+ // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
+ if (MMO->getAlign() < Align(16)) {
+ LLT Part64, Part32;
+ std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
+ auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
+ auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
+
+ auto Undef = B.buildUndef(LoadTy);
+ auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
+ B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
+ } else {
+ LLT WiderTy = widen96To128(LoadTy);
+ auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
+ B.buildExtract(MI.getOperand(0), WideLoad, 0);
+ }
}
MI.eraseFromParent();
@@ -1345,8 +1357,8 @@ static unsigned setBufferOffsets(MachineIRBuilder &B,
AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
uint32_t SOffset, ImmOffset;
- if (Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
- &RBI.Subtarget, Alignment)) {
+ if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
+ &RBI.Subtarget, Alignment)) {
if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
VOffsetReg = Base;
SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
@@ -1366,7 +1378,8 @@ static unsigned setBufferOffsets(MachineIRBuilder &B,
}
// Handle the variable sgpr + vgpr case.
- if (MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI)) {
+ MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
+ if (Add && (int)Offset >= 0) {
Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
@@ -1519,8 +1532,8 @@ bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
return true;
}
-bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic(
- const OperandsMapper &OpdMapper, bool Signed) const {
+bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper,
+ bool Signed) const {
MachineInstr &MI = OpdMapper.getMI();
MachineRegisterInfo &MRI = OpdMapper.getMRI();
@@ -1532,19 +1545,69 @@ bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic(
const LLT S32 = LLT::scalar(32);
+ unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1;
+ Register SrcReg = MI.getOperand(FirstOpnd).getReg();
+ Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
+ Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
+
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank == &AMDGPU::VGPRRegBank) {
if (Ty == S32)
return true;
- // TODO: 64-bit version is scalar only, so we need to expand this.
- return false;
- }
+ // There is no 64-bit vgpr bitfield extract instructions so the operation
+ // is expanded to a sequence of instructions that implement the operation.
+ ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank);
+ MachineIRBuilder B(MI, ApplyBank);
+
+ const LLT S64 = LLT::scalar(64);
+ // Shift the source operand so that extracted bits start at bit 0.
+ auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
+ : B.buildLShr(S64, SrcReg, OffsetReg);
+ auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
+
+ // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
+ // if the width is a constant.
+ if (auto ConstWidth = getConstantVRegValWithLookThrough(WidthReg, MRI)) {
+ // Use the 32-bit bitfield extract instruction if the width is a constant.
+ // Depending on the width size, use either the low or high 32-bits.
+ auto Zero = B.buildConstant(S32, 0);
+ auto WidthImm = ConstWidth->Value.getZExtValue();
+ if (WidthImm <= 32) {
+ // Use bitfield extract on the lower 32-bit source, and then sign-extend
+ // or clear the upper 32-bits.
+ auto Extract =
+ Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
+ : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
+ auto Extend =
+ Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
+ B.buildMerge(DstReg, {Extract, Extend});
+ } else {
+ // Use bitfield extract on upper 32-bit source, and combine with lower
+ // 32-bit source.
+ auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
+ auto Extract =
+ Signed
+ ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
+ : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
+ B.buildMerge(DstReg, {UnmergeSOffset.getReg(0), Extract});
+ }
+ MI.eraseFromParent();
+ return true;
+ }
- Register SrcReg = MI.getOperand(2).getReg();
- Register OffsetReg = MI.getOperand(3).getReg();
- Register WidthReg = MI.getOperand(4).getReg();
+ // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
+ // operations.
+ auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
+ auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
+ if (Signed)
+ B.buildAShr(S64, SignBit, ExtShift);
+ else
+ B.buildLShr(S64, SignBit, ExtShift);
+ MI.eraseFromParent();
+ return true;
+ }
// The scalar form packs the offset and width in a single operand.
@@ -1576,32 +1639,19 @@ bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic(
return true;
}
-// FIXME: Duplicated from LegalizerHelper
-static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
- switch (Opc) {
- case TargetOpcode::G_SMIN:
- return CmpInst::ICMP_SLT;
- case TargetOpcode::G_SMAX:
- return CmpInst::ICMP_SGT;
- case TargetOpcode::G_UMIN:
- return CmpInst::ICMP_ULT;
- case TargetOpcode::G_UMAX:
- return CmpInst::ICMP_UGT;
- default:
- llvm_unreachable("not in integer min/max");
- }
-}
-
-static unsigned minMaxToExtend(unsigned Opc) {
+// Return a suitable opcode for extending the operands of Opc when widening.
+static unsigned getExtendOp(unsigned Opc) {
switch (Opc) {
+ case TargetOpcode::G_ASHR:
case TargetOpcode::G_SMIN:
case TargetOpcode::G_SMAX:
return TargetOpcode::G_SEXT;
+ case TargetOpcode::G_LSHR:
case TargetOpcode::G_UMIN:
case TargetOpcode::G_UMAX:
return TargetOpcode::G_ZEXT;
default:
- llvm_unreachable("not in integer min/max");
+ return TargetOpcode::G_ANYEXT;
}
}
@@ -1628,30 +1678,6 @@ unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
}
-static MachineInstr *buildExpandedScalarMinMax(MachineIRBuilder &B,
- CmpInst::Predicate Pred,
- Register Dst, Register Src0,
- Register Src1) {
- const LLT CmpType = LLT::scalar(32);
- auto Cmp = B.buildICmp(Pred, CmpType, Src0, Src1);
- return B.buildSelect(Dst, Cmp, Src0, Src1);
-}
-
-// FIXME: Duplicated from LegalizerHelper, except changing the boolean type.
-void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B,
- MachineInstr &MI) const {
- Register Dst = MI.getOperand(0).getReg();
- Register Src0 = MI.getOperand(1).getReg();
- Register Src1 = MI.getOperand(2).getReg();
-
- const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
- MachineInstr *Sel = buildExpandedScalarMinMax(B, Pred, Dst, Src0, Src1);
-
- Register CmpReg = Sel->getOperand(1).getReg();
- B.getMRI()->setRegBank(CmpReg, AMDGPU::SGPRRegBank);
- MI.eraseFromParent();
-}
-
// For cases where only a single copy is inserted for matching register banks.
// Replace the register in the instruction operand
static bool substituteSimpleCopyRegs(
@@ -1688,7 +1714,7 @@ Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
const LLT S32 = LLT::scalar(32);
int NumElts = StoreVT.getNumElements();
- return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0);
+ return B.buildMerge(LLT::fixed_vector(NumElts, S32), WideRegs).getReg(0);
}
static std::pair<Register, unsigned>
@@ -1754,17 +1780,14 @@ static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
}
-static unsigned extractGLC(unsigned CachePolicy) {
- return CachePolicy & 1;
+static unsigned extractCPol(unsigned CachePolicy) {
+ return CachePolicy & AMDGPU::CPol::ALL;
}
-static unsigned extractSLC(unsigned CachePolicy) {
- return (CachePolicy >> 1) & 1;
+static unsigned extractSWZ(unsigned CachePolicy) {
+ return (CachePolicy >> 3) & 1;
}
-static unsigned extractDLC(unsigned CachePolicy) {
- return (CachePolicy >> 2) & 1;
-}
MachineInstr *
AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
@@ -1830,10 +1853,9 @@ AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
MIB.addUse(RSrc)
.addUse(SOffset)
.addImm(ImmOffset)
- .addImm(extractGLC(CachePolicy))
- .addImm(extractSLC(CachePolicy))
+ .addImm(extractCPol(CachePolicy))
.addImm(0) // tfe: FIXME: Remove from inst
- .addImm(extractDLC(CachePolicy))
+ .addImm(extractSWZ(CachePolicy))
.cloneMemRefs(MI);
// FIXME: We need a way to report failure from applyMappingImpl.
@@ -2006,6 +2028,22 @@ bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
return true;
}
+// Insert a cross regbank copy for a register if it already has a bank that
+// differs from the one we want to set.
+static Register constrainRegToBank(MachineRegisterInfo &MRI,
+ MachineIRBuilder &B, Register &Reg,
+ const RegisterBank &Bank) {
+ const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
+ if (CurrBank && *CurrBank != Bank) {
+ Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
+ MRI.setRegBank(Copy, Bank);
+ return Copy;
+ }
+
+ MRI.setRegBank(Reg, Bank);
+ return Reg;
+}
+
bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
MachineInstr &MI, MachineRegisterInfo &MRI,
const OperandsMapper &OpdMapper) const {
@@ -2069,17 +2107,18 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
for (unsigned L = 0; L < NumLanes; ++L) {
- auto S = B.buildSelect(EltTy, Cmp, InsRegs[L],
- UnmergeToEltTy.getReg(I * NumLanes + L));
+ Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
+ Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
+ Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
- for (unsigned N : { 0, 2, 3 })
- MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
+ Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
+ MRI.setRegBank(Select, DstBank);
- Ops[I * NumLanes + L] = S->getOperand(0).getReg();
+ Ops[I * NumLanes + L] = Select;
}
}
- LLT MergeTy = LLT::vector(Ops.size(), EltTy);
+ LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
B.buildBuildVector(MI.getOperand(0), Ops);
} else {
@@ -2336,18 +2375,40 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
MI.eraseFromParent();
return;
}
+ case AMDGPU::G_ABS: {
+ Register SrcReg = MI.getOperand(1).getReg();
+ const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
+
+ // There is no VALU abs instruction so we need to replace it with a sub and
+ // max combination.
+ if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
+ MachineFunction *MF = MI.getParent()->getParent();
+ ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank);
+ MachineIRBuilder B(MI, Apply);
+ LegalizerHelper Helper(*MF, Apply, B);
+
+ if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
+ llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
+ return;
+ }
+ LLVM_FALLTHROUGH;
+ }
case AMDGPU::G_ADD:
case AMDGPU::G_SUB:
case AMDGPU::G_MUL:
case AMDGPU::G_SHL:
case AMDGPU::G_LSHR:
- case AMDGPU::G_ASHR: {
+ case AMDGPU::G_ASHR:
+ case AMDGPU::G_SMIN:
+ case AMDGPU::G_SMAX:
+ case AMDGPU::G_UMIN:
+ case AMDGPU::G_UMAX: {
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
// 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
// Packed 16-bit operations need to be scalarized and promoted.
- if (DstTy != LLT::scalar(16) && DstTy != LLT::vector(2, 16))
+ if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
break;
const RegisterBank *DstBank =
@@ -2365,10 +2426,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
Register WideSrc0Lo, WideSrc0Hi;
Register WideSrc1Lo, WideSrc1Hi;
+ unsigned ExtendOp = getExtendOp(MI.getOpcode());
std::tie(WideSrc0Lo, WideSrc0Hi)
- = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), AMDGPU::G_ANYEXT);
+ = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
std::tie(WideSrc1Lo, WideSrc1Hi)
- = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), AMDGPU::G_ANYEXT);
+ = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
@@ -2390,73 +2452,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
- case AMDGPU::G_SMIN:
- case AMDGPU::G_SMAX:
- case AMDGPU::G_UMIN:
- case AMDGPU::G_UMAX: {
- Register DstReg = MI.getOperand(0).getReg();
- const RegisterBank *DstBank =
- OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
- if (DstBank == &AMDGPU::VGPRRegBank)
- break;
-
- MachineFunction *MF = MI.getParent()->getParent();
- MachineIRBuilder B(MI);
-
- // Turn scalar min/max into a compare and select.
- LLT Ty = MRI.getType(DstReg);
- const LLT S32 = LLT::scalar(32);
- const LLT S16 = LLT::scalar(16);
- const LLT V2S16 = LLT::vector(2, 16);
-
- if (Ty == V2S16) {
- ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
- B.setChangeObserver(ApplySALU);
-
- // Need to widen to s32, and expand as cmp + select, and avoid producing
- // illegal vector extends or unmerges that would need further
- // legalization.
- //
- // TODO: Should we just readfirstlane? That should probably be handled
- // with a UniformVGPR register bank that wouldn't need special
- // consideration here.
-
- Register Dst = MI.getOperand(0).getReg();
- Register Src0 = MI.getOperand(1).getReg();
- Register Src1 = MI.getOperand(2).getReg();
-
- Register WideSrc0Lo, WideSrc0Hi;
- Register WideSrc1Lo, WideSrc1Hi;
-
- unsigned ExtendOp = minMaxToExtend(MI.getOpcode());
-
- std::tie(WideSrc0Lo, WideSrc0Hi) = unpackV2S16ToS32(B, Src0, ExtendOp);
- std::tie(WideSrc1Lo, WideSrc1Hi) = unpackV2S16ToS32(B, Src1, ExtendOp);
-
- Register Lo = MRI.createGenericVirtualRegister(S32);
- Register Hi = MRI.createGenericVirtualRegister(S32);
- const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
- buildExpandedScalarMinMax(B, Pred, Lo, WideSrc0Lo, WideSrc1Lo);
- buildExpandedScalarMinMax(B, Pred, Hi, WideSrc0Hi, WideSrc1Hi);
-
- B.buildBuildVectorTrunc(Dst, {Lo, Hi});
- MI.eraseFromParent();
- } else if (Ty == S16) {
- ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
- B.setChangeObserver(ApplySALU);
- LegalizerHelper Helper(*MF, ApplySALU, B);
-
- // Need to widen to s32, and expand as cmp + select.
- if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
- llvm_unreachable("widenScalar should have succeeded");
-
- // FIXME: This is relying on widenScalar leaving MI in place.
- lowerScalarMinMax(B, MI);
- } else
- lowerScalarMinMax(B, MI);
-
- return;
- }
case AMDGPU::G_SEXT_INREG: {
SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
if (SrcRegs.empty())
@@ -2496,6 +2491,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
case AMDGPU::G_CTPOP:
+ case AMDGPU::G_BITREVERSE:
case AMDGPU::G_CTLZ_ZERO_UNDEF:
case AMDGPU::G_CTTZ_ZERO_UNDEF: {
const RegisterBank *DstBank =
@@ -2605,7 +2601,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_BUILD_VECTOR_TRUNC: {
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
- if (DstTy != LLT::vector(2, 16))
+ if (DstTy != LLT::fixed_vector(2, 16))
break;
assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
@@ -2737,7 +2733,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
assert(DstTy.getSizeInBits() == 64);
- LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
+ LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
auto CastSrc = B.buildBitcast(Vec32, SrcReg);
auto One = B.buildConstant(S32, 1);
@@ -2854,7 +2850,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
assert(InsTy.getSizeInBits() == 64);
const LLT S32 = LLT::scalar(32);
- LLT Vec32 = LLT::vector(2 * VecTy.getNumElements(), 32);
+ LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
MachineIRBuilder B(MI);
auto CastSrc = B.buildBitcast(Vec32, SrcReg);
@@ -2953,7 +2949,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
executeInWaterfallLoop(MI, MRI, {2, 5});
return;
}
- case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
applyDefaultMapping(OpdMapper);
executeInWaterfallLoop(MI, MRI, {2, 5});
return;
@@ -3012,10 +3010,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
case Intrinsic::amdgcn_sbfe:
- applyMappingBFEIntrinsic(OpdMapper, true);
+ applyMappingBFE(OpdMapper, true);
return;
case Intrinsic::amdgcn_ubfe:
- applyMappingBFEIntrinsic(OpdMapper, false);
+ applyMappingBFE(OpdMapper, false);
return;
case Intrinsic::amdgcn_ballot:
// Use default handling and insert copy to vcc source.
@@ -3107,6 +3105,12 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_DYN_STACKALLOC:
applyMappingDynStackAlloc(MI, OpdMapper, MRI);
return;
+ case AMDGPU::G_SBFX:
+ applyMappingBFE(OpdMapper, /*Signed*/ true);
+ return;
+ case AMDGPU::G_UBFX:
+ applyMappingBFE(OpdMapper, /*Signed*/ false);
+ return;
default:
break;
}
@@ -3579,7 +3583,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_SMAX:
case AMDGPU::G_UMIN:
case AMDGPU::G_UMAX:
+ case AMDGPU::G_ABS:
case AMDGPU::G_SHUFFLE_VECTOR:
+ case AMDGPU::G_SBFX:
+ case AMDGPU::G_UBFX:
if (isSALUMapping(MI))
return getDefaultMappingSOP(MI);
LLVM_FALLTHROUGH;
@@ -3621,6 +3628,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
+ case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
+ case AMDGPU::G_AMDGPU_SMED3:
return getDefaultMappingVOP(MI);
case AMDGPU::G_UMULH:
case AMDGPU::G_SMULH: {
@@ -3679,7 +3688,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_BUILD_VECTOR:
case AMDGPU::G_BUILD_VECTOR_TRUNC: {
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
- if (DstTy == LLT::vector(2, 16)) {
+ if (DstTy == LLT::fixed_vector(2, 16)) {
unsigned DstSize = DstTy.getSizeInBits();
unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
@@ -3706,10 +3715,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
break;
}
+ case AMDGPU::G_BITREVERSE:
case AMDGPU::G_BITCAST:
case AMDGPU::G_INTTOPTR:
case AMDGPU::G_PTRTOINT:
- case AMDGPU::G_BITREVERSE:
case AMDGPU::G_FABS:
case AMDGPU::G_FNEG: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
@@ -3919,7 +3928,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
- case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
// vdata_out
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
@@ -4033,6 +4044,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_pk_u8_f32:
case Intrinsic::amdgcn_alignbit:
case Intrinsic::amdgcn_alignbyte:
+ case Intrinsic::amdgcn_perm:
case Intrinsic::amdgcn_fdot2:
case Intrinsic::amdgcn_sdot2:
case Intrinsic::amdgcn_udot2:
@@ -4052,7 +4064,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_update_dpp:
case Intrinsic::amdgcn_mov_dpp8:
case Intrinsic::amdgcn_mov_dpp:
+ case Intrinsic::amdgcn_strict_wwm:
case Intrinsic::amdgcn_wwm:
+ case Intrinsic::amdgcn_strict_wqm:
case Intrinsic::amdgcn_wqm:
case Intrinsic::amdgcn_softwqm:
case Intrinsic::amdgcn_set_inactive:
@@ -4176,7 +4190,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
- case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: {
+ case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
+ case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
+ case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
+ case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
+ case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
+ case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
+ case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
+ case Intrinsic::amdgcn_mfma_f64_4x4x4f64: {
// Default for MAI intrinsics.
// srcC can also be an immediate which can be folded later.
// FIXME: Should we eventually add an alternative mapping with AGPR src
@@ -4250,6 +4271,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case Intrinsic::amdgcn_global_atomic_fadd:
case Intrinsic::amdgcn_global_atomic_csub:
+ case Intrinsic::amdgcn_global_atomic_fmin:
+ case Intrinsic::amdgcn_global_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fadd:
+ case Intrinsic::amdgcn_flat_atomic_fmin:
+ case Intrinsic::amdgcn_flat_atomic_fmax:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap: {
@@ -4306,6 +4332,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
break;
}
+ case Intrinsic::amdgcn_live_mask: {
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
+ break;
+ }
+ case Intrinsic::amdgcn_wqm_demote:
case Intrinsic::amdgcn_kill: {
OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
break;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index 1c1441729e30..7e051e4a5424 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -81,10 +81,7 @@ public:
MachineRegisterInfo &MRI, int RSrcIdx) const;
bool applyMappingSBufferLoad(const OperandsMapper &OpdMapper) const;
- bool applyMappingBFEIntrinsic(const OperandsMapper &OpdMapper,
- bool Signed) const;
-
- void lowerScalarMinMax(MachineIRBuilder &B, MachineInstr &MI) const;
+ bool applyMappingBFE(const OperandsMapper &OpdMapper, bool Signed) const;
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
Register Reg) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index 6c70b53b23c1..50999a4802b3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -7,16 +7,16 @@
//===----------------------------------------------------------------------===//
def SGPRRegBank : RegisterBank<"SGPR",
- [SReg_LO16, SReg_32, SReg_64, SReg_96, SReg_128, SReg_160, SReg_192, SReg_256, SReg_512, SReg_1024]
+ [SReg_LO16, SReg_32, SReg_64, SReg_96, SReg_128, SReg_160, SReg_192, SReg_224, SReg_256, SReg_512, SReg_1024]
>;
def VGPRRegBank : RegisterBank<"VGPR",
- [VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_256, VReg_512, VReg_1024]
+ [VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_512, VReg_1024]
>;
// It is helpful to distinguish conditions from ordinary SGPRs.
def VCCRegBank : RegisterBank <"VCC", [SReg_1]>;
def AGPRRegBank : RegisterBank <"AGPR",
- [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_256, AReg_512, AReg_1024]
+ [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_224, AReg_256, AReg_512, AReg_1024]
>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
new file mode 100644
index 000000000000..dabb4d006d99
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
@@ -0,0 +1,460 @@
+//===-- AMDGPUReplaceLDSUseWithPointer.cpp --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass replaces all the uses of LDS within non-kernel functions by
+// corresponding pointer counter-parts.
+//
+// The main motivation behind this pass is - to *avoid* subsequent LDS lowering
+// pass from directly packing LDS (assume large LDS) into a struct type which
+// would otherwise cause allocating huge memory for struct instance within every
+// kernel.
+//
+// Brief sketch of the algorithm implemented in this pass is as below:
+//
+// 1. Collect all the LDS defined in the module which qualify for pointer
+// replacement, say it is, LDSGlobals set.
+//
+// 2. Collect all the reachable callees for each kernel defined in the module,
+// say it is, KernelToCallees map.
+//
+// 3. FOR (each global GV from LDSGlobals set) DO
+// LDSUsedNonKernels = Collect all non-kernel functions which use GV.
+// FOR (each kernel K in KernelToCallees map) DO
+// ReachableCallees = KernelToCallees[K]
+// ReachableAndLDSUsedCallees =
+// SetIntersect(LDSUsedNonKernels, ReachableCallees)
+// IF (ReachableAndLDSUsedCallees is not empty) THEN
+// Pointer = Create a pointer to point-to GV if not created.
+// Initialize Pointer to point-to GV within kernel K.
+// ENDIF
+// ENDFOR
+// Replace all uses of GV within non kernel functions by Pointer.
+// ENFOR
+//
+// LLVM IR example:
+//
+// Input IR:
+//
+// @lds = internal addrspace(3) global [4 x i32] undef, align 16
+//
+// define internal void @f0() {
+// entry:
+// %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds,
+// i32 0, i32 0
+// ret void
+// }
+//
+// define protected amdgpu_kernel void @k0() {
+// entry:
+// call void @f0()
+// ret void
+// }
+//
+// Output IR:
+//
+// @lds = internal addrspace(3) global [4 x i32] undef, align 16
+// @lds.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
+//
+// define internal void @f0() {
+// entry:
+// %0 = load i16, i16 addrspace(3)* @lds.ptr, align 2
+// %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
+// %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)*
+// %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2,
+// i32 0, i32 0
+// ret void
+// }
+//
+// define protected amdgpu_kernel void @k0() {
+// entry:
+// store i16 ptrtoint ([4 x i32] addrspace(3)* @lds to i16),
+// i16 addrspace(3)* @lds.ptr, align 2
+// call void @f0()
+// ret void
+// }
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "Utils/AMDGPULDSUtils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/ReplaceConstant.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <algorithm>
+#include <vector>
+
+#define DEBUG_TYPE "amdgpu-replace-lds-use-with-pointer"
+
+using namespace llvm;
+
+namespace {
+
+class ReplaceLDSUseImpl {
+ Module &M;
+ LLVMContext &Ctx;
+ const DataLayout &DL;
+ Constant *LDSMemBaseAddr;
+
+ DenseMap<GlobalVariable *, GlobalVariable *> LDSToPointer;
+ DenseMap<GlobalVariable *, SmallPtrSet<Function *, 8>> LDSToNonKernels;
+ DenseMap<Function *, SmallPtrSet<Function *, 8>> KernelToCallees;
+ DenseMap<Function *, SmallPtrSet<GlobalVariable *, 8>> KernelToLDSPointers;
+ DenseMap<Function *, BasicBlock *> KernelToInitBB;
+ DenseMap<Function *, DenseMap<GlobalVariable *, Value *>>
+ FunctionToLDSToReplaceInst;
+
+ // Collect LDS which requires their uses to be replaced by pointer.
+ std::vector<GlobalVariable *> collectLDSRequiringPointerReplace() {
+ // Collect LDS which requires module lowering.
+ std::vector<GlobalVariable *> LDSGlobals = AMDGPU::findVariablesToLower(M);
+
+ // Remove LDS which don't qualify for replacement.
+ LDSGlobals.erase(std::remove_if(LDSGlobals.begin(), LDSGlobals.end(),
+ [&](GlobalVariable *GV) {
+ return shouldIgnorePointerReplacement(GV);
+ }),
+ LDSGlobals.end());
+
+ return LDSGlobals;
+ }
+
+ // Returns true if uses of given LDS global within non-kernel functions should
+ // be keep as it is without pointer replacement.
+ bool shouldIgnorePointerReplacement(GlobalVariable *GV) {
+ // LDS whose size is very small and doesn`t exceed pointer size is not worth
+ // replacing.
+ if (DL.getTypeAllocSize(GV->getValueType()) <= 2)
+ return true;
+
+ // LDS which is not used from non-kernel function scope or it is used from
+ // global scope does not qualify for replacement.
+ LDSToNonKernels[GV] = AMDGPU::collectNonKernelAccessorsOfLDS(GV);
+ return LDSToNonKernels[GV].empty();
+
+ // FIXME: When GV is used within all (or within most of the kernels), then
+ // it does not make sense to create a pointer for it.
+ }
+
+ // Insert new global LDS pointer which points to LDS.
+ GlobalVariable *createLDSPointer(GlobalVariable *GV) {
+ // LDS pointer which points to LDS is already created? return it.
+ auto PointerEntry = LDSToPointer.insert(std::make_pair(GV, nullptr));
+ if (!PointerEntry.second)
+ return PointerEntry.first->second;
+
+ // We need to create new LDS pointer which points to LDS.
+ //
+ // Each CU owns at max 64K of LDS memory, so LDS address ranges from 0 to
+ // 2^16 - 1. Hence 16 bit pointer is enough to hold the LDS address.
+ auto *I16Ty = Type::getInt16Ty(Ctx);
+ GlobalVariable *LDSPointer = new GlobalVariable(
+ M, I16Ty, false, GlobalValue::InternalLinkage, UndefValue::get(I16Ty),
+ GV->getName() + Twine(".ptr"), nullptr, GlobalVariable::NotThreadLocal,
+ AMDGPUAS::LOCAL_ADDRESS);
+
+ LDSPointer->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+ LDSPointer->setAlignment(AMDGPU::getAlign(DL, LDSPointer));
+
+ // Mark that an associated LDS pointer is created for LDS.
+ LDSToPointer[GV] = LDSPointer;
+
+ return LDSPointer;
+ }
+
+ // Split entry basic block in such a way that only lane 0 of each wave does
+ // the LDS pointer initialization, and return newly created basic block.
+ BasicBlock *activateLaneZero(Function *K) {
+ // If the entry basic block of kernel K is already splitted, then return
+ // newly created basic block.
+ auto BasicBlockEntry = KernelToInitBB.insert(std::make_pair(K, nullptr));
+ if (!BasicBlockEntry.second)
+ return BasicBlockEntry.first->second;
+
+ // Split entry basic block of kernel K.
+ auto *EI = &(*(K->getEntryBlock().getFirstInsertionPt()));
+ IRBuilder<> Builder(EI);
+
+ Value *Mbcnt =
+ Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
+ {Builder.getInt32(-1), Builder.getInt32(0)});
+ Value *Cond = Builder.CreateICmpEQ(Mbcnt, Builder.getInt32(0));
+ Instruction *WB = cast<Instruction>(
+ Builder.CreateIntrinsic(Intrinsic::amdgcn_wave_barrier, {}, {}));
+
+ BasicBlock *NBB = SplitBlockAndInsertIfThen(Cond, WB, false)->getParent();
+
+ // Mark that the entry basic block of kernel K is splitted.
+ KernelToInitBB[K] = NBB;
+
+ return NBB;
+ }
+
+ // Within given kernel, initialize given LDS pointer to point to given LDS.
+ void initializeLDSPointer(Function *K, GlobalVariable *GV,
+ GlobalVariable *LDSPointer) {
+ // If LDS pointer is already initialized within K, then nothing to do.
+ auto PointerEntry = KernelToLDSPointers.insert(
+ std::make_pair(K, SmallPtrSet<GlobalVariable *, 8>()));
+ if (!PointerEntry.second)
+ if (PointerEntry.first->second.contains(LDSPointer))
+ return;
+
+ // Insert instructions at EI which initialize LDS pointer to point-to LDS
+ // within kernel K.
+ //
+ // That is, convert pointer type of GV to i16, and then store this converted
+ // i16 value within LDSPointer which is of type i16*.
+ auto *EI = &(*(activateLaneZero(K)->getFirstInsertionPt()));
+ IRBuilder<> Builder(EI);
+ Builder.CreateStore(Builder.CreatePtrToInt(GV, Type::getInt16Ty(Ctx)),
+ LDSPointer);
+
+ // Mark that LDS pointer is initialized within kernel K.
+ KernelToLDSPointers[K].insert(LDSPointer);
+ }
+
+ // We have created an LDS pointer for LDS, and initialized it to point-to LDS
+ // within all relevent kernels. Now replace all the uses of LDS within
+ // non-kernel functions by LDS pointer.
+ void replaceLDSUseByPointer(GlobalVariable *GV, GlobalVariable *LDSPointer) {
+ SmallVector<User *, 8> LDSUsers(GV->users());
+ for (auto *U : LDSUsers) {
+ // When `U` is a constant expression, it is possible that same constant
+ // expression exists within multiple instructions, and within multiple
+ // non-kernel functions. Collect all those non-kernel functions and all
+ // those instructions within which `U` exist.
+ auto FunctionToInsts =
+ AMDGPU::getFunctionToInstsMap(U, false /*=CollectKernelInsts*/);
+
+ for (auto FI = FunctionToInsts.begin(), FE = FunctionToInsts.end();
+ FI != FE; ++FI) {
+ Function *F = FI->first;
+ auto &Insts = FI->second;
+ for (auto *I : Insts) {
+ // If `U` is a constant expression, then we need to break the
+ // associated instruction into a set of separate instructions by
+ // converting constant expressions into instructions.
+ SmallPtrSet<Instruction *, 8> UserInsts;
+
+ if (U == I) {
+ // `U` is an instruction, conversion from constant expression to
+ // set of instructions is *not* required.
+ UserInsts.insert(I);
+ } else {
+ // `U` is a constant expression, convert it into corresponding set
+ // of instructions.
+ auto *CE = cast<ConstantExpr>(U);
+ convertConstantExprsToInstructions(I, CE, &UserInsts);
+ }
+
+ // Go through all the user instrutions, if LDS exist within them as an
+ // operand, then replace it by replace instruction.
+ for (auto *II : UserInsts) {
+ auto *ReplaceInst = getReplacementInst(F, GV, LDSPointer);
+ II->replaceUsesOfWith(GV, ReplaceInst);
+ }
+ }
+ }
+ }
+ }
+
+ // Create a set of replacement instructions which together replace LDS within
+ // non-kernel function F by accessing LDS indirectly using LDS pointer.
+ Value *getReplacementInst(Function *F, GlobalVariable *GV,
+ GlobalVariable *LDSPointer) {
+ // If the instruction which replaces LDS within F is already created, then
+ // return it.
+ auto LDSEntry = FunctionToLDSToReplaceInst.insert(
+ std::make_pair(F, DenseMap<GlobalVariable *, Value *>()));
+ if (!LDSEntry.second) {
+ auto ReplaceInstEntry =
+ LDSEntry.first->second.insert(std::make_pair(GV, nullptr));
+ if (!ReplaceInstEntry.second)
+ return ReplaceInstEntry.first->second;
+ }
+
+ // Get the instruction insertion point within the beginning of the entry
+ // block of current non-kernel function.
+ auto *EI = &(*(F->getEntryBlock().getFirstInsertionPt()));
+ IRBuilder<> Builder(EI);
+
+ // Insert required set of instructions which replace LDS within F.
+ auto *V = Builder.CreateBitCast(
+ Builder.CreateGEP(
+ Builder.getInt8Ty(), LDSMemBaseAddr,
+ Builder.CreateLoad(LDSPointer->getValueType(), LDSPointer)),
+ GV->getType());
+
+ // Mark that the replacement instruction which replace LDS within F is
+ // created.
+ FunctionToLDSToReplaceInst[F][GV] = V;
+
+ return V;
+ }
+
+public:
+ ReplaceLDSUseImpl(Module &M)
+ : M(M), Ctx(M.getContext()), DL(M.getDataLayout()) {
+ LDSMemBaseAddr = Constant::getIntegerValue(
+ PointerType::get(Type::getInt8Ty(M.getContext()),
+ AMDGPUAS::LOCAL_ADDRESS),
+ APInt(32, 0));
+ }
+
+ // Entry-point function which interface ReplaceLDSUseImpl with outside of the
+ // class.
+ bool replaceLDSUse();
+
+private:
+ // For a given LDS from collected LDS globals set, replace its non-kernel
+ // function scope uses by pointer.
+ bool replaceLDSUse(GlobalVariable *GV);
+};
+
+// For given LDS from collected LDS globals set, replace its non-kernel function
+// scope uses by pointer.
+bool ReplaceLDSUseImpl::replaceLDSUse(GlobalVariable *GV) {
+ // Holds all those non-kernel functions within which LDS is being accessed.
+ SmallPtrSet<Function *, 8> &LDSAccessors = LDSToNonKernels[GV];
+
+ // The LDS pointer which points to LDS and replaces all the uses of LDS.
+ GlobalVariable *LDSPointer = nullptr;
+
+ // Traverse through each kernel K, check and if required, initialize the
+ // LDS pointer to point to LDS within K.
+ for (auto KI = KernelToCallees.begin(), KE = KernelToCallees.end(); KI != KE;
+ ++KI) {
+ Function *K = KI->first;
+ SmallPtrSet<Function *, 8> Callees = KI->second;
+
+ // Compute reachable and LDS used callees for kernel K.
+ set_intersect(Callees, LDSAccessors);
+
+ // None of the LDS accessing non-kernel functions are reachable from
+ // kernel K. Hence, no need to initialize LDS pointer within kernel K.
+ if (Callees.empty())
+ continue;
+
+ // We have found reachable and LDS used callees for kernel K, and we need to
+ // initialize LDS pointer within kernel K, and we need to replace LDS use
+ // within those callees by LDS pointer.
+ //
+ // But, first check if LDS pointer is already created, if not create one.
+ LDSPointer = createLDSPointer(GV);
+
+ // Initialize LDS pointer to point to LDS within kernel K.
+ initializeLDSPointer(K, GV, LDSPointer);
+ }
+
+ // We have not found reachable and LDS used callees for any of the kernels,
+ // and hence we have not created LDS pointer.
+ if (!LDSPointer)
+ return false;
+
+ // We have created an LDS pointer for LDS, and initialized it to point-to LDS
+ // within all relevent kernels. Now replace all the uses of LDS within
+ // non-kernel functions by LDS pointer.
+ replaceLDSUseByPointer(GV, LDSPointer);
+
+ return true;
+}
+
+// Entry-point function which interface ReplaceLDSUseImpl with outside of the
+// class.
+bool ReplaceLDSUseImpl::replaceLDSUse() {
+ // Collect LDS which requires their uses to be replaced by pointer.
+ std::vector<GlobalVariable *> LDSGlobals =
+ collectLDSRequiringPointerReplace();
+
+ // No LDS to pointer-replace. Nothing to do.
+ if (LDSGlobals.empty())
+ return false;
+
+ // Collect reachable callee set for each kernel defined in the module.
+ AMDGPU::collectReachableCallees(M, KernelToCallees);
+
+ if (KernelToCallees.empty()) {
+ // Either module does not have any kernel definitions, or none of the kernel
+ // has a call to non-kernel functions, or we could not resolve any of the
+ // call sites to proper non-kernel functions, because of the situations like
+ // inline asm calls. Nothing to replace.
+ return false;
+ }
+
+ // For every LDS from collected LDS globals set, replace its non-kernel
+ // function scope use by pointer.
+ bool Changed = false;
+ for (auto *GV : LDSGlobals)
+ Changed |= replaceLDSUse(GV);
+
+ return Changed;
+}
+
+class AMDGPUReplaceLDSUseWithPointer : public ModulePass {
+public:
+ static char ID;
+
+ AMDGPUReplaceLDSUseWithPointer() : ModulePass(ID) {
+ initializeAMDGPUReplaceLDSUseWithPointerPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetPassConfig>();
+ }
+};
+
+} // namespace
+
+char AMDGPUReplaceLDSUseWithPointer::ID = 0;
+char &llvm::AMDGPUReplaceLDSUseWithPointerID =
+ AMDGPUReplaceLDSUseWithPointer::ID;
+
+INITIALIZE_PASS_BEGIN(
+ AMDGPUReplaceLDSUseWithPointer, DEBUG_TYPE,
+ "Replace within non-kernel function use of LDS with pointer",
+ false /*only look at the cfg*/, false /*analysis pass*/)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(
+ AMDGPUReplaceLDSUseWithPointer, DEBUG_TYPE,
+ "Replace within non-kernel function use of LDS with pointer",
+ false /*only look at the cfg*/, false /*analysis pass*/)
+
+bool AMDGPUReplaceLDSUseWithPointer::runOnModule(Module &M) {
+ ReplaceLDSUseImpl LDSUseReplacer{M};
+ return LDSUseReplacer.replaceLDSUse();
+}
+
+ModulePass *llvm::createAMDGPUReplaceLDSUseWithPointerPass() {
+ return new AMDGPUReplaceLDSUseWithPointer();
+}
+
+PreservedAnalyses
+AMDGPUReplaceLDSUseWithPointerPass::run(Module &M, ModuleAnalysisManager &AM) {
+ ReplaceLDSUseImpl LDSUseReplacer{M};
+ LDSUseReplacer.replaceLDSUse();
+ return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
new file mode 100644
index 000000000000..ef46e53b7460
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -0,0 +1,514 @@
+//===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Analyzes how many registers and other resources are used by
+/// functions.
+///
+/// The results of this analysis are used to fill the register usage, flat
+/// usage, etc. into hardware registers.
+///
+/// The analysis takes callees into account. E.g. if a function A that needs 10
+/// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
+/// will return 20.
+/// It is assumed that an indirect call can go into any function except
+/// hardware-entrypoints. Therefore the register usage of functions with
+/// indirect calls is estimated as the maximum of all non-entrypoint functions
+/// in the module.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUResourceUsageAnalysis.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+
+#define DEBUG_TYPE "amdgpu-resource-usage"
+
+char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
+char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
+
+// We need to tell the runtime some amount ahead of time if we don't know the
+// true stack size. Assume a smaller number if this is only due to dynamic /
+// non-entry block allocas.
+static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
+ "amdgpu-assume-external-call-stack-size",
+ cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
+ cl::init(16384));
+
+static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
+ "amdgpu-assume-dynamic-stack-object-size",
+ cl::desc("Assumed extra stack use if there are any "
+ "variable sized objects (in bytes)"),
+ cl::Hidden, cl::init(4096));
+
+INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
+ "Function register usage analysis", true, true)
+
+static const Function *getCalleeFunction(const MachineOperand &Op) {
+ if (Op.isImm()) {
+ assert(Op.getImm() == 0);
+ return nullptr;
+ }
+
+ return cast<Function>(Op.getGlobal());
+}
+
+static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
+ const SIInstrInfo &TII, unsigned Reg) {
+ for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
+ if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
+ return true;
+ }
+
+ return false;
+}
+
+int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
+ const GCNSubtarget &ST) const {
+ return NumExplicitSGPR +
+ IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
+ ST.getTargetID().isXnackOnOrAny());
+}
+
+int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
+ const GCNSubtarget &ST) const {
+ if (ST.hasGFX90AInsts() && NumAGPR)
+ return alignTo(NumVGPR, 4) + NumAGPR;
+ return std::max(NumVGPR, NumAGPR);
+}
+
+bool AMDGPUResourceUsageAnalysis::runOnSCC(CallGraphSCC &SCC) {
+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+ if (!TPC)
+ return false;
+
+ const TargetMachine &TM = TPC->getTM<TargetMachine>();
+ bool HasIndirectCall = false;
+
+ for (CallGraphNode *I : SCC) {
+ Function *F = I->getFunction();
+ if (!F || F->isDeclaration())
+ continue;
+
+ MachineModuleInfo &MMI =
+ getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+ MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);
+
+ auto CI = CallGraphResourceInfo.insert(
+ std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
+ SIFunctionResourceInfo &Info = CI.first->second;
+ assert(CI.second && "should only be called once per function");
+ Info = analyzeResourceUsage(MF, TM);
+ HasIndirectCall |= Info.HasIndirectCall;
+ }
+
+ if (HasIndirectCall)
+ propagateIndirectCallRegisterUsage();
+
+ return false;
+}
+
+AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
+AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
+ const MachineFunction &MF, const TargetMachine &TM) const {
+ SIFunctionResourceInfo Info;
+
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
+ Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
+ MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
+ MRI.isLiveIn(MFI->getPreloadedReg(
+ AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
+
+ // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
+ // instructions aren't used to access the scratch buffer. Inline assembly may
+ // need it though.
+ //
+ // If we only have implicit uses of flat_scr on flat instructions, it is not
+ // really needed.
+ if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
+ (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
+ !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
+ !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
+ Info.UsesFlatScratch = false;
+ }
+
+ Info.PrivateSegmentSize = FrameInfo.getStackSize();
+
+ // Assume a big number if there are any unknown sized objects.
+ Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
+ if (Info.HasDynamicallySizedStack)
+ Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
+
+ if (MFI->isStackRealigned())
+ Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
+
+ Info.UsesVCC =
+ MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
+
+ // If there are no calls, MachineRegisterInfo can tell us the used register
+ // count easily.
+ // A tail call isn't considered a call for MachineFrameInfo's purposes.
+ if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
+ MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
+ for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
+ if (MRI.isPhysRegUsed(Reg)) {
+ HighestVGPRReg = Reg;
+ break;
+ }
+ }
+
+ if (ST.hasMAIInsts()) {
+ MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
+ for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
+ if (MRI.isPhysRegUsed(Reg)) {
+ HighestAGPRReg = Reg;
+ break;
+ }
+ }
+ Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
+ ? 0
+ : TRI.getHWRegIndex(HighestAGPRReg) + 1;
+ }
+
+ MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
+ for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
+ if (MRI.isPhysRegUsed(Reg)) {
+ HighestSGPRReg = Reg;
+ break;
+ }
+ }
+
+ // We found the maximum register index. They start at 0, so add one to get
+ // the number of registers.
+ Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
+ ? 0
+ : TRI.getHWRegIndex(HighestVGPRReg) + 1;
+ Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
+ ? 0
+ : TRI.getHWRegIndex(HighestSGPRReg) + 1;
+
+ return Info;
+ }
+
+ int32_t MaxVGPR = -1;
+ int32_t MaxAGPR = -1;
+ int32_t MaxSGPR = -1;
+ uint64_t CalleeFrameSize = 0;
+
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ // TODO: Check regmasks? Do they occur anywhere except calls?
+ for (const MachineOperand &MO : MI.operands()) {
+ unsigned Width = 0;
+ bool IsSGPR = false;
+ bool IsAGPR = false;
+
+ if (!MO.isReg())
+ continue;
+
+ Register Reg = MO.getReg();
+ switch (Reg) {
+ case AMDGPU::EXEC:
+ case AMDGPU::EXEC_LO:
+ case AMDGPU::EXEC_HI:
+ case AMDGPU::SCC:
+ case AMDGPU::M0:
+ case AMDGPU::M0_LO16:
+ case AMDGPU::M0_HI16:
+ case AMDGPU::SRC_SHARED_BASE:
+ case AMDGPU::SRC_SHARED_LIMIT:
+ case AMDGPU::SRC_PRIVATE_BASE:
+ case AMDGPU::SRC_PRIVATE_LIMIT:
+ case AMDGPU::SGPR_NULL:
+ case AMDGPU::MODE:
+ continue;
+
+ case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
+ llvm_unreachable("src_pops_exiting_wave_id should not be used");
+
+ case AMDGPU::NoRegister:
+ assert(MI.isDebugInstr() &&
+ "Instruction uses invalid noreg register");
+ continue;
+
+ case AMDGPU::VCC:
+ case AMDGPU::VCC_LO:
+ case AMDGPU::VCC_HI:
+ case AMDGPU::VCC_LO_LO16:
+ case AMDGPU::VCC_LO_HI16:
+ case AMDGPU::VCC_HI_LO16:
+ case AMDGPU::VCC_HI_HI16:
+ Info.UsesVCC = true;
+ continue;
+
+ case AMDGPU::FLAT_SCR:
+ case AMDGPU::FLAT_SCR_LO:
+ case AMDGPU::FLAT_SCR_HI:
+ continue;
+
+ case AMDGPU::XNACK_MASK:
+ case AMDGPU::XNACK_MASK_LO:
+ case AMDGPU::XNACK_MASK_HI:
+ llvm_unreachable("xnack_mask registers should not be used");
+
+ case AMDGPU::LDS_DIRECT:
+ llvm_unreachable("lds_direct register should not be used");
+
+ case AMDGPU::TBA:
+ case AMDGPU::TBA_LO:
+ case AMDGPU::TBA_HI:
+ case AMDGPU::TMA:
+ case AMDGPU::TMA_LO:
+ case AMDGPU::TMA_HI:
+ llvm_unreachable("trap handler registers should not be used");
+
+ case AMDGPU::SRC_VCCZ:
+ llvm_unreachable("src_vccz register should not be used");
+
+ case AMDGPU::SRC_EXECZ:
+ llvm_unreachable("src_execz register should not be used");
+
+ case AMDGPU::SRC_SCC:
+ llvm_unreachable("src_scc register should not be used");
+
+ default:
+ break;
+ }
+
+ if (AMDGPU::SReg_32RegClass.contains(Reg) ||
+ AMDGPU::SReg_LO16RegClass.contains(Reg) ||
+ AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
+ assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
+ "trap handler registers should not be used");
+ IsSGPR = true;
+ Width = 1;
+ } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
+ AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
+ AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 1;
+ } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
+ AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 1;
+ } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
+ assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
+ "trap handler registers should not be used");
+ IsSGPR = true;
+ Width = 2;
+ } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 2;
+ } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 2;
+ } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 3;
+ } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 3;
+ } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 3;
+ } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
+ assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
+ "trap handler registers should not be used");
+ IsSGPR = true;
+ Width = 4;
+ } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 4;
+ } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 4;
+ } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 5;
+ } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 5;
+ } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 5;
+ } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 6;
+ } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 6;
+ } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 6;
+ } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 7;
+ } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 7;
+ } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 7;
+ } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
+ assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
+ "trap handler registers should not be used");
+ IsSGPR = true;
+ Width = 8;
+ } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 8;
+ } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 8;
+ } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
+ assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
+ "trap handler registers should not be used");
+ IsSGPR = true;
+ Width = 16;
+ } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 16;
+ } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 16;
+ } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 32;
+ } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 32;
+ } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 32;
+ } else {
+ llvm_unreachable("Unknown register class");
+ }
+ unsigned HWReg = TRI.getHWRegIndex(Reg);
+ int MaxUsed = HWReg + Width - 1;
+ if (IsSGPR) {
+ MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
+ } else if (IsAGPR) {
+ MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
+ } else {
+ MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
+ }
+ }
+
+ if (MI.isCall()) {
+ // Pseudo used just to encode the underlying global. Is there a better
+ // way to track this?
+
+ const MachineOperand *CalleeOp =
+ TII->getNamedOperand(MI, AMDGPU::OpName::callee);
+
+ const Function *Callee = getCalleeFunction(*CalleeOp);
+ DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
+ CallGraphResourceInfo.end();
+
+ // Avoid crashing on undefined behavior with an illegal call to a
+ // kernel. If a callsite's calling convention doesn't match the
+ // function's, it's undefined behavior. If the callsite calling
+ // convention does match, that would have errored earlier.
+ if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
+ report_fatal_error("invalid call to entry function");
+
+ bool IsIndirect = !Callee || Callee->isDeclaration();
+ if (!IsIndirect)
+ I = CallGraphResourceInfo.find(Callee);
+
+ if (IsIndirect || I == CallGraphResourceInfo.end()) {
+ CalleeFrameSize =
+ std::max(CalleeFrameSize,
+ static_cast<uint64_t>(AssumedStackSizeForExternalCall));
+
+ // Register usage of indirect calls gets handled later
+ Info.UsesVCC = true;
+ Info.UsesFlatScratch = ST.hasFlatAddressSpace();
+ Info.HasDynamicallySizedStack = true;
+ Info.HasIndirectCall = true;
+ } else {
+ // We force CodeGen to run in SCC order, so the callee's register
+ // usage etc. should be the cumulative usage of all callees.
+ MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
+ MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
+ MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
+ CalleeFrameSize =
+ std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
+ Info.UsesVCC |= I->second.UsesVCC;
+ Info.UsesFlatScratch |= I->second.UsesFlatScratch;
+ Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
+ Info.HasRecursion |= I->second.HasRecursion;
+ Info.HasIndirectCall |= I->second.HasIndirectCall;
+ }
+
+ // FIXME: Call site could have norecurse on it
+ if (!Callee || !Callee->doesNotRecurse())
+ Info.HasRecursion = true;
+ }
+ }
+ }
+
+ Info.NumExplicitSGPR = MaxSGPR + 1;
+ Info.NumVGPR = MaxVGPR + 1;
+ Info.NumAGPR = MaxAGPR + 1;
+ Info.PrivateSegmentSize += CalleeFrameSize;
+
+ return Info;
+}
+
+void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
+ // Collect the maximum number of registers from non-hardware-entrypoints.
+ // All these functions are potential targets for indirect calls.
+ int32_t NonKernelMaxSGPRs = 0;
+ int32_t NonKernelMaxVGPRs = 0;
+ int32_t NonKernelMaxAGPRs = 0;
+
+ for (const auto &I : CallGraphResourceInfo) {
+ if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
+ auto &Info = I.getSecond();
+ NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
+ NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
+ NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
+ }
+ }
+
+ // Add register usage for functions with indirect calls.
+ // For calls to unknown functions, we assume the maximum register usage of
+ // all non-hardware-entrypoints in the current module.
+ for (auto &I : CallGraphResourceInfo) {
+ auto &Info = I.getSecond();
+ if (Info.HasIndirectCall) {
+ Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
+ Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
+ Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
+ }
+ }
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
new file mode 100644
index 000000000000..832e8119e444
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
@@ -0,0 +1,79 @@
+//===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Analyzes how many registers and other resources are used by
+/// functions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H
+
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/ValueMap.h"
+
+namespace llvm {
+
+class GCNSubtarget;
+class MachineFunction;
+class TargetMachine;
+
+struct AMDGPUResourceUsageAnalysis : public CallGraphSCCPass {
+ static char ID;
+
+public:
+ // Track resource usage for callee functions.
+ struct SIFunctionResourceInfo {
+ // Track the number of explicitly used VGPRs. Special registers reserved at
+ // the end are tracked separately.
+ int32_t NumVGPR = 0;
+ int32_t NumAGPR = 0;
+ int32_t NumExplicitSGPR = 0;
+ uint64_t PrivateSegmentSize = 0;
+ bool UsesVCC = false;
+ bool UsesFlatScratch = false;
+ bool HasDynamicallySizedStack = false;
+ bool HasRecursion = false;
+ bool HasIndirectCall = false;
+
+ int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const;
+ int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const;
+ };
+
+ AMDGPUResourceUsageAnalysis() : CallGraphSCCPass(ID) {}
+
+ bool runOnSCC(CallGraphSCC &SCC) override;
+
+ bool doInitialization(CallGraph &CG) override {
+ CallGraphResourceInfo.clear();
+ return CallGraphSCCPass::doInitialization(CG);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineModuleInfoWrapperPass>();
+ AU.setPreservesAll();
+ }
+
+ const SIFunctionResourceInfo &getResourceInfo(const Function *F) const {
+ auto Info = CallGraphResourceInfo.find(F);
+ assert(Info != CallGraphResourceInfo.end() &&
+ "Failed to find resource info for function");
+ return Info->getSecond();
+ }
+
+private:
+ SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF,
+ const TargetMachine &TM) const;
+ void propagateIndirectCallRegisterUsage();
+
+ DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;
+};
+} // namespace llvm
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index fd65727f04d4..afe016731395 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -199,6 +199,12 @@ def : SourceOfDivergence<int_r600_read_tidig_z>;
def : SourceOfDivergence<int_amdgcn_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
+def : SourceOfDivergence<int_amdgcn_global_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>;
+def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>;
+def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>;
+def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_ds_fadd>;
def : SourceOfDivergence<int_amdgcn_ds_fmin>;
def : SourceOfDivergence<int_amdgcn_ds_fmax>;
@@ -226,6 +232,8 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmin>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>;
@@ -240,9 +248,12 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmin>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>;
def : SourceOfDivergence<int_amdgcn_ps_live>;
+def : SourceOfDivergence<int_amdgcn_live_mask>;
def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
def : SourceOfDivergence<int_amdgcn_ds_ordered_add>;
def : SourceOfDivergence<int_amdgcn_ds_ordered_swap>;
@@ -274,6 +285,13 @@ def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x4i8>;
def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x8i8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2bf16>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16_1k>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4bf16_1k>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x4bf16_1k>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8bf16_1k>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16bf16_1k>;
+def : SourceOfDivergence<int_amdgcn_mfma_f64_16x16x4f64>;
+def : SourceOfDivergence<int_amdgcn_mfma_f64_4x4x4f64>;
// The dummy boolean output is divergent from the IR's perspective,
// but the mask results are uniform. These produce a divergent and
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index f1a7d7463676..0c5020dccecd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -98,12 +98,12 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
// Disable mutually exclusive bits.
- if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
- if (FS.find_lower("wavefrontsize16") == StringRef::npos)
+ if (FS.find_insensitive("+wavefrontsize") != StringRef::npos) {
+ if (FS.find_insensitive("wavefrontsize16") == StringRef::npos)
FullFS += "-wavefrontsize16,";
- if (FS.find_lower("wavefrontsize32") == StringRef::npos)
+ if (FS.find_insensitive("wavefrontsize32") == StringRef::npos)
FullFS += "-wavefrontsize32,";
- if (FS.find_lower("wavefrontsize64") == StringRef::npos)
+ if (FS.find_insensitive("wavefrontsize64") == StringRef::npos)
FullFS += "-wavefrontsize64,";
}
@@ -163,6 +163,7 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
WavefrontSizeLog2 = 5;
HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
+ HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
TargetID.setTargetIDFromFeaturesString(FS);
@@ -176,6 +177,7 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
TargetTriple(TT),
+ GCN3Encoding(false),
Has16BitInsts(false),
HasMadMixInsts(false),
HasMadMacF32Insts(false),
@@ -184,6 +186,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
HasVOP3PInsts(false),
HasMulI24(true),
HasMulU24(true),
+ HasSMulHi(false),
HasInv2PiInlineImm(false),
HasFminFmaxLegacy(true),
EnablePromoteAlloca(false),
@@ -194,7 +197,8 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
{ }
GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
- const GCNTargetMachine &TM) :
+ const GCNTargetMachine &TM)
+ : // clang-format off
AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
AMDGPUSubtarget(TT),
TargetTriple(TT),
@@ -207,6 +211,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
FastFMAF32(false),
FastDenormalF32(false),
HalfRate64Ops(false),
+ FullRate64Ops(false),
FlatForGlobal(false),
AutoWaitcntBeforeBarrier(false),
@@ -216,6 +221,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasApertureRegs(false),
SupportsXNACK(false),
EnableXNACK(false),
+ EnableTgSplit(false),
EnableCuMode(false),
TrapHandler(false),
@@ -227,14 +233,16 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
DumpCode(false),
FP64(false),
- GCN3Encoding(false),
CIInsts(false),
GFX8Insts(false),
GFX9Insts(false),
+ GFX90AInsts(false),
GFX10Insts(false),
GFX10_3Insts(false),
GFX7GFX8GFX9Insts(false),
SGPRInitBug(false),
+ NegativeScratchOffsetBug(false),
+ NegativeUnalignedScratchOffsetBug(false),
HasSMemRealTime(false),
HasIntClamp(false),
HasFmaMixInsts(false),
@@ -249,10 +257,15 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasSDWAOutModsVOPC(false),
HasDPP(false),
HasDPP8(false),
+ Has64BitDPP(false),
+ HasPackedFP32Ops(false),
+ HasExtendedImageInsts(false),
HasR128A16(false),
HasGFX10A16(false),
HasG16(false),
HasNSAEncoding(false),
+ NSAMaxSize(0),
+ GFX10_AEncoding(false),
GFX10_BEncoding(false),
HasDLInsts(false),
HasDot1Insts(false),
@@ -261,6 +274,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasDot4Insts(false),
HasDot5Insts(false),
HasDot6Insts(false),
+ HasDot7Insts(false),
HasMAIInsts(false),
HasPkFmacF16Inst(false),
HasAtomicFaddInsts(false),
@@ -270,6 +284,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasVscnt(false),
HasGetWaveIdInst(false),
HasSMemTimeInst(false),
+ HasShaderCyclesRegister(false),
HasRegisterBanking(false),
HasVOP3Literal(false),
HasNoDataDepHazard(false),
@@ -278,12 +293,14 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
FlatGlobalInsts(false),
FlatScratchInsts(false),
ScalarFlatScratchInsts(false),
+ HasArchitectedFlatScratch(false),
AddNoCarryInsts(false),
HasUnpackedD16VMem(false),
LDSMisalignedBug(false),
HasMFMAInlineLiteralBug(false),
UnalignedBufferAccess(false),
UnalignedDSAccess(false),
+ HasPackedTID(false),
ScalarizeGlobal(false),
@@ -294,6 +311,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasVcmpxExecWARHazard(false),
HasLdsBranchVmemWARHazard(false),
HasNSAtoVMEMBug(false),
+ HasNSAClauseBug(false),
HasOffset3fBug(false),
HasFlatSegmentOffsetBug(false),
HasImageStoreD16Bug(false),
@@ -303,6 +321,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
TLInfo(TM, *this),
FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
+ // clang-format on
MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
@@ -313,7 +332,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
}
bool GCNSubtarget::enableFlatScratch() const {
- return EnableFlatScratch && hasFlatScratchInsts();
+ return flatScratchIsArchitected() ||
+ (EnableFlatScratch && hasFlatScratchInsts());
}
unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
@@ -336,6 +356,105 @@ unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
return 2;
}
+/// This list was mostly derived from experimentation.
+bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
+ switch (Opcode) {
+ case AMDGPU::V_CVT_F16_F32_e32:
+ case AMDGPU::V_CVT_F16_F32_e64:
+ case AMDGPU::V_CVT_F16_U16_e32:
+ case AMDGPU::V_CVT_F16_U16_e64:
+ case AMDGPU::V_CVT_F16_I16_e32:
+ case AMDGPU::V_CVT_F16_I16_e64:
+ case AMDGPU::V_RCP_F16_e64:
+ case AMDGPU::V_RCP_F16_e32:
+ case AMDGPU::V_RSQ_F16_e64:
+ case AMDGPU::V_RSQ_F16_e32:
+ case AMDGPU::V_SQRT_F16_e64:
+ case AMDGPU::V_SQRT_F16_e32:
+ case AMDGPU::V_LOG_F16_e64:
+ case AMDGPU::V_LOG_F16_e32:
+ case AMDGPU::V_EXP_F16_e64:
+ case AMDGPU::V_EXP_F16_e32:
+ case AMDGPU::V_SIN_F16_e64:
+ case AMDGPU::V_SIN_F16_e32:
+ case AMDGPU::V_COS_F16_e64:
+ case AMDGPU::V_COS_F16_e32:
+ case AMDGPU::V_FLOOR_F16_e64:
+ case AMDGPU::V_FLOOR_F16_e32:
+ case AMDGPU::V_CEIL_F16_e64:
+ case AMDGPU::V_CEIL_F16_e32:
+ case AMDGPU::V_TRUNC_F16_e64:
+ case AMDGPU::V_TRUNC_F16_e32:
+ case AMDGPU::V_RNDNE_F16_e64:
+ case AMDGPU::V_RNDNE_F16_e32:
+ case AMDGPU::V_FRACT_F16_e64:
+ case AMDGPU::V_FRACT_F16_e32:
+ case AMDGPU::V_FREXP_MANT_F16_e64:
+ case AMDGPU::V_FREXP_MANT_F16_e32:
+ case AMDGPU::V_FREXP_EXP_I16_F16_e64:
+ case AMDGPU::V_FREXP_EXP_I16_F16_e32:
+ case AMDGPU::V_LDEXP_F16_e64:
+ case AMDGPU::V_LDEXP_F16_e32:
+ case AMDGPU::V_LSHLREV_B16_e64:
+ case AMDGPU::V_LSHLREV_B16_e32:
+ case AMDGPU::V_LSHRREV_B16_e64:
+ case AMDGPU::V_LSHRREV_B16_e32:
+ case AMDGPU::V_ASHRREV_I16_e64:
+ case AMDGPU::V_ASHRREV_I16_e32:
+ case AMDGPU::V_ADD_U16_e64:
+ case AMDGPU::V_ADD_U16_e32:
+ case AMDGPU::V_SUB_U16_e64:
+ case AMDGPU::V_SUB_U16_e32:
+ case AMDGPU::V_SUBREV_U16_e64:
+ case AMDGPU::V_SUBREV_U16_e32:
+ case AMDGPU::V_MUL_LO_U16_e64:
+ case AMDGPU::V_MUL_LO_U16_e32:
+ case AMDGPU::V_ADD_F16_e64:
+ case AMDGPU::V_ADD_F16_e32:
+ case AMDGPU::V_SUB_F16_e64:
+ case AMDGPU::V_SUB_F16_e32:
+ case AMDGPU::V_SUBREV_F16_e64:
+ case AMDGPU::V_SUBREV_F16_e32:
+ case AMDGPU::V_MUL_F16_e64:
+ case AMDGPU::V_MUL_F16_e32:
+ case AMDGPU::V_MAX_F16_e64:
+ case AMDGPU::V_MAX_F16_e32:
+ case AMDGPU::V_MIN_F16_e64:
+ case AMDGPU::V_MIN_F16_e32:
+ case AMDGPU::V_MAX_U16_e64:
+ case AMDGPU::V_MAX_U16_e32:
+ case AMDGPU::V_MIN_U16_e64:
+ case AMDGPU::V_MIN_U16_e32:
+ case AMDGPU::V_MAX_I16_e64:
+ case AMDGPU::V_MAX_I16_e32:
+ case AMDGPU::V_MIN_I16_e64:
+ case AMDGPU::V_MIN_I16_e32:
+ // On gfx10, all 16-bit instructions preserve the high bits.
+ return getGeneration() <= AMDGPUSubtarget::GFX9;
+ case AMDGPU::V_MAD_F16_e64:
+ case AMDGPU::V_MADAK_F16:
+ case AMDGPU::V_MADMK_F16:
+ case AMDGPU::V_MAC_F16_e64:
+ case AMDGPU::V_MAC_F16_e32:
+ case AMDGPU::V_FMAMK_F16:
+ case AMDGPU::V_FMAAK_F16:
+ case AMDGPU::V_MAD_U16_e64:
+ case AMDGPU::V_MAD_I16_e64:
+ case AMDGPU::V_FMA_F16_e64:
+ case AMDGPU::V_FMAC_F16_e64:
+ case AMDGPU::V_FMAC_F16_e32:
+ case AMDGPU::V_DIV_FIXUP_F16_e64:
+ // In gfx9, the preferred handling of the unused high 16-bits changed. Most
+ // instructions maintain the legacy behavior of 0ing. Some instructions
+ // changed to preserving the high bits.
+ return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
+ case AMDGPU::V_MAD_MIXLO_F16:
+ case AMDGPU::V_MAD_MIXHI_F16:
+ default:
+ return false;
+ }
+}
+
unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
const Function &F) const {
if (NWaves == 1)
@@ -681,12 +800,12 @@ unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
}
-unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
- const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+unsigned
+GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const {
if (getGeneration() >= AMDGPUSubtarget::GFX10)
return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
- if (MFI.hasFlatScratchInit()) {
+ if (HasFlatScratchInit) {
if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
@@ -698,6 +817,28 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
return 2; // VCC.
}
+unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
+ const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+ return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
+}
+
+unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
+ // The logic to detect if the function has
+ // flat scratch init is slightly different than how
+ // SIMachineFunctionInfo constructor derives.
+ // We don't use amdgpu-calls, amdgpu-stack-objects
+ // attributes and isAmdHsaOrMesa here as it doesn't really matter.
+ // TODO: Outline this derivation logic and have just
+ // one common function in the backend to avoid duplication.
+ bool isEntry = AMDGPU::isEntryFunctionCC(F.getCallingConv());
+ bool FunctionHasFlatScratchInit = false;
+ if (hasFlatAddressSpace() && isEntry && !flatScratchIsArchitected() &&
+ enableFlatScratch()) {
+ FunctionHasFlatScratchInit = true;
+ }
+ return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit);
+}
+
unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
unsigned NumSGPRs,
unsigned NumVGPRs) const {
@@ -711,13 +852,11 @@ unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
return Occupancy;
}
-unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
- const Function &F = MF.getFunction();
- const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
-
+unsigned GCNSubtarget::getBaseMaxNumSGPRs(
+ const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
+ unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
// Compute maximum number of SGPRs function can use using default/requested
// minimum number of waves per execution unit.
- std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
@@ -728,7 +867,7 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
F, "amdgpu-num-sgpr", MaxNumSGPRs);
// Make sure requested value does not violate subtarget's specifications.
- if (Requested && (Requested <= getReservedNumSGPRs(MF)))
+ if (Requested && (Requested <= ReservedNumSGPRs))
Requested = 0;
// If more SGPRs are required to support the input user/system SGPRs,
@@ -738,7 +877,7 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
// of reserved special registers in total. Theoretically you could re-use
// the last input registers for these special registers, but this would
// require a lot of complexity to deal with the weird aliasing.
- unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
+ unsigned InputNumSGPRs = PreloadedSGPRs;
if (Requested && Requested < InputNumSGPRs)
Requested = InputNumSGPRs;
@@ -757,17 +896,43 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
if (hasSGPRInitBug())
MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
- return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
- MaxAddressableNumSGPRs);
+ return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
}
-unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
+unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
const Function &F = MF.getFunction();
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+ return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
+ getReservedNumSGPRs(MF));
+}
+
+static unsigned getMaxNumPreloadedSGPRs() {
+ // Max number of user SGPRs
+ unsigned MaxUserSGPRs = 4 + // private segment buffer
+ 2 + // Dispatch ptr
+ 2 + // queue ptr
+ 2 + // kernel segment ptr
+ 2 + // dispatch ID
+ 2 + // flat scratch init
+ 2; // Implicit buffer ptr
+ // Max number of system SGPRs
+ unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
+ 1 + // WorkGroupIDY
+ 1 + // WorkGroupIDZ
+ 1 + // WorkGroupInfo
+ 1; // private segment wave byte offset
+ return MaxUserSGPRs + MaxSystemSGPRs;
+}
+
+unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
+ return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
+ getReservedNumSGPRs(F));
+}
+unsigned GCNSubtarget::getBaseMaxNumVGPRs(
+ const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
// Compute maximum number of VGPRs function can use using default/requested
// minimum number of waves per execution unit.
- std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
// Check if maximum number of VGPRs was explicitly requested using
@@ -776,6 +941,9 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
unsigned Requested = AMDGPU::getIntegerAttribute(
F, "amdgpu-num-vgpr", MaxNumVGPRs);
+ if (hasGFX90AInsts())
+ Requested *= 2;
+
// Make sure requested value is compatible with values implied by
// default/requested minimum/maximum number of waves per execution unit.
if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
@@ -791,6 +959,16 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
return MaxNumVGPRs;
}
+unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
+ return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
+}
+
+unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
+ const Function &F = MF.getFunction();
+ const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+ return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
+}
+
void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
int UseOpIdx, SDep &Dep) const {
if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index ba3a8acae551..b160cdf3a97a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -45,6 +45,7 @@ private:
Triple TargetTriple;
protected:
+ bool GCN3Encoding;
bool Has16BitInsts;
bool HasMadMixInsts;
bool HasMadMacF32Insts;
@@ -53,6 +54,7 @@ protected:
bool HasVOP3PInsts;
bool HasMulI24;
bool HasMulU24;
+ bool HasSMulHi;
bool HasInv2PiInlineImm;
bool HasFminFmaxLegacy;
bool EnablePromoteAlloca;
@@ -124,6 +126,10 @@ public:
return TargetTriple.getArch() == Triple::amdgcn;
}
+ bool isGCN3Encoding() const {
+ return GCN3Encoding;
+ }
+
bool has16BitInsts() const {
return Has16BitInsts;
}
@@ -156,6 +162,10 @@ public:
return HasMulU24;
}
+ bool hasSMulHi() const {
+ return HasSMulHi;
+ }
+
bool hasInv2PiInlineImm() const {
return HasInv2PiInlineImm;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ce7c82e2a88a..e4485f87fb79 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -32,6 +32,8 @@
#include "llvm/CodeGen/GlobalISel/Localizer.h"
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
#include "llvm/CodeGen/MIRParser/MIParser.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/PassManager.h"
@@ -52,6 +54,115 @@
using namespace llvm;
+namespace {
+class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
+public:
+ SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
+ : RegisterRegAllocBase(N, D, C) {}
+};
+
+class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
+public:
+ VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
+ : RegisterRegAllocBase(N, D, C) {}
+};
+
+static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
+ const TargetRegisterClass &RC) {
+ return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
+}
+
+static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
+ const TargetRegisterClass &RC) {
+ return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
+}
+
+
+/// -{sgpr|vgpr}-regalloc=... command line option.
+static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
+
+/// A dummy default pass factory indicates whether the register allocator is
+/// overridden on the command line.
+static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
+static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
+
+static SGPRRegisterRegAlloc
+defaultSGPRRegAlloc("default",
+ "pick SGPR register allocator based on -O option",
+ useDefaultRegisterAllocator);
+
+static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
+ RegisterPassParser<SGPRRegisterRegAlloc>>
+SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
+ cl::desc("Register allocator to use for SGPRs"));
+
+static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
+ RegisterPassParser<VGPRRegisterRegAlloc>>
+VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
+ cl::desc("Register allocator to use for VGPRs"));
+
+
+static void initializeDefaultSGPRRegisterAllocatorOnce() {
+ RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
+
+ if (!Ctor) {
+ Ctor = SGPRRegAlloc;
+ SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
+ }
+}
+
+static void initializeDefaultVGPRRegisterAllocatorOnce() {
+ RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
+
+ if (!Ctor) {
+ Ctor = VGPRRegAlloc;
+ VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
+ }
+}
+
+static FunctionPass *createBasicSGPRRegisterAllocator() {
+ return createBasicRegisterAllocator(onlyAllocateSGPRs);
+}
+
+static FunctionPass *createGreedySGPRRegisterAllocator() {
+ return createGreedyRegisterAllocator(onlyAllocateSGPRs);
+}
+
+static FunctionPass *createFastSGPRRegisterAllocator() {
+ return createFastRegisterAllocator(onlyAllocateSGPRs, false);
+}
+
+static FunctionPass *createBasicVGPRRegisterAllocator() {
+ return createBasicRegisterAllocator(onlyAllocateVGPRs);
+}
+
+static FunctionPass *createGreedyVGPRRegisterAllocator() {
+ return createGreedyRegisterAllocator(onlyAllocateVGPRs);
+}
+
+static FunctionPass *createFastVGPRRegisterAllocator() {
+ return createFastRegisterAllocator(onlyAllocateVGPRs, true);
+}
+
+static SGPRRegisterRegAlloc basicRegAllocSGPR(
+ "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
+static SGPRRegisterRegAlloc greedyRegAllocSGPR(
+ "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
+
+static SGPRRegisterRegAlloc fastRegAllocSGPR(
+ "fast", "fast register allocator", createFastSGPRRegisterAllocator);
+
+
+static VGPRRegisterRegAlloc basicRegAllocVGPR(
+ "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
+static VGPRRegisterRegAlloc greedyRegAllocVGPR(
+ "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
+
+static VGPRRegisterRegAlloc fastRegAllocVGPR(
+ "fast", "fast register allocator", createFastVGPRRegisterAllocator);
+}
+
+
static cl::opt<bool> EnableR600StructurizeCFG(
"r600-ir-structurize",
cl::desc("Use StructurizeCFG IR pass"),
@@ -162,6 +273,11 @@ static cl::opt<bool> EnableRegReassign(
cl::init(true),
cl::Hidden);
+static cl::opt<bool> OptVGPRLiveRange(
+ "amdgpu-opt-vgpr-liverange",
+ cl::desc("Enable VGPR liverange optimizations for if-else structure"),
+ cl::init(true), cl::Hidden);
+
// Enable atomic optimization
static cl::opt<bool> EnableAtomicOptimizations(
"amdgpu-atomic-optimizations",
@@ -193,6 +309,21 @@ static cl::opt<bool> EnableStructurizerWorkarounds(
cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
cl::Hidden);
+static cl::opt<bool> EnableLDSReplaceWithPointer(
+ "amdgpu-enable-lds-replace-with-pointer",
+ cl::desc("Enable LDS replace with pointer pass"), cl::init(false),
+ cl::Hidden);
+
+static cl::opt<bool, true> EnableLowerModuleLDS(
+ "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
+ cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
+ cl::Hidden);
+
+static cl::opt<bool> EnablePreRAOptimizations(
+ "amdgpu-enable-pre-ra-optimizations",
+ cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
+ cl::Hidden);
+
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -215,9 +346,11 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSIPeepholeSDWAPass(*PR);
initializeSIShrinkInstructionsPass(*PR);
initializeSIOptimizeExecMaskingPreRAPass(*PR);
+ initializeSIOptimizeVGPRLiveRangePass(*PR);
initializeSILoadStoreOptimizerPass(*PR);
initializeAMDGPUFixFunctionBitcastsPass(*PR);
initializeAMDGPUAlwaysInlinePass(*PR);
+ initializeAMDGPUAttributorPass(*PR);
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
initializeAMDGPUAnnotateUniformValuesPass(*PR);
initializeAMDGPUArgumentUsageInfoPass(*PR);
@@ -228,12 +361,15 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
initializeAMDGPUPostLegalizerCombinerPass(*PR);
initializeAMDGPUPreLegalizerCombinerPass(*PR);
+ initializeAMDGPURegBankCombinerPass(*PR);
initializeAMDGPUPromoteAllocaPass(*PR);
initializeAMDGPUPromoteAllocaToVectorPass(*PR);
initializeAMDGPUCodeGenPreparePass(*PR);
initializeAMDGPULateCodeGenPreparePass(*PR);
initializeAMDGPUPropagateAttributesEarlyPass(*PR);
initializeAMDGPUPropagateAttributesLatePass(*PR);
+ initializeAMDGPUReplaceLDSUseWithPointerPass(*PR);
+ initializeAMDGPULowerModuleLDSPass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
@@ -242,9 +378,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSIModeRegisterPass(*PR);
initializeSIWholeQuadModePass(*PR);
initializeSILowerControlFlowPass(*PR);
- initializeSIRemoveShortExecBranchesPass(*PR);
initializeSIPreEmitPeepholePass(*PR);
- initializeSIInsertSkipsPass(*PR);
+ initializeSILateBranchLoweringPass(*PR);
initializeSIMemoryLegalizerPass(*PR);
initializeSIOptimizeExecMaskingPass(*PR);
initializeSIPreAllocateWWMRegsPass(*PR);
@@ -256,9 +391,9 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUUseNativeCallsPass(*PR);
initializeAMDGPUSimplifyLibCallsPass(*PR);
initializeAMDGPUPrintfRuntimeBindingPass(*PR);
- initializeGCNRegBankReassignPass(*PR);
+ initializeAMDGPUResourceUsageAnalysisPass(*PR);
initializeGCNNSAReassignPass(*PR);
- initializeSIAddIMGInitPass(*PR);
+ initializeGCNPreRAOptimizationsPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -388,6 +523,7 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
bool AMDGPUTargetMachine::EnableFunctionCalls = false;
bool AMDGPUTargetMachine::EnableFixedFunctionABI = false;
+bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
@@ -408,6 +544,7 @@ static bool mustPreserveGV(const GlobalValue &GV) {
if (const Function *F = dyn_cast<Function>(&GV))
return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv());
+ GV.removeDeadConstantUsers();
return !GV.use_empty();
}
@@ -480,8 +617,7 @@ void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
AAM.registerFunctionAnalysis<AMDGPUAA>();
}
-void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
- bool DebugPassManager) {
+void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PB.registerPipelineParsingCallback(
[this](StringRef PassName, ModulePassManager &PM,
ArrayRef<PassBuilder::PipelineElement>) {
@@ -501,6 +637,14 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
PM.addPass(AMDGPUAlwaysInlinePass());
return true;
}
+ if (PassName == "amdgpu-replace-lds-use-with-pointer") {
+ PM.addPass(AMDGPUReplaceLDSUseWithPointerPass());
+ return true;
+ }
+ if (PassName == "amdgpu-lower-module-lds") {
+ PM.addPass(AMDGPULowerModuleLDSPass());
+ return true;
+ }
return false;
});
PB.registerPipelineParsingCallback(
@@ -530,7 +674,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
return true;
}
-
return false;
});
@@ -546,16 +689,16 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
return false;
});
- PB.registerPipelineStartEPCallback([this, DebugPassManager](
- ModulePassManager &PM,
- PassBuilder::OptimizationLevel Level) {
- FunctionPassManager FPM(DebugPassManager);
- FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
- FPM.addPass(AMDGPUUseNativeCallsPass());
- if (EnableLibCallSimplify && Level != PassBuilder::OptimizationLevel::O0)
- FPM.addPass(AMDGPUSimplifyLibCallsPass(*this));
- PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
- });
+ PB.registerPipelineStartEPCallback(
+ [this](ModulePassManager &PM, PassBuilder::OptimizationLevel Level) {
+ FunctionPassManager FPM;
+ FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
+ FPM.addPass(AMDGPUUseNativeCallsPass());
+ if (EnableLibCallSimplify &&
+ Level != PassBuilder::OptimizationLevel::O0)
+ FPM.addPass(AMDGPUSimplifyLibCallsPass(*this));
+ PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+ });
PB.registerPipelineEarlySimplificationEPCallback(
[this](ModulePassManager &PM, PassBuilder::OptimizationLevel Level) {
@@ -577,12 +720,11 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
});
PB.registerCGSCCOptimizerLateEPCallback(
- [this, DebugPassManager](CGSCCPassManager &PM,
- PassBuilder::OptimizationLevel Level) {
+ [this](CGSCCPassManager &PM, PassBuilder::OptimizationLevel Level) {
if (Level == PassBuilder::OptimizationLevel::O0)
return;
- FunctionPassManager FPM(DebugPassManager);
+ FunctionPassManager FPM;
// Add infer address spaces pass to the opt pipeline after inlining
// but before SROA to increase SROA opportunities.
@@ -732,6 +874,9 @@ public:
// anything.
disablePass(&StackMapLivenessID);
disablePass(&FuncletLayoutID);
+ // Garbage collection is not supported.
+ disablePass(&GCLoweringID);
+ disablePass(&ShadowStackGCLoweringID);
}
AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
@@ -754,6 +899,19 @@ public:
bool addGCPasses() override;
std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
+
+ /// Check if a pass is enabled given \p Opt option. The option always
+ /// overrides defaults if explicitely used. Otherwise its default will
+ /// be used given that a pass shall work at an optimization \p Level
+ /// minimum.
+ bool isPassEnabled(const cl::opt<bool> &Opt,
+ CodeGenOpt::Level Level = CodeGenOpt::Default) const {
+ if (Opt.getNumOccurrences())
+ return Opt;
+ if (TM->getOptLevel() < Level)
+ return false;
+ return Opt;
+ }
};
std::unique_ptr<CSEConfigBase> AMDGPUPassConfig::getCSEConfig() const {
@@ -803,9 +961,18 @@ public:
bool addLegalizeMachineIR() override;
void addPreRegBankSelect() override;
bool addRegBankSelect() override;
+ void addPreGlobalInstructionSelect() override;
bool addGlobalInstructionSelect() override;
void addFastRegAlloc() override;
void addOptimizedRegAlloc() override;
+
+ FunctionPass *createSGPRAllocPass(bool Optimized);
+ FunctionPass *createVGPRAllocPass(bool Optimized);
+ FunctionPass *createRegAllocPass(bool Optimized) override;
+
+ bool addRegAssignAndRewriteFast() override;
+ bool addRegAssignAndRewriteOptimized() override;
+
void addPreRegAlloc() override;
bool addPreRewrite() override;
void addPostRegAlloc() override;
@@ -856,9 +1023,6 @@ void AMDGPUPassConfig::addIRPasses() {
// A call to propagate attributes pass in the backend in case opt was not run.
addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
- addPass(createAtomicExpandPass());
-
-
addPass(createAMDGPULowerIntrinsicsPass());
// Function calls are not supported, so make sure we inline everything.
@@ -878,14 +1042,28 @@ void AMDGPUPassConfig::addIRPasses() {
// Replace OpenCL enqueued block function pointers with global variables.
addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
- if (TM.getOptLevel() > CodeGenOpt::None) {
+ // Can increase LDS used by kernel so runs before PromoteAlloca
+ if (EnableLowerModuleLDS) {
+ // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the
+ // pass "amdgpu-lower-module-lds", and also it required to be run only if
+ // "amdgpu-lower-module-lds" pass is enabled.
+ if (EnableLDSReplaceWithPointer)
+ addPass(createAMDGPUReplaceLDSUseWithPointerPass());
+
+ addPass(createAMDGPULowerModuleLDSPass());
+ }
+
+ if (TM.getOptLevel() > CodeGenOpt::None)
addPass(createInferAddressSpacesPass());
+
+ addPass(createAtomicExpandPass());
+
+ if (TM.getOptLevel() > CodeGenOpt::None) {
addPass(createAMDGPUPromoteAlloca());
if (EnableSROA)
addPass(createSROAPass());
-
- if (EnableScalarIRPasses)
+ if (isPassEnabled(EnableScalarIRPasses))
addStraightLineScalarOptimizationPasses();
if (EnableAMDGPUAliasAnalysis) {
@@ -896,11 +1074,11 @@ void AMDGPUPassConfig::addIRPasses() {
AAR.addAAResult(WrapperPass->getResult());
}));
}
- }
- if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
- // TODO: May want to move later or split into an early and late one.
- addPass(createAMDGPUCodeGenPreparePass());
+ if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
+ // TODO: May want to move later or split into an early and late one.
+ addPass(createAMDGPUCodeGenPreparePass());
+ }
}
TargetPassConfig::addIRPasses();
@@ -917,7 +1095,7 @@ void AMDGPUPassConfig::addIRPasses() {
// %1 = shl %a, 2
//
// but EarlyCSE can do neither of them.
- if (getOptLevel() != CodeGenOpt::None && EnableScalarIRPasses)
+ if (isPassEnabled(EnableScalarIRPasses))
addEarlyCSEOrGVNPass();
}
@@ -929,11 +1107,9 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
EnableLowerKernelArguments)
addPass(createAMDGPULowerKernelArgumentsPass());
- addPass(&AMDGPUPerfHintAnalysisID);
-
TargetPassConfig::addCodeGenPrepare();
- if (EnableLoadStoreVectorizer)
+ if (isPassEnabled(EnableLoadStoreVectorizer))
addPass(createLoadStoreVectorizerPass());
// LowerSwitch pass may introduce unreachable blocks that can
@@ -944,7 +1120,8 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
}
bool AMDGPUPassConfig::addPreISel() {
- addPass(createFlattenCFGPass());
+ if (TM->getOptLevel() > CodeGenOpt::None)
+ addPass(createFlattenCFGPass());
return false;
}
@@ -1014,13 +1191,15 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
bool GCNPassConfig::addPreISel() {
AMDGPUPassConfig::addPreISel();
- addPass(createAMDGPULateCodeGenPreparePass());
- if (EnableAtomicOptimizations) {
+ if (TM->getOptLevel() > CodeGenOpt::None)
+ addPass(createAMDGPULateCodeGenPreparePass());
+
+ if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) {
addPass(createAMDGPUAtomicOptimizerPass());
}
- // FIXME: We need to run a pass to propagate the attributes when calls are
- // supported.
+ if (TM->getOptLevel() > CodeGenOpt::None)
+ addPass(createSinkingPass());
// Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
// regions formed by them.
@@ -1032,13 +1211,15 @@ bool GCNPassConfig::addPreISel() {
}
addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
}
- addPass(createSinkingPass());
addPass(createAMDGPUAnnotateUniformValues());
if (!LateCFGStructurize) {
addPass(createSIAnnotateControlFlowPass());
}
addPass(createLCSSAPass());
+ if (TM->getOptLevel() > CodeGenOpt::Less)
+ addPass(&AMDGPUPerfHintAnalysisID);
+
return false;
}
@@ -1055,15 +1236,14 @@ void GCNPassConfig::addMachineSSAOptimization() {
addPass(&SIFoldOperandsID);
if (EnableDPPCombine)
addPass(&GCNDPPCombineID);
- addPass(&DeadMachineInstructionElimID);
addPass(&SILoadStoreOptimizerID);
- if (EnableSDWAPeephole) {
+ if (isPassEnabled(EnableSDWAPeephole)) {
addPass(&SIPeepholeSDWAID);
addPass(&EarlyMachineLICMID);
addPass(&MachineCSEID);
addPass(&SIFoldOperandsID);
- addPass(&DeadMachineInstructionElimID);
}
+ addPass(&DeadMachineInstructionElimID);
addPass(createSIShrinkInstructionsPass());
}
@@ -1079,7 +1259,6 @@ bool GCNPassConfig::addInstSelector() {
AMDGPUPassConfig::addInstSelector();
addPass(&SIFixSGPRCopiesID);
addPass(createSILowerI1CopiesPass());
- addPass(createSIAddIMGInitPass());
return false;
}
@@ -1109,12 +1288,13 @@ bool GCNPassConfig::addRegBankSelect() {
return false;
}
+void GCNPassConfig::addPreGlobalInstructionSelect() {
+ bool IsOptNone = getOptLevel() == CodeGenOpt::None;
+ addPass(createAMDGPURegBankCombiner(IsOptNone));
+}
+
bool GCNPassConfig::addGlobalInstructionSelect() {
- addPass(new InstructionSelect());
- // TODO: Fix instruction selection to do the right thing for image
- // instructions with tfe or lwe in the first place, instead of running a
- // separate pass to fix them up?
- addPass(createSIAddIMGInitPass());
+ addPass(new InstructionSelect(getOptLevel()));
return false;
}
@@ -1147,8 +1327,21 @@ void GCNPassConfig::addOptimizedRegAlloc() {
if (OptExecMaskPreRA)
insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
- insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
+ if (isPassEnabled(EnablePreRAOptimizations))
+ insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID);
+
+ // This is not an essential optimization and it has a noticeable impact on
+ // compilation time, so we only enable it from O2.
+ if (TM->getOptLevel() > CodeGenOpt::Less)
+ insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
+
+ // FIXME: when an instruction has a Killed operand, and the instruction is
+ // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
+ // the register in LiveVariables, this would trigger a failure in verifier,
+ // we should fix it and enable the verifier.
+ if (OptVGPRLiveRange)
+ insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID, false);
// This must be run immediately after phi elimination and before
// TwoAddressInstructions, otherwise the processing of the tied operand of
// SI_ELSE will introduce a copy of the tied operand source after the else.
@@ -1161,10 +1354,81 @@ void GCNPassConfig::addOptimizedRegAlloc() {
}
bool GCNPassConfig::addPreRewrite() {
- if (EnableRegReassign) {
+ if (EnableRegReassign)
addPass(&GCNNSAReassignID);
- addPass(&GCNRegBankReassignID);
- }
+ return true;
+}
+
+FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
+ // Initialize the global default.
+ llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
+ initializeDefaultSGPRRegisterAllocatorOnce);
+
+ RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
+ if (Ctor != useDefaultRegisterAllocator)
+ return Ctor();
+
+ if (Optimized)
+ return createGreedyRegisterAllocator(onlyAllocateSGPRs);
+
+ return createFastRegisterAllocator(onlyAllocateSGPRs, false);
+}
+
+FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
+ // Initialize the global default.
+ llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
+ initializeDefaultVGPRRegisterAllocatorOnce);
+
+ RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
+ if (Ctor != useDefaultRegisterAllocator)
+ return Ctor();
+
+ if (Optimized)
+ return createGreedyVGPRRegisterAllocator();
+
+ return createFastVGPRRegisterAllocator();
+}
+
+FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
+ llvm_unreachable("should not be used");
+}
+
+static const char RegAllocOptNotSupportedMessage[] =
+ "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
+
+bool GCNPassConfig::addRegAssignAndRewriteFast() {
+ if (!usingDefaultRegAlloc())
+ report_fatal_error(RegAllocOptNotSupportedMessage);
+
+ addPass(createSGPRAllocPass(false));
+
+ // Equivalent of PEI for SGPRs.
+ addPass(&SILowerSGPRSpillsID);
+
+ addPass(createVGPRAllocPass(false));
+ return true;
+}
+
+bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
+ if (!usingDefaultRegAlloc())
+ report_fatal_error(RegAllocOptNotSupportedMessage);
+
+ addPass(createSGPRAllocPass(true));
+
+ // Commit allocated register changes. This is mostly necessary because too
+ // many things rely on the use lists of the physical registers, such as the
+ // verifier. This is only necessary with allocators which use LiveIntervals,
+ // since FastRegAlloc does the replacments itself.
+ addPass(createVirtRegRewriter(false));
+
+ // Equivalent of PEI for SGPRs.
+ addPass(&SILowerSGPRSpillsID);
+
+ addPass(createVGPRAllocPass(true));
+
+ addPreRewrite();
+ addPass(&VirtRegRewriterID);
+
return true;
}
@@ -1173,9 +1437,6 @@ void GCNPassConfig::addPostRegAlloc() {
if (getOptLevel() > CodeGenOpt::None)
addPass(&SIOptimizeExecMaskingID);
TargetPassConfig::addPostRegAlloc();
-
- // Equivalent of PEI for SGPRs.
- addPass(&SILowerSGPRSpillsID);
}
void GCNPassConfig::addPreSched2() {
@@ -1185,15 +1446,18 @@ void GCNPassConfig::addPreSched2() {
void GCNPassConfig::addPreEmitPass() {
addPass(createSIMemoryLegalizerPass());
addPass(createSIInsertWaitcntsPass());
- addPass(createSIShrinkInstructionsPass());
+
+ if (TM->getOptLevel() > CodeGenOpt::None)
+ addPass(createSIShrinkInstructionsPass());
+
addPass(createSIModeRegisterPass());
if (getOptLevel() > CodeGenOpt::None)
addPass(&SIInsertHardClausesID);
- addPass(&SIRemoveShortExecBranchesID);
- addPass(&SIInsertSkipsPassID);
- addPass(&SIPreEmitPeepholeID);
+ addPass(&SILateBranchLoweringPassID);
+ if (getOptLevel() > CodeGenOpt::None)
+ addPass(&SIPreEmitPeepholeID);
// The hazard recognizer that runs as part of the post-ra scheduler does not
// guarantee to be able handle all hazards correctly. This is because if there
// are multiple scheduling regions in a basic block, the regions are scheduled
@@ -1217,8 +1481,8 @@ yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
yaml::MachineFunctionInfo *
GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- return new yaml::SIMachineFunctionInfo(*MFI,
- *MF.getSubtarget().getRegisterInfo());
+ return new yaml::SIMachineFunctionInfo(
+ *MFI, *MF.getSubtarget().getRegisterInfo(), MF);
}
bool GCNTargetMachine::parseMachineFunctionInfo(
@@ -1229,7 +1493,8 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
MachineFunction &MF = PFS.MF;
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- MFI->initializeBaseYamlFields(YamlMFI);
+ if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
+ return true;
if (MFI->Occupancy == 0) {
// Fixup the subtarget dependent default value.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 95aefa23c24c..1bfe026d080c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -35,6 +35,7 @@ public:
static bool EnableLateStructurizeCFG;
static bool EnableFunctionCalls;
static bool EnableFixedFunctionABI;
+ static bool EnableLowerModuleLDS;
AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, TargetOptions Options,
@@ -51,8 +52,7 @@ public:
void adjustPassManager(PassManagerBuilder &) override;
- void registerPassBuilderCallbacks(PassBuilder &PB,
- bool DebugPassManager) override;
+ void registerPassBuilderCallbacks(PassBuilder &PB) override;
void registerDefaultAliasAnalyses(AAManager &) override;
/// Get the integer value of a null pointer in the given address space.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 7b8a79640bb2..63f449f7a726 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -19,6 +19,7 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/KnownBits.h"
@@ -39,7 +40,7 @@ static cl::opt<unsigned> UnrollThresholdLocal(
static cl::opt<unsigned> UnrollThresholdIf(
"amdgpu-unroll-threshold-if",
cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
- cl::init(150), cl::Hidden);
+ cl::init(200), cl::Hidden);
static cl::opt<bool> UnrollRuntimeLocal(
"amdgpu-unroll-runtime-local",
@@ -106,6 +107,10 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
UP.MaxCount = std::numeric_limits<unsigned>::max();
UP.Partial = true;
+ // Conditional branch in a loop back edge needs 3 additional exec
+ // manipulations in average.
+ UP.BEInsns += 3;
+
// TODO: Do we want runtime unrolling?
// Maximum alloca size than can fit registers. Reserve 16 registers.
@@ -310,8 +315,17 @@ unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
return getHardwareNumberOfRegisters(false) / NumVGPRs;
}
-unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
- return 32;
+TypeSize
+GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
+ switch (K) {
+ case TargetTransformInfo::RGK_Scalar:
+ return TypeSize::getFixed(32);
+ case TargetTransformInfo::RGK_FixedWidthVector:
+ return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
+ case TargetTransformInfo::RGK_ScalableVector:
+ return TypeSize::getScalable(0);
+ }
+ llvm_unreachable("Unsupported register kind");
}
unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
@@ -321,7 +335,9 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
return 32 * 4 / ElemWidth;
- return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 : 1;
+ return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
+ : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
+ : 1;
}
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -495,14 +511,12 @@ bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
}
}
-int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
- TTI::TargetCostKind CostKind,
- TTI::OperandValueKind Opd1Info,
- TTI::OperandValueKind Opd2Info,
- TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo,
- ArrayRef<const Value *> Args,
- const Instruction *CxtI) {
+InstructionCost GCNTTIImpl::getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
+ TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
+ TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
+ const Instruction *CxtI) {
EVT OrigTy = TLI->getValueType(DL, Ty);
if (!OrigTy.isSimple()) {
// FIXME: We're having to query the throughput cost so that the basic
@@ -518,7 +532,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
bool IsFloat = Ty->isFPOrFPVectorTy();
// Assume that floating point arithmetic operations cost twice as much as
@@ -542,12 +556,13 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// similarly to what getCastInstrCost() does.
if (auto *VTy = dyn_cast<VectorType>(Ty)) {
unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
- unsigned Cost = getArithmeticInstrCost(
+ InstructionCost Cost = getArithmeticInstrCost(
Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
Opd1PropInfo, Opd2PropInfo, Args, CxtI);
// Return the cost of multiple scalar invocation plus the cost of
// inserting and extracting the values.
- return getScalarizationOverhead(VTy, Args) + Num * Cost;
+ SmallVector<Type *> Tys(Args.size(), Ty);
+ return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
}
// We don't know anything about this scalar instruction.
@@ -555,7 +570,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
}
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
// Because we don't have any legal vector operations, but the legal types, we
@@ -628,6 +643,8 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
LLVM_FALLTHROUGH;
case ISD::FADD:
case ISD::FSUB:
+ if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
+ NElts = (NElts + 1) / 2;
if (SLT == MVT::f64)
return LT.first * NElts * get64BitInstrCost(CostKind);
@@ -713,8 +730,9 @@ static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
}
}
-int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
- TTI::TargetCostKind CostKind) {
+InstructionCost
+GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind) {
if (ICA.getID() == Intrinsic::fabs)
return 0;
@@ -731,45 +749,34 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
if (ICA.isTypeBasedOnly())
return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
- Type *RetTy = ICA.getReturnType();
- unsigned VF = ICA.getVectorFactor().getFixedValue();
unsigned RetVF =
(RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
: 1);
- assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
const IntrinsicInst *I = ICA.getInst();
const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
FastMathFlags FMF = ICA.getFlags();
// Assume that we need to scalarize this intrinsic.
- SmallVector<Type *, 4> Types;
- for (const Value *Op : Args) {
- Type *OpTy = Op->getType();
- assert(VF == 1 || !OpTy->isVectorTy());
- Types.push_back(VF == 1 ? OpTy : FixedVectorType::get(OpTy, VF));
- }
-
- if (VF > 1 && !RetTy->isVoidTy())
- RetTy = FixedVectorType::get(RetTy, VF);
// Compute the scalarization overhead based on Args for a vector
// intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
// CostModel will pass a vector RetTy and VF is 1.
- unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
- if (RetVF > 1 || VF > 1) {
+ InstructionCost ScalarizationCost = InstructionCost::getInvalid();
+ if (RetVF > 1) {
ScalarizationCost = 0;
if (!RetTy->isVoidTy())
ScalarizationCost +=
getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
- ScalarizationCost += getOperandsScalarizationOverhead(Args, VF);
+ ScalarizationCost +=
+ getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
}
- IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, Types, FMF,
- ScalarizationCost, I);
+ IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I,
+ ScalarizationCost);
return getIntrinsicInstrCost(Attrs, CostKind);
}
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
unsigned NElts = LT.second.isVector() ?
LT.second.getVectorNumElements() : 1;
@@ -779,69 +786,96 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
if (SLT == MVT::f64)
return LT.first * NElts * get64BitInstrCost(CostKind);
- if (ST->has16BitInsts() && SLT == MVT::f16)
+ if ((ST->has16BitInsts() && SLT == MVT::f16) ||
+ (ST->hasPackedFP32Ops() && SLT == MVT::f32))
NElts = (NElts + 1) / 2;
// TODO: Get more refined intrinsic costs?
unsigned InstRate = getQuarterRateInstrCost(CostKind);
- if (ICA.getID() == Intrinsic::fma) {
+
+ switch (ICA.getID()) {
+ case Intrinsic::fma:
InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
: getQuarterRateInstrCost(CostKind);
+ break;
+ case Intrinsic::uadd_sat:
+ case Intrinsic::usub_sat:
+ case Intrinsic::sadd_sat:
+ case Intrinsic::ssub_sat:
+ static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
+ if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
+ NElts = 1;
+ break;
}
return LT.first * NElts * InstRate;
}
-unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode,
- TTI::TargetCostKind CostKind) {
- if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
- return Opcode == Instruction::PHI ? 0 : 1;
-
- // XXX - For some reason this isn't called for switch.
+InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
+ assert((I == nullptr || I->getOpcode() == Opcode) &&
+ "Opcode should reflect passed instruction.");
+ const bool SCost =
+ (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency);
+ const int CBrCost = SCost ? 5 : 7;
switch (Opcode) {
- case Instruction::Br:
+ case Instruction::Br: {
+ // Branch instruction takes about 4 slots on gfx900.
+ auto BI = dyn_cast_or_null<BranchInst>(I);
+ if (BI && BI->isUnconditional())
+ return SCost ? 1 : 4;
+ // Suppose conditional branch takes additional 3 exec manipulations
+ // instructions in average.
+ return CBrCost;
+ }
+ case Instruction::Switch: {
+ auto SI = dyn_cast_or_null<SwitchInst>(I);
+ // Each case (including default) takes 1 cmp + 1 cbr instructions in
+ // average.
+ return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
+ }
case Instruction::Ret:
- return 10;
- default:
- return BaseT::getCFInstrCost(Opcode, CostKind);
+ return SCost ? 1 : 10;
}
+ return BaseT::getCFInstrCost(Opcode, CostKind, I);
}
-int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
- bool IsPairwise,
- TTI::TargetCostKind CostKind) {
+InstructionCost
+GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+ Optional<FastMathFlags> FMF,
+ TTI::TargetCostKind CostKind) {
+ if (TTI::requiresOrderedReduction(FMF))
+ return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
+
EVT OrigTy = TLI->getValueType(DL, Ty);
// Computes cost on targets that have packed math instructions(which support
// 16-bit types only).
- if (IsPairwise ||
- !ST->hasVOP3PInsts() ||
- OrigTy.getScalarSizeInBits() != 16)
- return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise, CostKind);
+ if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
+ return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
return LT.first * getFullRateInstrCost();
}
-int GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
- bool IsPairwise, bool IsUnsigned,
- TTI::TargetCostKind CostKind) {
+InstructionCost
+GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
+ bool IsUnsigned,
+ TTI::TargetCostKind CostKind) {
EVT OrigTy = TLI->getValueType(DL, Ty);
// Computes cost on targets that have packed math instructions(which support
// 16-bit types only).
- if (IsPairwise ||
- !ST->hasVOP3PInsts() ||
- OrigTy.getScalarSizeInBits() != 16)
- return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned,
- CostKind);
+ if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
+ return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
return LT.first * getHalfRateInstrCost(CostKind);
}
-int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
- unsigned Index) {
+InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+ unsigned Index) {
switch (Opcode) {
case Instruction::ExtractElement:
case Instruction::InsertElement: {
@@ -1096,8 +1130,10 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
}
}
-unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *VT,
- int Index, VectorType *SubTp) {
+InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
+ VectorType *VT, ArrayRef<int> Mask,
+ int Index, VectorType *SubTp) {
+ Kind = improveShuffleKindFromMask(Kind, Mask);
if (ST->hasVOP3PInsts()) {
if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
@@ -1115,7 +1151,7 @@ unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *VT,
}
}
- return BaseT::getShuffleCost(Kind, VT, Index, SubTp);
+ return BaseT::getShuffleCost(Kind, VT, Mask, Index, SubTp);
}
bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
@@ -1141,9 +1177,15 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
if (!CallerMode.isInlineCompatible(CalleeMode))
return false;
+ if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
+ Callee->hasFnAttribute(Attribute::InlineHint))
+ return true;
+
// Hack to make compile times reasonable.
- if (InlineMaxBB && !Callee->hasFnAttribute(Attribute::InlineHint)) {
- // Single BB does not increase total BB amount, thus subtract 1.
+ if (InlineMaxBB) {
+ // Single BB does not increase total BB amount.
+ if (Callee->size() == 1)
+ return true;
size_t BBSize = Caller->size() + Callee->size() - 1;
return BBSize <= InlineMaxBB;
}
@@ -1192,8 +1234,10 @@ void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
}
int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
- return ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
- : getQuarterRateInstrCost(CostKind);
+ return ST->hasFullRate64Ops()
+ ? getFullRateInstrCost()
+ : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
+ : getQuarterRateInstrCost(CostKind);
}
R600TTIImpl::R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
@@ -1209,8 +1253,9 @@ unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
return getHardwareNumberOfRegisters(Vec);
}
-unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const {
- return 32;
+TypeSize
+R600TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
+ return TypeSize::getFixed(32);
}
unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
@@ -1265,8 +1310,9 @@ unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
return 8;
}
-unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode,
- TTI::TargetCostKind CostKind) {
+InstructionCost R600TTIImpl::getCFInstrCost(unsigned Opcode,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
return Opcode == Instruction::PHI ? 0 : 1;
@@ -1276,12 +1322,12 @@ unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode,
case Instruction::Ret:
return 10;
default:
- return BaseT::getCFInstrCost(Opcode, CostKind);
+ return BaseT::getCFInstrCost(Opcode, CostKind, I);
}
}
-int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
- unsigned Index) {
+InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+ unsigned Index) {
switch (Opcode) {
case Instruction::ExtractElement:
case Instruction::InsertElement: {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index b29c94180fb8..37c0756eb7a8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -25,6 +25,7 @@
namespace llvm {
class AMDGPUTargetLowering;
+class AMDGPUTargetMachine;
class GCNSubtarget;
class InstCombiner;
class Loop;
@@ -120,7 +121,7 @@ public:
unsigned getHardwareNumberOfRegisters(bool Vector) const;
unsigned getNumberOfRegisters(bool Vector) const;
unsigned getNumberOfRegisters(unsigned RCID) const;
- unsigned getRegisterBitWidth(bool Vector) const;
+ TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const;
unsigned getMinVectorRegisterBitWidth() const;
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -152,7 +153,7 @@ public:
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
- int getArithmeticInstrCost(
+ InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
@@ -162,12 +163,14 @@ public:
ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
const Instruction *CxtI = nullptr);
- unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
+ InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
bool isInlineAsmSourceOfDivergence(const CallInst *CI,
ArrayRef<unsigned> Indices = {}) const;
- int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
+ InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
+ unsigned Index);
bool isSourceOfDivergence(const Value *V) const;
bool isAlwaysUniform(const Value *V) const;
@@ -194,10 +197,11 @@ public:
std::function<void(Instruction *, unsigned, APInt, APInt &)>
SimplifyAndSetOp) const;
- unsigned getVectorSplitCost() { return 0; }
+ InstructionCost getVectorSplitCost() { return 0; }
- unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
- VectorType *SubTp);
+ InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
+ ArrayRef<int> Mask, int Index,
+ VectorType *SubTp);
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const;
@@ -207,17 +211,15 @@ public:
int getInlinerVectorBonusPercent() { return 0; }
- int getArithmeticReductionCost(
- unsigned Opcode,
- VectorType *Ty,
- bool IsPairwise,
+ InstructionCost getArithmeticReductionCost(
+ unsigned Opcode, VectorType *Ty, Optional<FastMathFlags> FMF,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
- int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
- TTI::TargetCostKind CostKind);
- int getMinMaxReductionCost(
- VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned,
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
+ InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind);
+ InstructionCost getMinMaxReductionCost(
+ VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
};
class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
@@ -242,7 +244,7 @@ public:
TTI::PeelingPreferences &PP);
unsigned getHardwareNumberOfRegisters(bool Vec) const;
unsigned getNumberOfRegisters(bool Vec) const;
- unsigned getRegisterBitWidth(bool Vector) const;
+ TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const;
unsigned getMinVectorRegisterBitWidth() const;
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
@@ -252,8 +254,10 @@ public:
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
unsigned AddrSpace) const;
unsigned getMaxInterleaveFactor(unsigned VF);
- unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
- int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
+ InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
+ InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
+ unsigned Index);
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 84d72e1b579f..4e3d5fdc012d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -6,7 +6,7 @@
//
//===----------------------------------------------------------------------===//
//
-// This is a variant of the UnifyDivergentExitNodes pass. Rather than ensuring
+// This is a variant of the UnifyFunctionExitNodes pass. Rather than ensuring
// there is at most one ret and one unreachable instruction, it ensures there is
// at most one divergent exiting block.
//
@@ -54,6 +54,9 @@ using namespace llvm;
namespace {
class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
+private:
+ const TargetTransformInfo *TTI = nullptr;
+
public:
static char ID; // Pass identification, replacement for typeid
@@ -63,6 +66,9 @@ public:
// We can preserve non-critical-edgeness when we unify function exit nodes
void getAnalysisUsage(AnalysisUsage &AU) const override;
+ BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,
+ ArrayRef<BasicBlock *> ReturningBlocks,
+ StringRef Name);
bool runOnFunction(Function &F) override;
};
@@ -110,12 +116,9 @@ void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
/// XXX - Is there a more efficient way to find this?
static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,
BasicBlock &BB) {
- SmallVector<BasicBlock *, 8> Stack;
+ SmallVector<BasicBlock *, 8> Stack(predecessors(&BB));
SmallPtrSet<BasicBlock *, 8> Visited;
- for (BasicBlock *Pred : predecessors(&BB))
- Stack.push_back(Pred);
-
while (!Stack.empty()) {
BasicBlock *Top = Stack.pop_back_val();
if (!DA.isUniform(Top->getTerminator()))
@@ -130,49 +133,15 @@ static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,
return true;
}
-static void removeDoneExport(Function &F) {
- ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext());
- for (BasicBlock &BB : F) {
- for (Instruction &I : BB) {
- if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {
- if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {
- Intrin->setArgOperand(6, BoolFalse); // done
- } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {
- Intrin->setArgOperand(4, BoolFalse); // done
- }
- }
- }
- }
-}
-
-static BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,
- ArrayRef<BasicBlock *> ReturningBlocks,
- bool InsertExport,
- const TargetTransformInfo &TTI,
- StringRef Name) {
+BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet(
+ Function &F, DomTreeUpdater &DTU, ArrayRef<BasicBlock *> ReturningBlocks,
+ StringRef Name) {
// Otherwise, we need to insert a new basic block into the function, add a PHI
// nodes (if the function returns values), and convert all of the return
// instructions into unconditional branches.
BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
IRBuilder<> B(NewRetBlock);
- if (InsertExport) {
- // Ensure that there's only one "done" export in the shader by removing the
- // "done" bit set on the original final export. More than one "done" export
- // can lead to undefined behavior.
- removeDoneExport(F);
-
- Value *Undef = UndefValue::get(B.getFloatTy());
- B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },
- {
- B.getInt32(AMDGPU::Exp::ET_NULL),
- B.getInt32(0), // enabled channels
- Undef, Undef, Undef, Undef, // values
- B.getTrue(), // done
- B.getTrue(), // valid mask
- });
- }
-
PHINode *PN = nullptr;
if (F.getReturnType()->isVoidTy()) {
B.CreateRetVoid();
@@ -180,7 +149,6 @@ static BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,
// If the function doesn't return void... add a PHI node to the block...
PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(),
"UnifiedRetVal");
- assert(!InsertExport);
B.CreateRet(PN);
}
@@ -206,7 +174,7 @@ static BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,
for (BasicBlock *BB : ReturningBlocks) {
// Cleanup possible branch to unconditional branch to the return.
- simplifyCFG(BB, TTI, RequireAndPreserveDomTree ? &DTU : nullptr,
+ simplifyCFG(BB, *TTI, RequireAndPreserveDomTree ? &DTU : nullptr,
SimplifyCFGOptions().bonusInstThreshold(2));
}
@@ -220,25 +188,21 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
- // If there's only one exit, we don't need to do anything, unless this is a
- // pixel shader and that exit is an infinite loop, since we still have to
- // insert an export in that case.
- if (PDT.root_size() <= 1 && F.getCallingConv() != CallingConv::AMDGPU_PS)
+ // If there's only one exit, we don't need to do anything.
+ if (PDT.root_size() <= 1)
return false;
LegacyDivergenceAnalysis &DA = getAnalysis<LegacyDivergenceAnalysis>();
+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
// Loop over all of the blocks in a function, tracking all of the blocks that
// return.
SmallVector<BasicBlock *, 4> ReturningBlocks;
- SmallVector<BasicBlock *, 4> UniformlyReachedRetBlocks;
SmallVector<BasicBlock *, 4> UnreachableBlocks;
// Dummy return block for infinite loop.
BasicBlock *DummyReturnBB = nullptr;
- bool InsertExport = false;
-
bool Changed = false;
std::vector<DominatorTree::UpdateType> Updates;
@@ -246,8 +210,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
if (isa<ReturnInst>(BB->getTerminator())) {
if (!isUniformlyReached(DA, *BB))
ReturningBlocks.push_back(BB);
- else
- UniformlyReachedRetBlocks.push_back(BB);
} else if (isa<UnreachableInst>(BB->getTerminator())) {
if (!isUniformlyReached(DA, *BB))
UnreachableBlocks.push_back(BB);
@@ -259,36 +221,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
"DummyReturnBlock", &F);
Type *RetTy = F.getReturnType();
Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
-
- // For pixel shaders, the producer guarantees that an export is
- // executed before each return instruction. However, if there is an
- // infinite loop and we insert a return ourselves, we need to uphold
- // that guarantee by inserting a null export. This can happen e.g. in
- // an infinite loop with kill instructions, which is supposed to
- // terminate. However, we don't need to do this if there is a non-void
- // return value, since then there is an epilog afterwards which will
- // still export.
- //
- // Note: In the case where only some threads enter the infinite loop,
- // this can result in the null export happening redundantly after the
- // original exports. However, The last "real" export happens after all
- // the threads that didn't enter an infinite loop converged, which
- // means that the only extra threads to execute the null export are
- // threads that entered the infinite loop, and they only could've
- // exited through being killed which sets their exec bit to 0.
- // Therefore, unless there's an actual infinite loop, which can have
- // invalid results, or there's a kill after the last export, which we
- // assume the frontend won't do, this export will have the same exec
- // mask as the last "real" export, and therefore the valid mask will be
- // overwritten with the same value and will still be correct. Also,
- // even though this forces an extra unnecessary export wait, we assume
- // that this happens rare enough in practice to that we don't have to
- // worry about performance.
- if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
- RetTy->isVoidTy()) {
- InsertExport = true;
- }
-
ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
ReturningBlocks.push_back(DummyReturnBB);
}
@@ -380,23 +312,9 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
if (ReturningBlocks.empty())
return Changed; // No blocks return
- if (ReturningBlocks.size() == 1 && !InsertExport)
+ if (ReturningBlocks.size() == 1)
return Changed; // Already has a single return block
- const TargetTransformInfo &TTI
- = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-
- // Unify returning blocks. If we are going to insert the export it is also
- // necessary to include blocks that are uniformly reached, because in addition
- // to inserting the export the "done" bits on existing exports will be cleared
- // and we do not want to end up with the normal export in a non-unified,
- // uniformly reached block with the "done" bit cleared.
- auto BlocksToUnify = std::move(ReturningBlocks);
- if (InsertExport) {
- llvm::append_range(BlocksToUnify, UniformlyReachedRetBlocks);
- }
-
- unifyReturnBlockSet(F, DTU, BlocksToUnify, InsertExport, TTI,
- "UnifiedReturnBlock");
+ unifyReturnBlockSet(F, DTU, ReturningBlocks, "UnifiedReturnBlock");
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index b9a8c6bd005d..56befe4ed0d0 100644
--- a/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -171,7 +171,7 @@ protected:
static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) {
for (MachineLoop::iterator iter = LoopInfo.begin(),
iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) {
- (*iter)->print(dbgs(), 0);
+ (*iter)->print(dbgs());
}
}
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index af4a47935e3f..00032c7d4ea5 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -11,6 +11,7 @@
#include "MCTargetDesc/AMDGPUTargetStreamer.h"
#include "SIDefines.h"
#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
#include "TargetInfo/AMDGPUTargetInfo.h"
#include "Utils/AMDGPUAsmUtils.h"
#include "Utils/AMDGPUBaseInfo.h"
@@ -113,9 +114,7 @@ public:
ImmTyInstOffset,
ImmTyOffset0,
ImmTyOffset1,
- ImmTyDLC,
- ImmTyGLC,
- ImmTySLC,
+ ImmTyCPol,
ImmTySWZ,
ImmTyTFE,
ImmTyD16,
@@ -299,6 +298,8 @@ public:
return isRegKind() && getReg() == AMDGPU::SGPR_NULL;
}
+ bool isVRegWithInputMods() const;
+
bool isSDWAOperand(MVT type) const;
bool isSDWAFP16Operand() const;
bool isSDWAFP32Operand() const;
@@ -336,12 +337,7 @@ public:
bool isFlatOffset() const { return isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset); }
bool isGDS() const { return isImmTy(ImmTyGDS); }
bool isLDS() const { return isImmTy(ImmTyLDS); }
- bool isDLC() const { return isImmTy(ImmTyDLC); }
- bool isGLC() const { return isImmTy(ImmTyGLC); }
- // "GLC_1" is a MatchClass of the GLC_1 operand with the default and forced
- // value of the GLC operand.
- bool isGLC_1() const { return isImmTy(ImmTyGLC); }
- bool isSLC() const { return isImmTy(ImmTySLC); }
+ bool isCPol() const { return isImmTy(ImmTyCPol); }
bool isSWZ() const { return isImmTy(ImmTySWZ); }
bool isTFE() const { return isImmTy(ImmTyTFE); }
bool isD16() const { return isImmTy(ImmTyD16); }
@@ -449,6 +445,26 @@ public:
return isSSrcF16();
}
+ bool isSSrcV2FP32() const {
+ llvm_unreachable("cannot happen");
+ return isSSrcF32();
+ }
+
+ bool isSCSrcV2FP32() const {
+ llvm_unreachable("cannot happen");
+ return isSCSrcF32();
+ }
+
+ bool isSSrcV2INT32() const {
+ llvm_unreachable("cannot happen");
+ return isSSrcB32();
+ }
+
+ bool isSCSrcV2INT32() const {
+ llvm_unreachable("cannot happen");
+ return isSCSrcB32();
+ }
+
bool isSSrcOrLdsB32() const {
return isRegOrInlineNoMods(AMDGPU::SRegOrLds_32RegClassID, MVT::i32) ||
isLiteralImm(MVT::i32) || isExpr();
@@ -502,6 +518,22 @@ public:
return isVSrcB16() || isLiteralImm(MVT::v2i16);
}
+ bool isVCSrcV2FP32() const {
+ return isVCSrcF64();
+ }
+
+ bool isVSrcV2FP32() const {
+ return isVSrcF64() || isLiteralImm(MVT::v2f32);
+ }
+
+ bool isVCSrcV2INT32() const {
+ return isVCSrcB64();
+ }
+
+ bool isVSrcV2INT32() const {
+ return isVSrcB64() || isLiteralImm(MVT::v2i32);
+ }
+
bool isVSrcF32() const {
return isVCSrcF32() || isLiteralImm(MVT::f32) || isExpr();
}
@@ -542,6 +574,102 @@ public:
return isVISrcF16() || isVISrcB32();
}
+ bool isVISrc_64B64() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i64);
+ }
+
+ bool isVISrc_64F64() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::f64);
+ }
+
+ bool isVISrc_64V2FP32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::f32);
+ }
+
+ bool isVISrc_64V2INT32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i32);
+ }
+
+ bool isVISrc_256B64() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i64);
+ }
+
+ bool isVISrc_256F64() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::f64);
+ }
+
+ bool isVISrc_128B16() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::i16);
+ }
+
+ bool isVISrc_128V2B16() const {
+ return isVISrc_128B16();
+ }
+
+ bool isVISrc_128B32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::i32);
+ }
+
+ bool isVISrc_128F32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::f32);
+ }
+
+ bool isVISrc_256V2FP32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::f32);
+ }
+
+ bool isVISrc_256V2INT32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i32);
+ }
+
+ bool isVISrc_512B32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::i32);
+ }
+
+ bool isVISrc_512B16() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::i16);
+ }
+
+ bool isVISrc_512V2B16() const {
+ return isVISrc_512B16();
+ }
+
+ bool isVISrc_512F32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::f32);
+ }
+
+ bool isVISrc_512F16() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::f16);
+ }
+
+ bool isVISrc_512V2F16() const {
+ return isVISrc_512F16() || isVISrc_512B32();
+ }
+
+ bool isVISrc_1024B32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_1024RegClassID, MVT::i32);
+ }
+
+ bool isVISrc_1024B16() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_1024RegClassID, MVT::i16);
+ }
+
+ bool isVISrc_1024V2B16() const {
+ return isVISrc_1024B16();
+ }
+
+ bool isVISrc_1024F32() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_1024RegClassID, MVT::f32);
+ }
+
+ bool isVISrc_1024F16() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_1024RegClassID, MVT::f16);
+ }
+
+ bool isVISrc_1024V2F16() const {
+ return isVISrc_1024F16() || isVISrc_1024B32();
+ }
+
bool isAISrcB32() const {
return isRegOrInlineNoMods(AMDGPU::AGPR_32RegClassID, MVT::i32);
}
@@ -566,6 +694,14 @@ public:
return isAISrcF16() || isAISrcB32();
}
+ bool isAISrc_64B64() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_64RegClassID, MVT::i64);
+ }
+
+ bool isAISrc_64F64() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_64RegClassID, MVT::f64);
+ }
+
bool isAISrc_128B32() const {
return isRegOrInlineNoMods(AMDGPU::AReg_128RegClassID, MVT::i32);
}
@@ -590,6 +726,22 @@ public:
return isAISrc_128F16() || isAISrc_128B32();
}
+ bool isVISrc_128F16() const {
+ return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::f16);
+ }
+
+ bool isVISrc_128V2F16() const {
+ return isVISrc_128F16() || isVISrc_128B32();
+ }
+
+ bool isAISrc_256B64() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_256RegClassID, MVT::i64);
+ }
+
+ bool isAISrc_256F64() const {
+ return isRegOrInlineNoMods(AMDGPU::AReg_256RegClassID, MVT::f64);
+ }
+
bool isAISrc_512B32() const {
return isRegOrInlineNoMods(AMDGPU::AReg_512RegClassID, MVT::i32);
}
@@ -837,9 +989,7 @@ public:
case ImmTyInstOffset: OS << "InstOffset"; break;
case ImmTyOffset0: OS << "Offset0"; break;
case ImmTyOffset1: OS << "Offset1"; break;
- case ImmTyDLC: OS << "DLC"; break;
- case ImmTyGLC: OS << "GLC"; break;
- case ImmTySLC: OS << "SLC"; break;
+ case ImmTyCPol: OS << "CPol"; break;
case ImmTySWZ: OS << "SWZ"; break;
case ImmTyTFE: OS << "TFE"; break;
case ImmTyD16: OS << "D16"; break;
@@ -1021,6 +1171,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
bool ForcedDPP = false;
bool ForcedSDWA = false;
KernelScopeInfo KernelScope;
+ unsigned CPolSeen;
/// @name Auto-generated Match Functions
/// {
@@ -1061,7 +1212,8 @@ private:
bool ParseDirectiveHSACodeObjectISA();
bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header);
bool ParseDirectiveAMDKernelCodeT();
- bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo) const;
+ // TODO: Possibly make subtargetHasRegister const.
+ bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo);
bool ParseDirectiveAMDGPUHsaKernel();
bool ParseDirectiveISAVersion();
@@ -1105,7 +1257,7 @@ private:
bool updateGprCountSymbols(RegisterKind RegKind, unsigned DwordRegIndex,
unsigned RegWidth);
void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands,
- bool IsAtomic, bool IsAtomicReturn, bool IsLds = false);
+ bool IsAtomic, bool IsLds = false);
void cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
bool IsGdsHardcoded);
@@ -1140,7 +1292,7 @@ public:
// AsmParser::parseDirectiveSet() cannot be specialized for specific target.
AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
MCContext &Ctx = getContext();
- if (ISA.Major >= 6 && isHsaAbiVersion3(&getSTI())) {
+ if (ISA.Major >= 6 && isHsaAbiVersion3Or4(&getSTI())) {
MCSymbol *Sym =
Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_number"));
Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
@@ -1157,7 +1309,7 @@ public:
Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
}
- if (ISA.Major >= 6 && isHsaAbiVersion3(&getSTI())) {
+ if (ISA.Major >= 6 && isHsaAbiVersion3Or4(&getSTI())) {
initializeGprCountSymbol(IS_VGPR);
initializeGprCountSymbol(IS_SGPR);
} else
@@ -1165,10 +1317,6 @@ public:
}
}
- bool hasXNACK() const {
- return AMDGPU::hasXNACK(getSTI());
- }
-
bool hasMIMG_R128() const {
return AMDGPU::hasMIMG_R128(getSTI());
}
@@ -1181,6 +1329,8 @@ public:
return AMDGPU::hasGFX10A16(getSTI());
}
+ bool hasG16() const { return AMDGPU::hasG16(getSTI()); }
+
bool isSI() const {
return AMDGPU::isSI(getSTI());
}
@@ -1197,6 +1347,10 @@ public:
return AMDGPU::isGFX9(getSTI());
}
+ bool isGFX90A() const {
+ return AMDGPU::isGFX90A(getSTI());
+ }
+
bool isGFX9Plus() const {
return AMDGPU::isGFX9Plus(getSTI());
}
@@ -1219,6 +1373,10 @@ public:
return getFeatureBits()[AMDGPU::FeatureFlatInstOffsets];
}
+ bool hasArchitectedFlatScratch() const {
+ return getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
+ }
+
bool hasSGPR102_SGPR103() const {
return !isVI() && !isGFX9();
}
@@ -1294,8 +1452,9 @@ public:
bool (*ConvertResult)(int64_t&) = nullptr);
OperandMatchResultTy
- parseNamedBit(const char *Name, OperandVector &Operands,
+ parseNamedBit(StringRef Name, OperandVector &Operands,
AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone);
+ OperandMatchResultTy parseCPol(OperandVector &Operands);
OperandMatchResultTy parseStringWithPrefix(StringRef Prefix,
StringRef &Value,
SMLoc &StringLoc);
@@ -1379,14 +1538,19 @@ private:
bool validateMIMGAddrSize(const MCInst &Inst);
bool validateMIMGD16(const MCInst &Inst);
bool validateMIMGDim(const MCInst &Inst);
- bool validateLdsDirect(const MCInst &Inst);
+ bool validateMIMGMSAA(const MCInst &Inst);
bool validateOpSel(const MCInst &Inst);
+ bool validateDPP(const MCInst &Inst, const OperandVector &Operands);
bool validateVccOperand(unsigned Reg) const;
bool validateVOP3Literal(const MCInst &Inst, const OperandVector &Operands);
bool validateMAIAccWrite(const MCInst &Inst, const OperandVector &Operands);
+ bool validateAGPRLdSt(const MCInst &Inst) const;
+ bool validateVGPRAlign(const MCInst &Inst) const;
+ bool validateGWS(const MCInst &Inst, const OperandVector &Operands);
bool validateDivScale(const MCInst &Inst);
bool validateCoherencyBits(const MCInst &Inst, const OperandVector &Operands,
const SMLoc &IDLoc);
+ Optional<StringRef> validateLdsDirect(const MCInst &Inst);
unsigned getConstantBusLimit(unsigned Opcode) const;
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
@@ -1403,6 +1567,7 @@ private:
bool isId(const AsmToken &Token, const StringRef Id) const;
bool isToken(const AsmToken::TokenKind Kind) const;
bool trySkipId(const StringRef Id);
+ bool trySkipId(const StringRef Pref, const StringRef Id);
bool trySkipId(const StringRef Id, const AsmToken::TokenKind Kind);
bool trySkipToken(const AsmToken::TokenKind Kind);
bool skipToken(const AsmToken::TokenKind Kind, const StringRef ErrMsg);
@@ -1420,6 +1585,8 @@ private:
void lex();
public:
+ void onBeginOfFile() override;
+
OperandMatchResultTy parseOptionalOperand(OperandVector &Operands);
OperandMatchResultTy parseOptionalOpr(OperandVector &Operands);
@@ -1451,16 +1618,12 @@ public:
OperandMatchResultTy parseGPRIdxMode(OperandVector &Operands);
int64_t parseGPRIdxMacro();
- void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); }
- void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); }
- void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); }
- void cvtMubufLds(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false, true); }
+ void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false); }
+ void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true); }
+ void cvtMubufLds(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, true); }
void cvtMtbuf(MCInst &Inst, const OperandVector &Operands);
- AMDGPUOperand::Ptr defaultDLC() const;
- AMDGPUOperand::Ptr defaultGLC() const;
- AMDGPUOperand::Ptr defaultGLC_1() const;
- AMDGPUOperand::Ptr defaultSLC() const;
+ AMDGPUOperand::Ptr defaultCPol() const;
AMDGPUOperand::Ptr defaultSMRDOffset8() const;
AMDGPUOperand::Ptr defaultSMEMOffset() const;
@@ -1474,6 +1637,8 @@ public:
void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3P(MCInst &Inst, const OperandVector &Operands);
+ void cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
+ OptionalImmIndexMap &OptionalIdx);
void cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands);
@@ -1482,6 +1647,9 @@ public:
void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands);
void cvtIntersectRay(MCInst &Inst, const OperandVector &Operands);
+ void cvtSMEMAtomic(MCInst &Inst, const OperandVector &Operands);
+
+ bool parseDimId(unsigned &Encoding);
OperandMatchResultTy parseDim(OperandVector &Operands);
OperandMatchResultTy parseDPP8(OperandVector &Operands);
OperandMatchResultTy parseDPPCtrl(OperandVector &Operands);
@@ -1551,11 +1719,16 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
+ case AMDGPU::OPERAND_REG_IMM_V2FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
+ case AMDGPU::OPERAND_REG_IMM_V2INT32:
return &APFloat::IEEEsingle();
case AMDGPU::OPERAND_REG_IMM_INT64:
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
return &APFloat::IEEEdouble();
case AMDGPU::OPERAND_REG_IMM_INT16:
case AMDGPU::OPERAND_REG_IMM_FP16:
@@ -1715,7 +1888,8 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {
// literal goes into the lower half and the upper half is zero. We also
// require that the literal may be losslesly converted to f16.
MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 :
- (type == MVT::v2i16)? MVT::i16 : type;
+ (type == MVT::v2i16)? MVT::i16 :
+ (type == MVT::v2f32)? MVT::f32 : type;
APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val));
return canLosslesslyConvertToFPType(FPLiteral, ExpectedType);
@@ -1725,6 +1899,13 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const {
return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
}
+bool AMDGPUOperand::isVRegWithInputMods() const {
+ return isRegClass(AMDGPU::VGPR_32RegClassID) ||
+ // GFX90A allows DPP on 64-bit operands.
+ (isRegClass(AMDGPU::VReg_64RegClassID) &&
+ AsmParser->getFeatureBits()[AMDGPU::Feature64BitDPP]);
+}
+
bool AMDGPUOperand::isSDWAOperand(MVT type) const {
if (AsmParser->isVI())
return isVReg32();
@@ -1751,8 +1932,9 @@ bool AMDGPUOperand::isSDWAInt32Operand() const {
}
bool AMDGPUOperand::isBoolReg() const {
- return (AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize64] && isSCSrcB64()) ||
- (AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize32] && isSCSrcB32());
+ auto FB = AsmParser->getFeatureBits();
+ return isReg() && ((FB[AMDGPU::FeatureWavefrontSize64] && isSCSrcB64()) ||
+ (FB[AMDGPU::FeatureWavefrontSize32] && isSCSrcB32()));
}
uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const
@@ -1806,6 +1988,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(),
AsmParser->hasInv2PiInlineImm())) {
Inst.addOperand(MCOperand::createImm(Literal.getZExtValue()));
@@ -1849,7 +2032,11 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
case AMDGPU::OPERAND_REG_IMM_V2INT16:
- case AMDGPU::OPERAND_REG_IMM_V2FP16: {
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
+ case AMDGPU::OPERAND_REG_IMM_V2FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
+ case AMDGPU::OPERAND_REG_IMM_V2INT32: {
bool lost;
APFloat FPLiteral(APFloat::IEEEdouble(), Literal);
// Convert literal to single precision
@@ -1881,6 +2068,10 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
+ case AMDGPU::OPERAND_REG_IMM_V2INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
if (isSafeTruncation(Val, 32) &&
AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val),
AsmParser->hasInv2PiInlineImm())) {
@@ -1897,6 +2088,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
if (AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) {
Inst.addOperand(MCOperand::createImm(Val));
setImmKindConst();
@@ -2000,6 +2192,7 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
case 4: return AMDGPU::VReg_128RegClassID;
case 5: return AMDGPU::VReg_160RegClassID;
case 6: return AMDGPU::VReg_192RegClassID;
+ case 7: return AMDGPU::VReg_224RegClassID;
case 8: return AMDGPU::VReg_256RegClassID;
case 16: return AMDGPU::VReg_512RegClassID;
case 32: return AMDGPU::VReg_1024RegClassID;
@@ -2022,6 +2215,7 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
case 4: return AMDGPU::SGPR_128RegClassID;
case 5: return AMDGPU::SGPR_160RegClassID;
case 6: return AMDGPU::SGPR_192RegClassID;
+ case 7: return AMDGPU::SGPR_224RegClassID;
case 8: return AMDGPU::SGPR_256RegClassID;
case 16: return AMDGPU::SGPR_512RegClassID;
}
@@ -2034,6 +2228,7 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
case 4: return AMDGPU::AReg_128RegClassID;
case 5: return AMDGPU::AReg_160RegClassID;
case 6: return AMDGPU::AReg_192RegClassID;
+ case 7: return AMDGPU::AReg_224RegClassID;
case 8: return AMDGPU::AReg_256RegClassID;
case 16: return AMDGPU::AReg_512RegClassID;
case 32: return AMDGPU::AReg_1024RegClassID;
@@ -2529,7 +2724,7 @@ AMDGPUAsmParser::parseRegister(bool RestoreOnFailure) {
if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) {
return nullptr;
}
- if (isHsaAbiVersion3(&getSTI())) {
+ if (isHsaAbiVersion3Or4(&getSTI())) {
if (!updateGprCountSymbols(RegKind, RegNum, RegWidth))
return nullptr;
} else
@@ -3200,7 +3395,7 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) {
return true;
unsigned VDataSize = AMDGPU::getRegOperandSize(getMRI(), Desc, VDataIdx);
- unsigned TFESize = Inst.getOperand(TFEIdx).getImm()? 1 : 0;
+ unsigned TFESize = (TFEIdx != -1 && Inst.getOperand(TFEIdx).getImm()) ? 1 : 0;
unsigned DMask = Inst.getOperand(DMaskIdx).getImm() & 0xf;
if (DMask == 0)
DMask = 1;
@@ -3230,6 +3425,7 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) {
int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
int SrsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
int DimIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dim);
+ int A16Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::a16);
assert(VAddr0Idx != -1);
assert(SrsrcIdx != -1);
@@ -3241,22 +3437,26 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) {
unsigned Dim = Inst.getOperand(DimIdx).getImm();
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim);
bool IsNSA = SrsrcIdx - VAddr0Idx > 1;
- unsigned VAddrSize =
+ unsigned ActualAddrSize =
IsNSA ? SrsrcIdx - VAddr0Idx
: AMDGPU::getRegOperandSize(getMRI(), Desc, VAddr0Idx) / 4;
+ bool IsA16 = (A16Idx != -1 && Inst.getOperand(A16Idx).getImm());
+
+ unsigned ExpectedAddrSize =
+ AMDGPU::getAddrSizeMIMGOp(BaseOpcode, DimInfo, IsA16, hasG16());
- unsigned AddrSize = BaseOpcode->NumExtraArgs +
- (BaseOpcode->Gradients ? DimInfo->NumGradients : 0) +
- (BaseOpcode->Coordinates ? DimInfo->NumCoords : 0) +
- (BaseOpcode->LodOrClampOrMip ? 1 : 0);
if (!IsNSA) {
- if (AddrSize > 8)
- AddrSize = 16;
- else if (AddrSize > 4)
- AddrSize = 8;
+ if (ExpectedAddrSize > 8)
+ ExpectedAddrSize = 16;
+
+ // Allow oversized 8 VGPR vaddr when only 5/6/7 VGPRs are required.
+ // This provides backward compatibility for assembly created
+ // before 160b/192b/224b types were directly supported.
+ if (ActualAddrSize == 8 && (ExpectedAddrSize >= 5 && ExpectedAddrSize <= 7))
+ return true;
}
- return VAddrSize == AddrSize;
+ return ActualAddrSize == ExpectedAddrSize;
}
bool AMDGPUAsmParser::validateMIMGAtomicDMask(const MCInst &Inst) {
@@ -3298,6 +3498,29 @@ bool AMDGPUAsmParser::validateMIMGGatherDMask(const MCInst &Inst) {
return DMask == 0x1 || DMask == 0x2 || DMask == 0x4 || DMask == 0x8;
}
+bool AMDGPUAsmParser::validateMIMGMSAA(const MCInst &Inst) {
+ const unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
+ return true;
+
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
+ const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+ AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
+
+ if (!BaseOpcode->MSAA)
+ return true;
+
+ int DimIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dim);
+ assert(DimIdx != -1);
+
+ unsigned Dim = Inst.getOperand(DimIdx).getImm();
+ const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim);
+
+ return DimInfo->MSAA;
+}
+
static bool IsMovrelsSDWAOpcode(const unsigned Opcode)
{
switch (Opcode) {
@@ -3559,7 +3782,7 @@ static bool IsRevOpcode(const unsigned Opcode)
}
}
-bool AMDGPUAsmParser::validateLdsDirect(const MCInst &Inst) {
+Optional<StringRef> AMDGPUAsmParser::validateLdsDirect(const MCInst &Inst) {
using namespace SIInstrFlags;
const unsigned Opcode = Inst.getOpcode();
@@ -3567,33 +3790,29 @@ bool AMDGPUAsmParser::validateLdsDirect(const MCInst &Inst) {
// lds_direct register is defined so that it can be used
// with 9-bit operands only. Ignore encodings which do not accept these.
- if ((Desc.TSFlags & (VOP1 | VOP2 | VOP3 | VOPC | VOP3P | SIInstrFlags::SDWA)) == 0)
- return true;
+ const auto Enc = VOP1 | VOP2 | VOP3 | VOPC | VOP3P | SIInstrFlags::SDWA;
+ if ((Desc.TSFlags & Enc) == 0)
+ return None;
- const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
- const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
- const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
+ for (auto SrcName : {OpName::src0, OpName::src1, OpName::src2}) {
+ auto SrcIdx = getNamedOperandIdx(Opcode, SrcName);
+ if (SrcIdx == -1)
+ break;
+ const auto &Src = Inst.getOperand(SrcIdx);
+ if (Src.isReg() && Src.getReg() == LDS_DIRECT) {
- const int SrcIndices[] = { Src1Idx, Src2Idx };
+ if (isGFX90A())
+ return StringRef("lds_direct is not supported on this GPU");
- // lds_direct cannot be specified as either src1 or src2.
- for (int SrcIdx : SrcIndices) {
- if (SrcIdx == -1) break;
- const MCOperand &Src = Inst.getOperand(SrcIdx);
- if (Src.isReg() && Src.getReg() == LDS_DIRECT) {
- return false;
+ if (IsRevOpcode(Opcode) || (Desc.TSFlags & SIInstrFlags::SDWA))
+ return StringRef("lds_direct cannot be used with this instruction");
+
+ if (SrcName != OpName::src0)
+ return StringRef("lds_direct may be used as src0 only");
}
}
- if (Src0Idx == -1)
- return true;
-
- const MCOperand &Src = Inst.getOperand(Src0Idx);
- if (!Src.isReg() || Src.getReg() != LDS_DIRECT)
- return true;
-
- // lds_direct is specified as src0. Check additional limitations.
- return (Desc.TSFlags & SIInstrFlags::SDWA) == 0 && !IsRevOpcode(Opcode);
+ return None;
}
SMLoc AMDGPUAsmParser::getFlatOffsetLoc(const OperandVector &Operands) const {
@@ -3624,7 +3843,7 @@ bool AMDGPUAsmParser::validateFlatOffset(const MCInst &Inst,
// For FLAT segment the offset must be positive;
// MSB is ignored and forced to zero.
- if (TSFlags & (SIInstrFlags::IsFlatGlobal | SIInstrFlags::IsFlatScratch)) {
+ if (TSFlags & (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch)) {
unsigned OffsetSize = AMDGPU::getNumFlatOffsetBits(getSTI(), true);
if (!isIntN(OffsetSize, Op.getImm())) {
Error(getFlatOffsetLoc(Operands),
@@ -3733,6 +3952,28 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) {
return true;
}
+bool AMDGPUAsmParser::validateDPP(const MCInst &Inst,
+ const OperandVector &Operands) {
+ const unsigned Opc = Inst.getOpcode();
+ int DppCtrlIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dpp_ctrl);
+ if (DppCtrlIdx < 0)
+ return true;
+ unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm();
+
+ if (!AMDGPU::isLegal64BitDPPControl(DppCtrl)) {
+ // DPP64 is supported for row_newbcast only.
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ if (Src0Idx >= 0 &&
+ getMRI()->getSubReg(Inst.getOperand(Src0Idx).getReg(), AMDGPU::sub1)) {
+ SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands);
+ Error(S, "64 bit dpp only supports row_newbcast");
+ return false;
+ }
+ }
+
+ return true;
+}
+
// Check if VCC register matches wavefront size
bool AMDGPUAsmParser::validateVccOperand(unsigned Reg) const {
auto FB = getFeatureBits();
@@ -3802,18 +4043,148 @@ bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst,
return true;
}
+// Returns -1 if not a register, 0 if VGPR and 1 if AGPR.
+static int IsAGPROperand(const MCInst &Inst, uint16_t NameIdx,
+ const MCRegisterInfo *MRI) {
+ int OpIdx = AMDGPU::getNamedOperandIdx(Inst.getOpcode(), NameIdx);
+ if (OpIdx < 0)
+ return -1;
+
+ const MCOperand &Op = Inst.getOperand(OpIdx);
+ if (!Op.isReg())
+ return -1;
+
+ unsigned Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0);
+ auto Reg = Sub ? Sub : Op.getReg();
+ const MCRegisterClass &AGPR32 = MRI->getRegClass(AMDGPU::AGPR_32RegClassID);
+ return AGPR32.contains(Reg) ? 1 : 0;
+}
+
+bool AMDGPUAsmParser::validateAGPRLdSt(const MCInst &Inst) const {
+ uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
+ if ((TSFlags & (SIInstrFlags::FLAT | SIInstrFlags::MUBUF |
+ SIInstrFlags::MTBUF | SIInstrFlags::MIMG |
+ SIInstrFlags::DS)) == 0)
+ return true;
+
+ uint16_t DataNameIdx = (TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
+ : AMDGPU::OpName::vdata;
+
+ const MCRegisterInfo *MRI = getMRI();
+ int DstAreg = IsAGPROperand(Inst, AMDGPU::OpName::vdst, MRI);
+ int DataAreg = IsAGPROperand(Inst, DataNameIdx, MRI);
+
+ if ((TSFlags & SIInstrFlags::DS) && DataAreg >= 0) {
+ int Data2Areg = IsAGPROperand(Inst, AMDGPU::OpName::data1, MRI);
+ if (Data2Areg >= 0 && Data2Areg != DataAreg)
+ return false;
+ }
+
+ auto FB = getFeatureBits();
+ if (FB[AMDGPU::FeatureGFX90AInsts]) {
+ if (DataAreg < 0 || DstAreg < 0)
+ return true;
+ return DstAreg == DataAreg;
+ }
+
+ return DstAreg < 1 && DataAreg < 1;
+}
+
+bool AMDGPUAsmParser::validateVGPRAlign(const MCInst &Inst) const {
+ auto FB = getFeatureBits();
+ if (!FB[AMDGPU::FeatureGFX90AInsts])
+ return true;
+
+ const MCRegisterInfo *MRI = getMRI();
+ const MCRegisterClass &VGPR32 = MRI->getRegClass(AMDGPU::VGPR_32RegClassID);
+ const MCRegisterClass &AGPR32 = MRI->getRegClass(AMDGPU::AGPR_32RegClassID);
+ for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+ const MCOperand &Op = Inst.getOperand(I);
+ if (!Op.isReg())
+ continue;
+
+ unsigned Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0);
+ if (!Sub)
+ continue;
+
+ if (VGPR32.contains(Sub) && ((Sub - AMDGPU::VGPR0) & 1))
+ return false;
+ if (AGPR32.contains(Sub) && ((Sub - AMDGPU::AGPR0) & 1))
+ return false;
+ }
+
+ return true;
+}
+
+// gfx90a has an undocumented limitation:
+// DS_GWS opcodes must use even aligned registers.
+bool AMDGPUAsmParser::validateGWS(const MCInst &Inst,
+ const OperandVector &Operands) {
+ if (!getFeatureBits()[AMDGPU::FeatureGFX90AInsts])
+ return true;
+
+ int Opc = Inst.getOpcode();
+ if (Opc != AMDGPU::DS_GWS_INIT_vi && Opc != AMDGPU::DS_GWS_BARRIER_vi &&
+ Opc != AMDGPU::DS_GWS_SEMA_BR_vi)
+ return true;
+
+ const MCRegisterInfo *MRI = getMRI();
+ const MCRegisterClass &VGPR32 = MRI->getRegClass(AMDGPU::VGPR_32RegClassID);
+ int Data0Pos =
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0);
+ assert(Data0Pos != -1);
+ auto Reg = Inst.getOperand(Data0Pos).getReg();
+ auto RegIdx = Reg - (VGPR32.contains(Reg) ? AMDGPU::VGPR0 : AMDGPU::AGPR0);
+ if (RegIdx & 1) {
+ SMLoc RegLoc = getRegLoc(Reg, Operands);
+ Error(RegLoc, "vgpr must be even aligned");
+ return false;
+ }
+
+ return true;
+}
+
bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
const OperandVector &Operands,
const SMLoc &IDLoc) {
- int GLCPos = AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
- AMDGPU::OpName::glc1);
- if (GLCPos != -1) {
- // -1 is set by GLC_1 default operand. In all cases "glc" must be present
- // in the asm string, and the default value means it is not present.
- if (Inst.getOperand(GLCPos).getImm() == -1) {
+ int CPolPos = AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+ AMDGPU::OpName::cpol);
+ if (CPolPos == -1)
+ return true;
+
+ unsigned CPol = Inst.getOperand(CPolPos).getImm();
+
+ uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
+ if ((TSFlags & (SIInstrFlags::SMRD)) &&
+ (CPol & ~(AMDGPU::CPol::GLC | AMDGPU::CPol::DLC))) {
+ Error(IDLoc, "invalid cache policy for SMRD instruction");
+ return false;
+ }
+
+ if (isGFX90A() && (CPol & CPol::SCC)) {
+ SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
+ StringRef CStr(S.getPointer());
+ S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scc")]);
+ Error(S, "scc is not supported on this GPU");
+ return false;
+ }
+
+ if (!(TSFlags & (SIInstrFlags::IsAtomicNoRet | SIInstrFlags::IsAtomicRet)))
+ return true;
+
+ if (TSFlags & SIInstrFlags::IsAtomicRet) {
+ if (!(TSFlags & SIInstrFlags::MIMG) && !(CPol & CPol::GLC)) {
Error(IDLoc, "instruction must use glc");
return false;
}
+ } else {
+ if (CPol & CPol::GLC) {
+ SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
+ StringRef CStr(S.getPointer());
+ S = SMLoc::getFromPointer(&CStr.data()[CStr.find("glc")]);
+ Error(S, "instruction must not use glc");
+ return false;
+ }
}
return true;
@@ -3822,9 +4193,8 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
const SMLoc &IDLoc,
const OperandVector &Operands) {
- if (!validateLdsDirect(Inst)) {
- Error(getRegLoc(AMDGPU::LDS_DIRECT, Operands),
- "invalid use of lds_direct");
+ if (auto ErrMsg = validateLdsDirect(Inst)) {
+ Error(getRegLoc(LDS_DIRECT, Operands), *ErrMsg);
return false;
}
if (!validateSOPLiteral(Inst)) {
@@ -3851,6 +4221,9 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
"invalid op_sel operand");
return false;
}
+ if (!validateDPP(Inst, Operands)) {
+ return false;
+ }
// For MUBUF/MTBUF d16 is a part of opcode, so there is nothing to validate.
if (!validateMIMGD16(Inst)) {
Error(getImmLoc(AMDGPUOperand::ImmTyD16, Operands),
@@ -3861,6 +4234,11 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
Error(IDLoc, "dim modifier is required on this GPU");
return false;
}
+ if (!validateMIMGMSAA(Inst)) {
+ Error(getImmLoc(AMDGPUOperand::ImmTyDim, Operands),
+ "invalid dim; must be MSAA type");
+ return false;
+ }
if (!validateMIMGDataSize(Inst)) {
Error(IDLoc,
"image data size does not match dmask and tfe");
@@ -3893,6 +4271,26 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
if (!validateMAIAccWrite(Inst, Operands)) {
return false;
}
+ if (!validateCoherencyBits(Inst, Operands, IDLoc)) {
+ return false;
+ }
+
+ if (!validateAGPRLdSt(Inst)) {
+ Error(IDLoc, getFeatureBits()[AMDGPU::FeatureGFX90AInsts]
+ ? "invalid register class: data and dst should be all VGPR or AGPR"
+ : "invalid register class: agpr loads and stores not supported on this GPU"
+ );
+ return false;
+ }
+ if (!validateVGPRAlign(Inst)) {
+ Error(IDLoc,
+ "invalid register class: vgpr tuples must be 64 bit aligned");
+ return false;
+ }
+ if (!validateGWS(Inst, Operands)) {
+ return false;
+ }
+
if (!validateDivScale(Inst)) {
Error(IDLoc, "ABS not allowed in VOP3B instructions");
return false;
@@ -4062,21 +4460,19 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGCNTarget() {
if (getSTI().getTargetTriple().getArch() != Triple::amdgcn)
return TokError("directive only supported for amdgcn architecture");
- std::string Target;
-
- SMLoc TargetStart = getLoc();
- if (getParser().parseEscapedString(Target))
+ std::string TargetIDDirective;
+ SMLoc TargetStart = getTok().getLoc();
+ if (getParser().parseEscapedString(TargetIDDirective))
return true;
- SMRange TargetRange = SMRange(TargetStart, getLoc());
- std::string ExpectedTarget;
- raw_string_ostream ExpectedTargetOS(ExpectedTarget);
- IsaInfo::streamIsaVersion(&getSTI(), ExpectedTargetOS);
+ SMRange TargetRange = SMRange(TargetStart, getTok().getLoc());
+ if (getTargetStreamer().getTargetID()->toString() != TargetIDDirective)
+ return getParser().Error(TargetRange.Start,
+ (Twine(".amdgcn_target directive's target id ") +
+ Twine(TargetIDDirective) +
+ Twine(" does not match the specified target id ") +
+ Twine(getTargetStreamer().getTargetID()->toString())).str());
- if (Target != ExpectedTargetOS.str())
- return Error(TargetRange.Start, "target must match options", TargetRange);
-
- getTargetStreamer().EmitDirectiveAMDGCNTarget(Target);
return false;
}
@@ -4143,12 +4539,12 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
SMRange VGPRRange;
uint64_t NextFreeVGPR = 0;
+ uint64_t AccumOffset = 0;
SMRange SGPRRange;
uint64_t NextFreeSGPR = 0;
unsigned UserSGPRCount = 0;
bool ReserveVCC = true;
bool ReserveFlatScr = true;
- bool ReserveXNACK = hasXNACK();
Optional<bool> EnableWavefrontSize32;
while (true) {
@@ -4191,7 +4587,15 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
if (!isUInt<sizeof(KD.private_segment_fixed_size) * CHAR_BIT>(Val))
return OutOfRangeError(ValRange);
KD.private_segment_fixed_size = Val;
+ } else if (ID == ".amdhsa_kernarg_size") {
+ if (!isUInt<sizeof(KD.kernarg_size) * CHAR_BIT>(Val))
+ return OutOfRangeError(ValRange);
+ KD.kernarg_size = Val;
} else if (ID == ".amdhsa_user_sgpr_private_segment_buffer") {
+ if (hasArchitectedFlatScratch())
+ return Error(IDRange.Start,
+ "directive is not supported with architected flat scratch",
+ IDRange);
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
Val, ValRange);
@@ -4222,6 +4626,10 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
if (Val)
UserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") {
+ if (hasArchitectedFlatScratch())
+ return Error(IDRange.Start,
+ "directive is not supported with architected flat scratch",
+ IDRange);
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val,
ValRange);
@@ -4241,10 +4649,20 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
Val, ValRange);
} else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") {
- PARSE_BITS_ENTRY(
- KD.compute_pgm_rsrc2,
- COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val,
- ValRange);
+ if (hasArchitectedFlatScratch())
+ return Error(IDRange.Start,
+ "directive is not supported with architected flat scratch",
+ IDRange);
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val, ValRange);
+ } else if (ID == ".amdhsa_enable_private_segment") {
+ if (!hasArchitectedFlatScratch())
+ return Error(
+ IDRange.Start,
+ "directive is not supported without architected flat scratch",
+ IDRange);
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val, ValRange);
} else if (ID == ".amdhsa_system_sgpr_workgroup_id_x") {
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, Val,
@@ -4271,6 +4689,10 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
} else if (ID == ".amdhsa_next_free_sgpr") {
SGPRRange = ValRange;
NextFreeSGPR = Val;
+ } else if (ID == ".amdhsa_accum_offset") {
+ if (!isGFX90A())
+ return Error(IDRange.Start, "directive requires gfx90a+", IDRange);
+ AccumOffset = Val;
} else if (ID == ".amdhsa_reserve_vcc") {
if (!isUInt<1>(Val))
return OutOfRangeError(ValRange);
@@ -4278,6 +4700,10 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
} else if (ID == ".amdhsa_reserve_flat_scratch") {
if (IVersion.Major < 7)
return Error(IDRange.Start, "directive requires gfx7+", IDRange);
+ if (hasArchitectedFlatScratch())
+ return Error(IDRange.Start,
+ "directive is not supported with architected flat scratch",
+ IDRange);
if (!isUInt<1>(Val))
return OutOfRangeError(ValRange);
ReserveFlatScr = Val;
@@ -4286,7 +4712,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
return Error(IDRange.Start, "directive requires gfx8+", IDRange);
if (!isUInt<1>(Val))
return OutOfRangeError(ValRange);
- ReserveXNACK = Val;
+ if (Val != getTargetStreamer().getTargetID()->isXnackOnOrAny())
+ return getParser().Error(IDRange.Start, ".amdhsa_reserve_xnack_mask does not match target id",
+ IDRange);
} else if (ID == ".amdhsa_float_round_mode_32") {
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32, Val, ValRange);
@@ -4311,6 +4739,11 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
return Error(IDRange.Start, "directive requires gfx9+", IDRange);
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FP16_OVFL, Val,
ValRange);
+ } else if (ID == ".amdhsa_tg_split") {
+ if (!isGFX90A())
+ return Error(IDRange.Start, "directive requires gfx90a+", IDRange);
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Val,
+ ValRange);
} else if (ID == ".amdhsa_workgroup_processor_mode") {
if (IVersion.Major < 10)
return Error(IDRange.Start, "directive requires gfx10+", IDRange);
@@ -4372,7 +4805,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
unsigned VGPRBlocks;
unsigned SGPRBlocks;
if (calculateGPRBlocks(getFeatureBits(), ReserveVCC, ReserveFlatScr,
- ReserveXNACK, EnableWavefrontSize32, NextFreeVGPR,
+ getTargetStreamer().getTargetID()->isXnackOnOrAny(),
+ EnableWavefrontSize32, NextFreeVGPR,
VGPRRange, NextFreeSGPR, SGPRRange, VGPRBlocks,
SGPRBlocks))
return true;
@@ -4395,9 +4829,21 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, COMPUTE_PGM_RSRC2_USER_SGPR_COUNT,
UserSGPRCount);
+ if (isGFX90A()) {
+ if (Seen.find(".amdhsa_accum_offset") == Seen.end())
+ return TokError(".amdhsa_accum_offset directive is required");
+ if (AccumOffset < 4 || AccumOffset > 256 || (AccumOffset & 3))
+ return TokError("accum_offset should be in range [4..256] in "
+ "increments of 4");
+ if (AccumOffset > alignTo(std::max((uint64_t)1, NextFreeVGPR), 4))
+ return TokError("accum_offset exceeds total VGPR allocation");
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
+ (AccumOffset / 4 - 1));
+ }
+
getTargetStreamer().EmitAmdhsaKernelDescriptor(
getSTI(), KernelName, KD, NextFreeVGPR, NextFreeSGPR, ReserveVCC,
- ReserveFlatScr, ReserveXNACK);
+ ReserveFlatScr);
return false;
}
@@ -4423,9 +4869,9 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
// targeted GPU.
if (isToken(AsmToken::EndOfStatement)) {
AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
- getTargetStreamer().EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor,
- ISA.Stepping,
- "AMD", "AMDGPU");
+ getTargetStreamer().EmitDirectiveHSACodeObjectISAV2(ISA.Major, ISA.Minor,
+ ISA.Stepping,
+ "AMD", "AMDGPU");
return false;
}
@@ -4450,8 +4896,8 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
if (!parseString(ArchName, "invalid arch name"))
return true;
- getTargetStreamer().EmitDirectiveHSACodeObjectISA(Major, Minor, Stepping,
- VendorName, ArchName);
+ getTargetStreamer().EmitDirectiveHSACodeObjectISAV2(Major, Minor, Stepping,
+ VendorName, ArchName);
return false;
}
@@ -4560,19 +5006,11 @@ bool AMDGPUAsmParser::ParseDirectiveISAVersion() {
"architectures");
}
- auto ISAVersionStringFromASM = getToken().getStringContents();
+ auto TargetIDDirective = getLexer().getTok().getStringContents();
+ if (getTargetStreamer().getTargetID()->toString() != TargetIDDirective)
+ return Error(getParser().getTok().getLoc(), "target id must match options");
- std::string ISAVersionStringFromSTI;
- raw_string_ostream ISAVersionStreamFromSTI(ISAVersionStringFromSTI);
- IsaInfo::streamIsaVersion(&getSTI(), ISAVersionStreamFromSTI);
-
- if (ISAVersionStringFromASM != ISAVersionStreamFromSTI.str()) {
- return Error(getLoc(),
- ".amd_amdgpu_isa directive does not match triple and/or mcpu "
- "arguments specified through the command line");
- }
-
- getTargetStreamer().EmitISAVersion(ISAVersionStreamFromSTI.str());
+ getTargetStreamer().EmitISAVersion();
Lex();
return false;
@@ -4582,7 +5020,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
const char *AssemblerDirectiveBegin;
const char *AssemblerDirectiveEnd;
std::tie(AssemblerDirectiveBegin, AssemblerDirectiveEnd) =
- isHsaAbiVersion3(&getSTI())
+ isHsaAbiVersion3Or4(&getSTI())
? std::make_tuple(HSAMD::V3::AssemblerDirectiveBegin,
HSAMD::V3::AssemblerDirectiveEnd)
: std::make_tuple(HSAMD::AssemblerDirectiveBegin,
@@ -4599,7 +5037,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
HSAMetadataString))
return true;
- if (isHsaAbiVersion3(&getSTI())) {
+ if (isHsaAbiVersion3Or4(&getSTI())) {
if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString))
return Error(getLoc(), "invalid HSA metadata");
} else {
@@ -4749,12 +5187,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() {
bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
StringRef IDVal = DirectiveID.getString();
- if (isHsaAbiVersion3(&getSTI())) {
- if (IDVal == ".amdgcn_target")
- return ParseDirectiveAMDGCNTarget();
-
+ if (isHsaAbiVersion3Or4(&getSTI())) {
if (IDVal == ".amdhsa_kernel")
- return ParseDirectiveAMDHSAKernel();
+ return ParseDirectiveAMDHSAKernel();
// TODO: Restructure/combine with PAL metadata directive.
if (IDVal == AMDGPU::HSAMD::V3::AssemblerDirectiveBegin)
@@ -4779,6 +5214,9 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
return ParseDirectiveHSAMetadata();
}
+ if (IDVal == ".amdgcn_target")
+ return ParseDirectiveAMDGCNTarget();
+
if (IDVal == ".amdgpu_lds")
return ParseDirectiveAMDGPULDS();
@@ -4792,7 +5230,7 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
}
bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
- unsigned RegNo) const {
+ unsigned RegNo) {
for (MCRegAliasIterator R(AMDGPU::TTMP12_TTMP13_TTMP14_TTMP15, &MRI, true);
R.isValid(); ++R) {
@@ -4824,7 +5262,7 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
case AMDGPU::XNACK_MASK:
case AMDGPU::XNACK_MASK_LO:
case AMDGPU::XNACK_MASK_HI:
- return (isVI() || isGFX9()) && hasXNACK();
+ return (isVI() || isGFX9()) && getTargetStreamer().getTargetID()->isXnackSupported();
case AMDGPU::SGPR_NULL:
return isGFX10Plus();
default:
@@ -4881,16 +5319,21 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic,
unsigned Prefix = Operands.size();
for (;;) {
+ auto Loc = getLoc();
ResTy = parseReg(Operands);
+ if (ResTy == MatchOperand_NoMatch)
+ Error(Loc, "expected a register");
if (ResTy != MatchOperand_Success)
- return ResTy;
+ return MatchOperand_ParseFail;
RBraceLoc = getLoc();
if (trySkipToken(AsmToken::RBrac))
break;
- if (!trySkipToken(AsmToken::Comma))
+ if (!skipToken(AsmToken::Comma,
+ "expected a comma or a closing square bracket")) {
return MatchOperand_ParseFail;
+ }
}
if (Operands.size() - Prefix > 1) {
@@ -4940,11 +5383,9 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
OperandMode Mode = OperandMode_Default;
if (IsMIMG && isGFX10Plus() && Operands.size() == 2)
Mode = OperandMode_NSA;
+ CPolSeen = 0;
OperandMatchResultTy Res = parseOperand(Operands, Name, Mode);
- // Eat the comma or space if there is one.
- trySkipToken(AsmToken::Comma);
-
if (Res != MatchOperand_Success) {
checkUnsupportedInstruction(Name, NameLoc);
if (!Parser.hasPendingError()) {
@@ -4959,6 +5400,9 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
}
return true;
}
+
+ // Eat the comma or space if there is one.
+ trySkipToken(AsmToken::Comma);
}
return false;
@@ -5043,39 +5487,27 @@ AMDGPUAsmParser::parseOperandArrayWithPrefix(const char *Prefix,
}
OperandMatchResultTy
-AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
+AMDGPUAsmParser::parseNamedBit(StringRef Name, OperandVector &Operands,
AMDGPUOperand::ImmTy ImmTy) {
- int64_t Bit = 0;
+ int64_t Bit;
SMLoc S = getLoc();
- // We are at the end of the statement, and this is a default argument, so
- // use a default value.
- if (!isToken(AsmToken::EndOfStatement)) {
- switch(getTokenKind()) {
- case AsmToken::Identifier: {
- StringRef Tok = getTokenStr();
- if (Tok == Name) {
- if (Tok == "r128" && !hasMIMG_R128())
- Error(S, "r128 modifier is not supported on this GPU");
- if (Tok == "a16" && !isGFX9() && !hasGFX10A16())
- Error(S, "a16 modifier is not supported on this GPU");
- Bit = 1;
- Parser.Lex();
- } else if (Tok.startswith("no") && Tok.endswith(Name)) {
- Bit = 0;
- Parser.Lex();
- } else {
- return MatchOperand_NoMatch;
- }
- break;
- }
- default:
- return MatchOperand_NoMatch;
- }
+ if (trySkipId(Name)) {
+ Bit = 1;
+ } else if (trySkipId("no", Name)) {
+ Bit = 0;
+ } else {
+ return MatchOperand_NoMatch;
}
- if (!isGFX10Plus() && ImmTy == AMDGPUOperand::ImmTyDLC)
+ if (Name == "r128" && !hasMIMG_R128()) {
+ Error(S, "r128 modifier is not supported on this GPU");
+ return MatchOperand_ParseFail;
+ }
+ if (Name == "a16" && !isGFX9() && !hasGFX10A16()) {
+ Error(S, "a16 modifier is not supported on this GPU");
return MatchOperand_ParseFail;
+ }
if (isGFX9() && ImmTy == AMDGPUOperand::ImmTyA16)
ImmTy = AMDGPUOperand::ImmTyR128A16;
@@ -5084,6 +5516,62 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
return MatchOperand_Success;
}
+OperandMatchResultTy
+AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
+ unsigned CPolOn = 0;
+ unsigned CPolOff = 0;
+ SMLoc S = getLoc();
+
+ if (trySkipId("glc"))
+ CPolOn = AMDGPU::CPol::GLC;
+ else if (trySkipId("noglc"))
+ CPolOff = AMDGPU::CPol::GLC;
+ else if (trySkipId("slc"))
+ CPolOn = AMDGPU::CPol::SLC;
+ else if (trySkipId("noslc"))
+ CPolOff = AMDGPU::CPol::SLC;
+ else if (trySkipId("dlc"))
+ CPolOn = AMDGPU::CPol::DLC;
+ else if (trySkipId("nodlc"))
+ CPolOff = AMDGPU::CPol::DLC;
+ else if (trySkipId("scc"))
+ CPolOn = AMDGPU::CPol::SCC;
+ else if (trySkipId("noscc"))
+ CPolOff = AMDGPU::CPol::SCC;
+ else
+ return MatchOperand_NoMatch;
+
+ if (!isGFX10Plus() && ((CPolOn | CPolOff) & AMDGPU::CPol::DLC)) {
+ Error(S, "dlc modifier is not supported on this GPU");
+ return MatchOperand_ParseFail;
+ }
+
+ if (!isGFX90A() && ((CPolOn | CPolOff) & AMDGPU::CPol::SCC)) {
+ Error(S, "scc modifier is not supported on this GPU");
+ return MatchOperand_ParseFail;
+ }
+
+ if (CPolSeen & (CPolOn | CPolOff)) {
+ Error(S, "duplicate cache policy modifier");
+ return MatchOperand_ParseFail;
+ }
+
+ CPolSeen |= (CPolOn | CPolOff);
+
+ for (unsigned I = 1; I != Operands.size(); ++I) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+ if (Op.isCPol()) {
+ Op.setImm((Op.getImm() | CPolOn) & ~CPolOff);
+ return MatchOperand_Success;
+ }
+ }
+
+ Operands.push_back(AMDGPUOperand::CreateImm(this, CPolOn, S,
+ AMDGPUOperand::ImmTyCPol));
+
+ return MatchOperand_Success;
+}
+
static void addOptionalImmOperand(
MCInst& Inst, const OperandVector& Operands,
AMDGPUAsmParser::OptionalImmIndexMap& OptionalIdx,
@@ -5757,7 +6245,7 @@ AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg,
}
return false;
}
- if (!isValidMsgOp(Msg.Id, Op.Id, Strict)) {
+ if (!isValidMsgOp(Msg.Id, Op.Id, getSTI(), Strict)) {
Error(Op.Loc, "invalid operation id");
return false;
}
@@ -5765,7 +6253,7 @@ AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg,
Error(Stream.Loc, "message operation does not support streams");
return false;
}
- if (!isValidMsgStream(Msg.Id, Op.Id, Stream.Id, Strict)) {
+ if (!isValidMsgStream(Msg.Id, Op.Id, Stream.Id, getSTI(), Strict)) {
Error(Stream.Loc, "invalid message stream id");
return false;
}
@@ -5934,6 +6422,18 @@ AMDGPUAsmParser::trySkipId(const StringRef Id) {
}
bool
+AMDGPUAsmParser::trySkipId(const StringRef Pref, const StringRef Id) {
+ if (isToken(AsmToken::Identifier)) {
+ StringRef Tok = getTokenStr();
+ if (Tok.startswith(Pref) && Tok.drop_front(Pref.size()) == Id) {
+ lex();
+ return true;
+ }
+ }
+ return false;
+}
+
+bool
AMDGPUAsmParser::trySkipId(const StringRef Id, const AsmToken::TokenKind Kind) {
if (isId(Id) && peekToken().is(Kind)) {
lex();
@@ -6489,32 +6989,38 @@ AMDGPUAsmParser::parseBoolReg(OperandVector &Operands) {
// mubuf
//===----------------------------------------------------------------------===//
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDLC() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDLC);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultGLC() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyGLC);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultGLC_1() const {
- return AMDGPUOperand::CreateImm(this, -1, SMLoc(), AMDGPUOperand::ImmTyGLC);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSLC() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTySLC);
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultCPol() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyCPol);
}
void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
- const OperandVector &Operands,
- bool IsAtomic,
- bool IsAtomicReturn,
- bool IsLds) {
+ const OperandVector &Operands,
+ bool IsAtomic,
+ bool IsLds) {
bool IsLdsOpcode = IsLds;
bool HasLdsModifier = false;
OptionalImmIndexMap OptionalIdx;
- assert(IsAtomicReturn ? IsAtomic : true);
unsigned FirstOperandIdx = 1;
+ bool IsAtomicReturn = false;
+
+ if (IsAtomic) {
+ for (unsigned i = FirstOperandIdx, e = Operands.size(); i != e; ++i) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+ if (!Op.isCPol())
+ continue;
+ IsAtomicReturn = Op.getImm() & AMDGPU::CPol::GLC;
+ break;
+ }
+
+ if (!IsAtomicReturn) {
+ int NewOpc = AMDGPU::getAtomicNoRetOp(Inst.getOpcode());
+ if (NewOpc != -1)
+ Inst.setOpcode(NewOpc);
+ }
+
+ IsAtomicReturn = MII.get(Inst.getOpcode()).TSFlags &
+ SIInstrFlags::IsAtomicRet;
+ }
for (unsigned i = FirstOperandIdx, e = Operands.size(); i != e; ++i) {
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
@@ -6565,18 +7071,12 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
}
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
- if (!IsAtomic || IsAtomicReturn) {
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC,
- IsAtomicReturn ? -1 : 0);
- }
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0);
if (!IsLdsOpcode) { // tfe is not legal with lds opcodes
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
}
-
- if (isGFX10Plus())
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySWZ);
}
void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
@@ -6611,12 +7111,9 @@ void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
addOptionalImmOperand(Inst, Operands, OptionalIdx,
AMDGPUOperand::ImmTyOffset);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyFORMAT);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
-
- if (isGFX10Plus())
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySWZ);
}
//===----------------------------------------------------------------------===//
@@ -6658,14 +7155,12 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
if (IsGFX10Plus)
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDim, -1);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
- if (IsGFX10Plus)
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128A16);
if (IsGFX10Plus)
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyA16);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+ if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::tfe) != -1)
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
if (!IsGFX10Plus)
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
@@ -6676,6 +7171,61 @@ void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands)
cvtMIMG(Inst, Operands, true);
}
+void AMDGPUAsmParser::cvtSMEMAtomic(MCInst &Inst, const OperandVector &Operands) {
+ OptionalImmIndexMap OptionalIdx;
+ bool IsAtomicReturn = false;
+
+ for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+ if (!Op.isCPol())
+ continue;
+ IsAtomicReturn = Op.getImm() & AMDGPU::CPol::GLC;
+ break;
+ }
+
+ if (!IsAtomicReturn) {
+ int NewOpc = AMDGPU::getAtomicNoRetOp(Inst.getOpcode());
+ if (NewOpc != -1)
+ Inst.setOpcode(NewOpc);
+ }
+
+ IsAtomicReturn = MII.get(Inst.getOpcode()).TSFlags &
+ SIInstrFlags::IsAtomicRet;
+
+ for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+ // Add the register arguments
+ if (Op.isReg()) {
+ Op.addRegOperands(Inst, 1);
+ if (IsAtomicReturn && i == 1)
+ Op.addRegOperands(Inst, 1);
+ continue;
+ }
+
+ // Handle the case where soffset is an immediate
+ if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) {
+ Op.addImmOperands(Inst, 1);
+ continue;
+ }
+
+ // Handle tokens like 'offen' which are sometimes hard-coded into the
+ // asm string. There are no MCInst operands for these.
+ if (Op.isToken()) {
+ continue;
+ }
+ assert(Op.isImm());
+
+ // Handle optional arguments
+ OptionalIdx[Op.getImmTy()] = i;
+ }
+
+ if ((int)Inst.getNumOperands() <=
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::offset))
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0);
+}
+
void AMDGPUAsmParser::cvtIntersectRay(MCInst &Inst,
const OperandVector &Operands) {
for (unsigned I = 1; I < Operands.size(); ++I) {
@@ -6747,17 +7297,14 @@ static bool ConvertOmodDiv(int64_t &Div) {
return false;
}
+// Both bound_ctrl:0 and bound_ctrl:1 are encoded as 1.
+// This is intentional and ensures compatibility with sp3.
+// See bug 35397 for details.
static bool ConvertBoundCtrl(int64_t &BoundCtrl) {
- if (BoundCtrl == 0) {
+ if (BoundCtrl == 0 || BoundCtrl == 1) {
BoundCtrl = 1;
return true;
}
-
- if (BoundCtrl == -1) {
- BoundCtrl = 0;
- return true;
- }
-
return false;
}
@@ -6772,9 +7319,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"lds", AMDGPUOperand::ImmTyLDS, true, nullptr},
{"offset", AMDGPUOperand::ImmTyOffset, false, nullptr},
{"inst_offset", AMDGPUOperand::ImmTyInstOffset, false, nullptr},
- {"dlc", AMDGPUOperand::ImmTyDLC, true, nullptr},
- {"glc", AMDGPUOperand::ImmTyGLC, true, nullptr},
- {"slc", AMDGPUOperand::ImmTySLC, true, nullptr},
+ {"", AMDGPUOperand::ImmTyCPol, false, nullptr},
{"swz", AMDGPUOperand::ImmTySWZ, true, nullptr},
{"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr},
{"d16", AMDGPUOperand::ImmTyD16, true, nullptr},
@@ -6808,6 +7353,18 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"abid", AMDGPUOperand::ImmTyABID, false, nullptr}
};
+void AMDGPUAsmParser::onBeginOfFile() {
+ if (!getParser().getStreamer().getTargetStreamer() ||
+ getSTI().getTargetTriple().getArch() == Triple::r600)
+ return;
+
+ if (!getTargetStreamer().getTargetID())
+ getTargetStreamer().initializeTargetID(getSTI(), getSTI().getFeatureString());
+
+ if (isHsaAbiVersion3Or4(&getSTI()))
+ getTargetStreamer().EmitDirectiveAMDGCNTarget();
+}
+
OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) {
OperandMatchResultTy res = parseOptionalOpr(Operands);
@@ -6857,6 +7414,8 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOpr(OperandVector &Operands)
Op.ConvertResult);
} else if (Op.Type == AMDGPUOperand::ImmTyDim) {
res = parseDim(Operands);
+ } else if (Op.Type == AMDGPUOperand::ImmTyCPol) {
+ res = parseCPol(Operands);
} else {
res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult);
}
@@ -7010,6 +7569,7 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
Opc == AMDGPU::V_MAC_LEGACY_F32_e64_gfx6_gfx7 ||
Opc == AMDGPU::V_MAC_LEGACY_F32_e64_gfx10 ||
Opc == AMDGPU::V_MAC_F16_e64_vi ||
+ Opc == AMDGPU::V_FMAC_F64_e64_gfx90a ||
Opc == AMDGPU::V_FMAC_F32_e64_gfx10 ||
Opc == AMDGPU::V_FMAC_F32_e64_vi ||
Opc == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 ||
@@ -7028,16 +7588,13 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
cvtVOP3(Inst, Operands, OptionalIdx);
}
-void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst,
- const OperandVector &Operands) {
- OptionalImmIndexMap OptIdx;
+void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
+ OptionalImmIndexMap &OptIdx) {
const int Opc = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opc);
const bool IsPacked = (Desc.TSFlags & SIInstrFlags::IsPacked) != 0;
- cvtVOP3(Inst, Operands, OptIdx);
-
if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in) != -1) {
assert(!IsPacked);
Inst.addOperand(Inst.getOperand(0));
@@ -7046,7 +7603,10 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst,
// FIXME: This is messy. Parse the modifiers as if it was a normal VOP3
// instruction, and then figure out where to actually put the modifiers
- addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSel);
+ int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
+ if (OpSelIdx != -1) {
+ addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSel);
+ }
int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi);
if (OpSelHiIdx != -1) {
@@ -7057,7 +7617,6 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst,
int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo);
if (NegLoIdx != -1) {
- assert(IsPacked);
addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo);
addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegHi);
}
@@ -7069,16 +7628,16 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst,
AMDGPU::OpName::src1_modifiers,
AMDGPU::OpName::src2_modifiers };
- int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
-
- unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
+ unsigned OpSel = 0;
unsigned OpSelHi = 0;
unsigned NegLo = 0;
unsigned NegHi = 0;
- if (OpSelHiIdx != -1) {
+ if (OpSelIdx != -1)
+ OpSel = Inst.getOperand(OpSelIdx).getImm();
+
+ if (OpSelHiIdx != -1)
OpSelHi = Inst.getOperand(OpSelHiIdx).getImm();
- }
if (NegLoIdx != -1) {
int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi);
@@ -7111,6 +7670,12 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst,
}
}
+void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) {
+ OptionalImmIndexMap OptIdx;
+ cvtVOP3(Inst, Operands, OptIdx);
+ cvtVOP3P(Inst, Operands, OptIdx);
+}
+
//===----------------------------------------------------------------------===//
// dpp
//===----------------------------------------------------------------------===//
@@ -7167,44 +7732,64 @@ bool AMDGPUOperand::isU16Imm() const {
return isImm() && isUInt<16>(getImm());
}
-OperandMatchResultTy AMDGPUAsmParser::parseDim(OperandVector &Operands) {
- if (!isGFX10Plus())
- return MatchOperand_NoMatch;
-
- SMLoc S = getLoc();
-
- if (!trySkipId("dim", AsmToken::Colon))
- return MatchOperand_NoMatch;
+//===----------------------------------------------------------------------===//
+// dim
+//===----------------------------------------------------------------------===//
- // We want to allow "dim:1D" etc., but the initial 1 is tokenized as an
- // integer.
+bool AMDGPUAsmParser::parseDimId(unsigned &Encoding) {
+ // We want to allow "dim:1D" etc.,
+ // but the initial 1 is tokenized as an integer.
std::string Token;
if (isToken(AsmToken::Integer)) {
SMLoc Loc = getToken().getEndLoc();
Token = std::string(getTokenStr());
lex();
if (getLoc() != Loc)
- return MatchOperand_ParseFail;
+ return false;
}
- if (!isToken(AsmToken::Identifier))
- return MatchOperand_ParseFail;
- Token += getTokenStr();
+
+ StringRef Suffix;
+ if (!parseId(Suffix))
+ return false;
+ Token += Suffix;
StringRef DimId = Token;
if (DimId.startswith("SQ_RSRC_IMG_"))
- DimId = DimId.substr(12);
+ DimId = DimId.drop_front(12);
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByAsmSuffix(DimId);
if (!DimInfo)
- return MatchOperand_ParseFail;
+ return false;
+
+ Encoding = DimInfo->Encoding;
+ return true;
+}
- lex();
+OperandMatchResultTy AMDGPUAsmParser::parseDim(OperandVector &Operands) {
+ if (!isGFX10Plus())
+ return MatchOperand_NoMatch;
- Operands.push_back(AMDGPUOperand::CreateImm(this, DimInfo->Encoding, S,
+ SMLoc S = getLoc();
+
+ if (!trySkipId("dim", AsmToken::Colon))
+ return MatchOperand_NoMatch;
+
+ unsigned Encoding;
+ SMLoc Loc = getLoc();
+ if (!parseDimId(Encoding)) {
+ Error(Loc, "invalid dim value");
+ return MatchOperand_ParseFail;
+ }
+
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Encoding, S,
AMDGPUOperand::ImmTyDim));
return MatchOperand_Success;
}
+//===----------------------------------------------------------------------===//
+// dpp
+//===----------------------------------------------------------------------===//
+
OperandMatchResultTy AMDGPUAsmParser::parseDPP8(OperandVector &Operands) {
SMLoc S = getLoc();
@@ -7245,6 +7830,9 @@ OperandMatchResultTy AMDGPUAsmParser::parseDPP8(OperandVector &Operands) {
bool
AMDGPUAsmParser::isSupportedDPPCtrl(StringRef Ctrl,
const OperandVector &Operands) {
+ if (Ctrl == "row_newbcast")
+ return isGFX90A();
+
if (Ctrl == "row_share" ||
Ctrl == "row_xmask")
return isGFX10Plus();
@@ -7322,6 +7910,7 @@ AMDGPUAsmParser::parseDPPCtrlSel(StringRef Ctrl) {
.Case("row_ror", {DppCtrl::ROW_ROR0, 1, 15})
.Case("row_share", {DppCtrl::ROW_SHARE_FIRST, 0, 15})
.Case("row_xmask", {DppCtrl::ROW_XMASK_FIRST, 0, 15})
+ .Case("row_newbcast", {DppCtrl::ROW_NEWBCAST_FIRST, 0, 15})
.Default({-1, 0, 0});
bool Valid;
@@ -7400,6 +7989,9 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultFI() const {
void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8) {
OptionalImmIndexMap OptionalIdx;
+ unsigned Opc = Inst.getOpcode();
+ bool HasModifiers =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1;
unsigned I = 1;
const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
@@ -7426,7 +8018,8 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool I
if (IsDPP8) {
if (Op.isDPP8()) {
Op.addImmOperands(Inst, 1);
- } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+ } else if (HasModifiers &&
+ isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
Op.addRegWithFPInputModsOperands(Inst, 2);
} else if (Op.isFI()) {
Fi = Op.getImm();
@@ -7436,8 +8029,11 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool I
llvm_unreachable("Invalid operand type");
}
} else {
- if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+ if (HasModifiers &&
+ isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
Op.addRegWithFPInputModsOperands(Inst, 2);
+ } else if (Op.isReg()) {
+ Op.addRegOperands(Inst, 1);
} else if (Op.isDPPCtrl()) {
Op.addImmOperands(Inst, 1);
} else if (Op.isImm()) {
@@ -7691,8 +8287,6 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
return Operand.isGDS() ? Match_Success : Match_InvalidOperand;
case MCK_lds:
return Operand.isLDS() ? Match_Success : Match_InvalidOperand;
- case MCK_glc:
- return Operand.isGLC() ? Match_Success : Match_InvalidOperand;
case MCK_idxen:
return Operand.isIdxen() ? Match_Success : Match_InvalidOperand;
case MCK_offen:
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 5dc5481df49e..5f43aa8388ee 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -6,17 +6,12 @@
//
//===----------------------------------------------------------------------===//
-def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
-def MUBUFAddr64 : ComplexPattern<i64, 9, "SelectMUBUFAddr64">;
-def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
+def MUBUFAddr64 : ComplexPattern<i64, 4, "SelectMUBUFAddr64">;
+def MUBUFOffset : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>;
def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>;
-def MUBUFOffset : ComplexPattern<i64, 8, "SelectMUBUFOffset">;
-def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
-def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
-
def BUFAddrKind {
int Offset = 0;
int OffEn = 1;
@@ -105,6 +100,8 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins,
bits<1> has_slc = 1;
bits<1> has_tfe = 1;
bits<4> elements = 0;
+ bits<1> has_sccb = 1;
+ bits<1> sccb_value = 0;
}
class MTBUF_Real <MTBUF_Pseudo ps> :
@@ -113,6 +110,10 @@ class MTBUF_Real <MTBUF_Pseudo ps> :
let isPseudo = 0;
let isCodeGenOnly = 0;
+ let VM_CNT = 1;
+ let EXP_CNT = 1;
+ let MTBUF = 1;
+
// copy relevant pseudo op flags
let UseNamedOperandTable = ps.UseNamedOperandTable;
let SubtargetPredicate = ps.SubtargetPredicate;
@@ -120,39 +121,47 @@ class MTBUF_Real <MTBUF_Pseudo ps> :
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let IsAtomicRet = ps.IsAtomicRet;
+ let IsAtomicNoRet = ps.IsAtomicNoRet;
bits<12> offset;
- bits<1> glc;
- bits<1> dlc;
+ bits<5> cpol;
bits<7> format;
bits<8> vaddr;
- bits<8> vdata;
+ bits<10> vdata;
bits<7> srsrc;
- bits<1> slc;
bits<1> tfe;
bits<8> soffset;
bits<4> dfmt = format{3-0};
bits<3> nfmt = format{6-4};
+
+ // GFX90A+ only: instruction uses AccVGPR for data
+ // Bit superceedes tfe.
+ bits<1> acc = !if(ps.has_vdata, vdata{9}, 0);
}
class getMTBUFInsDA<list<RegisterClass> vdataList,
list<RegisterClass> vaddrList=[]> {
RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+ RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret;
dag InsNoData = !if(!empty(vaddrList),
(ins SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz),
+ offset:$offset, FORMAT:$format, CPol:$cpol, TFE:$tfe, SWZ:$swz),
(ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz)
+ offset:$offset, FORMAT:$format, CPol:$cpol, TFE:$tfe, SWZ:$swz)
);
dag InsData = !if(!empty(vaddrList),
- (ins vdataClass:$vdata, SReg_128:$srsrc,
- SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc,
- SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz),
- (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
- SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc,
- SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz)
+ (ins vdata_op:$vdata, SReg_128:$srsrc,
+ SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, CPol:$cpol,
+ TFE:$tfe, SWZ:$swz),
+ (ins vdata_op:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
+ SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, CPol:$cpol,
+ TFE:$tfe, SWZ:$swz)
);
dag ret = !if(!empty(vdataList), InsNoData, InsData);
}
@@ -202,9 +211,9 @@ class MTBUF_Load_Pseudo <string opName,
// Workaround bug bz30254
int addrKindCopy = addrKind>
: MTBUF_Pseudo<opName,
- (outs vdataClass:$vdata),
+ (outs getLdStRegisterOperand<vdataClass>.ret:$vdata),
getMTBUFIns<addrKindCopy>.ret,
- " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz",
+ " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$cpol$tfe$swz",
pattern>,
MTBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
@@ -217,17 +226,11 @@ multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
int elems, ValueType load_vt = i32,
SDPatternOperator ld = null_frag> {
- def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems,
- [(set load_vt:$vdata,
- (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$format,
- i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)))]>,
- MTBUFAddr64Table<0, NAME>;
+ def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>,
+ MTBUFAddr64Table<0, NAME>;
- def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems,
- [(set load_vt:$vdata,
- (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset,
- i8:$format, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)))]>,
- MTBUFAddr64Table<1, NAME>;
+ def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems>,
+ MTBUFAddr64Table<1, NAME>;
def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>;
def _IDXEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>;
@@ -252,7 +255,7 @@ class MTBUF_Store_Pseudo <string opName,
: MTBUF_Pseudo<opName,
(outs),
getMTBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
- " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz",
+ " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$cpol$tfe$swz",
pattern>,
MTBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
@@ -265,16 +268,10 @@ multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
int elems, ValueType store_vt = i32,
SDPatternOperator st = null_frag> {
- def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems,
- [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i16:$offset, i8:$format, i1:$glc,
- i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
+ def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>,
MTBUFAddr64Table<0, NAME>;
- def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems,
- [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset, i8:$format, i1:$glc,
- i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
+ def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems>,
MTBUFAddr64Table<1, NAME>;
def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>;
@@ -341,6 +338,9 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
bits<1> has_slc = 1;
bits<1> has_tfe = 1;
bits<4> elements = 0;
+ bits<1> has_sccb = 1;
+ bits<1> sccb_value = 0;
+ bits<1> IsBufferInv = 0;
}
class MUBUF_Real <MUBUF_Pseudo ps> :
@@ -349,6 +349,10 @@ class MUBUF_Real <MUBUF_Pseudo ps> :
let isPseudo = 0;
let isCodeGenOnly = 0;
+ let VM_CNT = 1;
+ let EXP_CNT = 1;
+ let MUBUF = 1;
+
// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
let AsmMatchConverter = ps.AsmMatchConverter;
@@ -357,16 +361,23 @@ class MUBUF_Real <MUBUF_Pseudo ps> :
let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
let UseNamedOperandTable = ps.UseNamedOperandTable;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let IsAtomicRet = ps.IsAtomicRet;
+ let IsAtomicNoRet = ps.IsAtomicNoRet;
bits<12> offset;
- bits<1> glc;
- bits<1> dlc;
+ bits<5> cpol;
bits<8> vaddr;
- bits<8> vdata;
+ bits<10> vdata;
bits<7> srsrc;
- bits<1> slc;
bits<1> tfe;
bits<8> soffset;
+
+ // GFX90A+ only: instruction uses AccVGPR for data
+ // Bit superceedes tfe.
+ bits<1> acc = !if(ps.has_vdata, vdata{9}, 0);
}
@@ -380,7 +391,8 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node = null_frag> :
let mayLoad = 0;
let mayStore = 0;
- // Set everything to 0.
+ let IsBufferInv = 1;
+ // Set everything else to 0.
let offen = 0;
let idxen = 0;
let addr64 = 0;
@@ -395,6 +407,8 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node = null_frag> :
let has_offset = 0;
let has_slc = 0;
let has_tfe = 0;
+ let has_sccb = 0;
+ let sccb_value = 0;
}
class getMUBUFInsDA<list<RegisterClass> vdataList,
@@ -402,33 +416,31 @@ class getMUBUFInsDA<list<RegisterClass> vdataList,
bit isLds = 0> {
RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+ RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret;
dag InsNoData = !if(!empty(vaddrList),
(ins SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, GLC:$glc, SLC:$slc),
+ offset:$offset, CPol_0:$cpol),
(ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, GLC:$glc, SLC:$slc)
+ offset:$offset, CPol_0:$cpol)
);
dag InsData = !if(!empty(vaddrList),
- (ins vdataClass:$vdata, SReg_128:$srsrc,
- SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc),
- (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
- SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc)
+ (ins vdata_op:$vdata, SReg_128:$srsrc,
+ SCSrc_b32:$soffset, offset:$offset, CPol_0:$cpol),
+ (ins vdata_op:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
+ SCSrc_b32:$soffset, offset:$offset, CPol_0:$cpol)
);
dag ret = !con(
!if(!empty(vdataList), InsNoData, InsData),
- !if(isLds, (ins DLC:$dlc, SWZ:$swz), (ins TFE:$tfe, DLC:$dlc,SWZ:$swz))
+ !if(isLds, (ins SWZ_0:$swz), (ins TFE_0:$tfe, SWZ_0:$swz))
);
}
class getMUBUFElements<ValueType vt> {
- // eq does not support ValueType for some reason.
- string vtAsStr = !cast<string>(vt);
-
int ret =
- !if(!eq(vtAsStr, "f16"), 1,
- !if(!eq(vtAsStr, "v2f16"), 2,
- !if(!eq(vtAsStr, "v3f16"), 3,
- !if(!eq(vtAsStr, "v4f16"), 4,
+ !if(!eq(vt, f16), 1,
+ !if(!eq(vt, v2f16), 2,
+ !if(!eq(vt, v3f16), 3,
+ !if(!eq(vt, v4f16), 4,
!if(!eq(vt.Size, 32), 1,
!if(!eq(vt.Size, 64), 2,
!if(!eq(vt.Size, 96), 3,
@@ -482,13 +494,15 @@ class MUBUF_Load_Pseudo <string opName,
bit isLds = 0,
list<dag> pattern=[],
// Workaround bug bz30254
- int addrKindCopy = addrKind>
+ int addrKindCopy = addrKind,
+ RegisterClass vdata_rc = getVregSrcForVT<vdata_vt>.ret,
+ RegisterOperand vdata_op = getLdStRegisterOperand<vdata_rc>.ret>
: MUBUF_Pseudo<opName,
- (outs getVregSrcForVT<vdata_vt>.ret:$vdata),
+ (outs vdata_op:$vdata),
!con(getMUBUFIns<addrKindCopy, [], isLds>.ret,
- !if(HasTiedDest, (ins getVregSrcForVT<vdata_vt>.ret:$vdata_in), (ins))),
- " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc" #
- !if(isLds, " lds", "$tfe") # "$dlc$swz",
+ !if(HasTiedDest, (ins vdata_op:$vdata_in), (ins))),
+ " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$cpol" #
+ !if(isLds, " lds", "$tfe") # "$swz",
pattern>,
MUBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # !if(isLds, "_lds", "") #
@@ -506,15 +520,15 @@ class MUBUF_Load_Pseudo <string opName,
}
class MUBUF_Offset_Load_Pat <Instruction inst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> : Pat <
- (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))),
- (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))
+ (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset))),
+ (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset))
>;
class MUBUF_Addr64_Load_Pat <Instruction inst,
ValueType load_vt = i32,
SDPatternOperator ld = null_frag> : Pat <
- (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))),
- (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))
+ (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset))),
+ (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset))
>;
multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> {
@@ -531,7 +545,7 @@ multiclass MUBUF_Pseudo_Loads<string opName,
bit TiedDest = 0,
bit isLds = 0> {
- defvar legal_load_vt = !if(!eq(!cast<string>(load_vt), !cast<string>(v3f16)), v4f16, load_vt);
+ defvar legal_load_vt = !if(!eq(load_vt, v3f16), v4f16, load_vt);
def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, legal_load_vt, TiedDest, isLds>,
MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>;
@@ -567,7 +581,7 @@ class MUBUF_Store_Pseudo <string opName,
: MUBUF_Pseudo<opName,
(outs),
getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret]>.ret,
- " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz",
+ " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$cpol$tfe$swz",
pattern>,
MUBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
@@ -581,16 +595,16 @@ multiclass MUBUF_Pseudo_Stores<string opName,
ValueType store_vt = i32,
SDPatternOperator st = null_frag> {
- defvar legal_store_vt = !if(!eq(!cast<string>(store_vt), !cast<string>(v3f16)), v4f16, store_vt);
+ defvar legal_store_vt = !if(!eq(store_vt, v3f16), v4f16, store_vt);
def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, legal_store_vt,
[(st legal_store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
+ i16:$offset))]>,
MUBUFAddr64Table<0, NAME>;
def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, legal_store_vt,
[(st legal_store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
+ i16:$offset))]>,
MUBUFAddr64Table<1, NAME>;
def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, legal_store_vt>;
@@ -608,8 +622,8 @@ multiclass MUBUF_Pseudo_Stores<string opName,
class MUBUF_Pseudo_Store_Lds<string opName>
: MUBUF_Pseudo<opName,
(outs),
- (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc, SWZ:$swz),
- " $srsrc, $soffset$offset lds$glc$slc$swz"> {
+ (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol:$cpol, SWZ:$swz),
+ " $srsrc, $soffset$offset lds$cpol$swz"> {
let mayLoad = 0;
let mayStore = 1;
let maybeAtomic = 1;
@@ -626,18 +640,19 @@ class MUBUF_Pseudo_Store_Lds<string opName>
class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in,
list<RegisterClass> vaddrList=[]> {
RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+ RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret;
dag ret = !if(vdata_in,
!if(!empty(vaddrList),
- (ins vdataClass:$vdata_in,
- SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC_1:$glc1, SLC:$slc),
- (ins vdataClass:$vdata_in, vaddrClass:$vaddr,
- SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC_1:$glc1, SLC:$slc)
+ (ins vdata_op:$vdata_in,
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol_GLC1:$cpol),
+ (ins vdata_op:$vdata_in, vaddrClass:$vaddr,
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol_GLC1:$cpol)
),
!if(!empty(vaddrList),
- (ins vdataClass:$vdata,
- SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc),
- (ins vdataClass:$vdata, vaddrClass:$vaddr,
- SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc)
+ (ins vdata_op:$vdata,
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol_0:$cpol),
+ (ins vdata_op:$vdata, vaddrClass:$vaddr,
+ SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol_0:$cpol)
));
}
@@ -678,7 +693,9 @@ class MUBUF_Atomic_Pseudo<string opName,
let has_glc = 0;
let has_dlc = 0;
let has_tfe = 0;
+ let has_sccb = 1;
let maybeAtomic = 1;
+ let AsmMatchConverter = "cvtMubufAtomic";
}
class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind,
@@ -690,13 +707,14 @@ class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind,
: MUBUF_Atomic_Pseudo<opName, addrKindCopy,
(outs),
getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 0>.ret,
- " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$slc",
+ " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$cpol",
pattern>,
AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 0> {
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
let glc_value = 0;
let dlc_value = 0;
- let AsmMatchConverter = "cvtMubufAtomic";
+ let sccb_value = 0;
+ let IsAtomicNoRet = 1;
}
class MUBUF_AtomicRet_Pseudo<string opName, int addrKind,
@@ -704,19 +722,21 @@ class MUBUF_AtomicRet_Pseudo<string opName, int addrKind,
list<dag> pattern=[],
// Workaround bug bz30254
int addrKindCopy = addrKind,
- RegisterClass vdataClassCopy = vdataClass>
+ RegisterClass vdataClassCopy = vdataClass,
+ RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret>
: MUBUF_Atomic_Pseudo<opName, addrKindCopy,
- (outs vdataClassCopy:$vdata),
+ (outs vdata_op:$vdata),
getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 1>.ret,
- " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc1$slc",
+ " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$cpol",
pattern>,
AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 1> {
let PseudoInstr = opName # "_rtn_" # getAddrName<addrKindCopy>.ret;
let glc_value = 1;
let dlc_value = 0;
+ let sccb_value = 0;
+ let IsAtomicRet = 1;
let Constraints = "$vdata = $vdata_in";
let DisableEncoding = "$vdata_in";
- let AsmMatchConverter = "cvtMubufAtomicReturn";
}
multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
@@ -751,15 +771,15 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
let FPAtomic = isFP in
def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
[(set vdataType:$vdata,
- (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc),
+ (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset),
vdataType:$vdata_in))]>,
MUBUFAddr64Table <0, NAME # "_RTN">;
let FPAtomic = isFP in
def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
[(set vdataType:$vdata,
- (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc),
- vdataType:$vdata_in))]>,
+ (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset),
+ vdataType:$vdata_in))]>,
MUBUFAddr64Table <1, NAME # "_RTN">;
let FPAtomic = isFP in
@@ -1106,6 +1126,15 @@ defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN <
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
"buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_load_fadd_v2f16_global_noret_32
>;
+
+let OtherPredicates = [isGFX90APlus] in {
+defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN <
+ "buffer_atomic_add_f32", VGPR_32, f32, atomic_load_fadd_global_32
+>;
+defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN <
+ "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_load_fadd_v2f16_global_32
+>;
+}
} // End SubtargetPredicate = HasAtomicFaddInsts
//===----------------------------------------------------------------------===//
@@ -1154,6 +1183,17 @@ def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol",
} // End let SubtargetPredicate = isGFX7Plus
+let SubtargetPredicate = isGFX90APlus in {
+ def BUFFER_WBL2 : MUBUF_Invalidate<"buffer_wbl2"> {
+ }
+ def BUFFER_INVL2 : MUBUF_Invalidate<"buffer_invl2"> {
+ }
+
+ defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>;
+ defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
+ defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
+} // End SubtargetPredicate = isGFX90APlus
+
let SubtargetPredicate = isGFX10Plus in {
def BUFFER_GL0_INV : MUBUF_Invalidate<"buffer_gl0_inv">;
def BUFFER_GL1_INV : MUBUF_Invalidate<"buffer_gl1_inv">;
@@ -1169,30 +1209,27 @@ let SubtargetPredicate = isGFX10Plus in {
multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode, ValueType memoryVt = vt> {
- defvar st = !if(!eq(!cast<string>(memoryVt), !cast<string>(vt)), name, mubuf_intrinsic_load<name, memoryVt>);
+ defvar st = !if(!eq(memoryVt, vt), name, mubuf_intrinsic_load<name, memoryVt>);
def : GCNPat<
(vt (st v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
def : GCNPat<
(vt (st v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
timm:$auxiliary, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
def : GCNPat<
(vt (st v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, timm)),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
def : GCNPat<
@@ -1201,8 +1238,7 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
}
@@ -1255,32 +1291,27 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ushort, i32, "BUFFER_LOAD_USHORT">;
multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode, ValueType memoryVt = vt> {
- defvar st = !if(!eq(!cast<string>(memoryVt), !cast<string>(vt)), name, mubuf_intrinsic_store<name, memoryVt>);
+ defvar st = !if(!eq(memoryVt, vt), name, mubuf_intrinsic_store<name, memoryVt>);
def : GCNPat<
(st vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
def : GCNPat<
(st vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
timm:$auxiliary, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), (extract_glc $auxiliary),
- (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (as_i16timm $offset), (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
def : GCNPat<
(st vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, timm),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), (extract_glc $auxiliary),
- (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (as_i16timm $offset), (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
def : GCNPat<
@@ -1289,9 +1320,8 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact)
getVregSrcForVT<vt>.ret:$vdata,
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
- SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_glc $auxiliary),
- (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_cpol $auxiliary),
+ 0, (extract_swz $auxiliary))
>;
}
@@ -1351,7 +1381,7 @@ multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt,
timm:$offset, timm:$cachepolicy, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN)
getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), (extract_slc $cachepolicy))
+ (as_i16timm $offset), (set_glc $cachepolicy))
>;
def : GCNPat<
@@ -1359,7 +1389,7 @@ multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt,
timm:$offset, timm:$cachepolicy, timm)),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) getVregSrcForVT<vt>.ret:$vdata_in,
VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), (extract_slc $cachepolicy))
+ (as_i16timm $offset), (set_glc $cachepolicy))
>;
def : GCNPat<
@@ -1367,7 +1397,7 @@ multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt,
i32:$soffset, timm:$offset, timm:$cachepolicy, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) getVregSrcForVT<vt>.ret:$vdata_in,
VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), (extract_slc $cachepolicy))
+ (as_i16timm $offset), (set_glc $cachepolicy))
>;
def : GCNPat<
@@ -1377,7 +1407,7 @@ multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt,
getVregSrcForVT<vt>.ret:$vdata_in,
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
- (extract_slc $cachepolicy))
+ (set_glc $cachepolicy))
>;
}
@@ -1425,7 +1455,7 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
0, i32:$soffset, timm:$offset,
timm:$cachepolicy, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), (extract_slc $cachepolicy))
+ (as_i16timm $offset), $cachepolicy)
>;
def : GCNPat<
@@ -1433,7 +1463,7 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
0, i32:$soffset, timm:$offset,
timm:$cachepolicy, timm),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), (extract_slc $cachepolicy))
+ (as_i16timm $offset), $cachepolicy)
>;
def : GCNPat<
@@ -1441,7 +1471,7 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
i32:$voffset, i32:$soffset, timm:$offset,
timm:$cachepolicy, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), (extract_slc $cachepolicy))
+ (as_i16timm $offset), $cachepolicy)
>;
def : GCNPat<
@@ -1451,7 +1481,7 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
getVregSrcForVT<vt>.ret:$vdata_in,
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
- SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy))
+ SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), $cachepolicy)
>;
}
@@ -1460,15 +1490,24 @@ defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, f32, "BUFFER_ATOMIC_ADD
defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">;
}
+let SubtargetPredicate = isGFX90APlus in {
+ defm : BufferAtomicPatterns<SIbuffer_atomic_fadd, f32, "BUFFER_ATOMIC_ADD_F32">;
+ defm : BufferAtomicPatterns<SIbuffer_atomic_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">;
+
+ defm : BufferAtomicPatterns<SIbuffer_atomic_fadd, f64, "BUFFER_ATOMIC_ADD_F64">;
+ defm : BufferAtomicPatterns<SIbuffer_atomic_fmin, f64, "BUFFER_ATOMIC_MIN_F64">;
+ defm : BufferAtomicPatterns<SIbuffer_atomic_fmax, f64, "BUFFER_ATOMIC_MAX_F64">;
+} // End SubtargetPredicate = isGFX90APlus
+
def : GCNPat<
(SIbuffer_atomic_cmpswap
i32:$data, i32:$cmp, v4i32:$rsrc, 0, 0, i32:$soffset,
timm:$offset, timm:$cachepolicy, 0),
- (EXTRACT_SUBREG
+ (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS
(BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN
(REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
- (extract_slc $cachepolicy)), sub0)
+ (set_glc $cachepolicy)), VReg_64)), sub0)
>;
def : GCNPat<
@@ -1476,10 +1515,11 @@ def : GCNPat<
i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
0, i32:$soffset, timm:$offset,
timm:$cachepolicy, timm),
- (EXTRACT_SUBREG
+ (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS
(BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN
(REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
- VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)),
+ VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ (set_glc $cachepolicy)), VReg_64)),
sub0)
>;
@@ -1488,10 +1528,11 @@ def : GCNPat<
i32:$data, i32:$cmp, v4i32:$rsrc, 0,
i32:$voffset, i32:$soffset, timm:$offset,
timm:$cachepolicy, 0),
- (EXTRACT_SUBREG
+ (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS
(BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN
(REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
- VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)),
+ VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ (set_glc $cachepolicy)), VReg_64)),
sub0)
>;
@@ -1500,32 +1541,32 @@ def : GCNPat<
i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
i32:$voffset, i32:$soffset, timm:$offset,
timm:$cachepolicy, timm),
- (EXTRACT_SUBREG
+ (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS
(BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN
(REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
- SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)),
+ SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ (set_glc $cachepolicy)), VReg_64)),
sub0)
>;
class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt,
PatFrag constant_ld> : GCNPat <
(vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))),
- (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz)
+ i16:$offset))),
+ (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset)
>;
multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
ValueType vt, PatFrag atomic_ld> {
def : GCNPat <
- (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset, i1:$slc))),
- (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0)
+ (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset))),
+ (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset)
>;
def : GCNPat <
- (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))),
- (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0)
+ (vt (atomic_ld (MUBUFOffset v4i32:$rsrc, i32:$soffset, i16:$offset))),
+ (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset))
>;
}
@@ -1545,9 +1586,8 @@ multiclass MUBUFLoad_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
PatFrag ld> {
def : GCNPat <
- (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))),
- (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz)
+ (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset))),
+ (Instr_OFFSET $srsrc, $soffset, $offset)
>;
}
@@ -1570,12 +1610,12 @@ multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen,
def : GCNPat <
(vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
i32:$soffset, u16imm:$offset))),
- (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0)
+ (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
>;
def : GCNPat <
(vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))),
- (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0)
+ (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0)
>;
}
@@ -1585,12 +1625,12 @@ multiclass MUBUFScratchLoadPat_D16 <MUBUF_Pseudo InstrOffen,
ValueType vt, PatFrag ld_frag> {
def : GCNPat <
(ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset), vt:$in),
- (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, $in)
+ (InstrOffen $vaddr, $srsrc, $soffset, $offset, $in)
>;
def : GCNPat <
(ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset), vt:$in),
- (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, $in)
+ (InstrOffset $srsrc, $soffset, $offset, $in)
>;
}
@@ -1635,14 +1675,13 @@ multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo In
ValueType vt, PatFrag atomic_st> {
// Store follows atomic op convention so address is first
def : GCNPat <
- (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset, i1:$slc), vt:$val),
- (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0)
+ (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset), vt:$val),
+ (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset)
>;
def : GCNPat <
- (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val),
- (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0)
+ (atomic_st (MUBUFOffset v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val),
+ (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset))
>;
}
let SubtargetPredicate = isGFX6GFX7 in {
@@ -1655,9 +1694,8 @@ multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
PatFrag st> {
def : GCNPat <
- (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)),
- (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz)
+ (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset)),
+ (Instr_OFFSET $vdata, $srsrc, $soffset, $offset)
>;
}
@@ -1671,13 +1709,13 @@ multiclass MUBUFScratchStorePat <MUBUF_Pseudo InstrOffen,
def : GCNPat <
(st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
i32:$soffset, u16imm:$offset)),
- (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0)
+ (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
>;
def : GCNPat <
(st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset,
u16imm:$offset)),
- (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0)
+ (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0)
>;
}
@@ -1716,15 +1754,14 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_D16_HI_OFFEN, BUFFER_STORE_BYTE_D
multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode, ValueType memoryVt = vt> {
- defvar st = !if(!eq(!cast<string>(memoryVt), !cast<string>(vt)), name, mtbuf_intrinsic_load<name, memoryVt>);
+ defvar st = !if(!eq(memoryVt, vt), name, mtbuf_intrinsic_load<name, memoryVt>);
def : GCNPat<
(vt (st v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, 0)),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
(as_i8timm $format),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
def : GCNPat<
@@ -1732,8 +1769,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
timm:$format, timm:$auxiliary, timm)),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
(as_i8timm $format),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
def : GCNPat<
@@ -1741,8 +1777,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
timm:$format, timm:$auxiliary, 0)),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
(as_i8timm $format),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
def : GCNPat<
@@ -1752,8 +1787,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
(as_i8timm $format),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
}
@@ -1784,15 +1818,14 @@ let SubtargetPredicate = HasPackedD16VMem in {
multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode, ValueType memoryVt = vt> {
- defvar st = !if(!eq(!cast<string>(memoryVt), !cast<string>(vt)), name, mtbuf_intrinsic_store<name, memoryVt>);
+ defvar st = !if(!eq(memoryVt, vt), name, mtbuf_intrinsic_store<name, memoryVt>);
def : GCNPat<
(st vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, 0),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset,
(as_i16timm $offset), (as_i8timm $format),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
def : GCNPat<
@@ -1800,8 +1833,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
timm:$format, timm:$auxiliary, timm),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
(as_i16timm $offset), (as_i8timm $format),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
def : GCNPat<
@@ -1809,8 +1841,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
timm:$format, timm:$auxiliary, 0),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
(as_i16timm $offset), (as_i8timm $format),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
def : GCNPat<
@@ -1820,8 +1851,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
getVregSrcForVT<vt>.ret:$vdata,
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
- (extract_swz $auxiliary))
+ (extract_cpol $auxiliary), 0, (extract_swz $auxiliary))
>;
}
@@ -1863,21 +1893,21 @@ class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef> :
let Inst{11-0} = !if(ps.has_offset, offset, ?);
let Inst{12} = ps.offen;
let Inst{13} = ps.idxen;
- let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
+ let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value);
let Inst{16} = ps.lds;
let Inst{24-18} = op;
let Inst{31-26} = 0x38;
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
- let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?);
let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
- let Inst{54} = !if(ps.has_slc, slc, ?);
+ let Inst{54} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?);
let Inst{55} = !if(ps.has_tfe, tfe, ?);
let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
}
class MUBUF_Real_gfx10<bits<8> op, MUBUF_Pseudo ps> :
Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.GFX10> {
- let Inst{15} = !if(ps.has_dlc, dlc, ps.dlc_value);
+ let Inst{15} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlc_value);
let Inst{25} = op{7};
}
@@ -1891,13 +1921,6 @@ class MUBUF_Real_gfx6_gfx7<bits<8> op, MUBUF_Pseudo ps> :
//===----------------------------------------------------------------------===//
let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
- multiclass MUBUF_Real_gfx10_with_name<bits<8> op, string opName,
- string asmName> {
- def _gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(opName)> {
- MUBUF_Pseudo ps = !cast<MUBUF_Pseudo>(opName);
- let AsmString = asmName # ps.AsmOperands;
- }
- }
multiclass MUBUF_Real_AllAddr_gfx10<bits<8> op> {
def _BOTHEN_gfx10 :
MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
@@ -1929,16 +1952,33 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
}
multiclass MUBUF_Real_Atomics_RTN_gfx10<bits<8> op> {
def _BOTHEN_RTN_gfx10 :
- MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>,
+ AtomicNoRet<NAME # "_BOTHEN_gfx10", 1>;
def _IDXEN_RTN_gfx10 :
- MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>;
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>,
+ AtomicNoRet<NAME # "_IDXEN_gfx10", 1>;
def _OFFEN_RTN_gfx10 :
- MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>;
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>,
+ AtomicNoRet<NAME # "_OFFEN_gfx10", 1>;
def _OFFSET_RTN_gfx10 :
- MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>,
+ AtomicNoRet<NAME # "_OFFSET_gfx10", 1>;
}
multiclass MUBUF_Real_Atomics_gfx10<bits<8> op> :
- MUBUF_Real_AllAddr_gfx10<op>, MUBUF_Real_Atomics_RTN_gfx10<op>;
+ MUBUF_Real_Atomics_RTN_gfx10<op> {
+ def _BOTHEN_gfx10 :
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
+ AtomicNoRet<NAME # "_BOTHEN_gfx10", 0>;
+ def _IDXEN_gfx10 :
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
+ AtomicNoRet<NAME # "_IDXEN_gfx10", 0>;
+ def _OFFEN_gfx10 :
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
+ AtomicNoRet<NAME # "_OFFEN_gfx10", 0>;
+ def _OFFSET_gfx10 :
+ MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
+ AtomicNoRet<NAME # "_OFFSET_gfx10", 0>;
+ }
} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x019>;
@@ -2018,18 +2058,38 @@ let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
def _LDS_BOTHEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
MUBUFLdsTable<1, NAME # "_BOTHEN_gfx6_gfx7">;
}
- multiclass MUBUF_Real_Atomics_gfx6_gfx7<bits<8> op> :
- MUBUF_Real_AllAddr_gfx6_gfx7<op> {
+ multiclass MUBUF_Real_Atomics_gfx6_gfx7<bits<8> op> {
+ def _ADDR64_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>,
+ AtomicNoRet<NAME # "_ADDR64_gfx6_gfx7", 0>;
+ def _BOTHEN_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
+ AtomicNoRet<NAME # "_BOTHEN_gfx6_gfx7", 0>;
+ def _IDXEN_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
+ AtomicNoRet<NAME # "_IDXEN_gfx6_gfx7", 0>;
+ def _OFFEN_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
+ AtomicNoRet<NAME # "_OFFEN_gfx6_gfx7", 0>;
+ def _OFFSET_gfx6_gfx7 :
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
+ AtomicNoRet<NAME # "_OFFSET_gfx6_gfx7", 0>;
+
def _ADDR64_RTN_gfx6_gfx7 :
- MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64_RTN")>;
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64_RTN")>,
+ AtomicNoRet<NAME # "_ADDR64_gfx6_gfx7", 1>;
def _BOTHEN_RTN_gfx6_gfx7 :
- MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>,
+ AtomicNoRet<NAME # "_BOTHEN_gfx6_gfx7", 1>;
def _IDXEN_RTN_gfx6_gfx7 :
- MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>;
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>,
+ AtomicNoRet<NAME # "_IDXEN_gfx6_gfx7", 1>;
def _OFFEN_RTN_gfx6_gfx7 :
- MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>;
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>,
+ AtomicNoRet<NAME # "_OFFEN_gfx6_gfx7", 1>;
def _OFFSET_RTN_gfx6_gfx7 :
- MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
+ MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>,
+ AtomicNoRet<NAME # "_OFFSET_gfx6_gfx7", 1>;
}
} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
@@ -2118,13 +2178,13 @@ class Base_MTBUF_Real_gfx6_gfx7_gfx10<bits<3> op, MTBUF_Pseudo ps, int ef> :
let Inst{11-0} = !if(ps.has_offset, offset, ?);
let Inst{12} = ps.offen;
let Inst{13} = ps.idxen;
- let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
+ let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value);
let Inst{18-16} = op;
let Inst{31-26} = 0x3a; //encoding
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
- let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?);
let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
- let Inst{54} = !if(ps.has_slc, slc, ?);
+ let Inst{54} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?);
let Inst{55} = !if(ps.has_tfe, tfe, ?);
let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
}
@@ -2135,7 +2195,7 @@ class Base_MTBUF_Real_gfx6_gfx7_gfx10<bits<3> op, MTBUF_Pseudo ps, int ef> :
class MTBUF_Real_gfx10<bits<4> op, MTBUF_Pseudo ps> :
Base_MTBUF_Real_gfx6_gfx7_gfx10<op{2-0}, ps, SIEncodingFamily.GFX10> {
- let Inst{15} = !if(ps.has_dlc, dlc, ps.dlc_value);
+ let Inst{15} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlc_value);
let Inst{25-19} = format;
let Inst{53} = op{3};
}
@@ -2204,33 +2264,58 @@ defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_gfx6_gfx7_gfx10<0x007>;
// GFX8, GFX9 (VI).
//===----------------------------------------------------------------------===//
-class MUBUF_Real_vi <bits<7> op, MUBUF_Pseudo ps> :
+class MUBUF_Real_Base_vi <bits<7> op, MUBUF_Pseudo ps, int Enc,
+ bit has_sccb = ps.has_sccb> :
MUBUF_Real<ps>,
Enc64,
- SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
- let AssemblerPredicate = isGFX8GFX9;
- let DecoderNamespace = "GFX8";
+ SIMCInstr<ps.PseudoInstr, Enc>,
+ AtomicNoRet<!subst("_RTN","",NAME), !if(ps.IsAtomicNoRet, 0,
+ !if(ps.IsAtomicRet, 1, ?))> {
let Inst{11-0} = !if(ps.has_offset, offset, ?);
let Inst{12} = ps.offen;
let Inst{13} = ps.idxen;
- let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
+ let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value);
+ let Inst{15} = !if(has_sccb, cpol{CPolBit.SCC}, ps.sccb_value);
let Inst{16} = ps.lds;
- let Inst{17} = !if(ps.has_slc, slc, ?);
+ let Inst{17} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?);
let Inst{24-18} = op;
let Inst{31-26} = 0x38; //encoding
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
- let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?);
let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
- let Inst{55} = !if(ps.has_tfe, tfe, ?);
let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
}
+class MUBUF_Real_vi <bits<7> op, MUBUF_Pseudo ps, bit has_sccb = ps.has_sccb> :
+ MUBUF_Real_Base_vi<op, ps, SIEncodingFamily.VI, has_sccb> {
+ let AssemblerPredicate = isGFX8GFX9NotGFX90A;
+ let DecoderNamespace = "GFX8";
+
+ let Inst{55} = !if(ps.has_tfe, tfe, ?);
+}
+
+class MUBUF_Real_gfx90a <bits<7> op, MUBUF_Pseudo ps,
+ bit has_sccb = ps.has_sccb> :
+ MUBUF_Real_Base_vi<op, ps, SIEncodingFamily.GFX90A, has_sccb> {
+ let AssemblerPredicate = isGFX90APlus;
+ let DecoderNamespace = "GFX90A";
+ let AsmString = ps.Mnemonic # !subst("$sccb", !if(has_sccb, "$sccb",""),
+ !subst("$tfe", "", ps.AsmOperands));
+
+ let Inst{55} = acc;
+}
+
+multiclass MUBUF_Real_vi_gfx90a<bits<7> op, MUBUF_Pseudo ps> {
+ def _vi : MUBUF_Real_vi<op, ps>;
+ def _gfx90a : MUBUF_Real_gfx90a<op, ps, !and(ps.has_sccb,!not(ps.FPAtomic))>;
+}
+
multiclass MUBUF_Real_AllAddr_vi<bits<7> op> {
- def _OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
- def _OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
- def _IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
- def _BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+ defm _OFFSET : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+ defm _OFFEN : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+ defm _IDXEN : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+ defm _BOTHEN : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
}
multiclass MUBUF_Real_AllAddr_Lds_vi<bits<7> op> {
@@ -2252,6 +2337,24 @@ multiclass MUBUF_Real_AllAddr_Lds_vi<bits<7> op> {
MUBUFLdsTable<1, NAME # "_IDXEN_vi">;
def _LDS_BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
MUBUFLdsTable<1, NAME # "_BOTHEN_vi">;
+
+ def _OFFSET_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
+ MUBUFLdsTable<0, NAME # "_OFFSET_gfx90a">;
+ def _OFFEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>,
+ MUBUFLdsTable<0, NAME # "_OFFEN_gfx90a">;
+ def _IDXEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>,
+ MUBUFLdsTable<0, NAME # "_IDXEN_gfx90a">;
+ def _BOTHEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>,
+ MUBUFLdsTable<0, NAME # "_BOTHEN_gfx90a">;
+
+ def _LDS_OFFSET_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>,
+ MUBUFLdsTable<1, NAME # "_OFFSET_gfx90a">;
+ def _LDS_OFFEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>,
+ MUBUFLdsTable<1, NAME # "_OFFEN_gfx90a">;
+ def _LDS_IDXEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>,
+ MUBUFLdsTable<1, NAME # "_IDXEN_gfx90a">;
+ def _LDS_BOTHEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
+ MUBUFLdsTable<1, NAME # "_BOTHEN_gfx90a">;
}
class MUBUF_Real_gfx80 <bits<7> op, MUBUF_Pseudo ps> :
@@ -2264,13 +2367,13 @@ class MUBUF_Real_gfx80 <bits<7> op, MUBUF_Pseudo ps> :
let Inst{11-0} = !if(ps.has_offset, offset, ?);
let Inst{12} = ps.offen;
let Inst{13} = ps.idxen;
- let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
+ let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value);
let Inst{16} = ps.lds;
- let Inst{17} = !if(ps.has_slc, slc, ?);
+ let Inst{17} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?);
let Inst{24-18} = op;
let Inst{31-26} = 0x38; //encoding
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
- let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?);
let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
let Inst{55} = !if(ps.has_tfe, tfe, ?);
let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
@@ -2285,10 +2388,10 @@ multiclass MUBUF_Real_AllAddr_gfx80<bits<7> op> {
multiclass MUBUF_Real_Atomic_vi<bits<7> op> :
MUBUF_Real_AllAddr_vi<op> {
- def _OFFSET_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
- def _OFFEN_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>;
- def _IDXEN_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>;
- def _BOTHEN_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
+ defm _OFFSET_RTN : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
+ defm _OFFEN_RTN : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>;
+ defm _IDXEN_RTN : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>;
+ defm _BOTHEN_RTN : MUBUF_Real_vi_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
}
defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_Lds_vi <0x00>;
@@ -2374,46 +2477,79 @@ defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_vi <0x6a>;
defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_vi <0x6b>;
defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_vi <0x6c>;
-def BUFFER_STORE_LDS_DWORD_vi : MUBUF_Real_vi <0x3d, BUFFER_STORE_LDS_DWORD>;
+defm BUFFER_STORE_LDS_DWORD : MUBUF_Real_vi_gfx90a <0x3d, BUFFER_STORE_LDS_DWORD>;
+let AssemblerPredicate = isGFX8GFX9 in {
def BUFFER_WBINVL1_vi : MUBUF_Real_vi <0x3e, BUFFER_WBINVL1>;
def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>;
+} // End AssemblerPredicate = isGFX8GFX9
let SubtargetPredicate = HasAtomicFaddInsts in {
-defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_AllAddr_vi <0x4d>;
-defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_AllAddr_vi <0x4e>;
+defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_Atomic_vi <0x4d>;
+defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_vi <0x4e>;
} // End SubtargetPredicate = HasAtomicFaddInsts
-class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> :
+let SubtargetPredicate = isGFX90APlus in {
+ defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Real_Atomic_vi<0x4f>;
+ defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Real_Atomic_vi<0x50>;
+ defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Real_Atomic_vi<0x51>;
+} // End SubtargetPredicate = isGFX90APlus, AssemblerPredicate = isGFX90APlus
+
+def BUFFER_WBL2_gfx90a : MUBUF_Real_gfx90a<0x28, BUFFER_WBL2> {
+}
+def BUFFER_INVL2_gfx90a : MUBUF_Real_gfx90a<0x29, BUFFER_INVL2>;
+
+class MTBUF_Real_Base_vi <bits<4> op, MTBUF_Pseudo ps, int Enc> :
MTBUF_Real<ps>,
Enc64,
- SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
- let AssemblerPredicate = isGFX8GFX9;
- let DecoderNamespace = "GFX8";
+ SIMCInstr<ps.PseudoInstr, Enc> {
let Inst{11-0} = !if(ps.has_offset, offset, ?);
let Inst{12} = ps.offen;
let Inst{13} = ps.idxen;
- let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
+ let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value);
let Inst{18-15} = op;
let Inst{22-19} = dfmt;
let Inst{25-23} = nfmt;
let Inst{31-26} = 0x3a; //encoding
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
- let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?);
let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
- let Inst{54} = !if(ps.has_slc, slc, ?);
+ let Inst{53} = !if(ps.has_sccb, cpol{CPolBit.SCC}, ps.sccb_value);
+ let Inst{54} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?);
let Inst{55} = !if(ps.has_tfe, tfe, ?);
let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
}
+class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> :
+ MTBUF_Real_Base_vi <op, ps, SIEncodingFamily.VI> {
+ let AssemblerPredicate = isGFX8GFX9NotGFX90A;
+ let DecoderNamespace = "GFX8";
+
+ let Inst{55} = !if(ps.has_tfe, tfe, ?);
+}
+
+class MTBUF_Real_gfx90a <bits<4> op, MTBUF_Pseudo ps> :
+ MTBUF_Real_Base_vi <op, ps, SIEncodingFamily.GFX90A> {
+ let AssemblerPredicate = isGFX90APlus;
+ let DecoderNamespace = "GFX90A";
+ let AsmString = ps.Mnemonic # !subst("$tfe", "", ps.AsmOperands);
+
+ let Inst{55} = acc;
+}
+
+multiclass MTBUF_Real_vi_gfx90a<bits<4> op, MTBUF_Pseudo ps> {
+ def _vi : MTBUF_Real_vi<op, ps>;
+ def _gfx90a : MTBUF_Real_gfx90a<op, ps>;
+}
+
multiclass MTBUF_Real_AllAddr_vi<bits<4> op> {
- def _OFFSET_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
- def _OFFEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
- def _IDXEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
- def _BOTHEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+ defm _OFFSET : MTBUF_Real_vi_gfx90a <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+ defm _OFFEN : MTBUF_Real_vi_gfx90a <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+ defm _IDXEN : MTBUF_Real_vi_gfx90a <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+ defm _BOTHEN : MTBUF_Real_vi_gfx90a <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
}
class MTBUF_Real_gfx80 <bits<4> op, MTBUF_Pseudo ps> :
@@ -2426,15 +2562,15 @@ class MTBUF_Real_gfx80 <bits<4> op, MTBUF_Pseudo ps> :
let Inst{11-0} = !if(ps.has_offset, offset, ?);
let Inst{12} = ps.offen;
let Inst{13} = ps.idxen;
- let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
+ let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value);
let Inst{18-15} = op;
let Inst{22-19} = dfmt;
let Inst{25-23} = nfmt;
let Inst{31-26} = 0x3a; //encoding
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
- let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?);
let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
- let Inst{54} = !if(ps.has_slc, slc, ?);
+ let Inst{54} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?);
let Inst{55} = !if(ps.has_tfe, tfe, ?);
let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
}
@@ -2478,7 +2614,10 @@ let SubtargetPredicate = HasPackedD16VMem in {
def MUBUFInfoTable : GenericTable {
let FilterClass = "MUBUF_Pseudo";
let CppTypeName = "MUBUFInfo";
- let Fields = ["Opcode", "BaseOpcode", "elements", "has_vaddr", "has_srsrc", "has_soffset"];
+ let Fields = [
+ "Opcode", "BaseOpcode", "elements", "has_vaddr", "has_srsrc", "has_soffset",
+ "IsBufferInv"
+ ];
let PrimaryKey = ["Opcode"];
let PrimaryKeyName = "getMUBUFOpcodeHelper";
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 328c81005df4..ad9528ece7d0 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -52,32 +52,41 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
let Uses = !if(has_m0_read, [M0, EXEC], [EXEC]);
}
-class DS_Real <DS_Pseudo ds> :
- InstSI <ds.OutOperandList, ds.InOperandList, ds.Mnemonic # ds.AsmOperands, []>,
+class DS_Real <DS_Pseudo ps> :
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
Enc64 {
let isPseudo = 0;
let isCodeGenOnly = 0;
+ let LGKM_CNT = 1;
let DS = 1;
let UseNamedOperandTable = 1;
// copy relevant pseudo op flags
- let SubtargetPredicate = ds.SubtargetPredicate;
- let OtherPredicates = ds.OtherPredicates;
- let AsmMatchConverter = ds.AsmMatchConverter;
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let OtherPredicates = ps.OtherPredicates;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let IsAtomicRet = ps.IsAtomicRet;
+ let IsAtomicNoRet = ps.IsAtomicNoRet;
// encoding fields
- bits<8> vdst;
+ bits<10> vdst;
bits<1> gds;
bits<8> addr;
- bits<8> data0;
- bits<8> data1;
+ bits<10> data0;
+ bits<10> data1;
bits<8> offset0;
bits<8> offset1;
bits<16> offset;
- let offset0 = !if(ds.has_offset, offset{7-0}, ?);
- let offset1 = !if(ds.has_offset, offset{15-8}, ?);
+ let offset0 = !if(ps.has_offset, offset{7-0}, ?);
+ let offset1 = !if(ps.has_offset, offset{15-8}, ?);
+
+ bits<1> acc = !if(ps.has_vdst, vdst{9},
+ !if(!or(ps.has_data0, ps.has_gws_data0), data0{9}, 0));
}
@@ -86,7 +95,7 @@ class DS_Real <DS_Pseudo ds> :
class DS_0A1D_NORET<string opName, RegisterClass rc = VGPR_32>
: DS_Pseudo<opName,
(outs),
- (ins rc:$data0, offset:$offset, gds:$gds),
+ (ins getLdStRegisterOperand<rc>.ret:$data0, offset:$offset, gds:$gds),
" $data0$offset$gds"> {
let has_addr = 0;
@@ -97,11 +106,12 @@ class DS_0A1D_NORET<string opName, RegisterClass rc = VGPR_32>
class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32>
: DS_Pseudo<opName,
(outs),
- (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds),
+ (ins VGPR_32:$addr, getLdStRegisterOperand<rc>.ret:$data0, offset:$offset, gds:$gds),
" $addr, $data0$offset$gds"> {
let has_data1 = 0;
let has_vdst = 0;
+ let IsAtomicNoRet = 1;
}
multiclass DS_1A1D_NORET_mc<string opName, RegisterClass rc = VGPR_32> {
@@ -114,13 +124,22 @@ multiclass DS_1A1D_NORET_mc<string opName, RegisterClass rc = VGPR_32> {
}
}
-class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32>
+multiclass DS_1A1D_NORET_mc_gfx9<string opName, RegisterClass rc = VGPR_32> {
+ let has_m0_read = 0 in {
+ def "" : DS_1A1D_NORET<opName, rc>,
+ AtomicNoRet<opName, 0>;
+ }
+}
+
+class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32,
+ RegisterOperand data_op = getLdStRegisterOperand<rc>.ret>
: DS_Pseudo<opName,
(outs),
- (ins VGPR_32:$addr, rc:$data0, rc:$data1, offset:$offset, gds:$gds),
+ (ins VGPR_32:$addr, data_op:$data0, data_op:$data1, offset:$offset, gds:$gds),
" $addr, $data0, $data1$offset$gds"> {
let has_vdst = 0;
+ let IsAtomicNoRet = 1;
}
multiclass DS_1A2D_NORET_mc<string opName, RegisterClass rc = VGPR_32> {
@@ -133,10 +152,11 @@ multiclass DS_1A2D_NORET_mc<string opName, RegisterClass rc = VGPR_32> {
}
}
-class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32>
+class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32,
+ RegisterOperand data_op = getLdStRegisterOperand<rc>.ret>
: DS_Pseudo<opName,
(outs),
- (ins VGPR_32:$addr, rc:$data0, rc:$data1,
+ (ins VGPR_32:$addr, data_op:$data0, data_op:$data1,
offset0:$offset0, offset1:$offset1, gds:$gds),
" $addr, $data0, $data1$offset0$offset1$gds"> {
@@ -153,14 +173,16 @@ multiclass DS_1A2D_Off8_NORET_mc <string opName, RegisterClass rc = VGPR_32> {
}
}
-class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32>
+class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32,
+ RegisterOperand data_op = getLdStRegisterOperand<rc>.ret>
: DS_Pseudo<opName,
- (outs rc:$vdst),
- (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds),
+ (outs data_op:$vdst),
+ (ins VGPR_32:$addr, data_op:$data0, offset:$offset, gds:$gds),
" $vdst, $addr, $data0$offset$gds"> {
let hasPostISelHook = 1;
let has_data1 = 0;
+ let IsAtomicRet = 1;
}
multiclass DS_1A1D_RET_mc <string opName, RegisterClass rc = VGPR_32,
@@ -175,15 +197,27 @@ multiclass DS_1A1D_RET_mc <string opName, RegisterClass rc = VGPR_32,
}
}
+multiclass DS_1A1D_RET_mc_gfx9 <string opName, RegisterClass rc = VGPR_32,
+ string NoRetOp = ""> {
+ let has_m0_read = 0 in {
+ def "" : DS_1A1D_RET<opName, rc>,
+ AtomicNoRet<!if(!eq(NoRetOp, ""), "", NoRetOp),
+ !if(!eq(NoRetOp, ""), 0, 1)>;
+ }
+}
+
class DS_1A2D_RET<string opName,
RegisterClass rc = VGPR_32,
- RegisterClass src = rc>
+ RegisterClass src = rc,
+ RegisterOperand dst_op = getLdStRegisterOperand<rc>.ret,
+ RegisterOperand src_op = getLdStRegisterOperand<src>.ret>
: DS_Pseudo<opName,
- (outs rc:$vdst),
- (ins VGPR_32:$addr, src:$data0, src:$data1, offset:$offset, gds:$gds),
+ (outs dst_op:$vdst),
+ (ins VGPR_32:$addr, src_op:$data0, src_op:$data1, offset:$offset, gds:$gds),
" $vdst, $addr, $data0, $data1$offset$gds"> {
let hasPostISelHook = 1;
+ let IsAtomicRet = 1;
}
multiclass DS_1A2D_RET_mc<string opName,
@@ -201,10 +235,12 @@ multiclass DS_1A2D_RET_mc<string opName,
class DS_1A2D_Off8_RET<string opName,
RegisterClass rc = VGPR_32,
- RegisterClass src = rc>
+ RegisterClass src = rc,
+ RegisterOperand dst_op = getLdStRegisterOperand<rc>.ret,
+ RegisterOperand src_op = getLdStRegisterOperand<src>.ret>
: DS_Pseudo<opName,
- (outs rc:$vdst),
- (ins VGPR_32:$addr, src:$data0, src:$data1, offset0:$offset0, offset1:$offset1, gds:$gds),
+ (outs dst_op:$vdst),
+ (ins VGPR_32:$addr, src_op:$data0, src_op:$data1, offset0:$offset0, offset1:$offset1, gds:$gds),
" $vdst, $addr, $data0, $data1$offset0$offset1$gds"> {
let has_offset = 0;
@@ -224,11 +260,12 @@ multiclass DS_1A2D_Off8_RET_mc<string opName,
}
-class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = offset>
+class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = offset,
+ RegisterOperand data_op = getLdStRegisterOperand<rc>.ret>
: DS_Pseudo<opName,
- (outs rc:$vdst),
+ (outs data_op:$vdst),
!if(HasTiedOutput,
- (ins VGPR_32:$addr, ofs:$offset, gds:$gds, rc:$vdst_in),
+ (ins VGPR_32:$addr, ofs:$offset, gds:$gds, data_op:$vdst_in),
(ins VGPR_32:$addr, ofs:$offset, gds:$gds)),
" $vdst, $addr$offset$gds"> {
let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
@@ -250,7 +287,7 @@ class DS_1A_RET_Tied<string opName, RegisterClass rc = VGPR_32> :
class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32>
: DS_Pseudo<opName,
- (outs rc:$vdst),
+ (outs getLdStRegisterOperand<rc>.ret:$vdst),
(ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1, gds:$gds),
" $vdst, $addr$offset0$offset1$gds"> {
@@ -269,7 +306,7 @@ multiclass DS_1A_Off8_RET_mc <string opName, RegisterClass rc = VGPR_32> {
}
class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName,
- (outs VGPR_32:$vdst),
+ (outs getLdStRegisterOperand<VGPR_32>.ret:$vdst),
(ins VGPR_32:$addr, offset:$offset),
" $vdst, $addr$offset gds"> {
@@ -281,7 +318,7 @@ class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName,
}
class DS_0A_RET <string opName> : DS_Pseudo<opName,
- (outs VGPR_32:$vdst),
+ (outs getLdStRegisterOperand<VGPR_32>.ret:$vdst),
(ins offset:$offset, gds:$gds),
" $vdst$offset$gds"> {
@@ -336,7 +373,8 @@ class DS_GWS_0D <string opName>
class DS_GWS_1D <string opName>
: DS_GWS<opName,
- (ins VGPR_32:$data0, offset:$offset), " $data0$offset gds"> {
+ (ins getLdStRegisterOperand<VGPR_32>.ret:$data0, offset:$offset),
+ " $data0$offset gds"> {
let has_gws_data0 = 1;
let hasSideEffects = 1;
@@ -360,10 +398,11 @@ class DS_VOID <string opName> : DS_Pseudo<opName,
let has_gds = 0;
}
-class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag>
+class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag,
+ RegisterOperand data_op = getLdStRegisterOperand<VGPR_32>.ret>
: DS_Pseudo<opName,
- (outs VGPR_32:$vdst),
- (ins VGPR_32:$addr, VGPR_32:$data0, offset:$offset),
+ (outs data_op:$vdst),
+ (ins VGPR_32:$addr, data_op:$data0, offset:$offset),
" $vdst, $addr, $data0$offset",
[(set i32:$vdst,
(node (DS1Addr1Offset i32:$addr, i16:$offset), i32:$data0))] > {
@@ -420,6 +459,11 @@ def DS_WRITE_ADDTID_B32 : DS_0A1D_NORET<"ds_write_addtid_b32">;
} // End mayLoad = 0
+let SubtargetPredicate = isGFX90APlus in {
+ defm DS_ADD_F64 : DS_1A1D_NORET_mc_gfx9<"ds_add_f64", VReg_64>;
+ defm DS_ADD_RTN_F64 : DS_1A1D_RET_mc_gfx9<"ds_add_rtn_f64", VReg_64, "ds_add_f64">;
+} // End SubtargetPredicate = isGFX90APlus
+
defm DS_MSKOR_B32 : DS_1A2D_NORET_mc<"ds_mskor_b32">;
defm DS_CMPST_B32 : DS_1A2D_NORET_mc<"ds_cmpst_b32">;
defm DS_CMPST_F32 : DS_1A2D_NORET_mc<"ds_cmpst_f32">;
@@ -674,38 +718,6 @@ defm : DSReadPat_mc <DS_READ_B32, vt, "load_local">;
defm : DSReadPat_mc <DS_READ_B32, i32, "atomic_load_32_local">;
defm : DSReadPat_mc <DS_READ_B64, i64, "atomic_load_64_local">;
-let AddedComplexity = 100 in {
-
-foreach vt = VReg_64.RegTypes in {
-defm : DSReadPat_mc <DS_READ_B64, vt, "load_align8_local">;
-}
-
-let SubtargetPredicate = isGFX7Plus in {
-
-foreach vt = VReg_96.RegTypes in {
-defm : DSReadPat_mc <DS_READ_B96, vt, "load_align16_local">;
-}
-
-foreach vt = VReg_128.RegTypes in {
-defm : DSReadPat_mc <DS_READ_B128, vt, "load_align16_local">;
-}
-
-let SubtargetPredicate = HasUnalignedAccessMode in {
-
-foreach vt = VReg_96.RegTypes in {
-defm : DSReadPat_mc <DS_READ_B96, vt, "load_local">;
-}
-
-foreach vt = VReg_128.RegTypes in {
-defm : DSReadPat_mc <DS_READ_B128, vt, "load_local">;
-}
-
-} // End SubtargetPredicate = HasUnalignedAccessMode
-
-} // End SubtargetPredicate = isGFX7Plus
-
-} // End AddedComplexity = 100
-
let OtherPredicates = [D16PreservesUnusedBits] in {
def : DSReadPat_D16<DS_READ_U16_D16_HI, load_d16_hi_local, v2i16>;
def : DSReadPat_D16<DS_READ_U16_D16_HI, load_d16_hi_local, v2f16>;
@@ -829,31 +841,38 @@ foreach vt = VReg_128.RegTypes in {
defm : DS128Bit8ByteAlignedPat_mc<vt>;
}
+// Prefer ds_read over ds_read2 and ds_write over ds_write2, all other things
+// being equal, because it has a larger immediate offset range.
let AddedComplexity = 100 in {
foreach vt = VReg_64.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B64, vt, "load_align8_local">;
defm : DSWritePat_mc <DS_WRITE_B64, vt, "store_align8_local">;
}
let SubtargetPredicate = isGFX7Plus in {
foreach vt = VReg_96.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B96, vt, "load_align16_local">;
defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_align16_local">;
}
foreach vt = VReg_128.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B128, vt, "load_align16_local">;
defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align16_local">;
}
let SubtargetPredicate = HasUnalignedAccessMode in {
+// FIXME: From performance point of view, is ds_read_b96/ds_write_b96 better choice
+// for unaligned accesses?
foreach vt = VReg_96.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B96, vt, "load_local">;
defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_local">;
}
-foreach vt = VReg_128.RegTypes in {
-defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_local">;
-}
+// For performance reasons, *do not* select ds_read_b128/ds_write_b128 for unaligned
+// accesses.
} // End SubtargetPredicate = HasUnalignedAccessMode
@@ -938,6 +957,10 @@ defm : DSAtomicRetPat_mc<DS_MAX_RTN_U64, i64, "atomic_load_umax">;
defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B64, i64, "atomic_cmp_swap">;
+let SubtargetPredicate = isGFX90APlus in {
+def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_64>;
+}
+
def : Pat <
(SIds_ordered_count i32:$value, i16:$offset),
(DS_ORDERED_COUNT $value, (as_i16imm $offset))
@@ -959,10 +982,10 @@ class Base_DS_Real_gfx6_gfx7_gfx10<bits<8> op, DS_Pseudo ps, int ef> :
let Inst{17} = !if(ps.has_gds, gds, ps.gdsValue);
let Inst{25-18} = op;
let Inst{31-26} = 0x36;
- let Inst{39-32} = !if(ps.has_addr, addr, !if(ps.has_gws_data0, data0, 0));
- let Inst{47-40} = !if(ps.has_data0, data0, 0);
- let Inst{55-48} = !if(ps.has_data1, data1, 0);
- let Inst{63-56} = !if(ps.has_vdst, vdst, 0);
+ let Inst{39-32} = !if(ps.has_addr, addr, !if(ps.has_gws_data0, data0{7-0}, 0));
+ let Inst{47-40} = !if(ps.has_data0, data0{7-0}, 0);
+ let Inst{55-48} = !if(ps.has_data1, data1{7-0}, 0);
+ let Inst{63-56} = !if(ps.has_vdst, vdst{7-0}, 0);
}
//===----------------------------------------------------------------------===//
@@ -1166,22 +1189,23 @@ defm DS_MAX_SRC2_F64 : DS_Real_gfx6_gfx7_gfx10<0x0d3>;
// GFX8, GFX9 (VI).
//===----------------------------------------------------------------------===//
-class DS_Real_vi <bits<8> op, DS_Pseudo ds> :
- DS_Real <ds>,
- SIMCInstr <ds.Mnemonic, SIEncodingFamily.VI> {
+class DS_Real_vi <bits<8> op, DS_Pseudo ps> :
+ DS_Real <ps>,
+ SIMCInstr <ps.Mnemonic, SIEncodingFamily.VI> {
let AssemblerPredicate = isGFX8GFX9;
let DecoderNamespace = "GFX8";
// encoding
- let Inst{7-0} = !if(ds.has_offset0, offset0, 0);
- let Inst{15-8} = !if(ds.has_offset1, offset1, 0);
- let Inst{16} = !if(ds.has_gds, gds, ds.gdsValue);
+ let Inst{7-0} = !if(ps.has_offset0, offset0, 0);
+ let Inst{15-8} = !if(ps.has_offset1, offset1, 0);
+ let Inst{16} = !if(ps.has_gds, gds, ps.gdsValue);
let Inst{24-17} = op;
+ let Inst{25} = acc;
let Inst{31-26} = 0x36; // ds prefix
- let Inst{39-32} = !if(ds.has_addr, addr, !if(ds.has_gws_data0, data0, 0));
- let Inst{47-40} = !if(ds.has_data0, data0, 0);
- let Inst{55-48} = !if(ds.has_data1, data1, 0);
- let Inst{63-56} = !if(ds.has_vdst, vdst, 0);
+ let Inst{39-32} = !if(ps.has_addr, addr, !if(ps.has_gws_data0, data0{7-0}, 0));
+ let Inst{47-40} = !if(ps.has_data0, data0{7-0}, 0);
+ let Inst{55-48} = !if(ps.has_data1, data1{7-0}, 0);
+ let Inst{63-56} = !if(ps.has_vdst, vdst{7-0}, 0);
}
def DS_ADD_U32_vi : DS_Real_vi<0x0, DS_ADD_U32>;
@@ -1344,3 +1368,8 @@ def DS_WRITE_B96_vi : DS_Real_vi<0xde, DS_WRITE_B96>;
def DS_WRITE_B128_vi : DS_Real_vi<0xdf, DS_WRITE_B128>;
def DS_READ_B96_vi : DS_Real_vi<0xfe, DS_READ_B96>;
def DS_READ_B128_vi : DS_Real_vi<0xff, DS_READ_B128>;
+
+let SubtargetPredicate = isGFX90APlus in {
+ def DS_ADD_F64_vi : DS_Real_vi<0x5c, DS_ADD_F64>;
+ def DS_ADD_RTN_F64_vi : DS_Real_vi<0x7c, DS_ADD_RTN_F64>;
+} // End SubtargetPredicate = isGFX90APlus
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 8061c6c509e0..fe62b8590fa0 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -123,6 +123,7 @@ DECODE_OPERAND_REG(VReg_96)
DECODE_OPERAND_REG(VReg_128)
DECODE_OPERAND_REG(VReg_256)
DECODE_OPERAND_REG(VReg_512)
+DECODE_OPERAND_REG(VReg_1024)
DECODE_OPERAND_REG(SReg_32)
DECODE_OPERAND_REG(SReg_32_XM0_XEXEC)
@@ -135,7 +136,9 @@ DECODE_OPERAND_REG(SReg_256)
DECODE_OPERAND_REG(SReg_512)
DECODE_OPERAND_REG(AGPR_32)
+DECODE_OPERAND_REG(AReg_64)
DECODE_OPERAND_REG(AReg_128)
+DECODE_OPERAND_REG(AReg_256)
DECODE_OPERAND_REG(AReg_512)
DECODE_OPERAND_REG(AReg_1024)
DECODE_OPERAND_REG(AV_32)
@@ -157,6 +160,14 @@ static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst,
return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm));
}
+static DecodeStatus decodeOperand_VSrcV232(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeOperand_VSrcV232(Imm));
+}
+
static DecodeStatus decodeOperand_VS_16(MCInst &Inst,
unsigned Imm,
uint64_t Addr,
@@ -173,6 +184,14 @@ static DecodeStatus decodeOperand_VS_32(MCInst &Inst,
return addOperand(Inst, DAsm->decodeOperand_VS_32(Imm));
}
+static DecodeStatus decodeOperand_AReg_64(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm | 512));
+}
+
static DecodeStatus decodeOperand_AReg_128(MCInst &Inst,
unsigned Imm,
uint64_t Addr,
@@ -181,6 +200,14 @@ static DecodeStatus decodeOperand_AReg_128(MCInst &Inst,
return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW128, Imm | 512));
}
+static DecodeStatus decodeOperand_AReg_256(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW256, Imm | 512));
+}
+
static DecodeStatus decodeOperand_AReg_512(MCInst &Inst,
unsigned Imm,
uint64_t Addr,
@@ -197,6 +224,127 @@ static DecodeStatus decodeOperand_AReg_1024(MCInst &Inst,
return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm | 512));
}
+static DecodeStatus decodeOperand_VReg_64(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm));
+}
+
+static DecodeStatus decodeOperand_VReg_128(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW128, Imm));
+}
+
+static DecodeStatus decodeOperand_VReg_256(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW256, Imm));
+}
+
+static DecodeStatus decodeOperand_VReg_512(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW512, Imm));
+}
+
+static DecodeStatus decodeOperand_VReg_1024(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm));
+}
+
+static bool IsAGPROperand(const MCInst &Inst, int OpIdx,
+ const MCRegisterInfo *MRI) {
+ if (OpIdx < 0)
+ return false;
+
+ const MCOperand &Op = Inst.getOperand(OpIdx);
+ if (!Op.isReg())
+ return false;
+
+ unsigned Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0);
+ auto Reg = Sub ? Sub : Op.getReg();
+ return Reg >= AMDGPU::AGPR0 && Reg <= AMDGPU::AGPR255;
+}
+
+static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst,
+ unsigned Imm,
+ AMDGPUDisassembler::OpWidthTy Opw,
+ const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ if (!DAsm->isGFX90A()) {
+ Imm &= 511;
+ } else {
+ // If atomic has both vdata and vdst their register classes are tied.
+ // The bit is decoded along with the vdst, first operand. We need to
+ // change register class to AGPR if vdst was AGPR.
+ // If a DS instruction has both data0 and data1 their register classes
+ // are also tied.
+ unsigned Opc = Inst.getOpcode();
+ uint64_t TSFlags = DAsm->getMCII()->get(Opc).TSFlags;
+ uint16_t DataNameIdx = (TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
+ : AMDGPU::OpName::vdata;
+ const MCRegisterInfo *MRI = DAsm->getContext().getRegisterInfo();
+ int DataIdx = AMDGPU::getNamedOperandIdx(Opc, DataNameIdx);
+ if ((int)Inst.getNumOperands() == DataIdx) {
+ int DstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+ if (IsAGPROperand(Inst, DstIdx, MRI))
+ Imm |= 512;
+ }
+
+ if (TSFlags & SIInstrFlags::DS) {
+ int Data2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
+ if ((int)Inst.getNumOperands() == Data2Idx &&
+ IsAGPROperand(Inst, DataIdx, MRI))
+ Imm |= 512;
+ }
+ }
+ return addOperand(Inst, DAsm->decodeSrcOp(Opw, Imm | 256));
+}
+
+static DecodeStatus DecodeAVLdSt_32RegisterClass(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ return decodeOperand_AVLdSt_Any(Inst, Imm,
+ AMDGPUDisassembler::OPW32, Decoder);
+}
+
+static DecodeStatus DecodeAVLdSt_64RegisterClass(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ return decodeOperand_AVLdSt_Any(Inst, Imm,
+ AMDGPUDisassembler::OPW64, Decoder);
+}
+
+static DecodeStatus DecodeAVLdSt_96RegisterClass(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ return decodeOperand_AVLdSt_Any(Inst, Imm,
+ AMDGPUDisassembler::OPW96, Decoder);
+}
+
+static DecodeStatus DecodeAVLdSt_128RegisterClass(MCInst &Inst,
+ unsigned Imm,
+ uint64_t Addr,
+ const void *Decoder) {
+ return decodeOperand_AVLdSt_Any(Inst, Imm,
+ AMDGPUDisassembler::OPW128, Decoder);
+}
+
static DecodeStatus decodeOperand_SReg_32(MCInst &Inst,
unsigned Imm,
uint64_t Addr,
@@ -250,6 +398,9 @@ DecodeStatus AMDGPUDisassembler::tryDecodeInst(const uint8_t* Table,
return MCDisassembler::Fail;
}
+// The disassembler is greedy, so we need to check FI operand value to
+// not parse a dpp if the correct literal is not set. For dpp16 the
+// autogenerated decoder checks the dpp literal
static bool isValidDPP8(const MCInst &MI) {
using namespace llvm::AMDGPU::DPP;
int FiIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::fi);
@@ -341,6 +492,12 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address);
if (Res) break;
+ if (STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]) {
+ Res = tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address);
+ if (Res)
+ break;
+ }
+
if (STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]) {
Res = tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address);
if (Res) break;
@@ -351,6 +508,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
if (Bytes.size() < 4) break;
const uint64_t QW = ((uint64_t)eatBytes<uint32_t>(Bytes) << 32) | DW;
+
+ if (STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]) {
+ Res = tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address);
+ if (Res)
+ break;
+ }
+
Res = tryDecodeInst(DecoderTableGFX864, MI, QW, Address);
if (Res) break;
@@ -369,6 +533,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
MI.getOpcode() == AMDGPU::V_MAC_LEGACY_F32_e64_gfx6_gfx7 ||
MI.getOpcode() == AMDGPU::V_MAC_LEGACY_F32_e64_gfx10 ||
MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi ||
+ MI.getOpcode() == AMDGPU::V_FMAC_F64_e64_gfx90a ||
MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi ||
MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_gfx10 ||
MI.getOpcode() == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 ||
@@ -379,9 +544,44 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}
if (Res && (MCII->get(MI.getOpcode()).TSFlags &
- (SIInstrFlags::MUBUF | SIInstrFlags::FLAT)) &&
- AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::glc1) != -1) {
- insertNamedMCOperand(MI, MCOperand::createImm(1), AMDGPU::OpName::glc1);
+ (SIInstrFlags::MUBUF | SIInstrFlags::FLAT | SIInstrFlags::SMRD))) {
+ int CPolPos = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::cpol);
+ if (CPolPos != -1) {
+ unsigned CPol =
+ (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::IsAtomicRet) ?
+ AMDGPU::CPol::GLC : 0;
+ if (MI.getNumOperands() <= (unsigned)CPolPos) {
+ insertNamedMCOperand(MI, MCOperand::createImm(CPol),
+ AMDGPU::OpName::cpol);
+ } else if (CPol) {
+ MI.getOperand(CPolPos).setImm(MI.getOperand(CPolPos).getImm() | CPol);
+ }
+ }
+ }
+
+ if (Res && (MCII->get(MI.getOpcode()).TSFlags &
+ (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) &&
+ (STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts])) {
+ // GFX90A lost TFE, its place is occupied by ACC.
+ int TFEOpIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
+ if (TFEOpIdx != -1) {
+ auto TFEIter = MI.begin();
+ std::advance(TFEIter, TFEOpIdx);
+ MI.insert(TFEIter, MCOperand::createImm(0));
+ }
+ }
+
+ if (Res && (MCII->get(MI.getOpcode()).TSFlags &
+ (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF))) {
+ int SWZOpIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
+ if (SWZOpIdx != -1) {
+ auto SWZIter = MI.begin();
+ std::advance(SWZIter, SWZOpIdx);
+ MI.insert(SWZIter, MCOperand::createImm(0));
+ }
}
if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG)) {
@@ -453,6 +653,8 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
return MCDisassembler::Success;
}
+// We must check FI == literal to reject not genuine dpp8 insts, and we must
+// first add optional MI operands to check FI
DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
unsigned Opc = MI.getOpcode();
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
@@ -513,21 +715,21 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
if (STI.getFeatureBits()[AMDGPU::FeatureGFX10]) {
unsigned DimIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dim);
+ int A16Idx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::a16);
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
const AMDGPU::MIMGDimInfo *Dim =
AMDGPU::getMIMGDimInfoByEncoding(MI.getOperand(DimIdx).getImm());
+ const bool IsA16 = (A16Idx != -1 && MI.getOperand(A16Idx).getImm());
+
+ AddrSize =
+ AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, AMDGPU::hasG16(STI));
- AddrSize = BaseOpcode->NumExtraArgs +
- (BaseOpcode->Gradients ? Dim->NumGradients : 0) +
- (BaseOpcode->Coordinates ? Dim->NumCoords : 0) +
- (BaseOpcode->LodOrClampOrMip ? 1 : 0);
IsNSA = Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA;
if (!IsNSA) {
if (AddrSize > 8)
AddrSize = 16;
- else if (AddrSize > 4)
- AddrSize = 8;
} else {
if (AddrSize > Info->VAddrDwords) {
// The NSA encoding does not contain enough operands for the combination
@@ -545,7 +747,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
DstSize = (DstSize + 1) / 2;
}
- if (MI.getOperand(TFEIdx).getImm())
+ if (TFEIdx != -1 && MI.getOperand(TFEIdx).getImm())
DstSize += 1;
if (DstSize == Info->VDataDwords && AddrSize == Info->VAddrDwords)
@@ -701,6 +903,10 @@ MCOperand AMDGPUDisassembler::decodeOperand_VSrcV216(unsigned Val) const {
return decodeSrcOp(OPWV216, Val);
}
+MCOperand AMDGPUDisassembler::decodeOperand_VSrcV232(unsigned Val) const {
+ return decodeSrcOp(OPWV232, Val);
+}
+
MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const {
// Some instructions have operand restrictions beyond what the encoding
// allows. Some ordinarily VSrc_32 operands are VGPR_32, so clear the extra
@@ -718,10 +924,18 @@ MCOperand AMDGPUDisassembler::decodeOperand_AGPR_32(unsigned Val) const {
return createRegOperand(AMDGPU::AGPR_32RegClassID, Val & 255);
}
+MCOperand AMDGPUDisassembler::decodeOperand_AReg_64(unsigned Val) const {
+ return createRegOperand(AMDGPU::AReg_64RegClassID, Val & 255);
+}
+
MCOperand AMDGPUDisassembler::decodeOperand_AReg_128(unsigned Val) const {
return createRegOperand(AMDGPU::AReg_128RegClassID, Val & 255);
}
+MCOperand AMDGPUDisassembler::decodeOperand_AReg_256(unsigned Val) const {
+ return createRegOperand(AMDGPU::AReg_256RegClassID, Val & 255);
+}
+
MCOperand AMDGPUDisassembler::decodeOperand_AReg_512(unsigned Val) const {
return createRegOperand(AMDGPU::AReg_512RegClassID, Val & 255);
}
@@ -758,6 +972,10 @@ MCOperand AMDGPUDisassembler::decodeOperand_VReg_512(unsigned Val) const {
return createRegOperand(AMDGPU::VReg_512RegClassID, Val);
}
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_1024(unsigned Val) const {
+ return createRegOperand(AMDGPU::VReg_1024RegClassID, Val);
+}
+
MCOperand AMDGPUDisassembler::decodeOperand_SReg_32(unsigned Val) const {
// table-gen generated disassembler doesn't care about operand types
// leaving only registry class so SSrc_32 operand turns into SReg_32
@@ -914,8 +1132,10 @@ MCOperand AMDGPUDisassembler::decodeFPImmed(OpWidthTy Width, unsigned Imm) {
case OPW128: // splat constants
case OPW512:
case OPW1024:
+ case OPWV232:
return MCOperand::createImm(getInlineImmVal32(Imm));
case OPW64:
+ case OPW256:
return MCOperand::createImm(getInlineImmVal64(Imm));
case OPW16:
case OPWV216:
@@ -935,8 +1155,14 @@ unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
case OPW16:
case OPWV216:
return VGPR_32RegClassID;
- case OPW64: return VReg_64RegClassID;
+ case OPW64:
+ case OPWV232: return VReg_64RegClassID;
+ case OPW96: return VReg_96RegClassID;
case OPW128: return VReg_128RegClassID;
+ case OPW160: return VReg_160RegClassID;
+ case OPW256: return VReg_256RegClassID;
+ case OPW512: return VReg_512RegClassID;
+ case OPW1024: return VReg_1024RegClassID;
}
}
@@ -950,8 +1176,11 @@ unsigned AMDGPUDisassembler::getAgprClassId(const OpWidthTy Width) const {
case OPW16:
case OPWV216:
return AGPR_32RegClassID;
- case OPW64: return AReg_64RegClassID;
+ case OPW64:
+ case OPWV232: return AReg_64RegClassID;
+ case OPW96: return AReg_96RegClassID;
case OPW128: return AReg_128RegClassID;
+ case OPW160: return AReg_160RegClassID;
case OPW256: return AReg_256RegClassID;
case OPW512: return AReg_512RegClassID;
case OPW1024: return AReg_1024RegClassID;
@@ -969,8 +1198,11 @@ unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const {
case OPW16:
case OPWV216:
return SGPR_32RegClassID;
- case OPW64: return SGPR_64RegClassID;
+ case OPW64:
+ case OPWV232: return SGPR_64RegClassID;
+ case OPW96: return SGPR_96RegClassID;
case OPW128: return SGPR_128RegClassID;
+ case OPW160: return SGPR_160RegClassID;
case OPW256: return SGPR_256RegClassID;
case OPW512: return SGPR_512RegClassID;
}
@@ -986,7 +1218,8 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
case OPW16:
case OPWV216:
return TTMP_32RegClassID;
- case OPW64: return TTMP_64RegClassID;
+ case OPW64:
+ case OPWV232: return TTMP_64RegClassID;
case OPW128: return TTMP_128RegClassID;
case OPW256: return TTMP_256RegClassID;
case OPW512: return TTMP_512RegClassID;
@@ -1040,6 +1273,7 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c
case OPWV216:
return decodeSpecialReg32(Val);
case OPW64:
+ case OPWV232:
return decodeSpecialReg64(Val);
default:
llvm_unreachable("unexpected immediate type");
@@ -1209,6 +1443,10 @@ bool AMDGPUDisassembler::isVI() const {
bool AMDGPUDisassembler::isGFX9() const { return AMDGPU::isGFX9(STI); }
+bool AMDGPUDisassembler::isGFX90A() const {
+ return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts];
+}
+
bool AMDGPUDisassembler::isGFX9Plus() const { return AMDGPU::isGFX9Plus(STI); }
bool AMDGPUDisassembler::isGFX10() const { return AMDGPU::isGFX10(STI); }
@@ -1217,6 +1455,10 @@ bool AMDGPUDisassembler::isGFX10Plus() const {
return AMDGPU::isGFX10Plus(STI);
}
+bool AMDGPUDisassembler::hasArchitectedFlatScratch() const {
+ return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
+}
+
//===----------------------------------------------------------------------===//
// AMDGPU specific symbol handling
//===----------------------------------------------------------------------===//
@@ -1276,7 +1518,8 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
AMDGPU::IsaInfo::getSGPREncodingGranule(&STI);
KdStream << Indent << ".amdhsa_reserve_vcc " << 0 << '\n';
- KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n';
+ if (!hasArchitectedFlatScratch())
+ KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n';
KdStream << Indent << ".amdhsa_reserve_xnack_mask " << 0 << '\n';
KdStream << Indent << ".amdhsa_next_free_sgpr " << NextFreeSGPR << "\n";
@@ -1327,9 +1570,12 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC2(
uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
using namespace amdhsa;
StringRef Indent = "\t";
- PRINT_DIRECTIVE(
- ".amdhsa_system_sgpr_private_segment_wavefront_offset",
- COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
+ if (hasArchitectedFlatScratch())
+ PRINT_DIRECTIVE(".amdhsa_enable_private_segment",
+ COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
+ else
+ PRINT_DIRECTIVE(".amdhsa_system_sgpr_private_segment_wavefront_offset",
+ COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_x",
COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_y",
@@ -1387,7 +1633,6 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective(
uint16_t TwoByteBuffer = 0;
uint32_t FourByteBuffer = 0;
- uint64_t EightByteBuffer = 0;
StringRef ReservedBytes;
StringRef Indent = "\t";
@@ -1408,11 +1653,19 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective(
<< FourByteBuffer << '\n';
return MCDisassembler::Success;
+ case amdhsa::KERNARG_SIZE_OFFSET:
+ FourByteBuffer = DE.getU32(Cursor);
+ KdStream << Indent << ".amdhsa_kernarg_size "
+ << FourByteBuffer << '\n';
+ return MCDisassembler::Success;
+
case amdhsa::RESERVED0_OFFSET:
- // 8 reserved bytes, must be 0.
- EightByteBuffer = DE.getU64(Cursor);
- if (EightByteBuffer) {
- return MCDisassembler::Fail;
+ // 4 reserved bytes, must be 0.
+ ReservedBytes = DE.getBytes(Cursor, 4);
+ for (int I = 0; I < 4; ++I) {
+ if (ReservedBytes[I] != 0) {
+ return MCDisassembler::Fail;
+ }
}
return MCDisassembler::Success;
@@ -1463,8 +1716,9 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective(
using namespace amdhsa;
TwoByteBuffer = DE.getU16(Cursor);
- PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer",
- KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
+ if (!hasArchitectedFlatScratch())
+ PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer",
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_ptr",
KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
PRINT_DIRECTIVE(".amdhsa_user_sgpr_queue_ptr",
@@ -1473,8 +1727,9 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective(
KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_id",
KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
- PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init",
- KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
+ if (!hasArchitectedFlatScratch())
+ PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init",
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size",
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
@@ -1589,6 +1844,8 @@ bool AMDGPUSymbolizer::tryAddingSymbolicOperand(MCInst &Inst,
Inst.addOperand(MCOperand::createExpr(Add));
return true;
}
+ // Add to list of referenced addresses, so caller can synthesize a label.
+ ReferencedAddresses.push_back(static_cast<uint64_t>(Value));
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 714dabbc5184..dc879ec5ad88 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -99,12 +99,14 @@ public:
MCOperand decodeOperand_VS_128(unsigned Val) const;
MCOperand decodeOperand_VSrc16(unsigned Val) const;
MCOperand decodeOperand_VSrcV216(unsigned Val) const;
+ MCOperand decodeOperand_VSrcV232(unsigned Val) const;
MCOperand decodeOperand_VReg_64(unsigned Val) const;
MCOperand decodeOperand_VReg_96(unsigned Val) const;
MCOperand decodeOperand_VReg_128(unsigned Val) const;
MCOperand decodeOperand_VReg_256(unsigned Val) const;
MCOperand decodeOperand_VReg_512(unsigned Val) const;
+ MCOperand decodeOperand_VReg_1024(unsigned Val) const;
MCOperand decodeOperand_SReg_32(unsigned Val) const;
MCOperand decodeOperand_SReg_32_XM0_XEXEC(unsigned Val) const;
@@ -117,7 +119,9 @@ public:
MCOperand decodeOperand_SReg_512(unsigned Val) const;
MCOperand decodeOperand_AGPR_32(unsigned Val) const;
+ MCOperand decodeOperand_AReg_64(unsigned Val) const;
MCOperand decodeOperand_AReg_128(unsigned Val) const;
+ MCOperand decodeOperand_AReg_256(unsigned Val) const;
MCOperand decodeOperand_AReg_512(unsigned Val) const;
MCOperand decodeOperand_AReg_1024(unsigned Val) const;
MCOperand decodeOperand_AV_32(unsigned Val) const;
@@ -126,12 +130,15 @@ public:
enum OpWidthTy {
OPW32,
OPW64,
+ OPW96,
OPW128,
+ OPW160,
OPW256,
OPW512,
OPW1024,
OPW16,
OPWV216,
+ OPWV232,
OPW_LAST_,
OPW_FIRST_ = OPW32
};
@@ -159,11 +166,16 @@ public:
int getTTmpIdx(unsigned Val) const;
+ const MCInstrInfo *getMCII() const { return MCII.get(); }
+
bool isVI() const;
bool isGFX9() const;
+ bool isGFX90A() const;
bool isGFX9Plus() const;
bool isGFX10() const;
bool isGFX10Plus() const;
+
+ bool hasArchitectedFlatScratch() const;
};
//===----------------------------------------------------------------------===//
@@ -173,6 +185,7 @@ public:
class AMDGPUSymbolizer : public MCSymbolizer {
private:
void *DisInfo;
+ std::vector<uint64_t> ReferencedAddresses;
public:
AMDGPUSymbolizer(MCContext &Ctx, std::unique_ptr<MCRelocationInfo> &&RelInfo,
@@ -187,6 +200,10 @@ public:
void tryAddingPcLoadReferenceComment(raw_ostream &cStream,
int64_t Value,
uint64_t Address) override;
+
+ ArrayRef<uint64_t> getReferencedAddresses() const override {
+ return ReferencedAddresses;
+ }
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index 8d3e138ba56a..596c3d7baea0 100644
--- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -532,7 +532,10 @@ def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24",
def : UMad24Pat<MULADD_UINT24_eg>;
def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>;
-def : FSHRPattern <BIT_ALIGN_INT_eg>;
+def : AMDGPUPat <
+ (fshr i32:$src0, i32:$src1, i32:$src2),
+ (BIT_ALIGN_INT_eg $src0, $src1, $src2)
+>;
def : ROTRPattern <BIT_ALIGN_INT_eg>;
def MULADD_eg : MULADD_Common<0x14>;
def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 57a355a55a02..90f26e514f54 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -6,9 +6,9 @@
//
//===----------------------------------------------------------------------===//
-def FLATOffset : ComplexPattern<i64, 2, "SelectFlatOffset<false>", [], [SDNPWantRoot], -10>;
-def FLATOffsetSigned : ComplexPattern<i64, 2, "SelectFlatOffset<true>", [], [SDNPWantRoot], -10>;
-def ScratchOffset : ComplexPattern<i32, 2, "SelectFlatOffset<true>", [], [SDNPWantRoot], -10>;
+def FlatOffset : ComplexPattern<i64, 2, "SelectFlatOffset", [], [SDNPWantRoot], -10>;
+def GlobalOffset : ComplexPattern<i64, 2, "SelectGlobalOffset", [], [SDNPWantRoot], -10>;
+def ScratchOffset : ComplexPattern<i32, 2, "SelectScratchOffset", [], [SDNPWantRoot], -10>;
def GlobalSAddr : ComplexPattern<i64, 3, "SelectGlobalSAddr", [], [SDNPWantRoot], -10>;
def ScratchSAddr : ComplexPattern<i32, 2, "SelectScratchSAddr", [], [SDNPWantRoot], -10>;
@@ -54,6 +54,8 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
bits<1> glcValue = 0;
bits<1> has_dlc = 1;
bits<1> dlcValue = 0;
+ bits<1> has_sccb = 1;
+ bits<1> sccbValue = 0;
let SubtargetPredicate = !if(is_flat_global, HasFlatGlobalInsts,
!if(is_flat_scratch, HasFlatScratchInsts, HasFlatAddressSpace));
@@ -67,9 +69,9 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
let VM_CNT = 1;
let LGKM_CNT = !not(!or(is_flat_global, is_flat_scratch));
- let IsFlatGlobal = is_flat_global;
+ let FlatGlobal = is_flat_global;
- let IsFlatScratch = is_flat_scratch;
+ let FlatScratch = is_flat_scratch;
}
class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
@@ -79,22 +81,29 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
let isPseudo = 0;
let isCodeGenOnly = 0;
+ let FLAT = 1;
+
// copy relevant pseudo op flags
- let SubtargetPredicate = ps.SubtargetPredicate;
- let AsmMatchConverter = ps.AsmMatchConverter;
- let OtherPredicates = ps.OtherPredicates;
- let TSFlags = ps.TSFlags;
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let OtherPredicates = ps.OtherPredicates;
+ let TSFlags = ps.TSFlags;
let UseNamedOperandTable = ps.UseNamedOperandTable;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let IsAtomicRet = ps.IsAtomicRet;
+ let IsAtomicNoRet = ps.IsAtomicNoRet;
+ let VM_CNT = ps.VM_CNT;
+ let LGKM_CNT = ps.LGKM_CNT;
// encoding fields
bits<8> vaddr;
- bits<8> vdata;
+ bits<10> vdata;
bits<7> saddr;
- bits<8> vdst;
+ bits<10> vdst;
- bits<1> slc;
- bits<1> glc;
- bits<1> dlc;
+ bits<5> cpol;
// Only valid on gfx9
bits<1> lds = 0; // XXX - What does this actually do?
@@ -106,7 +115,8 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
// Signed offset. Highest bit ignored for flat and treated as 12-bit
// unsigned for flat accesses.
bits<13> offset;
- bits<1> nv = 0; // XXX - What does this actually do?
+ // GFX90A+ only: instruction uses AccVGPR for data
+ bits<1> acc = !if(ps.has_vdst, vdst{9}, !if(ps.has_data, vdata{9}, 0));
// We don't use tfe right now, and it was removed in gfx9.
bits<1> tfe = 0;
@@ -116,17 +126,17 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
let Inst{13} = lds;
let Inst{15-14} = seg;
- let Inst{16} = !if(ps.has_glc, glc, ps.glcValue);
- let Inst{17} = slc;
+ let Inst{16} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glcValue);
+ let Inst{17} = cpol{CPolBit.SLC};
let Inst{24-18} = op;
let Inst{31-26} = 0x37; // Encoding.
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
- let Inst{47-40} = !if(ps.has_data, vdata, ?);
+ let Inst{47-40} = !if(ps.has_data, vdata{7-0}, ?);
let Inst{54-48} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7f), 0);
// 54-48 is reserved.
- let Inst{55} = nv; // nv on GFX9+, TFE before.
- let Inst{63-56} = !if(ps.has_vdst, vdst, ?);
+ let Inst{55} = acc; // nv on GFX9+, TFE before. AccVGPR for data on GFX90A.
+ let Inst{63-56} = !if(ps.has_vdst, vdst{7-0}, ?);
}
class GlobalSaddrTable <bit is_saddr, string Name = ""> {
@@ -139,9 +149,10 @@ class GlobalSaddrTable <bit is_saddr, string Name = ""> {
// saddr is 32-bit (which isn't handled here yet).
class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
bit HasTiedOutput = 0,
- bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
+ bit HasSaddr = 0, bit EnableSaddr = 0,
+ RegisterOperand vdata_op = getLdStRegisterOperand<regClass>.ret> : FLAT_Pseudo<
opName,
- (outs regClass:$vdst),
+ (outs vdata_op:$vdst),
!con(
!con(
!if(EnableSaddr,
@@ -149,9 +160,9 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
(ins VReg_64:$vaddr)),
(ins flat_offset:$offset)),
// FIXME: Operands with default values do not work with following non-optional operands.
- !if(HasTiedOutput, (ins GLC:$glc, SLC:$slc, DLC:$dlc, regClass:$vdst_in),
- (ins GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc))),
- " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc"> {
+ !if(HasTiedOutput, (ins CPol:$cpol, vdata_op:$vdst_in),
+ (ins CPol_0:$cpol))),
+ " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$cpol"> {
let has_data = 0;
let mayLoad = 1;
let has_saddr = HasSaddr;
@@ -169,10 +180,10 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
(outs),
!con(
!if(EnableSaddr,
- (ins VGPR_32:$vaddr, vdataClass:$vdata, SReg_64:$saddr),
- (ins VReg_64:$vaddr, vdataClass:$vdata)),
- (ins flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc)),
- " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc"> {
+ (ins VGPR_32:$vaddr, getLdStRegisterOperand<vdataClass>.ret:$vdata, SReg_64:$saddr),
+ (ins VReg_64:$vaddr, getLdStRegisterOperand<vdataClass>.ret:$vdata)),
+ (ins flat_offset:$offset, CPol_0:$cpol)),
+ " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$cpol"> {
let mayLoad = 0;
let mayStore = 1;
let has_vdst = 0;
@@ -196,9 +207,9 @@ class FLAT_Global_Load_AddTid_Pseudo <string opName, RegisterClass regClass,
opName,
(outs regClass:$vdst),
!con(!if(EnableSaddr, (ins SReg_64:$saddr), (ins)),
- (ins flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc),
+ (ins flat_offset:$offset, CPol_0:$cpol),
!if(HasTiedOutput, (ins regClass:$vdst_in), (ins))),
- " $vdst, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
+ " $vdst, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> {
let is_flat_global = 1;
let has_data = 0;
let mayLoad = 1;
@@ -234,8 +245,8 @@ class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterClass vdataClass,
opName,
(outs),
!con(!if(EnableSaddr, (ins vdataClass:$vdata, SReg_64:$saddr), (ins vdataClass:$vdata)),
- (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
- " $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
+ (ins flat_offset:$offset, CPol:$cpol)),
+ " $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> {
let is_flat_global = 1;
let mayLoad = 0;
let mayStore = 1;
@@ -266,16 +277,16 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
bit EnableVaddr = !not(EnableSaddr)>
: FLAT_Pseudo<
opName,
- (outs regClass:$vdst),
+ (outs getLdStRegisterOperand<regClass>.ret:$vdst),
!con(
!if(EnableSaddr,
(ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset),
!if(EnableVaddr,
(ins VGPR_32:$vaddr, flat_offset:$offset),
(ins flat_offset:$offset))),
- !if(HasTiedOutput, (ins GLC:$glc, SLC:$slc, DLC:$dlc, regClass:$vdst_in),
- (ins GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc))),
- " $vdst, "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
+ !if(HasTiedOutput, (ins CPol:$cpol, getLdStRegisterOperand<regClass>.ret:$vdst_in),
+ (ins CPol_0:$cpol))),
+ " $vdst, "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> {
let has_data = 0;
let mayLoad = 1;
let has_saddr = 1;
@@ -289,15 +300,16 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
}
class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit EnableSaddr = 0,
- bit EnableVaddr = !not(EnableSaddr)> : FLAT_Pseudo<
+ bit EnableVaddr = !not(EnableSaddr),
+ RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret> : FLAT_Pseudo<
opName,
(outs),
!if(EnableSaddr,
- (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc),
+ (ins vdata_op:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol_0:$cpol),
!if(EnableVaddr,
- (ins vdataClass:$vdata, VGPR_32:$vaddr, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc),
- (ins vdataClass:$vdata, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc))),
- " "#!if(EnableVaddr, "$vaddr", "off")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
+ (ins vdata_op:$vdata, VGPR_32:$vaddr, flat_offset:$offset, CPol_0:$cpol),
+ (ins vdata_op:$vdata, flat_offset:$offset, CPol_0:$cpol))),
+ " "#!if(EnableVaddr, "$vaddr", "off")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> {
let mayLoad = 0;
let mayStore = 1;
let has_vdst = 0;
@@ -344,7 +356,10 @@ class FLAT_AtomicNoRet_Pseudo<string opName, dag outs, dag ins,
let has_dlc = 0;
let dlcValue = 0;
let has_vdst = 0;
+ let has_sccb = 1;
+ let sccbValue = 0;
let maybeAtomic = 1;
+ let IsAtomicNoRet = 1;
}
class FLAT_AtomicRet_Pseudo<string opName, dag outs, dag ins,
@@ -354,6 +369,9 @@ class FLAT_AtomicRet_Pseudo<string opName, dag outs, dag ins,
let has_vdst = 1;
let glcValue = 1;
let dlcValue = 0;
+ let sccbValue = 0;
+ let IsAtomicNoRet = 0;
+ let IsAtomicRet = 1;
let PseudoInstr = NAME # "_RTN";
}
@@ -364,11 +382,12 @@ multiclass FLAT_Atomic_Pseudo<
SDPatternOperator atomic = null_frag,
ValueType data_vt = vt,
RegisterClass data_rc = vdst_rc,
- bit isFP = isFloatType<data_vt>.ret> {
+ bit isFP = isFloatType<data_vt>.ret,
+ RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
def "" : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC_0:$slc),
- " $vaddr, $vdata$offset$slc">,
+ (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol),
+ " $vaddr, $vdata$offset$cpol">,
GlobalSaddrTable<0, opName>,
AtomicNoRet <opName, 0> {
let PseudoInstr = NAME;
@@ -377,11 +396,11 @@ multiclass FLAT_Atomic_Pseudo<
}
def _RTN : FLAT_AtomicRet_Pseudo <opName,
- (outs vdst_rc:$vdst),
- (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, GLC_1:$glc1, SLC_0:$slc),
- " $vdst, $vaddr, $vdata$offset$glc1$slc",
+ (outs getLdStRegisterOperand<vdst_rc>.ret:$vdst),
+ (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
+ " $vdst, $vaddr, $vdata$offset$cpol",
[(set vt:$vdst,
- (atomic (FLATOffset i64:$vaddr, i16:$offset), data_vt:$vdata))]>,
+ (atomic (FlatOffset i64:$vaddr, i16:$offset), data_vt:$vdata))]>,
GlobalSaddrTable<0, opName#"_rtn">,
AtomicNoRet <opName, 1>{
let FPAtomic = isFP;
@@ -396,12 +415,13 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
SDPatternOperator atomic = null_frag,
ValueType data_vt = vt,
RegisterClass data_rc = vdst_rc,
- bit isFP = isFloatType<data_vt>.ret> {
+ bit isFP = isFloatType<data_vt>.ret,
+ RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
def "" : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC_0:$slc),
- " $vaddr, $vdata, off$offset$slc">,
+ (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol),
+ " $vaddr, $vdata, off$offset$cpol">,
GlobalSaddrTable<0, opName>,
AtomicNoRet <opName, 0> {
let has_saddr = 1;
@@ -411,8 +431,8 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
- (ins VGPR_32:$vaddr, data_rc:$vdata, SReg_64:$saddr, flat_offset:$offset, SLC_0:$slc),
- " $vaddr, $vdata, $saddr$offset$slc">,
+ (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_0:$cpol),
+ " $vaddr, $vdata, $saddr$offset$cpol">,
GlobalSaddrTable<1, opName>,
AtomicNoRet <opName#"_saddr", 0> {
let has_saddr = 1;
@@ -429,14 +449,16 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
SDPatternOperator atomic = null_frag,
ValueType data_vt = vt,
RegisterClass data_rc = vdst_rc,
- bit isFP = isFloatType<data_vt>.ret> {
+ bit isFP = isFloatType<data_vt>.ret,
+ RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret,
+ RegisterOperand vdst_op = getLdStRegisterOperand<vdst_rc>.ret> {
def _RTN : FLAT_AtomicRet_Pseudo <opName,
- (outs vdst_rc:$vdst),
- (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, GLC_1:$glc1, SLC_0:$slc),
- " $vdst, $vaddr, $vdata, off$offset$glc1$slc",
+ (outs vdst_op:$vdst),
+ (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
+ " $vdst, $vaddr, $vdata, off$offset$cpol",
[(set vt:$vdst,
- (atomic (FLATOffsetSigned i64:$vaddr, i16:$offset), data_vt:$vdata))]>,
+ (atomic (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$vdata))]>,
GlobalSaddrTable<0, opName#"_rtn">,
AtomicNoRet <opName, 1> {
let has_saddr = 1;
@@ -444,9 +466,9 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
}
def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
- (outs vdst_rc:$vdst),
- (ins VGPR_32:$vaddr, data_rc:$vdata, SReg_64:$saddr, flat_offset:$offset, GLC_1:$glc1, SLC_0:$slc),
- " $vdst, $vaddr, $vdata, $saddr$offset$glc1$slc">,
+ (outs vdst_op:$vdst),
+ (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
+ " $vdst, $vaddr, $vdata, $saddr$offset$cpol">,
GlobalSaddrTable<1, opName#"_rtn">,
AtomicNoRet <opName#"_saddr", 1> {
let has_saddr = 1;
@@ -605,6 +627,15 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
} // End SubtargetPredicate = isGFX7GFX10
+let SubtargetPredicate = isGFX90APlus in {
+ defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64, int_amdgcn_flat_atomic_fadd>;
+ defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmin>;
+ defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmax>;
+ defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>;
+ defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
+ defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
+} // End SubtargetPredicate = isGFX90APlus
+
defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>;
defm GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>;
defm GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>;
@@ -777,6 +808,15 @@ let OtherPredicates = [HasAtomicFaddInsts] in {
"global_atomic_pk_add_f16", VGPR_32, v2f16
>;
} // End OtherPredicates = [HasAtomicFaddInsts]
+
+let OtherPredicates = [isGFX90APlus] in {
+ defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_RTN <
+ "global_atomic_add_f32", VGPR_32, f32, int_amdgcn_global_atomic_fadd
+ >;
+ defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN <
+ "global_atomic_pk_add_f16", VGPR_32, v2f16, int_amdgcn_global_atomic_fadd
+ >;
+} // End OtherPredicates = [isGFX90APlus]
} // End is_flat_global = 1
//===----------------------------------------------------------------------===//
@@ -785,33 +825,33 @@ let OtherPredicates = [HasAtomicFaddInsts] in {
// Patterns for global loads with no offset.
class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (FLATOffset i64:$vaddr, i16:$offset))),
+ (vt (node (FlatOffset i64:$vaddr, i16:$offset))),
(inst $vaddr, $offset)
>;
class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (node (FLATOffset (i64 VReg_64:$vaddr), i16:$offset), vt:$in),
- (inst $vaddr, $offset, 0, 0, 0, $in)
+ (node (FlatOffset (i64 VReg_64:$vaddr), i16:$offset), vt:$in),
+ (inst $vaddr, $offset, 0, $in)
>;
class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset), vt:$in),
- (inst $vaddr, $offset, 0, 0, 0, $in)
+ (node (GlobalOffset (i64 VReg_64:$vaddr), i16:$offset), vt:$in),
+ (inst $vaddr, $offset, 0, $in)
>;
class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), vt:$in)),
- (inst $saddr, $voffset, $offset, 0, 0, 0, $in)
+ (inst $saddr, $voffset, $offset, 0, $in)
>;
class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset))),
+ (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i16:$offset))),
(inst $vaddr, $offset)
>;
class GlobalLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset))),
- (inst $saddr, $voffset, $offset, 0, 0, 0)
+ (inst $saddr, $voffset, $offset, 0)
>;
class GlobalStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
@@ -839,19 +879,19 @@ class GlobalAtomicNoRtnSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
>;
class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (node vt:$data, (FLATOffset i64:$vaddr, i16:$offset)),
+ (node vt:$data, (FlatOffset i64:$vaddr, i16:$offset)),
(inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
>;
class FlatStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (node vt:$data, (FLATOffsetSigned i64:$vaddr, i16:$offset)),
+ (node vt:$data, (GlobalOffset i64:$vaddr, i16:$offset)),
(inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
>;
class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
// atomic store follows atomic binop convention so the address comes
// first.
- (node (FLATOffset i64:$vaddr, i16:$offset), vt:$data),
+ (node (FlatOffset i64:$vaddr, i16:$offset), vt:$data),
(inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
>;
@@ -859,29 +899,29 @@ class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt, ValueType data_vt = vt> : GCNPat <
// atomic store follows atomic binop convention so the address comes
// first.
- (node (FLATOffset i64:$vaddr, i16:$offset), data_vt:$data),
+ (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data),
(inst $vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
>;
class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
ValueType data_vt = vt> : GCNPat <
- (vt (node (FLATOffset i64:$vaddr, i16:$offset), data_vt:$data)),
+ (vt (node (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)),
(inst $vaddr, $data, $offset)
>;
class FlatAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (node (FLATOffset i64:$vaddr, i16:$offset), vt:$data),
+ (node (FlatOffset i64:$vaddr, i16:$offset), vt:$data),
(inst VReg_64:$vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
>;
class FlatSignedAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (node (FLATOffsetSigned i64:$vaddr, i16:$offset), vt:$data),
+ (node (GlobalOffset i64:$vaddr, i16:$offset), vt:$data),
(inst VReg_64:$vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
>;
class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
ValueType data_vt = vt> : GCNPat <
- (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset), data_vt:$data)),
+ (vt (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
(inst $vaddr, $data, $offset)
>;
@@ -892,7 +932,7 @@ class ScratchLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType
class ScratchLoadSignedPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(node (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset), vt:$in),
- (inst $vaddr, $offset, 0, 0, 0, $in)
+ (inst $vaddr, $offset, 0, $in)
>;
class ScratchStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -907,7 +947,7 @@ class ScratchLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType v
class ScratchLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset), vt:$in)),
- (inst $saddr, $offset, 0, 0, 0, $in)
+ (inst $saddr, $offset, 0, $in)
>;
class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
@@ -1202,6 +1242,17 @@ defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_ADD_F32, atomic_load_fadd_glo
defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_PK_ADD_F16, atomic_load_fadd_v2f16_global_noret_32, v2f16>;
}
+let OtherPredicates = [isGFX90APlus] in {
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F32", atomic_load_fadd_global_32, f32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", atomic_load_fadd_v2f16_global_32, v2f16>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", atomic_load_fadd_global_64, f64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", atomic_load_fmin_global_64, f64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", atomic_load_fmax_global_64, f64>;
+def : FlatSignedAtomicPat <FLAT_ATOMIC_ADD_F64_RTN, atomic_load_fadd_flat_64, f64>;
+def : FlatSignedAtomicPat <FLAT_ATOMIC_MIN_F64_RTN, atomic_load_fmin_flat_64, f64>;
+def : FlatSignedAtomicPat <FLAT_ATOMIC_MAX_F64_RTN, atomic_load_fmax_flat_64, f64>;
+}
+
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in {
@@ -1337,16 +1388,21 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_Real_Atomics_ci <0x60, FLAT_ATOMIC_FMAX_X2
// VI
//===----------------------------------------------------------------------===//
-class FLAT_Real_vi <bits<7> op, FLAT_Pseudo ps> :
+class FLAT_Real_vi <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> :
FLAT_Real <op, ps>,
SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
let AssemblerPredicate = isGFX8GFX9;
let DecoderNamespace = "GFX8";
+
+ let Inst{25} = !if(has_sccb, cpol{CPolBit.SCC}, ps.sccbValue);
+ let AsmString = ps.Mnemonic #
+ !subst("$sccb", !if(has_sccb, "$sccb",""), ps.AsmOperands);
}
-multiclass FLAT_Real_AllAddr_vi<bits<7> op> {
- def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME)>;
- def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>;
+multiclass FLAT_Real_AllAddr_vi<bits<7> op,
+ bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
+ def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
+ def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
}
def FLAT_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, FLAT_LOAD_UBYTE>;
@@ -1374,15 +1430,17 @@ def FLAT_LOAD_SBYTE_D16_HI_vi : FLAT_Real_vi <0x23, FLAT_LOAD_SBYTE_D16_HI>;
def FLAT_LOAD_SHORT_D16_vi : FLAT_Real_vi <0x24, FLAT_LOAD_SHORT_D16>;
def FLAT_LOAD_SHORT_D16_HI_vi : FLAT_Real_vi <0x25, FLAT_LOAD_SHORT_D16_HI>;
-multiclass FLAT_Real_Atomics_vi <bits<7> op, FLAT_Pseudo ps> {
- def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>;
- def _RTN_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>;
+multiclass FLAT_Real_Atomics_vi <bits<7> op, FLAT_Pseudo ps,
+ bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
+ def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>;
+ def _RTN_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>;
}
-multiclass FLAT_Global_Real_Atomics_vi<bits<7> op> :
- FLAT_Real_AllAddr_vi<op> {
- def _RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN")>;
- def _SADDR_RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN")>;
+multiclass FLAT_Global_Real_Atomics_vi<bits<7> op,
+ bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> :
+ FLAT_Real_AllAddr_vi<op, has_sccb> {
+ def _RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>;
+ def _SADDR_RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>;
}
@@ -1489,6 +1547,19 @@ defm SCRATCH_STORE_DWORDX2 : FLAT_Real_AllAddr_vi <0x1d>;
defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_vi <0x1e>;
defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_vi <0x1f>;
+let SubtargetPredicate = HasAtomicFaddInsts in {
+defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Real_Atomics_vi <0x04d, 0>;
+defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_vi <0x04e, 0>;
+}
+
+let SubtargetPredicate = isGFX90AOnly in {
+ defm FLAT_ATOMIC_ADD_F64 : FLAT_Real_Atomics_vi<0x4f, FLAT_ATOMIC_ADD_F64, 0>;
+ defm FLAT_ATOMIC_MIN_F64 : FLAT_Real_Atomics_vi<0x50, FLAT_ATOMIC_MIN_F64, 0>;
+ defm FLAT_ATOMIC_MAX_F64 : FLAT_Real_Atomics_vi<0x51, FLAT_ATOMIC_MAX_F64, 0>;
+ defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Real_Atomics_vi<0x4f, 0>;
+ defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Real_Atomics_vi<0x50, 0>;
+ defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Real_Atomics_vi<0x51, 0>;
+} // End SubtargetPredicate = isGFX90AOnly
//===----------------------------------------------------------------------===//
// GFX10.
@@ -1500,7 +1571,7 @@ class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps> :
let DecoderNamespace = "GFX10";
let Inst{11-0} = offset{11-0};
- let Inst{12} = !if(ps.has_dlc, dlc, ps.dlcValue);
+ let Inst{12} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlcValue);
let Inst{54-48} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7d), 0x7d);
let Inst{55} = 0;
}
@@ -1695,10 +1766,3 @@ defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Real_ScratchAllAddr_gfx10<0x022>;
defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Real_ScratchAllAddr_gfx10<0x023>;
defm SCRATCH_LOAD_SHORT_D16 : FLAT_Real_ScratchAllAddr_gfx10<0x024>;
defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Real_ScratchAllAddr_gfx10<0x025>;
-
-let SubtargetPredicate = HasAtomicFaddInsts in {
-
-defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Real_AllAddr_vi <0x04d>;
-defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Real_AllAddr_vi <0x04e>;
-
-} // End SubtargetPredicate = HasAtomicFaddInsts
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index e4eacd101ce8..2bf365168048 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -54,21 +54,20 @@ namespace {
class GCNDPPCombine : public MachineFunctionPass {
MachineRegisterInfo *MRI;
const SIInstrInfo *TII;
+ const GCNSubtarget *ST;
using RegSubRegPair = TargetInstrInfo::RegSubRegPair;
MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const;
- MachineInstr *createDPPInst(MachineInstr &OrigMI,
- MachineInstr &MovMI,
+ MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI,
RegSubRegPair CombOldVGPR,
- MachineOperand *OldOpnd,
- bool CombBCZ) const;
+ MachineOperand *OldOpnd, bool CombBCZ,
+ bool IsShrinkable) const;
- MachineInstr *createDPPInst(MachineInstr &OrigMI,
- MachineInstr &MovMI,
- RegSubRegPair CombOldVGPR,
- bool CombBCZ) const;
+ MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI,
+ RegSubRegPair CombOldVGPR, bool CombBCZ,
+ bool IsShrinkable) const;
bool hasNoImmOrEqual(MachineInstr &MI,
unsigned OpndName,
@@ -99,7 +98,8 @@ public:
}
private:
- int getDPPOp(unsigned Op) const;
+ int getDPPOp(unsigned Op, bool IsShrinkable) const;
+ bool isShrinkable(MachineInstr &MI) const;
};
} // end anonymous namespace
@@ -114,11 +114,40 @@ FunctionPass *llvm::createGCNDPPCombinePass() {
return new GCNDPPCombine();
}
-int GCNDPPCombine::getDPPOp(unsigned Op) const {
+bool GCNDPPCombine::isShrinkable(MachineInstr &MI) const {
+ unsigned Op = MI.getOpcode();
+ if (!TII->isVOP3(Op)) {
+ return false;
+ }
+ if (!TII->hasVALU32BitEncoding(Op)) {
+ LLVM_DEBUG(dbgs() << " Inst hasn't e32 equivalent\n");
+ return false;
+ }
+ if (const auto *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
+ // Give up if there are any uses of the carry-out from instructions like
+ // V_ADD_CO_U32. The shrunken form of the instruction would write it to vcc
+ // instead of to a virtual register.
+ if (!MRI->use_nodbg_empty(SDst->getReg()))
+ return false;
+ }
+ // check if other than abs|neg modifiers are set (opsel for example)
+ const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG);
+ if (!hasNoImmOrEqual(MI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
+ !hasNoImmOrEqual(MI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
+ !hasNoImmOrEqual(MI, AMDGPU::OpName::clamp, 0) ||
+ !hasNoImmOrEqual(MI, AMDGPU::OpName::omod, 0)) {
+ LLVM_DEBUG(dbgs() << " Inst has non-default modifiers\n");
+ return false;
+ }
+ return true;
+}
+
+int GCNDPPCombine::getDPPOp(unsigned Op, bool IsShrinkable) const {
auto DPP32 = AMDGPU::getDPPOp32(Op);
- if (DPP32 == -1) {
+ if (IsShrinkable) {
+ assert(DPP32 == -1);
auto E32 = AMDGPU::getVOPe32(Op);
- DPP32 = (E32 == -1)? -1 : AMDGPU::getDPPOp32(E32);
+ DPP32 = (E32 == -1) ? -1 : AMDGPU::getDPPOp32(E32);
}
return (DPP32 == -1 || TII->pseudoToMCOpcode(DPP32) == -1) ? -1 : DPP32;
}
@@ -137,7 +166,8 @@ MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
case AMDGPU::IMPLICIT_DEF:
return nullptr;
case AMDGPU::COPY:
- case AMDGPU::V_MOV_B32_e32: {
+ case AMDGPU::V_MOV_B32_e32:
+ case AMDGPU::V_MOV_B64_PSEUDO: {
auto &Op1 = Def->getOperand(1);
if (Op1.isImm())
return &Op1;
@@ -150,11 +180,13 @@ MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
MachineInstr &MovMI,
RegSubRegPair CombOldVGPR,
- bool CombBCZ) const {
- assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
+ bool CombBCZ,
+ bool IsShrinkable) const {
+ assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp ||
+ MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
auto OrigOp = OrigMI.getOpcode();
- auto DPPOp = getDPPOp(OrigOp);
+ auto DPPOp = getDPPOp(OrigOp, IsShrinkable);
if (DPPOp == -1) {
LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n");
return nullptr;
@@ -174,7 +206,11 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
if (OldIdx != -1) {
assert(OldIdx == NumOperands);
- assert(isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI));
+ assert(isOfRegClass(
+ CombOldVGPR,
+ *MRI->getRegClass(
+ TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg()),
+ *MRI));
auto *Def = getVRegSubRegDef(CombOldVGPR, *MRI);
DPPInst.addReg(CombOldVGPR.Reg, Def ? 0 : RegState::Undef,
CombOldVGPR.SubReg);
@@ -308,11 +344,9 @@ static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) {
return false;
}
-MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
- MachineInstr &MovMI,
- RegSubRegPair CombOldVGPR,
- MachineOperand *OldOpndValue,
- bool CombBCZ) const {
+MachineInstr *GCNDPPCombine::createDPPInst(
+ MachineInstr &OrigMI, MachineInstr &MovMI, RegSubRegPair CombOldVGPR,
+ MachineOperand *OldOpndValue, bool CombBCZ, bool IsShrinkable) const {
assert(CombOldVGPR.Reg);
if (!CombBCZ && OldOpndValue && OldOpndValue->isImm()) {
auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
@@ -325,12 +359,14 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
return nullptr;
}
CombOldVGPR = getRegSubRegPair(*Src1);
- if (!isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI)) {
- LLVM_DEBUG(dbgs() << " failed: src1 isn't a VGPR32 register\n");
+ auto MovDst = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
+ const TargetRegisterClass *RC = MRI->getRegClass(MovDst->getReg());
+ if (!isOfRegClass(CombOldVGPR, *RC, *MRI)) {
+ LLVM_DEBUG(dbgs() << " failed: src1 has wrong register class\n");
return nullptr;
}
}
- return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ);
+ return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ, IsShrinkable);
}
// returns true if MI doesn't have OpndName immediate operand or the
@@ -346,7 +382,8 @@ bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName,
}
bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
- assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
+ assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp ||
+ MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
@@ -362,6 +399,17 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
return false;
}
+ if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) {
+ auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl);
+ assert(DppCtrl && DppCtrl->isImm());
+ if (!AMDGPU::isLegal64BitDPPControl(DppCtrl->getImm())) {
+ LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move uses unsupported"
+ " control value\n");
+ // Let it split, then control may become legal.
+ return false;
+ }
+ }
+
auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
assert(RowMaskOpnd && RowMaskOpnd->isImm());
auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
@@ -430,8 +478,9 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
auto CombOldVGPR = getRegSubRegPair(*OldOpnd);
// try to reuse previous old reg if its undefined (IMPLICIT_DEF)
if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef
+ const TargetRegisterClass *RC = MRI->getRegClass(DPPMovReg);
CombOldVGPR = RegSubRegPair(
- MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass));
+ MRI->createVirtualRegister(RC));
auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(),
TII->get(AMDGPU::IMPLICIT_DEF), CombOldVGPR.Reg);
DPPMIs.push_back(UndefInst.getInstr());
@@ -482,21 +531,8 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
continue;
}
- if (TII->isVOP3(OrigOp)) {
- if (!TII->hasVALU32BitEncoding(OrigOp)) {
- LLVM_DEBUG(dbgs() << " failed: VOP3 hasn't e32 equivalent\n");
- break;
- }
- // check if other than abs|neg modifiers are set (opsel for example)
- const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG);
- if (!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
- !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
- !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::clamp, 0) ||
- !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::omod, 0)) {
- LLVM_DEBUG(dbgs() << " failed: VOP3 has non-default modifiers\n");
- break;
- }
- } else if (!TII->isVOP1(OrigOp) && !TII->isVOP2(OrigOp)) {
+ bool IsShrinkable = isShrinkable(OrigMI);
+ if (!(IsShrinkable || TII->isVOP1(OrigOp) || TII->isVOP2(OrigOp))) {
LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n");
break;
}
@@ -521,7 +557,7 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
LLVM_DEBUG(dbgs() << " combining: " << OrigMI);
if (Use == Src0) {
if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
- OldOpndValue, CombBCZ)) {
+ OldOpndValue, CombBCZ, IsShrinkable)) {
DPPMIs.push_back(DPPInst);
Rollback = false;
}
@@ -532,8 +568,9 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
BB->insert(OrigMI, NewMI);
if (TII->commuteInstruction(*NewMI)) {
LLVM_DEBUG(dbgs() << " commuted: " << *NewMI);
- if (auto *DPPInst = createDPPInst(*NewMI, MovMI, CombOldVGPR,
- OldOpndValue, CombBCZ)) {
+ if (auto *DPPInst =
+ createDPPInst(*NewMI, MovMI, CombOldVGPR, OldOpndValue, CombBCZ,
+ IsShrinkable)) {
DPPMIs.push_back(DPPInst);
Rollback = false;
}
@@ -566,12 +603,12 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
}
bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
- auto &ST = MF.getSubtarget<GCNSubtarget>();
- if (!ST.hasDPP() || skipFunction(MF.getFunction()))
+ ST = &MF.getSubtarget<GCNSubtarget>();
+ if (!ST->hasDPP() || skipFunction(MF.getFunction()))
return false;
MRI = &MF.getRegInfo();
- TII = ST.getInstrInfo();
+ TII = ST->getInstrInfo();
bool Changed = false;
for (auto &MBB : MF) {
@@ -581,12 +618,17 @@ bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
Changed = true;
++NumDPPMovsCombined;
} else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) {
- auto Split = TII->expandMovDPP64(MI);
- for (auto M : { Split.first, Split.second }) {
- if (combineDPPMov(*M))
- ++NumDPPMovsCombined;
+ if (ST->has64BitDPP() && combineDPPMov(MI)) {
+ Changed = true;
+ ++NumDPPMovsCombined;
+ } else {
+ auto Split = TII->expandMovDPP64(MI);
+ for (auto M : { Split.first, Split.second }) {
+ if (M && combineDPPMov(*M))
+ ++NumDPPMovsCombined;
+ }
+ Changed = true;
}
- Changed = true;
}
}
}
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index ed1dc77bd545..bc2fb1e9770c 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -23,6 +23,9 @@ using namespace llvm;
// Hazard Recoginizer Implementation
//===----------------------------------------------------------------------===//
+static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
+ const GCNSubtarget &ST);
+
GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
IsHazardRecognizerMode(false),
CurrCycleInstr(nullptr),
@@ -32,8 +35,9 @@ GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
TRI(TII.getRegisterInfo()),
ClauseUses(TRI.getNumRegUnits()),
ClauseDefs(TRI.getNumRegUnits()) {
- MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5;
+ MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
TSchedModel.init(&ST);
+ RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
}
void GCNHazardRecognizer::Reset() {
@@ -87,6 +91,25 @@ static bool isSMovRel(unsigned Opcode) {
}
}
+static bool isDGEMM(unsigned Opcode) {
+ return Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
+ Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64 ||
+ Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_e64 ||
+ Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64;
+}
+
+static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
+ unsigned Opcode = MI.getOpcode();
+
+ if (!SIInstrInfo::isMAI(MI) ||
+ isDGEMM(Opcode) ||
+ Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
+ Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
+ return false;
+
+ return true;
+}
+
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
const MachineInstr &MI) {
if (TII.isAlwaysGDS(MI.getOpcode()))
@@ -138,12 +161,6 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
return HazardType;
- // FIXME: Should flat be considered vmem?
- if ((SIInstrInfo::isVMEM(*MI) ||
- SIInstrInfo::isFLAT(*MI))
- && checkVMEMHazards(MI) > 0)
- return HazardType;
-
if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
return HazardType;
@@ -153,6 +170,12 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
if (ST.hasNoDataDepHazard())
return NoHazard;
+ // FIXME: Should flat be considered vmem?
+ if ((SIInstrInfo::isVMEM(*MI) ||
+ SIInstrInfo::isFLAT(*MI))
+ && checkVMEMHazards(MI) > 0)
+ return HazardType;
+
if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
return HazardType;
@@ -165,6 +188,11 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
return HazardType;
+ if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
+ SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
+ SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
+ return HazardType;
+
if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
return HazardType;
@@ -251,9 +279,6 @@ unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
if (SIInstrInfo::isSMRD(*MI))
return std::max(WaitStates, checkSMRDHazards(MI));
- if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
- WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
-
if (ST.hasNSAtoVMEMBug())
WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
@@ -262,6 +287,9 @@ unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
if (ST.hasNoDataDepHazard())
return WaitStates;
+ if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
+ WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
+
if (SIInstrInfo::isVALU(*MI))
WaitStates = std::max(WaitStates, checkVALUHazards(MI));
@@ -274,6 +302,11 @@ unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
if (isRWLane(MI->getOpcode()))
WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
+ if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
+ SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
+ SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
+ WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
+
if (MI->isInlineAsm())
return std::max(WaitStates, checkInlineAsmHazards(MI));
@@ -319,8 +352,7 @@ void GCNHazardRecognizer::AdvanceCycle() {
// Do not track non-instructions which do not affect the wait states.
// If included, these instructions can lead to buffer overflow such that
// detectable hazards are missed.
- if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() ||
- CurrCycleInstr->isKill()) {
+ if (CurrCycleInstr->isMetaInstruction()) {
CurrCycleInstr = nullptr;
return;
}
@@ -359,23 +391,22 @@ void GCNHazardRecognizer::RecedeCycle() {
// Helper Functions
//===----------------------------------------------------------------------===//
-typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn;
+typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
// Returns a minimum wait states since \p I walking all predecessors.
// Only scans until \p IsExpired does not return true.
// Can only be run in a hazard recognizer mode.
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
- MachineBasicBlock *MBB,
- MachineBasicBlock::reverse_instr_iterator I,
- int WaitStates,
- IsExpiredFn IsExpired,
+ const MachineBasicBlock *MBB,
+ MachineBasicBlock::const_reverse_instr_iterator I,
+ int WaitStates, IsExpiredFn IsExpired,
DenseSet<const MachineBasicBlock *> &Visited) {
for (auto E = MBB->instr_rend(); I != E; ++I) {
// Don't add WaitStates for parent BUNDLE instructions.
if (I->isBundle())
continue;
- if (IsHazard(&*I))
+ if (IsHazard(*I))
return WaitStates;
if (I->isInlineAsm() || I->isMetaInstruction())
@@ -383,12 +414,11 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
WaitStates += SIInstrInfo::getNumWaitStates(*I);
- if (IsExpired(&*I, WaitStates))
+ if (IsExpired(*I, WaitStates))
return std::numeric_limits<int>::max();
}
- int MinWaitStates = WaitStates;
- bool Found = false;
+ int MinWaitStates = std::numeric_limits<int>::max();
for (MachineBasicBlock *Pred : MBB->predecessors()) {
if (!Visited.insert(Pred).second)
continue;
@@ -396,25 +426,14 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
WaitStates, IsExpired, Visited);
- if (W == std::numeric_limits<int>::max())
- continue;
-
- MinWaitStates = Found ? std::min(MinWaitStates, W) : W;
- if (IsExpired(nullptr, MinWaitStates))
- return MinWaitStates;
-
- Found = true;
+ MinWaitStates = std::min(MinWaitStates, W);
}
- if (Found)
- return MinWaitStates;
-
- return std::numeric_limits<int>::max();
+ return MinWaitStates;
}
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
- MachineInstr *MI,
- IsExpiredFn IsExpired) {
+ const MachineInstr *MI, IsExpiredFn IsExpired) {
DenseSet<const MachineBasicBlock *> Visited;
return getWaitStatesSince(IsHazard, MI->getParent(),
std::next(MI->getReverseIterator()),
@@ -423,7 +442,7 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
if (IsHazardRecognizerMode) {
- auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) {
+ auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
return WaitStates >= Limit;
};
return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
@@ -432,7 +451,7 @@ int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
int WaitStates = 0;
for (MachineInstr *MI : EmittedInstrs) {
if (MI) {
- if (IsHazard(MI))
+ if (IsHazard(*MI))
return WaitStates;
if (MI->isInlineAsm())
@@ -451,8 +470,8 @@ int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
int Limit) {
const SIRegisterInfo *TRI = ST.getRegisterInfo();
- auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
- return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
+ auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
+ return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
};
return getWaitStatesSince(IsHazardFn, Limit);
@@ -460,8 +479,8 @@ int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
int Limit) {
- auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
- return isSSetReg(MI->getOpcode()) && IsHazard(MI);
+ auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
+ return isSSetReg(MI.getOpcode()) && IsHazard(MI);
};
return getWaitStatesSince(IsHazardFn, Limit);
@@ -560,8 +579,12 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
// A read of an SGPR by SMRD instruction requires 4 wait states when the
// SGPR was written by a VALU instruction.
int SmrdSgprWaitStates = 4;
- auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
- auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); };
+ auto IsHazardDefFn = [this](const MachineInstr &MI) {
+ return TII.isVALU(MI);
+ };
+ auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
+ return TII.isSALU(MI);
+ };
bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
@@ -601,9 +624,11 @@ int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
// A read of an SGPR by a VMEM instruction requires 5 wait states when the
// SGPR was written by a VALU Instruction.
const int VmemSgprWaitStates = 5;
- auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
+ auto IsHazardDefFn = [this](const MachineInstr &MI) {
+ return TII.isVALU(MI);
+ };
for (const MachineOperand &Use : VMEM->uses()) {
- if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
+ if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
continue;
int WaitStatesNeededForUse =
@@ -622,15 +647,18 @@ int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
int DppVgprWaitStates = 2;
int DppExecWaitStates = 5;
int WaitStatesNeeded = 0;
- auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
+ auto IsHazardDefFn = [TII](const MachineInstr &MI) {
+ return TII->isVALU(MI);
+ };
for (const MachineOperand &Use : DPP->uses()) {
if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
continue;
int WaitStatesNeededForUse =
- DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
- [](MachineInstr *) { return true; },
- DppVgprWaitStates);
+ DppVgprWaitStates - getWaitStatesSinceDef(
+ Use.getReg(),
+ [](const MachineInstr &) { return true; },
+ DppVgprWaitStates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
}
@@ -648,7 +676,9 @@ int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
// v_div_fmas requires 4 wait states after a write to vcc from a VALU
// instruction.
const int DivFMasWaitStates = 4;
- auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
+ auto IsHazardDefFn = [TII](const MachineInstr &MI) {
+ return TII->isVALU(MI);
+ };
int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
DivFMasWaitStates);
@@ -660,8 +690,8 @@ int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
const int GetRegWaitStates = 2;
- auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {
- return GetRegHWReg == getHWReg(TII, *MI);
+ auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
+ return GetRegHWReg == getHWReg(TII, MI);
};
int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
@@ -673,8 +703,8 @@ int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
unsigned HWReg = getHWReg(TII, *SetRegInstr);
const int SetRegWaitStates = ST.getSetRegWaitStates();
- auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {
- return HWReg == getHWReg(TII, *MI);
+ auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
+ return HWReg == getHWReg(TII, MI);
};
int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
return SetRegWaitStates - WaitStatesNeeded;
@@ -739,13 +769,13 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
const int VALUWaitStates = 1;
int WaitStatesNeeded = 0;
- if (!TRI->isVGPR(MRI, Def.getReg()))
+ if (!TRI->isVectorRegister(MRI, Def.getReg()))
return WaitStatesNeeded;
Register Reg = Def.getReg();
- auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
- int DataIdx = createsVALUHazard(*MI);
+ auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
+ int DataIdx = createsVALUHazard(MI);
return DataIdx >= 0 &&
- TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
+ TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
};
int WaitStatesNeededForDef =
VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
@@ -808,9 +838,7 @@ int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
return 0;
Register LaneSelectReg = LaneSelectOp->getReg();
- auto IsHazardFn = [TII] (MachineInstr *MI) {
- return TII->isVALU(*MI);
- };
+ auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
const int RWLaneWaitStates = 4;
int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
@@ -826,8 +854,8 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
const int RFEWaitStates = 1;
- auto IsHazardFn = [TII] (MachineInstr *MI) {
- return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;
+ auto IsHazardFn = [TII](const MachineInstr &MI) {
+ return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
};
int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
return RFEWaitStates - WaitStatesNeeded;
@@ -836,9 +864,7 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
const SIInstrInfo *TII = ST.getInstrInfo();
const int SMovRelWaitStates = 1;
- auto IsHazardFn = [TII] (MachineInstr *MI) {
- return TII->isSALU(*MI);
- };
+ auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
SMovRelWaitStates);
}
@@ -856,18 +882,12 @@ bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
return false;
const SIInstrInfo *TII = ST.getInstrInfo();
- auto IsHazardFn = [TII] (MachineInstr *MI) {
- return TII->isVOPC(*MI);
- };
+ auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVOPC(MI); };
- auto IsExpiredFn = [] (MachineInstr *MI, int) {
- if (!MI)
- return false;
- unsigned Opc = MI->getOpcode();
- return SIInstrInfo::isVALU(*MI) &&
- Opc != AMDGPU::V_NOP_e32 &&
- Opc != AMDGPU::V_NOP_e64 &&
- Opc != AMDGPU::V_NOP_sdwa;
+ auto IsExpiredFn = [](const MachineInstr &MI, int) {
+ unsigned Opc = MI.getOpcode();
+ return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
+ Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
};
if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
@@ -900,13 +920,14 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
const SIRegisterInfo *TRI = ST.getRegisterInfo();
- auto IsHazardFn = [TRI, MI] (MachineInstr *I) {
- if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) &&
- !SIInstrInfo::isFLAT(*I))
+ auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
+ if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
+ !SIInstrInfo::isFLAT(I))
return false;
for (const MachineOperand &Def : MI->defs()) {
- MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI);
+ const MachineOperand *Op =
+ I.findRegisterUseOperand(Def.getReg(), false, TRI);
if (!Op)
continue;
return true;
@@ -914,12 +935,12 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
return false;
};
- auto IsExpiredFn = [](MachineInstr *MI, int) {
- return MI && (SIInstrInfo::isVALU(*MI) ||
- (MI->getOpcode() == AMDGPU::S_WAITCNT &&
- !MI->getOperand(0).getImm()) ||
- (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
- MI->getOperand(0).getImm() == 0xffe3));
+ auto IsExpiredFn = [](const MachineInstr &MI, int) {
+ return SIInstrInfo::isVALU(MI) ||
+ (MI.getOpcode() == AMDGPU::S_WAITCNT &&
+ !MI.getOperand(0).getImm()) ||
+ (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+ MI.getOperand(0).getImm() == 0xffe3);
};
if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
@@ -968,43 +989,41 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
return false;
const Register SDSTReg = SDST->getReg();
- auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) {
- return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI);
+ auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
+ return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
};
- auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) {
- if (MI) {
- if (TII->isSALU(*MI)) {
- switch (MI->getOpcode()) {
- case AMDGPU::S_SETVSKIP:
- case AMDGPU::S_VERSION:
- case AMDGPU::S_WAITCNT_VSCNT:
- case AMDGPU::S_WAITCNT_VMCNT:
- case AMDGPU::S_WAITCNT_EXPCNT:
- // These instructions cannot not mitigate the hazard.
+ auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
+ if (TII->isSALU(MI)) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::S_SETVSKIP:
+ case AMDGPU::S_VERSION:
+ case AMDGPU::S_WAITCNT_VSCNT:
+ case AMDGPU::S_WAITCNT_VMCNT:
+ case AMDGPU::S_WAITCNT_EXPCNT:
+ // These instructions cannot not mitigate the hazard.
+ return false;
+ case AMDGPU::S_WAITCNT_LGKMCNT:
+ // Reducing lgkmcnt count to 0 always mitigates the hazard.
+ return (MI.getOperand(1).getImm() == 0) &&
+ (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
+ case AMDGPU::S_WAITCNT: {
+ const int64_t Imm = MI.getOperand(0).getImm();
+ AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
+ return (Decoded.LgkmCnt == 0);
+ }
+ default:
+ // SOPP instructions cannot mitigate the hazard.
+ if (TII->isSOPP(MI))
return false;
- case AMDGPU::S_WAITCNT_LGKMCNT:
- // Reducing lgkmcnt count to 0 always mitigates the hazard.
- return (MI->getOperand(1).getImm() == 0) &&
- (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
- case AMDGPU::S_WAITCNT: {
- const int64_t Imm = MI->getOperand(0).getImm();
- AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
- return (Decoded.LgkmCnt == 0);
- }
- default:
- // SOPP instructions cannot mitigate the hazard.
- if (TII->isSOPP(*MI))
- return false;
- // At this point the SALU can be assumed to mitigate the hazard
- // because either:
- // (a) it is independent of the at risk SMEM (breaking chain),
- // or
- // (b) it is dependent on the SMEM, in which case an appropriate
- // s_waitcnt lgkmcnt _must_ exist between it and the at risk
- // SMEM instruction.
- return true;
- }
+ // At this point the SALU can be assumed to mitigate the hazard
+ // because either:
+ // (a) it is independent of the at risk SMEM (breaking chain),
+ // or
+ // (b) it is dependent on the SMEM, in which case an appropriate
+ // s_waitcnt lgkmcnt _must_ exist between it and the at risk
+ // SMEM instruction.
+ return true;
}
}
return false;
@@ -1028,25 +1047,23 @@ bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
return false;
- auto IsHazardFn = [TRI] (MachineInstr *I) {
- if (SIInstrInfo::isVALU(*I))
+ auto IsHazardFn = [TRI](const MachineInstr &I) {
+ if (SIInstrInfo::isVALU(I))
return false;
- return I->readsRegister(AMDGPU::EXEC, TRI);
+ return I.readsRegister(AMDGPU::EXEC, TRI);
};
const SIInstrInfo *TII = ST.getInstrInfo();
- auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) {
- if (!MI)
- return false;
- if (SIInstrInfo::isVALU(*MI)) {
- if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst))
+ auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
+ if (SIInstrInfo::isVALU(MI)) {
+ if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
return true;
- for (auto MO : MI->implicit_operands())
+ for (auto MO : MI.implicit_operands())
if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
return true;
}
- if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
- (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe)
+ if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+ (MI.getOperand(0).getImm() & 0xfffe) == 0xfffe)
return true;
return false;
};
@@ -1061,52 +1078,71 @@ bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
return true;
}
-bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
+static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
+ const GCNSubtarget &ST) {
if (!ST.hasLdsBranchVmemWARHazard())
return false;
- auto IsHazardInst = [] (const MachineInstr *MI) {
- if (SIInstrInfo::isDS(*MI))
+ // Check if the necessary condition for the hazard is met: both LDS and VMEM
+ // instructions need to appear in the same function.
+ bool HasLds = false;
+ bool HasVmem = false;
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB) {
+ HasLds |= SIInstrInfo::isDS(MI);
+ HasVmem |=
+ SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
+ if (HasLds && HasVmem)
+ return true;
+ }
+ }
+ return false;
+}
+
+bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
+ if (!RunLdsBranchVmemWARHazardFixup)
+ return false;
+
+ assert(ST.hasLdsBranchVmemWARHazard());
+
+ auto IsHazardInst = [](const MachineInstr &MI) {
+ if (SIInstrInfo::isDS(MI))
return 1;
- if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI))
+ if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
return 2;
return 0;
};
- auto InstType = IsHazardInst(MI);
+ auto InstType = IsHazardInst(*MI);
if (!InstType)
return false;
- auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) {
- return I && (IsHazardInst(I) ||
- (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
- I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
- !I->getOperand(1).getImm()));
+ auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
+ return IsHazardInst(I) || (I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
+ I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
+ !I.getOperand(1).getImm());
};
- auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) {
- if (!I->isBranch())
+ auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
+ if (!I.isBranch())
return false;
- auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) {
+ auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
auto InstType2 = IsHazardInst(I);
return InstType2 && InstType != InstType2;
};
- auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) {
- if (!I)
- return false;
-
+ auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
auto InstType2 = IsHazardInst(I);
if (InstType == InstType2)
return true;
- return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
- I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
- !I->getOperand(1).getImm();
+ return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
+ I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
+ !I.getOperand(1).getImm();
};
- return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) !=
+ return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
std::numeric_limits<int>::max();
};
@@ -1137,12 +1173,12 @@ int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
if (!Offset || (Offset->getImm() & 6) == 0)
return 0;
- auto IsHazardFn = [TII] (MachineInstr *I) {
- if (!SIInstrInfo::isMIMG(*I))
+ auto IsHazardFn = [TII](const MachineInstr &I) {
+ if (!SIInstrInfo::isMIMG(I))
return false;
- const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode());
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
- TII->getInstSizeInBytes(*I) >= 16;
+ TII->getInstSizeInBytes(I) >= 16;
};
return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
@@ -1154,17 +1190,17 @@ int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
return 0;
- auto IsHazardFn = [] (MachineInstr *I) {
- if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I))
+ auto IsHazardFn = [](const MachineInstr &I) {
+ if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
return false;
- return SIInstrInfo::isFPAtomic(*I);
+ return SIInstrInfo::isFPAtomic(I);
};
- auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) {
- if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI))
+ auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
+ if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
return true;
- switch (MI->getOpcode()) {
+ switch (MI.getOpcode()) {
case AMDGPU::S_WAITCNT:
case AMDGPU::S_WAITCNT_VSCNT:
case AMDGPU::S_WAITCNT_VMCNT:
@@ -1179,7 +1215,6 @@ int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
return false;
};
-
return FPAtomicToDenormModeWaitStates -
::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
}
@@ -1187,11 +1222,15 @@ int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
assert(SIInstrInfo::isMAI(*MI));
+ return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
+}
+
+int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
int WaitStatesNeeded = 0;
unsigned Opc = MI->getOpcode();
- auto IsVALUFn = [] (MachineInstr *MI) {
- return SIInstrInfo::isVALU(*MI);
+ auto IsVALUFn = [](const MachineInstr &MI) {
+ return SIInstrInfo::isVALU(MI);
};
if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
@@ -1220,10 +1259,10 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
}
}
- auto IsMFMAFn = [] (MachineInstr *MI) {
- return SIInstrInfo::isMAI(*MI) &&
- MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
- MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
+ auto IsMFMAFn = [](const MachineInstr &MI) {
+ return SIInstrInfo::isMAI(MI) &&
+ MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+ MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
};
for (const MachineOperand &Op : MI->explicit_operands()) {
@@ -1245,15 +1284,15 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
Register Reg = Op.getReg();
unsigned HazardDefLatency = 0;
- auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this]
- (MachineInstr *MI) {
+ auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency,
+ this](const MachineInstr &MI) {
if (!IsMFMAFn(MI))
return false;
- Register DstReg = MI->getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
if (DstReg == Reg)
return false;
- HazardDefLatency = std::max(HazardDefLatency,
- TSchedModel.computeInstrLatency(MI));
+ HazardDefLatency =
+ std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
return TRI.regsOverlap(DstReg, Reg);
};
@@ -1292,10 +1331,10 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
if (WaitStatesNeeded == MaxWaitStates)
return WaitStatesNeeded; // Early exit.
- auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {
- if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
+ auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
+ if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
return false;
- Register DstReg = MI->getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
return TRI.regsOverlap(Reg, DstReg);
};
@@ -1324,13 +1363,13 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
Register DstReg = MI->getOperand(0).getReg();
unsigned HazardDefLatency = 0;
- auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this]
- (MachineInstr *MI) {
+ auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency,
+ this](const MachineInstr &MI) {
if (!IsMFMAFn(MI))
return false;
- Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
- HazardDefLatency = std::max(HazardDefLatency,
- TSchedModel.computeInstrLatency(MI));
+ Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
+ HazardDefLatency =
+ std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
return TRI.regsOverlap(Reg, DstReg);
};
@@ -1353,14 +1392,171 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
return WaitStatesNeeded;
}
+int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
+ int WaitStatesNeeded = 0;
+ unsigned Opc = MI->getOpcode();
+
+ auto IsMFMAFn = [](const MachineInstr &MI) {
+ return SIInstrInfo::isMAI(MI) &&
+ MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+ MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
+ };
+
+ auto IsLegacyVALUFn = [&IsMFMAFn](const MachineInstr &MI) {
+ return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI);
+ };
+
+ auto IsLegacyVALUNotDotFn = [&IsMFMAFn](const MachineInstr &MI) {
+ return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI) && !SIInstrInfo::isDOT(MI);
+ };
+
+ if (!IsMFMAFn(*MI))
+ return WaitStatesNeeded;
+
+ const int VALUWritesExecWaitStates = 4;
+ int WaitStatesNeededForUse = VALUWritesExecWaitStates -
+ getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
+ VALUWritesExecWaitStates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+ int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+
+ // Loop for both DGEMM and S/HGEMM 2nd instruction.
+ for (const MachineOperand &Use : MI->explicit_uses()) {
+ const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
+ const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
+ const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
+ const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
+ const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
+ const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
+ const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
+ const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
+ const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
+ const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
+ const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
+ const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
+ const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
+ const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
+ const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
+ const int MaxWaitStates = 19;
+
+ if (!Use.isReg())
+ continue;
+ unsigned Reg = Use.getReg();
+ bool FullReg;
+ const MachineInstr *MI1;
+
+ auto IsOverlappedDGEMMorXDLFn = [Reg, &IsMFMAFn, &FullReg, &MI1,
+ this](const MachineInstr &MI) {
+ if (!IsMFMAFn(MI))
+ return false;
+ if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI))
+ return false;
+ Register DstReg = MI.getOperand(0).getReg();
+ FullReg = (DstReg == Reg);
+ MI1 = &MI;
+ return TRI.regsOverlap(DstReg, Reg);
+ };
+
+ WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
+ getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+ int NumWaitStates = getWaitStatesSinceDef(Reg, IsOverlappedDGEMMorXDLFn,
+ MaxWaitStates);
+ if (NumWaitStates == std::numeric_limits<int>::max())
+ continue;
+
+ int OpNo = MI->getOperandNo(&Use);
+ unsigned Opc1 = MI1->getOpcode();
+ int NeedWaitStates = 0;
+ if (OpNo == SrcCIdx) {
+ if (!isDGEMM(Opc) && isDGEMM(Opc1)) {
+ NeedWaitStates = 0;
+ } else if (FullReg) {
+ if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
+ Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
+ (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
+ Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
+ NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
+ } else {
+ switch (Opc1) {
+ case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
+ case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
+ if (!isXDL(ST, *MI))
+ NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
+ break;
+ case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
+ case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
+ if (!isXDL(ST, *MI))
+ NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
+ break;
+ default:
+ switch (TSchedModel.computeInstrLatency(MI1)) {
+ case 2:
+ NeedWaitStates = isDGEMM(Opc)
+ ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
+ : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
+ break;
+ case 8:
+ NeedWaitStates = isDGEMM(Opc)
+ ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
+ : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
+ break;
+ case 16: LLVM_FALLTHROUGH;
+ default:
+ NeedWaitStates = isDGEMM(Opc)
+ ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
+ : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
+ }
+ }
+ }
+ } else {
+ switch (Opc1) {
+ case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
+ case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
+ NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
+ break;
+ case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
+ case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
+ NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
+ break;
+ default:
+ switch (TSchedModel.computeInstrLatency(MI1)) {
+ case 2:
+ NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
+ break;
+ case 8:
+ NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
+ break;
+ case 16: LLVM_FALLTHROUGH;
+ default:
+ NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
+ }
+ }
+ }
+ if (WaitStatesNeeded >= NeedWaitStates)
+ continue;
+
+ WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+ if (WaitStatesNeeded == MaxWaitStates)
+ break;
+ }
+
+ return WaitStatesNeeded;
+}
+
int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
- if (!ST.hasMAIInsts())
+ // On gfx90a+ releveant hazards are checked in checkMAIVALUHazards()
+ if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
return 0;
int WaitStatesNeeded = 0;
- auto IsAccVgprReadFn = [] (MachineInstr *MI) {
- return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
+ auto IsAccVgprReadFn = [](const MachineInstr &MI) {
+ return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
};
for (const MachineOperand &Op : MI->explicit_uses()) {
@@ -1380,12 +1576,12 @@ int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
if (WaitStatesNeeded == MaxWaitStates)
return WaitStatesNeeded; // Early exit.
- auto IsVALUAccVgprRdWrCheckFn = [Reg, this](MachineInstr *MI) {
- if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
- MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
+ auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
+ if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
+ MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
return false;
- auto IsVALUFn = [] (MachineInstr *MI) {
- return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI);
+ auto IsVALUFn = [](const MachineInstr &MI) {
+ return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
};
return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
std::numeric_limits<int>::max();
@@ -1399,22 +1595,252 @@ int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
return WaitStatesNeeded;
}
+int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
+ if (!ST.hasGFX90AInsts())
+ return 0;
+
+ auto IsMFMAFn = [](const MachineInstr &MI) -> bool {
+ return SIInstrInfo::isMAI(MI) &&
+ MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+ MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
+ };
+
+ auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
+ return isDGEMM(MI.getOpcode());
+ };
+
+ // This is checked in checkMAIHazards90A()
+ if (IsMFMAFn(*MI))
+ return 0;
+
+ int WaitStatesNeeded = 0;
+
+ bool IsMemOrExport = SIInstrInfo::isVMEM(*MI) ||
+ SIInstrInfo::isFLAT(*MI) ||
+ SIInstrInfo::isDS(*MI) ||
+ SIInstrInfo::isEXP(*MI);
+ bool IsVALU = SIInstrInfo::isVALU(*MI);
+
+ const MachineInstr *MFMA = nullptr;
+ unsigned Reg;
+ auto IsDGEMMorXDLWriteFn = [&Reg, &IsMFMAFn, &MFMA,
+ this](const MachineInstr &MI) {
+ if (!IsMFMAFn(MI) || !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
+ return false;
+ if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI))
+ return false;
+ MFMA = &MI;
+ return true;
+ };
+
+ const MachineInstr *DOT = nullptr;
+ auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
+ if (!SIInstrInfo::isDOT(MI) ||
+ !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
+ return false;
+ DOT = &MI;
+ return true;
+ };
+
+ int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+ AMDGPU::OpName::src2);
+
+ if (IsMemOrExport || IsVALU) {
+ const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
+ const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
+ const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
+ const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
+ const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
+ const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
+ const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
+ const int DotWriteSameDotReadSrcAB = 3;
+ const int DotWriteDifferentVALURead = 3;
+ const int MaxWaitStates = 19;
+
+ for (const MachineOperand &Use : MI->explicit_uses()) {
+ if (!Use.isReg())
+ continue;
+ Reg = Use.getReg();
+
+ DOT = nullptr;
+ int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
+ MaxWaitStates);
+ if (DOT) {
+ int NeedWaitStates = 0;
+ if (DOT->getOpcode() == MI->getOpcode()) {
+ if (&Use - &MI->getOperand(0) != SrcCIdx)
+ NeedWaitStates = DotWriteSameDotReadSrcAB;
+ } else {
+ NeedWaitStates = DotWriteDifferentVALURead;
+ }
+
+ int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+ }
+
+ MFMA = nullptr;
+ WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn,
+ MaxWaitStates);
+ if (!MFMA)
+ continue;
+
+ unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
+ int NeedWaitStates = MaxWaitStates;
+ switch (HazardDefLatency) {
+ case 2:
+ NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
+ break;
+ case 4:
+ assert(isDGEMM(MFMA->getOpcode()));
+ NeedWaitStates =
+ IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
+ : DMFMA4x4WriteVgprVALUReadWaitStates;
+ break;
+ case 8:
+ NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
+ break;
+ case 16: LLVM_FALLTHROUGH;
+ default:
+ NeedWaitStates =
+ isDGEMM(MFMA->getOpcode())
+ ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
+ : DMFMA16x16WriteVgprVALUReadWaitStates
+ : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
+ break;
+ }
+
+ int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+ if (WaitStatesNeeded == MaxWaitStates)
+ break;
+ }
+ }
+
+ unsigned Opc = MI->getOpcode();
+ const int DMFMAToFMA64WaitStates = 2;
+ if ((Opc == AMDGPU::V_FMA_F64_e64 ||
+ Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
+ Opc == AMDGPU::V_FMAC_F64_dpp) &&
+ WaitStatesNeeded < DMFMAToFMA64WaitStates) {
+ int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
+ getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+ }
+
+ if (!IsVALU && !IsMemOrExport)
+ return WaitStatesNeeded;
+
+ for (const MachineOperand &Def : MI->defs()) {
+ const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
+ const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
+ const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
+ const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
+ const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
+ const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
+ const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
+ const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
+ const int DotWriteDifferentVALUWrite = 3;
+ const int MaxWaitStates = 19;
+ const int MaxWarWaitStates = 15;
+
+ Reg = Def.getReg();
+
+ DOT = nullptr;
+ int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
+ MaxWaitStates);
+ if (DOT && DOT->getOpcode() != MI->getOpcode())
+ WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
+ WaitStatesSinceDef);
+
+ MFMA = nullptr;
+ WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn,
+ MaxWaitStates);
+ if (MFMA) {
+ int NeedWaitStates = MaxWaitStates;
+ switch (TSchedModel.computeInstrLatency(MFMA)) {
+ case 2:
+ NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
+ break;
+ case 4:
+ assert(isDGEMM(MFMA->getOpcode()));
+ NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
+ break;
+ case 8:
+ NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
+ break;
+ case 16: LLVM_FALLTHROUGH;
+ default:
+ NeedWaitStates = isDGEMM(MFMA->getOpcode())
+ ? DMFMA16x16WriteVgprVALUWriteWaitStates
+ : SMFMA32x32WriteVgprVALUWawWaitStates;
+ break;
+ }
+
+ int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+
+ if (WaitStatesNeeded == MaxWaitStates)
+ break;
+ }
+
+ auto IsSMFMAReadAsCFn = [&Reg, &IsMFMAFn, &MFMA,
+ this](const MachineInstr &MI) {
+ if (!IsMFMAFn(MI) || isDGEMM(MI.getOpcode()) ||
+ !MI.readsRegister(Reg, &TRI))
+ return false;
+
+ const MachineOperand *SrcC =
+ TII.getNamedOperand(MI, AMDGPU::OpName::src2);
+ assert(SrcC);
+ if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
+ return false;
+
+ MFMA = &MI;
+ return true;
+ };
+
+ MFMA = nullptr;
+ int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
+ MaxWarWaitStates);
+ if (!MFMA)
+ continue;
+
+ unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
+ int NeedWaitStates = MaxWaitStates;
+ switch (HazardDefLatency) {
+ case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
+ break;
+ case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
+ break;
+ case 16: LLVM_FALLTHROUGH;
+ default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
+ break;
+ }
+
+ int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+ }
+
+ return WaitStatesNeeded;
+}
+
bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
if (!SU->isInstr())
return false;
- MachineInstr *MAI = nullptr;
- auto IsMFMAFn = [&MAI] (MachineInstr *MI) {
+ const MachineInstr *MAI = nullptr;
+ auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
MAI = nullptr;
- if (SIInstrInfo::isMAI(*MI) &&
- MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
- MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64)
- MAI = MI;
+ if (SIInstrInfo::isMAI(MI) &&
+ MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+ MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64)
+ MAI = &MI;
return MAI != nullptr;
};
MachineInstr *MI = SU->getInstr();
- if (IsMFMAFn(MI)) {
+ if (IsMFMAFn(*MI)) {
int W = getWaitStatesSince(IsMFMAFn, 16);
if (MAI)
return W < (int)TSchedModel.computeInstrLatency(MAI);
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 447ca828ae64..162121c2c525 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -32,7 +32,7 @@ class GCNSubtarget;
class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
public:
- typedef function_ref<bool(MachineInstr *)> IsHazardFn;
+ typedef function_ref<bool(const MachineInstr &)> IsHazardFn;
private:
// Distinguish if we are called from scheduler or hazard recognizer
@@ -48,6 +48,7 @@ private:
const SIInstrInfo &TII;
const SIRegisterInfo &TRI;
TargetSchedModel TSchedModel;
+ bool RunLdsBranchVmemWARHazardFixup;
/// RegUnits of uses in the current soft memory clause.
BitVector ClauseUses;
@@ -94,6 +95,9 @@ private:
bool fixLdsBranchVmemWARHazard(MachineInstr *MI);
int checkMAIHazards(MachineInstr *MI);
+ int checkMAIHazards908(MachineInstr *MI);
+ int checkMAIHazards90A(MachineInstr *MI);
+ int checkMAIVALUHazards(MachineInstr *MI);
int checkMAILdStHazards(MachineInstr *MI);
public:
diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
index fc7105bc15a7..9f98f9ada802 100644
--- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
@@ -190,6 +190,14 @@ GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg())
return NSA_Status::FIXED;
+ // InlineSpiller does not call LRM::assign() after an LI split leaving
+ // it in an inconsistent state, so we cannot call LRM::unassign().
+ // See llvm bug #48911.
+ // Skip reassign if a register has originated from such split.
+ // FIXME: Remove the workaround when bug #48911 is fixed.
+ if (VRM->getPreSplitReg(Reg))
+ return NSA_Status::FIXED;
+
const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
new file mode 100644
index 000000000000..a51399d7da5f
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -0,0 +1,162 @@
+//===-- GCNPreRAOptimizations.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass combines split register tuple initialization into a single psuedo:
+///
+/// undef %0.sub1:sreg_64 = S_MOV_B32 1
+/// %0.sub0:sreg_64 = S_MOV_B32 2
+/// =>
+/// %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 0x200000001
+///
+/// This is to allow rematerialization of a value instead of spilling. It is
+/// supposed to be done after register coalescer to allow it to do its job and
+/// before actual register allocation to allow rematerialization.
+///
+/// Right now the pass only handles 64 bit SGPRs with immediate initializers,
+/// although the same shall be possible with other register classes and
+/// instructions if necessary.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-pre-ra-optimizations"
+
+namespace {
+
+class GCNPreRAOptimizations : public MachineFunctionPass {
+private:
+ const SIInstrInfo *TII;
+ MachineRegisterInfo *MRI;
+ LiveIntervals *LIS;
+
+ bool processReg(Register Reg);
+
+public:
+ static char ID;
+
+ GCNPreRAOptimizations() : MachineFunctionPass(ID) {
+ initializeGCNPreRAOptimizationsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "AMDGPU Pre-RA optimizations";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveIntervals>();
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(GCNPreRAOptimizations, DEBUG_TYPE,
+ "AMDGPU Pre-RA optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(GCNPreRAOptimizations, DEBUG_TYPE, "Pre-RA optimizations",
+ false, false)
+
+char GCNPreRAOptimizations::ID = 0;
+
+char &llvm::GCNPreRAOptimizationsID = GCNPreRAOptimizations::ID;
+
+FunctionPass *llvm::createGCNPreRAOptimizationsPass() {
+ return new GCNPreRAOptimizations();
+}
+
+bool GCNPreRAOptimizations::processReg(Register Reg) {
+ MachineInstr *Def0 = nullptr;
+ MachineInstr *Def1 = nullptr;
+ uint64_t Init = 0;
+
+ for (MachineInstr &I : MRI->def_instructions(Reg)) {
+ if (I.getOpcode() != AMDGPU::S_MOV_B32 || I.getOperand(0).getReg() != Reg ||
+ !I.getOperand(1).isImm() || I.getNumOperands() != 2)
+ return false;
+
+ switch (I.getOperand(0).getSubReg()) {
+ default:
+ return false;
+ case AMDGPU::sub0:
+ if (Def0)
+ return false;
+ Def0 = &I;
+ Init |= I.getOperand(1).getImm() & 0xffffffff;
+ break;
+ case AMDGPU::sub1:
+ if (Def1)
+ return false;
+ Def1 = &I;
+ Init |= static_cast<uint64_t>(I.getOperand(1).getImm()) << 32;
+ break;
+ }
+ }
+
+ if (!Def0 || !Def1 || Def0->getParent() != Def1->getParent())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Combining:\n " << *Def0 << " " << *Def1
+ << " =>\n");
+
+ if (SlotIndex::isEarlierInstr(LIS->getInstructionIndex(*Def1),
+ LIS->getInstructionIndex(*Def0)))
+ std::swap(Def0, Def1);
+
+ LIS->RemoveMachineInstrFromMaps(*Def0);
+ LIS->RemoveMachineInstrFromMaps(*Def1);
+ auto NewI = BuildMI(*Def0->getParent(), *Def0, Def0->getDebugLoc(),
+ TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), Reg)
+ .addImm(Init);
+
+ Def0->eraseFromParent();
+ Def1->eraseFromParent();
+ LIS->InsertMachineInstrInMaps(*NewI);
+ LIS->removeInterval(Reg);
+ LIS->createAndComputeVirtRegInterval(Reg);
+
+ LLVM_DEBUG(dbgs() << " " << *NewI);
+
+ return true;
+}
+
+bool GCNPreRAOptimizations::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ TII = ST.getInstrInfo();
+ MRI = &MF.getRegInfo();
+ LIS = &getAnalysis<LiveIntervals>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ bool Changed = false;
+
+ for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
+ Register Reg = Register::index2VirtReg(I);
+ if (!LIS->hasInterval(Reg))
+ continue;
+ const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+ if (RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC))
+ continue;
+ Changed |= processReg(Reg);
+ }
+
+ return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index 7447ec2db188..3a68ed1934e1 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -184,6 +184,10 @@ def : ProcessorModel<"gfx909", SIQuarterSpeedModel,
FeatureISAVersion9_0_9.Features
>;
+def : ProcessorModel<"gfx90a", SIDPFullSpeedModel,
+ FeatureISAVersion9_0_A.Features
+>;
+
def : ProcessorModel<"gfx90c", SIQuarterSpeedModel,
FeatureISAVersion9_0_C.Features
>;
@@ -204,6 +208,10 @@ def : ProcessorModel<"gfx1012", GFX10SpeedModel,
FeatureISAVersion10_1_2.Features
>;
+def : ProcessorModel<"gfx1013", GFX10SpeedModel,
+ FeatureISAVersion10_1_3.Features
+>;
+
def : ProcessorModel<"gfx1030", GFX10SpeedModel,
FeatureISAVersion10_3_0.Features
>;
@@ -219,3 +227,11 @@ def : ProcessorModel<"gfx1032", GFX10SpeedModel,
def : ProcessorModel<"gfx1033", GFX10SpeedModel,
FeatureISAVersion10_3_0.Features
>;
+
+def : ProcessorModel<"gfx1034", GFX10SpeedModel,
+ FeatureISAVersion10_3_0.Features
+>;
+
+def : ProcessorModel<"gfx1035", GFX10SpeedModel,
+ FeatureISAVersion10_3_0.Features
+>;
diff --git a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
deleted file mode 100644
index a12e9ab03e1d..000000000000
--- a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
+++ /dev/null
@@ -1,862 +0,0 @@
-//===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Try to reassign registers on GFX10+ to reduce register bank
-/// conflicts.
-///
-/// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in
-/// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to
-/// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1,
-/// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc.
-///
-/// The shader can read one dword from each of these banks once per cycle.
-/// If an instruction has to read more register operands from the same bank
-/// an additional cycle is needed. HW attempts to pre-load registers through
-/// input operand gathering, but a stall cycle may occur if that fails. For
-/// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands,
-/// potentially incuring 2 stall cycles.
-///
-/// The pass tries to reassign registers to reduce bank conflicts.
-///
-/// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so
-/// that 4 has to be subtracted from an SGPR bank number to get the real value.
-/// This also corresponds to bit numbers in bank masks used in the pass.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "GCNSubtarget.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/LiveRegMatrix.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/InitializePasses.h"
-
-using namespace llvm;
-
-static cl::opt<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign",
- cl::desc("Verify stall cycles in the regbanks reassign pass"),
- cl::value_desc("0|1|2"),
- cl::init(0), cl::Hidden);
-
-#define DEBUG_TYPE "amdgpu-regbanks-reassign"
-
-#define NUM_VGPR_BANKS 4
-#define NUM_SGPR_BANKS 8
-#define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS)
-#define SGPR_BANK_OFFSET NUM_VGPR_BANKS
-#define VGPR_BANK_MASK 0xf
-#define SGPR_BANK_MASK 0xff0
-#define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET)
-
-STATISTIC(NumStallsDetected,
- "Number of operand read stalls detected");
-STATISTIC(NumStallsRecovered,
- "Number of operand read stalls recovered");
-
-namespace {
-
-class GCNRegBankReassign : public MachineFunctionPass {
-
- class OperandMask {
- public:
- OperandMask(unsigned r, unsigned s, unsigned m)
- : Reg(r), SubReg(s), Mask(m) {}
- Register Reg;
- unsigned SubReg;
- unsigned Mask;
- };
-
- class Candidate {
- public:
- Candidate(MachineInstr *mi, Register reg, unsigned subreg,
- unsigned freebanks)
- : MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks) {}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- void dump(const GCNRegBankReassign *P) const {
- MI->dump();
- dbgs() << P->printReg(Reg) << " to banks ";
- dumpFreeBanks(FreeBanks);
- dbgs() << '\n';
- }
-#endif
-
- MachineInstr *MI;
- Register Reg;
- unsigned SubReg;
- unsigned FreeBanks;
- };
-
- class CandidateList : public std::map<unsigned, std::list<Candidate>> {
- public:
- void push(unsigned Weight, const Candidate&& C) {
- operator[](Weight).push_front(C);
- }
-
- Candidate &back() {
- return rbegin()->second.back();
- }
-
- void pop_back() {
- rbegin()->second.pop_back();
- if (rbegin()->second.empty())
- erase(rbegin()->first);
- }
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- void dump(const GCNRegBankReassign *P) const {
- dbgs() << "\nCandidates:\n\n";
- for (auto &B : *this) {
- dbgs() << " Weight " << B.first << ":\n";
- for (auto &C : B.second)
- C.dump(P);
- }
- dbgs() << "\n\n";
- }
-#endif
- };
-
-public:
- static char ID;
-
-public:
- GCNRegBankReassign() : MachineFunctionPass(ID) {
- initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- StringRef getPassName() const override { return "GCN RegBank Reassign"; }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineLoopInfo>();
- AU.addRequired<LiveIntervals>();
- AU.addRequired<VirtRegMap>();
- AU.addRequired<LiveRegMatrix>();
- AU.setPreservesAll();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-
-private:
- const GCNSubtarget *ST;
-
- const MachineRegisterInfo *MRI;
-
- const SIRegisterInfo *TRI;
-
- MachineLoopInfo *MLI;
-
- VirtRegMap *VRM;
-
- LiveRegMatrix *LRM;
-
- LiveIntervals *LIS;
-
- unsigned MaxNumVGPRs;
-
- unsigned MaxNumSGPRs;
-
- BitVector RegsUsed;
-
- SmallVector<OperandMask, 8> OperandMasks;
-
- CandidateList Candidates;
-
- const MCPhysReg *CSRegs;
-
- // Returns bank for a phys reg.
- unsigned getPhysRegBank(Register Reg, unsigned SubReg) const;
-
- // Return a bit set for each register bank used. 4 banks for VGPRs and
- // 8 banks for SGPRs.
- // Registers already processed and recorded in RegsUsed are excluded.
- // If Bank is not -1 assume Reg:SubReg to belong to that Bank.
- uint32_t getRegBankMask(Register Reg, unsigned SubReg, int Bank);
-
- // Analyze one instruction returning the number of stalls and a mask of the
- // banks used by all operands.
- // If Reg and Bank are provided, assume all uses of Reg will be replaced with
- // a register chosen from Bank.
- std::pair<unsigned, unsigned> analyzeInst(const MachineInstr &MI,
- Register Reg = Register(),
- unsigned SubReg = 0, int Bank = -1);
-
- // Return true if register is regular VGPR or SGPR or their tuples.
- // Returns false for special registers like m0, vcc etc.
- bool isReassignable(Register Reg) const;
-
- // Check if registers' defs are old and may be pre-loaded.
- // Returns 0 if both registers are old enough, 1 or 2 if one or both
- // registers will not likely be pre-loaded.
- unsigned getOperandGatherWeight(const MachineInstr& MI,
- Register Reg1,
- Register Reg2,
- unsigned StallCycles) const;
-
-
- // Find all bank bits in UsedBanks where Mask can be relocated to.
- unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const;
-
- // Find all bank bits in UsedBanks where Mask can be relocated to.
- // Bank is relative to the register and not its subregister component.
- // Returns 0 is a register is not reassignable.
- unsigned getFreeBanks(Register Reg, unsigned SubReg, unsigned Mask,
- unsigned UsedBanks) const;
-
- // Add cadidate instruction to the work list.
- void collectCandidates(MachineInstr& MI, unsigned UsedBanks,
- unsigned StallCycles);
-
- // Collect cadidate instructions across function. Returns a number stall
- // cycles detected. Only counts stalls if Collect is false.
- unsigned collectCandidates(MachineFunction &MF, bool Collect = true);
-
- // Remove all candidates that read specified register.
- void removeCandidates(Register Reg);
-
- // Compute stalls within the uses of SrcReg replaced by a register from
- // Bank. If Bank is -1 does not perform substitution. If Collect is set
- // candidates are collected and added to work list.
- unsigned computeStallCycles(Register SrcReg,
- Register Reg = Register(),
- unsigned SubReg = 0, int Bank = -1,
- bool Collect = false);
-
- // Search for a register in Bank unused within LI.
- // Returns phys reg or NoRegister.
- MCRegister scavengeReg(LiveInterval &LI, unsigned Bank,
- unsigned SubReg) const;
-
- // Try to reassign candidate. Returns number or stall cycles saved.
- unsigned tryReassign(Candidate &C);
-
- bool verifyCycles(MachineFunction &MF,
- unsigned OriginalCycles, unsigned CyclesSaved);
-
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-public:
- Printable printReg(Register Reg, unsigned SubReg = 0) const {
- return Printable([Reg, SubReg, this](raw_ostream &OS) {
- if (Reg.isPhysical()) {
- OS << llvm::printReg(Reg, TRI);
- return;
- }
- if (!VRM->isAssignedReg(Reg))
- OS << "<unassigned> " << llvm::printReg(Reg, TRI);
- else
- OS << llvm::printReg(Reg, TRI) << '('
- << llvm::printReg(VRM->getPhys(Reg), TRI) << ')';
- if (SubReg)
- OS << ':' << TRI->getSubRegIndexName(SubReg);
- });
- }
-
- static Printable printBank(unsigned Bank) {
- return Printable([Bank](raw_ostream &OS) {
- OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank);
- });
- }
-
- static void dumpFreeBanks(unsigned FreeBanks) {
- for (unsigned L = 0; L < NUM_BANKS; ++L)
- if (FreeBanks & (1 << L))
- dbgs() << printBank(L) << ' ';
- }
-#endif
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
-INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
-INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
- false, false)
-
-
-char GCNRegBankReassign::ID = 0;
-
-char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;
-
-unsigned GCNRegBankReassign::getPhysRegBank(Register Reg,
- unsigned SubReg) const {
- assert(Reg.isPhysical());
-
- const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
- unsigned Size = TRI->getRegSizeInBits(*RC);
- if (Size == 16)
- Reg = TRI->get32BitRegister(Reg);
- else if (Size > 32) {
- if (SubReg) {
- const TargetRegisterClass *SubRC = TRI->getSubRegClass(RC, SubReg);
- Reg = TRI->getSubReg(Reg, SubReg);
- if (TRI->getRegSizeInBits(*SubRC) > 32)
- Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
- } else {
- Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
- }
- }
-
- if (TRI->hasVGPRs(RC)) {
- unsigned RegNo = Reg - AMDGPU::VGPR0;
- return RegNo % NUM_VGPR_BANKS;
- }
-
- unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2;
- return RegNo % NUM_SGPR_BANKS + SGPR_BANK_OFFSET;
-}
-
-uint32_t GCNRegBankReassign::getRegBankMask(Register Reg, unsigned SubReg,
- int Bank) {
- if (Reg.isVirtual()) {
- if (!VRM->isAssignedReg(Reg))
- return 0;
-
- Reg = VRM->getPhys(Reg);
- if (!Reg)
- return 0;
- if (SubReg)
- Reg = TRI->getSubReg(Reg, SubReg);
- }
-
- const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
- unsigned Size = TRI->getRegSizeInBits(*RC);
-
- if (Size == 16) {
- Reg = TRI->get32BitRegister(Reg);
- Size = 1;
- } else {
- Size /= 32;
- if (Size > 1)
- Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
- }
-
- if (TRI->hasVGPRs(RC)) {
- // VGPRs have 4 banks assigned in a round-robin fashion.
- unsigned RegNo = Reg - AMDGPU::VGPR0;
- uint32_t Mask = maskTrailingOnes<uint32_t>(Size);
- unsigned Used = 0;
- // Bitmask lacks an extract method
- for (unsigned I = 0; I < Size; ++I)
- if (RegsUsed.test(RegNo + I))
- Used |= 1 << I;
- RegsUsed.set(RegNo, RegNo + Size);
- Mask &= ~Used;
- Mask <<= (Bank == -1) ? RegNo % NUM_VGPR_BANKS : uint32_t(Bank);
- return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
- }
-
- // SGPRs have 8 banks holding 2 consequitive registers each.
- unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2;
- unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs();
- if (RegNo + StartBit >= RegsUsed.size())
- return 0;
-
- if (Size > 1)
- Size /= 2;
- unsigned Mask = (1 << Size) - 1;
- unsigned Used = 0;
- for (unsigned I = 0; I < Size; ++I)
- if (RegsUsed.test(StartBit + RegNo + I))
- Used |= 1 << I;
- RegsUsed.set(StartBit + RegNo, StartBit + RegNo + Size);
- Mask &= ~Used;
- Mask <<= (Bank == -1) ? RegNo % NUM_SGPR_BANKS
- : unsigned(Bank - SGPR_BANK_OFFSET);
- Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
- // Reserve 4 bank ids for VGPRs.
- return Mask << SGPR_BANK_OFFSET;
-}
-
-std::pair<unsigned, unsigned>
-GCNRegBankReassign::analyzeInst(const MachineInstr &MI, Register Reg,
- unsigned SubReg, int Bank) {
- unsigned StallCycles = 0;
- unsigned UsedBanks = 0;
-
- if (MI.isDebugValue())
- return std::make_pair(StallCycles, UsedBanks);
-
- RegsUsed.reset();
- OperandMasks.clear();
- for (const auto& Op : MI.explicit_uses()) {
- // Undef can be assigned to any register, so two vregs can be assigned
- // the same phys reg within the same instruction.
- if (!Op.isReg() || Op.isUndef())
- continue;
-
- const Register R = Op.getReg();
- const TargetRegisterClass *RC = TRI->getRegClassForReg(*MRI, R);
-
- // Do not compute stalls for AGPRs
- if (TRI->hasAGPRs(RC))
- continue;
-
- // Do not compute stalls if sub-register covers all banks
- if (Op.getSubReg()) {
- LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg());
- if (TRI->hasVGPRs(RC)) {
- if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS)
- continue;
- } else {
- if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS)
- continue;
- }
- }
-
- unsigned ShiftedBank = Bank;
-
- if (Bank != -1 && R == Reg && (Op.getSubReg() || SubReg)) {
- unsigned RegOffset =
- TRI->getChannelFromSubReg(SubReg ? SubReg : (unsigned)AMDGPU::sub0);
- unsigned Offset = TRI->getChannelFromSubReg(
- Op.getSubReg() ? Op.getSubReg() : (unsigned)AMDGPU::sub0);
- if (Bank < NUM_VGPR_BANKS) {
- unsigned Shift = ((NUM_VGPR_BANKS + Offset) - RegOffset);
- ShiftedBank = (Bank + Shift) % NUM_VGPR_BANKS;
- } else if (Bank >= SGPR_BANK_OFFSET) {
- unsigned Shift = (NUM_SGPR_BANKS + (Offset >> 1)) - (RegOffset >> 1);
- ShiftedBank = SGPR_BANK_OFFSET +
- (Bank - SGPR_BANK_OFFSET + Shift) % NUM_SGPR_BANKS;
- }
- }
-
- uint32_t Mask = getRegBankMask(R, Op.getSubReg(),
- (Reg == R) ? ShiftedBank : -1);
- StallCycles += countPopulation(UsedBanks & Mask);
- UsedBanks |= Mask;
- OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask));
- }
-
- return std::make_pair(StallCycles, UsedBanks);
-}
-
-unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
- Register Reg1,
- Register Reg2,
- unsigned StallCycles) const
-{
- unsigned Defs = 0;
- MachineBasicBlock::const_instr_iterator Def(MI.getIterator());
- MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin());
- for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) {
- if (MI.isDebugInstr())
- continue;
- --Def;
- if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF)
- continue;
- if (Def->modifiesRegister(Reg1, TRI))
- Defs |= 1;
- if (Def->modifiesRegister(Reg2, TRI))
- Defs |= 2;
- }
- return countPopulation(Defs);
-}
-
-bool GCNRegBankReassign::isReassignable(Register Reg) const {
- if (Reg.isPhysical() || !VRM->isAssignedReg(Reg))
- return false;
-
- const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
-
- Register PhysReg = VRM->getPhys(Reg);
-
- if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
- return false;
-
- for (auto U : MRI->use_nodbg_operands(Reg)) {
- if (U.isImplicit())
- return false;
- const MachineInstr *UseInst = U.getParent();
- if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
- return false;
- }
-
- const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg);
- unsigned Size = TRI->getRegSizeInBits(*RC);
-
- // TODO: Support 16 bit registers. Those needs to be moved with their
- // parent VGPR_32 and potentially a sibling 16 bit sub-register.
- if (Size < 32)
- return false;
-
- if (TRI->hasVGPRs(RC))
- return true;
-
- if (Size == 16)
- return AMDGPU::SGPR_LO16RegClass.contains(PhysReg);
-
- if (Size > 32)
- PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0);
-
- return AMDGPU::SGPR_32RegClass.contains(PhysReg);
-}
-
-unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask,
- unsigned UsedBanks) const {
- unsigned Size = countPopulation(Mask);
- unsigned FreeBanks = 0;
- unsigned Bank = findFirstSet(Mask);
-
- UsedBanks &= ~Mask;
-
- // Find free VGPR banks
- if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) {
- for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) {
- if (Bank == I)
- continue;
- unsigned NewMask = ((1 << Size) - 1) << I;
- NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
- if (!(UsedBanks & NewMask))
- FreeBanks |= 1 << I;
- }
- return FreeBanks;
- }
-
- // Find free SGPR banks
- // SGPR tuples must be aligned, so step is size in banks it
- // crosses.
- Bank -= SGPR_BANK_OFFSET;
- for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) {
- if (Bank == I)
- continue;
- unsigned NewMask = ((1 << Size) - 1) << I;
- NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
- if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET)))
- FreeBanks |= (1 << SGPR_BANK_OFFSET) << I;
- }
-
- return FreeBanks;
-}
-
-unsigned GCNRegBankReassign::getFreeBanks(Register Reg,
- unsigned SubReg,
- unsigned Mask,
- unsigned UsedBanks) const {
- if (!isReassignable(Reg))
- return 0;
-
- unsigned FreeBanks = getFreeBanks(Mask, UsedBanks);
-
- unsigned Offset = TRI->getChannelFromSubReg(SubReg);
- if (Offset && (Mask & VGPR_BANK_MASK)) {
- unsigned Shift = Offset;
- if (Shift >= NUM_VGPR_BANKS)
- return 0;
- unsigned VB = FreeBanks & VGPR_BANK_MASK;
- FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) &
- VGPR_BANK_MASK;
- } else if (Offset > 1 && (Mask & SGPR_BANK_MASK)) {
- unsigned Shift = Offset >> 1;
- if (Shift >= NUM_SGPR_BANKS)
- return 0;
- unsigned SB = FreeBanks >> SGPR_BANK_OFFSET;
- FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) &
- SGPR_BANK_SHIFTED_MASK;
- FreeBanks <<= SGPR_BANK_OFFSET;
- }
-
- LLVM_DEBUG(if (FreeBanks) {
- dbgs() << "Potential reassignments of " << printReg(Reg, SubReg)
- << " to banks: "; dumpFreeBanks(FreeBanks);
- dbgs() << '\n'; });
-
- return FreeBanks;
-}
-
-void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
- unsigned UsedBanks,
- unsigned StallCycles) {
- LLVM_DEBUG(MI.dump());
-
- if (!StallCycles)
- return;
-
- LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n');
-
- for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) {
- for (unsigned J = I + 1; J != E; ++J) {
- if (!(OperandMasks[I].Mask & OperandMasks[J].Mask))
- continue;
-
- Register Reg1 = OperandMasks[I].Reg;
- Register Reg2 = OperandMasks[J].Reg;
- unsigned SubReg1 = OperandMasks[I].SubReg;
- unsigned SubReg2 = OperandMasks[J].SubReg;
- unsigned Mask1 = OperandMasks[I].Mask;
- unsigned Mask2 = OperandMasks[J].Mask;
- unsigned Size1 = countPopulation(Mask1);
- unsigned Size2 = countPopulation(Mask2);
-
- LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) <<
- " and " << printReg(Reg2, SubReg2) << '\n');
-
- unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles);
- Weight += MLI->getLoopDepth(MI.getParent()) * 10;
-
- LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n');
-
- unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks);
- unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks);
- if (FreeBanks1)
- Candidates.push(Weight + ((Size2 > Size1) ? 1 : 0),
- Candidate(&MI, Reg1, SubReg1, FreeBanks1));
- if (FreeBanks2)
- Candidates.push(Weight + ((Size1 > Size2) ? 1 : 0),
- Candidate(&MI, Reg2, SubReg2, FreeBanks2));
- }
- }
-}
-
-unsigned GCNRegBankReassign::computeStallCycles(Register SrcReg, Register Reg,
- unsigned SubReg, int Bank,
- bool Collect) {
- unsigned TotalStallCycles = 0;
- SmallSet<const MachineInstr *, 16> Visited;
-
- for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) {
- if (MI.isBundle())
- continue;
- if (!Visited.insert(&MI).second)
- continue;
- unsigned StallCycles;
- unsigned UsedBanks;
- std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, SubReg, Bank);
- TotalStallCycles += StallCycles;
- if (Collect)
- collectCandidates(MI, UsedBanks, StallCycles);
- }
-
- return TotalStallCycles;
-}
-
-MCRegister GCNRegBankReassign::scavengeReg(LiveInterval &LI, unsigned Bank,
- unsigned SubReg) const {
- const TargetRegisterClass *RC = MRI->getRegClass(LI.reg());
- unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs
- : MaxNumSGPRs;
- unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0
- : AMDGPU::SGPR0);
-
- for (MCRegister Reg : RC->getRegisters()) {
- // Check occupancy limit.
- if (TRI->isSubRegisterEq(Reg, MaxReg))
- break;
-
- if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg, SubReg) != Bank)
- continue;
-
- for (unsigned I = 0; CSRegs[I]; ++I)
- if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
- !LRM->isPhysRegUsed(CSRegs[I]))
- return MCRegister::from(AMDGPU::NoRegister);
-
- LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n');
-
- if (!LRM->checkInterference(LI, Reg))
- return Reg;
- }
-
- return MCRegister::from(AMDGPU::NoRegister);
-}
-
-unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
- if (!LIS->hasInterval(C.Reg))
- return 0;
-
- LiveInterval &LI = LIS->getInterval(C.Reg);
- LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump();
- LI.dump());
-
- // For each candidate bank walk all instructions in the range of live
- // interval and check if replacing the register with one belonging to
- // the candidate bank reduces conflicts.
-
- unsigned OrigStalls = computeStallCycles(C.Reg);
- LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n');
- if (!OrigStalls)
- return 0;
-
- struct BankStall {
- BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {};
- bool operator<(const BankStall &RHS) const {
- if (Stalls == RHS.Stalls)
- return Bank < RHS.Bank;
- return Stalls > RHS.Stalls;
- }
- unsigned Bank;
- unsigned Stalls;
- };
- SmallVector<BankStall, 8> BankStalls;
-
- for (int Bank = 0; Bank < NUM_BANKS; ++Bank) {
- if (C.FreeBanks & (1 << Bank)) {
- LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n');
- unsigned Stalls = computeStallCycles(C.Reg, C.Reg, C.SubReg, Bank);
- if (Stalls < OrigStalls) {
- LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> "
- << Stalls << '\n');
- BankStalls.push_back(BankStall((unsigned)Bank, Stalls));
- }
- }
- }
- llvm::sort(BankStalls);
-
- MCRegister OrigReg = VRM->getPhys(C.Reg);
- LRM->unassign(LI);
- while (!BankStalls.empty()) {
- BankStall BS = BankStalls.pop_back_val();
- MCRegister Reg = scavengeReg(LI, BS.Bank, C.SubReg);
- if (Reg == AMDGPU::NoRegister) {
- LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank)
- << '\n');
- continue;
- }
- LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg)
- << (LRM->isPhysRegUsed(Reg) ? "" : " (new)")
- << " in bank " << printBank(BS.Bank) << '\n');
-
- LRM->assign(LI, Reg);
-
- LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n');
-
- return OrigStalls - BS.Stalls;
- }
- LRM->assign(LI, OrigReg);
-
- return 0;
-}
-
-unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF,
- bool Collect) {
- unsigned TotalStallCycles = 0;
-
- for (MachineBasicBlock &MBB : MF) {
-
- LLVM_DEBUG(if (Collect) {
- if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber();
- else dbgs() << MBB.getName(); dbgs() << ":\n";
- });
-
- for (MachineInstr &MI : MBB.instrs()) {
- if (MI.isBundle())
- continue; // we analyze the instructions inside the bundle individually
-
- unsigned StallCycles;
- unsigned UsedBanks;
- std::tie(StallCycles, UsedBanks) = analyzeInst(MI);
-
- if (Collect)
- collectCandidates(MI, UsedBanks, StallCycles);
-
- TotalStallCycles += StallCycles;
- }
-
- LLVM_DEBUG(if (Collect) { dbgs() << '\n'; });
- }
-
- return TotalStallCycles;
-}
-
-void GCNRegBankReassign::removeCandidates(Register Reg) {
- typename CandidateList::iterator Next;
- for (auto I = Candidates.begin(), E = Candidates.end(); I != E; I = Next) {
- Next = std::next(I);
- I->second.remove_if([Reg, this](const Candidate& C) {
- return C.MI->readsRegister(Reg, TRI);
- });
- if (I->second.empty())
- Candidates.erase(I);
- }
-}
-
-bool GCNRegBankReassign::verifyCycles(MachineFunction &MF,
- unsigned OriginalCycles,
- unsigned CyclesSaved) {
- unsigned StallCycles = collectCandidates(MF, false);
- LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles
- << " stall cycles left\n");
- return StallCycles + CyclesSaved == OriginalCycles;
-}
-
-bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
- ST = &MF.getSubtarget<GCNSubtarget>();
- if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction()))
- return false;
-
- MRI = &MF.getRegInfo();
- TRI = ST->getRegisterInfo();
- MLI = &getAnalysis<MachineLoopInfo>();
- VRM = &getAnalysis<VirtRegMap>();
- LRM = &getAnalysis<LiveRegMatrix>();
- LIS = &getAnalysis<LiveIntervals>();
-
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- unsigned Occupancy = MFI->getOccupancy();
- MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
- MaxNumSGPRs = ST->getMaxNumSGPRs(MF);
- MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs);
- MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs);
-
- CSRegs = MRI->getCalleeSavedRegs();
- unsigned NumRegBanks = AMDGPU::VGPR_32RegClass.getNumRegs() +
- // Not a tight bound
- AMDGPU::SReg_32RegClass.getNumRegs() / 2 + 1;
- RegsUsed.resize(NumRegBanks);
-
- LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName()
- << '\n');
-
- unsigned StallCycles = collectCandidates(MF);
- NumStallsDetected += StallCycles;
-
- LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in "
- "function " << MF.getName() << '\n');
-
- LLVM_DEBUG(Candidates.dump(this));
-
- unsigned CyclesSaved = 0;
- while (!Candidates.empty()) {
- Candidate C = Candidates.back();
- unsigned LocalCyclesSaved = tryReassign(C);
- CyclesSaved += LocalCyclesSaved;
-
- if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
- report_fatal_error("RegBank reassign stall cycles verification failed.");
-
- Candidates.pop_back();
- if (LocalCyclesSaved) {
- removeCandidates(C.Reg);
- computeStallCycles(C.Reg, AMDGPU::NoRegister, 0, -1, true);
-
- LLVM_DEBUG(Candidates.dump(this));
- }
- }
- NumStallsRecovered += CyclesSaved;
-
- LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved
- << " cycles saved in function " << MF.getName() << '\n');
-
- Candidates.clear();
-
- if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
- report_fatal_error("RegBank reassign stall cycles verification failed.");
-
- RegsUsed.clear();
-
- return CyclesSaved > 0;
-}
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index aeec3e886327..3456f9a6156c 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -125,12 +125,14 @@ bool GCNRegPressure::less(const GCNSubtarget &ST,
unsigned MaxOccupancy) const {
const auto SGPROcc = std::min(MaxOccupancy,
ST.getOccupancyWithNumSGPRs(getSGPRNum()));
- const auto VGPROcc = std::min(MaxOccupancy,
- ST.getOccupancyWithNumVGPRs(getVGPRNum()));
+ const auto VGPROcc =
+ std::min(MaxOccupancy,
+ ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts())));
const auto OtherSGPROcc = std::min(MaxOccupancy,
ST.getOccupancyWithNumSGPRs(O.getSGPRNum()));
- const auto OtherVGPROcc = std::min(MaxOccupancy,
- ST.getOccupancyWithNumVGPRs(O.getVGPRNum()));
+ const auto OtherVGPROcc =
+ std::min(MaxOccupancy,
+ ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts())));
const auto Occ = std::min(SGPROcc, VGPROcc);
const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
@@ -161,7 +163,8 @@ bool GCNRegPressure::less(const GCNSubtarget &ST,
}
}
return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()):
- (getVGPRNum() < O.getVGPRNum());
+ (getVGPRNum(ST.hasGFX90AInsts()) <
+ O.getVGPRNum(ST.hasGFX90AInsts()));
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -169,7 +172,9 @@ LLVM_DUMP_METHOD
void GCNRegPressure::print(raw_ostream &OS, const GCNSubtarget *ST) const {
OS << "VGPRs: " << Value[VGPR32] << ' ';
OS << "AGPRs: " << Value[AGPR32];
- if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGPRNum()) << ')';
+ if (ST) OS << "(O"
+ << ST->getOccupancyWithNumVGPRs(getVGPRNum(ST->hasGFX90AInsts()))
+ << ')';
OS << ", SGPRs: " << getSGPRNum();
if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGPRNum()) << ')';
OS << ", LVGPR WT: " << getVGPRTuplesWeight()
@@ -384,6 +389,7 @@ bool GCNDownwardRPTracker::advanceBeforeNext() {
void GCNDownwardRPTracker::advanceToNext() {
LastTrackedMI = &*NextMI++;
+ NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
// Add new registers or mask bits.
for (const auto &MO : LastTrackedMI->operands()) {
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index ba8c85aa502b..257561cb8430 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -42,12 +42,19 @@ struct GCNRegPressure {
clear();
}
- bool empty() const { return getSGPRNum() == 0 && getVGPRNum() == 0; }
+ bool empty() const { return getSGPRNum() == 0 && getVGPRNum(false) == 0; }
void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); }
unsigned getSGPRNum() const { return Value[SGPR32]; }
- unsigned getVGPRNum() const { return std::max(Value[VGPR32], Value[AGPR32]); }
+ unsigned getVGPRNum(bool UnifiedVGPRFile) const {
+ if (UnifiedVGPRFile) {
+ return Value[AGPR32] ? alignTo(Value[VGPR32], 4) + Value[AGPR32]
+ : Value[VGPR32] + Value[AGPR32];
+ }
+ return std::max(Value[VGPR32], Value[AGPR32]);
+ }
+ unsigned getAGPRNum() const { return Value[AGPR32]; }
unsigned getVGPRTuplesWeight() const { return std::max(Value[VGPR_TUPLE],
Value[AGPR_TUPLE]); }
@@ -55,7 +62,7 @@ struct GCNRegPressure {
unsigned getOccupancy(const GCNSubtarget &ST) const {
return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
- ST.getOccupancyWithNumVGPRs(getVGPRNum()));
+ ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts())));
}
void inc(unsigned Reg,
@@ -160,7 +167,7 @@ class GCNDownwardRPTracker : public GCNRPTracker {
public:
GCNDownwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {}
- const MachineBasicBlock::const_iterator getNext() const { return NextMI; }
+ MachineBasicBlock::const_iterator getNext() const { return NextMI; }
// Reset tracker to the point before the MI
// filling live regs upon this point using LIS.
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 6e2550298dc6..0212b8e17641 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -20,7 +20,8 @@ using namespace llvm;
GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
const MachineSchedContext *C) :
- GenericScheduler(C), TargetOccupancy(0), MF(nullptr) { }
+ GenericScheduler(C), TargetOccupancy(0), HasClusteredNodes(false),
+ HasExcessPressure(false), MF(nullptr) { }
void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
GenericScheduler::initialize(DAG);
@@ -103,11 +104,13 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
// marked as RegExcess in tryCandidate() when they are compared with
// instructions that increase the register pressure.
if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
+ HasExcessPressure = true;
Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
}
if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
+ HasExcessPressure = true;
Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
}
@@ -121,6 +124,7 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
if (SGPRDelta >= 0 || VGPRDelta >= 0) {
+ HasExcessPressure = true;
if (SGPRDelta > VGPRDelta) {
Cand.RPDelta.CriticalMax =
PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
@@ -279,6 +283,15 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) {
if (SU->isBottomReady())
Bot.removeReady(SU);
+ if (!HasClusteredNodes && SU->getInstr()->mayLoadOrStore()) {
+ for (SDep &Dep : SU->Preds) {
+ if (Dep.isCluster()) {
+ HasClusteredNodes = true;
+ break;
+ }
+ }
+ }
+
LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
<< *SU->getInstr());
return SU;
@@ -320,22 +333,30 @@ void GCNScheduleDAGMILive::schedule() {
PressureBefore.print(dbgs()));
}
+ GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
+ // Set HasClusteredNodes to true for late stages where we have already
+ // collected it. That way pickNode() will not scan SDep's when not needed.
+ S.HasClusteredNodes = Stage > InitialSchedule;
+ S.HasExcessPressure = false;
ScheduleDAGMILive::schedule();
Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
RescheduleRegions[RegionIdx] = false;
+ if (Stage == InitialSchedule && S.HasClusteredNodes)
+ RegionsWithClusters[RegionIdx] = true;
+ if (S.HasExcessPressure)
+ RegionsWithHighRP[RegionIdx] = true;
if (!LIS)
return;
// Check the results of scheduling.
- GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
auto PressureAfter = getRealRegPressure();
LLVM_DEBUG(dbgs() << "Pressure after scheduling: ";
PressureAfter.print(dbgs()));
if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
- PressureAfter.getVGPRNum() <= S.VGPRCriticalLimit) {
+ PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
Pressure[RegionIdx] = PressureAfter;
LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
return;
@@ -366,9 +387,12 @@ void GCNScheduleDAGMILive::schedule() {
unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
- if (PressureAfter.getVGPRNum() > MaxVGPRs ||
- PressureAfter.getSGPRNum() > MaxSGPRs)
+ if (PressureAfter.getVGPRNum(false) > MaxVGPRs ||
+ PressureAfter.getAGPRNum() > MaxVGPRs ||
+ PressureAfter.getSGPRNum() > MaxSGPRs) {
RescheduleRegions[RegionIdx] = true;
+ RegionsWithHighRP[RegionIdx] = true;
+ }
if (WavesAfter >= MinOccupancy) {
if (Stage == UnclusteredReschedule &&
@@ -378,6 +402,9 @@ void GCNScheduleDAGMILive::schedule() {
PressureAfter.less(ST, PressureBefore) ||
!RescheduleRegions[RegionIdx]) {
Pressure[RegionIdx] = PressureAfter;
+ if (!RegionsWithClusters[RegionIdx] &&
+ (Stage + 1) == UnclusteredReschedule)
+ RescheduleRegions[RegionIdx] = false;
return;
} else {
LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
@@ -385,7 +412,8 @@ void GCNScheduleDAGMILive::schedule() {
}
LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
- RescheduleRegions[RegionIdx] = true;
+ RescheduleRegions[RegionIdx] = RegionsWithClusters[RegionIdx] ||
+ (Stage + 1) != UnclusteredReschedule;
RegionEnd = RegionBegin;
for (MachineInstr *MI : Unsched) {
if (MI->isDebugInstr())
@@ -460,7 +488,9 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
I = Rgn.first;
auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
auto LRS = BBLiveInMap.lookup(NonDbgMI);
+#ifdef EXPENSIVE_CHECKS
assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
+#endif
RPTracker.reset(*I, &LRS);
}
@@ -516,7 +546,11 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
LiveIns.resize(Regions.size());
Pressure.resize(Regions.size());
RescheduleRegions.resize(Regions.size());
+ RegionsWithClusters.resize(Regions.size());
+ RegionsWithHighRP.resize(Regions.size());
RescheduleRegions.set();
+ RegionsWithClusters.reset();
+ RegionsWithHighRP.reset();
if (!Regions.empty())
BBLiveInMap = getBBLiveInMap();
@@ -561,7 +595,10 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
SavedMutations.swap(Mutations);
for (auto Region : Regions) {
- if (Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx]) {
+ if ((Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx]) ||
+ (Stage == ClusteredLowOccupancyReschedule &&
+ !RegionsWithClusters[RegionIdx] && !RegionsWithHighRP[RegionIdx])) {
+
++RegionIdx;
continue;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 2d81d9977c31..15eba3f5eac0 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -50,6 +50,14 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
unsigned TargetOccupancy;
+ // schedule() have seen a clustered memory operation. Set it to false
+ // before a region scheduling to know if the region had such clusters.
+ bool HasClusteredNodes;
+
+ // schedule() have seen a an excess register pressure and had to track
+ // register pressure for actual scheduling heuristics.
+ bool HasExcessPressure;
+
MachineFunction *MF;
public:
@@ -96,6 +104,12 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
// or we generally desire to reschedule it.
BitVector RescheduleRegions;
+ // Record regions which use clustered loads/stores.
+ BitVector RegionsWithClusters;
+
+ // Record regions with high register pressure.
+ BitVector RegionsWithHighRP;
+
// Region live-in cache.
SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 7a7178126444..bd0c40081c01 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -41,24 +41,16 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
using AMDGPUSubtarget::getMaxWavesPerEU;
public:
- enum TrapHandlerAbi {
- TrapHandlerAbiNone = 0,
- TrapHandlerAbiHsa = 1
+ // Following 2 enums are documented at:
+ // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
+ enum class TrapHandlerAbi {
+ NONE = 0x00,
+ AMDHSA = 0x01,
};
- enum TrapID {
- TrapIDHardwareReserved = 0,
- TrapIDHSADebugTrap = 1,
- TrapIDLLVMTrap = 2,
- TrapIDLLVMDebugTrap = 3,
- TrapIDDebugBreakpoint = 7,
- TrapIDDebugReserved8 = 8,
- TrapIDDebugReservedFE = 0xfe,
- TrapIDDebugReservedFF = 0xff
- };
-
- enum TrapRegValues {
- LLVMTrapHandlerRegValue = 1
+ enum class TrapID {
+ LLVMAMDHSATrap = 0x02,
+ LLVMAMDHSADebugTrap = 0x03,
};
private:
@@ -82,6 +74,7 @@ protected:
bool FastFMAF32;
bool FastDenormalF32;
bool HalfRate64Ops;
+ bool FullRate64Ops;
// Dynamically set bits that enable features.
bool FlatForGlobal;
@@ -95,6 +88,7 @@ protected:
// for XNACK.
bool EnableXNACK;
+ bool EnableTgSplit;
bool EnableCuMode;
bool TrapHandler;
@@ -110,14 +104,17 @@ protected:
bool FP64;
bool FMA;
bool MIMG_R128;
- bool GCN3Encoding;
+ bool IsGCN;
bool CIInsts;
bool GFX8Insts;
bool GFX9Insts;
+ bool GFX90AInsts;
bool GFX10Insts;
bool GFX10_3Insts;
bool GFX7GFX8GFX9Insts;
bool SGPRInitBug;
+ bool NegativeScratchOffsetBug;
+ bool NegativeUnalignedScratchOffsetBug;
bool HasSMemRealTime;
bool HasIntClamp;
bool HasFmaMixInsts;
@@ -132,10 +129,15 @@ protected:
bool HasSDWAOutModsVOPC;
bool HasDPP;
bool HasDPP8;
+ bool Has64BitDPP;
+ bool HasPackedFP32Ops;
+ bool HasExtendedImageInsts;
bool HasR128A16;
bool HasGFX10A16;
bool HasG16;
bool HasNSAEncoding;
+ unsigned NSAMaxSize;
+ bool GFX10_AEncoding;
bool GFX10_BEncoding;
bool HasDLInsts;
bool HasDot1Insts;
@@ -144,6 +146,7 @@ protected:
bool HasDot4Insts;
bool HasDot5Insts;
bool HasDot6Insts;
+ bool HasDot7Insts;
bool HasMAIInsts;
bool HasPkFmacF16Inst;
bool HasAtomicFaddInsts;
@@ -157,6 +160,7 @@ protected:
bool HasVscnt;
bool HasGetWaveIdInst;
bool HasSMemTimeInst;
+ bool HasShaderCyclesRegister;
bool HasRegisterBanking;
bool HasVOP3Literal;
bool HasNoDataDepHazard;
@@ -165,12 +169,19 @@ protected:
bool FlatGlobalInsts;
bool FlatScratchInsts;
bool ScalarFlatScratchInsts;
+ bool HasArchitectedFlatScratch;
bool AddNoCarryInsts;
bool HasUnpackedD16VMem;
+ bool R600ALUInst;
+ bool CaymanISA;
+ bool CFALUBug;
bool LDSMisalignedBug;
bool HasMFMAInlineLiteralBug;
+ bool HasVertexCache;
+ short TexVTXClauseSize;
bool UnalignedBufferAccess;
bool UnalignedDSAccess;
+ bool HasPackedTID;
bool ScalarizeGlobal;
bool HasVcmpxPermlaneHazard;
@@ -180,6 +191,7 @@ protected:
bool HasVcmpxExecWARHazard;
bool HasLdsBranchVmemWARHazard;
bool HasNSAtoVMEMBug;
+ bool HasNSAClauseBug;
bool HasOffset3fBug;
bool HasFlatSegmentOffsetBug;
bool HasImageStoreD16Bug;
@@ -241,6 +253,10 @@ public:
return RegBankInfo.get();
}
+ const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
+ return TargetID;
+ }
+
// Nothing implemented, just prevent crashes on use.
const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
return &TSInfo;
@@ -271,6 +287,11 @@ public:
unsigned getConstantBusLimit(unsigned Opcode) const;
+ /// Returns if the result of this instruction with a 16-bit result returned in
+ /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
+ /// the original value.
+ bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
+
bool hasIntClamp() const {
return HasIntClamp;
}
@@ -295,6 +316,10 @@ public:
return HalfRate64Ops;
}
+ bool hasFullRate64Ops() const {
+ return FullRate64Ops;
+ }
+
bool hasAddr64() const {
return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
}
@@ -370,7 +395,12 @@ public:
}
TrapHandlerAbi getTrapHandlerAbi() const {
- return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
+ return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
+ }
+
+ bool supportsGetDoorbellID() const {
+ // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
+ return getGeneration() >= GFX9;
}
/// True if the offset field of DS instructions works as expected. On SI, the
@@ -510,6 +540,10 @@ public:
return TargetID.isXnackOnOrAny();
}
+ bool isTgSplitEnabled() const {
+ return EnableTgSplit;
+ }
+
bool isCuModeEnabled() const {
return EnableCuMode;
}
@@ -666,6 +700,10 @@ public:
return HasDot6Insts;
}
+ bool hasDot7Insts() const {
+ return HasDot7Insts;
+ }
+
bool hasMAIInsts() const {
return HasMAIInsts;
}
@@ -694,6 +732,10 @@ public:
return HasSMemTimeInst;
}
+ bool hasShaderCyclesRegister() const {
+ return HasShaderCyclesRegister;
+ }
+
bool hasRegisterBanking() const {
return HasRegisterBanking;
}
@@ -780,6 +822,9 @@ public:
return GFX8Insts;
}
+ /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
+ bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
+
bool hasDPP() const {
return HasDPP;
}
@@ -796,6 +841,22 @@ public:
return HasDPP8;
}
+ bool has64BitDPP() const {
+ return Has64BitDPP;
+ }
+
+ bool hasPackedFP32Ops() const {
+ return HasPackedFP32Ops;
+ }
+
+ bool hasFmaakFmamkF32Insts() const {
+ return getGeneration() >= GFX10;
+ }
+
+ bool hasExtendedImageInsts() const {
+ return HasExtendedImageInsts;
+ }
+
bool hasR128A16() const {
return HasR128A16;
}
@@ -818,6 +879,12 @@ public:
bool hasNSAEncoding() const { return HasNSAEncoding; }
+ unsigned getNSAMaxSize() const { return NSAMaxSize; }
+
+ bool hasGFX10_AEncoding() const {
+ return GFX10_AEncoding;
+ }
+
bool hasGFX10_BEncoding() const {
return GFX10_BEncoding;
}
@@ -840,6 +907,12 @@ public:
return SGPRInitBug;
}
+ bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
+
+ bool hasNegativeUnalignedScratchOffsetBug() const {
+ return NegativeUnalignedScratchOffsetBug;
+ }
+
bool hasMFMAInlineLiteralBug() const {
return HasMFMAInlineLiteralBug;
}
@@ -894,8 +967,17 @@ public:
return HasNSAtoVMEMBug;
}
+ bool hasNSAClauseBug() const { return HasNSAClauseBug; }
+
bool hasHardClauses() const { return getGeneration() >= GFX10; }
+ bool hasGFX90AInsts() const { return GFX90AInsts; }
+
+ /// Return if operations acting on VGPR tuples require even alignment.
+ bool needsAlignedVGPRs() const { return GFX90AInsts; }
+
+ bool hasPackedTID() const { return HasPackedTID; }
+
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
/// SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
@@ -917,6 +999,10 @@ public:
return getGeneration() >= AMDGPUSubtarget::GFX9;
}
+ /// \returns true if the flat_scratch register is initialized by the HW.
+ /// In this case it is readonly.
+ bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
+
/// \returns true if the machine has merged shaders in which s0-s7 are
/// reserved by the hardware and user SGPRs start at s8
bool hasMergedShaders() const {
@@ -955,9 +1041,24 @@ public:
return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
}
- /// \returns Reserved number of SGPRs for given function \p MF.
+ /// \returns Reserved number of SGPRs. This is common
+ /// utility function called by MachineFunction and
+ /// Function variants of getReservedNumSGPRs.
+ unsigned getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const;
+ /// \returns Reserved number of SGPRs for given machine function \p MF.
unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
+ /// \returns Reserved number of SGPRs for given function \p F.
+ unsigned getReservedNumSGPRs(const Function &F) const;
+
+ /// \returns max num SGPRs. This is the common utility
+ /// function called by MachineFunction and Function
+ /// variants of getMaxNumSGPRs.
+ unsigned getBaseMaxNumSGPRs(const Function &F,
+ std::pair<unsigned, unsigned> WavesPerEU,
+ unsigned PreloadedSGPRs,
+ unsigned ReservedNumSGPRs) const;
+
/// \returns Maximum number of SGPRs that meets number of waves per execution
/// unit requirement for function \p MF, or number of SGPRs explicitly
/// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
@@ -968,6 +1069,16 @@ public:
/// unit requirement.
unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
+ /// \returns Maximum number of SGPRs that meets number of waves per execution
+ /// unit requirement for function \p F, or number of SGPRs explicitly
+ /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
+ ///
+ /// \returns Value that meets number of waves per execution unit requirement
+ /// if explicitly requested value cannot be converted to integer, violates
+ /// subtarget's specifications, or does not meet number of waves per execution
+ /// unit requirement.
+ unsigned getMaxNumSGPRs(const Function &F) const;
+
/// \returns VGPR allocation granularity supported by the subtarget.
unsigned getVGPRAllocGranule() const {
return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
@@ -1000,6 +1111,20 @@ public:
return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
}
+ /// \returns max num VGPRs. This is the common utility function
+ /// called by MachineFunction and Function variants of getMaxNumVGPRs.
+ unsigned getBaseMaxNumVGPRs(const Function &F,
+ std::pair<unsigned, unsigned> WavesPerEU) const;
+ /// \returns Maximum number of VGPRs that meets number of waves per execution
+ /// unit requirement for function \p F, or number of VGPRs explicitly
+ /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
+ ///
+ /// \returns Value that meets number of waves per execution unit requirement
+ /// if explicitly requested value cannot be converted to integer, violates
+ /// subtarget's specifications, or does not meet number of waves per execution
+ /// unit requirement.
+ unsigned getMaxNumVGPRs(const Function &F) const;
+
/// \returns Maximum number of VGPRs that meets number of waves per execution
/// unit requirement for function \p MF, or number of VGPRs explicitly
/// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 426648d19d55..bb2c298c2850 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -80,9 +80,12 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
const auto *SymA = Target.getSymA();
assert(SymA);
- Ctx.reportError(Fixup.getLoc(),
- Twine("undefined label '") + SymA->getSymbol().getName() + "'");
- return ELF::R_AMDGPU_NONE;
+ if (SymA->getSymbol().isUndefined()) {
+ Ctx.reportError(Fixup.getLoc(), Twine("undefined label '") +
+ SymA->getSymbol().getName() + "'");
+ return ELF::R_AMDGPU_NONE;
+ }
+ return ELF::R_AMDGPU_REL16;
}
llvm_unreachable("unhandled relocation type");
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index fbf7dc2a72db..9ba0ffbced3d 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -10,6 +10,7 @@
#include "AMDGPUInstPrinter.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIDefines.h"
+#include "SIRegisterInfo.h"
#include "Utils/AMDGPUAsmUtils.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/MC/MCExpr.h"
@@ -146,7 +147,7 @@ void AMDGPUInstPrinter::printFlatOffset(const MCInst *MI, unsigned OpNo,
const MCInstrDesc &Desc = MII.get(MI->getOpcode());
bool IsFlatSeg = !(Desc.TSFlags &
- (SIInstrFlags::IsFlatGlobal | SIInstrFlags::IsFlatScratch));
+ (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch));
if (IsFlatSeg) { // Unsigned offset
printU16ImmDecOperand(MI, OpNo, O);
@@ -201,20 +202,19 @@ void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo,
printNamedBit(MI, OpNo, O, "gds");
}
-void AMDGPUInstPrinter::printDLC(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- if (AMDGPU::isGFX10Plus(STI))
- printNamedBit(MI, OpNo, O, "dlc");
-}
-
-void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- printNamedBit(MI, OpNo, O, "glc");
-}
-
-void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- printNamedBit(MI, OpNo, O, "slc");
+void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ auto Imm = MI->getOperand(OpNo).getImm();
+ if (Imm & CPol::GLC)
+ O << " glc";
+ if (Imm & CPol::SLC)
+ O << " slc";
+ if ((Imm & CPol::DLC) && AMDGPU::isGFX10Plus(STI))
+ O << " dlc";
+ if ((Imm & CPol::SCC) && AMDGPU::isGFX90A(STI))
+ O << " scc";
+ if (Imm & ~CPol::ALL)
+ O << " /* unexpected cache policy bit */";
}
void AMDGPUInstPrinter::printSWZ(const MCInst *MI, unsigned OpNo,
@@ -362,22 +362,30 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
}
void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ auto Opcode = MI->getOpcode();
+ auto Flags = MII.get(Opcode).TSFlags;
+
if (OpNo == 0) {
- if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3)
- O << "_e64 ";
- else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::DPP)
- O << "_dpp ";
- else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::SDWA)
- O << "_sdwa ";
- else
- O << "_e32 ";
+ if (Flags & SIInstrFlags::VOP3) {
+ if (!getVOP3IsSingle(Opcode))
+ O << "_e64";
+ } else if (Flags & SIInstrFlags::DPP) {
+ O << "_dpp";
+ } else if (Flags & SIInstrFlags::SDWA) {
+ O << "_sdwa";
+ } else if (((Flags & SIInstrFlags::VOP1) && !getVOP1IsSingle(Opcode)) ||
+ ((Flags & SIInstrFlags::VOP2) && !getVOP2IsSingle(Opcode))) {
+ O << "_e32";
+ }
+ O << " ";
}
printOperand(MI, OpNo, STI, O);
// Print default vcc/vcc_lo operand.
- switch (MI->getOpcode()) {
+ switch (Opcode) {
default: break;
case AMDGPU::V_ADD_CO_CI_U32_e32_gfx10:
@@ -601,6 +609,10 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_REG_IMM_V2INT32:
+ case AMDGPU::OPERAND_REG_IMM_V2FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
case MCOI::OPERAND_IMMEDIATE:
printImmediate32(Op.getImm(), STI, O);
break;
@@ -608,6 +620,7 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
printImmediate64(Op.getImm(), STI, O);
break;
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
@@ -656,18 +669,19 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
// custom printer.
llvm_unreachable("unexpected immediate operand type");
}
- } else if (Op.isFPImm()) {
+ } else if (Op.isDFPImm()) {
+ double Value = bit_cast<double>(Op.getDFPImm());
// We special case 0.0 because otherwise it will be printed as an integer.
- if (Op.getFPImm() == 0.0)
+ if (Value == 0.0)
O << "0.0";
else {
const MCInstrDesc &Desc = MII.get(MI->getOpcode());
int RCID = Desc.OpInfo[OpNo].RegClass;
unsigned RCBits = AMDGPU::getRegBitWidth(MRI.getRegClass(RCID));
if (RCBits == 32)
- printImmediate32(FloatToBits(Op.getFPImm()), STI, O);
+ printImmediate32(FloatToBits(Value), STI, O);
else if (RCBits == 64)
- printImmediate64(DoubleToBits(Op.getFPImm()), STI, O);
+ printImmediate64(DoubleToBits(Value), STI, O);
else
llvm_unreachable("Invalid register class size");
}
@@ -727,7 +741,7 @@ void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI,
if (OpNo + 1 < MI->getNumOperands() &&
(InputModifiers & SISrcMods::ABS) == 0) {
const MCOperand &Op = MI->getOperand(OpNo + 1);
- NegMnemo = Op.isImm() || Op.isFPImm();
+ NegMnemo = Op.isImm() || Op.isDFPImm();
}
if (NegMnemo) {
O << "neg(";
@@ -793,7 +807,16 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
using namespace AMDGPU::DPP;
unsigned Imm = MI->getOperand(OpNo).getImm();
- if (Imm <= DppCtrl::QUAD_PERM_LAST) {
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+ AMDGPU::OpName::src0);
+
+ if (Src0Idx >= 0 &&
+ Desc.OpInfo[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID &&
+ !AMDGPU::isLegal64BitDPPControl(Imm)) {
+ O << " /* 64 bit dpp only supports row_newbcast */";
+ return;
+ } else if (Imm <= DppCtrl::QUAD_PERM_LAST) {
O << "quad_perm:[";
O << formatDec(Imm & 0x3) << ',';
O << formatDec((Imm & 0xc) >> 2) << ',';
@@ -853,11 +876,15 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
O << "row_bcast:31";
} else if ((Imm >= DppCtrl::ROW_SHARE_FIRST) &&
(Imm <= DppCtrl::ROW_SHARE_LAST)) {
- if (!AMDGPU::isGFX10Plus(STI)) {
- O << "/* row_share is not supported on ASICs earlier than GFX10 */";
+ if (AMDGPU::isGFX90A(STI)) {
+ O << "row_newbcast:";
+ } else if (AMDGPU::isGFX10Plus(STI)) {
+ O << "row_share:";
+ } else {
+ O << " /* row_newbcast/row_share is not supported on ASICs earlier "
+ "than GFX90A/GFX10 */";
return;
}
- O << "row_share:";
printU4ImmDecOperand(MI, OpNo, O);
} else if ((Imm >= DppCtrl::ROW_XMASK_FIRST) &&
(Imm <= DppCtrl::ROW_XMASK_LAST)) {
@@ -891,7 +918,7 @@ void AMDGPUInstPrinter::printBoundCtrl(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
unsigned Imm = MI->getOperand(OpNo).getImm();
if (Imm) {
- O << " bound_ctrl:0"; // XXX - this syntax is used in sp3
+ O << " bound_ctrl:1";
}
}
@@ -1236,8 +1263,8 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
decodeMsg(Imm16, MsgId, OpId, StreamId);
if (isValidMsgId(MsgId, STI) &&
- isValidMsgOp(MsgId, OpId) &&
- isValidMsgStream(MsgId, OpId, StreamId)) {
+ isValidMsgOp(MsgId, OpId, STI) &&
+ isValidMsgStream(MsgId, OpId, StreamId, STI)) {
O << "sendmsg(" << getMsgName(MsgId);
if (msgRequiresOp(MsgId)) {
O << ", " << getMsgOpName(MsgId, OpId);
@@ -1560,12 +1587,12 @@ void R600InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
}
} else if (Op.isImm()) {
O << Op.getImm();
- } else if (Op.isFPImm()) {
+ } else if (Op.isDFPImm()) {
// We special case 0.0 because otherwise it will be printed as an integer.
- if (Op.getFPImm() == 0.0)
+ if (Op.getDFPImm() == 0.0)
O << "0.0";
else {
- O << Op.getFPImm();
+ O << bit_cast<double>(Op.getDFPImm());
}
} else if (Op.isExpr()) {
const MCExpr *Exp = Op.getExpr();
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index 8d13aa682211..3cb4fcb28cb0 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -68,12 +68,8 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printGDS(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
- void printDLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
- raw_ostream &O);
- void printGLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
- raw_ostream &O);
- void printSLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
- raw_ostream &O);
+ void printCPol(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printSWZ(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printTFE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 1836237c8df5..5c728bd86817 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -42,6 +42,7 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT,
HasNoDeadStrip = true;
//===--- Dwarf Emission Directives -----------------------------------===//
SupportsDebugInformation = true;
+ UsesCFIForDebug = true;
DwarfRegNumForCFI = true;
UseIntegratedAssembler = false;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index f0eb11b70c97..9a9a2c973f44 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -31,6 +31,20 @@ using namespace llvm::AMDGPU;
// AMDGPUTargetStreamer
//===----------------------------------------------------------------------===//
+static void convertIsaVersionV2(uint32_t &Major, uint32_t &Minor,
+ uint32_t &Stepping, bool Sramecc, bool Xnack) {
+ if (Major == 9 && Minor == 0) {
+ switch (Stepping) {
+ case 0:
+ case 2:
+ case 4:
+ case 6:
+ if (Xnack)
+ Stepping++;
+ }
+ }
+}
+
bool AMDGPUTargetStreamer::EmitHSAMetadataV2(StringRef HSAMetadataString) {
HSAMD::Metadata HSAMetadata;
if (HSAMD::fromString(HSAMetadataString, HSAMetadata))
@@ -86,14 +100,18 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX908: AK = GK_GFX908; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A: AK = GK_GFX90A; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C: AK = GK_GFX90C; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: AK = GK_GFX1012; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1013: AK = GK_GFX1013; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030: AK = GK_GFX1030; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1031: AK = GK_GFX1031; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1032: AK = GK_GFX1032; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1033: AK = GK_GFX1033; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1034: AK = GK_GFX1034; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1035: AK = GK_GFX1035; break;
case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break;
}
@@ -145,14 +163,18 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
case GK_GFX906: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX906;
case GK_GFX908: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX908;
case GK_GFX909: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909;
+ case GK_GFX90A: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A;
case GK_GFX90C: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C;
case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010;
case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011;
case GK_GFX1012: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012;
+ case GK_GFX1013: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1013;
case GK_GFX1030: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030;
case GK_GFX1031: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1031;
case GK_GFX1032: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1032;
case GK_GFX1033: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1033;
+ case GK_GFX1034: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1034;
+ case GK_GFX1035: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1035;
case GK_NONE: return ELF::EF_AMDGPU_MACH_NONE;
}
@@ -180,8 +202,8 @@ void AMDGPUTargetAsmStreamer::finish() {
getPALMetadata()->reset();
}
-void AMDGPUTargetAsmStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {
- OS << "\t.amdgcn_target \"" << Target << "\"\n";
+void AMDGPUTargetAsmStreamer::EmitDirectiveAMDGCNTarget() {
+ OS << "\t.amdgcn_target \"" << getTargetID()->toString() << "\"\n";
}
void AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectVersion(
@@ -191,15 +213,14 @@ void AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectVersion(
}
void
-AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major,
- uint32_t Minor,
- uint32_t Stepping,
- StringRef VendorName,
- StringRef ArchName) {
- OS << "\t.hsa_code_object_isa " <<
- Twine(Major) << "," << Twine(Minor) << "," << Twine(Stepping) <<
- ",\"" << VendorName << "\",\"" << ArchName << "\"\n";
-
+AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectISAV2(uint32_t Major,
+ uint32_t Minor,
+ uint32_t Stepping,
+ StringRef VendorName,
+ StringRef ArchName) {
+ convertIsaVersionV2(Major, Minor, Stepping, TargetID->isSramEccOnOrAny(), TargetID->isXnackOnOrAny());
+ OS << "\t.hsa_code_object_isa " << Twine(Major) << "," << Twine(Minor) << ","
+ << Twine(Stepping) << ",\"" << VendorName << "\",\"" << ArchName << "\"\n";
}
void
@@ -225,8 +246,8 @@ void AMDGPUTargetAsmStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
<< Alignment.value() << '\n';
}
-bool AMDGPUTargetAsmStreamer::EmitISAVersion(StringRef IsaVersionString) {
- OS << "\t.amd_amdgpu_isa \"" << IsaVersionString << "\"\n";
+bool AMDGPUTargetAsmStreamer::EmitISAVersion() {
+ OS << "\t.amd_amdgpu_isa \"" << getTargetID()->toString() << "\"\n";
return true;
}
@@ -258,17 +279,32 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
return true;
}
-bool AMDGPUTargetAsmStreamer::EmitCodeEnd() {
+bool AMDGPUTargetAsmStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
const uint32_t Encoded_s_code_end = 0xbf9f0000;
- OS << "\t.p2alignl 6, " << Encoded_s_code_end << '\n';
- OS << "\t.fill 48, 4, " << Encoded_s_code_end << '\n';
+ const uint32_t Encoded_s_nop = 0xbf800000;
+ uint32_t Encoded_pad = Encoded_s_code_end;
+
+ // Instruction cache line size in bytes.
+ const unsigned Log2CacheLineSize = 6;
+ const unsigned CacheLineSize = 1u << Log2CacheLineSize;
+
+ // Extra padding amount in bytes to support prefetch mode 3.
+ unsigned FillSize = 3 * CacheLineSize;
+
+ if (AMDGPU::isGFX90A(STI)) {
+ Encoded_pad = Encoded_s_nop;
+ FillSize = 16 * CacheLineSize;
+ }
+
+ OS << "\t.p2alignl " << Log2CacheLineSize << ", " << Encoded_pad << '\n';
+ OS << "\t.fill " << (FillSize / 4) << ", 4, " << Encoded_pad << '\n';
return true;
}
void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KD, uint64_t NextVGPR, uint64_t NextSGPR,
- bool ReserveVCC, bool ReserveFlatScr, bool ReserveXNACK) {
+ bool ReserveVCC, bool ReserveFlatScr) {
IsaVersion IVersion = getIsaVersion(STI.getCPU());
OS << "\t.amdhsa_kernel " << KernelName << '\n';
@@ -281,10 +317,13 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
<< '\n';
OS << "\t\t.amdhsa_private_segment_fixed_size "
<< KD.private_segment_fixed_size << '\n';
+ OS << "\t\t.amdhsa_kernarg_size " << KD.kernarg_size << '\n';
- PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_buffer", KD,
- kernel_code_properties,
- amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
+ if (!hasArchitectedFlatScratch(STI))
+ PRINT_FIELD(
+ OS, ".amdhsa_user_sgpr_private_segment_buffer", KD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_ptr", KD,
kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
@@ -297,9 +336,10 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_id", KD,
kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
- PRINT_FIELD(OS, ".amdhsa_user_sgpr_flat_scratch_init", KD,
- kernel_code_properties,
- amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
+ if (!hasArchitectedFlatScratch(STI))
+ PRINT_FIELD(OS, ".amdhsa_user_sgpr_flat_scratch_init", KD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_size", KD,
kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
@@ -307,10 +347,12 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
PRINT_FIELD(OS, ".amdhsa_wavefront_size32", KD,
kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
- PRINT_FIELD(
- OS, ".amdhsa_system_sgpr_private_segment_wavefront_offset", KD,
- compute_pgm_rsrc2,
- amdhsa::COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
+ PRINT_FIELD(OS,
+ (hasArchitectedFlatScratch(STI)
+ ? ".amdhsa_enable_private_segment"
+ : ".amdhsa_system_sgpr_private_segment_wavefront_offset"),
+ KD, compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_x", KD,
compute_pgm_rsrc2,
amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
@@ -331,12 +373,30 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
OS << "\t\t.amdhsa_next_free_vgpr " << NextVGPR << '\n';
OS << "\t\t.amdhsa_next_free_sgpr " << NextSGPR << '\n';
+ if (AMDGPU::isGFX90A(STI))
+ OS << "\t\t.amdhsa_accum_offset " <<
+ (AMDHSA_BITS_GET(KD.compute_pgm_rsrc3,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET) + 1) * 4
+ << '\n';
+
if (!ReserveVCC)
OS << "\t\t.amdhsa_reserve_vcc " << ReserveVCC << '\n';
- if (IVersion.Major >= 7 && !ReserveFlatScr)
+ if (IVersion.Major >= 7 && !ReserveFlatScr && !hasArchitectedFlatScratch(STI))
OS << "\t\t.amdhsa_reserve_flat_scratch " << ReserveFlatScr << '\n';
- if (IVersion.Major >= 8 && ReserveXNACK != hasXNACK(STI))
- OS << "\t\t.amdhsa_reserve_xnack_mask " << ReserveXNACK << '\n';
+
+ if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(&STI)) {
+ switch (*HsaAbiVer) {
+ default:
+ break;
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
+ break;
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
+ if (getTargetID()->isXnackSupported())
+ OS << "\t\t.amdhsa_reserve_xnack_mask " << getTargetID()->isXnackOnOrAny() << '\n';
+ break;
+ }
+ }
PRINT_FIELD(OS, ".amdhsa_float_round_mode_32", KD,
compute_pgm_rsrc1,
@@ -360,6 +420,10 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
PRINT_FIELD(OS, ".amdhsa_fp16_overflow", KD,
compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL);
+ if (AMDGPU::isGFX90A(STI))
+ PRINT_FIELD(OS, ".amdhsa_tg_split", KD,
+ compute_pgm_rsrc3,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT);
if (IVersion.Major >= 10) {
PRINT_FIELD(OS, ".amdhsa_workgroup_processor_mode", KD,
compute_pgm_rsrc1,
@@ -405,23 +469,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(MCStreamer &S,
const MCSubtargetInfo &STI)
- : AMDGPUTargetStreamer(S), Streamer(S), Os(STI.getTargetTriple().getOS()) {
- MCAssembler &MCA = getStreamer().getAssembler();
- unsigned EFlags = MCA.getELFHeaderEFlags();
-
- EFlags &= ~ELF::EF_AMDGPU_MACH;
- EFlags |= getElfMach(STI.getCPU());
-
- EFlags &= ~ELF::EF_AMDGPU_XNACK;
- if (AMDGPU::hasXNACK(STI))
- EFlags |= ELF::EF_AMDGPU_XNACK;
-
- EFlags &= ~ELF::EF_AMDGPU_SRAM_ECC;
- if (AMDGPU::hasSRAMECC(STI))
- EFlags |= ELF::EF_AMDGPU_SRAM_ECC;
-
- MCA.setELFHeaderEFlags(EFlags);
-}
+ : AMDGPUTargetStreamer(S), STI(STI), Streamer(S) {}
MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
return static_cast<MCELFStreamer &>(Streamer);
@@ -431,6 +479,9 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
// We use it for emitting the accumulated PAL metadata as a .note record.
// The PAL metadata is reset after it is emitted.
void AMDGPUTargetELFStreamer::finish() {
+ MCAssembler &MCA = getStreamer().getAssembler();
+ MCA.setELFHeaderEFlags(getEFlags());
+
std::string Blob;
const char *Vendor = getPALMetadata()->getVendor();
unsigned Type = getPALMetadata()->getType();
@@ -456,7 +507,7 @@ void AMDGPUTargetELFStreamer::EmitNote(
unsigned NoteFlags = 0;
// TODO Apparently, this is currently needed for OpenCL as mentioned in
// https://reviews.llvm.org/D74995
- if (Os == Triple::AMDHSA)
+ if (STI.getTargetTriple().getOS() == Triple::AMDHSA)
NoteFlags = ELF::SHF_ALLOC;
S.PushSection();
@@ -472,24 +523,150 @@ void AMDGPUTargetELFStreamer::EmitNote(
S.PopSection();
}
-void AMDGPUTargetELFStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {}
+unsigned AMDGPUTargetELFStreamer::getEFlags() {
+ switch (STI.getTargetTriple().getArch()) {
+ default:
+ llvm_unreachable("Unsupported Arch");
+ case Triple::r600:
+ return getEFlagsR600();
+ case Triple::amdgcn:
+ return getEFlagsAMDGCN();
+ }
+}
+
+unsigned AMDGPUTargetELFStreamer::getEFlagsR600() {
+ assert(STI.getTargetTriple().getArch() == Triple::r600);
+
+ return getElfMach(STI.getCPU());
+}
+
+unsigned AMDGPUTargetELFStreamer::getEFlagsAMDGCN() {
+ assert(STI.getTargetTriple().getArch() == Triple::amdgcn);
+
+ switch (STI.getTargetTriple().getOS()) {
+ default:
+ // TODO: Why are some tests have "mingw" listed as OS?
+ // llvm_unreachable("Unsupported OS");
+ case Triple::UnknownOS:
+ return getEFlagsUnknownOS();
+ case Triple::AMDHSA:
+ return getEFlagsAMDHSA();
+ case Triple::AMDPAL:
+ return getEFlagsAMDPAL();
+ case Triple::Mesa3D:
+ return getEFlagsMesa3D();
+ }
+}
+
+unsigned AMDGPUTargetELFStreamer::getEFlagsUnknownOS() {
+ // TODO: Why are some tests have "mingw" listed as OS?
+ // assert(STI.getTargetTriple().getOS() == Triple::UnknownOS);
+
+ return getEFlagsV3();
+}
+
+unsigned AMDGPUTargetELFStreamer::getEFlagsAMDHSA() {
+ assert(STI.getTargetTriple().getOS() == Triple::AMDHSA);
+
+ if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(&STI)) {
+ switch (*HsaAbiVer) {
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
+ return getEFlagsV3();
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
+ return getEFlagsV4();
+ }
+ }
+
+ llvm_unreachable("HSA OS ABI Version identification must be defined");
+}
+
+unsigned AMDGPUTargetELFStreamer::getEFlagsAMDPAL() {
+ assert(STI.getTargetTriple().getOS() == Triple::AMDPAL);
+
+ return getEFlagsV3();
+}
+
+unsigned AMDGPUTargetELFStreamer::getEFlagsMesa3D() {
+ assert(STI.getTargetTriple().getOS() == Triple::Mesa3D);
+
+ return getEFlagsV3();
+}
+
+unsigned AMDGPUTargetELFStreamer::getEFlagsV3() {
+ unsigned EFlagsV3 = 0;
+
+ // mach.
+ EFlagsV3 |= getElfMach(STI.getCPU());
+
+ // xnack.
+ if (getTargetID()->isXnackOnOrAny())
+ EFlagsV3 |= ELF::EF_AMDGPU_FEATURE_XNACK_V3;
+ // sramecc.
+ if (getTargetID()->isSramEccOnOrAny())
+ EFlagsV3 |= ELF::EF_AMDGPU_FEATURE_SRAMECC_V3;
+
+ return EFlagsV3;
+}
+
+unsigned AMDGPUTargetELFStreamer::getEFlagsV4() {
+ unsigned EFlagsV4 = 0;
+
+ // mach.
+ EFlagsV4 |= getElfMach(STI.getCPU());
+
+ // xnack.
+ switch (getTargetID()->getXnackSetting()) {
+ case AMDGPU::IsaInfo::TargetIDSetting::Unsupported:
+ EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4;
+ break;
+ case AMDGPU::IsaInfo::TargetIDSetting::Any:
+ EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_XNACK_ANY_V4;
+ break;
+ case AMDGPU::IsaInfo::TargetIDSetting::Off:
+ EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_XNACK_OFF_V4;
+ break;
+ case AMDGPU::IsaInfo::TargetIDSetting::On:
+ EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_XNACK_ON_V4;
+ break;
+ }
+ // sramecc.
+ switch (getTargetID()->getSramEccSetting()) {
+ case AMDGPU::IsaInfo::TargetIDSetting::Unsupported:
+ EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4;
+ break;
+ case AMDGPU::IsaInfo::TargetIDSetting::Any:
+ EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_SRAMECC_ANY_V4;
+ break;
+ case AMDGPU::IsaInfo::TargetIDSetting::Off:
+ EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_SRAMECC_OFF_V4;
+ break;
+ case AMDGPU::IsaInfo::TargetIDSetting::On:
+ EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_SRAMECC_ON_V4;
+ break;
+ }
+
+ return EFlagsV4;
+}
+
+void AMDGPUTargetELFStreamer::EmitDirectiveAMDGCNTarget() {}
void AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(
uint32_t Major, uint32_t Minor) {
EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(8, getContext()),
- ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_VERSION, [&](MCELFStreamer &OS) {
+ ELF::NT_AMD_HSA_CODE_OBJECT_VERSION, [&](MCELFStreamer &OS) {
OS.emitInt32(Major);
OS.emitInt32(Minor);
});
}
void
-AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major,
- uint32_t Minor,
- uint32_t Stepping,
- StringRef VendorName,
- StringRef ArchName) {
+AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISAV2(uint32_t Major,
+ uint32_t Minor,
+ uint32_t Stepping,
+ StringRef VendorName,
+ StringRef ArchName) {
uint16_t VendorNameSize = VendorName.size() + 1;
uint16_t ArchNameSize = ArchName.size() + 1;
@@ -497,8 +674,9 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major,
sizeof(Major) + sizeof(Minor) + sizeof(Stepping) +
VendorNameSize + ArchNameSize;
+ convertIsaVersionV2(Major, Minor, Stepping, TargetID->isSramEccOnOrAny(), TargetID->isXnackOnOrAny());
EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(DescSZ, getContext()),
- ElfNote::NT_AMDGPU_HSA_ISA, [&](MCELFStreamer &OS) {
+ ELF::NT_AMD_HSA_ISA_VERSION, [&](MCELFStreamer &OS) {
OS.emitInt16(VendorNameSize);
OS.emitInt16(ArchNameSize);
OS.emitInt32(Major);
@@ -546,7 +724,7 @@ void AMDGPUTargetELFStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
SymbolELF->setSize(MCConstantExpr::create(Size, getContext()));
}
-bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) {
+bool AMDGPUTargetELFStreamer::EmitISAVersion() {
// Create two labels to mark the beginning and end of the desc field
// and a MCExpr to calculate the size of the desc field.
auto &Context = getContext();
@@ -556,10 +734,10 @@ bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) {
MCSymbolRefExpr::create(DescEnd, Context),
MCSymbolRefExpr::create(DescBegin, Context), Context);
- EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_AMDGPU_ISA,
+ EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_HSA_ISA_NAME,
[&](MCELFStreamer &OS) {
OS.emitLabel(DescBegin);
- OS.emitBytes(IsaVersionString);
+ OS.emitBytes(getTargetID()->toString());
OS.emitLabel(DescEnd);
});
return true;
@@ -607,7 +785,7 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
MCSymbolRefExpr::create(DescEnd, Context),
MCSymbolRefExpr::create(DescBegin, Context), Context);
- EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_AMDGPU_HSA_METADATA,
+ EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_HSA_METADATA,
[&](MCELFStreamer &OS) {
OS.emitLabel(DescBegin);
OS.emitBytes(HSAMetadataString);
@@ -616,14 +794,28 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
return true;
}
-bool AMDGPUTargetELFStreamer::EmitCodeEnd() {
+bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
const uint32_t Encoded_s_code_end = 0xbf9f0000;
+ const uint32_t Encoded_s_nop = 0xbf800000;
+ uint32_t Encoded_pad = Encoded_s_code_end;
+
+ // Instruction cache line size in bytes.
+ const unsigned Log2CacheLineSize = 6;
+ const unsigned CacheLineSize = 1u << Log2CacheLineSize;
+
+ // Extra padding amount in bytes to support prefetch mode 3.
+ unsigned FillSize = 3 * CacheLineSize;
+
+ if (AMDGPU::isGFX90A(STI)) {
+ Encoded_pad = Encoded_s_nop;
+ FillSize = 16 * CacheLineSize;
+ }
MCStreamer &OS = getStreamer();
OS.PushSection();
- OS.emitValueToAlignment(64, Encoded_s_code_end, 4);
- for (unsigned I = 0; I < 48; ++I)
- OS.emitInt32(Encoded_s_code_end);
+ OS.emitValueToAlignment(CacheLineSize, Encoded_pad, 4);
+ for (unsigned I = 0; I < FillSize; I += 4)
+ OS.emitInt32(Encoded_pad);
OS.PopSection();
return true;
}
@@ -631,8 +823,7 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd() {
void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
- uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
- bool ReserveXNACK) {
+ uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) {
auto &Streamer = getStreamer();
auto &Context = Streamer.getContext();
@@ -659,8 +850,11 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
Streamer.emitLabel(KernelDescriptorSymbol);
Streamer.emitInt32(KernelDescriptor.group_segment_fixed_size);
Streamer.emitInt32(KernelDescriptor.private_segment_fixed_size);
+ Streamer.emitInt32(KernelDescriptor.kernarg_size);
+
for (uint8_t Res : KernelDescriptor.reserved0)
Streamer.emitInt8(Res);
+
// FIXME: Remove the use of VK_AMDGPU_REL64 in the expression below. The
// expression being created is:
// (start of kernel code) - (start of kernel descriptor)
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 1ad64532931c..cef34a5e5a59 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -9,6 +9,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
+#include "Utils/AMDGPUBaseInfo.h"
#include "Utils/AMDGPUPALMetadata.h"
#include "llvm/MC/MCStreamer.h"
@@ -23,6 +24,7 @@ class MCSymbol;
class MDNode;
class Module;
class Type;
+class formatted_raw_ostream;
namespace AMDGPU {
namespace HSAMD {
@@ -38,6 +40,9 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
AMDGPUPALMetadata PALMetadata;
protected:
+ // TODO: Move HSAMetadataStream to AMDGPUTargetStreamer.
+ Optional<AMDGPU::IsaInfo::AMDGPUTargetID> TargetID;
+
MCContext &getContext() const { return Streamer.getContext(); }
public:
@@ -45,15 +50,15 @@ public:
AMDGPUPALMetadata *getPALMetadata() { return &PALMetadata; }
- virtual void EmitDirectiveAMDGCNTarget(StringRef Target) = 0;
+ virtual void EmitDirectiveAMDGCNTarget() = 0;
virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
uint32_t Minor) = 0;
- virtual void EmitDirectiveHSACodeObjectISA(uint32_t Major, uint32_t Minor,
- uint32_t Stepping,
- StringRef VendorName,
- StringRef ArchName) = 0;
+ virtual void EmitDirectiveHSACodeObjectISAV2(uint32_t Major, uint32_t Minor,
+ uint32_t Stepping,
+ StringRef VendorName,
+ StringRef ArchName) = 0;
virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) = 0;
@@ -63,7 +68,7 @@ public:
Align Alignment) = 0;
/// \returns True on success, false on failure.
- virtual bool EmitISAVersion(StringRef IsaVersionString) = 0;
+ virtual bool EmitISAVersion() = 0;
/// \returns True on success, false on failure.
virtual bool EmitHSAMetadataV2(StringRef HSAMetadataString);
@@ -84,16 +89,32 @@ public:
virtual bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) = 0;
/// \returns True on success, false on failure.
- virtual bool EmitCodeEnd() = 0;
+ virtual bool EmitCodeEnd(const MCSubtargetInfo &STI) = 0;
virtual void EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
- uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
- bool ReserveXNACK) = 0;
+ uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) = 0;
static StringRef getArchNameFromElfMach(unsigned ElfMach);
static unsigned getElfMach(StringRef GPU);
+
+ const Optional<AMDGPU::IsaInfo::AMDGPUTargetID> &getTargetID() const {
+ return TargetID;
+ }
+ Optional<AMDGPU::IsaInfo::AMDGPUTargetID> &getTargetID() {
+ return TargetID;
+ }
+ void initializeTargetID(const MCSubtargetInfo &STI) {
+ assert(TargetID == None && "TargetID can only be initialized once");
+ TargetID.emplace(STI);
+ }
+ void initializeTargetID(const MCSubtargetInfo &STI, StringRef FeatureString) {
+ initializeTargetID(STI);
+
+ assert(getTargetID() != None && "TargetID is None");
+ getTargetID()->setTargetIDFromFeaturesString(FeatureString);
+ }
};
class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
@@ -103,14 +124,14 @@ public:
void finish() override;
- void EmitDirectiveAMDGCNTarget(StringRef Target) override;
+ void EmitDirectiveAMDGCNTarget() override;
void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
uint32_t Minor) override;
- void EmitDirectiveHSACodeObjectISA(uint32_t Major, uint32_t Minor,
- uint32_t Stepping, StringRef VendorName,
- StringRef ArchName) override;
+ void EmitDirectiveHSACodeObjectISAV2(uint32_t Major, uint32_t Minor,
+ uint32_t Stepping, StringRef VendorName,
+ StringRef ArchName) override;
void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
@@ -119,7 +140,7 @@ public:
void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override;
/// \returns True on success, false on failure.
- bool EmitISAVersion(StringRef IsaVersionString) override;
+ bool EmitISAVersion() override;
/// \returns True on success, false on failure.
bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) override;
@@ -128,22 +149,34 @@ public:
bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override;
/// \returns True on success, false on failure.
- bool EmitCodeEnd() override;
+ bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
void EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
- uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
- bool ReserveXNACK) override;
+ uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) override;
};
class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
+ const MCSubtargetInfo &STI;
MCStreamer &Streamer;
- Triple::OSType Os;
void EmitNote(StringRef Name, const MCExpr *DescSize, unsigned NoteType,
function_ref<void(MCELFStreamer &)> EmitDesc);
+ unsigned getEFlags();
+
+ unsigned getEFlagsR600();
+ unsigned getEFlagsAMDGCN();
+
+ unsigned getEFlagsUnknownOS();
+ unsigned getEFlagsAMDHSA();
+ unsigned getEFlagsAMDPAL();
+ unsigned getEFlagsMesa3D();
+
+ unsigned getEFlagsV3();
+ unsigned getEFlagsV4();
+
public:
AMDGPUTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
@@ -151,14 +184,14 @@ public:
void finish() override;
- void EmitDirectiveAMDGCNTarget(StringRef Target) override;
+ void EmitDirectiveAMDGCNTarget() override;
void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
uint32_t Minor) override;
- void EmitDirectiveHSACodeObjectISA(uint32_t Major, uint32_t Minor,
- uint32_t Stepping, StringRef VendorName,
- StringRef ArchName) override;
+ void EmitDirectiveHSACodeObjectISAV2(uint32_t Major, uint32_t Minor,
+ uint32_t Stepping, StringRef VendorName,
+ StringRef ArchName) override;
void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
@@ -167,7 +200,7 @@ public:
void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override;
/// \returns True on success, false on failure.
- bool EmitISAVersion(StringRef IsaVersionString) override;
+ bool EmitISAVersion() override;
/// \returns True on success, false on failure.
bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) override;
@@ -176,13 +209,12 @@ public:
bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override;
/// \returns True on success, false on failure.
- bool EmitCodeEnd() override;
+ bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
void EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
- uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
- bool ReserveXNACK) override;
+ uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) override;
};
}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 1a1ffcda3b4e..dbce4b2e872c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -71,6 +71,9 @@ public:
unsigned getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const override;
+
+private:
+ uint64_t getImplicitOpSelHiEncoding(int Opcode) const;
};
} // end anonymous namespace
@@ -219,7 +222,7 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
Imm = C->getValue();
} else {
- assert(!MO.isFPImm());
+ assert(!MO.isDFPImm());
if (!MO.isImm())
return ~0;
@@ -234,12 +237,17 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_REG_IMM_V2INT32:
+ case AMDGPU::OPERAND_REG_IMM_V2FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
case AMDGPU::OPERAND_REG_IMM_INT64:
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
return getLit64Encoding(static_cast<uint64_t>(Imm), STI);
case AMDGPU::OPERAND_REG_IMM_INT16:
@@ -274,16 +282,40 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
}
}
+uint64_t SIMCCodeEmitter::getImplicitOpSelHiEncoding(int Opcode) const {
+ using namespace AMDGPU::VOP3PEncoding;
+ using namespace AMDGPU::OpName;
+
+ if (AMDGPU::getNamedOperandIdx(Opcode, op_sel_hi) != -1) {
+ if (AMDGPU::getNamedOperandIdx(Opcode, src2) != -1)
+ return 0;
+ if (AMDGPU::getNamedOperandIdx(Opcode, src1) != -1)
+ return OP_SEL_HI_2;
+ if (AMDGPU::getNamedOperandIdx(Opcode, src0) != -1)
+ return OP_SEL_HI_1 | OP_SEL_HI_2;
+ }
+ return OP_SEL_HI_0 | OP_SEL_HI_1 | OP_SEL_HI_2;
+}
+
void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
verifyInstructionPredicates(MI,
computeAvailableFeatures(STI.getFeatureBits()));
+ int Opcode = MI.getOpcode();
uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups, STI);
- const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+ const MCInstrDesc &Desc = MCII.get(Opcode);
unsigned bytes = Desc.getSize();
+ // Set unused op_sel_hi bits to 1 for VOP3P and MAI instructions.
+ // Note that accvgpr_read/write are MAI, have src0, but do not use op_sel.
+ if ((Desc.TSFlags & SIInstrFlags::VOP3P) ||
+ Opcode == AMDGPU::V_ACCVGPR_READ_B32_vi ||
+ Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) {
+ Encoding |= getImplicitOpSelHiEncoding(Opcode);
+ }
+
for (unsigned i = 0; i < bytes; i++) {
OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff));
}
@@ -431,6 +463,7 @@ SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
MRI.getRegClass(AMDGPU::AReg_128RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AReg_160RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AReg_192RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_224RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AReg_256RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AGPR_LO16RegClassID).contains(Reg))
Enc |= 512;
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 54c8cdf196ac..bacb790aac62 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -11,12 +11,14 @@
//
// - MIMGEncGfx6: encoding introduced with gfx6 (obsoleted for atomics in gfx8)
// - MIMGEncGfx8: encoding introduced with gfx8 for atomics
-// - MIMGEncGfx10Default: gfx default (non-NSA) encoding
+// - MIMGEncGfx90a: encoding for gfx90a for atomics
+// - MIMGEncGfx10Default: gfx10 default (non-NSA) encoding
// - MIMGEncGfx10NSA: gfx10 NSA encoding
class MIMGEncoding;
def MIMGEncGfx6 : MIMGEncoding;
def MIMGEncGfx8 : MIMGEncoding;
+def MIMGEncGfx90a : MIMGEncoding;
def MIMGEncGfx10Default : MIMGEncoding;
def MIMGEncGfx10NSA : MIMGEncoding;
@@ -39,6 +41,8 @@ class MIMGBaseOpcode : PredicateControl {
bit Coordinates = 1;
bit LodOrClampOrMip = 0;
bit HasD16 = 0;
+ bit IsAtomicRet = 0;
+ bit MSAA = 0;
}
def MIMGBaseOpcode : GenericEnum {
@@ -50,7 +54,7 @@ def MIMGBaseOpcodesTable : GenericTable {
let CppTypeName = "MIMGBaseOpcodeInfo";
let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
"Gather4", "NumExtraArgs", "Gradients", "G16", "Coordinates",
- "LodOrClampOrMip", "HasD16"];
+ "LodOrClampOrMip", "HasD16", "MSAA"];
string TypeOf_BaseOpcode = "MIMGBaseOpcode";
let PrimaryKey = ["BaseOpcode"];
@@ -64,7 +68,7 @@ def MIMGDim : GenericEnum {
def MIMGDimInfoTable : GenericTable {
let FilterClass = "AMDGPUDimProps";
let CppTypeName = "MIMGDimInfo";
- let Fields = ["Dim", "NumCoords", "NumGradients", "DA", "Encoding", "AsmSuffix"];
+ let Fields = ["Dim", "NumCoords", "NumGradients", "MSAA", "DA", "Encoding", "AsmSuffix"];
string TypeOf_Dim = "MIMGDim";
let PrimaryKey = ["Dim"];
@@ -81,9 +85,17 @@ def getMIMGDimInfoByAsmSuffix : SearchIndex {
let Key = ["AsmSuffix"];
}
-class mimg <bits<8> si_gfx10, bits<8> vi = si_gfx10> {
- field bits<8> SI_GFX10 = si_gfx10;
- field bits<8> VI = vi;
+def MIMG {
+ int NOP = -1;
+}
+
+class mimgopc <int base, int vi = base, int si = base> {
+ field bits<8> BASE = base; // Opcode for all but atomics
+ field bits<8> VI = vi; // VI is only used for atomic instructions
+ field bits<8> SI = si; // SI is only used for atomic instructions
+ bit HAS_BASE = !ne(base, MIMG.NOP);
+ bit HAS_VI = !ne(vi, MIMG.NOP);
+ bit HAS_SI = !ne(si, MIMG.NOP);
}
class MIMGLZMapping<MIMGBaseOpcode l, MIMGBaseOpcode lz> {
@@ -198,14 +210,24 @@ class MIMGNSAHelper<int num_addrs> {
// Base class of all pre-gfx10 MIMG instructions.
class MIMG_gfx6789<bits<8> op, dag outs, string dns = "">
: MIMG<outs, dns>, MIMGe_gfx6789<op> {
- let SubtargetPredicate = isGFX6GFX7GFX8GFX9;
- let AssemblerPredicate = isGFX6GFX7GFX8GFX9;
+ let SubtargetPredicate = isGFX6GFX7GFX8GFX9NotGFX90A;
+ let AssemblerPredicate = isGFX6GFX7GFX8GFX9NotGFX90A;
let MIMGEncoding = MIMGEncGfx6;
let d16 = !if(BaseOpcode.HasD16, ?, 0);
}
+class MIMG_gfx90a<bits<8> op, dag outs, string dns = "">
+ : MIMG<outs, dns>, MIMGe_gfx90a<op> {
+ let SubtargetPredicate = isGFX90APlus;
+ let AssemblerPredicate = isGFX90APlus;
+
+ let MIMGEncoding = MIMGEncGfx90a;
+
+ let d16 = !if(BaseOpcode.HasD16, ?, 0);
+}
+
// Base class of all non-NSA gfx10 MIMG instructions.
class MIMG_gfx10<int op, dag outs, string dns = "">
: MIMG<outs, dns>, MIMGe_gfx10<op> {
@@ -218,8 +240,8 @@ class MIMG_gfx10<int op, dag outs, string dns = "">
let nsa = 0;
}
-// Base class for all NSA MIMG instructions. Note that 1-dword addresses always
-// use non-NSA variants.
+// Base class for all NSA MIMG instructions.
+// Note that 1-dword addresses always use non-NSA variants.
class MIMG_nsa_gfx10<int op, dag outs, int num_addrs, string dns="">
: MIMG<outs, dns>, MIMGe_gfx10<op> {
let SubtargetPredicate = isGFX10Plus;
@@ -235,169 +257,229 @@ class MIMG_nsa_gfx10<int op, dag outs, int num_addrs, string dns="">
let nsa = nsah.NSA;
}
-class MIMG_NoSampler_Helper <bits<8> op, string asm,
+class MIMG_NoSampler_Helper <mimgopc op, string asm,
RegisterClass dst_rc,
RegisterClass addr_rc,
string dns="">
- : MIMG_gfx6789 <op, (outs dst_rc:$vdata), dns> {
+ : MIMG_gfx6789 <op.BASE, (outs dst_rc:$vdata), dns> {
let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc,
- DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+ DMask:$dmask, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
- let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+ let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da"
#!if(BaseOpcode.HasD16, "$d16", "");
}
-class MIMG_NoSampler_gfx10<int op, string opcode,
+class MIMG_NoSampler_Helper_gfx90a <mimgopc op, string asm,
+ RegisterClass dst_rc,
+ RegisterClass addr_rc,
+ string dns="">
+ : MIMG_gfx90a <op.BASE, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> {
+ let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc,
+ DMask:$dmask, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, LWE:$lwe, DA:$da),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_NoSampler_gfx10<mimgopc op, string opcode,
RegisterClass DataRC, RegisterClass AddrRC,
string dns="">
- : MIMG_gfx10<op, (outs DataRC:$vdata), dns> {
+ : MIMG_gfx10<op.BASE, (outs DataRC:$vdata), dns> {
let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask,
- Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
- SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
+ Dim:$dim, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
- let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe"
+ let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"
#!if(BaseOpcode.HasD16, "$d16", "");
}
-class MIMG_NoSampler_nsa_gfx10<int op, string opcode,
+class MIMG_NoSampler_nsa_gfx10<mimgopc op, string opcode,
RegisterClass DataRC, int num_addrs,
string dns="">
- : MIMG_nsa_gfx10<op, (outs DataRC:$vdata), num_addrs, dns> {
+ : MIMG_nsa_gfx10<op.BASE, (outs DataRC:$vdata), num_addrs, dns> {
let InOperandList = !con(AddrIns,
(ins SReg_256:$srsrc, DMask:$dmask,
- Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
- SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
+ Dim:$dim, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
- let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe"
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"
#!if(BaseOpcode.HasD16, "$d16", "");
}
-multiclass MIMG_NoSampler_Src_Helper <bits<8> op, string asm,
+multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
RegisterClass dst_rc,
- bit enableDisasm> {
+ bit enableDisasm,
+ bit ExtendedImageInst = 1> {
let ssamp = 0 in {
let VAddrDwords = 1 in {
- def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
- !if(enableDisasm, "AMDGPU", "")>;
- def _V1_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VGPR_32,
- !if(enableDisasm, "AMDGPU", "")>;
+ if op.HAS_BASE then {
+ def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ if !not(ExtendedImageInst) then
+ def _V1_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VGPR_32,
+ !if(enableDisasm, "GFX90A", "")>;
+ def _V1_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ }
}
let VAddrDwords = 2 in {
- def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>;
- def _V2_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_64>;
- def _V2_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 2>;
+ if op.HAS_BASE then {
+ def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>;
+ if !not(ExtendedImageInst) then
+ def _V2_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_64>;
+ def _V2_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_64>;
+ def _V2_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 2>;
+ }
}
let VAddrDwords = 3 in {
- def _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>;
- def _V3_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_96>;
- def _V3_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 3>;
+ if op.HAS_BASE then {
+ def _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>;
+ if !not(ExtendedImageInst) then
+ def _V3_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_96>;
+ def _V3_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_96>;
+ def _V3_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 3>;
+ }
}
let VAddrDwords = 4 in {
- def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>;
- def _V4_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_128>;
- def _V4_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 4,
- !if(enableDisasm, "AMDGPU", "")>;
+ if op.HAS_BASE then {
+ def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>;
+ if !not(ExtendedImageInst) then
+ def _V4_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_128>;
+ def _V4_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_128>;
+ def _V4_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 4,
+ !if(enableDisasm, "AMDGPU", "")>;
+ }
}
}
}
-multiclass MIMG_NoSampler <bits<8> op, string asm, bit has_d16, bit mip = 0,
- bit isResInfo = 0> {
+multiclass MIMG_NoSampler <mimgopc op, string asm, bit has_d16, bit mip = 0,
+ bit isResInfo = 0,
+ bit msaa = 0> {
def "" : MIMGBaseOpcode {
let Coordinates = !not(isResInfo);
let LodOrClampOrMip = mip;
let HasD16 = has_d16;
+ let MSAA = msaa;
}
let BaseOpcode = !cast<MIMGBaseOpcode>(NAME),
mayLoad = !not(isResInfo) in {
let VDataDwords = 1 in
- defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>;
+ defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1, msaa>;
let VDataDwords = 2 in
- defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 0>;
+ defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 0, msaa>;
let VDataDwords = 3 in
- defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>;
+ defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0, msaa>;
let VDataDwords = 4 in
- defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>;
+ defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0, msaa>;
let VDataDwords = 5 in
- defm _V5 : MIMG_NoSampler_Src_Helper <op, asm, VReg_160, 0>;
+ defm _V5 : MIMG_NoSampler_Src_Helper <op, asm, VReg_160, 0, msaa>;
}
}
-class MIMG_Store_Helper <bits<8> op, string asm,
+class MIMG_Store_Helper <mimgopc op, string asm,
RegisterClass data_rc,
RegisterClass addr_rc,
string dns = "">
- : MIMG_gfx6789<op, (outs), dns> {
+ : MIMG_gfx6789<op.BASE, (outs), dns> {
let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
- DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+ DMask:$dmask, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
- let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+ let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da"
#!if(BaseOpcode.HasD16, "$d16", "");
}
-class MIMG_Store_gfx10<int op, string opcode,
+class MIMG_Store_Helper_gfx90a <mimgopc op, string asm,
+ RegisterClass data_rc,
+ RegisterClass addr_rc,
+ string dns = "">
+ : MIMG_gfx90a<op.BASE, (outs), dns> {
+ let InOperandList = !con((ins getLdStRegisterOperand<data_rc>.ret:$vdata,
+ addr_rc:$vaddr, SReg_256:$srsrc,
+ DMask:$dmask, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, LWE:$lwe, DA:$da),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_Store_gfx10<mimgopc op, string opcode,
RegisterClass DataRC, RegisterClass AddrRC,
string dns="">
- : MIMG_gfx10<op, (outs), dns> {
+ : MIMG_gfx10<op.BASE, (outs), dns> {
let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc,
- DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc,
- GLC:$glc, SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
+ DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
- let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe"
+ let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"
#!if(BaseOpcode.HasD16, "$d16", "");
}
-class MIMG_Store_nsa_gfx10<int op, string opcode,
+class MIMG_Store_nsa_gfx10<mimgopc op, string opcode,
RegisterClass DataRC, int num_addrs,
string dns="">
- : MIMG_nsa_gfx10<op, (outs), num_addrs, dns> {
+ : MIMG_nsa_gfx10<op.BASE, (outs), num_addrs, dns> {
let InOperandList = !con((ins DataRC:$vdata),
AddrIns,
(ins SReg_256:$srsrc, DMask:$dmask,
- Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
- SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
+ Dim:$dim, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
- let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe"
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"
#!if(BaseOpcode.HasD16, "$d16", "");
}
-multiclass MIMG_Store_Addr_Helper <int op, string asm,
+multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm,
RegisterClass data_rc,
bit enableDisasm> {
let mayLoad = 0, mayStore = 1, hasSideEffects = 0, hasPostISelHook = 0,
DisableWQM = 1, ssamp = 0 in {
let VAddrDwords = 1 in {
- def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32,
- !if(enableDisasm, "AMDGPU", "")>;
- def _V1_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VGPR_32,
- !if(enableDisasm, "AMDGPU", "")>;
+ if op.HAS_BASE then {
+ def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ def _V1_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VGPR_32,
+ !if(enableDisasm, "GFX90A", "")>;
+ def _V1_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ }
}
let VAddrDwords = 2 in {
- def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>;
- def _V2_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_64>;
- def _V2_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 2>;
+ if op.HAS_BASE then {
+ def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>;
+ def _V2_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_64>;
+ def _V2_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_64>;
+ def _V2_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 2>;
+ }
}
let VAddrDwords = 3 in {
- def _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>;
- def _V3_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_96>;
- def _V3_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 3>;
+ if op.HAS_BASE then {
+ def _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>;
+ def _V3_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_96>;
+ def _V3_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_96>;
+ def _V3_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 3>;
+ }
}
let VAddrDwords = 4 in {
- def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>;
- def _V4_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_128>;
- def _V4_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 4,
- !if(enableDisasm, "AMDGPU", "")>;
+ if op.HAS_BASE then {
+ def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>;
+ def _V4_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_128>;
+ def _V4_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_128>;
+ def _V4_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 4,
+ !if(enableDisasm, "AMDGPU", "")>;
+ }
}
}
}
-multiclass MIMG_Store <bits<8> op, string asm, bit has_d16, bit mip = 0> {
+multiclass MIMG_Store <mimgopc op, string asm, bit has_d16, bit mip = 0> {
def "" : MIMGBaseOpcode {
let Store = 1;
let LodOrClampOrMip = mip;
@@ -425,43 +507,63 @@ class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterClass data_rc,
let AsmMatchConverter = "cvtMIMGAtomic";
let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
- DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+ DMask:$dmask, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da);
- let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da";
+ let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da";
+}
+
+class MIMG_Atomic_gfx90a_base <bits<8> op, string asm, RegisterClass data_rc,
+ RegisterClass addr_rc, string dns="">
+ : MIMG_gfx90a <op, (outs getLdStRegisterOperand<data_rc>.ret:$vdst), dns> {
+ let Constraints = "$vdst = $vdata";
+ let AsmMatchConverter = "cvtMIMGAtomic";
+
+ let InOperandList = (ins getLdStRegisterOperand<data_rc>.ret:$vdata,
+ addr_rc:$vaddr, SReg_256:$srsrc,
+ DMask:$dmask, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, LWE:$lwe, DA:$da);
+ let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da";
}
-class MIMG_Atomic_si<mimg op, string asm, RegisterClass data_rc,
+class MIMG_Atomic_si<mimgopc op, string asm, RegisterClass data_rc,
RegisterClass addr_rc, bit enableDasm = 0>
- : MIMG_Atomic_gfx6789_base<op.SI_GFX10, asm, data_rc, addr_rc,
+ : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc,
!if(enableDasm, "GFX6GFX7", "")> {
let AssemblerPredicate = isGFX6GFX7;
}
-class MIMG_Atomic_vi<mimg op, string asm, RegisterClass data_rc,
+class MIMG_Atomic_vi<mimgopc op, string asm, RegisterClass data_rc,
RegisterClass addr_rc, bit enableDasm = 0>
: MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX8", "")> {
- let AssemblerPredicate = isGFX8GFX9;
+ let AssemblerPredicate = isGFX8GFX9NotGFX90A;
let MIMGEncoding = MIMGEncGfx8;
}
-class MIMG_Atomic_gfx10<mimg op, string opcode,
+class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterClass data_rc,
+ RegisterClass addr_rc, bit enableDasm = 0>
+ : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX90A", "")> {
+ let AssemblerPredicate = isGFX90APlus;
+ let MIMGEncoding = MIMGEncGfx90a;
+}
+
+class MIMG_Atomic_gfx10<mimgopc op, string opcode,
RegisterClass DataRC, RegisterClass AddrRC,
bit enableDisasm = 0>
- : MIMG_gfx10<!cast<int>(op.SI_GFX10), (outs DataRC:$vdst),
+ : MIMG_gfx10<!cast<int>(op.BASE), (outs DataRC:$vdst),
!if(enableDisasm, "AMDGPU", "")> {
let Constraints = "$vdst = $vdata";
let AsmMatchConverter = "cvtMIMGAtomic";
let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc,
- DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc,
- GLC:$glc, SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe);
- let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe";
+ DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe);
+ let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
}
-class MIMG_Atomic_nsa_gfx10<mimg op, string opcode,
+class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode,
RegisterClass DataRC, int num_addrs,
bit enableDisasm = 0>
- : MIMG_nsa_gfx10<!cast<int>(op.SI_GFX10), (outs DataRC:$vdst), num_addrs,
+ : MIMG_nsa_gfx10<!cast<int>(op.BASE), (outs DataRC:$vdst), num_addrs,
!if(enableDisasm, "AMDGPU", "")> {
let Constraints = "$vdst = $vdata";
let AsmMatchConverter = "cvtMIMGAtomic";
@@ -469,95 +571,137 @@ class MIMG_Atomic_nsa_gfx10<mimg op, string opcode,
let InOperandList = !con((ins DataRC:$vdata),
AddrIns,
(ins SReg_256:$srsrc, DMask:$dmask,
- Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
- SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe));
- let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe";
+ Dim:$dim, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe));
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
}
-multiclass MIMG_Atomic_Addr_Helper_m <mimg op, string asm,
+multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
RegisterClass data_rc,
- bit enableDasm = 0> {
+ bit enableDasm = 0,
+ bit isFP = 0> {
let hasSideEffects = 1, // FIXME: remove this
mayLoad = 1, mayStore = 1, hasPostISelHook = 0, DisableWQM = 1,
- ssamp = 0 in {
+ ssamp = 0, FPAtomic = isFP in {
let VAddrDwords = 1 in {
- def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, enableDasm>;
- def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>;
- def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>;
+ if op.HAS_SI then {
+ def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, enableDasm>;
+ }
+ if op.HAS_VI then {
+ def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>;
+ def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, enableDasm>;
+ }
+ if op.HAS_BASE then {
+ def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>;
+ }
}
let VAddrDwords = 2 in {
- def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, 0>;
- def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>;
- def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>;
- def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>;
+ if op.HAS_SI then {
+ def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, 0>;
+ }
+ if op.HAS_VI then {
+ def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>;
+ def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64, 0>;
+ }
+ if op.HAS_BASE then {
+ def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>;
+ def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>;
+ }
}
let VAddrDwords = 3 in {
- def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, 0>;
- def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>;
- def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>;
- def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>;
+ if op.HAS_SI then {
+ def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, 0>;
+ }
+ if op.HAS_VI then {
+ def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>;
+ def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96, 0>;
+ }
+ if op.HAS_BASE then {
+ def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>;
+ def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>;
+ }
}
let VAddrDwords = 4 in {
- def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, 0>;
- def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>;
- def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>;
- def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>;
+ if op.HAS_SI then {
+ def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, 0>;
+ }
+ if op.HAS_VI then {
+ def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>;
+ def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128, 0>;
+ }
+ if op.HAS_BASE then {
+ def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>;
+ def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>;
+ }
}
}
}
-multiclass MIMG_Atomic <mimg op, string asm, bit isCmpSwap = 0> { // 64-bit atomics
- def "" : MIMGBaseOpcode {
- let Atomic = 1;
- let AtomicX2 = isCmpSwap;
- }
+multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0> { // 64-bit atomics
+ let IsAtomicRet = 1 in {
+ def "" : MIMGBaseOpcode {
+ let Atomic = 1;
+ let AtomicX2 = isCmpSwap;
+ }
- let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in {
- // _V* variants have different dst size, but the size is encoded implicitly,
- // using dmask and tfe. Only 32-bit variant is registered with disassembler.
- // Other variants are reconstructed by disassembler using dmask and tfe.
- let VDataDwords = !if(isCmpSwap, 2, 1) in
- defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_64, VGPR_32), 1>;
- let VDataDwords = !if(isCmpSwap, 4, 2) in
- defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_128, VReg_64)>;
- }
+ let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in {
+ // _V* variants have different dst size, but the size is encoded implicitly,
+ // using dmask and tfe. Only 32-bit variant is registered with disassembler.
+ // Other variants are reconstructed by disassembler using dmask and tfe.
+ let VDataDwords = !if(isCmpSwap, 2, 1) in
+ defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_64, VGPR_32), 1, isFP>;
+ let VDataDwords = !if(isCmpSwap, 4, 2) in
+ defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_128, VReg_64), 0, isFP>;
+ }
+ } // End IsAtomicRet = 1
}
-class MIMG_Sampler_Helper <bits<8> op, string asm, RegisterClass dst_rc,
+class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterClass dst_rc,
RegisterClass src_rc, string dns="">
- : MIMG_gfx6789 <op, (outs dst_rc:$vdata), dns> {
+ : MIMG_gfx6789 <op.BASE, (outs dst_rc:$vdata), dns> {
let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
- DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
+ DMask:$dmask, UNorm:$unorm, CPol:$cpol,
R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
- let AsmString = asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+ let AsmString = asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$cpol$r128$tfe$lwe$da"
#!if(BaseOpcode.HasD16, "$d16", "");
}
-class MIMG_Sampler_gfx10<int op, string opcode,
+class MIMG_Sampler_gfx90a<mimgopc op, string asm, RegisterClass dst_rc,
+ RegisterClass src_rc, string dns="">
+ : MIMG_gfx90a<op.BASE, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> {
+ let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
+ DMask:$dmask, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, LWE:$lwe, DA:$da),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$cpol$r128$lwe$da"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class MIMG_Sampler_gfx10<mimgopc op, string opcode,
RegisterClass DataRC, RegisterClass AddrRC,
string dns="">
- : MIMG_gfx10<op, (outs DataRC:$vdata), dns> {
+ : MIMG_gfx10<op.BASE, (outs DataRC:$vdata), dns> {
let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, SReg_128:$ssamp,
- DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc,
- GLC:$glc, SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
+ DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
let AsmString = opcode#" $vdata, $vaddr0, $srsrc, $ssamp$dmask$dim$unorm"
- #"$dlc$glc$slc$r128$a16$tfe$lwe"
+ #"$cpol$r128$a16$tfe$lwe"
#!if(BaseOpcode.HasD16, "$d16", "");
}
-class MIMG_Sampler_nsa_gfx10<int op, string opcode,
+class MIMG_Sampler_nsa_gfx10<mimgopc op, string opcode,
RegisterClass DataRC, int num_addrs,
string dns="">
- : MIMG_nsa_gfx10<op, (outs DataRC:$vdata), num_addrs, dns> {
+ : MIMG_nsa_gfx10<op.BASE, (outs DataRC:$vdata), num_addrs, dns> {
let InOperandList = !con(AddrIns,
(ins SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask,
- Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
- SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
+ Dim:$dim, UNorm:$unorm, CPol:$cpol,
+ R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc, $ssamp$dmask$dim$unorm"
- #"$dlc$glc$slc$r128$a16$tfe$lwe"
+ #"$cpol$r128$a16$tfe$lwe"
#!if(BaseOpcode.HasD16, "$d16", "");
}
@@ -569,8 +713,11 @@ class MIMGAddrSize<int dw, bit enable_disasm> {
!if(!eq(NumWords, 2), VReg_64,
!if(!eq(NumWords, 3), VReg_96,
!if(!eq(NumWords, 4), VReg_128,
+ !if(!eq(NumWords, 5), VReg_160,
+ !if(!eq(NumWords, 6), VReg_192,
+ !if(!eq(NumWords, 7), VReg_224,
!if(!le(NumWords, 8), VReg_256,
- !if(!le(NumWords, 16), VReg_512, ?)))))));
+ !if(!le(NumWords, 16), VReg_512, ?))))))))));
// Whether the instruction variant with this vaddr size should be enabled for
// the auto-generated disassembler.
@@ -588,9 +735,9 @@ class isRangeInList<int min, int max, list<int> lst> {
bit ret = !foldl(0, lst, lhs, y, !or(lhs, !and(!le(min, y), !le(y, max))));
}
-class MIMGAddrSizes_tmp<list<MIMGAddrSize> lst, int min> {
- list<MIMGAddrSize> List = lst;
- int Min = min;
+class MIMGAddrSizes_dw_range<list<int> range> {
+ int Min = !head(range);
+ int Max = !if(!empty(!tail(range)), Min, !head(!tail(range)));
}
class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample> {
@@ -600,8 +747,8 @@ class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample> {
list<int> AllNumAddrWords =
!foreach(dw, !if(sample.Gradients,
!if(!eq(sample.LodOrClamp, ""),
- [2, 3, 4, 5, 6, 7, 9],
- [2, 3, 4, 5, 7, 8, 10]),
+ [2, 3, 4, 5, 6, 7, 8, 9],
+ [2, 3, 4, 5, 6, 7, 8, 9, 10]),
!if(!eq(sample.LodOrClamp, ""),
[1, 2, 3],
[1, 2, 3, 4])),
@@ -611,12 +758,17 @@ class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample> {
// required numbers of address words. The disassembler defaults to the
// smallest register class.
list<MIMGAddrSize> MachineInstrs =
- !foldl(MIMGAddrSizes_tmp<[], 0>, [1, 2, 3, 4, 8, 16], lhs, dw,
- !if(isRangeInList<lhs.Min, dw, AllNumAddrWords>.ret,
- MIMGAddrSizes_tmp<
- !listconcat(lhs.List, [MIMGAddrSize<dw, !empty(lhs.List)>]),
- !if(!eq(dw, 3), 3, !add(dw, 1))>, // we still need _V4 for codegen w/ 3 dwords
- lhs)).List;
+ !foldl([]<MIMGAddrSize>,
+ !foreach(range,
+ // V4 is generated for V3 and V4
+ // V8 is generated for V5 through V8
+ // V16 is generated for V9 through V16
+ [[1],[2],[3],[3,4],[5],[6],[7],[5,8],[9,16]],
+ MIMGAddrSizes_dw_range<range>),
+ lhs, dw,
+ !if(isRangeInList<dw.Min, dw.Max, AllNumAddrWords>.ret,
+ !listconcat(lhs, [MIMGAddrSize<dw.Max, !empty(lhs)>]),
+ lhs));
// For NSA, generate machine instructions for all possible numbers of words
// except 1 (which is already covered by the non-NSA case).
@@ -632,25 +784,34 @@ class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample> {
lhs))));
}
-multiclass MIMG_Sampler_Src_Helper <bits<8> op, string asm,
+multiclass MIMG_Sampler_Src_Helper <mimgopc op, string asm,
AMDGPUSampleVariant sample, RegisterClass dst_rc,
- bit enableDisasm = 0> {
+ bit enableDisasm = 0,
+ bit ExtendedImageInst = 1> {
foreach addr = MIMG_Sampler_AddrSizes<sample>.MachineInstrs in {
let VAddrDwords = addr.NumWords in {
- def _V # addr.NumWords
- : MIMG_Sampler_Helper <op, asm, dst_rc, addr.RegClass,
- !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
- def _V # addr.NumWords # _gfx10
- : MIMG_Sampler_gfx10 <op, asm, dst_rc, addr.RegClass,
- !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+ if op.HAS_BASE then {
+ def _V # addr.NumWords
+ : MIMG_Sampler_Helper <op, asm, dst_rc, addr.RegClass,
+ !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+ if !not(ExtendedImageInst) then
+ def _V # addr.NumWords # _gfx90a
+ : MIMG_Sampler_gfx90a <op, asm, dst_rc, addr.RegClass,
+ !if(!and(enableDisasm, addr.Disassemble), "GFX90A", "")>;
+ def _V # addr.NumWords # _gfx10
+ : MIMG_Sampler_gfx10 <op, asm, dst_rc, addr.RegClass,
+ !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+ }
}
}
foreach addr = MIMG_Sampler_AddrSizes<sample>.NSAInstrs in {
let VAddrDwords = addr.NumWords in {
- def _V # addr.NumWords # _nsa_gfx10
- : MIMG_Sampler_nsa_gfx10<op, asm, dst_rc, addr.NumWords,
- !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+ if op.HAS_BASE then {
+ def _V # addr.NumWords # _nsa_gfx10
+ : MIMG_Sampler_nsa_gfx10<op, asm, dst_rc, addr.NumWords,
+ !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
+ }
}
}
}
@@ -663,9 +824,10 @@ class MIMG_Sampler_BaseOpcode<AMDGPUSampleVariant sample>
let LodOrClampOrMip = !ne(sample.LodOrClamp, "");
}
-multiclass MIMG_Sampler <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0,
+multiclass MIMG_Sampler <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
bit isG16 = 0, bit isGetLod = 0,
- string asm = "image_sample"#sample.LowerCaseMod#!if(isG16, "_g16", "")> {
+ string asm = "image_sample"#sample.LowerCaseMod#!if(isG16, "_g16", ""),
+ bit ExtendedImageInst = !ne(sample.LowerCaseMod, "")> {
def "" : MIMG_Sampler_BaseOpcode<sample> {
let HasD16 = !not(isGetLod);
let G16 = isG16;
@@ -674,22 +836,22 @@ multiclass MIMG_Sampler <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0,
let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
mayLoad = !not(isGetLod) in {
let VDataDwords = 1 in
- defm _V1 : MIMG_Sampler_Src_Helper<op, asm, sample, VGPR_32, 1>;
+ defm _V1 : MIMG_Sampler_Src_Helper<op, asm, sample, VGPR_32, 1, ExtendedImageInst>;
let VDataDwords = 2 in
- defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>;
+ defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64, 0, ExtendedImageInst>;
let VDataDwords = 3 in
- defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>;
+ defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96, 0, ExtendedImageInst>;
let VDataDwords = 4 in
- defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>;
+ defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 0, ExtendedImageInst>;
let VDataDwords = 5 in
- defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160>;
+ defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160, 0, ExtendedImageInst>;
}
}
-multiclass MIMG_Sampler_WQM <bits<8> op, AMDGPUSampleVariant sample>
+multiclass MIMG_Sampler_WQM <mimgopc op, AMDGPUSampleVariant sample>
: MIMG_Sampler<op, sample, 1>;
-multiclass MIMG_Gather <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0,
+multiclass MIMG_Gather <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
string asm = "image_gather4"#sample.LowerCaseMod> {
def "" : MIMG_Sampler_BaseOpcode<sample> {
let HasD16 = 1;
@@ -697,7 +859,7 @@ multiclass MIMG_Gather <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0,
}
let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
- Gather4 = 1, hasPostISelHook = 0 in {
+ Gather4 = 1 in {
let VDataDwords = 2 in
defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */
let VDataDwords = 4 in
@@ -707,11 +869,11 @@ multiclass MIMG_Gather <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0,
}
}
-multiclass MIMG_Gather_WQM <bits<8> op, AMDGPUSampleVariant sample>
+multiclass MIMG_Gather_WQM <mimgopc op, AMDGPUSampleVariant sample>
: MIMG_Gather<op, sample, 1>;
-class MIMG_IntersectRay_gfx10<int op, string opcode, RegisterClass AddrRC, bit A16>
- : MIMG_gfx10<op, (outs VReg_128:$vdata), "AMDGPU"> {
+class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC, bit A16>
+ : MIMG_gfx10<op.BASE, (outs VReg_128:$vdata), "AMDGPU"> {
let InOperandList = !con((ins AddrRC:$vaddr0, SReg_128:$srsrc),
!if(A16, (ins GFX10A16:$a16), (ins)));
@@ -720,25 +882,23 @@ class MIMG_IntersectRay_gfx10<int op, string opcode, RegisterClass AddrRC, bit A
let nsa = 0;
}
-class MIMG_IntersectRay_nsa_gfx10<int op, string opcode, int num_addrs, bit A16>
- : MIMG_nsa_gfx10<op, (outs VReg_128:$vdata), num_addrs, "AMDGPU"> {
+class MIMG_IntersectRay_nsa_gfx10<mimgopc op, string opcode, int num_addrs, bit A16>
+ : MIMG_nsa_gfx10<op.BASE, (outs VReg_128:$vdata), num_addrs, "AMDGPU"> {
let InOperandList = !con(nsah.AddrIns,
(ins SReg_128:$srsrc),
!if(A16, (ins GFX10A16:$a16), (ins)));
let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc"#!if(A16, "$a16", "");
}
-multiclass MIMG_IntersectRay<int op, string opcode, int num_addrs, bit A16> {
+multiclass MIMG_IntersectRay<mimgopc op, string opcode, int num_addrs, bit A16> {
def "" : MIMGBaseOpcode;
- let SubtargetPredicate = HasGFX10_BEncoding,
- AssemblerPredicate = HasGFX10_BEncoding,
+ let SubtargetPredicate = HasGFX10_AEncoding,
+ AssemblerPredicate = HasGFX10_AEncoding,
AsmMatchConverter = !if(A16, "cvtIntersectRay", ""),
dmask = 0xf,
unorm = 1,
d16 = 0,
- glc = 0,
- slc = 0,
- dlc = 0,
+ cpol = 0,
tfe = 0,
lwe = 0,
r128 = 1,
@@ -762,131 +922,133 @@ multiclass MIMG_IntersectRay<int op, string opcode, int num_addrs, bit A16> {
//===----------------------------------------------------------------------===//
// MIMG Instructions
//===----------------------------------------------------------------------===//
-defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load", 1>;
-defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip", 1, 1>;
-defm IMAGE_LOAD_PCK : MIMG_NoSampler <0x00000002, "image_load_pck", 0>;
-defm IMAGE_LOAD_PCK_SGN : MIMG_NoSampler <0x00000003, "image_load_pck_sgn", 0>;
-defm IMAGE_LOAD_MIP_PCK : MIMG_NoSampler <0x00000004, "image_load_mip_pck", 0, 1>;
-defm IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoSampler <0x00000005, "image_load_mip_pck_sgn", 0, 1>;
-defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store", 1>;
-defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip", 1, 1>;
-defm IMAGE_STORE_PCK : MIMG_Store <0x0000000a, "image_store_pck", 0>;
-defm IMAGE_STORE_MIP_PCK : MIMG_Store <0x0000000b, "image_store_mip_pck", 0, 1>;
-
-defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo", 0, 1, 1>;
-
-defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimg<0x0f, 0x10>, "image_atomic_swap">;
-defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", 1>;
-defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimg<0x11, 0x12>, "image_atomic_add">;
-defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimg<0x12, 0x13>, "image_atomic_sub">;
-//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; -- not on VI
-defm IMAGE_ATOMIC_SMIN : MIMG_Atomic <mimg<0x14>, "image_atomic_smin">;
-defm IMAGE_ATOMIC_UMIN : MIMG_Atomic <mimg<0x15>, "image_atomic_umin">;
-defm IMAGE_ATOMIC_SMAX : MIMG_Atomic <mimg<0x16>, "image_atomic_smax">;
-defm IMAGE_ATOMIC_UMAX : MIMG_Atomic <mimg<0x17>, "image_atomic_umax">;
-defm IMAGE_ATOMIC_AND : MIMG_Atomic <mimg<0x18>, "image_atomic_and">;
-defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimg<0x19>, "image_atomic_or">;
-defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimg<0x1a>, "image_atomic_xor">;
-defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimg<0x1b>, "image_atomic_inc">;
-defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimg<0x1c>, "image_atomic_dec">;
-//let FPAtomic = 1 in {
-//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d, 1>; -- not on VI
-//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI
-//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI
-//} // End let FPAtomic = 1
-defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, AMDGPUSample>;
-defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, AMDGPUSample_cl>;
-defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, AMDGPUSample_d>;
-defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, AMDGPUSample_d_cl>;
-defm IMAGE_SAMPLE_D_G16 : MIMG_Sampler <0x000000a2, AMDGPUSample_d, 0, 1>;
-defm IMAGE_SAMPLE_D_CL_G16 : MIMG_Sampler <0x000000a3, AMDGPUSample_d_cl, 0, 1>;
-defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, AMDGPUSample_l>;
-defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, AMDGPUSample_b>;
-defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, AMDGPUSample_b_cl>;
-defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, AMDGPUSample_lz>;
-defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, AMDGPUSample_c>;
-defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, AMDGPUSample_c_cl>;
-defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, AMDGPUSample_c_d>;
-defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, AMDGPUSample_c_d_cl>;
-defm IMAGE_SAMPLE_C_D_G16 : MIMG_Sampler <0x000000aa, AMDGPUSample_c_d, 0, 1>;
-defm IMAGE_SAMPLE_C_D_CL_G16 : MIMG_Sampler <0x000000ab, AMDGPUSample_c_d_cl, 0, 1>;
-defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, AMDGPUSample_c_l>;
-defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, AMDGPUSample_c_b>;
-defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, AMDGPUSample_c_b_cl>;
-defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, AMDGPUSample_c_lz>;
-defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, AMDGPUSample_o>;
-defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, AMDGPUSample_cl_o>;
-defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, AMDGPUSample_d_o>;
-defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, AMDGPUSample_d_cl_o>;
-defm IMAGE_SAMPLE_D_O_G16 : MIMG_Sampler <0x000000b2, AMDGPUSample_d_o, 0, 1>;
-defm IMAGE_SAMPLE_D_CL_O_G16 : MIMG_Sampler <0x000000b3, AMDGPUSample_d_cl_o, 0, 1>;
-defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, AMDGPUSample_l_o>;
-defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, AMDGPUSample_b_o>;
-defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, AMDGPUSample_b_cl_o>;
-defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, AMDGPUSample_lz_o>;
-defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, AMDGPUSample_c_o>;
-defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, AMDGPUSample_c_cl_o>;
-defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, AMDGPUSample_c_d_o>;
-defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, AMDGPUSample_c_d_cl_o>;
-defm IMAGE_SAMPLE_C_D_O_G16 : MIMG_Sampler <0x000000ba, AMDGPUSample_c_d_o, 0, 1>;
-defm IMAGE_SAMPLE_C_D_CL_O_G16 : MIMG_Sampler <0x000000bb, AMDGPUSample_c_d_cl_o, 0, 1>;
-defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, AMDGPUSample_c_l_o>;
-defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, AMDGPUSample_c_b_cl_o>;
-defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, AMDGPUSample_c_b_o>;
-defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, AMDGPUSample_c_lz_o>;
-defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, AMDGPUSample>;
-defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, AMDGPUSample_cl>;
-defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, AMDGPUSample_l>;
-defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, AMDGPUSample_b>;
-defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, AMDGPUSample_b_cl>;
-defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, AMDGPUSample_lz>;
-defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, AMDGPUSample_c>;
-defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, AMDGPUSample_c_cl>;
-defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, AMDGPUSample_c_l>;
-defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, AMDGPUSample_c_b>;
-defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, AMDGPUSample_c_b_cl>;
-defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, AMDGPUSample_c_lz>;
-defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, AMDGPUSample_o>;
-defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, AMDGPUSample_cl_o>;
-defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, AMDGPUSample_l_o>;
-defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, AMDGPUSample_b_o>;
-defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, AMDGPUSample_b_cl_o>;
-defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, AMDGPUSample_lz_o>;
-defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, AMDGPUSample_c_o>;
-defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, AMDGPUSample_c_cl_o>;
-defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, AMDGPUSample_c_l_o>;
-defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, AMDGPUSample_c_b_o>;
-defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, AMDGPUSample_c_b_cl_o>;
-defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, AMDGPUSample_c_lz_o>;
-
-defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, AMDGPUSample, 1, 0, 1, "image_get_lod">;
-
-defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, AMDGPUSample_cd>;
-defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, AMDGPUSample_cd_cl>;
-defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, AMDGPUSample_c_cd>;
-defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, AMDGPUSample_c_cd_cl>;
-defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, AMDGPUSample_cd_o>;
-defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, AMDGPUSample_cd_cl_o>;
-defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, AMDGPUSample_c_cd_o>;
-defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, AMDGPUSample_c_cd_cl_o>;
-defm IMAGE_SAMPLE_CD_G16 : MIMG_Sampler <0x000000e8, AMDGPUSample_cd, 0, 1>;
-defm IMAGE_SAMPLE_CD_CL_G16 : MIMG_Sampler <0x000000e9, AMDGPUSample_cd_cl, 0, 1>;
-defm IMAGE_SAMPLE_C_CD_G16 : MIMG_Sampler <0x000000ea, AMDGPUSample_c_cd, 0, 1>;
-defm IMAGE_SAMPLE_C_CD_CL_G16 : MIMG_Sampler <0x000000eb, AMDGPUSample_c_cd_cl, 0, 1>;
-defm IMAGE_SAMPLE_CD_O_G16 : MIMG_Sampler <0x000000ec, AMDGPUSample_cd_o, 0, 1>;
-defm IMAGE_SAMPLE_CD_CL_O_G16 : MIMG_Sampler <0x000000ed, AMDGPUSample_cd_cl_o, 0, 1>;
-defm IMAGE_SAMPLE_C_CD_O_G16 : MIMG_Sampler <0x000000ee, AMDGPUSample_c_cd_o, 0, 1>;
-defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <0x000000ef, AMDGPUSample_c_cd_cl_o, 0, 1>;
+defm IMAGE_LOAD : MIMG_NoSampler <mimgopc<0x00>, "image_load", 1>;
+defm IMAGE_LOAD_MIP : MIMG_NoSampler <mimgopc<0x01>, "image_load_mip", 1, 1>;
+defm IMAGE_LOAD_PCK : MIMG_NoSampler <mimgopc<0x02>, "image_load_pck", 0>;
+defm IMAGE_LOAD_PCK_SGN : MIMG_NoSampler <mimgopc<0x03>, "image_load_pck_sgn", 0>;
+defm IMAGE_LOAD_MIP_PCK : MIMG_NoSampler <mimgopc<0x04>, "image_load_mip_pck", 0, 1>;
+defm IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoSampler <mimgopc<0x05>, "image_load_mip_pck_sgn", 0, 1>;
+defm IMAGE_STORE : MIMG_Store <mimgopc<0x08>, "image_store", 1>;
+defm IMAGE_STORE_MIP : MIMG_Store <mimgopc<0x09>, "image_store_mip", 1, 1>;
+defm IMAGE_STORE_PCK : MIMG_Store <mimgopc<0x0a>, "image_store_pck", 0>;
+defm IMAGE_STORE_MIP_PCK : MIMG_Store <mimgopc<0x0b>, "image_store_mip_pck", 0, 1>;
+
+defm IMAGE_GET_RESINFO : MIMG_NoSampler <mimgopc<0x0e>, "image_get_resinfo", 0, 1, 1>;
+
+defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimgopc<0x0f, 0x10, 0x0f>, "image_atomic_swap">;
+defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimgopc<0x10, 0x11, 0x10>, "image_atomic_cmpswap", 1>;
+defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimgopc<0x11, 0x12, 0x11>, "image_atomic_add">;
+defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimgopc<0x12, 0x13, 0x12>, "image_atomic_sub">;
+defm IMAGE_ATOMIC_RSUB : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, 0x13>, "image_atomic_rsub">;
+defm IMAGE_ATOMIC_SMIN : MIMG_Atomic <mimgopc<0x14>, "image_atomic_smin">;
+defm IMAGE_ATOMIC_UMIN : MIMG_Atomic <mimgopc<0x15>, "image_atomic_umin">;
+defm IMAGE_ATOMIC_SMAX : MIMG_Atomic <mimgopc<0x16>, "image_atomic_smax">;
+defm IMAGE_ATOMIC_UMAX : MIMG_Atomic <mimgopc<0x17>, "image_atomic_umax">;
+defm IMAGE_ATOMIC_AND : MIMG_Atomic <mimgopc<0x18>, "image_atomic_and">;
+defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimgopc<0x19>, "image_atomic_or">;
+defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimgopc<0x1a>, "image_atomic_xor">;
+defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimgopc<0x1b>, "image_atomic_inc">;
+defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimgopc<0x1c>, "image_atomic_dec">;
+defm IMAGE_ATOMIC_FCMPSWAP : MIMG_Atomic <mimgopc<0x1d, MIMG.NOP>, "image_atomic_fcmpswap", 0, 1>;
+defm IMAGE_ATOMIC_FMIN : MIMG_Atomic <mimgopc<0x1e, MIMG.NOP>, "image_atomic_fmin", 0, 1>;
+defm IMAGE_ATOMIC_FMAX : MIMG_Atomic <mimgopc<0x1f, MIMG.NOP>, "image_atomic_fmax", 0, 1>;
+
+defm IMAGE_SAMPLE : MIMG_Sampler_WQM <mimgopc<0x20>, AMDGPUSample>;
+let OtherPredicates = [HasExtendedImageInsts] in {
+defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <mimgopc<0x21>, AMDGPUSample_cl>;
+defm IMAGE_SAMPLE_D : MIMG_Sampler <mimgopc<0x22>, AMDGPUSample_d>;
+defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <mimgopc<0x23>, AMDGPUSample_d_cl>;
+defm IMAGE_SAMPLE_D_G16 : MIMG_Sampler <mimgopc<0xa2>, AMDGPUSample_d, 0, 1>;
+defm IMAGE_SAMPLE_D_CL_G16 : MIMG_Sampler <mimgopc<0xa3>, AMDGPUSample_d_cl, 0, 1>;
+defm IMAGE_SAMPLE_L : MIMG_Sampler <mimgopc<0x24>, AMDGPUSample_l>;
+defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <mimgopc<0x25>, AMDGPUSample_b>;
+defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <mimgopc<0x26>, AMDGPUSample_b_cl>;
+defm IMAGE_SAMPLE_LZ : MIMG_Sampler <mimgopc<0x27>, AMDGPUSample_lz>;
+defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <mimgopc<0x28>, AMDGPUSample_c>;
+defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <mimgopc<0x29>, AMDGPUSample_c_cl>;
+defm IMAGE_SAMPLE_C_D : MIMG_Sampler <mimgopc<0x2a>, AMDGPUSample_c_d>;
+defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <mimgopc<0x2b>, AMDGPUSample_c_d_cl>;
+defm IMAGE_SAMPLE_C_D_G16 : MIMG_Sampler <mimgopc<0xaa>, AMDGPUSample_c_d, 0, 1>;
+defm IMAGE_SAMPLE_C_D_CL_G16 : MIMG_Sampler <mimgopc<0xab>, AMDGPUSample_c_d_cl, 0, 1>;
+defm IMAGE_SAMPLE_C_L : MIMG_Sampler <mimgopc<0x2c>, AMDGPUSample_c_l>;
+defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <mimgopc<0x2d>, AMDGPUSample_c_b>;
+defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <mimgopc<0x2e>, AMDGPUSample_c_b_cl>;
+defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <mimgopc<0x2f>, AMDGPUSample_c_lz>;
+defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <mimgopc<0x30>, AMDGPUSample_o>;
+defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <mimgopc<0x31>, AMDGPUSample_cl_o>;
+defm IMAGE_SAMPLE_D_O : MIMG_Sampler <mimgopc<0x32>, AMDGPUSample_d_o>;
+defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <mimgopc<0x33>, AMDGPUSample_d_cl_o>;
+defm IMAGE_SAMPLE_D_O_G16 : MIMG_Sampler <mimgopc<0xb2>, AMDGPUSample_d_o, 0, 1>;
+defm IMAGE_SAMPLE_D_CL_O_G16 : MIMG_Sampler <mimgopc<0xb3>, AMDGPUSample_d_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_L_O : MIMG_Sampler <mimgopc<0x34>, AMDGPUSample_l_o>;
+defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <mimgopc<0x35>, AMDGPUSample_b_o>;
+defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <mimgopc<0x36>, AMDGPUSample_b_cl_o>;
+defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <mimgopc<0x37>, AMDGPUSample_lz_o>;
+defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <mimgopc<0x38>, AMDGPUSample_c_o>;
+defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <mimgopc<0x39>, AMDGPUSample_c_cl_o>;
+defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <mimgopc<0x3a>, AMDGPUSample_c_d_o>;
+defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <mimgopc<0x3b>, AMDGPUSample_c_d_cl_o>;
+defm IMAGE_SAMPLE_C_D_O_G16 : MIMG_Sampler <mimgopc<0xba>, AMDGPUSample_c_d_o, 0, 1>;
+defm IMAGE_SAMPLE_C_D_CL_O_G16 : MIMG_Sampler <mimgopc<0xbb>, AMDGPUSample_c_d_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <mimgopc<0x3c>, AMDGPUSample_c_l_o>;
+defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <mimgopc<0x3e>, AMDGPUSample_c_b_cl_o>;
+defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <mimgopc<0x3d>, AMDGPUSample_c_b_o>;
+defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <mimgopc<0x3f>, AMDGPUSample_c_lz_o>;
+defm IMAGE_GATHER4 : MIMG_Gather_WQM <mimgopc<0x40>, AMDGPUSample>;
+defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <mimgopc<0x41>, AMDGPUSample_cl>;
+defm IMAGE_GATHER4_L : MIMG_Gather <mimgopc<0x44>, AMDGPUSample_l>;
+defm IMAGE_GATHER4_B : MIMG_Gather_WQM <mimgopc<0x45>, AMDGPUSample_b>;
+defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <mimgopc<0x46>, AMDGPUSample_b_cl>;
+defm IMAGE_GATHER4_LZ : MIMG_Gather <mimgopc<0x47>, AMDGPUSample_lz>;
+defm IMAGE_GATHER4_C : MIMG_Gather_WQM <mimgopc<0x48>, AMDGPUSample_c>;
+defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <mimgopc<0x49>, AMDGPUSample_c_cl>;
+defm IMAGE_GATHER4_C_L : MIMG_Gather <mimgopc<0x4c>, AMDGPUSample_c_l>;
+defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <mimgopc<0x4d>, AMDGPUSample_c_b>;
+defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <mimgopc<0x4e>, AMDGPUSample_c_b_cl>;
+defm IMAGE_GATHER4_C_LZ : MIMG_Gather <mimgopc<0x4f>, AMDGPUSample_c_lz>;
+defm IMAGE_GATHER4_O : MIMG_Gather_WQM <mimgopc<0x50>, AMDGPUSample_o>;
+defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <mimgopc<0x51>, AMDGPUSample_cl_o>;
+defm IMAGE_GATHER4_L_O : MIMG_Gather <mimgopc<0x54>, AMDGPUSample_l_o>;
+defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <mimgopc<0x55>, AMDGPUSample_b_o>;
+defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <mimgopc<0x56>, AMDGPUSample_b_cl_o>;
+defm IMAGE_GATHER4_LZ_O : MIMG_Gather <mimgopc<0x57>, AMDGPUSample_lz_o>;
+defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <mimgopc<0x58>, AMDGPUSample_c_o>;
+defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <mimgopc<0x59>, AMDGPUSample_c_cl_o>;
+defm IMAGE_GATHER4_C_L_O : MIMG_Gather <mimgopc<0x5c>, AMDGPUSample_c_l_o>;
+defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <mimgopc<0x5d>, AMDGPUSample_c_b_o>;
+defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <mimgopc<0x5e>, AMDGPUSample_c_b_cl_o>;
+defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <mimgopc<0x5f>, AMDGPUSample_c_lz_o>;
+//defm IMAGE_GATHER4H : MIMG_Gather_WQM <mimgopc<0x61>, ?>;
+
+defm IMAGE_GET_LOD : MIMG_Sampler <mimgopc<0x60>, AMDGPUSample, 1, 0, 1, "image_get_lod">;
+
+defm IMAGE_SAMPLE_CD : MIMG_Sampler <mimgopc<0x68>, AMDGPUSample_cd>;
+defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <mimgopc<0x69>, AMDGPUSample_cd_cl>;
+defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <mimgopc<0x6a>, AMDGPUSample_c_cd>;
+defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <mimgopc<0x6b>, AMDGPUSample_c_cd_cl>;
+defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <mimgopc<0x6c>, AMDGPUSample_cd_o>;
+defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <mimgopc<0x6d>, AMDGPUSample_cd_cl_o>;
+defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <mimgopc<0x6e>, AMDGPUSample_c_cd_o>;
+defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <mimgopc<0x6f>, AMDGPUSample_c_cd_cl_o>;
+defm IMAGE_SAMPLE_CD_G16 : MIMG_Sampler <mimgopc<0xe8>, AMDGPUSample_cd, 0, 1>;
+defm IMAGE_SAMPLE_CD_CL_G16 : MIMG_Sampler <mimgopc<0xe9>, AMDGPUSample_cd_cl, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_G16 : MIMG_Sampler <mimgopc<0xea>, AMDGPUSample_c_cd, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_CL_G16 : MIMG_Sampler <mimgopc<0xeb>, AMDGPUSample_c_cd_cl, 0, 1>;
+defm IMAGE_SAMPLE_CD_O_G16 : MIMG_Sampler <mimgopc<0xec>, AMDGPUSample_cd_o, 0, 1>;
+defm IMAGE_SAMPLE_CD_CL_O_G16 : MIMG_Sampler <mimgopc<0xed>, AMDGPUSample_cd_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_O_G16 : MIMG_Sampler <mimgopc<0xee>, AMDGPUSample_c_cd_o, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <mimgopc<0xef>, AMDGPUSample_c_cd_cl_o, 0, 1>;
+} // End OtherPredicates = [HasExtendedImageInsts]
//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>;
//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>;
-let SubtargetPredicate = HasGFX10_BEncoding in
-defm IMAGE_MSAA_LOAD : MIMG_NoSampler <0x00000080, "image_msaa_load", 1>;
+let SubtargetPredicate = HasGFX10_AEncoding in
+defm IMAGE_MSAA_LOAD_X : MIMG_NoSampler <mimgopc<0x80>, "image_msaa_load", 1, 0, 0, 1>;
-defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay<0xe6, "image_bvh_intersect_ray", 11, 0>;
-defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay<0xe6, "image_bvh_intersect_ray", 8, 1>;
-defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay<0xe7, "image_bvh64_intersect_ray", 12, 0>;
-defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<0xe7, "image_bvh64_intersect_ray", 9, 1>;
+defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0xe6>, "image_bvh_intersect_ray", 11, 0>;
+defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0xe6>, "image_bvh_intersect_ray", 8, 1>;
+defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0xe7>, "image_bvh64_intersect_ray", 12, 0>;
+defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0xe7>, "image_bvh64_intersect_ray", 9, 1>;
/********** ========================================= **********/
/********** Table of dimension-aware image intrinsics **********/
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index c0120903396c..002ef1801448 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -451,9 +451,9 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
- case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
+ case ISD::SHL_PARTS:
case ISD::SRA_PARTS:
- case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
+ case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
case ISD::FCOS:
@@ -765,78 +765,11 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
DAG.getConstantFP(numbers::pif, DL, MVT::f32));
}
-SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT VT = Op.getValueType();
-
- SDValue Lo = Op.getOperand(0);
- SDValue Hi = Op.getOperand(1);
- SDValue Shift = Op.getOperand(2);
- SDValue Zero = DAG.getConstant(0, DL, VT);
- SDValue One = DAG.getConstant(1, DL, VT);
-
- SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT);
- SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
- SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
- SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
-
- // The dance around Width1 is necessary for 0 special case.
- // Without it the CompShift might be 32, producing incorrect results in
- // Overflow. So we do the shift in two steps, the alternative is to
- // add a conditional to filter the special case.
-
- SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
- Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
-
- SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
- HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
- SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
-
- SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
- SDValue LoBig = Zero;
-
- Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
- Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
-
- return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
-}
-
-SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT VT = Op.getValueType();
-
- SDValue Lo = Op.getOperand(0);
- SDValue Hi = Op.getOperand(1);
- SDValue Shift = Op.getOperand(2);
- SDValue Zero = DAG.getConstant(0, DL, VT);
- SDValue One = DAG.getConstant(1, DL, VT);
-
- const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
-
- SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT);
- SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
- SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
- SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
-
- // The dance around Width1 is necessary for 0 special case.
- // Without it the CompShift might be 32, producing incorrect results in
- // Overflow. So we do the shift in two steps, the alternative is to
- // add a conditional to filter the special case.
-
- SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
- Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
-
- SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
- SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
- LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
-
- SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
- SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
-
- Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
- Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
-
- return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
+SDValue R600TargetLowering::LowerShiftParts(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Lo, Hi;
+ expandShiftParts(Op.getNode(), Lo, Hi, DAG);
+ return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
}
SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
@@ -1239,7 +1172,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
Align Alignment = StoreNode->getAlign();
if (Alignment < MemVT.getStoreSize() &&
- !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment.value(),
+ !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
StoreNode->getMemOperand()->getFlags(),
nullptr)) {
return expandUnalignedStore(StoreNode, DAG);
@@ -1640,7 +1573,7 @@ bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
}
bool R600TargetLowering::allowsMisalignedMemoryAccesses(
- EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
+ EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
bool *IsFast) const {
if (IsFast)
*IsFast = false;
@@ -1655,7 +1588,7 @@ bool R600TargetLowering::allowsMisalignedMemoryAccesses(
if (IsFast)
*IsFast = true;
- return VT.bitsGT(MVT::i32) && Align % 4 == 0;
+ return VT.bitsGT(MVT::i32) && Alignment >= Align(4);
}
static SDValue CompactSwizzlableVector(
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
index b560da8e91d9..920cf3cd97ef 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
@@ -50,10 +50,19 @@ public:
const SelectionDAG &DAG) const override;
bool allowsMisalignedMemoryAccesses(
- EVT VT, unsigned AS, unsigned Align,
+ EVT VT, unsigned AS, Align Alignment,
MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
bool *IsFast = nullptr) const override;
+ virtual bool canCombineTruncStore(EVT ValVT, EVT MemVT,
+ bool LegalOperations) const override {
+ // R600 has "custom" lowering for truncating stores despite not supporting
+ // those instructions. If we allow that custom lowering in the DAG combiner
+ // then all truncates are merged into truncating stores, giving worse code
+ // generation. This hook prevents the DAG combiner performing that combine.
+ return isTruncStoreLegal(ValVT, MemVT);
+ }
+
private:
unsigned Gen;
/// Each OpenCL kernel has nine implicit parameters that are stored in the
@@ -85,8 +94,7 @@ private:
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSHLParts(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSRXParts(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
unsigned mainop, unsigned ovf) const;
diff --git a/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp b/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
index 5fd912e0fb39..8f1a069c232d 100644
--- a/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
+++ b/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
@@ -301,7 +301,8 @@ class R600OpenCLImageTypeLoweringPass : public ModulePass {
}
}
SmallVector<ReturnInst*, 8> Returns;
- CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns);
+ CloneFunctionInto(NewF, F, VMap, CloneFunctionChangeType::LocalChangesOnly,
+ Returns);
// Build new MDNode.
SmallVector<Metadata *, 6> KernelMDArgs;
diff --git a/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp b/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp
deleted file mode 100644
index 3b753cb66ead..000000000000
--- a/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-//===-- SIAddIMGInit.cpp - Add any required IMG inits ---------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// Any MIMG instructions that use tfe or lwe require an initialization of the
-/// result register that will be written in the case of a memory access failure
-/// The required code is also added to tie this init code to the result of the
-/// img instruction
-///
-//===----------------------------------------------------------------------===//
-//
-
-#include "AMDGPU.h"
-#include "GCNSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-
-#define DEBUG_TYPE "si-img-init"
-
-using namespace llvm;
-
-namespace {
-
-class SIAddIMGInit : public MachineFunctionPass {
-public:
- static char ID;
-
-public:
- SIAddIMGInit() : MachineFunctionPass(ID) {
- initializeSIAddIMGInitPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS(SIAddIMGInit, DEBUG_TYPE, "SI Add IMG Init", false, false)
-
-char SIAddIMGInit::ID = 0;
-
-char &llvm::SIAddIMGInitID = SIAddIMGInit::ID;
-
-FunctionPass *llvm::createSIAddIMGInitPass() { return new SIAddIMGInit(); }
-
-bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
- MachineRegisterInfo &MRI = MF.getRegInfo();
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = ST.getInstrInfo();
- const SIRegisterInfo *RI = ST.getRegisterInfo();
- bool Changed = false;
-
- for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
- ++BI) {
- MachineBasicBlock &MBB = *BI;
- MachineBasicBlock::iterator I, Next;
- for (I = MBB.begin(); I != MBB.end(); I = Next) {
- Next = std::next(I);
- MachineInstr &MI = *I;
-
- auto Opcode = MI.getOpcode();
- if (TII->isMIMG(Opcode) && !MI.mayStore()) {
- MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
- MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
- MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
-
- if (!TFE && !LWE) // intersect_ray
- continue;
-
- unsigned TFEVal = TFE->getImm();
- unsigned LWEVal = LWE->getImm();
- unsigned D16Val = D16 ? D16->getImm() : 0;
-
- if (TFEVal || LWEVal) {
- // At least one of TFE or LWE are non-zero
- // We have to insert a suitable initialization of the result value and
- // tie this to the dest of the image instruction.
-
- const DebugLoc &DL = MI.getDebugLoc();
-
- int DstIdx =
- AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
-
- // Calculate which dword we have to initialize to 0.
- MachineOperand *MO_Dmask =
- TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
-
- // check that dmask operand is found.
- assert(MO_Dmask && "Expected dmask operand in instruction");
-
- unsigned dmask = MO_Dmask->getImm();
- // Determine the number of active lanes taking into account the
- // Gather4 special case
- unsigned ActiveLanes =
- TII->isGather4(Opcode) ? 4 : countPopulation(dmask);
-
- bool Packed = !ST.hasUnpackedD16VMem();
-
- unsigned InitIdx =
- D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
-
- // Abandon attempt if the dst size isn't large enough
- // - this is in fact an error but this is picked up elsewhere and
- // reported correctly.
- uint32_t DstSize =
- RI->getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
- if (DstSize < InitIdx)
- continue;
-
- // Create a register for the intialization value.
- Register PrevDst =
- MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
- unsigned NewDst = 0; // Final initialized value will be in here
-
- // If PRTStrictNull feature is enabled (the default) then initialize
- // all the result registers to 0, otherwise just the error indication
- // register (VGPRn+1)
- unsigned SizeLeft = ST.usePRTStrictNull() ? InitIdx : 1;
- unsigned CurrIdx = ST.usePRTStrictNull() ? 0 : (InitIdx - 1);
-
- if (DstSize == 1) {
- // In this case we can just initialize the result directly
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), PrevDst)
- .addImm(0);
- NewDst = PrevDst;
- } else {
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
- for (; SizeLeft; SizeLeft--, CurrIdx++) {
- NewDst =
- MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
- // Initialize dword
- Register SubReg =
- MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
- .addImm(0);
- // Insert into the super-reg
- BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
- .addReg(PrevDst)
- .addReg(SubReg)
- .addImm(SIRegisterInfo::getSubRegFromChannel(CurrIdx));
-
- PrevDst = NewDst;
- }
- }
-
- // Add as an implicit operand
- MachineInstrBuilder(MF, MI).addReg(NewDst, RegState::Implicit);
-
- // Tie the just added implicit operand to the dst
- MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
-
- Changed = true;
- }
- }
- }
- }
-
- return Changed;
-}
diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 625749deb3a8..397b2f873515 100644
--- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -71,6 +71,8 @@ class SIAnnotateControlFlow : public FunctionPass {
bool isElse(PHINode *Phi);
+ bool hasKill(const BasicBlock *BB);
+
void eraseIfUnused(PHINode *Phi);
void openIf(BranchInst *Term);
@@ -98,6 +100,7 @@ public:
AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<LegacyDivergenceAnalysis>();
+ AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<TargetPassConfig>();
FunctionPass::getAnalysisUsage(AU);
@@ -181,6 +184,15 @@ bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
return true;
}
+bool SIAnnotateControlFlow::hasKill(const BasicBlock *BB) {
+ for (const Instruction &I : *BB) {
+ if (const CallInst *CI = dyn_cast<CallInst>(&I))
+ if (CI->getIntrinsicID() == Intrinsic::amdgcn_kill)
+ return true;
+ }
+ return false;
+}
+
// Erase "Phi" if it is not used any more
void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
if (RecursivelyDeleteDeadPHINode(Phi)) {
@@ -339,7 +351,7 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
if (isTopOfStack(BB)) {
PHINode *Phi = dyn_cast<PHINode>(Term->getCondition());
- if (Phi && Phi->getParent() == BB && isElse(Phi)) {
+ if (Phi && Phi->getParent() == BB && isElse(Phi) && !hasKill(BB)) {
insertElse(Term);
eraseIfUnused(Phi);
continue;
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index c83802b323c3..d3c0d792804d 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -91,7 +91,7 @@ enum : uint64_t {
D16Buf = UINT64_C(1) << 50,
// FLAT instruction accesses FLAT_GLBL segment.
- IsFlatGlobal = UINT64_C(1) << 51,
+ FlatGlobal = UINT64_C(1) << 51,
// Uses floating point double precision rounding mode
FPDPRounding = UINT64_C(1) << 52,
@@ -106,7 +106,13 @@ enum : uint64_t {
IsDOT = UINT64_C(1) << 55,
// FLAT instruction accesses FLAT_SCRATCH segment.
- IsFlatScratch = UINT64_C(1) << 56
+ FlatScratch = UINT64_C(1) << 56,
+
+ // Atomic without return.
+ IsAtomicNoRet = UINT64_C(1) << 57,
+
+ // Atomic with return.
+ IsAtomicRet = UINT64_C(1) << 58
};
// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
@@ -136,6 +142,8 @@ namespace AMDGPU {
OPERAND_REG_IMM_FP16,
OPERAND_REG_IMM_V2FP16,
OPERAND_REG_IMM_V2INT16,
+ OPERAND_REG_IMM_V2INT32,
+ OPERAND_REG_IMM_V2FP32,
/// Operands with register or inline constant
OPERAND_REG_INLINE_C_INT16,
@@ -144,25 +152,30 @@ namespace AMDGPU {
OPERAND_REG_INLINE_C_FP16,
OPERAND_REG_INLINE_C_FP32,
OPERAND_REG_INLINE_C_FP64,
- OPERAND_REG_INLINE_C_V2FP16,
OPERAND_REG_INLINE_C_V2INT16,
+ OPERAND_REG_INLINE_C_V2FP16,
+ OPERAND_REG_INLINE_C_V2INT32,
+ OPERAND_REG_INLINE_C_V2FP32,
/// Operands with an AccVGPR register or inline constant
OPERAND_REG_INLINE_AC_INT16,
OPERAND_REG_INLINE_AC_INT32,
OPERAND_REG_INLINE_AC_FP16,
OPERAND_REG_INLINE_AC_FP32,
- OPERAND_REG_INLINE_AC_V2FP16,
+ OPERAND_REG_INLINE_AC_FP64,
OPERAND_REG_INLINE_AC_V2INT16,
+ OPERAND_REG_INLINE_AC_V2FP16,
+ OPERAND_REG_INLINE_AC_V2INT32,
+ OPERAND_REG_INLINE_AC_V2FP32,
OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32,
- OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_V2INT16,
+ OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_V2FP32,
OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16,
- OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_AC_V2INT16,
+ OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_AC_V2FP32,
OPERAND_REG_INLINE_AC_FIRST = OPERAND_REG_INLINE_AC_INT16,
- OPERAND_REG_INLINE_AC_LAST = OPERAND_REG_INLINE_AC_V2INT16,
+ OPERAND_REG_INLINE_AC_LAST = OPERAND_REG_INLINE_AC_V2FP32,
OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32,
OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST,
@@ -263,15 +276,33 @@ enum : unsigned {
} // namespace AMDGPU
namespace AMDGPU {
+namespace CPol {
+
+enum CPol {
+ GLC = 1,
+ SLC = 2,
+ DLC = 4,
+ SCC = 16,
+ ALL = GLC | SLC | DLC | SCC
+};
+
+} // namespace CPol
+
namespace SendMsg { // Encoding of SIMM16 used in s_sendmsg* insns.
enum Id { // Message ID, width(4) [3:0].
ID_UNKNOWN_ = -1,
ID_INTERRUPT = 1,
- ID_GS,
- ID_GS_DONE,
- ID_GS_ALLOC_REQ = 9,
- ID_GET_DOORBELL = 10,
+ ID_GS = 2,
+ ID_GS_DONE = 3,
+ ID_SAVEWAVE = 4, // added in GFX8
+ ID_STALL_WAVE_GEN = 5, // added in GFX9
+ ID_HALT_WAVES = 6, // added in GFX9
+ ID_ORDERED_PS_DONE = 7, // added in GFX9
+ ID_EARLY_PRIM_DEALLOC = 8, // added in GFX9, removed in GFX10
+ ID_GS_ALLOC_REQ = 9, // added in GFX9
+ ID_GET_DOORBELL = 10, // added in GFX9
+ ID_GET_DDID = 11, // added in GFX10
ID_SYSMSG = 15,
ID_GAPS_LAST_, // Indicate that sequence has gaps.
ID_GAPS_FIRST_ = ID_INTERRUPT,
@@ -289,16 +320,16 @@ enum Op { // Both GS and SYS operation IDs.
OP_MASK_ = (((1 << OP_WIDTH_) - 1) << OP_SHIFT_),
// GS operations are encoded in bits 5:4
OP_GS_NOP = 0,
- OP_GS_CUT,
- OP_GS_EMIT,
- OP_GS_EMIT_CUT,
+ OP_GS_CUT = 1,
+ OP_GS_EMIT = 2,
+ OP_GS_EMIT_CUT = 3,
OP_GS_LAST_,
OP_GS_FIRST_ = OP_GS_NOP,
// SYS operations are encoded in bits 6:4
OP_SYS_ECC_ERR_INTERRUPT = 1,
- OP_SYS_REG_RD,
- OP_SYS_HOST_TRAP_ACK,
- OP_SYS_TTRACE_PC,
+ OP_SYS_REG_RD = 2,
+ OP_SYS_HOST_TRAP_ACK = 3,
+ OP_SYS_TTRACE_PC = 4,
OP_SYS_LAST_,
OP_SYS_FIRST_ = OP_SYS_ECC_ERR_INTERRUPT,
};
@@ -640,6 +671,7 @@ enum SDWA9EncValues : unsigned {
namespace DPP {
+// clang-format off
enum DppCtrl : unsigned {
QUAD_PERM_FIRST = 0,
QUAD_PERM_ID = 0xE4, // identity permutation
@@ -674,12 +706,17 @@ enum DppCtrl : unsigned {
BCAST31 = 0x143,
DPP_UNUSED8_FIRST = 0x144,
DPP_UNUSED8_LAST = 0x14F,
+ ROW_NEWBCAST_FIRST= 0x150,
+ ROW_NEWBCAST_LAST = 0x15F,
+ ROW_SHARE0 = 0x150,
ROW_SHARE_FIRST = 0x150,
ROW_SHARE_LAST = 0x15F,
+ ROW_XMASK0 = 0x160,
ROW_XMASK_FIRST = 0x160,
ROW_XMASK_LAST = 0x16F,
DPP_LAST = ROW_XMASK_LAST
};
+// clang-format on
enum DppFiMode {
DPP_FI_0 = 0,
@@ -716,6 +753,17 @@ enum Target : unsigned {
};
} // namespace Exp
+
+namespace VOP3PEncoding {
+
+enum OpSel : uint64_t {
+ OP_SEL_HI_0 = UINT64_C(1) << 59,
+ OP_SEL_HI_1 = UINT64_C(1) << 60,
+ OP_SEL_HI_2 = UINT64_C(1) << 14,
+};
+
+} // namespace VOP3PEncoding
+
} // namespace AMDGPU
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 34f59bf34dd5..d5c56bf2a321 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -581,8 +581,9 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
continue;
case AMDGPU::COPY:
case AMDGPU::WQM:
+ case AMDGPU::STRICT_WQM:
case AMDGPU::SOFT_WQM:
- case AMDGPU::WWM: {
+ case AMDGPU::STRICT_WWM: {
Register DstReg = MI.getOperand(0).getReg();
const TargetRegisterClass *SrcRC, *DstRC;
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index d5fa9afded27..ad910522ba90 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -90,6 +90,8 @@ public:
SmallVectorImpl<FoldCandidate> &FoldList,
SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
+ bool tryFoldCndMask(MachineInstr &MI) const;
+ bool tryFoldZeroHighBits(MachineInstr &MI) const;
void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
const MachineOperand *isClamp(const MachineInstr &MI) const;
@@ -97,6 +99,9 @@ public:
std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
bool tryFoldOMod(MachineInstr &MI);
+ bool tryFoldRegSequence(MachineInstr &MI);
+ bool tryFoldLCSSAPhi(MachineInstr &MI);
+ bool tryFoldLoad(MachineInstr &MI);
public:
SIFoldOperands() : MachineFunctionPass(ID) {
@@ -135,6 +140,8 @@ static unsigned macToMad(unsigned Opc) {
return AMDGPU::V_FMA_F16_gfx9_e64;
case AMDGPU::V_FMAC_LEGACY_F32_e64:
return AMDGPU::V_FMA_LEGACY_F32_e64;
+ case AMDGPU::V_FMAC_F64_e64:
+ return AMDGPU::V_FMA_F64_e64;
}
return AMDGPU::INSTRUCTION_LIST_END;
}
@@ -332,8 +339,8 @@ static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
if (Fold.UseMI == MI && Fold.UseOpNo == OpNo)
return;
LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal")
- << " operand " << OpNo << "\n " << *MI << '\n');
- FoldList.push_back(FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
+ << " operand " << OpNo << "\n " << *MI);
+ FoldList.emplace_back(MI, OpNo, FoldOp, Commuted, ShrinkOp);
}
static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
@@ -484,37 +491,37 @@ static bool isUseSafeToFold(const SIInstrInfo *TII,
//return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
}
-// Find a def of the UseReg, check if it is a reg_seqence and find initializers
+// Find a def of the UseReg, check if it is a reg_sequence and find initializers
// for each subreg, tracking it to foldable inline immediate if possible.
// Returns true on success.
static bool getRegSeqInit(
SmallVectorImpl<std::pair<MachineOperand*, unsigned>> &Defs,
Register UseReg, uint8_t OpTy,
const SIInstrInfo *TII, const MachineRegisterInfo &MRI) {
- MachineInstr *Def = MRI.getUniqueVRegDef(UseReg);
+ MachineInstr *Def = MRI.getVRegDef(UseReg);
if (!Def || !Def->isRegSequence())
return false;
for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
MachineOperand *Sub = &Def->getOperand(I);
- assert (Sub->isReg());
+ assert(Sub->isReg());
- for (MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub->getReg());
- SubDef && Sub->isReg() && !Sub->getSubReg() &&
- TII->isFoldableCopy(*SubDef);
- SubDef = MRI.getUniqueVRegDef(Sub->getReg())) {
+ for (MachineInstr *SubDef = MRI.getVRegDef(Sub->getReg());
+ SubDef && Sub->isReg() && Sub->getReg().isVirtual() &&
+ !Sub->getSubReg() && TII->isFoldableCopy(*SubDef);
+ SubDef = MRI.getVRegDef(Sub->getReg())) {
MachineOperand *Op = &SubDef->getOperand(1);
if (Op->isImm()) {
if (TII->isInlineConstant(*Op, OpTy))
Sub = Op;
break;
}
- if (!Op->isReg())
+ if (!Op->isReg() || Op->getReg().isPhysical())
break;
Sub = Op;
}
- Defs.push_back(std::make_pair(Sub, Def->getOperand(I + 1).getImm()));
+ Defs.emplace_back(Sub, Def->getOperand(I + 1).getImm());
}
return true;
@@ -531,8 +538,10 @@ static bool tryToFoldACImm(const SIInstrInfo *TII,
return false;
uint8_t OpTy = OpInfo[UseOpIdx].OperandType;
- if (OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST ||
- OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST)
+ if ((OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST ||
+ OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST) &&
+ (OpTy < AMDGPU::OPERAND_REG_INLINE_C_FIRST ||
+ OpTy > AMDGPU::OPERAND_REG_INLINE_C_LAST))
return false;
if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
@@ -548,12 +557,23 @@ static bool tryToFoldACImm(const SIInstrInfo *TII,
if (!UseReg.isVirtual())
return false;
- if (llvm::any_of(FoldList, [UseMI](const FoldCandidate &FC) {
- return FC.UseMI == UseMI;
- }))
+ if (isUseMIInFoldList(FoldList, UseMI))
return false;
MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo();
+
+ // Maybe it is just a COPY of an immediate itself.
+ MachineInstr *Def = MRI.getVRegDef(UseReg);
+ MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
+ if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) {
+ MachineOperand &DefOp = Def->getOperand(1);
+ if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) &&
+ TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
+ UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm());
+ return true;
+ }
+ }
+
SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
if (!getRegSeqInit(Defs, UseReg, OpTy, TII, MRI))
return false;
@@ -605,22 +625,17 @@ void SIFoldOperands::foldOperand(
Register RegSeqDstReg = UseMI->getOperand(0).getReg();
unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
- MachineRegisterInfo::use_nodbg_iterator Next;
- for (MachineRegisterInfo::use_nodbg_iterator
- RSUse = MRI->use_nodbg_begin(RegSeqDstReg), RSE = MRI->use_nodbg_end();
- RSUse != RSE; RSUse = Next) {
- Next = std::next(RSUse);
-
- MachineInstr *RSUseMI = RSUse->getParent();
+ for (auto &RSUse : make_early_inc_range(MRI->use_nodbg_operands(RegSeqDstReg))) {
+ MachineInstr *RSUseMI = RSUse.getParent();
if (tryToFoldACImm(TII, UseMI->getOperand(0), RSUseMI,
- RSUse.getOperandNo(), FoldList))
+ RSUseMI->getOperandNo(&RSUse), FoldList))
continue;
- if (RSUse->getSubReg() != RegSeqDstSubReg)
+ if (RSUse.getSubReg() != RegSeqDstSubReg)
continue;
- foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
+ foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(&RSUse), FoldList,
CopiesToReplace);
}
@@ -680,19 +695,15 @@ void SIFoldOperands::foldOperand(
const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
if (!DestReg.isPhysical()) {
if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
- MachineRegisterInfo::use_nodbg_iterator NextUse;
SmallVector<FoldCandidate, 4> CopyUses;
- for (MachineRegisterInfo::use_nodbg_iterator Use = MRI->use_nodbg_begin(DestReg),
- E = MRI->use_nodbg_end();
- Use != E; Use = NextUse) {
- NextUse = std::next(Use);
+ for (auto &Use : MRI->use_nodbg_operands(DestReg)) {
// There's no point trying to fold into an implicit operand.
- if (Use->isImplicit())
+ if (Use.isImplicit())
continue;
- FoldCandidate FC = FoldCandidate(Use->getParent(), Use.getOperandNo(),
- &UseMI->getOperand(1));
- CopyUses.push_back(FC);
+ CopyUses.emplace_back(Use.getParent(),
+ Use.getParent()->getOperandNo(&Use),
+ &UseMI->getOperand(1));
}
for (auto &F : CopyUses) {
foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, FoldList, CopiesToReplace);
@@ -728,8 +739,7 @@ void SIFoldOperands::foldOperand(
if (UseMI->isCopy() && OpToFold.isReg() &&
UseMI->getOperand(0).getReg().isVirtual() &&
!UseMI->getOperand(1).getSubReg()) {
- LLVM_DEBUG(dbgs() << "Folding " << OpToFold
- << "\n into " << *UseMI << '\n');
+ LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
unsigned Size = TII->getOpSize(*UseMI, 1);
Register UseReg = OpToFold.getReg();
UseMI->getOperand(1).setReg(UseReg);
@@ -813,7 +823,7 @@ void SIFoldOperands::foldOperand(
B.addImm(Defs[I].second);
}
- LLVM_DEBUG(dbgs() << "Folded " << *UseMI << '\n');
+ LLVM_DEBUG(dbgs() << "Folded " << *UseMI);
return;
}
@@ -825,6 +835,10 @@ void SIFoldOperands::foldOperand(
else if (TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg()))
UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64));
+ else if (ST->hasGFX90AInsts() &&
+ TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
+ TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg()))
+ UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32));
return;
}
@@ -1033,14 +1047,19 @@ static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
// Try to simplify operations with a constant that may appear after instruction
// selection.
// TODO: See if a frame index with a fixed offset can fold.
-static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
- const SIInstrInfo *TII,
- MachineInstr *MI,
- MachineOperand *ImmOp) {
+static bool tryConstantFoldOp(MachineRegisterInfo &MRI, const SIInstrInfo *TII,
+ MachineInstr *MI) {
unsigned Opc = MI->getOpcode();
- if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
- Opc == AMDGPU::S_NOT_B32) {
- MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm());
+
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ if (Src0Idx == -1)
+ return false;
+ MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
+
+ if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
+ Opc == AMDGPU::S_NOT_B32) &&
+ Src0->isImm()) {
+ MI->getOperand(1).ChangeToImmediate(~Src0->getImm());
mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
return true;
}
@@ -1048,9 +1067,6 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
if (Src1Idx == -1)
return false;
-
- int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
- MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx));
if (!Src0->isImm() && !Src1->isImm())
@@ -1134,35 +1150,61 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
}
// Try to fold an instruction into a simpler one
-static bool tryFoldInst(const SIInstrInfo *TII,
- MachineInstr *MI) {
- unsigned Opc = MI->getOpcode();
+bool SIFoldOperands::tryFoldCndMask(MachineInstr &MI) const {
+ unsigned Opc = MI.getOpcode();
+ if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
+ Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
+ return false;
- if (Opc == AMDGPU::V_CNDMASK_B32_e32 ||
- Opc == AMDGPU::V_CNDMASK_B32_e64 ||
- Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) {
- const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
- const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
- int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
- int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
- if (Src1->isIdenticalTo(*Src0) &&
- (Src1ModIdx == -1 || !MI->getOperand(Src1ModIdx).getImm()) &&
- (Src0ModIdx == -1 || !MI->getOperand(Src0ModIdx).getImm())) {
- LLVM_DEBUG(dbgs() << "Folded " << *MI << " into ");
- auto &NewDesc =
- TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
- int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
- if (Src2Idx != -1)
- MI->RemoveOperand(Src2Idx);
- MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
- if (Src1ModIdx != -1)
- MI->RemoveOperand(Src1ModIdx);
- if (Src0ModIdx != -1)
- MI->RemoveOperand(Src0ModIdx);
- mutateCopyOp(*MI, NewDesc);
- LLVM_DEBUG(dbgs() << *MI << '\n');
- return true;
- }
+ MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ if (!Src1->isIdenticalTo(*Src0)) {
+ auto *Src0Imm = getImmOrMaterializedImm(*MRI, *Src0);
+ auto *Src1Imm = getImmOrMaterializedImm(*MRI, *Src1);
+ if (!Src1Imm->isIdenticalTo(*Src0Imm))
+ return false;
+ }
+
+ int Src1ModIdx =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
+ int Src0ModIdx =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+ if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
+ (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
+ auto &NewDesc =
+ TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
+ int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+ if (Src2Idx != -1)
+ MI.RemoveOperand(Src2Idx);
+ MI.RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
+ if (Src1ModIdx != -1)
+ MI.RemoveOperand(Src1ModIdx);
+ if (Src0ModIdx != -1)
+ MI.RemoveOperand(Src0ModIdx);
+ mutateCopyOp(MI, NewDesc);
+ LLVM_DEBUG(dbgs() << MI);
+ return true;
+}
+
+bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const {
+ if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
+ MI.getOpcode() != AMDGPU::V_AND_B32_e32)
+ return false;
+
+ MachineOperand *Src0 = getImmOrMaterializedImm(*MRI, MI.getOperand(1));
+ if (!Src0->isImm() || Src0->getImm() != 0xffff)
+ return false;
+
+ Register Src1 = MI.getOperand(2).getReg();
+ MachineInstr *SrcDef = MRI->getVRegDef(Src1);
+ if (ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode())) {
+ Register Dst = MI.getOperand(0).getReg();
+ MRI->replaceRegWith(Dst, SrcDef->getOperand(0).getReg());
+ MI.eraseFromParent();
+ return true;
}
return false;
@@ -1177,20 +1219,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
SmallVector<FoldCandidate, 4> FoldList;
MachineOperand &Dst = MI.getOperand(0);
- bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
- if (FoldingImm) {
- unsigned NumLiteralUses = 0;
- MachineOperand *NonInlineUse = nullptr;
- int NonInlineUseOpNo = -1;
-
- MachineRegisterInfo::use_nodbg_iterator NextUse;
- for (MachineRegisterInfo::use_nodbg_iterator
- Use = MRI->use_nodbg_begin(Dst.getReg()), E = MRI->use_nodbg_end();
- Use != E; Use = NextUse) {
- NextUse = std::next(Use);
- MachineInstr *UseMI = Use->getParent();
- unsigned OpNo = Use.getOperandNo();
-
+ if (OpToFold.isImm()) {
+ for (auto &UseMI :
+ make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
// Folding the immediate may reveal operations that can be constant
// folded or replaced with a copy. This can happen for example after
// frame indices are lowered to constants or from splitting 64-bit
@@ -1199,18 +1230,21 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
// We may also encounter cases where one or both operands are
// immediates materialized into a register, which would ordinarily not
// be folded due to multiple uses or operand constraints.
+ if (tryConstantFoldOp(*MRI, TII, &UseMI))
+ LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
+ }
+ }
- if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) {
- LLVM_DEBUG(dbgs() << "Constant folded " << *UseMI << '\n');
+ bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
+ if (FoldingImm) {
+ unsigned NumLiteralUses = 0;
+ MachineOperand *NonInlineUse = nullptr;
+ int NonInlineUseOpNo = -1;
- // Some constant folding cases change the same immediate's use to a new
- // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
- // again. The same constant folded instruction could also have a second
- // use operand.
- NextUse = MRI->use_nodbg_begin(Dst.getReg());
- FoldList.clear();
- continue;
- }
+ for (auto &Use :
+ make_early_inc_range(MRI->use_nodbg_operands(Dst.getReg()))) {
+ MachineInstr *UseMI = Use.getParent();
+ unsigned OpNo = UseMI->getOperandNo(&Use);
// Try to fold any inline immediate uses, and then only fold other
// constants if they have one use.
@@ -1230,11 +1264,10 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
} else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) {
- foldOperand(OpToFold, UseMI, OpNo, FoldList,
- CopiesToReplace);
+ foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
} else {
if (++NumLiteralUses == 1) {
- NonInlineUse = &*Use;
+ NonInlineUse = &Use;
NonInlineUseOpNo = OpNo;
}
}
@@ -1246,16 +1279,13 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
}
} else {
// Folding register.
- SmallVector <MachineRegisterInfo::use_nodbg_iterator, 4> UsesToProcess;
- for (MachineRegisterInfo::use_nodbg_iterator
- Use = MRI->use_nodbg_begin(Dst.getReg()), E = MRI->use_nodbg_end();
- Use != E; ++Use) {
- UsesToProcess.push_back(Use);
- }
+ SmallVector <MachineOperand *, 4> UsesToProcess;
+ for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
+ UsesToProcess.push_back(&Use);
for (auto U : UsesToProcess) {
MachineInstr *UseMI = U->getParent();
- foldOperand(OpToFold, UseMI, U.getOperandNo(),
+ foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U),
FoldList, CopiesToReplace);
}
}
@@ -1265,11 +1295,8 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
for (MachineInstr *Copy : CopiesToReplace)
Copy->addImplicitDefUseOperands(*MF);
- SmallPtrSet<MachineInstr *, 16> Folded;
for (FoldCandidate &Fold : FoldList) {
assert(!Fold.isReg() || Fold.OpToFold);
- if (Folded.count(Fold.UseMI))
- continue;
if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
Register Reg = Fold.OpToFold->getReg();
MachineInstr *DefMI = Fold.OpToFold->getParent();
@@ -1288,9 +1315,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
}
LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
<< static_cast<int>(Fold.UseOpNo) << " of "
- << *Fold.UseMI << '\n');
- if (tryFoldInst(TII, Fold.UseMI))
- Folded.insert(Fold.UseMI);
+ << *Fold.UseMI);
} else if (Fold.isCommuted()) {
// Restoring instruction's original operand order if fold has failed.
TII->commuteInstruction(*Fold.UseMI, false);
@@ -1341,23 +1366,10 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
}
}
-// We obviously have multiple uses in a clamp since the register is used twice
-// in the same instruction.
-static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) {
- int Count = 0;
- for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end();
- I != E; ++I) {
- if (++Count > 1)
- return false;
- }
-
- return true;
-}
-
// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
const MachineOperand *ClampSrc = isClamp(MI);
- if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg()))
+ if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
return false;
MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
@@ -1370,8 +1382,7 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
if (!DefClamp)
return false;
- LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def
- << '\n');
+ LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
// Clamp is applied after omod, so it is OK if omod is set.
DefClamp->setImm(1);
@@ -1382,6 +1393,18 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
static int getOModValue(unsigned Opc, int64_t Val) {
switch (Opc) {
+ case AMDGPU::V_MUL_F64_e64: {
+ switch (Val) {
+ case 0x3fe0000000000000: // 0.5
+ return SIOutMods::DIV2;
+ case 0x4000000000000000: // 2.0
+ return SIOutMods::MUL2;
+ case 0x4010000000000000: // 4.0
+ return SIOutMods::MUL4;
+ default:
+ return SIOutMods::NONE;
+ }
+ }
case AMDGPU::V_MUL_F32_e64: {
switch (static_cast<uint32_t>(Val)) {
case 0x3f000000: // 0.5
@@ -1418,11 +1441,13 @@ std::pair<const MachineOperand *, int>
SIFoldOperands::isOMod(const MachineInstr &MI) const {
unsigned Op = MI.getOpcode();
switch (Op) {
+ case AMDGPU::V_MUL_F64_e64:
case AMDGPU::V_MUL_F32_e64:
case AMDGPU::V_MUL_F16_e64: {
// If output denormals are enabled, omod is ignored.
if ((Op == AMDGPU::V_MUL_F32_e64 && MFI->getMode().FP32OutputDenormals) ||
- (Op == AMDGPU::V_MUL_F16_e64 && MFI->getMode().FP64FP16OutputDenormals))
+ ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F16_e64) &&
+ MFI->getMode().FP64FP16OutputDenormals))
return std::make_pair(nullptr, SIOutMods::NONE);
const MachineOperand *RegOp = nullptr;
@@ -1448,11 +1473,13 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
return std::make_pair(RegOp, OMod);
}
+ case AMDGPU::V_ADD_F64_e64:
case AMDGPU::V_ADD_F32_e64:
case AMDGPU::V_ADD_F16_e64: {
// If output denormals are enabled, omod is ignored.
if ((Op == AMDGPU::V_ADD_F32_e64 && MFI->getMode().FP32OutputDenormals) ||
- (Op == AMDGPU::V_ADD_F16_e64 && MFI->getMode().FP64FP16OutputDenormals))
+ ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F16_e64) &&
+ MFI->getMode().FP64FP16OutputDenormals))
return std::make_pair(nullptr, SIOutMods::NONE);
// Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
@@ -1481,7 +1508,7 @@ bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
std::tie(RegOp, OMod) = isOMod(MI);
if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
RegOp->getSubReg() != AMDGPU::NoSubRegister ||
- !hasOneNonDBGUseInst(*MRI, RegOp->getReg()))
+ !MRI->hasOneNonDBGUser(RegOp->getReg()))
return false;
MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
@@ -1494,7 +1521,7 @@ bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
return false;
- LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
+ LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
DefOMod->setImm(OMod);
MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
@@ -1502,6 +1529,198 @@ bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
return true;
}
+// Try to fold a reg_sequence with vgpr output and agpr inputs into an
+// instruction which can take an agpr. So far that means a store.
+bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) {
+ assert(MI.isRegSequence());
+ auto Reg = MI.getOperand(0).getReg();
+
+ if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
+ !MRI->hasOneNonDBGUse(Reg))
+ return false;
+
+ SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
+ if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER, TII, *MRI))
+ return false;
+
+ for (auto &Def : Defs) {
+ const auto *Op = Def.first;
+ if (!Op->isReg())
+ return false;
+ if (TRI->isAGPR(*MRI, Op->getReg()))
+ continue;
+ // Maybe this is a COPY from AREG
+ const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
+ if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
+ return false;
+ if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
+ return false;
+ }
+
+ MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
+ MachineInstr *UseMI = Op->getParent();
+ while (UseMI->isCopy() && !Op->getSubReg()) {
+ Reg = UseMI->getOperand(0).getReg();
+ if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
+ return false;
+ Op = &*MRI->use_nodbg_begin(Reg);
+ UseMI = Op->getParent();
+ }
+
+ if (Op->getSubReg())
+ return false;
+
+ unsigned OpIdx = Op - &UseMI->getOperand(0);
+ const MCInstrDesc &InstDesc = UseMI->getDesc();
+ const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
+ switch (OpInfo.RegClass) {
+ case AMDGPU::AV_32RegClassID: LLVM_FALLTHROUGH;
+ case AMDGPU::AV_64RegClassID: LLVM_FALLTHROUGH;
+ case AMDGPU::AV_96RegClassID: LLVM_FALLTHROUGH;
+ case AMDGPU::AV_128RegClassID: LLVM_FALLTHROUGH;
+ case AMDGPU::AV_160RegClassID:
+ break;
+ default:
+ return false;
+ }
+
+ const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
+ auto Dst = MRI->createVirtualRegister(NewDstRC);
+ auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+ TII->get(AMDGPU::REG_SEQUENCE), Dst);
+
+ for (unsigned I = 0; I < Defs.size(); ++I) {
+ MachineOperand *Def = Defs[I].first;
+ Def->setIsKill(false);
+ if (TRI->isAGPR(*MRI, Def->getReg())) {
+ RS.add(*Def);
+ } else { // This is a copy
+ MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
+ SubDef->getOperand(1).setIsKill(false);
+ RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
+ }
+ RS.addImm(Defs[I].second);
+ }
+
+ Op->setReg(Dst);
+ if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
+ Op->setReg(Reg);
+ RS->eraseFromParent();
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
+
+ // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
+ // in which case we can erase them all later in runOnMachineFunction.
+ if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
+ MI.eraseFromParentAndMarkDBGValuesForRemoval();
+ return true;
+}
+
+// Try to hoist an AGPR to VGPR copy out of the loop across a LCSSA PHI.
+// This should allow folding of an AGPR into a consumer which may support it.
+// I.e.:
+//
+// loop: // loop:
+// %1:vreg = COPY %0:areg // exit:
+// exit: => // %1:areg = PHI %0:areg, %loop
+// %2:vreg = PHI %1:vreg, %loop // %2:vreg = COPY %1:areg
+bool SIFoldOperands::tryFoldLCSSAPhi(MachineInstr &PHI) {
+ assert(PHI.isPHI());
+
+ if (PHI.getNumExplicitOperands() != 3) // Single input LCSSA PHI
+ return false;
+
+ Register PhiIn = PHI.getOperand(1).getReg();
+ Register PhiOut = PHI.getOperand(0).getReg();
+ if (PHI.getOperand(1).getSubReg() ||
+ !TRI->isVGPR(*MRI, PhiIn) || !TRI->isVGPR(*MRI, PhiOut))
+ return false;
+
+ // A single use should not matter for correctness, but if it has another use
+ // inside the loop we may perform copy twice in a worst case.
+ if (!MRI->hasOneNonDBGUse(PhiIn))
+ return false;
+
+ MachineInstr *Copy = MRI->getVRegDef(PhiIn);
+ if (!Copy || !Copy->isCopy())
+ return false;
+
+ Register CopyIn = Copy->getOperand(1).getReg();
+ if (!TRI->isAGPR(*MRI, CopyIn) || Copy->getOperand(1).getSubReg())
+ return false;
+
+ const TargetRegisterClass *ARC = MRI->getRegClass(CopyIn);
+ Register NewReg = MRI->createVirtualRegister(ARC);
+ PHI.getOperand(1).setReg(CopyIn);
+ PHI.getOperand(0).setReg(NewReg);
+
+ MachineBasicBlock *MBB = PHI.getParent();
+ BuildMI(*MBB, MBB->getFirstNonPHI(), Copy->getDebugLoc(),
+ TII->get(AMDGPU::COPY), PhiOut)
+ .addReg(NewReg, RegState::Kill);
+ Copy->eraseFromParent(); // We know this copy had a single use.
+
+ LLVM_DEBUG(dbgs() << "Folded " << PHI);
+
+ return true;
+}
+
+// Attempt to convert VGPR load to an AGPR load.
+bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) {
+ assert(MI.mayLoad());
+ if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
+ return false;
+
+ MachineOperand &Def = MI.getOperand(0);
+ if (!Def.isDef())
+ return false;
+
+ Register DefReg = Def.getReg();
+
+ if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
+ return false;
+
+ SmallVector<const MachineInstr*, 8> Users;
+ SmallVector<Register, 8> MoveRegs;
+ for (const MachineInstr &I : MRI->use_nodbg_instructions(DefReg)) {
+ Users.push_back(&I);
+ }
+ if (Users.empty())
+ return false;
+
+ // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
+ while (!Users.empty()) {
+ const MachineInstr *I = Users.pop_back_val();
+ if (!I->isCopy() && !I->isRegSequence())
+ return false;
+ Register DstReg = I->getOperand(0).getReg();
+ if (TRI->isAGPR(*MRI, DstReg))
+ continue;
+ MoveRegs.push_back(DstReg);
+ for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg)) {
+ Users.push_back(&U);
+ }
+ }
+
+ const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
+ MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
+ if (!TII->isOperandLegal(MI, 0, &Def)) {
+ MRI->setRegClass(DefReg, RC);
+ return false;
+ }
+
+ while (!MoveRegs.empty()) {
+ Register Reg = MoveRegs.pop_back_val();
+ MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
+ }
+
+ LLVM_DEBUG(dbgs() << "Folded " << MI);
+
+ return true;
+}
+
bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
@@ -1520,14 +1739,21 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
bool HasNSZ = MFI->hasNoSignedZerosFPMath();
for (MachineBasicBlock *MBB : depth_first(&MF)) {
- MachineBasicBlock::iterator I, Next;
-
MachineOperand *CurrentKnownM0Val = nullptr;
- for (I = MBB->begin(); I != MBB->end(); I = Next) {
- Next = std::next(I);
- MachineInstr &MI = *I;
+ for (auto &MI : make_early_inc_range(*MBB)) {
+ tryFoldCndMask(MI);
+
+ if (tryFoldZeroHighBits(MI))
+ continue;
- tryFoldInst(TII, &MI);
+ if (MI.isRegSequence() && tryFoldRegSequence(MI))
+ continue;
+
+ if (MI.isPHI() && tryFoldLCSSAPhi(MI))
+ continue;
+
+ if (MI.mayLoad() && tryFoldLoad(MI))
+ continue;
if (!TII->isFoldableCopy(MI)) {
// Saw an unknown clobber of m0, so we no longer know what it is.
@@ -1575,11 +1801,31 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
// %3 = COPY %vgpr0; VGPR_32:%3
// ...
// %vgpr0 = V_MOV_B32_e32 1, implicit %exec
- MachineOperand &Dst = MI.getOperand(0);
- if (Dst.isReg() && !Dst.getReg().isVirtual())
+ if (!MI.getOperand(0).getReg().isVirtual())
continue;
foldInstOperand(MI, OpToFold);
+
+ // If we managed to fold all uses of this copy then we might as well
+ // delete it now.
+ // The only reason we need to follow chains of copies here is that
+ // tryFoldRegSequence looks forward through copies before folding a
+ // REG_SEQUENCE into its eventual users.
+ auto *InstToErase = &MI;
+ while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
+ auto &SrcOp = InstToErase->getOperand(1);
+ auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
+ InstToErase->eraseFromParentAndMarkDBGValuesForRemoval();
+ InstToErase = nullptr;
+ if (!SrcReg || SrcReg.isPhysical())
+ break;
+ InstToErase = MRI->getVRegDef(SrcReg);
+ if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
+ break;
+ }
+ if (InstToErase && InstToErase->isRegSequence() &&
+ MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg()))
+ InstToErase->eraseFromParentAndMarkDBGValuesForRemoval();
}
}
return true;
diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index a12e013b4fe6..80ee7a00252a 100644
--- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -6,10 +6,11 @@
//
//===----------------------------------------------------------------------===//
//
-/// \file
-/// This pass creates bundles of SMEM and VMEM instructions forming memory
-/// clauses if XNACK is enabled. Def operands of clauses are marked as early
-/// clobber to make sure we will not override any source within a clause.
+/// \file This pass extends the live ranges of registers used as pointers in
+/// sequences of adjacent SMEM and VMEM instructions if XNACK is enabled. A
+/// load that would overwrite a pointer would require breaking the soft clause.
+/// Artificially extend the live ranges of the pointer operands by adding
+/// implicit-def early-clobber operands throughout the soft clause.
///
//===----------------------------------------------------------------------===//
@@ -59,10 +60,8 @@ public:
}
private:
- template <typename Callable>
- void forAllLanes(Register Reg, LaneBitmask LaneMask, Callable Func) const;
-
- bool canBundle(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const;
+ bool canBundle(const MachineInstr &MI, const RegUse &Defs,
+ const RegUse &Uses) const;
bool checkPressure(const MachineInstr &MI, GCNDownwardRPTracker &RPT);
void collectRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const;
bool processRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses,
@@ -106,12 +105,12 @@ static bool isSMEMClauseInst(const MachineInstr &MI) {
// There no sense to create store clauses, they do not define anything,
// thus there is nothing to set early-clobber.
static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) {
- if (MI.isDebugValue() || MI.isBundled())
+ assert(!MI.isDebugInstr() && "debug instructions should not reach here");
+ if (MI.isBundled())
return false;
if (!MI.mayLoad() || MI.mayStore())
return false;
- if (AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1 ||
- AMDGPU::getAtomicRetOp(MI.getOpcode()) != -1)
+ if (SIInstrInfo::isAtomic(MI))
return false;
if (IsVMEMClause && !isVMEMClauseInst(MI))
return false;
@@ -148,63 +147,10 @@ static unsigned getMopState(const MachineOperand &MO) {
return S;
}
-template <typename Callable>
-void SIFormMemoryClauses::forAllLanes(Register Reg, LaneBitmask LaneMask,
- Callable Func) const {
- if (LaneMask.all() || Reg.isPhysical() ||
- LaneMask == MRI->getMaxLaneMaskForVReg(Reg)) {
- Func(0);
- return;
- }
-
- const TargetRegisterClass *RC = MRI->getRegClass(Reg);
- unsigned E = TRI->getNumSubRegIndices();
- SmallVector<unsigned, AMDGPU::NUM_TARGET_SUBREGS> CoveringSubregs;
- for (unsigned Idx = 1; Idx < E; ++Idx) {
- // Is this index even compatible with the given class?
- if (TRI->getSubClassWithSubReg(RC, Idx) != RC)
- continue;
- LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
- // Early exit if we found a perfect match.
- if (SubRegMask == LaneMask) {
- Func(Idx);
- return;
- }
-
- if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none())
- continue;
-
- CoveringSubregs.push_back(Idx);
- }
-
- llvm::sort(CoveringSubregs, [this](unsigned A, unsigned B) {
- LaneBitmask MaskA = TRI->getSubRegIndexLaneMask(A);
- LaneBitmask MaskB = TRI->getSubRegIndexLaneMask(B);
- unsigned NA = MaskA.getNumLanes();
- unsigned NB = MaskB.getNumLanes();
- if (NA != NB)
- return NA > NB;
- return MaskA.getHighestLane() > MaskB.getHighestLane();
- });
-
- for (unsigned Idx : CoveringSubregs) {
- LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
- if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none())
- continue;
-
- Func(Idx);
- LaneMask &= ~SubRegMask;
- if (LaneMask.none())
- return;
- }
-
- llvm_unreachable("Failed to find all subregs to cover lane mask");
-}
-
// Returns false if there is a use of a def already in the map.
// In this case we must break the clause.
-bool SIFormMemoryClauses::canBundle(const MachineInstr &MI,
- RegUse &Defs, RegUse &Uses) const {
+bool SIFormMemoryClauses::canBundle(const MachineInstr &MI, const RegUse &Defs,
+ const RegUse &Uses) const {
// Check interference with defs.
for (const MachineOperand &MO : MI.operands()) {
// TODO: Prologue/Epilogue Insertion pass does not process bundled
@@ -221,7 +167,7 @@ bool SIFormMemoryClauses::canBundle(const MachineInstr &MI,
if (MO.isTied())
return false;
- RegUse &Map = MO.isDef() ? Uses : Defs;
+ const RegUse &Map = MO.isDef() ? Uses : Defs;
auto Conflict = Map.find(Reg);
if (Conflict == Map.end())
continue;
@@ -249,9 +195,19 @@ bool SIFormMemoryClauses::checkPressure(const MachineInstr &MI,
RPT.advanceToNext();
GCNRegPressure MaxPressure = RPT.moveMaxPressure();
unsigned Occupancy = MaxPressure.getOccupancy(*ST);
+
+ // Don't push over half the register budget. We don't want to introduce
+ // spilling just to form a soft clause.
+ //
+ // FIXME: This pressure check is fundamentally broken. First, this is checking
+ // the global pressure, not the pressure at this specific point in the
+ // program. Second, it's not accounting for the increased liveness of the use
+ // operands due to the early clobber we will introduce. Third, the pressure
+ // tracking does not account for the alignment requirements for SGPRs, or the
+ // fragmentation of registers the allocator will need to satisfy.
if (Occupancy >= MFI->getMinAllowedOccupancy() &&
- MaxPressure.getVGPRNum() <= MaxVGPRs &&
- MaxPressure.getSGPRNum() <= MaxSGPRs) {
+ MaxPressure.getVGPRNum(ST->hasGFX90AInsts()) <= MaxVGPRs / 2 &&
+ MaxPressure.getSGPRNum() <= MaxSGPRs / 2) {
LastRecordedOccupancy = Occupancy;
return true;
}
@@ -328,6 +284,9 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
MachineInstr &MI = *I;
Next = std::next(I);
+ if (MI.isMetaInstruction())
+ continue;
+
bool IsVMEM = isVMEMClauseInst(MI);
if (!isValidClauseInst(MI, IsVMEM))
@@ -347,8 +306,13 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
continue;
}
+ MachineBasicBlock::iterator LastClauseInst = Next;
unsigned Length = 1;
for ( ; Next != E && Length < FuncMaxClause; ++Next) {
+ // Debug instructions should not change the kill insertion.
+ if (Next->isMetaInstruction())
+ continue;
+
if (!isValidClauseInst(*Next, IsVMEM))
break;
@@ -358,6 +322,7 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
if (!processRegUses(*Next, Defs, Uses, RPT))
break;
+ LastClauseInst = Next;
++Length;
}
if (Length < 2) {
@@ -368,36 +333,74 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
Changed = true;
MFI->limitOccupancy(LastRecordedOccupancy);
- auto B = BuildMI(MBB, I, DebugLoc(), TII->get(TargetOpcode::BUNDLE));
- Ind->insertMachineInstrInMaps(*B);
+ assert(!LastClauseInst->isMetaInstruction());
- // Restore the state after processing the bundle.
- RPT.reset(*B, &LiveRegsCopy);
+ SlotIndex ClauseLiveInIdx = LIS->getInstructionIndex(MI);
+ SlotIndex ClauseLiveOutIdx =
+ LIS->getInstructionIndex(*LastClauseInst).getNextIndex();
- for (auto BI = I; BI != Next; ++BI) {
- BI->bundleWithPred();
- Ind->removeSingleMachineInstrFromMaps(*BI);
+ // Track the last inserted kill.
+ MachineInstrBuilder Kill;
- for (MachineOperand &MO : BI->defs())
- if (MO.readsReg())
- MO.setIsInternalRead(true);
- }
+ // Insert one kill per register, with operands covering all necessary
+ // subregisters.
+ for (auto &&R : Uses) {
+ Register Reg = R.first;
+ if (Reg.isPhysical())
+ continue;
- for (auto &&R : Defs) {
- forAllLanes(R.first, R.second.second, [&R, &B](unsigned SubReg) {
- unsigned S = R.second.first | RegState::EarlyClobber;
- if (!SubReg)
- S &= ~(RegState::Undef | RegState::Dead);
- B.addDef(R.first, S, SubReg);
- });
+ // Collect the register operands we should extend the live ranges of.
+ SmallVector<std::tuple<unsigned, unsigned>> KillOps;
+ const LiveInterval &LI = LIS->getInterval(R.first);
+
+ if (!LI.hasSubRanges()) {
+ if (!LI.liveAt(ClauseLiveOutIdx)) {
+ KillOps.emplace_back(R.second.first | RegState::Kill,
+ AMDGPU::NoSubRegister);
+ }
+ } else {
+ LaneBitmask KilledMask;
+ for (const LiveInterval::SubRange &SR : LI.subranges()) {
+ if (SR.liveAt(ClauseLiveInIdx) && !SR.liveAt(ClauseLiveOutIdx))
+ KilledMask |= SR.LaneMask;
+ }
+
+ if (KilledMask.none())
+ continue;
+
+ SmallVector<unsigned> KilledIndexes;
+ bool Success = TRI->getCoveringSubRegIndexes(
+ *MRI, MRI->getRegClass(Reg), KilledMask, KilledIndexes);
+ (void)Success;
+ assert(Success && "Failed to find subregister mask to cover lanes");
+ for (unsigned SubReg : KilledIndexes) {
+ KillOps.emplace_back(R.second.first | RegState::Kill, SubReg);
+ }
+ }
+
+ if (KillOps.empty())
+ continue;
+
+ // We only want to extend the live ranges of used registers. If they
+ // already have existing uses beyond the bundle, we don't need the kill.
+ //
+ // It's possible all of the use registers were already live past the
+ // bundle.
+ Kill = BuildMI(*MI.getParent(), std::next(LastClauseInst),
+ DebugLoc(), TII->get(AMDGPU::KILL));
+ for (auto &Op : KillOps)
+ Kill.addUse(Reg, std::get<0>(Op), std::get<1>(Op));
+ Ind->insertMachineInstrInMaps(*Kill);
}
- for (auto &&R : Uses) {
- forAllLanes(R.first, R.second.second, [&R, &B](unsigned SubReg) {
- B.addUse(R.first, R.second.first & ~RegState::Kill, SubReg);
- });
+ if (!Kill) {
+ RPT.reset(MI, &LiveRegsCopy);
+ continue;
}
+ // Restore the state after processing the end of the bundle.
+ RPT.reset(*Kill, &LiveRegsCopy);
+
for (auto &&R : Defs) {
Register Reg = R.first;
Uses.erase(Reg);
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 0398d27756db..c9883d38e08c 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -20,18 +20,16 @@ using namespace llvm;
#define DEBUG_TYPE "frame-info"
-
-// Find a scratch register that we can use at the start of the prologue to
-// re-align the stack pointer. We avoid using callee-save registers since they
-// may appear to be free when this is called from canUseAsPrologue (during
-// shrink wrapping), but then no longer be free when this is called from
-// emitPrologue.
-//
-// FIXME: This is a bit conservative, since in the above case we could use one
-// of the callee-save registers as a scratch temp to re-align the stack pointer,
-// but we would then have to make sure that we were in fact saving at least one
-// callee-save register in the prologue, which is additional complexity that
-// doesn't seem worth the benefit.
+static cl::opt<bool> EnableSpillVGPRToAGPR(
+ "amdgpu-spill-vgpr-to-agpr",
+ cl::desc("Enable spilling VGPRs to AGPRs"),
+ cl::ReallyHidden,
+ cl::init(true));
+
+// Find a scratch register that we can use in the prologue. We avoid using
+// callee-save registers since they may appear to be free when this is called
+// from canUseAsPrologue (during shrink wrapping), but then no longer be free
+// when this is called from emitPrologue.
static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
LivePhysRegs &LiveRegs,
const TargetRegisterClass &RC,
@@ -55,12 +53,6 @@ static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
}
}
- // If we require an unused register, this is used in contexts where failure is
- // an option and has an alternative plan. In other contexts, this must
- // succeed0.
- if (!Unused)
- report_fatal_error("failed to find free scratch register");
-
return MCRegister();
}
@@ -72,10 +64,8 @@ static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF,
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
-#ifndef NDEBUG
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
-#endif
// We need to save and restore the current FP/BP.
@@ -105,7 +95,7 @@ static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF,
int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr,
TargetStackID::SGPRSpill);
- if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
+ if (TRI->spillSGPRToVGPR() && MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
// 3: There's no free lane to spill, and no free register to save FP/BP,
// so we're forced to spill another VGPR to use for the spill.
FrameIndex = NewFI;
@@ -131,166 +121,45 @@ static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF,
// We need to specially emit stack operations here because a different frame
// register is used than in the rest of the function, as getFrameRegister would
// use.
-static void buildPrologSpill(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
+static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
+ const SIMachineFunctionInfo &FuncInfo,
+ LivePhysRegs &LiveRegs, MachineFunction &MF,
MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- const SIInstrInfo *TII, Register SpillReg,
- Register ScratchRsrcReg, Register SPReg, int FI) {
- MachineFunction *MF = MBB.getParent();
- MachineFrameInfo &MFI = MF->getFrameInfo();
+ MachineBasicBlock::iterator I, Register SpillReg,
+ int FI) {
+ unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
+ : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
- int64_t Offset = MFI.getObjectOffset(FI);
-
- MachineMemOperand *MMO = MF->getMachineMemOperand(
- MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4,
- MFI.getObjectAlign(FI));
-
- if (ST.enableFlatScratch()) {
- if (TII->isLegalFLATOffset(Offset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_STORE_DWORD_SADDR))
- .addReg(SpillReg, RegState::Kill)
- .addReg(SPReg)
- .addImm(Offset)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // dlc
- .addMemOperand(MMO);
- return;
- }
- } else if (SIInstrInfo::isLegalMUBUFImmOffset(Offset)) {
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET))
- .addReg(SpillReg, RegState::Kill)
- .addReg(ScratchRsrcReg)
- .addReg(SPReg)
- .addImm(Offset)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // tfe
- .addImm(0) // dlc
- .addImm(0) // swz
- .addMemOperand(MMO);
- return;
- }
-
- // Don't clobber the TmpVGPR if we also need a scratch reg for the stack
- // offset in the spill.
+ MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+ MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI),
+ FrameInfo.getObjectAlign(FI));
LiveRegs.addReg(SpillReg);
-
- if (ST.enableFlatScratch()) {
- MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
- MF->getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0RegClass);
-
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), OffsetReg)
- .addReg(SPReg)
- .addImm(Offset);
-
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_STORE_DWORD_SADDR))
- .addReg(SpillReg, RegState::Kill)
- .addReg(OffsetReg, RegState::Kill)
- .addImm(0)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // dlc
- .addMemOperand(MMO);
- } else {
- MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
- MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
-
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
- .addImm(Offset);
-
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN))
- .addReg(SpillReg, RegState::Kill)
- .addReg(OffsetReg, RegState::Kill)
- .addReg(ScratchRsrcReg)
- .addReg(SPReg)
- .addImm(0)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // tfe
- .addImm(0) // dlc
- .addImm(0) // swz
- .addMemOperand(MMO);
- }
-
+ TRI.buildSpillLoadStore(MBB, I, Opc, FI, SpillReg, true,
+ FuncInfo.getStackPtrOffsetReg(), 0, MMO, nullptr,
+ &LiveRegs);
LiveRegs.removeReg(SpillReg);
}
-static void buildEpilogReload(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- const SIInstrInfo *TII, Register SpillReg,
- Register ScratchRsrcReg, Register SPReg, int FI) {
- MachineFunction *MF = MBB.getParent();
- MachineFrameInfo &MFI = MF->getFrameInfo();
- int64_t Offset = MFI.getObjectOffset(FI);
-
- MachineMemOperand *MMO = MF->getMachineMemOperand(
- MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4,
- MFI.getObjectAlign(FI));
-
- if (ST.enableFlatScratch()) {
- if (TII->isLegalFLATOffset(Offset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
- BuildMI(MBB, I, DebugLoc(),
- TII->get(AMDGPU::SCRATCH_LOAD_DWORD_SADDR), SpillReg)
- .addReg(SPReg)
- .addImm(Offset)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // dlc
- .addMemOperand(MMO);
- return;
- }
- MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
- MF->getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0RegClass);
-
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), OffsetReg)
- .addReg(SPReg)
- .addImm(Offset);
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_LOAD_DWORD_SADDR),
- SpillReg)
- .addReg(OffsetReg, RegState::Kill)
- .addImm(0)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // dlc
- .addMemOperand(MMO);
- return;
- }
-
- if (SIInstrInfo::isLegalMUBUFImmOffset(Offset)) {
- BuildMI(MBB, I, DebugLoc(),
- TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg)
- .addReg(ScratchRsrcReg)
- .addReg(SPReg)
- .addImm(Offset)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // tfe
- .addImm(0) // dlc
- .addImm(0) // swz
- .addMemOperand(MMO);
- return;
- }
+static void buildEpilogRestore(const GCNSubtarget &ST,
+ const SIRegisterInfo &TRI,
+ const SIMachineFunctionInfo &FuncInfo,
+ LivePhysRegs &LiveRegs, MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, Register SpillReg,
+ int FI) {
+ unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
+ : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
- MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
- MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
-
- BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
- .addImm(Offset);
-
- BuildMI(MBB, I, DebugLoc(),
- TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg)
- .addReg(OffsetReg, RegState::Kill)
- .addReg(ScratchRsrcReg)
- .addReg(SPReg)
- .addImm(0)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // tfe
- .addImm(0) // dlc
- .addImm(0) // swz
- .addMemOperand(MMO);
+ MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+ MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI),
+ FrameInfo.getObjectAlign(FI));
+ TRI.buildSpillLoadStore(MBB, I, Opc, FI, SpillReg, false,
+ FuncInfo.getStackPtrOffsetReg(), 0, MMO, nullptr,
+ &LiveRegs);
}
static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
@@ -384,8 +253,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
.addReg(FlatScrInit)
.addImm(EncodedOffset) // offset
- .addImm(0) // glc
- .addImm(0) // dlc
+ .addImm(0) // cpol
.addMemOperand(MMO);
// Mask the offset in [47:0] of the descriptor
@@ -445,9 +313,9 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
// Add wave offset in bytes to private base offset.
// See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
- .addReg(FlatScrInitLo)
- .addReg(ScratchWaveOffsetReg);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo)
+ .addReg(FlatScrInitLo)
+ .addReg(ScratchWaveOffsetReg);
// Convert offset to 256-byte units.
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
@@ -545,6 +413,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
const Function &F = MF.getFunction();
+ MachineFrameInfo &FrameInfo = MF.getFrameInfo();
assert(MFI->isEntryFunction());
@@ -622,7 +491,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
Register SPReg = MFI->getStackPtrOffsetReg();
assert(SPReg != AMDGPU::SP_REG);
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
- .addImm(MF.getFrameInfo().getStackSize() * getScratchScaleFactor(ST));
+ .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST));
}
if (hasFP(MF)) {
@@ -631,12 +500,18 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
}
- if (MFI->hasFlatScratchInit() || ScratchRsrcReg) {
+ bool NeedsFlatScratchInit =
+ MFI->hasFlatScratchInit() &&
+ (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
+ (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
+
+ if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
+ !ST.flatScratchIsArchitected()) {
MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
}
- if (MFI->hasFlatScratchInit()) {
+ if (NeedsFlatScratchInit) {
emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
}
@@ -663,6 +538,7 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
// The pointer to the GIT is formed from the offset passed in and either
// the amdgpu-git-ptr-high function attribute or the top part of the PC
Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
+ Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
buildGitPtr(MBB, I, DL, TII, Rsrc01);
@@ -681,10 +557,23 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
.addReg(Rsrc01)
.addImm(EncodedOffset) // offset
- .addImm(0) // glc
- .addImm(0) // dlc
+ .addImm(0) // cpol
.addReg(ScratchRsrcReg, RegState::ImplicitDefine)
.addMemOperand(MMO);
+
+ // The driver will always set the SRD for wave 64 (bits 118:117 of
+ // descriptor / bits 22:21 of third sub-reg will be 0b11)
+ // If the shader is actually wave32 we have to modify the const_index_stride
+ // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
+ // reason the driver does this is that there can be cases where it presents
+ // 2 shaders with different wave size (e.g. VsFs).
+ // TODO: convert to using SCRATCH instructions or multiple SRD buffers
+ if (ST.isWave32()) {
+ const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32);
+ BuildMI(MBB, I, DL, SBitsetB32, Rsrc03)
+ .addImm(21)
+ .addReg(Rsrc03);
+ }
} else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
assert(!ST.isAmdHsaOrMesa(Fn));
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
@@ -716,8 +605,7 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
.addReg(MFI->getImplicitBufferPtrUserSGPR())
.addImm(0) // offset
- .addImm(0) // glc
- .addImm(0) // dlc
+ .addImm(0) // cpol
.addMemOperand(MMO)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
@@ -785,11 +673,28 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
case TargetStackID::SGPRSpill:
return true;
case TargetStackID::ScalableVector:
+ case TargetStackID::WasmLocal:
return false;
}
llvm_unreachable("Invalid TargetStackID::Value");
}
+static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI,
+ const SIMachineFunctionInfo *FuncInfo,
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, bool IsProlog) {
+ if (LiveRegs.empty()) {
+ LiveRegs.init(TRI);
+ if (IsProlog) {
+ LiveRegs.addLiveIns(MBB);
+ } else {
+ // In epilog.
+ LiveRegs.addLiveOuts(MBB);
+ LiveRegs.stepBackward(*MBBI);
+ }
+ }
+}
+
// Activate all lanes, returns saved exec.
static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
MachineFunction &MF,
@@ -804,28 +709,14 @@ static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
DebugLoc DL;
- if (LiveRegs.empty()) {
- if (IsProlog) {
- LiveRegs.init(TRI);
- LiveRegs.addLiveIns(MBB);
- if (FuncInfo->SGPRForFPSaveRestoreCopy)
- LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
-
- if (FuncInfo->SGPRForBPSaveRestoreCopy)
- LiveRegs.removeReg(FuncInfo->SGPRForBPSaveRestoreCopy);
- } else {
- // In epilog.
- LiveRegs.init(*ST.getRegisterInfo());
- LiveRegs.addLiveOuts(MBB);
- LiveRegs.stepBackward(*MBBI);
- }
- }
+ initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
ScratchExecCopy = findScratchNonCalleeSaveRegister(
MRI, LiveRegs, *TRI.getWaveMaskRegClass());
+ if (!ScratchExecCopy)
+ report_fatal_error("failed to find free scratch register");
- if (!IsProlog)
- LiveRegs.removeReg(ScratchExecCopy);
+ LiveRegs.addReg(ScratchExecCopy);
const unsigned OrSaveExec =
ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
@@ -834,6 +725,13 @@ static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
return ScratchExecCopy;
}
+// A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
+// Otherwise we are spilling to memory.
+static bool spilledToMemory(const MachineFunction &MF, int SaveIndex) {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ return MFI.getStackID(SaveIndex) != TargetStackID::SGPRSpill;
+}
+
void SIFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
@@ -865,126 +763,93 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
// turn on all lanes before doing the spill to memory.
Register ScratchExecCopy;
- bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
- bool SpillFPToMemory = false;
- // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
- // Otherwise we are spilling the FP to memory.
- if (HasFPSaveIndex) {
- SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
- TargetStackID::SGPRSpill;
- }
-
- bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
- bool SpillBPToMemory = false;
- // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
- // Otherwise we are spilling the BP to memory.
- if (HasBPSaveIndex) {
- SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
- TargetStackID::SGPRSpill;
- }
-
- // Emit the copy if we need an FP, and are using a free SGPR to save it.
- if (FuncInfo->SGPRForFPSaveRestoreCopy) {
- BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy)
- .addReg(FramePtrReg)
- .setMIFlag(MachineInstr::FrameSetup);
- }
-
- // Emit the copy if we need a BP, and are using a free SGPR to save it.
- if (FuncInfo->SGPRForBPSaveRestoreCopy) {
- BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY),
- FuncInfo->SGPRForBPSaveRestoreCopy)
- .addReg(BasePtrReg)
- .setMIFlag(MachineInstr::FrameSetup);
- }
+ Optional<int> FPSaveIndex = FuncInfo->FramePointerSaveIndex;
+ Optional<int> BPSaveIndex = FuncInfo->BasePointerSaveIndex;
- // If a copy has been emitted for FP and/or BP, Make the SGPRs
- // used in the copy instructions live throughout the function.
- SmallVector<MCPhysReg, 2> TempSGPRs;
- if (FuncInfo->SGPRForFPSaveRestoreCopy)
- TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy);
-
- if (FuncInfo->SGPRForBPSaveRestoreCopy)
- TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy);
+ // VGPRs used for SGPR->VGPR spills
+ for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg :
+ FuncInfo->getSGPRSpillVGPRs()) {
+ if (!Reg.FI)
+ continue;
- if (!TempSGPRs.empty()) {
- for (MachineBasicBlock &MBB : MF) {
- for (MCPhysReg Reg : TempSGPRs)
- MBB.addLiveIn(Reg);
+ if (!ScratchExecCopy)
+ ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI,
+ /*IsProlog*/ true);
- MBB.sortUniqueLiveIns();
- }
+ buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, Reg.VGPR,
+ *Reg.FI);
}
- for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
- : FuncInfo->getSGPRSpillVGPRs()) {
- if (!Reg.FI.hasValue())
+ // VGPRs used for Whole Wave Mode
+ for (const auto &Reg : FuncInfo->WWMReservedRegs) {
+ auto VGPR = Reg.first;
+ auto FI = Reg.second;
+ if (!FI)
continue;
if (!ScratchExecCopy)
- ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
+ ScratchExecCopy =
+ buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ true);
- buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, Reg.VGPR,
- FuncInfo->getScratchRSrcReg(),
- StackPtrReg,
- Reg.FI.getValue());
+ buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, VGPR, *FI);
}
- if (HasFPSaveIndex && SpillFPToMemory) {
- assert(!MFI.isDeadObjectIndex(FuncInfo->FramePointerSaveIndex.getValue()));
+ if (ScratchExecCopy) {
+ // FIXME: Split block and make terminator.
+ unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
+ .addReg(ScratchExecCopy, RegState::Kill);
+ LiveRegs.addReg(ScratchExecCopy);
+ }
- if (!ScratchExecCopy)
- ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
+ if (FPSaveIndex && spilledToMemory(MF, *FPSaveIndex)) {
+ const int FramePtrFI = *FPSaveIndex;
+ assert(!MFI.isDeadObjectIndex(FramePtrFI));
+
+ initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+ if (!TmpVGPR)
+ report_fatal_error("failed to find free scratch register");
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
.addReg(FramePtrReg);
- buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR,
- FuncInfo->getScratchRSrcReg(), StackPtrReg,
- FuncInfo->FramePointerSaveIndex.getValue());
+ buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR,
+ FramePtrFI);
}
- if (HasBPSaveIndex && SpillBPToMemory) {
- assert(!MFI.isDeadObjectIndex(*FuncInfo->BasePointerSaveIndex));
+ if (BPSaveIndex && spilledToMemory(MF, *BPSaveIndex)) {
+ const int BasePtrFI = *BPSaveIndex;
+ assert(!MFI.isDeadObjectIndex(BasePtrFI));
- if (!ScratchExecCopy)
- ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
+ initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+ if (!TmpVGPR)
+ report_fatal_error("failed to find free scratch register");
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
.addReg(BasePtrReg);
- buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR,
- FuncInfo->getScratchRSrcReg(), StackPtrReg,
- *FuncInfo->BasePointerSaveIndex);
- }
-
- if (ScratchExecCopy) {
- // FIXME: Split block and make terminator.
- unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
- .addReg(ScratchExecCopy, RegState::Kill);
- LiveRegs.addReg(ScratchExecCopy);
+ buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR,
+ BasePtrFI);
}
// In this case, spill the FP to a reserved VGPR.
- if (HasFPSaveIndex && !SpillFPToMemory) {
- const int FI = FuncInfo->FramePointerSaveIndex.getValue();
- assert(!MFI.isDeadObjectIndex(FI));
+ if (FPSaveIndex && !spilledToMemory(MF, *FPSaveIndex)) {
+ const int FramePtrFI = *FPSaveIndex;
+ assert(!MFI.isDeadObjectIndex(FramePtrFI));
- assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
+ assert(MFI.getStackID(FramePtrFI) == TargetStackID::SGPRSpill);
ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
- FuncInfo->getSGPRToVGPRSpills(FI);
+ FuncInfo->getSGPRToVGPRSpills(FramePtrFI);
assert(Spill.size() == 1);
// Save FP before setting it up.
- // FIXME: This should respect spillSGPRToVGPR;
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR)
.addReg(FramePtrReg)
.addImm(Spill[0].Lane)
@@ -992,8 +857,8 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
}
// In this case, spill the BP to a reserved VGPR.
- if (HasBPSaveIndex && !SpillBPToMemory) {
- const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
+ if (BPSaveIndex && !spilledToMemory(MF, *BPSaveIndex)) {
+ const int BasePtrFI = *BPSaveIndex;
assert(!MFI.isDeadObjectIndex(BasePtrFI));
assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
@@ -1002,14 +867,51 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
assert(Spill.size() == 1);
// Save BP before setting it up.
- // FIXME: This should respect spillSGPRToVGPR;
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR)
.addReg(BasePtrReg)
.addImm(Spill[0].Lane)
.addReg(Spill[0].VGPR, RegState::Undef);
}
- if (TRI.needsStackRealignment(MF)) {
+ // Emit the copy if we need an FP, and are using a free SGPR to save it.
+ if (FuncInfo->SGPRForFPSaveRestoreCopy) {
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY),
+ FuncInfo->SGPRForFPSaveRestoreCopy)
+ .addReg(FramePtrReg)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // Emit the copy if we need a BP, and are using a free SGPR to save it.
+ if (FuncInfo->SGPRForBPSaveRestoreCopy) {
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY),
+ FuncInfo->SGPRForBPSaveRestoreCopy)
+ .addReg(BasePtrReg)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // If a copy has been emitted for FP and/or BP, Make the SGPRs
+ // used in the copy instructions live throughout the function.
+ SmallVector<MCPhysReg, 2> TempSGPRs;
+ if (FuncInfo->SGPRForFPSaveRestoreCopy)
+ TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy);
+
+ if (FuncInfo->SGPRForBPSaveRestoreCopy)
+ TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy);
+
+ if (!TempSGPRs.empty()) {
+ for (MachineBasicBlock &MBB : MF) {
+ for (MCPhysReg Reg : TempSGPRs)
+ MBB.addLiveIn(Reg);
+
+ MBB.sortUniqueLiveIns();
+ }
+ if (!LiveRegs.empty()) {
+ LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+ LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy);
+ }
+ }
+
+ if (TRI.hasStackRealignment(MF)) {
HasFP = true;
const unsigned Alignment = MFI.getMaxAlign().value();
@@ -1017,23 +919,16 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
if (LiveRegs.empty()) {
LiveRegs.init(TRI);
LiveRegs.addLiveIns(MBB);
- LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
- LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy);
}
- Register ScratchSPReg = findScratchNonCalleeSaveRegister(
- MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass);
- assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy &&
- ScratchSPReg != FuncInfo->SGPRForBPSaveRestoreCopy);
-
- // s_add_u32 tmp_reg, s32, NumBytes
- // s_and_b32 s32, tmp_reg, 0b111...0000
- BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
+ // s_add_i32 s33, s32, NumBytes
+ // s_and_b32 s33, s33, 0b111...0000
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg)
.addReg(StackPtrReg)
.addImm((Alignment - 1) * getScratchScaleFactor(ST))
.setMIFlag(MachineInstr::FrameSetup);
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
- .addReg(ScratchSPReg, RegState::Kill)
+ .addReg(FramePtrReg, RegState::Kill)
.addImm(-Alignment * getScratchScaleFactor(ST))
.setMIFlag(MachineInstr::FrameSetup);
FuncInfo->setIsStackRealigned(true);
@@ -1054,7 +949,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
}
if (HasFP && RoundedSize != 0) {
- BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
.addReg(StackPtrReg)
.addImm(RoundedSize * getScratchScaleFactor(ST))
.setMIFlag(MachineInstr::FrameSetup);
@@ -1101,58 +996,47 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
const Register BasePtrReg =
TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
- bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
- bool SpillFPToMemory = false;
- if (HasFPSaveIndex) {
- SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
- TargetStackID::SGPRSpill;
- }
-
- bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
- bool SpillBPToMemory = false;
- if (HasBPSaveIndex) {
- SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
- TargetStackID::SGPRSpill;
- }
+ Optional<int> FPSaveIndex = FuncInfo->FramePointerSaveIndex;
+ Optional<int> BPSaveIndex = FuncInfo->BasePointerSaveIndex;
if (RoundedSize != 0 && hasFP(MF)) {
- BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
- .addReg(StackPtrReg)
- .addImm(RoundedSize * getScratchScaleFactor(ST))
- .setMIFlag(MachineInstr::FrameDestroy);
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
+ .addReg(StackPtrReg)
+ .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST)))
+ .setMIFlag(MachineInstr::FrameDestroy);
}
if (FuncInfo->SGPRForFPSaveRestoreCopy) {
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
.addReg(FuncInfo->SGPRForFPSaveRestoreCopy)
- .setMIFlag(MachineInstr::FrameSetup);
+ .setMIFlag(MachineInstr::FrameDestroy);
}
if (FuncInfo->SGPRForBPSaveRestoreCopy) {
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
.addReg(FuncInfo->SGPRForBPSaveRestoreCopy)
- .setMIFlag(MachineInstr::FrameSetup);
+ .setMIFlag(MachineInstr::FrameDestroy);
}
- Register ScratchExecCopy;
- if (HasFPSaveIndex) {
- const int FI = FuncInfo->FramePointerSaveIndex.getValue();
- assert(!MFI.isDeadObjectIndex(FI));
- if (SpillFPToMemory) {
- if (!ScratchExecCopy)
- ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
-
- MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
+ if (FPSaveIndex) {
+ const int FramePtrFI = *FPSaveIndex;
+ assert(!MFI.isDeadObjectIndex(FramePtrFI));
+ if (spilledToMemory(MF, FramePtrFI)) {
+ initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
+
+ MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
- buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TempVGPR,
- FuncInfo->getScratchRSrcReg(), StackPtrReg, FI);
+ if (!TmpVGPR)
+ report_fatal_error("failed to find free scratch register");
+ buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR,
+ FramePtrFI);
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg)
- .addReg(TempVGPR, RegState::Kill);
+ .addReg(TmpVGPR, RegState::Kill);
} else {
// Reload from VGPR spill.
- assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
+ assert(MFI.getStackID(FramePtrFI) == TargetStackID::SGPRSpill);
ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
- FuncInfo->getSGPRToVGPRSpills(FI);
+ FuncInfo->getSGPRToVGPRSpills(FramePtrFI);
assert(Spill.size() == 1);
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), FramePtrReg)
.addReg(Spill[0].VGPR)
@@ -1160,19 +1044,20 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
}
}
- if (HasBPSaveIndex) {
- const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
+ if (BPSaveIndex) {
+ const int BasePtrFI = *BPSaveIndex;
assert(!MFI.isDeadObjectIndex(BasePtrFI));
- if (SpillBPToMemory) {
- if (!ScratchExecCopy)
- ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
+ if (spilledToMemory(MF, BasePtrFI)) {
+ initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
- MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
+ MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
- buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TempVGPR,
- FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI);
+ if (!TmpVGPR)
+ report_fatal_error("failed to find free scratch register");
+ buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, TmpVGPR,
+ BasePtrFI);
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg)
- .addReg(TempVGPR, RegState::Kill);
+ .addReg(TmpVGPR, RegState::Kill);
} else {
// Reload from VGPR spill.
assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
@@ -1185,17 +1070,31 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
}
}
- for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg :
+ Register ScratchExecCopy;
+ for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg :
FuncInfo->getSGPRSpillVGPRs()) {
- if (!Reg.FI.hasValue())
+ if (!Reg.FI)
+ continue;
+
+ if (!ScratchExecCopy)
+ ScratchExecCopy =
+ buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false);
+
+ buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, Reg.VGPR,
+ *Reg.FI);
+ }
+
+ for (const auto &Reg : FuncInfo->WWMReservedRegs) {
+ auto VGPR = Reg.first;
+ auto FI = Reg.second;
+ if (!FI)
continue;
if (!ScratchExecCopy)
- ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
+ ScratchExecCopy =
+ buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false);
- buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, Reg.VGPR,
- FuncInfo->getScratchRSrcReg(), StackPtrReg,
- Reg.FI.getValue());
+ buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, VGPR, *FI);
}
if (ScratchExecCopy) {
@@ -1240,9 +1139,73 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
MachineFrameInfo &MFI = MF.getFrameInfo();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
+ && EnableSpillVGPRToAGPR;
+
+ if (SpillVGPRToAGPR) {
+ // To track the spill frame indices handled in this pass.
+ BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
+
+ bool SeenDbgInstr = false;
+
+ for (MachineBasicBlock &MBB : MF) {
+ MachineBasicBlock::iterator Next;
+ for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) {
+ MachineInstr &MI = *I;
+ Next = std::next(I);
+
+ if (MI.isDebugInstr())
+ SeenDbgInstr = true;
+
+ if (TII->isVGPRSpill(MI)) {
+ // Try to eliminate stack used by VGPR spills before frame
+ // finalization.
+ unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::vaddr);
+ int FI = MI.getOperand(FIOp).getIndex();
+ Register VReg =
+ TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
+ if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
+ TRI->isAGPR(MRI, VReg))) {
+ // FIXME: change to enterBasicBlockEnd()
+ RS->enterBasicBlock(MBB);
+ TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
+ SpillFIs.set(FI);
+ continue;
+ }
+ }
+ }
+ }
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
+ MBB.addLiveIn(Reg);
+
+ for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
+ MBB.addLiveIn(Reg);
+
+ MBB.sortUniqueLiveIns();
+
+ if (!SpillFIs.empty() && SeenDbgInstr) {
+ // FIXME: The dead frame indices are replaced with a null register from
+ // the debug value instructions. We should instead, update it with the
+ // correct register value. But not sure the register value alone is
+ for (MachineInstr &MI : MBB) {
+ if (MI.isDebugValue() && MI.getOperand(0).isFI() &&
+ SpillFIs[MI.getOperand(0).getIndex()]) {
+ MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/);
+ MI.getOperand(0).setIsDebug();
+ }
+ }
+ }
+ }
+ }
+
FuncInfo->removeDeadFrameIndices(MFI);
assert(allSGPRSpillsAreDead(MF) &&
"SGPR spill should have been removed in SILowerSGPRSpills");
@@ -1253,16 +1216,8 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
if (!allStackObjectsAreDead(MFI)) {
assert(RS && "RegScavenger required if spilling");
- if (FuncInfo->isEntryFunction()) {
- int ScavengeFI = MFI.CreateFixedObject(
- TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
- RS->addScavengingFrameIndex(ScavengeFI);
- } else {
- int ScavengeFI = MFI.CreateStackObject(
- TRI->getSpillSize(AMDGPU::SGPR_32RegClass),
- TRI->getSpillAlign(AMDGPU::SGPR_32RegClass), false);
- RS->addScavengingFrameIndex(ScavengeFI);
- }
+ // Add an emergency spill slot
+ RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI));
}
}
@@ -1280,7 +1235,13 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
const SIRegisterInfo *TRI = ST.getRegisterInfo();
// Ignore the SGPRs the default implementation found.
- SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask());
+ SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask());
+
+ // Do not save AGPRs prior to GFX90A because there was no easy way to do so.
+ // In gfx908 there was do AGPR loads and stores and thus spilling also
+ // require a temporary VGPR.
+ if (!ST.hasGFX90AInsts())
+ SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask());
// hasFP only knows about stack objects that already exist. We're now
// determining the stack slots that will be created, so we have to predict
@@ -1335,7 +1296,7 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
SavedRegs.reset(MFI->getStackPtrOffsetReg());
const BitVector AllSavedRegs = SavedRegs;
- SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask());
+ SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask());
// If clearing VGPRs changed the mask, we will have some CSR VGPR spills.
const bool HaveAnyCSRVGPR = SavedRegs != AllSavedRegs;
@@ -1409,10 +1370,12 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Register SPReg = MFI->getStackPtrOffsetReg();
- unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
- BuildMI(MBB, I, DL, TII->get(Op), SPReg)
- .addReg(SPReg)
- .addImm(Amount * getScratchScaleFactor(ST));
+ Amount *= getScratchScaleFactor(ST);
+ if (IsDestroy)
+ Amount = -Amount;
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg)
+ .addReg(SPReg)
+ .addImm(Amount);
} else if (CalleePopAmount != 0) {
llvm_unreachable("is this used?");
}
@@ -1450,8 +1413,9 @@ bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
}
return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
- MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF) ||
- MF.getTarget().Options.DisableFramePointerElim(MF);
+ MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
+ MF) ||
+ MF.getTarget().Options.DisableFramePointerElim(MF);
}
// This is essentially a reduced version of hasFP for entry functions. Since the
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 839437b5e3f8..d98acfc6c532 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -19,11 +19,13 @@
#include "SIRegisterInfo.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/Support/CommandLine.h"
@@ -80,36 +82,49 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
- addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
- addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
+
+ const SIRegisterInfo *TRI = STI.getRegisterInfo();
+ const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
+
+ addRegisterClass(MVT::f64, V64RegClass);
+ addRegisterClass(MVT::v2f32, V64RegClass);
addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
- addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
+ addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
- addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
+ addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
- addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass);
+ addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
+
+ addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
+ addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
+
+ addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
+ addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
+
+ addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
+ addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
- addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
+ addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
- addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass);
+ addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
- addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
+ addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
- addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass);
+ addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
- addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass);
+ addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
if (Subtarget->has16BitInsts()) {
addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
@@ -123,7 +138,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
- addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass);
+ addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
computeRegisterProperties(Subtarget->getRegisterInfo());
@@ -139,6 +154,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v3i32, Custom);
setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
setOperationAction(ISD::LOAD, MVT::v5i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v6i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v7i32, Custom);
setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
setOperationAction(ISD::LOAD, MVT::i1, Custom);
@@ -148,6 +165,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v3i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
setOperationAction(ISD::STORE, MVT::v5i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v6i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v7i32, Custom);
setOperationAction(ISD::STORE, MVT::v8i32, Custom);
setOperationAction(ISD::STORE, MVT::v16i32, Custom);
setOperationAction(ISD::STORE, MVT::i1, Custom);
@@ -170,6 +189,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
+ setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
+ setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
@@ -197,8 +218,16 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
+ setOperationAction(ISD::TRUNCATE, MVT::v3i32, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v3f32, Expand);
setOperationAction(ISD::TRUNCATE, MVT::v4i32, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v4f32, Expand);
+ setOperationAction(ISD::TRUNCATE, MVT::v5i32, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v5f32, Expand);
+ setOperationAction(ISD::TRUNCATE, MVT::v6i32, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v6f32, Expand);
+ setOperationAction(ISD::TRUNCATE, MVT::v7i32, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v7f32, Expand);
setOperationAction(ISD::TRUNCATE, MVT::v8i32, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v8f32, Expand);
setOperationAction(ISD::TRUNCATE, MVT::v16i32, Expand);
@@ -239,6 +268,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// with > 4 elements.
for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16,
+ MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32 }) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
@@ -249,10 +279,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
case ISD::BITCAST:
case ISD::EXTRACT_VECTOR_ELT:
case ISD::INSERT_VECTOR_ELT:
- case ISD::INSERT_SUBVECTOR:
case ISD::EXTRACT_SUBVECTOR:
case ISD::SCALAR_TO_VECTOR:
break;
+ case ISD::INSERT_SUBVECTOR:
case ISD::CONCAT_VECTORS:
setOperationAction(Op, VT, Custom);
break;
@@ -284,6 +314,20 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
}
+ for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
+ setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
+ AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
+
+ setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
+ AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
+
+ setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
+ }
+
for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
@@ -336,17 +380,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// Avoid stack access for these.
// TODO: Generalize to more vector types.
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
-
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
@@ -362,9 +403,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Custom);
- // Deal with vec5 vector operations when widened to vec8.
+ // Deal with vec5/6/7 vector operations when widened to vec8.
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6i32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6f32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7i32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7f32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Custom);
@@ -384,6 +429,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+ setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
// FIXME: This should be narrowed to i32, but that only happens if i64 is
// illegal.
@@ -525,8 +571,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
- setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
- setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i16, Custom);
// F16 - Constant Actions.
setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
@@ -718,6 +764,19 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
+
+ if (Subtarget->hasPackedFP32Ops()) {
+ setOperationAction(ISD::FADD, MVT::v2f32, Legal);
+ setOperationAction(ISD::FMUL, MVT::v2f32, Legal);
+ setOperationAction(ISD::FMA, MVT::v2f32, Legal);
+ setOperationAction(ISD::FNEG, MVT::v2f32, Legal);
+
+ for (MVT VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32 }) {
+ setOperationAction(ISD::FADD, VT, Custom);
+ setOperationAction(ISD::FMUL, VT, Custom);
+ setOperationAction(ISD::FMA, VT, Custom);
+ }
+ }
}
setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
@@ -1128,17 +1187,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
MachineMemOperand::MOVolatile;
return true;
}
- case Intrinsic::amdgcn_global_atomic_fadd: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::getVT(CI.getType());
- Info.ptrVal = CI.getOperand(0);
- Info.align.reset();
- Info.flags = MachineMemOperand::MOLoad |
- MachineMemOperand::MOStore |
- MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOVolatile;
- return true;
- }
case Intrinsic::amdgcn_image_bvh_intersect_ray: {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Info.opc = ISD::INTRINSIC_W_CHAIN;
@@ -1150,6 +1198,22 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
MachineMemOperand::MODereferenceable;
return true;
}
+ case Intrinsic::amdgcn_global_atomic_fadd:
+ case Intrinsic::amdgcn_global_atomic_fmin:
+ case Intrinsic::amdgcn_global_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fadd:
+ case Intrinsic::amdgcn_flat_atomic_fmin:
+ case Intrinsic::amdgcn_flat_atomic_fmax: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(CI.getType());
+ Info.ptrVal = CI.getOperand(0);
+ Info.align.reset();
+ Info.flags = MachineMemOperand::MOLoad |
+ MachineMemOperand::MOStore |
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOVolatile;
+ return true;
+ }
case Intrinsic::amdgcn_ds_gws_init:
case Intrinsic::amdgcn_ds_gws_barrier:
case Intrinsic::amdgcn_ds_gws_sema_v:
@@ -1191,6 +1255,9 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax:
case Intrinsic::amdgcn_global_atomic_fadd:
+ case Intrinsic::amdgcn_flat_atomic_fadd:
+ case Intrinsic::amdgcn_flat_atomic_fmin:
+ case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_global_atomic_csub: {
Value *Ptr = II->getArgOperand(0);
AccessTy = II->getType();
@@ -1210,9 +1277,9 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
}
return AM.Scale == 0 &&
- (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
- AM.BaseOffs, AMDGPUAS::FLAT_ADDRESS,
- /*Signed=*/false));
+ (AM.BaseOffs == 0 ||
+ Subtarget->getInstrInfo()->isLegalFLATOffset(
+ AM.BaseOffs, AMDGPUAS::FLAT_ADDRESS, SIInstrFlags::FLAT));
}
bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
@@ -1220,7 +1287,7 @@ bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
return AM.Scale == 0 &&
(AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
AM.BaseOffs, AMDGPUAS::GLOBAL_ADDRESS,
- /*Signed=*/true));
+ SIInstrFlags::FlatGlobal));
if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
// Assume the we will use FLAT for all global memory accesses
@@ -1385,10 +1452,15 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
return true;
}
+ // Either, the alignment requirements are "enabled", or there is an
+ // unaligned LDS access related hardware bug though alignment requirements
+ // are "disabled". In either case, we need to check for proper alignment
+ // requirements.
+ //
if (Size == 64) {
- // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
- // aligned, 8 byte access in a single operation using ds_read2/write2_b32
- // with adjacent offsets.
+ // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
+ // can do a 4 byte aligned, 8 byte access in a single operation using
+ // ds_read2/write2_b32 with adjacent offsets.
bool AlignedBy4 = Alignment >= Align(4);
if (IsFast)
*IsFast = AlignedBy4;
@@ -1396,22 +1468,23 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
return AlignedBy4;
}
if (Size == 96) {
- // ds_read/write_b96 require 16-byte alignment on gfx8 and older.
- bool Aligned = Alignment >= Align(16);
+ // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
+ // gfx8 and older.
+ bool AlignedBy16 = Alignment >= Align(16);
if (IsFast)
- *IsFast = Aligned;
+ *IsFast = AlignedBy16;
- return Aligned;
+ return AlignedBy16;
}
if (Size == 128) {
- // ds_read/write_b128 require 16-byte alignment on gfx8 and older, but we
- // can do a 8 byte aligned, 16 byte access in a single operation using
- // ds_read2/write2_b64.
- bool Aligned = Alignment >= Align(8);
+ // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
+ // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
+ // single operation using ds_read2/write2_b64.
+ bool AlignedBy8 = Alignment >= Align(8);
if (IsFast)
- *IsFast = Aligned;
+ *IsFast = AlignedBy8;
- return Aligned;
+ return AlignedBy8;
}
}
@@ -1467,8 +1540,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
}
bool SITargetLowering::allowsMisalignedMemoryAccesses(
- EVT VT, unsigned AddrSpace, unsigned Alignment,
- MachineMemOperand::Flags Flags, bool *IsFast) const {
+ EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
+ bool *IsFast) const {
if (IsFast)
*IsFast = false;
@@ -1482,7 +1555,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(
}
return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
- Align(Alignment), Flags, IsFast);
+ Alignment, Flags, IsFast);
}
EVT SITargetLowering::getOptimalMemOpType(
@@ -1535,8 +1608,8 @@ bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
TargetLoweringBase::LegalizeTypeAction
SITargetLowering::getPreferredVectorAction(MVT VT) const {
- int NumElts = VT.getVectorNumElements();
- if (NumElts != 1 && VT.getScalarType().bitsLE(MVT::i16))
+ if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
+ VT.getScalarType().bitsLE(MVT::i16))
return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;
return TargetLoweringBase::getPreferredVectorAction(VT);
}
@@ -1799,23 +1872,37 @@ void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo,
MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
CCInfo.AllocateReg(Reg);
- Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
+ unsigned Mask = (Subtarget->hasPackedTID() &&
+ Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
+ Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
}
if (Info.hasWorkItemIDY()) {
- Register Reg = AMDGPU::VGPR1;
- MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
+ assert(Info.hasWorkItemIDX());
+ if (Subtarget->hasPackedTID()) {
+ Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
+ 0x3ff << 10));
+ } else {
+ unsigned Reg = AMDGPU::VGPR1;
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
- CCInfo.AllocateReg(Reg);
- Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
+ CCInfo.AllocateReg(Reg);
+ Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
+ }
}
if (Info.hasWorkItemIDZ()) {
- Register Reg = AMDGPU::VGPR2;
- MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
+ assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
+ if (Subtarget->hasPackedTID()) {
+ Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
+ 0x3ff << 20));
+ } else {
+ unsigned Reg = AMDGPU::VGPR2;
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
- CCInfo.AllocateReg(Reg);
- Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
+ CCInfo.AllocateReg(Reg);
+ Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
+ }
}
}
@@ -1865,12 +1952,32 @@ static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
return ArgDescriptor::createRegister(Reg);
}
-static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
- return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
+// If this has a fixed position, we still should allocate the register in the
+// CCInfo state. Technically we could get away with this for values passed
+// outside of the normal argument range.
+static void allocateFixedSGPRInputImpl(CCState &CCInfo,
+ const TargetRegisterClass *RC,
+ MCRegister Reg) {
+ Reg = CCInfo.AllocateReg(Reg);
+ assert(Reg != AMDGPU::NoRegister);
+ MachineFunction &MF = CCInfo.getMachineFunction();
+ MF.addLiveIn(Reg, RC);
+}
+
+static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
+ if (Arg) {
+ allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
+ Arg.getRegister());
+ } else
+ Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
}
-static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
- return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
+static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
+ if (Arg) {
+ allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
+ Arg.getRegister());
+ } else
+ Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
}
/// Allocate implicit function VGPR arguments at the end of allocated user
@@ -1919,29 +2026,29 @@ void SITargetLowering::allocateSpecialInputSGPRs(
// TODO: Unify handling with private memory pointers.
if (Info.hasDispatchPtr())
- ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
+ allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
if (Info.hasQueuePtr())
- ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
+ allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
// Implicit arg ptr takes the place of the kernarg segment pointer. This is a
// constant offset from the kernarg segment.
if (Info.hasImplicitArgPtr())
- ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
+ allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
if (Info.hasDispatchID())
- ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
+ allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
// flat_scratch_init is not applicable for non-kernel functions.
if (Info.hasWorkGroupIDX())
- ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
+ allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
if (Info.hasWorkGroupIDY())
- ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
+ allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
if (Info.hasWorkGroupIDZ())
- ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
+ allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
}
// Allocate special inputs passed in user SGPRs.
@@ -2203,6 +2310,8 @@ SDValue SITargetLowering::LowerFormalArguments(
return DAG.getEntryNode();
}
+ Info->allocateModuleLDSGlobal(Fn.getParent());
+
SmallVector<ISD::InputArg, 16> Splits;
SmallVector<CCValAssign, 16> ArgLocs;
BitVector Skipped(Ins.size());
@@ -2767,6 +2876,7 @@ static bool canGuaranteeTCO(CallingConv::ID CC) {
static bool mayTailCallThisCC(CallingConv::ID CC) {
switch (CC) {
case CallingConv::C:
+ case CallingConv::AMDGPU_Gfx:
return true;
default:
return canGuaranteeTCO(CC);
@@ -2781,6 +2891,11 @@ bool SITargetLowering::isEligibleForTailCallOptimization(
if (!mayTailCallThisCC(CalleeCC))
return false;
+ // For a divergent call target, we need to do a waterfall loop over the
+ // possible callees which precludes us from using a simple jump.
+ if (Callee->isDivergent())
+ return false;
+
MachineFunction &MF = DAG.getMachineFunction();
const Function &CallerF = MF.getFunction();
CallingConv::ID CallerCC = CallerF.getCallingConv();
@@ -2888,12 +3003,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
if (!CLI.CB)
report_fatal_error("unsupported libcall legalization");
- if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
- !CLI.CB->getCalledFunction() && CallConv != CallingConv::AMDGPU_Gfx) {
- return lowerUnhandledCall(CLI, InVals,
- "unsupported indirect call to function ");
- }
-
if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
return lowerUnhandledCall(CLI, InVals,
"unsupported required tail call to function ");
@@ -3054,7 +3163,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
// locations, which are supposed to be immutable?
Chain = addTokenForArgument(Chain, DAG, MFI, FI);
} else {
- DstAddr = PtrOff;
+ // Stores to the argument stack area are relative to the stack pointer.
+ SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
+ MVT::i32);
+ DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
Alignment =
commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
@@ -4150,11 +4262,35 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
return BB;
}
case AMDGPU::DS_GWS_INIT:
- case AMDGPU::DS_GWS_SEMA_V:
case AMDGPU::DS_GWS_SEMA_BR:
+ case AMDGPU::DS_GWS_BARRIER:
+ if (Subtarget->needsAlignedVGPRs()) {
+ // Add implicit aligned super-reg to force alignment on the data operand.
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
+ Register DataReg = Op->getReg();
+ bool IsAGPR = TRI->isAGPR(MRI, DataReg);
+ Register Undef = MRI.createVirtualRegister(
+ IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef);
+ Register NewVR =
+ MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
+ : &AMDGPU::VReg_64_Align2RegClass);
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), NewVR)
+ .addReg(DataReg, 0, Op->getSubReg())
+ .addImm(AMDGPU::sub0)
+ .addReg(Undef)
+ .addImm(AMDGPU::sub1);
+ Op->setReg(NewVR);
+ Op->setSubReg(AMDGPU::sub0);
+ MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
+ }
+ LLVM_FALLTHROUGH;
+ case AMDGPU::DS_GWS_SEMA_V:
case AMDGPU::DS_GWS_SEMA_P:
case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
- case AMDGPU::DS_GWS_BARRIER:
// A s_waitcnt 0 is required to be the instruction immediately following.
if (getSubtarget()->hasGWSAutoReplay()) {
bundleInstWithWaitcnt(MI);
@@ -4360,7 +4496,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
SelectionDAG &DAG) const {
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
- assert(VT == MVT::v4i16 || VT == MVT::v4f16);
+ assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
+ VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32);
SDValue Lo0, Hi0;
std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -4381,7 +4518,8 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
SelectionDAG &DAG) const {
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
- assert(VT == MVT::v4i16 || VT == MVT::v4f16);
+ assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
+ VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32);
SDValue Lo0, Hi0;
std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -4456,6 +4594,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerFMINNUM_FMAXNUM(Op, DAG);
case ISD::FMA:
return splitTernaryVectorOp(Op, DAG);
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ return LowerFP_TO_INT(Op, DAG);
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
@@ -5092,12 +5233,35 @@ SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
}
SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
+ if (!Subtarget->isTrapHandlerEnabled() ||
+ Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
+ return lowerTrapEndpgm(Op, DAG);
+
+ if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(Subtarget)) {
+ switch (*HsaAbiVer) {
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
+ return lowerTrapHsaQueuePtr(Op, DAG);
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
+ return Subtarget->supportsGetDoorbellID() ?
+ lowerTrapHsa(Op, DAG) : lowerTrapHsaQueuePtr(Op, DAG);
+ }
+ }
+
+ llvm_unreachable("Unknown trap handler");
+}
+
+SDValue SITargetLowering::lowerTrapEndpgm(
+ SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Chain = Op.getOperand(0);
+ return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
+}
- if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
- !Subtarget->isTrapHandlerEnabled())
- return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
+SDValue SITargetLowering::lowerTrapHsaQueuePtr(
+ SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue Chain = Op.getOperand(0);
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
@@ -5108,22 +5272,37 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
QueuePtr, SDValue());
+
+ uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
SDValue Ops[] = {
ToReg,
- DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16),
+ DAG.getTargetConstant(TrapID, SL, MVT::i16),
SGPR01,
ToReg.getValue(1)
};
return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
}
+SDValue SITargetLowering::lowerTrapHsa(
+ SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue Chain = Op.getOperand(0);
+
+ uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
+ SDValue Ops[] = {
+ Chain,
+ DAG.getTargetConstant(TrapID, SL, MVT::i16)
+ };
+ return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
+}
+
SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Chain = Op.getOperand(0);
MachineFunction &MF = DAG.getMachineFunction();
- if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
- !Subtarget->isTrapHandlerEnabled()) {
+ if (!Subtarget->isTrapHandlerEnabled() ||
+ Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
"debugtrap handler not supported",
Op.getDebugLoc(),
@@ -5133,9 +5312,10 @@ SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
return Chain;
}
+ uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
SDValue Ops[] = {
Chain,
- DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16)
+ DAG.getTargetConstant(TrapID, SL, MVT::i16)
};
return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
}
@@ -5666,23 +5846,10 @@ static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
ArrayRef<SDValue> Elts) {
assert(!Elts.empty());
MVT Type;
- unsigned NumElts;
-
- if (Elts.size() == 1) {
- Type = MVT::f32;
- NumElts = 1;
- } else if (Elts.size() == 2) {
- Type = MVT::v2f32;
- NumElts = 2;
- } else if (Elts.size() == 3) {
- Type = MVT::v3f32;
- NumElts = 3;
- } else if (Elts.size() <= 4) {
- Type = MVT::v4f32;
- NumElts = 4;
- } else if (Elts.size() <= 8) {
- Type = MVT::v8f32;
- NumElts = 8;
+ unsigned NumElts = Elts.size();
+
+ if (NumElts <= 8) {
+ Type = MVT::getVectorVT(MVT::f32, NumElts);
} else {
assert(Elts.size() <= 16);
Type = MVT::v16f32;
@@ -5704,28 +5871,6 @@ static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
return DAG.getBuildVector(Type, DL, VecElts);
}
-static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
- SDValue *GLC, SDValue *SLC, SDValue *DLC) {
- auto CachePolicyConst = cast<ConstantSDNode>(CachePolicy.getNode());
-
- uint64_t Value = CachePolicyConst->getZExtValue();
- SDLoc DL(CachePolicy);
- if (GLC) {
- *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
- Value &= ~(uint64_t)0x1;
- }
- if (SLC) {
- *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
- Value &= ~(uint64_t)0x2;
- }
- if (DLC) {
- *DLC = DAG.getTargetConstant((Value & 0x4) ? 1 : 0, DL, MVT::i32);
- Value &= ~(uint64_t)0x4;
- }
-
- return Value == 0;
-}
-
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
SDValue Src, int ExtraElts) {
EVT SrcVT = Src.getValueType();
@@ -5752,7 +5897,7 @@ static SDValue constructRetValue(SelectionDAG &DAG,
ArrayRef<EVT> ResultTypes,
bool IsTexFail, bool Unpacked, bool IsD16,
int DMaskPop, int NumVDataDwords,
- const SDLoc &DL, LLVMContext &Context) {
+ const SDLoc &DL) {
// Determine the required return type. This is the same regardless of IsTexFail flag
EVT ReqRetVT = ResultTypes[0];
int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
@@ -5835,11 +5980,11 @@ static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
return Value == 0;
}
-static void packImageA16AddressToDwords(SelectionDAG &DAG, SDValue Op,
- MVT PackVectorVT,
- SmallVectorImpl<SDValue> &PackedAddrs,
- unsigned DimIdx, unsigned EndIdx,
- unsigned NumGradients) {
+static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op,
+ MVT PackVectorVT,
+ SmallVectorImpl<SDValue> &PackedAddrs,
+ unsigned DimIdx, unsigned EndIdx,
+ unsigned NumGradients) {
SDLoc DL(Op);
for (unsigned I = DimIdx; I < EndIdx; I++) {
SDValue Addr = Op.getOperand(I);
@@ -5994,56 +6139,64 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
MVT VAddrVT =
Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
MVT VAddrScalarVT = VAddrVT.getScalarType();
- MVT PackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
+ MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
VAddrScalarVT = VAddrVT.getScalarType();
+ MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
- if (IsA16 || IsG16) {
- if (IsA16) {
- if (!ST->hasA16()) {
- LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
- "support 16 bit addresses\n");
- return Op;
- }
- if (!IsG16) {
- LLVM_DEBUG(
- dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
- "need 16 bit derivatives but got 32 bit derivatives\n");
- return Op;
- }
- } else if (!ST->hasG16()) {
+
+ if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
+ // 16 bit gradients are supported, but are tied to the A16 control
+ // so both gradients and addresses must be 16 bit
+ LLVM_DEBUG(
+ dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
+ "require 16 bit args for both gradients and addresses");
+ return Op;
+ }
+
+ if (IsA16) {
+ if (!ST->hasA16()) {
LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
- "support 16 bit derivatives\n");
+ "support 16 bit addresses\n");
return Op;
}
+ }
- if (BaseOpcode->Gradients && !IsA16) {
- if (!ST->hasG16()) {
- LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
- "support 16 bit derivatives\n");
- return Op;
- }
- // Activate g16
- const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
- AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
- IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
- }
+ // We've dealt with incorrect input so we know that if IsA16, IsG16
+ // are set then we have to compress/pack operands (either address,
+ // gradient or both)
+ // In the case where a16 and gradients are tied (no G16 support) then we
+ // have already verified that both IsA16 and IsG16 are true
+ if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
+ // Activate g16
+ const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
+ AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
+ IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
+ }
- // Don't compress addresses for G16
- const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
- packImageA16AddressToDwords(DAG, Op, PackVectorVT, VAddrs,
- ArgOffset + Intr->GradientStart, PackEndIdx,
- Intr->NumGradients);
+ // Add gradients (packed or unpacked)
+ if (IsG16) {
+ // Pack the gradients
+ // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
+ packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
+ ArgOffset + Intr->GradientStart,
+ ArgOffset + Intr->CoordStart, Intr->NumGradients);
+ } else {
+ for (unsigned I = ArgOffset + Intr->GradientStart;
+ I < ArgOffset + Intr->CoordStart; I++)
+ VAddrs.push_back(Op.getOperand(I));
+ }
- if (!IsA16) {
- // Add uncompressed address
- for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
- VAddrs.push_back(Op.getOperand(I));
- }
+ // Add addresses (packed or unpacked)
+ if (IsA16) {
+ packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
+ ArgOffset + Intr->CoordStart, VAddrEnd,
+ 0 /* No gradients */);
} else {
- for (unsigned I = ArgOffset + Intr->GradientStart; I < VAddrEnd; I++)
+ // Add uncompressed address
+ for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
VAddrs.push_back(Op.getOperand(I));
}
@@ -6058,8 +6211,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
//
// SIShrinkInstructions will convert NSA encodings to non-NSA after register
// allocation when possible.
- bool UseNSA =
- ST->hasFeature(AMDGPU::FeatureNSAEncoding) && VAddrs.size() >= 3;
+ bool UseNSA = ST->hasFeature(AMDGPU::FeatureNSAEncoding) &&
+ VAddrs.size() >= 3 &&
+ VAddrs.size() <= (unsigned)ST->getNSAMaxSize();
SDValue VAddr;
if (!UseNSA)
VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
@@ -6120,19 +6274,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
}
}
- SDValue GLC;
- SDValue SLC;
- SDValue DLC;
- if (BaseOpcode->Atomic) {
- GLC = True; // TODO no-return optimization
- if (!parseCachePolicy(Op.getOperand(ArgOffset + Intr->CachePolicyIndex),
- DAG, nullptr, &SLC, IsGFX10Plus ? &DLC : nullptr))
- return Op;
- } else {
- if (!parseCachePolicy(Op.getOperand(ArgOffset + Intr->CachePolicyIndex),
- DAG, &GLC, &SLC, IsGFX10Plus ? &DLC : nullptr))
- return Op;
- }
+ unsigned CPol = cast<ConstantSDNode>(
+ Op.getOperand(ArgOffset + Intr->CachePolicyIndex))->getZExtValue();
+ if (BaseOpcode->Atomic)
+ CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
+ if (CPol & ~AMDGPU::CPol::ALL)
+ return Op;
SmallVector<SDValue, 26> Ops;
if (BaseOpcode->Store || BaseOpcode->Atomic)
@@ -6148,16 +6295,17 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
if (IsGFX10Plus)
Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
Ops.push_back(Unorm);
- if (IsGFX10Plus)
- Ops.push_back(DLC);
- Ops.push_back(GLC);
- Ops.push_back(SLC);
+ Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
Ops.push_back(IsA16 && // r128, a16 for gfx9
ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
if (IsGFX10Plus)
Ops.push_back(IsA16 ? True : False);
- Ops.push_back(TFE);
- Ops.push_back(LWE);
+ if (!Subtarget->hasGFX90AInsts()) {
+ Ops.push_back(TFE); //tfe
+ } else if (cast<ConstantSDNode>(TFE)->getZExtValue()) {
+ report_fatal_error("TFE is not supported on this GPU");
+ }
+ Ops.push_back(LWE); // lwe
if (!IsGFX10Plus)
Ops.push_back(DimInfo->DA ? True : False);
if (BaseOpcode->HasD16)
@@ -6175,7 +6323,15 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
: AMDGPU::MIMGEncGfx10Default,
NumVDataDwords, NumVAddrDwords);
} else {
- if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ if (Subtarget->hasGFX90AInsts()) {
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
+ NumVDataDwords, NumVAddrDwords);
+ if (Opcode == -1)
+ report_fatal_error(
+ "requested image instruction is not supported on this GPU");
+ }
+ if (Opcode == -1 &&
+ Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
NumVDataDwords, NumVAddrDwords);
if (Opcode == -1)
@@ -6194,15 +6350,13 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
SmallVector<SDValue, 1> Elt;
DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
- } else if (!BaseOpcode->Store) {
- return constructRetValue(DAG, NewNode,
- OrigResultTypes, IsTexFail,
- Subtarget->hasUnpackedD16VMem(), IsD16,
- DMaskLanes, NumVDataDwords, DL,
- *DAG.getContext());
}
-
- return SDValue(NewNode, 0);
+ if (BaseOpcode->Store)
+ return SDValue(NewNode, 0);
+ return constructRetValue(DAG, NewNode,
+ OrigResultTypes, IsTexFail,
+ Subtarget->hasUnpackedD16VMem(), IsD16,
+ DMaskLanes, NumVDataDwords, DL);
}
SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
@@ -6448,11 +6602,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
SDLoc(Op), MVT::i32);
case Intrinsic::amdgcn_s_buffer_load: {
- bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
- SDValue GLC;
- SDValue DLC = DAG.getTargetConstant(0, DL, MVT::i1);
- if (!parseCachePolicy(Op.getOperand(3), DAG, &GLC, nullptr,
- IsGFX10Plus ? &DLC : nullptr))
+ unsigned CPol = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+ if (CPol & ~AMDGPU::CPol::ALL)
return Op;
return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
DAG);
@@ -6607,6 +6758,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_alignbit:
return DAG.getNode(ISD::FSHR, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+ case Intrinsic::amdgcn_perm:
+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
+ Op.getOperand(2), Op.getOperand(3));
case Intrinsic::amdgcn_reloc_constant: {
Module *M = const_cast<Module *>(MF.getFunction().getParent());
const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
@@ -6626,28 +6780,29 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
}
-// This function computes an appropriate offset to pass to
-// MachineMemOperand::setOffset() based on the offset inputs to
-// an intrinsic. If any of the offsets are non-contstant or
-// if VIndex is non-zero then this function returns 0. Otherwise,
-// it returns the sum of VOffset, SOffset, and Offset.
-static unsigned getBufferOffsetForMMO(SDValue VOffset,
- SDValue SOffset,
- SDValue Offset,
- SDValue VIndex = SDValue()) {
-
+/// Update \p MMO based on the offset inputs to an intrinsic.
+static void updateBufferMMO(MachineMemOperand *MMO, SDValue VOffset,
+ SDValue SOffset, SDValue Offset,
+ SDValue VIndex = SDValue()) {
if (!isa<ConstantSDNode>(VOffset) || !isa<ConstantSDNode>(SOffset) ||
- !isa<ConstantSDNode>(Offset))
- return 0;
+ !isa<ConstantSDNode>(Offset)) {
+ // The combined offset is not known to be constant, so we cannot represent
+ // it in the MMO. Give up.
+ MMO->setValue((Value *)nullptr);
+ return;
+ }
- if (VIndex) {
- if (!isa<ConstantSDNode>(VIndex) || !cast<ConstantSDNode>(VIndex)->isNullValue())
- return 0;
+ if (VIndex && (!isa<ConstantSDNode>(VIndex) ||
+ !cast<ConstantSDNode>(VIndex)->isNullValue())) {
+ // The strided index component of the address is not known to be zero, so we
+ // cannot represent it in the MMO. Give up.
+ MMO->setValue((Value *)nullptr);
+ return;
}
- return cast<ConstantSDNode>(VOffset)->getSExtValue() +
- cast<ConstantSDNode>(SOffset)->getSExtValue() +
- cast<ConstantSDNode>(Offset)->getSExtValue();
+ MMO->setOffset(cast<ConstantSDNode>(VOffset)->getSExtValue() +
+ cast<ConstantSDNode>(SOffset)->getSExtValue() +
+ cast<ConstantSDNode>(Offset)->getSExtValue());
}
SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
@@ -6670,13 +6825,21 @@ SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
};
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6]));
+ updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6]);
EVT MemVT = VData.getValueType();
return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
M->getMemOperand());
}
+// Return a value to use for the idxen operand by examining the vindex operand.
+static unsigned getIdxEn(SDValue VIndex) {
+ if (auto VIndexC = dyn_cast<ConstantSDNode>(VIndex))
+ // No need to set idxen if vindex is known to be zero.
+ return VIndexC->getZExtValue() != 0;
+ return 1;
+}
+
SDValue
SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
unsigned NewOpcode) const {
@@ -6697,8 +6860,7 @@ SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
};
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6],
- Ops[3]));
+ updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);
EVT MemVT = VData.getValueType();
return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
@@ -6811,9 +6973,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_buffer_load_format: {
unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
- unsigned IdxEn = 1;
- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
- IdxEn = Idx->getZExtValue() != 0;
+ unsigned IdxEn = getIdxEn(Op.getOperand(3));
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // rsrc
@@ -6824,11 +6984,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
-
- unsigned Offset = setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
- // We don't know the offset if vindex is non-zero, so clear it.
- if (IdxEn)
- Offset = 0;
+ setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
@@ -6836,7 +6992,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
EVT VT = Op.getValueType();
EVT IntVT = VT.changeTypeToInteger();
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(Offset);
+ updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5], Ops[2]);
EVT LoadVT = Op.getValueType();
if (LoadVT.getScalarType() == MVT::f16)
@@ -6868,7 +7024,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
};
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5]));
+ updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5]);
return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
}
case Intrinsic::amdgcn_struct_buffer_load:
@@ -6888,8 +7044,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
};
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5],
- Ops[2]));
+ updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5], Ops[2]);
return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
}
case Intrinsic::amdgcn_tbuffer_load: {
@@ -6900,9 +7055,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
- unsigned IdxEn = 1;
- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
- IdxEn = Idx->getZExtValue() != 0;
+ unsigned IdxEn = getIdxEn(Op.getOperand(3));
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // rsrc
@@ -6983,9 +7136,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_buffer_atomic_xor:
case Intrinsic::amdgcn_buffer_atomic_fadd: {
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
- unsigned IdxEn = 1;
- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
- IdxEn = Idx->getZExtValue() != 0;
+ unsigned IdxEn = getIdxEn(Op.getOperand(4));
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // vdata
@@ -6997,14 +7148,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
- unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
- // We don't know the offset if vindex is non-zero, so clear it.
- if (IdxEn)
- Offset = 0;
+ setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(Offset);
+ updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);
unsigned Opcode = 0;
switch (IntrID) {
@@ -7042,7 +7191,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
break;
case Intrinsic::amdgcn_buffer_atomic_fadd:
- if (!Op.getValue(0).use_empty()) {
+ if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) {
DiagnosticInfoUnsupported
NoFpRet(DAG.getMachineFunction().getFunction(),
"return versions of fp atomics not supported",
@@ -7063,6 +7212,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
+ case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
+ case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
+ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
+ case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
+ case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
+ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
case Intrinsic::amdgcn_raw_buffer_atomic_swap:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
case Intrinsic::amdgcn_raw_buffer_atomic_add:
@@ -7119,9 +7276,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
- unsigned IdxEn = 1;
- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(5)))
- IdxEn = Idx->getZExtValue() != 0;
+ unsigned IdxEn = getIdxEn(Op.getOperand(5));
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // src
@@ -7134,13 +7289,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
- unsigned Offset = setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
- // We don't know the offset if vindex is non-zero, so clear it.
- if (IdxEn)
- Offset = 0;
+ setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
+
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(Offset);
+ updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7], Ops[4]);
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
@@ -7161,7 +7314,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
};
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7]));
+ updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7]);
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
@@ -7182,33 +7335,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
};
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7],
- Ops[4]));
+ updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7], Ops[4]);
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
}
- case Intrinsic::amdgcn_global_atomic_fadd: {
- if (!Op.getValue(0).use_empty()) {
- DiagnosticInfoUnsupported
- NoFpRet(DAG.getMachineFunction().getFunction(),
- "return versions of fp atomics not supported",
- DL.getDebugLoc(), DS_Error);
- DAG.getContext()->diagnose(NoFpRet);
- return SDValue();
- }
- MemSDNode *M = cast<MemSDNode>(Op);
- SDValue Ops[] = {
- M->getOperand(0), // Chain
- M->getOperand(2), // Ptr
- M->getOperand(3) // Value
- };
-
- EVT VT = Op.getOperand(3).getValueType();
- return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,
- DAG.getVTList(VT, MVT::Other), Ops,
- M->getMemOperand());
- }
case Intrinsic::amdgcn_image_bvh_intersect_ray: {
SDLoc DL(Op);
MemSDNode *M = cast<MemSDNode>(Op);
@@ -7224,6 +7355,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
assert(RayDir.getValueType() == MVT::v4f16 ||
RayDir.getValueType() == MVT::v4f32);
+ if (!Subtarget->hasGFX10_AEncoding()) {
+ emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
+ return SDValue();
+ }
+
bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
bool Is64 = NodePtr.getValueType() == MVT::i64;
unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
@@ -7279,7 +7415,55 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
DAG.setNodeMemRefs(NewNode, {MemRef});
return SDValue(NewNode, 0);
}
+ case Intrinsic::amdgcn_global_atomic_fadd:
+ if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) {
+ DiagnosticInfoUnsupported
+ NoFpRet(DAG.getMachineFunction().getFunction(),
+ "return versions of fp atomics not supported",
+ DL.getDebugLoc(), DS_Error);
+ DAG.getContext()->diagnose(NoFpRet);
+ return SDValue();
+ }
+ LLVM_FALLTHROUGH;
+ case Intrinsic::amdgcn_global_atomic_fmin:
+ case Intrinsic::amdgcn_global_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fadd:
+ case Intrinsic::amdgcn_flat_atomic_fmin:
+ case Intrinsic::amdgcn_flat_atomic_fmax: {
+ MemSDNode *M = cast<MemSDNode>(Op);
+ SDValue Ops[] = {
+ M->getOperand(0), // Chain
+ M->getOperand(2), // Ptr
+ M->getOperand(3) // Value
+ };
+ unsigned Opcode = 0;
+ switch (IntrID) {
+ case Intrinsic::amdgcn_global_atomic_fadd:
+ case Intrinsic::amdgcn_flat_atomic_fadd: {
+ EVT VT = Op.getOperand(3).getValueType();
+ return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,
+ DAG.getVTList(VT, MVT::Other), Ops,
+ M->getMemOperand());
+ }
+ case Intrinsic::amdgcn_global_atomic_fmin:
+ case Intrinsic::amdgcn_flat_atomic_fmin: {
+ Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN;
+ break;
+ }
+ case Intrinsic::amdgcn_global_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fmax: {
+ Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX;
+ break;
+ }
+ default:
+ llvm_unreachable("unhandled atomic opcode");
+ }
+ return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op),
+ M->getVTList(), Ops, M->getMemoryVT(),
+ M->getMemOperand());
+ }
default:
+
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))
return lowerImage(Op, ImageDimIntr, DAG, true);
@@ -7448,9 +7632,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();
- unsigned IdxEn = 1;
- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
- IdxEn = Idx->getZExtValue() != 0;
+ unsigned IdxEn = getIdxEn(Op.getOperand(4));
SDValue Ops[] = {
Chain,
VData, // vdata
@@ -7461,7 +7643,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op.getOperand(7), // offset
DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idexen
+ DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
AMDGPUISD::TBUFFER_STORE_FORMAT;
@@ -7486,7 +7668,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Offsets.second, // offset
Op.getOperand(7), // format
Op.getOperand(8), // cachepolicy, swizzled buffer
- DAG.getTargetConstant(1, DL, MVT::i1), // idexen
+ DAG.getTargetConstant(1, DL, MVT::i1), // idxen
};
unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
AMDGPUISD::TBUFFER_STORE_FORMAT;
@@ -7511,7 +7693,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Offsets.second, // offset
Op.getOperand(6), // format
Op.getOperand(7), // cachepolicy, swizzled buffer
- DAG.getTargetConstant(0, DL, MVT::i1), // idexen
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};
unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
AMDGPUISD::TBUFFER_STORE_FORMAT;
@@ -7528,9 +7710,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
VData = handleD16VData(VData, DAG);
unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
- unsigned IdxEn = 1;
- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
- IdxEn = Idx->getZExtValue() != 0;
+ unsigned IdxEn = getIdxEn(Op.getOperand(4));
SDValue Ops[] = {
Chain,
VData,
@@ -7542,15 +7722,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
- unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
- // We don't know the offset if vindex is non-zero, so clear it.
- if (IdxEn)
- Offset = 0;
+ setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+
unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(Offset);
+ updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);
// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
EVT VDataType = VData.getValueType().getScalarType();
@@ -7597,7 +7775,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6]));
+ updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6]);
// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
@@ -7644,8 +7822,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6],
- Ops[3]));
+ updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);
// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
EVT VDataType = VData.getValueType().getScalarType();
@@ -7725,9 +7902,9 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
// pointed to by Offsets.
-unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
- SelectionDAG &DAG, SDValue *Offsets,
- Align Alignment) const {
+void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
+ SelectionDAG &DAG, SDValue *Offsets,
+ Align Alignment) const {
SDLoc DL(CombinedOffset);
if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
uint32_t Imm = C->getZExtValue();
@@ -7737,7 +7914,7 @@ unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
- return SOffset + ImmOffset;
+ return;
}
}
if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
@@ -7750,13 +7927,12 @@ unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
Offsets[0] = N0;
Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
- return 0;
+ return;
}
}
Offsets[0] = CombinedOffset;
Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
- return 0;
}
// Handle 8 bit and 16 bit buffer loads
@@ -8263,8 +8439,8 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
// Returns immediate value for setting the F32 denorm mode when using the
// S_DENORM_MODE instruction.
-static const SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG,
- const SDLoc &SL, const GCNSubtarget *ST) {
+static SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG,
+ const SDLoc &SL, const GCNSubtarget *ST) {
assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
int DPDenormModeDefault = hasFP64FP16Denormals(DAG.getMachineFunction())
? FP_DENORM_FLUSH_NONE
@@ -8794,18 +8970,20 @@ SDValue SITargetLowering::splitBinaryBitConstantOp(
}
// Returns true if argument is a boolean value which is not serialized into
-// memory or argument and does not require v_cmdmask_b32 to be deserialized.
+// memory or argument and does not require v_cndmask_b32 to be deserialized.
static bool isBoolSGPR(SDValue V) {
if (V.getValueType() != MVT::i1)
return false;
switch (V.getOpcode()) {
- default: break;
+ default:
+ break;
case ISD::SETCC:
+ case AMDGPUISD::FP_CLASS:
+ return true;
case ISD::AND:
case ISD::OR:
case ISD::XOR:
- case AMDGPUISD::FP_CLASS:
- return true;
+ return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
}
return false;
}
@@ -9206,63 +9384,6 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
return SDValue();
}
-// Instructions that will be lowered with a final instruction that zeros the
-// high result bits.
-// XXX - probably only need to list legal operations.
-static bool fp16SrcZerosHighBits(unsigned Opc) {
- switch (Opc) {
- case ISD::FADD:
- case ISD::FSUB:
- case ISD::FMUL:
- case ISD::FDIV:
- case ISD::FREM:
- case ISD::FMA:
- case ISD::FMAD:
- case ISD::FCANONICALIZE:
- case ISD::FP_ROUND:
- case ISD::UINT_TO_FP:
- case ISD::SINT_TO_FP:
- case ISD::FABS:
- // Fabs is lowered to a bit operation, but it's an and which will clear the
- // high bits anyway.
- case ISD::FSQRT:
- case ISD::FSIN:
- case ISD::FCOS:
- case ISD::FPOWI:
- case ISD::FPOW:
- case ISD::FLOG:
- case ISD::FLOG2:
- case ISD::FLOG10:
- case ISD::FEXP:
- case ISD::FEXP2:
- case ISD::FCEIL:
- case ISD::FTRUNC:
- case ISD::FRINT:
- case ISD::FNEARBYINT:
- case ISD::FROUND:
- case ISD::FFLOOR:
- case ISD::FMINNUM:
- case ISD::FMAXNUM:
- case AMDGPUISD::FRACT:
- case AMDGPUISD::CLAMP:
- case AMDGPUISD::COS_HW:
- case AMDGPUISD::SIN_HW:
- case AMDGPUISD::FMIN3:
- case AMDGPUISD::FMAX3:
- case AMDGPUISD::FMED3:
- case AMDGPUISD::FMAD_FTZ:
- case AMDGPUISD::RCP:
- case AMDGPUISD::RSQ:
- case AMDGPUISD::RCP_IFLAG:
- case AMDGPUISD::LDEXP:
- return true;
- default:
- // fcopysign, select and others may be lowered to 32-bit bit operations
- // which don't zero the high bits.
- return false;
- }
-}
-
SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (!Subtarget->has16BitInsts() ||
@@ -9277,15 +9398,6 @@ SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
if (Src.getValueType() != MVT::i16)
return SDValue();
- // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
- // FIXME: It is not universally true that the high bits are zeroed on gfx9.
- if (Src.getOpcode() == ISD::BITCAST) {
- SDValue BCSrc = Src.getOperand(0);
- if (BCSrc.getValueType() == MVT::f16 &&
- fp16SrcZerosHighBits(BCSrc.getOpcode()))
- return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
- }
-
return SDValue();
}
@@ -9482,19 +9594,18 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
// Could be anything.
return false;
- case ISD::BITCAST: {
+ case ISD::BITCAST:
+ return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
+ case ISD::TRUNCATE: {
// Hack round the mess we make when legalizing extract_vector_elt
- SDValue Src = Op.getOperand(0);
- if (Src.getValueType() == MVT::i16 &&
- Src.getOpcode() == ISD::TRUNCATE) {
- SDValue TruncSrc = Src.getOperand(0);
+ if (Op.getValueType() == MVT::i16) {
+ SDValue TruncSrc = Op.getOperand(0);
if (TruncSrc.getValueType() == MVT::i32 &&
TruncSrc.getOpcode() == ISD::BITCAST &&
TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
}
}
-
return false;
}
case ISD::INTRINSIC_WO_CHAIN: {
@@ -9527,6 +9638,45 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
llvm_unreachable("invalid operation");
}
+bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
+ unsigned MaxDepth) const {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MachineInstr *MI = MRI.getVRegDef(Reg);
+ unsigned Opcode = MI->getOpcode();
+
+ if (Opcode == AMDGPU::G_FCANONICALIZE)
+ return true;
+
+ if (Opcode == AMDGPU::G_FCONSTANT) {
+ auto F = MI->getOperand(1).getFPImm()->getValueAPF();
+ if (F.isNaN() && F.isSignaling())
+ return false;
+ return !F.isDenormal() || denormalsEnabledForType(MRI.getType(Reg), MF);
+ }
+
+ if (MaxDepth == 0)
+ return false;
+
+ switch (Opcode) {
+ case AMDGPU::G_FMINNUM_IEEE:
+ case AMDGPU::G_FMAXNUM_IEEE: {
+ if (Subtarget->supportsMinMaxDenormModes() ||
+ denormalsEnabledForType(MRI.getType(Reg), MF))
+ return true;
+ for (unsigned I = 1, E = MI->getNumOperands(); I != E; ++I) {
+ if (!isCanonicalized(MI->getOperand(I).getReg(), MF, MaxDepth - 1))
+ return false;
+ }
+ return true;
+ }
+ default:
+ return denormalsEnabledForType(MRI.getType(Reg), MF) &&
+ isKnownNeverSNaN(Reg, MRI);
+ }
+
+ llvm_unreachable("invalid operation");
+}
+
// Constant fold canonicalize.
SDValue SITargetLowering::getCanonicalConstantFP(
SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
@@ -9694,15 +9844,19 @@ SDValue SITargetLowering::performIntMed3ImmCombine(
}
// If there isn't a 16-bit med3 operation, convert to 32-bit.
- MVT NVT = MVT::i32;
- unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ if (VT == MVT::i16) {
+ MVT NVT = MVT::i32;
+ unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+
+ SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
+ SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
+ SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
- SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
- SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
- SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
+ SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
+ }
- SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
- return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
+ return SDValue();
}
static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
@@ -10408,7 +10562,7 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
EVT VT = N->getValueType(0);
SDLoc SL(N);
- if (!Subtarget->hasDot2Insts() || VT != MVT::f32)
+ if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
return SDValue();
// FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
@@ -10791,7 +10945,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
unsigned NewDmask = 0;
unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
- bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) ||
+ bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
Node->getConstantOperandVal(LWEIdx)) ? 1 : 0;
unsigned TFCLane = 0;
bool HasChain = Node->getNumValues() > 1;
@@ -11067,6 +11221,95 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
return Node;
}
+// Any MIMG instructions that use tfe or lwe require an initialization of the
+// result register that will be written in the case of a memory access failure.
+// The required code is also added to tie this init code to the result of the
+// img instruction.
+void SITargetLowering::AddIMGInit(MachineInstr &MI) const {
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+ MachineBasicBlock &MBB = *MI.getParent();
+
+ MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
+ MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
+ MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
+
+ if (!TFE && !LWE) // intersect_ray
+ return;
+
+ unsigned TFEVal = TFE ? TFE->getImm() : 0;
+ unsigned LWEVal = LWE->getImm();
+ unsigned D16Val = D16 ? D16->getImm() : 0;
+
+ if (!TFEVal && !LWEVal)
+ return;
+
+ // At least one of TFE or LWE are non-zero
+ // We have to insert a suitable initialization of the result value and
+ // tie this to the dest of the image instruction.
+
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ int DstIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
+
+ // Calculate which dword we have to initialize to 0.
+ MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
+
+ // check that dmask operand is found.
+ assert(MO_Dmask && "Expected dmask operand in instruction");
+
+ unsigned dmask = MO_Dmask->getImm();
+ // Determine the number of active lanes taking into account the
+ // Gather4 special case
+ unsigned ActiveLanes = TII->isGather4(MI) ? 4 : countPopulation(dmask);
+
+ bool Packed = !Subtarget->hasUnpackedD16VMem();
+
+ unsigned InitIdx =
+ D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
+
+ // Abandon attempt if the dst size isn't large enough
+ // - this is in fact an error but this is picked up elsewhere and
+ // reported correctly.
+ uint32_t DstSize = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
+ if (DstSize < InitIdx)
+ return;
+
+ // Create a register for the intialization value.
+ Register PrevDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
+ unsigned NewDst = 0; // Final initialized value will be in here
+
+ // If PRTStrictNull feature is enabled (the default) then initialize
+ // all the result registers to 0, otherwise just the error indication
+ // register (VGPRn+1)
+ unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
+ unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
+
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
+ for (; SizeLeft; SizeLeft--, CurrIdx++) {
+ NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
+ // Initialize dword
+ Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
+ .addImm(0);
+ // Insert into the super-reg
+ BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
+ .addReg(PrevDst)
+ .addReg(SubReg)
+ .addImm(SIRegisterInfo::getSubRegFromChannel(CurrIdx));
+
+ PrevDst = NewDst;
+ }
+
+ // Add as an implicit operand
+ MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
+
+ // Tie the just added implicit operand to the dst
+ MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
+}
+
/// Assign the register class depending on the number of
/// bits set in the writemask
void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
@@ -11114,10 +11357,12 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
if (NoRetAtomicOp != -1) {
if (!Node->hasAnyUseOfValue(0)) {
- int Glc1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
- AMDGPU::OpName::glc1);
- if (Glc1Idx != -1)
- MI.RemoveOperand(Glc1Idx);
+ int CPolIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::cpol);
+ if (CPolIdx != -1) {
+ MachineOperand &CPol = MI.getOperand(CPolIdx);
+ CPol.setImm(CPol.getImm() & ~AMDGPU::CPol::GLC);
+ }
MI.RemoveOperand(0);
MI.setDesc(TII->get(NoRetAtomicOp));
return;
@@ -11148,6 +11393,9 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
}
return;
}
+
+ if (TII->isMIMG(MI) && !MI.mayStore())
+ AddIMGInit(MI);
}
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
@@ -11226,9 +11474,11 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
//===----------------------------------------------------------------------===//
std::pair<unsigned, const TargetRegisterClass *>
-SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
StringRef Constraint,
MVT VT) const {
+ const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
+
const TargetRegisterClass *RC = nullptr;
if (Constraint.size() == 1) {
const unsigned BitWidth = VT.getSizeInBits();
@@ -11257,7 +11507,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
RC = &AMDGPU::VGPR_32RegClass;
break;
default:
- RC = SIRegisterInfo::getVGPRClassForBitWidth(BitWidth);
+ RC = TRI->getVGPRClassForBitWidth(BitWidth);
if (!RC)
return std::make_pair(0U, nullptr);
break;
@@ -11271,7 +11521,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
RC = &AMDGPU::AGPR_32RegClass;
break;
default:
- RC = SIRegisterInfo::getAGPRClassForBitWidth(BitWidth);
+ RC = TRI->getAGPRClassForBitWidth(BitWidth);
if (!RC)
return std::make_pair(0U, nullptr);
break;
@@ -11444,6 +11694,47 @@ bool SITargetLowering::checkAsmConstraintValA(SDValue Op,
return false;
}
+static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
+ switch (UnalignedClassID) {
+ case AMDGPU::VReg_64RegClassID:
+ return AMDGPU::VReg_64_Align2RegClassID;
+ case AMDGPU::VReg_96RegClassID:
+ return AMDGPU::VReg_96_Align2RegClassID;
+ case AMDGPU::VReg_128RegClassID:
+ return AMDGPU::VReg_128_Align2RegClassID;
+ case AMDGPU::VReg_160RegClassID:
+ return AMDGPU::VReg_160_Align2RegClassID;
+ case AMDGPU::VReg_192RegClassID:
+ return AMDGPU::VReg_192_Align2RegClassID;
+ case AMDGPU::VReg_224RegClassID:
+ return AMDGPU::VReg_224_Align2RegClassID;
+ case AMDGPU::VReg_256RegClassID:
+ return AMDGPU::VReg_256_Align2RegClassID;
+ case AMDGPU::VReg_512RegClassID:
+ return AMDGPU::VReg_512_Align2RegClassID;
+ case AMDGPU::VReg_1024RegClassID:
+ return AMDGPU::VReg_1024_Align2RegClassID;
+ case AMDGPU::AReg_64RegClassID:
+ return AMDGPU::AReg_64_Align2RegClassID;
+ case AMDGPU::AReg_96RegClassID:
+ return AMDGPU::AReg_96_Align2RegClassID;
+ case AMDGPU::AReg_128RegClassID:
+ return AMDGPU::AReg_128_Align2RegClassID;
+ case AMDGPU::AReg_160RegClassID:
+ return AMDGPU::AReg_160_Align2RegClassID;
+ case AMDGPU::AReg_192RegClassID:
+ return AMDGPU::AReg_192_Align2RegClassID;
+ case AMDGPU::AReg_256RegClassID:
+ return AMDGPU::AReg_256_Align2RegClassID;
+ case AMDGPU::AReg_512RegClassID:
+ return AMDGPU::AReg_512_Align2RegClassID;
+ case AMDGPU::AReg_1024RegClassID:
+ return AMDGPU::AReg_1024_Align2RegClassID;
+ default:
+ return -1;
+ }
+}
+
// Figure out which registers should be reserved for stack access. Only after
// the function is legalized do we know all of the non-spill stack objects or if
// calls are present.
@@ -11452,6 +11743,7 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
if (Info->isEntryFunction()) {
// Callable functions have fixed registers used for stack access.
@@ -11474,7 +11766,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
Info->limitOccupancy(MF);
if (ST.isWave32() && !MF.empty()) {
- const SIInstrInfo *TII = ST.getInstrInfo();
for (auto &MBB : MF) {
for (auto &MI : MBB) {
TII->fixImplicitOperands(MI);
@@ -11482,13 +11773,30 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
}
}
+ // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
+ // classes if required. Ideally the register class constraints would differ
+ // per-subtarget, but there's no easy way to achieve that right now. This is
+ // not a problem for VGPRs because the correctly aligned VGPR class is implied
+ // from using them as the register class for legal types.
+ if (ST.needsAlignedVGPRs()) {
+ for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+ const Register Reg = Register::index2VirtReg(I);
+ const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
+ if (!RC)
+ continue;
+ int NewClassID = getAlignedAGPRClassID(RC->getID());
+ if (NewClassID != -1)
+ MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
+ }
+ }
+
TargetLoweringBase::finalizeLowering(MF);
// Allocate a VGPR for future SGPR Spill if
// "amdgpu-reserve-vgpr-for-sgpr-spill" option is used
// FIXME: We won't need this hack if we split SGPR allocation from VGPR
- if (VGPRReserveforSGPRSpill && !Info->VGPRReservedForSGPRSpill &&
- !Info->isEntryFunction() && MF.getFrameInfo().hasStackObjects())
+ if (VGPRReserveforSGPRSpill && TRI->spillSGPRToVGPR() &&
+ !Info->VGPRReservedForSGPRSpill && !Info->isEntryFunction())
Info->reserveVGPRforSGPRSpills(MF);
}
@@ -11690,8 +11998,37 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(
case ISD::INTRINSIC_W_CHAIN:
return AMDGPU::isIntrinsicSourceOfDivergence(
cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
+ case AMDGPUISD::ATOMIC_CMP_SWAP:
+ case AMDGPUISD::ATOMIC_INC:
+ case AMDGPUISD::ATOMIC_DEC:
+ case AMDGPUISD::ATOMIC_LOAD_FMIN:
+ case AMDGPUISD::ATOMIC_LOAD_FMAX:
+ case AMDGPUISD::BUFFER_ATOMIC_SWAP:
+ case AMDGPUISD::BUFFER_ATOMIC_ADD:
+ case AMDGPUISD::BUFFER_ATOMIC_SUB:
+ case AMDGPUISD::BUFFER_ATOMIC_SMIN:
+ case AMDGPUISD::BUFFER_ATOMIC_UMIN:
+ case AMDGPUISD::BUFFER_ATOMIC_SMAX:
+ case AMDGPUISD::BUFFER_ATOMIC_UMAX:
+ case AMDGPUISD::BUFFER_ATOMIC_AND:
+ case AMDGPUISD::BUFFER_ATOMIC_OR:
+ case AMDGPUISD::BUFFER_ATOMIC_XOR:
+ case AMDGPUISD::BUFFER_ATOMIC_INC:
+ case AMDGPUISD::BUFFER_ATOMIC_DEC:
+ case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
+ case AMDGPUISD::BUFFER_ATOMIC_CSUB:
+ case AMDGPUISD::BUFFER_ATOMIC_FADD:
+ case AMDGPUISD::BUFFER_ATOMIC_FMIN:
+ case AMDGPUISD::BUFFER_ATOMIC_FMAX:
+ // Target-specific read-modify-write atomics are sources of divergence.
+ return true;
+ default:
+ if (auto *A = dyn_cast<AtomicSDNode>(N)) {
+ // Generic read-modify-write atomics are sources of divergence.
+ return A->readMem() && A->writeMem();
+ }
+ return false;
}
- return false;
}
bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
@@ -11707,6 +12044,19 @@ bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
}
}
+bool SITargetLowering::denormalsEnabledForType(LLT Ty,
+ MachineFunction &MF) const {
+ switch (Ty.getScalarSizeInBits()) {
+ case 32:
+ return hasFP32Denormals(MF);
+ case 64:
+ case 16:
+ return hasFP64FP16Denormals(MF);
+ default:
+ return false;
+ }
+}
+
bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
const SelectionDAG &DAG,
bool SNaN,
@@ -11745,24 +12095,57 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
if (Ty->isHalfTy())
return AtomicExpansionKind::None;
- if (!Ty->isFloatTy())
+ if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy()))
return AtomicExpansionKind::CmpXChg;
- // TODO: Do have these for flat. Older targets also had them for buffers.
unsigned AS = RMW->getPointerAddressSpace();
- if (AS == AMDGPUAS::GLOBAL_ADDRESS && Subtarget->hasAtomicFaddInsts()) {
- if (!fpModeMatchesGlobalFPAtomicMode(RMW))
+ if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) &&
+ Subtarget->hasAtomicFaddInsts()) {
+ // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
+ // floating point atomic instructions. May generate more efficient code,
+ // but may not respect rounding and denormal modes, and may give incorrect
+ // results for certain memory destinations.
+ if (RMW->getFunction()
+ ->getFnAttribute("amdgpu-unsafe-fp-atomics")
+ .getValueAsString() != "true")
+ return AtomicExpansionKind::CmpXChg;
+
+ if (Subtarget->hasGFX90AInsts()) {
+ if (Ty->isFloatTy() && AS == AMDGPUAS::FLAT_ADDRESS)
+ return AtomicExpansionKind::CmpXChg;
+
+ auto SSID = RMW->getSyncScopeID();
+ if (SSID == SyncScope::System ||
+ SSID == RMW->getContext().getOrInsertSyncScopeID("one-as"))
+ return AtomicExpansionKind::CmpXChg;
+
+ return AtomicExpansionKind::None;
+ }
+
+ if (AS == AMDGPUAS::FLAT_ADDRESS)
return AtomicExpansionKind::CmpXChg;
- return RMW->use_empty() ? AtomicExpansionKind::None :
- AtomicExpansionKind::CmpXChg;
+ return RMW->use_empty() ? AtomicExpansionKind::None
+ : AtomicExpansionKind::CmpXChg;
}
// DS FP atomics do repect the denormal mode, but the rounding mode is fixed
// to round-to-nearest-even.
- return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ?
- AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg;
+ // The only exception is DS_ADD_F64 which never flushes regardless of mode.
+ if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) {
+ if (!Ty->isDoubleTy())
+ return AtomicExpansionKind::None;
+
+ return (fpModeMatchesGlobalFPAtomicMode(RMW) ||
+ RMW->getFunction()
+ ->getFnAttribute("amdgpu-unsafe-fp-atomics")
+ .getValueAsString() == "true")
+ ? AtomicExpansionKind::None
+ : AtomicExpansionKind::CmpXChg;
+ }
+
+ return AtomicExpansionKind::CmpXChg;
}
default:
break;
@@ -11872,10 +12255,11 @@ bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
}
-std::pair<int, MVT>
+std::pair<InstructionCost, MVT>
SITargetLowering::getTypeLegalizationCost(const DataLayout &DL,
Type *Ty) const {
- auto Cost = TargetLoweringBase::getTypeLegalizationCost(DL, Ty);
+ std::pair<InstructionCost, MVT> Cost =
+ TargetLoweringBase::getTypeLegalizationCost(DL, Ty);
auto Size = DL.getTypeSizeInBits(Ty);
// Maximum load or store can handle 8 dwords for scalar and 4 for
// vector ALU. Let's assume anything above 8 dwords is expensive
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 823d6eca9bf8..f3d34267a81d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -144,7 +144,11 @@ private:
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+
SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerTrapHsaQueuePtr(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const;
SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
@@ -227,10 +231,8 @@ private:
// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
// pointed to by Offsets.
- /// \returns 0 If there is a non-constant offset or if the offset is 0.
- /// Otherwise returns the constant offset.
- unsigned setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
- SDValue *Offsets, Align Alignment = Align(4)) const;
+ void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
+ SDValue *Offsets, Align Alignment = Align(4)) const;
// Handle 8 bit and 16 bit buffer loads
SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL,
@@ -283,7 +285,7 @@ public:
}
bool allowsMisalignedMemoryAccesses(
- EVT VT, unsigned AS, unsigned Alignment,
+ EVT VT, unsigned AS, Align Alignment,
MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
bool *IsFast = nullptr) const override;
@@ -393,6 +395,7 @@ public:
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
+ void AddIMGInit(MachineInstr &MI) const;
void AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const override;
@@ -439,7 +442,10 @@ public:
bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
unsigned MaxDepth = 5) const;
+ bool isCanonicalized(Register Reg, MachineFunction &MF,
+ unsigned MaxDepth = 5) const;
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const;
+ bool denormalsEnabledForType(LLT Ty, MachineFunction &MF) const;
bool isKnownNeverNaNForTargetNode(SDValue Op,
const SelectionDAG &DAG,
@@ -483,8 +489,8 @@ public:
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const;
- std::pair<int, MVT> getTypeLegalizationCost(const DataLayout &DL,
- Type *Ty) const;
+ std::pair<InstructionCost, MVT> getTypeLegalizationCost(const DataLayout &DL,
+ Type *Ty) const;
};
} // End namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
index 5611c9c5d57e..7ba20eb6027b 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -63,30 +63,10 @@ enum HardClauseType {
HARDCLAUSE_ILLEGAL,
};
-HardClauseType getHardClauseType(const MachineInstr &MI) {
- // On current architectures we only get a benefit from clausing loads.
- if (MI.mayLoad()) {
- if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
- return HARDCLAUSE_VMEM;
- if (SIInstrInfo::isFLAT(MI))
- return HARDCLAUSE_FLAT;
- // TODO: LDS
- if (SIInstrInfo::isSMRD(MI))
- return HARDCLAUSE_SMEM;
- }
-
- // Don't form VALU clauses. It's not clear what benefit they give, if any.
-
- // In practice s_nop is the only internal instruction we're likely to see.
- // It's safe to treat the rest as illegal.
- if (MI.getOpcode() == AMDGPU::S_NOP)
- return HARDCLAUSE_INTERNAL;
- return HARDCLAUSE_ILLEGAL;
-}
-
class SIInsertHardClauses : public MachineFunctionPass {
public:
static char ID;
+ const GCNSubtarget *ST = nullptr;
SIInsertHardClauses() : MachineFunctionPass(ID) {}
@@ -95,6 +75,34 @@ public:
MachineFunctionPass::getAnalysisUsage(AU);
}
+ HardClauseType getHardClauseType(const MachineInstr &MI) {
+
+ // On current architectures we only get a benefit from clausing loads.
+ if (MI.mayLoad()) {
+ if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) {
+ if (ST->hasNSAClauseBug()) {
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
+ if (Info && Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA)
+ return HARDCLAUSE_ILLEGAL;
+ }
+ return HARDCLAUSE_VMEM;
+ }
+ if (SIInstrInfo::isFLAT(MI))
+ return HARDCLAUSE_FLAT;
+ // TODO: LDS
+ if (SIInstrInfo::isSMRD(MI))
+ return HARDCLAUSE_SMEM;
+ }
+
+ // Don't form VALU clauses. It's not clear what benefit they give, if any.
+
+ // In practice s_nop is the only internal instruction we're likely to see.
+ // It's safe to treat the rest as illegal.
+ if (MI.getOpcode() == AMDGPU::S_NOP)
+ return HARDCLAUSE_INTERNAL;
+ return HARDCLAUSE_ILLEGAL;
+ }
+
// Track information about a clause as we discover it.
struct ClauseInfo {
// The type of all (non-internal) instructions in the clause.
@@ -132,12 +140,12 @@ public:
if (skipFunction(MF.getFunction()))
return false;
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- if (!ST.hasHardClauses())
+ ST = &MF.getSubtarget<GCNSubtarget>();
+ if (!ST->hasHardClauses())
return false;
- const SIInstrInfo *SII = ST.getInstrInfo();
- const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+ const SIInstrInfo *SII = ST->getInstrInfo();
+ const TargetRegisterInfo *TRI = ST->getRegisterInfo();
bool Changed = false;
for (auto &MBB : MF) {
diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
deleted file mode 100644
index 9d31cd5cedc3..000000000000
--- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ /dev/null
@@ -1,504 +0,0 @@
-//===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This pass inserts branches on the 0 exec mask over divergent branches
-/// branches when it's expected that jumping over the untaken control flow will
-/// be cheaper than having every workitem no-op through it.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "GCNSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/InitializePasses.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "si-insert-skips"
-
-static cl::opt<unsigned> SkipThresholdFlag(
- "amdgpu-skip-threshold-legacy",
- cl::desc("Number of instructions before jumping over divergent control flow"),
- cl::init(12), cl::Hidden);
-
-namespace {
-
-class SIInsertSkips : public MachineFunctionPass {
-private:
- const SIRegisterInfo *TRI = nullptr;
- const SIInstrInfo *TII = nullptr;
- unsigned SkipThreshold = 0;
- MachineDominatorTree *MDT = nullptr;
-
- MachineBasicBlock *EarlyExitBlock = nullptr;
- bool EarlyExitClearsExec = false;
-
- bool shouldSkip(const MachineBasicBlock &From,
- const MachineBasicBlock &To) const;
-
- bool dominatesAllReachable(MachineBasicBlock &MBB);
- void ensureEarlyExitBlock(MachineBasicBlock &MBB, bool ClearExec);
- void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- DebugLoc DL);
-
- bool kill(MachineInstr &MI);
- void earlyTerm(MachineInstr &MI);
-
- bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
-
-public:
- static char ID;
-
- SIInsertSkips() : MachineFunctionPass(ID) {}
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- StringRef getPassName() const override {
- return "SI insert s_cbranch_execz instructions";
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineDominatorTree>();
- AU.addPreserved<MachineDominatorTree>();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-};
-
-} // end anonymous namespace
-
-char SIInsertSkips::ID = 0;
-
-INITIALIZE_PASS_BEGIN(SIInsertSkips, DEBUG_TYPE,
- "SI insert s_cbranch_execz instructions", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_END(SIInsertSkips, DEBUG_TYPE,
- "SI insert s_cbranch_execz instructions", false, false)
-
-char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
-
-static bool opcodeEmitsNoInsts(const MachineInstr &MI) {
- if (MI.isMetaInstruction())
- return true;
-
- // Handle target specific opcodes.
- switch (MI.getOpcode()) {
- case AMDGPU::SI_MASK_BRANCH:
- return true;
- default:
- return false;
- }
-}
-
-bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
- const MachineBasicBlock &To) const {
- unsigned NumInstr = 0;
- const MachineFunction *MF = From.getParent();
-
- for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
- MBBI != End && MBBI != ToI; ++MBBI) {
- const MachineBasicBlock &MBB = *MBBI;
-
- for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
- NumInstr < SkipThreshold && I != E; ++I) {
- if (opcodeEmitsNoInsts(*I))
- continue;
-
- // FIXME: Since this is required for correctness, this should be inserted
- // during SILowerControlFlow.
-
- // When a uniform loop is inside non-uniform control flow, the branch
- // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
- // when EXEC = 0. We should skip the loop lest it becomes infinite.
- if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
- I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
- return true;
-
- if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
- return true;
-
- // These instructions are potentially expensive even if EXEC = 0.
- if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
- I->getOpcode() == AMDGPU::S_WAITCNT)
- return true;
-
- ++NumInstr;
- if (NumInstr >= SkipThreshold)
- return true;
- }
- }
-
- return false;
-}
-
-/// Check whether \p MBB dominates all blocks that are reachable from it.
-bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) {
- for (MachineBasicBlock *Other : depth_first(&MBB)) {
- if (!MDT->dominates(&MBB, Other))
- return false;
- }
- return true;
-}
-
-static void generateEndPgm(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I, DebugLoc DL,
- const SIInstrInfo *TII, bool IsPS) {
- // "null export"
- if (IsPS) {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE))
- .addImm(AMDGPU::Exp::ET_NULL)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addImm(1) // vm
- .addImm(0) // compr
- .addImm(0); // en
- }
- // s_endpgm
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
-}
-
-void SIInsertSkips::ensureEarlyExitBlock(MachineBasicBlock &MBB,
- bool ClearExec) {
- MachineFunction *MF = MBB.getParent();
- DebugLoc DL;
-
- if (!EarlyExitBlock) {
- EarlyExitBlock = MF->CreateMachineBasicBlock();
- MF->insert(MF->end(), EarlyExitBlock);
- generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII,
- MF->getFunction().getCallingConv() ==
- CallingConv::AMDGPU_PS);
- EarlyExitClearsExec = false;
- }
-
- if (ClearExec && !EarlyExitClearsExec) {
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
- unsigned Mov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- Register Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- auto ExitI = EarlyExitBlock->getFirstNonPHI();
- BuildMI(*EarlyExitBlock, ExitI, DL, TII->get(Mov), Exec).addImm(0);
- EarlyExitClearsExec = true;
- }
-}
-
-static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI,
- MachineDominatorTree *MDT) {
- MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/ true);
-
- // Update dominator tree
- using DomTreeT = DomTreeBase<MachineBasicBlock>;
- SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
- for (MachineBasicBlock *Succ : SplitBB->successors()) {
- DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
- DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ});
- }
- DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
- MDT->getBase().applyUpdates(DTUpdates);
-}
-
-/// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given
-/// iterator. Only applies to pixel shaders.
-void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I, DebugLoc DL) {
- MachineFunction *MF = MBB.getParent();
- (void)MF;
- assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS);
-
- // It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a
- // basic block that has no further successors (e.g., there was an
- // `unreachable` there in IR). This can happen with original source of the
- // form:
- //
- // if (uniform_condition) {
- // write_to_memory();
- // discard;
- // }
- //
- // In this case, we write the "null_export; s_endpgm" skip code in the
- // already-existing basic block.
- auto NextBBI = std::next(MBB.getIterator());
- bool NoSuccessor =
- I == MBB.end() && !llvm::is_contained(MBB.successors(), &*NextBBI);
-
- if (NoSuccessor) {
- generateEndPgm(MBB, I, DL, TII, true);
- } else {
- ensureEarlyExitBlock(MBB, false);
-
- MachineInstr *BranchMI =
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
- .addMBB(EarlyExitBlock);
-
- // Split the block if the branch will not come at the end.
- auto Next = std::next(BranchMI->getIterator());
- if (Next != MBB.end() && !Next->isTerminator())
- splitBlock(MBB, *BranchMI, MDT);
-
- MBB.addSuccessor(EarlyExitBlock);
- MDT->getBase().insertEdge(&MBB, EarlyExitBlock);
- }
-}
-
-/// Translate a SI_KILL_*_TERMINATOR into exec-manipulating instructions.
-/// Return true unless the terminator is a no-op.
-bool SIInsertSkips::kill(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
-
- switch (MI.getOpcode()) {
- case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
- unsigned Opcode = 0;
-
- // The opcodes are inverted because the inline immediate has to be
- // the first operand, e.g. from "x < imm" to "imm > x"
- switch (MI.getOperand(2).getImm()) {
- case ISD::SETOEQ:
- case ISD::SETEQ:
- Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
- break;
- case ISD::SETOGT:
- case ISD::SETGT:
- Opcode = AMDGPU::V_CMPX_LT_F32_e64;
- break;
- case ISD::SETOGE:
- case ISD::SETGE:
- Opcode = AMDGPU::V_CMPX_LE_F32_e64;
- break;
- case ISD::SETOLT:
- case ISD::SETLT:
- Opcode = AMDGPU::V_CMPX_GT_F32_e64;
- break;
- case ISD::SETOLE:
- case ISD::SETLE:
- Opcode = AMDGPU::V_CMPX_GE_F32_e64;
- break;
- case ISD::SETONE:
- case ISD::SETNE:
- Opcode = AMDGPU::V_CMPX_LG_F32_e64;
- break;
- case ISD::SETO:
- Opcode = AMDGPU::V_CMPX_O_F32_e64;
- break;
- case ISD::SETUO:
- Opcode = AMDGPU::V_CMPX_U_F32_e64;
- break;
- case ISD::SETUEQ:
- Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
- break;
- case ISD::SETUGT:
- Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
- break;
- case ISD::SETUGE:
- Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
- break;
- case ISD::SETULT:
- Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
- break;
- case ISD::SETULE:
- Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
- break;
- case ISD::SETUNE:
- Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
- break;
- default:
- llvm_unreachable("invalid ISD:SET cond code");
- }
-
- const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
- if (ST.hasNoSdstCMPX())
- Opcode = AMDGPU::getVCMPXNoSDstOp(Opcode);
-
- assert(MI.getOperand(0).isReg());
-
- if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
- MI.getOperand(0).getReg())) {
- Opcode = AMDGPU::getVOPe32(Opcode);
- BuildMI(MBB, &MI, DL, TII->get(Opcode))
- .add(MI.getOperand(1))
- .add(MI.getOperand(0));
- } else {
- auto I = BuildMI(MBB, &MI, DL, TII->get(Opcode));
- if (!ST.hasNoSdstCMPX())
- I.addReg(AMDGPU::VCC, RegState::Define);
-
- I.addImm(0) // src0 modifiers
- .add(MI.getOperand(1))
- .addImm(0) // src1 modifiers
- .add(MI.getOperand(0));
-
- I.addImm(0); // omod
- }
- return true;
- }
- case AMDGPU::SI_KILL_I1_TERMINATOR: {
- const MachineFunction *MF = MI.getParent()->getParent();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- const MachineOperand &Op = MI.getOperand(0);
- int64_t KillVal = MI.getOperand(1).getImm();
- assert(KillVal == 0 || KillVal == -1);
-
- // Kill all threads if Op0 is an immediate and equal to the Kill value.
- if (Op.isImm()) {
- int64_t Imm = Op.getImm();
- assert(Imm == 0 || Imm == -1);
-
- if (Imm == KillVal) {
- BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32
- : AMDGPU::S_MOV_B64), Exec)
- .addImm(0);
- return true;
- }
- return false;
- }
-
- unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
- if (ST.isWave32())
- Opcode = KillVal ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_AND_B32;
- BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec)
- .addReg(Exec)
- .add(Op);
- return true;
- }
- default:
- llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
- }
-}
-
-void SIInsertSkips::earlyTerm(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- const DebugLoc DL = MI.getDebugLoc();
-
- ensureEarlyExitBlock(MBB, true);
-
- auto BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0))
- .addMBB(EarlyExitBlock);
- auto Next = std::next(MI.getIterator());
-
- if (Next != MBB.end() && !Next->isTerminator())
- splitBlock(MBB, *BranchMI, MDT);
-
- MBB.addSuccessor(EarlyExitBlock);
- MDT->getBase().insertEdge(&MBB, EarlyExitBlock);
-}
-
-// Returns true if a branch over the block was inserted.
-bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
- MachineBasicBlock &SrcMBB) {
- MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
-
- if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
- return false;
-
- const DebugLoc &DL = MI.getDebugLoc();
- MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
-
- BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
- .addMBB(DestBB);
-
- return true;
-}
-
-bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- TII = ST.getInstrInfo();
- TRI = &TII->getRegisterInfo();
- MDT = &getAnalysis<MachineDominatorTree>();
- SkipThreshold = SkipThresholdFlag;
-
- SmallVector<MachineInstr *, 4> KillInstrs;
- SmallVector<MachineInstr *, 4> EarlyTermInstrs;
- bool MadeChange = false;
-
- for (MachineBasicBlock &MBB : MF) {
- MachineBasicBlock::iterator I, Next;
- for (I = MBB.begin(); I != MBB.end(); I = Next) {
- Next = std::next(I);
- MachineInstr &MI = *I;
-
- switch (MI.getOpcode()) {
- case AMDGPU::SI_MASK_BRANCH:
- MadeChange |= skipMaskBranch(MI, MBB);
- break;
-
- case AMDGPU::S_BRANCH:
- // Optimize out branches to the next block.
- // FIXME: Shouldn't this be handled by BranchFolding?
- if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
- assert(&MI == &MBB.back());
- MI.eraseFromParent();
- MadeChange = true;
- }
- break;
-
- case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
- case AMDGPU::SI_KILL_I1_TERMINATOR: {
- MadeChange = true;
- bool CanKill = kill(MI);
-
- // Check if we can add an early "if exec=0 { end shader }".
- //
- // Note that we _always_ do this if it is correct, even if the kill
- // happens fairly late in the shader, because the null export should
- // generally still be cheaper than normal export(s).
- //
- // TODO: The dominatesAllReachable check is conservative: if the
- // dominance is only missing due to _uniform_ branches, we could
- // in fact insert the early-exit as well.
- if (CanKill &&
- MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
- dominatesAllReachable(MBB)) {
- // Mark the instruction for kill-if-dead insertion. We delay this
- // change because it modifies the CFG.
- KillInstrs.push_back(&MI);
- } else {
- MI.eraseFromParent();
- }
- break;
- }
-
- case AMDGPU::SI_KILL_CLEANUP:
- if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
- dominatesAllReachable(MBB)) {
- KillInstrs.push_back(&MI);
- } else {
- MI.eraseFromParent();
- }
- break;
-
- case AMDGPU::SI_EARLY_TERMINATE_SCC0:
- EarlyTermInstrs.push_back(&MI);
- break;
-
- default:
- break;
- }
- }
- }
-
- for (MachineInstr *Instr : EarlyTermInstrs) {
- // Early termination in GS does nothing
- if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS)
- earlyTerm(*Instr);
- Instr->eraseFromParent();
- }
- for (MachineInstr *Kill : KillInstrs) {
- skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()),
- Kill->getDebugLoc());
- Kill->eraseFromParent();
- }
- KillInstrs.clear();
- EarlyTermInstrs.clear();
- EarlyExitBlock = nullptr;
-
- return MadeChange;
-}
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index c12745586da1..7d6f79922d2e 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -27,6 +27,7 @@
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/CodeGen/MachinePostDominators.h"
@@ -131,7 +132,8 @@ static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
// We reserve a fixed number of VGPR slots in the scoring tables for
// special tokens like SCMEM_LDS (needed for buffer load to LDS).
enum RegisterMapping {
- SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
+ SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
+ AGPR_OFFSET = 226, // Maximum programmable ArchVGPRs across all targets.
SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
@@ -244,8 +246,8 @@ public:
const SIRegisterInfo *TRI, unsigned OpNo) const;
bool counterOutOfOrder(InstCounterType T) const;
- bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
- bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
+ void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
+ void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
void determineWait(InstCounterType T, unsigned ScoreToWait,
AMDGPU::Waitcnt &Wait) const;
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
@@ -417,7 +419,7 @@ public:
}
if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
- DebugCounter::shouldExecute(ForceLgkmCounter)) {
+ DebugCounter::shouldExecute(ForceLgkmCounter)) {
ForceEmitWaitcnt[LGKM_CNT] = true;
} else {
ForceEmitWaitcnt[LGKM_CNT] = false;
@@ -441,6 +443,9 @@ public:
WaitcntBrackets *ScoreBrackets);
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets);
+ bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+ MachineInstr &OldWaitcntInstr,
+ AMDGPU::Waitcnt &Wait, const MachineInstr *MI);
};
} // end anonymous namespace
@@ -451,8 +456,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
const SIRegisterInfo *TRI,
unsigned OpNo) const {
const MachineOperand &Op = MI->getOperand(OpNo);
- assert(Op.isReg());
- if (!TRI->isInAllocatableClass(Op.getReg()) || TRI->isAGPR(*MRI, Op.getReg()))
+ if (!TRI->isInAllocatableClass(Op.getReg()))
return {-1, -1};
// A use via a PW operand does not need a waitcnt.
@@ -463,9 +467,11 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST));
- if (TRI->isVGPR(*MRI, Op.getReg())) {
+ if (TRI->isVectorRegister(*MRI, Op.getReg())) {
assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
Result.first = Reg - RegisterEncoding.VGPR0;
+ if (TRI->isAGPR(*MRI, Op.getReg()))
+ Result.first += AGPR_OFFSET;
assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
} else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
@@ -491,7 +497,7 @@ void WaitcntBrackets::setExpScore(const MachineInstr *MI,
const MachineRegisterInfo *MRI, unsigned OpNo,
unsigned Val) {
RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo);
- assert(TRI->isVGPR(*MRI, MI->getOperand(OpNo).getReg()));
+ assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
setRegScore(RegNo, EXP_CNT, Val);
}
@@ -538,7 +544,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
AMDGPU::OpName::data1),
CurrScore);
}
- } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
+ } else if (SIInstrInfo::isAtomicRet(Inst) &&
Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
@@ -549,7 +555,8 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
const MachineOperand &Op = Inst.getOperand(I);
- if (Op.isReg() && !Op.isDef() && TRI->isVGPR(*MRI, Op.getReg())) {
+ if (Op.isReg() && !Op.isDef() &&
+ TRI->isVectorRegister(*MRI, Op.getReg())) {
setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
}
}
@@ -560,7 +567,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
&Inst, TII, TRI, MRI,
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
CurrScore);
- } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
+ } else if (SIInstrInfo::isAtomicRet(Inst)) {
setExpScore(
&Inst, TII, TRI, MRI,
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
@@ -569,7 +576,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
} else if (TII->isMIMG(Inst)) {
if (Inst.mayStore()) {
setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
- } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
+ } else if (SIInstrInfo::isAtomicRet(Inst)) {
setExpScore(
&Inst, TII, TRI, MRI,
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
@@ -582,7 +589,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
} else if (TII->isMUBUF(Inst)) {
if (Inst.mayStore()) {
setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
- } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
+ } else if (SIInstrInfo::isAtomicRet(Inst)) {
setExpScore(
&Inst, TII, TRI, MRI,
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
@@ -606,7 +613,8 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
}
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
MachineOperand &MO = Inst.getOperand(I);
- if (MO.isReg() && !MO.isDef() && TRI->isVGPR(*MRI, MO.getReg())) {
+ if (MO.isReg() && !MO.isDef() &&
+ TRI->isVectorRegister(*MRI, MO.getReg())) {
setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
}
}
@@ -704,22 +712,23 @@ void WaitcntBrackets::print(raw_ostream &OS) {
/// Simplify the waitcnt, in the sense of removing redundant counts, and return
/// whether a waitcnt instruction is needed at all.
-bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
- return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
- simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
- simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt) |
- simplifyWaitcnt(VS_CNT, Wait.VsCnt);
+void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
+ simplifyWaitcnt(VM_CNT, Wait.VmCnt);
+ simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
+ simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
+ simplifyWaitcnt(VS_CNT, Wait.VsCnt);
}
-bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
+void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
unsigned &Count) const {
const unsigned LB = getScoreLB(T);
const unsigned UB = getScoreUB(T);
- if (Count < UB && UB - Count > LB)
- return true;
- Count = ~0u;
- return false;
+ // The number of outstanding events for this type, T, can be calculated
+ // as (UB - LB). If the current Count is greater than or equal to the number
+ // of outstanding events, then the wait for this counter is redundant.
+ if (Count >= UB - LB)
+ Count = ~0u;
}
void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait,
@@ -794,6 +803,107 @@ FunctionPass *llvm::createSIInsertWaitcntsPass() {
return new SIInsertWaitcnts();
}
+/// Combine consecutive waitcnt instructions that precede \p MI and follow
+/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
+/// by previous passes. Currently this pass conservatively assumes that these
+/// preexisting waitcnt are required for correctness.
+bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+ MachineInstr &OldWaitcntInstr,
+ AMDGPU::Waitcnt &Wait,
+ const MachineInstr *MI) {
+ bool Modified = false;
+ MachineInstr *WaitcntInstr = nullptr;
+ MachineInstr *WaitcntVsCntInstr = nullptr;
+ for (auto II = OldWaitcntInstr.getIterator(), NextI = std::next(II);
+ &*II != MI; II = NextI, ++NextI) {
+ if (II->isMetaInstruction())
+ continue;
+
+ if (II->getOpcode() == AMDGPU::S_WAITCNT) {
+ // Conservatively update required wait if this waitcnt was added in an
+ // earlier pass. In this case it will not exist in the tracked waitcnt
+ // set.
+ if (!TrackedWaitcntSet.count(&*II)) {
+ unsigned IEnc = II->getOperand(0).getImm();
+ AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
+ Wait = Wait.combined(OldWait);
+ }
+
+ // Merge consecutive waitcnt of the same type by erasing multiples.
+ if (!WaitcntInstr) {
+ WaitcntInstr = &*II;
+ } else {
+ II->eraseFromParent();
+ Modified = true;
+ }
+
+ } else {
+ assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
+ assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
+ if (!TrackedWaitcntSet.count(&*II)) {
+ unsigned OldVSCnt =
+ TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm();
+ Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
+ }
+
+ if (!WaitcntVsCntInstr) {
+ WaitcntVsCntInstr = &*II;
+ } else {
+ II->eraseFromParent();
+ Modified = true;
+ }
+ }
+ }
+
+ // Updated encoding of merged waitcnt with the required wait.
+ if (WaitcntInstr) {
+ if (Wait.hasWaitExceptVsCnt()) {
+ unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
+ unsigned OldEnc = WaitcntInstr->getOperand(0).getImm();
+ if (OldEnc != NewEnc) {
+ WaitcntInstr->getOperand(0).setImm(NewEnc);
+ Modified = true;
+ }
+ ScoreBrackets.applyWaitcnt(Wait);
+ Wait.VmCnt = ~0u;
+ Wait.LgkmCnt = ~0u;
+ Wait.ExpCnt = ~0u;
+
+ LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
+ << "Old Instr: " << MI << "New Instr: " << *WaitcntInstr
+ << '\n');
+ } else {
+ WaitcntInstr->eraseFromParent();
+ Modified = true;
+ }
+ }
+
+ if (WaitcntVsCntInstr) {
+ if (Wait.hasWaitVsCnt()) {
+ assert(ST->hasVscnt());
+ unsigned OldVSCnt =
+ TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
+ ->getImm();
+ if (Wait.VsCnt != OldVSCnt) {
+ TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
+ ->setImm(Wait.VsCnt);
+ Modified = true;
+ }
+ ScoreBrackets.applyWaitcnt(Wait);
+ Wait.VsCnt = ~0u;
+
+ LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
+ << "Old Instr: " << MI
+ << "New Instr: " << *WaitcntVsCntInstr << '\n');
+ } else {
+ WaitcntVsCntInstr->eraseFromParent();
+ Modified = true;
+ }
+ }
+
+ return Modified;
+}
+
static bool readsVCCZ(const MachineInstr &MI) {
unsigned Opc = MI.getOpcode();
return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
@@ -829,15 +939,17 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
MachineInstr *OldWaitcntInstr) {
setForceEmitWaitcnt();
- bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
if (MI.isMetaInstruction())
return false;
AMDGPU::Waitcnt Wait;
+ bool Modified = false;
- // See if this instruction has a forced S_WAITCNT VM.
- // TODO: Handle other cases of NeedsWaitcntVmBefore()
+ // FIXME: This should have already been handled by the memory legalizer.
+ // Removing this currently doesn't affect any lit tests, but we need to
+ // verify that nothing was relying on this. The number of buffer invalidates
+ // being handled here should not be expanded.
if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
@@ -1003,7 +1115,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
RegInterval Interval =
ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I);
- const bool IsVGPR = TRI->isVGPR(*MRI, Op.getReg());
+ const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
if (IsVGPR) {
// RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
@@ -1049,32 +1161,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
}
}
- // Early-out if no wait is indicated.
- if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
- bool Modified = false;
- if (OldWaitcntInstr) {
- for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
- &*II != &MI; II = NextI, ++NextI) {
- if (II->isDebugInstr())
- continue;
-
- if (TrackedWaitcntSet.count(&*II)) {
- TrackedWaitcntSet.erase(&*II);
- II->eraseFromParent();
- Modified = true;
- } else if (II->getOpcode() == AMDGPU::S_WAITCNT) {
- int64_t Imm = II->getOperand(0).getImm();
- ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
- } else {
- assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
- assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
- auto W = TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm();
- ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt(~0u, ~0u, ~0u, W));
- }
- }
- }
- return Modified;
- }
+ // Verify that the wait is actually needed.
+ ScoreBrackets.simplifyWaitcnt(Wait);
if (ForceEmitZeroWaitcnts)
Wait = AMDGPU::Waitcnt::allZero(ST->hasVscnt());
@@ -1088,57 +1176,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
if (ForceEmitWaitcnt[VS_CNT])
Wait.VsCnt = 0;
- ScoreBrackets.applyWaitcnt(Wait);
-
- AMDGPU::Waitcnt OldWait;
- bool Modified = false;
-
if (OldWaitcntInstr) {
- for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
- &*II != &MI; II = NextI, NextI++) {
- if (II->isDebugInstr())
- continue;
-
- if (II->getOpcode() == AMDGPU::S_WAITCNT) {
- unsigned IEnc = II->getOperand(0).getImm();
- AMDGPU::Waitcnt IWait = AMDGPU::decodeWaitcnt(IV, IEnc);
- OldWait = OldWait.combined(IWait);
- if (!TrackedWaitcntSet.count(&*II))
- Wait = Wait.combined(IWait);
- unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
- if (IEnc != NewEnc) {
- II->getOperand(0).setImm(NewEnc);
- Modified = true;
- }
- Wait.VmCnt = ~0u;
- Wait.LgkmCnt = ~0u;
- Wait.ExpCnt = ~0u;
- } else {
- assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
- assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
-
- unsigned ICnt = TII->getNamedOperand(*II, AMDGPU::OpName::simm16)
- ->getImm();
- OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt);
- if (!TrackedWaitcntSet.count(&*II))
- Wait.VsCnt = std::min(Wait.VsCnt, ICnt);
- if (Wait.VsCnt != ICnt) {
- TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->setImm(Wait.VsCnt);
- Modified = true;
- }
- Wait.VsCnt = ~0u;
- }
-
- LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
- << "Old Instr: " << MI
- << "New Instr: " << *II << '\n');
-
- if (!Wait.hasWait())
- return Modified;
- }
+ // Try to merge the required wait with preexisting waitcnt instructions.
+ // Also erase redundant waitcnt.
+ Modified =
+ applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, &MI);
+ } else {
+ // Update waitcnt brackets after determining the required wait.
+ ScoreBrackets.applyWaitcnt(Wait);
}
- if (Wait.VmCnt != ~0u || Wait.LgkmCnt != ~0u || Wait.ExpCnt != ~0u) {
+ // Build new waitcnt instructions unless no wait is needed or the old waitcnt
+ // instruction was modified to handle the required wait.
+ if (Wait.hasWaitExceptVsCnt()) {
unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
@@ -1151,7 +1201,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
<< "New Instr: " << *SWaitInst << '\n');
}
- if (Wait.VsCnt != ~0u) {
+ if (Wait.hasWaitVsCnt()) {
assert(ST->hasVscnt());
auto SWaitInst =
@@ -1208,6 +1258,10 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
if (!TII->usesLGKM_CNT(MI))
return false;
+ // If in tgsplit mode then there can be no use of LDS.
+ if (ST->isTgSplitEnabled())
+ return false;
+
// If there are no memory operands then conservatively assume the flat
// operation may access LDS.
if (MI.memoperands_empty())
@@ -1246,8 +1300,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
++FlatASCount;
if (!ST->hasVscnt())
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
- else if (Inst.mayLoad() &&
- AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1)
+ else if (Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst))
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
else
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
@@ -1267,16 +1320,10 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
if (FlatASCount > 1)
ScoreBrackets->setPendingFlat();
} else if (SIInstrInfo::isVMEM(Inst) &&
- // TODO: get a better carve out.
- Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
- Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
- Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL &&
- Inst.getOpcode() != AMDGPU::BUFFER_GL0_INV &&
- Inst.getOpcode() != AMDGPU::BUFFER_GL1_INV) {
+ !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) {
if (!ST->hasVscnt())
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
- else if ((Inst.mayLoad() &&
- AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1) ||
+ else if ((Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst)) ||
/* IMAGE_GET_RESINFO / IMAGE_GET_LOD */
(TII->isMIMG(Inst) && !Inst.mayLoad() && !Inst.mayStore()))
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
@@ -1284,7 +1331,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
if (ST->vmemWriteNeedsExpWaitcnt() &&
- (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
+ (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
}
} else if (TII->isSMRD(Inst)) {
@@ -1424,7 +1471,8 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
Iter != E;) {
MachineInstr &Inst = *Iter;
- // Track pre-existing waitcnts from earlier iterations.
+ // Track pre-existing waitcnts that were added in earlier iterations or by
+ // the memory legalizer.
if (Inst.getOpcode() == AMDGPU::S_WAITCNT ||
(Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
Inst.getOperand(0).isReg() &&
@@ -1473,8 +1521,12 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
if (TII->isSMRD(Inst)) {
for (const MachineMemOperand *Memop : Inst.memoperands()) {
- const Value *Ptr = Memop->getValue();
- SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
+ // No need to handle invariant loads when avoiding WAR conflicts, as
+ // there cannot be a vector store to the same memory location.
+ if (!Memop->isInvariant()) {
+ const Value *Ptr = Memop->getValue();
+ SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
+ }
}
if (ST->hasReadVCCZBug()) {
// This smem read could complete and clobber vccz at any time.
@@ -1550,6 +1602,28 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
TrackedWaitcntSet.clear();
BlockInfos.clear();
+ bool Modified = false;
+
+ if (!MFI->isEntryFunction()) {
+ // Wait for any outstanding memory operations that the input registers may
+ // depend on. We can't track them and it's better to do the wait after the
+ // costly call sequence.
+
+ // TODO: Could insert earlier and schedule more liberally with operations
+ // that only use caller preserved registers.
+ MachineBasicBlock &EntryBB = MF.front();
+ MachineBasicBlock::iterator I = EntryBB.begin();
+ for (MachineBasicBlock::iterator E = EntryBB.end();
+ I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
+ ;
+ BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
+ if (ST->hasVscnt())
+ BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(0);
+
+ Modified = true;
+ }
// Keep iterating over the blocks in reverse post order, inserting and
// updating s_waitcnt where needed, until a fix point is reached.
@@ -1557,7 +1631,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
BlockInfos.insert({MBB, BlockInfo(MBB)});
std::unique_ptr<WaitcntBrackets> Brackets;
- bool Modified = false;
bool Repeat;
do {
Repeat = false;
@@ -1657,26 +1730,5 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
}
}
- if (!MFI->isEntryFunction()) {
- // Wait for any outstanding memory operations that the input registers may
- // depend on. We can't track them and it's better to the wait after the
- // costly call sequence.
-
- // TODO: Could insert earlier and schedule more liberally with operations
- // that only use caller preserved registers.
- MachineBasicBlock &EntryBB = MF.front();
- MachineBasicBlock::iterator I = EntryBB.begin();
- for (MachineBasicBlock::iterator E = EntryBB.end();
- I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
- ;
- BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
- if (ST->hasVscnt())
- BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
- .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
- .addImm(0);
-
- Modified = true;
- }
-
return Modified;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 7ce042b67aba..e39f52875f1f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -113,7 +113,7 @@ class InstSI <dag outs, dag ins, string asm = "",
// This field indicates that FLAT instruction accesses FLAT_GLBL segment.
// Must be 0 for non-FLAT instructions.
- field bit IsFlatGlobal = 0;
+ field bit FlatGlobal = 0;
// Reads the mode register, usually for FP environment.
field bit ReadsModeReg = 0;
@@ -133,7 +133,13 @@ class InstSI <dag outs, dag ins, string asm = "",
// This field indicates that FLAT instruction accesses FLAT_SCRATCH segment.
// Must be 0 for non-FLAT instructions.
- field bit IsFlatScratch = 0;
+ field bit FlatScratch = 0;
+
+ // Atomic without a return.
+ field bit IsAtomicNoRet = 0;
+
+ // Atomic with return.
+ field bit IsAtomicRet = 0;
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = SALU;
@@ -193,7 +199,7 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{50} = D16Buf;
- let TSFlags{51} = IsFlatGlobal;
+ let TSFlags{51} = FlatGlobal;
let TSFlags{52} = FPDPRounding;
@@ -203,7 +209,11 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{55} = IsDOT;
- let TSFlags{56} = IsFlatScratch;
+ let TSFlags{56} = FlatScratch;
+
+ let TSFlags{57} = IsAtomicNoRet;
+
+ let TSFlags{58} = IsAtomicRet;
let SchedRW = [Write32Bit];
@@ -251,6 +261,13 @@ class Enc64 {
int Size = 8;
}
+def CPolBit {
+ int GLC = 0;
+ int SLC = 1;
+ int DLC = 2;
+ int SCC = 4;
+}
+
class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">;
class VINTRPe <bits<2> op> : Enc32 {
@@ -268,27 +285,25 @@ class VINTRPe <bits<2> op> : Enc32 {
}
class MIMGe : Enc64 {
- bits<8> vdata;
+ bits<10> vdata;
bits<4> dmask;
bits<1> unorm;
- bits<1> glc;
+ bits<5> cpol;
bits<1> r128;
bits<1> tfe;
bits<1> lwe;
- bits<1> slc;
bit d16;
bits<7> srsrc;
bits<7> ssamp;
let Inst{11-8} = dmask;
let Inst{12} = unorm;
- let Inst{13} = glc;
+ let Inst{13} = cpol{CPolBit.GLC};
let Inst{15} = r128;
- let Inst{16} = tfe;
let Inst{17} = lwe;
- let Inst{25} = slc;
+ let Inst{25} = cpol{CPolBit.SLC};
let Inst{31-26} = 0x3c;
- let Inst{47-40} = vdata;
+ let Inst{47-40} = vdata{7-0};
let Inst{52-48} = srsrc{6-2};
let Inst{57-53} = ssamp{6-2};
let Inst{63} = d16;
@@ -299,7 +314,21 @@ class MIMGe_gfx6789 <bits<8> op> : MIMGe {
bits<1> da;
let Inst{0} = op{7};
+ let Inst{7} = cpol{CPolBit.SCC};
+ let Inst{14} = da;
+ let Inst{16} = tfe;
+ let Inst{24-18} = op{6-0};
+ let Inst{39-32} = vaddr;
+}
+
+class MIMGe_gfx90a <bits<8> op> : MIMGe {
+ bits<8> vaddr;
+ bits<1> da;
+
+ let Inst{0} = op{7};
+ let Inst{7} = cpol{CPolBit.SCC};
let Inst{14} = da;
+ let Inst{16} = vdata{9}; // ACC bit
let Inst{24-18} = op{6-0};
let Inst{39-32} = vaddr;
}
@@ -308,13 +337,13 @@ class MIMGe_gfx10 <bits<8> op> : MIMGe {
bits<8> vaddr0;
bits<3> dim;
bits<2> nsa;
- bits<1> dlc;
bits<1> a16;
let Inst{0} = op{7};
let Inst{2-1} = nsa;
let Inst{5-3} = dim;
- let Inst{7} = dlc;
+ let Inst{7} = cpol{CPolBit.DLC};
+ let Inst{16} = tfe;
let Inst{24-18} = op{6-0};
let Inst{39-32} = vaddr0;
let Inst{62} = a16;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index dfd0075bf03a..7ab0f7a100c5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -25,6 +25,7 @@
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Target/TargetMachine.h"
@@ -107,20 +108,26 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
AAResults *AA) const {
- // TODO: The generic check fails for VALU instructions that should be
- // rematerializable due to implicit reads of exec. We really want all of the
- // generic logic for this except for this.
- switch (MI.getOpcode()) {
- case AMDGPU::V_MOV_B32_e32:
- case AMDGPU::V_MOV_B32_e64:
- case AMDGPU::V_MOV_B64_PSEUDO:
- case AMDGPU::V_ACCVGPR_READ_B32_e64:
- case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
- // No implicit operands.
- return MI.getNumOperands() == MI.getDesc().getNumOperands();
- default:
- return false;
+ if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI)) {
+ // Normally VALU use of exec would block the rematerialization, but that
+ // is OK in this case to have an implicit exec read as all VALU do.
+ // We really want all of the generic logic for this except for this.
+
+ // Another potential implicit use is mode register. The core logic of
+ // the RA will not attempt rematerialization if mode is set anywhere
+ // in the function, otherwise it is safe since mode is not changed.
+ return !MI.hasImplicitDef() &&
+ MI.getNumImplicitOperands() == MI.getDesc().getNumImplicitUses() &&
+ !MI.mayRaiseFPException();
}
+
+ return false;
+}
+
+bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
+ // Any implicit use of exec by VALU is not a real register read.
+ return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
+ isVALU(*MO.getParent());
}
bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
@@ -313,39 +320,22 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
}
if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
- const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
- if (SOffset && SOffset->isReg()) {
- // We can only handle this if it's a stack access, as any other resource
- // would require reporting multiple base registers.
- const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
- if (AddrReg && !AddrReg->isFI())
- return false;
-
- const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
- const SIMachineFunctionInfo *MFI
- = LdSt.getParent()->getParent()->getInfo<SIMachineFunctionInfo>();
- if (RSrc->getReg() != MFI->getScratchRSrcReg())
- return false;
-
- const MachineOperand *OffsetImm =
- getNamedOperand(LdSt, AMDGPU::OpName::offset);
- BaseOps.push_back(RSrc);
- BaseOps.push_back(SOffset);
- Offset = OffsetImm->getImm();
- } else {
- BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
- if (!BaseOp) // e.g. BUFFER_WBINVL1_VOL
- return false;
+ const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
+ if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
+ return false;
+ BaseOps.push_back(RSrc);
+ BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+ if (BaseOp && !BaseOp->isFI())
BaseOps.push_back(BaseOp);
-
- BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
- if (BaseOp)
- BaseOps.push_back(BaseOp);
-
- const MachineOperand *OffsetImm =
- getNamedOperand(LdSt, AMDGPU::OpName::offset);
- Offset = OffsetImm->getImm();
- if (SOffset) // soffset can be an inline immediate.
+ const MachineOperand *OffsetImm =
+ getNamedOperand(LdSt, AMDGPU::OpName::offset);
+ Offset = OffsetImm->getImm();
+ const MachineOperand *SOffset =
+ getNamedOperand(LdSt, AMDGPU::OpName::soffset);
+ if (SOffset) {
+ if (SOffset->isReg())
+ BaseOps.push_back(SOffset);
+ else
Offset += SOffset->getImm();
}
// Get appropriate operand, and compute width accordingly.
@@ -576,15 +566,18 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
if (!Tmp)
report_fatal_error("Cannot scavenge VGPR to copy to AGPR");
RS.setRegUsed(Tmp);
- // Only loop through if there are any free registers left, otherwise
- // scavenger may report a fatal error without emergency spill slot
- // or spill with the slot.
- while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
- Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
- if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
- break;
- Tmp = Tmp2;
- RS.setRegUsed(Tmp);
+
+ if (!TII.getSubtarget().hasGFX90AInsts()) {
+ // Only loop through if there are any free registers left, otherwise
+ // scavenger may report a fatal error without emergency spill slot
+ // or spill with the slot.
+ while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
+ Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
+ if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
+ break;
+ Tmp = Tmp2;
+ RS.setRegUsed(Tmp);
+ }
}
// Insert copy to temporary VGPR.
@@ -782,7 +775,6 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
-
if (RC == &AMDGPU::AGPR_32RegClass) {
if (AMDGPU::VGPR_32RegClass.contains(SrcReg)) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
@@ -790,6 +782,12 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
+ if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
// FIXME: Pass should maintain scavenger to avoid scan through the block on
// every AGPR spill.
RegScavenger RS;
@@ -797,7 +795,8 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
- if (RI.getRegSizeInBits(*RC) == 16) {
+ const unsigned Size = RI.getRegSizeInBits(*RC);
+ if (Size == 16) {
assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
AMDGPU::VGPR_HI16RegClass.contains(SrcReg) ||
AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
@@ -863,9 +862,27 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
+ const TargetRegisterClass *SrcRC = RI.getPhysRegClass(SrcReg);
+ if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
+ if (ST.hasPackedFP32Ops()) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
+ .addImm(SISrcMods::OP_SEL_1)
+ .addReg(SrcReg)
+ .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
+ .addReg(SrcReg)
+ .addImm(0) // op_sel_lo
+ .addImm(0) // op_sel_hi
+ .addImm(0) // neg_lo
+ .addImm(0) // neg_hi
+ .addImm(0) // clamp
+ .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
+ return;
+ }
+ }
+
const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
if (RI.isSGPRClass(RC)) {
- if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) {
+ if (!RI.isSGPRClass(SrcRC)) {
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
}
@@ -873,12 +890,21 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
+ unsigned EltSize = 4;
unsigned Opcode = AMDGPU::V_MOV_B32_e32;
if (RI.hasAGPRs(RC)) {
- Opcode = RI.hasVGPRs(RI.getPhysRegClass(SrcReg)) ?
+ Opcode = (RI.hasVGPRs(SrcRC)) ?
AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
- } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) {
+ } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(SrcRC)) {
Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
+ } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
+ (RI.isProperlyAlignedRC(*RC) &&
+ (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
+ // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
+ if (ST.hasPackedFP32Ops()) {
+ Opcode = AMDGPU::V_PK_MOV_B32;
+ EltSize = 8;
+ }
}
// For the cases where we need an intermediate instruction/temporary register
@@ -890,7 +916,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
RS.reset(new RegScavenger());
- ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, 4);
+ ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
// If there is an overlap, we can't kill the super-register on the last
// instruction, since it will also kill the components made live by this def.
@@ -911,6 +937,23 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx),
RI.getSubReg(SrcReg, SubIdx), UseKill, *RS,
ImpDefSuper, ImpUseSuper);
+ } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
+ Register DstSubReg = RI.getSubReg(DestReg, SubIdx);
+ Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg)
+ .addImm(SISrcMods::OP_SEL_1)
+ .addReg(SrcSubReg)
+ .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
+ .addReg(SrcSubReg)
+ .addImm(0) // op_sel_lo
+ .addImm(0) // op_sel_hi
+ .addImm(0) // neg_lo
+ .addImm(0) // neg_hi
+ .addImm(0) // clamp
+ .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
+ if (Idx == 0)
+ MIB.addReg(DestReg, RegState::Define | RegState::Implicit);
} else {
MachineInstrBuilder Builder =
BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx))
@@ -969,7 +1012,7 @@ void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
.addImm(Value);
return;
}
- if (RegClass == &AMDGPU::VReg_64RegClass) {
+ if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
.addImm(Value);
return;
@@ -1301,6 +1344,8 @@ static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_S160_SAVE;
case 24:
return AMDGPU::SI_SPILL_S192_SAVE;
+ case 28:
+ return AMDGPU::SI_SPILL_S224_SAVE;
case 32:
return AMDGPU::SI_SPILL_S256_SAVE;
case 64:
@@ -1326,6 +1371,8 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_V160_SAVE;
case 24:
return AMDGPU::SI_SPILL_V192_SAVE;
+ case 28:
+ return AMDGPU::SI_SPILL_V224_SAVE;
case 32:
return AMDGPU::SI_SPILL_V256_SAVE;
case 64:
@@ -1351,6 +1398,8 @@ static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_A160_SAVE;
case 24:
return AMDGPU::SI_SPILL_A192_SAVE;
+ case 28:
+ return AMDGPU::SI_SPILL_A224_SAVE;
case 32:
return AMDGPU::SI_SPILL_A256_SAVE;
case 64:
@@ -1434,6 +1483,8 @@ static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_S160_RESTORE;
case 24:
return AMDGPU::SI_SPILL_S192_RESTORE;
+ case 28:
+ return AMDGPU::SI_SPILL_S224_RESTORE;
case 32:
return AMDGPU::SI_SPILL_S256_RESTORE;
case 64:
@@ -1459,6 +1510,8 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_V160_RESTORE;
case 24:
return AMDGPU::SI_SPILL_V192_RESTORE;
+ case 28:
+ return AMDGPU::SI_SPILL_V224_RESTORE;
case 32:
return AMDGPU::SI_SPILL_V256_RESTORE;
case 64:
@@ -1484,6 +1537,8 @@ static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_A160_RESTORE;
case 24:
return AMDGPU::SI_SPILL_A192_RESTORE;
+ case 28:
+ return AMDGPU::SI_SPILL_A224_RESTORE;
case 32:
return AMDGPU::SI_SPILL_A256_RESTORE;
case 64:
@@ -1590,6 +1645,7 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
}
bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MBB.findDebugLoc(MI);
switch (MI.getOpcode()) {
@@ -1640,6 +1696,18 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(AMDGPU::S_ANDN2_B32));
break;
+ case AMDGPU::S_AND_B64_term:
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(get(AMDGPU::S_AND_B64));
+ break;
+
+ case AMDGPU::S_AND_B32_term:
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(get(AMDGPU::S_AND_B32));
+ break;
+
case AMDGPU::V_MOV_B64_PSEUDO: {
Register Dst = MI.getOperand(0).getReg();
Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
@@ -1650,20 +1718,49 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
assert(!SrcOp.isFPImm());
if (SrcOp.isImm()) {
APInt Imm(64, SrcOp.getImm());
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
- .addImm(Imm.getLoBits(32).getZExtValue())
- .addReg(Dst, RegState::Implicit | RegState::Define);
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
- .addImm(Imm.getHiBits(32).getZExtValue())
- .addReg(Dst, RegState::Implicit | RegState::Define);
+ APInt Lo(32, Imm.getLoBits(32).getZExtValue());
+ APInt Hi(32, Imm.getHiBits(32).getZExtValue());
+ if (ST.hasPackedFP32Ops() && Lo == Hi && isInlineConstant(Lo)) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
+ .addImm(SISrcMods::OP_SEL_1)
+ .addImm(Lo.getSExtValue())
+ .addImm(SISrcMods::OP_SEL_1)
+ .addImm(Lo.getSExtValue())
+ .addImm(0) // op_sel_lo
+ .addImm(0) // op_sel_hi
+ .addImm(0) // neg_lo
+ .addImm(0) // neg_hi
+ .addImm(0); // clamp
+ } else {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
+ .addImm(Lo.getSExtValue())
+ .addReg(Dst, RegState::Implicit | RegState::Define);
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
+ .addImm(Hi.getSExtValue())
+ .addReg(Dst, RegState::Implicit | RegState::Define);
+ }
} else {
assert(SrcOp.isReg());
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
- .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
- .addReg(Dst, RegState::Implicit | RegState::Define);
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
- .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
- .addReg(Dst, RegState::Implicit | RegState::Define);
+ if (ST.hasPackedFP32Ops() &&
+ !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
+ .addImm(SISrcMods::OP_SEL_1) // src0_mod
+ .addReg(SrcOp.getReg())
+ .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod
+ .addReg(SrcOp.getReg())
+ .addImm(0) // op_sel_lo
+ .addImm(0) // op_sel_hi
+ .addImm(0) // neg_lo
+ .addImm(0) // neg_hi
+ .addImm(0); // clamp
+ } else {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
+ .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
+ .addReg(Dst, RegState::Implicit | RegState::Define);
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
+ .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
+ .addReg(Dst, RegState::Implicit | RegState::Define);
+ }
}
MI.eraseFromParent();
break;
@@ -1672,11 +1769,35 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
expandMovDPP64(MI);
break;
}
+ case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
+ const MachineOperand &SrcOp = MI.getOperand(1);
+ assert(!SrcOp.isFPImm());
+ APInt Imm(64, SrcOp.getImm());
+ if (Imm.isIntN(32) || isInlineConstant(Imm)) {
+ MI.setDesc(get(AMDGPU::S_MOV_B64));
+ break;
+ }
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
+ Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
+
+ APInt Lo(32, Imm.getLoBits(32).getZExtValue());
+ APInt Hi(32, Imm.getHiBits(32).getZExtValue());
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
+ .addImm(Lo.getSExtValue())
+ .addReg(Dst, RegState::Implicit | RegState::Define);
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
+ .addImm(Hi.getSExtValue())
+ .addReg(Dst, RegState::Implicit | RegState::Define);
+ MI.eraseFromParent();
+ break;
+ }
case AMDGPU::V_SET_INACTIVE_B32: {
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- BuildMI(MBB, MI, DL, get(NotOpc), Exec)
- .addReg(Exec);
+ auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
+ FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
.add(MI.getOperand(2));
BuildMI(MBB, MI, DL, get(NotOpc), Exec)
@@ -1687,8 +1808,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case AMDGPU::V_SET_INACTIVE_B64: {
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- BuildMI(MBB, MI, DL, get(NotOpc), Exec)
- .addReg(Exec);
+ auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
+ FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
MI.getOperand(0).getReg())
.add(MI.getOperand(2));
@@ -1848,16 +1969,29 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
- case AMDGPU::ENTER_WWM: {
+ case AMDGPU::ENTER_STRICT_WWM: {
// This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
- // WWM is entered.
+ // Whole Wave Mode is entered.
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
: AMDGPU::S_OR_SAVEEXEC_B64));
break;
}
- case AMDGPU::EXIT_WWM: {
+ case AMDGPU::ENTER_STRICT_WQM: {
// This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
- // WWM is exited.
+ // STRICT_WQM is entered.
+ const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
+ const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
+ BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
+
+ MI.eraseFromParent();
+ break;
+ }
+ case AMDGPU::EXIT_STRICT_WWM:
+ case AMDGPU::EXIT_STRICT_WQM: {
+ // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
+ // WWM/STICT_WQM is exited.
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
break;
}
@@ -1877,7 +2011,6 @@ SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
unsigned Part = 0;
MachineInstr *Split[2];
-
for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
if (Dst.isPhysical()) {
@@ -2098,32 +2231,36 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
// s_getpc_b64. Insert pc arithmetic code before last terminator.
MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
- // TODO: Handle > 32-bit block address.
- if (BrOffset >= 0) {
- BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
- .addReg(PCReg, RegState::Define, AMDGPU::sub0)
- .addReg(PCReg, 0, AMDGPU::sub0)
- .addMBB(&DestBB, MO_LONG_BRANCH_FORWARD);
- BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
- .addReg(PCReg, RegState::Define, AMDGPU::sub1)
- .addReg(PCReg, 0, AMDGPU::sub1)
- .addImm(0);
- } else {
- // Backwards branch.
- BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
+ auto &MCCtx = MF->getContext();
+ MCSymbol *PostGetPCLabel =
+ MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
+ GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
+
+ MCSymbol *OffsetLo =
+ MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
+ MCSymbol *OffsetHi =
+ MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
+ BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
.addReg(PCReg, RegState::Define, AMDGPU::sub0)
.addReg(PCReg, 0, AMDGPU::sub0)
- .addMBB(&DestBB, MO_LONG_BRANCH_BACKWARD);
- BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
+ .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
+ BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
.addReg(PCReg, RegState::Define, AMDGPU::sub1)
.addReg(PCReg, 0, AMDGPU::sub1)
- .addImm(0);
- }
+ .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
// Insert the indirect branch after the other terminator.
BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
.addReg(PCReg);
+ auto ComputeBlockSize = [](const TargetInstrInfo *TII,
+ const MachineBasicBlock &MBB) {
+ unsigned Size = 0;
+ for (const MachineInstr &MI : MBB)
+ Size += TII->getInstSizeInBytes(MI);
+ return Size;
+ };
+
// FIXME: If spilling is necessary, this will fail because this scavenger has
// no emergency stack slots. It is non-trivial to spill in this situation,
// because the restore code needs to be specially placed after the
@@ -2168,7 +2305,16 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
MRI.clearVirtRegs();
RS->setRegUsed(Scav);
- return 4 + 8 + 4 + 4;
+ // Now, the distance could be defined.
+ auto *Offset = MCBinaryExpr::createSub(
+ MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
+ MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
+ // Add offset assignments.
+ auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
+ OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
+ auto *ShAmt = MCConstantExpr::create(32, MCCtx);
+ OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
+ return ComputeBlockSize(this, MBB);
}
unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
@@ -2263,18 +2409,18 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
// Skip over the instructions that are artificially terminators for special
// exec management.
- while (I != E && !I->isBranch() && !I->isReturn() &&
- I->getOpcode() != AMDGPU::SI_MASK_BRANCH) {
+ while (I != E && !I->isBranch() && !I->isReturn()) {
switch (I->getOpcode()) {
- case AMDGPU::SI_MASK_BRANCH:
case AMDGPU::S_MOV_B64_term:
case AMDGPU::S_XOR_B64_term:
case AMDGPU::S_OR_B64_term:
case AMDGPU::S_ANDN2_B64_term:
+ case AMDGPU::S_AND_B64_term:
case AMDGPU::S_MOV_B32_term:
case AMDGPU::S_XOR_B32_term:
case AMDGPU::S_OR_B32_term:
case AMDGPU::S_ANDN2_B32_term:
+ case AMDGPU::S_AND_B32_term:
break;
case AMDGPU::SI_IF:
case AMDGPU::SI_ELSE:
@@ -2292,34 +2438,7 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
if (I == E)
return false;
- if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
- return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
-
- ++I;
-
- // TODO: Should be able to treat as fallthrough?
- if (I == MBB.end())
- return true;
-
- if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
- return true;
-
- MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
-
- // Specifically handle the case where the conditional branch is to the same
- // destination as the mask branch. e.g.
- //
- // si_mask_branch BB8
- // s_cbranch_execz BB8
- // s_cbranch BB9
- //
- // This is required to understand divergent loops which may need the branches
- // to be relaxed.
- if (TBB != MaskBrDest || Cond.empty())
- return true;
-
- auto Pred = Cond[0].getImm();
- return (Pred != EXECZ && Pred != EXECNZ);
+ return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
}
unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
@@ -2330,11 +2449,6 @@ unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
unsigned RemovedSize = 0;
while (I != MBB.end()) {
MachineBasicBlock::iterator Next = std::next(I);
- if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
- I = Next;
- continue;
- }
-
RemovedSize += getInstSizeInBytes(*I);
I->eraseFromParent();
++Count;
@@ -2400,6 +2514,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
MachineInstr *CondBr =
BuildMI(&MBB, DL, get(Opcode))
.addMBB(TBB);
+ fixImplicitOperands(*CondBr);
BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
.addMBB(FBB);
@@ -2593,6 +2708,7 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
case AMDGPU::COPY:
case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
case AMDGPU::V_ACCVGPR_READ_B32_e64:
+ case AMDGPU::V_ACCVGPR_MOV_B32:
return true;
default:
return false;
@@ -2983,7 +3099,9 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
unsigned Opc = MI.getOpcode();
bool IsF16 = false;
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
- Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64;
+ Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
+ bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
switch (Opc) {
default:
@@ -2994,13 +3112,15 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
LLVM_FALLTHROUGH;
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_FMAC_F32_e64:
+ case AMDGPU::V_FMAC_F64_e64:
break;
case AMDGPU::V_MAC_F16_e32:
case AMDGPU::V_FMAC_F16_e32:
IsF16 = true;
LLVM_FALLTHROUGH;
case AMDGPU::V_MAC_F32_e32:
- case AMDGPU::V_FMAC_F32_e32: {
+ case AMDGPU::V_FMAC_F32_e32:
+ case AMDGPU::V_FMAC_F64_e32: {
int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::src0);
const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
@@ -3026,7 +3146,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
MachineInstrBuilder MIB;
- if (!Src0Mods && !Src1Mods && !Clamp && !Omod &&
+ if (!Src0Mods && !Src1Mods && !Clamp && !Omod && !IsF64 &&
// If we have an SGPR input, we will violate the constant bus restriction.
(ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
!RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
@@ -3074,7 +3194,9 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
}
}
- unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64 : AMDGPU::V_FMA_F32_e64)
+ unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64
+ : IsF64 ? AMDGPU::V_FMA_F64_e64
+ : AMDGPU::V_FMA_F32_e64)
: (IsF16 ? AMDGPU::V_MAD_F16_e64 : AMDGPU::V_MAD_F32_e64);
if (pseudoToMCOpcode(NewOpc) == -1)
return nullptr;
@@ -3262,6 +3384,10 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ case AMDGPU::OPERAND_REG_IMM_V2FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
+ case AMDGPU::OPERAND_REG_IMM_V2INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32: {
int32_t Trunc = static_cast<int32_t>(Imm);
@@ -3271,6 +3397,7 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
return AMDGPU::isInlinableLiteral64(MO.getImm(),
ST.hasInv2PiInlineImm());
case AMDGPU::OPERAND_REG_IMM_INT16:
@@ -3382,6 +3509,10 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
}
bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
+ // GFX90A does not have V_MUL_LEGACY_F32_e32.
+ if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
+ return false;
+
int Op32 = AMDGPU::getVOPe32(Opcode);
if (Op32 == -1)
return false;
@@ -3439,6 +3570,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
case AMDGPU::V_MAC_F16_e64:
case AMDGPU::V_FMAC_F32_e64:
case AMDGPU::V_FMAC_F16_e64:
+ case AMDGPU::V_FMAC_F64_e64:
if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
return false;
@@ -3663,7 +3795,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
// Make sure the register classes are correct.
for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
- if (MI.getOperand(i).isFPImm()) {
+ const MachineOperand &MO = MI.getOperand(i);
+ if (MO.isFPImm()) {
ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
"all fp values to integers.";
return false;
@@ -3690,8 +3823,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
- case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
- const MachineOperand &MO = MI.getOperand(i);
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64: {
if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
ErrInfo = "Illegal immediate value for operand.";
return false;
@@ -3712,12 +3845,37 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
continue;
}
- if (!MI.getOperand(i).isReg())
+ if (!MO.isReg())
+ continue;
+ Register Reg = MO.getReg();
+ if (!Reg)
continue;
+ // FIXME: Ideally we would have separate instruction definitions with the
+ // aligned register constraint.
+ // FIXME: We do not verify inline asm operands, but custom inline asm
+ // verification is broken anyway
+ if (ST.needsAlignedVGPRs()) {
+ const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
+ const bool IsVGPR = RI.hasVGPRs(RC);
+ const bool IsAGPR = !IsVGPR && RI.hasAGPRs(RC);
+ if ((IsVGPR || IsAGPR) && MO.getSubReg()) {
+ const TargetRegisterClass *SubRC =
+ RI.getSubRegClass(RC, MO.getSubReg());
+ RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
+ if (RC)
+ RC = SubRC;
+ }
+
+ // Check that this is the aligned version of the class.
+ if (!RC || !RI.isProperlyAlignedRC(*RC)) {
+ ErrInfo = "Subtarget requires even aligned vector registers";
+ return false;
+ }
+ }
+
if (RegClass != -1) {
- Register Reg = MI.getOperand(i).getReg();
- if (Reg == AMDGPU::NoRegister || Reg.isVirtual())
+ if (Reg.isVirtual())
continue;
const TargetRegisterClass *RC = RI.getRegClass(RegClass);
@@ -3864,7 +4022,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
unsigned ConstantBusCount = 0;
- unsigned LiteralCount = 0;
+ bool UsesLiteral = false;
+ const MachineOperand *LiteralVal = nullptr;
if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
++ConstantBusCount;
@@ -3886,8 +4045,15 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
SGPRsUsed.push_back(SGPRUsed);
}
} else {
- ++ConstantBusCount;
- ++LiteralCount;
+ if (!UsesLiteral) {
+ ++ConstantBusCount;
+ UsesLiteral = true;
+ LiteralVal = &MO;
+ } else if (!MO.isIdenticalTo(*LiteralVal)) {
+ assert(isVOP3(MI));
+ ErrInfo = "VOP3 instruction uses more than one literal";
+ return false;
+ }
}
}
}
@@ -3911,15 +4077,9 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
return false;
}
- if (isVOP3(MI) && LiteralCount) {
- if (!ST.hasVOP3Literal()) {
- ErrInfo = "VOP3 instruction uses literal";
- return false;
- }
- if (LiteralCount > 1) {
- ErrInfo = "VOP3 instruction uses more than one literal";
- return false;
- }
+ if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
+ ErrInfo = "VOP3 instruction uses literal";
+ return false;
}
}
@@ -4113,25 +4273,10 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
IsA16 = A16->getImm() != 0;
}
- bool PackDerivatives = IsA16 || BaseOpcode->G16;
bool IsNSA = SRsrcIdx - VAddr0Idx > 1;
- unsigned AddrWords = BaseOpcode->NumExtraArgs;
- unsigned AddrComponents = (BaseOpcode->Coordinates ? Dim->NumCoords : 0) +
- (BaseOpcode->LodOrClampOrMip ? 1 : 0);
- if (IsA16)
- AddrWords += (AddrComponents + 1) / 2;
- else
- AddrWords += AddrComponents;
-
- if (BaseOpcode->Gradients) {
- if (PackDerivatives)
- // There are two gradients per coordinate, we pack them separately.
- // For the 3d case, we get (dy/du, dx/du) (-, dz/du) (dy/dv, dx/dv) (-, dz/dv)
- AddrWords += (Dim->NumGradients / 2 + 1) / 2 * 2;
- else
- AddrWords += Dim->NumGradients;
- }
+ unsigned AddrWords =
+ AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
unsigned VAddrWords;
if (IsNSA) {
@@ -4141,12 +4286,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32;
if (AddrWords > 8)
AddrWords = 16;
- else if (AddrWords > 4)
- AddrWords = 8;
- else if (AddrWords == 4)
- AddrWords = 4;
- else if (AddrWords == 3)
- AddrWords = 3;
}
if (VAddrWords != AddrWords) {
@@ -4187,8 +4326,89 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
ST.getGeneration() < AMDGPUSubtarget::GFX10) {
+ if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
+ DC <= DppCtrl::ROW_NEWBCAST_LAST &&
+ !ST.hasGFX90AInsts()) {
+ ErrInfo = "Invalid dpp_ctrl value: "
+ "row_newbroadcast/row_share is not supported before "
+ "GFX90A/GFX10";
+ return false;
+ } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
+ ErrInfo = "Invalid dpp_ctrl value: "
+ "row_share and row_xmask are not supported before GFX10";
+ return false;
+ }
+ }
+
+ int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+
+ if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
+ ((DstIdx >= 0 &&
+ (Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64RegClassID ||
+ Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64_Align2RegClassID)) ||
+ ((Src0Idx >= 0 &&
+ (Desc.OpInfo[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID ||
+ Desc.OpInfo[Src0Idx].RegClass ==
+ AMDGPU::VReg_64_Align2RegClassID)))) &&
+ !AMDGPU::isLegal64BitDPPControl(DC)) {
ErrInfo = "Invalid dpp_ctrl value: "
- "row_share and row_xmask are not supported before GFX10";
+ "64 bit dpp only support row_newbcast";
+ return false;
+ }
+ }
+
+ if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
+ const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
+ uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0
+ : AMDGPU::OpName::vdata;
+ const MachineOperand *Data = getNamedOperand(MI, DataNameIdx);
+ const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
+ if (Data && !Data->isReg())
+ Data = nullptr;
+
+ if (ST.hasGFX90AInsts()) {
+ if (Dst && Data &&
+ (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
+ ErrInfo = "Invalid register class: "
+ "vdata and vdst should be both VGPR or AGPR";
+ return false;
+ }
+ if (Data && Data2 &&
+ (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
+ ErrInfo = "Invalid register class: "
+ "both data operands should be VGPR or AGPR";
+ return false;
+ }
+ } else {
+ if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
+ (Data && RI.isAGPR(MRI, Data->getReg())) ||
+ (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
+ ErrInfo = "Invalid register class: "
+ "agpr loads and stores not supported on this GPU";
+ return false;
+ }
+ }
+ }
+
+ if (ST.needsAlignedVGPRs() &&
+ (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
+ MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
+ MI.getOpcode() == AMDGPU::DS_GWS_BARRIER)) {
+ const MachineOperand *Op = getNamedOperand(MI, AMDGPU::OpName::data0);
+ Register Reg = Op->getReg();
+ bool Aligned = true;
+ if (Reg.isPhysical()) {
+ Aligned = !(RI.getHWRegIndex(Reg) & 1);
+ } else {
+ const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
+ Aligned = RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
+ !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
+ }
+
+ if (!Aligned) {
+ ErrInfo = "Subtarget requires even aligned vector registers "
+ "for DS_GWS instructions";
return false;
}
}
@@ -4205,7 +4425,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
case AMDGPU::WQM: return AMDGPU::WQM;
case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
- case AMDGPU::WWM: return AMDGPU::WWM;
+ case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
+ case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
case AMDGPU::S_MOV_B32: {
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
return MI.getOperand(1).isReg() ||
@@ -4276,6 +4497,59 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
"Unexpected scalar opcode without corresponding vector one!");
}
+static unsigned adjustAllocatableRegClass(const GCNSubtarget &ST,
+ const MachineRegisterInfo &MRI,
+ const MCInstrDesc &TID,
+ unsigned RCID,
+ bool IsAllocatable) {
+ if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
+ (TID.mayLoad() || TID.mayStore() ||
+ (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) {
+ switch (RCID) {
+ case AMDGPU::AV_32RegClassID: return AMDGPU::VGPR_32RegClassID;
+ case AMDGPU::AV_64RegClassID: return AMDGPU::VReg_64RegClassID;
+ case AMDGPU::AV_96RegClassID: return AMDGPU::VReg_96RegClassID;
+ case AMDGPU::AV_128RegClassID: return AMDGPU::VReg_128RegClassID;
+ case AMDGPU::AV_160RegClassID: return AMDGPU::VReg_160RegClassID;
+ default:
+ break;
+ }
+ }
+ return RCID;
+}
+
+const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID,
+ unsigned OpNum, const TargetRegisterInfo *TRI,
+ const MachineFunction &MF)
+ const {
+ if (OpNum >= TID.getNumOperands())
+ return nullptr;
+ auto RegClass = TID.OpInfo[OpNum].RegClass;
+ bool IsAllocatable = false;
+ if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) {
+ // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
+ // with two data operands. Request register class constainted to VGPR only
+ // of both operands present as Machine Copy Propagation can not check this
+ // constraint and possibly other passes too.
+ //
+ // The check is limited to FLAT and DS because atomics in non-flat encoding
+ // have their vdst and vdata tied to be the same register.
+ const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
+ AMDGPU::OpName::vdst);
+ const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
+ (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
+ : AMDGPU::OpName::vdata);
+ if (DataIdx != -1) {
+ IsAllocatable = VDstIdx != -1 ||
+ AMDGPU::getNamedOperandIdx(TID.Opcode,
+ AMDGPU::OpName::data1) != -1;
+ }
+ }
+ RegClass = adjustAllocatableRegClass(ST, MF.getRegInfo(), TID, RegClass,
+ IsAllocatable);
+ return RI.getRegClass(RegClass);
+}
+
const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
unsigned OpNo) const {
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
@@ -4290,6 +4564,7 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
}
unsigned RCID = Desc.OpInfo[OpNo].RegClass;
+ RCID = adjustAllocatableRegClass(ST, MRI, Desc, RCID, true);
return RI.getRegClass(RCID);
}
@@ -4308,8 +4583,9 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
- if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
- VRC = &AMDGPU::VReg_64RegClass;
+ const TargetRegisterClass *VRC64 = RI.getVGPR64Class();
+ if (RI.getCommonSubClass(VRC64, VRC))
+ VRC = VRC64;
else
VRC = &AMDGPU::VGPR_32RegClass;
@@ -4466,7 +4742,40 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
if (MO->isReg()) {
assert(DefinedRC);
- return isLegalRegOperand(MRI, OpInfo, *MO);
+ if (!isLegalRegOperand(MRI, OpInfo, *MO))
+ return false;
+ bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
+ if (IsAGPR && !ST.hasMAIInsts())
+ return false;
+ unsigned Opc = MI.getOpcode();
+ if (IsAGPR &&
+ (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
+ (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
+ return false;
+ // Atomics should have both vdst and vdata either vgpr or agpr.
+ const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+ const int DataIdx = AMDGPU::getNamedOperandIdx(Opc,
+ isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
+ if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
+ MI.getOperand(DataIdx).isReg() &&
+ RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
+ return false;
+ if ((int)OpIdx == DataIdx) {
+ if (VDstIdx != -1 &&
+ RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
+ return false;
+ // DS instructions with 2 src operands also must have tied RC.
+ const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc,
+ AMDGPU::OpName::data1);
+ if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
+ RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
+ return false;
+ }
+ if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+ (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
+ RI.isSGPRReg(MRI, MO->getReg()))
+ return false;
+ return true;
}
// Handle non-register types that are treated like immediates.
@@ -4740,6 +5049,86 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
}
}
+bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
+ unsigned Opc = Inst.getOpcode();
+ int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
+ if (OldSAddrIdx < 0)
+ return false;
+
+ assert(isSegmentSpecificFLAT(Inst));
+
+ int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
+ if (NewOpc < 0)
+ NewOpc = AMDGPU::getFlatScratchInstSVfromSS(Opc);
+ if (NewOpc < 0)
+ return false;
+
+ MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
+ MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
+ if (RI.isSGPRReg(MRI, SAddr.getReg()))
+ return false;
+
+ int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
+ if (NewVAddrIdx < 0)
+ return false;
+
+ int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
+
+ // Check vaddr, it shall be zero or absent.
+ MachineInstr *VAddrDef = nullptr;
+ if (OldVAddrIdx >= 0) {
+ MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
+ VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
+ if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
+ !VAddrDef->getOperand(1).isImm() ||
+ VAddrDef->getOperand(1).getImm() != 0)
+ return false;
+ }
+
+ const MCInstrDesc &NewDesc = get(NewOpc);
+ Inst.setDesc(NewDesc);
+
+ // Callers expect interator to be valid after this call, so modify the
+ // instruction in place.
+ if (OldVAddrIdx == NewVAddrIdx) {
+ MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
+ // Clear use list from the old vaddr holding a zero register.
+ MRI.removeRegOperandFromUseList(&NewVAddr);
+ MRI.moveOperands(&NewVAddr, &SAddr, 1);
+ Inst.RemoveOperand(OldSAddrIdx);
+ // Update the use list with the pointer we have just moved from vaddr to
+ // saddr poisition. Otherwise new vaddr will be missing from the use list.
+ MRI.removeRegOperandFromUseList(&NewVAddr);
+ MRI.addRegOperandToUseList(&NewVAddr);
+ } else {
+ assert(OldSAddrIdx == NewVAddrIdx);
+
+ if (OldVAddrIdx >= 0) {
+ int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
+ AMDGPU::OpName::vdst_in);
+
+ // RemoveOperand doesn't try to fixup tied operand indexes at it goes, so
+ // it asserts. Untie the operands for now and retie them afterwards.
+ if (NewVDstIn != -1) {
+ int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
+ Inst.untieRegOperand(OldVDstIn);
+ }
+
+ Inst.RemoveOperand(OldVAddrIdx);
+
+ if (NewVDstIn != -1) {
+ int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
+ Inst.tieOperands(NewVDst, NewVDstIn);
+ }
+ }
+ }
+
+ if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
+ VAddrDef->eraseFromParent();
+
+ return true;
+}
+
// FIXME: Remove this when SelectionDAG is obsoleted.
void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
MachineInstr &MI) const {
@@ -4752,6 +5141,9 @@ void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
return;
+ if (moveFlatAddrToVGPR(MI))
+ return;
+
Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI);
SAddr->setReg(ToSGPR);
}
@@ -4905,7 +5297,7 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
.addReg(Exec)
.addReg(SaveExec);
- BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB);
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
}
// Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
@@ -5316,17 +5708,10 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
.add(*SOffset)
.add(*Offset);
- // Atomics do not have this operand.
- if (const MachineOperand *GLC =
- getNamedOperand(MI, AMDGPU::OpName::glc)) {
- MIB.addImm(GLC->getImm());
+ if (const MachineOperand *CPol =
+ getNamedOperand(MI, AMDGPU::OpName::cpol)) {
+ MIB.addImm(CPol->getImm());
}
- if (const MachineOperand *DLC =
- getNamedOperand(MI, AMDGPU::OpName::dlc)) {
- MIB.addImm(DLC->getImm());
- }
-
- MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
if (const MachineOperand *TFE =
getNamedOperand(MI, AMDGPU::OpName::tfe)) {
@@ -5346,7 +5731,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
.addReg(NewSRsrc)
.add(*SOffset)
.add(*Offset)
- .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
+ .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
.cloneMemRefs(MI);
}
@@ -5449,6 +5834,11 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
Inst.eraseFromParent();
continue;
+ case AMDGPU::S_BREV_B64:
+ splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
+ Inst.eraseFromParent();
+ continue;
+
case AMDGPU::S_NOT_B64:
splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
Inst.eraseFromParent();
@@ -5654,6 +6044,8 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
// Only propagate through live-def of SCC.
if (Op.isDef() && !Op.isDead())
addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
+ if (Op.isUse())
+ addSCCDefsToVALUWorklist(Op, Worklist);
Inst.RemoveOperand(i);
}
}
@@ -5999,7 +6391,7 @@ void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist,
void SIInstrInfo::splitScalar64BitUnaryOp(
SetVectorType &Worklist, MachineInstr &Inst,
- unsigned Opcode) const {
+ unsigned Opcode, bool Swap) const {
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -6032,6 +6424,9 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
+ if (Swap)
+ std::swap(DestSub0, DestSub1);
+
Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
.addReg(DestSub0)
@@ -6341,7 +6736,8 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
case AMDGPU::COPY:
case AMDGPU::WQM:
case AMDGPU::SOFT_WQM:
- case AMDGPU::WWM:
+ case AMDGPU::STRICT_WWM:
+ case AMDGPU::STRICT_WQM:
case AMDGPU::REG_SEQUENCE:
case AMDGPU::PHI:
case AMDGPU::INSERT_SUBREG:
@@ -6485,6 +6881,32 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
}
}
+// Instructions that use SCC may be converted to VALU instructions. When that
+// happens, the SCC register is changed to VCC_LO. The instruction that defines
+// SCC must be changed to an instruction that defines VCC. This function makes
+// sure that the instruction that defines SCC is added to the moveToVALU
+// worklist.
+void SIInstrInfo::addSCCDefsToVALUWorklist(MachineOperand &Op,
+ SetVectorType &Worklist) const {
+ assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isUse());
+
+ MachineInstr *SCCUseInst = Op.getParent();
+ // Look for a preceeding instruction that either defines VCC or SCC. If VCC
+ // then there is nothing to do because the defining instruction has been
+ // converted to a VALU already. If SCC then that instruction needs to be
+ // converted to a VALU.
+ for (MachineInstr &MI :
+ make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
+ SCCUseInst->getParent()->rend())) {
+ if (MI.modifiesRegister(AMDGPU::VCC, &RI))
+ break;
+ if (MI.definesRegister(AMDGPU::SCC, &RI)) {
+ Worklist.insert(&MI);
+ break;
+ }
+ }
+}
+
const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
const MachineInstr &Inst) const {
const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
@@ -6499,7 +6921,8 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
case AMDGPU::INSERT_SUBREG:
case AMDGPU::WQM:
case AMDGPU::SOFT_WQM:
- case AMDGPU::WWM: {
+ case AMDGPU::STRICT_WWM:
+ case AMDGPU::STRICT_WQM: {
const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
if (RI.hasAGPRs(SrcRC)) {
if (RI.hasAGPRs(NewDstRC))
@@ -6614,7 +7037,7 @@ MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
- return (22ULL << 44) | // IMG_FORMAT_32_FLOAT
+ return (AMDGPU::MTBUFFormat::UFMT_32_FLOAT << 44) |
(1ULL << 56) | // RESOURCE_LEVEL = 1
(3ULL << 60); // OOB_SELECT = 3
}
@@ -6786,11 +7209,6 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
}
switch (Opc) {
- case TargetOpcode::IMPLICIT_DEF:
- case TargetOpcode::KILL:
- case TargetOpcode::DBG_VALUE:
- case TargetOpcode::EH_LABEL:
- return 0;
case TargetOpcode::BUNDLE:
return getInstBundleSize(MI);
case TargetOpcode::INLINEASM:
@@ -6800,6 +7218,8 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
}
default:
+ if (MI.isMetaInstruction())
+ return 0;
return DescSize;
}
}
@@ -7026,36 +7446,92 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
}
+// Depending on the used address space and instructions, some immediate offsets
+// are allowed and some are not.
+// In general, flat instruction offsets can only be non-negative, global and
+// scratch instruction offsets can also be negative.
+//
+// There are several bugs related to these offsets:
+// On gfx10.1, flat instructions that go into the global address space cannot
+// use an offset.
+//
+// For scratch instructions, the address can be either an SGPR or a VGPR.
+// The following offsets can be used, depending on the architecture (x means
+// cannot be used):
+// +----------------------------+------+------+
+// | Address-Mode | SGPR | VGPR |
+// +----------------------------+------+------+
+// | gfx9 | | |
+// | negative, 4-aligned offset | x | ok |
+// | negative, unaligned offset | x | ok |
+// +----------------------------+------+------+
+// | gfx10 | | |
+// | negative, 4-aligned offset | ok | ok |
+// | negative, unaligned offset | ok | x |
+// +----------------------------+------+------+
+// | gfx10.3 | | |
+// | negative, 4-aligned offset | ok | ok |
+// | negative, unaligned offset | ok | ok |
+// +----------------------------+------+------+
+//
+// This function ignores the addressing mode, so if an offset cannot be used in
+// one addressing mode, it is considered illegal.
bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
- bool Signed) const {
+ uint64_t FlatVariant) const {
// TODO: Should 0 be special cased?
if (!ST.hasFlatInstOffsets())
return false;
- if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS)
+ if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
+ (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
+ AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
+ return false;
+
+ bool Signed = FlatVariant != SIInstrFlags::FLAT;
+ if (ST.hasNegativeScratchOffsetBug() &&
+ FlatVariant == SIInstrFlags::FlatScratch)
+ Signed = false;
+ if (ST.hasNegativeUnalignedScratchOffsetBug() &&
+ FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
+ (Offset % 4) != 0) {
return false;
+ }
unsigned N = AMDGPU::getNumFlatOffsetBits(ST, Signed);
return Signed ? isIntN(N, Offset) : isUIntN(N, Offset);
}
-std::pair<int64_t, int64_t> SIInstrInfo::splitFlatOffset(int64_t COffsetVal,
- unsigned AddrSpace,
- bool IsSigned) const {
+// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
+std::pair<int64_t, int64_t>
+SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
+ uint64_t FlatVariant) const {
int64_t RemainderOffset = COffsetVal;
int64_t ImmField = 0;
- const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST, IsSigned);
- if (IsSigned) {
+ bool Signed = FlatVariant != SIInstrFlags::FLAT;
+ if (ST.hasNegativeScratchOffsetBug() &&
+ FlatVariant == SIInstrFlags::FlatScratch)
+ Signed = false;
+
+ const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST, Signed);
+ if (Signed) {
// Use signed division by a power of two to truncate towards 0.
int64_t D = 1LL << (NumBits - 1);
RemainderOffset = (COffsetVal / D) * D;
ImmField = COffsetVal - RemainderOffset;
+
+ if (ST.hasNegativeUnalignedScratchOffsetBug() &&
+ FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
+ (ImmField % 4) != 0) {
+ // Make ImmField a multiple of 4
+ RemainderOffset += ImmField % 4;
+ ImmField -= ImmField % 4;
+ }
} else if (COffsetVal >= 0) {
ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
RemainderOffset = COffsetVal - ImmField;
}
- assert(isLegalFLATOffset(ImmField, AddrSpace, IsSigned));
+ assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
assert(RemainderOffset + ImmField == COffsetVal);
return {ImmField, RemainderOffset};
}
@@ -7069,7 +7545,8 @@ enum SIEncodingFamily {
GFX80 = 4,
GFX9 = 5,
GFX10 = 6,
- SDWA10 = 7
+ SDWA10 = 7,
+ GFX90A = 8
};
static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
@@ -7141,6 +7618,15 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
if (MCOp == -1)
return Opcode;
+ if (ST.hasGFX90AInsts()) {
+ uint16_t NMCOp = (uint16_t)-1;
+ NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A);
+ if (NMCOp == (uint16_t)-1)
+ NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9);
+ if (NMCOp != (uint16_t)-1)
+ MCOp = NMCOp;
+ }
+
// (uint16_t)-1 means that Opcode is a pseudo instruction that has
// no encoding in the given subtarget generation.
if (MCOp == (uint16_t)-1)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index ce59fe86c688..fc5e5be03541 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -96,7 +96,8 @@ private:
unsigned Opcode) const;
void splitScalar64BitUnaryOp(SetVectorType &Worklist,
- MachineInstr &Inst, unsigned Opcode) const;
+ MachineInstr &Inst, unsigned Opcode,
+ bool Swap = false) const;
void splitScalar64BitAddSub(SetVectorType &Worklist, MachineInstr &Inst,
MachineDominatorTree *MDT = nullptr) const;
@@ -122,6 +123,8 @@ private:
void addSCCDefUsersToVALUWorklist(MachineOperand &Op,
MachineInstr &SCCDefInst,
SetVectorType &Worklist) const;
+ void addSCCDefsToVALUWorklist(MachineOperand &Op,
+ SetVectorType &Worklist) const;
const TargetRegisterClass *
getDestEquivalentVGPRClass(const MachineInstr &Inst) const;
@@ -158,8 +161,7 @@ public:
// MO_REL32_HI -> symbol@rel32@hi -> R_AMDGPU_REL32_HI.
MO_REL32_HI = 5,
- MO_LONG_BRANCH_FORWARD = 6,
- MO_LONG_BRANCH_BACKWARD = 7,
+ MO_FAR_BRANCH_OFFSET = 6,
MO_ABS32_LO = 8,
MO_ABS32_HI = 9,
@@ -171,9 +173,15 @@ public:
return RI;
}
+ const GCNSubtarget &getSubtarget() const {
+ return ST;
+ }
+
bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
AAResults *AA) const override;
+ bool isIgnorableUse(const MachineOperand &MO) const override;
+
bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
int64_t &Offset1,
int64_t &Offset2) const override;
@@ -501,28 +509,28 @@ public:
// i.e. global_* or scratch_*.
static bool isSegmentSpecificFLAT(const MachineInstr &MI) {
auto Flags = MI.getDesc().TSFlags;
- return Flags & (SIInstrFlags::IsFlatGlobal | SIInstrFlags::IsFlatScratch);
+ return Flags & (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch);
}
bool isSegmentSpecificFLAT(uint16_t Opcode) const {
auto Flags = get(Opcode).TSFlags;
- return Flags & (SIInstrFlags::IsFlatGlobal | SIInstrFlags::IsFlatScratch);
+ return Flags & (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch);
}
static bool isFLATGlobal(const MachineInstr &MI) {
- return MI.getDesc().TSFlags & SIInstrFlags::IsFlatGlobal;
+ return MI.getDesc().TSFlags & SIInstrFlags::FlatGlobal;
}
bool isFLATGlobal(uint16_t Opcode) const {
- return get(Opcode).TSFlags & SIInstrFlags::IsFlatGlobal;
+ return get(Opcode).TSFlags & SIInstrFlags::FlatGlobal;
}
static bool isFLATScratch(const MachineInstr &MI) {
- return MI.getDesc().TSFlags & SIInstrFlags::IsFlatScratch;
+ return MI.getDesc().TSFlags & SIInstrFlags::FlatScratch;
}
bool isFLATScratch(uint16_t Opcode) const {
- return get(Opcode).TSFlags & SIInstrFlags::IsFlatScratch;
+ return get(Opcode).TSFlags & SIInstrFlags::FlatScratch;
}
// Any FLAT encoded instruction, including global_* and scratch_*.
@@ -538,6 +546,32 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::EXP;
}
+ static bool isAtomicNoRet(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::IsAtomicNoRet;
+ }
+
+ bool isAtomicNoRet(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::IsAtomicNoRet;
+ }
+
+ static bool isAtomicRet(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::IsAtomicRet;
+ }
+
+ bool isAtomicRet(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::IsAtomicRet;
+ }
+
+ static bool isAtomic(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & (SIInstrFlags::IsAtomicRet |
+ SIInstrFlags::IsAtomicNoRet);
+ }
+
+ bool isAtomic(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & (SIInstrFlags::IsAtomicRet |
+ SIInstrFlags::IsAtomicNoRet);
+ }
+
static bool isWQM(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::WQM;
}
@@ -915,6 +949,10 @@ public:
MachineBasicBlock *
legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT = nullptr) const;
+ /// Change SADDR form of a FLAT \p Inst to its VADDR form if saddr operand
+ /// was moved to VGPR. \returns true if succeeded.
+ bool moveFlatAddrToVGPR(MachineInstr &Inst) const;
+
/// Replace this instruction's opcode with the equivalent VALU
/// opcode. This function will also move the users of \p MI to the
/// VALU if necessary. If present, \p MDT is updated.
@@ -1039,13 +1077,13 @@ public:
/// encoded instruction. If \p Signed, this is for an instruction that
/// interprets the offset as signed.
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
- bool Signed) const;
+ uint64_t FlatVariant) const;
/// Split \p COffsetVal into {immediate offset field, remainder offset}
/// values.
std::pair<int64_t, int64_t> splitFlatOffset(int64_t COffsetVal,
unsigned AddrSpace,
- bool IsSigned) const;
+ uint64_t FlatVariant) const;
/// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
/// Return -1 if the target-specific opcode for the pseudo instruction does
@@ -1059,11 +1097,7 @@ public:
const TargetRegisterClass *getRegClass(const MCInstrDesc &TID, unsigned OpNum,
const TargetRegisterInfo *TRI,
const MachineFunction &MF)
- const override {
- if (OpNum >= TID.getNumOperands())
- return nullptr;
- return RI.getRegClass(TID.OpInfo[OpNum].RegClass);
- }
+ const override;
void fixImplicitOperands(MachineInstr &MI) const;
@@ -1166,26 +1200,39 @@ namespace AMDGPU {
int getMUBUFNoLdsInst(uint16_t Opcode);
LLVM_READONLY
- int getAtomicRetOp(uint16_t Opcode);
-
- LLVM_READONLY
int getAtomicNoRetOp(uint16_t Opcode);
LLVM_READONLY
int getSOPKOp(uint16_t Opcode);
+ /// \returns SADDR form of a FLAT Global instruction given an \p Opcode
+ /// of a VADDR form.
LLVM_READONLY
int getGlobalSaddrOp(uint16_t Opcode);
+ /// \returns VADDR form of a FLAT Global instruction given an \p Opcode
+ /// of a SADDR form.
+ LLVM_READONLY
+ int getGlobalVaddrOp(uint16_t Opcode);
+
LLVM_READONLY
int getVCMPXNoSDstOp(uint16_t Opcode);
+ /// \returns ST form with only immediate offset of a FLAT Scratch instruction
+ /// given an \p Opcode of an SS (SADDR) form.
LLVM_READONLY
int getFlatScratchInstSTfromSS(uint16_t Opcode);
+ /// \returns SS (SADDR) form of a FLAT Scratch instruction given an \p Opcode
+ /// of an SV (VADDR) form.
LLVM_READONLY
int getFlatScratchInstSSfromSV(uint16_t Opcode);
+ /// \returns SV (VADDR) form of a FLAT Scratch instruction given an \p Opcode
+ /// of an SS (SADDR) form.
+ LLVM_READONLY
+ int getFlatScratchInstSVfromSS(uint16_t Opcode);
+
const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 5adc9e817d41..25b647d34ec1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -16,7 +16,7 @@ class GCNPredicateControl : PredicateControl {
Predicate VIAssemblerPredicate = isGFX8GFX9;
}
-// Execpt for the NONE field, this must be kept in sync with the
+// Except for the NONE field, this must be kept in sync with the
// SIEncodingFamily enum in AMDGPUInstrInfo.cpp
def SIEncodingFamily {
int NONE = -1;
@@ -28,6 +28,7 @@ def SIEncodingFamily {
int GFX9 = 5;
int GFX10 = 6;
int SDWA10 = 7;
+ int GFX90A = 8;
}
//===----------------------------------------------------------------------===//
@@ -186,6 +187,8 @@ def SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">;
def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">;
def SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">;
def SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">;
+def SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">;
+def SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">;
def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
SDTypeProfile<1, 9,
@@ -265,21 +268,25 @@ class isFloatType<ValueType SrcVT> {
!eq(SrcVT.Value, v2f16.Value),
!eq(SrcVT.Value, v4f16.Value),
!eq(SrcVT.Value, v2f32.Value),
- !eq(SrcVT.Value, v2f64.Value));
+ !eq(SrcVT.Value, v2f64.Value),
+ !eq(SrcVT.Value, v4f64.Value));
}
class isIntType<ValueType SrcVT> {
bit ret = !or(!eq(SrcVT.Value, i16.Value),
!eq(SrcVT.Value, i32.Value),
- !eq(SrcVT.Value, i64.Value));
+ !eq(SrcVT.Value, i64.Value),
+ !eq(SrcVT.Value, v2i32.Value));
}
class isPackedType<ValueType SrcVT> {
bit ret = !or(!eq(SrcVT.Value, v2i16.Value),
!eq(SrcVT.Value, v2f16.Value),
- !eq(SrcVT.Value, v4f16.Value));
+ !eq(SrcVT.Value, v4f16.Value),
+ !eq(SrcVT.Value, v2f32.Value));
}
+
//===----------------------------------------------------------------------===//
// PatFrags for global memory operations
//===----------------------------------------------------------------------===//
@@ -629,6 +636,11 @@ def add_ctpop : PatFrag <
(add (ctpop $src0), $src1)
>;
+def xnor : PatFrag <
+ (ops node:$src0, node:$src1),
+ (not (xor $src0, $src1))
+>;
+
foreach I = 1-4 in {
def shl#I#_add : PatFrag <
(ops node:$src0, node:$src1),
@@ -802,26 +814,28 @@ def NegSubInlineConstV216 : PatLeaf<(build_vector), [{
(isNullConstantOrUndef(Src1) && isNegInlineImmediate(Src0.getNode()));
}], getNegV2I16Imm>;
-//===----------------------------------------------------------------------===//
-// MUBUF/SMEM Patterns
-//===----------------------------------------------------------------------===//
-def extract_glc : SDNodeXForm<timm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i8);
+def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{
+ return fp16SrcZerosHighBits(N->getOpcode());
}]>;
-def extract_slc : SDNodeXForm<timm, [{
- return CurDAG->getTargetConstant((N->getZExtValue() >> 1) & 1, SDLoc(N), MVT::i8);
-}]>;
-def extract_dlc : SDNodeXForm<timm, [{
- return CurDAG->getTargetConstant((N->getZExtValue() >> 2) & 1, SDLoc(N), MVT::i8);
+//===----------------------------------------------------------------------===//
+// MUBUF/SMEM Patterns
+//===----------------------------------------------------------------------===//
+
+def extract_cpol : SDNodeXForm<timm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() & AMDGPU::CPol::ALL, SDLoc(N), MVT::i8);
}]>;
def extract_swz : SDNodeXForm<timm, [{
return CurDAG->getTargetConstant((N->getZExtValue() >> 3) & 1, SDLoc(N), MVT::i8);
}]>;
+def set_glc : SDNodeXForm<timm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() | AMDGPU::CPol::GLC, SDLoc(N), MVT::i8);
+}]>;
+
//===----------------------------------------------------------------------===//
// Custom Operands
//===----------------------------------------------------------------------===//
@@ -1074,6 +1088,12 @@ class NamedOperandU32Default0<string Name, AsmOperandClass MatchClass> :
let ParserMatchClass = MatchClass;
}
+class NamedOperandU32Default1<string Name, AsmOperandClass MatchClass> :
+ OperandWithDefaultOps<i32, (ops (i32 1))> {
+ let PrintMethod = "print"#Name;
+ let ParserMatchClass = MatchClass;
+}
+
let OperandType = "OPERAND_IMMEDIATE" in {
def offen : NamedOperandBit<"Offen", NamedMatchClass<"Offen">>;
@@ -1097,18 +1117,14 @@ def clampmod : NamedOperandBit<"ClampSI", NamedMatchClass<"ClampSI">>;
def clampmod0 : NamedOperandBit_0<"ClampSI", NamedMatchClass<"ClampSI">>;
def highmod : NamedOperandBit<"High", NamedMatchClass<"High">>;
-def DLC : NamedOperandBit<"DLC", NamedMatchClass<"DLC">>;
-def DLC_0 : NamedOperandBit_0<"DLC", NamedMatchClass<"DLC">>;
-
-def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>;
-def GLC_0 : NamedOperandBit_0<"GLC", NamedMatchClass<"GLC">>;
-def GLC_1 : NamedOperandBit_1<"GLC", NamedMatchClass<"GLC_1">>;
-
-def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
-def SLC_0 : NamedOperandBit_0<"SLC", NamedMatchClass<"SLC">>;
+def CPol : NamedOperandU32<"CPol", NamedMatchClass<"CPol">>;
+def CPol_0 : NamedOperandU32Default0<"CPol", NamedMatchClass<"CPol">>;
+def CPol_GLC1 : NamedOperandU32Default1<"CPol", NamedMatchClass<"CPol">>;
def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>;
+def TFE_0 : NamedOperandBit_0<"TFE", NamedMatchClass<"TFE">>;
def SWZ : NamedOperandBit<"SWZ", NamedMatchClass<"SWZ">>;
+def SWZ_0 : NamedOperandBit_0<"SWZ", NamedMatchClass<"SWZ">>;
def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>;
def DA : NamedOperandBit<"DA", NamedMatchClass<"DA">>;
def R128A16 : NamedOperandBit<"R128A16", NamedMatchClass<"R128A16">>;
@@ -1243,7 +1259,7 @@ def FP32SDWAInputMods : FPSDWAInputMods<FP32SDWAInputModsMatchClass>;
def FPVRegInputModsMatchClass : AsmOperandClass {
let Name = "VRegWithFPInputMods";
let ParserMethod = "parseRegWithFPInputMods";
- let PredicateMethod = "isVReg32";
+ let PredicateMethod = "isVRegWithInputMods";
}
def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {
@@ -1270,7 +1286,7 @@ def Int32SDWAInputMods : IntSDWAInputMods<Int32SDWAInputModsMatchClass>;
def IntVRegInputModsMatchClass : AsmOperandClass {
let Name = "VRegWithIntInputMods";
let ParserMethod = "parseRegWithIntInputMods";
- let PredicateMethod = "isVReg32";
+ let PredicateMethod = "isVRegWithInputMods";
}
def IntVRegInputMods : InputMods <IntVRegInputModsMatchClass> {
@@ -1363,11 +1379,6 @@ def DSTOMOD {
int NONE = 0;
}
-def TRAPID{
- int LLVM_TRAP = 2;
- int LLVM_DEBUG_TRAP = 3;
-}
-
def HWREG {
int MODE = 1;
int STATUS = 2;
@@ -1507,8 +1518,12 @@ class getVOP3SrcForVT<ValueType VT> {
VSrc_128,
!if(!eq(VT.Size, 64),
!if(isFP,
- VSrc_f64,
- VSrc_b64),
+ !if(!eq(VT.Value, v2f32.Value),
+ VSrc_v2f32,
+ VSrc_f64),
+ !if(!eq(VT.Value, v2i32.Value),
+ VSrc_v2b32,
+ VSrc_b64)),
!if(!eq(VT.Value, i1.Value),
SSrc_i1,
!if(isFP,
@@ -1541,7 +1556,9 @@ class isModifierType<ValueType SrcVT> {
!eq(SrcVT.Value, f32.Value),
!eq(SrcVT.Value, f64.Value),
!eq(SrcVT.Value, v2f16.Value),
- !eq(SrcVT.Value, v2i16.Value));
+ !eq(SrcVT.Value, v2i16.Value),
+ !eq(SrcVT.Value, v2f32.Value),
+ !eq(SrcVT.Value, v2i32.Value));
}
// Return type of input modifiers operand for specified input operand
@@ -1598,8 +1615,11 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
!if (!eq(NumSrcArgs, 1),
!if (HasModifiers,
// VOP1 with modifiers
- (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
- clampmod0:$clamp, omod0:$omod)
+ !if(HasOMod,
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ clampmod0:$clamp, omod0:$omod),
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ clampmod0:$clamp))
/* else */,
// VOP1 without modifiers
!if (HasClamp,
@@ -1695,7 +1715,7 @@ class getInsVOP3OpSel <RegisterOperand Src0RC, RegisterOperand Src1RC,
Src0Mod, Src1Mod, Src2Mod, 1/*HasOpSel*/, 0>.ret;
}
-class getInsDPPBase <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC,
+class getInsDPPBase <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
int NumSrcArgs, bit HasModifiers,
Operand Src0Mod, Operand Src1Mod> {
@@ -1705,45 +1725,45 @@ class getInsDPPBase <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass
!if (!eq(NumSrcArgs, 1),
!if (HasModifiers,
// VOP1_DPP with modifiers
- (ins DstRC:$old, Src0Mod:$src0_modifiers,
+ (ins OldRC:$old, Src0Mod:$src0_modifiers,
Src0RC:$src0)
/* else */,
// VOP1_DPP without modifiers
- (ins DstRC:$old, Src0RC:$src0)
+ (ins OldRC:$old, Src0RC:$src0)
/* endif */),
!if (HasModifiers,
// VOP2_DPP with modifiers
- (ins DstRC:$old,
+ (ins OldRC:$old,
Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1)
/* else */,
// VOP2_DPP without modifiers
- (ins DstRC:$old,
+ (ins OldRC:$old,
Src0RC:$src0, Src1RC:$src1)
)));
}
-class getInsDPP <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC,
+class getInsDPP <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
int NumSrcArgs, bit HasModifiers,
Operand Src0Mod, Operand Src1Mod> {
- dag ret = !con(getInsDPPBase<DstRC, Src0RC, Src1RC, NumSrcArgs,
+ dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, NumSrcArgs,
HasModifiers, Src0Mod, Src1Mod>.ret,
(ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl));
}
-class getInsDPP16 <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC,
+class getInsDPP16 <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
int NumSrcArgs, bit HasModifiers,
Operand Src0Mod, Operand Src1Mod> {
- dag ret = !con(getInsDPP<DstRC, Src0RC, Src1RC, NumSrcArgs,
+ dag ret = !con(getInsDPP<OldRC, Src0RC, Src1RC, NumSrcArgs,
HasModifiers, Src0Mod, Src1Mod>.ret,
(ins FI:$fi));
}
-class getInsDPP8 <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC,
+class getInsDPP8 <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
int NumSrcArgs, bit HasModifiers,
Operand Src0Mod, Operand Src1Mod> {
- dag ret = !con(getInsDPPBase<DstRC, Src0RC, Src1RC, NumSrcArgs,
+ dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, NumSrcArgs,
HasModifiers, Src0Mod, Src1Mod>.ret,
(ins dpp8:$dpp8, FI:$fi));
}
@@ -1846,7 +1866,7 @@ class getAsm64 <bit HasDst, int NumSrcArgs, bit HasIntClamp, bit HasModifiers,
// instruction.
class getAsmVOP3P <bit HasDst, int NumSrcArgs, bit HasModifiers,
bit HasClamp, ValueType DstVT = i32> {
- string dst = " $vdst";
+ string dst = "$vdst";
string src0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
string src1 = !if(!eq(NumSrcArgs, 1), "",
!if(!eq(NumSrcArgs, 2), " $src1",
@@ -1867,7 +1887,7 @@ class getAsmVOP3OpSel <int NumSrcArgs,
bit Src0HasMods,
bit Src1HasMods,
bit Src2HasMods> {
- string dst = " $vdst";
+ string dst = "$vdst";
string isrc0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
string isrc1 = !if(!eq(NumSrcArgs, 1), "",
@@ -1972,14 +1992,29 @@ class getAsmSDWA9 <bit HasDst, bit HasOMod, int NumSrcArgs,
string ret = dst#args#sdwa;
}
+class getHas64BitOps <int NumSrcArgs, ValueType DstVT, ValueType Src0VT,
+ ValueType Src1VT> {
+ bit ret = !if(!eq(NumSrcArgs, 3),
+ 0,
+ !if(!eq(DstVT.Size, 64),
+ 1,
+ !if(!eq(Src0VT.Size, 64),
+ 1,
+ !if(!eq(Src1VT.Size, 64),
+ 1,
+ 0
+ )
+ )
+ )
+ );
+}
-// Function that checks if instruction supports DPP and SDWA
-class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
- ValueType Src1VT = i32> {
+class getHasSDWA <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
+ ValueType Src1VT = i32> {
bit ret = !if(!eq(NumSrcArgs, 3),
- 0, // NumSrcArgs == 3 - No DPP or SDWA for VOP3
+ 0, // NumSrcArgs == 3 - No SDWA for VOP3
!if(!eq(DstVT.Size, 64),
- 0, // 64-bit dst - No DPP or SDWA for 64-bit operands
+ 0, // 64-bit dst - No SDWA for 64-bit operands
!if(!eq(Src0VT.Size, 64),
0, // 64-bit src0
!if(!eq(Src1VT.Size, 64),
@@ -1993,8 +2028,42 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
class getHasDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
ValueType Src1VT = i32> {
- bit ret = !if(!eq(NumSrcArgs, 0), 0,
- getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret);
+ bit ret = !if(!eq(NumSrcArgs, 3),
+ 0, // NumSrcArgs == 3 - No DPP for VOP3
+ 1);
+}
+
+class getHasExt64BitDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
+ ValueType Src1VT = i32> {
+ bit ret = !and(getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret,
+ getHas64BitOps<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret);
+}
+
+// Function that checks if instruction supports DPP and SDWA
+class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
+ ValueType Src1VT = i32> {
+ bit ret = !or(getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret,
+ getHasSDWA<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret);
+}
+
+// Return an AGPR+VGPR operand class for the given VGPR register class.
+class getLdStRegisterOperand<RegisterClass RC> {
+ RegisterOperand ret =
+ !if(!eq(RC.Size, 32), AVLdSt_32,
+ !if(!eq(RC.Size, 64), AVLdSt_64,
+ !if(!eq(RC.Size, 96), AVLdSt_96,
+ !if(!eq(RC.Size, 128), AVLdSt_128,
+ !if(!eq(RC.Size, 160), AVLdSt_160,
+ RegisterOperand<VReg_1> // invalid register
+ )))));
+}
+
+class BitOr<bit a, bit b> {
+ bit ret = !if(a, 1, !if(b, 1, 0));
+}
+
+class BitAnd<bit a, bit b> {
+ bit ret = !if(a, !if(b, 1, 0), 0);
}
def PatGenMode {
@@ -2037,6 +2106,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
field bit HasDst = !ne(DstVT.Value, untyped.Value);
field bit HasDst32 = HasDst;
field bit EmitDst = HasDst; // force dst encoding, see v_movreld_b32 special case
+ field bit EmitDstSel = EmitDst;
field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret;
field bit HasSrc0 = !ne(Src0VT.Value, untyped.Value);
field bit HasSrc1 = !ne(Src1VT.Value, untyped.Value);
@@ -2077,12 +2147,14 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
field bit HasExtDPP = getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
- field bit HasExtSDWA = HasExt;
- field bit HasExtSDWA9 = HasExt;
+ field bit HasExt64BitDPP = getHasExt64BitDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
+ field bit HasExtSDWA = getHasSDWA<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
+ field bit HasExtSDWA9 = HasExtSDWA;
field int NeedPatGen = PatGenMode.NoPattern;
field bit IsMAI = 0;
field bit IsDOT = 0;
+ field bit IsSingle = 0;
field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods);
field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods);
@@ -2134,7 +2206,9 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
field string AsmDPP = !if(HasExtDPP,
getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret, "");
field string AsmDPP16 = getAsmDPP16<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
- field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0, DstVT>.ret;
+ // DPP8 encoding has no fields for modifiers, and it is enforced by setting
+ // the asm operand name via this HasModifiers flag
+ field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret;
field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret;
field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret;
@@ -2144,6 +2218,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
let HasExt = 0;
let HasExtDPP = 0;
+ let HasExt64BitDPP = 0;
let HasExtSDWA = 0;
let HasExtSDWA9 = 0;
}
@@ -2191,6 +2266,7 @@ def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>;
def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>;
def VOP_F16_F32 : VOPProfile <[f16, f32, untyped, untyped]>;
def VOP_F32_F16 : VOPProfile <[f32, f16, untyped, untyped]>;
+def VOP_I64_I64 : VOPProfile <[i64, i64, untyped, untyped]>;
def VOP_F32_F32_F16 : VOPProfile <[f32, f32, f16, untyped]>;
def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>;
@@ -2234,6 +2310,16 @@ def VOP_V4I32_I32_I32_V4I32 : VOPProfile <[v4i32, i32, i32, v4i32]>;
def VOP_V16I32_I32_I32_V16I32 : VOPProfile <[v16i32, i32, i32, v16i32]>;
def VOP_V32I32_I32_I32_V32I32 : VOPProfile <[v32i32, i32, i32, v32i32]>;
+def VOP_V4F64_F64_F64_V4F64 : VOPProfile <[v4f64, f64, f64, v4f64]>;
+def VOP_V1F64_F64_F64_V1F64 : VOPProfile <[v1f64, f64, f64, v1f64]>;
+
+def VOP_V2F32_V2F32_V2F32_V2F32 : VOPProfile <[v2f32, v2f32, v2f32, v2f32]>;
+def VOP_V2F32_V2F32_V2F32 : VOPProfile <[v2f32, v2f32, v2f32, untyped]>;
+def VOP_V2I32_V2I32_V2I32 : VOPProfile <[v2i32, v2i32, v2i32, untyped]>;
+def VOP_V4F32_V4I16_V4I16_V4F32 : VOPProfile <[v4f32, v4i16, v4i16, v4f32]>;
+def VOP_V16F32_V4I16_V4I16_V16F32 : VOPProfile <[v16f32, v4i16, v4i16, v16f32]>;
+def VOP_V32F32_V4I16_V4I16_V32F32 : VOPProfile <[v32f32, v4i16, v4i16, v32f32]>;
+
class Commutable_REV <string revOp, bit isOrig> {
string RevOp = revOp;
bit IsOrig = isOrig;
@@ -2372,7 +2458,8 @@ def getMCOpcodeGen : InstrMapping {
[!cast<string>(SIEncodingFamily.GFX80)],
[!cast<string>(SIEncodingFamily.GFX9)],
[!cast<string>(SIEncodingFamily.GFX10)],
- [!cast<string>(SIEncodingFamily.SDWA10)]];
+ [!cast<string>(SIEncodingFamily.SDWA10)],
+ [!cast<string>(SIEncodingFamily.GFX90A)]];
}
// Get equivalent SOPK instruction.
@@ -2408,15 +2495,6 @@ def getMUBUFNoLdsInst : InstrMapping {
let ValueCols = [["0"]];
}
-// Maps an atomic opcode to its version with a return value.
-def getAtomicRetOp : InstrMapping {
- let FilterClass = "AtomicNoRet";
- let RowFields = ["NoRetOp"];
- let ColFields = ["IsRet"];
- let KeyCol = ["0"];
- let ValueCols = [["1"]];
-}
-
// Maps an atomic opcode to its returnless version.
def getAtomicNoRetOp : InstrMapping {
let FilterClass = "AtomicNoRet";
@@ -2435,6 +2513,15 @@ def getGlobalSaddrOp : InstrMapping {
let ValueCols = [["1"]];
}
+// Maps a GLOBAL SADDR to its VADDR form.
+def getGlobalVaddrOp : InstrMapping {
+ let FilterClass = "GlobalSaddrTable";
+ let RowFields = ["SaddrOp"];
+ let ColFields = ["IsSaddr"];
+ let KeyCol = ["1"];
+ let ValueCols = [["0"]];
+}
+
// Maps a v_cmpx opcode with sdst to opcode without sdst.
def getVCMPXNoSDstOp : InstrMapping {
let FilterClass = "VCMPXNoSDstTable";
@@ -2470,6 +2557,14 @@ def getFlatScratchInstSSfromSV : InstrMapping {
let ValueCols = [["SS"]];
}
+def getFlatScratchInstSVfromSS : InstrMapping {
+ let FilterClass = "FlatScratchInst";
+ let RowFields = ["SVOp"];
+ let ColFields = ["Mode"];
+ let KeyCol = ["SS"];
+ let ValueCols = [["SV"]];
+}
+
include "SIInstructions.td"
include "DSInstructions.td"
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 7c1cbd67c993..fbf4634bfc94 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -41,18 +41,21 @@ multiclass V_INTERP_P1_F32_m : VINTRP_m <
(i32 timm:$attrchan), (i32 timm:$attr), M0))]
>;
-let OtherPredicates = [has32BankLDS] in {
+let OtherPredicates = [has32BankLDS, isNotGFX90APlus] in {
defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m;
-} // End OtherPredicates = [has32BankLDS]
+} // End OtherPredicates = [has32BankLDS, isNotGFX90APlus]
-let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in {
+let OtherPredicates = [has16BankLDS, isNotGFX90APlus],
+ Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in {
defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m;
-} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1
+} // End OtherPredicates = [has32BankLDS, isNotGFX90APlus],
+ // Constraints = "@earlyclobber $vdst", isAsmParserOnly=1
+let OtherPredicates = [isNotGFX90APlus] in {
let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in {
defm V_INTERP_P2_F32 : VINTRP_m <
@@ -73,6 +76,8 @@ defm V_INTERP_MOV_F32 : VINTRP_m <
[(set f32:$vdst, (int_amdgcn_interp_mov (i32 timm:$vsrc),
(i32 timm:$attrchan), (i32 timm:$attr), M0))]>;
+} // End OtherPredicates = [isNotGFX90APlus]
+
} // End Uses = [MODE, M0, EXEC]
//===----------------------------------------------------------------------===//
@@ -86,11 +91,6 @@ def ATOMIC_FENCE : SPseudoInstSI<
let maybeAtomic = 1;
}
-def VOP_I64_I64_DPP : VOPProfile <[i64, i64, untyped, untyped]> {
- let HasExt = 1;
- let HasExtDPP = 1;
-}
-
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
// For use in patterns
@@ -104,13 +104,31 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
// 64-bit vector move instruction. This is mainly used by the
// SIFoldOperands pass to enable folding of inline immediates.
def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
- (ins VSrc_b64:$src0)>;
+ (ins VSrc_b64:$src0)> {
+ let isReMaterializable = 1;
+ let isAsCheapAsAMove = 1;
+ let isMoveImm = 1;
+ let SchedRW = [Write64Bit];
+ let Size = 16; // Needs maximum 2 v_mov_b32 instructions 8 byte long each.
+}
// 64-bit vector move with dpp. Expanded post-RA.
-def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64_DPP> {
+def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64> {
let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete.
}
+// 64-bit scalar move immediate instruction. This is used to avoid subregs
+// initialization and allow rematerialization.
+def S_MOV_B64_IMM_PSEUDO : SPseudoInstSI <(outs SReg_64:$sdst),
+ (ins i64imm:$src0)> {
+ let isReMaterializable = 1;
+ let isAsCheapAsAMove = 1;
+ let isMoveImm = 1;
+ let SchedRW = [WriteSALU, Write64Bit];
+ let Size = 16; // Needs maximum 2 s_mov_b32 instructions 8 byte long each.
+ let Uses = [];
+}
+
// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the
// WQM pass processes it.
def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
@@ -119,17 +137,32 @@ def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
// turned into a copy by WQM pass, but does not seed WQM requirements.
def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
-// Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so
+// Pseudoinstruction for @llvm.amdgcn.strict.wwm. It is turned into a copy post-RA, so
// that the @earlyclobber is respected. The @earlyclobber is to make sure that
-// the instruction that defines $src0 (which is run in WWM) doesn't
+// the instruction that defines $src0 (which is run in Whole Wave Mode) doesn't
// accidentally clobber inactive channels of $vdst.
let Constraints = "@earlyclobber $vdst" in {
-def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
+def STRICT_WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
+def STRICT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
}
} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
-def ENTER_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
+def ENTER_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
+ let Uses = [EXEC];
+ let Defs = [EXEC, SCC];
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+}
+
+def EXIT_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+}
+
+def ENTER_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
let Uses = [EXEC];
let Defs = [EXEC, SCC];
let hasSideEffects = 0;
@@ -137,7 +170,7 @@ def ENTER_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
let mayStore = 0;
}
-def EXIT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
+def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
let hasSideEffects = 0;
let mayLoad = 0;
let mayStore = 0;
@@ -145,6 +178,7 @@ def EXIT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
// Invert the exec mask and overwrite the inactive lanes of dst with inactive,
// restoring it after we're done.
+let Defs = [SCC] in {
def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
(ins VGPR_32: $src, VSrc_b32:$inactive),
[(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> {
@@ -156,6 +190,7 @@ def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
[(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> {
let Constraints = "$src = $vdst";
}
+} // End Defs = [SCC]
let usesCustomInserter = 1, Defs = [VCC, EXEC] in {
def V_ADD_U64_PSEUDO : VPseudoInstSI <
@@ -230,6 +265,7 @@ def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>;
def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>;
def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
+def S_AND_B64_term : WrapTerminatorInst<S_AND_B64>;
}
let WaveSizePredicate = isWave32 in {
@@ -237,6 +273,7 @@ def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>;
def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;
def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
+def S_AND_B32_term : WrapTerminatorInst<S_AND_B32>;
}
@@ -255,19 +292,6 @@ def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
// SI pseudo instructions. These are used by the CFG structurizer pass
// and should be lowered to ISA instructions prior to codegen.
-// Dummy terminator instruction to use after control flow instructions
-// replaced with exec mask operations.
-def SI_MASK_BRANCH : VPseudoInstSI <
- (outs), (ins brtarget:$target)> {
- let isBranch = 0;
- let isTerminator = 1;
- let isBarrier = 0;
- let SchedRW = [];
- let hasNoSchedulingInfo = 1;
- let FixedSize = 1;
- let Size = 0;
-}
-
let isTerminator = 1 in {
let OtherPredicates = [EnableLateCFGStructurize] in {
@@ -294,6 +318,14 @@ def SI_ELSE : CFPseudoInstSI <
let hasSideEffects = 1;
}
+def SI_WATERFALL_LOOP : CFPseudoInstSI <
+ (outs),
+ (ins brtarget:$target), [], 1> {
+ let Size = 8;
+ let isBranch = 1;
+ let Defs = [];
+}
+
def SI_LOOP : CFPseudoInstSI <
(outs), (ins SReg_1:$saved, brtarget:$target),
[(AMDGPUloop i1:$saved, bb:$target)], 1, 1> {
@@ -337,24 +369,22 @@ multiclass PseudoInstKill <dag ins> {
// required in degenerate cases (when V_CMPX cannot be used due to constant
// bus limitations) and because it allows us to avoid having to track SCC
// liveness across basic blocks.
- let Defs = [EXEC,VCC,SCC] in
+ let Defs = [EXEC,SCC] in
def _PSEUDO : PseudoInstSI <(outs), ins> {
let isConvergent = 1;
let usesCustomInserter = 1;
}
- let Defs = [EXEC,VCC,SCC] in
+ let Defs = [EXEC,SCC] in
def _TERMINATOR : SPseudoInstSI <(outs), ins> {
let isTerminator = 1;
}
}
defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>;
+let Defs = [VCC] in
defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
-let Defs = [EXEC] in
-def SI_KILL_CLEANUP : SPseudoInstSI <(outs), (ins)>;
-
let Defs = [EXEC,VCC] in
def SI_ILLEGAL_COPY : SPseudoInstSI <
(outs unknown:$dst), (ins unknown:$src),
@@ -376,6 +406,18 @@ def SI_PS_LIVE : PseudoInstSI <
let SALU = 1;
}
+let Uses = [EXEC] in {
+def SI_LIVE_MASK : PseudoInstSI <
+ (outs SReg_1:$dst), (ins),
+ [(set i1:$dst, (int_amdgcn_live_mask))]> {
+ let SALU = 1;
+}
+let Defs = [EXEC,SCC] in {
+// Demote: Turn a pixel shader thread into a helper lane.
+def SI_DEMOTE_I1 : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)>;
+} // End Defs = [EXEC,SCC]
+} // End Uses = [EXEC]
+
def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
[(int_amdgcn_unreachable)],
"; divergent unreachable"> {
@@ -463,7 +505,7 @@ def SI_CALL : SPseudoInstSI <
// Tail call handling pseudo
def SI_TCRETURN : SPseudoInstSI <(outs),
- (ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff),
+ (ins SReg_64:$src0, unknown:$callee, i32imm:$fpdiff),
[(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> {
let Size = 4;
let isCall = 1;
@@ -476,6 +518,11 @@ def SI_TCRETURN : SPseudoInstSI <(outs),
let isConvergent = 1;
}
+// Handle selecting indirect tail calls
+def : GCNPat<
+ (AMDGPUtc_return i64:$src0, (i64 0), (i32 timm:$fpdiff)),
+ (SI_TCRETURN SReg_64:$src0, (i64 0), i32imm:$fpdiff)
+>;
def ADJCALLSTACKUP : SPseudoInstSI<
(outs), (ins i32imm:$amt0, i32imm:$amt1),
@@ -654,6 +701,7 @@ defm SI_SPILL_S96 : SI_SPILL_SGPR <SReg_96>;
defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>;
defm SI_SPILL_S192 : SI_SPILL_SGPR <SReg_192>;
+defm SI_SPILL_S224 : SI_SPILL_SGPR <SReg_224>;
defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>;
@@ -697,6 +745,7 @@ defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>;
defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>;
+defm SI_SPILL_V224 : SI_SPILL_VGPR <VReg_224>;
defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>;
@@ -707,6 +756,7 @@ defm SI_SPILL_A96 : SI_SPILL_VGPR <AReg_96, 1>;
defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128, 1>;
defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160, 1>;
defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192, 1>;
+defm SI_SPILL_A224 : SI_SPILL_VGPR <AReg_224, 1>;
defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>;
defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>;
defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>;
@@ -749,6 +799,16 @@ def : Pat <
(SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond))
>;
+def : Pat <
+ (int_amdgcn_wqm_demote i1:$src),
+ (SI_DEMOTE_I1 SCSrc_i1:$src, 0)
+>;
+
+def : Pat <
+ (int_amdgcn_wqm_demote (i1 (not i1:$src))),
+ (SI_DEMOTE_I1 SCSrc_i1:$src, -1)
+>;
+
// TODO: we could add more variants for other types of conditionals
def : Pat <
@@ -1021,6 +1081,38 @@ foreach Index = 0-4 in {
>;
}
+foreach Index = 0-5 in {
+ def Extract_Element_v6i32_#Index : Extract_Element <
+ i32, v6i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v6i32_#Index : Insert_Element <
+ i32, v6i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v6f32_#Index : Extract_Element <
+ f32, v6f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v6f32_#Index : Insert_Element <
+ f32, v6f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
+
+foreach Index = 0-6 in {
+ def Extract_Element_v7i32_#Index : Extract_Element <
+ i32, v7i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v7i32_#Index : Insert_Element <
+ i32, v7i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v7f32_#Index : Extract_Element <
+ f32, v7f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v7f32_#Index : Insert_Element <
+ f32, v7f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
+
foreach Index = 0-7 in {
def Extract_Element_v8i32_#Index : Extract_Element <
i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -1171,8 +1263,32 @@ def : BitConvert <v4f32, v2i64, VReg_128>;
def : BitConvert <v2i64, v4f32, VReg_128>;
// 160-bit bitcast
-def : BitConvert <v5i32, v5f32, SGPR_160>;
-def : BitConvert <v5f32, v5i32, SGPR_160>;
+def : BitConvert <v5i32, v5f32, SReg_160>;
+def : BitConvert <v5f32, v5i32, SReg_160>;
+def : BitConvert <v5i32, v5f32, VReg_160>;
+def : BitConvert <v5f32, v5i32, VReg_160>;
+
+// 192-bit bitcast
+def : BitConvert <v6i32, v6f32, SReg_192>;
+def : BitConvert <v6f32, v6i32, SReg_192>;
+def : BitConvert <v6i32, v6f32, VReg_192>;
+def : BitConvert <v6f32, v6i32, VReg_192>;
+def : BitConvert <v3i64, v3f64, VReg_192>;
+def : BitConvert <v3f64, v3i64, VReg_192>;
+def : BitConvert <v3i64, v6i32, VReg_192>;
+def : BitConvert <v3i64, v6f32, VReg_192>;
+def : BitConvert <v3f64, v6i32, VReg_192>;
+def : BitConvert <v3f64, v6f32, VReg_192>;
+def : BitConvert <v6i32, v3i64, VReg_192>;
+def : BitConvert <v6f32, v3i64, VReg_192>;
+def : BitConvert <v6i32, v3f64, VReg_192>;
+def : BitConvert <v6f32, v3f64, VReg_192>;
+
+// 224-bit bitcast
+def : BitConvert <v7i32, v7f32, SReg_224>;
+def : BitConvert <v7f32, v7i32, SReg_224>;
+def : BitConvert <v7i32, v7f32, VReg_224>;
+def : BitConvert <v7f32, v7i32, VReg_224>;
// 256-bit bitcast
def : BitConvert <v8i32, v8f32, SReg_256>;
@@ -1349,6 +1465,19 @@ def : GCNPat <
// sub1)
// >;
+// COPY_TO_REGCLASS is needed to avoid using SCC from S_XOR_B32 instead
+// of the real value.
+def : GCNPat <
+ (fneg (v2f32 SReg_64:$src)),
+ (v2f32 (REG_SEQUENCE SReg_64,
+ (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub0)),
+ (i32 (S_MOV_B32 (i32 0x80000000)))),
+ SReg_32)), sub0,
+ (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub1)),
+ (i32 (S_MOV_B32 (i32 0x80000000)))),
+ SReg_32)), sub1))
+>;
+
} // End let AddedComplexity = 1
def : GCNPat <
@@ -1414,6 +1543,15 @@ def : GCNPat <
>;
def : GCNPat <
+ (getDivergentFrag<fneg>.ret (v2f32 VReg_64:$src)),
+ (V_PK_ADD_F32 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, VReg_64:$src,
+ 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, 0,
+ 0, 0, 0, 0, 0)
+> {
+ let SubtargetPredicate = HasPackedFP32Ops;
+}
+
+def : GCNPat <
(fcopysign f16:$src0, f16:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
>;
@@ -1532,9 +1670,16 @@ def : GCNPat <
/********** Intrinsic Patterns **********/
/********** ================== **********/
+let OtherPredicates = [isNotGFX90APlus] in
// FIXME: Should use _e64 and select source modifiers.
def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
+let OtherPredicates = [isGFX90APlus] in
+def : GCNPat <
+ (fpow f32:$src0, f32:$src1),
+ (V_EXP_F32_e32 (V_MUL_LEGACY_F32_e64 0, f32:$src1, SRCMODS.NONE, (V_LOG_F32_e32 f32:$src0), 0, 0))
+>;
+
def : GCNPat <
(i32 (sext i1:$src0)),
(V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
@@ -1793,6 +1938,8 @@ class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, S
(i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE))
>;
+def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>;
+def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>;
def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>;
def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>;
def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>;
@@ -1930,11 +2077,19 @@ def : GCNPat <
//===----------------------------------------------------------------------===//
// Miscellaneous Patterns
//===----------------------------------------------------------------------===//
-def : GCNPat <
- (i32 (AMDGPUfp16_zext f16:$src)),
- (COPY $src)
->;
+// Eliminate a zero extension from an fp16 operation if it already
+// zeros the high bits of the 32-bit register.
+//
+// This is complicated on gfx9+. Some instructions maintain the legacy
+// zeroing behavior, but others preserve the high bits. Some have a
+// control bit to change the behavior. We can't simply say with
+// certainty what the source behavior is without more context on how
+// the src is lowered. e.g. fptrunc + fma may be lowered to a
+// v_fma_mix* instruction which does not zero, or may not.
+def : GCNPat<
+ (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
+ (COPY VSrc_b16:$src)>;
def : GCNPat <
(i32 (trunc i64:$a)),
@@ -2141,6 +2296,17 @@ def : GCNPat <
SRCMODS.NONE, $src2)
>;
+let SubtargetPredicate = isGFX90APlus in
+def : GCNPat <
+ (fma (f64 (VOP3Mods0 f64:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (f64 (VOP3Mods f64:$src1, i32:$src1_modifiers)),
+ (f64 (VOP3NoMods f64:$src2))),
+ (V_FMAC_F64_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
+ SRCMODS.NONE, $src2, $clamp, $omod)
+>;
+
+// COPY is workaround tablegen bug from multiple outputs
+// from S_LSHL_B32's multiple outputs from implicit scc def.
def : GCNPat <
(v2i16 (build_vector (i16 0), (i16 SReg_32:$src1))),
(S_LSHL_B32 SReg_32:$src1, (i16 16))
@@ -2207,9 +2373,13 @@ def : GCNPat <
(S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
>;
+def : GCNPat <
+ (v2f16 (is_canonicalized<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)),
+ (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))),
+ (V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1)
+>;
} // End SubtargetPredicate = HasVOP3PInsts
-
def : GCNPat <
(v2f16 (scalar_to_vector f16:$src0)),
(COPY $src0)
@@ -2233,7 +2403,7 @@ def : GCNPat <
def : GCNPat <
(i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask,
timm:$bank_mask, timm:$bound_ctrl)),
- (V_MOV_B64_DPP_PSEUDO VReg_64:$src, VReg_64:$src,
+ (V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$src, VReg_64_Align2:$src,
(as_i32timm $dpp_ctrl), (as_i32timm $row_mask),
(as_i32timm $bank_mask),
(as_i1timm $bound_ctrl))
@@ -2242,7 +2412,7 @@ def : GCNPat <
def : GCNPat <
(i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask,
timm:$bank_mask, timm:$bound_ctrl)),
- (V_MOV_B64_DPP_PSEUDO VReg_64:$old, VReg_64:$src, (as_i32timm $dpp_ctrl),
+ (V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$old, VReg_64_Align2:$src, (as_i32timm $dpp_ctrl),
(as_i32timm $row_mask), (as_i32timm $bank_mask),
(as_i1timm $bound_ctrl))
>;
@@ -2573,6 +2743,24 @@ def G_AMDGPU_CVT_F32_UBYTE#N : AMDGPUGenericInstruction {
}
}
+def G_AMDGPU_CVT_PK_I16_I32 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1);
+ let hasSideEffects = 0;
+}
+
+def G_AMDGPU_SMED3 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
+ let hasSideEffects = 0;
+}
+
+def G_AMDGPU_UMED3 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
+ let hasSideEffects = 0;
+}
+
// Atomic cmpxchg. $cmpval ad $newval are packed in a single vector
// operand Expects a MachineMemOperand in addition to explicit
// operands.
@@ -2614,6 +2802,8 @@ def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_FMIN : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_FMAX : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
new file mode 100644
index 000000000000..d560b477b8ba
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
@@ -0,0 +1,231 @@
+//===-- SILateBranchLowering.cpp - Final preparation of branches ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass mainly lowers early terminate pseudo instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-late-branch-lowering"
+
+namespace {
+
+class SILateBranchLowering : public MachineFunctionPass {
+private:
+ const SIRegisterInfo *TRI = nullptr;
+ const SIInstrInfo *TII = nullptr;
+ MachineDominatorTree *MDT = nullptr;
+
+ void earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock);
+
+public:
+ static char ID;
+
+ unsigned MovOpc;
+ Register ExecReg;
+
+ SILateBranchLowering() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "SI Final Branch Preparation";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+char SILateBranchLowering::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SILateBranchLowering, DEBUG_TYPE,
+ "SI insert s_cbranch_execz instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(SILateBranchLowering, DEBUG_TYPE,
+ "SI insert s_cbranch_execz instructions", false, false)
+
+char &llvm::SILateBranchLoweringPassID = SILateBranchLowering::ID;
+
+static void generateEndPgm(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, DebugLoc DL,
+ const SIInstrInfo *TII, MachineFunction &MF) {
+ const Function &F = MF.getFunction();
+ bool IsPS = F.getCallingConv() == CallingConv::AMDGPU_PS;
+
+ // Check if hardware has been configured to expect color or depth exports.
+ bool HasExports =
+ AMDGPU::getHasColorExport(F) || AMDGPU::getHasDepthExport(F);
+
+ // Prior to GFX10, hardware always expects at least one export for PS.
+ bool MustExport = !AMDGPU::isGFX10Plus(TII->getSubtarget());
+
+ if (IsPS && (HasExports || MustExport)) {
+ // Generate "null export" if hardware is expecting PS to export.
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE))
+ .addImm(AMDGPU::Exp::ET_NULL)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addImm(1) // vm
+ .addImm(0) // compr
+ .addImm(0); // en
+ }
+
+ // s_endpgm
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
+}
+
+static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI,
+ MachineDominatorTree *MDT) {
+ MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/ true);
+
+ // Update dominator tree
+ using DomTreeT = DomTreeBase<MachineBasicBlock>;
+ SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
+ for (MachineBasicBlock *Succ : SplitBB->successors()) {
+ DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
+ DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ});
+ }
+ DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
+ MDT->getBase().applyUpdates(DTUpdates);
+}
+
+void SILateBranchLowering::earlyTerm(MachineInstr &MI,
+ MachineBasicBlock *EarlyExitBlock) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ const DebugLoc DL = MI.getDebugLoc();
+
+ auto BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0))
+ .addMBB(EarlyExitBlock);
+ auto Next = std::next(MI.getIterator());
+
+ if (Next != MBB.end() && !Next->isTerminator())
+ splitBlock(MBB, *BranchMI, MDT);
+
+ MBB.addSuccessor(EarlyExitBlock);
+ MDT->getBase().insertEdge(&MBB, EarlyExitBlock);
+}
+
+bool SILateBranchLowering::runOnMachineFunction(MachineFunction &MF) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+ MDT = &getAnalysis<MachineDominatorTree>();
+
+ MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+
+ SmallVector<MachineInstr *, 4> EarlyTermInstrs;
+ SmallVector<MachineInstr *, 1> EpilogInstrs;
+ bool MadeChange = false;
+
+ for (MachineBasicBlock &MBB : MF) {
+ MachineBasicBlock::iterator I, Next;
+ for (I = MBB.begin(); I != MBB.end(); I = Next) {
+ Next = std::next(I);
+ MachineInstr &MI = *I;
+
+ switch (MI.getOpcode()) {
+ case AMDGPU::S_BRANCH:
+ // Optimize out branches to the next block.
+ // This only occurs in -O0 when BranchFolding is not executed.
+ if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
+ assert(&MI == &MBB.back());
+ MI.eraseFromParent();
+ MadeChange = true;
+ }
+ break;
+
+ case AMDGPU::SI_EARLY_TERMINATE_SCC0:
+ EarlyTermInstrs.push_back(&MI);
+ break;
+
+ case AMDGPU::SI_RETURN_TO_EPILOG:
+ EpilogInstrs.push_back(&MI);
+ break;
+
+ default:
+ break;
+ }
+ }
+ }
+
+ // Lower any early exit branches first
+ if (!EarlyTermInstrs.empty()) {
+ MachineBasicBlock *EarlyExitBlock = MF.CreateMachineBasicBlock();
+ DebugLoc DL;
+
+ MF.insert(MF.end(), EarlyExitBlock);
+ BuildMI(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII->get(MovOpc),
+ ExecReg)
+ .addImm(0);
+ generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII, MF);
+
+ for (MachineInstr *Instr : EarlyTermInstrs) {
+ // Early termination in GS does nothing
+ if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS)
+ earlyTerm(*Instr, EarlyExitBlock);
+ Instr->eraseFromParent();
+ }
+
+ EarlyTermInstrs.clear();
+ MadeChange = true;
+ }
+
+ // Now check return to epilog instructions occur at function end
+ if (!EpilogInstrs.empty()) {
+ MachineBasicBlock *EmptyMBBAtEnd = nullptr;
+ assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
+
+ // If there are multiple returns to epilog then all will
+ // become jumps to new empty end block.
+ if (EpilogInstrs.size() > 1) {
+ EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
+ MF.insert(MF.end(), EmptyMBBAtEnd);
+ }
+
+ for (auto MI : EpilogInstrs) {
+ auto MBB = MI->getParent();
+ if (MBB == &MF.back() && MI == &MBB->back())
+ continue;
+
+ // SI_RETURN_TO_EPILOG is not the last instruction.
+ // Jump to empty block at function end.
+ if (!EmptyMBBAtEnd) {
+ EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
+ MF.insert(MF.end(), EmptyMBBAtEnd);
+ }
+
+ MBB->addSuccessor(EmptyMBBAtEnd);
+ MDT->getBase().insertEdge(MBB, EmptyMBBAtEnd);
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
+ .addMBB(EmptyMBBAtEnd);
+ MI->eraseFromParent();
+ MadeChange = true;
+ }
+
+ EpilogInstrs.clear();
+ }
+
+ return MadeChange;
+}
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index b39420f3c7db..493c1ad87f93 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -104,9 +104,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
unsigned BaseOff;
unsigned DMask;
InstClassEnum InstClass;
- bool GLC;
- bool SLC;
- bool DLC;
+ unsigned CPol = 0;
bool UseST64;
int AddrIdx[MaxAddressRegs];
const MachineOperand *AddrReg[MaxAddressRegs];
@@ -199,6 +197,7 @@ private:
const CombineInfo &Paired);
const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
const CombineInfo &Paired);
+ const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired,
SmallVectorImpl<MachineInstr *> &InstsToMove);
@@ -304,6 +303,16 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 2;
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
return 4;
+ case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH;
+ case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;
+ case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH;
+ case AMDGPU::DS_WRITE_B32_gfx9:
+ return 1;
+ case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH;
+ case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH;
+ case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH;
+ case AMDGPU::DS_WRITE_B64_gfx9:
+ return 2;
default:
return 0;
}
@@ -521,11 +530,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
Offset &= 0xffff;
} else if (InstClass != MIMG) {
- GLC = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm();
- if (InstClass != S_BUFFER_LOAD_IMM) {
- SLC = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm();
- }
- DLC = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm();
+ CPol = TII.getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
}
AddressRegs Regs = getRegs(Opc, TII);
@@ -675,10 +680,9 @@ bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
return false;
// Check other optional immediate operands for equality.
- unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc,
- AMDGPU::OpName::d16, AMDGPU::OpName::unorm,
- AMDGPU::OpName::da, AMDGPU::OpName::r128,
- AMDGPU::OpName::a16, AMDGPU::OpName::dlc};
+ unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
+ AMDGPU::OpName::unorm, AMDGPU::OpName::da,
+ AMDGPU::OpName::r128, AMDGPU::OpName::a16};
for (auto op : OperandsToMatch) {
int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
@@ -725,6 +729,16 @@ static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
return NewFormatInfo->Format;
}
+// Return the value in the inclusive range [Lo,Hi] that is aligned to the
+// highest power of two. Note that the result is well defined for all inputs
+// including corner cases like:
+// - if Lo == Hi, return that value
+// - if Lo == 0, return 0 (even though the "- 1" below underflows
+// - if Lo > Hi, return 0 (as if the range wrapped around)
+static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
+ return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
+}
+
bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
const GCNSubtarget &STI,
CombineInfo &Paired,
@@ -764,20 +778,19 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
return false;
}
- unsigned EltOffset0 = CI.Offset / CI.EltSize;
- unsigned EltOffset1 = Paired.Offset / CI.EltSize;
+ uint32_t EltOffset0 = CI.Offset / CI.EltSize;
+ uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
CI.UseST64 = false;
CI.BaseOff = 0;
- // Handle DS instructions.
+ // Handle all non-DS instructions.
if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
return (EltOffset0 + CI.Width == EltOffset1 ||
EltOffset1 + Paired.Width == EltOffset0) &&
- CI.GLC == Paired.GLC && CI.DLC == Paired.DLC &&
- (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC == Paired.SLC);
+ CI.CPol == Paired.CPol &&
+ (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol);
}
- // Handle SMEM and VMEM instructions.
// If the offset in elements doesn't fit in 8-bits, we might be able to use
// the stride 64 versions.
if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
@@ -800,22 +813,36 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
}
// Try to shift base address to decrease offsets.
- unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
- CI.BaseOff = std::min(CI.Offset, Paired.Offset);
+ uint32_t Min = std::min(EltOffset0, EltOffset1);
+ uint32_t Max = std::max(EltOffset0, EltOffset1);
- if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
+ const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
+ if (((Max - Min) & ~Mask) == 0) {
if (Modify) {
- CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
- Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
+ // From the range of values we could use for BaseOff, choose the one that
+ // is aligned to the highest power of two, to maximise the chance that
+ // the same offset can be reused for other load/store pairs.
+ uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
+ // Copy the low bits of the offsets, so that when we adjust them by
+ // subtracting BaseOff they will be multiples of 64.
+ BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
+ CI.BaseOff = BaseOff * CI.EltSize;
+ CI.Offset = (EltOffset0 - BaseOff) / 64;
+ Paired.Offset = (EltOffset1 - BaseOff) / 64;
CI.UseST64 = true;
}
return true;
}
- if (isUInt<8>(OffsetDiff)) {
+ if (isUInt<8>(Max - Min)) {
if (Modify) {
- CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize;
- Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize;
+ // From the range of values we could use for BaseOff, choose the one that
+ // is aligned to the highest power of two, to maximise the chance that
+ // the same offset can be reused for other load/store pairs.
+ uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
+ CI.BaseOff = BaseOff * CI.EltSize;
+ CI.Offset = EltOffset0 - BaseOff;
+ Paired.Offset = EltOffset1 - BaseOff;
}
return true;
}
@@ -841,6 +868,26 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
}
}
+const TargetRegisterClass *
+SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
+ if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
+ return TRI->getRegClassForReg(*MRI, Dst->getReg());
+ }
+ if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
+ return TRI->getRegClassForReg(*MRI, Src->getReg());
+ }
+ if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
+ return TRI->getRegClassForReg(*MRI, Src->getReg());
+ }
+ if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
+ return TRI->getRegClassForReg(*MRI, Dst->getReg());
+ }
+ if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
+ return TRI->getRegClassForReg(*MRI, Src->getReg());
+ }
+ return nullptr;
+}
+
/// This function assumes that CI comes before Paired in a basic block.
bool SILoadStoreOptimizer::checkAndPrepareMerge(
CombineInfo &CI, CombineInfo &Paired,
@@ -873,6 +920,9 @@ bool SILoadStoreOptimizer::checkAndPrepareMerge(
DenseSet<Register> PhysRegUsesToMove;
addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
+ const TargetRegisterClass *DataRC = getDataRegClass(*CI.I);
+ bool IsAGPR = TRI->hasAGPRs(DataRC);
+
MachineBasicBlock::iterator E = std::next(Paired.I);
MachineBasicBlock::iterator MBBI = std::next(CI.I);
MachineBasicBlock::iterator MBBE = CI.I->getParent()->end();
@@ -941,6 +991,17 @@ bool SILoadStoreOptimizer::checkAndPrepareMerge(
continue;
if (&*MBBI == &*Paired.I) {
+ if (TRI->hasAGPRs(getDataRegClass(*MBBI)) != IsAGPR)
+ return false;
+ // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
+ // operands. However we are reporting that ds_write2 shall have
+ // only VGPR data so that machine copy propagation does not
+ // create an illegal instruction with a VGPR and AGPR sources.
+ // Consequenctially if we create such instruction the verifier
+ // will complain.
+ if (IsAGPR && CI.InstClass == DS_WRITE)
+ return false;
+
// We need to go through the list of instructions that we plan to
// move and make sure they are all safe to move down past the merged
// instruction.
@@ -1014,8 +1075,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
const MCInstrDesc &Read2Desc = TII->get(Opc);
- const TargetRegisterClass *SuperRC =
- (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
+ const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
Register DestReg = MRI->createVirtualRegister(SuperRC);
DebugLoc DL = CI.I->getDebugLoc();
@@ -1229,8 +1289,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg)
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
.addImm(MergedOffset) // offset
- .addImm(CI.GLC) // glc
- .addImm(CI.DLC) // dlc
+ .addImm(CI.CPol) // cpol
.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
@@ -1289,10 +1348,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
.addImm(MergedOffset) // offset
- .addImm(CI.GLC) // glc
- .addImm(CI.SLC) // slc
+ .addImm(CI.CPol) // cpol
.addImm(0) // tfe
- .addImm(CI.DLC) // dlc
.addImm(0) // swz
.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
@@ -1356,10 +1413,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
.addImm(MergedOffset) // offset
.addImm(JoinedFormat) // format
- .addImm(CI.GLC) // glc
- .addImm(CI.SLC) // slc
+ .addImm(CI.CPol) // cpol
.addImm(0) // tfe
- .addImm(CI.DLC) // dlc
.addImm(0) // swz
.addMemOperand(
combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
@@ -1436,10 +1491,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
.addImm(std::min(CI.Offset, Paired.Offset)) // offset
.addImm(JoinedFormat) // format
- .addImm(CI.GLC) // glc
- .addImm(CI.SLC) // slc
+ .addImm(CI.CPol) // cpol
.addImm(0) // tfe
- .addImm(CI.DLC) // dlc
.addImm(0) // swz
.addMemOperand(
combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
@@ -1536,18 +1589,12 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
case 16:
return &AMDGPU::SGPR_512RegClass;
}
- } else {
- switch (CI.Width + Paired.Width) {
- default:
- return nullptr;
- case 2:
- return &AMDGPU::VReg_64RegClass;
- case 3:
- return &AMDGPU::VReg_96RegClass;
- case 4:
- return &AMDGPU::VReg_128RegClass;
- }
}
+
+ unsigned BitWidth = 32 * (CI.Width + Paired.Width);
+ return TRI->hasAGPRs(getDataRegClass(*CI.I))
+ ? TRI->getAGPRClassForBitWidth(BitWidth)
+ : TRI->getVGPRClassForBitWidth(BitWidth);
}
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
@@ -1596,10 +1643,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
.addImm(std::min(CI.Offset, Paired.Offset)) // offset
- .addImm(CI.GLC) // glc
- .addImm(CI.SLC) // slc
+ .addImm(CI.CPol) // cpol
.addImm(0) // tfe
- .addImm(CI.DLC) // dlc
.addImm(0) // swz
.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
@@ -1671,7 +1716,7 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
(void)HiHalf;
LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
- Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
MachineInstr *FullBase =
BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
.addReg(DestSub0)
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 5839e59b4d7f..0f2836e1e7fb 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -72,10 +72,9 @@ private:
MachineRegisterInfo *MRI = nullptr;
SetVector<MachineInstr*> LoweredEndCf;
DenseSet<Register> LoweredIf;
- SmallSet<MachineInstr *, 16> NeedsKillCleanup;
+ SmallSet<MachineBasicBlock *, 4> KillBlocks;
const TargetRegisterClass *BoolRC = nullptr;
- bool InsertKillCleanups;
unsigned AndOpc;
unsigned OrOpc;
unsigned XorOpc;
@@ -86,6 +85,8 @@ private:
unsigned OrSaveExecOpc;
unsigned Exec;
+ bool hasKill(const MachineBasicBlock *Begin, const MachineBasicBlock *End);
+
void emitIf(MachineInstr &MI);
void emitElse(MachineInstr &MI);
void emitIfBreak(MachineInstr &MI);
@@ -163,8 +164,8 @@ static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) {
char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
-static bool hasKill(const MachineBasicBlock *Begin,
- const MachineBasicBlock *End, const SIInstrInfo *TII) {
+bool SILowerControlFlow::hasKill(const MachineBasicBlock *Begin,
+ const MachineBasicBlock *End) {
DenseSet<const MachineBasicBlock*> Visited;
SmallVector<MachineBasicBlock *, 4> Worklist(Begin->successors());
@@ -173,9 +174,8 @@ static bool hasKill(const MachineBasicBlock *Begin,
if (MBB == End || !Visited.insert(MBB).second)
continue;
- for (auto &Term : MBB->terminators())
- if (TII->isKillTerminator(Term.getOpcode()))
- return true;
+ if (KillBlocks.contains(MBB))
+ return true;
Worklist.append(MBB->succ_begin(), MBB->succ_end());
}
@@ -211,32 +211,11 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
// just cleared bits.
bool SimpleIf = isSimpleIf(MI, MRI);
- if (InsertKillCleanups) {
- // Check for SI_KILL_*_TERMINATOR on full path of control flow and
- // flag the associated SI_END_CF for insertion of a kill cleanup.
- auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
- while (UseMI->getOpcode() != AMDGPU::SI_END_CF) {
- assert(std::next(UseMI) == MRI->use_instr_nodbg_end());
- assert(UseMI->getOpcode() == AMDGPU::SI_ELSE);
- MachineOperand &NextExec = UseMI->getOperand(0);
- Register NextExecReg = NextExec.getReg();
- if (NextExec.isDead()) {
- assert(!SimpleIf);
- break;
- }
- UseMI = MRI->use_instr_nodbg_begin(NextExecReg);
- }
- if (UseMI->getOpcode() == AMDGPU::SI_END_CF) {
- if (hasKill(MI.getParent(), UseMI->getParent(), TII)) {
- NeedsKillCleanup.insert(&*UseMI);
- SimpleIf = false;
- }
- }
- } else if (SimpleIf) {
+ if (SimpleIf) {
// Check for SI_KILL_*_TERMINATOR on path from if to endif.
// if there is any such terminator simplifications are not safe.
auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
- SimpleIf = !hasKill(MI.getParent(), UseMI->getParent(), TII);
+ SimpleIf = !hasKill(MI.getParent(), UseMI->getParent());
}
// Add an implicit def of exec to discourage scheduling VALU after this which
@@ -451,8 +430,6 @@ SILowerControlFlow::skipIgnoreExecInstsTrivialSucc(
auto E = B->end();
for ( ; It != E; ++It) {
- if (It->getOpcode() == AMDGPU::SI_KILL_CLEANUP)
- continue;
if (TII->mayReadEXEC(*MRI, *It))
break;
}
@@ -505,18 +482,8 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) {
LoweredEndCf.insert(NewMI);
- // If this ends control flow which contains kills (as flagged in emitIf)
- // then insert an SI_KILL_CLEANUP immediately following the exec mask
- // manipulation. This can be lowered to early termination if appropriate.
- MachineInstr *CleanUpMI = nullptr;
- if (NeedsKillCleanup.count(&MI))
- CleanUpMI = BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_KILL_CLEANUP));
-
- if (LIS) {
+ if (LIS)
LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
- if (CleanUpMI)
- LIS->InsertMachineInstrInMaps(*CleanUpMI);
- }
MI.eraseFromParent();
@@ -633,6 +600,10 @@ MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) {
emitLoop(MI);
break;
+ case AMDGPU::SI_WATERFALL_LOOP:
+ MI.setDesc(TII->get(AMDGPU::S_CBRANCH_EXECNZ));
+ break;
+
case AMDGPU::SI_END_CF:
SplitBB = emitEndCf(MI);
break;
@@ -811,8 +782,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
LIS = getAnalysisIfAvailable<LiveIntervals>();
MRI = &MF.getRegInfo();
BoolRC = TRI->getBoolRC();
- InsertKillCleanups =
- MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
if (ST.isWave32()) {
AndOpc = AMDGPU::S_AND_B32;
@@ -836,7 +805,27 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
Exec = AMDGPU::EXEC;
}
- SmallVector<MachineInstr *, 32> Worklist;
+ // Compute set of blocks with kills
+ const bool CanDemote =
+ MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
+ for (auto &MBB : MF) {
+ bool IsKillBlock = false;
+ for (auto &Term : MBB.terminators()) {
+ if (TII->isKillTerminator(Term.getOpcode())) {
+ KillBlocks.insert(&MBB);
+ IsKillBlock = true;
+ break;
+ }
+ }
+ if (CanDemote && !IsKillBlock) {
+ for (auto &MI : MBB) {
+ if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
+ KillBlocks.insert(&MBB);
+ break;
+ }
+ }
+ }
+ }
MachineFunction::iterator NextBB;
for (MachineFunction::iterator BI = MF.begin();
@@ -853,18 +842,12 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
switch (MI.getOpcode()) {
case AMDGPU::SI_IF:
- SplitMBB = process(MI);
- break;
-
case AMDGPU::SI_ELSE:
case AMDGPU::SI_IF_BREAK:
+ case AMDGPU::SI_WATERFALL_LOOP:
case AMDGPU::SI_LOOP:
case AMDGPU::SI_END_CF:
- // Only build worklist if SI_IF instructions must be processed first.
- if (InsertKillCleanups)
- Worklist.push_back(&MI);
- else
- SplitMBB = process(MI);
+ SplitMBB = process(MI);
break;
// FIXME: find a better place for this
@@ -886,14 +869,11 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
}
}
- for (MachineInstr *MI : Worklist)
- process(*MI);
-
optimizeEndCf();
LoweredEndCf.clear();
LoweredIf.clear();
- NeedsKillCleanup.clear();
+ KillBlocks.clear();
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 9570680ad9cb..672266f0c11e 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -598,6 +598,11 @@ void SILowerI1Copies::lowerPhis() {
MachineBasicBlock *PostDomBound =
PDT->findNearestCommonDominator(DomBlocks);
+
+ // FIXME: This fails to find irreducible cycles. If we have a def (other
+ // than a constant) in a pair of blocks that end up looping back to each
+ // other, it will be mishandle. Due to structurization this shouldn't occur
+ // in practice.
unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
SSAUpdater.Initialize(DstReg);
@@ -732,6 +737,9 @@ bool SILowerI1Copies::isConstantLaneMask(Register Reg, bool &Val) const {
const MachineInstr *MI;
for (;;) {
MI = MRI->getUniqueVRegDef(Reg);
+ if (MI->getOpcode() == AMDGPU::IMPLICIT_DEF)
+ return true;
+
if (MI->getOpcode() != AMDGPU::COPY)
break;
@@ -808,9 +816,9 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL, unsigned DstReg,
unsigned PrevReg, unsigned CurReg) {
- bool PrevVal;
+ bool PrevVal = false;
bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal);
- bool CurVal;
+ bool CurVal = false;
bool CurConstant = isConstantLaneMask(CurReg, CurVal);
if (PrevConstant && CurConstant) {
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 30405059530e..38b9d85b653b 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -31,12 +31,6 @@ using MBBVector = SmallVector<MachineBasicBlock *, 4>;
namespace {
-static cl::opt<bool> EnableSpillVGPRToAGPR(
- "amdgpu-spill-vgpr-to-agpr",
- cl::desc("Enable spilling VGPRs to AGPRs"),
- cl::ReallyHidden,
- cl::init(true));
-
class SILowerSGPRSpills : public MachineFunctionPass {
private:
const SIRegisterInfo *TRI = nullptr;
@@ -71,6 +65,7 @@ char SILowerSGPRSpills::ID = 0;
INITIALIZE_PASS_BEGIN(SILowerSGPRSpills, DEBUG_TYPE,
"SI lower SGPR spill instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE,
"SI lower SGPR spill instructions", false, false)
@@ -88,6 +83,8 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
MachineBasicBlock::iterator I = SaveBlock.begin();
if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) {
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+
for (const CalleeSavedInfo &CS : CSI) {
// Insert the spill to the stack frame.
MCRegister Reg = CS.getReg();
@@ -96,8 +93,13 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
const TargetRegisterClass *RC =
TRI->getMinimalPhysRegClass(Reg, MVT::i32);
- TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC,
- TRI);
+ // If this value was already livein, we probably have a direct use of the
+ // incoming register value, so don't kill at the spill point. This happens
+ // since we pass some special inputs (workgroup IDs) in the callee saved
+ // range.
+ const bool IsLiveIn = MRI.isLiveIn(Reg);
+ TII.storeRegToStackSlot(SaveBlock, I, Reg, !IsLiveIn, CS.getFrameIdx(),
+ RC, TRI);
if (LIS) {
assert(std::distance(MIS.begin(), I) == 1);
@@ -255,13 +257,10 @@ static bool lowerShiftReservedVGPR(MachineFunction &MF,
if (!LowestAvailableVGPR)
LowestAvailableVGPR = PreReservedVGPR;
- const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
- Optional<int> FI;
- // Check if we are reserving a CSR. Create a stack object for a possible spill
- // in the function prologue.
- if (FuncInfo->isCalleeSavedReg(CSRegs, LowestAvailableVGPR))
- FI = FrameInfo.CreateSpillStackObject(4, Align(4));
+ // Create a stack object for a possible spill in the function prologue.
+ // Note Non-CSR VGPR also need this as we may overwrite inactive lanes.
+ Optional<int> FI = FrameInfo.CreateSpillStackObject(4, Align(4));
// Find saved info about the pre-reserved register.
const auto *ReservedVGPRInfoItr =
@@ -291,6 +290,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
TRI = &TII->getRegisterInfo();
VRM = getAnalysisIfAvailable<VirtRegMap>();
+ LIS = getAnalysisIfAvailable<LiveIntervals>();
assert(SaveBlocks.empty() && RestoreBlocks.empty());
@@ -300,29 +300,28 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
bool HasCSRs = spillCalleeSavedRegs(MF);
MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+
if (!MFI.hasStackObjects() && !HasCSRs) {
SaveBlocks.clear();
RestoreBlocks.clear();
+ if (FuncInfo->VGPRReservedForSGPRSpill) {
+ // Free the reserved VGPR for later possible use by frame lowering.
+ FuncInfo->removeVGPRForSGPRSpill(FuncInfo->VGPRReservedForSGPRSpill, MF);
+ MRI.freezeReservedRegs(MF);
+ }
return false;
}
- MachineRegisterInfo &MRI = MF.getRegInfo();
- SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
- const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
- && EnableSpillVGPRToAGPR;
-
bool MadeChange = false;
-
- const bool SpillToAGPR = EnableSpillVGPRToAGPR && ST.hasMAIInsts();
- std::unique_ptr<RegScavenger> RS;
-
bool NewReservedRegs = false;
// TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be
// handled as SpilledToReg in regular PrologEpilogInserter.
const bool HasSGPRSpillToVGPR = TRI->spillSGPRToVGPR() &&
(HasCSRs || FuncInfo->hasSpilledSGPRs());
- if (HasSGPRSpillToVGPR || SpillVGPRToAGPR) {
+ if (HasSGPRSpillToVGPR) {
// Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
// are spilled to VGPRs, in which case we can eliminate the stack usage.
//
@@ -331,33 +330,15 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
lowerShiftReservedVGPR(MF, ST);
+ // To track the spill frame indices handled in this pass.
+ BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
+
for (MachineBasicBlock &MBB : MF) {
MachineBasicBlock::iterator Next;
for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) {
MachineInstr &MI = *I;
Next = std::next(I);
- if (SpillToAGPR && TII->isVGPRSpill(MI)) {
- // Try to eliminate stack used by VGPR spills before frame
- // finalization.
- unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
- AMDGPU::OpName::vaddr);
- int FI = MI.getOperand(FIOp).getIndex();
- Register VReg =
- TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
- if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
- TRI->isAGPR(MRI, VReg))) {
- NewReservedRegs = true;
- if (!RS)
- RS.reset(new RegScavenger());
-
- // FIXME: change to enterBasicBlockEnd()
- RS->enterBasicBlock(MBB);
- TRI->eliminateFrameIndex(MI, 0, FIOp, RS.get());
- continue;
- }
- }
-
if (!TII->isSGPRSpill(MI))
continue;
@@ -365,24 +346,32 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) {
NewReservedRegs = true;
- bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI, nullptr);
+ bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI,
+ nullptr, LIS);
(void)Spilled;
assert(Spilled && "failed to spill SGPR to VGPR when allocated");
+ SpillFIs.set(FI);
}
}
}
+ // FIXME: Adding to live-ins redundant with reserving registers.
for (MachineBasicBlock &MBB : MF) {
for (auto SSpill : FuncInfo->getSGPRSpillVGPRs())
MBB.addLiveIn(SSpill.VGPR);
-
- for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
- MBB.addLiveIn(Reg);
-
- for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
- MBB.addLiveIn(Reg);
-
MBB.sortUniqueLiveIns();
+
+ // FIXME: The dead frame indices are replaced with a null register from
+ // the debug value instructions. We should instead, update it with the
+ // correct register value. But not sure the register value alone is
+ // adequate to lower the DIExpression. It should be worked out later.
+ for (MachineInstr &MI : MBB) {
+ if (MI.isDebugValue() && MI.getOperand(0).isFI() &&
+ SpillFIs[MI.getOperand(0).getIndex()]) {
+ MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/);
+ MI.getOperand(0).setIsDebug();
+ }
+ }
}
MadeChange = true;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 9a0cdc7b1f4d..85cfe36df16a 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -8,6 +8,22 @@
#include "SIMachineFunctionInfo.h"
#include "AMDGPUTargetMachine.h"
+#include "AMDGPUSubtarget.h"
+#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MIRParser/MIParser.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include <cassert>
+#include <vector>
#define MAX_LANES 64
@@ -49,6 +65,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
// Enable all kernel inputs if we have the fixed ABI. Don't bother if we don't
// have any calls.
const bool UseFixedABI = AMDGPUTargetMachine::EnableFixedFunctionABI &&
+ CC != CallingConv::AMDGPU_Gfx &&
(!isEntryFunction() || HasCalls);
if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
@@ -61,6 +78,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
}
if (!isEntryFunction()) {
+ if (UseFixedABI)
+ ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
+
// TODO: Pick a high register, and shift down, similar to a kernel.
FrameOffsetReg = AMDGPU::SGPR33;
StackPtrOffsetReg = AMDGPU::SGPR32;
@@ -119,13 +139,15 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (WorkItemIDZ)
WorkItemIDY = true;
- PrivateSegmentWaveByteOffset = true;
+ if (!ST.flatScratchIsArchitected()) {
+ PrivateSegmentWaveByteOffset = true;
- // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
- if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
- (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
- ArgInfo.PrivateSegmentWaveByteOffset =
- ArgDescriptor::createRegister(AMDGPU::SGPR5);
+ // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
+ if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
+ (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
+ ArgInfo.PrivateSegmentWaveByteOffset =
+ ArgDescriptor::createRegister(AMDGPU::SGPR5);
+ }
}
bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
@@ -156,13 +178,14 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (UseFixedABI || F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
KernargSegmentPtr = true;
+ // TODO: This could be refined a lot. The attribute is a poor way of
+ // detecting calls or stack objects that may require it before argument
+ // lowering.
if (ST.hasFlatAddressSpace() && isEntryFunction() &&
- (isAmdHsaOrMesa || ST.enableFlatScratch())) {
- // TODO: This could be refined a lot. The attribute is a poor way of
- // detecting calls or stack objects that may require it before argument
- // lowering.
- if (HasCalls || HasStackObjects || ST.enableFlatScratch())
- FlatScratchInit = true;
+ (isAmdHsaOrMesa || ST.enableFlatScratch()) &&
+ (HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
+ !ST.flatScratchIsArchitected()) {
+ FlatScratchInit = true;
}
Attribute A = F.getFnAttribute("amdgpu-git-ptr-high");
@@ -285,8 +308,6 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
assert(Size >= 4 && "invalid sgpr spill size");
assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs");
- const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
-
// Make sure to handle the case where a wide SGPR spill may span between two
// VGPRs.
for (unsigned I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) {
@@ -309,16 +330,24 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
// partially spill the SGPR to VGPRs.
SGPRToVGPRSpills.erase(FI);
NumVGPRSpillLanes -= I;
+
+#if 0
+ DiagnosticInfoResourceLimit DiagOutOfRegs(MF.getFunction(),
+ "VGPRs for SGPR spilling",
+ 0, DS_Error);
+ MF.getFunction().getContext().diagnose(DiagOutOfRegs);
+#endif
return false;
}
- Optional<int> CSRSpillFI;
- if ((FrameInfo.hasCalls() || !isEntryFunction()) && CSRegs &&
- isCalleeSavedReg(CSRegs, LaneVGPR)) {
- CSRSpillFI = FrameInfo.CreateSpillStackObject(4, Align(4));
+ Optional<int> SpillFI;
+ // We need to preserve inactive lanes, so always save, even caller-save
+ // registers.
+ if (!isEntryFunction()) {
+ SpillFI = FrameInfo.CreateSpillStackObject(4, Align(4));
}
- SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, CSRSpillFI));
+ SpillVGPRs.push_back(SGPRSpillVGPR(LaneVGPR, SpillFI));
// Add this register as live-in to all blocks to avoid machine verifer
// complaining about use of an undefined physical register.
@@ -344,7 +373,7 @@ bool SIMachineFunctionInfo::reserveVGPRforSGPRSpills(MachineFunction &MF) {
MF.getRegInfo(), &AMDGPU::VGPR_32RegClass, MF, true);
if (LaneVGPR == Register())
return false;
- SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, None));
+ SpillVGPRs.push_back(SGPRSpillVGPR(LaneVGPR, None));
FuncInfo->VGPRReservedForSGPRSpill = LaneVGPR;
return true;
}
@@ -437,6 +466,21 @@ void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) {
}
}
+int SIMachineFunctionInfo::getScavengeFI(MachineFrameInfo &MFI,
+ const SIRegisterInfo &TRI) {
+ if (ScavengeFI)
+ return *ScavengeFI;
+ if (isEntryFunction()) {
+ ScavengeFI = MFI.CreateFixedObject(
+ TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
+ } else {
+ ScavengeFI = MFI.CreateStackObject(
+ TRI.getSpillSize(AMDGPU::SGPR_32RegClass),
+ TRI.getSpillAlign(AMDGPU::SGPR_32RegClass), false);
+ }
+ return *ScavengeFI;
+}
+
MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
return AMDGPU::SGPR0 + NumUserSGPRs;
@@ -529,7 +573,8 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo,
}
yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
- const llvm::SIMachineFunctionInfo &MFI, const TargetRegisterInfo &TRI)
+ const llvm::SIMachineFunctionInfo &MFI, const TargetRegisterInfo &TRI,
+ const llvm::MachineFunction &MF)
: ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
DynLDSAlign(MFI.getDynLDSAlign()), IsEntryFunction(MFI.isEntryFunction()),
@@ -543,6 +588,9 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)), Mode(MFI.getMode()) {
+ auto SFI = MFI.getOptionalScavengeFI();
+ if (SFI)
+ ScavengeFI = yaml::FrameIndex(*SFI, MF.getFrameInfo());
}
void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) {
@@ -550,7 +598,8 @@ void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) {
}
bool SIMachineFunctionInfo::initializeBaseYamlFields(
- const yaml::SIMachineFunctionInfo &YamlMFI) {
+ const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF,
+ PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange) {
ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize;
MaxKernArgAlign = assumeAligned(YamlMFI.MaxKernArgAlign);
LDSSize = YamlMFI.LDSSize;
@@ -563,6 +612,24 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
WaveLimiter = YamlMFI.WaveLimiter;
HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
+
+ if (YamlMFI.ScavengeFI) {
+ auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo());
+ if (!FIOrErr) {
+ // Create a diagnostic for a the frame index.
+ const MemoryBuffer &Buffer =
+ *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
+
+ Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1,
+ SourceMgr::DK_Error, toString(FIOrErr.takeError()),
+ "", None, None);
+ SourceRange = YamlMFI.ScavengeFI->SourceRange;
+ return true;
+ }
+ ScavengeFI = *FIOrErr;
+ } else {
+ ScavengeFI = None;
+ }
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 35fb43162199..fb6d4f8841ab 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -17,6 +17,7 @@
#include "AMDGPUMachineFunction.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
+#include "llvm/ADT/MapVector.h"
#include "llvm/CodeGen/MIRYamlMapping.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/Support/raw_ostream.h"
@@ -288,10 +289,12 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
Optional<SIArgumentInfo> ArgInfo;
SIMode Mode;
+ Optional<FrameIndex> ScavengeFI;
SIMachineFunctionInfo() = default;
SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
- const TargetRegisterInfo &TRI);
+ const TargetRegisterInfo &TRI,
+ const llvm::MachineFunction &MF);
void mappingImpl(yaml::IO &YamlIO) override;
~SIMachineFunctionInfo() = default;
@@ -321,6 +324,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("highBitsOf32BitAddress",
MFI.HighBitsOf32BitAddress, 0u);
YamlIO.mapOptional("occupancy", MFI.Occupancy, 0);
+ YamlIO.mapOptional("scavengeFI", MFI.ScavengeFI);
}
};
@@ -445,15 +449,15 @@ public:
bool hasReg() { return VGPR != 0;}
};
- struct SGPRSpillVGPRCSR {
+ struct SGPRSpillVGPR {
// VGPR used for SGPR spills
Register VGPR;
- // If the VGPR is a CSR, the stack slot used to save/restore it in the
- // prolog/epilog.
+ // If the VGPR is is used for SGPR spills in a non-entrypoint function, the
+ // stack slot used to save/restore it in the prolog/epilog.
Optional<int> FI;
- SGPRSpillVGPRCSR(Register V, Optional<int> F) : VGPR(V), FI(F) {}
+ SGPRSpillVGPR(Register V, Optional<int> F) : VGPR(V), FI(F) {}
};
struct VGPRSpillToAGPR {
@@ -461,16 +465,16 @@ public:
bool FullyAllocated = false;
};
- SparseBitVector<> WWMReservedRegs;
-
- void ReserveWWMRegister(Register Reg) { WWMReservedRegs.set(Reg); }
+ // Map WWM VGPR to a stack slot that is used to save/restore it in the
+ // prolog/epilog.
+ MapVector<Register, Optional<int>> WWMReservedRegs;
private:
// Track VGPR + wave index for each subregister of the SGPR spilled to
// frameindex key.
DenseMap<int, std::vector<SpilledReg>> SGPRToVGPRSpills;
unsigned NumVGPRSpillLanes = 0;
- SmallVector<SGPRSpillVGPRCSR, 2> SpillVGPRs;
+ SmallVector<SGPRSpillVGPR, 2> SpillVGPRs;
DenseMap<int, VGPRSpillToAGPR> VGPRToAGPRSpills;
@@ -480,6 +484,10 @@ private:
// VGPRs used for AGPR spills.
SmallVector<MCPhysReg, 32> SpillVGPR;
+ // Emergency stack slot. Sometimes, we create this before finalizing the stack
+ // frame, so save it here and add it to the RegScavenger later.
+ Optional<int> ScavengeFI;
+
public: // FIXME
/// If this is set, an SGPR used for save/restore of the register used for the
/// frame pointer.
@@ -497,7 +505,14 @@ public: // FIXME
public:
SIMachineFunctionInfo(const MachineFunction &MF);
- bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI);
+ bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI,
+ const MachineFunction &MF,
+ PerFunctionMIParsingState &PFS,
+ SMDiagnostic &Error, SMRange &SourceRange);
+
+ void reserveWWMRegister(Register Reg, Optional<int> FI) {
+ WWMReservedRegs.insert(std::make_pair(Reg, FI));
+ }
ArrayRef<SpilledReg> getSGPRToVGPRSpills(int FrameIndex) const {
auto I = SGPRToVGPRSpills.find(FrameIndex);
@@ -505,9 +520,7 @@ public:
ArrayRef<SpilledReg>() : makeArrayRef(I->second);
}
- ArrayRef<SGPRSpillVGPRCSR> getSGPRSpillVGPRs() const {
- return SpillVGPRs;
- }
+ ArrayRef<SGPRSpillVGPR> getSGPRSpillVGPRs() const { return SpillVGPRs; }
void setSGPRSpillVGPRs(Register NewVGPR, Optional<int> newFI, int Index) {
SpillVGPRs[Index].VGPR = NewVGPR;
@@ -538,6 +551,9 @@ public:
bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR);
void removeDeadFrameIndices(MachineFrameInfo &MFI);
+ int getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI);
+ Optional<int> getOptionalScavengeFI() const { return ScavengeFI; }
+
bool hasCalculatedTID() const { return TIDReg != 0; };
Register getTIDReg() const { return TIDReg; };
void setTIDReg(Register Reg) { TIDReg = Reg; }
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 3caa75e4d958..71be73c2f0e4 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -84,22 +84,6 @@ enum class SIAtomicAddrSpace {
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
};
-/// Sets named bit \p BitName to "true" if present in instruction \p MI.
-/// \returns Returns true if \p MI is modified, false otherwise.
-template <uint16_t BitName>
-bool enableNamedBit(const MachineBasicBlock::iterator &MI) {
- int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
- if (BitIdx == -1)
- return false;
-
- MachineOperand &Bit = MI->getOperand(BitIdx);
- if (Bit.getImm() != 0)
- return false;
-
- Bit.setImm(1);
- return true;
-}
-
class SIMemOpInfo final {
private:
@@ -129,12 +113,43 @@ private:
IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
IsVolatile(IsVolatile),
IsNonTemporal(IsNonTemporal) {
+
+ if (Ordering == AtomicOrdering::NotAtomic) {
+ assert(Scope == SIAtomicScope::NONE &&
+ OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
+ !IsCrossAddressSpaceOrdering &&
+ FailureOrdering == AtomicOrdering::NotAtomic);
+ return;
+ }
+
+ assert(Scope != SIAtomicScope::NONE &&
+ (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
+ SIAtomicAddrSpace::NONE &&
+ (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
+ SIAtomicAddrSpace::NONE &&
+ !isStrongerThan(FailureOrdering, Ordering));
+
// There is also no cross address space ordering if the ordering
// address space is the same as the instruction address space and
// only contains a single address space.
if ((OrderingAddrSpace == InstrAddrSpace) &&
isPowerOf2_32(uint32_t(InstrAddrSpace)))
this->IsCrossAddressSpaceOrdering = false;
+
+ // Limit the scope to the maximum supported by the instruction's address
+ // spaces.
+ if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
+ SIAtomicAddrSpace::NONE) {
+ this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
+ } else if ((InstrAddrSpace &
+ ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
+ SIAtomicAddrSpace::NONE) {
+ this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
+ } else if ((InstrAddrSpace &
+ ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
+ SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
+ this->Scope = std::min(Scope, SIAtomicScope::AGENT);
+ }
}
public:
@@ -202,12 +217,12 @@ private:
void reportUnsupported(const MachineBasicBlock::iterator &MI,
const char *Msg) const;
- /// Inspects the target synchonization scope \p SSID and determines
+ /// Inspects the target synchronization scope \p SSID and determines
/// the SI atomic scope it corresponds to, the address spaces it
/// covers, and whether the memory ordering applies between address
/// spaces.
Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
- toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const;
+ toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
/// \return Return a bit set of the address spaces accessed by \p AS.
SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
@@ -257,6 +272,11 @@ protected:
SICacheControl(const GCNSubtarget &ST);
+ /// Sets named bit \p BitName to "true" if present in instruction \p MI.
+ /// \returns Returns true if \p MI is modified, false otherwise.
+ bool enableNamedBit(const MachineBasicBlock::iterator MI,
+ AMDGPU::CPol::CPol Bit) const;
+
public:
/// Create a cache control for the subtarget \p ST.
@@ -269,6 +289,20 @@ public:
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const = 0;
+ /// Update \p MI memory store instruction to bypass any caches up to
+ /// the \p Scope memory scope for address spaces \p
+ /// AddrSpace. Return true iff the instruction was modified.
+ virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const = 0;
+
+ /// Update \p MI memory read-modify-write instruction to bypass any caches up
+ /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
+ /// iff the instruction was modified.
+ virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const = 0;
+
/// Update \p MI memory instruction of kind \p Op associated with address
/// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
/// true iff the instruction was modified.
@@ -324,13 +358,13 @@ protected:
/// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
/// is modified, false otherwise.
bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
- return enableNamedBit<AMDGPU::OpName::glc>(MI);
+ return enableNamedBit(MI, AMDGPU::CPol::GLC);
}
/// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
/// is modified, false otherwise.
bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
- return enableNamedBit<AMDGPU::OpName::slc>(MI);
+ return enableNamedBit(MI, AMDGPU::CPol::SLC);
}
public:
@@ -341,6 +375,14 @@ public:
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const override;
+ bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
+ bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile,
@@ -377,13 +419,54 @@ public:
};
+class SIGfx90ACacheControl : public SIGfx7CacheControl {
+public:
+
+ SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
+
+ bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
+ bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
+ bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
+ bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
+ SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsVolatile,
+ bool IsNonTemporal) const override;
+
+ bool insertWait(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ SIMemOp Op,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const override;
+
+ bool insertAcquire(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const override;
+
+ bool insertRelease(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const override;
+};
+
class SIGfx10CacheControl : public SIGfx7CacheControl {
protected:
/// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
/// is modified, false otherwise.
bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
- return enableNamedBit<AMDGPU::OpName::dlc>(MI);
+ return enableNamedBit(MI, AMDGPU::CPol::DLC);
}
public:
@@ -424,7 +507,7 @@ private:
/// Return true iff instruction \p MI is a atomic instruction that
/// returns a result.
bool isAtomicRet(const MachineInstr &MI) const {
- return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1;
+ return SIInstrInfo::isAtomicRet(MI);
}
/// Removes all processed atomic pseudo instructions from the current
@@ -476,7 +559,7 @@ void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
- SIAtomicAddrSpace InstrScope) const {
+ SIAtomicAddrSpace InstrAddrSpace) const {
if (SSID == SyncScope::System)
return std::make_tuple(SIAtomicScope::SYSTEM,
SIAtomicAddrSpace::ATOMIC,
@@ -499,23 +582,23 @@ SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
true);
if (SSID == MMI->getSystemOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::SYSTEM,
- SIAtomicAddrSpace::ATOMIC & InstrScope,
+ SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
false);
if (SSID == MMI->getAgentOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::AGENT,
- SIAtomicAddrSpace::ATOMIC & InstrScope,
+ SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
false);
if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::WORKGROUP,
- SIAtomicAddrSpace::ATOMIC & InstrScope,
+ SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
false);
if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::WAVEFRONT,
- SIAtomicAddrSpace::ATOMIC & InstrScope,
+ SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
false);
if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::SINGLETHREAD,
- SIAtomicAddrSpace::ATOMIC & InstrScope,
+ SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
false);
return None;
}
@@ -557,7 +640,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
IsVolatile |= MMO->isVolatile();
InstrAddrSpace |=
toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
- AtomicOrdering OpOrdering = MMO->getOrdering();
+ AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
if (OpOrdering != AtomicOrdering::NotAtomic) {
const auto &IsSyncScopeInclusion =
MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
@@ -568,9 +651,9 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
}
SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
- Ordering =
- isStrongerThan(Ordering, OpOrdering) ?
- Ordering : MMO->getOrdering();
+ Ordering = isStrongerThan(Ordering, OpOrdering)
+ ? Ordering
+ : MMO->getSuccessOrdering();
assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
FailureOrdering =
@@ -591,7 +674,8 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
ScopeOrNone.getValue();
if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
- ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
+ ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
+ ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
reportUnsupported(MI, "Unsupported atomic address space");
return None;
}
@@ -659,7 +743,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
}
return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
- IsCrossAddressSpaceOrdering);
+ IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
}
Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
@@ -682,9 +766,21 @@ SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
InsertCacheInv = !AmdgcnSkipCacheInvalidations;
}
+bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
+ AMDGPU::CPol::CPol Bit) const {
+ MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
+ if (!CPol)
+ return false;
+
+ CPol->setImm(CPol->getImm() | Bit);
+ return true;
+}
+
/* static */
std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
GCNSubtarget::Generation Generation = ST.getGeneration();
+ if (ST.hasGFX90AInsts())
+ return std::make_unique<SIGfx90ACacheControl>(ST);
if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
return std::make_unique<SIGfx6CacheControl>(ST);
if (Generation < AMDGPUSubtarget::GFX10)
@@ -725,6 +821,32 @@ bool SIGfx6CacheControl::enableLoadCacheBypass(
return Changed;
}
+bool SIGfx6CacheControl::enableStoreCacheBypass(
+ const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const {
+ assert(!MI->mayLoad() && MI->mayStore());
+ bool Changed = false;
+
+ /// The L1 cache is write through so does not need to be bypassed. There is no
+ /// bypass control for the L2 cache at the isa level.
+
+ return Changed;
+}
+
+bool SIGfx6CacheControl::enableRMWCacheBypass(
+ const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const {
+ assert(MI->mayLoad() && MI->mayStore());
+ bool Changed = false;
+
+ /// The L1 cache is write through so does not need to be bypassed. There is no
+ /// bypass control for the L2 cache at the isa level.
+
+ return Changed;
+}
+
bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile, bool IsNonTemporal) const {
@@ -968,6 +1090,292 @@ bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
return Changed;
}
+bool SIGfx90ACacheControl::enableLoadCacheBypass(
+ const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const {
+ assert(MI->mayLoad() && !MI->mayStore());
+ bool Changed = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ Changed |= enableGLCBit(MI);
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In threadgroup split mode the waves of a work-group can be executing on
+ // different CUs. Therefore need to bypass the L1 which is per CU.
+ // Otherwise in non-threadgroup split mode all waves of a work-group are
+ // on the same CU, and so the L1 does not need to be bypassed.
+ if (ST.isTgSplitEnabled()) Changed |= enableGLCBit(MI);
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to bypass.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory caches
+ /// to be bypassed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+
+ return Changed;
+}
+
+bool SIGfx90ACacheControl::enableStoreCacheBypass(
+ const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const {
+ assert(!MI->mayLoad() && MI->mayStore());
+ bool Changed = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ /// Do not set glc for store atomic operations as they implicitly write
+ /// through the L1 cache.
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to bypass. Store atomics implicitly write through the L1
+ // cache.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory caches
+ /// to be bypassed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+
+ return Changed;
+}
+
+bool SIGfx90ACacheControl::enableRMWCacheBypass(
+ const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const {
+ assert(MI->mayLoad() && MI->mayStore());
+ bool Changed = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ /// Do not set glc for RMW atomic operations as they implicitly bypass
+ /// the L1 cache, and the glc bit is instead used to indicate if they are
+ /// return or no-return.
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ return Changed;
+}
+
+bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
+ MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsVolatile, bool IsNonTemporal) const {
+ // Only handle load and store, not atomic read-modify-write insructions. The
+ // latter use glc to indicate if the atomic returns a result and so must not
+ // be used for cache control.
+ assert(MI->mayLoad() ^ MI->mayStore());
+
+ // Only update load and store, not LLVM IR atomic read-modify-write
+ // instructions. The latter are always marked as volatile so cannot sensibly
+ // handle it as do not want to pessimize all atomics. Also they do not support
+ // the nontemporal attribute.
+ assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
+
+ bool Changed = false;
+
+ if (IsVolatile) {
+ if (Op == SIMemOp::LOAD) {
+ Changed |= enableGLCBit(MI);
+ }
+
+ // Ensure operation has completed at system scope to cause all volatile
+ // operations to be visible outside the program in a global order. Do not
+ // request cross address space as only the global address space can be
+ // observable outside the program, so no need to cause a waitcnt for LDS
+ // address space operations.
+ Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
+ Position::AFTER);
+
+ return Changed;
+ }
+
+ if (IsNonTemporal) {
+ // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
+ Changed |= enableGLCBit(MI);
+ Changed |= enableSLCBit(MI);
+ return Changed;
+ }
+
+ return Changed;
+}
+
+bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ SIMemOp Op,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const {
+ if (ST.isTgSplitEnabled()) {
+ // In threadgroup split mode the waves of a work-group can be executing on
+ // different CUs. Therefore need to wait for global or GDS memory operations
+ // to complete to ensure they are visible to waves in the other CUs.
+ // Otherwise in non-threadgroup split mode all waves of a work-group are on
+ // the same CU, so no need to wait for global memory as all waves in the
+ // work-group access the same the L1, nor wait for GDS as access are ordered
+ // on a CU.
+ if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
+ SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
+ (Scope == SIAtomicScope::WORKGROUP)) {
+ // Same as GFX7 using agent scope.
+ Scope = SIAtomicScope::AGENT;
+ }
+ // In threadgroup split mode LDS cannot be allocated so no need to wait for
+ // LDS memory operations.
+ AddrSpace &= ~SIAtomicAddrSpace::LDS;
+ }
+ return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
+ IsCrossAddrSpaceOrdering, Pos);
+}
+
+bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const {
+ if (!InsertCacheInv)
+ return false;
+
+ bool Changed = false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ if (Pos == Position::AFTER)
+ ++MI;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Ensures that following loads will not see stale remote VMEM data or
+ // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
+ // CC will never be stale due to the local memory probes.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
+ // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
+ // hardware does not reorder memory operations by the same wave with
+ // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
+ // remove any cache lines of earlier writes by the same wave and ensures
+ // later reads by the same wave will refetch the cache lines.
+ Changed = true;
+ break;
+ case SIAtomicScope::AGENT:
+ // Same as GFX7.
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In threadgroup split mode the waves of a work-group can be executing on
+ // different CUs. Therefore need to invalidate the L1 which is per CU.
+ // Otherwise in non-threadgroup split mode all waves of a work-group are
+ // on the same CU, and so the L1 does not need to be invalidated.
+ if (ST.isTgSplitEnabled()) {
+ // Same as GFX7 using agent scope.
+ Scope = SIAtomicScope::AGENT;
+ }
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // Same as GFX7.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory cache
+ /// to be flushed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+
+ if (Pos == Position::AFTER)
+ --MI;
+
+ Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
+
+ return Changed;
+}
+
+bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const {
+ bool Changed = false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ if (Pos == Position::AFTER)
+ ++MI;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
+ // hardware does not reorder memory operations by the same wave with
+ // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
+ // to initiate writeback of any dirty cache lines of earlier writes by the
+ // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
+ // writeback has completed.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2));
+ // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
+ // vmcnt(0)" needed by the "BUFFER_WBL2".
+ Changed = true;
+ break;
+ case SIAtomicScope::AGENT:
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // Same as GFX7.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ if (Pos == Position::AFTER)
+ --MI;
+
+ Changed |=
+ SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
+ IsCrossAddrSpaceOrdering, Pos);
+
+ return Changed;
+}
+
bool SIGfx10CacheControl::enableLoadCacheBypass(
const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
@@ -1292,6 +1700,13 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
bool Changed = false;
if (MOI.isAtomic()) {
+ if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
+ MOI.getOrdering() == AtomicOrdering::Release ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
+ Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
+ MOI.getOrderingAddrSpace());
+ }
+
if (MOI.getOrdering() == AtomicOrdering::Release ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
Changed |= CC->insertRelease(MI, MOI.getScope(),
@@ -1336,7 +1751,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
Position::BEFORE);
// TODO: If both release and invalidate are happening they could be combined
- // to use the single "BUFFER_WBL2" instruction. This could be done by
+ // to use the single "BUFFER_WBINV*" instruction. This could be done by
// reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
// track cache invalidate and write back instructions.
@@ -1360,6 +1775,15 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
bool Changed = false;
if (MOI.isAtomic()) {
+ if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
+ MOI.getOrdering() == AtomicOrdering::Acquire ||
+ MOI.getOrdering() == AtomicOrdering::Release ||
+ MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+ MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
+ Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
+ MOI.getInstrAddrSpace());
+ }
+
if (MOI.getOrdering() == AtomicOrdering::Release ||
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
@@ -1375,7 +1799,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
Changed |= CC->insertWait(MI, MOI.getScope(),
- MOI.getOrderingAddrSpace(),
+ MOI.getInstrAddrSpace(),
isAtomicRet(*MI) ? SIMemOp::LOAD :
SIMemOp::STORE,
MOI.getIsCrossAddressSpaceOrdering(),
@@ -1401,7 +1825,7 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
// Unbundle instructions after the post-RA scheduler.
- if (MI->isBundle()) {
+ if (MI->isBundle() && MI->mayLoadOrStore()) {
MachineBasicBlock::instr_iterator II(MI->getIterator());
for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
I != E && I->isBundledWithPred(); ++I) {
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 54f20912d0a9..b9c839fe28ba 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -220,6 +220,18 @@ static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32));
return true;
}
+ case AMDGPU::S_AND_B64_term: {
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(TII.get(AMDGPU::S_AND_B64));
+ return true;
+ }
+ case AMDGPU::S_AND_B32_term: {
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(TII.get(AMDGPU::S_AND_B32));
+ return true;
+ }
default:
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index 162e96655df2..5f89f3826683 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -416,15 +416,20 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
continue;
Register SavedExec = I->getOperand(0).getReg();
- if (SavedExec.isVirtual() && MRI->hasOneNonDBGUse(SavedExec) &&
- MRI->use_instr_nodbg_begin(SavedExec)->getParent() ==
- I->getParent()) {
- LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *I << '\n');
- LIS->RemoveMachineInstrFromMaps(*I);
- I->eraseFromParent();
- MRI->replaceRegWith(SavedExec, ExecReg);
- LIS->removeInterval(SavedExec);
- Changed = true;
+ if (SavedExec.isVirtual() && MRI->hasOneNonDBGUse(SavedExec)) {
+ MachineInstr *SingleExecUser = &*MRI->use_instr_nodbg_begin(SavedExec);
+ int Idx = SingleExecUser->findRegisterUseOperandIdx(SavedExec);
+ assert(Idx != -1);
+ if (SingleExecUser->getParent() == I->getParent() &&
+ !SingleExecUser->getOperand(Idx).isImplicit() &&
+ TII->isOperandLegal(*SingleExecUser, Idx, &I->getOperand(1))) {
+ LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *I << '\n');
+ LIS->RemoveMachineInstrFromMaps(*I);
+ I->eraseFromParent();
+ MRI->replaceRegWith(SavedExec, ExecReg);
+ LIS->removeInterval(SavedExec);
+ Changed = true;
+ }
}
break;
}
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
new file mode 100644
index 000000000000..307c9eba9d3b
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
@@ -0,0 +1,637 @@
+//===--------------------- SIOptimizeVGPRLiveRange.cpp -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass tries to remove unnecessary VGPR live ranges in divergent if-else
+/// structures and waterfall loops.
+///
+/// When we do structurization, we usually transform an if-else into two
+/// sucessive if-then (with a flow block to do predicate inversion). Consider a
+/// simple case after structurization: A divergent value %a was defined before
+/// if-else and used in both THEN (use in THEN is optional) and ELSE part:
+/// bb.if:
+/// %a = ...
+/// ...
+/// bb.then:
+/// ... = op %a
+/// ... // %a can be dead here
+/// bb.flow:
+/// ...
+/// bb.else:
+/// ... = %a
+/// ...
+/// bb.endif
+///
+/// As register allocator has no idea of the thread-control-flow, it will just
+/// assume %a would be alive in the whole range of bb.then because of a later
+/// use in bb.else. On AMDGPU architecture, the VGPR is accessed with respect
+/// to exec mask. For this if-else case, the lanes active in bb.then will be
+/// inactive in bb.else, and vice-versa. So we are safe to say that %a was dead
+/// after the last use in bb.then until the end of the block. The reason is
+/// the instructions in bb.then will only overwrite lanes that will never be
+/// accessed in bb.else.
+///
+/// This pass aims to to tell register allocator that %a is in-fact dead,
+/// through inserting a phi-node in bb.flow saying that %a is undef when coming
+/// from bb.then, and then replace the uses in the bb.else with the result of
+/// newly inserted phi.
+///
+/// Two key conditions must be met to ensure correctness:
+/// 1.) The def-point should be in the same loop-level as if-else-endif to make
+/// sure the second loop iteration still get correct data.
+/// 2.) There should be no further uses after the IF-ELSE region.
+///
+///
+/// Waterfall loops get inserted around instructions that use divergent values
+/// but can only be executed with a uniform value. For example an indirect call
+/// to a divergent address:
+/// bb.start:
+/// %a = ...
+/// %fun = ...
+/// ...
+/// bb.loop:
+/// call %fun (%a)
+/// ... // %a can be dead here
+/// loop %bb.loop
+///
+/// The loop block is executed multiple times, but it is run exactly once for
+/// each active lane. Similar to the if-else case, the register allocator
+/// assumes that %a is live throughout the loop as it is used again in the next
+/// iteration. If %a is a VGPR that is unused after the loop, it does not need
+/// to be live after its last use in the loop block. By inserting a phi-node at
+/// the start of bb.loop that is undef when coming from bb.loop, the register
+/// allocation knows that the value of %a does not need to be preserved through
+/// iterations of the loop.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-opt-vgpr-liverange"
+
+namespace {
+
+class SIOptimizeVGPRLiveRange : public MachineFunctionPass {
+private:
+ const SIRegisterInfo *TRI = nullptr;
+ const SIInstrInfo *TII = nullptr;
+ LiveVariables *LV = nullptr;
+ MachineDominatorTree *MDT = nullptr;
+ const MachineLoopInfo *Loops = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+
+public:
+ static char ID;
+
+ MachineBasicBlock *getElseTarget(MachineBasicBlock *MBB) const;
+
+ void collectElseRegionBlocks(MachineBasicBlock *Flow,
+ MachineBasicBlock *Endif,
+ SmallSetVector<MachineBasicBlock *, 16> &) const;
+
+ void
+ collectCandidateRegisters(MachineBasicBlock *If, MachineBasicBlock *Flow,
+ MachineBasicBlock *Endif,
+ SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks,
+ SmallVectorImpl<Register> &CandidateRegs) const;
+
+ void collectWaterfallCandidateRegisters(
+ MachineBasicBlock *Loop,
+ SmallSetVector<Register, 16> &CandidateRegs) const;
+
+ void findNonPHIUsesInBlock(Register Reg, MachineBasicBlock *MBB,
+ SmallVectorImpl<MachineInstr *> &Uses) const;
+
+ void updateLiveRangeInThenRegion(Register Reg, MachineBasicBlock *If,
+ MachineBasicBlock *Flow) const;
+
+ void updateLiveRangeInElseRegion(
+ Register Reg, Register NewReg, MachineBasicBlock *Flow,
+ MachineBasicBlock *Endif,
+ SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const;
+
+ void
+ optimizeLiveRange(Register Reg, MachineBasicBlock *If,
+ MachineBasicBlock *Flow, MachineBasicBlock *Endif,
+ SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const;
+
+ void optimizeWaterfallLiveRange(Register Reg, MachineBasicBlock *If) const;
+
+ SIOptimizeVGPRLiveRange() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "SI Optimize VGPR LiveRange";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveVariables>();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineLoopInfo>();
+ AU.addPreserved<LiveVariables>();
+ AU.addPreserved<MachineDominatorTree>();
+ AU.addPreserved<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::IsSSA);
+ }
+};
+
+} // end anonymous namespace
+
+// Check whether the MBB is a else flow block and get the branching target which
+// is the Endif block
+MachineBasicBlock *
+SIOptimizeVGPRLiveRange::getElseTarget(MachineBasicBlock *MBB) const {
+ for (auto &BR : MBB->terminators()) {
+ if (BR.getOpcode() == AMDGPU::SI_ELSE)
+ return BR.getOperand(2).getMBB();
+ }
+ return nullptr;
+}
+
+void SIOptimizeVGPRLiveRange::collectElseRegionBlocks(
+ MachineBasicBlock *Flow, MachineBasicBlock *Endif,
+ SmallSetVector<MachineBasicBlock *, 16> &Blocks) const {
+ assert(Flow != Endif);
+
+ MachineBasicBlock *MBB = Endif;
+ unsigned Cur = 0;
+ while (MBB) {
+ for (auto *Pred : MBB->predecessors()) {
+ if (Pred != Flow && !Blocks.contains(Pred))
+ Blocks.insert(Pred);
+ }
+
+ if (Cur < Blocks.size())
+ MBB = Blocks[Cur++];
+ else
+ MBB = nullptr;
+ }
+
+ LLVM_DEBUG({
+ dbgs() << "Found Else blocks: ";
+ for (auto *MBB : Blocks)
+ dbgs() << printMBBReference(*MBB) << ' ';
+ dbgs() << '\n';
+ });
+}
+
+/// Find the instructions(excluding phi) in \p MBB that uses the \p Reg.
+void SIOptimizeVGPRLiveRange::findNonPHIUsesInBlock(
+ Register Reg, MachineBasicBlock *MBB,
+ SmallVectorImpl<MachineInstr *> &Uses) const {
+ for (auto &UseMI : MRI->use_nodbg_instructions(Reg)) {
+ if (UseMI.getParent() == MBB && !UseMI.isPHI())
+ Uses.push_back(&UseMI);
+ }
+}
+
+/// Collect the killed registers in the ELSE region which are not alive through
+/// the whole THEN region.
+void SIOptimizeVGPRLiveRange::collectCandidateRegisters(
+ MachineBasicBlock *If, MachineBasicBlock *Flow, MachineBasicBlock *Endif,
+ SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks,
+ SmallVectorImpl<Register> &CandidateRegs) const {
+
+ SmallSet<Register, 8> KillsInElse;
+
+ for (auto *Else : ElseBlocks) {
+ for (auto &MI : Else->instrs()) {
+ if (MI.isDebugInstr())
+ continue;
+
+ for (auto &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.getReg() || MO.isDef())
+ continue;
+
+ Register MOReg = MO.getReg();
+ // We can only optimize AGPR/VGPR virtual register
+ if (MOReg.isPhysical() || !TRI->isVectorRegister(*MRI, MOReg))
+ continue;
+
+ if (MO.readsReg()) {
+ LiveVariables::VarInfo &VI = LV->getVarInfo(MOReg);
+ const MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent();
+ // Make sure two conditions are met:
+ // a.) the value is defined before/in the IF block
+ // b.) should be defined in the same loop-level.
+ if ((VI.AliveBlocks.test(If->getNumber()) || DefMBB == If) &&
+ Loops->getLoopFor(DefMBB) == Loops->getLoopFor(If)) {
+ // Check if the register is live into the endif block. If not,
+ // consider it killed in the else region.
+ LiveVariables::VarInfo &VI = LV->getVarInfo(MOReg);
+ if (!VI.isLiveIn(*Endif, MOReg, *MRI)) {
+ KillsInElse.insert(MOReg);
+ } else {
+ LLVM_DEBUG(dbgs() << "Excluding " << printReg(MOReg, TRI)
+ << " as Live in Endif\n");
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // Check the phis in the Endif, looking for value coming from the ELSE
+ // region. Make sure the phi-use is the last use.
+ for (auto &MI : Endif->phis()) {
+ for (unsigned Idx = 1; Idx < MI.getNumOperands(); Idx += 2) {
+ auto &MO = MI.getOperand(Idx);
+ auto *Pred = MI.getOperand(Idx + 1).getMBB();
+ if (Pred == Flow)
+ continue;
+ assert(ElseBlocks.contains(Pred) && "Should be from Else region\n");
+
+ if (!MO.isReg() || !MO.getReg() || MO.isUndef())
+ continue;
+
+ Register Reg = MO.getReg();
+ if (Reg.isPhysical() || !TRI->isVectorRegister(*MRI, Reg))
+ continue;
+
+ LiveVariables::VarInfo &VI = LV->getVarInfo(Reg);
+
+ if (VI.isLiveIn(*Endif, Reg, *MRI)) {
+ LLVM_DEBUG(dbgs() << "Excluding " << printReg(Reg, TRI)
+ << " as Live in Endif\n");
+ continue;
+ }
+ // Make sure two conditions are met:
+ // a.) the value is defined before/in the IF block
+ // b.) should be defined in the same loop-level.
+ const MachineBasicBlock *DefMBB = MRI->getVRegDef(Reg)->getParent();
+ if ((VI.AliveBlocks.test(If->getNumber()) || DefMBB == If) &&
+ Loops->getLoopFor(DefMBB) == Loops->getLoopFor(If))
+ KillsInElse.insert(Reg);
+ }
+ }
+
+ auto IsLiveThroughThen = [&](Register Reg) {
+ for (auto I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end(); I != E;
+ ++I) {
+ if (!I->readsReg())
+ continue;
+ auto *UseMI = I->getParent();
+ auto *UseMBB = UseMI->getParent();
+ if (UseMBB == Flow || UseMBB == Endif) {
+ if (!UseMI->isPHI())
+ return true;
+
+ auto *IncomingMBB = UseMI->getOperand(I.getOperandNo() + 1).getMBB();
+ // The register is live through the path If->Flow or Flow->Endif.
+ // we should not optimize for such cases.
+ if ((UseMBB == Flow && IncomingMBB != If) ||
+ (UseMBB == Endif && IncomingMBB == Flow))
+ return true;
+ }
+ }
+ return false;
+ };
+
+ for (auto Reg : KillsInElse) {
+ if (!IsLiveThroughThen(Reg))
+ CandidateRegs.push_back(Reg);
+ }
+}
+
+/// Collect the registers used in the waterfall loop block that are defined
+/// before.
+void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters(
+ MachineBasicBlock *Loop,
+ SmallSetVector<Register, 16> &CandidateRegs) const {
+
+ for (auto &MI : Loop->instrs()) {
+ if (MI.isDebugInstr())
+ continue;
+
+ for (auto &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.getReg() || MO.isDef())
+ continue;
+
+ Register MOReg = MO.getReg();
+ // We can only optimize AGPR/VGPR virtual register
+ if (MOReg.isPhysical() || !TRI->isVectorRegister(*MRI, MOReg))
+ continue;
+
+ if (MO.readsReg()) {
+ const MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent();
+ // Make sure the value is defined before the LOOP block
+ if (DefMBB != Loop && !CandidateRegs.contains(MOReg)) {
+ // If the variable is used after the loop, the register coalescer will
+ // merge the newly created register and remove the phi node again.
+ // Just do nothing in that case.
+ LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(MOReg);
+ bool IsUsed = false;
+ for (auto *Succ : Loop->successors()) {
+ if (Succ != Loop && OldVarInfo.isLiveIn(*Succ, MOReg, *MRI)) {
+ IsUsed = true;
+ break;
+ }
+ }
+ if (!IsUsed) {
+ LLVM_DEBUG(dbgs() << "Found candidate reg: "
+ << printReg(MOReg, TRI, 0, MRI) << '\n');
+ CandidateRegs.insert(MOReg);
+ } else {
+ LLVM_DEBUG(dbgs() << "Reg is used after loop, ignoring: "
+ << printReg(MOReg, TRI, 0, MRI) << '\n');
+ }
+ }
+ }
+ }
+ }
+}
+
+// Re-calculate the liveness of \p Reg in the THEN-region
+void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion(
+ Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow) const {
+
+ SmallPtrSet<MachineBasicBlock *, 16> PHIIncoming;
+
+ MachineBasicBlock *ThenEntry = nullptr;
+ for (auto *Succ : If->successors()) {
+ if (Succ != Flow) {
+ ThenEntry = Succ;
+ break;
+ }
+ }
+ assert(ThenEntry && "No successor in Then region?");
+
+ LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg);
+ df_iterator_default_set<MachineBasicBlock *, 16> Visited;
+
+ for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) {
+ if (MBB == Flow)
+ break;
+
+ // Clear Live bit, as we will recalculate afterwards
+ LLVM_DEBUG(dbgs() << "Clear AliveBlock " << printMBBReference(*MBB)
+ << '\n');
+ OldVarInfo.AliveBlocks.reset(MBB->getNumber());
+ }
+
+ // Get the blocks the Reg should be alive through
+ for (auto I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end(); I != E;
+ ++I) {
+ auto *UseMI = I->getParent();
+ if (UseMI->isPHI() && I->readsReg()) {
+ if (Visited.contains(UseMI->getParent()))
+ PHIIncoming.insert(UseMI->getOperand(I.getOperandNo() + 1).getMBB());
+ }
+ }
+
+ Visited.clear();
+
+ for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) {
+ if (MBB == Flow)
+ break;
+
+ SmallVector<MachineInstr *> Uses;
+ // PHI instructions has been processed before.
+ findNonPHIUsesInBlock(Reg, MBB, Uses);
+
+ if (Uses.size() == 1) {
+ LLVM_DEBUG(dbgs() << "Found one Non-PHI use in "
+ << printMBBReference(*MBB) << '\n');
+ LV->HandleVirtRegUse(Reg, MBB, *(*Uses.begin()));
+ } else if (Uses.size() > 1) {
+ // Process the instructions in-order
+ LLVM_DEBUG(dbgs() << "Found " << Uses.size() << " Non-PHI uses in "
+ << printMBBReference(*MBB) << '\n');
+ for (MachineInstr &MI : *MBB) {
+ if (llvm::is_contained(Uses, &MI))
+ LV->HandleVirtRegUse(Reg, MBB, MI);
+ }
+ }
+
+ // Mark Reg alive through the block if this is a PHI incoming block
+ if (PHIIncoming.contains(MBB))
+ LV->MarkVirtRegAliveInBlock(OldVarInfo, MRI->getVRegDef(Reg)->getParent(),
+ MBB);
+ }
+
+ // Set the isKilled flag if we get new Kills in the THEN region.
+ for (auto *MI : OldVarInfo.Kills) {
+ if (Visited.contains(MI->getParent()))
+ MI->addRegisterKilled(Reg, TRI);
+ }
+}
+
+void SIOptimizeVGPRLiveRange::updateLiveRangeInElseRegion(
+ Register Reg, Register NewReg, MachineBasicBlock *Flow,
+ MachineBasicBlock *Endif,
+ SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const {
+ LiveVariables::VarInfo &NewVarInfo = LV->getVarInfo(NewReg);
+ LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg);
+
+ // Transfer aliveBlocks from Reg to NewReg
+ for (auto *MBB : ElseBlocks) {
+ unsigned BBNum = MBB->getNumber();
+ if (OldVarInfo.AliveBlocks.test(BBNum)) {
+ NewVarInfo.AliveBlocks.set(BBNum);
+ LLVM_DEBUG(dbgs() << "Removing AliveBlock " << printMBBReference(*MBB)
+ << '\n');
+ OldVarInfo.AliveBlocks.reset(BBNum);
+ }
+ }
+
+ // Transfer the possible Kills in ElseBlocks from Reg to NewReg
+ auto I = OldVarInfo.Kills.begin();
+ while (I != OldVarInfo.Kills.end()) {
+ if (ElseBlocks.contains((*I)->getParent())) {
+ NewVarInfo.Kills.push_back(*I);
+ I = OldVarInfo.Kills.erase(I);
+ } else {
+ ++I;
+ }
+ }
+}
+
+void SIOptimizeVGPRLiveRange::optimizeLiveRange(
+ Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow,
+ MachineBasicBlock *Endif,
+ SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const {
+ // Insert a new PHI, marking the value from the THEN region being
+ // undef.
+ LLVM_DEBUG(dbgs() << "Optimizing " << printReg(Reg, TRI) << '\n');
+ const auto *RC = MRI->getRegClass(Reg);
+ Register NewReg = MRI->createVirtualRegister(RC);
+ Register UndefReg = MRI->createVirtualRegister(RC);
+ MachineInstrBuilder PHI = BuildMI(*Flow, Flow->getFirstNonPHI(), DebugLoc(),
+ TII->get(TargetOpcode::PHI), NewReg);
+ for (auto *Pred : Flow->predecessors()) {
+ if (Pred == If)
+ PHI.addReg(Reg).addMBB(Pred);
+ else
+ PHI.addReg(UndefReg, RegState::Undef).addMBB(Pred);
+ }
+
+ // Replace all uses in the ELSE region or the PHIs in ENDIF block
+ // Use early increment range because setReg() will update the linked list.
+ for (auto &O : make_early_inc_range(MRI->use_operands(Reg))) {
+ auto *UseMI = O.getParent();
+ auto *UseBlock = UseMI->getParent();
+ // Replace uses in Endif block
+ if (UseBlock == Endif) {
+ assert(UseMI->isPHI() && "Uses should be PHI in Endif block");
+ O.setReg(NewReg);
+ continue;
+ }
+
+ // Replace uses in Else region
+ if (ElseBlocks.contains(UseBlock))
+ O.setReg(NewReg);
+ }
+
+ // The optimized Reg is not alive through Flow blocks anymore.
+ LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg);
+ OldVarInfo.AliveBlocks.reset(Flow->getNumber());
+
+ updateLiveRangeInElseRegion(Reg, NewReg, Flow, Endif, ElseBlocks);
+ updateLiveRangeInThenRegion(Reg, If, Flow);
+}
+
+void SIOptimizeVGPRLiveRange::optimizeWaterfallLiveRange(
+ Register Reg, MachineBasicBlock *Loop) const {
+ // Insert a new PHI, marking the value from the last loop iteration undef.
+ LLVM_DEBUG(dbgs() << "Optimizing " << printReg(Reg, TRI) << '\n');
+ const auto *RC = MRI->getRegClass(Reg);
+ Register NewReg = MRI->createVirtualRegister(RC);
+ Register UndefReg = MRI->createVirtualRegister(RC);
+
+ // Replace all uses in the LOOP region
+ // Use early increment range because setReg() will update the linked list.
+ for (auto &O : make_early_inc_range(MRI->use_operands(Reg))) {
+ auto *UseMI = O.getParent();
+ auto *UseBlock = UseMI->getParent();
+ // Replace uses in Loop block
+ if (UseBlock == Loop)
+ O.setReg(NewReg);
+ }
+
+ MachineInstrBuilder PHI = BuildMI(*Loop, Loop->getFirstNonPHI(), DebugLoc(),
+ TII->get(TargetOpcode::PHI), NewReg);
+ for (auto *Pred : Loop->predecessors()) {
+ if (Pred == Loop)
+ PHI.addReg(UndefReg, RegState::Undef).addMBB(Pred);
+ else
+ PHI.addReg(Reg).addMBB(Pred);
+ }
+
+ LiveVariables::VarInfo &NewVarInfo = LV->getVarInfo(NewReg);
+ LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg);
+
+ // collectWaterfallCandidateRegisters only collects registers that are dead
+ // after the loop. So we know that the old reg is not live throughout the
+ // whole block anymore.
+ OldVarInfo.AliveBlocks.reset(Loop->getNumber());
+
+ // Mark the last use as kill
+ for (auto &MI : reverse(Loop->instrs())) {
+ if (MI.readsRegister(NewReg, TRI)) {
+ MI.addRegisterKilled(NewReg, TRI);
+ NewVarInfo.Kills.push_back(&MI);
+ break;
+ }
+ }
+ assert(!NewVarInfo.Kills.empty() &&
+ "Failed to find last usage of register in loop");
+}
+
+char SIOptimizeVGPRLiveRange::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SIOptimizeVGPRLiveRange, DEBUG_TYPE,
+ "SI Optimize VGPR LiveRange", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LiveVariables)
+INITIALIZE_PASS_END(SIOptimizeVGPRLiveRange, DEBUG_TYPE,
+ "SI Optimize VGPR LiveRange", false, false)
+
+char &llvm::SIOptimizeVGPRLiveRangeID = SIOptimizeVGPRLiveRange::ID;
+
+FunctionPass *llvm::createSIOptimizeVGPRLiveRangePass() {
+ return new SIOptimizeVGPRLiveRange();
+}
+
+bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) {
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+ MDT = &getAnalysis<MachineDominatorTree>();
+ Loops = &getAnalysis<MachineLoopInfo>();
+ LV = &getAnalysis<LiveVariables>();
+ MRI = &MF.getRegInfo();
+
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ bool MadeChange = false;
+
+ // TODO: we need to think about the order of visiting the blocks to get
+ // optimal result for nesting if-else cases.
+ for (MachineBasicBlock &MBB : MF) {
+ for (auto &MI : MBB.terminators()) {
+ // Detect the if-else blocks
+ if (MI.getOpcode() == AMDGPU::SI_IF) {
+ MachineBasicBlock *IfTarget = MI.getOperand(2).getMBB();
+ auto *Endif = getElseTarget(IfTarget);
+ if (!Endif)
+ continue;
+
+ SmallSetVector<MachineBasicBlock *, 16> ElseBlocks;
+ SmallVector<Register> CandidateRegs;
+
+ LLVM_DEBUG(dbgs() << "Checking IF-ELSE-ENDIF: "
+ << printMBBReference(MBB) << ' '
+ << printMBBReference(*IfTarget) << ' '
+ << printMBBReference(*Endif) << '\n');
+
+ // Collect all the blocks in the ELSE region
+ collectElseRegionBlocks(IfTarget, Endif, ElseBlocks);
+
+ // Collect the registers can be optimized
+ collectCandidateRegisters(&MBB, IfTarget, Endif, ElseBlocks,
+ CandidateRegs);
+ MadeChange |= !CandidateRegs.empty();
+ // Now we are safe to optimize.
+ for (auto Reg : CandidateRegs)
+ optimizeLiveRange(Reg, &MBB, IfTarget, Endif, ElseBlocks);
+ } else if (MI.getOpcode() == AMDGPU::SI_WATERFALL_LOOP) {
+ LLVM_DEBUG(dbgs() << "Checking Waterfall loop: "
+ << printMBBReference(MBB) << '\n');
+
+ SmallSetVector<Register, 16> CandidateRegs;
+ collectWaterfallCandidateRegisters(&MBB, CandidateRegs);
+ MadeChange |= !CandidateRegs.empty();
+ // Now we are safe to optimize.
+ for (auto Reg : CandidateRegs)
+ optimizeWaterfallLiveRange(Reg, &MBB);
+ }
+ }
+ }
+
+ return MadeChange;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index ab05081e55d5..e05aafe5e291 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -48,10 +48,18 @@ private:
SmallSet<Register, 16> Defs;
- bool isDependentLoad(const MachineInstr &MI) const;
+ void collectUsedRegUnits(const MachineInstr &MI,
+ BitVector &UsedRegUnits) const;
+ bool isBundleCandidate(const MachineInstr &MI) const;
+ bool isDependentLoad(const MachineInstr &MI) const;
+ bool canBundle(const MachineInstr &MI, const MachineInstr &NextMI) const;
};
+constexpr uint64_t MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF |
+ SIInstrFlags::SMRD | SIInstrFlags::DS |
+ SIInstrFlags::FLAT | SIInstrFlags::MIMG;
+
} // End anonymous namespace.
INITIALIZE_PASS(SIPostRABundler, DEBUG_TYPE, "SI post-RA bundler", false, false)
@@ -80,55 +88,125 @@ bool SIPostRABundler::isDependentLoad(const MachineInstr &MI) const {
return false;
}
+void SIPostRABundler::collectUsedRegUnits(const MachineInstr &MI,
+ BitVector &UsedRegUnits) const {
+ for (const MachineOperand &Op : MI.operands()) {
+ if (!Op.isReg() || !Op.readsReg())
+ continue;
+
+ Register Reg = Op.getReg();
+ assert(!Op.getSubReg() &&
+ "subregister indexes should not be present after RA");
+
+ for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+ UsedRegUnits.set(*Units);
+ }
+}
+
+bool SIPostRABundler::isBundleCandidate(const MachineInstr &MI) const {
+ const uint64_t IMemFlags = MI.getDesc().TSFlags & MemFlags;
+ return IMemFlags != 0 && MI.mayLoadOrStore() && !MI.isBundled();
+}
+
+bool SIPostRABundler::canBundle(const MachineInstr &MI,
+ const MachineInstr &NextMI) const {
+ const uint64_t IMemFlags = MI.getDesc().TSFlags & MemFlags;
+
+ return (IMemFlags != 0 && MI.mayLoadOrStore() && !NextMI.isBundled() &&
+ NextMI.mayLoad() == MI.mayLoad() && NextMI.mayStore() == MI.mayStore() &&
+ ((NextMI.getDesc().TSFlags & MemFlags) == IMemFlags) &&
+ !isDependentLoad(NextMI));
+}
+
bool SIPostRABundler::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
- bool Changed = false;
- const uint64_t MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF |
- SIInstrFlags::SMRD | SIInstrFlags::DS |
- SIInstrFlags::FLAT | SIInstrFlags::MIMG;
+ BitVector BundleUsedRegUnits(TRI->getNumRegUnits());
+ BitVector KillUsedRegUnits(TRI->getNumRegUnits());
+ bool Changed = false;
for (MachineBasicBlock &MBB : MF) {
MachineBasicBlock::instr_iterator Next;
MachineBasicBlock::instr_iterator B = MBB.instr_begin();
MachineBasicBlock::instr_iterator E = MBB.instr_end();
+
for (auto I = B; I != E; I = Next) {
Next = std::next(I);
+ if (!isBundleCandidate(*I))
+ continue;
+
+ assert(Defs.empty());
+
+ if (I->getNumExplicitDefs() != 0)
+ Defs.insert(I->defs().begin()->getReg());
+
+ MachineBasicBlock::instr_iterator BundleStart = I;
+ MachineBasicBlock::instr_iterator BundleEnd = I;
+ unsigned ClauseLength = 1;
+ for (I = Next; I != E; I = Next) {
+ Next = std::next(I);
+
+ assert(BundleEnd != I);
+ if (canBundle(*BundleEnd, *I)) {
+ BundleEnd = I;
+ if (I->getNumExplicitDefs() != 0)
+ Defs.insert(I->defs().begin()->getReg());
+ ++ClauseLength;
+ } else if (!I->isMetaInstruction()) {
+ // Allow meta instructions in between bundle candidates, but do not
+ // start or end a bundle on one.
+ //
+ // TODO: It may be better to move meta instructions like dbg_value
+ // after the bundle. We're relying on the memory legalizer to unbundle
+ // these.
+ break;
+ }
+ }
+
+ Next = std::next(BundleEnd);
+ if (ClauseLength > 1) {
+ Changed = true;
+
+ // Before register allocation, kills are inserted after potential soft
+ // clauses to hint register allocation. Look for kills that look like
+ // this, and erase them.
+ if (Next != E && Next->isKill()) {
+
+ // TODO: Should maybe back-propagate kill flags to the bundle.
+ for (const MachineInstr &BundleMI : make_range(BundleStart, Next))
+ collectUsedRegUnits(BundleMI, BundleUsedRegUnits);
+
+ BundleUsedRegUnits.flip();
- const uint64_t IMemFlags = I->getDesc().TSFlags & MemFlags;
+ while (Next != E && Next->isKill()) {
+ MachineInstr &Kill = *Next;
+ collectUsedRegUnits(Kill, KillUsedRegUnits);
- if (IMemFlags == 0 || I->isBundled() || !I->mayLoadOrStore() ||
- B->mayLoad() != I->mayLoad() || B->mayStore() != I->mayStore() ||
- ((B->getDesc().TSFlags & MemFlags) != IMemFlags) ||
- isDependentLoad(*I)) {
+ KillUsedRegUnits &= BundleUsedRegUnits;
- if (B != I) {
- if (std::next(B) != I) {
- finalizeBundle(MBB, B, I);
- Changed = true;
+ // Erase the kill if it's a subset of the used registers.
+ //
+ // TODO: Should we just remove all kills? Is there any real reason to
+ // keep them after RA?
+ if (KillUsedRegUnits.none()) {
+ ++Next;
+ Kill.eraseFromParent();
+ } else
+ break;
+
+ KillUsedRegUnits.reset();
}
- Next = I;
+
+ BundleUsedRegUnits.reset();
}
- B = Next;
- Defs.clear();
- continue;
+ finalizeBundle(MBB, BundleStart, Next);
}
- if (I->getNumExplicitDefs() == 0)
- continue;
-
- Defs.insert(I->defs().begin()->getReg());
- }
-
- if (B != E && std::next(B) != E) {
- finalizeBundle(MBB, B, E);
- Changed = true;
+ Defs.clear();
}
-
- Defs.clear();
}
return Changed;
diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index dc08d9dcb9bb..c2e2875ed6bf 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -38,6 +38,9 @@ private:
RegisterClassInfo RegClassInfo;
std::vector<unsigned> RegsToRewrite;
+#ifndef NDEBUG
+ void printWWMInfo(const MachineInstr &MI);
+#endif
public:
static char ID;
@@ -139,13 +142,26 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
}
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ MachineFrameInfo &FrameInfo = MF.getFrameInfo();
for (unsigned Reg : RegsToRewrite) {
LIS->removeInterval(Reg);
const Register PhysReg = VRM->getPhys(Reg);
assert(PhysReg != 0);
- MFI->ReserveWWMRegister(PhysReg);
+
+ // Check if PhysReg is already reserved
+ if (!MFI->WWMReservedRegs.count(PhysReg)) {
+ Optional<int> FI;
+ if (!MFI->isEntryFunction()) {
+ // Create a stack object for a possible spill in the function prologue.
+ // Note: Non-CSR VGPR also need this as we may overwrite inactive lanes.
+ const TargetRegisterClass *RC = TRI->getPhysRegClass(PhysReg);
+ FI = FrameInfo.CreateSpillStackObject(TRI->getSpillSize(*RC),
+ TRI->getSpillAlign(*RC));
+ }
+ MFI->reserveWWMRegister(PhysReg, FI);
+ }
}
RegsToRewrite.clear();
@@ -154,6 +170,31 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
MRI->freezeReservedRegs(MF);
}
+#ifndef NDEBUG
+LLVM_DUMP_METHOD void
+SIPreAllocateWWMRegs::printWWMInfo(const MachineInstr &MI) {
+
+ unsigned Opc = MI.getOpcode();
+
+ if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM) {
+ dbgs() << "Entering ";
+ } else {
+ assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM);
+ dbgs() << "Exiting ";
+ }
+
+ if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WWM) {
+ dbgs() << "Strict WWM ";
+ } else {
+ assert(Opc == AMDGPU::ENTER_STRICT_WQM || Opc == AMDGPU::EXIT_STRICT_WQM);
+ dbgs() << "Strict WQM ";
+ }
+
+ dbgs() << "region: " << MI;
+}
+
+#endif
+
bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "SIPreAllocateWWMRegs: function " << MF.getName() << "\n");
@@ -185,21 +226,23 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64)
RegsAssigned |= processDef(MI.getOperand(0));
- if (MI.getOpcode() == AMDGPU::ENTER_WWM) {
- LLVM_DEBUG(dbgs() << "entering WWM region: " << MI << "\n");
+ if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM ||
+ MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM) {
+ LLVM_DEBUG(printWWMInfo(MI));
InWWM = true;
continue;
}
- if (MI.getOpcode() == AMDGPU::EXIT_WWM) {
- LLVM_DEBUG(dbgs() << "exiting WWM region: " << MI << "\n");
+ if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM ||
+ MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM) {
+ LLVM_DEBUG(printWWMInfo(MI));
InWWM = false;
}
if (!InWWM)
continue;
- LLVM_DEBUG(dbgs() << "processing " << MI << "\n");
+ LLVM_DEBUG(dbgs() << "Processing " << MI);
for (MachineOperand &DefOpnd : MI.defs()) {
RegsAssigned |= processDef(DefOpnd);
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 9ca43512cd91..dce0f4b0df5f 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -14,13 +14,20 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
using namespace llvm;
#define DEBUG_TYPE "si-pre-emit-peephole"
+static unsigned SkipThreshold;
+
+static cl::opt<unsigned, true> SkipThresholdFlag(
+ "amdgpu-skip-threshold", cl::Hidden,
+ cl::desc(
+ "Number of instructions before jumping over divergent control flow"),
+ cl::location(SkipThreshold), cl::init(12));
+
namespace {
class SIPreEmitPeephole : public MachineFunctionPass {
@@ -30,6 +37,13 @@ private:
bool optimizeVccBranch(MachineInstr &MI) const;
bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
+ bool getBlockDestinations(MachineBasicBlock &SrcMBB,
+ MachineBasicBlock *&TrueMBB,
+ MachineBasicBlock *&FalseMBB,
+ SmallVectorImpl<MachineOperand> &Cond);
+ bool mustRetainExeczBranch(const MachineBasicBlock &From,
+ const MachineBasicBlock &To) const;
+ bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
public:
static char ID;
@@ -219,8 +233,11 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
return false;
// Scan back to find an identical S_SET_GPR_IDX_ON
- for (MachineBasicBlock::iterator I = std::next(First.getIterator()),
- E = MI.getIterator(); I != E; ++I) {
+ for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
+ E = MI.getIterator();
+ I != E; ++I) {
+ if (I->isBundle())
+ continue;
switch (I->getOpcode()) {
case AMDGPU::S_SET_GPR_IDX_MODE:
return false;
@@ -249,9 +266,77 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
}
}
- MI.eraseFromParent();
+ MI.eraseFromBundle();
for (MachineInstr *RI : ToRemove)
- RI->eraseFromParent();
+ RI->eraseFromBundle();
+ return true;
+}
+
+bool SIPreEmitPeephole::getBlockDestinations(
+ MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
+ MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
+ if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
+ return false;
+
+ if (!FalseMBB)
+ FalseMBB = SrcMBB.getNextNode();
+
+ return true;
+}
+
+bool SIPreEmitPeephole::mustRetainExeczBranch(
+ const MachineBasicBlock &From, const MachineBasicBlock &To) const {
+ unsigned NumInstr = 0;
+ const MachineFunction *MF = From.getParent();
+
+ for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
+ MBBI != End && MBBI != ToI; ++MBBI) {
+ const MachineBasicBlock &MBB = *MBBI;
+
+ for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
+ I != E; ++I) {
+ // When a uniform loop is inside non-uniform control flow, the branch
+ // leaving the loop might never be taken when EXEC = 0.
+ // Hence we should retain cbranch out of the loop lest it become infinite.
+ if (I->isConditionalBranch())
+ return true;
+
+ if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
+ return true;
+
+ // These instructions are potentially expensive even if EXEC = 0.
+ if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
+ TII->isDS(*I) || I->getOpcode() == AMDGPU::S_WAITCNT)
+ return true;
+
+ ++NumInstr;
+ if (NumInstr >= SkipThreshold)
+ return true;
+ }
+ }
+
+ return false;
+}
+
+// Returns true if the skip branch instruction is removed.
+bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
+ MachineBasicBlock &SrcMBB) {
+ MachineBasicBlock *TrueMBB = nullptr;
+ MachineBasicBlock *FalseMBB = nullptr;
+ SmallVector<MachineOperand, 1> Cond;
+
+ if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
+ return false;
+
+ // Consider only the forward branches.
+ if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
+ mustRetainExeczBranch(*FalseMBB, *TrueMBB))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
+ MI.eraseFromParent();
+ SrcMBB.removeSuccessor(TrueMBB);
+
return true;
}
@@ -259,52 +344,25 @@ bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
- MachineBasicBlock *EmptyMBBAtEnd = nullptr;
bool Changed = false;
+ MF.RenumberBlocks();
+
for (MachineBasicBlock &MBB : MF) {
- MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator();
- MachineBasicBlock::iterator TermI = MBBE;
- // Check first terminator for VCC branches to optimize
+ MachineBasicBlock::iterator TermI = MBB.getFirstTerminator();
+ // Check first terminator for branches to optimize
if (TermI != MBB.end()) {
MachineInstr &MI = *TermI;
switch (MI.getOpcode()) {
case AMDGPU::S_CBRANCH_VCCZ:
case AMDGPU::S_CBRANCH_VCCNZ:
Changed |= optimizeVccBranch(MI);
- continue;
- default:
+ break;
+ case AMDGPU::S_CBRANCH_EXECZ:
+ Changed |= removeExeczBranch(MI, MBB);
break;
}
}
- // Check all terminators for SI_RETURN_TO_EPILOG
- // FIXME: This is not an optimization and should be moved somewhere else.
- while (TermI != MBB.end()) {
- MachineInstr &MI = *TermI;
- if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
- assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
-
- // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
- // because external bytecode will be appended at the end.
- if (&MBB != &MF.back() || &MI != &MBB.back()) {
- // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block
- // at the end and jump there.
- if (!EmptyMBBAtEnd) {
- EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
- MF.insert(MF.end(), EmptyMBBAtEnd);
- }
-
- MBB.addSuccessor(EmptyMBBAtEnd);
- BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
- .addMBB(EmptyMBBAtEnd);
- MI.eraseFromParent();
- MBBE = MBB.getFirstTerminator();
- TermI = MBBE;
- continue;
- }
- }
- TermI++;
- }
if (!ST.hasVGPRIndexMode())
continue;
@@ -315,10 +373,10 @@ bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
// Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
// second is not needed. Do expensive checks in the optimizeSetGPR()
// and limit the distance to 20 instructions for compile time purposes.
- for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBBE; ) {
- MachineInstr &MI = *MBBI;
- ++MBBI;
-
+ // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
+ // may be bundled with the instructions they modify.
+ for (auto &MI :
+ make_early_inc_range(make_range(MBB.instr_begin(), MBB.instr_end()))) {
if (Count == Threshold)
SetGPRMI = nullptr;
else
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index 9b72d0829d80..b13afceba20e 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -41,10 +41,13 @@ struct SIProgramInfo {
uint32_t ScratchBlocks = 0;
uint64_t ComputePGMRSrc2 = 0;
+ uint64_t ComputePGMRSrc3GFX90A = 0;
uint32_t NumVGPR = 0;
uint32_t NumArchVGPR = 0;
uint32_t NumAccVGPR = 0;
+ uint32_t AccumOffset = 0;
+ uint32_t TgSplit = 0;
uint32_t NumSGPR = 0;
uint32_t LDSSize = 0;
bool FlatUsed = false;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 7a45d8c54f9a..bba5bf7fdbc3 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -43,6 +43,233 @@ std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
+namespace llvm {
+
+// A temporary struct to spill SGPRs.
+// This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
+// just v_writelane and v_readlane.
+//
+// When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
+// is saved to scratch (or the other way around for loads).
+// For this, a VGPR is required where the needed lanes can be clobbered. The
+// RegScavenger can provide a VGPR where currently active lanes can be
+// clobbered, but we still need to save inactive lanes.
+// The high-level steps are:
+// - Try to scavenge SGPR(s) to save exec
+// - Try to scavenge VGPR
+// - Save needed, all or inactive lanes of a TmpVGPR
+// - Spill/Restore SGPRs using TmpVGPR
+// - Restore TmpVGPR
+//
+// To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
+// cannot scavenge temporary SGPRs to save exec, we use the following code:
+// buffer_store_dword TmpVGPR ; only if active lanes need to be saved
+// s_not exec, exec
+// buffer_store_dword TmpVGPR ; save inactive lanes
+// s_not exec, exec
+struct SGPRSpillBuilder {
+ struct PerVGPRData {
+ unsigned PerVGPR;
+ unsigned NumVGPRs;
+ int64_t VGPRLanes;
+ };
+
+ // The SGPR to save
+ Register SuperReg;
+ MachineBasicBlock::iterator MI;
+ ArrayRef<int16_t> SplitParts;
+ unsigned NumSubRegs;
+ bool IsKill;
+ const DebugLoc &DL;
+
+ /* When spilling to stack */
+ // The SGPRs are written into this VGPR, which is then written to scratch
+ // (or vice versa for loads).
+ Register TmpVGPR = AMDGPU::NoRegister;
+ // Temporary spill slot to save TmpVGPR to.
+ int TmpVGPRIndex = 0;
+ // If TmpVGPR is live before the spill or if it is scavenged.
+ bool TmpVGPRLive = false;
+ // Scavenged SGPR to save EXEC.
+ Register SavedExecReg = AMDGPU::NoRegister;
+ // Stack index to write the SGPRs to.
+ int Index;
+ unsigned EltSize = 4;
+
+ RegScavenger *RS;
+ MachineBasicBlock &MBB;
+ MachineFunction &MF;
+ SIMachineFunctionInfo &MFI;
+ const SIInstrInfo &TII;
+ const SIRegisterInfo &TRI;
+ bool IsWave32;
+ Register ExecReg;
+ unsigned MovOpc;
+ unsigned NotOpc;
+
+ SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII,
+ bool IsWave32, MachineBasicBlock::iterator MI, int Index,
+ RegScavenger *RS)
+ : SuperReg(MI->getOperand(0).getReg()), MI(MI),
+ IsKill(MI->getOperand(0).isKill()), DL(MI->getDebugLoc()), Index(Index),
+ RS(RS), MBB(*MI->getParent()), MF(*MBB.getParent()),
+ MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
+ IsWave32(IsWave32) {
+ const TargetRegisterClass *RC = TRI.getPhysRegClass(SuperReg);
+ SplitParts = TRI.getRegSplitParts(RC, EltSize);
+ NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
+
+ if (IsWave32) {
+ ExecReg = AMDGPU::EXEC_LO;
+ MovOpc = AMDGPU::S_MOV_B32;
+ NotOpc = AMDGPU::S_NOT_B32;
+ } else {
+ ExecReg = AMDGPU::EXEC;
+ MovOpc = AMDGPU::S_MOV_B64;
+ NotOpc = AMDGPU::S_NOT_B64;
+ }
+
+ assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
+ assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
+ SuperReg != AMDGPU::EXEC && "exec should never spill");
+ }
+
+ PerVGPRData getPerVGPRData() {
+ PerVGPRData Data;
+ Data.PerVGPR = IsWave32 ? 32 : 64;
+ Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR;
+ Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL;
+ return Data;
+ }
+
+ // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
+ // free.
+ // Writes these instructions if an SGPR can be scavenged:
+ // s_mov_b64 s[6:7], exec ; Save exec
+ // s_mov_b64 exec, 3 ; Wanted lanemask
+ // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot
+ //
+ // Writes these instructions if no SGPR can be scavenged:
+ // buffer_store_dword v0 ; Only if no free VGPR was found
+ // s_not_b64 exec, exec
+ // buffer_store_dword v0 ; Save inactive lanes
+ // ; exec stays inverted, it is flipped back in
+ // ; restore.
+ void prepare() {
+ // Scavenged temporary VGPR to use. It must be scavenged once for any number
+ // of spilled subregs.
+ // FIXME: The liveness analysis is limited and does not tell if a register
+ // is in use in lanes that are currently inactive. We can never be sure if
+ // a register as actually in use in another lane, so we need to save all
+ // used lanes of the chosen VGPR.
+ assert(RS && "Cannot spill SGPR to memory without RegScavenger");
+ TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0, false);
+
+ // Reserve temporary stack slot
+ TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI);
+ if (TmpVGPR) {
+ // Found a register that is dead in the currently active lanes, we only
+ // need to spill inactive lanes.
+ TmpVGPRLive = false;
+ } else {
+ // Pick v0 because it doesn't make a difference.
+ TmpVGPR = AMDGPU::VGPR0;
+ TmpVGPRLive = true;
+ }
+
+ // Try to scavenge SGPRs to save exec
+ assert(!SavedExecReg && "Exec is already saved, refuse to save again");
+ const TargetRegisterClass &RC =
+ IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
+ RS->setRegUsed(SuperReg);
+ SavedExecReg = RS->scavengeRegister(&RC, MI, 0, false);
+
+ int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
+
+ if (SavedExecReg) {
+ RS->setRegUsed(SavedExecReg);
+ // Set exec to needed lanes
+ BuildMI(MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg);
+ auto I = BuildMI(MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
+ if (!TmpVGPRLive)
+ I.addReg(TmpVGPR, RegState::ImplicitDefine);
+ // Spill needed lanes
+ TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
+ } else {
+ // Spill active lanes
+ if (TmpVGPRLive)
+ TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
+ /*IsKill*/ false);
+ // Spill inactive lanes
+ auto I = BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+ if (!TmpVGPRLive)
+ I.addReg(TmpVGPR, RegState::ImplicitDefine);
+ TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
+ }
+ }
+
+ // Writes these instructions if an SGPR can be scavenged:
+ // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot
+ // s_waitcnt vmcnt(0) ; If a free VGPR was found
+ // s_mov_b64 exec, s[6:7] ; Save exec
+ //
+ // Writes these instructions if no SGPR can be scavenged:
+ // buffer_load_dword v0 ; Restore inactive lanes
+ // s_waitcnt vmcnt(0) ; If a free VGPR was found
+ // s_not_b64 exec, exec
+ // buffer_load_dword v0 ; Only if no free VGPR was found
+ void restore() {
+ if (SavedExecReg) {
+ // Restore used lanes
+ TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
+ /*IsKill*/ false);
+ // Restore exec
+ auto I = BuildMI(MBB, MI, DL, TII.get(MovOpc), ExecReg)
+ .addReg(SavedExecReg, RegState::Kill);
+ // Add an implicit use of the load so it is not dead.
+ // FIXME This inserts an unnecessary waitcnt
+ if (!TmpVGPRLive) {
+ I.addReg(TmpVGPR, RegState::ImplicitKill);
+ }
+ } else {
+ // Restore inactive lanes
+ TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
+ /*IsKill*/ false);
+ auto I = BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+ if (!TmpVGPRLive) {
+ I.addReg(TmpVGPR, RegState::ImplicitKill);
+ }
+ // Restore active lanes
+ if (TmpVGPRLive)
+ TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
+ }
+ }
+
+ // Write TmpVGPR to memory or read TmpVGPR from memory.
+ // Either using a single buffer_load/store if exec is set to the needed mask
+ // or using
+ // buffer_load
+ // s_not exec, exec
+ // buffer_load
+ // s_not exec, exec
+ void readWriteTmpVGPR(unsigned Offset, bool IsLoad) {
+ if (SavedExecReg) {
+ // Spill needed lanes
+ TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
+ } else {
+ // Spill active lanes
+ TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
+ /*IsKill*/ false);
+ // Spill inactive lanes
+ BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+ TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
+ BuildMI(MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+ }
+ }
+};
+
+} // namespace llvm
+
SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
: AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST),
SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
@@ -122,7 +349,9 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
case CallingConv::Fast:
case CallingConv::Cold:
case CallingConv::AMDGPU_Gfx:
- return CSR_AMDGPU_HighRegs_SaveList;
+ return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts()
+ ? CSR_AMDGPU_HighRegs_With_AGPRs_SaveList
+ : CSR_AMDGPU_HighRegs_SaveList;
default: {
// Dummy to not crash RegisterClassInfo.
static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
@@ -143,7 +372,9 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
case CallingConv::Fast:
case CallingConv::Cold:
case CallingConv::AMDGPU_Gfx:
- return CSR_AMDGPU_HighRegs_RegMask;
+ return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()
+ ? CSR_AMDGPU_HighRegs_With_AGPRs_RegMask
+ : CSR_AMDGPU_HighRegs_RegMask;
default:
return nullptr;
}
@@ -172,7 +403,7 @@ bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
// When we need stack realignment, we can't reference off of the
// stack pointer, so we reserve a base pointer.
const MachineFrameInfo &MFI = MF.getFrameInfo();
- return MFI.getNumFixedObjects() && needsStackRealignment(MF);
+ return MFI.getNumFixedObjects() && shouldRealignStack(MF);
}
Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
@@ -181,6 +412,14 @@ const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
return CSR_AMDGPU_AllVGPRs_RegMask;
}
+const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const {
+ return CSR_AMDGPU_AllAGPRs_RegMask;
+}
+
+const uint32_t *SIRegisterInfo::getAllVectorRegMask() const {
+ return CSR_AMDGPU_AllVectorRegs_RegMask;
+}
+
const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
return CSR_AMDGPU_AllAllocatableSRegs_RegMask;
}
@@ -263,6 +502,12 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
}
unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
+ // TODO: In an entry function without calls and AGPRs used it is possible
+ // to use the whole register budget for VGPRs. Even more it shall
+ // be possible to estimate maximum AGPR/VGPR pressure and split
+ // register file accordingly.
+ if (ST.hasGFX90AInsts())
+ MaxNumVGPRs /= 2;
unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
@@ -323,9 +568,20 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
}
- for (MCRegister Reg : MFI->WWMReservedRegs) {
- reserveRegisterTuples(Reserved, Reg);
+ for (auto Reg : MFI->WWMReservedRegs) {
+ reserveRegisterTuples(Reserved, Reg.first);
+ }
+
+ // Reserve VGPRs used for SGPR spilling.
+ // Note we treat freezeReservedRegs unusually because we run register
+ // allocation in two phases. It's OK to re-freeze with new registers for the
+ // second run.
+#if 0
+ for (auto &SpilledFI : MFI->sgpr_spill_vgprs()) {
+ for (auto &SpilledVGPR : SpilledFI.second)
+ reserveRegisterTuples(Reserved, SpilledVGPR.VGPR);
}
+#endif
// FIXME: Stop using reserved registers for this.
for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
@@ -340,7 +596,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
return Reserved;
}
-bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const {
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
// On entry, the base address is 0, so it can't possibly need any more
// alignment.
@@ -350,7 +606,7 @@ bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const {
if (Info->isEntryFunction())
return false;
- return TargetRegisterInfo::canRealignStack(MF);
+ return TargetRegisterInfo::shouldRealignStack(MF);
}
bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
@@ -408,7 +664,7 @@ int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
}
bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
- if (!MI->mayLoadOrStore())
+ if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
return false;
int64_t FullOffset = Offset + getScratchInstrOffset(MI);
@@ -417,7 +673,8 @@ bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset);
const SIInstrInfo *TII = ST.getInstrInfo();
- return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, true);
+ return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS,
+ SIInstrFlags::FlatScratch);
}
Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
@@ -457,7 +714,7 @@ Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
.addFrameIndex(FrameIdx);
if (ST.enableFlatScratch() ) {
- BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_U32), BaseReg)
+ BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
.addReg(OffsetReg, RegState::Kill)
.addReg(FIReg);
return BaseReg;
@@ -500,7 +757,8 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
if (IsFlat) {
- assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true) &&
+ assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
+ SIInstrFlags::FlatScratch) &&
"offset should be legal");
FIOp->ChangeToRegister(BaseReg, false);
OffsetOp->setImm(NewOffset);
@@ -531,7 +789,8 @@ bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset);
const SIInstrInfo *TII = ST.getInstrInfo();
- return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true);
+ return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
+ SIInstrFlags::FlatScratch);
}
const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
@@ -566,6 +825,13 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_A256_SAVE:
case AMDGPU::SI_SPILL_A256_RESTORE:
return 8;
+ case AMDGPU::SI_SPILL_S224_SAVE:
+ case AMDGPU::SI_SPILL_S224_RESTORE:
+ case AMDGPU::SI_SPILL_V224_SAVE:
+ case AMDGPU::SI_SPILL_V224_RESTORE:
+ case AMDGPU::SI_SPILL_A224_SAVE:
+ case AMDGPU::SI_SPILL_A224_RESTORE:
+ return 7;
case AMDGPU::SI_SPILL_S192_SAVE:
case AMDGPU::SI_SPILL_S192_RESTORE:
case AMDGPU::SI_SPILL_V192_SAVE:
@@ -667,13 +933,11 @@ static int getOffsetMUBUFLoad(unsigned Opc) {
}
static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
+ MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
- int Index,
- unsigned Lane,
- unsigned ValueReg,
- bool IsKill) {
- MachineBasicBlock *MBB = MI->getParent();
- MachineFunction *MF = MI->getParent()->getParent();
+ int Index, unsigned Lane,
+ unsigned ValueReg, bool IsKill) {
+ MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
const SIInstrInfo *TII = ST.getInstrInfo();
@@ -691,8 +955,8 @@ static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
: AMDGPU::V_ACCVGPR_READ_B32_e64;
- auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
- .addReg(Src, getKillRegState(IsKill));
+ auto MIB = BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
+ .addReg(Src, getKillRegState(IsKill));
MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
return MIB;
}
@@ -716,7 +980,7 @@ static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
return false;
const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
- if (spillVGPRtoAGPR(ST, MI, Index, 0, Reg->getReg(), false).getInstr())
+ if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr())
return true;
MachineInstrBuilder NewMI =
@@ -725,10 +989,8 @@ static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
.add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
.addImm(Offset)
- .addImm(0) // glc
- .addImm(0) // slc
+ .addImm(0) // cpol
.addImm(0) // tfe
- .addImm(0) // dlc
.addImm(0) // swz
.cloneMemRefs(*MI);
@@ -774,23 +1036,20 @@ static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII,
return LoadStoreOp;
}
-void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
- unsigned LoadStoreOp,
- int Index,
- Register ValueReg,
- bool IsKill,
- MCRegister ScratchOffsetReg,
- int64_t InstOffset,
- MachineMemOperand *MMO,
- RegScavenger *RS) const {
- MachineBasicBlock *MBB = MI->getParent();
- MachineFunction *MF = MI->getParent()->getParent();
+void SIRegisterInfo::buildSpillLoadStore(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
+ MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
+ RegScavenger *RS, LivePhysRegs *LiveRegs) const {
+ assert((!RS || !LiveRegs) && "Only RS or LiveRegs can be set but not both");
+
+ MachineFunction *MF = MBB.getParent();
const SIInstrInfo *TII = ST.getInstrInfo();
const MachineFrameInfo &MFI = MF->getFrameInfo();
const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
- const DebugLoc &DL = MI->getDebugLoc();
+ const DebugLoc &DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
bool IsStore = Desc->mayStore();
bool IsFlat = TII->isFLATScratch(LoadStoreOp);
@@ -798,7 +1057,8 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
MCRegister SOffset = ScratchOffsetReg;
const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
- const bool IsAGPR = hasAGPRs(RC);
+ // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
+ const bool IsAGPR = !ST.hasGFX90AInsts() && hasAGPRs(RC);
const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8;
// Always use 4 byte operations for AGPRs because we need to scavenge
@@ -823,9 +1083,10 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
assert((IsFlat || ((Offset % EltSize) == 0)) &&
"unexpected VGPR spill offset");
- bool IsOffsetLegal = IsFlat
- ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, true)
- : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset);
+ bool IsOffsetLegal =
+ IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
+ SIInstrFlags::FlatScratch)
+ : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset);
if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
SOffset = MCRegister();
@@ -836,9 +1097,17 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
Offset *= ST.getWavefrontSize();
// We don't have access to the register scavenger if this function is called
- // during PEI::scavengeFrameVirtualRegs().
- if (RS)
+ // during PEI::scavengeFrameVirtualRegs() so use LiveRegs in this case.
+ if (RS) {
SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false);
+ } else if (LiveRegs) {
+ for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
+ if (LiveRegs->available(MF->getRegInfo(), Reg)) {
+ SOffset = Reg;
+ break;
+ }
+ }
+ }
if (!SOffset) {
// There are no free SGPRs, and since we are in the process of spilling
@@ -860,10 +1129,9 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
report_fatal_error("could not scavenge SGPR to spill in entry function");
if (ScratchOffsetReg == AMDGPU::NoRegister) {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset)
- .addImm(Offset);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
} else {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
.addReg(ScratchOffsetReg)
.addImm(Offset);
}
@@ -916,7 +1184,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
Register Sub = IsSubReg
? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
: ValueReg;
- auto MIB = spillVGPRtoAGPR(ST, MI, Index, Lane, Sub, IsKill);
+ auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill);
if (!MIB.getInstr())
break;
if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == 0)) {
@@ -962,9 +1230,9 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
RS->setRegUsed(TmpReg);
}
if (IsStore) {
- auto AccRead = BuildMI(*MBB, MI, DL,
- TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TmpReg)
- .addReg(SubReg, getKillRegState(IsKill));
+ auto AccRead = BuildMI(MBB, MI, DL,
+ TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TmpReg)
+ .addReg(SubReg, getKillRegState(IsKill));
if (NeedSuperRegDef)
AccRead.addReg(ValueReg, RegState::ImplicitDefine);
AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse);
@@ -977,9 +1245,9 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
commonAlignment(Alignment, RemRegOffset));
- auto MIB = BuildMI(*MBB, MI, DL, *Desc)
- .addReg(SubReg,
- getDefRegState(!IsStore) | getKillRegState(IsKill));
+ auto MIB =
+ BuildMI(MBB, MI, DL, *Desc)
+ .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill));
if (!IsFlat)
MIB.addReg(FuncInfo->getScratchRSrcReg());
@@ -990,11 +1258,9 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
MIB.addReg(SOffset, SOffsetRegState);
}
MIB.addImm(Offset + RemRegOffset)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0); // tfe for MUBUF or dlc for FLAT
+ .addImm(0); // cpol
if (!IsFlat)
- MIB.addImm(0) // dlc
+ MIB.addImm(0) // tfe
.addImm(0); // swz
MIB.addMemOperand(NewMMO);
@@ -1002,9 +1268,9 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
MIB.addReg(ValueReg, RegState::ImplicitDefine);
if (!IsStore && TmpReg != AMDGPU::NoRegister) {
- MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
+ MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
FinalReg)
- .addReg(TmpReg, RegState::Kill);
+ .addReg(TmpReg, RegState::Kill);
MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
}
@@ -1014,321 +1280,239 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
if (ScratchOffsetRegDelta != 0) {
// Subtract the offset we added to the ScratchOffset register.
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), SOffset)
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
.addReg(SOffset)
- .addImm(ScratchOffsetRegDelta);
+ .addImm(-ScratchOffsetRegDelta);
}
}
-// Generate a VMEM access which loads or stores the VGPR containing an SGPR
-// spill such that all the lanes set in VGPRLanes are loaded or stored.
-// This generates exec mask manipulation and will use SGPRs available in MI
-// or VGPR lanes in the VGPR to save and restore the exec mask.
-void SIRegisterInfo::buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI,
- int Index, int Offset,
- unsigned EltSize, Register VGPR,
- int64_t VGPRLanes,
- RegScavenger *RS,
- bool IsLoad) const {
- MachineBasicBlock *MBB = MI->getParent();
- MachineFunction *MF = MBB->getParent();
- SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- const SIInstrInfo *TII = ST.getInstrInfo();
-
- Register SuperReg = MI->getOperand(0).getReg();
- const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
- ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
- unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
- unsigned FirstPart = Offset * 32;
- unsigned ExecLane = 0;
-
- bool IsKill = MI->getOperand(0).isKill();
- const DebugLoc &DL = MI->getDebugLoc();
-
- // Cannot handle load/store to EXEC
- assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
- SuperReg != AMDGPU::EXEC && "exec should never spill");
-
- // On Wave32 only handle EXEC_LO.
- // On Wave64 only update EXEC_HI if there is sufficent space for a copy.
- bool OnlyExecLo = isWave32 || NumSubRegs == 1 || SuperReg == AMDGPU::EXEC_HI;
-
- unsigned ExecMovOpc = OnlyExecLo ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- Register ExecReg = OnlyExecLo ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- Register SavedExecReg;
-
- // Backup EXEC
- if (OnlyExecLo) {
- SavedExecReg =
- NumSubRegs == 1
- ? SuperReg
- : Register(getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]));
- } else {
- // If src/dst is an odd size it is possible subreg0 is not aligned.
- for (; ExecLane < (NumSubRegs - 1); ++ExecLane) {
- SavedExecReg = getMatchingSuperReg(
- getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]), AMDGPU::sub0,
- &AMDGPU::SReg_64_XEXECRegClass);
- if (SavedExecReg)
- break;
- }
- }
- assert(SavedExecReg);
- BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), SavedExecReg).addReg(ExecReg);
-
- // Setup EXEC
- BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg).addImm(VGPRLanes);
-
+void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index,
+ int Offset, bool IsLoad,
+ bool IsKill) const {
// Load/store VGPR
- MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+ MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo();
assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
- Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
- ? getBaseRegister()
- : getFrameRegister(*MF);
+ Register FrameReg =
+ FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF)
+ ? getBaseRegister()
+ : getFrameRegister(SB.MF);
Align Alignment = FrameInfo.getObjectAlign(Index);
- MachinePointerInfo PtrInfo =
- MachinePointerInfo::getFixedStack(*MF, Index);
- MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SB.MF, Index);
+ MachineMemOperand *MMO = SB.MF.getMachineMemOperand(
PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore,
- EltSize, Alignment);
+ SB.EltSize, Alignment);
if (IsLoad) {
unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
: AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
- buildSpillLoadStore(MI, Opc,
- Index,
- VGPR, false,
- FrameReg,
- Offset * EltSize, MMO,
- RS);
+ buildSpillLoadStore(SB.MBB, SB.MI, Opc, Index, SB.TmpVGPR, false, FrameReg,
+ Offset * SB.EltSize, MMO, SB.RS);
} else {
unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
: AMDGPU::BUFFER_STORE_DWORD_OFFSET;
- buildSpillLoadStore(MI, Opc, Index, VGPR,
- IsKill, FrameReg,
- Offset * EltSize, MMO, RS);
+ buildSpillLoadStore(SB.MBB, SB.MI, Opc, Index, SB.TmpVGPR, IsKill, FrameReg,
+ Offset * SB.EltSize, MMO, SB.RS);
// This only ever adds one VGPR spill
- MFI->addToSpilledVGPRs(1);
- }
-
- // Restore EXEC
- BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg)
- .addReg(SavedExecReg, getKillRegState(IsLoad || IsKill));
-
- // Restore clobbered SGPRs
- if (IsLoad) {
- // Nothing to do; register will be overwritten
- } else if (!IsKill) {
- // Restore SGPRs from appropriate VGPR lanes
- if (!OnlyExecLo) {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
- getSubReg(SuperReg, SplitParts[FirstPart + ExecLane + 1]))
- .addReg(VGPR)
- .addImm(ExecLane + 1);
- }
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
- NumSubRegs == 1 ? SavedExecReg
- : Register(getSubReg(
- SuperReg, SplitParts[FirstPart + ExecLane])))
- .addReg(VGPR, RegState::Kill)
- .addImm(ExecLane);
+ SB.MFI.addToSpilledVGPRs(1);
}
}
bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
int Index,
RegScavenger *RS,
+ LiveIntervals *LIS,
bool OnlyToVGPR) const {
- MachineBasicBlock *MBB = MI->getParent();
- MachineFunction *MF = MBB->getParent();
- SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
- ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
- = MFI->getSGPRToVGPRSpills(Index);
+ ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills =
+ SB.MFI.getSGPRToVGPRSpills(Index);
bool SpillToVGPR = !VGPRSpills.empty();
if (OnlyToVGPR && !SpillToVGPR)
return false;
- const SIInstrInfo *TII = ST.getInstrInfo();
-
- Register SuperReg = MI->getOperand(0).getReg();
- bool IsKill = MI->getOperand(0).isKill();
- const DebugLoc &DL = MI->getDebugLoc();
-
- assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
- SuperReg != MFI->getFrameOffsetReg()));
-
- assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
- assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
- SuperReg != AMDGPU::EXEC && "exec should never spill");
-
- unsigned EltSize = 4;
- const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
-
- ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
- unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
+ assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() &&
+ SB.SuperReg != SB.MFI.getFrameOffsetReg()));
if (SpillToVGPR) {
- for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
- Register SubReg = NumSubRegs == 1
- ? SuperReg
- : Register(getSubReg(SuperReg, SplitParts[i]));
+ for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
+ Register SubReg =
+ SB.NumSubRegs == 1
+ ? SB.SuperReg
+ : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
- bool UseKill = IsKill && i == NumSubRegs - 1;
+ bool UseKill = SB.IsKill && i == SB.NumSubRegs - 1;
// Mark the "old value of vgpr" input undef only if this is the first sgpr
// spill to this specific vgpr in the first basic block.
- auto MIB =
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill.VGPR)
- .addReg(SubReg, getKillRegState(UseKill))
- .addImm(Spill.Lane)
- .addReg(Spill.VGPR);
+ auto MIB = BuildMI(SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
+ Spill.VGPR)
+ .addReg(SubReg, getKillRegState(UseKill))
+ .addImm(Spill.Lane)
+ .addReg(Spill.VGPR);
+ if (LIS) {
+ if (i == 0)
+ LIS->ReplaceMachineInstrInMaps(*MI, *MIB);
+ else
+ LIS->InsertMachineInstrInMaps(*MIB);
+ }
- if (i == 0 && NumSubRegs > 1) {
+ if (i == 0 && SB.NumSubRegs > 1) {
// We may be spilling a super-register which is only partially defined,
// and need to ensure later spills think the value is defined.
- MIB.addReg(SuperReg, RegState::ImplicitDefine);
+ MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
}
- if (NumSubRegs > 1)
- MIB.addReg(SuperReg, getKillRegState(UseKill) | RegState::Implicit);
+ if (SB.NumSubRegs > 1)
+ MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit);
// FIXME: Since this spills to another register instead of an actual
// frame index, we should delete the frame index when all references to
// it are fixed.
}
} else {
- // Scavenged temporary VGPR to use. It must be scavenged once for any number
- // of spilled subregs.
- Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
- RS->setRegUsed(TmpVGPR);
+ SB.prepare();
- // SubReg carries the "Kill" flag when SubReg == SuperReg.
- unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
+ // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
+ unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
- unsigned PerVGPR = 32;
- unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR;
- int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL;
+ // Per VGPR helper data
+ auto PVD = SB.getPerVGPRData();
- for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) {
+ for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
unsigned TmpVGPRFlags = RegState::Undef;
// Write sub registers into the VGPR
- for (unsigned i = Offset * PerVGPR,
- e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
+ for (unsigned i = Offset * PVD.PerVGPR,
+ e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
i < e; ++i) {
- Register SubReg = NumSubRegs == 1
- ? SuperReg
- : Register(getSubReg(SuperReg, SplitParts[i]));
+ Register SubReg =
+ SB.NumSubRegs == 1
+ ? SB.SuperReg
+ : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
MachineInstrBuilder WriteLane =
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), TmpVGPR)
+ BuildMI(SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
+ SB.TmpVGPR)
.addReg(SubReg, SubKillState)
- .addImm(i % PerVGPR)
- .addReg(TmpVGPR, TmpVGPRFlags);
+ .addImm(i % PVD.PerVGPR)
+ .addReg(SB.TmpVGPR, TmpVGPRFlags);
TmpVGPRFlags = 0;
+ if (LIS) {
+ if (i == 0)
+ LIS->ReplaceMachineInstrInMaps(*MI, *WriteLane);
+ else
+ LIS->InsertMachineInstrInMaps(*WriteLane);
+ }
+
// There could be undef components of a spilled super register.
// TODO: Can we detect this and skip the spill?
- if (NumSubRegs > 1) {
- // The last implicit use of the SuperReg carries the "Kill" flag.
+ if (SB.NumSubRegs > 1) {
+ // The last implicit use of the SB.SuperReg carries the "Kill" flag.
unsigned SuperKillState = 0;
- if (i + 1 == NumSubRegs)
- SuperKillState |= getKillRegState(IsKill);
- WriteLane.addReg(SuperReg, RegState::Implicit | SuperKillState);
+ if (i + 1 == SB.NumSubRegs)
+ SuperKillState |= getKillRegState(SB.IsKill);
+ WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
}
}
// Write out VGPR
- buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
- RS, false);
+ SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false);
}
+
+ SB.restore();
}
MI->eraseFromParent();
- MFI->addToSpilledSGPRs(NumSubRegs);
+ SB.MFI.addToSpilledSGPRs(SB.NumSubRegs);
+
+ if (LIS)
+ LIS->removeAllRegUnitsForPhysReg(SB.SuperReg);
+
return true;
}
bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
int Index,
RegScavenger *RS,
+ LiveIntervals *LIS,
bool OnlyToVGPR) const {
- MachineFunction *MF = MI->getParent()->getParent();
- MachineBasicBlock *MBB = MI->getParent();
- SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
- ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
- = MFI->getSGPRToVGPRSpills(Index);
+ ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills =
+ SB.MFI.getSGPRToVGPRSpills(Index);
bool SpillToVGPR = !VGPRSpills.empty();
if (OnlyToVGPR && !SpillToVGPR)
return false;
- const SIInstrInfo *TII = ST.getInstrInfo();
- const DebugLoc &DL = MI->getDebugLoc();
-
- Register SuperReg = MI->getOperand(0).getReg();
-
- assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
- assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
- SuperReg != AMDGPU::EXEC && "exec should never spill");
-
- unsigned EltSize = 4;
-
- const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
-
- ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
- unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
-
if (SpillToVGPR) {
- for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
- Register SubReg = NumSubRegs == 1
- ? SuperReg
- : Register(getSubReg(SuperReg, SplitParts[i]));
+ for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
+ Register SubReg =
+ SB.NumSubRegs == 1
+ ? SB.SuperReg
+ : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
- auto MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
- .addReg(Spill.VGPR)
- .addImm(Spill.Lane);
- if (NumSubRegs > 1 && i == 0)
- MIB.addReg(SuperReg, RegState::ImplicitDefine);
+ auto MIB =
+ BuildMI(SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), SubReg)
+ .addReg(Spill.VGPR)
+ .addImm(Spill.Lane);
+ if (SB.NumSubRegs > 1 && i == 0)
+ MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
+ if (LIS) {
+ if (i == e - 1)
+ LIS->ReplaceMachineInstrInMaps(*MI, *MIB);
+ else
+ LIS->InsertMachineInstrInMaps(*MIB);
+ }
+
}
} else {
- Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
- RS->setRegUsed(TmpVGPR);
+ SB.prepare();
- unsigned PerVGPR = 32;
- unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR;
- int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL;
+ // Per VGPR helper data
+ auto PVD = SB.getPerVGPRData();
- for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) {
+ for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
// Load in VGPR data
- buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
- RS, true);
+ SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true);
// Unpack lanes
- for (unsigned i = Offset * PerVGPR,
- e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
+ for (unsigned i = Offset * PVD.PerVGPR,
+ e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
i < e; ++i) {
- Register SubReg = NumSubRegs == 1
- ? SuperReg
- : Register(getSubReg(SuperReg, SplitParts[i]));
+ Register SubReg =
+ SB.NumSubRegs == 1
+ ? SB.SuperReg
+ : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
bool LastSubReg = (i + 1 == e);
- auto MIB =
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
- .addReg(TmpVGPR, getKillRegState(LastSubReg))
- .addImm(i);
- if (NumSubRegs > 1 && i == 0)
- MIB.addReg(SuperReg, RegState::ImplicitDefine);
+ auto MIB = BuildMI(SB.MBB, MI, SB.DL,
+ SB.TII.get(AMDGPU::V_READLANE_B32), SubReg)
+ .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
+ .addImm(i);
+ if (SB.NumSubRegs > 1 && i == 0)
+ MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
+ if (LIS) {
+ if (i == e - 1)
+ LIS->ReplaceMachineInstrInMaps(*MI, *MIB);
+ else
+ LIS->InsertMachineInstrInMaps(*MIB);
+ }
}
}
+
+ SB.restore();
}
MI->eraseFromParent();
+
+ if (LIS)
+ LIS->removeAllRegUnitsForPhysReg(SB.SuperReg);
+
return true;
}
@@ -1338,28 +1522,31 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
MachineBasicBlock::iterator MI,
int FI,
- RegScavenger *RS) const {
+ RegScavenger *RS,
+ LiveIntervals *LIS) const {
switch (MI->getOpcode()) {
case AMDGPU::SI_SPILL_S1024_SAVE:
case AMDGPU::SI_SPILL_S512_SAVE:
case AMDGPU::SI_SPILL_S256_SAVE:
+ case AMDGPU::SI_SPILL_S224_SAVE:
case AMDGPU::SI_SPILL_S192_SAVE:
case AMDGPU::SI_SPILL_S160_SAVE:
case AMDGPU::SI_SPILL_S128_SAVE:
case AMDGPU::SI_SPILL_S96_SAVE:
case AMDGPU::SI_SPILL_S64_SAVE:
case AMDGPU::SI_SPILL_S32_SAVE:
- return spillSGPR(MI, FI, RS, true);
+ return spillSGPR(MI, FI, RS, LIS, true);
case AMDGPU::SI_SPILL_S1024_RESTORE:
case AMDGPU::SI_SPILL_S512_RESTORE:
case AMDGPU::SI_SPILL_S256_RESTORE:
+ case AMDGPU::SI_SPILL_S224_RESTORE:
case AMDGPU::SI_SPILL_S192_RESTORE:
case AMDGPU::SI_SPILL_S160_RESTORE:
case AMDGPU::SI_SPILL_S128_RESTORE:
case AMDGPU::SI_SPILL_S96_RESTORE:
case AMDGPU::SI_SPILL_S64_RESTORE:
case AMDGPU::SI_SPILL_S32_RESTORE:
- return restoreSGPR(MI, FI, RS, true);
+ return restoreSGPR(MI, FI, RS, LIS, true);
default:
llvm_unreachable("not an SGPR spill instruction");
}
@@ -1389,6 +1576,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_S1024_SAVE:
case AMDGPU::SI_SPILL_S512_SAVE:
case AMDGPU::SI_SPILL_S256_SAVE:
+ case AMDGPU::SI_SPILL_S224_SAVE:
case AMDGPU::SI_SPILL_S192_SAVE:
case AMDGPU::SI_SPILL_S160_SAVE:
case AMDGPU::SI_SPILL_S128_SAVE:
@@ -1403,6 +1591,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_S1024_RESTORE:
case AMDGPU::SI_SPILL_S512_RESTORE:
case AMDGPU::SI_SPILL_S256_RESTORE:
+ case AMDGPU::SI_SPILL_S224_RESTORE:
case AMDGPU::SI_SPILL_S192_RESTORE:
case AMDGPU::SI_SPILL_S160_RESTORE:
case AMDGPU::SI_SPILL_S128_RESTORE:
@@ -1417,6 +1606,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_V1024_SAVE:
case AMDGPU::SI_SPILL_V512_SAVE:
case AMDGPU::SI_SPILL_V256_SAVE:
+ case AMDGPU::SI_SPILL_V224_SAVE:
case AMDGPU::SI_SPILL_V192_SAVE:
case AMDGPU::SI_SPILL_V160_SAVE:
case AMDGPU::SI_SPILL_V128_SAVE:
@@ -1426,6 +1616,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_A1024_SAVE:
case AMDGPU::SI_SPILL_A512_SAVE:
case AMDGPU::SI_SPILL_A256_SAVE:
+ case AMDGPU::SI_SPILL_A224_SAVE:
case AMDGPU::SI_SPILL_A192_SAVE:
case AMDGPU::SI_SPILL_A160_SAVE:
case AMDGPU::SI_SPILL_A128_SAVE:
@@ -1439,13 +1630,11 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
: AMDGPU::BUFFER_STORE_DWORD_OFFSET;
- buildSpillLoadStore(MI, Opc,
- Index,
- VData->getReg(), VData->isKill(),
- FrameReg,
- TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
- *MI->memoperands_begin(),
- RS);
+ auto *MBB = MI->getParent();
+ buildSpillLoadStore(
+ *MBB, MI, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
+ TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
+ *MI->memoperands_begin(), RS);
MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
MI->eraseFromParent();
break;
@@ -1456,6 +1645,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_V128_RESTORE:
case AMDGPU::SI_SPILL_V160_RESTORE:
case AMDGPU::SI_SPILL_V192_RESTORE:
+ case AMDGPU::SI_SPILL_V224_RESTORE:
case AMDGPU::SI_SPILL_V256_RESTORE:
case AMDGPU::SI_SPILL_V512_RESTORE:
case AMDGPU::SI_SPILL_V1024_RESTORE:
@@ -1465,6 +1655,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_A128_RESTORE:
case AMDGPU::SI_SPILL_A160_RESTORE:
case AMDGPU::SI_SPILL_A192_RESTORE:
+ case AMDGPU::SI_SPILL_A224_RESTORE:
case AMDGPU::SI_SPILL_A256_RESTORE:
case AMDGPU::SI_SPILL_A512_RESTORE:
case AMDGPU::SI_SPILL_A1024_RESTORE: {
@@ -1475,18 +1666,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
: AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
- buildSpillLoadStore(MI, Opc,
- Index,
- VData->getReg(), VData->isKill(),
- FrameReg,
- TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
- *MI->memoperands_begin(),
- RS);
+ auto *MBB = MI->getParent();
+ buildSpillLoadStore(
+ *MBB, MI, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
+ TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
+ *MI->memoperands_begin(), RS);
MI->eraseFromParent();
break;
}
default: {
+ // Other access to frame index
const DebugLoc &DL = MI->getDebugLoc();
int64_t Offset = FrameInfo.getObjectOffset(Index);
@@ -1507,7 +1697,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
int64_t NewOffset = Offset + OffsetOp->getImm();
if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
- true)) {
+ SIInstrFlags::FlatScratch)) {
OffsetOp->setImm(NewOffset);
if (FrameReg)
return;
@@ -1580,9 +1770,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
FIOp.setIsKill(false);
}
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), TmpSReg)
- .addReg(FrameReg)
- .addImm(Offset);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg)
+ .addReg(FrameReg)
+ .addImm(Offset);
if (!UseSGPR)
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
@@ -1590,10 +1780,10 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (TmpSReg == FrameReg) {
// Undo frame register modification.
- BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_SUB_U32),
+ BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32),
FrameReg)
- .addReg(FrameReg)
- .addImm(Offset);
+ .addReg(FrameReg)
+ .addImm(-Offset);
}
return;
@@ -1667,17 +1857,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
.addReg(FrameReg)
.addImm(ST.getWavefrontSizeLog2());
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg)
- .addReg(ScaledReg, RegState::Kill)
- .addImm(Offset);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
+ .addReg(ScaledReg, RegState::Kill)
+ .addImm(Offset);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
.addReg(ScaledReg, RegState::Kill);
// If there were truly no free SGPRs, we need to undo everything.
if (!TmpScaledReg.isValid()) {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScaledReg)
- .addReg(ScaledReg, RegState::Kill)
- .addImm(Offset);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
+ .addReg(ScaledReg, RegState::Kill)
+ .addImm(-Offset);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
.addReg(FrameReg)
.addImm(ST.getWavefrontSizeLog2());
@@ -1735,14 +1925,8 @@ StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const {
return AMDGPUInstPrinter::getRegisterName(Reg);
}
-const TargetRegisterClass *
-SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) {
- if (BitWidth == 1)
- return &AMDGPU::VReg_1RegClass;
- if (BitWidth <= 16)
- return &AMDGPU::VGPR_LO16RegClass;
- if (BitWidth <= 32)
- return &AMDGPU::VGPR_32RegClass;
+static const TargetRegisterClass *
+getAnyVGPRClassForBitWidth(unsigned BitWidth) {
if (BitWidth <= 64)
return &AMDGPU::VReg_64RegClass;
if (BitWidth <= 96)
@@ -1753,6 +1937,8 @@ SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) {
return &AMDGPU::VReg_160RegClass;
if (BitWidth <= 192)
return &AMDGPU::VReg_192RegClass;
+ if (BitWidth <= 224)
+ return &AMDGPU::VReg_224RegClass;
if (BitWidth <= 256)
return &AMDGPU::VReg_256RegClass;
if (BitWidth <= 512)
@@ -1763,12 +1949,44 @@ SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) {
return nullptr;
}
+static const TargetRegisterClass *
+getAlignedVGPRClassForBitWidth(unsigned BitWidth) {
+ if (BitWidth <= 64)
+ return &AMDGPU::VReg_64_Align2RegClass;
+ if (BitWidth <= 96)
+ return &AMDGPU::VReg_96_Align2RegClass;
+ if (BitWidth <= 128)
+ return &AMDGPU::VReg_128_Align2RegClass;
+ if (BitWidth <= 160)
+ return &AMDGPU::VReg_160_Align2RegClass;
+ if (BitWidth <= 192)
+ return &AMDGPU::VReg_192_Align2RegClass;
+ if (BitWidth <= 224)
+ return &AMDGPU::VReg_224_Align2RegClass;
+ if (BitWidth <= 256)
+ return &AMDGPU::VReg_256_Align2RegClass;
+ if (BitWidth <= 512)
+ return &AMDGPU::VReg_512_Align2RegClass;
+ if (BitWidth <= 1024)
+ return &AMDGPU::VReg_1024_Align2RegClass;
+
+ return nullptr;
+}
+
const TargetRegisterClass *
-SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) {
+SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const {
+ if (BitWidth == 1)
+ return &AMDGPU::VReg_1RegClass;
if (BitWidth <= 16)
- return &AMDGPU::AGPR_LO16RegClass;
+ return &AMDGPU::VGPR_LO16RegClass;
if (BitWidth <= 32)
- return &AMDGPU::AGPR_32RegClass;
+ return &AMDGPU::VGPR_32RegClass;
+ return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth)
+ : getAnyVGPRClassForBitWidth(BitWidth);
+}
+
+static const TargetRegisterClass *
+getAnyAGPRClassForBitWidth(unsigned BitWidth) {
if (BitWidth <= 64)
return &AMDGPU::AReg_64RegClass;
if (BitWidth <= 96)
@@ -1779,6 +1997,8 @@ SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) {
return &AMDGPU::AReg_160RegClass;
if (BitWidth <= 192)
return &AMDGPU::AReg_192RegClass;
+ if (BitWidth <= 224)
+ return &AMDGPU::AReg_224RegClass;
if (BitWidth <= 256)
return &AMDGPU::AReg_256RegClass;
if (BitWidth <= 512)
@@ -1789,6 +2009,40 @@ SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) {
return nullptr;
}
+static const TargetRegisterClass *
+getAlignedAGPRClassForBitWidth(unsigned BitWidth) {
+ if (BitWidth <= 64)
+ return &AMDGPU::AReg_64_Align2RegClass;
+ if (BitWidth <= 96)
+ return &AMDGPU::AReg_96_Align2RegClass;
+ if (BitWidth <= 128)
+ return &AMDGPU::AReg_128_Align2RegClass;
+ if (BitWidth <= 160)
+ return &AMDGPU::AReg_160_Align2RegClass;
+ if (BitWidth <= 192)
+ return &AMDGPU::AReg_192_Align2RegClass;
+ if (BitWidth <= 224)
+ return &AMDGPU::AReg_224_Align2RegClass;
+ if (BitWidth <= 256)
+ return &AMDGPU::AReg_256_Align2RegClass;
+ if (BitWidth <= 512)
+ return &AMDGPU::AReg_512_Align2RegClass;
+ if (BitWidth <= 1024)
+ return &AMDGPU::AReg_1024_Align2RegClass;
+
+ return nullptr;
+}
+
+const TargetRegisterClass *
+SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const {
+ if (BitWidth <= 16)
+ return &AMDGPU::AGPR_LO16RegClass;
+ if (BitWidth <= 32)
+ return &AMDGPU::AGPR_32RegClass;
+ return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth)
+ : getAnyAGPRClassForBitWidth(BitWidth);
+}
+
const TargetRegisterClass *
SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
if (BitWidth <= 16)
@@ -1805,6 +2059,8 @@ SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
return &AMDGPU::SGPR_160RegClass;
if (BitWidth <= 192)
return &AMDGPU::SGPR_192RegClass;
+ if (BitWidth <= 224)
+ return &AMDGPU::SGPR_224RegClass;
if (BitWidth <= 256)
return &AMDGPU::SGPR_256RegClass;
if (BitWidth <= 512)
@@ -1827,29 +2083,51 @@ SIRegisterInfo::getPhysRegClass(MCRegister Reg) const {
&AMDGPU::VGPR_32RegClass,
&AMDGPU::SReg_32RegClass,
&AMDGPU::AGPR_32RegClass,
+ &AMDGPU::AGPR_32RegClass,
+ &AMDGPU::VReg_64_Align2RegClass,
&AMDGPU::VReg_64RegClass,
&AMDGPU::SReg_64RegClass,
+ &AMDGPU::AReg_64_Align2RegClass,
&AMDGPU::AReg_64RegClass,
+ &AMDGPU::VReg_96_Align2RegClass,
&AMDGPU::VReg_96RegClass,
&AMDGPU::SReg_96RegClass,
+ &AMDGPU::AReg_96_Align2RegClass,
&AMDGPU::AReg_96RegClass,
+ &AMDGPU::VReg_128_Align2RegClass,
&AMDGPU::VReg_128RegClass,
&AMDGPU::SReg_128RegClass,
+ &AMDGPU::AReg_128_Align2RegClass,
&AMDGPU::AReg_128RegClass,
+ &AMDGPU::VReg_160_Align2RegClass,
&AMDGPU::VReg_160RegClass,
&AMDGPU::SReg_160RegClass,
+ &AMDGPU::AReg_160_Align2RegClass,
&AMDGPU::AReg_160RegClass,
+ &AMDGPU::VReg_192_Align2RegClass,
&AMDGPU::VReg_192RegClass,
&AMDGPU::SReg_192RegClass,
+ &AMDGPU::AReg_192_Align2RegClass,
&AMDGPU::AReg_192RegClass,
+ &AMDGPU::VReg_224_Align2RegClass,
+ &AMDGPU::VReg_224RegClass,
+ &AMDGPU::SReg_224RegClass,
+ &AMDGPU::AReg_224_Align2RegClass,
+ &AMDGPU::AReg_224RegClass,
+ &AMDGPU::VReg_256_Align2RegClass,
&AMDGPU::VReg_256RegClass,
&AMDGPU::SReg_256RegClass,
+ &AMDGPU::AReg_256_Align2RegClass,
&AMDGPU::AReg_256RegClass,
+ &AMDGPU::VReg_512_Align2RegClass,
&AMDGPU::VReg_512RegClass,
&AMDGPU::SReg_512RegClass,
+ &AMDGPU::AReg_512_Align2RegClass,
&AMDGPU::AReg_512RegClass,
&AMDGPU::SReg_1024RegClass,
+ &AMDGPU::VReg_1024_Align2RegClass,
&AMDGPU::VReg_1024RegClass,
+ &AMDGPU::AReg_1024_Align2RegClass,
&AMDGPU::AReg_1024RegClass,
&AMDGPU::SCC_CLASSRegClass,
&AMDGPU::Pseudo_SReg_32RegClass,
@@ -1949,6 +2227,16 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
return RC;
}
+const TargetRegisterClass *
+SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC,
+ const TargetRegisterClass *SubRC,
+ unsigned SubIdx) const {
+ // Ensure this subregister index is aligned in the super register.
+ const TargetRegisterClass *MatchRC =
+ getMatchingSuperRegClass(SuperRC, SubRC, SubIdx);
+ return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr;
+}
+
bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST &&
OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST)
@@ -2147,6 +2435,12 @@ MCRegister SIRegisterInfo::getVCC() const {
return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
}
+const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const {
+ // VGPR tuples have an alignment requirement on gfx90a variants.
+ return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
+ : &AMDGPU::VReg_64RegClass;
+}
+
const TargetRegisterClass *
SIRegisterInfo::getRegClass(unsigned RCID) const {
switch ((int)RCID) {
@@ -2234,6 +2528,18 @@ MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const {
return AMDGPU::NoRegister;
}
+bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const {
+ if (!ST.needsAlignedVGPRs())
+ return true;
+
+ if (hasVGPRs(&RC))
+ return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC)));
+ if (hasAGPRs(&RC))
+ return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC)));
+
+ return true;
+}
+
bool SIRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
switch (PhysReg) {
case AMDGPU::SGPR_NULL:
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 963da9b3536b..2a92051e5fb2 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -21,7 +21,9 @@ namespace llvm {
class GCNSubtarget;
class LiveIntervals;
+class LivePhysRegs;
class RegisterBank;
+struct SGPRSpillBuilder;
class SIMachineFunctionInfo;
class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
@@ -79,7 +81,7 @@ public:
bool hasBasePointer(const MachineFunction &MF) const;
Register getBaseRegister() const;
- bool canRealignStack(const MachineFunction &MF) const override;
+ bool shouldRealignStack(const MachineFunction &MF) const override;
bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
@@ -106,18 +108,18 @@ public:
const TargetRegisterClass *getPointerRegClass(
const MachineFunction &MF, unsigned Kind = 0) const override;
- void buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI, int Index,
- int Offset, unsigned EltSize, Register VGPR,
- int64_t VGPRLanes, RegScavenger *RS,
- bool IsLoad) const;
+ void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset,
+ bool IsLoad, bool IsKill = true) const;
/// If \p OnlyToVGPR is true, this will only succeed if this
bool spillSGPR(MachineBasicBlock::iterator MI,
int FI, RegScavenger *RS,
+ LiveIntervals *LIS = nullptr,
bool OnlyToVGPR = false) const;
bool restoreSGPR(MachineBasicBlock::iterator MI,
int FI, RegScavenger *RS,
+ LiveIntervals *LIS = nullptr,
bool OnlyToVGPR = false) const;
void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
@@ -125,7 +127,8 @@ public:
RegScavenger *RS) const override;
bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI,
- int FI, RegScavenger *RS) const;
+ int FI, RegScavenger *RS,
+ LiveIntervals *LIS = nullptr) const;
StringRef getRegAsmName(MCRegister Reg) const override;
@@ -134,8 +137,13 @@ public:
return getEncodingValue(Reg) & 0xff;
}
- static const TargetRegisterClass *getVGPRClassForBitWidth(unsigned BitWidth);
- static const TargetRegisterClass *getAGPRClassForBitWidth(unsigned BitWidth);
+ LLVM_READONLY
+ const TargetRegisterClass *getVGPRClassForBitWidth(unsigned BitWidth) const;
+
+ LLVM_READONLY
+ const TargetRegisterClass *getAGPRClassForBitWidth(unsigned BitWidth) const;
+
+ LLVM_READONLY
static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth);
/// Return the 'base' register class for this register.
@@ -182,12 +190,21 @@ public:
const TargetRegisterClass *
getEquivalentSGPRClass(const TargetRegisterClass *VRC) const;
- /// \returns The register class that is used for a sub-register of \p RC for
- /// the given \p SubIdx. If \p SubIdx equals NoSubRegister, \p RC will
- /// be returned.
+ /// \returns The canonical register class that is used for a sub-register of
+ /// \p RC for the given \p SubIdx. If \p SubIdx equals NoSubRegister, \p RC
+ /// will be returned.
const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC,
unsigned SubIdx) const;
+ /// Returns a register class which is compatible with \p SuperRC, such that a
+ /// subregister exists with class \p SubRC with subregister index \p
+ /// SubIdx. If this is impossible (e.g., an unaligned subregister index within
+ /// a register tuple), return null.
+ const TargetRegisterClass *
+ getCompatibleSubRegClass(const TargetRegisterClass *SuperRC,
+ const TargetRegisterClass *SubRC,
+ unsigned SubIdx) const;
+
bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
unsigned DefSubReg,
const TargetRegisterClass *SrcRC,
@@ -268,6 +285,10 @@ public:
: &AMDGPU::SReg_64_XEXECRegClass;
}
+ // Return the appropriate register class to use for 64-bit VGPRs for the
+ // subtarget.
+ const TargetRegisterClass *getVGPR64Class() const;
+
MCRegister getVCC() const;
const TargetRegisterClass *getRegClass(unsigned RCID) const;
@@ -279,6 +300,8 @@ public:
LiveIntervals *LIS) const;
const uint32_t *getAllVGPRRegMask() const;
+ const uint32_t *getAllAGPRRegMask() const;
+ const uint32_t *getAllVectorRegMask() const;
const uint32_t *getAllAllocatableSRegMask() const;
// \returns number of 32 bit registers covered by a \p LM
@@ -306,6 +329,10 @@ public:
// \returns \p Reg otherwise.
MCPhysReg get32BitRegister(MCPhysReg Reg) const;
+ // Returns true if a given register class is properly aligned for
+ // the subtarget.
+ bool isProperlyAlignedRC(const TargetRegisterClass &RC) const;
+
/// Return all SGPR128 which satisfy the waves per execution unit requirement
/// of the subtarget.
ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF) const;
@@ -318,16 +345,16 @@ public:
/// of the subtarget.
ArrayRef<MCPhysReg> getAllSGPR32(const MachineFunction &MF) const;
-private:
- void buildSpillLoadStore(MachineBasicBlock::iterator MI,
- unsigned LoadStoreOp,
- int Index,
- Register ValueReg,
- bool ValueIsKill,
- MCRegister ScratchOffsetReg,
- int64_t InstrOffset,
- MachineMemOperand *MMO,
- RegScavenger *RS) const;
+ // Insert spill or restore instructions.
+ // When lowering spill pseudos, the RegScavenger should be set.
+ // For creating spill instructions during frame lowering, where no scavenger
+ // is available, LiveRegs can be used.
+ void buildSpillLoadStore(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI, unsigned LoadStoreOp,
+ int Index, Register ValueReg, bool ValueIsKill,
+ MCRegister ScratchOffsetReg, int64_t InstrOffset,
+ MachineMemOperand *MMO, RegScavenger *RS,
+ LivePhysRegs *LiveRegs = nullptr) const;
};
} // End namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 92390f1f3297..6e3c4e8775f3 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -36,12 +36,12 @@ foreach Index = 1...31 in {
foreach Size = {2...6,8,16} in {
foreach Index = Indexes<!sub(33, Size)>.slice in {
- def !foldl("", Indexes<Size>.slice, acc, cur,
- !strconcat(acc#!if(!eq(acc,""),"","_"), "sub"#!add(cur, Index))) :
+ def !interleave(!foreach(cur, Indexes<Size>.slice, "sub"#!add(cur, Index)),
+ "_") :
SubRegIndex<!mul(Size, 32), !shl(Index, 5)> {
let CoveringSubRegIndices =
- !foldl([]<SubRegIndex>, Indexes<Size>.slice, acc, cur,
- !listconcat(acc, [!cast<SubRegIndex>(sub#!add(cur, Index))]));
+ !foreach(cur, Indexes<Size>.slice,
+ !cast<SubRegIndex>(sub#!add(cur, Index)));
}
}
}
@@ -58,6 +58,7 @@ class getSubRegs<int size> {
list<SubRegIndex> ret4 = [sub0, sub1, sub2, sub3];
list<SubRegIndex> ret5 = [sub0, sub1, sub2, sub3, sub4];
list<SubRegIndex> ret6 = [sub0, sub1, sub2, sub3, sub4, sub5];
+ list<SubRegIndex> ret7 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6];
list<SubRegIndex> ret8 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7];
list<SubRegIndex> ret16 = [sub0, sub1, sub2, sub3,
sub4, sub5, sub6, sub7,
@@ -77,9 +78,10 @@ class getSubRegs<int size> {
!if(!eq(size, 4), ret4,
!if(!eq(size, 5), ret5,
!if(!eq(size, 6), ret6,
- !if(!eq(size, 8), ret8,
- !if(!eq(size, 16), ret16,
- ret32)))))));
+ !if(!eq(size, 7), ret7,
+ !if(!eq(size, 8), ret8,
+ !if(!eq(size, 16), ret16,
+ ret32))))))));
}
// Generates list of sequential register tuple names.
@@ -350,9 +352,12 @@ def SGPR_128Regs : SIRegisterTuples<getSubRegs<4>.ret, SGPR_32, 105, 4, 4, "s">;
// SGPR 160-bit registers. No operations use these, but for symmetry with 160-bit VGPRs.
def SGPR_160Regs : SIRegisterTuples<getSubRegs<5>.ret, SGPR_32, 105, 4, 5, "s">;
-// SGPR 192-bit registers
+// SGPR 192-bit registers. No operations use these, but for symmetry with 192-bit VGPRs.
def SGPR_192Regs : SIRegisterTuples<getSubRegs<6>.ret, SGPR_32, 105, 4, 6, "s">;
+// SGPR 224-bit registers. No operations use these, but for symmetry with 224-bit VGPRs.
+def SGPR_224Regs : SIRegisterTuples<getSubRegs<7>.ret, SGPR_32, 105, 4, 7, "s">;
+
// SGPR 256-bit registers
def SGPR_256Regs : SIRegisterTuples<getSubRegs<8>.ret, SGPR_32, 105, 4, 8, "s">;
@@ -368,6 +373,7 @@ def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
let isAllocatable = 0;
}
+// Trap handler TMP 16-bit registers
def TTMP_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16,
(add (sequence "TTMP%u_LO16", 0, 15))> {
let Size = 16;
@@ -377,11 +383,25 @@ def TTMP_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16,
// Trap handler TMP 64-bit registers
def TTMP_64Regs : SIRegisterTuples<getSubRegs<2>.ret, TTMP_32, 15, 2, 2, "ttmp">;
+// Trap handler TMP 96-bit registers
+def TTMP_96Regs : SIRegisterTuples<getSubRegs<3>.ret, TTMP_32, 15, 3, 3, "ttmp">;
+
// Trap handler TMP 128-bit registers
def TTMP_128Regs : SIRegisterTuples<getSubRegs<4>.ret, TTMP_32, 15, 4, 4, "ttmp">;
+// Trap handler TMP 160-bit registers
+def TTMP_160Regs : SIRegisterTuples<getSubRegs<5>.ret, TTMP_32, 15, 4, 5, "ttmp">;
+
+// Trap handler TMP 192-bit registers
+def TTMP_192Regs : SIRegisterTuples<getSubRegs<6>.ret, TTMP_32, 15, 4, 6, "ttmp">;
+
+// Trap handler TMP 224-bit registers
+def TTMP_224Regs : SIRegisterTuples<getSubRegs<7>.ret, TTMP_32, 15, 4, 7, "ttmp">;
+
+// Trap handler TMP 256-bit registers
def TTMP_256Regs : SIRegisterTuples<getSubRegs<8>.ret, TTMP_32, 15, 4, 8, "ttmp">;
+// Trap handler TMP 512-bit registers
def TTMP_512Regs : SIRegisterTuples<getSubRegs<16>.ret, TTMP_32, 15, 4, 16, "ttmp">;
class TmpRegTuplesBase<int index, int size,
@@ -508,6 +528,9 @@ def VGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, VGPR_32, 255, 1, 5, "v">;
// VGPR 192-bit registers
def VGPR_192 : SIRegisterTuples<getSubRegs<6>.ret, VGPR_32, 255, 1, 6, "v">;
+// VGPR 224-bit registers
+def VGPR_224 : SIRegisterTuples<getSubRegs<7>.ret, VGPR_32, 255, 1, 7, "v">;
+
// VGPR 256-bit registers
def VGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, VGPR_32, 255, 1, 8, "v">;
@@ -547,6 +570,9 @@ def AGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, AGPR_32, 255, 1, 5, "a">;
// AGPR 192-bit registers
def AGPR_192 : SIRegisterTuples<getSubRegs<6>.ret, AGPR_32, 255, 1, 6, "a">;
+// AGPR 224-bit registers
+def AGPR_224 : SIRegisterTuples<getSubRegs<7>.ret, AGPR_32, 255, 1, 7, "a">;
+
// AGPR 256-bit registers
def AGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, AGPR_32, 255, 1, 8, "a">;
@@ -682,111 +708,53 @@ def SReg_1 : RegisterClass<"AMDGPU", [i1], 32,
let isAllocatable = 0;
}
-// Requires 2 s_mov_b64 to copy
-let CopyCost = 2 in {
-
-// There are no 3-component scalar instructions, but this is needed
-// for symmetry with VGPRs.
-def SGPR_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32,
- (add SGPR_96Regs)> {
- let AllocationPriority = 14;
-}
-
-def SReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32,
- (add SGPR_96)> {
- let AllocationPriority = 14;
-}
-
-def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32,
- (add SGPR_128Regs)> {
- let AllocationPriority = 15;
-}
-
-def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32,
- (add TTMP_128Regs)> {
- let isAllocatable = 0;
-}
-
-def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
- (add SGPR_128, TTMP_128)> {
- let isAllocatable = 0;
-}
-
-} // End CopyCost = 2
-
-// There are no 5-component scalar instructions, but this is needed
-// for symmetry with VGPRs.
-def SGPR_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
- (add SGPR_160Regs)> {
- let AllocationPriority = 16;
-}
-
-def SReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
- (add SGPR_160)> {
- // FIXME: Should be isAllocatable = 0, but that causes all TableGen-generated
- // subclasses of SGPR_160 to be marked unallocatable too.
-}
-
-def SGPR_192 : RegisterClass<"AMDGPU", [untyped], 32, (add SGPR_192Regs)> {
- let Size = 192;
- let AllocationPriority = 17;
-}
-
-def SReg_192 : RegisterClass<"AMDGPU", [untyped], 32, (add SGPR_192)> {
- let Size = 192;
- let isAllocatable = 0;
-}
-
-def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32, (add SGPR_256Regs)> {
- let AllocationPriority = 18;
-}
-
-def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32, (add TTMP_256Regs)> {
- let isAllocatable = 0;
-}
+multiclass SRegClass<int numRegs, int priority,
+ list<ValueType> regTypes,
+ SIRegisterTuples regList,
+ SIRegisterTuples ttmpList = regList,
+ int copyCost = !sra(!add(numRegs, 1), 1)> {
+ defvar hasTTMP = !ne(regList, ttmpList);
+ defvar suffix = !cast<string>(!mul(numRegs, 32));
+ defvar sgprName = !strconcat("SGPR_", suffix);
+ defvar ttmpName = !strconcat("TTMP_", suffix);
-def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32,
- (add SGPR_256, TTMP_256)> {
- // Requires 4 s_mov_b64 to copy
- let CopyCost = 4;
- let isAllocatable = 0;
-}
+ let AllocationPriority = priority, CopyCost = copyCost in {
+ def "" # sgprName : RegisterClass<"AMDGPU", regTypes, 32, (add regList)> {
+ }
-def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32,
- (add SGPR_512Regs)> {
- let AllocationPriority = 19;
-}
+ if hasTTMP then {
+ def "" # ttmpName : RegisterClass<"AMDGPU", regTypes, 32, (add ttmpList)> {
+ let isAllocatable = 0;
+ }
+ }
-def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32,
- (add TTMP_512Regs)> {
- let isAllocatable = 0;
+ def SReg_ # suffix :
+ RegisterClass<"AMDGPU", regTypes, 32,
+ !con(!dag(add, [!cast<RegisterClass>(sgprName)], ["sgpr"]),
+ !if(hasTTMP,
+ !dag(add, [!cast<RegisterClass>(ttmpName)], ["ttmp"]),
+ (add)))> {
+ let isAllocatable = 0;
+ }
+ }
}
-def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32,
- (add SGPR_512, TTMP_512)> {
- // Requires 8 s_mov_b64 to copy
- let CopyCost = 8;
- let isAllocatable = 0;
-}
+defm "" : SRegClass<3, 14, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>;
+defm "" : SRegClass<4, 15, [v4i32, v4f32, v2i64], SGPR_128Regs, TTMP_128Regs>;
+defm "" : SRegClass<5, 16, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
+defm "" : SRegClass<6, 17, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
+defm "" : SRegClass<7, 18, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
+defm "" : SRegClass<8, 19, [v8i32, v8f32, v4i64, v4f64], SGPR_256Regs, TTMP_256Regs>;
+defm "" : SRegClass<16, 20, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>;
+defm "" : SRegClass<32, 21, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
(add VGPR_32, LDS_DIRECT_CLASS)> {
let isAllocatable = 0;
}
-def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32, v16i64, v16f64], 32,
- (add SGPR_1024Regs)> {
- let AllocationPriority = 20;
-}
-
-def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32, v16i64, v16f64], 32,
- (add SGPR_1024)> {
- let CopyCost = 16;
- let isAllocatable = 0;
-}
-
// Register class for all vector registers (VGPRs + Interpolation Registers)
-class VRegClass<int numRegs, list<ValueType> regTypes, dag regList> :
+class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> :
RegisterClass<"AMDGPU", regTypes, 32, regList> {
let Size = !mul(numRegs, 32);
@@ -796,31 +764,48 @@ class VRegClass<int numRegs, list<ValueType> regTypes, dag regList> :
let Weight = numRegs;
}
-def VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4],
- (add VGPR_64)>;
-def VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;
-def VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, i128], (add VGPR_128)>;
-def VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
-def VReg_192 : VRegClass<6, [untyped], (add VGPR_192)>;
-def VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64], (add VGPR_256)>;
-def VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;
-def VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;
+// Define a register tuple class, along with one requiring an even
+// aligned base register.
+multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
+ // Define the regular class.
+ def "" : VRegClassBase<numRegs, regTypes, regList>;
-class ARegClass<int numRegs, list<ValueType> regTypes, dag regList> :
- VRegClass<numRegs, regTypes, regList> {
- // Requires n v_accvgpr_write and n v_accvgpr_read to copy + burn 1 vgpr
- let CopyCost = !add(numRegs, numRegs, 1);
+ // Define 2-aligned variant
+ def _Align2 : VRegClassBase<numRegs, regTypes, (decimate regList, 2)>;
+}
+
+defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4],
+ (add VGPR_64)>;
+defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;
+defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64], (add VGPR_128)>;
+defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
+
+defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>;
+defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>;
+defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64], (add VGPR_256)>;
+defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;
+defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;
+
+multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
+ let CopyCost = !add(numRegs, numRegs, 1) in {
+ // Define the regular class.
+ def "" : VRegClassBase<numRegs, regTypes, regList>;
+
+ // Define 2-aligned variant
+ def _Align2 : VRegClassBase<numRegs, regTypes, (decimate regList, 2)>;
+ }
}
-def AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16],
+defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16],
(add AGPR_64)>;
-def AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>;
-def AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64], (add AGPR_128)>;
-def AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;
-def AReg_192 : ARegClass<6, [untyped], (add AGPR_192)>;
-def AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>;
-def AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>;
-def AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)>;
+defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>;
+defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64], (add AGPR_128)>;
+defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;
+defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>;
+defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>;
+defm AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>;
+defm AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>;
+defm AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)>;
} // End GeneratePressureSet = 0
@@ -847,21 +832,36 @@ def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
let isAllocatable = 0;
}
-def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> {
+def VS_64 : RegisterClass<"AMDGPU", [i64, f64, v2f32], 32, (add VReg_64, SReg_64)> {
let isAllocatable = 0;
}
-def AV_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def AV_32 : RegisterClass<"AMDGPU", VGPR_32.RegTypes, 32,
(add AGPR_32, VGPR_32)> {
let isAllocatable = 0;
}
-def AV_64 : RegisterClass<"AMDGPU", [i64, f64, v4f16], 32,
+def AV_64 : RegisterClass<"AMDGPU", VReg_64.RegTypes, 32,
(add AReg_64, VReg_64)> {
let isAllocatable = 0;
}
} // End GeneratePressureSet = 0
+def AV_96 : RegisterClass<"AMDGPU", VReg_96.RegTypes, 32,
+ (add AReg_96, VReg_96)> {
+ let isAllocatable = 0;
+}
+
+def AV_128 : RegisterClass<"AMDGPU", VReg_128.RegTypes, 32,
+ (add AReg_128, VReg_128)> {
+ let isAllocatable = 0;
+}
+
+def AV_160 : RegisterClass<"AMDGPU", VReg_160.RegTypes, 32,
+ (add AReg_160, VReg_160)> {
+ let isAllocatable = 0;
+}
+
//===----------------------------------------------------------------------===//
// Register operands
//===----------------------------------------------------------------------===//
@@ -912,21 +912,38 @@ multiclass SIRegOperand32 <string rc, string MatchName, string opType,
}
}
-multiclass SIRegOperand <string rc, string MatchName, string opType> :
- SIRegOperand32<rc, MatchName, opType> {
+multiclass SIRegOperand64 <string rc, string MatchName, string opType,
+ string rc_suffix = "_64", bit Vectors = 1> {
let OperandNamespace = "AMDGPU" in {
- def _b64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
+ def _b64 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
let OperandType = opType#"_INT64";
let ParserMatchClass = RegImmMatcher<MatchName#"B64">;
}
- def _f64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
+ def _f64 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
let OperandType = opType#"_FP64";
let ParserMatchClass = RegImmMatcher<MatchName#"F64">;
}
+
+ if Vectors then
+ def _v2f32 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
+ let OperandType = opType#"_V2FP32";
+ let ParserMatchClass = RegImmMatcher<MatchName#"V2FP32">;
+ let DecoderMethod = "decodeOperand_VSrcV232";
+ }
+ if Vectors then
+ def _v2b32 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
+ let OperandType = opType#"_V2INT32";
+ let ParserMatchClass = RegImmMatcher<MatchName#"V2INT32">;
+ let DecoderMethod = "decodeOperand_VSrcV232";
+ }
}
}
+multiclass SIRegOperand <string rc, string MatchName, string opType> :
+ SIRegOperand32<rc, MatchName, opType>,
+ SIRegOperand64<rc, MatchName, opType>;
+
// FIXME: 64-bit sources can sometimes use 32-bit constants.
multiclass RegImmOperand <string rc, string MatchName>
: SIRegOperand<rc, MatchName, "OPERAND_REG_IMM">;
@@ -938,10 +955,18 @@ multiclass RegInlineOperand32 <string rc, string MatchName,
string rc_suffix = "_32">
: SIRegOperand32<rc, MatchName, "OPERAND_REG_INLINE_C", rc_suffix>;
+multiclass RegInlineOperand64 <string rc, string MatchName,
+ string rc_suffix = "_64">
+ : SIRegOperand64<rc, MatchName, "OPERAND_REG_INLINE_C", rc_suffix>;
+
multiclass RegInlineOperandAC <string rc, string MatchName,
string rc_suffix = "_32">
: SIRegOperand32<rc, MatchName, "OPERAND_REG_INLINE_AC", rc_suffix>;
+multiclass RegInlineOperandAC64 <string rc, string MatchName,
+ string rc_suffix = "_64">
+ : SIRegOperand64<rc, MatchName, "OPERAND_REG_INLINE_AC", rc_suffix, 0>;
+
//===----------------------------------------------------------------------===//
// SSrc_* Operands with an SGPR or a 32-bit immediate
//===----------------------------------------------------------------------===//
@@ -971,7 +996,7 @@ def VSrc_128 : RegisterOperand<VReg_128> {
}
//===----------------------------------------------------------------------===//
-// VSrc_* Operands with an VGPR
+// VRegSrc_* Operands with a VGPR
//===----------------------------------------------------------------------===//
// This is for operands with the enum(9), VSrc encoding restriction,
@@ -1001,6 +1026,13 @@ defm VCSrc : RegInlineOperand<"VS", "VCSrc">;
//===----------------------------------------------------------------------===//
defm VISrc : RegInlineOperand32<"VGPR", "VISrc">;
+let DecoderMethod = "decodeOperand_VReg_64" in
+defm VISrc_64 : RegInlineOperand64<"VReg", "VISrc_64", "_64">;
+defm VISrc_128 : RegInlineOperandAC<"VReg", "VISrc_128", "_128">;
+let DecoderMethod = "decodeOperand_VReg_256" in
+defm VISrc_256 : RegInlineOperand64<"VReg", "VISrc_256", "_256">;
+defm VISrc_512 : RegInlineOperandAC<"VReg", "VISrc_512", "_512">;
+defm VISrc_1024 : RegInlineOperandAC<"VReg", "VISrc_1024", "_1024">;
//===----------------------------------------------------------------------===//
// AVSrc_* Operands with an AGPR or VGPR
@@ -1016,6 +1048,31 @@ def AVSrc_64 : RegisterOperand<AV_64> {
let EncoderMethod = "getAVOperandEncoding";
}
+def AVLdSt_32 : RegisterOperand<AV_32> {
+ let DecoderMethod = "DecodeAVLdSt_32RegisterClass";
+ let EncoderMethod = "getAVOperandEncoding";
+}
+
+def AVLdSt_64 : RegisterOperand<AV_64> {
+ let DecoderMethod = "DecodeAVLdSt_64RegisterClass";
+ let EncoderMethod = "getAVOperandEncoding";
+}
+
+def AVLdSt_96 : RegisterOperand<AV_96> {
+ let DecoderMethod = "DecodeAVLdSt_96RegisterClass";
+ let EncoderMethod = "getAVOperandEncoding";
+}
+
+def AVLdSt_128 : RegisterOperand<AV_128> {
+ let DecoderMethod = "DecodeAVLdSt_128RegisterClass";
+ let EncoderMethod = "getAVOperandEncoding";
+}
+
+def AVLdSt_160 : RegisterOperand<AV_160> {
+ let DecoderMethod = "DecodeAVLdSt_160RegisterClass";
+ let EncoderMethod = "getAVOperandEncoding";
+}
+
//===----------------------------------------------------------------------===//
// ACSrc_* Operands with an AGPR or an inline constant
//===----------------------------------------------------------------------===//
@@ -1024,3 +1081,8 @@ defm AISrc : RegInlineOperandAC<"AGPR", "AISrc">;
defm AISrc_128 : RegInlineOperandAC<"AReg", "AISrc_128", "_128">;
defm AISrc_512 : RegInlineOperandAC<"AReg", "AISrc_512", "_512">;
defm AISrc_1024 : RegInlineOperandAC<"AReg", "AISrc_1024", "_1024">;
+
+let DecoderMethod = "decodeOperand_AReg_64" in
+defm AISrc_64 : RegInlineOperandAC64<"AReg", "AISrc_64", "_64">;
+let DecoderMethod = "decodeOperand_AReg_256" in
+defm AISrc_256 : RegInlineOperandAC64<"AReg", "AISrc_256", "_256">;
diff --git a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp b/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
deleted file mode 100644
index d30ff4a3fd15..000000000000
--- a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-//===-- SIRemoveShortExecBranches.cpp ------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This pass optmizes the s_cbranch_execz instructions.
-/// The pass removes this skip instruction for short branches,
-/// if there is no unwanted sideeffect in the fallthrough code sequence.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "GCNSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/Support/CommandLine.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "si-remove-short-exec-branches"
-
-static unsigned SkipThreshold;
-
-static cl::opt<unsigned, true> SkipThresholdFlag(
- "amdgpu-skip-threshold", cl::Hidden,
- cl::desc(
- "Number of instructions before jumping over divergent control flow"),
- cl::location(SkipThreshold), cl::init(12));
-
-namespace {
-
-class SIRemoveShortExecBranches : public MachineFunctionPass {
-private:
- const SIInstrInfo *TII = nullptr;
- bool getBlockDestinations(MachineBasicBlock &SrcMBB,
- MachineBasicBlock *&TrueMBB,
- MachineBasicBlock *&FalseMBB,
- SmallVectorImpl<MachineOperand> &Cond);
- bool mustRetainExeczBranch(const MachineBasicBlock &From,
- const MachineBasicBlock &To) const;
- bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
-
-public:
- static char ID;
-
- SIRemoveShortExecBranches() : MachineFunctionPass(ID) {
- initializeSIRemoveShortExecBranchesPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS(SIRemoveShortExecBranches, DEBUG_TYPE,
- "SI remove short exec branches", false, false)
-
-char SIRemoveShortExecBranches::ID = 0;
-
-char &llvm::SIRemoveShortExecBranchesID = SIRemoveShortExecBranches::ID;
-
-bool SIRemoveShortExecBranches::getBlockDestinations(
- MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
- MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
- if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
- return false;
-
- if (!FalseMBB)
- FalseMBB = SrcMBB.getNextNode();
-
- return true;
-}
-
-bool SIRemoveShortExecBranches::mustRetainExeczBranch(
- const MachineBasicBlock &From, const MachineBasicBlock &To) const {
- unsigned NumInstr = 0;
- const MachineFunction *MF = From.getParent();
-
- for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
- MBBI != End && MBBI != ToI; ++MBBI) {
- const MachineBasicBlock &MBB = *MBBI;
-
- for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
- I != E; ++I) {
- // When a uniform loop is inside non-uniform control flow, the branch
- // leaving the loop might never be taken when EXEC = 0.
- // Hence we should retain cbranch out of the loop lest it become infinite.
- if (I->isConditionalBranch())
- return true;
-
- if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
- return true;
-
- if (TII->isKillTerminator(I->getOpcode()))
- return true;
-
- // These instructions are potentially expensive even if EXEC = 0.
- if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
- I->getOpcode() == AMDGPU::S_WAITCNT)
- return true;
-
- ++NumInstr;
- if (NumInstr >= SkipThreshold)
- return true;
- }
- }
-
- return false;
-}
-
-// Returns true if the skip branch instruction is removed.
-bool SIRemoveShortExecBranches::removeExeczBranch(MachineInstr &MI,
- MachineBasicBlock &SrcMBB) {
- MachineBasicBlock *TrueMBB = nullptr;
- MachineBasicBlock *FalseMBB = nullptr;
- SmallVector<MachineOperand, 1> Cond;
-
- if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
- return false;
-
- // Consider only the forward branches.
- if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
- mustRetainExeczBranch(*FalseMBB, *TrueMBB))
- return false;
-
- LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
- MI.eraseFromParent();
- SrcMBB.removeSuccessor(TrueMBB);
-
- return true;
-}
-
-bool SIRemoveShortExecBranches::runOnMachineFunction(MachineFunction &MF) {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- TII = ST.getInstrInfo();
- MF.RenumberBlocks();
- bool Changed = false;
-
- for (MachineBasicBlock &MBB : MF) {
- MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
- if (MBBI == MBB.end())
- continue;
-
- MachineInstr &MI = *MBBI;
- switch (MI.getOpcode()) {
- case AMDGPU::S_CBRANCH_EXECZ:
- Changed = removeExeczBranch(MI, MBB);
- break;
- default:
- break;
- }
- }
-
- return Changed;
-}
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index db4a009e08d7..b24c061af7ab 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -54,10 +54,15 @@ def WriteTrans64 : SchedWrite;
// Half rate 64-bit instructions.
def Write64Bit : SchedWrite;
+// Integer multiplications.
+def WriteIntMul : SchedWrite;
+
// mAI multipass instructions.
def Write2PassMAI : SchedWrite;
def Write8PassMAI : SchedWrite;
def Write16PassMAI : SchedWrite;
+def Write4PassDGEMM : SchedWrite;
+def Write8PassDGEMM : SchedWrite;
// FIXME: Should there be a class for instructions which are VALU
// instructions and have VALU rates, but write to the SALU (i.e. VOPC
@@ -80,6 +85,7 @@ class SISchedMachineModel : SchedMachineModel {
def SIFullSpeedModel : SISchedMachineModel;
def SIQuarterSpeedModel : SISchedMachineModel;
+def SIDPFullSpeedModel : SISchedMachineModel;
def GFX10SpeedModel : SISchedMachineModel;
// XXX: Are the resource counts correct?
@@ -101,6 +107,9 @@ def HWVMEM : ProcResource<1> {
def HWVALU : ProcResource<1> {
let BufferSize = 1;
}
+def HWTransVALU : ProcResource<1> { // Transcendental VALU
+ let BufferSize = 1;
+}
def HWRC : ProcResource<1> { // Register destination cache
let BufferSize = 1;
}
@@ -137,11 +146,13 @@ multiclass SICommonWriteRes {
def : HWWriteRes<WriteBarrier, [HWBranch], 500>; // XXX: Guessed ???
def : HWVALUWriteRes<Write32Bit, 1>;
- def : HWVALUWriteRes<Write64Bit, 2>;
def : HWVALUWriteRes<WriteFloatCvt, 4>;
def : HWVALUWriteRes<WriteTrans32, 4>;
def : HWVALUWriteRes<WriteQuarterRate32, 4>;
+ def : HWVALUWriteRes<Write4PassDGEMM, 4>;
+ def : HWVALUWriteRes<Write8PassDGEMM, 16>;
+
let ResourceCycles = [2] in
def : HWWriteRes<Write2PassMAI, [HWXDL], 2>;
let ResourceCycles = [8] in
@@ -150,7 +161,6 @@ multiclass SICommonWriteRes {
def : HWWriteRes<Write16PassMAI, [HWXDL], 16>;
def : ReadAdvance<MIVGPRRead, -2>;
- def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
// Technically mfma reads can be from 0 to 4 cycles but that does not make
// sense to model because its register setup is huge. In particular if we
@@ -159,10 +169,6 @@ multiclass SICommonWriteRes {
// need to consume 2 or 4 more vgprs to be initialized before the acc
// write sequence. Just assume worst case here.
def : ReadAdvance<MIMFMARead, -4>;
-
- def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_..._4X4X")>;
- def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_..._16X16X")>;
- def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_..._32X32X")>;
}
def PredIsVGPR32Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) <= 32}]>;
@@ -176,11 +182,13 @@ let SchedModel = SIFullSpeedModel in {
defm : SICommonWriteRes;
-def : HWVALUWriteRes<WriteFloatFMA, 1>;
-def : HWVALUWriteRes<WriteDouble, 4>;
-def : HWVALUWriteRes<WriteDoubleAdd, 2>;
-def : HWVALUWriteRes<WriteDoubleCvt, 4>;
-def : HWVALUWriteRes<WriteTrans64, 4>;
+def : HWVALUWriteRes<Write64Bit, 2>;
+def : HWVALUWriteRes<WriteIntMul, 4>;
+def : HWVALUWriteRes<WriteFloatFMA, 1>;
+def : HWVALUWriteRes<WriteDouble, 4>;
+def : HWVALUWriteRes<WriteDoubleAdd, 2>;
+def : HWVALUWriteRes<WriteDoubleCvt, 4>;
+def : HWVALUWriteRes<WriteTrans64, 4>;
def : InstRW<[WriteCopy], (instrs COPY)>;
@@ -190,16 +198,44 @@ let SchedModel = SIQuarterSpeedModel in {
defm : SICommonWriteRes;
-def : HWVALUWriteRes<WriteFloatFMA, 16>;
-def : HWVALUWriteRes<WriteDouble, 16>;
-def : HWVALUWriteRes<WriteDoubleAdd, 8>;
-def : HWVALUWriteRes<WriteDoubleCvt, 4>;
-def : HWVALUWriteRes<WriteTrans64, 16>;
+def : HWVALUWriteRes<Write64Bit, 2>;
+def : HWVALUWriteRes<WriteIntMul, 4>;
+def : HWVALUWriteRes<WriteFloatFMA, 16>;
+def : HWVALUWriteRes<WriteDouble, 16>;
+def : HWVALUWriteRes<WriteDoubleAdd, 8>;
+def : HWVALUWriteRes<WriteDoubleCvt, 4>;
+def : HWVALUWriteRes<WriteTrans64, 16>;
def : InstRW<[WriteCopy], (instrs COPY)>;
+def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
+def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_..._4X4X")>;
+def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_..._16X16X")>;
+def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_..._32X32X")>;
} // End SchedModel = SIQuarterSpeedModel
+let SchedModel = SIDPFullSpeedModel in {
+
+defm : SICommonWriteRes;
+
+def : HWVALUWriteRes<WriteFloatFMA, 1>;
+def : HWVALUWriteRes<WriteDouble, 1>;
+def : HWVALUWriteRes<WriteDoubleAdd, 1>;
+def : HWVALUWriteRes<WriteDoubleCvt, 1>;
+def : HWVALUWriteRes<WriteTrans64, 4>;
+def : HWVALUWriteRes<WriteIntMul, 1>;
+def : HWVALUWriteRes<Write64Bit, 1>;
+
+def : InstRW<[WriteCopy], (instrs COPY)>;
+def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
+def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_4X4X")>;
+def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X")>;
+def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X")>;
+def : InstRW<[Write4PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_4X4X")>;
+def : InstRW<[Write8PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>;
+
+} // End SchedModel = SIDPFullSpeedModel
+
let SchedModel = GFX10SpeedModel in {
// The latency values are 1 / (operations / cycle).
@@ -207,13 +243,14 @@ let SchedModel = GFX10SpeedModel in {
def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>;
def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>;
def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 6>;
-def : HWWriteRes<WriteTrans32, [HWVALU, HWRC], 10>;
+def : HWWriteRes<WriteTrans32, [HWTransVALU, HWRC], 10>;
def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 8>;
def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>;
def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 22>;
def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 22>;
def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 22>;
-def : HWWriteRes<WriteTrans64, [HWVALU, HWRC], 24>;
+def : HWWriteRes<WriteIntMul, [HWVALU, HWRC], 8>;
+def : HWWriteRes<WriteTrans64, [HWVALU, HWTransVALU, HWRC], 24>;
def : HWWriteRes<WriteBranch, [HWBranch], 32>;
def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>;
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 2628070f219c..45dd57ea1be4 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -75,17 +75,19 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
MachineOperand &MovSrc = Def->getOperand(1);
bool ConstantFolded = false;
- if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) ||
- isUInt<32>(MovSrc.getImm()))) {
- Src0.ChangeToImmediate(MovSrc.getImm());
- ConstantFolded = true;
- } else if (MovSrc.isFI()) {
- Src0.ChangeToFrameIndex(MovSrc.getIndex());
- ConstantFolded = true;
- } else if (MovSrc.isGlobal()) {
- Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(),
- MovSrc.getTargetFlags());
- ConstantFolded = true;
+ if (TII->isOperandLegal(MI, Src0Idx, &MovSrc)) {
+ if (MovSrc.isImm() &&
+ (isInt<32>(MovSrc.getImm()) || isUInt<32>(MovSrc.getImm()))) {
+ Src0.ChangeToImmediate(MovSrc.getImm());
+ ConstantFolded = true;
+ } else if (MovSrc.isFI()) {
+ Src0.ChangeToFrameIndex(MovSrc.getIndex());
+ ConstantFolded = true;
+ } else if (MovSrc.isGlobal()) {
+ Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(),
+ MovSrc.getTargetFlags());
+ ConstantFolded = true;
+ }
}
if (ConstantFolded) {
@@ -230,9 +232,14 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
RC = &AMDGPU::VReg_96RegClass;
} else if (Info->VAddrDwords == 4) {
RC = &AMDGPU::VReg_128RegClass;
- } else if (Info->VAddrDwords <= 8) {
+ } else if (Info->VAddrDwords == 5) {
+ RC = &AMDGPU::VReg_160RegClass;
+ } else if (Info->VAddrDwords == 6) {
+ RC = &AMDGPU::VReg_192RegClass;
+ } else if (Info->VAddrDwords == 7) {
+ RC = &AMDGPU::VReg_224RegClass;
+ } else if (Info->VAddrDwords == 8) {
RC = &AMDGPU::VReg_256RegClass;
- NewAddrDwords = 8;
} else {
RC = &AMDGPU::VReg_512RegClass;
NewAddrDwords = 16;
@@ -571,7 +578,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
dropInstructionKeepingImpDefs(*MovY, TII);
MachineInstr *Next = &*std::next(MovT.getIterator());
- if (MRI.use_nodbg_empty(T)) {
+ if (T.isVirtual() && MRI.use_nodbg_empty(T)) {
dropInstructionKeepingImpDefs(MovT, TII);
} else {
Xop.setIsKill(false);
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 0640e24b37ec..38548eaf9478 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -7,14 +7,17 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// This pass adds instructions to enable whole quad mode for pixel
-/// shaders, and whole wavefront mode for all programs.
+/// This pass adds instructions to enable whole quad mode (strict or non-strict)
+/// for pixel shaders, and strict whole wavefront mode for all programs.
+///
+/// The "strict" prefix indicates that inactive lanes do not take part in
+/// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
+/// always be enabled irrespective of control flow decisions. Conversely in
+/// non-strict WQM inactive lanes may control flow decisions.
///
/// Whole quad mode is required for derivative computations, but it interferes
-/// with shader side effects (stores and atomics). This pass is run on the
-/// scheduled machine IR but before register coalescing, so that machine SSA is
-/// available for analysis. It ensures that WQM is enabled when necessary, but
-/// disabled around stores and atomics.
+/// with shader side effects (stores and atomics). It ensures that WQM is
+/// enabled when necessary, but disabled around stores and atomics.
///
/// When necessary, this pass creates a function prolog
///
@@ -28,12 +31,21 @@
/// ...
/// S_MOV_B64 EXEC, Tmp
///
-/// We also compute when a sequence of instructions requires Whole Wavefront
-/// Mode (WWM) and insert instructions to save and restore it:
+/// We also compute when a sequence of instructions requires strict whole
+/// wavefront mode (StrictWWM) and insert instructions to save and restore it:
+///
+/// S_OR_SAVEEXEC_B64 Tmp, -1
+/// ...
+/// S_MOV_B64 EXEC, Tmp
+///
+/// When a sequence of instructions requires strict whole quad mode (StrictWQM)
+/// we use a similar save and restore mechanism and force whole quad mode for
+/// those instructions:
///
-/// S_OR_SAVEEXEC_B64 Tmp, -1
-/// ...
-/// S_MOV_B64 EXEC, Tmp
+/// S_MOV_B64 Tmp, EXEC
+/// S_WQM_B64 EXEC, EXEC
+/// ...
+/// S_MOV_B64 EXEC, Tmp
///
/// In order to avoid excessive switching during sequences of Exact
/// instructions, the pass first analyzes which instructions must be run in WQM
@@ -62,8 +74,10 @@
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/raw_ostream.h"
@@ -76,8 +90,10 @@ namespace {
enum {
StateWQM = 0x1,
- StateWWM = 0x2,
- StateExact = 0x4,
+ StateStrictWWM = 0x2,
+ StateStrictWQM = 0x4,
+ StateExact = 0x8,
+ StateStrict = StateStrictWWM | StateStrictWQM,
};
struct PrintState {
@@ -89,19 +105,23 @@ public:
#ifndef NDEBUG
static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
- if (PS.State & StateWQM)
- OS << "WQM";
- if (PS.State & StateWWM) {
- if (PS.State & StateWQM)
- OS << '|';
- OS << "WWM";
- }
- if (PS.State & StateExact) {
- if (PS.State & (StateWQM | StateWWM))
- OS << '|';
- OS << "Exact";
- }
+ static const std::pair<char, const char *> Mapping[] = {
+ std::make_pair(StateWQM, "WQM"),
+ std::make_pair(StateStrictWWM, "StrictWWM"),
+ std::make_pair(StateStrictWQM, "StrictWQM"),
+ std::make_pair(StateExact, "Exact")};
+ char State = PS.State;
+ for (auto M : Mapping) {
+ if (State & M.first) {
+ OS << M.second;
+ State &= ~M.first;
+
+ if (State)
+ OS << '|';
+ }
+ }
+ assert(State == 0);
return OS;
}
#endif
@@ -116,6 +136,8 @@ struct BlockInfo {
char Needs = 0;
char InNeeds = 0;
char OutNeeds = 0;
+ char InitialState = 0;
+ bool NeedsLowering = false;
};
struct WorkItem {
@@ -129,23 +151,33 @@ struct WorkItem {
class SIWholeQuadMode : public MachineFunctionPass {
private:
- CallingConv::ID CallingConv;
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
const GCNSubtarget *ST;
MachineRegisterInfo *MRI;
LiveIntervals *LIS;
+ MachineDominatorTree *MDT;
+ MachinePostDominatorTree *PDT;
unsigned AndOpc;
- unsigned XorTermrOpc;
+ unsigned AndN2Opc;
+ unsigned XorOpc;
+ unsigned AndSaveExecOpc;
unsigned OrSaveExecOpc;
- unsigned Exec;
+ unsigned WQMOpc;
+ Register Exec;
+ Register LiveMaskReg;
DenseMap<const MachineInstr *, InstrInfo> Instructions;
MapVector<MachineBasicBlock *, BlockInfo> Blocks;
- SmallVector<MachineInstr *, 1> LiveMaskQueries;
+
+ // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
+ DenseMap<const MachineInstr *, char> StateTransition;
+
+ SmallVector<MachineInstr *, 2> LiveMaskQueries;
SmallVector<MachineInstr *, 4> LowerToMovInstrs;
SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
+ SmallVector<MachineInstr *, 4> KillInstrs;
void printInfo();
@@ -153,6 +185,8 @@ private:
std::vector<WorkItem> &Worklist);
void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
+ void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
+ std::vector<WorkItem> &Worklist);
void markInstructionUses(const MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist);
char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
@@ -167,17 +201,27 @@ private:
MachineBasicBlock::iterator Last, bool PreferLast,
bool SaveSCC);
void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
- unsigned SaveWQM, unsigned LiveMaskReg);
+ Register SaveWQM);
void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
- unsigned SavedWQM);
- void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
- unsigned SaveOrig);
- void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
- unsigned SavedOrig);
- void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
-
- void lowerLiveMaskQueries(unsigned LiveMaskReg);
+ Register SavedWQM);
+ void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+ Register SaveOrig, char StrictStateNeeded);
+ void fromStrictMode(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator Before, Register SavedOrig,
+ char NonStrictState, char CurrentStrictState);
+
+ MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
+
+ MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
+ bool IsWQM);
+ MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
+
+ void lowerBlock(MachineBasicBlock &MBB);
+ void processBlock(MachineBasicBlock &MBB, bool IsEntry);
+
+ void lowerLiveMaskQueries();
void lowerCopyInstrs();
+ void lowerKillInstrs(bool IsWQM);
public:
static char ID;
@@ -193,9 +237,17 @@ public:
AU.addRequired<LiveIntervals>();
AU.addPreserved<SlotIndexes>();
AU.addPreserved<LiveIntervals>();
- AU.setPreservesCFG();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachinePostDominatorTree>();
+ AU.addPreserved<MachinePostDominatorTree>();
MachineFunctionPass::getAnalysisUsage(AU);
}
+
+ MachineFunctionProperties getClearedProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::IsSSA);
+ }
};
} // end anonymous namespace
@@ -205,6 +257,8 @@ char SIWholeQuadMode::ID = 0;
INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
false)
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
false)
@@ -241,8 +295,6 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
assert(!(Flag & StateExact) && Flag != 0);
- LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
-
// Remove any disabled states from the flag. The user that required it gets
// an undefined value in the helper lanes. For example, this can happen if
// the result of an atomic is used by instruction that requires WQM, where
@@ -254,6 +306,7 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
if ((II.Needs & Flag) == Flag)
return;
+ LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
II.Needs |= Flag;
Worklist.push_back(&MI);
}
@@ -262,108 +315,167 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
Register Reg, unsigned SubReg, char Flag,
std::vector<WorkItem> &Worklist) {
- assert(!MRI->isSSA());
-
LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
- if (!UseLRQ.valueIn())
+ const VNInfo *Value = UseLRQ.valueIn();
+ if (!Value)
return;
- SmallPtrSet<const VNInfo *, 4> Visited;
- SmallVector<const VNInfo *, 4> ToProcess;
- ToProcess.push_back(UseLRQ.valueIn());
+ // Note: this code assumes that lane masks on AMDGPU completely
+ // cover registers.
+ const LaneBitmask UseLanes =
+ SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
+ : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
+ : LaneBitmask::getNone());
+
+ // Perform a depth-first iteration of the LiveRange graph marking defs.
+ // Stop processing of a given branch when all use lanes have been defined.
+ // The first definition stops processing for a physical register.
+ struct PhiEntry {
+ const VNInfo *Phi;
+ unsigned PredIdx;
+ LaneBitmask DefinedLanes;
+
+ PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
+ : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
+ };
+ using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
+ SmallVector<PhiEntry, 2> PhiStack;
+ SmallSet<VisitKey, 4> Visited;
+ LaneBitmask DefinedLanes;
+ unsigned NextPredIdx = 0; // Only used for processing phi nodes
do {
- const VNInfo *Value = ToProcess.pop_back_val();
- Visited.insert(Value);
+ const VNInfo *NextValue = nullptr;
+ const VisitKey Key(Value, DefinedLanes);
+
+ if (!Visited.count(Key)) {
+ Visited.insert(Key);
+ // On first visit to a phi then start processing first predecessor
+ NextPredIdx = 0;
+ }
if (Value->isPHIDef()) {
- // Need to mark all defs used in the PHI node
+ // Each predecessor node in the phi must be processed as a subgraph
const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
assert(MBB && "Phi-def has no defining MBB");
- for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
- PE = MBB->pred_end();
- PI != PE; ++PI) {
+
+ // Find next predecessor to process
+ unsigned Idx = NextPredIdx;
+ auto PI = MBB->pred_begin() + Idx;
+ auto PE = MBB->pred_end();
+ for (; PI != PE && !NextValue; ++PI, ++Idx) {
if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
- if (!Visited.count(VN))
- ToProcess.push_back(VN);
+ if (!Visited.count(VisitKey(VN, DefinedLanes)))
+ NextValue = VN;
}
}
+
+ // If there are more predecessors to process; add phi to stack
+ if (PI != PE)
+ PhiStack.emplace_back(Value, Idx, DefinedLanes);
} else {
MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
assert(MI && "Def has no defining instruction");
- markInstruction(*MI, Flag, Worklist);
- // Iterate over all operands to find relevant definitions
- for (const MachineOperand &Op : MI->operands()) {
- if (!(Op.isReg() && Op.getReg() == Reg))
- continue;
+ if (Reg.isVirtual()) {
+ // Iterate over all operands to find relevant definitions
+ bool HasDef = false;
+ for (const MachineOperand &Op : MI->operands()) {
+ if (!(Op.isReg() && Op.isDef() && Op.getReg() == Reg))
+ continue;
+
+ // Compute lanes defined and overlap with use
+ LaneBitmask OpLanes =
+ Op.isUndef() ? LaneBitmask::getAll()
+ : TRI->getSubRegIndexLaneMask(Op.getSubReg());
+ LaneBitmask Overlap = (UseLanes & OpLanes);
- // Does this def cover whole register?
- bool DefinesFullReg =
- Op.isUndef() || !Op.getSubReg() || Op.getSubReg() == SubReg;
- if (!DefinesFullReg) {
- // Partial definition; need to follow and mark input value
+ // Record if this instruction defined any of use
+ HasDef |= Overlap.any();
+
+ // Mark any lanes defined
+ DefinedLanes |= OpLanes;
+ }
+
+ // Check if all lanes of use have been defined
+ if ((DefinedLanes & UseLanes) != UseLanes) {
+ // Definition not complete; need to process input value
LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
if (const VNInfo *VN = LRQ.valueIn()) {
- if (!Visited.count(VN))
- ToProcess.push_back(VN);
+ if (!Visited.count(VisitKey(VN, DefinedLanes)))
+ NextValue = VN;
}
}
+
+ // Only mark the instruction if it defines some part of the use
+ if (HasDef)
+ markInstruction(*MI, Flag, Worklist);
+ } else {
+ // For physical registers simply mark the defining instruction
+ markInstruction(*MI, Flag, Worklist);
}
}
- } while (!ToProcess.empty());
-}
-/// Mark all instructions defining the uses in \p MI with \p Flag.
-void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
- std::vector<WorkItem> &Worklist) {
+ if (!NextValue && !PhiStack.empty()) {
+ // Reach end of chain; revert to processing last phi
+ PhiEntry &Entry = PhiStack.back();
+ NextValue = Entry.Phi;
+ NextPredIdx = Entry.PredIdx;
+ DefinedLanes = Entry.DefinedLanes;
+ PhiStack.pop_back();
+ }
- LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
- << MI);
+ Value = NextValue;
+ } while (Value);
+}
- for (const MachineOperand &Use : MI.uses()) {
- if (!Use.isReg() || !Use.isUse())
- continue;
+void SIWholeQuadMode::markOperand(const MachineInstr &MI,
+ const MachineOperand &Op, char Flag,
+ std::vector<WorkItem> &Worklist) {
+ assert(Op.isReg());
+ Register Reg = Op.getReg();
- Register Reg = Use.getReg();
+ // Ignore some hardware registers
+ switch (Reg) {
+ case AMDGPU::EXEC:
+ case AMDGPU::EXEC_LO:
+ return;
+ default:
+ break;
+ }
+ LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
+ << " for " << MI);
+ if (Reg.isVirtual()) {
+ LiveRange &LR = LIS->getInterval(Reg);
+ markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
+ } else {
// Handle physical registers that we need to track; this is mostly relevant
// for VCC, which can appear as the (implicit) input of a uniform branch,
// e.g. when a loop counter is stored in a VGPR.
- if (!Reg.isVirtual()) {
- if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
+ for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
+ ++RegUnit) {
+ LiveRange &LR = LIS->getRegUnit(*RegUnit);
+ const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
+ if (!Value)
continue;
- for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
- ++RegUnit) {
- LiveRange &LR = LIS->getRegUnit(*RegUnit);
- const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
- if (!Value)
- continue;
+ markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
+ }
+ }
+}
- if (MRI->isSSA()) {
- // Since we're in machine SSA, we do not need to track physical
- // registers across basic blocks.
- if (Value->isPHIDef())
- continue;
- markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
- Worklist);
- } else {
- markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
- }
- }
+/// Mark all instructions defining the uses in \p MI with \p Flag.
+void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
+ std::vector<WorkItem> &Worklist) {
+ LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
+ << MI);
+ for (const MachineOperand &Use : MI.uses()) {
+ if (!Use.isReg() || !Use.isUse())
continue;
- }
-
- if (MRI->isSSA()) {
- for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
- markInstruction(DefMI, Flag, Worklist);
- } else {
- LiveRange &LR = LIS->getInterval(Reg);
- markDefs(MI, LR, Reg, Use.getSubReg(), Flag, Worklist);
- }
+ markOperand(MI, Use, Flag, Worklist);
}
}
@@ -392,6 +504,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
char Flags = 0;
if (TII->isWQM(Opcode)) {
+ // If LOD is not supported WQM is not needed.
+ if (!ST->hasExtendedImageInsts())
+ continue;
// Sampling instructions don't need to produce results for all pixels
// in a quad, they just require all inputs of a quad to have been
// computed for derivatives.
@@ -407,27 +522,31 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
LowerToCopyInstrs.push_back(&MI);
SoftWQMInstrs.push_back(&MI);
continue;
- } else if (Opcode == AMDGPU::WWM) {
- // The WWM intrinsic doesn't make the same guarantee, and plus it needs
- // to be executed in WQM or Exact so that its copy doesn't clobber
- // inactive lanes.
- markInstructionUses(MI, StateWWM, Worklist);
- GlobalFlags |= StateWWM;
+ } else if (Opcode == AMDGPU::STRICT_WWM) {
+ // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
+ // it needs to be executed in WQM or Exact so that its copy doesn't
+ // clobber inactive lanes.
+ markInstructionUses(MI, StateStrictWWM, Worklist);
+ GlobalFlags |= StateStrictWWM;
+ LowerToMovInstrs.push_back(&MI);
+ continue;
+ } else if (Opcode == AMDGPU::STRICT_WQM) {
+ // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
+ // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
+ // quads that have at least one active thread.
+ markInstructionUses(MI, StateStrictWQM, Worklist);
+ GlobalFlags |= StateStrictWQM;
LowerToMovInstrs.push_back(&MI);
continue;
} else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
Opcode == AMDGPU::V_SET_INACTIVE_B64) {
- III.Disabled = StateWWM;
+ III.Disabled = StateStrict;
MachineOperand &Inactive = MI.getOperand(2);
if (Inactive.isReg()) {
if (Inactive.isUndef()) {
LowerToCopyInstrs.push_back(&MI);
} else {
- Register Reg = Inactive.getReg();
- if (Reg.isVirtual()) {
- for (MachineInstr &DefMI : MRI->def_instructions(Reg))
- markInstruction(DefMI, StateWWM, Worklist);
- }
+ markOperand(MI, Inactive, StateStrictWWM, Worklist);
}
}
SetInactiveInstrs.push_back(&MI);
@@ -439,15 +558,21 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
Worklist.push_back(&MBB);
}
GlobalFlags |= StateExact;
- III.Disabled = StateWQM | StateWWM;
+ III.Disabled = StateWQM | StateStrict;
continue;
} else {
- if (Opcode == AMDGPU::SI_PS_LIVE) {
+ if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
LiveMaskQueries.push_back(&MI);
+ } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
+ Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
+ Opcode == AMDGPU::SI_DEMOTE_I1) {
+ KillInstrs.push_back(&MI);
+ BBI.NeedsLowering = true;
} else if (WQMOutputs) {
// The function is in machine SSA form, which means that physical
// VGPRs correspond to shader inputs and outputs. Inputs are
// only used, outputs are only defined.
+ // FIXME: is this still valid?
for (const MachineOperand &MO : MI.defs()) {
if (!MO.isReg())
continue;
@@ -510,7 +635,7 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
// Propagate backwards within block
if (MachineInstr *PrevMI = MI.getPrevNode()) {
- char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
+ char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
if (!PrevMI->isPHI()) {
InstrInfo &PrevII = Instructions[PrevMI];
if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
@@ -526,10 +651,12 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
if (II.Needs != 0)
markInstructionUses(MI, II.Needs, Worklist);
- // Ensure we process a block containing WWM, even if it does not require any
- // WQM transitions.
- if (II.Needs & StateWWM)
- BI.Needs |= StateWWM;
+ // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
+ // not require any WQM transitions.
+ if (II.Needs & StateStrictWWM)
+ BI.Needs |= StateStrictWWM;
+ if (II.Needs & StateStrictWQM)
+ BI.Needs |= StateStrictWQM;
}
void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
@@ -604,6 +731,339 @@ SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
return Restore;
}
+MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
+ MachineInstr *TermMI) {
+ LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
+ << *TermMI << "\n");
+
+ MachineBasicBlock *SplitBB =
+ BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
+
+ // Convert last instruction in block to a terminator.
+ // Note: this only covers the expected patterns
+ unsigned NewOpcode = 0;
+ switch (TermMI->getOpcode()) {
+ case AMDGPU::S_AND_B32:
+ NewOpcode = AMDGPU::S_AND_B32_term;
+ break;
+ case AMDGPU::S_AND_B64:
+ NewOpcode = AMDGPU::S_AND_B64_term;
+ break;
+ case AMDGPU::S_MOV_B32:
+ NewOpcode = AMDGPU::S_MOV_B32_term;
+ break;
+ case AMDGPU::S_MOV_B64:
+ NewOpcode = AMDGPU::S_MOV_B64_term;
+ break;
+ default:
+ break;
+ }
+ if (NewOpcode)
+ TermMI->setDesc(TII->get(NewOpcode));
+
+ if (SplitBB != BB) {
+ // Update dominator trees
+ using DomTreeT = DomTreeBase<MachineBasicBlock>;
+ SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
+ for (MachineBasicBlock *Succ : SplitBB->successors()) {
+ DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
+ DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
+ }
+ DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
+ if (MDT)
+ MDT->getBase().applyUpdates(DTUpdates);
+ if (PDT)
+ PDT->getBase().applyUpdates(DTUpdates);
+
+ // Link blocks
+ MachineInstr *MI =
+ BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
+ .addMBB(SplitBB);
+ LIS->InsertMachineInstrInMaps(*MI);
+ }
+
+ return SplitBB;
+}
+
+MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
+ MachineInstr &MI) {
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned Opcode = 0;
+
+ assert(MI.getOperand(0).isReg());
+
+ // Comparison is for live lanes; however here we compute the inverse
+ // (killed lanes). This is because VCMP will always generate 0 bits
+ // for inactive lanes so a mask of live lanes would not be correct
+ // inside control flow.
+ // Invert the comparison by swapping the operands and adjusting
+ // the comparison codes.
+
+ switch (MI.getOperand(2).getImm()) {
+ case ISD::SETUEQ:
+ Opcode = AMDGPU::V_CMP_LG_F32_e64;
+ break;
+ case ISD::SETUGT:
+ Opcode = AMDGPU::V_CMP_GE_F32_e64;
+ break;
+ case ISD::SETUGE:
+ Opcode = AMDGPU::V_CMP_GT_F32_e64;
+ break;
+ case ISD::SETULT:
+ Opcode = AMDGPU::V_CMP_LE_F32_e64;
+ break;
+ case ISD::SETULE:
+ Opcode = AMDGPU::V_CMP_LT_F32_e64;
+ break;
+ case ISD::SETUNE:
+ Opcode = AMDGPU::V_CMP_EQ_F32_e64;
+ break;
+ case ISD::SETO:
+ Opcode = AMDGPU::V_CMP_O_F32_e64;
+ break;
+ case ISD::SETUO:
+ Opcode = AMDGPU::V_CMP_U_F32_e64;
+ break;
+ case ISD::SETOEQ:
+ case ISD::SETEQ:
+ Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
+ break;
+ case ISD::SETOGT:
+ case ISD::SETGT:
+ Opcode = AMDGPU::V_CMP_NLT_F32_e64;
+ break;
+ case ISD::SETOGE:
+ case ISD::SETGE:
+ Opcode = AMDGPU::V_CMP_NLE_F32_e64;
+ break;
+ case ISD::SETOLT:
+ case ISD::SETLT:
+ Opcode = AMDGPU::V_CMP_NGT_F32_e64;
+ break;
+ case ISD::SETOLE:
+ case ISD::SETLE:
+ Opcode = AMDGPU::V_CMP_NGE_F32_e64;
+ break;
+ case ISD::SETONE:
+ case ISD::SETNE:
+ Opcode = AMDGPU::V_CMP_NLG_F32_e64;
+ break;
+ default:
+ llvm_unreachable("invalid ISD:SET cond code");
+ }
+
+ // Pick opcode based on comparison type.
+ MachineInstr *VcmpMI;
+ const MachineOperand &Op0 = MI.getOperand(0);
+ const MachineOperand &Op1 = MI.getOperand(1);
+ if (TRI->isVGPR(*MRI, Op0.getReg())) {
+ Opcode = AMDGPU::getVOPe32(Opcode);
+ VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
+ } else {
+ VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
+ .addReg(AMDGPU::VCC, RegState::Define)
+ .addImm(0) // src0 modifiers
+ .add(Op1)
+ .addImm(0) // src1 modifiers
+ .add(Op0)
+ .addImm(0); // omod
+ }
+
+ // VCC represents lanes killed.
+ Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
+
+ MachineInstr *MaskUpdateMI =
+ BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
+ .addReg(LiveMaskReg)
+ .addReg(VCC);
+
+ // State of SCC represents whether any lanes are live in mask,
+ // if SCC is 0 then no lanes will be alive anymore.
+ MachineInstr *EarlyTermMI =
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
+
+ MachineInstr *ExecMaskMI =
+ BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
+
+ assert(MBB.succ_size() == 1);
+ MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
+ .addMBB(*MBB.succ_begin());
+
+ // Update live intervals
+ LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
+ MBB.remove(&MI);
+
+ LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
+ LIS->InsertMachineInstrInMaps(*ExecMaskMI);
+ LIS->InsertMachineInstrInMaps(*EarlyTermMI);
+ LIS->InsertMachineInstrInMaps(*NewTerm);
+
+ return NewTerm;
+}
+
+MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
+ MachineInstr &MI, bool IsWQM) {
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineInstr *MaskUpdateMI = nullptr;
+
+ const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
+ const MachineOperand &Op = MI.getOperand(0);
+ int64_t KillVal = MI.getOperand(1).getImm();
+ MachineInstr *ComputeKilledMaskMI = nullptr;
+ Register CndReg = !Op.isImm() ? Op.getReg() : Register();
+ Register TmpReg;
+
+ // Is this a static or dynamic kill?
+ if (Op.isImm()) {
+ if (Op.getImm() == KillVal) {
+ // Static: all active lanes are killed
+ MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
+ .addReg(LiveMaskReg)
+ .addReg(Exec);
+ } else {
+ // Static: kill does nothing
+ MachineInstr *NewTerm = nullptr;
+ if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
+ LIS->RemoveMachineInstrFromMaps(MI);
+ } else {
+ assert(MBB.succ_size() == 1);
+ NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
+ .addMBB(*MBB.succ_begin());
+ LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
+ }
+ MBB.remove(&MI);
+ return NewTerm;
+ }
+ } else {
+ if (!KillVal) {
+ // Op represents live lanes after kill,
+ // so exec mask needs to be factored in.
+ TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
+ ComputeKilledMaskMI =
+ BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
+ MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
+ .addReg(LiveMaskReg)
+ .addReg(TmpReg);
+ } else {
+ // Op represents lanes to kill
+ MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
+ .addReg(LiveMaskReg)
+ .add(Op);
+ }
+ }
+
+ // State of SCC represents whether any lanes are live in mask,
+ // if SCC is 0 then no lanes will be alive anymore.
+ MachineInstr *EarlyTermMI =
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
+
+ // In the case we got this far some lanes are still live,
+ // update EXEC to deactivate lanes as appropriate.
+ MachineInstr *NewTerm;
+ MachineInstr *WQMMaskMI = nullptr;
+ Register LiveMaskWQM;
+ if (IsDemote) {
+ // Demotes deactive quads with only helper lanes
+ LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
+ WQMMaskMI =
+ BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
+ NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
+ .addReg(Exec)
+ .addReg(LiveMaskWQM);
+ } else {
+ // Kills deactivate lanes
+ if (Op.isImm()) {
+ unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
+ } else if (!IsWQM) {
+ NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
+ .addReg(Exec)
+ .addReg(LiveMaskReg);
+ } else {
+ unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
+ NewTerm =
+ BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
+ }
+ }
+
+ // Update live intervals
+ LIS->RemoveMachineInstrFromMaps(MI);
+ MBB.remove(&MI);
+ assert(EarlyTermMI);
+ assert(MaskUpdateMI);
+ assert(NewTerm);
+ if (ComputeKilledMaskMI)
+ LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
+ LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
+ LIS->InsertMachineInstrInMaps(*EarlyTermMI);
+ if (WQMMaskMI)
+ LIS->InsertMachineInstrInMaps(*WQMMaskMI);
+ LIS->InsertMachineInstrInMaps(*NewTerm);
+
+ if (CndReg) {
+ LIS->removeInterval(CndReg);
+ LIS->createAndComputeVirtRegInterval(CndReg);
+ }
+ if (TmpReg)
+ LIS->createAndComputeVirtRegInterval(TmpReg);
+ if (LiveMaskWQM)
+ LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
+
+ return NewTerm;
+}
+
+// Replace (or supplement) instructions accessing live mask.
+// This can only happen once all the live mask registers have been created
+// and the execute state (WQM/StrictWWM/Exact) of instructions is known.
+void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
+ auto BII = Blocks.find(&MBB);
+ if (BII == Blocks.end())
+ return;
+
+ const BlockInfo &BI = BII->second;
+ if (!BI.NeedsLowering)
+ return;
+
+ LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
+
+ SmallVector<MachineInstr *, 4> SplitPoints;
+ char State = BI.InitialState;
+
+ auto II = MBB.getFirstNonPHI(), IE = MBB.end();
+ while (II != IE) {
+ auto Next = std::next(II);
+ MachineInstr &MI = *II;
+
+ if (StateTransition.count(&MI))
+ State = StateTransition[&MI];
+
+ MachineInstr *SplitPoint = nullptr;
+ switch (MI.getOpcode()) {
+ case AMDGPU::SI_DEMOTE_I1:
+ case AMDGPU::SI_KILL_I1_TERMINATOR:
+ SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
+ break;
+ case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
+ SplitPoint = lowerKillF32(MBB, MI);
+ break;
+ default:
+ break;
+ }
+ if (SplitPoint)
+ SplitPoints.push_back(SplitPoint);
+
+ II = Next;
+ }
+
+ // Perform splitting after instruction scan to simplify iteration.
+ if (!SplitPoints.empty()) {
+ MachineBasicBlock *BB = &MBB;
+ for (MachineInstr *MI : SplitPoints) {
+ BB = splitBlock(BB, MI);
+ }
+ }
+}
+
// Return an iterator in the (inclusive) range [First, Last] at which
// instructions can be safely inserted, keeping in mind that some of the
// instructions we want to add necessarily clobber SCC.
@@ -680,93 +1140,108 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before,
- unsigned SaveWQM, unsigned LiveMaskReg) {
+ Register SaveWQM) {
MachineInstr *MI;
if (SaveWQM) {
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
- AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64),
- SaveWQM)
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndSaveExecOpc), SaveWQM)
.addReg(LiveMaskReg);
} else {
- unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
- AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64),
- Exec)
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndOpc), Exec)
.addReg(Exec)
.addReg(LiveMaskReg);
}
LIS->InsertMachineInstrInMaps(*MI);
+ StateTransition[MI] = StateExact;
}
void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before,
- unsigned SavedWQM) {
+ Register SavedWQM) {
MachineInstr *MI;
- unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
if (SavedWQM) {
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
.addReg(SavedWQM);
} else {
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
- AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
- Exec)
- .addReg(Exec);
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
}
LIS->InsertMachineInstrInMaps(*MI);
+ StateTransition[MI] = StateWQM;
}
-void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator Before,
- unsigned SaveOrig) {
+void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator Before,
+ Register SaveOrig, char StrictStateNeeded) {
MachineInstr *MI;
-
assert(SaveOrig);
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig)
- .addImm(-1);
+ assert(StrictStateNeeded == StateStrictWWM ||
+ StrictStateNeeded == StateStrictWQM);
+
+ if (StrictStateNeeded == StateStrictWWM) {
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
+ SaveOrig)
+ .addImm(-1);
+ } else {
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
+ SaveOrig)
+ .addImm(-1);
+ }
LIS->InsertMachineInstrInMaps(*MI);
+ StateTransition[MI] = StateStrictWWM;
}
-void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator Before,
- unsigned SavedOrig) {
+void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator Before,
+ Register SavedOrig, char NonStrictState,
+ char CurrentStrictState) {
MachineInstr *MI;
assert(SavedOrig);
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM),
- ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC)
- .addReg(SavedOrig);
+ assert(CurrentStrictState == StateStrictWWM ||
+ CurrentStrictState == StateStrictWQM);
+
+ if (CurrentStrictState == StateStrictWWM) {
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
+ Exec)
+ .addReg(SavedOrig);
+ } else {
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
+ Exec)
+ .addReg(SavedOrig);
+ }
LIS->InsertMachineInstrInMaps(*MI);
+ StateTransition[MI] = NonStrictState;
}
-void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
- bool isEntry) {
+void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
auto BII = Blocks.find(&MBB);
if (BII == Blocks.end())
return;
- const BlockInfo &BI = BII->second;
+ BlockInfo &BI = BII->second;
// This is a non-entry block that is WQM throughout, so no need to do
// anything.
- if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
+ if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
+ BI.InitialState = StateWQM;
return;
+ }
LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
<< ":\n");
- unsigned SavedWQMReg = 0;
- unsigned SavedNonWWMReg = 0;
- bool WQMFromExec = isEntry;
- char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
- char NonWWMState = 0;
+ Register SavedWQMReg;
+ Register SavedNonStrictReg;
+ bool WQMFromExec = IsEntry;
+ char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
+ char NonStrictState = 0;
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
auto II = MBB.getFirstNonPHI(), IE = MBB.end();
- if (isEntry) {
+ if (IsEntry) {
// Skip the instruction that saves LiveMask
if (II != IE && II->getOpcode() == AMDGPU::COPY)
++II;
@@ -776,22 +1251,25 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
// Exact or vice versa.
MachineBasicBlock::iterator FirstWQM = IE;
- // This stores the first instruction where it's safe to switch from WWM to
- // Exact/WQM or to switch to WWM. It must always be the same as, or after,
- // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
- // switch to/from WQM as well.
- MachineBasicBlock::iterator FirstWWM = IE;
+ // This stores the first instruction where it's safe to switch from Strict
+ // mode to Exact/WQM or to switch to Strict mode. It must always be the same
+ // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
+ // be safe to switch to/from WQM as well.
+ MachineBasicBlock::iterator FirstStrict = IE;
+
+ // Record initial state is block information.
+ BI.InitialState = State;
for (;;) {
MachineBasicBlock::iterator Next = II;
- char Needs = StateExact | StateWQM; // WWM is disabled by default
+ char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
char OutNeeds = 0;
if (FirstWQM == IE)
FirstWQM = II;
- if (FirstWWM == IE)
- FirstWWM = II;
+ if (FirstStrict == IE)
+ FirstStrict = II;
// First, figure out the allowed states (Needs) based on the propagated
// flags.
@@ -801,8 +1279,10 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
auto III = Instructions.find(&MI);
if (III != Instructions.end()) {
- if (III->second.Needs & StateWWM)
- Needs = StateWWM;
+ if (III->second.Needs & StateStrictWWM)
+ Needs = StateStrictWWM;
+ else if (III->second.Needs & StateStrictWQM)
+ Needs = StateStrictWQM;
else if (III->second.Needs & StateWQM)
Needs = StateWQM;
else
@@ -811,8 +1291,8 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
}
} else {
// If the instruction doesn't actually need a correct EXEC, then we can
- // safely leave WWM enabled.
- Needs = StateExact | StateWQM | StateWWM;
+ // safely leave Strict mode enabled.
+ Needs = StateExact | StateWQM | StateStrict;
}
if (MI.isTerminator() && OutNeeds == StateExact)
@@ -832,32 +1312,56 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
// Now, transition if necessary.
if (!(Needs & State)) {
MachineBasicBlock::iterator First;
- if (State == StateWWM || Needs == StateWWM) {
- // We must switch to or from WWM
- First = FirstWWM;
+ if (State == StateStrictWWM || Needs == StateStrictWWM ||
+ State == StateStrictWQM || Needs == StateStrictWQM) {
+ // We must switch to or from Strict mode.
+ First = FirstStrict;
} else {
- // We only need to switch to/from WQM, so we can use FirstWQM
+ // We only need to switch to/from WQM, so we can use FirstWQM.
First = FirstWQM;
}
+ // Whether we need to save SCC depends on start and end states.
+ bool SaveSCC = false;
+ switch (State) {
+ case StateExact:
+ case StateStrictWWM:
+ case StateStrictWQM:
+ // Exact/Strict -> Strict: save SCC
+ // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
+ // Exact/Strict -> Exact: no save
+ SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
+ break;
+ case StateWQM:
+ // WQM -> Exact/Strict: save SCC
+ SaveSCC = !(Needs & StateWQM);
+ break;
+ default:
+ llvm_unreachable("Unknown state");
+ break;
+ }
MachineBasicBlock::iterator Before =
- prepareInsertion(MBB, First, II, Needs == StateWQM,
- Needs == StateExact || WQMFromExec);
-
- if (State == StateWWM) {
- assert(SavedNonWWMReg);
- fromWWM(MBB, Before, SavedNonWWMReg);
- LIS->createAndComputeVirtRegInterval(SavedNonWWMReg);
- SavedNonWWMReg = 0;
- State = NonWWMState;
+ prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
+
+ if (State & StateStrict) {
+ assert(State == StateStrictWWM || State == StateStrictWQM);
+ assert(SavedNonStrictReg);
+ fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
+
+ LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
+ SavedNonStrictReg = 0;
+ State = NonStrictState;
}
- if (Needs == StateWWM) {
- NonWWMState = State;
- assert(!SavedNonWWMReg);
- SavedNonWWMReg = MRI->createVirtualRegister(BoolRC);
- toWWM(MBB, Before, SavedNonWWMReg);
- State = StateWWM;
+ if (Needs & StateStrict) {
+ NonStrictState = State;
+ assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
+ assert(!SavedNonStrictReg);
+ SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
+
+ toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
+ State = Needs;
+
} else {
if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
if (!WQMFromExec && (OutNeeds & StateWQM)) {
@@ -865,7 +1369,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
SavedWQMReg = MRI->createVirtualRegister(BoolRC);
}
- toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
+ toExact(MBB, Before, SavedWQMReg);
State = StateExact;
} else if (State == StateExact && (Needs & StateWQM) &&
!(Needs & StateExact)) {
@@ -879,17 +1383,18 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
}
State = StateWQM;
} else {
- // We can get here if we transitioned from WWM to a non-WWM state that
- // already matches our needs, but we shouldn't need to do anything.
+ // We can get here if we transitioned from StrictWWM to a
+ // non-StrictWWM state that already matches our needs, but we
+ // shouldn't need to do anything.
assert(Needs & State);
}
}
}
- if (Needs != (StateExact | StateWQM | StateWWM)) {
+ if (Needs != (StateExact | StateWQM | StateStrict)) {
if (Needs != (StateExact | StateWQM))
FirstWQM = IE;
- FirstWWM = IE;
+ FirstStrict = IE;
}
if (II == IE)
@@ -898,10 +1403,10 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
II = Next;
}
assert(!SavedWQMReg);
- assert(!SavedNonWWMReg);
+ assert(!SavedNonStrictReg);
}
-void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
+void SIWholeQuadMode::lowerLiveMaskQueries() {
for (MachineInstr *MI : LiveMaskQueries) {
const DebugLoc &DL = MI->getDebugLoc();
Register Dest = MI->getOperand(0).getReg();
@@ -931,9 +1436,12 @@ void SIWholeQuadMode::lowerCopyInstrs() {
const unsigned MovOp = TII->getMovOpcode(regClass);
MI->setDesc(TII->get(MovOp));
- // And make it implicitly depend on exec (like all VALU movs should do).
- MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
- } else if (!MRI->isSSA()) {
+ // Check that it already implicitly depends on exec (like all VALU movs
+ // should do).
+ assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
+ return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
+ }));
+ } else {
// Remove early-clobber and exec dependency from simple SGPR copies.
// This allows some to be eliminated during/post RA.
LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
@@ -969,13 +1477,38 @@ void SIWholeQuadMode::lowerCopyInstrs() {
}
}
+void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
+ for (MachineInstr *MI : KillInstrs) {
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineInstr *SplitPoint = nullptr;
+ switch (MI->getOpcode()) {
+ case AMDGPU::SI_DEMOTE_I1:
+ case AMDGPU::SI_KILL_I1_TERMINATOR:
+ SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
+ break;
+ case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
+ SplitPoint = lowerKillF32(*MBB, *MI);
+ break;
+ default:
+ continue;
+ }
+ if (SplitPoint)
+ splitBlock(MBB, SplitPoint);
+ }
+}
+
bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
+ LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
+ << " ------------- \n");
+ LLVM_DEBUG(MF.dump(););
+
Instructions.clear();
Blocks.clear();
LiveMaskQueries.clear();
LowerToCopyInstrs.clear();
LowerToMovInstrs.clear();
- CallingConv = MF.getFunction().getCallingConv();
+ KillInstrs.clear();
+ StateTransition.clear();
ST = &MF.getSubtarget<GCNSubtarget>();
@@ -983,64 +1516,72 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
LIS = &getAnalysis<LiveIntervals>();
+ MDT = &getAnalysis<MachineDominatorTree>();
+ PDT = &getAnalysis<MachinePostDominatorTree>();
if (ST->isWave32()) {
AndOpc = AMDGPU::S_AND_B32;
- XorTermrOpc = AMDGPU::S_XOR_B32_term;
+ AndN2Opc = AMDGPU::S_ANDN2_B32;
+ XorOpc = AMDGPU::S_XOR_B32;
+ AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
+ WQMOpc = AMDGPU::S_WQM_B32;
Exec = AMDGPU::EXEC_LO;
} else {
AndOpc = AMDGPU::S_AND_B64;
- XorTermrOpc = AMDGPU::S_XOR_B64_term;
+ AndN2Opc = AMDGPU::S_ANDN2_B64;
+ XorOpc = AMDGPU::S_XOR_B64;
+ AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
+ WQMOpc = AMDGPU::S_WQM_B64;
Exec = AMDGPU::EXEC;
}
- char GlobalFlags = analyzeFunction(MF);
- unsigned LiveMaskReg = 0;
- if (!(GlobalFlags & StateWQM)) {
- lowerLiveMaskQueries(Exec);
- if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty())
- return !LiveMaskQueries.empty();
- } else {
- // Store a copy of the original live mask when required
- MachineBasicBlock &Entry = MF.front();
- MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
-
- if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
- LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
- MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
- TII->get(AMDGPU::COPY), LiveMaskReg)
- .addReg(Exec);
- LIS->InsertMachineInstrInMaps(*MI);
- }
+ const char GlobalFlags = analyzeFunction(MF);
+ const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty());
+
+ LiveMaskReg = Exec;
- lowerLiveMaskQueries(LiveMaskReg);
+ // Shader is simple does not need any state changes or any complex lowering
+ if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
+ LowerToMovInstrs.empty() && KillInstrs.empty()) {
+ lowerLiveMaskQueries();
+ return !LiveMaskQueries.empty();
+ }
- if (GlobalFlags == StateWQM) {
- // For a shader that needs only WQM, we can just set it once.
- auto MI = BuildMI(Entry, EntryMI, DebugLoc(),
- TII->get(ST->isWave32() ? AMDGPU::S_WQM_B32
- : AMDGPU::S_WQM_B64),
- Exec)
- .addReg(Exec);
- LIS->InsertMachineInstrInMaps(*MI);
+ MachineBasicBlock &Entry = MF.front();
+ MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
- lowerCopyInstrs();
- // EntryMI may become invalid here
- return true;
- }
+ // Store a copy of the original live mask when required
+ if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
+ LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
+ MachineInstr *MI =
+ BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
+ .addReg(Exec);
+ LIS->InsertMachineInstrInMaps(*MI);
}
LLVM_DEBUG(printInfo());
+ lowerLiveMaskQueries();
lowerCopyInstrs();
- // Handle the general case
- for (auto BII : Blocks)
- processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
+ // Shader only needs WQM
+ if (GlobalFlags == StateWQM) {
+ auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
+ .addReg(Exec);
+ LIS->InsertMachineInstrInMaps(*MI);
+ lowerKillInstrs(true);
+ } else {
+ for (auto BII : Blocks)
+ processBlock(*BII.first, BII.first == &Entry);
+ // Lowering blocks causes block splitting so perform as a second pass.
+ for (auto BII : Blocks)
+ lowerBlock(*BII.first);
+ }
- if (LiveMaskReg)
+ // Compute live range for live mask
+ if (LiveMaskReg != Exec)
LIS->createAndComputeVirtRegInterval(LiveMaskReg);
// Physical registers like SCC aren't tracked by default anyway, so just
@@ -1048,5 +1589,9 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
// the analysis results.
LIS->removeRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
+ // If we performed any kills then recompute EXEC
+ if (!KillInstrs.empty())
+ LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
+
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 5b8896c21832..8502ed61b366 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -57,10 +57,19 @@ class SM_Real <SM_Pseudo ps>
Instruction Opcode = !cast<Instruction>(NAME);
// copy relevant pseudo op flags
- let SubtargetPredicate = ps.SubtargetPredicate;
- let AsmMatchConverter = ps.AsmMatchConverter;
+ let LGKM_CNT = ps.LGKM_CNT;
+ let SMRD = ps.SMRD;
+ let mayStore = ps.mayStore;
+ let mayLoad = ps.mayLoad;
+ let hasSideEffects = ps.hasSideEffects;
let UseNamedOperandTable = ps.UseNamedOperandTable;
- let SMRD = ps.SMRD;
+ let SchedRW = ps.SchedRW;
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let IsAtomicRet = ps.IsAtomicRet;
+ let IsAtomicNoRet = ps.IsAtomicNoRet;
+
+ let TSFlags = ps.TSFlags;
bit is_buffer = ps.is_buffer;
@@ -69,6 +78,7 @@ class SM_Real <SM_Pseudo ps>
bits<7> sdst;
bits<32> offset;
bits<1> imm = !if(ps.has_offset, ps.offset_is_imm, 0);
+ bits<5> cpol;
}
class SM_Probe_Pseudo <string opName, dag ins, bit isImm>
@@ -120,8 +130,8 @@ multiclass SM_Pseudo_Loads<string opName,
RegisterClass dstClass> {
def _IMM : SM_Load_Pseudo <opName,
(outs dstClass:$sdst),
- (ins baseClass:$sbase, i32imm:$offset, i1imm:$glc, i1imm:$dlc),
- " $sdst, $sbase, $offset$glc$dlc", []> {
+ (ins baseClass:$sbase, i32imm:$offset, CPol:$cpol),
+ " $sdst, $sbase, $offset$cpol", []> {
let offset_is_imm = 1;
let BaseClass = baseClass;
let PseudoInstr = opName # "_IMM";
@@ -131,8 +141,8 @@ multiclass SM_Pseudo_Loads<string opName,
def _SGPR : SM_Load_Pseudo <opName,
(outs dstClass:$sdst),
- (ins baseClass:$sbase, SReg_32:$soff, i1imm:$glc, i1imm:$dlc),
- " $sdst, $sbase, $offset$glc$dlc", []> {
+ (ins baseClass:$sbase, SReg_32:$soff, CPol:$cpol),
+ " $sdst, $sbase, $offset$cpol", []> {
let BaseClass = baseClass;
let PseudoInstr = opName # "_SGPR";
let has_glc = 1;
@@ -144,8 +154,8 @@ multiclass SM_Pseudo_Stores<string opName,
RegisterClass baseClass,
RegisterClass srcClass> {
def _IMM : SM_Store_Pseudo <opName,
- (ins srcClass:$sdata, baseClass:$sbase, i32imm:$offset, i1imm:$glc, i1imm:$dlc),
- " $sdata, $sbase, $offset$glc$dlc", []> {
+ (ins srcClass:$sdata, baseClass:$sbase, i32imm:$offset, CPol:$cpol),
+ " $sdata, $sbase, $offset$cpol", []> {
let offset_is_imm = 1;
let BaseClass = baseClass;
let SrcClass = srcClass;
@@ -153,8 +163,8 @@ multiclass SM_Pseudo_Stores<string opName,
}
def _SGPR : SM_Store_Pseudo <opName,
- (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soff, i1imm:$glc, i1imm:$dlc),
- " $sdata, $sbase, $offset$glc$dlc", []> {
+ (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soff, CPol:$cpol),
+ " $sdata, $sbase, $offset$cpol", []> {
let BaseClass = baseClass;
let SrcClass = srcClass;
let PseudoInstr = opName # "_SGPR";
@@ -227,24 +237,32 @@ class SM_Atomic_Pseudo <string opName,
let ScalarStore = 1;
let hasSideEffects = 1;
let maybeAtomic = 1;
+
+ let IsAtomicNoRet = !not(isRet);
+ let IsAtomicRet = isRet;
+
+ let AsmMatchConverter = "cvtSMEMAtomic";
}
class SM_Pseudo_Atomic<string opName,
RegisterClass baseClass,
RegisterClass dataClass,
bit isImm,
- bit isRet> :
+ bit isRet,
+ string opNameWithSuffix = opName # !if(isImm,
+ !if(isRet, "_IMM_RTN", "_IMM"),
+ !if(isRet, "_SGPR_RTN", "_SGPR")),
+ Operand CPolTy = !if(isRet, CPol_GLC1, CPol)> :
SM_Atomic_Pseudo<opName,
!if(isRet, (outs dataClass:$sdst), (outs)),
!if(isImm,
- (ins dataClass:$sdata, baseClass:$sbase, smem_offset:$offset, DLC:$dlc),
- (ins dataClass:$sdata, baseClass:$sbase, SReg_32:$offset, DLC:$dlc)),
- !if(isRet, " $sdst", " $sdata") # ", $sbase, $offset" # !if(isRet, " glc", "") # "$dlc",
- isRet> {
+ (ins dataClass:$sdata, baseClass:$sbase, smem_offset:$offset, CPolTy:$cpol),
+ (ins dataClass:$sdata, baseClass:$sbase, SReg_32:$offset, CPolTy:$cpol)),
+ !if(isRet, " $sdst", " $sdata") # ", $sbase, $offset$cpol",
+ isRet>,
+ AtomicNoRet <opNameWithSuffix, isRet> {
let offset_is_imm = isImm;
- let PseudoInstr = opName # !if(isImm,
- !if(isRet, "_IMM_RTN", "_IMM"),
- !if(isRet, "_SGPR_RTN", "_SGPR"));
+ let PseudoInstr = opNameWithSuffix;
let Constraints = !if(isRet, "$sdst = $sdata", "");
let DisableEncoding = !if(isRet, "$sdata", "");
@@ -456,13 +474,13 @@ multiclass SM_Real_Loads_si<bits<5> op, string ps,
SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
def _IMM_si : SMRD_Real_si <op, immPs> {
- let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_8:$offset, GLC:$glc, DLC:$dlc);
+ let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_8:$offset, CPol:$cpol);
}
// FIXME: The operand name $offset is inconsistent with $soff used
// in the pseudo
def _SGPR_si : SMRD_Real_si <op, sgprPs> {
- let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc);
+ let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol);
}
}
@@ -490,32 +508,31 @@ class SMEM_Real_vi <bits<8> op, SM_Pseudo ps>
: SM_Real<ps>
, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI>
, Enc64 {
- bit glc;
-
let AssemblerPredicate = isGFX8GFX9;
let DecoderNamespace = "GFX8";
let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?);
let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?);
- let Inst{16} = !if(ps.has_glc, glc, ?);
+ let Inst{16} = !if(ps.has_glc, cpol{CPolBit.GLC}, ?);
let Inst{17} = imm;
let Inst{25-18} = op;
let Inst{31-26} = 0x30; //encoding
// VI supports 20-bit unsigned offsets while GFX9+ supports 21-bit signed.
// Offset value is corrected accordingly when offset is encoded/decoded.
- let Inst{52-32} = !if(ps.has_offset, offset{20-0}, ?);
+ let Inst{38-32} = !if(ps.has_offset, offset{6-0}, ?);
+ let Inst{52-39} = !if(ps.has_offset, !if(imm, offset{20-7}, ?), ?);
}
multiclass SM_Real_Loads_vi<bits<8> op, string ps,
SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
def _IMM_vi : SMEM_Real_vi <op, immPs> {
- let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, GLC:$glc, DLC:$dlc);
+ let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol);
}
def _SGPR_vi : SMEM_Real_vi <op, sgprPs> {
- let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc);
+ let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol);
}
}
@@ -533,11 +550,11 @@ multiclass SM_Real_Stores_vi<bits<8> op, string ps,
// FIXME: The operand name $offset is inconsistent with $soff used
// in the pseudo
def _IMM_vi : SMEM_Real_Store_vi <op, immPs> {
- let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, GLC:$glc, DLC:$dlc);
+ let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol);
}
def _SGPR_vi : SMEM_Real_Store_vi <op, sgprPs> {
- let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc);
+ let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol);
}
}
@@ -589,15 +606,16 @@ defm S_ATC_PROBE_BUFFER : SM_Real_Probe_vi <0x27, "S_ATC_PROBE_BUFFER">;
//===----------------------------------------------------------------------===//
class SMEM_Atomic_Real_vi <bits<8> op, SM_Atomic_Pseudo ps>
- : SMEM_Real_vi <op, ps> {
+ : SMEM_Real_vi <op, ps>,
+ AtomicNoRet <!subst("_RTN","",NAME), ps.glc> {
bits<7> sdata;
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
- let glc = ps.glc;
- let Inst{12-6} = !if(glc, sdst{6-0}, sdata{6-0});
+ let cpol{CPolBit.GLC} = ps.glc;
+ let Inst{12-6} = !if(ps.glc, sdst{6-0}, sdata{6-0});
}
multiclass SM_Real_Atomics_vi<bits<8> op, string ps> {
@@ -686,13 +704,7 @@ class SMRD_Real_Load_IMM_ci <bits<5> op, SM_Load_Pseudo ps> :
let AssemblerPredicate = isGFX7Only;
let DecoderNamespace = "GFX7";
- let InOperandList = (ins ps.BaseClass:$sbase, smrd_literal_offset:$offset, GLC:$glc, DLC:$dlc);
-
- let LGKM_CNT = ps.LGKM_CNT;
- let mayLoad = ps.mayLoad;
- let mayStore = ps.mayStore;
- let hasSideEffects = ps.hasSideEffects;
- let SchedRW = ps.SchedRW;
+ let InOperandList = (ins ps.BaseClass:$sbase, smrd_literal_offset:$offset, CPol:$cpol);
let Inst{7-0} = 0xff;
let Inst{8} = 0;
@@ -764,26 +776,26 @@ multiclass SMRD_Pattern <string Instr, ValueType vt> {
// 1. IMM offset
def : GCNPat <
(smrd_load (SMRDImm i64:$sbase, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0, 0))
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))
>;
// 2. 32-bit IMM offset on CI
def : GCNPat <
(smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
- (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0, 0))> {
+ (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
let OtherPredicates = [isGFX7Only];
}
// 3. SGPR offset
def : GCNPat <
(smrd_load (SMRDSgpr i64:$sbase, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0, 0))
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0))
>;
// 4. No offset
def : GCNPat <
(vt (smrd_load (i64 SReg_64:$sbase))),
- (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0, 0))
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))
>;
}
@@ -791,8 +803,7 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt> {
// 1. Offset as an immediate
def : GCNPat <
(SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), timm:$cachepolicy),
- (vt (!cast<SM_Pseudo>(Instr#"_IMM") SReg_128:$sbase, i32imm:$offset, (extract_glc $cachepolicy),
- (extract_dlc $cachepolicy)))> {
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") SReg_128:$sbase, i32imm:$offset, (extract_cpol $cachepolicy)))> {
let AddedComplexity = 2;
}
@@ -800,7 +811,7 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt> {
def : GCNPat <
(vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), timm:$cachepolicy)),
(!cast<InstSI>(Instr#"_IMM_ci") SReg_128:$sbase, smrd_literal_offset:$offset,
- (extract_glc $cachepolicy), (extract_dlc $cachepolicy))> {
+ (extract_cpol $cachepolicy))> {
let OtherPredicates = [isGFX7Only];
let AddedComplexity = 1;
}
@@ -808,8 +819,7 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt> {
// 3. Offset loaded in an 32bit SGPR
def : GCNPat <
(SIsbuffer_load v4i32:$sbase, i32:$offset, timm:$cachepolicy),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR") SReg_128:$sbase, SReg_32:$offset, (extract_glc $cachepolicy),
- (extract_dlc $cachepolicy)))
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR") SReg_128:$sbase, SReg_32:$offset, (extract_cpol $cachepolicy)))
>;
}
@@ -858,14 +868,16 @@ def : GCNPat <
>;
} // let OtherPredicates = [HasSMemTimeInst]
-let OtherPredicates = [HasNoSMemTimeInst] in {
+let OtherPredicates = [HasShaderCyclesRegister] in {
def : GCNPat <
(i64 (readcyclecounter)),
(REG_SEQUENCE SReg_64,
(S_GETREG_B32 getHwRegImm<HWREG.SHADER_CYCLES, 0, -12>.ret), sub0,
- (S_MOV_B32 (i32 0)), sub1)
->;
-} // let OtherPredicates = [HasNoSMemTimeInst]
+ (S_MOV_B32 (i32 0)), sub1)> {
+ // Prefer this to s_memtime because it has lower and more predictable latency.
+ let AddedComplexity = 1;
+}
+} // let OtherPredicates = [HasShaderCyclesRegister]
//===----------------------------------------------------------------------===//
// GFX10.
@@ -873,16 +885,13 @@ def : GCNPat <
class SMEM_Real_gfx10<bits<8> op, SM_Pseudo ps> :
SM_Real<ps>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX10>, Enc64 {
- bit glc;
- bit dlc;
-
let AssemblerPredicate = isGFX10Plus;
let DecoderNamespace = "GFX10";
let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?);
let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?);
- let Inst{14} = !if(ps.has_dlc, dlc, ?);
- let Inst{16} = !if(ps.has_glc, glc, ?);
+ let Inst{14} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ?);
+ let Inst{16} = !if(ps.has_glc, cpol{CPolBit.GLC}, ?);
let Inst{25-18} = op;
let Inst{31-26} = 0x3d;
let Inst{52-32} = !if(ps.offset_is_imm, !if(ps.has_offset, offset{20-0}, ?), ?);
@@ -894,10 +903,10 @@ multiclass SM_Real_Loads_gfx10<bits<8> op, string ps,
SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
def _IMM_gfx10 : SMEM_Real_gfx10<op, immPs> {
- let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, GLC:$glc, DLC:$dlc);
+ let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol);
}
def _SGPR_gfx10 : SMEM_Real_gfx10<op, sgprPs> {
- let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc);
+ let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol);
}
}
@@ -914,11 +923,11 @@ multiclass SM_Real_Stores_gfx10<bits<8> op, string ps,
// FIXME: The operand name $offset is inconsistent with $soff used
// in the pseudo
def _IMM_gfx10 : SMEM_Real_Store_gfx10 <op, immPs> {
- let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, GLC:$glc, DLC:$dlc);
+ let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol);
}
def _SGPR_gfx10 : SMEM_Real_Store_gfx10 <op, sgprPs> {
- let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc);
+ let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol);
}
}
@@ -973,18 +982,18 @@ defm S_ATC_PROBE : SM_Real_Probe_gfx10 <0x26, "S_ATC_PROBE">;
defm S_ATC_PROBE_BUFFER : SM_Real_Probe_gfx10 <0x27, "S_ATC_PROBE_BUFFER">;
class SMEM_Atomic_Real_gfx10 <bits<8> op, SM_Atomic_Pseudo ps>
- : SMEM_Real_gfx10 <op, ps> {
+ : SMEM_Real_gfx10 <op, ps>,
+ AtomicNoRet <!subst("_RTN","",NAME), ps.glc> {
bits<7> sdata;
- bit dlc;
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
- let glc = ps.glc;
+ let cpol{CPolBit.GLC} = ps.glc;
- let Inst{14} = !if(ps.has_dlc, dlc, 0);
- let Inst{12-6} = !if(glc, sdst{6-0}, sdata{6-0});
+ let Inst{14} = !if(ps.has_dlc, cpol{CPolBit.DLC}, 0);
+ let Inst{12-6} = !if(ps.glc, sdst{6-0}, sdata{6-0});
}
multiclass SM_Real_Atomics_gfx10<bits<8> op, string ps> {
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 7426af931a62..e9697017aac0 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -59,6 +59,8 @@ class SOP1_Real<bits<8> op, SOP1_Pseudo ps, string real_name = ps.Mnemonic> :
real_name # " " # ps.AsmOperands, []>,
Enc32 {
+ let SALU = 1;
+ let SOP1 = 1;
let isPseudo = 0;
let isCodeGenOnly = 0;
let Size = 4;
@@ -66,6 +68,9 @@ class SOP1_Real<bits<8> op, SOP1_Pseudo ps, string real_name = ps.Mnemonic> :
// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
let AsmMatchConverter = ps.AsmMatchConverter;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
// encoding
bits<7> sdst;
@@ -157,7 +162,7 @@ let isMoveImm = 1 in {
let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
def S_MOV_B32 : SOP1_32 <"s_mov_b32">;
def S_MOV_B64 : SOP1_64 <"s_mov_b64">;
- } // End isRematerializeable = 1
+ } // End isReMaterializable = 1
let Uses = [SCC] in {
def S_CMOV_B32 : SOP1_32 <"s_cmov_b32">;
@@ -192,10 +197,14 @@ def : GCNPat <
>;
}
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
def S_BREV_B32 : SOP1_32 <"s_brev_b32",
[(set i32:$sdst, (bitreverse i32:$src0))]
>;
-def S_BREV_B64 : SOP1_64 <"s_brev_b64">;
+def S_BREV_B64 : SOP1_64 <"s_brev_b64",
+ [(set i64:$sdst, (bitreverse i64:$src0))]
+>;
+} // End isReMaterializable = 1, isAsCheapAsAMove = 1
let Defs = [SCC] in {
def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">;
@@ -208,6 +217,7 @@ def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64",
>;
} // End Defs = [SCC]
+let isReMaterializable = 1 in {
def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">;
def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">;
def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64",
@@ -235,11 +245,13 @@ def S_SEXT_I32_I8 : SOP1_32 <"s_sext_i32_i8",
def S_SEXT_I32_I16 : SOP1_32 <"s_sext_i32_i16",
[(set i32:$sdst, (sext_inreg i32:$src0, i16))]
>;
+} // End isReMaterializable = 1
def S_BITSET0_B32 : SOP1_32 <"s_bitset0_b32", [], 1>;
def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64", [], 1>;
def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32", [], 1>;
def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64", [], 1>;
+
def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64",
[(set i64:$sdst, (int_amdgcn_s_getpc))]
>;
@@ -291,7 +303,9 @@ def S_CBRANCH_JOIN : SOP1_0_32R <"s_cbranch_join">;
} // End SubtargetPredicate = isGFX6GFX7GFX8GFX9
let Defs = [SCC] in {
-def S_ABS_I32 : SOP1_32 <"s_abs_i32">;
+def S_ABS_I32 : SOP1_32 <"s_abs_i32",
+ [(set i32:$sdst, (abs i32:$src0))]
+ >;
} // End Defs = [SCC]
let SubtargetPredicate = HasVGPRIndexMode in {
@@ -309,6 +323,7 @@ let SubtargetPredicate = isGFX9Plus in {
def S_ANDN2_WREXEC_B64 : SOP1_64<"s_andn2_wrexec_b64">;
} // End hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC]
+ let isReMaterializable = 1 in
def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32">;
} // End SubtargetPredicate = isGFX9Plus
@@ -363,14 +378,19 @@ class SOP2_Real<bits<7> op, SOP_Pseudo ps, string real_name = ps.Mnemonic> :
InstSI <ps.OutOperandList, ps.InOperandList,
real_name # " " # ps.AsmOperands, []>,
Enc32 {
+ let SALU = 1;
+ let SOP2 = 1;
let isPseudo = 0;
let isCodeGenOnly = 0;
// copy relevant pseudo op flags
- let SubtargetPredicate = ps.SubtargetPredicate;
- let AsmMatchConverter = ps.AsmMatchConverter;
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
let UseNamedOperandTable = ps.UseNamedOperandTable;
- let TSFlags = ps.TSFlags;
+ let TSFlags = ps.TSFlags;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
// encoding
bits<7> sdst;
@@ -596,6 +616,7 @@ def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64",
>;
} // End Defs = [SCC]
+let isReMaterializable = 1 in {
def S_BFM_B32 : SOP2_32 <"s_bfm_b32",
[(set i32:$sdst, (UniformBinFrag<AMDGPUbfm> i32:$src0, i32:$src1))]>;
def S_BFM_B64 : SOP2_64_32_32 <"s_bfm_b64">;
@@ -605,7 +626,7 @@ def S_MUL_I32 : SOP2_32 <"s_mul_i32",
[(set i32:$sdst, (mul i32:$src0, i32:$src1))]> {
let isCommutable = 1;
}
-
+} // End isReMaterializable = 1
} // End AddedComplexity = 1
let Defs = [SCC] in {
@@ -640,9 +661,11 @@ let SubtargetPredicate = isGFX8GFX9 in {
}
let SubtargetPredicate = isGFX9Plus in {
- def S_PACK_LL_B32_B16 : SOP2_32<"s_pack_ll_b32_b16">;
- def S_PACK_LH_B32_B16 : SOP2_32<"s_pack_lh_b32_b16">;
- def S_PACK_HH_B32_B16 : SOP2_32<"s_pack_hh_b32_b16">;
+ let isReMaterializable = 1 in {
+ def S_PACK_LL_B32_B16 : SOP2_32<"s_pack_ll_b32_b16">;
+ def S_PACK_LH_B32_B16 : SOP2_32<"s_pack_lh_b32_b16">;
+ def S_PACK_HH_B32_B16 : SOP2_32<"s_pack_hh_b32_b16">;
+ } // End isReMaterializable = 1
let Defs = [SCC] in {
def S_LSHL1_ADD_U32 : SOP2_32<"s_lshl1_add_u32",
@@ -659,12 +682,12 @@ let SubtargetPredicate = isGFX9Plus in {
>;
} // End Defs = [SCC]
- let isCommutable = 1 in {
+ let isCommutable = 1, isReMaterializable = 1 in {
def S_MUL_HI_U32 : SOP2_32<"s_mul_hi_u32",
[(set i32:$sdst, (UniformBinFrag<mulhu> SSrc_b32:$src0, SSrc_b32:$src1))]>;
def S_MUL_HI_I32 : SOP2_32<"s_mul_hi_i32",
[(set i32:$sdst, (UniformBinFrag<mulhs> SSrc_b32:$src0, SSrc_b32:$src1))]>;
- }
+ } // End isCommutable = 1, isReMaterializable = 1
} // End SubtargetPredicate = isGFX9Plus
//===----------------------------------------------------------------------===//
@@ -693,6 +716,8 @@ class SOPK_Pseudo <string opName, dag outs, dag ins,
class SOPK_Real<bits<5> op, SOPK_Pseudo ps> :
InstSI <ps.OutOperandList, ps.InOperandList,
ps.Mnemonic # " " # ps.AsmOperands, []> {
+ let SALU = 1;
+ let SOPK = 1;
let isPseudo = 0;
let isCodeGenOnly = 0;
@@ -701,6 +726,11 @@ class SOPK_Real<bits<5> op, SOPK_Pseudo ps> :
let AsmMatchConverter = ps.AsmMatchConverter;
let DisableEncoding = ps.DisableEncoding;
let Constraints = ps.Constraints;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let isBranch = ps.isBranch;
+ let isCall = ps.isCall;
// encoding
bits<7> sdst;
@@ -947,15 +977,20 @@ class SOPC_Real<bits<7> op, SOPC_Pseudo ps, string real_name = ps.Mnemonic> :
InstSI <ps.OutOperandList, ps.InOperandList,
real_name # " " # ps.AsmOperands, []>,
Enc32 {
+ let SALU = 1;
+ let SOPC = 1;
let isPseudo = 0;
let isCodeGenOnly = 0;
// copy relevant pseudo op flags
- let SubtargetPredicate = ps.SubtargetPredicate;
- let OtherPredicates = ps.OtherPredicates;
- let AsmMatchConverter = ps.AsmMatchConverter;
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let OtherPredicates = ps.OtherPredicates;
+ let AsmMatchConverter = ps.AsmMatchConverter;
let UseNamedOperandTable = ps.UseNamedOperandTable;
- let TSFlags = ps.TSFlags;
+ let TSFlags = ps.TSFlags;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
// encoding
bits<8> src0;
@@ -1075,15 +1110,20 @@ class SOPPRelaxTable <bit isRelaxed, string keyName, string gfxip> {
class SOPP_Real<bits<7> op, SOPP_Pseudo ps, string real_name = ps.Mnemonic> :
InstSI <ps.OutOperandList, ps.InOperandList,
real_name # ps.AsmOperands, []> {
+ let SALU = 1;
+ let SOPP = 1;
let isPseudo = 0;
let isCodeGenOnly = 0;
// copy relevant pseudo op flags
- let SubtargetPredicate = ps.SubtargetPredicate;
- let OtherPredicates = ps.OtherPredicates;
- let AsmMatchConverter = ps.AsmMatchConverter;
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let OtherPredicates = ps.OtherPredicates;
+ let AsmMatchConverter = ps.AsmMatchConverter;
let UseNamedOperandTable = ps.UseNamedOperandTable;
- let TSFlags = ps.TSFlags;
+ let TSFlags = ps.TSFlags;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
bits <16> simm16;
}
@@ -1226,7 +1266,8 @@ def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {
let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins WAIT_FLAG:$simm16), "$simm16",
[(int_amdgcn_s_waitcnt timm:$simm16)]>;
-def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i16imm:$simm16), "$simm16">;
+def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
+ [(int_amdgcn_s_sethalt timm:$simm16)]>;
def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
// On SI the documentation says sleep for approximately 64 * low 2
@@ -1433,8 +1474,9 @@ class Select_gfx6_gfx7<string opName> : SIMCInstr<opName, SIEncodingFamily.SI> {
//===----------------------------------------------------------------------===//
multiclass SOP1_Real_gfx10<bits<8> op> {
- def _gfx10 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>,
- Select_gfx10<!cast<SOP1_Pseudo>(NAME).Mnemonic>;
+ defvar ps = !cast<SOP1_Pseudo>(NAME);
+ def _gfx10 : SOP1_Real<op, ps>,
+ Select_gfx10<ps.Mnemonic>;
}
defm S_ANDN1_SAVEEXEC_B64 : SOP1_Real_gfx10<0x037>;
@@ -1462,8 +1504,9 @@ defm S_MOVRELSD_2_B32 : SOP1_Real_gfx10<0x049>;
multiclass SOP1_Real_gfx6_gfx7<bits<8> op> {
- def _gfx6_gfx7 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>,
- Select_gfx6_gfx7<!cast<SOP1_Pseudo>(NAME).Mnemonic>;
+ defvar ps = !cast<SOP1_Pseudo>(NAME);
+ def _gfx6_gfx7 : SOP1_Real<op, ps>,
+ Select_gfx6_gfx7<ps.Mnemonic>;
}
multiclass SOP1_Real_gfx6_gfx7_gfx10<bits<8> op> :
@@ -1524,8 +1567,9 @@ defm S_ABS_I32 : SOP1_Real_gfx6_gfx7_gfx10<0x034>;
//===----------------------------------------------------------------------===//
multiclass SOP2_Real_gfx10<bits<7> op> {
- def _gfx10 : SOP2_Real<op, !cast<SOP2_Pseudo>(NAME)>,
- Select_gfx10<!cast<SOP2_Pseudo>(NAME).Mnemonic>;
+ defvar ps = !cast<SOP2_Pseudo>(NAME);
+ def _gfx10 : SOP2_Real<op, ps>,
+ Select_gfx10<ps.Mnemonic>;
}
defm S_LSHL1_ADD_U32 : SOP2_Real_gfx10<0x02e>;
@@ -1543,8 +1587,9 @@ defm S_MUL_HI_I32 : SOP2_Real_gfx10<0x036>;
//===----------------------------------------------------------------------===//
multiclass SOP2_Real_gfx6_gfx7<bits<7> op> {
- def _gfx6_gfx7 : SOP2_Real<op, !cast<SOP_Pseudo>(NAME)>,
- Select_gfx6_gfx7<!cast<SOP_Pseudo>(NAME).Mnemonic>;
+ defvar ps = !cast<SOP_Pseudo>(NAME);
+ def _gfx6_gfx7 : SOP2_Real<op, ps>,
+ Select_gfx6_gfx7<ps.Mnemonic>;
}
multiclass SOP2_Real_gfx6_gfx7_gfx10<bits<7> op> :
@@ -1600,13 +1645,15 @@ defm S_ABSDIFF_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x02c>;
//===----------------------------------------------------------------------===//
multiclass SOPK_Real32_gfx10<bits<5> op> {
- def _gfx10 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>,
- Select_gfx10<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+ defvar ps = !cast<SOPK_Pseudo>(NAME);
+ def _gfx10 : SOPK_Real32<op, ps>,
+ Select_gfx10<ps.Mnemonic>;
}
multiclass SOPK_Real64_gfx10<bits<5> op> {
- def _gfx10 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>,
- Select_gfx10<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+ defvar ps = !cast<SOPK_Pseudo>(NAME);
+ def _gfx10 : SOPK_Real64<op, ps>,
+ Select_gfx10<ps.Mnemonic>;
}
defm S_VERSION : SOPK_Real32_gfx10<0x001>;
@@ -1623,13 +1670,15 @@ defm S_SUBVECTOR_LOOP_END : SOPK_Real32_gfx10<0x01c>;
//===----------------------------------------------------------------------===//
multiclass SOPK_Real32_gfx6_gfx7<bits<5> op> {
- def _gfx6_gfx7 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>,
- Select_gfx6_gfx7<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+ defvar ps = !cast<SOPK_Pseudo>(NAME);
+ def _gfx6_gfx7 : SOPK_Real32<op, ps>,
+ Select_gfx6_gfx7<ps.Mnemonic>;
}
multiclass SOPK_Real64_gfx6_gfx7<bits<5> op> {
- def _gfx6_gfx7 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>,
- Select_gfx6_gfx7<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+ defvar ps = !cast<SOPK_Pseudo>(NAME);
+ def _gfx6_gfx7 : SOPK_Real64<op, ps>,
+ Select_gfx6_gfx7<ps.Mnemonic>;
}
multiclass SOPK_Real32_gfx6_gfx7_gfx10<bits<5> op> :
@@ -1665,21 +1714,24 @@ defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx6_gfx7_gfx10<0x015>;
//===----------------------------------------------------------------------===//
multiclass SOPP_Real_32_gfx6_gfx7<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic> {
- def _gfx6_gfx7 : SOPP_Real_32<op, !cast<SOPP_Pseudo>(NAME), real_name>,
- Select_gfx6_gfx7<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
- SOPPRelaxTable<0, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx6_gfx7">;
+ defvar ps = !cast<SOPP_Pseudo>(NAME);
+ def _gfx6_gfx7 : SOPP_Real_32<op, ps, real_name>,
+ Select_gfx6_gfx7<ps.Mnemonic>,
+ SOPPRelaxTable<0, ps.KeyName, "_gfx6_gfx7">;
}
multiclass SOPP_Real_32_gfx8_gfx9<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
- def _vi : SOPP_Real_32<op, !cast<SOPP_Pseudo>(NAME), real_name>,
- Select_vi<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
- SOPPRelaxTable<0, !cast<SOPP_Pseudo>(NAME).KeyName, "_vi">;
+ defvar ps = !cast<SOPP_Pseudo>(NAME);
+ def _vi : SOPP_Real_32<op, ps, real_name>,
+ Select_vi<ps.Mnemonic>,
+ SOPPRelaxTable<0, ps.KeyName, "_vi">;
}
multiclass SOPP_Real_32_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
- def _gfx10 : SOPP_Real_32<op, !cast<SOPP_Pseudo>(NAME), real_name>,
- Select_gfx10<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
- SOPPRelaxTable<0, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx10">;
+ defvar ps = !cast<SOPP_Pseudo>(NAME);
+ def _gfx10 : SOPP_Real_32<op, ps, real_name>,
+ Select_gfx10<ps.Mnemonic>,
+ SOPPRelaxTable<0, ps.KeyName, "_gfx10">;
}
multiclass SOPP_Real_32_gfx8_gfx9_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
@@ -1693,21 +1745,24 @@ multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op, string real_name =
//64 bit encodings, for Relaxation
multiclass SOPP_Real_64_gfx6_gfx7<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
- def _gfx6_gfx7 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), real_name>,
- Select_gfx6_gfx7<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
- SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx6_gfx7">;
+ defvar ps = !cast<SOPP_Pseudo>(NAME);
+ def _gfx6_gfx7 : SOPP_Real_64<op, ps, real_name>,
+ Select_gfx6_gfx7<ps.Mnemonic>,
+ SOPPRelaxTable<1, ps.KeyName, "_gfx6_gfx7">;
}
multiclass SOPP_Real_64_gfx8_gfx9<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
- def _vi : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), real_name>,
- Select_vi<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
- SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_vi">;
+ defvar ps = !cast<SOPP_Pseudo>(NAME);
+ def _vi : SOPP_Real_64<op, ps, real_name>,
+ Select_vi<ps.Mnemonic>,
+ SOPPRelaxTable<1, ps.KeyName, "_vi">;
}
multiclass SOPP_Real_64_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> {
- def _gfx10 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), real_name>,
- Select_gfx10<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
- SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx10">;
+ defvar ps = !cast<SOPP_Pseudo>(NAME);
+ def _gfx10 : SOPP_Real_64<op, ps, real_name>,
+ Select_gfx10<ps.Mnemonic>,
+ SOPPRelaxTable<1, ps.KeyName, "_gfx10">;
}
multiclass SOPP_Real_64_gfx8_gfx9_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> :
@@ -1727,18 +1782,7 @@ multiclass SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> {
defm S_NOP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x000>;
defm S_ENDPGM : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x001, "s_endpgm">;
-defm S_BRANCH : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x002>;
defm S_WAKEUP : SOPP_Real_32_gfx8_gfx9_gfx10<0x003>;
-defm S_CBRANCH_SCC0 : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x004>;
-defm S_CBRANCH_SCC1 : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x005>;
-defm S_CBRANCH_VCCZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x006>;
-defm S_CBRANCH_VCCNZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x007>;
-defm S_CBRANCH_EXECZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x008>;
-defm S_CBRANCH_EXECNZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x009>;
-defm S_CBRANCH_CDBGSYS : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x017>;
-defm S_CBRANCH_CDBGUSER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x018>;
-defm S_CBRANCH_CDBGSYS_OR_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x019>;
-defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x01A>;
defm S_BARRIER : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00a>;
defm S_WAITCNT : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00c>;
defm S_SETHALT : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00d>;
@@ -1765,23 +1809,40 @@ defm S_ROUND_MODE : SOPP_Real_32_gfx10<0x024>;
defm S_DENORM_MODE : SOPP_Real_32_gfx10<0x025>;
defm S_TTRACEDATA_IMM : SOPP_Real_32_gfx10<0x028>;
+let isBranch = 1 in {
+defm S_BRANCH : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x002>;
+defm S_CBRANCH_SCC0 : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x004>;
+defm S_CBRANCH_SCC1 : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x005>;
+defm S_CBRANCH_VCCZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x006>;
+defm S_CBRANCH_VCCNZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x007>;
+defm S_CBRANCH_EXECZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x008>;
+defm S_CBRANCH_EXECNZ : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x009>;
+defm S_CBRANCH_CDBGSYS : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x017>;
+defm S_CBRANCH_CDBGUSER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x018>;
+defm S_CBRANCH_CDBGSYS_OR_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x019>;
+defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x01A>;
+}
+
//===----------------------------------------------------------------------===//
// SOPC - GFX6, GFX7, GFX8, GFX9, GFX10
//===----------------------------------------------------------------------===//
multiclass SOPC_Real_gfx6_gfx7<bits<7> op> {
- def _gfx6_gfx7 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>,
- Select_gfx6_gfx7<!cast<SOPC_Pseudo>(NAME).Mnemonic>;
+ defvar ps = !cast<SOPC_Pseudo>(NAME);
+ def _gfx6_gfx7 : SOPC_Real<op, ps>,
+ Select_gfx6_gfx7<ps.Mnemonic>;
}
multiclass SOPC_Real_gfx8_gfx9<bits<7> op> {
- def _vi : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>,
- Select_vi<!cast<SOPC_Pseudo>(NAME).Mnemonic>;
+ defvar ps = !cast<SOPC_Pseudo>(NAME);
+ def _vi : SOPC_Real<op, ps>,
+ Select_vi<ps.Mnemonic>;
}
multiclass SOPC_Real_gfx10<bits<7> op> {
- def _gfx10 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>,
- Select_gfx10<!cast<SOPC_Pseudo>(NAME).Mnemonic>;
+ defvar ps = !cast<SOPC_Pseudo>(NAME);
+ def _gfx10 : SOPC_Real<op, ps>,
+ Select_gfx10<ps.Mnemonic>;
}
multiclass SOPC_Real_gfx8_gfx9_gfx10<bits<7> op> :
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index c8a85d76a55b..0bee9022975e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -15,19 +15,19 @@ namespace AMDGPU {
namespace SendMsg {
// This must be in sync with llvm::AMDGPU::SendMsg::Id enum members, see SIDefines.h.
-const char* const IdSymbolic[] = {
+const char *const IdSymbolic[ID_GAPS_LAST_] = {
nullptr,
"MSG_INTERRUPT",
"MSG_GS",
"MSG_GS_DONE",
- nullptr,
- nullptr,
- nullptr,
- nullptr,
- nullptr,
+ "MSG_SAVEWAVE",
+ "MSG_STALL_WAVE_GEN",
+ "MSG_HALT_WAVES",
+ "MSG_ORDERED_PS_DONE",
+ "MSG_EARLY_PRIM_DEALLOC",
"MSG_GS_ALLOC_REQ",
"MSG_GET_DOORBELL",
- nullptr,
+ "MSG_GET_DDID",
nullptr,
nullptr,
nullptr,
@@ -35,7 +35,7 @@ const char* const IdSymbolic[] = {
};
// These two must be in sync with llvm::AMDGPU::SendMsg::Op enum members, see SIDefines.h.
-const char* const OpSysSymbolic[] = {
+const char *const OpSysSymbolic[OP_SYS_LAST_] = {
nullptr,
"SYSMSG_OP_ECC_ERR_INTERRUPT",
"SYSMSG_OP_REG_RD",
@@ -43,7 +43,7 @@ const char* const OpSysSymbolic[] = {
"SYSMSG_OP_TTRACE_PC"
};
-const char* const OpGsSymbolic[] = {
+const char *const OpGsSymbolic[OP_GS_LAST_] = {
"GS_OP_NOP",
"GS_OP_CUT",
"GS_OP_EMIT",
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
index 3eb27c5e5f42..d1deb570a938 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
@@ -9,6 +9,8 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H
#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H
+#include "SIDefines.h"
+
namespace llvm {
class StringLiteral;
@@ -17,9 +19,9 @@ namespace AMDGPU {
namespace SendMsg { // Symbolic names for the sendmsg(...) syntax.
-extern const char* const IdSymbolic[];
-extern const char* const OpSysSymbolic[];
-extern const char* const OpGsSymbolic[];
+extern const char *const IdSymbolic[ID_GAPS_LAST_];
+extern const char *const OpSysSymbolic[OP_SYS_LAST_];
+extern const char *const OpGsSymbolic[OP_GS_LAST_];
} // namespace SendMsg
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 4c1e4dec7ecb..29bbf50cbfdc 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -30,7 +30,8 @@
static llvm::cl::opt<unsigned> AmdhsaCodeObjectVersion(
"amdhsa-code-object-version", llvm::cl::Hidden,
- llvm::cl::desc("AMDHSA Code Object Version"), llvm::cl::init(3));
+ llvm::cl::desc("AMDHSA Code Object Version"), llvm::cl::init(4),
+ llvm::cl::ZeroOrMore);
namespace {
@@ -96,23 +97,36 @@ Optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI) {
return ELF::ELFABIVERSION_AMDGPU_HSA_V2;
case 3:
return ELF::ELFABIVERSION_AMDGPU_HSA_V3;
+ case 4:
+ return ELF::ELFABIVERSION_AMDGPU_HSA_V4;
default:
- return ELF::ELFABIVERSION_AMDGPU_HSA_V3;
+ report_fatal_error(Twine("Unsupported AMDHSA Code Object Version ") +
+ Twine(AmdhsaCodeObjectVersion));
}
}
bool isHsaAbiVersion2(const MCSubtargetInfo *STI) {
- if (const auto &&HsaAbiVer = getHsaAbiVersion(STI))
- return HsaAbiVer.getValue() == ELF::ELFABIVERSION_AMDGPU_HSA_V2;
+ if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))
+ return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V2;
return false;
}
bool isHsaAbiVersion3(const MCSubtargetInfo *STI) {
- if (const auto &&HsaAbiVer = getHsaAbiVersion(STI))
- return HsaAbiVer.getValue() == ELF::ELFABIVERSION_AMDGPU_HSA_V3;
+ if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))
+ return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V3;
+ return false;
+}
+
+bool isHsaAbiVersion4(const MCSubtargetInfo *STI) {
+ if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))
+ return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V4;
return false;
}
+bool isHsaAbiVersion3Or4(const MCSubtargetInfo *STI) {
+ return isHsaAbiVersion3(STI) || isHsaAbiVersion4(STI);
+}
+
#define GET_MIMGBaseOpcodesTable_IMPL
#define GET_MIMGDimInfoTable_IMPL
#define GET_MIMGInfoTable_IMPL
@@ -141,6 +155,34 @@ int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) {
return NewInfo ? NewInfo->Opcode : -1;
}
+unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode,
+ const MIMGDimInfo *Dim, bool IsA16,
+ bool IsG16Supported) {
+ unsigned AddrWords = BaseOpcode->NumExtraArgs;
+ unsigned AddrComponents = (BaseOpcode->Coordinates ? Dim->NumCoords : 0) +
+ (BaseOpcode->LodOrClampOrMip ? 1 : 0);
+ if (IsA16)
+ AddrWords += divideCeil(AddrComponents, 2);
+ else
+ AddrWords += AddrComponents;
+
+ // Note: For subtargets that support A16 but not G16, enabling A16 also
+ // enables 16 bit gradients.
+ // For subtargets that support A16 (operand) and G16 (done with a different
+ // instruction encoding), they are independent.
+
+ if (BaseOpcode->Gradients) {
+ if ((IsA16 && !IsG16Supported) || BaseOpcode->G16)
+ // There are two gradients per coordinate, we pack them separately.
+ // For the 3d case,
+ // we get (dy/du, dx/du) (-, dz/du) (dy/dv, dx/dv) (-, dz/dv)
+ AddrWords += alignTo<2>(Dim->NumGradients / 2);
+ else
+ AddrWords += Dim->NumGradients;
+ }
+ return AddrWords;
+}
+
struct MUBUFInfo {
uint16_t Opcode;
uint16_t BaseOpcode;
@@ -148,6 +190,7 @@ struct MUBUFInfo {
bool has_vaddr;
bool has_srsrc;
bool has_soffset;
+ bool IsBufferInv;
};
struct MTBUFInfo {
@@ -164,12 +207,23 @@ struct SMInfo {
bool IsBuffer;
};
+struct VOPInfo {
+ uint16_t Opcode;
+ bool IsSingle;
+};
+
#define GET_MTBUFInfoTable_DECL
#define GET_MTBUFInfoTable_IMPL
#define GET_MUBUFInfoTable_DECL
#define GET_MUBUFInfoTable_IMPL
#define GET_SMInfoTable_DECL
#define GET_SMInfoTable_IMPL
+#define GET_VOP1InfoTable_DECL
+#define GET_VOP1InfoTable_IMPL
+#define GET_VOP2InfoTable_DECL
+#define GET_VOP2InfoTable_IMPL
+#define GET_VOP3InfoTable_DECL
+#define GET_VOP3InfoTable_IMPL
#include "AMDGPUGenSearchableTables.inc"
int getMTBUFBaseOpcode(unsigned Opc) {
@@ -232,11 +286,31 @@ bool getMUBUFHasSoffset(unsigned Opc) {
return Info ? Info->has_soffset : false;
}
+bool getMUBUFIsBufferInv(unsigned Opc) {
+ const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
+ return Info ? Info->IsBufferInv : false;
+}
+
bool getSMEMIsBuffer(unsigned Opc) {
const SMInfo *Info = getSMEMOpcodeHelper(Opc);
return Info ? Info->IsBuffer : false;
}
+bool getVOP1IsSingle(unsigned Opc) {
+ const VOPInfo *Info = getVOP1OpcodeHelper(Opc);
+ return Info ? Info->IsSingle : false;
+}
+
+bool getVOP2IsSingle(unsigned Opc) {
+ const VOPInfo *Info = getVOP2OpcodeHelper(Opc);
+ return Info ? Info->IsSingle : false;
+}
+
+bool getVOP3IsSingle(unsigned Opc) {
+ const VOPInfo *Info = getVOP3OpcodeHelper(Opc);
+ return Info ? Info->IsSingle : false;
+}
+
// Wrapper for Tablegen'd function. enum Subtarget is not defined in any
// header files, so we need to wrap it in a function that takes unsigned
// instead.
@@ -247,7 +321,8 @@ int getMCOpcode(uint16_t Opcode, unsigned Gen) {
namespace IsaInfo {
AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI)
- : XnackSetting(TargetIDSetting::Any), SramEccSetting(TargetIDSetting::Any) {
+ : STI(STI), XnackSetting(TargetIDSetting::Any),
+ SramEccSetting(TargetIDSetting::Any) {
if (!STI.getFeatureBits().test(FeatureSupportsXNACK))
XnackSetting = TargetIDSetting::Unsupported;
if (!STI.getFeatureBits().test(FeatureSupportsSRAMECC))
@@ -334,25 +409,109 @@ void AMDGPUTargetID::setTargetIDFromTargetIDStream(StringRef TargetID) {
}
}
-void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) {
- auto TargetTriple = STI->getTargetTriple();
- auto Version = getIsaVersion(STI->getCPU());
+std::string AMDGPUTargetID::toString() const {
+ std::string StringRep = "";
+ raw_string_ostream StreamRep(StringRep);
- Stream << TargetTriple.getArchName() << '-'
- << TargetTriple.getVendorName() << '-'
- << TargetTriple.getOSName() << '-'
- << TargetTriple.getEnvironmentName() << '-'
- << "gfx"
- << Version.Major
- << Version.Minor
- << Version.Stepping;
+ auto TargetTriple = STI.getTargetTriple();
+ auto Version = getIsaVersion(STI.getCPU());
- if (hasXNACK(*STI))
- Stream << "+xnack";
- if (hasSRAMECC(*STI))
- Stream << "+sramecc";
+ StreamRep << TargetTriple.getArchName() << '-'
+ << TargetTriple.getVendorName() << '-'
+ << TargetTriple.getOSName() << '-'
+ << TargetTriple.getEnvironmentName() << '-';
- Stream.flush();
+ std::string Processor = "";
+ // TODO: Following else statement is present here because we used various
+ // alias names for GPUs up until GFX9 (e.g. 'fiji' is same as 'gfx803').
+ // Remove once all aliases are removed from GCNProcessors.td.
+ if (Version.Major >= 9)
+ Processor = STI.getCPU().str();
+ else
+ Processor = (Twine("gfx") + Twine(Version.Major) + Twine(Version.Minor) +
+ Twine(Version.Stepping))
+ .str();
+
+ std::string Features = "";
+ if (Optional<uint8_t> HsaAbiVersion = getHsaAbiVersion(&STI)) {
+ switch (*HsaAbiVersion) {
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
+ // Code object V2 only supported specific processors and had fixed
+ // settings for the XNACK.
+ if (Processor == "gfx600") {
+ } else if (Processor == "gfx601") {
+ } else if (Processor == "gfx602") {
+ } else if (Processor == "gfx700") {
+ } else if (Processor == "gfx701") {
+ } else if (Processor == "gfx702") {
+ } else if (Processor == "gfx703") {
+ } else if (Processor == "gfx704") {
+ } else if (Processor == "gfx705") {
+ } else if (Processor == "gfx801") {
+ if (!isXnackOnOrAny())
+ report_fatal_error(
+ "AMD GPU code object V2 does not support processor " + Processor +
+ " without XNACK");
+ } else if (Processor == "gfx802") {
+ } else if (Processor == "gfx803") {
+ } else if (Processor == "gfx805") {
+ } else if (Processor == "gfx810") {
+ if (!isXnackOnOrAny())
+ report_fatal_error(
+ "AMD GPU code object V2 does not support processor " + Processor +
+ " without XNACK");
+ } else if (Processor == "gfx900") {
+ if (isXnackOnOrAny())
+ Processor = "gfx901";
+ } else if (Processor == "gfx902") {
+ if (isXnackOnOrAny())
+ Processor = "gfx903";
+ } else if (Processor == "gfx904") {
+ if (isXnackOnOrAny())
+ Processor = "gfx905";
+ } else if (Processor == "gfx906") {
+ if (isXnackOnOrAny())
+ Processor = "gfx907";
+ } else if (Processor == "gfx90c") {
+ if (isXnackOnOrAny())
+ report_fatal_error(
+ "AMD GPU code object V2 does not support processor " + Processor +
+ " with XNACK being ON or ANY");
+ } else {
+ report_fatal_error(
+ "AMD GPU code object V2 does not support processor " + Processor);
+ }
+ break;
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
+ // xnack.
+ if (isXnackOnOrAny())
+ Features += "+xnack";
+ // In code object v2 and v3, "sramecc" feature was spelled with a
+ // hyphen ("sram-ecc").
+ if (isSramEccOnOrAny())
+ Features += "+sram-ecc";
+ break;
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
+ // sramecc.
+ if (getSramEccSetting() == TargetIDSetting::Off)
+ Features += ":sramecc-";
+ else if (getSramEccSetting() == TargetIDSetting::On)
+ Features += ":sramecc+";
+ // xnack.
+ if (getXnackSetting() == TargetIDSetting::Off)
+ Features += ":xnack-";
+ else if (getXnackSetting() == TargetIDSetting::On)
+ Features += ":xnack+";
+ break;
+ default:
+ break;
+ }
+ }
+
+ StreamRep << Processor << Features;
+
+ StreamRep.flush();
+ return StringRep;
}
unsigned getWavefrontSize(const MCSubtargetInfo *STI) {
@@ -402,6 +561,8 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) {
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI) {
// FIXME: Need to take scratch memory into account.
+ if (isGFX90A(*STI))
+ return 8;
if (!isGFX10Plus(*STI))
return 10;
return hasGFX10_3Insts(*STI) ? 16 : 20;
@@ -531,6 +692,9 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs) {
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
Optional<bool> EnableWavefrontSize32) {
+ if (STI->getFeatureBits().test(FeatureGFX90AInsts))
+ return 8;
+
bool IsWave32 = EnableWavefrontSize32 ?
*EnableWavefrontSize32 :
STI->getFeatureBits().test(FeatureWavefrontSize32);
@@ -543,6 +707,8 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
Optional<bool> EnableWavefrontSize32) {
+ if (STI->getFeatureBits().test(FeatureGFX90AInsts))
+ return 8;
bool IsWave32 = EnableWavefrontSize32 ?
*EnableWavefrontSize32 :
@@ -552,12 +718,16 @@ unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
}
unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
+ if (STI->getFeatureBits().test(FeatureGFX90AInsts))
+ return 512;
if (!isGFX10Plus(*STI))
return 256;
return STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1024 : 512;
}
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
+ if (STI->getFeatureBits().test(FeatureGFX90AInsts))
+ return 512;
return 256;
}
@@ -653,6 +823,11 @@ amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(
AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED, 1);
}
+ if (AMDGPU::isGFX90A(*STI)) {
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc3,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
+ STI->getFeatureBits().test(FeatureTgSplit) ? 1 : 0);
+ }
return KD;
}
@@ -1049,23 +1224,32 @@ int64_t getMsgId(const StringRef Name) {
return ID_UNKNOWN_;
}
-static bool isValidMsgId(int64_t MsgId) {
- return (ID_GAPS_FIRST_ <= MsgId && MsgId < ID_GAPS_LAST_) && IdSymbolic[MsgId];
-}
-
bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict) {
if (Strict) {
- if (MsgId == ID_GS_ALLOC_REQ || MsgId == ID_GET_DOORBELL)
+ switch (MsgId) {
+ case ID_SAVEWAVE:
+ return isVI(STI) || isGFX9Plus(STI);
+ case ID_STALL_WAVE_GEN:
+ case ID_HALT_WAVES:
+ case ID_ORDERED_PS_DONE:
+ case ID_GS_ALLOC_REQ:
+ case ID_GET_DOORBELL:
return isGFX9Plus(STI);
- else
- return isValidMsgId(MsgId);
+ case ID_EARLY_PRIM_DEALLOC:
+ return isGFX9(STI);
+ case ID_GET_DDID:
+ return isGFX10Plus(STI);
+ default:
+ return 0 <= MsgId && MsgId < ID_GAPS_LAST_ && IdSymbolic[MsgId];
+ }
} else {
return 0 <= MsgId && isUInt<ID_WIDTH_>(MsgId);
}
}
StringRef getMsgName(int64_t MsgId) {
- return isValidMsgId(MsgId)? IdSymbolic[MsgId] : "";
+ assert(0 <= MsgId && MsgId < ID_GAPS_LAST_);
+ return IdSymbolic[MsgId];
}
int64_t getMsgOpId(int64_t MsgId, const StringRef Name) {
@@ -1080,7 +1264,9 @@ int64_t getMsgOpId(int64_t MsgId, const StringRef Name) {
return OP_UNKNOWN_;
}
-bool isValidMsgOp(int64_t MsgId, int64_t OpId, bool Strict) {
+bool isValidMsgOp(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI,
+ bool Strict) {
+ assert(isValidMsgId(MsgId, STI, Strict));
if (!Strict)
return 0 <= OpId && isUInt<OP_WIDTH_>(OpId);
@@ -1103,7 +1289,9 @@ StringRef getMsgOpName(int64_t MsgId, int64_t OpId) {
return (MsgId == ID_SYSMSG)? OpSysSymbolic[OpId] : OpGsSymbolic[OpId];
}
-bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, bool Strict) {
+bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId,
+ const MCSubtargetInfo &STI, bool Strict) {
+ assert(isValidMsgOp(MsgId, OpId, STI, Strict));
if (!Strict)
return 0 <= StreamId && isUInt<STREAM_ID_WIDTH_>(StreamId);
@@ -1156,6 +1344,17 @@ unsigned getInitialPSInputAddr(const Function &F) {
return getIntegerAttribute(F, "InitialPSInputAddr", 0);
}
+bool getHasColorExport(const Function &F) {
+ // As a safe default always respond as if PS has color exports.
+ return getIntegerAttribute(
+ F, "amdgpu-color-export",
+ F.getCallingConv() == CallingConv::AMDGPU_PS ? 1 : 0) != 0;
+}
+
+bool getHasDepthExport(const Function &F) {
+ return getIntegerAttribute(F, "amdgpu-depth-export", 0) != 0;
+}
+
bool isShader(CallingConv::ID cc) {
switch(cc) {
case CallingConv::AMDGPU_VS:
@@ -1259,6 +1458,10 @@ bool isGCN3Encoding(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding];
}
+bool isGFX10_AEncoding(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureGFX10_AEncoding];
+}
+
bool isGFX10_BEncoding(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding];
}
@@ -1267,6 +1470,14 @@ bool hasGFX10_3Insts(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX10_3Insts];
}
+bool isGFX90A(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts];
+}
+
+bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
+}
+
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0);
@@ -1374,6 +1585,9 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
return true;
default:
return false;
@@ -1413,41 +1627,67 @@ unsigned getRegBitWidth(unsigned RCID) {
case AMDGPU::VReg_64RegClassID:
case AMDGPU::AReg_64RegClassID:
case AMDGPU::SReg_64_XEXECRegClassID:
+ case AMDGPU::VReg_64_Align2RegClassID:
+ case AMDGPU::AReg_64_Align2RegClassID:
return 64;
case AMDGPU::SGPR_96RegClassID:
case AMDGPU::SReg_96RegClassID:
case AMDGPU::VReg_96RegClassID:
case AMDGPU::AReg_96RegClassID:
+ case AMDGPU::VReg_96_Align2RegClassID:
+ case AMDGPU::AReg_96_Align2RegClassID:
+ case AMDGPU::AV_96RegClassID:
return 96;
case AMDGPU::SGPR_128RegClassID:
case AMDGPU::SReg_128RegClassID:
case AMDGPU::VReg_128RegClassID:
case AMDGPU::AReg_128RegClassID:
+ case AMDGPU::VReg_128_Align2RegClassID:
+ case AMDGPU::AReg_128_Align2RegClassID:
+ case AMDGPU::AV_128RegClassID:
return 128;
case AMDGPU::SGPR_160RegClassID:
case AMDGPU::SReg_160RegClassID:
case AMDGPU::VReg_160RegClassID:
case AMDGPU::AReg_160RegClassID:
+ case AMDGPU::VReg_160_Align2RegClassID:
+ case AMDGPU::AReg_160_Align2RegClassID:
+ case AMDGPU::AV_160RegClassID:
return 160;
case AMDGPU::SGPR_192RegClassID:
case AMDGPU::SReg_192RegClassID:
case AMDGPU::VReg_192RegClassID:
case AMDGPU::AReg_192RegClassID:
+ case AMDGPU::VReg_192_Align2RegClassID:
+ case AMDGPU::AReg_192_Align2RegClassID:
return 192;
+ case AMDGPU::SGPR_224RegClassID:
+ case AMDGPU::SReg_224RegClassID:
+ case AMDGPU::VReg_224RegClassID:
+ case AMDGPU::AReg_224RegClassID:
+ case AMDGPU::VReg_224_Align2RegClassID:
+ case AMDGPU::AReg_224_Align2RegClassID:
+ return 224;
case AMDGPU::SGPR_256RegClassID:
case AMDGPU::SReg_256RegClassID:
case AMDGPU::VReg_256RegClassID:
case AMDGPU::AReg_256RegClassID:
+ case AMDGPU::VReg_256_Align2RegClassID:
+ case AMDGPU::AReg_256_Align2RegClassID:
return 256;
case AMDGPU::SGPR_512RegClassID:
case AMDGPU::SReg_512RegClassID:
case AMDGPU::VReg_512RegClassID:
case AMDGPU::AReg_512RegClassID:
+ case AMDGPU::VReg_512_Align2RegClassID:
+ case AMDGPU::AReg_512_Align2RegClassID:
return 512;
case AMDGPU::SGPR_1024RegClassID:
case AMDGPU::SReg_1024RegClassID:
case AMDGPU::VReg_1024RegClassID:
case AMDGPU::AReg_1024RegClassID:
+ case AMDGPU::VReg_1024_Align2RegClassID:
+ case AMDGPU::AReg_1024_Align2RegClassID:
return 1024;
default:
llvm_unreachable("Unexpected register class");
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index f9378693cf48..72c872dec5ba 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -44,6 +44,12 @@ bool isHsaAbiVersion2(const MCSubtargetInfo *STI);
/// \returns True if HSA OS ABI Version identification is 3,
/// false otherwise.
bool isHsaAbiVersion3(const MCSubtargetInfo *STI);
+/// \returns True if HSA OS ABI Version identification is 4,
+/// false otherwise.
+bool isHsaAbiVersion4(const MCSubtargetInfo *STI);
+/// \returns True if HSA OS ABI Version identification is 3 or 4,
+/// false otherwise.
+bool isHsaAbiVersion3Or4(const MCSubtargetInfo *STI);
struct GcnBufferFormatInfo {
unsigned Format;
@@ -78,6 +84,7 @@ enum class TargetIDSetting {
class AMDGPUTargetID {
private:
+ const MCSubtargetInfo &STI;
TargetIDSetting XnackSetting;
TargetIDSetting SramEccSetting;
@@ -145,10 +152,10 @@ public:
void setTargetIDFromFeaturesString(StringRef FS);
void setTargetIDFromTargetIDStream(StringRef TargetID);
-};
-/// Streams isa version string for given subtarget \p STI into \p Stream.
-void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream);
+ /// \returns String representation of an object.
+ std::string toString() const;
+};
/// \returns Wavefront size for given subtarget \p STI.
unsigned getWavefrontSize(const MCSubtargetInfo *STI);
@@ -284,6 +291,7 @@ struct MIMGBaseOpcodeInfo {
bool Coordinates;
bool LodOrClampOrMip;
bool HasD16;
+ bool MSAA;
};
LLVM_READONLY
@@ -293,6 +301,7 @@ struct MIMGDimInfo {
MIMGDim Dim;
uint8_t NumCoords;
uint8_t NumGradients;
+ bool MSAA;
bool DA;
uint8_t Encoding;
const char *AsmSuffix;
@@ -338,6 +347,11 @@ int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
LLVM_READONLY
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels);
+LLVM_READONLY
+unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode,
+ const MIMGDimInfo *Dim, bool IsA16,
+ bool IsG16Supported);
+
struct MIMGInfo {
uint16_t Opcode;
uint16_t BaseOpcode;
@@ -386,9 +400,21 @@ LLVM_READONLY
bool getMUBUFHasSoffset(unsigned Opc);
LLVM_READONLY
+bool getMUBUFIsBufferInv(unsigned Opc);
+
+LLVM_READONLY
bool getSMEMIsBuffer(unsigned Opc);
LLVM_READONLY
+bool getVOP1IsSingle(unsigned Opc);
+
+LLVM_READONLY
+bool getVOP2IsSingle(unsigned Opc);
+
+LLVM_READONLY
+bool getVOP3IsSingle(unsigned Opc);
+
+LLVM_READONLY
const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
uint8_t NumComponents,
uint8_t NumFormat,
@@ -459,6 +485,14 @@ struct Waitcnt {
return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u || VsCnt != ~0u;
}
+ bool hasWaitExceptVsCnt() const {
+ return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u;
+ }
+
+ bool hasWaitVsCnt() const {
+ return VsCnt != ~0u;
+ }
+
bool dominates(const Waitcnt &Other) const {
return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt &&
LgkmCnt <= Other.LgkmCnt && VsCnt <= Other.VsCnt;
@@ -627,10 +661,12 @@ LLVM_READNONE
bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict = true);
LLVM_READNONE
-bool isValidMsgOp(int64_t MsgId, int64_t OpId, bool Strict = true);
+bool isValidMsgOp(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI,
+ bool Strict = true);
LLVM_READNONE
-bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, bool Strict = true);
+bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId,
+ const MCSubtargetInfo &STI, bool Strict = true);
LLVM_READNONE
bool msgRequiresOp(int64_t MsgId);
@@ -653,6 +689,10 @@ uint64_t encodeMsg(uint64_t MsgId,
unsigned getInitialPSInputAddr(const Function &F);
+bool getHasColorExport(const Function &F);
+
+bool getHasDepthExport(const Function &F);
+
LLVM_READNONE
bool isShader(CallingConv::ID CC);
@@ -701,8 +741,11 @@ bool isGFX9Plus(const MCSubtargetInfo &STI);
bool isGFX10(const MCSubtargetInfo &STI);
bool isGFX10Plus(const MCSubtargetInfo &STI);
bool isGCN3Encoding(const MCSubtargetInfo &STI);
+bool isGFX10_AEncoding(const MCSubtargetInfo &STI);
bool isGFX10_BEncoding(const MCSubtargetInfo &STI);
bool hasGFX10_3Insts(const MCSubtargetInfo &STI);
+bool isGFX90A(const MCSubtargetInfo &STI);
+bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI);
/// Is Reg - scalar register
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
@@ -746,12 +789,17 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_REG_IMM_V2INT32:
+ case AMDGPU::OPERAND_REG_IMM_V2FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
return 4;
case AMDGPU::OPERAND_REG_IMM_INT64:
case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
return 8;
case AMDGPU::OPERAND_REG_IMM_INT16:
@@ -847,6 +895,11 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
const GCNSubtarget *Subtarget,
Align Alignment = Align(4));
+LLVM_READNONE
+inline bool isLegal64BitDPPControl(unsigned DC) {
+ return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST;
+}
+
/// \returns true if the intrinsic is divergent
bool isIntrinsicSourceOfDivergence(unsigned IntrID);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
new file mode 100644
index 000000000000..da8fcf3900bb
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
@@ -0,0 +1,355 @@
+//===- AMDGPULDSUtils.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// AMDGPU LDS related helper utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPULDSUtils.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/ReplaceConstant.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+namespace AMDGPU {
+
+// An helper class for collecting all reachable callees for each kernel defined
+// within the module.
+class CollectReachableCallees {
+ Module &M;
+ CallGraph CG;
+ SmallPtrSet<CallGraphNode *, 8> AddressTakenFunctions;
+
+ // Collect all address taken functions within the module.
+ void collectAddressTakenFunctions() {
+ auto *ECNode = CG.getExternalCallingNode();
+
+ for (auto GI = ECNode->begin(), GE = ECNode->end(); GI != GE; ++GI) {
+ auto *CGN = GI->second;
+ auto *F = CGN->getFunction();
+ if (!F || F->isDeclaration() || AMDGPU::isKernelCC(F))
+ continue;
+ AddressTakenFunctions.insert(CGN);
+ }
+ }
+
+ // For given kernel, collect all its reachable non-kernel functions.
+ SmallPtrSet<Function *, 8> collectReachableCallees(Function *K) {
+ SmallPtrSet<Function *, 8> ReachableCallees;
+
+ // Call graph node which represents this kernel.
+ auto *KCGN = CG[K];
+
+ // Go through all call graph nodes reachable from the node representing this
+ // kernel, visit all their call sites, if the call site is direct, add
+ // corresponding callee to reachable callee set, if it is indirect, resolve
+ // the indirect call site to potential reachable callees, add them to
+ // reachable callee set, and repeat the process for the newly added
+ // potential callee nodes.
+ //
+ // FIXME: Need to handle bit-casted function pointers.
+ //
+ SmallVector<CallGraphNode *, 8> CGNStack(df_begin(KCGN), df_end(KCGN));
+ SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes;
+ while (!CGNStack.empty()) {
+ auto *CGN = CGNStack.pop_back_val();
+
+ if (!VisitedCGNodes.insert(CGN).second)
+ continue;
+
+ for (auto GI = CGN->begin(), GE = CGN->end(); GI != GE; ++GI) {
+ auto *RCB = cast<CallBase>(GI->first.getValue());
+ auto *RCGN = GI->second;
+
+ if (auto *DCallee = RCGN->getFunction()) {
+ ReachableCallees.insert(DCallee);
+ } else if (RCB->isIndirectCall()) {
+ auto *RCBFTy = RCB->getFunctionType();
+ for (auto *ACGN : AddressTakenFunctions) {
+ auto *ACallee = ACGN->getFunction();
+ if (ACallee->getFunctionType() == RCBFTy) {
+ ReachableCallees.insert(ACallee);
+ CGNStack.append(df_begin(ACGN), df_end(ACGN));
+ }
+ }
+ }
+ }
+ }
+
+ return ReachableCallees;
+ }
+
+public:
+ explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) {
+ // Collect address taken functions.
+ collectAddressTakenFunctions();
+ }
+
+ void collectReachableCallees(
+ DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
+ // Collect reachable callee set for each kernel defined in the module.
+ for (Function &F : M.functions()) {
+ if (!AMDGPU::isKernelCC(&F))
+ continue;
+ Function *K = &F;
+ KernelToCallees[K] = collectReachableCallees(K);
+ }
+ }
+};
+
+void collectReachableCallees(
+ Module &M,
+ DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
+ CollectReachableCallees CRC{M};
+ CRC.collectReachableCallees(KernelToCallees);
+}
+
+SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV) {
+ SmallPtrSet<Function *, 8> LDSAccessors;
+ SmallVector<User *, 8> UserStack(GV->users());
+ SmallPtrSet<User *, 8> VisitedUsers;
+
+ while (!UserStack.empty()) {
+ auto *U = UserStack.pop_back_val();
+
+ // `U` is already visited? continue to next one.
+ if (!VisitedUsers.insert(U).second)
+ continue;
+
+ // `U` is a global variable which is initialized with LDS. Ignore LDS.
+ if (isa<GlobalValue>(U))
+ return SmallPtrSet<Function *, 8>();
+
+ // Recursively explore constant users.
+ if (isa<Constant>(U)) {
+ append_range(UserStack, U->users());
+ continue;
+ }
+
+ // `U` should be an instruction, if it belongs to a non-kernel function F,
+ // then collect F.
+ Function *F = cast<Instruction>(U)->getFunction();
+ if (!AMDGPU::isKernelCC(F))
+ LDSAccessors.insert(F);
+ }
+
+ return LDSAccessors;
+}
+
+DenseMap<Function *, SmallPtrSet<Instruction *, 8>>
+getFunctionToInstsMap(User *U, bool CollectKernelInsts) {
+ DenseMap<Function *, SmallPtrSet<Instruction *, 8>> FunctionToInsts;
+ SmallVector<User *, 8> UserStack;
+ SmallPtrSet<User *, 8> VisitedUsers;
+
+ UserStack.push_back(U);
+
+ while (!UserStack.empty()) {
+ auto *UU = UserStack.pop_back_val();
+
+ if (!VisitedUsers.insert(UU).second)
+ continue;
+
+ if (isa<GlobalValue>(UU))
+ continue;
+
+ if (isa<Constant>(UU)) {
+ append_range(UserStack, UU->users());
+ continue;
+ }
+
+ auto *I = cast<Instruction>(UU);
+ Function *F = I->getFunction();
+ if (CollectKernelInsts) {
+ if (!AMDGPU::isKernelCC(F)) {
+ continue;
+ }
+ } else {
+ if (AMDGPU::isKernelCC(F)) {
+ continue;
+ }
+ }
+
+ FunctionToInsts.insert(std::make_pair(F, SmallPtrSet<Instruction *, 8>()));
+ FunctionToInsts[F].insert(I);
+ }
+
+ return FunctionToInsts;
+}
+
+bool isKernelCC(const Function *Func) {
+ return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());
+}
+
+Align getAlign(DataLayout const &DL, const GlobalVariable *GV) {
+ return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL),
+ GV->getValueType());
+}
+
+static void collectFunctionUses(User *U, const Function *F,
+ SetVector<Instruction *> &InstUsers) {
+ SmallVector<User *> Stack{U};
+
+ while (!Stack.empty()) {
+ U = Stack.pop_back_val();
+
+ if (auto *I = dyn_cast<Instruction>(U)) {
+ if (I->getFunction() == F)
+ InstUsers.insert(I);
+ continue;
+ }
+
+ if (!isa<ConstantExpr>(U))
+ continue;
+
+ append_range(Stack, U->users());
+ }
+}
+
+void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F) {
+ SetVector<Instruction *> InstUsers;
+
+ collectFunctionUses(C, F, InstUsers);
+ for (Instruction *I : InstUsers) {
+ convertConstantExprsToInstructions(I, C);
+ }
+}
+
+bool hasUserInstruction(const GlobalValue *GV) {
+ SmallPtrSet<const User *, 8> Visited;
+ SmallVector<const User *, 16> Stack(GV->users());
+
+ while (!Stack.empty()) {
+ const User *U = Stack.pop_back_val();
+
+ if (!Visited.insert(U).second)
+ continue;
+
+ if (isa<Instruction>(U))
+ return true;
+
+ append_range(Stack, U->users());
+ }
+
+ return false;
+}
+
+bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) {
+ // We are not interested in kernel LDS lowering for module LDS itself.
+ if (F && GV.getName() == "llvm.amdgcn.module.lds")
+ return false;
+
+ bool Ret = false;
+ SmallPtrSet<const User *, 8> Visited;
+ SmallVector<const User *, 16> Stack(GV.users());
+ SmallPtrSet<const GlobalValue *, 8> GlobalUsers;
+
+ assert(!F || isKernelCC(F));
+
+ while (!Stack.empty()) {
+ const User *V = Stack.pop_back_val();
+ Visited.insert(V);
+
+ if (auto *G = dyn_cast<GlobalValue>(V)) {
+ StringRef GName = G->getName();
+ if (F && GName != "llvm.used" && GName != "llvm.compiler.used") {
+ // For kernel LDS lowering, if G is not a compiler.used list, then we
+ // cannot lower the lds GV since we cannot replace the use of GV within
+ // G.
+ return false;
+ }
+ GlobalUsers.insert(G);
+ continue;
+ }
+
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ const Function *UF = I->getFunction();
+ if (UF == F) {
+ // Used from this kernel, we want to put it into the structure.
+ Ret = true;
+ } else if (!F) {
+ // For module LDS lowering, lowering is required if the user instruction
+ // is from non-kernel function.
+ Ret |= !isKernelCC(UF);
+ }
+ continue;
+ }
+
+ // User V should be a constant, recursively visit users of V.
+ assert(isa<Constant>(V) && "Expected a constant.");
+ append_range(Stack, V->users());
+ }
+
+ if (!F && !Ret) {
+ // For module LDS lowering, we have not yet decided if we should lower GV or
+ // not. Explore all global users of GV, and check if atleast one of these
+ // global users appear as an use within an instruction (possibly nested use
+ // via constant expression), if so, then conservately lower LDS.
+ for (auto *G : GlobalUsers)
+ Ret |= hasUserInstruction(G);
+ }
+
+ return Ret;
+}
+
+std::vector<GlobalVariable *> findVariablesToLower(Module &M,
+ const Function *F) {
+ std::vector<llvm::GlobalVariable *> LocalVars;
+ for (auto &GV : M.globals()) {
+ if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
+ continue;
+ }
+ if (!GV.hasInitializer()) {
+ // addrspace(3) without initializer implies cuda/hip extern __shared__
+ // the semantics for such a variable appears to be that all extern
+ // __shared__ variables alias one another, in which case this transform
+ // is not required
+ continue;
+ }
+ if (!isa<UndefValue>(GV.getInitializer())) {
+ // Initializers are unimplemented for local address space.
+ // Leave such variables in place for consistent error reporting.
+ continue;
+ }
+ if (GV.isConstant()) {
+ // A constant undef variable can't be written to, and any load is
+ // undef, so it should be eliminated by the optimizer. It could be
+ // dropped by the back end if not. This pass skips over it.
+ continue;
+ }
+ if (!shouldLowerLDSToStruct(GV, F)) {
+ continue;
+ }
+ LocalVars.push_back(&GV);
+ }
+ return LocalVars;
+}
+
+SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) {
+ SmallPtrSet<GlobalValue *, 32> UsedList;
+
+ SmallVector<GlobalValue *, 32> TmpVec;
+ collectUsedGlobalVariables(M, TmpVec, true);
+ UsedList.insert(TmpVec.begin(), TmpVec.end());
+
+ TmpVec.clear();
+ collectUsedGlobalVariables(M, TmpVec, false);
+ UsedList.insert(TmpVec.begin(), TmpVec.end());
+
+ return UsedList;
+}
+
+} // end namespace AMDGPU
+
+} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
new file mode 100644
index 000000000000..ffcafb9b76ce
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
@@ -0,0 +1,70 @@
+//===- AMDGPULDSUtils.h - LDS related helper functions -*- C++ -*----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// AMDGPU LDS related helper utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H
+
+#include "AMDGPU.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/Constants.h"
+
+namespace llvm {
+
+class ConstantExpr;
+
+namespace AMDGPU {
+
+/// Collect reachable callees for each kernel defined in the module \p M and
+/// return collected callees at \p KernelToCallees.
+void collectReachableCallees(
+ Module &M,
+ DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees);
+
+/// For the given LDS global \p GV, visit all its users and collect all
+/// non-kernel functions within which \p GV is used and return collected list of
+/// such non-kernel functions.
+SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV);
+
+/// Collect all the instructions where user \p U belongs to. \p U could be
+/// instruction itself or it could be a constant expression which is used within
+/// an instruction. If \p CollectKernelInsts is true, collect instructions only
+/// from kernels, otherwise collect instructions only from non-kernel functions.
+DenseMap<Function *, SmallPtrSet<Instruction *, 8>>
+getFunctionToInstsMap(User *U, bool CollectKernelInsts);
+
+bool isKernelCC(const Function *Func);
+
+Align getAlign(DataLayout const &DL, const GlobalVariable *GV);
+
+/// \returns true if a given global variable \p GV (or its global users) appear
+/// as an use within some instruction (either from kernel or from non-kernel).
+bool hasUserInstruction(const GlobalValue *GV);
+
+/// \returns true if an LDS global requres lowering to a module LDS structure
+/// if \p F is not given. If \p F is given it must be a kernel and function
+/// \returns true if an LDS global is directly used from that kernel and it
+/// is safe to replace its uses with a kernel LDS structure member.
+bool shouldLowerLDSToStruct(const GlobalVariable &GV,
+ const Function *F = nullptr);
+
+std::vector<GlobalVariable *> findVariablesToLower(Module &M,
+ const Function *F = nullptr);
+
+SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M);
+
+/// Replace all uses of constant \p C with instructions in \p F.
+void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F);
+} // end namespace AMDGPU
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index b7dd757a8af3..f6b5975f1934 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -41,7 +41,7 @@ void AMDGPUPALMetadata::readFromIR(Module &M) {
}
return;
}
- BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA;
+ BlobType = ELF::NT_AMD_PAL_METADATA;
NamedMD = M.getNamedMetadata("amdgpu.pal.metadata");
if (!NamedMD || !NamedMD->getNumOperands()) {
// Emit msgpack metadata by default
@@ -69,7 +69,7 @@ void AMDGPUPALMetadata::readFromIR(Module &M) {
// Metadata.
bool AMDGPUPALMetadata::setFromBlob(unsigned Type, StringRef Blob) {
BlobType = Type;
- if (Type == ELF::NT_AMD_AMDGPU_PAL_METADATA)
+ if (Type == ELF::NT_AMD_PAL_METADATA)
return setFromLegacyBlob(Blob);
return setFromMsgPackBlob(Blob);
}
@@ -243,6 +243,27 @@ void AMDGPUPALMetadata::setFunctionScratchSize(const MachineFunction &MF,
Node[".stack_frame_size_in_bytes"] = MsgPackDoc.getNode(Val);
}
+// Set the amount of LDS used in bytes in the metadata.
+void AMDGPUPALMetadata::setFunctionLdsSize(const MachineFunction &MF,
+ unsigned Val) {
+ auto Node = getShaderFunction(MF.getFunction().getName());
+ Node[".lds_size"] = MsgPackDoc.getNode(Val);
+}
+
+// Set the number of used vgprs in the metadata.
+void AMDGPUPALMetadata::setFunctionNumUsedVgprs(const MachineFunction &MF,
+ unsigned Val) {
+ auto Node = getShaderFunction(MF.getFunction().getName());
+ Node[".vgpr_count"] = MsgPackDoc.getNode(Val);
+}
+
+// Set the number of used vgprs in the metadata.
+void AMDGPUPALMetadata::setFunctionNumUsedSgprs(const MachineFunction &MF,
+ unsigned Val) {
+ auto Node = getShaderFunction(MF.getFunction().getName());
+ Node[".sgpr_count"] = MsgPackDoc.getNode(Val);
+}
+
// Set the hardware register bit in PAL metadata to enable wave32 on the
// shader of the given calling convention.
void AMDGPUPALMetadata::setWave32(unsigned CC) {
@@ -592,6 +613,41 @@ static const char *getRegisterName(unsigned RegNum) {
{0xa2c1, "VGT_STRMOUT_VTX_STRIDE_3"},
{0xa316, "VGT_VERTEX_REUSE_BLOCK_CNTL"},
+ {0x2e28, "COMPUTE_PGM_RSRC3"},
+ {0x2e2a, "COMPUTE_SHADER_CHKSUM"},
+ {0x2e24, "COMPUTE_USER_ACCUM_0"},
+ {0x2e25, "COMPUTE_USER_ACCUM_1"},
+ {0x2e26, "COMPUTE_USER_ACCUM_2"},
+ {0x2e27, "COMPUTE_USER_ACCUM_3"},
+ {0xa1ff, "GE_MAX_OUTPUT_PER_SUBGROUP"},
+ {0xa2d3, "GE_NGG_SUBGRP_CNTL"},
+ {0xc25f, "GE_STEREO_CNTL"},
+ {0xc262, "GE_USER_VGPR_EN"},
+ {0xc258, "IA_MULTI_VGT_PARAM_PIPED"},
+ {0xa210, "PA_STEREO_CNTL"},
+ {0xa1c2, "SPI_SHADER_IDX_FORMAT"},
+ {0x2c80, "SPI_SHADER_PGM_CHKSUM_GS"},
+ {0x2d00, "SPI_SHADER_PGM_CHKSUM_HS"},
+ {0x2c06, "SPI_SHADER_PGM_CHKSUM_PS"},
+ {0x2c45, "SPI_SHADER_PGM_CHKSUM_VS"},
+ {0x2c88, "SPI_SHADER_PGM_LO_GS"},
+ {0x2cb2, "SPI_SHADER_USER_ACCUM_ESGS_0"},
+ {0x2cb3, "SPI_SHADER_USER_ACCUM_ESGS_1"},
+ {0x2cb4, "SPI_SHADER_USER_ACCUM_ESGS_2"},
+ {0x2cb5, "SPI_SHADER_USER_ACCUM_ESGS_3"},
+ {0x2d32, "SPI_SHADER_USER_ACCUM_LSHS_0"},
+ {0x2d33, "SPI_SHADER_USER_ACCUM_LSHS_1"},
+ {0x2d34, "SPI_SHADER_USER_ACCUM_LSHS_2"},
+ {0x2d35, "SPI_SHADER_USER_ACCUM_LSHS_3"},
+ {0x2c32, "SPI_SHADER_USER_ACCUM_PS_0"},
+ {0x2c33, "SPI_SHADER_USER_ACCUM_PS_1"},
+ {0x2c34, "SPI_SHADER_USER_ACCUM_PS_2"},
+ {0x2c35, "SPI_SHADER_USER_ACCUM_PS_3"},
+ {0x2c72, "SPI_SHADER_USER_ACCUM_VS_0"},
+ {0x2c73, "SPI_SHADER_USER_ACCUM_VS_1"},
+ {0x2c74, "SPI_SHADER_USER_ACCUM_VS_2"},
+ {0x2c75, "SPI_SHADER_USER_ACCUM_VS_3"},
+
{0, nullptr}};
auto Entry = RegInfoTable;
for (; Entry->Num && Entry->Num != RegNum; ++Entry)
@@ -653,7 +709,7 @@ void AMDGPUPALMetadata::toString(std::string &String) {
// a .note record of the specified AMD type. Returns an empty blob if
// there is no PAL metadata,
void AMDGPUPALMetadata::toBlob(unsigned Type, std::string &Blob) {
- if (Type == ELF::NT_AMD_AMDGPU_PAL_METADATA)
+ if (Type == ELF::NT_AMD_PAL_METADATA)
toLegacyBlob(Blob);
else if (Type)
toMsgPackBlob(Blob);
@@ -790,7 +846,7 @@ const char *AMDGPUPALMetadata::getVendor() const {
}
// Get .note record type of metadata blob to be emitted:
-// ELF::NT_AMD_AMDGPU_PAL_METADATA (legacy key=val format), or
+// ELF::NT_AMD_PAL_METADATA (legacy key=val format), or
// ELF::NT_AMDGPU_METADATA (MsgPack format), or
// 0 (no PAL metadata).
unsigned AMDGPUPALMetadata::getType() const {
@@ -799,12 +855,12 @@ unsigned AMDGPUPALMetadata::getType() const {
// Return whether the blob type is legacy PAL metadata.
bool AMDGPUPALMetadata::isLegacy() const {
- return BlobType == ELF::NT_AMD_AMDGPU_PAL_METADATA;
+ return BlobType == ELF::NT_AMD_PAL_METADATA;
}
// Set legacy PAL metadata format.
void AMDGPUPALMetadata::setLegacy() {
- BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA;
+ BlobType = ELF::NT_AMD_PAL_METADATA;
}
// Erase all PAL metadata.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
index 8fa1f738487c..7fdd9a8429c1 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -80,6 +80,21 @@ public:
// Set the stack frame size of a function in the metadata.
void setFunctionScratchSize(const MachineFunction &MF, unsigned Val);
+ // Set the amount of LDS used in bytes in the metadata. This is an optional
+ // advisory record for logging etc; wave dispatch actually uses the rsrc1
+ // register for the shader stage to determine the amount of LDS to allocate.
+ void setFunctionLdsSize(const MachineFunction &MF, unsigned Val);
+
+ // Set the number of used vgprs in the metadata. This is an optional advisory
+ // record for logging etc; wave dispatch actually uses the rsrc1 register for
+ // the shader stage to determine the number of vgprs to allocate.
+ void setFunctionNumUsedVgprs(const MachineFunction &MF, unsigned Val);
+
+ // Set the number of used sgprs in the metadata. This is an optional advisory
+ // record for logging etc; wave dispatch actually uses the rsrc1 register for
+ // the shader stage to determine the number of sgprs to allocate.
+ void setFunctionNumUsedSgprs(const MachineFunction &MF, unsigned Val);
+
// Set the hardware register bit in PAL metadata to enable wave32 on the
// shader of the given calling convention.
void setWave32(unsigned CC);
@@ -95,7 +110,7 @@ public:
const char *getVendor() const;
// Get .note record type of metadata blob to be emitted:
- // ELF::NT_AMD_AMDGPU_PAL_METADATA (legacy key=val format), or
+ // ELF::NT_AMD_PAL_METADATA (legacy key=val format), or
// ELF::NT_AMDGPU_METADATA (MsgPack format), or
// 0 (no PAL metadata).
unsigned getType() const;
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index f1e470031982..35d5fe13ad30 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -60,9 +60,12 @@ class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1On
}
class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> :
+ VOP_Real <ps>,
InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+ let VALU = 1;
+ let VOP1 = 1;
let isPseudo = 0;
let isCodeGenOnly = 0;
@@ -79,6 +82,10 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> :
let UseNamedOperandTable = ps.UseNamedOperandTable;
let Uses = ps.Uses;
let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let TRANS = ps.TRANS;
}
class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
@@ -144,6 +151,15 @@ def VOP1_F64_I32 : VOPProfileI2F <f64, i32>;
def VOP1_F32_I32 : VOPProfileI2F <f32, i32>;
def VOP1_F16_I16 : VOPProfileI2F <f16, i16>;
+class VOP_SPECIAL_OMOD_PROF<ValueType dstVt, ValueType srcVt> :
+ VOPProfile<[dstVt, srcVt, untyped, untyped]> {
+
+ let HasOMod = 1;
+}
+def VOP_I32_F32_SPECIAL_OMOD : VOP_SPECIAL_OMOD_PROF<i32, f32>;
+def VOP_I32_F64_SPECIAL_OMOD : VOP_SPECIAL_OMOD_PROF<i32, f64>;
+def VOP_I16_F16_SPECIAL_OMOD : VOP_SPECIAL_OMOD_PROF<i16, f16>;
+
//===----------------------------------------------------------------------===//
// VOP1 Instructions
//===----------------------------------------------------------------------===//
@@ -187,8 +203,10 @@ def V_READFIRSTLANE_B32 :
let Inst{31-25} = 0x3f; //encoding
}
+let isReMaterializable = 1 in {
let SchedRW = [WriteDoubleCvt] in {
-defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>;
+// OMod clears exceptions when set in this instruction
+defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64_SPECIAL_OMOD, fp_to_sint>;
let mayRaiseFPException = 0 in {
defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>;
@@ -196,7 +214,8 @@ defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>;
defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>;
defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>;
-defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>;
+// OMod clears exceptions when set in this instruction
+defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64_SPECIAL_OMOD, fp_to_uint>;
let mayRaiseFPException = 0 in {
defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP1_F64_I32, uint_to_fp>;
@@ -213,11 +232,12 @@ defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>;
defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>;
}
-defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>;
-defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>;
-let FPDPRounding = 1 in {
+// OMod clears exceptions when set in these 2 instructions
+defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32_SPECIAL_OMOD, fp_to_uint>;
+defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32_SPECIAL_OMOD, fp_to_sint>;
+let FPDPRounding = 1, isReMaterializable = 0 in {
defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>;
-} // End FPDPRounding = 1
+} // End FPDPRounding = 1, isReMaterializable = 0
defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>;
@@ -268,7 +288,7 @@ defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32, AMDGPUffbl_b32>;
defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32, AMDGPUffbh_i32>;
let SchedRW = [WriteDoubleAdd] in {
-defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>;
+defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64_SPECIAL_OMOD, int_amdgcn_frexp_exp>;
defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>;
let FPDPRounding = 1 in {
defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>;
@@ -277,6 +297,7 @@ defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>;
defm V_FREXP_EXP_I32_F32 : VOP1Inst <"v_frexp_exp_i32_f32", VOP_I32_F32, int_amdgcn_frexp_exp>;
defm V_FREXP_MANT_F32 : VOP1Inst <"v_frexp_mant_f32", VOP_F32_F32, int_amdgcn_frexp_mant>;
+} // End isReMaterializable = 1
let VOPAsmPrefer32Bit = 1 in {
defm V_CLREXCP : VOP1Inst <"v_clrexcp", VOP_NO_EXT<VOP_NONE>>;
@@ -337,6 +358,7 @@ defm V_MOVRELS_B32 : VOP1Inst <"v_movrels_b32", VOP_MOVRELS>;
defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_MOVRELSD>;
} // End Uses = [M0, EXEC]
+let isReMaterializable = 1 in {
let SubtargetPredicate = isGFX6GFX7 in {
let TRANS = 1, SchedRW = [WriteTrans32] in {
defm V_LOG_CLAMP_F32 :
@@ -351,12 +373,12 @@ let SubtargetPredicate = isGFX6GFX7 in {
VOP1Inst<"v_rsq_legacy_f32", VOP_F32_F32, int_amdgcn_rsq_legacy>;
} // End TRANS = 1, SchedRW = [WriteTrans32]
- let SchedRW = [WriteDouble] in {
+ let SchedRW = [WriteTrans64] in {
defm V_RCP_CLAMP_F64 :
VOP1Inst<"v_rcp_clamp_f64", VOP_F64_F64>;
defm V_RSQ_CLAMP_F64 :
VOP1Inst<"v_rsq_clamp_f64", VOP_F64_F64, AMDGPUrsq_clamp>;
- } // End SchedRW = [WriteDouble]
+ } // End SchedRW = [WriteTrans64]
} // End SubtargetPredicate = isGFX6GFX7
let SubtargetPredicate = isGFX7GFX8GFX9 in {
@@ -374,6 +396,7 @@ let SubtargetPredicate = isGFX7Plus in {
defm V_FLOOR_F64 : VOP1Inst<"v_floor_f64", VOP_F64_F64, ffloor>;
} // End SchedRW = [WriteDoubleAdd]
} // End SubtargetPredicate = isGFX7Plus
+} // End isReMaterializable = 1
let SubtargetPredicate = Has16BitInsts in {
@@ -381,8 +404,9 @@ let FPDPRounding = 1 in {
defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>;
defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>;
} // End FPDPRounding = 1
-defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>;
-defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>;
+// OMod clears exceptions when set in these two instructions
+defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16_SPECIAL_OMOD, fp_to_uint>;
+defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16_SPECIAL_OMOD, fp_to_sint>;
let TRANS = 1, SchedRW = [WriteTrans32] in {
defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>;
@@ -393,7 +417,7 @@ defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
} // End TRANS = 1, SchedRW = [WriteTrans32]
defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
-defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16, int_amdgcn_frexp_exp>;
+defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16_SPECIAL_OMOD, int_amdgcn_frexp_exp>;
defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>;
defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>;
defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>;
@@ -434,11 +458,12 @@ let SubtargetPredicate = isGFX9Plus in {
let SchedRW = [Write64Bit, Write64Bit];
}
+ let isReMaterializable = 1 in
defm V_SAT_PK_U8_I16 : VOP1Inst<"v_sat_pk_u8_i16", VOP_I32_I32>;
let mayRaiseFPException = 0 in {
- defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16>;
- defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16>;
+ defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16_SPECIAL_OMOD>;
+ defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16_SPECIAL_OMOD>;
} // End mayRaiseFPException = 0
} // End SubtargetPredicate = isGFX9Plus
@@ -461,6 +486,18 @@ let SubtargetPredicate = isGFX10Plus in {
} // End Uses = [M0]
} // End SubtargetPredicate = isGFX10Plus
+def VOPProfileAccMov : VOP_NO_EXT<VOP_I32_I32> {
+ let DstRC = RegisterOperand<AGPR_32>;
+ let Src0RC32 = RegisterOperand<AGPR_32>;
+ let Asm32 = " $vdst, $src0";
+}
+
+def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1> {
+ let SubtargetPredicate = isGFX90APlus;
+ let isReMaterializable = 1;
+ let isAsCheapAsAMove = 1;
+}
+
//===----------------------------------------------------------------------===//
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
@@ -471,6 +508,7 @@ class VOP1_DPP<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP1
let Defs = ps.Defs;
let SchedRW = ps.SchedRW;
let Uses = ps.Uses;
+ let TRANS = ps.TRANS;
bits<8> vdst;
let Inst{8-0} = 0xfa;
@@ -498,9 +536,6 @@ class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
let Inst{16-9} = op;
let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0);
let Inst{31-25} = 0x3f;
-
- let AssemblerPredicate = HasDPP8;
- let SubtargetPredicate = HasDPP8;
}
//===----------------------------------------------------------------------===//
@@ -823,6 +858,8 @@ defm V_SAT_PK_U8_I16 : VOP1_Real_vi<0x4f>;
defm V_CVT_NORM_I16_F16 : VOP1_Real_vi<0x4d>;
defm V_CVT_NORM_U16_F16 : VOP1_Real_vi<0x4e>;
+defm V_ACCVGPR_MOV_B32 : VOP1Only_Real_vi<0x52>;
+
// Copy of v_mov_b32 with $vdst as a use operand for use with VGPR
// indexing mode. vdst can't be treated as a def for codegen purposes,
// and an implicit use and def of the super register should be added.
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 7a334eaadaed..7860b7e7f8a6 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -81,9 +81,12 @@ class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suf
}
class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> :
+ VOP_Real <ps>,
InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+ let VALU = 1;
+ let VOP2 = 1;
let isPseudo = 0;
let isCodeGenOnly = 0;
@@ -101,6 +104,9 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> :
let UseNamedOperandTable = ps.UseNamedOperandTable;
let Uses = ps.Uses;
let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
}
class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
@@ -237,7 +243,9 @@ multiclass VOP2eInst <string opName,
}
def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
- Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
+ Commutable_REV<revOp#"_e64", !eq(revOp, opName)> {
+ let isReMaterializable = 1;
+ }
}
}
@@ -267,10 +275,9 @@ class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
(ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm),
(ins VCSrc_f16:$src0, VGPR_32:$src1, ImmOpType:$imm));
field bit HasExt = 0;
+ let IsSingle = 1;
- // Hack to stop printing _e64
- let DstRC = RegisterOperand<VGPR_32>;
- field string Asm32 = " $vdst, $src0, $src1, $imm";
+ field string Asm32 = "$vdst, $src0, $src1, $imm";
}
def VOP_MADAK_F16 : VOP_MADAK <f16>;
@@ -280,37 +287,38 @@ class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
field dag Ins32 = (ins VCSrc_f32:$src0, ImmOpType:$imm, VGPR_32:$src1);
field bit HasExt = 0;
+ let IsSingle = 1;
- // Hack to stop printing _e64
- let DstRC = RegisterOperand<VGPR_32>;
- field string Asm32 = " $vdst, $src0, $imm, $src1";
+ field string Asm32 = "$vdst, $src0, $imm, $src1";
}
def VOP_MADMK_F16 : VOP_MADMK <f16>;
def VOP_MADMK_F32 : VOP_MADMK <f32>;
+class getRegisterOperandForVT<ValueType VT> {
+ RegisterOperand ret = RegisterOperand<getVregSrcForVT<VT>.ret>;
+}
+
// FIXME: Remove src2_modifiers. It isn't used, so is wasting memory
// and processing time but it makes it easier to convert to mad.
class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, vt0]> {
- let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
- let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
+ let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT<Src2VT>.ret:$src2);
+ let Ins64 = getIns64<Src0RC64, Src1RC64, getRegisterOperandForVT<Src2VT>.ret, 3,
0, HasModifiers, HasModifiers, HasOMod,
Src0Mod, Src1Mod, Src2Mod>.ret;
let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
- VGPR_32:$src2, // stub argument
+ getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
let InsDPP16 = !con(InsDPP, (ins FI:$fi));
-
let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
- VGPR_32:$src2, // stub argument
+ getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
dpp8:$dpp8, FI:$fi);
-
let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
- VGPR_32:$src2, // stub argument
+ getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
clampmod:$clamp, omod:$omod,
dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel, src1_sel:$src1_sel);
@@ -335,6 +343,8 @@ def VOP_MAC_F16 : VOP_MAC <f16>;
def VOP_MAC_F32 : VOP_MAC <f32>;
let HasExtDPP = 0 in
def VOP_MAC_LEGACY_F32 : VOP_MAC <f32>;
+let HasExtSDWA = 0, HasExt64BitDPP = 1 in
+def VOP_MAC_F64 : VOP_MAC <f64>;
class VOP_DOT_ACC<ValueType vt0, ValueType vt1> : VOP_MAC<vt0, vt1> {
let HasClamp = 0;
@@ -448,6 +458,7 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
let HasExt = 0;
let HasExtDPP = 0;
+ let HasExt64BitDPP = 0;
let HasExtSDWA = 0;
let HasExtSDWA9 = 0;
}
@@ -464,6 +475,7 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
let HasExt = 0;
let HasExtDPP = 0;
+ let HasExt64BitDPP = 0;
let HasExtSDWA = 0;
let HasExtSDWA9 = 0;
}
@@ -473,10 +485,11 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
//===----------------------------------------------------------------------===//
defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>;
-let SubtargetPredicate = HasMadMacF32Insts in
+let SubtargetPredicate = HasMadMacF32Insts, isReMaterializable = 1 in
def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>;
let isCommutable = 1 in {
+let isReMaterializable = 1 in {
defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, any_fadd>;
defm V_SUB_F32 : VOP2Inst <"v_sub_f32", VOP_F32_F32_F32, any_fsub>;
defm V_SUBREV_F32 : VOP2Inst <"v_subrev_f32", VOP_F32_F32_F32, null_frag, "v_sub_f32">;
@@ -498,6 +511,7 @@ defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, lshl_rev, "v_ls
defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, and>;
defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, or>;
defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>;
+} // End isReMaterializable = 1
let mayRaiseFPException = 0 in {
let OtherPredicates = [HasMadMacF32Insts] in {
@@ -510,6 +524,7 @@ defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_MAC_LEGACY_F32>;
} // End Constraints = "$vdst = $src2", DisableEncoding="$src2",
// isConvertibleToThreeAddress = 1
+let isReMaterializable = 1 in
def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, []>;
} // End OtherPredicates = [HasMadMacF32Insts]
} // End mayRaiseFPException = 0
@@ -524,7 +539,7 @@ defm V_SUBB_U32 : VOP2bInst <"v_subb_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "
defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32", 1>;
-let SubtargetPredicate = HasAddNoCarryInsts in {
+let SubtargetPredicate = HasAddNoCarryInsts, isReMaterializable = 1 in {
defm V_ADD_U32 : VOP2Inst <"v_add_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_add_u32", 1>;
defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>;
defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>;
@@ -543,12 +558,12 @@ def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
} // End $vdst = $vdst_in, DisableEncoding $vdst_in
} // End isConvergent = 1
+let isReMaterializable = 1 in {
defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, add_ctpop>;
defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_lo>;
defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
-defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst"
let ReadsModeReg = 0, mayRaiseFPException = 0 in {
defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_V2I16_F32_F32>, AMDGPUpknorm_i16_f32>;
@@ -572,7 +587,9 @@ defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, sra>;
defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, shl>;
} // End SubtargetPredicate = isGFX6GFX7
} // End isCommutable = 1
+} // End isReMaterializable = 1
+defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst"
class DivergentBinOp<SDPatternOperator Op, VOP_Pseudo Inst> :
GCNPat<
@@ -672,7 +689,8 @@ defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;
let SubtargetPredicate = HasDLInsts in {
-defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32>;
+let isReMaterializable = 1 in
+defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32, xnor>;
let Constraints = "$vdst = $src2",
DisableEncoding = "$src2",
@@ -692,6 +710,14 @@ defm V_FMAC_LEGACY_F32 : VOP2Inst <"v_fmac_legacy_f32", VOP_MAC_LEGACY_F32>;
} // End SubtargetPredicate = HasFmaLegacy32
+let SubtargetPredicate = isGFX90APlus,
+ Constraints = "$vdst = $src2",
+ DisableEncoding="$src2",
+ isConvertibleToThreeAddress = 1,
+ isCommutable = 1,
+ SchedRW = [WriteDoubleAdd] in
+defm V_FMAC_F64 : VOP2Inst <"v_fmac_f64", VOP_MAC_F64>;
+
let Constraints = "$vdst = $src2",
DisableEncoding="$src2",
isConvertibleToThreeAddress = 1,
@@ -735,17 +761,21 @@ let AddedComplexity = 30 in {
}
} // End AddedComplexity = 30
+let SubtargetPredicate = HasFmaakFmamkF32Insts, isReMaterializable = 1 in {
+def V_FMAMK_F32 : VOP2_Pseudo<"v_fmamk_f32", VOP_MADMK_F32, [], "">;
+
+let isCommutable = 1 in
+def V_FMAAK_F32 : VOP2_Pseudo<"v_fmaak_f32", VOP_MADAK_F32, [], "">;
+}
+
let SubtargetPredicate = isGFX10Plus in {
-def V_FMAMK_F32 : VOP2_Pseudo<"v_fmamk_f32", VOP_MADMK_F32, [], "">;
-let FPDPRounding = 1 in
+let FPDPRounding = 1 in {
def V_FMAMK_F16 : VOP2_Pseudo <"v_fmamk_f16", VOP_MADMK_F16, [], "">;
-let isCommutable = 1 in {
-def V_FMAAK_F32 : VOP2_Pseudo<"v_fmaak_f32", VOP_MADAK_F32, [], "">;
-let FPDPRounding = 1 in
+let isCommutable = 1 in
def V_FMAAK_F16 : VOP2_Pseudo <"v_fmaak_f16", VOP_MADAK_F16, [], "">;
-} // End isCommutable = 1
+} // End FPDPRounding = 1
let Constraints = "$vdst = $src2",
DisableEncoding="$src2",
@@ -913,8 +943,6 @@ class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps,
let Inst{30-25} = op;
let Inst{31} = 0x0;
- let AssemblerPredicate = HasDPP8;
- let SubtargetPredicate = HasDPP8;
let OtherPredicates = ps.OtherPredicates;
}
@@ -1122,14 +1150,18 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
multiclass VOP3Only_Real_gfx10<bits<10> op> {
def _e64_gfx10 :
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
- VOP3e_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ VOP3e_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ let IsSingle = 1;
+ }
}
//===---------------------------- VOP3beOnly ----------------------------===//
multiclass VOP3beOnly_Real_gfx10<bits<10> op> {
def _e64_gfx10 :
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
- VOP3be_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ VOP3be_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ let IsSingle = 1;
+ }
}
} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
@@ -1177,7 +1209,10 @@ defm V_FMAAK_F16 : VOP2Only_Real_MADK_gfx10<0x038>;
defm V_MAX_F16 : VOP2_Real_gfx10<0x039>;
defm V_MIN_F16 : VOP2_Real_gfx10<0x03a>;
defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>;
+
+let IsSingle = 1 in {
defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>;
+}
// VOP2 no carry-in, carry-out.
defm V_ADD_NC_U32 :
@@ -1251,20 +1286,20 @@ let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>,
VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
}
- multiclass VOP2_Real_e32_gfx6_gfx7<bits<6> op, string PseudoName = NAME> {
+ multiclass VOP2_Real_e32_gfx6_gfx7<bits<6> op, string opName = NAME> {
def _e32_gfx6_gfx7 :
- VOP2_Real<!cast<VOP2_Pseudo>(PseudoName#"_e32"), SIEncodingFamily.SI>,
- VOP2e<op{5-0}, !cast<VOP2_Pseudo>(PseudoName#"_e32").Pfl>;
+ VOP2_Real<!cast<VOP2_Pseudo>(opName#"_e32"), SIEncodingFamily.SI>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(opName#"_e32").Pfl>;
}
- multiclass VOP2_Real_e64_gfx6_gfx7<bits<6> op, string PseudoName = NAME> {
+ multiclass VOP2_Real_e64_gfx6_gfx7<bits<6> op, string opName = NAME> {
def _e64_gfx6_gfx7 :
- VOP3_Real<!cast<VOP3_Pseudo>(PseudoName#"_e64"), SIEncodingFamily.SI>,
- VOP3e_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(PseudoName#"_e64").Pfl>;
+ VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.SI>,
+ VOP3e_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(opName#"_e64").Pfl>;
}
- multiclass VOP2be_Real_e64_gfx6_gfx7<bits<6> op, string PseudoName = NAME> {
+ multiclass VOP2be_Real_e64_gfx6_gfx7<bits<6> op, string opName = NAME> {
def _e64_gfx6_gfx7 :
- VOP3_Real<!cast<VOP3_Pseudo>(PseudoName#"_e64"), SIEncodingFamily.SI>,
- VOP3be_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(PseudoName#"_e64").Pfl>;
+ VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.SI>,
+ VOP3be_gfx6_gfx7<{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(opName#"_e64").Pfl>;
}
} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
@@ -1281,16 +1316,16 @@ multiclass VOP2be_Real_gfx6_gfx7<bits<6> op> :
VOP2_Real_e32_gfx6_gfx7<op>, VOP2be_Real_e64_gfx6_gfx7<op>;
multiclass VOP2be_Real_gfx6_gfx7_with_name<bits<6> op,
- string PseudoName, string asmName> {
- defvar ps32 = !cast<VOP2_Pseudo>(PseudoName#"_e32");
- defvar ps64 = !cast<VOP3_Pseudo>(PseudoName#"_e64");
+ string opName, string asmName> {
+ defvar ps32 = !cast<VOP2_Pseudo>(opName#"_e32");
+ defvar ps64 = !cast<VOP3_Pseudo>(opName#"_e64");
let AsmString = asmName # ps32.AsmOperands in {
- defm "" : VOP2_Real_e32_gfx6_gfx7<op, PseudoName>;
+ defm "" : VOP2_Real_e32_gfx6_gfx7<op, opName>;
}
let AsmString = asmName # ps64.AsmOperands in {
- defm "" : VOP2be_Real_e64_gfx6_gfx7<op, PseudoName>;
+ defm "" : VOP2be_Real_e64_gfx6_gfx7<op, opName>;
}
}
@@ -1391,10 +1426,7 @@ multiclass VOP2_Real_e64only_vi <bits<10> op> {
def _e64_vi :
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
- // Hack to stop printing _e64
- VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME#"_e64");
- let OutOperandList = (outs VGPR_32:$vdst);
- let AsmString = ps.Mnemonic # " " # ps.AsmOperands;
+ let IsSingle = 1;
}
}
@@ -1525,6 +1557,7 @@ defm V_CNDMASK_B32 : VOP2_Real_e32e64_vi <0x0>;
defm V_ADD_F32 : VOP2_Real_e32e64_vi <0x1>;
defm V_SUB_F32 : VOP2_Real_e32e64_vi <0x2>;
defm V_SUBREV_F32 : VOP2_Real_e32e64_vi <0x3>;
+let AssemblerPredicate = isGCN3ExcludingGFX90A in
defm V_MUL_LEGACY_F32 : VOP2_Real_e32e64_vi <0x4>;
defm V_MUL_F32 : VOP2_Real_e32e64_vi <0x5>;
defm V_MUL_I32_I24 : VOP2_Real_e32e64_vi <0x6>;
@@ -1641,6 +1674,42 @@ defm V_XNOR_B32 : VOP2_Real_e32e64_vi <0x3d>;
} // End SubtargetPredicate = HasDLInsts
+let AssemblerPredicate = isGFX90APlus, DecoderNamespace = "GFX90A" in {
+ multiclass VOP2_Real_e32_gfx90a <bits<6> op> {
+ def _e32_gfx90a :
+ VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX90A>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
+ }
+
+ multiclass VOP2_Real_e64_gfx90a <bits<10> op> {
+ def _e64_gfx90a :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX90A>,
+ VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+ }
+
+ multiclass Base_VOP2_Real_e32e64_gfx90a <bits<6> op> :
+ VOP2_Real_e32_gfx90a<op>,
+ VOP2_Real_e64_gfx90a<{0, 1, 0, 0, op{5-0}}>;
+
+ multiclass VOP2_Real_e32e64_gfx90a <bits<6> op> :
+ Base_VOP2_Real_e32e64_gfx90a<op> {
+
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp_gfx90a :
+ VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX90A>,
+ VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> {
+ let DecoderNamespace = "SDWA9";
+ }
+ }
+} // End AssemblerPredicate = isGFX90APlus, DecoderNamespace = "GFX90A"
+
+let SubtargetPredicate = isGFX90APlus in {
+ defm V_FMAC_F64 : VOP2_Real_e32e64_gfx90a <0x4>;
+ let IsSingle = 1 in {
+ defm V_MUL_LEGACY_F32 : VOP2_Real_e64_gfx90a <0x2a1>;
+ }
+} // End SubtargetPredicate = isGFX90APlus
+
multiclass VOP2_Real_DOT_ACC_gfx9<bits<6> op> : VOP2_Real_e32_vi<op> {
def _dpp_vi : VOP2_DPP<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
}
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 42dc995609f0..ee3b87f487d0 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -184,47 +184,24 @@ class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProf
let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
let HasModifiers = !if(Features.IsMAI, 0, !or(Features.IsPacked, P.HasModifiers));
-
- // FIXME: Hack to stop printing _e64
- let Outs64 = (outs DstRC.RegClass:$vdst);
- let Asm64 =
- " " # !if(Features.HasOpSel,
- getAsmVOP3OpSel<NumSrcArgs,
- HasIntClamp,
- P.HasOMod,
- HasSrc0FloatMods,
- HasSrc1FloatMods,
- HasSrc2FloatMods>.ret,
- !if(Features.HasClamp,
- getAsm64<HasDst, NumSrcArgs, HasIntClamp,
- HasModifiers, HasOMod, DstVT>.ret,
- P.Asm64));
- let NeedPatGen = P.NeedPatGen;
+ let IsSingle = 1;
}
class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
- let Asm64 = " $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod";
-}
-
-def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32> {
- // FIXME: Hack to stop printing _e64
- let DstRC = RegisterOperand<VGPR_32>;
+ let Asm64 = "$vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod";
+ let IsSingle = 1;
}
-def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> {
- // FIXME: Hack to stop printing _e64
- let DstRC = RegisterOperand<VReg_64>;
-}
+def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32>;
+def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64>;
def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
let HasClamp = 1;
-
- // FIXME: Hack to stop printing _e64
- let DstRC = RegisterOperand<VReg_64>;
+ let IsSingle = 1;
let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
- let Asm64 = " $vdst, $sdst, $src0, $src1, $src2$clamp";
+ let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp";
}
//===----------------------------------------------------------------------===//
@@ -287,7 +264,7 @@ class VOP3_INTERP16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> {
let HasOMod = !ne(DstVT.Value, f16.Value);
let HasHigh = 1;
- let Outs64 = (outs VGPR_32:$vdst);
+ let Outs64 = (outs DstRC.RegClass:$vdst);
let Ins64 = getInterp16Ins<HasSrc2, HasOMod, Src0Mod, Src2Mod>.ret;
let Asm64 = getInterp16Asm<HasSrc2, HasOMod>.ret;
}
@@ -298,6 +275,7 @@ class VOP3_INTERP16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> {
let isCommutable = 1 in {
+let isReMaterializable = 1 in {
let mayRaiseFPException = 0 in {
let SubtargetPredicate = HasMadMacF32Insts in {
defm V_MAD_LEGACY_F32 : VOP3Inst <"v_mad_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
@@ -325,12 +303,13 @@ defm V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_l
defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_like, 1>;
} // End SchedRW = [WriteDoubleAdd]
-let SchedRW = [WriteQuarterRate32] in {
+let SchedRW = [WriteIntMul] in {
defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>, mul>;
defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile<VOP_I32_I32_I32>, mulhu>;
defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>;
defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>;
-} // End SchedRW = [WriteQuarterRate32]
+} // End SchedRW = [WriteIntMul]
+} // End isReMaterializable = 1
let Uses = [MODE, VCC, EXEC] in {
// v_div_fmas_f32:
@@ -351,6 +330,7 @@ defm V_DIV_FMAS_F64 : VOP3Inst_Pseudo_Wrapper <"v_div_fmas_f64", VOP_F64_F64_F6
} // End isCommutable = 1
+let isReMaterializable = 1 in {
let mayRaiseFPException = 0 in {
defm V_CUBEID_F32 : VOP3Inst <"v_cubeid_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubeid>;
defm V_CUBESC_F32 : VOP3Inst <"v_cubesc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubesc>;
@@ -364,22 +344,27 @@ defm V_BFI_B32 : VOP3Inst <"v_bfi_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGP
defm V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, fshr>;
defm V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>;
-let mayRaiseFPException = 0 in { // XXX - Seems suspect but manual doesn't say it does
-defm V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmin3>;
-defm V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmin3>;
-defm V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumin3>;
-defm V_MAX3_F32 : VOP3Inst <"v_max3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmax3>;
-defm V_MAX3_I32 : VOP3Inst <"v_max3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmax3>;
-defm V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumax3>;
-defm V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>;
-defm V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmed3>;
-defm V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumed3>;
+// XXX - No FPException seems suspect but manual doesn't say it does
+let mayRaiseFPException = 0 in {
+ let isCommutable = 1 in {
+ defm V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmin3>;
+ defm V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumin3>;
+ defm V_MAX3_I32 : VOP3Inst <"v_max3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmax3>;
+ defm V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumax3>;
+ defm V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmed3>;
+ defm V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumed3>;
+ } // End isCommutable = 1
+ defm V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmin3>;
+ defm V_MAX3_F32 : VOP3Inst <"v_max3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmax3>;
+ defm V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>;
} // End mayRaiseFPException = 0
-defm V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-defm V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-defm V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-defm V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+let isCommutable = 1 in {
+ defm V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+ defm V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+ defm V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+ defm V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
+} // End isCommutable = 1
defm V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>;
defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUdiv_fixup>;
@@ -388,6 +373,7 @@ let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in {
defm V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, AMDGPUdiv_fixup>;
defm V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp, 1>;
} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1
+} // End isReMaterializable = 1
let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it does.
@@ -399,6 +385,7 @@ let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it d
defm V_DIV_SCALE_F64 : VOP3Inst_Pseudo_Wrapper <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1>;
} // End mayRaiseFPException = 0
+let isReMaterializable = 1 in
defm V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
let Constraints = "@earlyclobber $vdst" in {
@@ -406,6 +393,7 @@ defm V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64
} // End Constraints = "@earlyclobber $vdst"
+let isReMaterializable = 1 in {
let SchedRW = [WriteDouble] in {
defm V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, int_amdgcn_trig_preop>;
} // End SchedRW = [WriteDouble]
@@ -423,12 +411,14 @@ let SchedRW = [Write64Bit] in {
defm V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, ashr_rev>;
} // End SubtargetPredicate = isGFX8Plus
} // End SchedRW = [Write64Bit]
+} // End isReMaterializable = 1
def : GCNPat<
(i32 (getDivergentFrag<sext>.ret i16:$src)),
(i32 (V_BFE_I32_e64 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10))))
>;
+let isReMaterializable = 1 in {
let SubtargetPredicate = isGFX6GFX7GFX10 in {
defm V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
} // End SubtargetPredicate = isGFX6GFX7GFX10
@@ -438,6 +428,7 @@ let SubtargetPredicate = isGFX8Plus in {
defm V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUperm>;
} // End SubtargetPredicate = isGFX8Plus
} // End SchedRW = [Write32Bit]
+} // End isReMaterializable = 1
let SubtargetPredicate = isGFX7Plus in {
@@ -447,10 +438,10 @@ defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32
} // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32]
let isCommutable = 1 in {
-let SchedRW = [WriteQuarterRate32, WriteSALU] in {
+let SchedRW = [WriteIntMul, WriteSALU] in {
defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
-} // End SchedRW = [WriteQuarterRate32, WriteSALU]
+} // End SchedRW = [WriteIntMul, WriteSALU]
} // End isCommutable = 1
} // End SubtargetPredicate = isGFX7Plus
@@ -476,6 +467,7 @@ let renamedInGFX9 = 1 in {
let FPDPRounding = 1 in {
defm V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
let Uses = [MODE, M0, EXEC] in {
+ let OtherPredicates = [isNotGFX90APlus] in
// For some reason the intrinsic operands are in a different order
// from the instruction operands.
def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>,
@@ -497,24 +489,24 @@ let SubtargetPredicate = isGFX9Only, FPDPRounding = 1 in {
let SubtargetPredicate = isGFX9Plus in {
defm V_MAD_U16_gfx9 : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
defm V_MAD_I16_gfx9 : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
+let OtherPredicates = [isNotGFX90APlus] in
def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>;
} // End SubtargetPredicate = isGFX9Plus
-let Uses = [MODE, M0, EXEC], FPDPRounding = 1 in {
+// This predicate should only apply to the selection pattern. The
+// instruction still exists and should decode on subtargets with
+// other bank counts.
+let OtherPredicates = [isNotGFX90APlus, has32BankLDS], Uses = [MODE, M0, EXEC], FPDPRounding = 1 in {
def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>,
[(set f32:$vdst, (int_amdgcn_interp_p1_f16 (VOP3Mods f32:$src0, i32:$src0_modifiers),
(i32 timm:$attrchan),
(i32 timm:$attr),
- (i1 timm:$high), M0))]> {
- // This predicate should only apply to the selection pattern. The
- // instruction still exists and should decode on subtargets with
- // other bank counts.
- let OtherPredicates = [has32BankLDS];
-}
-
+ (i1 timm:$high), M0))]>;
+} // End OtherPredicates = [isNotGFX90APlus, has32BankLDS], Uses = [MODE, M0, EXEC], FPDPRounding = 1
+let OtherPredicates = [isNotGFX90APlus], Uses = [MODE, M0, EXEC], FPDPRounding = 1 in {
def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>;
-} // End Uses = [MODE, M0, EXEC], FPDPRounding = 1
+} // End OtherPredicates = [isNotGFX90APlus], Uses = [MODE, M0, EXEC], FPDPRounding = 1
} // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
@@ -527,11 +519,11 @@ def : GCNPat<
), VGPR_32)), sub1)
>;
-let SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC] in {
+let SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC], OtherPredicates = [isNotGFX90APlus] in {
def V_INTERP_P1_F32_e64 : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>;
def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>;
-} // End SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC]
+} // End SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC], OtherPredicates = [isNotGFX90APlus]
let Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] in {
@@ -618,16 +610,16 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
}
let SubtargetPredicate = isGFX9Plus in {
-defm V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
-defm V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-defm V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-defm V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-defm V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-
-defm V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-
+let isCommutable = 1, isReMaterializable = 1 in {
+ defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+ defm V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+ defm V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+ defm V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+ defm V_ADD_I32 : VOP3Inst <"v_add_i32", VOP3_Profile<VOP_I32_I32_I32_ARITH>>;
+ defm V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+} // End isCommutable = 1, isReMaterializable = 1
+// TODO src0 contains the opsel bit for dst, so if we commute, need to mask and swap this
+// to the new src0.
defm V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmed3>;
defm V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmed3>;
defm V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumed3>;
@@ -649,8 +641,13 @@ defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32
defm V_CVT_PKNORM_I16_F16 : VOP3Inst <"v_cvt_pknorm_i16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
defm V_CVT_PKNORM_U16_F16 : VOP3Inst <"v_cvt_pknorm_u16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
-defm V_ADD_I32 : VOP3Inst <"v_add_i32", VOP3_Profile<VOP_I32_I32_I32_ARITH>>;
+defm V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
+
+let isReMaterializable = 1 in {
defm V_SUB_I32 : VOP3Inst <"v_sub_i32", VOP3_Profile<VOP_I32_I32_I32_ARITH>>;
+defm V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+} // End isReMaterializable = 1
class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat <
@@ -729,7 +726,9 @@ class PermlaneDiscardVDstIn<SDPatternOperator permlane,
let SubtargetPredicate = isGFX10Plus in {
- defm V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+ let isCommutable = 1, isReMaterializable = 1 in {
+ defm V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+ } // End isCommutable = 1, isReMaterializable = 1
def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32_e64>;
let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
@@ -833,6 +832,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
VOP3e_gfx10<op, !cast<VOP3_Pseudo>(opName#"_e64").Pfl> {
VOP3_Pseudo ps = !cast<VOP3_Pseudo>(opName#"_e64");
let AsmString = asmName # ps.AsmOperands;
+ let IsSingle = 1;
}
}
multiclass VOP3be_Real_gfx10<bits<10> op> {
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 64e70b8f64b0..48f5eb1dc272 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -10,71 +10,82 @@
// VOP3P Classes
//===----------------------------------------------------------------------===//
-class VOP3PInst<string OpName, VOPProfile P,
- SDPatternOperator node = null_frag,
- bit HasExplicitClamp = 0> :
- VOP3P_Pseudo<OpName, P,
- !if(P.HasModifiers, getVOP3PModPat<P, node, HasExplicitClamp>.ret, getVOP3Pat<P, node>.ret)
->;
+// Used for FMA_MIX* and MAD_MIX* insts
+// Their operands are only sort of f16 operands. Depending on
+// op_sel_hi, these may be interpreted as f32. The inline immediate
+// values are really f16 converted to f32, so we treat these as f16
+// operands.
+class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
+ bit useTiedOutput = 0> : VOP3_Profile<P, Features> {
+ bit UseTiedOutput = useTiedOutput;
+
+ dag srcs =
+ (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0,
+ FP16InputMods:$src1_modifiers, VCSrc_f16:$src1,
+ FP16InputMods:$src2_modifiers, VCSrc_f16:$src2);
+
+ // FIXME: clampmod0 misbehaves with the non-default vdst_in
+ // following it. For now workaround this by requiring clamp
+ // in tied patterns. This should use undef_tied_input, but it
+ // seems underdeveloped and doesn't apply the right register
+ // class constraints.
+ dag mods = !con(!if(UseTiedOutput, (ins clampmod:$clamp, VGPR_32:$vdst_in),
+ (ins clampmod0:$clamp)),
+ (ins op_sel0:$op_sel, op_sel_hi0:$op_sel_hi));
+ // We use Ins64 because that is the one which populates InOperandList
+ // due to the logic in class VOP3_Pseudo
+ let Ins64 = !con(srcs, mods);
+ let Asm64 =
+ "$vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp";
+}
+
+multiclass VOP3PInst<string OpName, VOPProfile P,
+ SDPatternOperator node = null_frag, bit HasExplicitClamp = 0> {
+ def NAME : VOP3P_Pseudo<OpName, P,
+ !if (P.HasModifiers,
+ getVOP3PModPat<P, node, HasExplicitClamp>.ret,
+ getVOP3Pat<P, node>.ret)>;
+}
+
// Non-packed instructions that use the VOP3P encoding.
// VOP3 neg/abs and VOP3P opsel/opsel_hi modifiers are allowed.
-class VOP3_VOP3PInst<string OpName, VOPProfile P, bit UseTiedOutput = 0,
- SDPatternOperator node = null_frag> :
- VOP3P_Pseudo<OpName, P> {
- // These operands are only sort of f16 operands. Depending on
- // op_sel_hi, these may be interpreted as f32. The inline immediate
- // values are really f16 converted to f32, so we treat these as f16
- // operands.
- let InOperandList =
- !con(
- !con(
- (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0,
- FP16InputMods:$src1_modifiers, VCSrc_f16:$src1,
- FP16InputMods:$src2_modifiers, VCSrc_f16:$src2),
- // FIXME: clampmod0 misbehaves with the non-default vdst_in
- // following it. For now workaround this by requiring clamp
- // in tied patterns. This should use undef_tied_input, but it
- // seems underdeveloped and doesn't apply the right register
- // class constraints.
- !if(UseTiedOutput, (ins clampmod:$clamp, VGPR_32:$vdst_in),
- (ins clampmod0:$clamp))),
- (ins op_sel0:$op_sel, op_sel_hi0:$op_sel_hi));
-
- let Constraints = !if(UseTiedOutput, "$vdst = $vdst_in", "");
- let DisableEncoding = !if(UseTiedOutput, "$vdst_in", "");
- let AsmOperands =
- " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp";
+multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P,
+ SDPatternOperator node = null_frag> {
+ def NAME : VOP3P_Pseudo<OpName, P> {
+ let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", "");
+ let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", "");
+ }
}
let isCommutable = 1 in {
-def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
-def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
let FPDPRounding = 1 in {
-def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>;
-def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fadd>;
-def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fmul>;
+defm V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>;
+defm V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fadd>;
+defm V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fmul>;
} // End FPDPRounding = 1
-def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>;
-def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>;
+defm V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>;
+defm V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>;
-def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>;
-def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
-def V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>;
+defm V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>;
+defm V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
+defm V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>;
-def V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>;
-def V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umin>;
-def V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smax>;
-def V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>;
+defm V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>;
+defm V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umin>;
+defm V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smax>;
+defm V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>;
}
-def V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
-def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
+defm V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
+defm V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
-def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshl_rev>;
-def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>;
-def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
+defm V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshl_rev>;
+defm V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>;
+defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
let SubtargetPredicate = HasVOP3PInsts in {
@@ -169,14 +180,14 @@ let SubtargetPredicate = HasMadMixInsts in {
// Size of src arguments (16/32) is controlled by op_sel.
// For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi.
let isCommutable = 1, mayRaiseFPException = 0 in {
-def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
+defm V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3P_Mix_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
let FPDPRounding = 1 in {
// Clamp modifier is applied after conversion to f16.
-def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
+defm V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>;
let ClampLo = 0, ClampHi = 1 in {
-def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
+defm V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>;
}
} // End FPDPRounding = 1
}
@@ -188,14 +199,14 @@ defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
// Essentially the same as the mad_mix versions
let SubtargetPredicate = HasFmaMixInsts in {
let isCommutable = 1 in {
-def V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
+defm V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3P_Mix_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
let FPDPRounding = 1 in {
// Clamp modifier is applied after conversion to f16.
-def V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
+defm V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>;
let ClampLo = 0, ClampHi = 1 in {
-def V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
+defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>;
}
} // End FPDPRounding = 1
}
@@ -287,25 +298,30 @@ class SDot2Pat<Instruction Inst> : GCNPat <
let IsDOT = 1 in {
let SubtargetPredicate = HasDot2Insts in {
-def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
- VOP3_Profile<VOP_F32_V2F16_V2F16_F32>,
- AMDGPUfdot2, 1/*ExplicitClamp*/>;
-def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16",
+defm V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16",
VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2, 1>;
-def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16",
+defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16",
VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>;
-def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8",
+
+} // End SubtargetPredicate = HasDot2Insts
+
+let SubtargetPredicate = HasDot7Insts in {
+
+defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
+ VOP3_Profile<VOP_F32_V2F16_V2F16_F32>,
+ AMDGPUfdot2, 1/*ExplicitClamp*/>;
+defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8",
VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
-def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4",
+defm V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4",
VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
-} // End SubtargetPredicate = HasDot2Insts
+} // End SubtargetPredicate = HasDot7Insts
let SubtargetPredicate = HasDot1Insts in {
-def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8",
+defm V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8",
VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
-def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4",
+defm V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4",
VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>;
} // End SubtargetPredicate = HasDot1Insts
@@ -319,7 +335,7 @@ foreach Type = ["U", "I"] in
def : GCNPat <
!cast<dag>(!foldl((i32 i32:$src2), [0, 1, 2, 3], lhs, y,
(add_oneuse lhs, (!cast<PatFrag>("Mul"#Type#"_Elt"#y) i32:$src0, i32:$src1)))),
- (!cast<VOP3PInst>("V_DOT4_"#Type#"32_"#Type#8) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
+ (!cast<VOP3P_Pseudo>("V_DOT4_"#Type#"32_"#Type#8) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
foreach Type = ["U", "I"] in
let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).SubtargetPredicate in
@@ -327,7 +343,7 @@ foreach Type = ["U", "I"] in
!cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)),
[1, 2, 3, 4, 5, 6, 7], lhs, y,
(NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))),
- (!cast<VOP3PInst>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
+ (!cast<VOP3P_Pseudo>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
// Different variants of dot8 code-gen dag patterns are not generated through table-gen due to a huge increase
// in the compile time. Directly handle the pattern generated by the FE here.
@@ -337,12 +353,19 @@ foreach Type = ["U", "I"] in
!cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)),
[7, 1, 2, 3, 4, 5, 6], lhs, y,
(NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))),
- (!cast<VOP3PInst>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
+ (!cast<VOP3P_Pseudo>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
def ADst_32 : VOPDstOperand<AGPR_32>;
+def ADst_64 : VOPDstOperand<AReg_64>;
def ADst_128 : VOPDstOperand<AReg_128>;
+def ADst_256 : VOPDstOperand<AReg_256>;
def ADst_512 : VOPDstOperand<AReg_512>;
def ADst_1024 : VOPDstOperand<AReg_1024>;
+def VDst_64 : VOPDstOperand<VReg_64>;
+def VDst_128 : VOPDstOperand<VReg_128>;
+def VDst_256 : VOPDstOperand<VReg_256>;
+def VDst_512 : VOPDstOperand<VReg_512>;
+def VDst_1024 : VOPDstOperand<VReg_1024>;
def VOPProfileAccRead : VOP3_Profile<VOP_I32_I32, VOP3_MAI> {
let Src0RC64 = ARegSrc_32;
@@ -362,7 +385,10 @@ class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC
let Src2RC64 = _SrcRC;
let HasOpSel = 0;
let HasClamp = 0;
- let Asm64 = " $vdst, $src0, $src1, $src2$cbsz$abid$blgp";
+ let HasIntClamp = 0;
+ let HasOMod = 0;
+ let HasModifiers = 0;
+ let Asm64 = "$vdst, $src0, $src1, $src2$cbsz$abid$blgp";
let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp);
}
@@ -378,6 +404,29 @@ def VOPProfileMAI_F32_V2I16_X32 : VOPProfileMAI<VOP_V32F32_V2I16_V2I16_V32F32, A
def VOPProfileMAI_F32_V4F16_X4 : VOPProfileMAI<VOP_V4F32_V4F16_V4F16_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>;
def VOPProfileMAI_F32_V4F16_X16 : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>;
def VOPProfileMAI_F32_V4F16_X32 : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>;
+def VOPProfileMAI_F32_V4I16_X4 : VOPProfileMAI<VOP_V4F32_V4I16_V4I16_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>;
+def VOPProfileMAI_F32_V4I16_X16 : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>;
+def VOPProfileMAI_F32_V4I16_X32 : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>;
+def VOPProfileMAI_F64_16X16X4F64 : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64, AISrc_256_f64, ADst_256, AVSrc_64>;
+def VOPProfileMAI_F64_4X4X4F64 : VOPProfileMAI<VOP_F64_F64_F64_F64, AISrc_64_f64, ADst_64, AVSrc_64>;
+
+def VOPProfileMAI_F32_F32_X4_VCD : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, VISrc_128_f32, VDst_128>;
+def VOPProfileMAI_F32_F32_X16_VCD : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32, VISrc_512_f32, VDst_512>;
+def VOPProfileMAI_F32_F32_X32_VCD : VOPProfileMAI<VOP_V32F32_F32_F32_V32F32, VISrc_1024_f32, VDst_1024>;
+def VOPProfileMAI_I32_I32_X4_VCD : VOPProfileMAI<VOP_V4I32_I32_I32_V4I32, VISrc_128_b32, VDst_128>;
+def VOPProfileMAI_I32_I32_X16_VCD : VOPProfileMAI<VOP_V16I32_I32_I32_V16I32, VISrc_512_b32, VDst_512>;
+def VOPProfileMAI_I32_I32_X32_VCD : VOPProfileMAI<VOP_V32I32_I32_I32_V32I32, VISrc_1024_b32, VDst_1024>;
+def VOPProfileMAI_F32_V2I16_X4_VCD : VOPProfileMAI<VOP_V4F32_V2I16_V2I16_V4F32, VISrc_128_b32, VDst_128>;
+def VOPProfileMAI_F32_V2I16_X16_VCD : VOPProfileMAI<VOP_V16F32_V2I16_V2I16_V16F32, VISrc_512_b32, VDst_512>;
+def VOPProfileMAI_F32_V2I16_X32_VCD : VOPProfileMAI<VOP_V32F32_V2I16_V2I16_V32F32, VISrc_1024_b32, VDst_1024>;
+def VOPProfileMAI_F32_V4F16_X4_VCD : VOPProfileMAI<VOP_V4F32_V4F16_V4F16_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>;
+def VOPProfileMAI_F32_V4F16_X16_VCD : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>;
+def VOPProfileMAI_F32_V4F16_X32_VCD : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, VISrc_1024_b32, VDst_1024, AVSrc_64>;
+def VOPProfileMAI_F32_V4I16_X4_VCD : VOPProfileMAI<VOP_V4F32_V4I16_V4I16_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>;
+def VOPProfileMAI_F32_V4I16_X16_VCD : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>;
+def VOPProfileMAI_F32_V4I16_X32_VCD : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, VISrc_1024_b32, VDst_1024, AVSrc_64>;
+def VOPProfileMAI_F64_16X16X4F64_VCD : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64, VISrc_256_f64, VDst_256, AVSrc_64>;
+def VOPProfileMAI_F64_4X4X4F64_VCD : VOPProfileMAI<VOP_F64_F64_F64_F64, VISrc_64_f64, VDst_64, AVSrc_64>;
let Predicates = [HasMAIInsts] in {
@@ -388,32 +437,57 @@ let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
} // End isMoveImm = 1
} // End isAsCheapAsAMove = 1, isReMaterializable = 1
-// FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.
-let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in {
-defm V_MFMA_F32_4X4X1F32 : VOP3Inst<"v_mfma_f32_4x4x1f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_4x4x1f32>;
-defm V_MFMA_F32_4X4X4F16 : VOP3Inst<"v_mfma_f32_4x4x4f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_4x4x4f16>;
-defm V_MFMA_I32_4X4X4I8 : VOP3Inst<"v_mfma_i32_4x4x4i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_4x4x4i8>;
-defm V_MFMA_F32_4X4X2BF16 : VOP3Inst<"v_mfma_f32_4x4x2bf16", VOPProfileMAI_F32_V2I16_X4, int_amdgcn_mfma_f32_4x4x2bf16>;
-defm V_MFMA_F32_16X16X1F32 : VOP3Inst<"v_mfma_f32_16x16x1f32", VOPProfileMAI_F32_F32_X16, int_amdgcn_mfma_f32_16x16x1f32>;
-defm V_MFMA_F32_16X16X4F32 : VOP3Inst<"v_mfma_f32_16x16x4f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_16x16x4f32>;
-defm V_MFMA_F32_16X16X4F16 : VOP3Inst<"v_mfma_f32_16x16x4f16", VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_16x16x4f16>;
-defm V_MFMA_F32_16X16X16F16 : VOP3Inst<"v_mfma_f32_16x16x16f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_16x16x16f16>;
-defm V_MFMA_I32_16X16X4I8 : VOP3Inst<"v_mfma_i32_16x16x4i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_16x16x4i8>;
-defm V_MFMA_I32_16X16X16I8 : VOP3Inst<"v_mfma_i32_16x16x16i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_16x16x16i8>;
-defm V_MFMA_F32_16X16X2BF16 : VOP3Inst<"v_mfma_f32_16x16x2bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_16x16x2bf16>;
-defm V_MFMA_F32_16X16X8BF16 : VOP3Inst<"v_mfma_f32_16x16x8bf16", VOPProfileMAI_F32_V2I16_X4, int_amdgcn_mfma_f32_16x16x8bf16>;
-defm V_MFMA_F32_32X32X1F32 : VOP3Inst<"v_mfma_f32_32x32x1f32", VOPProfileMAI_F32_F32_X32, int_amdgcn_mfma_f32_32x32x1f32>;
-defm V_MFMA_F32_32X32X2F32 : VOP3Inst<"v_mfma_f32_32x32x2f32", VOPProfileMAI_F32_F32_X16, int_amdgcn_mfma_f32_32x32x2f32>;
-defm V_MFMA_F32_32X32X4F16 : VOP3Inst<"v_mfma_f32_32x32x4f16", VOPProfileMAI_F32_V4F16_X32, int_amdgcn_mfma_f32_32x32x4f16>;
-defm V_MFMA_F32_32X32X8F16 : VOP3Inst<"v_mfma_f32_32x32x8f16", VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_32x32x8f16>;
-defm V_MFMA_I32_32X32X4I8 : VOP3Inst<"v_mfma_i32_32x32x4i8", VOPProfileMAI_I32_I32_X32, int_amdgcn_mfma_i32_32x32x4i8>;
-defm V_MFMA_I32_32X32X8I8 : VOP3Inst<"v_mfma_i32_32x32x8i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_32x32x8i8>;
-defm V_MFMA_F32_32X32X2BF16 : VOP3Inst<"v_mfma_f32_32x32x2bf16", VOPProfileMAI_F32_V2I16_X32, int_amdgcn_mfma_f32_32x32x2bf16>;
-defm V_MFMA_F32_32X32X4BF16 : VOP3Inst<"v_mfma_f32_32x32x4bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_32x32x4bf16>;
-} // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1
+multiclass MAIInst<string OpName, string P, SDPatternOperator node> {
+ let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in {
+ // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.
+ defm "" : VOP3Inst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P), node>;
+
+ let SubtargetPredicate = isGFX90APlus, Mnemonic = OpName in
+ defm _vgprcd : VOP3Inst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD")>;
+ } // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1
+}
+
+defm V_MFMA_F32_4X4X1F32 : MAIInst<"v_mfma_f32_4x4x1f32", "F32_F32_X4", int_amdgcn_mfma_f32_4x4x1f32>;
+defm V_MFMA_F32_4X4X4F16 : MAIInst<"v_mfma_f32_4x4x4f16", "F32_V4F16_X4", int_amdgcn_mfma_f32_4x4x4f16>;
+defm V_MFMA_I32_4X4X4I8 : MAIInst<"v_mfma_i32_4x4x4i8", "I32_I32_X4", int_amdgcn_mfma_i32_4x4x4i8>;
+defm V_MFMA_F32_16X16X1F32 : MAIInst<"v_mfma_f32_16x16x1f32", "F32_F32_X16", int_amdgcn_mfma_f32_16x16x1f32>;
+defm V_MFMA_F32_16X16X4F32 : MAIInst<"v_mfma_f32_16x16x4f32", "F32_F32_X4", int_amdgcn_mfma_f32_16x16x4f32>;
+defm V_MFMA_F32_16X16X4F16 : MAIInst<"v_mfma_f32_16x16x4f16", "F32_V4F16_X16", int_amdgcn_mfma_f32_16x16x4f16>;
+defm V_MFMA_F32_16X16X16F16 : MAIInst<"v_mfma_f32_16x16x16f16", "F32_V4F16_X4", int_amdgcn_mfma_f32_16x16x16f16>;
+defm V_MFMA_I32_16X16X4I8 : MAIInst<"v_mfma_i32_16x16x4i8", "I32_I32_X16", int_amdgcn_mfma_i32_16x16x4i8>;
+defm V_MFMA_F32_32X32X1F32 : MAIInst<"v_mfma_f32_32x32x1f32", "F32_F32_X32", int_amdgcn_mfma_f32_32x32x1f32>;
+defm V_MFMA_F32_32X32X2F32 : MAIInst<"v_mfma_f32_32x32x2f32", "F32_F32_X16", int_amdgcn_mfma_f32_32x32x2f32>;
+defm V_MFMA_F32_32X32X4F16 : MAIInst<"v_mfma_f32_32x32x4f16", "F32_V4F16_X32", int_amdgcn_mfma_f32_32x32x4f16>;
+defm V_MFMA_F32_32X32X8F16 : MAIInst<"v_mfma_f32_32x32x8f16", "F32_V4F16_X16", int_amdgcn_mfma_f32_32x32x8f16>;
+defm V_MFMA_I32_32X32X4I8 : MAIInst<"v_mfma_i32_32x32x4i8", "I32_I32_X32", int_amdgcn_mfma_i32_32x32x4i8>;
+defm V_MFMA_I32_16X16X16I8 : MAIInst<"v_mfma_i32_16x16x16i8", "I32_I32_X4", int_amdgcn_mfma_i32_16x16x16i8>;
+defm V_MFMA_I32_32X32X8I8 : MAIInst<"v_mfma_i32_32x32x8i8", "I32_I32_X16", int_amdgcn_mfma_i32_32x32x8i8>;
+defm V_MFMA_F32_4X4X2BF16 : MAIInst<"v_mfma_f32_4x4x2bf16", "F32_V2I16_X4", int_amdgcn_mfma_f32_4x4x2bf16>;
+defm V_MFMA_F32_16X16X2BF16 : MAIInst<"v_mfma_f32_16x16x2bf16", "F32_V2I16_X16", int_amdgcn_mfma_f32_16x16x2bf16>;
+defm V_MFMA_F32_16X16X8BF16 : MAIInst<"v_mfma_f32_16x16x8bf16", "F32_V2I16_X4", int_amdgcn_mfma_f32_16x16x8bf16>;
+defm V_MFMA_F32_32X32X2BF16 : MAIInst<"v_mfma_f32_32x32x2bf16", "F32_V2I16_X32", int_amdgcn_mfma_f32_32x32x2bf16>;
+defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16", int_amdgcn_mfma_f32_32x32x4bf16>;
} // End SubtargetPredicate = HasMAIInsts
+let Predicates = [isGFX90APlus] in {
+ defm V_MFMA_F32_32X32X4BF16_1K : MAIInst<"v_mfma_f32_32x32x4bf16_1k", "F32_V4I16_X32", int_amdgcn_mfma_f32_32x32x4bf16_1k>;
+ defm V_MFMA_F32_16X16X4BF16_1K : MAIInst<"v_mfma_f32_16x16x4bf16_1k", "F32_V4I16_X16", int_amdgcn_mfma_f32_16x16x4bf16_1k>;
+ defm V_MFMA_F32_4X4X4BF16_1K : MAIInst<"v_mfma_f32_4x4x4bf16_1k", "F32_V4I16_X4", int_amdgcn_mfma_f32_4x4x4bf16_1k>;
+ defm V_MFMA_F32_32X32X8BF16_1K : MAIInst<"v_mfma_f32_32x32x8bf16_1k", "F32_V4I16_X16", int_amdgcn_mfma_f32_32x32x8bf16_1k>;
+ defm V_MFMA_F32_16X16X16BF16_1K : MAIInst<"v_mfma_f32_16x16x16bf16_1k", "F32_V4I16_X4", int_amdgcn_mfma_f32_16x16x16bf16_1k>;
+
+ defm V_MFMA_F64_16X16X4F64 : MAIInst<"v_mfma_f64_16x16x4f64", "F64_16X16X4F64", int_amdgcn_mfma_f64_16x16x4f64>;
+ defm V_MFMA_F64_4X4X4F64 : MAIInst<"v_mfma_f64_4x4x4f64", "F64_4X4X4F64", int_amdgcn_mfma_f64_4x4x4f64>;
+} // End Predicates = [isGFX90APlus]
+
+let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 in {
+ defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fma>;
+ defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>;
+ defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>;
+ defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>;
+} // End SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1
+
def : MnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">;
def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">;
@@ -430,23 +504,36 @@ multiclass VOP3P_Real_vi<bits<7> op> {
VOP3Pe <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
let AssemblerPredicate = HasVOP3PInsts;
let DecoderNamespace = "GFX8";
+ let VOP3P = 1;
}
}
multiclass VOP3P_Real_MAI<bits<7> op> {
def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
- VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, ?> {
let AssemblerPredicate = HasMAIInsts;
let DecoderNamespace = "GFX8";
- let Inst{14} = 1; // op_sel_hi(2) default value
- let Inst{59} = 1; // op_sel_hi(0) default value
- let Inst{60} = 1; // op_sel_hi(1) default value
+ let Inst{14} = ?; // op_sel_hi(2)
+ let Inst{59} = ?; // op_sel_hi(0)
+ let Inst{60} = ?; // op_sel_hi(1)
}
}
-multiclass VOP3P_Real_MFMA<bits<7> op> {
+multiclass VOP3P_Real_MFMA_gfx90a<bits<7> op> {
+ let SubtargetPredicate = isGFX90AOnly,
+ AssemblerPredicate = isGFX90AOnly, DecoderNamespace = "GFX90A" in {
+ def _gfx90a_acd : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX90A>,
+ VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, 1>;
+
+ def _gfx90a_vcd : VOP3P_Real<!cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64"), SIEncodingFamily.GFX90A>,
+ VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64").Pfl, 0>;
+ } // End AssemblerPredicate = isGFX90AOnly, DecoderNamespace = "GFX90A"
+}
+
+multiclass VOP3P_Real_MFMA<bits<7> op> :
+ VOP3P_Real_MFMA_gfx90a <op> {
def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
- VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, ?> {
let AssemblerPredicate = HasMAIInsts;
let DecoderNamespace = "GFX8";
}
@@ -494,13 +581,18 @@ defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x22>;
let SubtargetPredicate = HasDot2Insts in {
-defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x23>;
defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x26>;
defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x27>;
+
+} // End SubtargetPredicate = HasDot2Insts
+
+let SubtargetPredicate = HasDot7Insts in {
+
+defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x23>;
defm V_DOT4_U32_U8 : VOP3P_Real_vi <0x29>;
defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x2b>;
-} // End SubtargetPredicate = HasDot2Insts
+} // End SubtargetPredicate = HasDot7Insts
let SubtargetPredicate = HasDot1Insts in {
@@ -536,16 +628,31 @@ defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MFMA <0x6d>;
} // End SubtargetPredicate = HasMAIInsts
+defm V_MFMA_F32_32X32X4BF16_1K : VOP3P_Real_MFMA_gfx90a <0x63>;
+defm V_MFMA_F32_16X16X4BF16_1K : VOP3P_Real_MFMA_gfx90a <0x64>;
+defm V_MFMA_F32_4X4X4BF16_1K : VOP3P_Real_MFMA_gfx90a <0x65>;
+defm V_MFMA_F32_32X32X8BF16_1K : VOP3P_Real_MFMA_gfx90a <0x66>;
+defm V_MFMA_F32_16X16X16BF16_1K : VOP3P_Real_MFMA_gfx90a <0x67>;
+defm V_MFMA_F64_16X16X4F64 : VOP3P_Real_MFMA_gfx90a <0x6e>;
+defm V_MFMA_F64_4X4X4F64 : VOP3P_Real_MFMA_gfx90a <0x6f>;
+
+let SubtargetPredicate = HasPackedFP32Ops in {
+ defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>;
+ defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>;
+ defm V_PK_ADD_F32 : VOP3P_Real_vi <0x32>;
+ defm V_PK_MOV_B32 : VOP3P_Real_vi <0x33>;
+} // End SubtargetPredicate = HasPackedFP32Ops
+
//===----------------------------------------------------------------------===//
// GFX10.
//===----------------------------------------------------------------------===//
-let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
+let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10", VOP3P = 1 in {
multiclass VOP3P_Real_gfx10<bits<7> op> {
def _gfx10 : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.GFX10>,
VOP3Pe_gfx10 <op, !cast<VOP3P_Pseudo>(NAME).Pfl>;
}
-} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
+} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10", VOP3P = 1
defm V_PK_MAD_I16 : VOP3P_Real_gfx10<0x00>;
defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10<0x01>;
@@ -572,13 +679,18 @@ defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10<0x22>;
let SubtargetPredicate = HasDot2Insts in {
-defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x13>;
defm V_DOT2_I32_I16 : VOP3P_Real_gfx10 <0x14>;
defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x15>;
+
+} // End SubtargetPredicate = HasDot2Insts
+
+let SubtargetPredicate = HasDot7Insts in {
+
+defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x13>;
defm V_DOT4_U32_U8 : VOP3P_Real_gfx10 <0x17>;
defm V_DOT8_U32_U4 : VOP3P_Real_gfx10 <0x19>;
-} // End SubtargetPredicate = HasDot2Insts
+} // End SubtargetPredicate = HasDot7Insts
let SubtargetPredicate = HasDot1Insts in {
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 99599c5cd667..c0cc91029d11 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -27,10 +27,6 @@ class VOPC_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {
let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0);
let Inst{24-17} = op;
let Inst{31-25} = 0x3e; // encoding
-
- // VOPC disallows dst_sel and dst_unused as they have no effect on destination
- let Inst{42-40} = 0;
- let Inst{44-43} = 0;
}
class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be <P> {
@@ -56,6 +52,8 @@ class VOPC_Profile<list<SchedReadWrite> sched, ValueType vt0, ValueType vt1 = vt
let Asm32 = "$src0, $src1";
// The destination for 32-bit encoding is implicit.
let HasDst32 = 0;
+ // VOPC disallows dst_sel and dst_unused as they have no effect on destination
+ let EmitDstSel = 0;
let Outs64 = (outs VOPDstS64orS32:$sdst);
list<SchedReadWrite> Schedule = sched;
}
@@ -106,6 +104,8 @@ class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily> :
InstSI <ps.OutOperandList, ps.InOperandList, ps.PseudoInstr # " " # ps.AsmOperands, []>,
SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+ let VALU = 1;
+ let VOPC = 1;
let isPseudo = 0;
let isCodeGenOnly = 0;
@@ -121,6 +121,9 @@ class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily> :
let UseNamedOperandTable = ps.UseNamedOperandTable;
let Uses = ps.Uses;
let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
}
class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
@@ -760,7 +763,7 @@ defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">;
// We need to use COPY_TO_REGCLASS to w/a the problem when ReplaceAllUsesWith()
// complaints it cannot replace i1 <-> i64/i32 if node was not morphed in place.
-multiclass ICMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> {
+multiclass ICMP_Pattern <PatFrags cond, Instruction inst, ValueType vt> {
let WaveSizePredicate = isWave64 in
def : GCNPat <
(i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
@@ -807,7 +810,7 @@ defm : ICMP_Pattern <COND_SGE, V_CMP_GE_I16_e64, i16>;
defm : ICMP_Pattern <COND_SLT, V_CMP_LT_I16_e64, i16>;
defm : ICMP_Pattern <COND_SLE, V_CMP_LE_I16_e64, i16>;
-multiclass FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> {
+multiclass FCMP_Pattern <PatFrags cond, Instruction inst, ValueType vt> {
let WaveSizePredicate = isWave64 in
def : GCNPat <
(i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 282c1002d3c9..5f6f664ea3e7 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -140,10 +140,18 @@ class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> :
let VOP3P = 1;
}
+class VOP_Real<VOP_Pseudo ps> {
+ Instruction Opcode = !cast<Instruction>(NAME);
+ bit IsSingle = ps.Pfl.IsSingle;
+}
+
class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> :
+ VOP_Real <ps>,
InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+ let VALU = 1;
+ let VOP3 = 1;
let isPseudo = 0;
let isCodeGenOnly = 0;
let UseNamedOperandTable = 1;
@@ -162,6 +170,10 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> :
let UseNamedOperandTable = ps.UseNamedOperandTable;
let Uses = ps.Uses;
let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let TRANS = ps.TRANS;
VOPProfile Pfl = ps.Pfl;
}
@@ -317,7 +329,7 @@ class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 {
let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1)
let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2)
- let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, 0); // op_sel_hi(2)
+ let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, ?); // op_sel_hi(2)
let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
@@ -326,14 +338,14 @@ class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 {
let Inst{40-32} = !if(P.HasSrc0, src0, 0);
let Inst{49-41} = !if(P.HasSrc1, src1, 0);
let Inst{58-50} = !if(P.HasSrc2, src2, 0);
- let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, 0); // op_sel_hi(0)
- let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, 0); // op_sel_hi(1)
+ let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, ?); // op_sel_hi(0)
+ let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, ?); // op_sel_hi(1)
let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo)
let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo)
let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
}
-class VOP3Pe_MAI <bits<7> op, VOPProfile P> : Enc64 {
+class VOP3Pe_MAI <bits<7> op, VOPProfile P, bit acc_cd = 0> : Enc64 {
bits<8> vdst;
bits<10> src0;
bits<10> src1;
@@ -341,14 +353,13 @@ class VOP3Pe_MAI <bits<7> op, VOPProfile P> : Enc64 {
bits<3> blgp;
bits<3> cbsz;
bits<4> abid;
- bits<1> clamp;
let Inst{7-0} = vdst;
let Inst{10-8} = !if(P.HasSrc1, cbsz, 0);
let Inst{14-11} = !if(P.HasSrc1, abid, 0);
- let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
+ let Inst{15} = acc_cd;
let Inst{22-16} = op;
let Inst{31-23} = 0x1a7; //encoding
@@ -411,8 +422,8 @@ class VOP_SDWAe<VOPProfile P> : Enc64 {
bits<1> clamp;
let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
- let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, 0);
- let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, 0);
+ let Inst{42-40} = !if(P.EmitDstSel, dst_sel{2-0}, ?);
+ let Inst{44-43} = !if(P.EmitDstSel, dst_unused{1-0}, ?);
let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0);
let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, 0);
let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0);
@@ -462,8 +473,8 @@ class VOP_SDWA9Ae<VOPProfile P> : VOP_SDWA9e<P> {
bits<1> clamp;
bits<2> omod;
- let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, 0);
- let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, 0);
+ let Inst{42-40} = !if(P.EmitDstSel, dst_sel{2-0}, ?);
+ let Inst{44-43} = !if(P.EmitDstSel, dst_unused{1-0}, ?);
let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0);
let Inst{47-46} = !if(P.HasSDWAOMod, omod{1-0}, 0);
}
@@ -515,12 +526,13 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA> {
+ let VALU = 1;
+ let SDWA = 1;
let isPseudo = 0;
let isCodeGenOnly = 0;
let Defs = ps.Defs;
let Uses = ps.Uses;
- let SchedRW = ps.SchedRW;
let hasSideEffects = ps.hasSideEffects;
let Constraints = ps.Constraints;
@@ -536,17 +548,22 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let TRANS = ps.TRANS;
}
class Base_VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands9, []> {
+ let VALU = 1;
+ let SDWA = 1;
let isPseudo = 0;
let isCodeGenOnly = 0;
let Defs = ps.Defs;
let Uses = ps.Uses;
- let SchedRW = ps.SchedRW;
let hasSideEffects = ps.hasSideEffects;
let Constraints = ps.Constraints;
@@ -564,6 +581,10 @@ class Base_VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let TRANS = ps.TRANS;
}
class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
@@ -628,8 +649,8 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
string AsmOperands = P.AsmDPP;
let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", "");
- let SubtargetPredicate = HasDPP;
- let AssemblerPredicate = HasDPP;
+ let SubtargetPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP);
+ let AssemblerPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP);
let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
AMDGPUAsmVariants.Disable);
let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
@@ -643,12 +664,13 @@ class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> :
InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+ let VALU = 1;
+ let DPP = 1;
let isPseudo = 0;
let isCodeGenOnly = 0;
let Defs = ps.Defs;
let Uses = ps.Uses;
- let SchedRW = ps.SchedRW;
let hasSideEffects = ps.hasSideEffects;
let Constraints = ps.Constraints;
@@ -665,6 +687,10 @@ class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> :
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let TRANS = ps.TRANS;
}
class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16,
@@ -683,8 +709,8 @@ class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16,
let Size = 8;
let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", "");
- let SubtargetPredicate = HasDPP;
- let AssemblerPredicate = HasDPP;
+ let SubtargetPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP);
+ let AssemblerPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP);
let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
AMDGPUAsmVariants.Disable);
let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
@@ -795,3 +821,17 @@ include "VOP1Instructions.td"
include "VOP2Instructions.td"
include "VOP3Instructions.td"
include "VOP3PInstructions.td"
+
+
+class VOPInfoTable <string Format> : GenericTable {
+ let FilterClass = Format # "_Real";
+ let CppTypeName = "VOPInfo";
+ let Fields = ["Opcode", "IsSingle"];
+
+ let PrimaryKey = ["Opcode"];
+ let PrimaryKeyName = "get" # Format # "OpcodeHelper";
+}
+
+def VOP1InfoTable : VOPInfoTable<"VOP1">;
+def VOP2InfoTable : VOPInfoTable<"VOP2">;
+def VOP3InfoTable : VOPInfoTable<"VOP3">;