diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td | 677 |
1 files changed, 550 insertions, 127 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td index d84720f820ee..0c4c9e0e9df2 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1,4 +1,4 @@ -//===-- SIInstructions.td - SI Instruction Defintions ---------------------===// +//===-- SIInstructions.td - SI Instruction Definitions --------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -24,8 +24,38 @@ include "BUFInstructions.td" // EXP Instructions //===----------------------------------------------------------------------===// -defm EXP : EXP_m<0, AMDGPUexport>; -defm EXP_DONE : EXP_m<1, AMDGPUexport_done>; +defm EXP : EXP_m<0>; +defm EXP_DONE : EXP_m<1>; + +class ExpPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat< + (int_amdgcn_exp timm:$tgt, timm:$en, + (vt ExpSrc0:$src0), (vt ExpSrc1:$src1), + (vt ExpSrc2:$src2), (vt ExpSrc3:$src3), + done_val, timm:$vm), + (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1, + ExpSrc2:$src2, ExpSrc3:$src3, timm:$vm, 0, timm:$en) +>; + +class ExpComprPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat< + (int_amdgcn_exp_compr timm:$tgt, timm:$en, + (vt ExpSrc0:$src0), (vt ExpSrc1:$src1), + done_val, timm:$vm), + (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1, + (IMPLICIT_DEF), (IMPLICIT_DEF), timm:$vm, 1, timm:$en) +>; + +// FIXME: The generated DAG matcher seems to have strange behavior +// with a 1-bit literal to match, so use a -1 for checking a true +// 1-bit value. +def : ExpPattern<i32, EXP, 0>; +def : ExpPattern<i32, EXP_DONE, -1>; +def : ExpPattern<f32, EXP, 0>; +def : ExpPattern<f32, EXP_DONE, -1>; + +def : ExpComprPattern<v2i16, EXP, 0>; +def : ExpComprPattern<v2i16, EXP_DONE, -1>; +def : ExpComprPattern<v2f16, EXP, 0>; +def : ExpComprPattern<v2f16, EXP_DONE, -1>; //===----------------------------------------------------------------------===// // VINTRP Instructions @@ -34,9 +64,9 @@ defm EXP_DONE : EXP_m<1, AMDGPUexport_done>; // Used to inject printing of "_e32" suffix for VI (there are "_e64" variants for VI) def VINTRPDst : VINTRPDstOperand <VGPR_32>; -let Uses = [M0, EXEC] in { +let Uses = [MODE, M0, EXEC] in { -// FIXME: Specify SchedRW for VINTRP insturctions. +// FIXME: Specify SchedRW for VINTRP instructions. multiclass V_INTERP_P1_F32_m : VINTRP_m < 0x00000000, @@ -76,10 +106,10 @@ defm V_INTERP_MOV_F32 : VINTRP_m < (outs VINTRPDst:$vdst), (ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan), "v_interp_mov_f32$vdst, $vsrc, $attr$attrchan", - [(set f32:$vdst, (int_amdgcn_interp_mov (i32 imm:$vsrc), + [(set f32:$vdst, (int_amdgcn_interp_mov (i32 timm:$vsrc), (i32 timm:$attrchan), (i32 timm:$attr), M0))]>; -} // End Uses = [M0, EXEC] +} // End Uses = [MODE, M0, EXEC] //===----------------------------------------------------------------------===// // Pseudo Instructions @@ -136,7 +166,8 @@ def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] def ENTER_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { - let Defs = [EXEC]; + let Uses = [EXEC]; + let Defs = [EXEC, SCC]; let hasSideEffects = 0; let mayLoad = 0; let mayStore = 0; @@ -162,16 +193,27 @@ def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst), let Constraints = "$src = $vdst"; } +let usesCustomInserter = 1, Defs = [VCC, EXEC] in { +def V_ADD_U64_PSEUDO : VPseudoInstSI < + (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), + [(set VReg_64:$vdst, (getDivergentFrag<add>.ret i64:$src0, i64:$src1))] +>; + +def V_SUB_U64_PSEUDO : VPseudoInstSI < + (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), + [(set VReg_64:$vdst, (getDivergentFrag<sub>.ret i64:$src0, i64:$src1))] +>; +} // End usesCustomInserter = 1, Defs = [VCC, EXEC] let usesCustomInserter = 1, Defs = [SCC] in { def S_ADD_U64_PSEUDO : SPseudoInstSI < - (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), - [(set SReg_64:$vdst, (add i64:$src0, i64:$src1))] + (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), + [(set SReg_64:$sdst, (UniformBinFrag<add> i64:$src0, i64:$src1))] >; def S_SUB_U64_PSEUDO : SPseudoInstSI < - (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), - [(set SReg_64:$vdst, (sub i64:$src0, i64:$src1))] + (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), + [(set SReg_64:$sdst, (UniformBinFrag<sub> i64:$src0, i64:$src1))] >; def S_ADD_U64_CO_PSEUDO : SPseudoInstSI < @@ -181,6 +223,23 @@ def S_ADD_U64_CO_PSEUDO : SPseudoInstSI < def S_SUB_U64_CO_PSEUDO : SPseudoInstSI < (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1) >; + +def S_ADD_CO_PSEUDO : SPseudoInstSI < + (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in) +>; + +def S_SUB_CO_PSEUDO : SPseudoInstSI < + (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in) +>; + +def S_UADDO_PSEUDO : SPseudoInstSI < + (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1) +>; + +def S_USUBO_PSEUDO : SPseudoInstSI < + (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1) +>; + } // End usesCustomInserter = 1, Defs = [SCC] let usesCustomInserter = 1 in { @@ -199,6 +258,7 @@ class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI< let hasSideEffects = base_inst.hasSideEffects; let UseNamedOperandTable = base_inst.UseNamedOperandTable; let CodeSize = base_inst.CodeSize; + let SchedRW = base_inst.SchedRW; } let WaveSizePredicate = isWave64 in { @@ -214,13 +274,14 @@ def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>; def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>; } + def WAVE_BARRIER : SPseudoInstSI<(outs), (ins), [(int_amdgcn_wave_barrier)]> { let SchedRW = []; let hasNoSchedulingInfo = 1; let hasSideEffects = 1; - let mayLoad = 1; - let mayStore = 1; + let mayLoad = 0; + let mayStore = 0; let isConvergent = 1; let FixedSize = 1; let Size = 0; @@ -318,6 +379,9 @@ multiclass PseudoInstKill <dag ins> { defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>; defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>; +let Defs = [EXEC] in +def SI_KILL_CLEANUP : SPseudoInstSI <(outs), (ins)>; + let Defs = [EXEC,VCC] in def SI_ILLEGAL_COPY : SPseudoInstSI < (outs unknown:$dst), (ins unknown:$src), @@ -386,7 +450,7 @@ def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI < def : GCNPat < (int_amdgcn_init_exec timm:$src), - (SI_INIT_EXEC_LO (as_i32imm imm:$src))> { + (SI_INIT_EXEC_LO (as_i32timm timm:$src))> { let WaveSizePredicate = isWave32; } @@ -413,8 +477,8 @@ def SI_RETURN : SPseudoInstSI < // Return for returning function calls without output register. // -// This version is only needed so we can fill in the output regiter in -// the custom inserter. +// This version is only needed so we can fill in the output register +// in the custom inserter. def SI_CALL_ISEL : SPseudoInstSI < (outs), (ins SSrc_b64:$src0, unknown:$callee), [(AMDGPUcall i64:$src0, tglobaladdr:$callee)]> { @@ -426,6 +490,11 @@ def SI_CALL_ISEL : SPseudoInstSI < let isConvergent = 1; } +def : GCNPat< + (AMDGPUcall i64:$src0, (i64 0)), + (SI_CALL_ISEL $src0, (i64 0)) +>; + // Wrapper around s_swappc_b64 with extra $callee parameter to track // the called function after regalloc. def SI_CALL : SPseudoInstSI < @@ -480,6 +549,8 @@ def ADJCALLSTACKDOWN : SPseudoInstSI< let Defs = [M0, EXEC, SCC], UseNamedOperandTable = 1 in { +// SI_INDIRECT_SRC/DST are only used by legacy SelectionDAG indirect +// addressing implementation. class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI < (outs VGPR_32:$vdst), (ins rc:$src, VS_32:$idx, i32imm:$offset)> { @@ -493,21 +564,81 @@ class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI < let usesCustomInserter = 1; } -// TODO: We can support indirect SGPR access. def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>; def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>; def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>; def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>; def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>; +def SI_INDIRECT_SRC_V32 : SI_INDIRECT_SRC<VReg_1024>; def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>; def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>; def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>; def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>; def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>; +def SI_INDIRECT_DST_V32 : SI_INDIRECT_DST<VReg_1024>; } // End Uses = [EXEC], Defs = [M0, EXEC] + +// This is a pseudo variant of the v_movreld_b32 (or v_mov_b32 +// expecting to be executed with gpr indexing mode enabled) +// instruction in which the vector operand appears only twice, once as +// def and once as use. Using this pseudo avoids problems with the Two +// Address instructions pass. +class INDIRECT_REG_WRITE_pseudo<RegisterClass rc, + RegisterOperand val_ty> : PseudoInstSI < + (outs rc:$vdst), (ins rc:$vsrc, val_ty:$val, i32imm:$subreg)> { + let Constraints = "$vsrc = $vdst"; + let Uses = [M0]; +} + +class V_INDIRECT_REG_WRITE_B32_pseudo<RegisterClass rc> : + INDIRECT_REG_WRITE_pseudo<rc, VSrc_b32> { + let VALU = 1; + let VOP1 = 1; + let Uses = [M0, EXEC]; +} + +class S_INDIRECT_REG_WRITE_pseudo<RegisterClass rc, + RegisterOperand val_ty> : + INDIRECT_REG_WRITE_pseudo<rc, val_ty> { + let SALU = 1; + let SOP1 = 1; + let Uses = [M0]; +} + +class S_INDIRECT_REG_WRITE_B32_pseudo<RegisterClass rc> : + S_INDIRECT_REG_WRITE_pseudo<rc, SSrc_b32>; +class S_INDIRECT_REG_WRITE_B64_pseudo<RegisterClass rc> : + S_INDIRECT_REG_WRITE_pseudo<rc, SSrc_b64>; + + +def V_INDIRECT_REG_WRITE_B32_V1 : V_INDIRECT_REG_WRITE_B32_pseudo<VGPR_32>; +def V_INDIRECT_REG_WRITE_B32_V2 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_64>; +def V_INDIRECT_REG_WRITE_B32_V3 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_96>; +def V_INDIRECT_REG_WRITE_B32_V4 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_128>; +def V_INDIRECT_REG_WRITE_B32_V5 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_160>; +def V_INDIRECT_REG_WRITE_B32_V8 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_256>; +def V_INDIRECT_REG_WRITE_B32_V16 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_512>; +def V_INDIRECT_REG_WRITE_B32_V32 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_1024>; + +def S_INDIRECT_REG_WRITE_B32_V1 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_32>; +def S_INDIRECT_REG_WRITE_B32_V2 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_64>; +def S_INDIRECT_REG_WRITE_B32_V3 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_96>; +def S_INDIRECT_REG_WRITE_B32_V4 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_128>; +def S_INDIRECT_REG_WRITE_B32_V5 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_160>; +def S_INDIRECT_REG_WRITE_B32_V8 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_256>; +def S_INDIRECT_REG_WRITE_B32_V16 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_512>; +def S_INDIRECT_REG_WRITE_B32_V32 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_1024>; + +def S_INDIRECT_REG_WRITE_B64_V1 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_64>; +def S_INDIRECT_REG_WRITE_B64_V2 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_128>; +def S_INDIRECT_REG_WRITE_B64_V4 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_256>; +def S_INDIRECT_REG_WRITE_B64_V8 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_512>; +def S_INDIRECT_REG_WRITE_B64_V16 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_1024>; + + multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { let UseNamedOperandTable = 1, SGPRSpill = 1, Uses = [EXEC] in { def _SAVE : PseudoInstSI < @@ -535,6 +666,7 @@ defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>; defm SI_SPILL_S96 : SI_SPILL_SGPR <SReg_96>; defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>; defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>; +defm SI_SPILL_S192 : SI_SPILL_SGPR <SReg_192>; defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>; @@ -574,6 +706,7 @@ defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>; defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>; defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>; defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>; +defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>; defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>; defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>; defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>; @@ -639,12 +772,6 @@ def : GCNPat< >; def : Pat < - // -1.0 as i32 (LowerINTRINSIC_VOID converts all other constants to -1.0) - (AMDGPUkill (i32 -1082130432)), - (SI_KILL_I1_PSEUDO (i1 0), 0) ->; - -def : Pat < (int_amdgcn_kill i1:$src), (SI_KILL_I1_PSEUDO SCSrc_i1:$src, 0) >; @@ -655,11 +782,6 @@ def : Pat < >; def : Pat < - (AMDGPUkill i32:$src), - (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, 0, 3) // 3 means SETOGE ->; - -def : Pat < (int_amdgcn_kill (i1 (setcc f32:$src, InlineImmFP32:$imm, cond:$cond))), (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond)) >; @@ -693,14 +815,14 @@ def : RsqPat<V_RSQ_F64_e32, f64>; def : GCNPat < (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), - (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) + (V_FRACT_F32_e64 $mods, $x) >; // Convert (x + (-floor(x))) to fract(x) def : GCNPat < (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), - (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) + (V_FRACT_F64_e64 $mods, $x) >; } // End OtherPredicates = [UnsafeFPMath] @@ -709,27 +831,27 @@ def : GCNPat < // f16_to_fp patterns def : GCNPat < (f32 (f16_to_fp i32:$src0)), - (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) + (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0) >; def : GCNPat < (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))), - (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) + (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0) >; def : GCNPat < (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))), - (V_CVT_F32_F16_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0)), DSTCLAMP.NONE, DSTOMOD.NONE) + (V_CVT_F32_F16_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0))) >; def : GCNPat < (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))), - (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) + (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0) >; def : GCNPat < (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))), - (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) + (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0) >; def : GCNPat < @@ -740,7 +862,7 @@ def : GCNPat < // fp_to_fp16 patterns def : GCNPat < (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), - (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, DSTCLAMP.NONE, DSTOMOD.NONE) + (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0) >; def : GCNPat < @@ -767,20 +889,29 @@ def : GCNPat < // VOP2 Patterns //===----------------------------------------------------------------------===// -multiclass FMADPat <ValueType vt, Instruction inst> { - def : GCNPat < - (vt (fmad (VOP3NoMods vt:$src0), - (VOP3NoMods vt:$src1), - (VOP3NoMods vt:$src2))), +// TODO: Check only no src2 mods? +class FMADPat <ValueType vt, Instruction inst, SDPatternOperator node> + : GCNPat <(vt (node (vt (VOP3NoMods vt:$src0)), + (vt (VOP3NoMods vt:$src1)), + (vt (VOP3NoMods vt:$src2)))), (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) - >; +>; + + +// Prefer mac form when there are no modifiers. +let AddedComplexity = 9 in { +def : FMADPat <f32, V_MAC_F32_e64, fmad>; +def : FMADPat <f32, V_MAC_F32_e64, AMDGPUfmad_ftz>; + +let SubtargetPredicate = Has16BitInsts in { +def : FMADPat <f16, V_MAC_F16_e64, fmad>; +def : FMADPat <f16, V_MAC_F16_e64, AMDGPUfmad_ftz>; } -defm : FMADPat <f16, V_MAC_F16_e64>; -defm : FMADPat <f32, V_MAC_F32_e64>; +} -class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty> +class FMADModsPat<ValueType Ty, Instruction inst, SDPatternOperator mad_opr> : GCNPat< (Ty (mad_opr (Ty (VOP3Mods Ty:$src0, i32:$src0_mod)), (Ty (VOP3Mods Ty:$src1, i32:$src1_mod)), @@ -789,24 +920,28 @@ class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty> $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) >; -// FIXME: This should select to V_MAC_F32 -def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz, f32>; -def : FMADModsPat<V_MAD_F16, AMDGPUfmad_ftz, f16> { +let SubtargetPredicate = HasMadMacF32Insts in +def : FMADModsPat<f32, V_MAD_F32, AMDGPUfmad_ftz>; +def : FMADModsPat<f16, V_MAD_F16, AMDGPUfmad_ftz> { let SubtargetPredicate = Has16BitInsts; } -multiclass SelectPat <ValueType vt> { - def : GCNPat < - (vt (select i1:$src0, (VOP3Mods_f32 vt:$src1, i32:$src1_mods), - (VOP3Mods_f32 vt:$src2, i32:$src2_mods))), - (V_CNDMASK_B32_e64 $src2_mods, $src2, $src1_mods, $src1, $src0) - >; -} +class VOPSelectModsPat <ValueType vt> : GCNPat < + (vt (select i1:$src0, (VOP3Mods vt:$src1, i32:$src1_mods), + (VOP3Mods vt:$src2, i32:$src2_mods))), + (V_CNDMASK_B32_e64 FP32InputMods:$src2_mods, VSrc_b32:$src2, + FP32InputMods:$src1_mods, VSrc_b32:$src1, SSrc_i1:$src0) +>; + +class VOPSelectPat <ValueType vt> : GCNPat < + (vt (select i1:$src0, vt:$src1, vt:$src2)), + (V_CNDMASK_B32_e64 0, VSrc_b32:$src2, 0, VSrc_b32:$src1, SSrc_i1:$src0) +>; -defm : SelectPat <i16>; -defm : SelectPat <i32>; -defm : SelectPat <f16>; -defm : SelectPat <f32>; +def : VOPSelectModsPat <i32>; +def : VOPSelectModsPat <f32>; +def : VOPSelectPat <f16>; +def : VOPSelectPat <i16>; let AddedComplexity = 1 in { def : GCNPat < @@ -1039,6 +1174,8 @@ def : BitConvert <v4f32, v2f64, VReg_128>; def : BitConvert <v4i32, v2f64, VReg_128>; def : BitConvert <v2i64, v2f64, VReg_128>; def : BitConvert <v2f64, v2i64, VReg_128>; +def : BitConvert <v4f32, v2i64, VReg_128>; +def : BitConvert <v2i64, v4f32, VReg_128>; // 160-bit bitcast def : BitConvert <v5i32, v5f32, SGPR_160>; @@ -1049,14 +1186,46 @@ def : BitConvert <v8i32, v8f32, SReg_256>; def : BitConvert <v8f32, v8i32, SReg_256>; def : BitConvert <v8i32, v8f32, VReg_256>; def : BitConvert <v8f32, v8i32, VReg_256>; +def : BitConvert <v4i64, v4f64, VReg_256>; +def : BitConvert <v4f64, v4i64, VReg_256>; +def : BitConvert <v4i64, v8i32, VReg_256>; +def : BitConvert <v4i64, v8f32, VReg_256>; +def : BitConvert <v4f64, v8i32, VReg_256>; +def : BitConvert <v4f64, v8f32, VReg_256>; +def : BitConvert <v8i32, v4i64, VReg_256>; +def : BitConvert <v8f32, v4i64, VReg_256>; +def : BitConvert <v8i32, v4f64, VReg_256>; +def : BitConvert <v8f32, v4f64, VReg_256>; + // 512-bit bitcast def : BitConvert <v16i32, v16f32, VReg_512>; def : BitConvert <v16f32, v16i32, VReg_512>; +def : BitConvert <v8i64, v8f64, VReg_512>; +def : BitConvert <v8f64, v8i64, VReg_512>; +def : BitConvert <v8i64, v16i32, VReg_512>; +def : BitConvert <v8f64, v16i32, VReg_512>; +def : BitConvert <v16i32, v8i64, VReg_512>; +def : BitConvert <v16i32, v8f64, VReg_512>; +def : BitConvert <v8i64, v16f32, VReg_512>; +def : BitConvert <v8f64, v16f32, VReg_512>; +def : BitConvert <v16f32, v8i64, VReg_512>; +def : BitConvert <v16f32, v8f64, VReg_512>; // 1024-bit bitcast def : BitConvert <v32i32, v32f32, VReg_1024>; def : BitConvert <v32f32, v32i32, VReg_1024>; +def : BitConvert <v16i64, v16f64, VReg_1024>; +def : BitConvert <v16f64, v16i64, VReg_1024>; +def : BitConvert <v16i64, v32i32, VReg_1024>; +def : BitConvert <v32i32, v16i64, VReg_1024>; +def : BitConvert <v16f64, v32f32, VReg_1024>; +def : BitConvert <v32f32, v16f64, VReg_1024>; +def : BitConvert <v16i64, v32f32, VReg_1024>; +def : BitConvert <v32i32, v16f64, VReg_1024>; +def : BitConvert <v16f64, v32i32, VReg_1024>; +def : BitConvert <v32f32, v16i64, VReg_1024>; + /********** =================== **********/ /********** Src & Dst modifiers **********/ @@ -1155,7 +1324,7 @@ def : GCNPat < (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit >; -// FIXME: The implicit-def of scc from S_[X]OR_B32 is mishandled +// FIXME: The implicit-def of scc from S_[X]OR/AND_B32 is mishandled // def : GCNPat < // (fneg (f64 SReg_64:$src)), // (REG_SEQUENCE SReg_64, @@ -1176,6 +1345,17 @@ def : GCNPat < // sub1) // >; +// FIXME: Use S_BITSET0_B32/B64? +// def : GCNPat < +// (fabs (f64 SReg_64:$src)), +// (REG_SEQUENCE SReg_64, +// (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), +// sub0, +// (S_AND_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), +// (i32 (S_MOV_B32 (i32 0x7fffffff)))), +// sub1) +// >; + } // End let AddedComplexity = 1 def : GCNPat < @@ -1372,11 +1552,12 @@ class Ext32Pat <SDNode ext> : GCNPat < def : Ext32Pat <zext>; def : Ext32Pat <anyext>; -// The multiplication scales from [0,1] to the unsigned integer range +// The multiplication scales from [0,1) to the unsigned integer range, +// rounding down a bit to avoid unwanted overflow. def : GCNPat < (AMDGPUurecip i32:$src0), (V_CVT_U32_F32_e32 - (V_MUL_F32_e32 (i32 CONST.FP_UINT_MAX_PLUS_1), + (V_MUL_F32_e32 (i32 CONST.FP_4294966784), (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) >; @@ -1421,11 +1602,13 @@ defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">; defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">; defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">; defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">; +defm : SI_INDIRECT_Pattern <v32f32, f32, "V32">; defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">; defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">; defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">; defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">; +defm : SI_INDIRECT_Pattern <v32i32, i32, "V32">; //===----------------------------------------------------------------------===// // SAD Patterns @@ -1695,102 +1878,187 @@ def : GCNPat < def : GCNPat < (i32 (bswap i32:$a)), (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)), - (V_ALIGNBIT_B32 $a, $a, (i32 24)), - (V_ALIGNBIT_B32 $a, $a, (i32 8))) + (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 24)), + (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 8))) >; -let OtherPredicates = [NoFP16Denormals] in { -def : GCNPat< - (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), - (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src, 0, 0) +// FIXME: This should have been narrowed to i32 during legalization. +// This pattern should also be skipped for GlobalISel +def : GCNPat < + (i64 (bswap i64:$a)), + (REG_SEQUENCE VReg_64, + (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)), + (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), + (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), + (i32 24)), + (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), + (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), + (i32 8))), + sub0, + (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)), + (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), + (i32 24)), + (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), + (i32 8))), + sub1) +>; + +// FIXME: The AddedComplexity should not be needed, but in GlobalISel +// the BFI pattern ends up taking precedence without it. +let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in { +// Magic number: 3 | (2 << 8) | (1 << 16) | (0 << 24) +// +// My reading of the manual suggests we should be using src0 for the +// register value, but this is what seems to work. +def : GCNPat < + (i32 (bswap i32:$a)), + (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x00010203))) >; -def : GCNPat< - (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), - (V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src, 0, 0) +// FIXME: This should have been narrowed to i32 during legalization. +// This pattern should also be skipped for GlobalISel +def : GCNPat < + (i64 (bswap i64:$a)), + (REG_SEQUENCE VReg_64, + (V_PERM_B32 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub1), + (S_MOV_B32 (i32 0x00010203))), + sub0, + (V_PERM_B32 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub0), + (S_MOV_B32 (i32 0x00010203))), + sub1) >; -def : GCNPat< - (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), - (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) +// Magic number: 1 | (0 << 8) | (12 << 16) | (12 << 24) +// The 12s emit 0s. +def : GCNPat < + (i16 (bswap i16:$a)), + (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001))) >; -} -let OtherPredicates = [FP16Denormals] in { -def : GCNPat< - (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), - (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0) +def : GCNPat < + (i32 (zext (bswap i16:$a))), + (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001))) >; -let SubtargetPredicate = HasVOP3PInsts in { -def : GCNPat< - (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), - (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE) +// Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24) +def : GCNPat < + (v2i16 (bswap v2i16:$a)), + (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x02030001))) >; + } -} -let OtherPredicates = [NoFP32Denormals] in { + +// Prefer selecting to max when legal, but using mul is always valid. +let AddedComplexity = -5 in { def : GCNPat< - (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), - (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src, 0, 0) + (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), + (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src) >; def : GCNPat< - (fcanonicalize (f32 (fneg (VOP3Mods f32:$src, i32:$src_mods)))), - (V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src, 0, 0) + (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), + (V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src) +>; + +def : GCNPat< + (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), + (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) >; -} -let OtherPredicates = [FP32Denormals] in { def : GCNPat< (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), - (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src, 0, 0) + (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src) >; -} -let OtherPredicates = [NoFP64Denormals] in { def : GCNPat< - (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), - (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src, 0, 0) + (fcanonicalize (f32 (fneg (VOP3Mods f32:$src, i32:$src_mods)))), + (V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src) >; -} -let OtherPredicates = [FP64Denormals] in { +// TODO: Handle fneg like other types. def : GCNPat< (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), - (V_MAX_F64 $src_mods, $src, $src_mods, $src, 0, 0) + (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src) >; +} // End AddedComplexity = -5 + +multiclass SelectCanonicalizeAsMax< + list<Predicate> f32_preds = [], + list<Predicate> f64_preds = [], + list<Predicate> f16_preds = []> { + def : GCNPat< + (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), + (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src)> { + let OtherPredicates = f32_preds; + } + + def : GCNPat< + (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), + (V_MAX_F64 $src_mods, $src, $src_mods, $src)> { + let OtherPredicates = f64_preds; + } + + def : GCNPat< + (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), + (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> { + // FIXME: Should have 16-bit inst subtarget predicate + let OtherPredicates = f16_preds; + } + + def : GCNPat< + (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), + (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)> { + // FIXME: Should have VOP3P subtarget predicate + let OtherPredicates = f16_preds; + } } +// On pre-gfx9 targets, v_max_*/v_min_* did not respect the denormal +// mode, and would never flush. For f64, it's faster to do implement +// this with a max. For f16/f32 it's a wash, but prefer max when +// valid. +// +// FIXME: Lowering f32/f16 with max is worse since we can use a +// smaller encoding if the input is fneg'd. It also adds an extra +// register use. +let SubtargetPredicate = HasMinMaxDenormModes in { + defm : SelectCanonicalizeAsMax<[], [], []>; +} // End SubtargetPredicate = HasMinMaxDenormModes + +let SubtargetPredicate = NotHasMinMaxDenormModes in { + // Use the max lowering if we don't need to flush. + + // FIXME: We don't do use this for f32 as a workaround for the + // library being compiled with the default ieee mode, but + // potentially being called from flushing kernels. Really we should + // not be mixing code expecting different default FP modes, but mul + // works in any FP environment. + defm : SelectCanonicalizeAsMax<[FalsePredicate], [FP64Denormals], [FP16Denormals]>; +} // End SubtargetPredicate = NotHasMinMaxDenormModes + + let OtherPredicates = [HasDLInsts] in { def : GCNPat < - (fma (f32 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (fma (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)), (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)), (f32 (VOP3NoMods f32:$src2))), (V_FMAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, - SRCMODS.NONE, $src2, $clamp, $omod) + SRCMODS.NONE, $src2) >; } // End OtherPredicates = [HasDLInsts] let SubtargetPredicate = isGFX10Plus in def : GCNPat < - (fma (f16 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (fma (f16 (VOP3Mods f32:$src0, i32:$src0_modifiers)), (f16 (VOP3Mods f32:$src1, i32:$src1_modifiers)), (f16 (VOP3NoMods f32:$src2))), (V_FMAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, - SRCMODS.NONE, $src2, $clamp, $omod) ->; - -// Allow integer inputs -class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPat< - (node (i8 timm:$tgt), (i8 timm:$en), vt:$src0, vt:$src1, vt:$src2, vt:$src3, (i1 timm:$compr), (i1 timm:$vm)), - (Inst i8:$tgt, vt:$src0, vt:$src1, vt:$src2, vt:$src3, i1:$vm, i1:$compr, i8:$en) + SRCMODS.NONE, $src2) >; -def : ExpPattern<AMDGPUexport, i32, EXP>; -def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>; - // COPY is workaround tablegen bug from multiple outputs // from S_LSHL_B32's multiple outputs from implicit scc def. def : GCNPat < @@ -1873,19 +2141,20 @@ def : GCNPat < >; def : GCNPat < - (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask, - timm:$bound_ctrl)), - (V_MOV_B64_DPP_PSEUDO $src, $src, (as_i32imm $dpp_ctrl), - (as_i32imm $row_mask), (as_i32imm $bank_mask), - (as_i1imm $bound_ctrl)) + (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, + timm:$bank_mask, timm:$bound_ctrl)), + (V_MOV_B64_DPP_PSEUDO VReg_64:$src, VReg_64:$src, + (as_i32timm $dpp_ctrl), (as_i32timm $row_mask), + (as_i32timm $bank_mask), + (as_i1timm $bound_ctrl)) >; def : GCNPat < (i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask, timm:$bound_ctrl)), - (V_MOV_B64_DPP_PSEUDO $old, $src, (as_i32imm $dpp_ctrl), - (as_i32imm $row_mask), (as_i32imm $bank_mask), - (as_i1imm $bound_ctrl)) + (V_MOV_B64_DPP_PSEUDO VReg_64:$old, VReg_64:$src, (as_i32timm $dpp_ctrl), + (as_i32timm $row_mask), (as_i32timm $bank_mask), + (as_i1timm $bound_ctrl)) >; //===----------------------------------------------------------------------===// @@ -1901,6 +2170,11 @@ let SubtargetPredicate = isGFX6 in { // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) // Convert floor(x) to (x - fract(x)) + +// Don't bother handling this for GlobalISel, it's handled during +// lowering. +// +// FIXME: DAG should also custom lower this. def : GCNPat < (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))), (V_ADD_F64 @@ -1910,13 +2184,11 @@ def : GCNPat < (V_CNDMASK_B64_PSEUDO (V_MIN_F64 SRCMODS.NONE, - (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE), + (V_FRACT_F64_e64 $mods, $x), SRCMODS.NONE, - (V_MOV_B64_PSEUDO 0x3fefffffffffffff), - DSTCLAMP.NONE, DSTOMOD.NONE), + (V_MOV_B64_PSEUDO 0x3fefffffffffffff)), $x, - (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))), - DSTCLAMP.NONE, DSTOMOD.NONE) + (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/)))) >; } // End SubtargetPredicates = isGFX6 @@ -2061,13 +2333,164 @@ def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction { let hasSideEffects = 0; } +def G_AMDGPU_RCP_IFLAG : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$src); + let hasSideEffects = 0; +} + +class BufferLoadGenericInstruction : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset, + type2:$soffset, untyped_imm_0:$offset, + untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); + let hasSideEffects = 0; + let mayLoad = 1; +} + +class TBufferLoadGenericInstruction : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset, + type2:$soffset, untyped_imm_0:$offset, untyped_imm_0:$format, + untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); + let hasSideEffects = 0; + let mayLoad = 1; +} + +def G_AMDGPU_BUFFER_LOAD_UBYTE : BufferLoadGenericInstruction; +def G_AMDGPU_BUFFER_LOAD_SBYTE : BufferLoadGenericInstruction; +def G_AMDGPU_BUFFER_LOAD_USHORT : BufferLoadGenericInstruction; +def G_AMDGPU_BUFFER_LOAD_SSHORT : BufferLoadGenericInstruction; +def G_AMDGPU_BUFFER_LOAD : BufferLoadGenericInstruction; +def G_AMDGPU_BUFFER_LOAD_FORMAT : BufferLoadGenericInstruction; +def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction; +def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction; +def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction; + +class BufferStoreGenericInstruction : AMDGPUGenericInstruction { + let OutOperandList = (outs); + let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, + type2:$soffset, untyped_imm_0:$offset, + untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); + let hasSideEffects = 0; + let mayStore = 1; +} + +class TBufferStoreGenericInstruction : AMDGPUGenericInstruction { + let OutOperandList = (outs); + let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, + type2:$soffset, untyped_imm_0:$offset, + untyped_imm_0:$format, + untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); + let hasSideEffects = 0; + let mayStore = 1; +} + +def G_AMDGPU_BUFFER_STORE : BufferStoreGenericInstruction; +def G_AMDGPU_BUFFER_STORE_BYTE : BufferStoreGenericInstruction; +def G_AMDGPU_BUFFER_STORE_SHORT : BufferStoreGenericInstruction; +def G_AMDGPU_BUFFER_STORE_FORMAT : BufferStoreGenericInstruction; +def G_AMDGPU_BUFFER_STORE_FORMAT_D16 : BufferStoreGenericInstruction; +def G_AMDGPU_TBUFFER_STORE_FORMAT : TBufferStoreGenericInstruction; +def G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : TBufferStoreGenericInstruction; + +def G_AMDGPU_FMIN_LEGACY : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1); + let hasSideEffects = 0; +} + +def G_AMDGPU_FMAX_LEGACY : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1); + let hasSideEffects = 0; +} + +foreach N = 0-3 in { +def G_AMDGPU_CVT_F32_UBYTE#N : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0); + let hasSideEffects = 0; +} +} + // Atomic cmpxchg. $cmpval ad $newval are packed in a single vector // operand Expects a MachineMemOperand in addition to explicit // operands. def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction { let OutOperandList = (outs type0:$oldval); - let InOperandList = (ins ptype1:$addr, type0:$cmpval_nnenwval); + let InOperandList = (ins ptype1:$addr, type0:$cmpval_newval); + let hasSideEffects = 0; + let mayLoad = 1; + let mayStore = 1; +} + +let Namespace = "AMDGPU" in { +def G_AMDGPU_ATOMIC_INC : G_ATOMICRMW_OP; +def G_AMDGPU_ATOMIC_DEC : G_ATOMICRMW_OP; +} + +class BufferAtomicGenericInstruction : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, + type2:$soffset, untyped_imm_0:$offset, + untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); let hasSideEffects = 0; let mayLoad = 1; let mayStore = 1; } + +def G_AMDGPU_BUFFER_ATOMIC_SWAP : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_ADD : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_SUB : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_SMIN : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_UMIN : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction; + +def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$vdata, type0:$cmp, type1:$rsrc, type2:$vindex, + type2:$voffset, type2:$soffset, untyped_imm_0:$offset, + untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); + let hasSideEffects = 0; + let mayLoad = 1; + let mayStore = 1; +} + +// Wrapper around llvm.amdgcn.s.buffer.load. This is mostly needed as +// a workaround for the intrinsic being defined as readnone, but +// really needs a memory operand. +def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$rsrc, type2:$offset, untyped_imm_0:$cachepolicy); + let hasSideEffects = 0; + let mayLoad = 1; + let mayStore = 0; +} + +// This is equivalent to the G_INTRINSIC*, but the operands may have +// been legalized depending on the subtarget requirements. +def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins unknown:$intrin, variable_ops); + let hasSideEffects = 0; + let mayLoad = 1; + + // FIXME: Use separate opcode for atomics. + let mayStore = 1; +} + +// This is equivalent to the G_INTRINSIC*, but the operands may have +// been legalized depending on the subtarget requirements. +def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction { + let OutOperandList = (outs); + let InOperandList = (ins unknown:$intrin, variable_ops); + let hasSideEffects = 0; + let mayStore = 1; +} |