diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td | 570 |
1 files changed, 370 insertions, 200 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td index 0c4c9e0e9df2..7c1cbd67c993 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -19,43 +19,7 @@ include "VOPInstructions.td" include "SMInstructions.td" include "FLATInstructions.td" include "BUFInstructions.td" - -//===----------------------------------------------------------------------===// -// EXP Instructions -//===----------------------------------------------------------------------===// - -defm EXP : EXP_m<0>; -defm EXP_DONE : EXP_m<1>; - -class ExpPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat< - (int_amdgcn_exp timm:$tgt, timm:$en, - (vt ExpSrc0:$src0), (vt ExpSrc1:$src1), - (vt ExpSrc2:$src2), (vt ExpSrc3:$src3), - done_val, timm:$vm), - (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1, - ExpSrc2:$src2, ExpSrc3:$src3, timm:$vm, 0, timm:$en) ->; - -class ExpComprPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat< - (int_amdgcn_exp_compr timm:$tgt, timm:$en, - (vt ExpSrc0:$src0), (vt ExpSrc1:$src1), - done_val, timm:$vm), - (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1, - (IMPLICIT_DEF), (IMPLICIT_DEF), timm:$vm, 1, timm:$en) ->; - -// FIXME: The generated DAG matcher seems to have strange behavior -// with a 1-bit literal to match, so use a -1 for checking a true -// 1-bit value. -def : ExpPattern<i32, EXP, 0>; -def : ExpPattern<i32, EXP_DONE, -1>; -def : ExpPattern<f32, EXP, 0>; -def : ExpPattern<f32, EXP_DONE, -1>; - -def : ExpComprPattern<v2i16, EXP, 0>; -def : ExpComprPattern<v2i16, EXP_DONE, -1>; -def : ExpComprPattern<v2f16, EXP, 0>; -def : ExpComprPattern<v2f16, EXP_DONE, -1>; +include "EXPInstructions.td" //===----------------------------------------------------------------------===// // VINTRP Instructions @@ -264,6 +228,7 @@ class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI< let WaveSizePredicate = isWave64 in { def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>; def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>; +def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>; def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>; } @@ -324,7 +289,7 @@ def SI_IF: CFPseudoInstSI < def SI_ELSE : CFPseudoInstSI < (outs SReg_1:$dst), - (ins SReg_1:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> { + (ins SReg_1:$src, brtarget:$target), [], 1, 1> { let Size = 12; let hasSideEffects = 1; } @@ -356,6 +321,14 @@ def SI_IF_BREAK : CFPseudoInstSI < let isReMaterializable = 1; } +// Branch to the early termination block of the shader if SCC is 0. +// This uses SCC from a previous SALU operation, i.e. the update of +// a mask of live lanes after a kill/demote operation. +// Only valid in pixel shaders. +def SI_EARLY_TERMINATE_SCC0 : SPseudoInstSI <(outs), (ins)> { + let Uses = [EXEC,SCC]; +} + let Uses = [EXEC] in { multiclass PseudoInstKill <dag ins> { @@ -426,32 +399,13 @@ def SI_INIT_EXEC : SPseudoInstSI < (outs), (ins i64imm:$src), [(int_amdgcn_init_exec (i64 timm:$src))]> { let Defs = [EXEC]; - let usesCustomInserter = 1; - let isAsCheapAsAMove = 1; - let WaveSizePredicate = isWave64; -} - -// FIXME: Intrinsic should be mangled for wave size. -def SI_INIT_EXEC_LO : SPseudoInstSI < - (outs), (ins i32imm:$src), []> { - let Defs = [EXEC_LO]; - let usesCustomInserter = 1; let isAsCheapAsAMove = 1; - let WaveSizePredicate = isWave32; } -// FIXME: Wave32 version def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI < (outs), (ins SSrc_b32:$input, i32imm:$shift), [(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> { let Defs = [EXEC]; - let usesCustomInserter = 1; -} - -def : GCNPat < - (int_amdgcn_init_exec timm:$src), - (SI_INIT_EXEC_LO (as_i32timm timm:$src))> { - let WaveSizePredicate = isWave32; } // Return for returning shaders to a shader variant epilog. @@ -580,64 +534,97 @@ def SI_INDIRECT_DST_V32 : SI_INDIRECT_DST<VReg_1024>; } // End Uses = [EXEC], Defs = [M0, EXEC] - -// This is a pseudo variant of the v_movreld_b32 (or v_mov_b32 -// expecting to be executed with gpr indexing mode enabled) -// instruction in which the vector operand appears only twice, once as -// def and once as use. Using this pseudo avoids problems with the Two -// Address instructions pass. -class INDIRECT_REG_WRITE_pseudo<RegisterClass rc, +// This is a pseudo variant of the v_movreld_b32 instruction in which the +// vector operand appears only twice, once as def and once as use. Using this +// pseudo avoids problems with the Two Address instructions pass. +class INDIRECT_REG_WRITE_MOVREL_pseudo<RegisterClass rc, RegisterOperand val_ty> : PseudoInstSI < (outs rc:$vdst), (ins rc:$vsrc, val_ty:$val, i32imm:$subreg)> { let Constraints = "$vsrc = $vdst"; let Uses = [M0]; } -class V_INDIRECT_REG_WRITE_B32_pseudo<RegisterClass rc> : - INDIRECT_REG_WRITE_pseudo<rc, VSrc_b32> { +class V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<RegisterClass rc> : + INDIRECT_REG_WRITE_MOVREL_pseudo<rc, VSrc_b32> { let VALU = 1; let VOP1 = 1; let Uses = [M0, EXEC]; } -class S_INDIRECT_REG_WRITE_pseudo<RegisterClass rc, +class S_INDIRECT_REG_WRITE_MOVREL_pseudo<RegisterClass rc, RegisterOperand val_ty> : - INDIRECT_REG_WRITE_pseudo<rc, val_ty> { + INDIRECT_REG_WRITE_MOVREL_pseudo<rc, val_ty> { let SALU = 1; let SOP1 = 1; let Uses = [M0]; } -class S_INDIRECT_REG_WRITE_B32_pseudo<RegisterClass rc> : - S_INDIRECT_REG_WRITE_pseudo<rc, SSrc_b32>; -class S_INDIRECT_REG_WRITE_B64_pseudo<RegisterClass rc> : - S_INDIRECT_REG_WRITE_pseudo<rc, SSrc_b64>; - - -def V_INDIRECT_REG_WRITE_B32_V1 : V_INDIRECT_REG_WRITE_B32_pseudo<VGPR_32>; -def V_INDIRECT_REG_WRITE_B32_V2 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_64>; -def V_INDIRECT_REG_WRITE_B32_V3 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_96>; -def V_INDIRECT_REG_WRITE_B32_V4 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_128>; -def V_INDIRECT_REG_WRITE_B32_V5 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_160>; -def V_INDIRECT_REG_WRITE_B32_V8 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_256>; -def V_INDIRECT_REG_WRITE_B32_V16 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_512>; -def V_INDIRECT_REG_WRITE_B32_V32 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_1024>; +class S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<RegisterClass rc> : + S_INDIRECT_REG_WRITE_MOVREL_pseudo<rc, SSrc_b32>; +class S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<RegisterClass rc> : + S_INDIRECT_REG_WRITE_MOVREL_pseudo<rc, SSrc_b64>; + +def V_INDIRECT_REG_WRITE_MOVREL_B32_V1 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VGPR_32>; +def V_INDIRECT_REG_WRITE_MOVREL_B32_V2 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_64>; +def V_INDIRECT_REG_WRITE_MOVREL_B32_V3 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_96>; +def V_INDIRECT_REG_WRITE_MOVREL_B32_V4 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_128>; +def V_INDIRECT_REG_WRITE_MOVREL_B32_V5 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_160>; +def V_INDIRECT_REG_WRITE_MOVREL_B32_V8 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_256>; +def V_INDIRECT_REG_WRITE_MOVREL_B32_V16 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_512>; +def V_INDIRECT_REG_WRITE_MOVREL_B32_V32 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_1024>; + +def S_INDIRECT_REG_WRITE_MOVREL_B32_V1 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_32>; +def S_INDIRECT_REG_WRITE_MOVREL_B32_V2 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_64>; +def S_INDIRECT_REG_WRITE_MOVREL_B32_V3 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_96>; +def S_INDIRECT_REG_WRITE_MOVREL_B32_V4 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_128>; +def S_INDIRECT_REG_WRITE_MOVREL_B32_V5 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_160>; +def S_INDIRECT_REG_WRITE_MOVREL_B32_V8 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_256>; +def S_INDIRECT_REG_WRITE_MOVREL_B32_V16 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_512>; +def S_INDIRECT_REG_WRITE_MOVREL_B32_V32 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_1024>; + +def S_INDIRECT_REG_WRITE_MOVREL_B64_V1 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_64>; +def S_INDIRECT_REG_WRITE_MOVREL_B64_V2 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_128>; +def S_INDIRECT_REG_WRITE_MOVREL_B64_V4 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_256>; +def S_INDIRECT_REG_WRITE_MOVREL_B64_V8 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_512>; +def S_INDIRECT_REG_WRITE_MOVREL_B64_V16 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_1024>; + +// These variants of V_INDIRECT_REG_READ/WRITE use VGPR indexing. By using these +// pseudos we avoid spills or copies being inserted within indirect sequences +// that switch the VGPR indexing mode. Spills to accvgprs could be effected by +// this mode switching. + +class V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<RegisterClass rc> : PseudoInstSI < + (outs rc:$vdst), (ins rc:$vsrc, VSrc_b32:$val, SSrc_b32:$idx, i32imm:$subreg)> { + let Constraints = "$vsrc = $vdst"; + let VALU = 1; + let Uses = [M0, EXEC]; + let Defs = [M0]; +} -def S_INDIRECT_REG_WRITE_B32_V1 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_32>; -def S_INDIRECT_REG_WRITE_B32_V2 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_64>; -def S_INDIRECT_REG_WRITE_B32_V3 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_96>; -def S_INDIRECT_REG_WRITE_B32_V4 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_128>; -def S_INDIRECT_REG_WRITE_B32_V5 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_160>; -def S_INDIRECT_REG_WRITE_B32_V8 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_256>; -def S_INDIRECT_REG_WRITE_B32_V16 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_512>; -def S_INDIRECT_REG_WRITE_B32_V32 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_1024>; +def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VGPR_32>; +def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_64>; +def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_96>; +def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_128>; +def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_160>; +def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_256>; +def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_512>; +def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_1024>; -def S_INDIRECT_REG_WRITE_B64_V1 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_64>; -def S_INDIRECT_REG_WRITE_B64_V2 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_128>; -def S_INDIRECT_REG_WRITE_B64_V4 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_256>; -def S_INDIRECT_REG_WRITE_B64_V8 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_512>; -def S_INDIRECT_REG_WRITE_B64_V16 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_1024>; +class V_INDIRECT_REG_READ_GPR_IDX_pseudo<RegisterClass rc> : PseudoInstSI < + (outs VGPR_32:$vdst), (ins rc:$vsrc, SSrc_b32:$idx, i32imm:$subreg)> { + let VALU = 1; + let Uses = [M0, EXEC]; + let Defs = [M0]; +} +def V_INDIRECT_REG_READ_GPR_IDX_B32_V1 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VGPR_32>; +def V_INDIRECT_REG_READ_GPR_IDX_B32_V2 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_64>; +def V_INDIRECT_REG_READ_GPR_IDX_B32_V3 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_96>; +def V_INDIRECT_REG_READ_GPR_IDX_B32_V4 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_128>; +def V_INDIRECT_REG_READ_GPR_IDX_B32_V5 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_160>; +def V_INDIRECT_REG_READ_GPR_IDX_B32_V8 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_256>; +def V_INDIRECT_REG_READ_GPR_IDX_B32_V16 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_512>; +def V_INDIRECT_REG_READ_GPR_IDX_B32_V32 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_1024>; multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { let UseNamedOperandTable = 1, SGPRSpill = 1, Uses = [EXEC] in { @@ -671,30 +658,33 @@ defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>; -multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> { +// VGPR or AGPR spill instructions. In case of AGPR spilling a temp register +// needs to be used and an extra instruction to move between VGPR and AGPR. +// UsesTmp adds to the total size of an expanded spill in this case. +multiclass SI_SPILL_VGPR <RegisterClass vgpr_class, bit UsesTmp = 0> { let UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM] in { def _SAVE : VPseudoInstSI < (outs), - (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc, + (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_32:$soffset, i32imm:$offset)> { let mayStore = 1; let mayLoad = 0; // (2 * 4) + (8 * num_subregs) bytes maximum - int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8); + int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), !add(UsesTmp, 3)), 8); // Size field is unsigned char and cannot fit more. let Size = !if(!le(MaxSize, 256), MaxSize, 252); } def _RESTORE : VPseudoInstSI < (outs vgpr_class:$vdata), - (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset, - i32imm:$offset)> { + (ins i32imm:$vaddr, + SReg_32:$soffset, i32imm:$offset)> { let mayStore = 0; let mayLoad = 1; // (2 * 4) + (8 * num_subregs) bytes maximum - int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8); + int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), !add(UsesTmp, 3)), 8); // Size field is unsigned char and cannot fit more. let Size = !if(!le(MaxSize, 256), MaxSize, 252); } @@ -711,42 +701,15 @@ defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>; defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>; defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>; -multiclass SI_SPILL_AGPR <RegisterClass vgpr_class> { - let UseNamedOperandTable = 1, VGPRSpill = 1, - Constraints = "@earlyclobber $tmp", - SchedRW = [WriteVMEM] in { - def _SAVE : VPseudoInstSI < - (outs VGPR_32:$tmp), - (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc, - SReg_32:$soffset, i32imm:$offset)> { - let mayStore = 1; - let mayLoad = 0; - // (2 * 4) + (16 * num_subregs) bytes maximum - int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8); - // Size field is unsigned char and cannot fit more. - let Size = !if(!le(MaxSize, 256), MaxSize, 252); - } - - def _RESTORE : VPseudoInstSI < - (outs vgpr_class:$vdata, VGPR_32:$tmp), - (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset, - i32imm:$offset)> { - let mayStore = 0; - let mayLoad = 1; - - // (2 * 4) + (16 * num_subregs) bytes maximum - int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8); - // Size field is unsigned char and cannot fit more. - let Size = !if(!le(MaxSize, 256), MaxSize, 252); - } - } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM] -} - -defm SI_SPILL_A32 : SI_SPILL_AGPR <AGPR_32>; -defm SI_SPILL_A64 : SI_SPILL_AGPR <AReg_64>; -defm SI_SPILL_A128 : SI_SPILL_AGPR <AReg_128>; -defm SI_SPILL_A512 : SI_SPILL_AGPR <AReg_512>; -defm SI_SPILL_A1024 : SI_SPILL_AGPR <AReg_1024>; +defm SI_SPILL_A32 : SI_SPILL_VGPR <AGPR_32, 1>; +defm SI_SPILL_A64 : SI_SPILL_VGPR <AReg_64, 1>; +defm SI_SPILL_A96 : SI_SPILL_VGPR <AReg_96, 1>; +defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128, 1>; +defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160, 1>; +defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192, 1>; +defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>; +defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>; +defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>; def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < (outs SReg_64:$dst), @@ -768,7 +731,7 @@ def : GCNPat< def : GCNPat< (AMDGPUelse i1:$src, bb:$target), - (SI_ELSE $src, $target, 0) + (SI_ELSE $src, $target) >; def : Pat < @@ -804,12 +767,9 @@ def : Pat < let OtherPredicates = [UnsafeFPMath] in { -//def : RcpPat<V_RCP_F64_e32, f64>; -//defm : RsqPat<V_RSQ_F64_e32, f64>; //defm : RsqPat<V_RSQ_F32_e32, f32>; def : RsqPat<V_RSQ_F32_e32, f32>; -def : RsqPat<V_RSQ_F64_e32, f64>; // Convert (x - floor(x)) to fract(x) def : GCNPat < @@ -889,7 +849,8 @@ def : GCNPat < // VOP2 Patterns //===----------------------------------------------------------------------===// -// TODO: Check only no src2 mods? +// NoMods pattern used for mac. If there are any source modifiers then it's +// better to select mad instead of mac. class FMADPat <ValueType vt, Instruction inst, SDPatternOperator node> : GCNPat <(vt (node (vt (VOP3NoMods vt:$src0)), (vt (VOP3NoMods vt:$src1)), @@ -898,18 +859,41 @@ class FMADPat <ValueType vt, Instruction inst, SDPatternOperator node> SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) >; - // Prefer mac form when there are no modifiers. let AddedComplexity = 9 in { +let OtherPredicates = [HasMadMacF32Insts] in { def : FMADPat <f32, V_MAC_F32_e64, fmad>; def : FMADPat <f32, V_MAC_F32_e64, AMDGPUfmad_ftz>; +} // OtherPredicates = [HasMadMacF32Insts] + +// Don't allow source modifiers. If there are any source modifiers then it's +// better to select mad instead of mac. +let SubtargetPredicate = isGFX6GFX7GFX10, + OtherPredicates = [HasMadMacF32Insts, NoFP32Denormals] in +def : GCNPat < + (f32 (fadd (AMDGPUfmul_legacy (VOP3NoMods f32:$src0), + (VOP3NoMods f32:$src1)), + (VOP3NoMods f32:$src2))), + (V_MAC_LEGACY_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, + SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +// Don't allow source modifiers. If there are any source modifiers then it's +// better to select fma instead of fmac. +let SubtargetPredicate = HasFmaLegacy32 in +def : GCNPat < + (f32 (int_amdgcn_fma_legacy (VOP3NoMods f32:$src0), + (VOP3NoMods f32:$src1), + (VOP3NoMods f32:$src2))), + (V_FMAC_LEGACY_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, + SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) +>; let SubtargetPredicate = Has16BitInsts in { def : FMADPat <f16, V_MAC_F16_e64, fmad>; def : FMADPat <f16, V_MAC_F16_e64, AMDGPUfmad_ftz>; -} - -} +} // SubtargetPredicate = Has16BitInsts +} // AddedComplexity = 9 class FMADModsPat<ValueType Ty, Instruction inst, SDPatternOperator mad_opr> : GCNPat< @@ -920,11 +904,20 @@ class FMADModsPat<ValueType Ty, Instruction inst, SDPatternOperator mad_opr> $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) >; -let SubtargetPredicate = HasMadMacF32Insts in -def : FMADModsPat<f32, V_MAD_F32, AMDGPUfmad_ftz>; -def : FMADModsPat<f16, V_MAD_F16, AMDGPUfmad_ftz> { - let SubtargetPredicate = Has16BitInsts; -} +let OtherPredicates = [HasMadMacF32Insts] in +def : FMADModsPat<f32, V_MAD_F32_e64, AMDGPUfmad_ftz>; + +let OtherPredicates = [HasMadMacF32Insts, NoFP32Denormals] in +def : GCNPat < + (f32 (fadd (AMDGPUfmul_legacy (VOP3Mods f32:$src0, i32:$src0_mod), + (VOP3Mods f32:$src1, i32:$src1_mod)), + (VOP3Mods f32:$src2, i32:$src2_mod))), + (V_MAD_LEGACY_F32_e64 $src0_mod, $src0, $src1_mod, $src1, + $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +let SubtargetPredicate = Has16BitInsts in +def : FMADModsPat<f16, V_MAD_F16_e64, AMDGPUfmad_ftz>; class VOPSelectModsPat <ValueType vt> : GCNPat < (vt (select i1:$src0, (VOP3Mods vt:$src1, i32:$src1_mods), @@ -1241,7 +1234,7 @@ class ClampPat<Instruction inst, ValueType vt> : GCNPat < >; def : ClampPat<V_MAX_F32_e64, f32>; -def : ClampPat<V_MAX_F64, f64>; +def : ClampPat<V_MAX_F64_e64, f64>; def : ClampPat<V_MAX_F16_e64, f16>; let SubtargetPredicate = HasVOP3PInsts in { @@ -1422,12 +1415,12 @@ def : GCNPat < def : GCNPat < (fcopysign f16:$src0, f16:$src1), - (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1) + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1) >; def : GCNPat < (fcopysign f32:$src0, f16:$src1), - (V_BFI_B32 (S_MOV_B32 (i32 0x7fffffff)), $src0, + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, (V_LSHLREV_B32_e64 (i32 16), $src1)) >; @@ -1435,19 +1428,19 @@ def : GCNPat < (fcopysign f64:$src0, f16:$src1), (REG_SEQUENCE SReg_64, (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, - (V_BFI_B32 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)), + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)), (V_LSHLREV_B32_e64 (i32 16), $src1)), sub1) >; def : GCNPat < (fcopysign f16:$src0, f32:$src1), - (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, (V_LSHRREV_B32_e64 (i32 16), $src1)) >; def : GCNPat < (fcopysign f16:$src0, f64:$src1), - (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1))) >; @@ -1499,8 +1492,13 @@ def : GCNPat < >; def : GCNPat < - (i32 frameindex:$fi), - (V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi))) + (p5 frameindex:$fi), + (V_MOV_B32_e32 (p5 (frameindex_to_targetframeindex $fi))) +>; + +def : GCNPat < + (p5 frameindex:$fi), + (S_MOV_B32 (p5 (frameindex_to_targetframeindex $fi))) >; def : GCNPat < @@ -1565,19 +1563,103 @@ def : GCNPat < // VOP3 Patterns //===----------------------------------------------------------------------===// -def : IMad24Pat<V_MAD_I32_I24, 1>; -def : UMad24Pat<V_MAD_U32_U24, 1>; +def : IMad24Pat<V_MAD_I32_I24_e64, 1>; +def : UMad24Pat<V_MAD_U32_U24_e64, 1>; + +// BFI patterns + +def BFIImm32 : PatFrag< + (ops node:$x, node:$y, node:$z), + (i32 (DivergentBinFrag<or> (and node:$y, node:$x), (and node:$z, imm))), + [{ + auto *X = dyn_cast<ConstantSDNode>(N->getOperand(0)->getOperand(1)); + auto *NotX = dyn_cast<ConstantSDNode>(N->getOperand(1)->getOperand(1)); + return X && NotX && + ~(unsigned)X->getZExtValue() == (unsigned)NotX->getZExtValue(); + }] +>; + +// Definition from ISA doc: +// (y & x) | (z & ~x) +def : AMDGPUPat < + (DivergentBinFrag<or> (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))), + (V_BFI_B32_e64 $x, $y, $z) +>; + +// (y & C) | (z & ~C) +def : AMDGPUPat < + (BFIImm32 i32:$x, i32:$y, i32:$z), + (V_BFI_B32_e64 $x, $y, $z) +>; + +// 64-bit version +def : AMDGPUPat < + (DivergentBinFrag<or> (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))), + (REG_SEQUENCE SReg_64, + (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)), + (i32 (EXTRACT_SUBREG SReg_64:$y, sub0)), + (i32 (EXTRACT_SUBREG SReg_64:$z, sub0))), sub0, + (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)), + (i32 (EXTRACT_SUBREG SReg_64:$y, sub1)), + (i32 (EXTRACT_SUBREG SReg_64:$z, sub1))), sub1) +>; + +// SHA-256 Ch function +// z ^ (x & (y ^ z)) +def : AMDGPUPat < + (DivergentBinFrag<xor> i32:$z, (and i32:$x, (xor i32:$y, i32:$z))), + (V_BFI_B32_e64 $x, $y, $z) +>; -// FIXME: This should only be done for VALU inputs -defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>; -def : ROTRPattern <V_ALIGNBIT_B32>; +// 64-bit version +def : AMDGPUPat < + (DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))), + (REG_SEQUENCE SReg_64, + (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)), + (i32 (EXTRACT_SUBREG SReg_64:$y, sub0)), + (i32 (EXTRACT_SUBREG SReg_64:$z, sub0))), sub0, + (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)), + (i32 (EXTRACT_SUBREG SReg_64:$y, sub1)), + (i32 (EXTRACT_SUBREG SReg_64:$z, sub1))), sub1) +>; + +def : AMDGPUPat < + (fcopysign f32:$src0, f32:$src1), + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, $src1) +>; + +def : AMDGPUPat < + (fcopysign f32:$src0, f64:$src1), + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, + (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1))) +>; + +def : AMDGPUPat < + (fcopysign f64:$src0, f64:$src1), + (REG_SEQUENCE SReg_64, + (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), + (i32 (EXTRACT_SUBREG SReg_64:$src0, sub1)), + (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1))), sub1) +>; + +def : AMDGPUPat < + (fcopysign f64:$src0, f32:$src1), + (REG_SEQUENCE SReg_64, + (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), + (i32 (EXTRACT_SUBREG SReg_64:$src0, sub1)), + $src1), sub1) +>; + +def : ROTRPattern <V_ALIGNBIT_B32_e64>; def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), - (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), + (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), - (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), + (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; /********** ====================== **********/ @@ -1618,7 +1700,7 @@ def : GCNPat < (add (sub_oneuse (umax i32:$src0, i32:$src1), (umin i32:$src0, i32:$src1)), i32:$src2), - (V_SAD_U32 $src0, $src1, $src2, (i1 0)) + (V_SAD_U32_e64 $src0, $src1, $src2, (i1 0)) >; def : GCNPat < @@ -1626,7 +1708,7 @@ def : GCNPat < (sub i32:$src0, i32:$src1), (sub i32:$src1, i32:$src0)), i32:$src2), - (V_SAD_U32 $src0, $src1, $src2, (i1 0)) + (V_SAD_U32_e64 $src0, $src1, $src2, (i1 0)) >; //===----------------------------------------------------------------------===// @@ -1877,9 +1959,9 @@ def : GCNPat < def : GCNPat < (i32 (bswap i32:$a)), - (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)), - (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 24)), - (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 8))) + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)), + (V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 24)), + (V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 8))) >; // FIXME: This should have been narrowed to i32 during legalization. @@ -1887,19 +1969,19 @@ def : GCNPat < def : GCNPat < (i64 (bswap i64:$a)), (REG_SEQUENCE VReg_64, - (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)), - (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)), + (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), (i32 24)), - (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), + (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), (i32 8))), sub0, - (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)), - (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)), + (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), (i32 24)), - (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), + (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), (i32 8))), sub1) @@ -1914,7 +1996,7 @@ let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in { // register value, but this is what seems to work. def : GCNPat < (i32 (bswap i32:$a)), - (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x00010203))) + (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x00010203))) >; // FIXME: This should have been narrowed to i32 during legalization. @@ -1922,10 +2004,10 @@ def : GCNPat < def : GCNPat < (i64 (bswap i64:$a)), (REG_SEQUENCE VReg_64, - (V_PERM_B32 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub1), + (V_PERM_B32_e64 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub1), (S_MOV_B32 (i32 0x00010203))), sub0, - (V_PERM_B32 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub0), + (V_PERM_B32_e64 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub0), (S_MOV_B32 (i32 0x00010203))), sub1) >; @@ -1934,18 +2016,18 @@ def : GCNPat < // The 12s emit 0s. def : GCNPat < (i16 (bswap i16:$a)), - (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001))) + (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001))) >; def : GCNPat < (i32 (zext (bswap i16:$a))), - (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001))) + (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001))) >; // Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24) def : GCNPat < (v2i16 (bswap v2i16:$a)), - (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x02030001))) + (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x02030001))) >; } @@ -1981,7 +2063,7 @@ def : GCNPat< // TODO: Handle fneg like other types. def : GCNPat< (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), - (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src) + (V_MUL_F64_e64 0, CONST.FP64_ONE, $src_mods, $src) >; } // End AddedComplexity = -5 @@ -1997,7 +2079,7 @@ multiclass SelectCanonicalizeAsMax< def : GCNPat< (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), - (V_MAX_F64 $src_mods, $src, $src_mods, $src)> { + (V_MAX_F64_e64 $src_mods, $src, $src_mods, $src)> { let OtherPredicates = f64_preds; } @@ -2059,14 +2141,22 @@ def : GCNPat < SRCMODS.NONE, $src2) >; -// COPY is workaround tablegen bug from multiple outputs -// from S_LSHL_B32's multiple outputs from implicit scc def. def : GCNPat < (v2i16 (build_vector (i16 0), (i16 SReg_32:$src1))), (S_LSHL_B32 SReg_32:$src1, (i16 16)) >; def : GCNPat < + (v2i16 (build_vector (i16 SReg_32:$src1), (i16 0))), + (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) +>; + +def : GCNPat < + (v2f16 (build_vector (f16 SReg_32:$src1), (f16 FP_ZERO))), + (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) +>; + +def : GCNPat < (v2i16 (build_vector (i16 SReg_32:$src0), (i16 undef))), (COPY_TO_REGCLASS SReg_32:$src0, SReg_32) >; @@ -2177,12 +2267,12 @@ let SubtargetPredicate = isGFX6 in { // FIXME: DAG should also custom lower this. def : GCNPat < (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))), - (V_ADD_F64 + (V_ADD_F64_e64 $mods, $x, SRCMODS.NEG, (V_CNDMASK_B64_PSEUDO - (V_MIN_F64 + (V_MIN_F64_e64 SRCMODS.NONE, (V_FRACT_F64_e64 $mods, $x), SRCMODS.NONE, @@ -2213,7 +2303,7 @@ def : GCNPat< def : GCNPat< (add i32:$src0, (i32 NegSubInlineConst32:$src1)), - (V_SUB_I32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> { + (V_SUB_CO_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> { let SubtargetPredicate = NotHasAddNoCarryInsts; } @@ -2241,8 +2331,77 @@ multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> { defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>; // FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>; -defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>; -defm : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64, SReg_64>; +// Bitfield extract patterns + +def IMMZeroBasedBitfieldMask : ImmLeaf <i32, [{ + return isMask_32(Imm); +}]>; + +def IMMPopCount : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(countPopulation(N->getZExtValue()), SDLoc(N), + MVT::i32); +}]>; + +def : AMDGPUPat < + (DivergentBinFrag<and> (i32 (srl i32:$src, i32:$rshift)), + IMMZeroBasedBitfieldMask:$mask), + (V_BFE_U32_e64 $src, $rshift, (i32 (IMMPopCount $mask))) +>; + +// x & ((1 << y) - 1) +def : AMDGPUPat < + (DivergentBinFrag<and> i32:$src, (add_oneuse (shl_oneuse 1, i32:$width), -1)), + (V_BFE_U32_e64 $src, (i32 0), $width) +>; + +// x & ~(-1 << y) +def : AMDGPUPat < + (DivergentBinFrag<and> i32:$src, + (xor_oneuse (shl_oneuse -1, i32:$width), -1)), + (V_BFE_U32_e64 $src, (i32 0), $width) +>; + +// x & (-1 >> (bitwidth - y)) +def : AMDGPUPat < + (DivergentBinFrag<and> i32:$src, (srl_oneuse -1, (sub 32, i32:$width))), + (V_BFE_U32_e64 $src, (i32 0), $width) +>; + +// x << (bitwidth - y) >> (bitwidth - y) +def : AMDGPUPat < + (DivergentBinFrag<srl> (shl_oneuse i32:$src, (sub 32, i32:$width)), + (sub 32, i32:$width)), + (V_BFE_U32_e64 $src, (i32 0), $width) +>; + +def : AMDGPUPat < + (DivergentBinFrag<sra> (shl_oneuse i32:$src, (sub 32, i32:$width)), + (sub 32, i32:$width)), + (V_BFE_I32_e64 $src, (i32 0), $width) +>; + +// SHA-256 Ma patterns + +// ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y +def : AMDGPUPat < + (DivergentBinFrag<or> (and i32:$x, i32:$z), + (and i32:$y, (or i32:$x, i32:$z))), + (V_BFI_B32_e64 (V_XOR_B32_e64 i32:$x, i32:$y), i32:$z, i32:$y) +>; + +def : AMDGPUPat < + (DivergentBinFrag<or> (and i64:$x, i64:$z), + (and i64:$y, (or i64:$x, i64:$z))), + (REG_SEQUENCE SReg_64, + (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)), + (i32 (EXTRACT_SUBREG SReg_64:$y, sub0))), + (i32 (EXTRACT_SUBREG SReg_64:$z, sub0)), + (i32 (EXTRACT_SUBREG SReg_64:$y, sub0))), sub0, + (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)), + (i32 (EXTRACT_SUBREG SReg_64:$y, sub1))), + (i32 (EXTRACT_SUBREG SReg_64:$z, sub1)), + (i32 (EXTRACT_SUBREG SReg_64:$y, sub1))), sub1) +>; multiclass IntMed3Pat<Instruction med3Inst, SDPatternOperator min, @@ -2267,8 +2426,8 @@ multiclass IntMed3Pat<Instruction med3Inst, >; } -defm : IntMed3Pat<V_MED3_I32, smin, smax, smin_oneuse, smax_oneuse>; -defm : IntMed3Pat<V_MED3_U32, umin, umax, umin_oneuse, umax_oneuse>; +defm : IntMed3Pat<V_MED3_I32_e64, smin, smax, smin_oneuse, smax_oneuse>; +defm : IntMed3Pat<V_MED3_U32_e64, umin, umax, umin_oneuse, umax_oneuse>; // This matches 16 permutations of // max(min(x, y), min(max(x, y), z)) @@ -2315,12 +2474,12 @@ multiclass Int16Med3Pat<Instruction med3Inst, >; } -def : FPMed3Pat<f32, V_MED3_F32>; +def : FPMed3Pat<f32, V_MED3_F32_e64>; let OtherPredicates = [isGFX9Plus] in { -def : FP16Med3Pat<f16, V_MED3_F16>; -defm : Int16Med3Pat<V_MED3_I16, smin, smax, smax_oneuse, smin_oneuse>; -defm : Int16Med3Pat<V_MED3_U16, umin, umax, umax_oneuse, umin_oneuse>; +def : FP16Med3Pat<f16, V_MED3_F16_e64>; +defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax, smax_oneuse, smin_oneuse>; +defm : Int16Med3Pat<V_MED3_U16_e64, umin, umax, umax_oneuse, umin_oneuse>; } // End Predicates = [isGFX9Plus] class AMDGPUGenericInstruction : GenericInstruction { @@ -2428,10 +2587,12 @@ def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction { let Namespace = "AMDGPU" in { def G_AMDGPU_ATOMIC_INC : G_ATOMICRMW_OP; def G_AMDGPU_ATOMIC_DEC : G_ATOMICRMW_OP; +def G_AMDGPU_ATOMIC_FMIN : G_ATOMICRMW_OP; +def G_AMDGPU_ATOMIC_FMAX : G_ATOMICRMW_OP; } -class BufferAtomicGenericInstruction : AMDGPUGenericInstruction { - let OutOperandList = (outs type0:$dst); +class BufferAtomicGenericInstruction<bit NoRtn = 0> : AMDGPUGenericInstruction { + let OutOperandList = !if(NoRtn, (outs), (outs type0:$dst)); let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, type2:$soffset, untyped_imm_0:$offset, untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); @@ -2452,6 +2613,7 @@ def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction { let OutOperandList = (outs type0:$dst); @@ -2494,3 +2656,11 @@ def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction { let hasSideEffects = 0; let mayStore = 1; } + +def G_AMDGPU_INTRIN_BVH_INTERSECT_RAY : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins unknown:$intrin, variable_ops); + let hasSideEffects = 0; + let mayLoad = 1; + let mayStore = 0; +} |