aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td570
1 files changed, 370 insertions, 200 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
index 0c4c9e0e9df2..7c1cbd67c993 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -19,43 +19,7 @@ include "VOPInstructions.td"
include "SMInstructions.td"
include "FLATInstructions.td"
include "BUFInstructions.td"
-
-//===----------------------------------------------------------------------===//
-// EXP Instructions
-//===----------------------------------------------------------------------===//
-
-defm EXP : EXP_m<0>;
-defm EXP_DONE : EXP_m<1>;
-
-class ExpPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat<
- (int_amdgcn_exp timm:$tgt, timm:$en,
- (vt ExpSrc0:$src0), (vt ExpSrc1:$src1),
- (vt ExpSrc2:$src2), (vt ExpSrc3:$src3),
- done_val, timm:$vm),
- (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1,
- ExpSrc2:$src2, ExpSrc3:$src3, timm:$vm, 0, timm:$en)
->;
-
-class ExpComprPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat<
- (int_amdgcn_exp_compr timm:$tgt, timm:$en,
- (vt ExpSrc0:$src0), (vt ExpSrc1:$src1),
- done_val, timm:$vm),
- (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1,
- (IMPLICIT_DEF), (IMPLICIT_DEF), timm:$vm, 1, timm:$en)
->;
-
-// FIXME: The generated DAG matcher seems to have strange behavior
-// with a 1-bit literal to match, so use a -1 for checking a true
-// 1-bit value.
-def : ExpPattern<i32, EXP, 0>;
-def : ExpPattern<i32, EXP_DONE, -1>;
-def : ExpPattern<f32, EXP, 0>;
-def : ExpPattern<f32, EXP_DONE, -1>;
-
-def : ExpComprPattern<v2i16, EXP, 0>;
-def : ExpComprPattern<v2i16, EXP_DONE, -1>;
-def : ExpComprPattern<v2f16, EXP, 0>;
-def : ExpComprPattern<v2f16, EXP_DONE, -1>;
+include "EXPInstructions.td"
//===----------------------------------------------------------------------===//
// VINTRP Instructions
@@ -264,6 +228,7 @@ class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI<
let WaveSizePredicate = isWave64 in {
def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>;
def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
+def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>;
def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
}
@@ -324,7 +289,7 @@ def SI_IF: CFPseudoInstSI <
def SI_ELSE : CFPseudoInstSI <
(outs SReg_1:$dst),
- (ins SReg_1:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
+ (ins SReg_1:$src, brtarget:$target), [], 1, 1> {
let Size = 12;
let hasSideEffects = 1;
}
@@ -356,6 +321,14 @@ def SI_IF_BREAK : CFPseudoInstSI <
let isReMaterializable = 1;
}
+// Branch to the early termination block of the shader if SCC is 0.
+// This uses SCC from a previous SALU operation, i.e. the update of
+// a mask of live lanes after a kill/demote operation.
+// Only valid in pixel shaders.
+def SI_EARLY_TERMINATE_SCC0 : SPseudoInstSI <(outs), (ins)> {
+ let Uses = [EXEC,SCC];
+}
+
let Uses = [EXEC] in {
multiclass PseudoInstKill <dag ins> {
@@ -426,32 +399,13 @@ def SI_INIT_EXEC : SPseudoInstSI <
(outs), (ins i64imm:$src),
[(int_amdgcn_init_exec (i64 timm:$src))]> {
let Defs = [EXEC];
- let usesCustomInserter = 1;
- let isAsCheapAsAMove = 1;
- let WaveSizePredicate = isWave64;
-}
-
-// FIXME: Intrinsic should be mangled for wave size.
-def SI_INIT_EXEC_LO : SPseudoInstSI <
- (outs), (ins i32imm:$src), []> {
- let Defs = [EXEC_LO];
- let usesCustomInserter = 1;
let isAsCheapAsAMove = 1;
- let WaveSizePredicate = isWave32;
}
-// FIXME: Wave32 version
def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
(outs), (ins SSrc_b32:$input, i32imm:$shift),
[(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> {
let Defs = [EXEC];
- let usesCustomInserter = 1;
-}
-
-def : GCNPat <
- (int_amdgcn_init_exec timm:$src),
- (SI_INIT_EXEC_LO (as_i32timm timm:$src))> {
- let WaveSizePredicate = isWave32;
}
// Return for returning shaders to a shader variant epilog.
@@ -580,64 +534,97 @@ def SI_INDIRECT_DST_V32 : SI_INDIRECT_DST<VReg_1024>;
} // End Uses = [EXEC], Defs = [M0, EXEC]
-
-// This is a pseudo variant of the v_movreld_b32 (or v_mov_b32
-// expecting to be executed with gpr indexing mode enabled)
-// instruction in which the vector operand appears only twice, once as
-// def and once as use. Using this pseudo avoids problems with the Two
-// Address instructions pass.
-class INDIRECT_REG_WRITE_pseudo<RegisterClass rc,
+// This is a pseudo variant of the v_movreld_b32 instruction in which the
+// vector operand appears only twice, once as def and once as use. Using this
+// pseudo avoids problems with the Two Address instructions pass.
+class INDIRECT_REG_WRITE_MOVREL_pseudo<RegisterClass rc,
RegisterOperand val_ty> : PseudoInstSI <
(outs rc:$vdst), (ins rc:$vsrc, val_ty:$val, i32imm:$subreg)> {
let Constraints = "$vsrc = $vdst";
let Uses = [M0];
}
-class V_INDIRECT_REG_WRITE_B32_pseudo<RegisterClass rc> :
- INDIRECT_REG_WRITE_pseudo<rc, VSrc_b32> {
+class V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<RegisterClass rc> :
+ INDIRECT_REG_WRITE_MOVREL_pseudo<rc, VSrc_b32> {
let VALU = 1;
let VOP1 = 1;
let Uses = [M0, EXEC];
}
-class S_INDIRECT_REG_WRITE_pseudo<RegisterClass rc,
+class S_INDIRECT_REG_WRITE_MOVREL_pseudo<RegisterClass rc,
RegisterOperand val_ty> :
- INDIRECT_REG_WRITE_pseudo<rc, val_ty> {
+ INDIRECT_REG_WRITE_MOVREL_pseudo<rc, val_ty> {
let SALU = 1;
let SOP1 = 1;
let Uses = [M0];
}
-class S_INDIRECT_REG_WRITE_B32_pseudo<RegisterClass rc> :
- S_INDIRECT_REG_WRITE_pseudo<rc, SSrc_b32>;
-class S_INDIRECT_REG_WRITE_B64_pseudo<RegisterClass rc> :
- S_INDIRECT_REG_WRITE_pseudo<rc, SSrc_b64>;
-
-
-def V_INDIRECT_REG_WRITE_B32_V1 : V_INDIRECT_REG_WRITE_B32_pseudo<VGPR_32>;
-def V_INDIRECT_REG_WRITE_B32_V2 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_64>;
-def V_INDIRECT_REG_WRITE_B32_V3 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_96>;
-def V_INDIRECT_REG_WRITE_B32_V4 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_128>;
-def V_INDIRECT_REG_WRITE_B32_V5 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_160>;
-def V_INDIRECT_REG_WRITE_B32_V8 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_256>;
-def V_INDIRECT_REG_WRITE_B32_V16 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_512>;
-def V_INDIRECT_REG_WRITE_B32_V32 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_1024>;
+class S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<RegisterClass rc> :
+ S_INDIRECT_REG_WRITE_MOVREL_pseudo<rc, SSrc_b32>;
+class S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<RegisterClass rc> :
+ S_INDIRECT_REG_WRITE_MOVREL_pseudo<rc, SSrc_b64>;
+
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V1 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VGPR_32>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V2 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_64>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V3 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_96>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V4 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_128>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V5 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_160>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V8 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_256>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V16 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_512>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V32 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_1024>;
+
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V1 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_32>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V2 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_64>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V3 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_96>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V4 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_128>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V5 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_160>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V8 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_256>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V16 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_512>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V32 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_1024>;
+
+def S_INDIRECT_REG_WRITE_MOVREL_B64_V1 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_64>;
+def S_INDIRECT_REG_WRITE_MOVREL_B64_V2 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_128>;
+def S_INDIRECT_REG_WRITE_MOVREL_B64_V4 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_256>;
+def S_INDIRECT_REG_WRITE_MOVREL_B64_V8 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_512>;
+def S_INDIRECT_REG_WRITE_MOVREL_B64_V16 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_1024>;
+
+// These variants of V_INDIRECT_REG_READ/WRITE use VGPR indexing. By using these
+// pseudos we avoid spills or copies being inserted within indirect sequences
+// that switch the VGPR indexing mode. Spills to accvgprs could be effected by
+// this mode switching.
+
+class V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<RegisterClass rc> : PseudoInstSI <
+ (outs rc:$vdst), (ins rc:$vsrc, VSrc_b32:$val, SSrc_b32:$idx, i32imm:$subreg)> {
+ let Constraints = "$vsrc = $vdst";
+ let VALU = 1;
+ let Uses = [M0, EXEC];
+ let Defs = [M0];
+}
-def S_INDIRECT_REG_WRITE_B32_V1 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_32>;
-def S_INDIRECT_REG_WRITE_B32_V2 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_64>;
-def S_INDIRECT_REG_WRITE_B32_V3 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_96>;
-def S_INDIRECT_REG_WRITE_B32_V4 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_128>;
-def S_INDIRECT_REG_WRITE_B32_V5 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_160>;
-def S_INDIRECT_REG_WRITE_B32_V8 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_256>;
-def S_INDIRECT_REG_WRITE_B32_V16 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_512>;
-def S_INDIRECT_REG_WRITE_B32_V32 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_1024>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VGPR_32>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_64>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_96>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_128>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_160>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_256>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_512>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_1024>;
-def S_INDIRECT_REG_WRITE_B64_V1 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_64>;
-def S_INDIRECT_REG_WRITE_B64_V2 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_128>;
-def S_INDIRECT_REG_WRITE_B64_V4 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_256>;
-def S_INDIRECT_REG_WRITE_B64_V8 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_512>;
-def S_INDIRECT_REG_WRITE_B64_V16 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_1024>;
+class V_INDIRECT_REG_READ_GPR_IDX_pseudo<RegisterClass rc> : PseudoInstSI <
+ (outs VGPR_32:$vdst), (ins rc:$vsrc, SSrc_b32:$idx, i32imm:$subreg)> {
+ let VALU = 1;
+ let Uses = [M0, EXEC];
+ let Defs = [M0];
+}
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V1 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VGPR_32>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V2 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_64>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V3 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_96>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V4 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_128>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V5 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_160>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V8 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_256>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V16 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_512>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V32 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_1024>;
multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
let UseNamedOperandTable = 1, SGPRSpill = 1, Uses = [EXEC] in {
@@ -671,30 +658,33 @@ defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>;
-multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
+// VGPR or AGPR spill instructions. In case of AGPR spilling a temp register
+// needs to be used and an extra instruction to move between VGPR and AGPR.
+// UsesTmp adds to the total size of an expanded spill in this case.
+multiclass SI_SPILL_VGPR <RegisterClass vgpr_class, bit UsesTmp = 0> {
let UseNamedOperandTable = 1, VGPRSpill = 1,
SchedRW = [WriteVMEM] in {
def _SAVE : VPseudoInstSI <
(outs),
- (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc,
+ (ins vgpr_class:$vdata, i32imm:$vaddr,
SReg_32:$soffset, i32imm:$offset)> {
let mayStore = 1;
let mayLoad = 0;
// (2 * 4) + (8 * num_subregs) bytes maximum
- int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+ int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), !add(UsesTmp, 3)), 8);
// Size field is unsigned char and cannot fit more.
let Size = !if(!le(MaxSize, 256), MaxSize, 252);
}
def _RESTORE : VPseudoInstSI <
(outs vgpr_class:$vdata),
- (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset,
- i32imm:$offset)> {
+ (ins i32imm:$vaddr,
+ SReg_32:$soffset, i32imm:$offset)> {
let mayStore = 0;
let mayLoad = 1;
// (2 * 4) + (8 * num_subregs) bytes maximum
- int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+ int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), !add(UsesTmp, 3)), 8);
// Size field is unsigned char and cannot fit more.
let Size = !if(!le(MaxSize, 256), MaxSize, 252);
}
@@ -711,42 +701,15 @@ defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>;
-multiclass SI_SPILL_AGPR <RegisterClass vgpr_class> {
- let UseNamedOperandTable = 1, VGPRSpill = 1,
- Constraints = "@earlyclobber $tmp",
- SchedRW = [WriteVMEM] in {
- def _SAVE : VPseudoInstSI <
- (outs VGPR_32:$tmp),
- (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc,
- SReg_32:$soffset, i32imm:$offset)> {
- let mayStore = 1;
- let mayLoad = 0;
- // (2 * 4) + (16 * num_subregs) bytes maximum
- int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8);
- // Size field is unsigned char and cannot fit more.
- let Size = !if(!le(MaxSize, 256), MaxSize, 252);
- }
-
- def _RESTORE : VPseudoInstSI <
- (outs vgpr_class:$vdata, VGPR_32:$tmp),
- (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset,
- i32imm:$offset)> {
- let mayStore = 0;
- let mayLoad = 1;
-
- // (2 * 4) + (16 * num_subregs) bytes maximum
- int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8);
- // Size field is unsigned char and cannot fit more.
- let Size = !if(!le(MaxSize, 256), MaxSize, 252);
- }
- } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM]
-}
-
-defm SI_SPILL_A32 : SI_SPILL_AGPR <AGPR_32>;
-defm SI_SPILL_A64 : SI_SPILL_AGPR <AReg_64>;
-defm SI_SPILL_A128 : SI_SPILL_AGPR <AReg_128>;
-defm SI_SPILL_A512 : SI_SPILL_AGPR <AReg_512>;
-defm SI_SPILL_A1024 : SI_SPILL_AGPR <AReg_1024>;
+defm SI_SPILL_A32 : SI_SPILL_VGPR <AGPR_32, 1>;
+defm SI_SPILL_A64 : SI_SPILL_VGPR <AReg_64, 1>;
+defm SI_SPILL_A96 : SI_SPILL_VGPR <AReg_96, 1>;
+defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128, 1>;
+defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160, 1>;
+defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192, 1>;
+defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>;
+defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>;
+defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>;
def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
(outs SReg_64:$dst),
@@ -768,7 +731,7 @@ def : GCNPat<
def : GCNPat<
(AMDGPUelse i1:$src, bb:$target),
- (SI_ELSE $src, $target, 0)
+ (SI_ELSE $src, $target)
>;
def : Pat <
@@ -804,12 +767,9 @@ def : Pat <
let OtherPredicates = [UnsafeFPMath] in {
-//def : RcpPat<V_RCP_F64_e32, f64>;
-//defm : RsqPat<V_RSQ_F64_e32, f64>;
//defm : RsqPat<V_RSQ_F32_e32, f32>;
def : RsqPat<V_RSQ_F32_e32, f32>;
-def : RsqPat<V_RSQ_F64_e32, f64>;
// Convert (x - floor(x)) to fract(x)
def : GCNPat <
@@ -889,7 +849,8 @@ def : GCNPat <
// VOP2 Patterns
//===----------------------------------------------------------------------===//
-// TODO: Check only no src2 mods?
+// NoMods pattern used for mac. If there are any source modifiers then it's
+// better to select mad instead of mac.
class FMADPat <ValueType vt, Instruction inst, SDPatternOperator node>
: GCNPat <(vt (node (vt (VOP3NoMods vt:$src0)),
(vt (VOP3NoMods vt:$src1)),
@@ -898,18 +859,41 @@ class FMADPat <ValueType vt, Instruction inst, SDPatternOperator node>
SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
-
// Prefer mac form when there are no modifiers.
let AddedComplexity = 9 in {
+let OtherPredicates = [HasMadMacF32Insts] in {
def : FMADPat <f32, V_MAC_F32_e64, fmad>;
def : FMADPat <f32, V_MAC_F32_e64, AMDGPUfmad_ftz>;
+} // OtherPredicates = [HasMadMacF32Insts]
+
+// Don't allow source modifiers. If there are any source modifiers then it's
+// better to select mad instead of mac.
+let SubtargetPredicate = isGFX6GFX7GFX10,
+ OtherPredicates = [HasMadMacF32Insts, NoFP32Denormals] in
+def : GCNPat <
+ (f32 (fadd (AMDGPUfmul_legacy (VOP3NoMods f32:$src0),
+ (VOP3NoMods f32:$src1)),
+ (VOP3NoMods f32:$src2))),
+ (V_MAC_LEGACY_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
+ SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+// Don't allow source modifiers. If there are any source modifiers then it's
+// better to select fma instead of fmac.
+let SubtargetPredicate = HasFmaLegacy32 in
+def : GCNPat <
+ (f32 (int_amdgcn_fma_legacy (VOP3NoMods f32:$src0),
+ (VOP3NoMods f32:$src1),
+ (VOP3NoMods f32:$src2))),
+ (V_FMAC_LEGACY_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
+ SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
let SubtargetPredicate = Has16BitInsts in {
def : FMADPat <f16, V_MAC_F16_e64, fmad>;
def : FMADPat <f16, V_MAC_F16_e64, AMDGPUfmad_ftz>;
-}
-
-}
+} // SubtargetPredicate = Has16BitInsts
+} // AddedComplexity = 9
class FMADModsPat<ValueType Ty, Instruction inst, SDPatternOperator mad_opr>
: GCNPat<
@@ -920,11 +904,20 @@ class FMADModsPat<ValueType Ty, Instruction inst, SDPatternOperator mad_opr>
$src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
-let SubtargetPredicate = HasMadMacF32Insts in
-def : FMADModsPat<f32, V_MAD_F32, AMDGPUfmad_ftz>;
-def : FMADModsPat<f16, V_MAD_F16, AMDGPUfmad_ftz> {
- let SubtargetPredicate = Has16BitInsts;
-}
+let OtherPredicates = [HasMadMacF32Insts] in
+def : FMADModsPat<f32, V_MAD_F32_e64, AMDGPUfmad_ftz>;
+
+let OtherPredicates = [HasMadMacF32Insts, NoFP32Denormals] in
+def : GCNPat <
+ (f32 (fadd (AMDGPUfmul_legacy (VOP3Mods f32:$src0, i32:$src0_mod),
+ (VOP3Mods f32:$src1, i32:$src1_mod)),
+ (VOP3Mods f32:$src2, i32:$src2_mod))),
+ (V_MAD_LEGACY_F32_e64 $src0_mod, $src0, $src1_mod, $src1,
+ $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+let SubtargetPredicate = Has16BitInsts in
+def : FMADModsPat<f16, V_MAD_F16_e64, AMDGPUfmad_ftz>;
class VOPSelectModsPat <ValueType vt> : GCNPat <
(vt (select i1:$src0, (VOP3Mods vt:$src1, i32:$src1_mods),
@@ -1241,7 +1234,7 @@ class ClampPat<Instruction inst, ValueType vt> : GCNPat <
>;
def : ClampPat<V_MAX_F32_e64, f32>;
-def : ClampPat<V_MAX_F64, f64>;
+def : ClampPat<V_MAX_F64_e64, f64>;
def : ClampPat<V_MAX_F16_e64, f16>;
let SubtargetPredicate = HasVOP3PInsts in {
@@ -1422,12 +1415,12 @@ def : GCNPat <
def : GCNPat <
(fcopysign f16:$src0, f16:$src1),
- (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
>;
def : GCNPat <
(fcopysign f32:$src0, f16:$src1),
- (V_BFI_B32 (S_MOV_B32 (i32 0x7fffffff)), $src0,
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
(V_LSHLREV_B32_e64 (i32 16), $src1))
>;
@@ -1435,19 +1428,19 @@ def : GCNPat <
(fcopysign f64:$src0, f16:$src1),
(REG_SEQUENCE SReg_64,
(i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
- (V_BFI_B32 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)),
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)),
(V_LSHLREV_B32_e64 (i32 16), $src1)), sub1)
>;
def : GCNPat <
(fcopysign f16:$src0, f32:$src1),
- (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0,
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
(V_LSHRREV_B32_e64 (i32 16), $src1))
>;
def : GCNPat <
(fcopysign f16:$src0, f64:$src1),
- (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0,
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
(V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
>;
@@ -1499,8 +1492,13 @@ def : GCNPat <
>;
def : GCNPat <
- (i32 frameindex:$fi),
- (V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi)))
+ (p5 frameindex:$fi),
+ (V_MOV_B32_e32 (p5 (frameindex_to_targetframeindex $fi)))
+>;
+
+def : GCNPat <
+ (p5 frameindex:$fi),
+ (S_MOV_B32 (p5 (frameindex_to_targetframeindex $fi)))
>;
def : GCNPat <
@@ -1565,19 +1563,103 @@ def : GCNPat <
// VOP3 Patterns
//===----------------------------------------------------------------------===//
-def : IMad24Pat<V_MAD_I32_I24, 1>;
-def : UMad24Pat<V_MAD_U32_U24, 1>;
+def : IMad24Pat<V_MAD_I32_I24_e64, 1>;
+def : UMad24Pat<V_MAD_U32_U24_e64, 1>;
+
+// BFI patterns
+
+def BFIImm32 : PatFrag<
+ (ops node:$x, node:$y, node:$z),
+ (i32 (DivergentBinFrag<or> (and node:$y, node:$x), (and node:$z, imm))),
+ [{
+ auto *X = dyn_cast<ConstantSDNode>(N->getOperand(0)->getOperand(1));
+ auto *NotX = dyn_cast<ConstantSDNode>(N->getOperand(1)->getOperand(1));
+ return X && NotX &&
+ ~(unsigned)X->getZExtValue() == (unsigned)NotX->getZExtValue();
+ }]
+>;
+
+// Definition from ISA doc:
+// (y & x) | (z & ~x)
+def : AMDGPUPat <
+ (DivergentBinFrag<or> (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
+ (V_BFI_B32_e64 $x, $y, $z)
+>;
+
+// (y & C) | (z & ~C)
+def : AMDGPUPat <
+ (BFIImm32 i32:$x, i32:$y, i32:$z),
+ (V_BFI_B32_e64 $x, $y, $z)
+>;
+
+// 64-bit version
+def : AMDGPUPat <
+ (DivergentBinFrag<or> (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))),
+ (REG_SEQUENCE SReg_64,
+ (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)),
+ (i32 (EXTRACT_SUBREG SReg_64:$y, sub0)),
+ (i32 (EXTRACT_SUBREG SReg_64:$z, sub0))), sub0,
+ (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)),
+ (i32 (EXTRACT_SUBREG SReg_64:$y, sub1)),
+ (i32 (EXTRACT_SUBREG SReg_64:$z, sub1))), sub1)
+>;
+
+// SHA-256 Ch function
+// z ^ (x & (y ^ z))
+def : AMDGPUPat <
+ (DivergentBinFrag<xor> i32:$z, (and i32:$x, (xor i32:$y, i32:$z))),
+ (V_BFI_B32_e64 $x, $y, $z)
+>;
-// FIXME: This should only be done for VALU inputs
-defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
-def : ROTRPattern <V_ALIGNBIT_B32>;
+// 64-bit version
+def : AMDGPUPat <
+ (DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
+ (REG_SEQUENCE SReg_64,
+ (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)),
+ (i32 (EXTRACT_SUBREG SReg_64:$y, sub0)),
+ (i32 (EXTRACT_SUBREG SReg_64:$z, sub0))), sub0,
+ (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)),
+ (i32 (EXTRACT_SUBREG SReg_64:$y, sub1)),
+ (i32 (EXTRACT_SUBREG SReg_64:$z, sub1))), sub1)
+>;
+
+def : AMDGPUPat <
+ (fcopysign f32:$src0, f32:$src1),
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, $src1)
+>;
+
+def : AMDGPUPat <
+ (fcopysign f32:$src0, f64:$src1),
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
+ (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1)))
+>;
+
+def : AMDGPUPat <
+ (fcopysign f64:$src0, f64:$src1),
+ (REG_SEQUENCE SReg_64,
+ (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)),
+ (i32 (EXTRACT_SUBREG SReg_64:$src0, sub1)),
+ (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1))), sub1)
+>;
+
+def : AMDGPUPat <
+ (fcopysign f64:$src0, f32:$src1),
+ (REG_SEQUENCE SReg_64,
+ (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)),
+ (i32 (EXTRACT_SUBREG SReg_64:$src0, sub1)),
+ $src1), sub1)
+>;
+
+def : ROTRPattern <V_ALIGNBIT_B32_e64>;
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
- (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
+ (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
- (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
+ (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
/********** ====================== **********/
@@ -1618,7 +1700,7 @@ def : GCNPat <
(add (sub_oneuse (umax i32:$src0, i32:$src1),
(umin i32:$src0, i32:$src1)),
i32:$src2),
- (V_SAD_U32 $src0, $src1, $src2, (i1 0))
+ (V_SAD_U32_e64 $src0, $src1, $src2, (i1 0))
>;
def : GCNPat <
@@ -1626,7 +1708,7 @@ def : GCNPat <
(sub i32:$src0, i32:$src1),
(sub i32:$src1, i32:$src0)),
i32:$src2),
- (V_SAD_U32 $src0, $src1, $src2, (i1 0))
+ (V_SAD_U32_e64 $src0, $src1, $src2, (i1 0))
>;
//===----------------------------------------------------------------------===//
@@ -1877,9 +1959,9 @@ def : GCNPat <
def : GCNPat <
(i32 (bswap i32:$a)),
- (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
- (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 24)),
- (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 8)))
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
+ (V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 24)),
+ (V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 8)))
>;
// FIXME: This should have been narrowed to i32 during legalization.
@@ -1887,19 +1969,19 @@ def : GCNPat <
def : GCNPat <
(i64 (bswap i64:$a)),
(REG_SEQUENCE VReg_64,
- (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
- (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
+ (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
(i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
(i32 24)),
- (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
+ (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
(i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
(i32 8))),
sub0,
- (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
- (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
+ (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
(i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
(i32 24)),
- (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
+ (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
(i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
(i32 8))),
sub1)
@@ -1914,7 +1996,7 @@ let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in {
// register value, but this is what seems to work.
def : GCNPat <
(i32 (bswap i32:$a)),
- (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x00010203)))
+ (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x00010203)))
>;
// FIXME: This should have been narrowed to i32 during legalization.
@@ -1922,10 +2004,10 @@ def : GCNPat <
def : GCNPat <
(i64 (bswap i64:$a)),
(REG_SEQUENCE VReg_64,
- (V_PERM_B32 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub1),
+ (V_PERM_B32_e64 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub1),
(S_MOV_B32 (i32 0x00010203))),
sub0,
- (V_PERM_B32 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub0),
+ (V_PERM_B32_e64 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub0),
(S_MOV_B32 (i32 0x00010203))),
sub1)
>;
@@ -1934,18 +2016,18 @@ def : GCNPat <
// The 12s emit 0s.
def : GCNPat <
(i16 (bswap i16:$a)),
- (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
+ (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
>;
def : GCNPat <
(i32 (zext (bswap i16:$a))),
- (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
+ (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
>;
// Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24)
def : GCNPat <
(v2i16 (bswap v2i16:$a)),
- (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x02030001)))
+ (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x02030001)))
>;
}
@@ -1981,7 +2063,7 @@ def : GCNPat<
// TODO: Handle fneg like other types.
def : GCNPat<
(fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
- (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src)
+ (V_MUL_F64_e64 0, CONST.FP64_ONE, $src_mods, $src)
>;
} // End AddedComplexity = -5
@@ -1997,7 +2079,7 @@ multiclass SelectCanonicalizeAsMax<
def : GCNPat<
(fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
- (V_MAX_F64 $src_mods, $src, $src_mods, $src)> {
+ (V_MAX_F64_e64 $src_mods, $src, $src_mods, $src)> {
let OtherPredicates = f64_preds;
}
@@ -2059,14 +2141,22 @@ def : GCNPat <
SRCMODS.NONE, $src2)
>;
-// COPY is workaround tablegen bug from multiple outputs
-// from S_LSHL_B32's multiple outputs from implicit scc def.
def : GCNPat <
(v2i16 (build_vector (i16 0), (i16 SReg_32:$src1))),
(S_LSHL_B32 SReg_32:$src1, (i16 16))
>;
def : GCNPat <
+ (v2i16 (build_vector (i16 SReg_32:$src1), (i16 0))),
+ (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
+>;
+
+def : GCNPat <
+ (v2f16 (build_vector (f16 SReg_32:$src1), (f16 FP_ZERO))),
+ (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
+>;
+
+def : GCNPat <
(v2i16 (build_vector (i16 SReg_32:$src0), (i16 undef))),
(COPY_TO_REGCLASS SReg_32:$src0, SReg_32)
>;
@@ -2177,12 +2267,12 @@ let SubtargetPredicate = isGFX6 in {
// FIXME: DAG should also custom lower this.
def : GCNPat <
(f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))),
- (V_ADD_F64
+ (V_ADD_F64_e64
$mods,
$x,
SRCMODS.NEG,
(V_CNDMASK_B64_PSEUDO
- (V_MIN_F64
+ (V_MIN_F64_e64
SRCMODS.NONE,
(V_FRACT_F64_e64 $mods, $x),
SRCMODS.NONE,
@@ -2213,7 +2303,7 @@ def : GCNPat<
def : GCNPat<
(add i32:$src0, (i32 NegSubInlineConst32:$src1)),
- (V_SUB_I32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
+ (V_SUB_CO_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
let SubtargetPredicate = NotHasAddNoCarryInsts;
}
@@ -2241,8 +2331,77 @@ multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
-defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>;
-defm : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64, SReg_64>;
+// Bitfield extract patterns
+
+def IMMZeroBasedBitfieldMask : ImmLeaf <i32, [{
+ return isMask_32(Imm);
+}]>;
+
+def IMMPopCount : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(countPopulation(N->getZExtValue()), SDLoc(N),
+ MVT::i32);
+}]>;
+
+def : AMDGPUPat <
+ (DivergentBinFrag<and> (i32 (srl i32:$src, i32:$rshift)),
+ IMMZeroBasedBitfieldMask:$mask),
+ (V_BFE_U32_e64 $src, $rshift, (i32 (IMMPopCount $mask)))
+>;
+
+// x & ((1 << y) - 1)
+def : AMDGPUPat <
+ (DivergentBinFrag<and> i32:$src, (add_oneuse (shl_oneuse 1, i32:$width), -1)),
+ (V_BFE_U32_e64 $src, (i32 0), $width)
+>;
+
+// x & ~(-1 << y)
+def : AMDGPUPat <
+ (DivergentBinFrag<and> i32:$src,
+ (xor_oneuse (shl_oneuse -1, i32:$width), -1)),
+ (V_BFE_U32_e64 $src, (i32 0), $width)
+>;
+
+// x & (-1 >> (bitwidth - y))
+def : AMDGPUPat <
+ (DivergentBinFrag<and> i32:$src, (srl_oneuse -1, (sub 32, i32:$width))),
+ (V_BFE_U32_e64 $src, (i32 0), $width)
+>;
+
+// x << (bitwidth - y) >> (bitwidth - y)
+def : AMDGPUPat <
+ (DivergentBinFrag<srl> (shl_oneuse i32:$src, (sub 32, i32:$width)),
+ (sub 32, i32:$width)),
+ (V_BFE_U32_e64 $src, (i32 0), $width)
+>;
+
+def : AMDGPUPat <
+ (DivergentBinFrag<sra> (shl_oneuse i32:$src, (sub 32, i32:$width)),
+ (sub 32, i32:$width)),
+ (V_BFE_I32_e64 $src, (i32 0), $width)
+>;
+
+// SHA-256 Ma patterns
+
+// ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y
+def : AMDGPUPat <
+ (DivergentBinFrag<or> (and i32:$x, i32:$z),
+ (and i32:$y, (or i32:$x, i32:$z))),
+ (V_BFI_B32_e64 (V_XOR_B32_e64 i32:$x, i32:$y), i32:$z, i32:$y)
+>;
+
+def : AMDGPUPat <
+ (DivergentBinFrag<or> (and i64:$x, i64:$z),
+ (and i64:$y, (or i64:$x, i64:$z))),
+ (REG_SEQUENCE SReg_64,
+ (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)),
+ (i32 (EXTRACT_SUBREG SReg_64:$y, sub0))),
+ (i32 (EXTRACT_SUBREG SReg_64:$z, sub0)),
+ (i32 (EXTRACT_SUBREG SReg_64:$y, sub0))), sub0,
+ (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)),
+ (i32 (EXTRACT_SUBREG SReg_64:$y, sub1))),
+ (i32 (EXTRACT_SUBREG SReg_64:$z, sub1)),
+ (i32 (EXTRACT_SUBREG SReg_64:$y, sub1))), sub1)
+>;
multiclass IntMed3Pat<Instruction med3Inst,
SDPatternOperator min,
@@ -2267,8 +2426,8 @@ multiclass IntMed3Pat<Instruction med3Inst,
>;
}
-defm : IntMed3Pat<V_MED3_I32, smin, smax, smin_oneuse, smax_oneuse>;
-defm : IntMed3Pat<V_MED3_U32, umin, umax, umin_oneuse, umax_oneuse>;
+defm : IntMed3Pat<V_MED3_I32_e64, smin, smax, smin_oneuse, smax_oneuse>;
+defm : IntMed3Pat<V_MED3_U32_e64, umin, umax, umin_oneuse, umax_oneuse>;
// This matches 16 permutations of
// max(min(x, y), min(max(x, y), z))
@@ -2315,12 +2474,12 @@ multiclass Int16Med3Pat<Instruction med3Inst,
>;
}
-def : FPMed3Pat<f32, V_MED3_F32>;
+def : FPMed3Pat<f32, V_MED3_F32_e64>;
let OtherPredicates = [isGFX9Plus] in {
-def : FP16Med3Pat<f16, V_MED3_F16>;
-defm : Int16Med3Pat<V_MED3_I16, smin, smax, smax_oneuse, smin_oneuse>;
-defm : Int16Med3Pat<V_MED3_U16, umin, umax, umax_oneuse, umin_oneuse>;
+def : FP16Med3Pat<f16, V_MED3_F16_e64>;
+defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax, smax_oneuse, smin_oneuse>;
+defm : Int16Med3Pat<V_MED3_U16_e64, umin, umax, umax_oneuse, umin_oneuse>;
} // End Predicates = [isGFX9Plus]
class AMDGPUGenericInstruction : GenericInstruction {
@@ -2428,10 +2587,12 @@ def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction {
let Namespace = "AMDGPU" in {
def G_AMDGPU_ATOMIC_INC : G_ATOMICRMW_OP;
def G_AMDGPU_ATOMIC_DEC : G_ATOMICRMW_OP;
+def G_AMDGPU_ATOMIC_FMIN : G_ATOMICRMW_OP;
+def G_AMDGPU_ATOMIC_FMAX : G_ATOMICRMW_OP;
}
-class BufferAtomicGenericInstruction : AMDGPUGenericInstruction {
- let OutOperandList = (outs type0:$dst);
+class BufferAtomicGenericInstruction<bit NoRtn = 0> : AMDGPUGenericInstruction {
+ let OutOperandList = !if(NoRtn, (outs), (outs type0:$dst));
let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
type2:$soffset, untyped_imm_0:$offset,
untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
@@ -2452,6 +2613,7 @@ def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
@@ -2494,3 +2656,11 @@ def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction {
let hasSideEffects = 0;
let mayStore = 1;
}
+
+def G_AMDGPU_INTRIN_BVH_INTERSECT_RAY : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins unknown:$intrin, variable_ops);
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+ let mayStore = 0;
+}