aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td677
1 files changed, 550 insertions, 127 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
index d84720f820ee..0c4c9e0e9df2 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1,4 +1,4 @@
-//===-- SIInstructions.td - SI Instruction Defintions ---------------------===//
+//===-- SIInstructions.td - SI Instruction Definitions --------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -24,8 +24,38 @@ include "BUFInstructions.td"
// EXP Instructions
//===----------------------------------------------------------------------===//
-defm EXP : EXP_m<0, AMDGPUexport>;
-defm EXP_DONE : EXP_m<1, AMDGPUexport_done>;
+defm EXP : EXP_m<0>;
+defm EXP_DONE : EXP_m<1>;
+
+class ExpPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat<
+ (int_amdgcn_exp timm:$tgt, timm:$en,
+ (vt ExpSrc0:$src0), (vt ExpSrc1:$src1),
+ (vt ExpSrc2:$src2), (vt ExpSrc3:$src3),
+ done_val, timm:$vm),
+ (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1,
+ ExpSrc2:$src2, ExpSrc3:$src3, timm:$vm, 0, timm:$en)
+>;
+
+class ExpComprPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat<
+ (int_amdgcn_exp_compr timm:$tgt, timm:$en,
+ (vt ExpSrc0:$src0), (vt ExpSrc1:$src1),
+ done_val, timm:$vm),
+ (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1,
+ (IMPLICIT_DEF), (IMPLICIT_DEF), timm:$vm, 1, timm:$en)
+>;
+
+// FIXME: The generated DAG matcher seems to have strange behavior
+// with a 1-bit literal to match, so use a -1 for checking a true
+// 1-bit value.
+def : ExpPattern<i32, EXP, 0>;
+def : ExpPattern<i32, EXP_DONE, -1>;
+def : ExpPattern<f32, EXP, 0>;
+def : ExpPattern<f32, EXP_DONE, -1>;
+
+def : ExpComprPattern<v2i16, EXP, 0>;
+def : ExpComprPattern<v2i16, EXP_DONE, -1>;
+def : ExpComprPattern<v2f16, EXP, 0>;
+def : ExpComprPattern<v2f16, EXP_DONE, -1>;
//===----------------------------------------------------------------------===//
// VINTRP Instructions
@@ -34,9 +64,9 @@ defm EXP_DONE : EXP_m<1, AMDGPUexport_done>;
// Used to inject printing of "_e32" suffix for VI (there are "_e64" variants for VI)
def VINTRPDst : VINTRPDstOperand <VGPR_32>;
-let Uses = [M0, EXEC] in {
+let Uses = [MODE, M0, EXEC] in {
-// FIXME: Specify SchedRW for VINTRP insturctions.
+// FIXME: Specify SchedRW for VINTRP instructions.
multiclass V_INTERP_P1_F32_m : VINTRP_m <
0x00000000,
@@ -76,10 +106,10 @@ defm V_INTERP_MOV_F32 : VINTRP_m <
(outs VINTRPDst:$vdst),
(ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan),
"v_interp_mov_f32$vdst, $vsrc, $attr$attrchan",
- [(set f32:$vdst, (int_amdgcn_interp_mov (i32 imm:$vsrc),
+ [(set f32:$vdst, (int_amdgcn_interp_mov (i32 timm:$vsrc),
(i32 timm:$attrchan), (i32 timm:$attr), M0))]>;
-} // End Uses = [M0, EXEC]
+} // End Uses = [MODE, M0, EXEC]
//===----------------------------------------------------------------------===//
// Pseudo Instructions
@@ -136,7 +166,8 @@ def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
def ENTER_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
- let Defs = [EXEC];
+ let Uses = [EXEC];
+ let Defs = [EXEC, SCC];
let hasSideEffects = 0;
let mayLoad = 0;
let mayStore = 0;
@@ -162,16 +193,27 @@ def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
let Constraints = "$src = $vdst";
}
+let usesCustomInserter = 1, Defs = [VCC, EXEC] in {
+def V_ADD_U64_PSEUDO : VPseudoInstSI <
+ (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1),
+ [(set VReg_64:$vdst, (getDivergentFrag<add>.ret i64:$src0, i64:$src1))]
+>;
+
+def V_SUB_U64_PSEUDO : VPseudoInstSI <
+ (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1),
+ [(set VReg_64:$vdst, (getDivergentFrag<sub>.ret i64:$src0, i64:$src1))]
+>;
+} // End usesCustomInserter = 1, Defs = [VCC, EXEC]
let usesCustomInserter = 1, Defs = [SCC] in {
def S_ADD_U64_PSEUDO : SPseudoInstSI <
- (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
- [(set SReg_64:$vdst, (add i64:$src0, i64:$src1))]
+ (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
+ [(set SReg_64:$sdst, (UniformBinFrag<add> i64:$src0, i64:$src1))]
>;
def S_SUB_U64_PSEUDO : SPseudoInstSI <
- (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
- [(set SReg_64:$vdst, (sub i64:$src0, i64:$src1))]
+ (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
+ [(set SReg_64:$sdst, (UniformBinFrag<sub> i64:$src0, i64:$src1))]
>;
def S_ADD_U64_CO_PSEUDO : SPseudoInstSI <
@@ -181,6 +223,23 @@ def S_ADD_U64_CO_PSEUDO : SPseudoInstSI <
def S_SUB_U64_CO_PSEUDO : SPseudoInstSI <
(outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
>;
+
+def S_ADD_CO_PSEUDO : SPseudoInstSI <
+ (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in)
+>;
+
+def S_SUB_CO_PSEUDO : SPseudoInstSI <
+ (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in)
+>;
+
+def S_UADDO_PSEUDO : SPseudoInstSI <
+ (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1)
+>;
+
+def S_USUBO_PSEUDO : SPseudoInstSI <
+ (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1)
+>;
+
} // End usesCustomInserter = 1, Defs = [SCC]
let usesCustomInserter = 1 in {
@@ -199,6 +258,7 @@ class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI<
let hasSideEffects = base_inst.hasSideEffects;
let UseNamedOperandTable = base_inst.UseNamedOperandTable;
let CodeSize = base_inst.CodeSize;
+ let SchedRW = base_inst.SchedRW;
}
let WaveSizePredicate = isWave64 in {
@@ -214,13 +274,14 @@ def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
}
+
def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
[(int_amdgcn_wave_barrier)]> {
let SchedRW = [];
let hasNoSchedulingInfo = 1;
let hasSideEffects = 1;
- let mayLoad = 1;
- let mayStore = 1;
+ let mayLoad = 0;
+ let mayStore = 0;
let isConvergent = 1;
let FixedSize = 1;
let Size = 0;
@@ -318,6 +379,9 @@ multiclass PseudoInstKill <dag ins> {
defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>;
defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
+let Defs = [EXEC] in
+def SI_KILL_CLEANUP : SPseudoInstSI <(outs), (ins)>;
+
let Defs = [EXEC,VCC] in
def SI_ILLEGAL_COPY : SPseudoInstSI <
(outs unknown:$dst), (ins unknown:$src),
@@ -386,7 +450,7 @@ def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
def : GCNPat <
(int_amdgcn_init_exec timm:$src),
- (SI_INIT_EXEC_LO (as_i32imm imm:$src))> {
+ (SI_INIT_EXEC_LO (as_i32timm timm:$src))> {
let WaveSizePredicate = isWave32;
}
@@ -413,8 +477,8 @@ def SI_RETURN : SPseudoInstSI <
// Return for returning function calls without output register.
//
-// This version is only needed so we can fill in the output regiter in
-// the custom inserter.
+// This version is only needed so we can fill in the output register
+// in the custom inserter.
def SI_CALL_ISEL : SPseudoInstSI <
(outs), (ins SSrc_b64:$src0, unknown:$callee),
[(AMDGPUcall i64:$src0, tglobaladdr:$callee)]> {
@@ -426,6 +490,11 @@ def SI_CALL_ISEL : SPseudoInstSI <
let isConvergent = 1;
}
+def : GCNPat<
+ (AMDGPUcall i64:$src0, (i64 0)),
+ (SI_CALL_ISEL $src0, (i64 0))
+>;
+
// Wrapper around s_swappc_b64 with extra $callee parameter to track
// the called function after regalloc.
def SI_CALL : SPseudoInstSI <
@@ -480,6 +549,8 @@ def ADJCALLSTACKDOWN : SPseudoInstSI<
let Defs = [M0, EXEC, SCC],
UseNamedOperandTable = 1 in {
+// SI_INDIRECT_SRC/DST are only used by legacy SelectionDAG indirect
+// addressing implementation.
class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI <
(outs VGPR_32:$vdst),
(ins rc:$src, VS_32:$idx, i32imm:$offset)> {
@@ -493,21 +564,81 @@ class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI <
let usesCustomInserter = 1;
}
-// TODO: We can support indirect SGPR access.
def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>;
def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>;
def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>;
def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>;
def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>;
+def SI_INDIRECT_SRC_V32 : SI_INDIRECT_SRC<VReg_1024>;
def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
+def SI_INDIRECT_DST_V32 : SI_INDIRECT_DST<VReg_1024>;
} // End Uses = [EXEC], Defs = [M0, EXEC]
+
+// This is a pseudo variant of the v_movreld_b32 (or v_mov_b32
+// expecting to be executed with gpr indexing mode enabled)
+// instruction in which the vector operand appears only twice, once as
+// def and once as use. Using this pseudo avoids problems with the Two
+// Address instructions pass.
+class INDIRECT_REG_WRITE_pseudo<RegisterClass rc,
+ RegisterOperand val_ty> : PseudoInstSI <
+ (outs rc:$vdst), (ins rc:$vsrc, val_ty:$val, i32imm:$subreg)> {
+ let Constraints = "$vsrc = $vdst";
+ let Uses = [M0];
+}
+
+class V_INDIRECT_REG_WRITE_B32_pseudo<RegisterClass rc> :
+ INDIRECT_REG_WRITE_pseudo<rc, VSrc_b32> {
+ let VALU = 1;
+ let VOP1 = 1;
+ let Uses = [M0, EXEC];
+}
+
+class S_INDIRECT_REG_WRITE_pseudo<RegisterClass rc,
+ RegisterOperand val_ty> :
+ INDIRECT_REG_WRITE_pseudo<rc, val_ty> {
+ let SALU = 1;
+ let SOP1 = 1;
+ let Uses = [M0];
+}
+
+class S_INDIRECT_REG_WRITE_B32_pseudo<RegisterClass rc> :
+ S_INDIRECT_REG_WRITE_pseudo<rc, SSrc_b32>;
+class S_INDIRECT_REG_WRITE_B64_pseudo<RegisterClass rc> :
+ S_INDIRECT_REG_WRITE_pseudo<rc, SSrc_b64>;
+
+
+def V_INDIRECT_REG_WRITE_B32_V1 : V_INDIRECT_REG_WRITE_B32_pseudo<VGPR_32>;
+def V_INDIRECT_REG_WRITE_B32_V2 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_64>;
+def V_INDIRECT_REG_WRITE_B32_V3 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_96>;
+def V_INDIRECT_REG_WRITE_B32_V4 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_128>;
+def V_INDIRECT_REG_WRITE_B32_V5 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_160>;
+def V_INDIRECT_REG_WRITE_B32_V8 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_256>;
+def V_INDIRECT_REG_WRITE_B32_V16 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_512>;
+def V_INDIRECT_REG_WRITE_B32_V32 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_1024>;
+
+def S_INDIRECT_REG_WRITE_B32_V1 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_32>;
+def S_INDIRECT_REG_WRITE_B32_V2 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_64>;
+def S_INDIRECT_REG_WRITE_B32_V3 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_96>;
+def S_INDIRECT_REG_WRITE_B32_V4 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_128>;
+def S_INDIRECT_REG_WRITE_B32_V5 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_160>;
+def S_INDIRECT_REG_WRITE_B32_V8 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_256>;
+def S_INDIRECT_REG_WRITE_B32_V16 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_512>;
+def S_INDIRECT_REG_WRITE_B32_V32 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_1024>;
+
+def S_INDIRECT_REG_WRITE_B64_V1 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_64>;
+def S_INDIRECT_REG_WRITE_B64_V2 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_128>;
+def S_INDIRECT_REG_WRITE_B64_V4 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_256>;
+def S_INDIRECT_REG_WRITE_B64_V8 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_512>;
+def S_INDIRECT_REG_WRITE_B64_V16 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_1024>;
+
+
multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
let UseNamedOperandTable = 1, SGPRSpill = 1, Uses = [EXEC] in {
def _SAVE : PseudoInstSI <
@@ -535,6 +666,7 @@ defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>;
defm SI_SPILL_S96 : SI_SPILL_SGPR <SReg_96>;
defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>;
+defm SI_SPILL_S192 : SI_SPILL_SGPR <SReg_192>;
defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>;
@@ -574,6 +706,7 @@ defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>;
defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>;
+defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>;
defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>;
@@ -639,12 +772,6 @@ def : GCNPat<
>;
def : Pat <
- // -1.0 as i32 (LowerINTRINSIC_VOID converts all other constants to -1.0)
- (AMDGPUkill (i32 -1082130432)),
- (SI_KILL_I1_PSEUDO (i1 0), 0)
->;
-
-def : Pat <
(int_amdgcn_kill i1:$src),
(SI_KILL_I1_PSEUDO SCSrc_i1:$src, 0)
>;
@@ -655,11 +782,6 @@ def : Pat <
>;
def : Pat <
- (AMDGPUkill i32:$src),
- (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, 0, 3) // 3 means SETOGE
->;
-
-def : Pat <
(int_amdgcn_kill (i1 (setcc f32:$src, InlineImmFP32:$imm, cond:$cond))),
(SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond))
>;
@@ -693,14 +815,14 @@ def : RsqPat<V_RSQ_F64_e32, f64>;
def : GCNPat <
(f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
(f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
- (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+ (V_FRACT_F32_e64 $mods, $x)
>;
// Convert (x + (-floor(x))) to fract(x)
def : GCNPat <
(f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
(f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
- (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+ (V_FRACT_F64_e64 $mods, $x)
>;
} // End OtherPredicates = [UnsafeFPMath]
@@ -709,27 +831,27 @@ def : GCNPat <
// f16_to_fp patterns
def : GCNPat <
(f32 (f16_to_fp i32:$src0)),
- (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+ (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0)
>;
def : GCNPat <
(f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))),
- (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+ (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0)
>;
def : GCNPat <
(f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))),
- (V_CVT_F32_F16_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0)), DSTCLAMP.NONE, DSTOMOD.NONE)
+ (V_CVT_F32_F16_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0)))
>;
def : GCNPat <
(f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))),
- (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+ (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0)
>;
def : GCNPat <
(f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))),
- (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+ (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0)
>;
def : GCNPat <
@@ -740,7 +862,7 @@ def : GCNPat <
// fp_to_fp16 patterns
def : GCNPat <
(i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
- (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+ (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0)
>;
def : GCNPat <
@@ -767,20 +889,29 @@ def : GCNPat <
// VOP2 Patterns
//===----------------------------------------------------------------------===//
-multiclass FMADPat <ValueType vt, Instruction inst> {
- def : GCNPat <
- (vt (fmad (VOP3NoMods vt:$src0),
- (VOP3NoMods vt:$src1),
- (VOP3NoMods vt:$src2))),
+// TODO: Check only no src2 mods?
+class FMADPat <ValueType vt, Instruction inst, SDPatternOperator node>
+ : GCNPat <(vt (node (vt (VOP3NoMods vt:$src0)),
+ (vt (VOP3NoMods vt:$src1)),
+ (vt (VOP3NoMods vt:$src2)))),
(inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
- >;
+>;
+
+
+// Prefer mac form when there are no modifiers.
+let AddedComplexity = 9 in {
+def : FMADPat <f32, V_MAC_F32_e64, fmad>;
+def : FMADPat <f32, V_MAC_F32_e64, AMDGPUfmad_ftz>;
+
+let SubtargetPredicate = Has16BitInsts in {
+def : FMADPat <f16, V_MAC_F16_e64, fmad>;
+def : FMADPat <f16, V_MAC_F16_e64, AMDGPUfmad_ftz>;
}
-defm : FMADPat <f16, V_MAC_F16_e64>;
-defm : FMADPat <f32, V_MAC_F32_e64>;
+}
-class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty>
+class FMADModsPat<ValueType Ty, Instruction inst, SDPatternOperator mad_opr>
: GCNPat<
(Ty (mad_opr (Ty (VOP3Mods Ty:$src0, i32:$src0_mod)),
(Ty (VOP3Mods Ty:$src1, i32:$src1_mod)),
@@ -789,24 +920,28 @@ class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty>
$src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
-// FIXME: This should select to V_MAC_F32
-def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz, f32>;
-def : FMADModsPat<V_MAD_F16, AMDGPUfmad_ftz, f16> {
+let SubtargetPredicate = HasMadMacF32Insts in
+def : FMADModsPat<f32, V_MAD_F32, AMDGPUfmad_ftz>;
+def : FMADModsPat<f16, V_MAD_F16, AMDGPUfmad_ftz> {
let SubtargetPredicate = Has16BitInsts;
}
-multiclass SelectPat <ValueType vt> {
- def : GCNPat <
- (vt (select i1:$src0, (VOP3Mods_f32 vt:$src1, i32:$src1_mods),
- (VOP3Mods_f32 vt:$src2, i32:$src2_mods))),
- (V_CNDMASK_B32_e64 $src2_mods, $src2, $src1_mods, $src1, $src0)
- >;
-}
+class VOPSelectModsPat <ValueType vt> : GCNPat <
+ (vt (select i1:$src0, (VOP3Mods vt:$src1, i32:$src1_mods),
+ (VOP3Mods vt:$src2, i32:$src2_mods))),
+ (V_CNDMASK_B32_e64 FP32InputMods:$src2_mods, VSrc_b32:$src2,
+ FP32InputMods:$src1_mods, VSrc_b32:$src1, SSrc_i1:$src0)
+>;
+
+class VOPSelectPat <ValueType vt> : GCNPat <
+ (vt (select i1:$src0, vt:$src1, vt:$src2)),
+ (V_CNDMASK_B32_e64 0, VSrc_b32:$src2, 0, VSrc_b32:$src1, SSrc_i1:$src0)
+>;
-defm : SelectPat <i16>;
-defm : SelectPat <i32>;
-defm : SelectPat <f16>;
-defm : SelectPat <f32>;
+def : VOPSelectModsPat <i32>;
+def : VOPSelectModsPat <f32>;
+def : VOPSelectPat <f16>;
+def : VOPSelectPat <i16>;
let AddedComplexity = 1 in {
def : GCNPat <
@@ -1039,6 +1174,8 @@ def : BitConvert <v4f32, v2f64, VReg_128>;
def : BitConvert <v4i32, v2f64, VReg_128>;
def : BitConvert <v2i64, v2f64, VReg_128>;
def : BitConvert <v2f64, v2i64, VReg_128>;
+def : BitConvert <v4f32, v2i64, VReg_128>;
+def : BitConvert <v2i64, v4f32, VReg_128>;
// 160-bit bitcast
def : BitConvert <v5i32, v5f32, SGPR_160>;
@@ -1049,14 +1186,46 @@ def : BitConvert <v8i32, v8f32, SReg_256>;
def : BitConvert <v8f32, v8i32, SReg_256>;
def : BitConvert <v8i32, v8f32, VReg_256>;
def : BitConvert <v8f32, v8i32, VReg_256>;
+def : BitConvert <v4i64, v4f64, VReg_256>;
+def : BitConvert <v4f64, v4i64, VReg_256>;
+def : BitConvert <v4i64, v8i32, VReg_256>;
+def : BitConvert <v4i64, v8f32, VReg_256>;
+def : BitConvert <v4f64, v8i32, VReg_256>;
+def : BitConvert <v4f64, v8f32, VReg_256>;
+def : BitConvert <v8i32, v4i64, VReg_256>;
+def : BitConvert <v8f32, v4i64, VReg_256>;
+def : BitConvert <v8i32, v4f64, VReg_256>;
+def : BitConvert <v8f32, v4f64, VReg_256>;
+
// 512-bit bitcast
def : BitConvert <v16i32, v16f32, VReg_512>;
def : BitConvert <v16f32, v16i32, VReg_512>;
+def : BitConvert <v8i64, v8f64, VReg_512>;
+def : BitConvert <v8f64, v8i64, VReg_512>;
+def : BitConvert <v8i64, v16i32, VReg_512>;
+def : BitConvert <v8f64, v16i32, VReg_512>;
+def : BitConvert <v16i32, v8i64, VReg_512>;
+def : BitConvert <v16i32, v8f64, VReg_512>;
+def : BitConvert <v8i64, v16f32, VReg_512>;
+def : BitConvert <v8f64, v16f32, VReg_512>;
+def : BitConvert <v16f32, v8i64, VReg_512>;
+def : BitConvert <v16f32, v8f64, VReg_512>;
// 1024-bit bitcast
def : BitConvert <v32i32, v32f32, VReg_1024>;
def : BitConvert <v32f32, v32i32, VReg_1024>;
+def : BitConvert <v16i64, v16f64, VReg_1024>;
+def : BitConvert <v16f64, v16i64, VReg_1024>;
+def : BitConvert <v16i64, v32i32, VReg_1024>;
+def : BitConvert <v32i32, v16i64, VReg_1024>;
+def : BitConvert <v16f64, v32f32, VReg_1024>;
+def : BitConvert <v32f32, v16f64, VReg_1024>;
+def : BitConvert <v16i64, v32f32, VReg_1024>;
+def : BitConvert <v32i32, v16f64, VReg_1024>;
+def : BitConvert <v16f64, v32i32, VReg_1024>;
+def : BitConvert <v32f32, v16i64, VReg_1024>;
+
/********** =================== **********/
/********** Src & Dst modifiers **********/
@@ -1155,7 +1324,7 @@ def : GCNPat <
(S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
>;
-// FIXME: The implicit-def of scc from S_[X]OR_B32 is mishandled
+// FIXME: The implicit-def of scc from S_[X]OR/AND_B32 is mishandled
// def : GCNPat <
// (fneg (f64 SReg_64:$src)),
// (REG_SEQUENCE SReg_64,
@@ -1176,6 +1345,17 @@ def : GCNPat <
// sub1)
// >;
+// FIXME: Use S_BITSET0_B32/B64?
+// def : GCNPat <
+// (fabs (f64 SReg_64:$src)),
+// (REG_SEQUENCE SReg_64,
+// (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)),
+// sub0,
+// (S_AND_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)),
+// (i32 (S_MOV_B32 (i32 0x7fffffff)))),
+// sub1)
+// >;
+
} // End let AddedComplexity = 1
def : GCNPat <
@@ -1372,11 +1552,12 @@ class Ext32Pat <SDNode ext> : GCNPat <
def : Ext32Pat <zext>;
def : Ext32Pat <anyext>;
-// The multiplication scales from [0,1] to the unsigned integer range
+// The multiplication scales from [0,1) to the unsigned integer range,
+// rounding down a bit to avoid unwanted overflow.
def : GCNPat <
(AMDGPUurecip i32:$src0),
(V_CVT_U32_F32_e32
- (V_MUL_F32_e32 (i32 CONST.FP_UINT_MAX_PLUS_1),
+ (V_MUL_F32_e32 (i32 CONST.FP_4294966784),
(V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
>;
@@ -1421,11 +1602,13 @@ defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">;
defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">;
defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">;
defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">;
+defm : SI_INDIRECT_Pattern <v32f32, f32, "V32">;
defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">;
defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">;
defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">;
defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">;
+defm : SI_INDIRECT_Pattern <v32i32, i32, "V32">;
//===----------------------------------------------------------------------===//
// SAD Patterns
@@ -1695,102 +1878,187 @@ def : GCNPat <
def : GCNPat <
(i32 (bswap i32:$a)),
(V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
- (V_ALIGNBIT_B32 $a, $a, (i32 24)),
- (V_ALIGNBIT_B32 $a, $a, (i32 8)))
+ (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 24)),
+ (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 8)))
>;
-let OtherPredicates = [NoFP16Denormals] in {
-def : GCNPat<
- (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
- (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src, 0, 0)
+// FIXME: This should have been narrowed to i32 during legalization.
+// This pattern should also be skipped for GlobalISel
+def : GCNPat <
+ (i64 (bswap i64:$a)),
+ (REG_SEQUENCE VReg_64,
+ (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
+ (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
+ (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
+ (i32 24)),
+ (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
+ (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
+ (i32 8))),
+ sub0,
+ (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
+ (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
+ (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
+ (i32 24)),
+ (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
+ (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
+ (i32 8))),
+ sub1)
+>;
+
+// FIXME: The AddedComplexity should not be needed, but in GlobalISel
+// the BFI pattern ends up taking precedence without it.
+let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in {
+// Magic number: 3 | (2 << 8) | (1 << 16) | (0 << 24)
+//
+// My reading of the manual suggests we should be using src0 for the
+// register value, but this is what seems to work.
+def : GCNPat <
+ (i32 (bswap i32:$a)),
+ (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x00010203)))
>;
-def : GCNPat<
- (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
- (V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src, 0, 0)
+// FIXME: This should have been narrowed to i32 during legalization.
+// This pattern should also be skipped for GlobalISel
+def : GCNPat <
+ (i64 (bswap i64:$a)),
+ (REG_SEQUENCE VReg_64,
+ (V_PERM_B32 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub1),
+ (S_MOV_B32 (i32 0x00010203))),
+ sub0,
+ (V_PERM_B32 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub0),
+ (S_MOV_B32 (i32 0x00010203))),
+ sub1)
>;
-def : GCNPat<
- (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
- (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
+// Magic number: 1 | (0 << 8) | (12 << 16) | (12 << 24)
+// The 12s emit 0s.
+def : GCNPat <
+ (i16 (bswap i16:$a)),
+ (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
>;
-}
-let OtherPredicates = [FP16Denormals] in {
-def : GCNPat<
- (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
- (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)
+def : GCNPat <
+ (i32 (zext (bswap i16:$a))),
+ (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
>;
-let SubtargetPredicate = HasVOP3PInsts in {
-def : GCNPat<
- (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
- (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)
+// Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24)
+def : GCNPat <
+ (v2i16 (bswap v2i16:$a)),
+ (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x02030001)))
>;
+
}
-}
-let OtherPredicates = [NoFP32Denormals] in {
+
+// Prefer selecting to max when legal, but using mul is always valid.
+let AddedComplexity = -5 in {
def : GCNPat<
- (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
- (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src, 0, 0)
+ (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
+ (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src)
>;
def : GCNPat<
- (fcanonicalize (f32 (fneg (VOP3Mods f32:$src, i32:$src_mods)))),
- (V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src, 0, 0)
+ (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
+ (V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src)
+>;
+
+def : GCNPat<
+ (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
+ (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
>;
-}
-let OtherPredicates = [FP32Denormals] in {
def : GCNPat<
(fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
- (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src, 0, 0)
+ (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src)
>;
-}
-let OtherPredicates = [NoFP64Denormals] in {
def : GCNPat<
- (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
- (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src, 0, 0)
+ (fcanonicalize (f32 (fneg (VOP3Mods f32:$src, i32:$src_mods)))),
+ (V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src)
>;
-}
-let OtherPredicates = [FP64Denormals] in {
+// TODO: Handle fneg like other types.
def : GCNPat<
(fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
- (V_MAX_F64 $src_mods, $src, $src_mods, $src, 0, 0)
+ (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src)
>;
+} // End AddedComplexity = -5
+
+multiclass SelectCanonicalizeAsMax<
+ list<Predicate> f32_preds = [],
+ list<Predicate> f64_preds = [],
+ list<Predicate> f16_preds = []> {
+ def : GCNPat<
+ (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
+ (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src)> {
+ let OtherPredicates = f32_preds;
+ }
+
+ def : GCNPat<
+ (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
+ (V_MAX_F64 $src_mods, $src, $src_mods, $src)> {
+ let OtherPredicates = f64_preds;
+ }
+
+ def : GCNPat<
+ (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
+ (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> {
+ // FIXME: Should have 16-bit inst subtarget predicate
+ let OtherPredicates = f16_preds;
+ }
+
+ def : GCNPat<
+ (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
+ (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)> {
+ // FIXME: Should have VOP3P subtarget predicate
+ let OtherPredicates = f16_preds;
+ }
}
+// On pre-gfx9 targets, v_max_*/v_min_* did not respect the denormal
+// mode, and would never flush. For f64, it's faster to do implement
+// this with a max. For f16/f32 it's a wash, but prefer max when
+// valid.
+//
+// FIXME: Lowering f32/f16 with max is worse since we can use a
+// smaller encoding if the input is fneg'd. It also adds an extra
+// register use.
+let SubtargetPredicate = HasMinMaxDenormModes in {
+ defm : SelectCanonicalizeAsMax<[], [], []>;
+} // End SubtargetPredicate = HasMinMaxDenormModes
+
+let SubtargetPredicate = NotHasMinMaxDenormModes in {
+ // Use the max lowering if we don't need to flush.
+
+ // FIXME: We don't do use this for f32 as a workaround for the
+ // library being compiled with the default ieee mode, but
+ // potentially being called from flushing kernels. Really we should
+ // not be mixing code expecting different default FP modes, but mul
+ // works in any FP environment.
+ defm : SelectCanonicalizeAsMax<[FalsePredicate], [FP64Denormals], [FP16Denormals]>;
+} // End SubtargetPredicate = NotHasMinMaxDenormModes
+
+
let OtherPredicates = [HasDLInsts] in {
def : GCNPat <
- (fma (f32 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (fma (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)),
(f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
(f32 (VOP3NoMods f32:$src2))),
(V_FMAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
- SRCMODS.NONE, $src2, $clamp, $omod)
+ SRCMODS.NONE, $src2)
>;
} // End OtherPredicates = [HasDLInsts]
let SubtargetPredicate = isGFX10Plus in
def : GCNPat <
- (fma (f16 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (fma (f16 (VOP3Mods f32:$src0, i32:$src0_modifiers)),
(f16 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
(f16 (VOP3NoMods f32:$src2))),
(V_FMAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
- SRCMODS.NONE, $src2, $clamp, $omod)
->;
-
-// Allow integer inputs
-class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPat<
- (node (i8 timm:$tgt), (i8 timm:$en), vt:$src0, vt:$src1, vt:$src2, vt:$src3, (i1 timm:$compr), (i1 timm:$vm)),
- (Inst i8:$tgt, vt:$src0, vt:$src1, vt:$src2, vt:$src3, i1:$vm, i1:$compr, i8:$en)
+ SRCMODS.NONE, $src2)
>;
-def : ExpPattern<AMDGPUexport, i32, EXP>;
-def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>;
-
// COPY is workaround tablegen bug from multiple outputs
// from S_LSHL_B32's multiple outputs from implicit scc def.
def : GCNPat <
@@ -1873,19 +2141,20 @@ def : GCNPat <
>;
def : GCNPat <
- (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask,
- timm:$bound_ctrl)),
- (V_MOV_B64_DPP_PSEUDO $src, $src, (as_i32imm $dpp_ctrl),
- (as_i32imm $row_mask), (as_i32imm $bank_mask),
- (as_i1imm $bound_ctrl))
+ (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask,
+ timm:$bank_mask, timm:$bound_ctrl)),
+ (V_MOV_B64_DPP_PSEUDO VReg_64:$src, VReg_64:$src,
+ (as_i32timm $dpp_ctrl), (as_i32timm $row_mask),
+ (as_i32timm $bank_mask),
+ (as_i1timm $bound_ctrl))
>;
def : GCNPat <
(i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask,
timm:$bank_mask, timm:$bound_ctrl)),
- (V_MOV_B64_DPP_PSEUDO $old, $src, (as_i32imm $dpp_ctrl),
- (as_i32imm $row_mask), (as_i32imm $bank_mask),
- (as_i1imm $bound_ctrl))
+ (V_MOV_B64_DPP_PSEUDO VReg_64:$old, VReg_64:$src, (as_i32timm $dpp_ctrl),
+ (as_i32timm $row_mask), (as_i32timm $bank_mask),
+ (as_i1timm $bound_ctrl))
>;
//===----------------------------------------------------------------------===//
@@ -1901,6 +2170,11 @@ let SubtargetPredicate = isGFX6 in {
// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
// Convert floor(x) to (x - fract(x))
+
+// Don't bother handling this for GlobalISel, it's handled during
+// lowering.
+//
+// FIXME: DAG should also custom lower this.
def : GCNPat <
(f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))),
(V_ADD_F64
@@ -1910,13 +2184,11 @@ def : GCNPat <
(V_CNDMASK_B64_PSEUDO
(V_MIN_F64
SRCMODS.NONE,
- (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
+ (V_FRACT_F64_e64 $mods, $x),
SRCMODS.NONE,
- (V_MOV_B64_PSEUDO 0x3fefffffffffffff),
- DSTCLAMP.NONE, DSTOMOD.NONE),
+ (V_MOV_B64_PSEUDO 0x3fefffffffffffff)),
$x,
- (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))),
- DSTCLAMP.NONE, DSTOMOD.NONE)
+ (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))))
>;
} // End SubtargetPredicates = isGFX6
@@ -2061,13 +2333,164 @@ def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction {
let hasSideEffects = 0;
}
+def G_AMDGPU_RCP_IFLAG : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type1:$src);
+ let hasSideEffects = 0;
+}
+
+class BufferLoadGenericInstruction : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset,
+ type2:$soffset, untyped_imm_0:$offset,
+ untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+}
+
+class TBufferLoadGenericInstruction : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset,
+ type2:$soffset, untyped_imm_0:$offset, untyped_imm_0:$format,
+ untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+}
+
+def G_AMDGPU_BUFFER_LOAD_UBYTE : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_SBYTE : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_USHORT : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_SSHORT : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_FORMAT : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction;
+def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction;
+def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction;
+
+class BufferStoreGenericInstruction : AMDGPUGenericInstruction {
+ let OutOperandList = (outs);
+ let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
+ type2:$soffset, untyped_imm_0:$offset,
+ untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
+ let hasSideEffects = 0;
+ let mayStore = 1;
+}
+
+class TBufferStoreGenericInstruction : AMDGPUGenericInstruction {
+ let OutOperandList = (outs);
+ let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
+ type2:$soffset, untyped_imm_0:$offset,
+ untyped_imm_0:$format,
+ untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
+ let hasSideEffects = 0;
+ let mayStore = 1;
+}
+
+def G_AMDGPU_BUFFER_STORE : BufferStoreGenericInstruction;
+def G_AMDGPU_BUFFER_STORE_BYTE : BufferStoreGenericInstruction;
+def G_AMDGPU_BUFFER_STORE_SHORT : BufferStoreGenericInstruction;
+def G_AMDGPU_BUFFER_STORE_FORMAT : BufferStoreGenericInstruction;
+def G_AMDGPU_BUFFER_STORE_FORMAT_D16 : BufferStoreGenericInstruction;
+def G_AMDGPU_TBUFFER_STORE_FORMAT : TBufferStoreGenericInstruction;
+def G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : TBufferStoreGenericInstruction;
+
+def G_AMDGPU_FMIN_LEGACY : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1);
+ let hasSideEffects = 0;
+}
+
+def G_AMDGPU_FMAX_LEGACY : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1);
+ let hasSideEffects = 0;
+}
+
+foreach N = 0-3 in {
+def G_AMDGPU_CVT_F32_UBYTE#N : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0);
+ let hasSideEffects = 0;
+}
+}
+
// Atomic cmpxchg. $cmpval ad $newval are packed in a single vector
// operand Expects a MachineMemOperand in addition to explicit
// operands.
def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$oldval);
- let InOperandList = (ins ptype1:$addr, type0:$cmpval_nnenwval);
+ let InOperandList = (ins ptype1:$addr, type0:$cmpval_newval);
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+ let mayStore = 1;
+}
+
+let Namespace = "AMDGPU" in {
+def G_AMDGPU_ATOMIC_INC : G_ATOMICRMW_OP;
+def G_AMDGPU_ATOMIC_DEC : G_ATOMICRMW_OP;
+}
+
+class BufferAtomicGenericInstruction : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
+ type2:$soffset, untyped_imm_0:$offset,
+ untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
let hasSideEffects = 0;
let mayLoad = 1;
let mayStore = 1;
}
+
+def G_AMDGPU_BUFFER_ATOMIC_SWAP : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_ADD : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_SUB : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_SMIN : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_UMIN : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction;
+
+def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$vdata, type0:$cmp, type1:$rsrc, type2:$vindex,
+ type2:$voffset, type2:$soffset, untyped_imm_0:$offset,
+ untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+ let mayStore = 1;
+}
+
+// Wrapper around llvm.amdgcn.s.buffer.load. This is mostly needed as
+// a workaround for the intrinsic being defined as readnone, but
+// really needs a memory operand.
+def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type1:$rsrc, type2:$offset, untyped_imm_0:$cachepolicy);
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+ let mayStore = 0;
+}
+
+// This is equivalent to the G_INTRINSIC*, but the operands may have
+// been legalized depending on the subtarget requirements.
+def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins unknown:$intrin, variable_ops);
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+
+ // FIXME: Use separate opcode for atomics.
+ let mayStore = 1;
+}
+
+// This is equivalent to the G_INTRINSIC*, but the operands may have
+// been legalized depending on the subtarget requirements.
+def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction {
+ let OutOperandList = (outs);
+ let InOperandList = (ins unknown:$intrin, variable_ops);
+ let hasSideEffects = 0;
+ let mayStore = 1;
+}