aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/NVPTX/NVPTXInstrInfo.td')
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td2544
1 files changed, 1290 insertions, 1254 deletions
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index cd17a9de541a..633a99d0fc1b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -139,6 +139,9 @@ def hasVote : Predicate<"Subtarget->hasVote()">;
def hasDouble : Predicate<"Subtarget->hasDouble()">;
def hasLDG : Predicate<"Subtarget->hasLDG()">;
def hasLDU : Predicate<"Subtarget->hasLDU()">;
+def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">;
+def noPTXASUnreachableBug : Predicate<"!Subtarget->hasPTXASUnreachableBug()">;
+def hasOptEnabled : Predicate<"TM.getOptLevel() != CodeGenOptLevel::None">;
def doF32FTZ : Predicate<"useF32FTZ()">;
def doNoF32FTZ : Predicate<"!useF32FTZ()">;
@@ -159,6 +162,7 @@ def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
+def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
def True : Predicate<"true">;
def False : Predicate<"false">;
@@ -168,15 +172,14 @@ class hasSM<int version>: Predicate<"Subtarget->getSmVersion() >= " # version>;
// Explicit records for arch-accelerated SM versions
def hasSM90a : Predicate<"Subtarget->getFullSmVersion() == 901">;
+def hasSM100a : Predicate<"Subtarget->getFullSmVersion() == 1001">;
+def hasSM101a : Predicate<"Subtarget->getFullSmVersion() == 1011">;
+def hasSM120a : Predicate<"Subtarget->getFullSmVersion() == 1201">;
// non-sync shfl instructions are not available on sm_70+ in PTX6.4+
def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"
"&& Subtarget->getPTXVersion() >= 64)">;
-def useShortPtrLocal : Predicate<"TM.is64Bit() && TM.getPointerSizeInBits(ADDRESS_SPACE_LOCAL) == 32">;
-def useShortPtrShared : Predicate<"TM.is64Bit() && TM.getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32">;
-def useShortPtrConst : Predicate<"TM.is64Bit() && TM.getPointerSizeInBits(ADDRESS_SPACE_CONST) == 32">;
-
def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
def hasBF16Math: Predicate<"Subtarget->hasBF16Math()">;
@@ -208,39 +211,45 @@ class ValueToRegClass<ValueType T> {
// Some Common Instruction Class Templates
//===----------------------------------------------------------------------===//
+// Utility class to wrap up information about a register and DAG type for more
+// convenient iteration and parameterization
+class RegTyInfo<ValueType ty, NVPTXRegClass rc, Operand imm> {
+ ValueType Ty = ty;
+ NVPTXRegClass RC = rc;
+ Operand Imm = imm;
+ int Size = ty.Size;
+}
+
+def I16RT : RegTyInfo<i16, Int16Regs, i16imm>;
+def I32RT : RegTyInfo<i32, Int32Regs, i32imm>;
+def I64RT : RegTyInfo<i64, Int64Regs, i64imm>;
+
// Template for instructions which take three int64, int32, or int16 args.
// The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
-multiclass I3<string OpcStr, SDNode OpNode> {
- def i64rr :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
- !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
- [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
- def i64ri :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
- !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
- [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
- def i32rr :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
- !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>;
- def i32ri :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
- !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>;
- def i16rr :
- NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
- !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
- def i16ri :
- NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
- !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
+multiclass I3<string OpcStr, SDNode OpNode, bit commutative> {
+ foreach t = [I16RT, I32RT, I64RT] in {
+ defvar asmstr = OpcStr # t.Size # " \t$dst, $a, $b;";
+
+ def t.Ty # rr :
+ NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.RC:$b),
+ asmstr,
+ [(set t.Ty:$dst, (OpNode t.Ty:$a, t.Ty:$b))]>;
+ def t.Ty # ri :
+ NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.Imm:$b),
+ asmstr,
+ [(set t.Ty:$dst, (OpNode t.RC:$a, imm:$b))]>;
+ if !not(commutative) then
+ def t.Ty # ir :
+ NVPTXInst<(outs t.RC:$dst), (ins t.Imm:$a, t.RC:$b),
+ asmstr,
+ [(set t.Ty:$dst, (OpNode imm:$a, t.RC:$b))]>;
+ }
}
class I16x2<string OpcStr, SDNode OpNode> :
NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
!strconcat(OpcStr, "16x2 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)))]>,
+ [(set v2i16:$dst, (OpNode v2i16:$a, v2i16:$b))]>,
Requires<[hasPTX<80>, hasSM<90>]>;
// Template for instructions which take 3 int args. The instructions are
@@ -250,117 +259,101 @@ multiclass ADD_SUB_INT_CARRY<string OpcStr, SDNode OpNode> {
def i32rr :
NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
!strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>;
+ [(set i32:$dst, (OpNode i32:$a, i32:$b))]>;
def i32ri :
NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
!strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>;
+ [(set i32:$dst, (OpNode i32:$a, imm:$b))]>;
def i64rr :
NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
!strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
- [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>,
+ [(set i64:$dst, (OpNode i64:$a, i64:$b))]>,
Requires<[hasPTX<43>]>;
def i64ri :
NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
!strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
- [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>,
+ [(set i64:$dst, (OpNode i64:$a, imm:$b))]>,
Requires<[hasPTX<43>]>;
}
}
-// Template for instructions which take three fp64 or fp32 args. The
-// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64").
+// Template for minimum/maximum instructions.
//
// Also defines ftz (flush subnormal inputs and results to sign-preserving
// zero) variants for fp32 functions.
-//
-// This multiclass should be used for nodes that cannot be folded into FMAs.
-// For nodes that can be folded into FMAs (i.e. adds and muls), use
-// F3_fma_component.
-multiclass F3<string OpcStr, SDNode OpNode> {
+multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDNode OpNode> {
+ if !not(NaN) then {
def f64rr :
NVPTXInst<(outs Float64Regs:$dst),
(ins Float64Regs:$a, Float64Regs:$b),
!strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
- [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
+ [(set f64:$dst, (OpNode f64:$a, f64:$b))]>;
def f64ri :
NVPTXInst<(outs Float64Regs:$dst),
(ins Float64Regs:$a, f64imm:$b),
!strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
- [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
+ [(set f64:$dst, (OpNode f64:$a, fpimm:$b))]>;
+ }
def f32rr_ftz :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
!strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+ [(set f32:$dst, (OpNode f32:$a, f32:$b))]>,
Requires<[doF32FTZ]>;
def f32ri_ftz :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
!strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+ [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>,
Requires<[doF32FTZ]>;
def f32rr :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
!strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
+ [(set f32:$dst, (OpNode f32:$a, f32:$b))]>;
def f32ri :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
!strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
+ [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>;
def f16rr_ftz :
NVPTXInst<(outs Int16Regs:$dst),
(ins Int16Regs:$a, Int16Regs:$b),
!strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
+ [(set f16:$dst, (OpNode f16:$a, f16:$b))]>,
Requires<[useFP16Math, doF32FTZ]>;
def f16rr :
NVPTXInst<(outs Int16Regs:$dst),
(ins Int16Regs:$a, Int16Regs:$b),
!strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
- Requires<[useFP16Math]>;
+ [(set f16:$dst, (OpNode f16:$a, f16:$b))]>,
+ Requires<[useFP16Math, hasSM<80>, hasPTX<70>]>;
def f16x2rr_ftz :
NVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$a, Int32Regs:$b),
!strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
- Requires<[useFP16Math, doF32FTZ]>;
+ [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>,
+ Requires<[useFP16Math, hasSM<80>, hasPTX<70>, doF32FTZ]>;
def f16x2rr :
NVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$a, Int32Regs:$b),
!strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
- Requires<[useFP16Math]>;
- def bf16rr_ftz :
- NVPTXInst<(outs Int16Regs:$dst),
- (ins Int16Regs:$a, Int16Regs:$b),
- !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
- Requires<[hasBF16Math, doF32FTZ]>;
+ [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>,
+ Requires<[useFP16Math, hasSM<80>, hasPTX<70>]>;
def bf16rr :
NVPTXInst<(outs Int16Regs:$dst),
(ins Int16Regs:$a, Int16Regs:$b),
!strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
- Requires<[hasBF16Math]>;
-
- def bf16x2rr_ftz :
- NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$a, Int32Regs:$b),
- !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
- Requires<[hasBF16Math, doF32FTZ]>;
+ [(set bf16:$dst, (OpNode bf16:$a, bf16:$b))]>,
+ Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>;
def bf16x2rr :
NVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$a, Int32Regs:$b),
!strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
- Requires<[hasBF16Math]>;
+ [(set v2bf16:$dst, (OpNode v2bf16:$a, v2bf16:$b))]>,
+ Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>;
}
// Template for instructions which take three FP args. The
@@ -377,173 +370,161 @@ multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
NVPTXInst<(outs Float64Regs:$dst),
(ins Float64Regs:$a, Float64Regs:$b),
!strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
- [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+ [(set f64:$dst, (OpNode f64:$a, f64:$b))]>,
Requires<[allowFMA]>;
def f64ri :
NVPTXInst<(outs Float64Regs:$dst),
(ins Float64Regs:$a, f64imm:$b),
!strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
- [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
+ [(set f64:$dst, (OpNode f64:$a, fpimm:$b))]>,
Requires<[allowFMA]>;
def f32rr_ftz :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
!strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+ [(set f32:$dst, (OpNode f32:$a, f32:$b))]>,
Requires<[allowFMA, doF32FTZ]>;
def f32ri_ftz :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
!strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+ [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>,
Requires<[allowFMA, doF32FTZ]>;
def f32rr :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
!strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+ [(set f32:$dst, (OpNode f32:$a, f32:$b))]>,
Requires<[allowFMA]>;
def f32ri :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
!strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+ [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>,
Requires<[allowFMA]>;
def f16rr_ftz :
NVPTXInst<(outs Int16Regs:$dst),
(ins Int16Regs:$a, Int16Regs:$b),
!strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
+ [(set f16:$dst, (OpNode f16:$a, f16:$b))]>,
Requires<[useFP16Math, allowFMA, doF32FTZ]>;
def f16rr :
NVPTXInst<(outs Int16Regs:$dst),
(ins Int16Regs:$a, Int16Regs:$b),
!strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
+ [(set f16:$dst, (OpNode f16:$a, f16:$b))]>,
Requires<[useFP16Math, allowFMA]>;
def f16x2rr_ftz :
NVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$a, Int32Regs:$b),
!strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
- [(set (v2f16 Int32Regs:$dst), (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
+ [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>,
Requires<[useFP16Math, allowFMA, doF32FTZ]>;
def f16x2rr :
NVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$a, Int32Regs:$b),
!strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
+ [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>,
Requires<[useFP16Math, allowFMA]>;
- def bf16rr_ftz :
- NVPTXInst<(outs Int16Regs:$dst),
- (ins Int16Regs:$a, Int16Regs:$b),
- !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
- Requires<[hasBF16Math, allowFMA, doF32FTZ]>;
def bf16rr :
NVPTXInst<(outs Int16Regs:$dst),
(ins Int16Regs:$a, Int16Regs:$b),
!strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
+ [(set bf16:$dst, (OpNode bf16:$a, bf16:$b))]>,
Requires<[hasBF16Math, allowFMA]>;
- def bf16x2rr_ftz :
- NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$a, Int32Regs:$b),
- !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"),
- [(set (v2bf16 Int32Regs:$dst), (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
- Requires<[hasBF16Math, allowFMA, doF32FTZ]>;
def bf16x2rr :
NVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$a, Int32Regs:$b),
!strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
+ [(set v2bf16:$dst, (OpNode v2bf16:$a, v2bf16:$b))]>,
Requires<[hasBF16Math, allowFMA]>;
// These have strange names so we don't perturb existing mir tests.
def _rnf64rr :
NVPTXInst<(outs Float64Regs:$dst),
(ins Float64Regs:$a, Float64Regs:$b),
!strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
- [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+ [(set f64:$dst, (OpNode f64:$a, f64:$b))]>,
Requires<[noFMA]>;
def _rnf64ri :
NVPTXInst<(outs Float64Regs:$dst),
(ins Float64Regs:$a, f64imm:$b),
!strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
- [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
+ [(set f64:$dst, (OpNode f64:$a, fpimm:$b))]>,
Requires<[noFMA]>;
def _rnf32rr_ftz :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
!strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+ [(set f32:$dst, (OpNode f32:$a, Float32Regs:$b))]>,
Requires<[noFMA, doF32FTZ]>;
def _rnf32ri_ftz :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
!strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+ [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>,
Requires<[noFMA, doF32FTZ]>;
def _rnf32rr :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
!strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+ [(set f32:$dst, (OpNode f32:$a, f32:$b))]>,
Requires<[noFMA]>;
def _rnf32ri :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
!strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+ [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>,
Requires<[noFMA]>;
def _rnf16rr_ftz :
NVPTXInst<(outs Int16Regs:$dst),
(ins Int16Regs:$a, Int16Regs:$b),
!strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
+ [(set f16:$dst, (OpNode f16:$a, f16:$b))]>,
Requires<[useFP16Math, noFMA, doF32FTZ]>;
def _rnf16rr :
NVPTXInst<(outs Int16Regs:$dst),
(ins Int16Regs:$a, Int16Regs:$b),
!strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
+ [(set f16:$dst, (OpNode f16:$a, f16:$b))]>,
Requires<[useFP16Math, noFMA]>;
def _rnf16x2rr_ftz :
NVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$a, Int32Regs:$b),
!strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
+ [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>,
Requires<[useFP16Math, noFMA, doF32FTZ]>;
def _rnf16x2rr :
NVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$a, Int32Regs:$b),
!strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
+ [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>,
Requires<[useFP16Math, noFMA]>;
def _rnbf16rr_ftz :
NVPTXInst<(outs Int16Regs:$dst),
(ins Int16Regs:$a, Int16Regs:$b),
!strconcat(OpcStr, ".rn.ftz.bf16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
+ [(set bf16:$dst, (OpNode bf16:$a, bf16:$b))]>,
Requires<[hasBF16Math, noFMA, doF32FTZ]>;
def _rnbf16rr :
NVPTXInst<(outs Int16Regs:$dst),
(ins Int16Regs:$a, Int16Regs:$b),
!strconcat(OpcStr, ".rn.bf16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
+ [(set bf16:$dst, (OpNode bf16:$a, bf16:$b))]>,
Requires<[hasBF16Math, noFMA]>;
def _rnbf16x2rr_ftz :
NVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$a, Int32Regs:$b),
!strconcat(OpcStr, ".rn.ftz.bf16x2 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
+ [(set v2bf16:$dst, (OpNode v2bf16:$a, v2bf16:$b))]>,
Requires<[hasBF16Math, noFMA, doF32FTZ]>;
def _rnbf16x2rr :
NVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$a, Int32Regs:$b),
!strconcat(OpcStr, ".rn.bf16x2 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
+ [(set v2bf16:$dst, (OpNode v2bf16:$a, v2bf16:$b))]>,
Requires<[hasBF16Math, noFMA]>;
}
@@ -553,44 +534,56 @@ multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
multiclass F2<string OpcStr, SDNode OpNode> {
def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
!strconcat(OpcStr, ".f64 \t$dst, $a;"),
- [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
+ [(set f64:$dst, (OpNode f64:$a))]>;
def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
!strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
+ [(set f32:$dst, (OpNode f32:$a))]>,
Requires<[doF32FTZ]>;
def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
!strconcat(OpcStr, ".f32 \t$dst, $a;"),
- [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
+ [(set f32:$dst, (OpNode f32:$a))]>;
}
multiclass F2_Support_Half<string OpcStr, SDNode OpNode> {
def bf16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
!strconcat(OpcStr, ".bf16 \t$dst, $a;"),
- [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a)))]>,
+ [(set bf16:$dst, (OpNode bf16:$a))]>,
Requires<[hasSM<80>, hasPTX<70>]>;
def bf16x2 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
!strconcat(OpcStr, ".bf16x2 \t$dst, $a;"),
- [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a)))]>,
+ [(set v2bf16:$dst, (OpNode v2bf16:$a))]>,
Requires<[hasSM<80>, hasPTX<70>]>;
def f16_ftz : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
!strconcat(OpcStr, ".ftz.f16 \t$dst, $a;"),
- [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a)))]>,
+ [(set f16:$dst, (OpNode f16:$a))]>,
Requires<[hasSM<53>, hasPTX<65>, doF32FTZ]>;
def f16x2_ftz : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
!strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a;"),
- [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a)))]>,
+ [(set v2f16:$dst, (OpNode v2f16:$a))]>,
Requires<[hasSM<53>, hasPTX<65>, doF32FTZ]>;
def f16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
!strconcat(OpcStr, ".f16 \t$dst, $a;"),
- [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a)))]>,
+ [(set f16:$dst, (OpNode f16:$a))]>,
Requires<[hasSM<53>, hasPTX<65>]>;
def f16x2 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
!strconcat(OpcStr, ".f16x2 \t$dst, $a;"),
- [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a)))]>,
+ [(set v2f16:$dst, (OpNode v2f16:$a))]>,
Requires<[hasSM<53>, hasPTX<65>]>;
}
+// Variant where only .ftz.bf16 is supported.
+multiclass F2_Support_Half_BF<string OpcStr, SDNode OpNode> {
+ def bf16_ftz : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
+ OpcStr # ".ftz.bf16 \t$dst, $a;",
+ [(set bf16:$dst, (OpNode bf16:$a))]>,
+ Requires<[hasSM<90>, hasPTX<78>]>;
+ def bf16x2_ftz: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
+ OpcStr # ".ftz.bf16x2 \t$dst, $a;",
+ [(set v2bf16:$dst, (OpNode v2bf16:$a))]>,
+ Requires<[hasSM<90>, hasPTX<78>]>;
+}
+
//===----------------------------------------------------------------------===//
// NVPTX Instructions.
//===----------------------------------------------------------------------===//
@@ -725,8 +718,73 @@ let hasSideEffects = false in {
defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Int32Regs>;
defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>;
+
+ // FP8 conversions.
+ multiclass CVT_TO_F8X2<string F8Name> {
+ def _f32 :
+ NVPTXInst<(outs Int16Regs:$dst),
+ (ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode),
+ !strconcat("cvt${mode:base}.satfinite${mode:relu}.",
+ F8Name, "x2.f32 \t$dst, $src1, $src2;"), []>,
+ Requires<[hasPTX<81>, hasSM<89>]>;
+ def _f16x2 :
+ NVPTXInst<(outs Int16Regs:$dst),
+ (ins Int32Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}.satfinite${mode:relu}.",
+ F8Name, "x2.f16x2 \t$dst, $src;"), []>,
+ Requires<[hasPTX<81>, hasSM<89>]>;
+ }
+
+ defm CVT_e4m3x2 : CVT_TO_F8X2<"e4m3">;
+ defm CVT_e5m2x2 : CVT_TO_F8X2<"e5m2">;
+
+ class CVT_f16x2_fp8<string F8Name> :
+ NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int16Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:relu}.f16x2.",
+ F8Name, "x2 \t$dst, $src;"), []>,
+ Requires<[hasPTX<81>, hasSM<89>]>;
+
+ def CVT_f16x2_e4m3x2 : CVT_f16x2_fp8<"e4m3">;
+ def CVT_f16x2_e5m2x2 : CVT_f16x2_fp8<"e5m2">;
+
+ // Float to TF32 conversions
+ multiclass CVT_TO_TF32<string Modifier, list<Predicate> Preds = [hasPTX<78>, hasSM<90>]> {
+ defvar Intr = !cast<Intrinsic>("int_nvvm_f2tf32_" # !subst(".", "_", Modifier));
+
+ def NAME : NVPTXInst<(outs Int32Regs:$dst), (ins Float32Regs:$src),
+ "cvt." # Modifier # ".tf32.f32 \t$dst, $src;",
+ [(set i32:$dst, (Intr f32:$src))]>,
+ Requires<Preds>;
+ }
+
+ defm CVT_to_tf32_rn : CVT_TO_TF32<"rn">;
+ defm CVT_to_tf32_rz : CVT_TO_TF32<"rz">;
+ defm CVT_to_tf32_rn_relu : CVT_TO_TF32<"rn.relu">;
+ defm CVT_to_tf32_rz_relu : CVT_TO_TF32<"rz.relu">;
+ defm CVT_to_tf32_rna : CVT_TO_TF32<"rna", [hasPTX<70>, hasSM<80>]>;
+ defm CVT_to_tf32_rna_satf : CVT_TO_TF32<"rna.satfinite", [hasPTX<81>, hasSM<89>]>;
+
+ defm CVT_to_tf32_rn_satf : CVT_TO_TF32<"rn.satfinite", [hasPTX<86>, hasSM<100>]>;
+ defm CVT_to_tf32_rz_satf : CVT_TO_TF32<"rz.satfinite", [hasPTX<86>, hasSM<100>]>;
+ defm CVT_to_tf32_rn_relu_satf : CVT_TO_TF32<"rn.relu.satfinite", [hasPTX<86>, hasSM<100>]>;
+ defm CVT_to_tf32_rz_relu_satf : CVT_TO_TF32<"rz.relu.satfinite", [hasPTX<86>, hasSM<100>]>;
}
+def fpround_oneuse : PatFrag<(ops node:$a), (fpround node:$a), [{
+ return N->hasOneUse();
+}]>;
+
+def : Pat<(v2bf16 (build_vector (bf16 (fpround_oneuse f32:$lo)),
+ (bf16 (fpround_oneuse f32:$hi)))),
+ (CVT_bf16x2_f32 $hi, $lo, CvtRN)>,
+ Requires<[hasPTX<70>, hasSM<80>, hasBF16Math]>;
+
+def : Pat<(v2f16 (build_vector (f16 (fpround_oneuse f32:$lo)),
+ (f16 (fpround_oneuse f32:$hi)))),
+ (CVT_f16x2_f32 $hi, $lo, CvtRN)>,
+ Requires<[hasPTX<70>, hasSM<80>, useFP16Math]>;
+
//-----------------------------------
// Selection instructions (selp)
//-----------------------------------
@@ -757,22 +815,22 @@ let hasSideEffects = false in {
NVPTXInst<(outs RC:$dst),
(ins RC:$a, RC:$b, Int1Regs:$p),
!strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
- [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T RC:$b)))]>;
+ [(set T:$dst, (select i1:$p, T:$a, T:$b))]>;
def ri :
NVPTXInst<(outs RC:$dst),
(ins RC:$a, ImmCls:$b, Int1Regs:$p),
!strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
- [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T ImmNode:$b)))]>;
+ [(set T:$dst, (select i1:$p, T:$a, (T ImmNode:$b)))]>;
def ir :
NVPTXInst<(outs RC:$dst),
(ins ImmCls:$a, RC:$b, Int1Regs:$p),
!strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
- [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, (T RC:$b)))]>;
+ [(set T:$dst, (select i1:$p, ImmNode:$a, T:$b))]>;
def ii :
NVPTXInst<(outs RC:$dst),
(ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
!strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
- [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
+ [(set T:$dst, (select i1:$p, ImmNode:$a, ImmNode:$b))]>;
}
}
@@ -798,8 +856,8 @@ defm SELP_f64 : SELP_PATTERN<"f64", f64, Float64Regs, f64imm, fpimm>;
// defm SELP_f16x2 : SELP_PATTERN<"b32", v2f16, Int32Regs, v2f16imm, imm>;
foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
-def : Pat<(vt (select Int1Regs:$p, (vt Int32Regs:$a), (vt Int32Regs:$b))),
- (SELP_b32rr Int32Regs:$a, Int32Regs:$b, Int1Regs:$p)>;
+def : Pat<(vt (select i1:$p, vt:$a, vt:$b)),
+ (SELP_b32rr $a, $b, $p)>;
}
//-----------------------------------
@@ -827,10 +885,10 @@ def TESTINF_f64i : NVPTXInst<(outs Int1Regs:$p), (ins f64imm:$a),
multiclass ADD_SUB_i1<SDNode OpNode> {
def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
"xor.pred \t$dst, $a, $b;",
- [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
+ [(set i1:$dst, (OpNode i1:$a, i1:$b))]>;
def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
"xor.pred \t$dst, $a, $b;",
- [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
+ [(set i1:$dst, (OpNode i1:$a, (imm):$b))]>;
}
// int1 addition and subtraction are both just xor.
@@ -839,8 +897,8 @@ defm SUB_i1 : ADD_SUB_i1<sub>;
// int16, int32, and int64 signed addition. Since nvptx is 2's complement, we
// also use these for unsigned arithmetic.
-defm ADD : I3<"add.s", add>;
-defm SUB : I3<"sub.s", sub>;
+defm ADD : I3<"add.s", add, /*commutative=*/ true>;
+defm SUB : I3<"sub.s", sub, /*commutative=*/ false>;
def ADD16x2 : I16x2<"add.s", add>;
@@ -852,18 +910,18 @@ defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>;
defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>;
defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>;
-defm MULT : I3<"mul.lo.s", mul>;
+defm MULT : I3<"mul.lo.s", mul, /*commutative=*/ true>;
-defm MULTHS : I3<"mul.hi.s", mulhs>;
-defm MULTHU : I3<"mul.hi.u", mulhu>;
+defm MULTHS : I3<"mul.hi.s", mulhs, /*commutative=*/ true>;
+defm MULTHU : I3<"mul.hi.u", mulhu, /*commutative=*/ true>;
-defm SDIV : I3<"div.s", sdiv>;
-defm UDIV : I3<"div.u", udiv>;
+defm SDIV : I3<"div.s", sdiv, /*commutative=*/ false>;
+defm UDIV : I3<"div.u", udiv, /*commutative=*/ false>;
// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
// will lower it.
-defm SREM : I3<"rem.s", srem>;
-defm UREM : I3<"rem.u", urem>;
+defm SREM : I3<"rem.s", srem, /*commutative=*/ false>;
+defm UREM : I3<"rem.u", urem, /*commutative=*/ false>;
// Integer absolute value. NumBits should be one minus the bit width of RC.
// This idiom implements the algorithm at
@@ -871,17 +929,17 @@ defm UREM : I3<"rem.u", urem>;
multiclass ABS<ValueType T, RegisterClass RC, string SizeName> {
def : NVPTXInst<(outs RC:$dst), (ins RC:$a),
!strconcat("abs", SizeName, " \t$dst, $a;"),
- [(set (T RC:$dst), (abs (T RC:$a)))]>;
+ [(set T:$dst, (abs T:$a))]>;
}
defm ABS_16 : ABS<i16, Int16Regs, ".s16">;
defm ABS_32 : ABS<i32, Int32Regs, ".s32">;
defm ABS_64 : ABS<i64, Int64Regs, ".s64">;
// Integer min/max.
-defm SMAX : I3<"max.s", smax>;
-defm UMAX : I3<"max.u", umax>;
-defm SMIN : I3<"min.s", smin>;
-defm UMIN : I3<"min.u", umin>;
+defm SMAX : I3<"max.s", smax, /*commutative=*/ true>;
+defm UMAX : I3<"max.u", umax, /*commutative=*/ true>;
+defm SMIN : I3<"min.s", smin, /*commutative=*/ true>;
+defm UMIN : I3<"min.u", umin, /*commutative=*/ true>;
def SMAX16x2 : I16x2<"max.s", smax>;
def UMAX16x2 : I16x2<"max.u", umax>;
@@ -938,29 +996,29 @@ def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
// Matchers for signed, unsigned mul.wide ISD nodes.
def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)),
- (MULWIDES32 i16:$a, i16:$b)>,
+ (MULWIDES32 $a, $b)>,
Requires<[doMulWide]>;
-def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
- (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
+def : Pat<(i32 (mul_wide_signed i16:$a, imm:$b)),
+ (MULWIDES32Imm $a, imm:$b)>,
Requires<[doMulWide]>;
def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)),
- (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
+ (MULWIDEU32 $a, $b)>,
Requires<[doMulWide]>;
-def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
- (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
+def : Pat<(i32 (mul_wide_unsigned i16:$a, imm:$b)),
+ (MULWIDEU32Imm $a, imm:$b)>,
Requires<[doMulWide]>;
def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)),
- (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+ (MULWIDES64 $a, $b)>,
Requires<[doMulWide]>;
-def : Pat<(i64 (mul_wide_signed (i32 Int32Regs:$a), imm:$b)),
- (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
+def : Pat<(i64 (mul_wide_signed i32:$a, imm:$b)),
+ (MULWIDES64Imm $a, imm:$b)>,
Requires<[doMulWide]>;
def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)),
- (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
+ (MULWIDEU64 $a, $b)>,
Requires<[doMulWide]>;
-def : Pat<(i64 (mul_wide_unsigned (i32 Int32Regs:$a), imm:$b)),
- (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
+def : Pat<(i64 (mul_wide_unsigned i32:$a, imm:$b)),
+ (MULWIDEU64Imm $a, imm:$b)>,
Requires<[doMulWide]>;
// Predicates used for converting some patterns to mul.wide.
@@ -1009,132 +1067,98 @@ def SHL2MUL16 : SDNodeXForm<imm, [{
}]>;
// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
-def : Pat<(shl (sext Int32Regs:$a), (i32 IntConst_0_30:$b)),
- (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
+def : Pat<(shl (sext i32:$a), (i32 IntConst_0_30:$b)),
+ (MULWIDES64Imm $a, (SHL2MUL32 $b))>,
Requires<[doMulWide]>;
-def : Pat<(shl (zext Int32Regs:$a), (i32 IntConst_0_30:$b)),
- (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
+def : Pat<(shl (zext i32:$a), (i32 IntConst_0_30:$b)),
+ (MULWIDEU64Imm $a, (SHL2MUL32 $b))>,
Requires<[doMulWide]>;
-def : Pat<(shl (sext Int16Regs:$a), (i16 IntConst_0_14:$b)),
- (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
+def : Pat<(shl (sext i16:$a), (i16 IntConst_0_14:$b)),
+ (MULWIDES32Imm $a, (SHL2MUL16 $b))>,
Requires<[doMulWide]>;
-def : Pat<(shl (zext Int16Regs:$a), (i16 IntConst_0_14:$b)),
- (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
+def : Pat<(shl (zext i16:$a), (i16 IntConst_0_14:$b)),
+ (MULWIDEU32Imm $a, (SHL2MUL16 $b))>,
Requires<[doMulWide]>;
// Convert "sign/zero-extend then multiply" to mul.wide.
-def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
- (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+def : Pat<(mul (sext i32:$a), (sext i32:$b)),
+ (MULWIDES64 $a, $b)>,
Requires<[doMulWide]>;
-def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
- (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
+def : Pat<(mul (sext i32:$a), (i64 SInt32Const:$b)),
+ (MULWIDES64Imm64 $a, (i64 SInt32Const:$b))>,
Requires<[doMulWide]>;
-def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
- (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
+def : Pat<(mul (zext i32:$a), (zext i32:$b)),
+ (MULWIDEU64 $a, $b)>,
Requires<[doMulWide]>;
-def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
- (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
+def : Pat<(mul (zext i32:$a), (i64 UInt32Const:$b)),
+ (MULWIDEU64Imm64 $a, (i64 UInt32Const:$b))>,
Requires<[doMulWide]>;
-def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
- (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
+def : Pat<(mul (sext i16:$a), (sext i16:$b)),
+ (MULWIDES32 $a, $b)>,
Requires<[doMulWide]>;
-def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
- (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
+def : Pat<(mul (sext i16:$a), (i32 SInt16Const:$b)),
+ (MULWIDES32Imm32 $a, (i32 SInt16Const:$b))>,
Requires<[doMulWide]>;
-def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
- (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
+def : Pat<(mul (zext i16:$a), (zext i16:$b)),
+ (MULWIDEU32 $a, $b)>,
Requires<[doMulWide]>;
-def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
- (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
+def : Pat<(mul (zext i16:$a), (i32 UInt16Const:$b)),
+ (MULWIDEU32Imm32 $a, (i32 UInt16Const:$b))>,
Requires<[doMulWide]>;
//
// Integer multiply-add
//
-def SDTIMAD :
- SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>,
- SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
-def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
-
-def MAD16rrr :
- NVPTXInst<(outs Int16Regs:$dst),
- (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
- "mad.lo.s16 \t$dst, $a, $b, $c;",
- [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
-def MAD16rri :
- NVPTXInst<(outs Int16Regs:$dst),
- (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
- "mad.lo.s16 \t$dst, $a, $b, $c;",
- [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
-def MAD16rir :
- NVPTXInst<(outs Int16Regs:$dst),
- (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
- "mad.lo.s16 \t$dst, $a, $b, $c;",
- [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
-def MAD16rii :
- NVPTXInst<(outs Int16Regs:$dst),
- (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
- "mad.lo.s16 \t$dst, $a, $b, $c;",
- [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>;
-
-def MAD32rrr :
- NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
- "mad.lo.s32 \t$dst, $a, $b, $c;",
- [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), (i32 Int32Regs:$b), (i32 Int32Regs:$c)))]>;
-def MAD32rri :
- NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
- "mad.lo.s32 \t$dst, $a, $b, $c;",
- [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), (i32 Int32Regs:$b), imm:$c))]>;
-def MAD32rir :
- NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
- "mad.lo.s32 \t$dst, $a, $b, $c;",
- [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), imm:$b, (i32 Int32Regs:$c)))]>;
-def MAD32rii :
- NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
- "mad.lo.s32 \t$dst, $a, $b, $c;",
- [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), imm:$b, imm:$c))]>;
-
-def MAD64rrr :
- NVPTXInst<(outs Int64Regs:$dst),
- (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
- "mad.lo.s64 \t$dst, $a, $b, $c;",
- [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
-def MAD64rri :
- NVPTXInst<(outs Int64Regs:$dst),
- (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
- "mad.lo.s64 \t$dst, $a, $b, $c;",
- [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
-def MAD64rir :
- NVPTXInst<(outs Int64Regs:$dst),
- (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
- "mad.lo.s64 \t$dst, $a, $b, $c;",
- [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
-def MAD64rii :
- NVPTXInst<(outs Int64Regs:$dst),
- (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
- "mad.lo.s64 \t$dst, $a, $b, $c;",
- [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>;
+def mul_oneuse : PatFrag<(ops node:$a, node:$b), (mul node:$a, node:$b), [{
+ return N->hasOneUse();
+}]>;
+
+multiclass MAD<string Ptx, ValueType VT, NVPTXRegClass Reg, Operand Imm> {
+ def rrr:
+ NVPTXInst<(outs Reg:$dst),
+ (ins Reg:$a, Reg:$b, Reg:$c),
+ Ptx # " \t$dst, $a, $b, $c;",
+ [(set VT:$dst, (add (mul_oneuse VT:$a, VT:$b), VT:$c))]>;
+
+ def rir:
+ NVPTXInst<(outs Reg:$dst),
+ (ins Reg:$a, Imm:$b, Reg:$c),
+ Ptx # " \t$dst, $a, $b, $c;",
+ [(set VT:$dst, (add (mul_oneuse VT:$a, imm:$b), VT:$c))]>;
+ def rri:
+ NVPTXInst<(outs Reg:$dst),
+ (ins Reg:$a, Reg:$b, Imm:$c),
+ Ptx # " \t$dst, $a, $b, $c;",
+ [(set VT:$dst, (add (mul_oneuse VT:$a, VT:$b), imm:$c))]>;
+ def rii:
+ NVPTXInst<(outs Reg:$dst),
+ (ins Reg:$a, Imm:$b, Imm:$c),
+ Ptx # " \t$dst, $a, $b, $c;",
+ [(set VT:$dst, (add (mul_oneuse VT:$a, imm:$b), imm:$c))]>;
+}
+
+let Predicates = [hasOptEnabled] in {
+defm MAD16 : MAD<"mad.lo.s16", i16, Int16Regs, i16imm>;
+defm MAD32 : MAD<"mad.lo.s32", i32, Int32Regs, i32imm>;
+defm MAD64 : MAD<"mad.lo.s64", i64, Int64Regs, i64imm>;
+}
def INEG16 :
NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
"neg.s16 \t$dst, $src;",
- [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
+ [(set i16:$dst, (ineg i16:$src))]>;
def INEG32 :
NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
"neg.s32 \t$dst, $src;",
- [(set (i32 Int32Regs:$dst), (ineg (i32 Int32Regs:$src)))]>;
+ [(set i32:$dst, (ineg i32:$src))]>;
def INEG64 :
NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
"neg.s64 \t$dst, $src;",
- [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
+ [(set i64:$dst, (ineg i64:$src))]>;
//-----------------------------------
// Floating Point Arithmetic
@@ -1163,26 +1187,14 @@ def NegDoubleConst : SDNodeXForm<fpimm, [{
SDLoc(N), MVT::f64);
}]>;
-// Loads FP16 constant into a register.
-//
-// ptxas does not have hex representation for fp16, so we can't use
-// fp16 immediate values in .f16 instructions. Instead we have to load
-// the constant into a register using mov.b16.
-def LOAD_CONST_F16 :
- NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$a),
- "mov.b16 \t$dst, $a;", []>;
-def LOAD_CONST_BF16 :
- NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$a),
- "mov.b16 \t$dst, $a;", []>;
defm FADD : F3_fma_component<"add", fadd>;
defm FSUB : F3_fma_component<"sub", fsub>;
defm FMUL : F3_fma_component<"mul", fmul>;
-defm FMIN : F3<"min", fminnum>;
-defm FMAX : F3<"max", fmaxnum>;
-// Note: min.NaN.f64 and max.NaN.f64 do not actually exist.
-defm FMINNAN : F3<"min.NaN", fminimum>;
-defm FMAXNAN : F3<"max.NaN", fmaximum>;
+defm FMIN : FMINIMUMMAXIMUM<"min", /* NaN */ false, fminnum>;
+defm FMAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum>;
+defm FMINNAN : FMINIMUMMAXIMUM<"min.NaN", /* NaN */ true, fminimum>;
+defm FMAXNAN : FMINIMUMMAXIMUM<"max.NaN", /* NaN */ true, fmaximum>;
defm FABS : F2<"abs", fabs>;
defm FNEG : F2<"neg", fneg>;
@@ -1191,13 +1203,15 @@ defm FNEG_H: F2_Support_Half<"neg", fneg>;
defm FSQRT : F2<"sqrt.rn", fsqrt>;
+defm FEXP2_H: F2_Support_Half_BF<"ex2.approx", fexp2>;
+
//
// F16 NEG
//
class FNEG_F16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> :
NVPTXInst<(outs RC:$dst), (ins RC:$src),
!strconcat(OpcStr, " \t$dst, $src;"),
- [(set RC:$dst, (fneg (T RC:$src)))]>,
+ [(set T:$dst, (fneg T:$src))]>,
Requires<[useFP16Math, hasPTX<60>, hasSM<53>, Pred]>;
def FNEG16_ftz : FNEG_F16_F16X2<"neg.ftz.f16", f16, Int16Regs, doF32FTZ>;
def FNEG16 : FNEG_F16_F16X2<"neg.f16", f16, Int16Regs, True>;
@@ -1211,7 +1225,7 @@ def FNEG16x2 : FNEG_F16_F16X2<"neg.f16x2", v2f16, Int32Regs, True>;
class FNEG_BF16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> :
NVPTXInst<(outs RC:$dst), (ins RC:$src),
!strconcat(OpcStr, " \t$dst, $src;"),
- [(set RC:$dst, (fneg (T RC:$src)))]>,
+ [(set T:$dst, (fneg T:$src))]>,
Requires<[hasBF16Math, hasPTX<70>, hasSM<80>, Pred]>;
def BFNEG16_ftz : FNEG_BF16_F16X2<"neg.ftz.bf16", bf16, Int16Regs, doF32FTZ>;
def BFNEG16 : FNEG_BF16_F16X2<"neg.bf16", bf16, Int16Regs, True>;
@@ -1225,22 +1239,22 @@ def FDIV641r :
NVPTXInst<(outs Float64Regs:$dst),
(ins f64imm:$a, Float64Regs:$b),
"rcp.rn.f64 \t$dst, $b;",
- [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
+ [(set f64:$dst, (fdiv DoubleConst1:$a, f64:$b))]>;
def FDIV64rr :
NVPTXInst<(outs Float64Regs:$dst),
(ins Float64Regs:$a, Float64Regs:$b),
"div.rn.f64 \t$dst, $a, $b;",
- [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>;
+ [(set f64:$dst, (fdiv f64:$a, f64:$b))]>;
def FDIV64ri :
NVPTXInst<(outs Float64Regs:$dst),
(ins Float64Regs:$a, f64imm:$b),
"div.rn.f64 \t$dst, $a, $b;",
- [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>;
+ [(set f64:$dst, (fdiv f64:$a, fpimm:$b))]>;
// fdiv will be converted to rcp
// fneg (fdiv 1.0, X) => fneg (rcp.rn X)
-def : Pat<(fdiv DoubleConstNeg1:$a, Float64Regs:$b),
- (FNEGf64 (FDIV641r (NegDoubleConst node:$a), Float64Regs:$b))>;
+def : Pat<(fdiv DoubleConstNeg1:$a, f64:$b),
+ (FNEGf64 (FDIV641r (NegDoubleConst node:$a), $b))>;
//
// F32 Approximate reciprocal
@@ -1249,13 +1263,13 @@ def FDIV321r_ftz :
NVPTXInst<(outs Float32Regs:$dst),
(ins f32imm:$a, Float32Regs:$b),
"rcp.approx.ftz.f32 \t$dst, $b;",
- [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+ [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>,
Requires<[do_DIVF32_APPROX, doF32FTZ]>;
def FDIV321r :
NVPTXInst<(outs Float32Regs:$dst),
(ins f32imm:$a, Float32Regs:$b),
"rcp.approx.f32 \t$dst, $b;",
- [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+ [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>,
Requires<[do_DIVF32_APPROX]>;
//
// F32 Approximate division
@@ -1264,25 +1278,25 @@ def FDIV32approxrr_ftz :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
"div.approx.ftz.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+ [(set f32:$dst, (fdiv f32:$a, f32:$b))]>,
Requires<[do_DIVF32_APPROX, doF32FTZ]>;
def FDIV32approxri_ftz :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
"div.approx.ftz.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+ [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>,
Requires<[do_DIVF32_APPROX, doF32FTZ]>;
def FDIV32approxrr :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
"div.approx.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+ [(set f32:$dst, (fdiv f32:$a, f32:$b))]>,
Requires<[do_DIVF32_APPROX]>;
def FDIV32approxri :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
"div.approx.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+ [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>,
Requires<[do_DIVF32_APPROX]>;
//
// F32 Semi-accurate reciprocal
@@ -1293,13 +1307,13 @@ def FDIV321r_approx_ftz :
NVPTXInst<(outs Float32Regs:$dst),
(ins f32imm:$a, Float32Regs:$b),
"rcp.approx.ftz.f32 \t$dst, $b;",
- [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+ [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>,
Requires<[do_DIVF32_FULL, doF32FTZ]>;
def FDIV321r_approx :
NVPTXInst<(outs Float32Regs:$dst),
(ins f32imm:$a, Float32Regs:$b),
"rcp.approx.f32 \t$dst, $b;",
- [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+ [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>,
Requires<[do_DIVF32_FULL]>;
//
// F32 Semi-accurate division
@@ -1308,25 +1322,25 @@ def FDIV32rr_ftz :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
"div.full.ftz.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+ [(set f32:$dst, (fdiv Float32Regs:$a, f32:$b))]>,
Requires<[do_DIVF32_FULL, doF32FTZ]>;
def FDIV32ri_ftz :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
"div.full.ftz.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+ [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>,
Requires<[do_DIVF32_FULL, doF32FTZ]>;
def FDIV32rr :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
"div.full.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+ [(set f32:$dst, (fdiv f32:$a, f32:$b))]>,
Requires<[do_DIVF32_FULL]>;
def FDIV32ri :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
"div.full.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+ [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>,
Requires<[do_DIVF32_FULL]>;
//
// F32 Accurate reciprocal
@@ -1335,13 +1349,13 @@ def FDIV321r_prec_ftz :
NVPTXInst<(outs Float32Regs:$dst),
(ins f32imm:$a, Float32Regs:$b),
"rcp.rn.ftz.f32 \t$dst, $b;",
- [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+ [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>,
Requires<[doF32FTZ]>;
def FDIV321r_prec :
NVPTXInst<(outs Float32Regs:$dst),
(ins f32imm:$a, Float32Regs:$b),
"rcp.rn.f32 \t$dst, $b;",
- [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>;
+ [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>;
//
// F32 Accurate division
//
@@ -1349,62 +1363,69 @@ def FDIV32rr_prec_ftz :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
"div.rn.ftz.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+ [(set f32:$dst, (fdiv f32:$a, f32:$b))]>,
Requires<[doF32FTZ]>;
def FDIV32ri_prec_ftz :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
"div.rn.ftz.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+ [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>,
Requires<[doF32FTZ]>;
def FDIV32rr_prec :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
"div.rn.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>;
+ [(set f32:$dst, (fdiv f32:$a, f32:$b))]>;
def FDIV32ri_prec :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
"div.rn.f32 \t$dst, $a, $b;",
- [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>;
+ [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>;
//
// FMA
//
multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
- def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
- !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
- Requires<[Pred]>;
- def rri : NVPTXInst<(outs RC:$dst),
- (ins RC:$a, RC:$b, ImmCls:$c),
- !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>,
- Requires<[Pred]>;
- def rir : NVPTXInst<(outs RC:$dst),
- (ins RC:$a, ImmCls:$b, RC:$c),
- !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>,
- Requires<[Pred]>;
- def rii : NVPTXInst<(outs RC:$dst),
- (ins RC:$a, ImmCls:$b, ImmCls:$c),
- !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
- Requires<[Pred]>;
+ defvar asmstr = OpcStr # " \t$dst, $a, $b, $c;";
+ def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
+ asmstr,
+ [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
+ Requires<[Pred]>;
+ def rri : NVPTXInst<(outs RC:$dst),
+ (ins RC:$a, RC:$b, ImmCls:$c),
+ asmstr,
+ [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>,
+ Requires<[Pred]>;
+ def rir : NVPTXInst<(outs RC:$dst),
+ (ins RC:$a, ImmCls:$b, RC:$c),
+ asmstr,
+ [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>,
+ Requires<[Pred]>;
+ def rii : NVPTXInst<(outs RC:$dst),
+ (ins RC:$a, ImmCls:$b, ImmCls:$c),
+ asmstr,
+ [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
+ Requires<[Pred]>;
+ def iir : NVPTXInst<(outs RC:$dst),
+ (ins ImmCls:$a, ImmCls:$b, RC:$c),
+ asmstr,
+ [(set RC:$dst, (fma fpimm:$a, fpimm:$b, RC:$c))]>,
+ Requires<[Pred]>;
+
}
multiclass FMA_F16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> {
def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>,
+ [(set T:$dst, (fma T:$a, T:$b, T:$c))]>,
Requires<[useFP16Math, Pred]>;
}
multiclass FMA_BF16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> {
def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
- [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>,
+ [(set T:$dst, (fma T:$a, T:$b, T:$c))]>,
Requires<[hasBF16Math, Pred]>;
}
@@ -1412,9 +1433,7 @@ defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", f16, Int16Regs, doF32FTZ>;
defm FMA16 : FMA_F16<"fma.rn.f16", f16, Int16Regs, True>;
defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>;
defm FMA16x2 : FMA_F16<"fma.rn.f16x2", v2f16, Int32Regs, True>;
-defm BFMA16_ftz : FMA_BF16<"fma.rn.ftz.bf16", bf16, Int16Regs, doF32FTZ>;
defm BFMA16 : FMA_BF16<"fma.rn.bf16", bf16, Int16Regs, True>;
-defm BFMA16x2_ftz : FMA_BF16<"fma.rn.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>;
defm BFMA16x2 : FMA_BF16<"fma.rn.bf16x2", v2bf16, Int32Regs, True>;
defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, True>;
@@ -1423,11 +1442,11 @@ defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, True>;
// sin/cos
def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
"sin.approx.f32 \t$dst, $src;",
- [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>,
+ [(set f32:$dst, (fsin f32:$src))]>,
Requires<[allowUnsafeFPMath]>;
def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
"cos.approx.f32 \t$dst, $src;",
- [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>,
+ [(set f32:$dst, (fcos f32:$src))]>,
Requires<[allowUnsafeFPMath]>;
// Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
@@ -1435,84 +1454,84 @@ def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
// semantics of LLVM's frem.
// frem - f32 FTZ
-def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
- (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
- (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ),
- Float32Regs:$y))>,
+def : Pat<(frem f32:$x, f32:$y),
+ (FSUBf32rr_ftz $x, (FMULf32rr_ftz (CVT_f32_f32
+ (FDIV32rr_prec_ftz $x, $y), CvtRZI_FTZ),
+ $y))>,
Requires<[doF32FTZ, allowUnsafeFPMath]>;
-def : Pat<(frem Float32Regs:$x, fpimm:$y),
- (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
- (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ),
+def : Pat<(frem f32:$x, fpimm:$y),
+ (FSUBf32rr_ftz $x, (FMULf32ri_ftz (CVT_f32_f32
+ (FDIV32ri_prec_ftz $x, fpimm:$y), CvtRZI_FTZ),
fpimm:$y))>,
Requires<[doF32FTZ, allowUnsafeFPMath]>;
-def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
- (SELP_f32rr Float32Regs:$x,
- (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
- (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ),
- Float32Regs:$y)),
- (TESTINF_f32r Float32Regs:$y))>,
+def : Pat<(frem f32:$x, f32:$y),
+ (SELP_f32rr $x,
+ (FSUBf32rr_ftz $x, (FMULf32rr_ftz (CVT_f32_f32
+ (FDIV32rr_prec_ftz $x, $y), CvtRZI_FTZ),
+ $y)),
+ (TESTINF_f32r $y))>,
Requires<[doF32FTZ, noUnsafeFPMath]>;
-def : Pat<(frem Float32Regs:$x, fpimm:$y),
- (SELP_f32rr Float32Regs:$x,
- (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
- (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ),
+def : Pat<(frem f32:$x, fpimm:$y),
+ (SELP_f32rr $x,
+ (FSUBf32rr_ftz $x, (FMULf32ri_ftz (CVT_f32_f32
+ (FDIV32ri_prec_ftz $x, fpimm:$y), CvtRZI_FTZ),
fpimm:$y)),
(TESTINF_f32i fpimm:$y))>,
Requires<[doF32FTZ, noUnsafeFPMath]>;
// frem - f32
-def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
- (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
- (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI),
- Float32Regs:$y))>,
+def : Pat<(frem f32:$x, f32:$y),
+ (FSUBf32rr $x, (FMULf32rr (CVT_f32_f32
+ (FDIV32rr_prec $x, $y), CvtRZI),
+ $y))>,
Requires<[allowUnsafeFPMath]>;
-def : Pat<(frem Float32Regs:$x, fpimm:$y),
- (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
- (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI),
+def : Pat<(frem f32:$x, fpimm:$y),
+ (FSUBf32rr $x, (FMULf32ri (CVT_f32_f32
+ (FDIV32ri_prec $x, fpimm:$y), CvtRZI),
fpimm:$y))>,
Requires<[allowUnsafeFPMath]>;
-def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
- (SELP_f32rr Float32Regs:$x,
- (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
- (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI),
- Float32Regs:$y)),
+def : Pat<(frem f32:$x, f32:$y),
+ (SELP_f32rr $x,
+ (FSUBf32rr $x, (FMULf32rr (CVT_f32_f32
+ (FDIV32rr_prec $x, $y), CvtRZI),
+ $y)),
(TESTINF_f32r Float32Regs:$y))>,
Requires<[noUnsafeFPMath]>;
-def : Pat<(frem Float32Regs:$x, fpimm:$y),
- (SELP_f32rr Float32Regs:$x,
- (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
- (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI),
+def : Pat<(frem f32:$x, fpimm:$y),
+ (SELP_f32rr $x,
+ (FSUBf32rr $x, (FMULf32ri (CVT_f32_f32
+ (FDIV32ri_prec $x, fpimm:$y), CvtRZI),
fpimm:$y)),
(TESTINF_f32i fpimm:$y))>,
Requires<[noUnsafeFPMath]>;
// frem - f64
-def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
- (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
- (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI),
- Float64Regs:$y))>,
+def : Pat<(frem f64:$x, f64:$y),
+ (FSUBf64rr $x, (FMULf64rr (CVT_f64_f64
+ (FDIV64rr $x, $y), CvtRZI),
+ $y))>,
Requires<[allowUnsafeFPMath]>;
-def : Pat<(frem Float64Regs:$x, fpimm:$y),
- (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
- (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI),
+def : Pat<(frem f64:$x, fpimm:$y),
+ (FSUBf64rr $x, (FMULf64ri (CVT_f64_f64
+ (FDIV64ri $x, fpimm:$y), CvtRZI),
fpimm:$y))>,
Requires<[allowUnsafeFPMath]>;
-def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
- (SELP_f64rr Float64Regs:$x,
- (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
- (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI),
- Float64Regs:$y)),
+def : Pat<(frem f64:$x, f64:$y),
+ (SELP_f64rr $x,
+ (FSUBf64rr $x, (FMULf64rr (CVT_f64_f64
+ (FDIV64rr $x, $y), CvtRZI),
+ $y)),
(TESTINF_f64r Float64Regs:$y))>,
Requires<[noUnsafeFPMath]>;
-def : Pat<(frem Float64Regs:$x, fpimm:$y),
- (SELP_f64rr Float64Regs:$x,
- (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
- (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI),
+def : Pat<(frem f64:$x, fpimm:$y),
+ (SELP_f64rr $x,
+ (FSUBf64rr $x, (FMULf64ri (CVT_f64_f64
+ (FDIV64ri $x, fpimm:$y), CvtRZI),
fpimm:$y)),
- (TESTINF_f64r Float64Regs:$y))>,
+ (TESTINF_f64r $y))>,
Requires<[noUnsafeFPMath]>;
//-----------------------------------
@@ -1525,35 +1544,35 @@ multiclass BITWISE<string OpcStr, SDNode OpNode> {
def b1rr :
NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
!strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
- [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
+ [(set i1:$dst, (OpNode i1:$a, i1:$b))]>;
def b1ri :
NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
!strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
- [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
+ [(set i1:$dst, (OpNode i1:$a, imm:$b))]>;
def b16rr :
NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
!strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
+ [(set i16:$dst, (OpNode i16:$a, i16:$b))]>;
def b16ri :
NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
!strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
+ [(set i16:$dst, (OpNode i16:$a, imm:$b))]>;
def b32rr :
NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
!strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>;
+ [(set i32:$dst, (OpNode i32:$a, i32:$b))]>;
def b32ri :
NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
!strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>;
+ [(set i32:$dst, (OpNode i32:$a, imm:$b))]>;
def b64rr :
NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
!strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
- [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
+ [(set i64:$dst, (OpNode i64:$a, i64:$b))]>;
def b64ri :
NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
!strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
- [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+ [(set i64:$dst, (OpNode i64:$a, imm:$b))]>;
}
defm OR : BITWISE<"or", or>;
@@ -1561,46 +1580,46 @@ defm AND : BITWISE<"and", and>;
defm XOR : BITWISE<"xor", xor>;
// PTX does not support mul on predicates, convert to and instructions
-def : Pat<(mul Int1Regs:$a, Int1Regs:$b), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>;
-def : Pat<(mul Int1Regs:$a, (i1 imm:$b)), (ANDb1ri Int1Regs:$a, imm:$b)>;
+def : Pat<(mul i1:$a, i1:$b), (ANDb1rr $a, $b)>;
+def : Pat<(mul i1:$a, imm:$b), (ANDb1ri $a, imm:$b)>;
// These transformations were once reliably performed by instcombine, but thanks
// to poison semantics they are no longer safe for LLVM IR, perform them here
// instead.
-def : Pat<(select Int1Regs:$a, Int1Regs:$b, 0), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>;
-def : Pat<(select Int1Regs:$a, 1, Int1Regs:$b), (ORb1rr Int1Regs:$a, Int1Regs:$b)>;
+def : Pat<(select i1:$a, i1:$b, 0), (ANDb1rr $a, $b)>;
+def : Pat<(select i1:$a, 1, i1:$b), (ORb1rr $a, $b)>;
// Lower logical v2i16/v4i8 ops as bitwise ops on b32.
foreach vt = [v2i16, v4i8] in {
- def: Pat<(or (vt Int32Regs:$a), (vt Int32Regs:$b)),
- (ORb32rr Int32Regs:$a, Int32Regs:$b)>;
- def: Pat<(xor (vt Int32Regs:$a), (vt Int32Regs:$b)),
- (XORb32rr Int32Regs:$a, Int32Regs:$b)>;
- def: Pat<(and (vt Int32Regs:$a), (vt Int32Regs:$b)),
- (ANDb32rr Int32Regs:$a, Int32Regs:$b)>;
+ def: Pat<(or vt:$a, vt:$b),
+ (ORb32rr $a, $b)>;
+ def: Pat<(xor vt:$a, vt:$b),
+ (XORb32rr $a, $b)>;
+ def: Pat<(and vt:$a, vt:$b),
+ (ANDb32rr $a, $b)>;
// The constants get legalized into a bitcast from i32, so that's what we need
// to match here.
- def: Pat<(or Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))),
- (ORb32ri Int32Regs:$a, imm:$b)>;
- def: Pat<(xor Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))),
- (XORb32ri Int32Regs:$a, imm:$b)>;
- def: Pat<(and Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))),
- (ANDb32ri Int32Regs:$a, imm:$b)>;
+ def: Pat<(or vt:$a, (vt (bitconvert (i32 imm:$b)))),
+ (ORb32ri $a, imm:$b)>;
+ def: Pat<(xor vt:$a, (vt (bitconvert (i32 imm:$b)))),
+ (XORb32ri $a, imm:$b)>;
+ def: Pat<(and vt:$a, (vt (bitconvert (i32 imm:$b)))),
+ (ANDb32ri $a, imm:$b)>;
}
def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
"not.pred \t$dst, $src;",
- [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
+ [(set i1:$dst, (not i1:$src))]>;
def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
"not.b16 \t$dst, $src;",
- [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
+ [(set i16:$dst, (not i16:$src))]>;
def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
"not.b32 \t$dst, $src;",
- [(set (i32 Int32Regs:$dst), (not (i32 Int32Regs:$src)))]>;
+ [(set i32:$dst, (not i32:$src))]>;
def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
"not.b64 \t$dst, $src;",
- [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
+ [(set i64:$dst, (not i64:$src))]>;
// Template for left/right shifts. Takes three operands,
// [dest (reg), src (reg), shift (reg or imm)].
@@ -1611,31 +1630,31 @@ multiclass SHIFT<string OpcStr, SDNode OpNode> {
def i64rr :
NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
!strconcat(OpcStr, "64 \t$dst, $a, $b;"),
- [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 Int32Regs:$b)))]>;
+ [(set i64:$dst, (OpNode i64:$a, i32:$b))]>;
def i64ri :
NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
!strconcat(OpcStr, "64 \t$dst, $a, $b;"),
- [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>;
+ [(set i64:$dst, (OpNode i64:$a, (i32 imm:$b)))]>;
def i32rr :
NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
!strconcat(OpcStr, "32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>;
+ [(set i32:$dst, (OpNode i32:$a, i32:$b))]>;
def i32ri :
NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
!strconcat(OpcStr, "32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 imm:$b)))]>;
+ [(set i32:$dst, (OpNode i32:$a, (i32 imm:$b)))]>;
def i32ii :
NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
!strconcat(OpcStr, "32 \t$dst, $a, $b;"),
- [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
+ [(set i32:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
def i16rr :
NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
!strconcat(OpcStr, "16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 Int32Regs:$b)))]>;
+ [(set i16:$dst, (OpNode i16:$a, i32:$b))]>;
def i16ri :
NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
!strconcat(OpcStr, "16 \t$dst, $a, $b;"),
- [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>;
+ [(set i16:$dst, (OpNode i16:$a, (i32 imm:$b)))]>;
}
defm SHL : SHIFT<"shl.b", shl>;
@@ -1646,173 +1665,12 @@ defm SRL : SHIFT<"shr.u", srl>;
def BREV32 :
NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
"brev.b32 \t$dst, $a;",
- [(set Int32Regs:$dst, (bitreverse (i32 Int32Regs:$a)))]>;
+ [(set i32:$dst, (bitreverse i32:$a))]>;
def BREV64 :
NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a),
"brev.b64 \t$dst, $a;",
- [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>;
+ [(set i64:$dst, (bitreverse i64:$a))]>;
-//
-// Rotate: Use ptx shf instruction if available.
-//
-
-// 32 bit r2 = rotl r1, n
-// =>
-// r2 = shf.l r1, r1, n
-def ROTL32imm_hw :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
- "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
- [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 imm:$amt)))]>,
- Requires<[hasHWROT32]>;
-
-def ROTL32reg_hw :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
- "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
- [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
- Requires<[hasHWROT32]>;
-
-// 32 bit r2 = rotr r1, n
-// =>
-// r2 = shf.r r1, r1, n
-def ROTR32imm_hw :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
- "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
- [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 imm:$amt)))]>,
- Requires<[hasHWROT32]>;
-
-def ROTR32reg_hw :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
- "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
- [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
- Requires<[hasHWROT32]>;
-
-// 32-bit software rotate by immediate. $amt2 should equal 32 - $amt1.
-def ROT32imm_sw :
- NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
- "{{\n\t"
- ".reg .b32 %lhs;\n\t"
- ".reg .b32 %rhs;\n\t"
- "shl.b32 \t%lhs, $src, $amt1;\n\t"
- "shr.b32 \t%rhs, $src, $amt2;\n\t"
- "add.u32 \t$dst, %lhs, %rhs;\n\t"
- "}}",
- []>;
-
-def SUB_FRM_32 : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32);
-}]>;
-
-def : Pat<(rotl (i32 Int32Regs:$src), (i32 imm:$amt)),
- (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
- Requires<[noHWROT32]>;
-def : Pat<(rotr (i32 Int32Regs:$src), (i32 imm:$amt)),
- (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
- Requires<[noHWROT32]>;
-
-// 32-bit software rotate left by register.
-def ROTL32reg_sw :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
- "{{\n\t"
- ".reg .b32 %lhs;\n\t"
- ".reg .b32 %rhs;\n\t"
- ".reg .b32 %amt2;\n\t"
- "shl.b32 \t%lhs, $src, $amt;\n\t"
- "sub.s32 \t%amt2, 32, $amt;\n\t"
- "shr.b32 \t%rhs, $src, %amt2;\n\t"
- "add.u32 \t$dst, %lhs, %rhs;\n\t"
- "}}",
- [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
- Requires<[noHWROT32]>;
-
-// 32-bit software rotate right by register.
-def ROTR32reg_sw :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
- "{{\n\t"
- ".reg .b32 %lhs;\n\t"
- ".reg .b32 %rhs;\n\t"
- ".reg .b32 %amt2;\n\t"
- "shr.b32 \t%lhs, $src, $amt;\n\t"
- "sub.s32 \t%amt2, 32, $amt;\n\t"
- "shl.b32 \t%rhs, $src, %amt2;\n\t"
- "add.u32 \t$dst, %lhs, %rhs;\n\t"
- "}}",
- [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
- Requires<[noHWROT32]>;
-
-// 64-bit software rotate by immediate. $amt2 should equal 64 - $amt1.
-def ROT64imm_sw :
- NVPTXInst<(outs Int64Regs:$dst),
- (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2),
- "{{\n\t"
- ".reg .b64 %lhs;\n\t"
- ".reg .b64 %rhs;\n\t"
- "shl.b64 \t%lhs, $src, $amt1;\n\t"
- "shr.b64 \t%rhs, $src, $amt2;\n\t"
- "add.u64 \t$dst, %lhs, %rhs;\n\t"
- "}}",
- []>;
-
-def SUB_FRM_64 : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32);
-}]>;
-
-def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
- (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
-def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
- (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
-
-// 64-bit software rotate left by register.
-def ROTL64reg_sw :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
- "{{\n\t"
- ".reg .b64 %lhs;\n\t"
- ".reg .b64 %rhs;\n\t"
- ".reg .u32 %amt2;\n\t"
- "and.b32 \t%amt2, $amt, 63;\n\t"
- "shl.b64 \t%lhs, $src, %amt2;\n\t"
- "sub.u32 \t%amt2, 64, %amt2;\n\t"
- "shr.b64 \t%rhs, $src, %amt2;\n\t"
- "add.u64 \t$dst, %lhs, %rhs;\n\t"
- "}}",
- [(set Int64Regs:$dst, (rotl Int64Regs:$src, (i32 Int32Regs:$amt)))]>;
-
-def ROTR64reg_sw :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
- "{{\n\t"
- ".reg .b64 %lhs;\n\t"
- ".reg .b64 %rhs;\n\t"
- ".reg .u32 %amt2;\n\t"
- "and.b32 \t%amt2, $amt, 63;\n\t"
- "shr.b64 \t%lhs, $src, %amt2;\n\t"
- "sub.u32 \t%amt2, 64, %amt2;\n\t"
- "shl.b64 \t%rhs, $src, %amt2;\n\t"
- "add.u64 \t$dst, %lhs, %rhs;\n\t"
- "}}",
- [(set Int64Regs:$dst, (rotr Int64Regs:$src, (i32 Int32Regs:$amt)))]>;
-
-//
-// Funnnel shift in clamp mode
-//
-
-// Create SDNodes so they can be used in the DAG code, e.g.
-// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
-def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
-def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
-
-def FUNSHFLCLAMP :
- NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
- "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
- [(set Int32Regs:$dst,
- (FUN_SHFL_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>;
-
-def FUNSHFRCLAMP :
- NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
- "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
- [(set Int32Regs:$dst,
- (FUN_SHFR_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>;
//
// BFE - bit-field extract
@@ -1844,17 +1702,17 @@ multiclass BFE<string Instr, ValueType T, RegisterClass RC> {
: NVPTXInst<(outs RC:$d),
(ins RC:$a, Int32Regs:$b, Int32Regs:$c),
!strconcat(Instr, " \t$d, $a, $b, $c;"),
- [(set (T RC:$d), (bfe (T RC:$a), (i32 Int32Regs:$b), (i32 Int32Regs:$c)))]>;
+ [(set T:$d, (bfe T:$a, i32:$b, i32:$c))]>;
def rri
: NVPTXInst<(outs RC:$d),
(ins RC:$a, Int32Regs:$b, i32imm:$c),
!strconcat(Instr, " \t$d, $a, $b, $c;"),
- [(set (T RC:$d), (bfe (T RC:$a), (i32 Int32Regs:$b), (i32 imm:$c)))]>;
+ [(set T:$d, (bfe T:$a, i32:$b, imm:$c))]>;
def rii
: NVPTXInst<(outs RC:$d),
(ins RC:$a, i32imm:$b, i32imm:$c),
!strconcat(Instr, " \t$d, $a, $b, $c;"),
- [(set (T RC:$d), (bfe (T RC:$a), (i32 imm:$b), (i32 imm:$c)))]>;
+ [(set T:$d, (bfe T:$a, imm:$b, imm:$c))]>;
}
multiclass BFI<string Instr, ValueType T, RegisterClass RC, Operand ImmCls> {
@@ -1862,32 +1720,36 @@ multiclass BFI<string Instr, ValueType T, RegisterClass RC, Operand ImmCls> {
: NVPTXInst<(outs RC:$f),
(ins RC:$a, RC:$b, Int32Regs:$c, Int32Regs:$d),
!strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
- [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 Int32Regs:$d)))]>;
+ [(set T:$f, (bfi T:$a, T:$b, i32:$c, i32:$d))]>;
def rrri
: NVPTXInst<(outs RC:$f),
(ins RC:$a, RC:$b, Int32Regs:$c, i32imm:$d),
!strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
- [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 imm:$d)))]>;
+ [(set T:$f, (bfi T:$a, T:$b, i32:$c, imm:$d))]>;
def rrii
: NVPTXInst<(outs RC:$f),
(ins RC:$a, RC:$b, i32imm:$c, i32imm:$d),
!strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
- [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 imm:$c), (i32 imm:$d)))]>;
+ [(set T:$f, (bfi T:$a, T:$b, imm:$c, imm:$d))]>;
def irrr
: NVPTXInst<(outs RC:$f),
(ins ImmCls:$a, RC:$b, Int32Regs:$c, Int32Regs:$d),
!strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
- [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 Int32Regs:$d)))]>;
+ [(set T:$f, (bfi (T imm:$a), T:$b, i32:$c, i32:$d))]>;
def irri
: NVPTXInst<(outs RC:$f),
(ins ImmCls:$a, RC:$b, Int32Regs:$c, i32imm:$d),
!strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
- [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 imm:$d)))]>;
+ [(set T:$f, (bfi (T imm:$a), T:$b, i32:$c, imm:$d))]>;
def irii
: NVPTXInst<(outs RC:$f),
(ins ImmCls:$a, RC:$b, i32imm:$c, i32imm:$d),
!strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
- [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 imm:$c), (i32 imm:$d)))]>;
+ [(set T:$f, (bfi (T imm:$a), T:$b, imm:$c, imm:$d))]>;
+}
+
+def Hexu32imm : Operand<i32> {
+ let PrintMethod = "printHexu32imm";
}
multiclass PRMT<ValueType T, RegisterClass RC> {
@@ -1895,17 +1757,17 @@ multiclass PRMT<ValueType T, RegisterClass RC> {
: NVPTXInst<(outs RC:$d),
(ins RC:$a, Int32Regs:$b, Int32Regs:$c, PrmtMode:$mode),
!strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
- [(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), imm:$mode))]>;
+ [(set T:$d, (prmt T:$a, T:$b, i32:$c, imm:$mode))]>;
def rri
: NVPTXInst<(outs RC:$d),
- (ins RC:$a, Int32Regs:$b, i32imm:$c, PrmtMode:$mode),
+ (ins RC:$a, Int32Regs:$b, Hexu32imm:$c, PrmtMode:$mode),
!strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
- [(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 imm:$c), imm:$mode))]>;
+ [(set T:$d, (prmt T:$a, T:$b, imm:$c, imm:$mode))]>;
def rii
: NVPTXInst<(outs RC:$d),
- (ins RC:$a, i32imm:$b, i32imm:$c, PrmtMode:$mode),
+ (ins RC:$a, i32imm:$b, Hexu32imm:$c, PrmtMode:$mode),
!strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
- [(set (T RC:$d), (prmt (T RC:$a), (T imm:$b), (i32 imm:$c), imm:$mode))]>;
+ [(set T:$d, (prmt T:$a, imm:$b, imm:$c, imm:$mode))]>;
}
let hasSideEffects = false in {
@@ -1926,35 +1788,35 @@ let hasSideEffects = false in {
// byte extraction + signed/unsigned extension to i32.
-def : Pat<(i32 (sext_inreg (bfe (i32 Int32Regs:$s), (i32 Int32Regs:$o), 8), i8)),
- (BFE_S32rri Int32Regs:$s, Int32Regs:$o, 8)>;
-def : Pat<(i32 (sext_inreg (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8), i8)),
- (BFE_S32rii Int32Regs:$s, imm:$o, 8)>;
-def : Pat<(i32 (and (bfe (i32 Int32Regs:$s), (i32 Int32Regs:$o), 8), 255)),
- (BFE_U32rri Int32Regs:$s, Int32Regs:$o, 8)>;
-def : Pat<(i32 (and (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8), 255)),
- (BFE_U32rii Int32Regs:$s, imm:$o, 8)>;
+def : Pat<(i32 (sext_inreg (bfe i32:$s, i32:$o, 8), i8)),
+ (BFE_S32rri $s, $o, 8)>;
+def : Pat<(i32 (sext_inreg (bfe i32:$s, imm:$o, 8), i8)),
+ (BFE_S32rii $s, imm:$o, 8)>;
+def : Pat<(i32 (and (bfe i32:$s, i32:$o, 8), 255)),
+ (BFE_U32rri $s, $o, 8)>;
+def : Pat<(i32 (and (bfe i32:$s, imm:$o, 8), 255)),
+ (BFE_U32rii $s, imm:$o, 8)>;
// byte extraction + signed extension to i16
-def : Pat<(i16 (sext_inreg (trunc (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8)), i8)),
- (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, imm:$o, 8), CvtNONE)>;
+def : Pat<(i16 (sext_inreg (trunc (bfe i32:$s, imm:$o, 8)), i8)),
+ (CVT_s8_s32 (BFE_S32rii $s, imm:$o, 8), CvtNONE)>;
// Byte extraction via shift/trunc/sext
-def : Pat<(i16 (sext_inreg (trunc Int32Regs:$s), i8)),
- (CVT_s8_s32 Int32Regs:$s, CvtNONE)>;
-def : Pat<(i16 (sext_inreg (trunc (srl (i32 Int32Regs:$s), (i32 imm:$o))), i8)),
- (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, imm:$o, 8), CvtNONE)>;
-def : Pat<(sext_inreg (srl (i32 Int32Regs:$s), (i32 imm:$o)), i8),
- (BFE_S32rii Int32Regs:$s, imm:$o, 8)>;
-def : Pat<(i16 (sra (i16 (trunc Int32Regs:$s)), (i32 8))),
- (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, 8, 8), CvtNONE)>;
-def : Pat<(sext_inreg (srl (i64 Int64Regs:$s), (i32 imm:$o)), i8),
- (BFE_S64rii Int64Regs:$s, imm:$o, 8)>;
-def : Pat<(i16 (sext_inreg (trunc Int64Regs:$s), i8)),
- (CVT_s8_s64 Int64Regs:$s, CvtNONE)>;
-def : Pat<(i16 (sext_inreg (trunc (srl (i64 Int64Regs:$s), (i32 imm:$o))), i8)),
- (CVT_s8_s64 (BFE_S64rii Int64Regs:$s, imm:$o, 8), CvtNONE)>;
+def : Pat<(i16 (sext_inreg (trunc i32:$s), i8)),
+ (CVT_s8_s32 $s, CvtNONE)>;
+def : Pat<(i16 (sext_inreg (trunc (srl i32:$s, (i32 imm:$o))), i8)),
+ (CVT_s8_s32 (BFE_S32rii $s, imm:$o, 8), CvtNONE)>;
+def : Pat<(sext_inreg (srl i32:$s, (i32 imm:$o)), i8),
+ (BFE_S32rii $s, imm:$o, 8)>;
+def : Pat<(i16 (sra (i16 (trunc i32:$s)), (i32 8))),
+ (CVT_s8_s32 (BFE_S32rii $s, 8, 8), CvtNONE)>;
+def : Pat<(sext_inreg (srl i64:$s, (i32 imm:$o)), i8),
+ (BFE_S64rii $s, imm:$o, 8)>;
+def : Pat<(i16 (sext_inreg (trunc i64:$s), i8)),
+ (CVT_s8_s64 $s, CvtNONE)>;
+def : Pat<(i16 (sext_inreg (trunc (srl i64:$s, (i32 imm:$o))), i8)),
+ (CVT_s8_s64 (BFE_S64rii $s, imm:$o, 8), CvtNONE)>;
//-----------------------------------
// Comparison instructions (setp, set)
@@ -2053,10 +1915,10 @@ defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
// Data Movement (Load / Store, Move)
//-----------------------------------
-def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
- [SDNPWantRoot]>;
-def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
- [SDNPWantRoot]>;
+let WantsRoot = true in {
+ def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex]>;
+ def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex]>;
+}
def ADDRvar : ComplexPattern<iPTR, 1, "SelectDirectAddr", [], []>;
def MEMri : Operand<i32> {
@@ -2072,7 +1934,7 @@ def imem : Operand<iPTR> {
let PrintMethod = "printOperand";
}
-def imemAny : Operand<iPTRAny> {
+def imemAny : Operand<pAny> {
let PrintMethod = "printOperand";
}
@@ -2084,16 +1946,20 @@ def MmaCode : Operand<i32> {
let PrintMethod = "printMmaCode";
}
+def Offseti32imm : Operand<i32> {
+ let PrintMethod = "printOffseti32imm";
+}
+
def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
// Load a memory address into a u32 or u64 register.
def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
"mov.u32 \t$dst, $a;",
- [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
+ [(set i32:$dst, (Wrapper tglobaladdr:$a))]>;
def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
"mov.u64 \t$dst, $a;",
- [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
+ [(set i64:$dst, (Wrapper tglobaladdr:$a))]>;
// Get pointer to local stack.
let hasSideEffects = false in {
@@ -2105,7 +1971,7 @@ let hasSideEffects = false in {
// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
-let IsSimpleMove=1, hasSideEffects=0 in {
+let IsSimpleMove=1, hasSideEffects=0, isAsCheapAsAMove=1 in {
def IMOV1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
"mov.pred \t$dst, $sss;", []>;
def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
@@ -2117,48 +1983,37 @@ let IsSimpleMove=1, hasSideEffects=0 in {
def IMOV128rr : NVPTXInst<(outs Int128Regs:$dst), (ins Int128Regs:$sss),
"mov.b128 \t$dst, $sss;", []>;
- def IMOVB16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
- "mov.b16 \t$dst, $sss;", []>;
- def IMOVB32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
- "mov.b32 \t$dst, $sss;", []>;
- def IMOVB64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
- "mov.b64 \t$dst, $sss;", []>;
-
- def FMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
- // We have to use .b16 here as there's no mov.f16.
- "mov.b16 \t$dst, $src;", []>;
def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
"mov.f32 \t$dst, $src;", []>;
def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
"mov.f64 \t$dst, $src;", []>;
-}
-def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
- "mov.pred \t$dst, $src;",
- [(set Int1Regs:$dst, imm:$src)]>;
-def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
- "mov.u16 \t$dst, $src;",
- [(set Int16Regs:$dst, imm:$src)]>;
-def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
- "mov.u32 \t$dst, $src;",
- [(set (i32 Int32Regs:$dst), imm:$src)]>;
-def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
- "mov.u64 \t$dst, $src;",
- [(set Int64Regs:$dst, imm:$src)]>;
-
-def IMOVB16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
- "mov.b16 \t$dst, $src;", []>;
-def IMOVB32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
- "mov.b32 \t$dst, $src;", []>;
-def IMOVB64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
- "mov.b64 \t$dst, $src;", []>;
-
-def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
- "mov.f32 \t$dst, $src;",
- [(set Float32Regs:$dst, fpimm:$src)]>;
-def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
- "mov.f64 \t$dst, $src;",
- [(set Float64Regs:$dst, fpimm:$src)]>;
+ def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
+ "mov.pred \t$dst, $src;",
+ [(set i1:$dst, imm:$src)]>;
+ def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
+ "mov.b16 \t$dst, $src;",
+ [(set i16:$dst, imm:$src)]>;
+ def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
+ "mov.b32 \t$dst, $src;",
+ [(set i32:$dst, imm:$src)]>;
+ def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
+ "mov.b64 \t$dst, $src;",
+ [(set i64:$dst, imm:$src)]>;
+
+ def FMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$src),
+ "mov.b16 \t$dst, $src;",
+ [(set f16:$dst, fpimm:$src)]>;
+ def BFMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$src),
+ "mov.b16 \t$dst, $src;",
+ [(set bf16:$dst, fpimm:$src)]>;
+ def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
+ "mov.f32 \t$dst, $src;",
+ [(set f32:$dst, fpimm:$src)]>;
+ def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
+ "mov.f64 \t$dst, $src;",
+ [(set f64:$dst, fpimm:$src)]>;
+}
def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64ri texternalsym:$dst)>;
@@ -2166,10 +2021,10 @@ def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64ri texternalsym:$dst)>;
//---- Copy Frame Index ----
def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
"add.u32 \t$dst, ${addr:add};",
- [(set Int32Regs:$dst, ADDRri:$addr)]>;
+ [(set i32:$dst, ADDRri:$addr)]>;
def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
"add.u64 \t$dst, ${addr:add};",
- [(set Int64Regs:$dst, ADDRri64:$addr)]>;
+ [(set i64:$dst, ADDRri64:$addr)]>;
//-----------------------------------
// Comparison and Selection
@@ -2196,47 +2051,47 @@ multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
Instruction set_64ir> {
// i16 -> pred
def : Pat<(i1 (OpNode i16:$a, i16:$b)),
- (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
- def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)),
- (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
- def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)),
- (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
+ (setp_16rr $a, $b, Mode)>;
+ def : Pat<(i1 (OpNode i16:$a, imm:$b)),
+ (setp_16ri $a, imm:$b, Mode)>;
+ def : Pat<(i1 (OpNode imm:$a, i16:$b)),
+ (setp_16ir imm:$a, $b, Mode)>;
// i32 -> pred
def : Pat<(i1 (OpNode i32:$a, i32:$b)),
- (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
- def : Pat<(i1 (OpNode (i32 Int32Regs:$a), imm:$b)),
- (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
- def : Pat<(i1 (OpNode imm:$a, (i32 Int32Regs:$b))),
- (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
+ (setp_32rr $a, $b, Mode)>;
+ def : Pat<(i1 (OpNode i32:$a, imm:$b)),
+ (setp_32ri $a, imm:$b, Mode)>;
+ def : Pat<(i1 (OpNode imm:$a, i32:$b)),
+ (setp_32ir imm:$a, $b, Mode)>;
// i64 -> pred
- def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)),
- (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
- def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)),
- (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
- def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)),
- (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
+ def : Pat<(i1 (OpNode i64:$a, i64:$b)),
+ (setp_64rr $a, $b, Mode)>;
+ def : Pat<(i1 (OpNode i64:$a, imm:$b)),
+ (setp_64ri $a, imm:$b, Mode)>;
+ def : Pat<(i1 (OpNode imm:$a, i64:$b)),
+ (setp_64ir imm:$a, $b, Mode)>;
// i16 -> i32
def : Pat<(i32 (OpNode i16:$a, i16:$b)),
- (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
- def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)),
- (set_16ri Int16Regs:$a, imm:$b, Mode)>;
- def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)),
- (set_16ir imm:$a, Int16Regs:$b, Mode)>;
+ (set_16rr $a, $b, Mode)>;
+ def : Pat<(i32 (OpNode i16:$a, imm:$b)),
+ (set_16ri $a, imm:$b, Mode)>;
+ def : Pat<(i32 (OpNode imm:$a, i16:$b)),
+ (set_16ir imm:$a, $b, Mode)>;
// i32 -> i32
def : Pat<(i32 (OpNode i32:$a, i32:$b)),
- (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
- def : Pat<(i32 (OpNode (i32 Int32Regs:$a), imm:$b)),
- (set_32ri Int32Regs:$a, imm:$b, Mode)>;
- def : Pat<(i32 (OpNode imm:$a, (i32 Int32Regs:$b))),
- (set_32ir imm:$a, Int32Regs:$b, Mode)>;
+ (set_32rr $a, $b, Mode)>;
+ def : Pat<(i32 (OpNode i32:$a, imm:$b)),
+ (set_32ri $a, imm:$b, Mode)>;
+ def : Pat<(i32 (OpNode imm:$a, i32:$b)),
+ (set_32ir imm:$a, $b, Mode)>;
// i64 -> i32
- def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)),
- (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
- def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)),
- (set_64ri Int64Regs:$a, imm:$b, Mode)>;
- def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)),
- (set_64ir imm:$a, Int64Regs:$b, Mode)>;
+ def : Pat<(i32 (OpNode i64:$a, Int64Regs:$b)),
+ (set_64rr $a, $b, Mode)>;
+ def : Pat<(i32 (OpNode i64:$a, imm:$b)),
+ (set_64ri $a, imm:$b, Mode)>;
+ def : Pat<(i32 (OpNode imm:$a, i64:$b)),
+ (set_64ir imm:$a, $b, Mode)>;
}
multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
@@ -2276,17 +2131,6 @@ defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
-// i1 compares
-def : Pat<(setne Int1Regs:$a, Int1Regs:$b),
- (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
-def : Pat<(setune Int1Regs:$a, Int1Regs:$b),
- (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
-
-def : Pat<(seteq Int1Regs:$a, Int1Regs:$b),
- (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
-def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
- (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
-
// comparisons of i8 extracted with BFE as i32
// It's faster to do comparison directly on i32 extracted by BFE,
// instead of the long conversion and sign extending.
@@ -2353,143 +2197,95 @@ def: Pat<(setne (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
(SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpNE)>;
// i1 compare -> i32
-def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
- (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
-def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
- (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+def : Pat<(i32 (setne i1:$a, i1:$b)),
+ (SELP_u32ii -1, 0, (XORb1rr $a, $b))>;
+def : Pat<(i32 (setne i1:$a, i1:$b)),
+ (SELP_u32ii 0, -1, (XORb1rr $a, $b))>;
multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
// f16 -> pred
- def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
- (SETP_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
- Requires<[useFP16Math,doF32FTZ]>;
- def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
- (SETP_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
- Requires<[useFP16Math]>;
- def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
- (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
- Requires<[useFP16Math,doF32FTZ]>;
- def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
- (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
- Requires<[useFP16Math]>;
- def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
- (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
+ def : Pat<(i1 (OpNode f16:$a, f16:$b)),
+ (SETP_f16rr $a, $b, ModeFTZ)>,
Requires<[useFP16Math,doF32FTZ]>;
- def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
- (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>,
+ def : Pat<(i1 (OpNode f16:$a, f16:$b)),
+ (SETP_f16rr $a, $b, Mode)>,
Requires<[useFP16Math]>;
// bf16 -> pred
- def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
- (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
- Requires<[hasBF16Math,doF32FTZ]>;
- def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
- (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
- Requires<[hasBF16Math]>;
- def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
- (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>,
+ def : Pat<(i1 (OpNode bf16:$a, bf16:$b)),
+ (SETP_bf16rr $a, $b, ModeFTZ)>,
Requires<[hasBF16Math,doF32FTZ]>;
- def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
- (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>,
- Requires<[hasBF16Math]>;
- def : Pat<(i1 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
- (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
- Requires<[hasBF16Math,doF32FTZ]>;
- def : Pat<(i1 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
- (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>,
+ def : Pat<(i1 (OpNode bf16:$a, bf16:$b)),
+ (SETP_bf16rr $a, $b, Mode)>,
Requires<[hasBF16Math]>;
// f32 -> pred
- def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
- (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
+ def : Pat<(i1 (OpNode f32:$a, f32:$b)),
+ (SETP_f32rr $a, $b, ModeFTZ)>,
Requires<[doF32FTZ]>;
- def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
- (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
+ def : Pat<(i1 (OpNode f32:$a, f32:$b)),
+ (SETP_f32rr $a, $b, Mode)>;
def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
- (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
+ (SETP_f32ri $a, fpimm:$b, ModeFTZ)>,
Requires<[doF32FTZ]>;
- def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
- (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
- def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
- (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
+ def : Pat<(i1 (OpNode f32:$a, fpimm:$b)),
+ (SETP_f32ri $a, fpimm:$b, Mode)>;
+ def : Pat<(i1 (OpNode fpimm:$a, f32:$b)),
+ (SETP_f32ir fpimm:$a, $b, ModeFTZ)>,
Requires<[doF32FTZ]>;
- def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
- (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
+ def : Pat<(i1 (OpNode fpimm:$a, f32:$b)),
+ (SETP_f32ir fpimm:$a, $b, Mode)>;
// f64 -> pred
- def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),
- (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
- def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),
- (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
- def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
- (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
+ def : Pat<(i1 (OpNode f64:$a, f64:$b)),
+ (SETP_f64rr $a, $b, Mode)>;
+ def : Pat<(i1 (OpNode f64:$a, fpimm:$b)),
+ (SETP_f64ri $a, fpimm:$b, Mode)>;
+ def : Pat<(i1 (OpNode fpimm:$a, f64:$b)),
+ (SETP_f64ir fpimm:$a, $b, Mode)>;
// f16 -> i32
- def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
- (SET_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
- Requires<[useFP16Math, doF32FTZ]>;
- def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
- (SET_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
- Requires<[useFP16Math]>;
- def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
- (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
+ def : Pat<(i32 (OpNode f16:$a, f16:$b)),
+ (SET_f16rr $a, $b, ModeFTZ)>,
Requires<[useFP16Math, doF32FTZ]>;
- def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)),
- (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
- Requires<[useFP16Math]>;
- def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
- (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
- Requires<[useFP16Math, doF32FTZ]>;
- def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))),
- (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>,
+ def : Pat<(i32 (OpNode f16:$a, f16:$b)),
+ (SET_f16rr $a, $b, Mode)>,
Requires<[useFP16Math]>;
// bf16 -> i32
- def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
- (SET_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
- Requires<[hasBF16Math, doF32FTZ]>;
- def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
- (SET_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
- Requires<[hasBF16Math]>;
- def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
- (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>,
- Requires<[hasBF16Math, doF32FTZ]>;
- def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), fpimm:$b)),
- (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>,
- Requires<[hasBF16Math]>;
- def : Pat<(i32 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
- (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
+ def : Pat<(i32 (OpNode bf16:$a, bf16:$b)),
+ (SET_bf16rr $a, $b, ModeFTZ)>,
Requires<[hasBF16Math, doF32FTZ]>;
- def : Pat<(i32 (OpNode fpimm:$a, (bf16 Int16Regs:$b))),
- (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>,
+ def : Pat<(i32 (OpNode bf16:$a, bf16:$b)),
+ (SET_bf16rr $a, $b, Mode)>,
Requires<[hasBF16Math]>;
// f32 -> i32
- def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
- (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
+ def : Pat<(i32 (OpNode f32:$a, f32:$b)),
+ (SET_f32rr $a, $b, ModeFTZ)>,
Requires<[doF32FTZ]>;
- def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
- (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
- def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
- (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
+ def : Pat<(i32 (OpNode f32:$a, f32:$b)),
+ (SET_f32rr $a, $b, Mode)>;
+ def : Pat<(i32 (OpNode f32:$a, fpimm:$b)),
+ (SET_f32ri $a, fpimm:$b, ModeFTZ)>,
Requires<[doF32FTZ]>;
- def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
- (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
- def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
- (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
+ def : Pat<(i32 (OpNode f32:$a, fpimm:$b)),
+ (SET_f32ri $a, fpimm:$b, Mode)>;
+ def : Pat<(i32 (OpNode fpimm:$a, f32:$b)),
+ (SET_f32ir fpimm:$a, $b, ModeFTZ)>,
Requires<[doF32FTZ]>;
- def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
- (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
+ def : Pat<(i32 (OpNode fpimm:$a, f32:$b)),
+ (SET_f32ir fpimm:$a, $b, Mode)>;
// f64 -> i32
- def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)),
- (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
- def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)),
- (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
- def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)),
- (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
+ def : Pat<(i32 (OpNode f64:$a, f64:$b)),
+ (SET_f64rr $a, $b, Mode)>;
+ def : Pat<(i32 (OpNode f64:$a, fpimm:$b)),
+ (SET_f64ri $a, fpimm:$b, Mode)>;
+ def : Pat<(i32 (OpNode fpimm:$a, f64:$b)),
+ (SET_f64ir fpimm:$a, $b, Mode)>;
}
defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
@@ -2632,21 +2428,21 @@ def ProxyReg :
let mayLoad = true in {
class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
- !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"),
+ NVPTXInst<(outs regclass:$dst), (ins Offseti32imm:$b),
+ !strconcat("ld.param", opstr, " \t$dst, [retval0$b];"),
[]>;
class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
+ NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins Offseti32imm:$b),
!strconcat("ld.param.v2", opstr,
- " \t{{$dst, $dst2}}, [retval0+$b];"), []>;
+ " \t{{$dst, $dst2}}, [retval0$b];"), []>;
class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
regclass:$dst4),
- (ins i32imm:$b),
+ (ins Offseti32imm:$b),
!strconcat("ld.param.v4", opstr,
- " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"),
+ " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0$b];"),
[]>;
}
@@ -2662,8 +2458,8 @@ let mayStore = true in {
if !or(support_imm, !isa<NVPTXRegClass>(op)) then
def _ # !if(!isa<NVPTXRegClass>(op), "r", "i")
: NVPTXInst<(outs),
- (ins op:$val, i32imm:$a, i32imm:$b),
- "st.param" # opstr # " \t[param$a+$b], $val;",
+ (ins op:$val, i32imm:$a, Offseti32imm:$b),
+ "st.param" # opstr # " \t[param$a$b], $val;",
[]>;
}
@@ -2674,8 +2470,8 @@ let mayStore = true in {
# !if(!isa<NVPTXRegClass>(op2), "r", "i")
: NVPTXInst<(outs),
(ins op1:$val1, op2:$val2,
- i32imm:$a, i32imm:$b),
- "st.param.v2" # opstr # " \t[param$a+$b], {{$val1, $val2}};",
+ i32imm:$a, Offseti32imm:$b),
+ "st.param.v2" # opstr # " \t[param$a$b], {{$val1, $val2}};",
[]>;
}
@@ -2691,29 +2487,29 @@ let mayStore = true in {
: NVPTXInst<(outs),
(ins op1:$val1, op2:$val2, op3:$val3, op4:$val4,
- i32imm:$a, i32imm:$b),
+ i32imm:$a, Offseti32imm:$b),
"st.param.v4" # opstr #
- " \t[param$a+$b], {{$val1, $val2, $val3, $val4}};",
+ " \t[param$a$b], {{$val1, $val2, $val3, $val4}};",
[]>;
}
class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
- !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"),
+ NVPTXInst<(outs), (ins regclass:$val, Offseti32imm:$a),
+ !strconcat("st.param", opstr, " \t[func_retval0$a], $val;"),
[]>;
class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
- NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
+ NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, Offseti32imm:$a),
!strconcat("st.param.v2", opstr,
- " \t[func_retval0+$a], {{$val, $val2}};"),
+ " \t[func_retval0$a], {{$val, $val2}};"),
[]>;
class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
NVPTXInst<(outs),
(ins regclass:$val, regclass:$val2, regclass:$val3,
- regclass:$val4, i32imm:$a),
+ regclass:$val4, Offseti32imm:$a),
!strconcat("st.param.v4", opstr,
- " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
+ " \t[func_retval0$a], {{$val, $val2, $val3, $val4}};"),
[]>;
}
@@ -2867,7 +2663,7 @@ def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr), "$addr, ",
def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ",
[(CallVoid i32:$addr)]>;
def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ",
- [(CallVoid Int64Regs:$addr)]>;
+ [(CallVoid i64:$addr)]>;
def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
[(Prototype (i32 imm:$val))]>;
@@ -2900,7 +2696,7 @@ def DeclareScalarRegInst :
class MoveParamInst<ValueType T, NVPTXRegClass regclass, string asmstr> :
NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
!strconcat("mov", asmstr, " \t$dst, $src;"),
- [(set (T regclass:$dst), (MoveParam (T regclass:$src)))]>;
+ [(set T:$dst, (MoveParam T:$src))]>;
class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty, ValueType vt,
string asmstr> :
@@ -2935,7 +2731,7 @@ def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs, f32>;
class ProxyRegInst<string SzStr, ValueType T, NVPTXRegClass regclass> :
NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
!strconcat("mov.", SzStr, " \t$dst, $src;"),
- [(set (T regclass:$dst), (ProxyReg (T regclass:$src)))]>;
+ [(set T:$dst, (ProxyReg T:$src))]>;
def ProxyRegI1 : ProxyRegInst<"pred", i1, Int1Regs>;
def ProxyRegI16 : ProxyRegInst<"b16", i16, Int16Regs>;
@@ -2945,11 +2741,11 @@ def ProxyRegF32 : ProxyRegInst<"f32", f32, Float32Regs>;
def ProxyRegF64 : ProxyRegInst<"f64", f64, Float64Regs>;
foreach vt = [f16, bf16] in {
- def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI16 Int16Regs:$src)>;
+ def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI16 $src)>;
}
foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
- def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI32 Int32Regs:$src)>;
+ def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI32 $src)>;
}
//
@@ -2958,40 +2754,40 @@ foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
multiclass LD<NVPTXRegClass regclass> {
def _avar : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _areg : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _areg_64 : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _ari : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t$dst, [$addr+$offset];", []>;
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int32Regs:$addr, Offseti32imm:$offset),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t$dst, [$addr$offset];", []>;
def _ari_64 : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t$dst, [$addr+$offset];", []>;
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, Offseti32imm:$offset),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t$dst, [$addr$offset];", []>;
def _asi : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t$dst, [$addr+$offset];", []>;
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, Offseti32imm:$offset),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t$dst, [$addr$offset];", []>;
}
let mayLoad=1, hasSideEffects=0 in {
@@ -3006,40 +2802,43 @@ let mayLoad=1, hasSideEffects=0 in {
multiclass ST<NVPTXRegClass regclass> {
def _avar : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _areg : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _areg_64 : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _ari : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
- " \t[$addr+$offset], $src;", []>;
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr,
+ Offseti32imm:$offset),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ " \t[$addr$offset], $src;", []>;
def _ari_64 : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
- " \t[$addr+$offset], $src;", []>;
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr,
+ Offseti32imm:$offset),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ " \t[$addr$offset], $src;", []>;
def _asi : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
- " \t[$addr+$offset], $src;", []>;
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr,
+ Offseti32imm:$offset),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ " \t[$addr$offset], $src;", []>;
}
let mayStore=1, hasSideEffects=0 in {
@@ -3057,76 +2856,76 @@ let mayStore=1, hasSideEffects=0 in {
multiclass LD_VEC<NVPTXRegClass regclass> {
def _v2_avar : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_areg : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_areg_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_ari : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, Offseti32imm:$offset),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2}}, [$addr$offset];", []>;
def _v2_ari_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, Offseti32imm:$offset),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2}}, [$addr$offset];", []>;
def _v2_asi : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, Offseti32imm:$offset),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2}}, [$addr$offset];", []>;
def _v4_avar : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_areg : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_areg_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_ari : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, Offseti32imm:$offset),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr$offset];", []>;
def _v4_ari_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, Offseti32imm:$offset),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr$offset];", []>;
def _v4_asi : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, Offseti32imm:$offset),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr$offset];", []>;
}
let mayLoad=1, hasSideEffects=0 in {
defm LDV_i8 : LD_VEC<Int16Regs>;
@@ -3140,85 +2939,88 @@ let mayLoad=1, hasSideEffects=0 in {
multiclass ST_VEC<NVPTXRegClass regclass> {
def _v2_avar : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
+ LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
+ imem:$addr),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_areg : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
+ LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
+ Int32Regs:$addr),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_areg_64 : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
+ LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
+ Int64Regs:$addr),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_ari : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
- i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t[$addr+$offset], {{$src1, $src2}};", []>;
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
+ LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
+ Int32Regs:$addr, Offseti32imm:$offset),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr$offset], {{$src1, $src2}};", []>;
def _v2_ari_64 : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
- i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t[$addr+$offset], {{$src1, $src2}};", []>;
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
+ LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
+ Int64Regs:$addr, Offseti32imm:$offset),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr$offset], {{$src1, $src2}};", []>;
def _v2_asi : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
- i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t[$addr+$offset], {{$src1, $src2}};", []>;
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
+ LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
+ imem:$addr, Offseti32imm:$offset),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr$offset], {{$src1, $src2}};", []>;
def _v4_avar : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_areg : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_areg_64 : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_ari : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
+ LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, Offseti32imm:$offset),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr$offset], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_ari_64 : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
- "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
+ LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, Offseti32imm:$offset),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "\t[$addr$offset], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_asi : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
- "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
+ LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, Offseti32imm:$offset),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}"
+ "$fromWidth \t[$addr$offset], {{$src1, $src2, $src3, $src4}};", []>;
}
let mayStore=1, hasSideEffects=0 in {
@@ -3237,7 +3039,7 @@ class F_BITCONVERT<string SzStr, ValueType TIn, ValueType TOut,
NVPTXRegClass regclassOut = ValueToRegClass<TOut>.ret> :
NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
!strconcat("mov.b", SzStr, " \t$d, $a;"),
- [(set (TOut regclassOut:$d), (bitconvert (TIn regclassIn:$a)))]>;
+ [(set TOut:$d, (bitconvert TIn:$a))]>;
def BITCONVERT_32_I2F : F_BITCONVERT<"32", i32, f32>;
def BITCONVERT_32_F2I : F_BITCONVERT<"32", f32, i32>;
@@ -3246,310 +3048,307 @@ def BITCONVERT_64_F2I : F_BITCONVERT<"64", f64, i64>;
foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
def: Pat<(vt (bitconvert (f32 Float32Regs:$a))),
- (BITCONVERT_32_F2I Float32Regs:$a)>;
-def: Pat<(f32 (bitconvert (vt Int32Regs:$a))),
- (BITCONVERT_32_I2F Int32Regs:$a)>;
+ (BITCONVERT_32_F2I $a)>;
+def: Pat<(f32 (bitconvert vt:$a)),
+ (BITCONVERT_32_I2F $a)>;
}
foreach vt = [f16, bf16] in {
-def: Pat<(vt (bitconvert (i16 UInt16Const:$a))),
- (IMOVB16ri UInt16Const:$a)>;
-def: Pat<(vt (bitconvert (i16 Int16Regs:$a))),
- (ProxyRegI16 Int16Regs:$a)>;
-def: Pat<(i16 (bitconvert (vt Int16Regs:$a))),
- (ProxyRegI16 Int16Regs:$a)>;
+ def: Pat<(vt (bitconvert i16:$a)),
+ (vt Int16Regs:$a)>;
+ def: Pat<(i16 (bitconvert vt:$a)),
+ (i16 Int16Regs:$a)>;
}
foreach ta = [v2f16, v2bf16, v2i16, v4i8, i32] in {
- def: Pat<(ta (bitconvert (i32 UInt32Const:$a))),
- (IMOVB32ri UInt32Const:$a)>;
foreach tb = [v2f16, v2bf16, v2i16, v4i8, i32] in {
if !ne(ta, tb) then {
- def: Pat<(ta (bitconvert (tb Int32Regs:$a))),
- (ProxyRegI32 Int32Regs:$a)>;
+ def: Pat<(ta (bitconvert tb:$a)),
+ (ta Int32Regs:$a)>;
}
}
}
// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
// we cannot specify floating-point literals in isel patterns. Therefore, we
-// use an integer selp to select either 1 or 0 and then cvt to floating-point.
+// use an integer selp to select either 1 (or -1 in case of signed) or 0
+// and then cvt to floating-point.
// sint -> f16
-def : Pat<(f16 (sint_to_fp Int1Regs:$a)),
- (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f16 (sint_to_fp i1:$a)),
+ (CVT_f16_s32 (SELP_s32ii -1, 0, $a), CvtRN)>;
def : Pat<(f16 (sint_to_fp Int16Regs:$a)),
- (CVT_f16_s16 Int16Regs:$a, CvtRN)>;
-def : Pat<(f16 (sint_to_fp Int32Regs:$a)),
- (CVT_f16_s32 Int32Regs:$a, CvtRN)>;
-def : Pat<(f16 (sint_to_fp Int64Regs:$a)),
- (CVT_f16_s64 Int64Regs:$a, CvtRN)>;
+ (CVT_f16_s16 $a, CvtRN)>;
+def : Pat<(f16 (sint_to_fp i32:$a)),
+ (CVT_f16_s32 $a, CvtRN)>;
+def : Pat<(f16 (sint_to_fp i64:$a)),
+ (CVT_f16_s64 $a, CvtRN)>;
// uint -> f16
-def : Pat<(f16 (uint_to_fp Int1Regs:$a)),
- (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f16 (uint_to_fp i1:$a)),
+ (CVT_f16_u32 (SELP_u32ii 1, 0, $a), CvtRN)>;
def : Pat<(f16 (uint_to_fp Int16Regs:$a)),
- (CVT_f16_u16 Int16Regs:$a, CvtRN)>;
-def : Pat<(f16 (uint_to_fp Int32Regs:$a)),
- (CVT_f16_u32 Int32Regs:$a, CvtRN)>;
-def : Pat<(f16 (uint_to_fp Int64Regs:$a)),
- (CVT_f16_u64 Int64Regs:$a, CvtRN)>;
+ (CVT_f16_u16 $a, CvtRN)>;
+def : Pat<(f16 (uint_to_fp i32:$a)),
+ (CVT_f16_u32 $a, CvtRN)>;
+def : Pat<(f16 (uint_to_fp i64:$a)),
+ (CVT_f16_u64 $a, CvtRN)>;
// sint -> bf16
-def : Pat<(bf16 (sint_to_fp Int1Regs:$a)),
- (CVT_bf16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (sint_to_fp Int16Regs:$a)),
- (CVT_bf16_s16 Int16Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (sint_to_fp Int32Regs:$a)),
- (CVT_bf16_s32 Int32Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (sint_to_fp Int64Regs:$a)),
- (CVT_bf16_s64 Int64Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+def : Pat<(bf16 (sint_to_fp i1:$a)),
+ (CVT_bf16_s32 (SELP_u32ii 1, 0, $a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+def : Pat<(bf16 (sint_to_fp i16:$a)),
+ (CVT_bf16_s16 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+def : Pat<(bf16 (sint_to_fp i32:$a)),
+ (CVT_bf16_s32 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+def : Pat<(bf16 (sint_to_fp i64:$a)),
+ (CVT_bf16_s64 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
// uint -> bf16
-def : Pat<(bf16 (uint_to_fp Int1Regs:$a)),
- (CVT_bf16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (uint_to_fp Int16Regs:$a)),
- (CVT_bf16_u16 Int16Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (uint_to_fp Int32Regs:$a)),
- (CVT_bf16_u32 Int32Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (uint_to_fp Int64Regs:$a)),
- (CVT_bf16_u64 Int64Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+def : Pat<(bf16 (uint_to_fp i1:$a)),
+ (CVT_bf16_u32 (SELP_u32ii 1, 0, $a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+def : Pat<(bf16 (uint_to_fp i16:$a)),
+ (CVT_bf16_u16 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+def : Pat<(bf16 (uint_to_fp i32:$a)),
+ (CVT_bf16_u32 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+def : Pat<(bf16 (uint_to_fp i64:$a)),
+ (CVT_bf16_u64 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
// sint -> f32
-def : Pat<(f32 (sint_to_fp Int1Regs:$a)),
- (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
-def : Pat<(f32 (sint_to_fp Int16Regs:$a)),
- (CVT_f32_s16 Int16Regs:$a, CvtRN)>;
-def : Pat<(f32 (sint_to_fp Int32Regs:$a)),
- (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
-def : Pat<(f32 (sint_to_fp Int64Regs:$a)),
- (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
+def : Pat<(f32 (sint_to_fp i1:$a)),
+ (CVT_f32_s32 (SELP_s32ii -1, 0, $a), CvtRN)>;
+def : Pat<(f32 (sint_to_fp i16:$a)),
+ (CVT_f32_s16 $a, CvtRN)>;
+def : Pat<(f32 (sint_to_fp i32:$a)),
+ (CVT_f32_s32 $a, CvtRN)>;
+def : Pat<(f32 (sint_to_fp i64:$a)),
+ (CVT_f32_s64 $a, CvtRN)>;
// uint -> f32
-def : Pat<(f32 (uint_to_fp Int1Regs:$a)),
- (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
-def : Pat<(f32 (uint_to_fp Int16Regs:$a)),
- (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
-def : Pat<(f32 (uint_to_fp Int32Regs:$a)),
- (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
-def : Pat<(f32 (uint_to_fp Int64Regs:$a)),
- (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
+def : Pat<(f32 (uint_to_fp i1:$a)),
+ (CVT_f32_u32 (SELP_u32ii 1, 0, $a), CvtRN)>;
+def : Pat<(f32 (uint_to_fp i16:$a)),
+ (CVT_f32_u16 $a, CvtRN)>;
+def : Pat<(f32 (uint_to_fp i32:$a)),
+ (CVT_f32_u32 $a, CvtRN)>;
+def : Pat<(f32 (uint_to_fp i64:$a)),
+ (CVT_f32_u64 $a, CvtRN)>;
// sint -> f64
-def : Pat<(f64 (sint_to_fp Int1Regs:$a)),
- (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
-def : Pat<(f64 (sint_to_fp Int16Regs:$a)),
- (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
-def : Pat<(f64 (sint_to_fp Int32Regs:$a)),
- (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
-def : Pat<(f64 (sint_to_fp Int64Regs:$a)),
- (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
+def : Pat<(f64 (sint_to_fp i1:$a)),
+ (CVT_f64_s32 (SELP_s32ii -1, 0, $a), CvtRN)>;
+def : Pat<(f64 (sint_to_fp i16:$a)),
+ (CVT_f64_s16 $a, CvtRN)>;
+def : Pat<(f64 (sint_to_fp i32:$a)),
+ (CVT_f64_s32 $a, CvtRN)>;
+def : Pat<(f64 (sint_to_fp i64:$a)),
+ (CVT_f64_s64 $a, CvtRN)>;
// uint -> f64
-def : Pat<(f64 (uint_to_fp Int1Regs:$a)),
- (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
-def : Pat<(f64 (uint_to_fp Int16Regs:$a)),
- (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
-def : Pat<(f64 (uint_to_fp Int32Regs:$a)),
- (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
-def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
- (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
+def : Pat<(f64 (uint_to_fp i1:$a)),
+ (CVT_f64_u32 (SELP_u32ii 1, 0, $a), CvtRN)>;
+def : Pat<(f64 (uint_to_fp i16:$a)),
+ (CVT_f64_u16 $a, CvtRN)>;
+def : Pat<(f64 (uint_to_fp i32:$a)),
+ (CVT_f64_u32 $a, CvtRN)>;
+def : Pat<(f64 (uint_to_fp i64:$a)),
+ (CVT_f64_u64 $a, CvtRN)>;
// f16 -> sint
-def : Pat<(i1 (fp_to_sint (f16 Int16Regs:$a))),
- (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_sint (f16 Int16Regs:$a))),
- (CVT_s16_f16 (f16 Int16Regs:$a), CvtRZI)>;
-def : Pat<(i32 (fp_to_sint (f16 Int16Regs:$a))),
- (CVT_s32_f16 (f16 Int16Regs:$a), CvtRZI)>;
-def : Pat<(i64 (fp_to_sint (f16 Int16Regs:$a))),
- (CVT_s64_f16 Int16Regs:$a, CvtRZI)>;
+def : Pat<(i1 (fp_to_sint f16:$a)),
+ (SETP_b16ri $a, 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_sint f16:$a)),
+ (CVT_s16_f16 $a, CvtRZI)>;
+def : Pat<(i32 (fp_to_sint f16:$a)),
+ (CVT_s32_f16 $a, CvtRZI)>;
+def : Pat<(i64 (fp_to_sint f16:$a)),
+ (CVT_s64_f16 $a, CvtRZI)>;
// f16 -> uint
-def : Pat<(i1 (fp_to_uint (f16 Int16Regs:$a))),
- (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_uint (f16 Int16Regs:$a))),
- (CVT_u16_f16 Int16Regs:$a, CvtRZI)>;
-def : Pat<(i32 (fp_to_uint (f16 Int16Regs:$a))),
- (CVT_u32_f16 Int16Regs:$a, CvtRZI)>;
-def : Pat<(i64 (fp_to_uint (f16 Int16Regs:$a))),
- (CVT_u64_f16 Int16Regs:$a, CvtRZI)>;
+def : Pat<(i1 (fp_to_uint f16:$a)),
+ (SETP_b16ri $a, 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_uint f16:$a)),
+ (CVT_u16_f16 $a, CvtRZI)>;
+def : Pat<(i32 (fp_to_uint f16:$a)),
+ (CVT_u32_f16 $a, CvtRZI)>;
+def : Pat<(i64 (fp_to_uint f16:$a)),
+ (CVT_u64_f16 $a, CvtRZI)>;
// bf16 -> sint
-def : Pat<(i1 (fp_to_sint (bf16 Int16Regs:$a))),
- (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_sint (bf16 Int16Regs:$a))),
- (CVT_s16_bf16 (bf16 Int16Regs:$a), CvtRZI)>;
-def : Pat<(i32 (fp_to_sint (bf16 Int16Regs:$a))),
- (CVT_s32_bf16 (bf16 Int16Regs:$a), CvtRZI)>;
-def : Pat<(i64 (fp_to_sint (bf16 Int16Regs:$a))),
- (CVT_s64_bf16 Int16Regs:$a, CvtRZI)>;
+def : Pat<(i1 (fp_to_sint bf16:$a)),
+ (SETP_b16ri $a, 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_sint bf16:$a)),
+ (CVT_s16_bf16 $a, CvtRZI)>;
+def : Pat<(i32 (fp_to_sint bf16:$a)),
+ (CVT_s32_bf16 $a, CvtRZI)>;
+def : Pat<(i64 (fp_to_sint bf16:$a)),
+ (CVT_s64_bf16 $a, CvtRZI)>;
// bf16 -> uint
-def : Pat<(i1 (fp_to_uint (bf16 Int16Regs:$a))),
- (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_uint (bf16 Int16Regs:$a))),
- (CVT_u16_bf16 Int16Regs:$a, CvtRZI)>;
-def : Pat<(i32 (fp_to_uint (bf16 Int16Regs:$a))),
- (CVT_u32_bf16 Int16Regs:$a, CvtRZI)>;
-def : Pat<(i64 (fp_to_uint (bf16 Int16Regs:$a))),
- (CVT_u64_bf16 Int16Regs:$a, CvtRZI)>;
+def : Pat<(i1 (fp_to_uint bf16:$a)),
+ (SETP_b16ri $a, 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_uint bf16:$a)),
+ (CVT_u16_bf16 $a, CvtRZI)>;
+def : Pat<(i32 (fp_to_uint bf16:$a)),
+ (CVT_u32_bf16 $a, CvtRZI)>;
+def : Pat<(i64 (fp_to_uint bf16:$a)),
+ (CVT_u64_bf16 $a, CvtRZI)>;
// f32 -> sint
-def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
- (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
- (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
- (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
-def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
- (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
- (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
-def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
- (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
- (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i1 (fp_to_sint f32:$a)),
+ (SETP_b32ri (BITCONVERT_32_F2I $a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_sint f32:$a)),
+ (CVT_s16_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i16 (fp_to_sint f32:$a)),
+ (CVT_s16_f32 $a, CvtRZI)>;
+def : Pat<(i32 (fp_to_sint f32:$a)),
+ (CVT_s32_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i32 (fp_to_sint f32:$a)),
+ (CVT_s32_f32 $a, CvtRZI)>;
+def : Pat<(i64 (fp_to_sint f32:$a)),
+ (CVT_s64_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i64 (fp_to_sint f32:$a)),
+ (CVT_s64_f32 $a, CvtRZI)>;
// f32 -> uint
-def : Pat<(i1 (fp_to_uint Float32Regs:$a)),
- (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
- (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
- (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
-def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
- (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
- (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
-def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
- (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
- (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i1 (fp_to_uint f32:$a)),
+ (SETP_b32ri (BITCONVERT_32_F2I $a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_uint f32:$a)),
+ (CVT_u16_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i16 (fp_to_uint f32:$a)),
+ (CVT_u16_f32 $a, CvtRZI)>;
+def : Pat<(i32 (fp_to_uint f32:$a)),
+ (CVT_u32_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i32 (fp_to_uint f32:$a)),
+ (CVT_u32_f32 $a, CvtRZI)>;
+def : Pat<(i64 (fp_to_uint f32:$a)),
+ (CVT_u64_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i64 (fp_to_uint f32:$a)),
+ (CVT_u64_f32 $a, CvtRZI)>;
// f64 -> sint
-def : Pat<(i1 (fp_to_sint Float64Regs:$a)),
- (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_sint Float64Regs:$a)),
- (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
-def : Pat<(i32 (fp_to_sint Float64Regs:$a)),
- (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
-def : Pat<(i64 (fp_to_sint Float64Regs:$a)),
- (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i1 (fp_to_sint f64:$a)),
+ (SETP_b64ri (BITCONVERT_64_F2I $a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_sint f64:$a)),
+ (CVT_s16_f64 $a, CvtRZI)>;
+def : Pat<(i32 (fp_to_sint f64:$a)),
+ (CVT_s32_f64 $a, CvtRZI)>;
+def : Pat<(i64 (fp_to_sint f64:$a)),
+ (CVT_s64_f64 $a, CvtRZI)>;
// f64 -> uint
-def : Pat<(i1 (fp_to_uint Float64Regs:$a)),
- (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_uint Float64Regs:$a)),
- (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
-def : Pat<(i32 (fp_to_uint Float64Regs:$a)),
- (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
-def : Pat<(i64 (fp_to_uint Float64Regs:$a)),
- (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i1 (fp_to_uint f64:$a)),
+ (SETP_b64ri (BITCONVERT_64_F2I $a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_uint f64:$a)),
+ (CVT_u16_f64 $a, CvtRZI)>;
+def : Pat<(i32 (fp_to_uint f64:$a)),
+ (CVT_u32_f64 $a, CvtRZI)>;
+def : Pat<(i64 (fp_to_uint f64:$a)),
+ (CVT_u64_f64 $a, CvtRZI)>;
// sext i1
-def : Pat<(i16 (sext Int1Regs:$a)),
- (SELP_s16ii -1, 0, Int1Regs:$a)>;
-def : Pat<(i32 (sext Int1Regs:$a)),
- (SELP_s32ii -1, 0, Int1Regs:$a)>;
-def : Pat<(i64 (sext Int1Regs:$a)),
- (SELP_s64ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i16 (sext i1:$a)),
+ (SELP_s16ii -1, 0, $a)>;
+def : Pat<(i32 (sext i1:$a)),
+ (SELP_s32ii -1, 0, $a)>;
+def : Pat<(i64 (sext i1:$a)),
+ (SELP_s64ii -1, 0, $a)>;
// zext i1
-def : Pat<(i16 (zext Int1Regs:$a)),
- (SELP_u16ii 1, 0, Int1Regs:$a)>;
-def : Pat<(i32 (zext Int1Regs:$a)),
- (SELP_u32ii 1, 0, Int1Regs:$a)>;
-def : Pat<(i64 (zext Int1Regs:$a)),
- (SELP_u64ii 1, 0, Int1Regs:$a)>;
+def : Pat<(i16 (zext i1:$a)),
+ (SELP_u16ii 1, 0, $a)>;
+def : Pat<(i32 (zext i1:$a)),
+ (SELP_u32ii 1, 0, $a)>;
+def : Pat<(i64 (zext i1:$a)),
+ (SELP_u64ii 1, 0, $a)>;
// anyext i1
-def : Pat<(i16 (anyext Int1Regs:$a)),
- (SELP_u16ii -1, 0, Int1Regs:$a)>;
-def : Pat<(i32 (anyext Int1Regs:$a)),
- (SELP_u32ii -1, 0, Int1Regs:$a)>;
-def : Pat<(i64 (anyext Int1Regs:$a)),
- (SELP_u64ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i16 (anyext i1:$a)),
+ (SELP_u16ii -1, 0, $a)>;
+def : Pat<(i32 (anyext i1:$a)),
+ (SELP_u32ii -1, 0, $a)>;
+def : Pat<(i64 (anyext i1:$a)),
+ (SELP_u64ii -1, 0, $a)>;
// sext i16
-def : Pat<(i32 (sext Int16Regs:$a)),
- (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
-def : Pat<(i64 (sext Int16Regs:$a)),
- (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i32 (sext i16:$a)),
+ (CVT_s32_s16 $a, CvtNONE)>;
+def : Pat<(i64 (sext i16:$a)),
+ (CVT_s64_s16 $a, CvtNONE)>;
// zext i16
-def : Pat<(i32 (zext Int16Regs:$a)),
- (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
-def : Pat<(i64 (zext Int16Regs:$a)),
- (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i32 (zext i16:$a)),
+ (CVT_u32_u16 $a, CvtNONE)>;
+def : Pat<(i64 (zext i16:$a)),
+ (CVT_u64_u16 $a, CvtNONE)>;
// anyext i16
-def : Pat<(i32 (anyext Int16Regs:$a)),
- (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
-def : Pat<(i64 (anyext Int16Regs:$a)),
- (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i32 (anyext i16:$a)),
+ (CVT_u32_u16 $a, CvtNONE)>;
+def : Pat<(i64 (anyext i16:$a)),
+ (CVT_u64_u16 $a, CvtNONE)>;
// sext i32
-def : Pat<(i64 (sext Int32Regs:$a)),
- (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
+def : Pat<(i64 (sext i32:$a)),
+ (CVT_s64_s32 $a, CvtNONE)>;
// zext i32
-def : Pat<(i64 (zext Int32Regs:$a)),
- (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
+def : Pat<(i64 (zext i32:$a)),
+ (CVT_u64_u32 $a, CvtNONE)>;
// anyext i32
-def : Pat<(i64 (anyext Int32Regs:$a)),
- (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
+def : Pat<(i64 (anyext i32:$a)),
+ (CVT_u64_u32 $a, CvtNONE)>;
// truncate i64
-def : Pat<(i32 (trunc Int64Regs:$a)),
- (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
-def : Pat<(i16 (trunc Int64Regs:$a)),
- (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
-def : Pat<(i1 (trunc Int64Regs:$a)),
- (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
+def : Pat<(i32 (trunc i64:$a)),
+ (CVT_u32_u64 $a, CvtNONE)>;
+def : Pat<(i16 (trunc i64:$a)),
+ (CVT_u16_u64 $a, CvtNONE)>;
+def : Pat<(i1 (trunc i64:$a)),
+ (SETP_b64ri (ANDb64ri $a, 1), 1, CmpEQ)>;
// truncate i32
-def : Pat<(i16 (trunc Int32Regs:$a)),
- (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
-def : Pat<(i1 (trunc Int32Regs:$a)),
- (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
+def : Pat<(i16 (trunc i32:$a)),
+ (CVT_u16_u32 $a, CvtNONE)>;
+def : Pat<(i1 (trunc i32:$a)),
+ (SETP_b32ri (ANDb32ri $a, 1), 1, CmpEQ)>;
// truncate i16
-def : Pat<(i1 (trunc Int16Regs:$a)),
- (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
+def : Pat<(i1 (trunc i16:$a)),
+ (SETP_b16ri (ANDb16ri $a, 1), 1, CmpEQ)>;
// sext_inreg
-def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
-def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
-def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
-def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
-def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
-def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
+def : Pat<(sext_inreg i16:$a, i8), (CVT_INREG_s16_s8 $a)>;
+def : Pat<(sext_inreg i32:$a, i8), (CVT_INREG_s32_s8 $a)>;
+def : Pat<(sext_inreg i32:$a, i16), (CVT_INREG_s32_s16 $a)>;
+def : Pat<(sext_inreg i64:$a, i8), (CVT_INREG_s64_s8 $a)>;
+def : Pat<(sext_inreg i64:$a, i16), (CVT_INREG_s64_s16 $a)>;
+def : Pat<(sext_inreg i64:$a, i32), (CVT_INREG_s64_s32 $a)>;
// Select instructions with 32-bit predicates
-def : Pat<(select (i32 Int32Regs:$pred), i16:$a, i16:$b),
- (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
- (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
-def : Pat<(select (i32 Int32Regs:$pred), i32:$a, i32:$b),
- (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
- (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
-def : Pat<(select (i32 Int32Regs:$pred), Int64Regs:$a, Int64Regs:$b),
- (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
- (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
-def : Pat<(select (i32 Int32Regs:$pred), (f16 Int16Regs:$a), (f16 Int16Regs:$b)),
- (SELP_f16rr Int16Regs:$a, Int16Regs:$b,
- (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
-def : Pat<(select (i32 Int32Regs:$pred), (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)),
- (SELP_bf16rr Int16Regs:$a, Int16Regs:$b,
- (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
-def : Pat<(select (i32 Int32Regs:$pred), Float32Regs:$a, Float32Regs:$b),
- (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
- (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
-def : Pat<(select (i32 Int32Regs:$pred), Float64Regs:$a, Float64Regs:$b),
- (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
- (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select i32:$pred, i16:$a, i16:$b),
+ (SELP_b16rr $a, $b,
+ (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
+def : Pat<(select i32:$pred, i32:$a, i32:$b),
+ (SELP_b32rr $a, $b,
+ (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
+def : Pat<(select i32:$pred, i64:$a, i64:$b),
+ (SELP_b64rr $a, $b,
+ (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
+def : Pat<(select i32:$pred, f16:$a, f16:$b),
+ (SELP_f16rr $a, $b,
+ (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
+def : Pat<(select i32:$pred, bf16:$a, bf16:$b),
+ (SELP_bf16rr $a, $b,
+ (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
+def : Pat<(select i32:$pred, f32:$a, f32:$b),
+ (SELP_f32rr $a, $b,
+ (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
+def : Pat<(select i32:$pred, f64:$a, f64:$b),
+ (SELP_f64rr $a, $b,
+ (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
let hasSideEffects = false in {
@@ -3610,33 +3409,78 @@ let hasSideEffects = false in {
// Using partial vectorized move produces better SASS code for extraction of
// upper/lower parts of an integer.
-def : Pat<(i16 (trunc (srl Int32Regs:$s, (i32 16)))),
- (I32toI16H Int32Regs:$s)>;
-def : Pat<(i16 (trunc (sra Int32Regs:$s, (i32 16)))),
- (I32toI16H Int32Regs:$s)>;
-def : Pat<(i32 (trunc (srl Int64Regs:$s, (i32 32)))),
- (I64toI32H Int64Regs:$s)>;
-def : Pat<(i32 (trunc (sra Int64Regs:$s, (i32 32)))),
- (I64toI32H Int64Regs:$s)>;
-
-def: Pat<(i32 (sext (extractelt (v2i16 Int32Regs:$src), 0))),
- (CVT_INREG_s32_s16 Int32Regs:$src)>;
+def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))),
+ (I32toI16H $s)>;
+def : Pat<(i16 (trunc (sra i32:$s, (i32 16)))),
+ (I32toI16H $s)>;
+def : Pat<(i32 (trunc (srl i64:$s, (i32 32)))),
+ (I64toI32H $s)>;
+def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))),
+ (I64toI32H $s)>;
+
+def: Pat<(i32 (sext (extractelt v2i16:$src, 0))),
+ (CVT_INREG_s32_s16 $src)>;
foreach vt = [v2f16, v2bf16, v2i16] in {
-def : Pat<(extractelt (vt Int32Regs:$src), 0),
- (I32toI16L Int32Regs:$src)>;
-def : Pat<(extractelt (vt Int32Regs:$src), 1),
- (I32toI16H Int32Regs:$src)>;
+def : Pat<(extractelt vt:$src, 0),
+ (I32toI16L $src)>;
+def : Pat<(extractelt vt:$src, 1),
+ (I32toI16H $src)>;
}
-def : Pat<(v2f16 (build_vector (f16 Int16Regs:$a), (f16 Int16Regs:$b))),
- (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
-def : Pat<(v2bf16 (build_vector (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
- (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
-def : Pat<(v2i16 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b))),
- (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
+def : Pat<(v2f16 (build_vector f16:$a, f16:$b)),
+ (V2I16toI32 $a, $b)>;
+def : Pat<(v2bf16 (build_vector bf16:$a, bf16:$b)),
+ (V2I16toI32 $a, $b)>;
+def : Pat<(v2i16 (build_vector i16:$a, i16:$b)),
+ (V2I16toI32 $a, $b)>;
-def: Pat<(v2i16 (scalar_to_vector (i16 Int16Regs:$a))),
- (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
+def: Pat<(v2i16 (scalar_to_vector i16:$a)),
+ (CVT_u32_u16 $a, CvtNONE)>;
+
+//
+// Funnel-Shift
+//
+
+// Create SDNodes so they can be used in the DAG code, e.g.
+// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
+def fshl_clamp : SDNode<"NVPTXISD::FSHL_CLAMP", SDTIntShiftDOp, []>;
+def fshr_clamp : SDNode<"NVPTXISD::FSHR_CLAMP", SDTIntShiftDOp, []>;
+
+// Funnel shift, requires >= sm_32. Does not trap if amt is out of range, so
+// no side effects.
+let hasSideEffects = false in {
+ multiclass ShfInst<string mode, SDNode op> {
+ def _i
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
+ "shf." # mode # ".b32 \t$dst, $lo, $hi, $amt;",
+ [(set i32:$dst,
+ (op i32:$hi, i32:$lo, (i32 imm:$amt)))]>,
+ Requires<[hasHWROT32]>;
+
+ def _r
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+ "shf." # mode # ".b32 \t$dst, $lo, $hi, $amt;",
+ [(set i32:$dst,
+ (op i32:$hi, i32:$lo, i32:$amt))]>,
+ Requires<[hasHWROT32]>;
+ }
+
+ defm SHF_L_CLAMP : ShfInst<"l.clamp", fshl_clamp>;
+ defm SHF_R_CLAMP : ShfInst<"r.clamp", fshr_clamp>;
+ defm SHF_L_WRAP : ShfInst<"l.wrap", fshl>;
+ defm SHF_R_WRAP : ShfInst<"r.wrap", fshr>;
+}
+
+def : Pat<(i32 (int_nvvm_fshl_clamp i32:$hi, i32:$lo, i32:$amt)),
+ (SHF_L_CLAMP_r $lo, $hi, $amt)>;
+def : Pat<(i32 (int_nvvm_fshl_clamp i32:$hi, i32:$lo, (i32 imm:$amt))),
+ (SHF_L_CLAMP_i $lo, $hi, imm:$amt)>;
+def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, i32:$amt)),
+ (SHF_R_CLAMP_r $lo, $hi, $amt)>;
+def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, (i32 imm:$amt))),
+ (SHF_R_CLAMP_i $lo, $hi, imm:$amt)>;
// Count leading zeros
let hasSideEffects = false in {
@@ -3647,14 +3491,14 @@ let hasSideEffects = false in {
}
// 32-bit has a direct PTX instruction
-def : Pat<(i32 (ctlz (i32 Int32Regs:$a))), (CLZr32 Int32Regs:$a)>;
+def : Pat<(i32 (ctlz i32:$a)), (CLZr32 $a)>;
// The return type of the ctlz ISD node is the same as its input, but the PTX
// ctz instruction always returns a 32-bit value. For ctlz.i64, convert the
// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
// truncating back down to 32 bits.
-def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
-def : Pat<(i32 (trunc (i64 (ctlz Int64Regs:$a)))), (CLZr64 Int64Regs:$a)>;
+def : Pat<(i64 (ctlz i64:$a)), (CVT_u64_u32 (CLZr64 $a), CvtNONE)>;
+def : Pat<(i32 (trunc (i64 (ctlz i64:$a)))), (CLZr64 $a)>;
// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
// result back to 16-bits if necessary. We also need to subtract 16 because
@@ -3670,11 +3514,11 @@ def : Pat<(i32 (trunc (i64 (ctlz Int64Regs:$a)))), (CLZr64 Int64Regs:$a)>;
// and then ctlz that value. This way we don't have to subtract 16 from the
// result. Unfortunately today we don't have a way to generate
// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
-def : Pat<(i16 (ctlz Int16Regs:$a)),
+def : Pat<(i16 (ctlz i16:$a)),
(SUBi16ri (CVT_u16_u32
- (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
-def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))),
- (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
+ (CLZr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE), 16)>;
+def : Pat<(i32 (zext (i16 (ctlz i16:$a)))),
+ (SUBi32ri (CLZr32 (CVT_u32_u16 $a, CvtNONE)), 16)>;
// Population count
let hasSideEffects = false in {
@@ -3685,67 +3529,67 @@ let hasSideEffects = false in {
}
// 32-bit has a direct PTX instruction
-def : Pat<(i32 (ctpop (i32 Int32Regs:$a))), (POPCr32 Int32Regs:$a)>;
+def : Pat<(i32 (ctpop i32:$a)), (POPCr32 $a)>;
// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
// to match the LLVM semantics. Just as with ctlz.i64, we provide a second
// pattern that avoids the type conversion if we're truncating the result to
// i32 anyway.
-def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
-def : Pat<(i32 (trunc (i64 (ctpop Int64Regs:$a)))), (POPCr64 Int64Regs:$a)>;
+def : Pat<(ctpop i64:$a), (CVT_u64_u32 (POPCr64 $a), CvtNONE)>;
+def : Pat<(i32 (trunc (i64 (ctpop i64:$a)))), (POPCr64 $a)>;
// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
// If we know that we're storing into an i32, we can avoid the final trunc.
-def : Pat<(ctpop Int16Regs:$a),
- (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
-def : Pat<(i32 (zext (i16 (ctpop Int16Regs:$a)))),
- (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>;
+def : Pat<(ctpop i16:$a),
+ (CVT_u16_u32 (POPCr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE)>;
+def : Pat<(i32 (zext (i16 (ctpop i16:$a)))),
+ (POPCr32 (CVT_u32_u16 $a, CvtNONE))>;
// fpround f32 -> f16
-def : Pat<(f16 (fpround Float32Regs:$a)),
- (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
+def : Pat<(f16 (fpround f32:$a)),
+ (CVT_f16_f32 $a, CvtRN)>;
// fpround f32 -> bf16
-def : Pat<(bf16 (fpround Float32Regs:$a)),
- (CVT_bf16_f32 Float32Regs:$a, CvtRN)>, Requires<[hasPTX<70>, hasSM<80>]>;
+def : Pat<(bf16 (fpround f32:$a)),
+ (CVT_bf16_f32 $a, CvtRN)>, Requires<[hasPTX<70>, hasSM<80>]>;
// fpround f64 -> f16
-def : Pat<(f16 (fpround Float64Regs:$a)),
- (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
+def : Pat<(f16 (fpround f64:$a)),
+ (CVT_f16_f64 $a, CvtRN)>;
// fpround f64 -> bf16
-def : Pat<(bf16 (fpround Float64Regs:$a)),
- (CVT_bf16_f64 Float64Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+def : Pat<(bf16 (fpround f64:$a)),
+ (CVT_bf16_f64 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
// fpround f64 -> f32
-def : Pat<(f32 (fpround Float64Regs:$a)),
- (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(f32 (fpround Float64Regs:$a)),
- (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
+def : Pat<(f32 (fpround f64:$a)),
+ (CVT_f32_f64 $a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fpround f64:$a)),
+ (CVT_f32_f64 $a, CvtRN)>;
// fpextend f16 -> f32
-def : Pat<(f32 (fpextend (f16 Int16Regs:$a))),
- (CVT_f32_f16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(f32 (fpextend (f16 Int16Regs:$a))),
- (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(f32 (fpextend f16:$a)),
+ (CVT_f32_f16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fpextend f16:$a)),
+ (CVT_f32_f16 $a, CvtNONE)>;
// fpextend bf16 -> f32
-def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))),
- (CVT_f32_bf16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))),
- (CVT_f32_bf16 Int16Regs:$a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>;
+def : Pat<(f32 (fpextend bf16:$a)),
+ (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fpextend bf16:$a)),
+ (CVT_f32_bf16 $a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>;
// fpextend f16 -> f64
-def : Pat<(f64 (fpextend (f16 Int16Regs:$a))),
- (CVT_f64_f16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(f64 (fpextend f16:$a)),
+ (CVT_f64_f16 $a, CvtNONE)>;
// fpextend bf16 -> f64
-def : Pat<(f64 (fpextend (bf16 Int16Regs:$a))),
- (CVT_f64_bf16 Int16Regs:$a, CvtNONE)>, Requires<[hasPTX<78>, hasSM<90>]>;
+def : Pat<(f64 (fpextend bf16:$a)),
+ (CVT_f64_bf16 $a, CvtNONE)>, Requires<[hasPTX<78>, hasSM<90>]>;
// fpextend f32 -> f64
-def : Pat<(f64 (fpextend Float32Regs:$a)),
- (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(f64 (fpextend Float32Regs:$a)),
- (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
+def : Pat<(f64 (fpextend f32:$a)),
+ (CVT_f64_f32 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f64 (fpextend f32:$a)),
+ (CVT_f64_f32 $a, CvtNONE)>;
def retglue : SDNode<"NVPTXISD::RET_GLUE", SDTNone,
[SDNPHasChain, SDNPOptInGlue]>;
@@ -3753,16 +3597,16 @@ def retglue : SDNode<"NVPTXISD::RET_GLUE", SDTNone,
// fceil, ffloor, froundeven, ftrunc.
multiclass CVT_ROUND<SDNode OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
- def : Pat<(OpNode (f16 Int16Regs:$a)),
- (CVT_f16_f16 Int16Regs:$a, Mode)>;
- def : Pat<(OpNode (bf16 Int16Regs:$a)),
- (CVT_bf16_bf16 Int16Regs:$a, Mode)>;
- def : Pat<(OpNode Float32Regs:$a),
- (CVT_f32_f32 Float32Regs:$a, ModeFTZ)>, Requires<[doF32FTZ]>;
- def : Pat<(OpNode Float32Regs:$a),
- (CVT_f32_f32 Float32Regs:$a, Mode)>, Requires<[doNoF32FTZ]>;
- def : Pat<(OpNode Float64Regs:$a),
- (CVT_f64_f64 Float64Regs:$a, Mode)>;
+ def : Pat<(OpNode f16:$a),
+ (CVT_f16_f16 $a, Mode)>;
+ def : Pat<(OpNode bf16:$a),
+ (CVT_bf16_bf16 $a, Mode)>;
+ def : Pat<(OpNode f32:$a),
+ (CVT_f32_f32 $a, ModeFTZ)>, Requires<[doF32FTZ]>;
+ def : Pat<(OpNode f32:$a),
+ (CVT_f32_f32 $a, Mode)>, Requires<[doNoF32FTZ]>;
+ def : Pat<(OpNode f64:$a),
+ (CVT_f64_f64 $a, Mode)>;
}
defm : CVT_ROUND<fceil, CvtRPI, CvtRPI_FTZ>;
@@ -3788,7 +3632,7 @@ let isTerminator=1 in {
let isBranch=1 in
def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
"@$a bra \t$target;",
- [(brcond Int1Regs:$a, bb:$target)]>;
+ [(brcond i1:$a, bb:$target)]>;
let isBranch=1 in
def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
"@!$a bra \t$target;", []>;
@@ -3798,16 +3642,16 @@ let isTerminator=1 in {
"bra.uni \t$target;", [(br bb:$target)]>;
}
-def : Pat<(brcond (i32 Int32Regs:$a), bb:$target),
- (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
+def : Pat<(brcond i32:$a, bb:$target),
+ (CBranch (SETP_u32ri $a, 0, CmpNE), bb:$target)>;
// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
// conditional branch if the target block is the next block so that the code
// can fall through to the target block. The invertion is done by 'xor
// condition, 1', which will be translated to (setne condition, -1). Since ptx
// supports '@!pred bra target', we should use it.
-def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
- (CBranchOther Int1Regs:$a, bb:$target)>;
+def : Pat<(brcond (i1 (setne i1:$a, -1)), bb:$target),
+ (CBranchOther $a, bb:$target)>;
// Call
def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
@@ -3845,9 +3689,12 @@ def Callseq_End :
[(callseq_end timm:$amt1, timm:$amt2)]>;
// trap instruction
+def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>, Requires<[noPTXASUnreachableBug]>;
// Emit an `exit` as well to convey to ptxas that `trap` exits the CFG.
// This won't be necessary in a future version of ptxas.
-def trapinst : NVPTXInst<(outs), (ins), "trap; exit;", [(trap)]>;
+def trapexitinst : NVPTXInst<(outs), (ins), "trap; exit;", [(trap)]>, Requires<[hasPTXASUnreachableBug]>;
+// brkpt instruction
+def debugtrapinst : NVPTXInst<(outs), (ins), "brkpt;", [(debugtrap)]>;
// Call prototype wrapper
def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
@@ -3873,7 +3720,7 @@ def DYNAMIC_STACKALLOC32 :
(ins Int32Regs:$size, i32imm:$align),
"alloca.u32 \t$ptr, $size, $align;\n\t"
"cvta.local.u32 \t$ptr, $ptr;",
- [(set (i32 Int32Regs:$ptr), (dyn_alloca Int32Regs:$size, (i32 timm:$align)))]>,
+ [(set i32:$ptr, (dyn_alloca i32:$size, (i32 timm:$align)))]>,
Requires<[hasPTX<73>, hasSM<52>]>;
def DYNAMIC_STACKALLOC64 :
@@ -3881,7 +3728,112 @@ def DYNAMIC_STACKALLOC64 :
(ins Int64Regs:$size, i32imm:$align),
"alloca.u64 \t$ptr, $size, $align;\n\t"
"cvta.local.u64 \t$ptr, $ptr;",
- [(set Int64Regs:$ptr, (dyn_alloca Int64Regs:$size, (i32 timm:$align)))]>,
+ [(set i64:$ptr, (dyn_alloca i64:$size, (i32 timm:$align)))]>,
+ Requires<[hasPTX<73>, hasSM<52>]>;
+
+
+//
+// BRX
+//
+
+def SDTBrxStartProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDTBrxItemProfile : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
+def SDTBrxEndProfile : SDTypeProfile<0, 3, [SDTCisVT<0, OtherVT>, SDTCisInt<1>, SDTCisInt<2>]>;
+
+def brx_start :
+ SDNode<"NVPTXISD::BrxStart", SDTBrxStartProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def brx_item :
+ SDNode<"NVPTXISD::BrxItem", SDTBrxItemProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def brx_end :
+ SDNode<"NVPTXISD::BrxEnd", SDTBrxEndProfile,
+ [SDNPHasChain, SDNPInGlue, SDNPSideEffect]>;
+
+let isTerminator = 1, isBranch = 1, isIndirectBranch = 1, isNotDuplicable = 1 in {
+
+ def BRX_START :
+ NVPTXInst<(outs), (ins i32imm:$id),
+ "$$L_brx_$id: .branchtargets",
+ [(brx_start (i32 imm:$id))]>;
+
+ def BRX_ITEM :
+ NVPTXInst<(outs), (ins brtarget:$target),
+ "\t$target,",
+ [(brx_item bb:$target)]>;
+
+ def BRX_END :
+ NVPTXInst<(outs), (ins brtarget:$target, Int32Regs:$val, i32imm:$id),
+ "\t$target;\n\tbrx.idx \t$val, $$L_brx_$id;",
+ [(brx_end bb:$target, i32:$val, (i32 imm:$id))]> {
+ let isBarrier = 1;
+ }
+}
+
+
+foreach a_type = ["s", "u"] in {
+ foreach b_type = ["s", "u"] in {
+
+ def DOT4_ # a_type # b_type :
+ NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
+ "dp4a." # a_type # "32." # b_type # "32 \t$dst, $a, $b, $c;",
+ [(set i32:$dst,
+ (!cast<Intrinsic>("int_nvvm_idp4a_" # a_type # "_" # b_type)
+ i32:$a, i32:$b, i32:$c))]>,
+ Requires<[hasDotInstructions]>;
+
+ foreach is_hi = [0, -1] in {
+ defvar lohi_suffix = !if(is_hi, "hi", "lo");
+
+ def DOT2_ # lohi_suffix # _ # a_type # b_type :
+ NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
+ "dp2a." # lohi_suffix # "." # a_type # "32." # b_type # "32 \t$dst, $a, $b, $c;",
+ [(set i32:$dst,
+ (!cast<Intrinsic>("int_nvvm_idp2a_" # a_type # "_" # b_type)
+ i32:$a, i32:$b, is_hi, i32:$c))]>,
+ Requires<[hasDotInstructions]>;
+ }
+ }
+}
+
+//
+// Stack Manipulation
+//
+
+def SDTStackRestore : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def stackrestore :
+ SDNode<"NVPTXISD::STACKRESTORE", SDTStackRestore,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def stacksave :
+ SDNode<"NVPTXISD::STACKSAVE", SDTIntLeaf,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def STACKRESTORE_32 :
+ NVPTXInst<(outs), (ins Int32Regs:$ptr),
+ "stackrestore.u32 \t$ptr;",
+ [(stackrestore i32:$ptr)]>,
+ Requires<[hasPTX<73>, hasSM<52>]>;
+
+def STACKSAVE_32 :
+ NVPTXInst<(outs Int32Regs:$dst), (ins),
+ "stacksave.u32 \t$dst;",
+ [(set i32:$dst, (i32 stacksave))]>,
+ Requires<[hasPTX<73>, hasSM<52>]>;
+
+def STACKRESTORE_64 :
+ NVPTXInst<(outs), (ins Int64Regs:$ptr),
+ "stackrestore.u64 \t$ptr;",
+ [(stackrestore i64:$ptr)]>,
+ Requires<[hasPTX<73>, hasSM<52>]>;
+
+def STACKSAVE_64 :
+ NVPTXInst<(outs Int64Regs:$dst), (ins),
+ "stacksave.u64 \t$dst;",
+ [(set i64:$dst, (i64 stacksave))]>,
Requires<[hasPTX<73>, hasSM<52>]>;
include "NVPTXIntrinsics.td"
@@ -3897,14 +3849,98 @@ include "NVPTXIntrinsics.td"
def : Pat <
(i32 (bswap i32:$a)),
- (INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x0123))>;
+ (INT_NVVM_PRMT $a, (i32 0), (i32 0x0123))>;
def : Pat <
(v2i16 (bswap v2i16:$a)),
- (INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x2301))>;
+ (INT_NVVM_PRMT $a, (i32 0), (i32 0x2301))>;
def : Pat <
(i64 (bswap i64:$a)),
(V2I32toI64
- (INT_NVVM_PRMT (I64toI32H Int64Regs:$a), (i32 0), (i32 0x0123)),
- (INT_NVVM_PRMT (I64toI32L Int64Regs:$a), (i32 0), (i32 0x0123)))>;
+ (INT_NVVM_PRMT (I64toI32H $a), (i32 0), (i32 0x0123)),
+ (INT_NVVM_PRMT (I64toI32L $a), (i32 0), (i32 0x0123)))>;
+
+
+////////////////////////////////////////////////////////////////////////////////
+// PTX Fence instructions
+////////////////////////////////////////////////////////////////////////////////
+
+def atomic_thread_fence_seq_cst_sys :
+ NVPTXInst<(outs), (ins), "fence.sc.sys;", []>,
+ Requires<[hasPTX<60>, hasSM<70>]>;
+def atomic_thread_fence_acq_rel_sys :
+ NVPTXInst<(outs), (ins), "fence.acq_rel.sys;", []>,
+ Requires<[hasPTX<60>, hasSM<70>]>;
+
+def atomic_thread_fence_seq_cst_gpu :
+ NVPTXInst<(outs), (ins), "fence.sc.gpu;", []>,
+ Requires<[hasPTX<60>, hasSM<70>]>;
+def atomic_thread_fence_acq_rel_gpu :
+ NVPTXInst<(outs), (ins), "fence.acq_rel.gpu;", []>,
+ Requires<[hasPTX<60>, hasSM<70>]>;
+
+def atomic_thread_fence_seq_cst_cluster :
+ NVPTXInst<(outs), (ins), "fence.sc.cluster;", []>,
+ Requires<[hasPTX<78>, hasSM<90>]>;
+def atomic_thread_fence_acq_rel_cluster :
+ NVPTXInst<(outs), (ins), "fence.acq_rel.cluster;", []>,
+ Requires<[hasPTX<78>, hasSM<90>]>;
+
+def atomic_thread_fence_seq_cst_cta :
+ NVPTXInst<(outs), (ins), "fence.sc.cta;", []>,
+ Requires<[hasPTX<60>, hasSM<70>]>;
+def atomic_thread_fence_acq_rel_cta :
+ NVPTXInst<(outs), (ins), "fence.acq_rel.cta;", []>,
+ Requires<[hasPTX<60>, hasSM<70>]>;
+
+def fpimm_any_zero : FPImmLeaf<fAny, [{
+ return Imm.isZero();
+}]>;
+
+def fpimm_positive_zero_v2f16 : PatFrag<(ops), (v2f16 (bitconvert (i32 0)))>;
+def fpimm_positive_zero_v2bf16 : PatFrag<(ops), (v2bf16 (bitconvert (i32 0)))>;
+
+// Perform substitution if fma only has one use, and also if instruction has
+// nnan instruction flag or if the TM has NoNaNsFPMath
+def NVPTX_fma_oneuse_and_nnan : PatFrag<(ops node:$a, node:$b, node:$c),
+ (fma node:$a, node:$b, node:$c), [{
+ return N->hasOneUse() &&
+ (N->getFlags().hasNoNaNs() || TM.Options.NoNaNsFPMath);
+}]>;
+// fmaxnum will differentiate between signed and unsigned zeros soon, so this
+// PatFrag is for a fmaxnum node with nsz
+def NVPTX_fmaxnum_nsz : PatFrag<(ops node:$a, node:$b),
+ (fmaxnum node:$a, node:$b), [{
+ return N->getFlags().hasNoSignedZeros() || TM.Options.NoSignedZerosFPMath;
+}]>;
+
+class NVPTXInst_rrr<RegisterClass RC, string Instruction, list<Predicate> Preds>
+ : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
+ !strconcat(Instruction, "\t$dst, $a, $b, $c;"), []>,
+ Requires<Preds>;
+
+def FMARELU_F16 : NVPTXInst_rrr<Int16Regs, "fma.rn.relu.f16", [useFP16Math, hasPTX<70>, hasSM<80>]>;
+def FMARELU_F16_FTZ : NVPTXInst_rrr<Int16Regs, "fma.rn.ftz.relu.f16", [useFP16Math, hasPTX<70>, hasSM<80>]>;
+def FMARELU_BF16 : NVPTXInst_rrr<Int16Regs, "fma.rn.relu.bf16", [hasBF16Math, hasPTX<70>, hasSM<80>]>;
+def FMARELU_F16X2 : NVPTXInst_rrr<Int32Regs, "fma.rn.relu.f16x2", [useFP16Math, hasPTX<70>, hasSM<80>]>;
+def FMARELU_F16X2_FTZ : NVPTXInst_rrr<Int32Regs, "fma.rn.ftz.relu.f16x2", [useFP16Math, hasPTX<70>, hasSM<80>]>;
+def FMARELU_BF16X2 : NVPTXInst_rrr<Int32Regs, "fma.rn.relu.bf16x2", [hasBF16Math, hasPTX<70>, hasSM<80>]>;
+
+// FTZ
+def : Pat<(f16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan f16:$a, f16:$b, f16:$c), fpimm_any_zero)),
+ (FMARELU_F16_FTZ $a, $b, $c)>,
+ Requires<[doF32FTZ]>;
+def : Pat<(v2f16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan v2f16:$a, v2f16:$b, v2f16:$c), fpimm_positive_zero_v2f16)),
+ (FMARELU_F16X2_FTZ $a, $b, $c)>,
+ Requires<[doF32FTZ]>;
+
+// NO FTZ
+def : Pat<(f16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan f16:$a, f16:$b, f16:$c), fpimm_any_zero)),
+ (FMARELU_F16 $a, $b, $c)>;
+def : Pat<(bf16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan bf16:$a, bf16:$b, bf16:$c), fpimm_any_zero)),
+ (FMARELU_BF16 $a, $b, $c)>;
+def : Pat<(v2f16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan v2f16:$a, v2f16:$b, v2f16:$c), fpimm_positive_zero_v2f16)),
+ (FMARELU_F16X2 $a, $b, $c)>;
+def : Pat<(v2bf16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan v2bf16:$a, v2bf16:$b, v2bf16:$c), fpimm_positive_zero_v2bf16)),
+ (FMARELU_BF16X2 $a, $b, $c)>;