diff options
Diffstat (limited to 'llvm/lib/Target/NVPTX/NVPTXInstrInfo.td')
| -rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 2544 |
1 files changed, 1290 insertions, 1254 deletions
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index cd17a9de541a..633a99d0fc1b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -139,6 +139,9 @@ def hasVote : Predicate<"Subtarget->hasVote()">; def hasDouble : Predicate<"Subtarget->hasDouble()">; def hasLDG : Predicate<"Subtarget->hasLDG()">; def hasLDU : Predicate<"Subtarget->hasLDU()">; +def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">; +def noPTXASUnreachableBug : Predicate<"!Subtarget->hasPTXASUnreachableBug()">; +def hasOptEnabled : Predicate<"TM.getOptLevel() != CodeGenOptLevel::None">; def doF32FTZ : Predicate<"useF32FTZ()">; def doNoF32FTZ : Predicate<"!useF32FTZ()">; @@ -159,6 +162,7 @@ def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">; def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; +def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">; def True : Predicate<"true">; def False : Predicate<"false">; @@ -168,15 +172,14 @@ class hasSM<int version>: Predicate<"Subtarget->getSmVersion() >= " # version>; // Explicit records for arch-accelerated SM versions def hasSM90a : Predicate<"Subtarget->getFullSmVersion() == 901">; +def hasSM100a : Predicate<"Subtarget->getFullSmVersion() == 1001">; +def hasSM101a : Predicate<"Subtarget->getFullSmVersion() == 1011">; +def hasSM120a : Predicate<"Subtarget->getFullSmVersion() == 1201">; // non-sync shfl instructions are not available on sm_70+ in PTX6.4+ def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" "&& Subtarget->getPTXVersion() >= 64)">; -def useShortPtrLocal : Predicate<"TM.is64Bit() && TM.getPointerSizeInBits(ADDRESS_SPACE_LOCAL) == 32">; -def useShortPtrShared : Predicate<"TM.is64Bit() && TM.getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32">; -def useShortPtrConst : Predicate<"TM.is64Bit() && TM.getPointerSizeInBits(ADDRESS_SPACE_CONST) == 32">; - def useFP16Math: Predicate<"Subtarget->allowFP16Math()">; def hasBF16Math: Predicate<"Subtarget->hasBF16Math()">; @@ -208,39 +211,45 @@ class ValueToRegClass<ValueType T> { // Some Common Instruction Class Templates //===----------------------------------------------------------------------===// +// Utility class to wrap up information about a register and DAG type for more +// convenient iteration and parameterization +class RegTyInfo<ValueType ty, NVPTXRegClass rc, Operand imm> { + ValueType Ty = ty; + NVPTXRegClass RC = rc; + Operand Imm = imm; + int Size = ty.Size; +} + +def I16RT : RegTyInfo<i16, Int16Regs, i16imm>; +def I32RT : RegTyInfo<i32, Int32Regs, i32imm>; +def I64RT : RegTyInfo<i64, Int64Regs, i64imm>; + // Template for instructions which take three int64, int32, or int16 args. // The instructions are named "<OpcStr><Width>" (e.g. "add.s64"). -multiclass I3<string OpcStr, SDNode OpNode> { - def i64rr : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), - !strconcat(OpcStr, "64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>; - def i64ri : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), - !strconcat(OpcStr, "64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; - def i32rr : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), - !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>; - def i32ri : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), - !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>; - def i16rr : - NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), - !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; - def i16ri : - NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), - !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>; +multiclass I3<string OpcStr, SDNode OpNode, bit commutative> { + foreach t = [I16RT, I32RT, I64RT] in { + defvar asmstr = OpcStr # t.Size # " \t$dst, $a, $b;"; + + def t.Ty # rr : + NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.RC:$b), + asmstr, + [(set t.Ty:$dst, (OpNode t.Ty:$a, t.Ty:$b))]>; + def t.Ty # ri : + NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.Imm:$b), + asmstr, + [(set t.Ty:$dst, (OpNode t.RC:$a, imm:$b))]>; + if !not(commutative) then + def t.Ty # ir : + NVPTXInst<(outs t.RC:$dst), (ins t.Imm:$a, t.RC:$b), + asmstr, + [(set t.Ty:$dst, (OpNode imm:$a, t.RC:$b))]>; + } } class I16x2<string OpcStr, SDNode OpNode> : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), !strconcat(OpcStr, "16x2 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)))]>, + [(set v2i16:$dst, (OpNode v2i16:$a, v2i16:$b))]>, Requires<[hasPTX<80>, hasSM<90>]>; // Template for instructions which take 3 int args. The instructions are @@ -250,117 +259,101 @@ multiclass ADD_SUB_INT_CARRY<string OpcStr, SDNode OpNode> { def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>; + [(set i32:$dst, (OpNode i32:$a, i32:$b))]>; def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>; + [(set i32:$dst, (OpNode i32:$a, imm:$b))]>; def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>, + [(set i64:$dst, (OpNode i64:$a, i64:$b))]>, Requires<[hasPTX<43>]>; def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>, + [(set i64:$dst, (OpNode i64:$a, imm:$b))]>, Requires<[hasPTX<43>]>; } } -// Template for instructions which take three fp64 or fp32 args. The -// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64"). +// Template for minimum/maximum instructions. // // Also defines ftz (flush subnormal inputs and results to sign-preserving // zero) variants for fp32 functions. -// -// This multiclass should be used for nodes that cannot be folded into FMAs. -// For nodes that can be folded into FMAs (i.e. adds and muls), use -// F3_fma_component. -multiclass F3<string OpcStr, SDNode OpNode> { +multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDNode OpNode> { + if !not(NaN) then { def f64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, Float64Regs:$b), !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), - [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>; + [(set f64:$dst, (OpNode f64:$a, f64:$b))]>; def f64ri : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, f64imm:$b), !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), - [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>; + [(set f64:$dst, (OpNode f64:$a, fpimm:$b))]>; + } def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b), !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, + [(set f32:$dst, (OpNode f32:$a, f32:$b))]>, Requires<[doF32FTZ]>; def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b), !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, + [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>, Requires<[doF32FTZ]>; def f32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b), !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>; + [(set f32:$dst, (OpNode f32:$a, f32:$b))]>; def f32ri : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b), !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>; + [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>; def f16rr_ftz : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, + [(set f16:$dst, (OpNode f16:$a, f16:$b))]>, Requires<[useFP16Math, doF32FTZ]>; def f16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, - Requires<[useFP16Math]>; + [(set f16:$dst, (OpNode f16:$a, f16:$b))]>, + Requires<[useFP16Math, hasSM<80>, hasPTX<70>]>; def f16x2rr_ftz : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, - Requires<[useFP16Math, doF32FTZ]>; + [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>, + Requires<[useFP16Math, hasSM<80>, hasPTX<70>, doF32FTZ]>; def f16x2rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, - Requires<[useFP16Math]>; - def bf16rr_ftz : - NVPTXInst<(outs Int16Regs:$dst), - (ins Int16Regs:$a, Int16Regs:$b), - !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, - Requires<[hasBF16Math, doF32FTZ]>; + [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>, + Requires<[useFP16Math, hasSM<80>, hasPTX<70>]>; def bf16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, - Requires<[hasBF16Math]>; - - def bf16x2rr_ftz : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$a, Int32Regs:$b), - !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, - Requires<[hasBF16Math, doF32FTZ]>; + [(set bf16:$dst, (OpNode bf16:$a, bf16:$b))]>, + Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>; def bf16x2rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, - Requires<[hasBF16Math]>; + [(set v2bf16:$dst, (OpNode v2bf16:$a, v2bf16:$b))]>, + Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>; } // Template for instructions which take three FP args. The @@ -377,173 +370,161 @@ multiclass F3_fma_component<string OpcStr, SDNode OpNode> { NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, Float64Regs:$b), !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), - [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>, + [(set f64:$dst, (OpNode f64:$a, f64:$b))]>, Requires<[allowFMA]>; def f64ri : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, f64imm:$b), !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), - [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>, + [(set f64:$dst, (OpNode f64:$a, fpimm:$b))]>, Requires<[allowFMA]>; def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b), !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, + [(set f32:$dst, (OpNode f32:$a, f32:$b))]>, Requires<[allowFMA, doF32FTZ]>; def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b), !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, + [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>, Requires<[allowFMA, doF32FTZ]>; def f32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b), !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, + [(set f32:$dst, (OpNode f32:$a, f32:$b))]>, Requires<[allowFMA]>; def f32ri : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b), !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, + [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>, Requires<[allowFMA]>; def f16rr_ftz : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, + [(set f16:$dst, (OpNode f16:$a, f16:$b))]>, Requires<[useFP16Math, allowFMA, doF32FTZ]>; def f16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, + [(set f16:$dst, (OpNode f16:$a, f16:$b))]>, Requires<[useFP16Math, allowFMA]>; def f16x2rr_ftz : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"), - [(set (v2f16 Int32Regs:$dst), (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, + [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>, Requires<[useFP16Math, allowFMA, doF32FTZ]>; def f16x2rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, + [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>, Requires<[useFP16Math, allowFMA]>; - def bf16rr_ftz : - NVPTXInst<(outs Int16Regs:$dst), - (ins Int16Regs:$a, Int16Regs:$b), - !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, - Requires<[hasBF16Math, allowFMA, doF32FTZ]>; def bf16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, + [(set bf16:$dst, (OpNode bf16:$a, bf16:$b))]>, Requires<[hasBF16Math, allowFMA]>; - def bf16x2rr_ftz : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$a, Int32Regs:$b), - !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"), - [(set (v2bf16 Int32Regs:$dst), (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, - Requires<[hasBF16Math, allowFMA, doF32FTZ]>; def bf16x2rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, + [(set v2bf16:$dst, (OpNode v2bf16:$a, v2bf16:$b))]>, Requires<[hasBF16Math, allowFMA]>; // These have strange names so we don't perturb existing mir tests. def _rnf64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, Float64Regs:$b), !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), - [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>, + [(set f64:$dst, (OpNode f64:$a, f64:$b))]>, Requires<[noFMA]>; def _rnf64ri : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, f64imm:$b), !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), - [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>, + [(set f64:$dst, (OpNode f64:$a, fpimm:$b))]>, Requires<[noFMA]>; def _rnf32rr_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b), !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, + [(set f32:$dst, (OpNode f32:$a, Float32Regs:$b))]>, Requires<[noFMA, doF32FTZ]>; def _rnf32ri_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b), !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, + [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>, Requires<[noFMA, doF32FTZ]>; def _rnf32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b), !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, + [(set f32:$dst, (OpNode f32:$a, f32:$b))]>, Requires<[noFMA]>; def _rnf32ri : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b), !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, + [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>, Requires<[noFMA]>; def _rnf16rr_ftz : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, + [(set f16:$dst, (OpNode f16:$a, f16:$b))]>, Requires<[useFP16Math, noFMA, doF32FTZ]>; def _rnf16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>, + [(set f16:$dst, (OpNode f16:$a, f16:$b))]>, Requires<[useFP16Math, noFMA]>; def _rnf16x2rr_ftz : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, + [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>, Requires<[useFP16Math, noFMA, doF32FTZ]>; def _rnf16x2rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>, + [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>, Requires<[useFP16Math, noFMA]>; def _rnbf16rr_ftz : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), !strconcat(OpcStr, ".rn.ftz.bf16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, + [(set bf16:$dst, (OpNode bf16:$a, bf16:$b))]>, Requires<[hasBF16Math, noFMA, doF32FTZ]>; def _rnbf16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), !strconcat(OpcStr, ".rn.bf16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>, + [(set bf16:$dst, (OpNode bf16:$a, bf16:$b))]>, Requires<[hasBF16Math, noFMA]>; def _rnbf16x2rr_ftz : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), !strconcat(OpcStr, ".rn.ftz.bf16x2 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, + [(set v2bf16:$dst, (OpNode v2bf16:$a, v2bf16:$b))]>, Requires<[hasBF16Math, noFMA, doF32FTZ]>; def _rnbf16x2rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), !strconcat(OpcStr, ".rn.bf16x2 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>, + [(set v2bf16:$dst, (OpNode v2bf16:$a, v2bf16:$b))]>, Requires<[hasBF16Math, noFMA]>; } @@ -553,44 +534,56 @@ multiclass F3_fma_component<string OpcStr, SDNode OpNode> { multiclass F2<string OpcStr, SDNode OpNode> { def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a), !strconcat(OpcStr, ".f64 \t$dst, $a;"), - [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>; + [(set f64:$dst, (OpNode f64:$a))]>; def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>, + [(set f32:$dst, (OpNode f32:$a))]>, Requires<[doF32FTZ]>; def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), !strconcat(OpcStr, ".f32 \t$dst, $a;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>; + [(set f32:$dst, (OpNode f32:$a))]>; } multiclass F2_Support_Half<string OpcStr, SDNode OpNode> { def bf16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a), !strconcat(OpcStr, ".bf16 \t$dst, $a;"), - [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a)))]>, + [(set bf16:$dst, (OpNode bf16:$a))]>, Requires<[hasSM<80>, hasPTX<70>]>; def bf16x2 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), !strconcat(OpcStr, ".bf16x2 \t$dst, $a;"), - [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a)))]>, + [(set v2bf16:$dst, (OpNode v2bf16:$a))]>, Requires<[hasSM<80>, hasPTX<70>]>; def f16_ftz : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a), !strconcat(OpcStr, ".ftz.f16 \t$dst, $a;"), - [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a)))]>, + [(set f16:$dst, (OpNode f16:$a))]>, Requires<[hasSM<53>, hasPTX<65>, doF32FTZ]>; def f16x2_ftz : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a;"), - [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a)))]>, + [(set v2f16:$dst, (OpNode v2f16:$a))]>, Requires<[hasSM<53>, hasPTX<65>, doF32FTZ]>; def f16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a), !strconcat(OpcStr, ".f16 \t$dst, $a;"), - [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a)))]>, + [(set f16:$dst, (OpNode f16:$a))]>, Requires<[hasSM<53>, hasPTX<65>]>; def f16x2 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), !strconcat(OpcStr, ".f16x2 \t$dst, $a;"), - [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a)))]>, + [(set v2f16:$dst, (OpNode v2f16:$a))]>, Requires<[hasSM<53>, hasPTX<65>]>; } +// Variant where only .ftz.bf16 is supported. +multiclass F2_Support_Half_BF<string OpcStr, SDNode OpNode> { + def bf16_ftz : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a), + OpcStr # ".ftz.bf16 \t$dst, $a;", + [(set bf16:$dst, (OpNode bf16:$a))]>, + Requires<[hasSM<90>, hasPTX<78>]>; + def bf16x2_ftz: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), + OpcStr # ".ftz.bf16x2 \t$dst, $a;", + [(set v2bf16:$dst, (OpNode v2bf16:$a))]>, + Requires<[hasSM<90>, hasPTX<78>]>; +} + //===----------------------------------------------------------------------===// // NVPTX Instructions. //===----------------------------------------------------------------------===// @@ -725,8 +718,73 @@ let hasSideEffects = false in { defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Int32Regs>; defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>; + + // FP8 conversions. + multiclass CVT_TO_F8X2<string F8Name> { + def _f32 : + NVPTXInst<(outs Int16Regs:$dst), + (ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode), + !strconcat("cvt${mode:base}.satfinite${mode:relu}.", + F8Name, "x2.f32 \t$dst, $src1, $src2;"), []>, + Requires<[hasPTX<81>, hasSM<89>]>; + def _f16x2 : + NVPTXInst<(outs Int16Regs:$dst), + (ins Int32Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}.satfinite${mode:relu}.", + F8Name, "x2.f16x2 \t$dst, $src;"), []>, + Requires<[hasPTX<81>, hasSM<89>]>; + } + + defm CVT_e4m3x2 : CVT_TO_F8X2<"e4m3">; + defm CVT_e5m2x2 : CVT_TO_F8X2<"e5m2">; + + class CVT_f16x2_fp8<string F8Name> : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int16Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:relu}.f16x2.", + F8Name, "x2 \t$dst, $src;"), []>, + Requires<[hasPTX<81>, hasSM<89>]>; + + def CVT_f16x2_e4m3x2 : CVT_f16x2_fp8<"e4m3">; + def CVT_f16x2_e5m2x2 : CVT_f16x2_fp8<"e5m2">; + + // Float to TF32 conversions + multiclass CVT_TO_TF32<string Modifier, list<Predicate> Preds = [hasPTX<78>, hasSM<90>]> { + defvar Intr = !cast<Intrinsic>("int_nvvm_f2tf32_" # !subst(".", "_", Modifier)); + + def NAME : NVPTXInst<(outs Int32Regs:$dst), (ins Float32Regs:$src), + "cvt." # Modifier # ".tf32.f32 \t$dst, $src;", + [(set i32:$dst, (Intr f32:$src))]>, + Requires<Preds>; + } + + defm CVT_to_tf32_rn : CVT_TO_TF32<"rn">; + defm CVT_to_tf32_rz : CVT_TO_TF32<"rz">; + defm CVT_to_tf32_rn_relu : CVT_TO_TF32<"rn.relu">; + defm CVT_to_tf32_rz_relu : CVT_TO_TF32<"rz.relu">; + defm CVT_to_tf32_rna : CVT_TO_TF32<"rna", [hasPTX<70>, hasSM<80>]>; + defm CVT_to_tf32_rna_satf : CVT_TO_TF32<"rna.satfinite", [hasPTX<81>, hasSM<89>]>; + + defm CVT_to_tf32_rn_satf : CVT_TO_TF32<"rn.satfinite", [hasPTX<86>, hasSM<100>]>; + defm CVT_to_tf32_rz_satf : CVT_TO_TF32<"rz.satfinite", [hasPTX<86>, hasSM<100>]>; + defm CVT_to_tf32_rn_relu_satf : CVT_TO_TF32<"rn.relu.satfinite", [hasPTX<86>, hasSM<100>]>; + defm CVT_to_tf32_rz_relu_satf : CVT_TO_TF32<"rz.relu.satfinite", [hasPTX<86>, hasSM<100>]>; } +def fpround_oneuse : PatFrag<(ops node:$a), (fpround node:$a), [{ + return N->hasOneUse(); +}]>; + +def : Pat<(v2bf16 (build_vector (bf16 (fpround_oneuse f32:$lo)), + (bf16 (fpround_oneuse f32:$hi)))), + (CVT_bf16x2_f32 $hi, $lo, CvtRN)>, + Requires<[hasPTX<70>, hasSM<80>, hasBF16Math]>; + +def : Pat<(v2f16 (build_vector (f16 (fpround_oneuse f32:$lo)), + (f16 (fpround_oneuse f32:$hi)))), + (CVT_f16x2_f32 $hi, $lo, CvtRN)>, + Requires<[hasPTX<70>, hasSM<80>, useFP16Math]>; + //----------------------------------- // Selection instructions (selp) //----------------------------------- @@ -757,22 +815,22 @@ let hasSideEffects = false in { NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, Int1Regs:$p), !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), - [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T RC:$b)))]>; + [(set T:$dst, (select i1:$p, T:$a, T:$b))]>; def ri : NVPTXInst<(outs RC:$dst), (ins RC:$a, ImmCls:$b, Int1Regs:$p), !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), - [(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T ImmNode:$b)))]>; + [(set T:$dst, (select i1:$p, T:$a, (T ImmNode:$b)))]>; def ir : NVPTXInst<(outs RC:$dst), (ins ImmCls:$a, RC:$b, Int1Regs:$p), !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), - [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, (T RC:$b)))]>; + [(set T:$dst, (select i1:$p, ImmNode:$a, T:$b))]>; def ii : NVPTXInst<(outs RC:$dst), (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), - [(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>; + [(set T:$dst, (select i1:$p, ImmNode:$a, ImmNode:$b))]>; } } @@ -798,8 +856,8 @@ defm SELP_f64 : SELP_PATTERN<"f64", f64, Float64Regs, f64imm, fpimm>; // defm SELP_f16x2 : SELP_PATTERN<"b32", v2f16, Int32Regs, v2f16imm, imm>; foreach vt = [v2f16, v2bf16, v2i16, v4i8] in { -def : Pat<(vt (select Int1Regs:$p, (vt Int32Regs:$a), (vt Int32Regs:$b))), - (SELP_b32rr Int32Regs:$a, Int32Regs:$b, Int1Regs:$p)>; +def : Pat<(vt (select i1:$p, vt:$a, vt:$b)), + (SELP_b32rr $a, $b, $p)>; } //----------------------------------- @@ -827,10 +885,10 @@ def TESTINF_f64i : NVPTXInst<(outs Int1Regs:$p), (ins f64imm:$a), multiclass ADD_SUB_i1<SDNode OpNode> { def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), "xor.pred \t$dst, $a, $b;", - [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; + [(set i1:$dst, (OpNode i1:$a, i1:$b))]>; def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), "xor.pred \t$dst, $a, $b;", - [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>; + [(set i1:$dst, (OpNode i1:$a, (imm):$b))]>; } // int1 addition and subtraction are both just xor. @@ -839,8 +897,8 @@ defm SUB_i1 : ADD_SUB_i1<sub>; // int16, int32, and int64 signed addition. Since nvptx is 2's complement, we // also use these for unsigned arithmetic. -defm ADD : I3<"add.s", add>; -defm SUB : I3<"sub.s", sub>; +defm ADD : I3<"add.s", add, /*commutative=*/ true>; +defm SUB : I3<"sub.s", sub, /*commutative=*/ false>; def ADD16x2 : I16x2<"add.s", add>; @@ -852,18 +910,18 @@ defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>; defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>; defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>; -defm MULT : I3<"mul.lo.s", mul>; +defm MULT : I3<"mul.lo.s", mul, /*commutative=*/ true>; -defm MULTHS : I3<"mul.hi.s", mulhs>; -defm MULTHU : I3<"mul.hi.u", mulhu>; +defm MULTHS : I3<"mul.hi.s", mulhs, /*commutative=*/ true>; +defm MULTHU : I3<"mul.hi.u", mulhu, /*commutative=*/ true>; -defm SDIV : I3<"div.s", sdiv>; -defm UDIV : I3<"div.u", udiv>; +defm SDIV : I3<"div.s", sdiv, /*commutative=*/ false>; +defm UDIV : I3<"div.u", udiv, /*commutative=*/ false>; // The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM // will lower it. -defm SREM : I3<"rem.s", srem>; -defm UREM : I3<"rem.u", urem>; +defm SREM : I3<"rem.s", srem, /*commutative=*/ false>; +defm UREM : I3<"rem.u", urem, /*commutative=*/ false>; // Integer absolute value. NumBits should be one minus the bit width of RC. // This idiom implements the algorithm at @@ -871,17 +929,17 @@ defm UREM : I3<"rem.u", urem>; multiclass ABS<ValueType T, RegisterClass RC, string SizeName> { def : NVPTXInst<(outs RC:$dst), (ins RC:$a), !strconcat("abs", SizeName, " \t$dst, $a;"), - [(set (T RC:$dst), (abs (T RC:$a)))]>; + [(set T:$dst, (abs T:$a))]>; } defm ABS_16 : ABS<i16, Int16Regs, ".s16">; defm ABS_32 : ABS<i32, Int32Regs, ".s32">; defm ABS_64 : ABS<i64, Int64Regs, ".s64">; // Integer min/max. -defm SMAX : I3<"max.s", smax>; -defm UMAX : I3<"max.u", umax>; -defm SMIN : I3<"min.s", smin>; -defm UMIN : I3<"min.u", umin>; +defm SMAX : I3<"max.s", smax, /*commutative=*/ true>; +defm UMAX : I3<"max.u", umax, /*commutative=*/ true>; +defm SMIN : I3<"min.s", smin, /*commutative=*/ true>; +defm UMIN : I3<"min.u", umin, /*commutative=*/ true>; def SMAX16x2 : I16x2<"max.s", smax>; def UMAX16x2 : I16x2<"max.u", umax>; @@ -938,29 +996,29 @@ def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>; // Matchers for signed, unsigned mul.wide ISD nodes. def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)), - (MULWIDES32 i16:$a, i16:$b)>, + (MULWIDES32 $a, $b)>, Requires<[doMulWide]>; -def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)), - (MULWIDES32Imm Int16Regs:$a, imm:$b)>, +def : Pat<(i32 (mul_wide_signed i16:$a, imm:$b)), + (MULWIDES32Imm $a, imm:$b)>, Requires<[doMulWide]>; def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)), - (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, + (MULWIDEU32 $a, $b)>, Requires<[doMulWide]>; -def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)), - (MULWIDEU32Imm Int16Regs:$a, imm:$b)>, +def : Pat<(i32 (mul_wide_unsigned i16:$a, imm:$b)), + (MULWIDEU32Imm $a, imm:$b)>, Requires<[doMulWide]>; def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)), - (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, + (MULWIDES64 $a, $b)>, Requires<[doMulWide]>; -def : Pat<(i64 (mul_wide_signed (i32 Int32Regs:$a), imm:$b)), - (MULWIDES64Imm Int32Regs:$a, imm:$b)>, +def : Pat<(i64 (mul_wide_signed i32:$a, imm:$b)), + (MULWIDES64Imm $a, imm:$b)>, Requires<[doMulWide]>; def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)), - (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, + (MULWIDEU64 $a, $b)>, Requires<[doMulWide]>; -def : Pat<(i64 (mul_wide_unsigned (i32 Int32Regs:$a), imm:$b)), - (MULWIDEU64Imm Int32Regs:$a, imm:$b)>, +def : Pat<(i64 (mul_wide_unsigned i32:$a, imm:$b)), + (MULWIDEU64Imm $a, imm:$b)>, Requires<[doMulWide]>; // Predicates used for converting some patterns to mul.wide. @@ -1009,132 +1067,98 @@ def SHL2MUL16 : SDNodeXForm<imm, [{ }]>; // Convert "sign/zero-extend, then shift left by an immediate" to mul.wide. -def : Pat<(shl (sext Int32Regs:$a), (i32 IntConst_0_30:$b)), - (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, +def : Pat<(shl (sext i32:$a), (i32 IntConst_0_30:$b)), + (MULWIDES64Imm $a, (SHL2MUL32 $b))>, Requires<[doMulWide]>; -def : Pat<(shl (zext Int32Regs:$a), (i32 IntConst_0_30:$b)), - (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, +def : Pat<(shl (zext i32:$a), (i32 IntConst_0_30:$b)), + (MULWIDEU64Imm $a, (SHL2MUL32 $b))>, Requires<[doMulWide]>; -def : Pat<(shl (sext Int16Regs:$a), (i16 IntConst_0_14:$b)), - (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>, +def : Pat<(shl (sext i16:$a), (i16 IntConst_0_14:$b)), + (MULWIDES32Imm $a, (SHL2MUL16 $b))>, Requires<[doMulWide]>; -def : Pat<(shl (zext Int16Regs:$a), (i16 IntConst_0_14:$b)), - (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>, +def : Pat<(shl (zext i16:$a), (i16 IntConst_0_14:$b)), + (MULWIDEU32Imm $a, (SHL2MUL16 $b))>, Requires<[doMulWide]>; // Convert "sign/zero-extend then multiply" to mul.wide. -def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)), - (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, +def : Pat<(mul (sext i32:$a), (sext i32:$b)), + (MULWIDES64 $a, $b)>, Requires<[doMulWide]>; -def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)), - (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>, +def : Pat<(mul (sext i32:$a), (i64 SInt32Const:$b)), + (MULWIDES64Imm64 $a, (i64 SInt32Const:$b))>, Requires<[doMulWide]>; -def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)), - (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, +def : Pat<(mul (zext i32:$a), (zext i32:$b)), + (MULWIDEU64 $a, $b)>, Requires<[doMulWide]>; -def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)), - (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>, +def : Pat<(mul (zext i32:$a), (i64 UInt32Const:$b)), + (MULWIDEU64Imm64 $a, (i64 UInt32Const:$b))>, Requires<[doMulWide]>; -def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)), - (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, +def : Pat<(mul (sext i16:$a), (sext i16:$b)), + (MULWIDES32 $a, $b)>, Requires<[doMulWide]>; -def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)), - (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>, +def : Pat<(mul (sext i16:$a), (i32 SInt16Const:$b)), + (MULWIDES32Imm32 $a, (i32 SInt16Const:$b))>, Requires<[doMulWide]>; -def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)), - (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, +def : Pat<(mul (zext i16:$a), (zext i16:$b)), + (MULWIDEU32 $a, $b)>, Requires<[doMulWide]>; -def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)), - (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>, +def : Pat<(mul (zext i16:$a), (i32 UInt16Const:$b)), + (MULWIDEU32Imm32 $a, (i32 UInt16Const:$b))>, Requires<[doMulWide]>; // // Integer multiply-add // -def SDTIMAD : - SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>, - SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>; -def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>; - -def MAD16rrr : - NVPTXInst<(outs Int16Regs:$dst), - (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), - "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>; -def MAD16rri : - NVPTXInst<(outs Int16Regs:$dst), - (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c), - "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>; -def MAD16rir : - NVPTXInst<(outs Int16Regs:$dst), - (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c), - "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>; -def MAD16rii : - NVPTXInst<(outs Int16Regs:$dst), - (ins Int16Regs:$a, i16imm:$b, i16imm:$c), - "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>; - -def MAD32rrr : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c), - "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), (i32 Int32Regs:$b), (i32 Int32Regs:$c)))]>; -def MAD32rri : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c), - "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), (i32 Int32Regs:$b), imm:$c))]>; -def MAD32rir : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c), - "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), imm:$b, (i32 Int32Regs:$c)))]>; -def MAD32rii : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$a, i32imm:$b, i32imm:$c), - "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set (i32 Int32Regs:$dst), (imad (i32 Int32Regs:$a), imm:$b, imm:$c))]>; - -def MAD64rrr : - NVPTXInst<(outs Int64Regs:$dst), - (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c), - "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>; -def MAD64rri : - NVPTXInst<(outs Int64Regs:$dst), - (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c), - "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>; -def MAD64rir : - NVPTXInst<(outs Int64Regs:$dst), - (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c), - "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>; -def MAD64rii : - NVPTXInst<(outs Int64Regs:$dst), - (ins Int64Regs:$a, i64imm:$b, i64imm:$c), - "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>; +def mul_oneuse : PatFrag<(ops node:$a, node:$b), (mul node:$a, node:$b), [{ + return N->hasOneUse(); +}]>; + +multiclass MAD<string Ptx, ValueType VT, NVPTXRegClass Reg, Operand Imm> { + def rrr: + NVPTXInst<(outs Reg:$dst), + (ins Reg:$a, Reg:$b, Reg:$c), + Ptx # " \t$dst, $a, $b, $c;", + [(set VT:$dst, (add (mul_oneuse VT:$a, VT:$b), VT:$c))]>; + + def rir: + NVPTXInst<(outs Reg:$dst), + (ins Reg:$a, Imm:$b, Reg:$c), + Ptx # " \t$dst, $a, $b, $c;", + [(set VT:$dst, (add (mul_oneuse VT:$a, imm:$b), VT:$c))]>; + def rri: + NVPTXInst<(outs Reg:$dst), + (ins Reg:$a, Reg:$b, Imm:$c), + Ptx # " \t$dst, $a, $b, $c;", + [(set VT:$dst, (add (mul_oneuse VT:$a, VT:$b), imm:$c))]>; + def rii: + NVPTXInst<(outs Reg:$dst), + (ins Reg:$a, Imm:$b, Imm:$c), + Ptx # " \t$dst, $a, $b, $c;", + [(set VT:$dst, (add (mul_oneuse VT:$a, imm:$b), imm:$c))]>; +} + +let Predicates = [hasOptEnabled] in { +defm MAD16 : MAD<"mad.lo.s16", i16, Int16Regs, i16imm>; +defm MAD32 : MAD<"mad.lo.s32", i32, Int32Regs, i32imm>; +defm MAD64 : MAD<"mad.lo.s64", i64, Int64Regs, i64imm>; +} def INEG16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), "neg.s16 \t$dst, $src;", - [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>; + [(set i16:$dst, (ineg i16:$src))]>; def INEG32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), "neg.s32 \t$dst, $src;", - [(set (i32 Int32Regs:$dst), (ineg (i32 Int32Regs:$src)))]>; + [(set i32:$dst, (ineg i32:$src))]>; def INEG64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), "neg.s64 \t$dst, $src;", - [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>; + [(set i64:$dst, (ineg i64:$src))]>; //----------------------------------- // Floating Point Arithmetic @@ -1163,26 +1187,14 @@ def NegDoubleConst : SDNodeXForm<fpimm, [{ SDLoc(N), MVT::f64); }]>; -// Loads FP16 constant into a register. -// -// ptxas does not have hex representation for fp16, so we can't use -// fp16 immediate values in .f16 instructions. Instead we have to load -// the constant into a register using mov.b16. -def LOAD_CONST_F16 : - NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$a), - "mov.b16 \t$dst, $a;", []>; -def LOAD_CONST_BF16 : - NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$a), - "mov.b16 \t$dst, $a;", []>; defm FADD : F3_fma_component<"add", fadd>; defm FSUB : F3_fma_component<"sub", fsub>; defm FMUL : F3_fma_component<"mul", fmul>; -defm FMIN : F3<"min", fminnum>; -defm FMAX : F3<"max", fmaxnum>; -// Note: min.NaN.f64 and max.NaN.f64 do not actually exist. -defm FMINNAN : F3<"min.NaN", fminimum>; -defm FMAXNAN : F3<"max.NaN", fmaximum>; +defm FMIN : FMINIMUMMAXIMUM<"min", /* NaN */ false, fminnum>; +defm FMAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum>; +defm FMINNAN : FMINIMUMMAXIMUM<"min.NaN", /* NaN */ true, fminimum>; +defm FMAXNAN : FMINIMUMMAXIMUM<"max.NaN", /* NaN */ true, fmaximum>; defm FABS : F2<"abs", fabs>; defm FNEG : F2<"neg", fneg>; @@ -1191,13 +1203,15 @@ defm FNEG_H: F2_Support_Half<"neg", fneg>; defm FSQRT : F2<"sqrt.rn", fsqrt>; +defm FEXP2_H: F2_Support_Half_BF<"ex2.approx", fexp2>; + // // F16 NEG // class FNEG_F16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> : NVPTXInst<(outs RC:$dst), (ins RC:$src), !strconcat(OpcStr, " \t$dst, $src;"), - [(set RC:$dst, (fneg (T RC:$src)))]>, + [(set T:$dst, (fneg T:$src))]>, Requires<[useFP16Math, hasPTX<60>, hasSM<53>, Pred]>; def FNEG16_ftz : FNEG_F16_F16X2<"neg.ftz.f16", f16, Int16Regs, doF32FTZ>; def FNEG16 : FNEG_F16_F16X2<"neg.f16", f16, Int16Regs, True>; @@ -1211,7 +1225,7 @@ def FNEG16x2 : FNEG_F16_F16X2<"neg.f16x2", v2f16, Int32Regs, True>; class FNEG_BF16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> : NVPTXInst<(outs RC:$dst), (ins RC:$src), !strconcat(OpcStr, " \t$dst, $src;"), - [(set RC:$dst, (fneg (T RC:$src)))]>, + [(set T:$dst, (fneg T:$src))]>, Requires<[hasBF16Math, hasPTX<70>, hasSM<80>, Pred]>; def BFNEG16_ftz : FNEG_BF16_F16X2<"neg.ftz.bf16", bf16, Int16Regs, doF32FTZ>; def BFNEG16 : FNEG_BF16_F16X2<"neg.bf16", bf16, Int16Regs, True>; @@ -1225,22 +1239,22 @@ def FDIV641r : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$a, Float64Regs:$b), "rcp.rn.f64 \t$dst, $b;", - [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>; + [(set f64:$dst, (fdiv DoubleConst1:$a, f64:$b))]>; def FDIV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, Float64Regs:$b), "div.rn.f64 \t$dst, $a, $b;", - [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>; + [(set f64:$dst, (fdiv f64:$a, f64:$b))]>; def FDIV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, f64imm:$b), "div.rn.f64 \t$dst, $a, $b;", - [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>; + [(set f64:$dst, (fdiv f64:$a, fpimm:$b))]>; // fdiv will be converted to rcp // fneg (fdiv 1.0, X) => fneg (rcp.rn X) -def : Pat<(fdiv DoubleConstNeg1:$a, Float64Regs:$b), - (FNEGf64 (FDIV641r (NegDoubleConst node:$a), Float64Regs:$b))>; +def : Pat<(fdiv DoubleConstNeg1:$a, f64:$b), + (FNEGf64 (FDIV641r (NegDoubleConst node:$a), $b))>; // // F32 Approximate reciprocal @@ -1249,13 +1263,13 @@ def FDIV321r_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$a, Float32Regs:$b), "rcp.approx.ftz.f32 \t$dst, $b;", - [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, + [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>, Requires<[do_DIVF32_APPROX, doF32FTZ]>; def FDIV321r : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$a, Float32Regs:$b), "rcp.approx.f32 \t$dst, $b;", - [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, + [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>, Requires<[do_DIVF32_APPROX]>; // // F32 Approximate division @@ -1264,25 +1278,25 @@ def FDIV32approxrr_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b), "div.approx.ftz.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, + [(set f32:$dst, (fdiv f32:$a, f32:$b))]>, Requires<[do_DIVF32_APPROX, doF32FTZ]>; def FDIV32approxri_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b), "div.approx.ftz.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, + [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>, Requires<[do_DIVF32_APPROX, doF32FTZ]>; def FDIV32approxrr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b), "div.approx.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, + [(set f32:$dst, (fdiv f32:$a, f32:$b))]>, Requires<[do_DIVF32_APPROX]>; def FDIV32approxri : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b), "div.approx.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, + [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>, Requires<[do_DIVF32_APPROX]>; // // F32 Semi-accurate reciprocal @@ -1293,13 +1307,13 @@ def FDIV321r_approx_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$a, Float32Regs:$b), "rcp.approx.ftz.f32 \t$dst, $b;", - [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, + [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>, Requires<[do_DIVF32_FULL, doF32FTZ]>; def FDIV321r_approx : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$a, Float32Regs:$b), "rcp.approx.f32 \t$dst, $b;", - [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, + [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>, Requires<[do_DIVF32_FULL]>; // // F32 Semi-accurate division @@ -1308,25 +1322,25 @@ def FDIV32rr_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b), "div.full.ftz.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, + [(set f32:$dst, (fdiv Float32Regs:$a, f32:$b))]>, Requires<[do_DIVF32_FULL, doF32FTZ]>; def FDIV32ri_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b), "div.full.ftz.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, + [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>, Requires<[do_DIVF32_FULL, doF32FTZ]>; def FDIV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b), "div.full.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, + [(set f32:$dst, (fdiv f32:$a, f32:$b))]>, Requires<[do_DIVF32_FULL]>; def FDIV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b), "div.full.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, + [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>, Requires<[do_DIVF32_FULL]>; // // F32 Accurate reciprocal @@ -1335,13 +1349,13 @@ def FDIV321r_prec_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$a, Float32Regs:$b), "rcp.rn.ftz.f32 \t$dst, $b;", - [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, + [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>, Requires<[doF32FTZ]>; def FDIV321r_prec : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$a, Float32Regs:$b), "rcp.rn.f32 \t$dst, $b;", - [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>; + [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>; // // F32 Accurate division // @@ -1349,62 +1363,69 @@ def FDIV32rr_prec_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b), "div.rn.ftz.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, + [(set f32:$dst, (fdiv f32:$a, f32:$b))]>, Requires<[doF32FTZ]>; def FDIV32ri_prec_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b), "div.rn.ftz.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, + [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>, Requires<[doF32FTZ]>; def FDIV32rr_prec : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b), "div.rn.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>; + [(set f32:$dst, (fdiv f32:$a, f32:$b))]>; def FDIV32ri_prec : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b), "div.rn.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>; + [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>; // // FMA // multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> { - def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>, - Requires<[Pred]>; - def rri : NVPTXInst<(outs RC:$dst), - (ins RC:$a, RC:$b, ImmCls:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>, - Requires<[Pred]>; - def rir : NVPTXInst<(outs RC:$dst), - (ins RC:$a, ImmCls:$b, RC:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>, - Requires<[Pred]>; - def rii : NVPTXInst<(outs RC:$dst), - (ins RC:$a, ImmCls:$b, ImmCls:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>, - Requires<[Pred]>; + defvar asmstr = OpcStr # " \t$dst, $a, $b, $c;"; + def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), + asmstr, + [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>, + Requires<[Pred]>; + def rri : NVPTXInst<(outs RC:$dst), + (ins RC:$a, RC:$b, ImmCls:$c), + asmstr, + [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>, + Requires<[Pred]>; + def rir : NVPTXInst<(outs RC:$dst), + (ins RC:$a, ImmCls:$b, RC:$c), + asmstr, + [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>, + Requires<[Pred]>; + def rii : NVPTXInst<(outs RC:$dst), + (ins RC:$a, ImmCls:$b, ImmCls:$c), + asmstr, + [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>, + Requires<[Pred]>; + def iir : NVPTXInst<(outs RC:$dst), + (ins ImmCls:$a, ImmCls:$b, RC:$c), + asmstr, + [(set RC:$dst, (fma fpimm:$a, fpimm:$b, RC:$c))]>, + Requires<[Pred]>; + } multiclass FMA_F16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> { def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>, + [(set T:$dst, (fma T:$a, T:$b, T:$c))]>, Requires<[useFP16Math, Pred]>; } multiclass FMA_BF16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> { def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>, + [(set T:$dst, (fma T:$a, T:$b, T:$c))]>, Requires<[hasBF16Math, Pred]>; } @@ -1412,9 +1433,7 @@ defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", f16, Int16Regs, doF32FTZ>; defm FMA16 : FMA_F16<"fma.rn.f16", f16, Int16Regs, True>; defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>; defm FMA16x2 : FMA_F16<"fma.rn.f16x2", v2f16, Int32Regs, True>; -defm BFMA16_ftz : FMA_BF16<"fma.rn.ftz.bf16", bf16, Int16Regs, doF32FTZ>; defm BFMA16 : FMA_BF16<"fma.rn.bf16", bf16, Int16Regs, True>; -defm BFMA16x2_ftz : FMA_BF16<"fma.rn.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>; defm BFMA16x2 : FMA_BF16<"fma.rn.bf16x2", v2bf16, Int32Regs, True>; defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>; defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, True>; @@ -1423,11 +1442,11 @@ defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, True>; // sin/cos def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), "sin.approx.f32 \t$dst, $src;", - [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>, + [(set f32:$dst, (fsin f32:$src))]>, Requires<[allowUnsafeFPMath]>; def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), "cos.approx.f32 \t$dst, $src;", - [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>, + [(set f32:$dst, (fcos f32:$src))]>, Requires<[allowUnsafeFPMath]>; // Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)), @@ -1435,84 +1454,84 @@ def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), // semantics of LLVM's frem. // frem - f32 FTZ -def : Pat<(frem Float32Regs:$x, Float32Regs:$y), - (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32 - (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ), - Float32Regs:$y))>, +def : Pat<(frem f32:$x, f32:$y), + (FSUBf32rr_ftz $x, (FMULf32rr_ftz (CVT_f32_f32 + (FDIV32rr_prec_ftz $x, $y), CvtRZI_FTZ), + $y))>, Requires<[doF32FTZ, allowUnsafeFPMath]>; -def : Pat<(frem Float32Regs:$x, fpimm:$y), - (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32 - (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ), +def : Pat<(frem f32:$x, fpimm:$y), + (FSUBf32rr_ftz $x, (FMULf32ri_ftz (CVT_f32_f32 + (FDIV32ri_prec_ftz $x, fpimm:$y), CvtRZI_FTZ), fpimm:$y))>, Requires<[doF32FTZ, allowUnsafeFPMath]>; -def : Pat<(frem Float32Regs:$x, Float32Regs:$y), - (SELP_f32rr Float32Regs:$x, - (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32 - (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ), - Float32Regs:$y)), - (TESTINF_f32r Float32Regs:$y))>, +def : Pat<(frem f32:$x, f32:$y), + (SELP_f32rr $x, + (FSUBf32rr_ftz $x, (FMULf32rr_ftz (CVT_f32_f32 + (FDIV32rr_prec_ftz $x, $y), CvtRZI_FTZ), + $y)), + (TESTINF_f32r $y))>, Requires<[doF32FTZ, noUnsafeFPMath]>; -def : Pat<(frem Float32Regs:$x, fpimm:$y), - (SELP_f32rr Float32Regs:$x, - (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32 - (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ), +def : Pat<(frem f32:$x, fpimm:$y), + (SELP_f32rr $x, + (FSUBf32rr_ftz $x, (FMULf32ri_ftz (CVT_f32_f32 + (FDIV32ri_prec_ftz $x, fpimm:$y), CvtRZI_FTZ), fpimm:$y)), (TESTINF_f32i fpimm:$y))>, Requires<[doF32FTZ, noUnsafeFPMath]>; // frem - f32 -def : Pat<(frem Float32Regs:$x, Float32Regs:$y), - (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32 - (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI), - Float32Regs:$y))>, +def : Pat<(frem f32:$x, f32:$y), + (FSUBf32rr $x, (FMULf32rr (CVT_f32_f32 + (FDIV32rr_prec $x, $y), CvtRZI), + $y))>, Requires<[allowUnsafeFPMath]>; -def : Pat<(frem Float32Regs:$x, fpimm:$y), - (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32 - (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI), +def : Pat<(frem f32:$x, fpimm:$y), + (FSUBf32rr $x, (FMULf32ri (CVT_f32_f32 + (FDIV32ri_prec $x, fpimm:$y), CvtRZI), fpimm:$y))>, Requires<[allowUnsafeFPMath]>; -def : Pat<(frem Float32Regs:$x, Float32Regs:$y), - (SELP_f32rr Float32Regs:$x, - (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32 - (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI), - Float32Regs:$y)), +def : Pat<(frem f32:$x, f32:$y), + (SELP_f32rr $x, + (FSUBf32rr $x, (FMULf32rr (CVT_f32_f32 + (FDIV32rr_prec $x, $y), CvtRZI), + $y)), (TESTINF_f32r Float32Regs:$y))>, Requires<[noUnsafeFPMath]>; -def : Pat<(frem Float32Regs:$x, fpimm:$y), - (SELP_f32rr Float32Regs:$x, - (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32 - (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI), +def : Pat<(frem f32:$x, fpimm:$y), + (SELP_f32rr $x, + (FSUBf32rr $x, (FMULf32ri (CVT_f32_f32 + (FDIV32ri_prec $x, fpimm:$y), CvtRZI), fpimm:$y)), (TESTINF_f32i fpimm:$y))>, Requires<[noUnsafeFPMath]>; // frem - f64 -def : Pat<(frem Float64Regs:$x, Float64Regs:$y), - (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64 - (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI), - Float64Regs:$y))>, +def : Pat<(frem f64:$x, f64:$y), + (FSUBf64rr $x, (FMULf64rr (CVT_f64_f64 + (FDIV64rr $x, $y), CvtRZI), + $y))>, Requires<[allowUnsafeFPMath]>; -def : Pat<(frem Float64Regs:$x, fpimm:$y), - (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64 - (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI), +def : Pat<(frem f64:$x, fpimm:$y), + (FSUBf64rr $x, (FMULf64ri (CVT_f64_f64 + (FDIV64ri $x, fpimm:$y), CvtRZI), fpimm:$y))>, Requires<[allowUnsafeFPMath]>; -def : Pat<(frem Float64Regs:$x, Float64Regs:$y), - (SELP_f64rr Float64Regs:$x, - (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64 - (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI), - Float64Regs:$y)), +def : Pat<(frem f64:$x, f64:$y), + (SELP_f64rr $x, + (FSUBf64rr $x, (FMULf64rr (CVT_f64_f64 + (FDIV64rr $x, $y), CvtRZI), + $y)), (TESTINF_f64r Float64Regs:$y))>, Requires<[noUnsafeFPMath]>; -def : Pat<(frem Float64Regs:$x, fpimm:$y), - (SELP_f64rr Float64Regs:$x, - (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64 - (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI), +def : Pat<(frem f64:$x, fpimm:$y), + (SELP_f64rr $x, + (FSUBf64rr $x, (FMULf64ri (CVT_f64_f64 + (FDIV64ri $x, fpimm:$y), CvtRZI), fpimm:$y)), - (TESTINF_f64r Float64Regs:$y))>, + (TESTINF_f64r $y))>, Requires<[noUnsafeFPMath]>; //----------------------------------- @@ -1525,35 +1544,35 @@ multiclass BITWISE<string OpcStr, SDNode OpNode> { def b1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), - [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; + [(set i1:$dst, (OpNode i1:$a, i1:$b))]>; def b1ri : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), - [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>; + [(set i1:$dst, (OpNode i1:$a, imm:$b))]>; def b16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; + [(set i16:$dst, (OpNode i16:$a, i16:$b))]>; def b16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>; + [(set i16:$dst, (OpNode i16:$a, imm:$b))]>; def b32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>; + [(set i32:$dst, (OpNode i32:$a, i32:$b))]>; def b32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), imm:$b))]>; + [(set i32:$dst, (OpNode i32:$a, imm:$b))]>; def b64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>; + [(set i64:$dst, (OpNode i64:$a, i64:$b))]>; def b64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; + [(set i64:$dst, (OpNode i64:$a, imm:$b))]>; } defm OR : BITWISE<"or", or>; @@ -1561,46 +1580,46 @@ defm AND : BITWISE<"and", and>; defm XOR : BITWISE<"xor", xor>; // PTX does not support mul on predicates, convert to and instructions -def : Pat<(mul Int1Regs:$a, Int1Regs:$b), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>; -def : Pat<(mul Int1Regs:$a, (i1 imm:$b)), (ANDb1ri Int1Regs:$a, imm:$b)>; +def : Pat<(mul i1:$a, i1:$b), (ANDb1rr $a, $b)>; +def : Pat<(mul i1:$a, imm:$b), (ANDb1ri $a, imm:$b)>; // These transformations were once reliably performed by instcombine, but thanks // to poison semantics they are no longer safe for LLVM IR, perform them here // instead. -def : Pat<(select Int1Regs:$a, Int1Regs:$b, 0), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>; -def : Pat<(select Int1Regs:$a, 1, Int1Regs:$b), (ORb1rr Int1Regs:$a, Int1Regs:$b)>; +def : Pat<(select i1:$a, i1:$b, 0), (ANDb1rr $a, $b)>; +def : Pat<(select i1:$a, 1, i1:$b), (ORb1rr $a, $b)>; // Lower logical v2i16/v4i8 ops as bitwise ops on b32. foreach vt = [v2i16, v4i8] in { - def: Pat<(or (vt Int32Regs:$a), (vt Int32Regs:$b)), - (ORb32rr Int32Regs:$a, Int32Regs:$b)>; - def: Pat<(xor (vt Int32Regs:$a), (vt Int32Regs:$b)), - (XORb32rr Int32Regs:$a, Int32Regs:$b)>; - def: Pat<(and (vt Int32Regs:$a), (vt Int32Regs:$b)), - (ANDb32rr Int32Regs:$a, Int32Regs:$b)>; + def: Pat<(or vt:$a, vt:$b), + (ORb32rr $a, $b)>; + def: Pat<(xor vt:$a, vt:$b), + (XORb32rr $a, $b)>; + def: Pat<(and vt:$a, vt:$b), + (ANDb32rr $a, $b)>; // The constants get legalized into a bitcast from i32, so that's what we need // to match here. - def: Pat<(or Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))), - (ORb32ri Int32Regs:$a, imm:$b)>; - def: Pat<(xor Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))), - (XORb32ri Int32Regs:$a, imm:$b)>; - def: Pat<(and Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))), - (ANDb32ri Int32Regs:$a, imm:$b)>; + def: Pat<(or vt:$a, (vt (bitconvert (i32 imm:$b)))), + (ORb32ri $a, imm:$b)>; + def: Pat<(xor vt:$a, (vt (bitconvert (i32 imm:$b)))), + (XORb32ri $a, imm:$b)>; + def: Pat<(and vt:$a, (vt (bitconvert (i32 imm:$b)))), + (ANDb32ri $a, imm:$b)>; } def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src), "not.pred \t$dst, $src;", - [(set Int1Regs:$dst, (not Int1Regs:$src))]>; + [(set i1:$dst, (not i1:$src))]>; def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), "not.b16 \t$dst, $src;", - [(set Int16Regs:$dst, (not Int16Regs:$src))]>; + [(set i16:$dst, (not i16:$src))]>; def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), "not.b32 \t$dst, $src;", - [(set (i32 Int32Regs:$dst), (not (i32 Int32Regs:$src)))]>; + [(set i32:$dst, (not i32:$src))]>; def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), "not.b64 \t$dst, $src;", - [(set Int64Regs:$dst, (not Int64Regs:$src))]>; + [(set i64:$dst, (not i64:$src))]>; // Template for left/right shifts. Takes three operands, // [dest (reg), src (reg), shift (reg or imm)]. @@ -1611,31 +1630,31 @@ multiclass SHIFT<string OpcStr, SDNode OpNode> { def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b), !strconcat(OpcStr, "64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 Int32Regs:$b)))]>; + [(set i64:$dst, (OpNode i64:$a, i32:$b))]>; def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b), !strconcat(OpcStr, "64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>; + [(set i64:$dst, (OpNode i64:$a, (i32 imm:$b)))]>; def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 Int32Regs:$b)))]>; + [(set i32:$dst, (OpNode i32:$a, i32:$b))]>; def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode (i32 Int32Regs:$a), (i32 imm:$b)))]>; + [(set i32:$dst, (OpNode i32:$a, (i32 imm:$b)))]>; def i32ii : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b), !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>; + [(set i32:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>; def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b), !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 Int32Regs:$b)))]>; + [(set i16:$dst, (OpNode i16:$a, i32:$b))]>; def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b), !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>; + [(set i16:$dst, (OpNode i16:$a, (i32 imm:$b)))]>; } defm SHL : SHIFT<"shl.b", shl>; @@ -1646,173 +1665,12 @@ defm SRL : SHIFT<"shr.u", srl>; def BREV32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), "brev.b32 \t$dst, $a;", - [(set Int32Regs:$dst, (bitreverse (i32 Int32Regs:$a)))]>; + [(set i32:$dst, (bitreverse i32:$a))]>; def BREV64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a), "brev.b64 \t$dst, $a;", - [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>; + [(set i64:$dst, (bitreverse i64:$a))]>; -// -// Rotate: Use ptx shf instruction if available. -// - -// 32 bit r2 = rotl r1, n -// => -// r2 = shf.l r1, r1, n -def ROTL32imm_hw : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), - "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", - [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 imm:$amt)))]>, - Requires<[hasHWROT32]>; - -def ROTL32reg_hw : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), - "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", - [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>, - Requires<[hasHWROT32]>; - -// 32 bit r2 = rotr r1, n -// => -// r2 = shf.r r1, r1, n -def ROTR32imm_hw : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), - "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", - [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 imm:$amt)))]>, - Requires<[hasHWROT32]>; - -def ROTR32reg_hw : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), - "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", - [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>, - Requires<[hasHWROT32]>; - -// 32-bit software rotate by immediate. $amt2 should equal 32 - $amt1. -def ROT32imm_sw : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2), - "{{\n\t" - ".reg .b32 %lhs;\n\t" - ".reg .b32 %rhs;\n\t" - "shl.b32 \t%lhs, $src, $amt1;\n\t" - "shr.b32 \t%rhs, $src, $amt2;\n\t" - "add.u32 \t$dst, %lhs, %rhs;\n\t" - "}}", - []>; - -def SUB_FRM_32 : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32); -}]>; - -def : Pat<(rotl (i32 Int32Regs:$src), (i32 imm:$amt)), - (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, - Requires<[noHWROT32]>; -def : Pat<(rotr (i32 Int32Regs:$src), (i32 imm:$amt)), - (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>, - Requires<[noHWROT32]>; - -// 32-bit software rotate left by register. -def ROTL32reg_sw : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), - "{{\n\t" - ".reg .b32 %lhs;\n\t" - ".reg .b32 %rhs;\n\t" - ".reg .b32 %amt2;\n\t" - "shl.b32 \t%lhs, $src, $amt;\n\t" - "sub.s32 \t%amt2, 32, $amt;\n\t" - "shr.b32 \t%rhs, $src, %amt2;\n\t" - "add.u32 \t$dst, %lhs, %rhs;\n\t" - "}}", - [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>, - Requires<[noHWROT32]>; - -// 32-bit software rotate right by register. -def ROTR32reg_sw : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), - "{{\n\t" - ".reg .b32 %lhs;\n\t" - ".reg .b32 %rhs;\n\t" - ".reg .b32 %amt2;\n\t" - "shr.b32 \t%lhs, $src, $amt;\n\t" - "sub.s32 \t%amt2, 32, $amt;\n\t" - "shl.b32 \t%rhs, $src, %amt2;\n\t" - "add.u32 \t$dst, %lhs, %rhs;\n\t" - "}}", - [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>, - Requires<[noHWROT32]>; - -// 64-bit software rotate by immediate. $amt2 should equal 64 - $amt1. -def ROT64imm_sw : - NVPTXInst<(outs Int64Regs:$dst), - (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2), - "{{\n\t" - ".reg .b64 %lhs;\n\t" - ".reg .b64 %rhs;\n\t" - "shl.b64 \t%lhs, $src, $amt1;\n\t" - "shr.b64 \t%rhs, $src, $amt2;\n\t" - "add.u64 \t$dst, %lhs, %rhs;\n\t" - "}}", - []>; - -def SUB_FRM_64 : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32); -}]>; - -def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)), - (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>; -def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)), - (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>; - -// 64-bit software rotate left by register. -def ROTL64reg_sw : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), - "{{\n\t" - ".reg .b64 %lhs;\n\t" - ".reg .b64 %rhs;\n\t" - ".reg .u32 %amt2;\n\t" - "and.b32 \t%amt2, $amt, 63;\n\t" - "shl.b64 \t%lhs, $src, %amt2;\n\t" - "sub.u32 \t%amt2, 64, %amt2;\n\t" - "shr.b64 \t%rhs, $src, %amt2;\n\t" - "add.u64 \t$dst, %lhs, %rhs;\n\t" - "}}", - [(set Int64Regs:$dst, (rotl Int64Regs:$src, (i32 Int32Regs:$amt)))]>; - -def ROTR64reg_sw : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), - "{{\n\t" - ".reg .b64 %lhs;\n\t" - ".reg .b64 %rhs;\n\t" - ".reg .u32 %amt2;\n\t" - "and.b32 \t%amt2, $amt, 63;\n\t" - "shr.b64 \t%lhs, $src, %amt2;\n\t" - "sub.u32 \t%amt2, 64, %amt2;\n\t" - "shl.b64 \t%rhs, $src, %amt2;\n\t" - "add.u64 \t$dst, %lhs, %rhs;\n\t" - "}}", - [(set Int64Regs:$dst, (rotr Int64Regs:$src, (i32 Int32Regs:$amt)))]>; - -// -// Funnnel shift in clamp mode -// - -// Create SDNodes so they can be used in the DAG code, e.g. -// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts) -def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>; -def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>; - -def FUNSHFLCLAMP : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), - "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;", - [(set Int32Regs:$dst, - (FUN_SHFL_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>; - -def FUNSHFRCLAMP : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), - "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;", - [(set Int32Regs:$dst, - (FUN_SHFR_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>; // // BFE - bit-field extract @@ -1844,17 +1702,17 @@ multiclass BFE<string Instr, ValueType T, RegisterClass RC> { : NVPTXInst<(outs RC:$d), (ins RC:$a, Int32Regs:$b, Int32Regs:$c), !strconcat(Instr, " \t$d, $a, $b, $c;"), - [(set (T RC:$d), (bfe (T RC:$a), (i32 Int32Regs:$b), (i32 Int32Regs:$c)))]>; + [(set T:$d, (bfe T:$a, i32:$b, i32:$c))]>; def rri : NVPTXInst<(outs RC:$d), (ins RC:$a, Int32Regs:$b, i32imm:$c), !strconcat(Instr, " \t$d, $a, $b, $c;"), - [(set (T RC:$d), (bfe (T RC:$a), (i32 Int32Regs:$b), (i32 imm:$c)))]>; + [(set T:$d, (bfe T:$a, i32:$b, imm:$c))]>; def rii : NVPTXInst<(outs RC:$d), (ins RC:$a, i32imm:$b, i32imm:$c), !strconcat(Instr, " \t$d, $a, $b, $c;"), - [(set (T RC:$d), (bfe (T RC:$a), (i32 imm:$b), (i32 imm:$c)))]>; + [(set T:$d, (bfe T:$a, imm:$b, imm:$c))]>; } multiclass BFI<string Instr, ValueType T, RegisterClass RC, Operand ImmCls> { @@ -1862,32 +1720,36 @@ multiclass BFI<string Instr, ValueType T, RegisterClass RC, Operand ImmCls> { : NVPTXInst<(outs RC:$f), (ins RC:$a, RC:$b, Int32Regs:$c, Int32Regs:$d), !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), - [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 Int32Regs:$d)))]>; + [(set T:$f, (bfi T:$a, T:$b, i32:$c, i32:$d))]>; def rrri : NVPTXInst<(outs RC:$f), (ins RC:$a, RC:$b, Int32Regs:$c, i32imm:$d), !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), - [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 imm:$d)))]>; + [(set T:$f, (bfi T:$a, T:$b, i32:$c, imm:$d))]>; def rrii : NVPTXInst<(outs RC:$f), (ins RC:$a, RC:$b, i32imm:$c, i32imm:$d), !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), - [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 imm:$c), (i32 imm:$d)))]>; + [(set T:$f, (bfi T:$a, T:$b, imm:$c, imm:$d))]>; def irrr : NVPTXInst<(outs RC:$f), (ins ImmCls:$a, RC:$b, Int32Regs:$c, Int32Regs:$d), !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), - [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 Int32Regs:$d)))]>; + [(set T:$f, (bfi (T imm:$a), T:$b, i32:$c, i32:$d))]>; def irri : NVPTXInst<(outs RC:$f), (ins ImmCls:$a, RC:$b, Int32Regs:$c, i32imm:$d), !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), - [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 imm:$d)))]>; + [(set T:$f, (bfi (T imm:$a), T:$b, i32:$c, imm:$d))]>; def irii : NVPTXInst<(outs RC:$f), (ins ImmCls:$a, RC:$b, i32imm:$c, i32imm:$d), !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), - [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 imm:$c), (i32 imm:$d)))]>; + [(set T:$f, (bfi (T imm:$a), T:$b, imm:$c, imm:$d))]>; +} + +def Hexu32imm : Operand<i32> { + let PrintMethod = "printHexu32imm"; } multiclass PRMT<ValueType T, RegisterClass RC> { @@ -1895,17 +1757,17 @@ multiclass PRMT<ValueType T, RegisterClass RC> { : NVPTXInst<(outs RC:$d), (ins RC:$a, Int32Regs:$b, Int32Regs:$c, PrmtMode:$mode), !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"), - [(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), imm:$mode))]>; + [(set T:$d, (prmt T:$a, T:$b, i32:$c, imm:$mode))]>; def rri : NVPTXInst<(outs RC:$d), - (ins RC:$a, Int32Regs:$b, i32imm:$c, PrmtMode:$mode), + (ins RC:$a, Int32Regs:$b, Hexu32imm:$c, PrmtMode:$mode), !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"), - [(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 imm:$c), imm:$mode))]>; + [(set T:$d, (prmt T:$a, T:$b, imm:$c, imm:$mode))]>; def rii : NVPTXInst<(outs RC:$d), - (ins RC:$a, i32imm:$b, i32imm:$c, PrmtMode:$mode), + (ins RC:$a, i32imm:$b, Hexu32imm:$c, PrmtMode:$mode), !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"), - [(set (T RC:$d), (prmt (T RC:$a), (T imm:$b), (i32 imm:$c), imm:$mode))]>; + [(set T:$d, (prmt T:$a, imm:$b, imm:$c, imm:$mode))]>; } let hasSideEffects = false in { @@ -1926,35 +1788,35 @@ let hasSideEffects = false in { // byte extraction + signed/unsigned extension to i32. -def : Pat<(i32 (sext_inreg (bfe (i32 Int32Regs:$s), (i32 Int32Regs:$o), 8), i8)), - (BFE_S32rri Int32Regs:$s, Int32Regs:$o, 8)>; -def : Pat<(i32 (sext_inreg (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8), i8)), - (BFE_S32rii Int32Regs:$s, imm:$o, 8)>; -def : Pat<(i32 (and (bfe (i32 Int32Regs:$s), (i32 Int32Regs:$o), 8), 255)), - (BFE_U32rri Int32Regs:$s, Int32Regs:$o, 8)>; -def : Pat<(i32 (and (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8), 255)), - (BFE_U32rii Int32Regs:$s, imm:$o, 8)>; +def : Pat<(i32 (sext_inreg (bfe i32:$s, i32:$o, 8), i8)), + (BFE_S32rri $s, $o, 8)>; +def : Pat<(i32 (sext_inreg (bfe i32:$s, imm:$o, 8), i8)), + (BFE_S32rii $s, imm:$o, 8)>; +def : Pat<(i32 (and (bfe i32:$s, i32:$o, 8), 255)), + (BFE_U32rri $s, $o, 8)>; +def : Pat<(i32 (and (bfe i32:$s, imm:$o, 8), 255)), + (BFE_U32rii $s, imm:$o, 8)>; // byte extraction + signed extension to i16 -def : Pat<(i16 (sext_inreg (trunc (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8)), i8)), - (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, imm:$o, 8), CvtNONE)>; +def : Pat<(i16 (sext_inreg (trunc (bfe i32:$s, imm:$o, 8)), i8)), + (CVT_s8_s32 (BFE_S32rii $s, imm:$o, 8), CvtNONE)>; // Byte extraction via shift/trunc/sext -def : Pat<(i16 (sext_inreg (trunc Int32Regs:$s), i8)), - (CVT_s8_s32 Int32Regs:$s, CvtNONE)>; -def : Pat<(i16 (sext_inreg (trunc (srl (i32 Int32Regs:$s), (i32 imm:$o))), i8)), - (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, imm:$o, 8), CvtNONE)>; -def : Pat<(sext_inreg (srl (i32 Int32Regs:$s), (i32 imm:$o)), i8), - (BFE_S32rii Int32Regs:$s, imm:$o, 8)>; -def : Pat<(i16 (sra (i16 (trunc Int32Regs:$s)), (i32 8))), - (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, 8, 8), CvtNONE)>; -def : Pat<(sext_inreg (srl (i64 Int64Regs:$s), (i32 imm:$o)), i8), - (BFE_S64rii Int64Regs:$s, imm:$o, 8)>; -def : Pat<(i16 (sext_inreg (trunc Int64Regs:$s), i8)), - (CVT_s8_s64 Int64Regs:$s, CvtNONE)>; -def : Pat<(i16 (sext_inreg (trunc (srl (i64 Int64Regs:$s), (i32 imm:$o))), i8)), - (CVT_s8_s64 (BFE_S64rii Int64Regs:$s, imm:$o, 8), CvtNONE)>; +def : Pat<(i16 (sext_inreg (trunc i32:$s), i8)), + (CVT_s8_s32 $s, CvtNONE)>; +def : Pat<(i16 (sext_inreg (trunc (srl i32:$s, (i32 imm:$o))), i8)), + (CVT_s8_s32 (BFE_S32rii $s, imm:$o, 8), CvtNONE)>; +def : Pat<(sext_inreg (srl i32:$s, (i32 imm:$o)), i8), + (BFE_S32rii $s, imm:$o, 8)>; +def : Pat<(i16 (sra (i16 (trunc i32:$s)), (i32 8))), + (CVT_s8_s32 (BFE_S32rii $s, 8, 8), CvtNONE)>; +def : Pat<(sext_inreg (srl i64:$s, (i32 imm:$o)), i8), + (BFE_S64rii $s, imm:$o, 8)>; +def : Pat<(i16 (sext_inreg (trunc i64:$s), i8)), + (CVT_s8_s64 $s, CvtNONE)>; +def : Pat<(i16 (sext_inreg (trunc (srl i64:$s, (i32 imm:$o))), i8)), + (CVT_s8_s64 (BFE_S64rii $s, imm:$o, 8), CvtNONE)>; //----------------------------------- // Comparison instructions (setp, set) @@ -2053,10 +1915,10 @@ defm SET_f64 : SET<"f64", Float64Regs, f64imm>; // Data Movement (Load / Store, Move) //----------------------------------- -def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex], - [SDNPWantRoot]>; -def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex], - [SDNPWantRoot]>; +let WantsRoot = true in { + def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex]>; + def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex]>; +} def ADDRvar : ComplexPattern<iPTR, 1, "SelectDirectAddr", [], []>; def MEMri : Operand<i32> { @@ -2072,7 +1934,7 @@ def imem : Operand<iPTR> { let PrintMethod = "printOperand"; } -def imemAny : Operand<iPTRAny> { +def imemAny : Operand<pAny> { let PrintMethod = "printOperand"; } @@ -2084,16 +1946,20 @@ def MmaCode : Operand<i32> { let PrintMethod = "printMmaCode"; } +def Offseti32imm : Operand<i32> { + let PrintMethod = "printOffseti32imm"; +} + def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>; // Load a memory address into a u32 or u64 register. def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a), "mov.u32 \t$dst, $a;", - [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>; + [(set i32:$dst, (Wrapper tglobaladdr:$a))]>; def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a), "mov.u64 \t$dst, $a;", - [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>; + [(set i64:$dst, (Wrapper tglobaladdr:$a))]>; // Get pointer to local stack. let hasSideEffects = false in { @@ -2105,7 +1971,7 @@ let hasSideEffects = false in { // copyPhysreg is hard-coded in NVPTXInstrInfo.cpp -let IsSimpleMove=1, hasSideEffects=0 in { +let IsSimpleMove=1, hasSideEffects=0, isAsCheapAsAMove=1 in { def IMOV1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss), "mov.pred \t$dst, $sss;", []>; def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss), @@ -2117,48 +1983,37 @@ let IsSimpleMove=1, hasSideEffects=0 in { def IMOV128rr : NVPTXInst<(outs Int128Regs:$dst), (ins Int128Regs:$sss), "mov.b128 \t$dst, $sss;", []>; - def IMOVB16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss), - "mov.b16 \t$dst, $sss;", []>; - def IMOVB32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss), - "mov.b32 \t$dst, $sss;", []>; - def IMOVB64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss), - "mov.b64 \t$dst, $sss;", []>; - - def FMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), - // We have to use .b16 here as there's no mov.f16. - "mov.b16 \t$dst, $src;", []>; def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), "mov.f32 \t$dst, $src;", []>; def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src), "mov.f64 \t$dst, $src;", []>; -} -def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src), - "mov.pred \t$dst, $src;", - [(set Int1Regs:$dst, imm:$src)]>; -def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), - "mov.u16 \t$dst, $src;", - [(set Int16Regs:$dst, imm:$src)]>; -def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src), - "mov.u32 \t$dst, $src;", - [(set (i32 Int32Regs:$dst), imm:$src)]>; -def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src), - "mov.u64 \t$dst, $src;", - [(set Int64Regs:$dst, imm:$src)]>; - -def IMOVB16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), - "mov.b16 \t$dst, $src;", []>; -def IMOVB32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src), - "mov.b32 \t$dst, $src;", []>; -def IMOVB64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src), - "mov.b64 \t$dst, $src;", []>; - -def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src), - "mov.f32 \t$dst, $src;", - [(set Float32Regs:$dst, fpimm:$src)]>; -def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src), - "mov.f64 \t$dst, $src;", - [(set Float64Regs:$dst, fpimm:$src)]>; + def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src), + "mov.pred \t$dst, $src;", + [(set i1:$dst, imm:$src)]>; + def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), + "mov.b16 \t$dst, $src;", + [(set i16:$dst, imm:$src)]>; + def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src), + "mov.b32 \t$dst, $src;", + [(set i32:$dst, imm:$src)]>; + def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src), + "mov.b64 \t$dst, $src;", + [(set i64:$dst, imm:$src)]>; + + def FMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$src), + "mov.b16 \t$dst, $src;", + [(set f16:$dst, fpimm:$src)]>; + def BFMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$src), + "mov.b16 \t$dst, $src;", + [(set bf16:$dst, fpimm:$src)]>; + def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src), + "mov.f32 \t$dst, $src;", + [(set f32:$dst, fpimm:$src)]>; + def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src), + "mov.f64 \t$dst, $src;", + [(set f64:$dst, fpimm:$src)]>; +} def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>; def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64ri texternalsym:$dst)>; @@ -2166,10 +2021,10 @@ def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64ri texternalsym:$dst)>; //---- Copy Frame Index ---- def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr), "add.u32 \t$dst, ${addr:add};", - [(set Int32Regs:$dst, ADDRri:$addr)]>; + [(set i32:$dst, ADDRri:$addr)]>; def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr), "add.u64 \t$dst, ${addr:add};", - [(set Int64Regs:$dst, ADDRri64:$addr)]>; + [(set i64:$dst, ADDRri64:$addr)]>; //----------------------------------- // Comparison and Selection @@ -2196,47 +2051,47 @@ multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode, Instruction set_64ir> { // i16 -> pred def : Pat<(i1 (OpNode i16:$a, i16:$b)), - (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>; - def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)), - (setp_16ri Int16Regs:$a, imm:$b, Mode)>; - def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)), - (setp_16ir imm:$a, Int16Regs:$b, Mode)>; + (setp_16rr $a, $b, Mode)>; + def : Pat<(i1 (OpNode i16:$a, imm:$b)), + (setp_16ri $a, imm:$b, Mode)>; + def : Pat<(i1 (OpNode imm:$a, i16:$b)), + (setp_16ir imm:$a, $b, Mode)>; // i32 -> pred def : Pat<(i1 (OpNode i32:$a, i32:$b)), - (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>; - def : Pat<(i1 (OpNode (i32 Int32Regs:$a), imm:$b)), - (setp_32ri Int32Regs:$a, imm:$b, Mode)>; - def : Pat<(i1 (OpNode imm:$a, (i32 Int32Regs:$b))), - (setp_32ir imm:$a, Int32Regs:$b, Mode)>; + (setp_32rr $a, $b, Mode)>; + def : Pat<(i1 (OpNode i32:$a, imm:$b)), + (setp_32ri $a, imm:$b, Mode)>; + def : Pat<(i1 (OpNode imm:$a, i32:$b)), + (setp_32ir imm:$a, $b, Mode)>; // i64 -> pred - def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)), - (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>; - def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)), - (setp_64ri Int64Regs:$a, imm:$b, Mode)>; - def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)), - (setp_64ir imm:$a, Int64Regs:$b, Mode)>; + def : Pat<(i1 (OpNode i64:$a, i64:$b)), + (setp_64rr $a, $b, Mode)>; + def : Pat<(i1 (OpNode i64:$a, imm:$b)), + (setp_64ri $a, imm:$b, Mode)>; + def : Pat<(i1 (OpNode imm:$a, i64:$b)), + (setp_64ir imm:$a, $b, Mode)>; // i16 -> i32 def : Pat<(i32 (OpNode i16:$a, i16:$b)), - (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>; - def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)), - (set_16ri Int16Regs:$a, imm:$b, Mode)>; - def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)), - (set_16ir imm:$a, Int16Regs:$b, Mode)>; + (set_16rr $a, $b, Mode)>; + def : Pat<(i32 (OpNode i16:$a, imm:$b)), + (set_16ri $a, imm:$b, Mode)>; + def : Pat<(i32 (OpNode imm:$a, i16:$b)), + (set_16ir imm:$a, $b, Mode)>; // i32 -> i32 def : Pat<(i32 (OpNode i32:$a, i32:$b)), - (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>; - def : Pat<(i32 (OpNode (i32 Int32Regs:$a), imm:$b)), - (set_32ri Int32Regs:$a, imm:$b, Mode)>; - def : Pat<(i32 (OpNode imm:$a, (i32 Int32Regs:$b))), - (set_32ir imm:$a, Int32Regs:$b, Mode)>; + (set_32rr $a, $b, Mode)>; + def : Pat<(i32 (OpNode i32:$a, imm:$b)), + (set_32ri $a, imm:$b, Mode)>; + def : Pat<(i32 (OpNode imm:$a, i32:$b)), + (set_32ir imm:$a, $b, Mode)>; // i64 -> i32 - def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)), - (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>; - def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)), - (set_64ri Int64Regs:$a, imm:$b, Mode)>; - def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)), - (set_64ir imm:$a, Int64Regs:$b, Mode)>; + def : Pat<(i32 (OpNode i64:$a, Int64Regs:$b)), + (set_64rr $a, $b, Mode)>; + def : Pat<(i32 (OpNode i64:$a, imm:$b)), + (set_64ri $a, imm:$b, Mode)>; + def : Pat<(i32 (OpNode imm:$a, i64:$b)), + (set_64ir imm:$a, $b, Mode)>; } multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode> @@ -2276,17 +2131,6 @@ defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>; defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>; defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>; -// i1 compares -def : Pat<(setne Int1Regs:$a, Int1Regs:$b), - (XORb1rr Int1Regs:$a, Int1Regs:$b)>; -def : Pat<(setune Int1Regs:$a, Int1Regs:$b), - (XORb1rr Int1Regs:$a, Int1Regs:$b)>; - -def : Pat<(seteq Int1Regs:$a, Int1Regs:$b), - (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>; -def : Pat<(setueq Int1Regs:$a, Int1Regs:$b), - (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>; - // comparisons of i8 extracted with BFE as i32 // It's faster to do comparison directly on i32 extracted by BFE, // instead of the long conversion and sign extending. @@ -2353,143 +2197,95 @@ def: Pat<(setne (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)), (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpNE)>; // i1 compare -> i32 -def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)), - (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>; -def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)), - (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>; +def : Pat<(i32 (setne i1:$a, i1:$b)), + (SELP_u32ii -1, 0, (XORb1rr $a, $b))>; +def : Pat<(i32 (setne i1:$a, i1:$b)), + (SELP_u32ii 0, -1, (XORb1rr $a, $b))>; multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> { // f16 -> pred - def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))), - (SETP_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>, - Requires<[useFP16Math,doF32FTZ]>; - def : Pat<(i1 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))), - (SETP_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>, - Requires<[useFP16Math]>; - def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)), - (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>, - Requires<[useFP16Math,doF32FTZ]>; - def : Pat<(i1 (OpNode (f16 Int16Regs:$a), fpimm:$b)), - (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>, - Requires<[useFP16Math]>; - def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))), - (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>, + def : Pat<(i1 (OpNode f16:$a, f16:$b)), + (SETP_f16rr $a, $b, ModeFTZ)>, Requires<[useFP16Math,doF32FTZ]>; - def : Pat<(i1 (OpNode fpimm:$a, (f16 Int16Regs:$b))), - (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>, + def : Pat<(i1 (OpNode f16:$a, f16:$b)), + (SETP_f16rr $a, $b, Mode)>, Requires<[useFP16Math]>; // bf16 -> pred - def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), - (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>, - Requires<[hasBF16Math,doF32FTZ]>; - def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), - (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>, - Requires<[hasBF16Math]>; - def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), fpimm:$b)), - (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>, + def : Pat<(i1 (OpNode bf16:$a, bf16:$b)), + (SETP_bf16rr $a, $b, ModeFTZ)>, Requires<[hasBF16Math,doF32FTZ]>; - def : Pat<(i1 (OpNode (bf16 Int16Regs:$a), fpimm:$b)), - (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>, - Requires<[hasBF16Math]>; - def : Pat<(i1 (OpNode fpimm:$a, (bf16 Int16Regs:$b))), - (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>, - Requires<[hasBF16Math,doF32FTZ]>; - def : Pat<(i1 (OpNode fpimm:$a, (bf16 Int16Regs:$b))), - (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>, + def : Pat<(i1 (OpNode bf16:$a, bf16:$b)), + (SETP_bf16rr $a, $b, Mode)>, Requires<[hasBF16Math]>; // f32 -> pred - def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)), - (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>, + def : Pat<(i1 (OpNode f32:$a, f32:$b)), + (SETP_f32rr $a, $b, ModeFTZ)>, Requires<[doF32FTZ]>; - def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)), - (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>; + def : Pat<(i1 (OpNode f32:$a, f32:$b)), + (SETP_f32rr $a, $b, Mode)>; def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)), - (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>, + (SETP_f32ri $a, fpimm:$b, ModeFTZ)>, Requires<[doF32FTZ]>; - def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)), - (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>; - def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)), - (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>, + def : Pat<(i1 (OpNode f32:$a, fpimm:$b)), + (SETP_f32ri $a, fpimm:$b, Mode)>; + def : Pat<(i1 (OpNode fpimm:$a, f32:$b)), + (SETP_f32ir fpimm:$a, $b, ModeFTZ)>, Requires<[doF32FTZ]>; - def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)), - (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>; + def : Pat<(i1 (OpNode fpimm:$a, f32:$b)), + (SETP_f32ir fpimm:$a, $b, Mode)>; // f64 -> pred - def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)), - (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>; - def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)), - (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>; - def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)), - (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>; + def : Pat<(i1 (OpNode f64:$a, f64:$b)), + (SETP_f64rr $a, $b, Mode)>; + def : Pat<(i1 (OpNode f64:$a, fpimm:$b)), + (SETP_f64ri $a, fpimm:$b, Mode)>; + def : Pat<(i1 (OpNode fpimm:$a, f64:$b)), + (SETP_f64ir fpimm:$a, $b, Mode)>; // f16 -> i32 - def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))), - (SET_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>, - Requires<[useFP16Math, doF32FTZ]>; - def : Pat<(i32 (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b))), - (SET_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>, - Requires<[useFP16Math]>; - def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)), - (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>, + def : Pat<(i32 (OpNode f16:$a, f16:$b)), + (SET_f16rr $a, $b, ModeFTZ)>, Requires<[useFP16Math, doF32FTZ]>; - def : Pat<(i32 (OpNode (f16 Int16Regs:$a), fpimm:$b)), - (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>, - Requires<[useFP16Math]>; - def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))), - (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>, - Requires<[useFP16Math, doF32FTZ]>; - def : Pat<(i32 (OpNode fpimm:$a, (f16 Int16Regs:$b))), - (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>, + def : Pat<(i32 (OpNode f16:$a, f16:$b)), + (SET_f16rr $a, $b, Mode)>, Requires<[useFP16Math]>; // bf16 -> i32 - def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), - (SET_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>, - Requires<[hasBF16Math, doF32FTZ]>; - def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), - (SET_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>, - Requires<[hasBF16Math]>; - def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), fpimm:$b)), - (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>, - Requires<[hasBF16Math, doF32FTZ]>; - def : Pat<(i32 (OpNode (bf16 Int16Regs:$a), fpimm:$b)), - (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>, - Requires<[hasBF16Math]>; - def : Pat<(i32 (OpNode fpimm:$a, (bf16 Int16Regs:$b))), - (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>, + def : Pat<(i32 (OpNode bf16:$a, bf16:$b)), + (SET_bf16rr $a, $b, ModeFTZ)>, Requires<[hasBF16Math, doF32FTZ]>; - def : Pat<(i32 (OpNode fpimm:$a, (bf16 Int16Regs:$b))), - (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>, + def : Pat<(i32 (OpNode bf16:$a, bf16:$b)), + (SET_bf16rr $a, $b, Mode)>, Requires<[hasBF16Math]>; // f32 -> i32 - def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)), - (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>, + def : Pat<(i32 (OpNode f32:$a, f32:$b)), + (SET_f32rr $a, $b, ModeFTZ)>, Requires<[doF32FTZ]>; - def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)), - (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>; - def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)), - (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>, + def : Pat<(i32 (OpNode f32:$a, f32:$b)), + (SET_f32rr $a, $b, Mode)>; + def : Pat<(i32 (OpNode f32:$a, fpimm:$b)), + (SET_f32ri $a, fpimm:$b, ModeFTZ)>, Requires<[doF32FTZ]>; - def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)), - (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>; - def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)), - (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>, + def : Pat<(i32 (OpNode f32:$a, fpimm:$b)), + (SET_f32ri $a, fpimm:$b, Mode)>; + def : Pat<(i32 (OpNode fpimm:$a, f32:$b)), + (SET_f32ir fpimm:$a, $b, ModeFTZ)>, Requires<[doF32FTZ]>; - def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)), - (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>; + def : Pat<(i32 (OpNode fpimm:$a, f32:$b)), + (SET_f32ir fpimm:$a, $b, Mode)>; // f64 -> i32 - def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)), - (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>; - def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)), - (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>; - def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)), - (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>; + def : Pat<(i32 (OpNode f64:$a, f64:$b)), + (SET_f64rr $a, $b, Mode)>; + def : Pat<(i32 (OpNode f64:$a, fpimm:$b)), + (SET_f64ri $a, fpimm:$b, Mode)>; + def : Pat<(i32 (OpNode fpimm:$a, f64:$b)), + (SET_f64ir fpimm:$a, $b, Mode)>; } defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>; @@ -2632,21 +2428,21 @@ def ProxyReg : let mayLoad = true in { class LoadParamMemInst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), - !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"), + NVPTXInst<(outs regclass:$dst), (ins Offseti32imm:$b), + !strconcat("ld.param", opstr, " \t$dst, [retval0$b];"), []>; class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b), + NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins Offseti32imm:$b), !strconcat("ld.param.v2", opstr, - " \t{{$dst, $dst2}}, [retval0+$b];"), []>; + " \t{{$dst, $dst2}}, [retval0$b];"), []>; class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> : NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins i32imm:$b), + (ins Offseti32imm:$b), !strconcat("ld.param.v4", opstr, - " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"), + " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0$b];"), []>; } @@ -2662,8 +2458,8 @@ let mayStore = true in { if !or(support_imm, !isa<NVPTXRegClass>(op)) then def _ # !if(!isa<NVPTXRegClass>(op), "r", "i") : NVPTXInst<(outs), - (ins op:$val, i32imm:$a, i32imm:$b), - "st.param" # opstr # " \t[param$a+$b], $val;", + (ins op:$val, i32imm:$a, Offseti32imm:$b), + "st.param" # opstr # " \t[param$a$b], $val;", []>; } @@ -2674,8 +2470,8 @@ let mayStore = true in { # !if(!isa<NVPTXRegClass>(op2), "r", "i") : NVPTXInst<(outs), (ins op1:$val1, op2:$val2, - i32imm:$a, i32imm:$b), - "st.param.v2" # opstr # " \t[param$a+$b], {{$val1, $val2}};", + i32imm:$a, Offseti32imm:$b), + "st.param.v2" # opstr # " \t[param$a$b], {{$val1, $val2}};", []>; } @@ -2691,29 +2487,29 @@ let mayStore = true in { : NVPTXInst<(outs), (ins op1:$val1, op2:$val2, op3:$val3, op4:$val4, - i32imm:$a, i32imm:$b), + i32imm:$a, Offseti32imm:$b), "st.param.v4" # opstr # - " \t[param$a+$b], {{$val1, $val2, $val3, $val4}};", + " \t[param$a$b], {{$val1, $val2, $val3, $val4}};", []>; } class StoreRetvalInst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs), (ins regclass:$val, i32imm:$a), - !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"), + NVPTXInst<(outs), (ins regclass:$val, Offseti32imm:$a), + !strconcat("st.param", opstr, " \t[func_retval0$a], $val;"), []>; class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a), + NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, Offseti32imm:$a), !strconcat("st.param.v2", opstr, - " \t[func_retval0+$a], {{$val, $val2}};"), + " \t[func_retval0$a], {{$val, $val2}};"), []>; class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> : NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3, - regclass:$val4, i32imm:$a), + regclass:$val4, Offseti32imm:$a), !strconcat("st.param.v4", opstr, - " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"), + " \t[func_retval0$a], {{$val, $val2, $val3, $val4}};"), []>; } @@ -2867,7 +2663,7 @@ def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr), "$addr, ", def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ", [(CallVoid i32:$addr)]>; def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ", - [(CallVoid Int64Regs:$addr)]>; + [(CallVoid i64:$addr)]>; def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;", [(Prototype (i32 imm:$val))]>; @@ -2900,7 +2696,7 @@ def DeclareScalarRegInst : class MoveParamInst<ValueType T, NVPTXRegClass regclass, string asmstr> : NVPTXInst<(outs regclass:$dst), (ins regclass:$src), !strconcat("mov", asmstr, " \t$dst, $src;"), - [(set (T regclass:$dst), (MoveParam (T regclass:$src)))]>; + [(set T:$dst, (MoveParam T:$src))]>; class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty, ValueType vt, string asmstr> : @@ -2935,7 +2731,7 @@ def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs, f32>; class ProxyRegInst<string SzStr, ValueType T, NVPTXRegClass regclass> : NVPTXInst<(outs regclass:$dst), (ins regclass:$src), !strconcat("mov.", SzStr, " \t$dst, $src;"), - [(set (T regclass:$dst), (ProxyReg (T regclass:$src)))]>; + [(set T:$dst, (ProxyReg T:$src))]>; def ProxyRegI1 : ProxyRegInst<"pred", i1, Int1Regs>; def ProxyRegI16 : ProxyRegInst<"b16", i16, Int16Regs>; @@ -2945,11 +2741,11 @@ def ProxyRegF32 : ProxyRegInst<"f32", f32, Float32Regs>; def ProxyRegF64 : ProxyRegInst<"f64", f64, Float64Regs>; foreach vt = [f16, bf16] in { - def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI16 Int16Regs:$src)>; + def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI16 $src)>; } foreach vt = [v2f16, v2bf16, v2i16, v4i8] in { - def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI32 Int32Regs:$src)>; + def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI32 $src)>; } // @@ -2958,40 +2754,40 @@ foreach vt = [v2f16, v2bf16, v2i16, v4i8] in { multiclass LD<NVPTXRegClass regclass> { def _avar : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr];", []>; def _areg : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr];", []>; def _areg_64 : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t$dst, [$addr];", []>; def _ari : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t$dst, [$addr+$offset];", []>; + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr, Offseti32imm:$offset), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t$dst, [$addr$offset];", []>; def _ari_64 : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t$dst, [$addr+$offset];", []>; + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, Offseti32imm:$offset), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t$dst, [$addr$offset];", []>; def _asi : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t$dst, [$addr+$offset];", []>; + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, Offseti32imm:$offset), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t$dst, [$addr$offset];", []>; } let mayLoad=1, hasSideEffects=0 in { @@ -3006,40 +2802,43 @@ let mayLoad=1, hasSideEffects=0 in { multiclass ST<NVPTXRegClass regclass> { def _avar : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$toWidth, imem:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr], $src;", []>; def _areg : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, + (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr], $src;", []>; def _areg_64 : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" " \t[$addr], $src;", []>; def _ari : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" - " \t[$addr+$offset], $src;", []>; + (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, + Offseti32imm:$offset), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + " \t[$addr$offset], $src;", []>; def _ari_64 : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" - " \t[$addr+$offset], $src;", []>; + (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, + Offseti32imm:$offset), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + " \t[$addr$offset], $src;", []>; def _asi : NVPTXInst< (outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" - " \t[$addr+$offset], $src;", []>; + (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr, + Offseti32imm:$offset), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + " \t[$addr$offset], $src;", []>; } let mayStore=1, hasSideEffects=0 in { @@ -3057,76 +2856,76 @@ let mayStore=1, hasSideEffects=0 in { multiclass LD_VEC<NVPTXRegClass regclass> { def _v2_avar : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr];", []>; def _v2_areg : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr];", []>; def _v2_areg_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr];", []>; def _v2_ari : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, Offseti32imm:$offset), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2}}, [$addr$offset];", []>; def _v2_ari_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, Offseti32imm:$offset), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2}}, [$addr$offset];", []>; def _v2_asi : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, Offseti32imm:$offset), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2}}, [$addr$offset];", []>; def _v4_avar : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; def _v4_areg : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; def _v4_areg_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; def _v4_ari : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, Offseti32imm:$offset), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr$offset];", []>; def _v4_ari_64 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, Offseti32imm:$offset), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr$offset];", []>; def _v4_asi : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, Offseti32imm:$offset), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr$offset];", []>; } let mayLoad=1, hasSideEffects=0 in { defm LDV_i8 : LD_VEC<Int16Regs>; @@ -3140,85 +2939,88 @@ let mayLoad=1, hasSideEffects=0 in { multiclass ST_VEC<NVPTXRegClass regclass> { def _v2_avar : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope, + LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, + imem:$addr), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2}};", []>; def _v2_areg : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope, + LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, + Int32Regs:$addr), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2}};", []>; def _v2_areg_64 : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope, + LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, + Int64Regs:$addr), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2}};", []>; def _v2_ari : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, - i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t[$addr+$offset], {{$src1, $src2}};", []>; + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope, + LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, + Int32Regs:$addr, Offseti32imm:$offset), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr$offset], {{$src1, $src2}};", []>; def _v2_ari_64 : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, - i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t[$addr+$offset], {{$src1, $src2}};", []>; + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope, + LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, + Int64Regs:$addr, Offseti32imm:$offset), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr$offset], {{$src1, $src2}};", []>; def _v2_asi : NVPTXInst< (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, - i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t[$addr+$offset], {{$src1, $src2}};", []>; + (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope, + LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, + imem:$addr, Offseti32imm:$offset), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr$offset], {{$src1, $src2}};", []>; def _v4_avar : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; def _v4_areg : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; def _v4_areg_64 : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; def _v4_ari : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; + LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, Offseti32imm:$offset), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr$offset], {{$src1, $src2, $src3, $src4}};", []>; def _v4_ari_64 : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; + LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, Offseti32imm:$offset), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr$offset], {{$src1, $src2, $src3, $src4}};", []>; def _v4_asi : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}" - "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; + LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, Offseti32imm:$offset), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}" + "$fromWidth \t[$addr$offset], {{$src1, $src2, $src3, $src4}};", []>; } let mayStore=1, hasSideEffects=0 in { @@ -3237,7 +3039,7 @@ class F_BITCONVERT<string SzStr, ValueType TIn, ValueType TOut, NVPTXRegClass regclassOut = ValueToRegClass<TOut>.ret> : NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a), !strconcat("mov.b", SzStr, " \t$d, $a;"), - [(set (TOut regclassOut:$d), (bitconvert (TIn regclassIn:$a)))]>; + [(set TOut:$d, (bitconvert TIn:$a))]>; def BITCONVERT_32_I2F : F_BITCONVERT<"32", i32, f32>; def BITCONVERT_32_F2I : F_BITCONVERT<"32", f32, i32>; @@ -3246,310 +3048,307 @@ def BITCONVERT_64_F2I : F_BITCONVERT<"64", f64, i64>; foreach vt = [v2f16, v2bf16, v2i16, v4i8] in { def: Pat<(vt (bitconvert (f32 Float32Regs:$a))), - (BITCONVERT_32_F2I Float32Regs:$a)>; -def: Pat<(f32 (bitconvert (vt Int32Regs:$a))), - (BITCONVERT_32_I2F Int32Regs:$a)>; + (BITCONVERT_32_F2I $a)>; +def: Pat<(f32 (bitconvert vt:$a)), + (BITCONVERT_32_I2F $a)>; } foreach vt = [f16, bf16] in { -def: Pat<(vt (bitconvert (i16 UInt16Const:$a))), - (IMOVB16ri UInt16Const:$a)>; -def: Pat<(vt (bitconvert (i16 Int16Regs:$a))), - (ProxyRegI16 Int16Regs:$a)>; -def: Pat<(i16 (bitconvert (vt Int16Regs:$a))), - (ProxyRegI16 Int16Regs:$a)>; + def: Pat<(vt (bitconvert i16:$a)), + (vt Int16Regs:$a)>; + def: Pat<(i16 (bitconvert vt:$a)), + (i16 Int16Regs:$a)>; } foreach ta = [v2f16, v2bf16, v2i16, v4i8, i32] in { - def: Pat<(ta (bitconvert (i32 UInt32Const:$a))), - (IMOVB32ri UInt32Const:$a)>; foreach tb = [v2f16, v2bf16, v2i16, v4i8, i32] in { if !ne(ta, tb) then { - def: Pat<(ta (bitconvert (tb Int32Regs:$a))), - (ProxyRegI32 Int32Regs:$a)>; + def: Pat<(ta (bitconvert tb:$a)), + (ta Int32Regs:$a)>; } } } // NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where // we cannot specify floating-point literals in isel patterns. Therefore, we -// use an integer selp to select either 1 or 0 and then cvt to floating-point. +// use an integer selp to select either 1 (or -1 in case of signed) or 0 +// and then cvt to floating-point. // sint -> f16 -def : Pat<(f16 (sint_to_fp Int1Regs:$a)), - (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; +def : Pat<(f16 (sint_to_fp i1:$a)), + (CVT_f16_s32 (SELP_s32ii -1, 0, $a), CvtRN)>; def : Pat<(f16 (sint_to_fp Int16Regs:$a)), - (CVT_f16_s16 Int16Regs:$a, CvtRN)>; -def : Pat<(f16 (sint_to_fp Int32Regs:$a)), - (CVT_f16_s32 Int32Regs:$a, CvtRN)>; -def : Pat<(f16 (sint_to_fp Int64Regs:$a)), - (CVT_f16_s64 Int64Regs:$a, CvtRN)>; + (CVT_f16_s16 $a, CvtRN)>; +def : Pat<(f16 (sint_to_fp i32:$a)), + (CVT_f16_s32 $a, CvtRN)>; +def : Pat<(f16 (sint_to_fp i64:$a)), + (CVT_f16_s64 $a, CvtRN)>; // uint -> f16 -def : Pat<(f16 (uint_to_fp Int1Regs:$a)), - (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; +def : Pat<(f16 (uint_to_fp i1:$a)), + (CVT_f16_u32 (SELP_u32ii 1, 0, $a), CvtRN)>; def : Pat<(f16 (uint_to_fp Int16Regs:$a)), - (CVT_f16_u16 Int16Regs:$a, CvtRN)>; -def : Pat<(f16 (uint_to_fp Int32Regs:$a)), - (CVT_f16_u32 Int32Regs:$a, CvtRN)>; -def : Pat<(f16 (uint_to_fp Int64Regs:$a)), - (CVT_f16_u64 Int64Regs:$a, CvtRN)>; + (CVT_f16_u16 $a, CvtRN)>; +def : Pat<(f16 (uint_to_fp i32:$a)), + (CVT_f16_u32 $a, CvtRN)>; +def : Pat<(f16 (uint_to_fp i64:$a)), + (CVT_f16_u64 $a, CvtRN)>; // sint -> bf16 -def : Pat<(bf16 (sint_to_fp Int1Regs:$a)), - (CVT_bf16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; -def : Pat<(bf16 (sint_to_fp Int16Regs:$a)), - (CVT_bf16_s16 Int16Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; -def : Pat<(bf16 (sint_to_fp Int32Regs:$a)), - (CVT_bf16_s32 Int32Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; -def : Pat<(bf16 (sint_to_fp Int64Regs:$a)), - (CVT_bf16_s64 Int64Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; +def : Pat<(bf16 (sint_to_fp i1:$a)), + (CVT_bf16_s32 (SELP_u32ii 1, 0, $a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; +def : Pat<(bf16 (sint_to_fp i16:$a)), + (CVT_bf16_s16 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; +def : Pat<(bf16 (sint_to_fp i32:$a)), + (CVT_bf16_s32 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; +def : Pat<(bf16 (sint_to_fp i64:$a)), + (CVT_bf16_s64 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; // uint -> bf16 -def : Pat<(bf16 (uint_to_fp Int1Regs:$a)), - (CVT_bf16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; -def : Pat<(bf16 (uint_to_fp Int16Regs:$a)), - (CVT_bf16_u16 Int16Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; -def : Pat<(bf16 (uint_to_fp Int32Regs:$a)), - (CVT_bf16_u32 Int32Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; -def : Pat<(bf16 (uint_to_fp Int64Regs:$a)), - (CVT_bf16_u64 Int64Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; +def : Pat<(bf16 (uint_to_fp i1:$a)), + (CVT_bf16_u32 (SELP_u32ii 1, 0, $a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; +def : Pat<(bf16 (uint_to_fp i16:$a)), + (CVT_bf16_u16 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; +def : Pat<(bf16 (uint_to_fp i32:$a)), + (CVT_bf16_u32 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; +def : Pat<(bf16 (uint_to_fp i64:$a)), + (CVT_bf16_u64 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; // sint -> f32 -def : Pat<(f32 (sint_to_fp Int1Regs:$a)), - (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; -def : Pat<(f32 (sint_to_fp Int16Regs:$a)), - (CVT_f32_s16 Int16Regs:$a, CvtRN)>; -def : Pat<(f32 (sint_to_fp Int32Regs:$a)), - (CVT_f32_s32 Int32Regs:$a, CvtRN)>; -def : Pat<(f32 (sint_to_fp Int64Regs:$a)), - (CVT_f32_s64 Int64Regs:$a, CvtRN)>; +def : Pat<(f32 (sint_to_fp i1:$a)), + (CVT_f32_s32 (SELP_s32ii -1, 0, $a), CvtRN)>; +def : Pat<(f32 (sint_to_fp i16:$a)), + (CVT_f32_s16 $a, CvtRN)>; +def : Pat<(f32 (sint_to_fp i32:$a)), + (CVT_f32_s32 $a, CvtRN)>; +def : Pat<(f32 (sint_to_fp i64:$a)), + (CVT_f32_s64 $a, CvtRN)>; // uint -> f32 -def : Pat<(f32 (uint_to_fp Int1Regs:$a)), - (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; -def : Pat<(f32 (uint_to_fp Int16Regs:$a)), - (CVT_f32_u16 Int16Regs:$a, CvtRN)>; -def : Pat<(f32 (uint_to_fp Int32Regs:$a)), - (CVT_f32_u32 Int32Regs:$a, CvtRN)>; -def : Pat<(f32 (uint_to_fp Int64Regs:$a)), - (CVT_f32_u64 Int64Regs:$a, CvtRN)>; +def : Pat<(f32 (uint_to_fp i1:$a)), + (CVT_f32_u32 (SELP_u32ii 1, 0, $a), CvtRN)>; +def : Pat<(f32 (uint_to_fp i16:$a)), + (CVT_f32_u16 $a, CvtRN)>; +def : Pat<(f32 (uint_to_fp i32:$a)), + (CVT_f32_u32 $a, CvtRN)>; +def : Pat<(f32 (uint_to_fp i64:$a)), + (CVT_f32_u64 $a, CvtRN)>; // sint -> f64 -def : Pat<(f64 (sint_to_fp Int1Regs:$a)), - (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; -def : Pat<(f64 (sint_to_fp Int16Regs:$a)), - (CVT_f64_s16 Int16Regs:$a, CvtRN)>; -def : Pat<(f64 (sint_to_fp Int32Regs:$a)), - (CVT_f64_s32 Int32Regs:$a, CvtRN)>; -def : Pat<(f64 (sint_to_fp Int64Regs:$a)), - (CVT_f64_s64 Int64Regs:$a, CvtRN)>; +def : Pat<(f64 (sint_to_fp i1:$a)), + (CVT_f64_s32 (SELP_s32ii -1, 0, $a), CvtRN)>; +def : Pat<(f64 (sint_to_fp i16:$a)), + (CVT_f64_s16 $a, CvtRN)>; +def : Pat<(f64 (sint_to_fp i32:$a)), + (CVT_f64_s32 $a, CvtRN)>; +def : Pat<(f64 (sint_to_fp i64:$a)), + (CVT_f64_s64 $a, CvtRN)>; // uint -> f64 -def : Pat<(f64 (uint_to_fp Int1Regs:$a)), - (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; -def : Pat<(f64 (uint_to_fp Int16Regs:$a)), - (CVT_f64_u16 Int16Regs:$a, CvtRN)>; -def : Pat<(f64 (uint_to_fp Int32Regs:$a)), - (CVT_f64_u32 Int32Regs:$a, CvtRN)>; -def : Pat<(f64 (uint_to_fp Int64Regs:$a)), - (CVT_f64_u64 Int64Regs:$a, CvtRN)>; +def : Pat<(f64 (uint_to_fp i1:$a)), + (CVT_f64_u32 (SELP_u32ii 1, 0, $a), CvtRN)>; +def : Pat<(f64 (uint_to_fp i16:$a)), + (CVT_f64_u16 $a, CvtRN)>; +def : Pat<(f64 (uint_to_fp i32:$a)), + (CVT_f64_u32 $a, CvtRN)>; +def : Pat<(f64 (uint_to_fp i64:$a)), + (CVT_f64_u64 $a, CvtRN)>; // f16 -> sint -def : Pat<(i1 (fp_to_sint (f16 Int16Regs:$a))), - (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>; -def : Pat<(i16 (fp_to_sint (f16 Int16Regs:$a))), - (CVT_s16_f16 (f16 Int16Regs:$a), CvtRZI)>; -def : Pat<(i32 (fp_to_sint (f16 Int16Regs:$a))), - (CVT_s32_f16 (f16 Int16Regs:$a), CvtRZI)>; -def : Pat<(i64 (fp_to_sint (f16 Int16Regs:$a))), - (CVT_s64_f16 Int16Regs:$a, CvtRZI)>; +def : Pat<(i1 (fp_to_sint f16:$a)), + (SETP_b16ri $a, 0, CmpEQ)>; +def : Pat<(i16 (fp_to_sint f16:$a)), + (CVT_s16_f16 $a, CvtRZI)>; +def : Pat<(i32 (fp_to_sint f16:$a)), + (CVT_s32_f16 $a, CvtRZI)>; +def : Pat<(i64 (fp_to_sint f16:$a)), + (CVT_s64_f16 $a, CvtRZI)>; // f16 -> uint -def : Pat<(i1 (fp_to_uint (f16 Int16Regs:$a))), - (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>; -def : Pat<(i16 (fp_to_uint (f16 Int16Regs:$a))), - (CVT_u16_f16 Int16Regs:$a, CvtRZI)>; -def : Pat<(i32 (fp_to_uint (f16 Int16Regs:$a))), - (CVT_u32_f16 Int16Regs:$a, CvtRZI)>; -def : Pat<(i64 (fp_to_uint (f16 Int16Regs:$a))), - (CVT_u64_f16 Int16Regs:$a, CvtRZI)>; +def : Pat<(i1 (fp_to_uint f16:$a)), + (SETP_b16ri $a, 0, CmpEQ)>; +def : Pat<(i16 (fp_to_uint f16:$a)), + (CVT_u16_f16 $a, CvtRZI)>; +def : Pat<(i32 (fp_to_uint f16:$a)), + (CVT_u32_f16 $a, CvtRZI)>; +def : Pat<(i64 (fp_to_uint f16:$a)), + (CVT_u64_f16 $a, CvtRZI)>; // bf16 -> sint -def : Pat<(i1 (fp_to_sint (bf16 Int16Regs:$a))), - (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>; -def : Pat<(i16 (fp_to_sint (bf16 Int16Regs:$a))), - (CVT_s16_bf16 (bf16 Int16Regs:$a), CvtRZI)>; -def : Pat<(i32 (fp_to_sint (bf16 Int16Regs:$a))), - (CVT_s32_bf16 (bf16 Int16Regs:$a), CvtRZI)>; -def : Pat<(i64 (fp_to_sint (bf16 Int16Regs:$a))), - (CVT_s64_bf16 Int16Regs:$a, CvtRZI)>; +def : Pat<(i1 (fp_to_sint bf16:$a)), + (SETP_b16ri $a, 0, CmpEQ)>; +def : Pat<(i16 (fp_to_sint bf16:$a)), + (CVT_s16_bf16 $a, CvtRZI)>; +def : Pat<(i32 (fp_to_sint bf16:$a)), + (CVT_s32_bf16 $a, CvtRZI)>; +def : Pat<(i64 (fp_to_sint bf16:$a)), + (CVT_s64_bf16 $a, CvtRZI)>; // bf16 -> uint -def : Pat<(i1 (fp_to_uint (bf16 Int16Regs:$a))), - (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>; -def : Pat<(i16 (fp_to_uint (bf16 Int16Regs:$a))), - (CVT_u16_bf16 Int16Regs:$a, CvtRZI)>; -def : Pat<(i32 (fp_to_uint (bf16 Int16Regs:$a))), - (CVT_u32_bf16 Int16Regs:$a, CvtRZI)>; -def : Pat<(i64 (fp_to_uint (bf16 Int16Regs:$a))), - (CVT_u64_bf16 Int16Regs:$a, CvtRZI)>; +def : Pat<(i1 (fp_to_uint bf16:$a)), + (SETP_b16ri $a, 0, CmpEQ)>; +def : Pat<(i16 (fp_to_uint bf16:$a)), + (CVT_u16_bf16 $a, CvtRZI)>; +def : Pat<(i32 (fp_to_uint bf16:$a)), + (CVT_u32_bf16 $a, CvtRZI)>; +def : Pat<(i64 (fp_to_uint bf16:$a)), + (CVT_u64_bf16 $a, CvtRZI)>; // f32 -> sint -def : Pat<(i1 (fp_to_sint Float32Regs:$a)), - (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>; -def : Pat<(i16 (fp_to_sint Float32Regs:$a)), - (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(i16 (fp_to_sint Float32Regs:$a)), - (CVT_s16_f32 Float32Regs:$a, CvtRZI)>; -def : Pat<(i32 (fp_to_sint Float32Regs:$a)), - (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(i32 (fp_to_sint Float32Regs:$a)), - (CVT_s32_f32 Float32Regs:$a, CvtRZI)>; -def : Pat<(i64 (fp_to_sint Float32Regs:$a)), - (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(i64 (fp_to_sint Float32Regs:$a)), - (CVT_s64_f32 Float32Regs:$a, CvtRZI)>; +def : Pat<(i1 (fp_to_sint f32:$a)), + (SETP_b32ri (BITCONVERT_32_F2I $a), 0, CmpEQ)>; +def : Pat<(i16 (fp_to_sint f32:$a)), + (CVT_s16_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i16 (fp_to_sint f32:$a)), + (CVT_s16_f32 $a, CvtRZI)>; +def : Pat<(i32 (fp_to_sint f32:$a)), + (CVT_s32_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i32 (fp_to_sint f32:$a)), + (CVT_s32_f32 $a, CvtRZI)>; +def : Pat<(i64 (fp_to_sint f32:$a)), + (CVT_s64_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i64 (fp_to_sint f32:$a)), + (CVT_s64_f32 $a, CvtRZI)>; // f32 -> uint -def : Pat<(i1 (fp_to_uint Float32Regs:$a)), - (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>; -def : Pat<(i16 (fp_to_uint Float32Regs:$a)), - (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(i16 (fp_to_uint Float32Regs:$a)), - (CVT_u16_f32 Float32Regs:$a, CvtRZI)>; -def : Pat<(i32 (fp_to_uint Float32Regs:$a)), - (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(i32 (fp_to_uint Float32Regs:$a)), - (CVT_u32_f32 Float32Regs:$a, CvtRZI)>; -def : Pat<(i64 (fp_to_uint Float32Regs:$a)), - (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(i64 (fp_to_uint Float32Regs:$a)), - (CVT_u64_f32 Float32Regs:$a, CvtRZI)>; +def : Pat<(i1 (fp_to_uint f32:$a)), + (SETP_b32ri (BITCONVERT_32_F2I $a), 0, CmpEQ)>; +def : Pat<(i16 (fp_to_uint f32:$a)), + (CVT_u16_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i16 (fp_to_uint f32:$a)), + (CVT_u16_f32 $a, CvtRZI)>; +def : Pat<(i32 (fp_to_uint f32:$a)), + (CVT_u32_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i32 (fp_to_uint f32:$a)), + (CVT_u32_f32 $a, CvtRZI)>; +def : Pat<(i64 (fp_to_uint f32:$a)), + (CVT_u64_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i64 (fp_to_uint f32:$a)), + (CVT_u64_f32 $a, CvtRZI)>; // f64 -> sint -def : Pat<(i1 (fp_to_sint Float64Regs:$a)), - (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>; -def : Pat<(i16 (fp_to_sint Float64Regs:$a)), - (CVT_s16_f64 Float64Regs:$a, CvtRZI)>; -def : Pat<(i32 (fp_to_sint Float64Regs:$a)), - (CVT_s32_f64 Float64Regs:$a, CvtRZI)>; -def : Pat<(i64 (fp_to_sint Float64Regs:$a)), - (CVT_s64_f64 Float64Regs:$a, CvtRZI)>; +def : Pat<(i1 (fp_to_sint f64:$a)), + (SETP_b64ri (BITCONVERT_64_F2I $a), 0, CmpEQ)>; +def : Pat<(i16 (fp_to_sint f64:$a)), + (CVT_s16_f64 $a, CvtRZI)>; +def : Pat<(i32 (fp_to_sint f64:$a)), + (CVT_s32_f64 $a, CvtRZI)>; +def : Pat<(i64 (fp_to_sint f64:$a)), + (CVT_s64_f64 $a, CvtRZI)>; // f64 -> uint -def : Pat<(i1 (fp_to_uint Float64Regs:$a)), - (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>; -def : Pat<(i16 (fp_to_uint Float64Regs:$a)), - (CVT_u16_f64 Float64Regs:$a, CvtRZI)>; -def : Pat<(i32 (fp_to_uint Float64Regs:$a)), - (CVT_u32_f64 Float64Regs:$a, CvtRZI)>; -def : Pat<(i64 (fp_to_uint Float64Regs:$a)), - (CVT_u64_f64 Float64Regs:$a, CvtRZI)>; +def : Pat<(i1 (fp_to_uint f64:$a)), + (SETP_b64ri (BITCONVERT_64_F2I $a), 0, CmpEQ)>; +def : Pat<(i16 (fp_to_uint f64:$a)), + (CVT_u16_f64 $a, CvtRZI)>; +def : Pat<(i32 (fp_to_uint f64:$a)), + (CVT_u32_f64 $a, CvtRZI)>; +def : Pat<(i64 (fp_to_uint f64:$a)), + (CVT_u64_f64 $a, CvtRZI)>; // sext i1 -def : Pat<(i16 (sext Int1Regs:$a)), - (SELP_s16ii -1, 0, Int1Regs:$a)>; -def : Pat<(i32 (sext Int1Regs:$a)), - (SELP_s32ii -1, 0, Int1Regs:$a)>; -def : Pat<(i64 (sext Int1Regs:$a)), - (SELP_s64ii -1, 0, Int1Regs:$a)>; +def : Pat<(i16 (sext i1:$a)), + (SELP_s16ii -1, 0, $a)>; +def : Pat<(i32 (sext i1:$a)), + (SELP_s32ii -1, 0, $a)>; +def : Pat<(i64 (sext i1:$a)), + (SELP_s64ii -1, 0, $a)>; // zext i1 -def : Pat<(i16 (zext Int1Regs:$a)), - (SELP_u16ii 1, 0, Int1Regs:$a)>; -def : Pat<(i32 (zext Int1Regs:$a)), - (SELP_u32ii 1, 0, Int1Regs:$a)>; -def : Pat<(i64 (zext Int1Regs:$a)), - (SELP_u64ii 1, 0, Int1Regs:$a)>; +def : Pat<(i16 (zext i1:$a)), + (SELP_u16ii 1, 0, $a)>; +def : Pat<(i32 (zext i1:$a)), + (SELP_u32ii 1, 0, $a)>; +def : Pat<(i64 (zext i1:$a)), + (SELP_u64ii 1, 0, $a)>; // anyext i1 -def : Pat<(i16 (anyext Int1Regs:$a)), - (SELP_u16ii -1, 0, Int1Regs:$a)>; -def : Pat<(i32 (anyext Int1Regs:$a)), - (SELP_u32ii -1, 0, Int1Regs:$a)>; -def : Pat<(i64 (anyext Int1Regs:$a)), - (SELP_u64ii -1, 0, Int1Regs:$a)>; +def : Pat<(i16 (anyext i1:$a)), + (SELP_u16ii -1, 0, $a)>; +def : Pat<(i32 (anyext i1:$a)), + (SELP_u32ii -1, 0, $a)>; +def : Pat<(i64 (anyext i1:$a)), + (SELP_u64ii -1, 0, $a)>; // sext i16 -def : Pat<(i32 (sext Int16Regs:$a)), - (CVT_s32_s16 Int16Regs:$a, CvtNONE)>; -def : Pat<(i64 (sext Int16Regs:$a)), - (CVT_s64_s16 Int16Regs:$a, CvtNONE)>; +def : Pat<(i32 (sext i16:$a)), + (CVT_s32_s16 $a, CvtNONE)>; +def : Pat<(i64 (sext i16:$a)), + (CVT_s64_s16 $a, CvtNONE)>; // zext i16 -def : Pat<(i32 (zext Int16Regs:$a)), - (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; -def : Pat<(i64 (zext Int16Regs:$a)), - (CVT_u64_u16 Int16Regs:$a, CvtNONE)>; +def : Pat<(i32 (zext i16:$a)), + (CVT_u32_u16 $a, CvtNONE)>; +def : Pat<(i64 (zext i16:$a)), + (CVT_u64_u16 $a, CvtNONE)>; // anyext i16 -def : Pat<(i32 (anyext Int16Regs:$a)), - (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; -def : Pat<(i64 (anyext Int16Regs:$a)), - (CVT_u64_u16 Int16Regs:$a, CvtNONE)>; +def : Pat<(i32 (anyext i16:$a)), + (CVT_u32_u16 $a, CvtNONE)>; +def : Pat<(i64 (anyext i16:$a)), + (CVT_u64_u16 $a, CvtNONE)>; // sext i32 -def : Pat<(i64 (sext Int32Regs:$a)), - (CVT_s64_s32 Int32Regs:$a, CvtNONE)>; +def : Pat<(i64 (sext i32:$a)), + (CVT_s64_s32 $a, CvtNONE)>; // zext i32 -def : Pat<(i64 (zext Int32Regs:$a)), - (CVT_u64_u32 Int32Regs:$a, CvtNONE)>; +def : Pat<(i64 (zext i32:$a)), + (CVT_u64_u32 $a, CvtNONE)>; // anyext i32 -def : Pat<(i64 (anyext Int32Regs:$a)), - (CVT_u64_u32 Int32Regs:$a, CvtNONE)>; +def : Pat<(i64 (anyext i32:$a)), + (CVT_u64_u32 $a, CvtNONE)>; // truncate i64 -def : Pat<(i32 (trunc Int64Regs:$a)), - (CVT_u32_u64 Int64Regs:$a, CvtNONE)>; -def : Pat<(i16 (trunc Int64Regs:$a)), - (CVT_u16_u64 Int64Regs:$a, CvtNONE)>; -def : Pat<(i1 (trunc Int64Regs:$a)), - (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>; +def : Pat<(i32 (trunc i64:$a)), + (CVT_u32_u64 $a, CvtNONE)>; +def : Pat<(i16 (trunc i64:$a)), + (CVT_u16_u64 $a, CvtNONE)>; +def : Pat<(i1 (trunc i64:$a)), + (SETP_b64ri (ANDb64ri $a, 1), 1, CmpEQ)>; // truncate i32 -def : Pat<(i16 (trunc Int32Regs:$a)), - (CVT_u16_u32 Int32Regs:$a, CvtNONE)>; -def : Pat<(i1 (trunc Int32Regs:$a)), - (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>; +def : Pat<(i16 (trunc i32:$a)), + (CVT_u16_u32 $a, CvtNONE)>; +def : Pat<(i1 (trunc i32:$a)), + (SETP_b32ri (ANDb32ri $a, 1), 1, CmpEQ)>; // truncate i16 -def : Pat<(i1 (trunc Int16Regs:$a)), - (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>; +def : Pat<(i1 (trunc i16:$a)), + (SETP_b16ri (ANDb16ri $a, 1), 1, CmpEQ)>; // sext_inreg -def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>; -def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>; -def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>; -def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>; -def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>; -def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>; +def : Pat<(sext_inreg i16:$a, i8), (CVT_INREG_s16_s8 $a)>; +def : Pat<(sext_inreg i32:$a, i8), (CVT_INREG_s32_s8 $a)>; +def : Pat<(sext_inreg i32:$a, i16), (CVT_INREG_s32_s16 $a)>; +def : Pat<(sext_inreg i64:$a, i8), (CVT_INREG_s64_s8 $a)>; +def : Pat<(sext_inreg i64:$a, i16), (CVT_INREG_s64_s16 $a)>; +def : Pat<(sext_inreg i64:$a, i32), (CVT_INREG_s64_s32 $a)>; // Select instructions with 32-bit predicates -def : Pat<(select (i32 Int32Regs:$pred), i16:$a, i16:$b), - (SELP_b16rr Int16Regs:$a, Int16Regs:$b, - (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; -def : Pat<(select (i32 Int32Regs:$pred), i32:$a, i32:$b), - (SELP_b32rr Int32Regs:$a, Int32Regs:$b, - (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; -def : Pat<(select (i32 Int32Regs:$pred), Int64Regs:$a, Int64Regs:$b), - (SELP_b64rr Int64Regs:$a, Int64Regs:$b, - (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; -def : Pat<(select (i32 Int32Regs:$pred), (f16 Int16Regs:$a), (f16 Int16Regs:$b)), - (SELP_f16rr Int16Regs:$a, Int16Regs:$b, - (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; -def : Pat<(select (i32 Int32Regs:$pred), (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)), - (SELP_bf16rr Int16Regs:$a, Int16Regs:$b, - (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; -def : Pat<(select (i32 Int32Regs:$pred), Float32Regs:$a, Float32Regs:$b), - (SELP_f32rr Float32Regs:$a, Float32Regs:$b, - (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; -def : Pat<(select (i32 Int32Regs:$pred), Float64Regs:$a, Float64Regs:$b), - (SELP_f64rr Float64Regs:$a, Float64Regs:$b, - (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; +def : Pat<(select i32:$pred, i16:$a, i16:$b), + (SELP_b16rr $a, $b, + (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>; +def : Pat<(select i32:$pred, i32:$a, i32:$b), + (SELP_b32rr $a, $b, + (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>; +def : Pat<(select i32:$pred, i64:$a, i64:$b), + (SELP_b64rr $a, $b, + (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>; +def : Pat<(select i32:$pred, f16:$a, f16:$b), + (SELP_f16rr $a, $b, + (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>; +def : Pat<(select i32:$pred, bf16:$a, bf16:$b), + (SELP_bf16rr $a, $b, + (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>; +def : Pat<(select i32:$pred, f32:$a, f32:$b), + (SELP_f32rr $a, $b, + (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>; +def : Pat<(select i32:$pred, f64:$a, f64:$b), + (SELP_f64rr $a, $b, + (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>; let hasSideEffects = false in { @@ -3610,33 +3409,78 @@ let hasSideEffects = false in { // Using partial vectorized move produces better SASS code for extraction of // upper/lower parts of an integer. -def : Pat<(i16 (trunc (srl Int32Regs:$s, (i32 16)))), - (I32toI16H Int32Regs:$s)>; -def : Pat<(i16 (trunc (sra Int32Regs:$s, (i32 16)))), - (I32toI16H Int32Regs:$s)>; -def : Pat<(i32 (trunc (srl Int64Regs:$s, (i32 32)))), - (I64toI32H Int64Regs:$s)>; -def : Pat<(i32 (trunc (sra Int64Regs:$s, (i32 32)))), - (I64toI32H Int64Regs:$s)>; - -def: Pat<(i32 (sext (extractelt (v2i16 Int32Regs:$src), 0))), - (CVT_INREG_s32_s16 Int32Regs:$src)>; +def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))), + (I32toI16H $s)>; +def : Pat<(i16 (trunc (sra i32:$s, (i32 16)))), + (I32toI16H $s)>; +def : Pat<(i32 (trunc (srl i64:$s, (i32 32)))), + (I64toI32H $s)>; +def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))), + (I64toI32H $s)>; + +def: Pat<(i32 (sext (extractelt v2i16:$src, 0))), + (CVT_INREG_s32_s16 $src)>; foreach vt = [v2f16, v2bf16, v2i16] in { -def : Pat<(extractelt (vt Int32Regs:$src), 0), - (I32toI16L Int32Regs:$src)>; -def : Pat<(extractelt (vt Int32Regs:$src), 1), - (I32toI16H Int32Regs:$src)>; +def : Pat<(extractelt vt:$src, 0), + (I32toI16L $src)>; +def : Pat<(extractelt vt:$src, 1), + (I32toI16H $src)>; } -def : Pat<(v2f16 (build_vector (f16 Int16Regs:$a), (f16 Int16Regs:$b))), - (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>; -def : Pat<(v2bf16 (build_vector (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), - (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>; -def : Pat<(v2i16 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b))), - (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>; +def : Pat<(v2f16 (build_vector f16:$a, f16:$b)), + (V2I16toI32 $a, $b)>; +def : Pat<(v2bf16 (build_vector bf16:$a, bf16:$b)), + (V2I16toI32 $a, $b)>; +def : Pat<(v2i16 (build_vector i16:$a, i16:$b)), + (V2I16toI32 $a, $b)>; -def: Pat<(v2i16 (scalar_to_vector (i16 Int16Regs:$a))), - (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; +def: Pat<(v2i16 (scalar_to_vector i16:$a)), + (CVT_u32_u16 $a, CvtNONE)>; + +// +// Funnel-Shift +// + +// Create SDNodes so they can be used in the DAG code, e.g. +// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts) +def fshl_clamp : SDNode<"NVPTXISD::FSHL_CLAMP", SDTIntShiftDOp, []>; +def fshr_clamp : SDNode<"NVPTXISD::FSHR_CLAMP", SDTIntShiftDOp, []>; + +// Funnel shift, requires >= sm_32. Does not trap if amt is out of range, so +// no side effects. +let hasSideEffects = false in { + multiclass ShfInst<string mode, SDNode op> { + def _i + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt), + "shf." # mode # ".b32 \t$dst, $lo, $hi, $amt;", + [(set i32:$dst, + (op i32:$hi, i32:$lo, (i32 imm:$amt)))]>, + Requires<[hasHWROT32]>; + + def _r + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf." # mode # ".b32 \t$dst, $lo, $hi, $amt;", + [(set i32:$dst, + (op i32:$hi, i32:$lo, i32:$amt))]>, + Requires<[hasHWROT32]>; + } + + defm SHF_L_CLAMP : ShfInst<"l.clamp", fshl_clamp>; + defm SHF_R_CLAMP : ShfInst<"r.clamp", fshr_clamp>; + defm SHF_L_WRAP : ShfInst<"l.wrap", fshl>; + defm SHF_R_WRAP : ShfInst<"r.wrap", fshr>; +} + +def : Pat<(i32 (int_nvvm_fshl_clamp i32:$hi, i32:$lo, i32:$amt)), + (SHF_L_CLAMP_r $lo, $hi, $amt)>; +def : Pat<(i32 (int_nvvm_fshl_clamp i32:$hi, i32:$lo, (i32 imm:$amt))), + (SHF_L_CLAMP_i $lo, $hi, imm:$amt)>; +def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, i32:$amt)), + (SHF_R_CLAMP_r $lo, $hi, $amt)>; +def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, (i32 imm:$amt))), + (SHF_R_CLAMP_i $lo, $hi, imm:$amt)>; // Count leading zeros let hasSideEffects = false in { @@ -3647,14 +3491,14 @@ let hasSideEffects = false in { } // 32-bit has a direct PTX instruction -def : Pat<(i32 (ctlz (i32 Int32Regs:$a))), (CLZr32 Int32Regs:$a)>; +def : Pat<(i32 (ctlz i32:$a)), (CLZr32 $a)>; // The return type of the ctlz ISD node is the same as its input, but the PTX // ctz instruction always returns a 32-bit value. For ctlz.i64, convert the // ptx value to 64 bits to match the ISD node's semantics, unless we know we're // truncating back down to 32 bits. -def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>; -def : Pat<(i32 (trunc (i64 (ctlz Int64Regs:$a)))), (CLZr64 Int64Regs:$a)>; +def : Pat<(i64 (ctlz i64:$a)), (CVT_u64_u32 (CLZr64 $a), CvtNONE)>; +def : Pat<(i32 (trunc (i64 (ctlz i64:$a)))), (CLZr64 $a)>; // For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the // result back to 16-bits if necessary. We also need to subtract 16 because @@ -3670,11 +3514,11 @@ def : Pat<(i32 (trunc (i64 (ctlz Int64Regs:$a)))), (CLZr64 Int64Regs:$a)>; // and then ctlz that value. This way we don't have to subtract 16 from the // result. Unfortunately today we don't have a way to generate // "mov b32reg, {b16imm, b16reg}", so we don't do this optimization. -def : Pat<(i16 (ctlz Int16Regs:$a)), +def : Pat<(i16 (ctlz i16:$a)), (SUBi16ri (CVT_u16_u32 - (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>; -def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))), - (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>; + (CLZr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE), 16)>; +def : Pat<(i32 (zext (i16 (ctlz i16:$a)))), + (SUBi32ri (CLZr32 (CVT_u32_u16 $a, CvtNONE)), 16)>; // Population count let hasSideEffects = false in { @@ -3685,67 +3529,67 @@ let hasSideEffects = false in { } // 32-bit has a direct PTX instruction -def : Pat<(i32 (ctpop (i32 Int32Regs:$a))), (POPCr32 Int32Regs:$a)>; +def : Pat<(i32 (ctpop i32:$a)), (POPCr32 $a)>; // For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit // to match the LLVM semantics. Just as with ctlz.i64, we provide a second // pattern that avoids the type conversion if we're truncating the result to // i32 anyway. -def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>; -def : Pat<(i32 (trunc (i64 (ctpop Int64Regs:$a)))), (POPCr64 Int64Regs:$a)>; +def : Pat<(ctpop i64:$a), (CVT_u64_u32 (POPCr64 $a), CvtNONE)>; +def : Pat<(i32 (trunc (i64 (ctpop i64:$a)))), (POPCr64 $a)>; // For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits. // If we know that we're storing into an i32, we can avoid the final trunc. -def : Pat<(ctpop Int16Regs:$a), - (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>; -def : Pat<(i32 (zext (i16 (ctpop Int16Regs:$a)))), - (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>; +def : Pat<(ctpop i16:$a), + (CVT_u16_u32 (POPCr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE)>; +def : Pat<(i32 (zext (i16 (ctpop i16:$a)))), + (POPCr32 (CVT_u32_u16 $a, CvtNONE))>; // fpround f32 -> f16 -def : Pat<(f16 (fpround Float32Regs:$a)), - (CVT_f16_f32 Float32Regs:$a, CvtRN)>; +def : Pat<(f16 (fpround f32:$a)), + (CVT_f16_f32 $a, CvtRN)>; // fpround f32 -> bf16 -def : Pat<(bf16 (fpround Float32Regs:$a)), - (CVT_bf16_f32 Float32Regs:$a, CvtRN)>, Requires<[hasPTX<70>, hasSM<80>]>; +def : Pat<(bf16 (fpround f32:$a)), + (CVT_bf16_f32 $a, CvtRN)>, Requires<[hasPTX<70>, hasSM<80>]>; // fpround f64 -> f16 -def : Pat<(f16 (fpround Float64Regs:$a)), - (CVT_f16_f64 Float64Regs:$a, CvtRN)>; +def : Pat<(f16 (fpround f64:$a)), + (CVT_f16_f64 $a, CvtRN)>; // fpround f64 -> bf16 -def : Pat<(bf16 (fpround Float64Regs:$a)), - (CVT_bf16_f64 Float64Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; +def : Pat<(bf16 (fpround f64:$a)), + (CVT_bf16_f64 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; // fpround f64 -> f32 -def : Pat<(f32 (fpround Float64Regs:$a)), - (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(f32 (fpround Float64Regs:$a)), - (CVT_f32_f64 Float64Regs:$a, CvtRN)>; +def : Pat<(f32 (fpround f64:$a)), + (CVT_f32_f64 $a, CvtRN_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f32 (fpround f64:$a)), + (CVT_f32_f64 $a, CvtRN)>; // fpextend f16 -> f32 -def : Pat<(f32 (fpextend (f16 Int16Regs:$a))), - (CVT_f32_f16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(f32 (fpextend (f16 Int16Regs:$a))), - (CVT_f32_f16 Int16Regs:$a, CvtNONE)>; +def : Pat<(f32 (fpextend f16:$a)), + (CVT_f32_f16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f32 (fpextend f16:$a)), + (CVT_f32_f16 $a, CvtNONE)>; // fpextend bf16 -> f32 -def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))), - (CVT_f32_bf16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))), - (CVT_f32_bf16 Int16Regs:$a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>; +def : Pat<(f32 (fpextend bf16:$a)), + (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f32 (fpextend bf16:$a)), + (CVT_f32_bf16 $a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>; // fpextend f16 -> f64 -def : Pat<(f64 (fpextend (f16 Int16Regs:$a))), - (CVT_f64_f16 Int16Regs:$a, CvtNONE)>; +def : Pat<(f64 (fpextend f16:$a)), + (CVT_f64_f16 $a, CvtNONE)>; // fpextend bf16 -> f64 -def : Pat<(f64 (fpextend (bf16 Int16Regs:$a))), - (CVT_f64_bf16 Int16Regs:$a, CvtNONE)>, Requires<[hasPTX<78>, hasSM<90>]>; +def : Pat<(f64 (fpextend bf16:$a)), + (CVT_f64_bf16 $a, CvtNONE)>, Requires<[hasPTX<78>, hasSM<90>]>; // fpextend f32 -> f64 -def : Pat<(f64 (fpextend Float32Regs:$a)), - (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(f64 (fpextend Float32Regs:$a)), - (CVT_f64_f32 Float32Regs:$a, CvtNONE)>; +def : Pat<(f64 (fpextend f32:$a)), + (CVT_f64_f32 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f64 (fpextend f32:$a)), + (CVT_f64_f32 $a, CvtNONE)>; def retglue : SDNode<"NVPTXISD::RET_GLUE", SDTNone, [SDNPHasChain, SDNPOptInGlue]>; @@ -3753,16 +3597,16 @@ def retglue : SDNode<"NVPTXISD::RET_GLUE", SDTNone, // fceil, ffloor, froundeven, ftrunc. multiclass CVT_ROUND<SDNode OpNode, PatLeaf Mode, PatLeaf ModeFTZ> { - def : Pat<(OpNode (f16 Int16Regs:$a)), - (CVT_f16_f16 Int16Regs:$a, Mode)>; - def : Pat<(OpNode (bf16 Int16Regs:$a)), - (CVT_bf16_bf16 Int16Regs:$a, Mode)>; - def : Pat<(OpNode Float32Regs:$a), - (CVT_f32_f32 Float32Regs:$a, ModeFTZ)>, Requires<[doF32FTZ]>; - def : Pat<(OpNode Float32Regs:$a), - (CVT_f32_f32 Float32Regs:$a, Mode)>, Requires<[doNoF32FTZ]>; - def : Pat<(OpNode Float64Regs:$a), - (CVT_f64_f64 Float64Regs:$a, Mode)>; + def : Pat<(OpNode f16:$a), + (CVT_f16_f16 $a, Mode)>; + def : Pat<(OpNode bf16:$a), + (CVT_bf16_bf16 $a, Mode)>; + def : Pat<(OpNode f32:$a), + (CVT_f32_f32 $a, ModeFTZ)>, Requires<[doF32FTZ]>; + def : Pat<(OpNode f32:$a), + (CVT_f32_f32 $a, Mode)>, Requires<[doNoF32FTZ]>; + def : Pat<(OpNode f64:$a), + (CVT_f64_f64 $a, Mode)>; } defm : CVT_ROUND<fceil, CvtRPI, CvtRPI_FTZ>; @@ -3788,7 +3632,7 @@ let isTerminator=1 in { let isBranch=1 in def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), "@$a bra \t$target;", - [(brcond Int1Regs:$a, bb:$target)]>; + [(brcond i1:$a, bb:$target)]>; let isBranch=1 in def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), "@!$a bra \t$target;", []>; @@ -3798,16 +3642,16 @@ let isTerminator=1 in { "bra.uni \t$target;", [(br bb:$target)]>; } -def : Pat<(brcond (i32 Int32Regs:$a), bb:$target), - (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>; +def : Pat<(brcond i32:$a, bb:$target), + (CBranch (SETP_u32ri $a, 0, CmpNE), bb:$target)>; // SelectionDAGBuilder::visitSWitchCase() will invert the condition of a // conditional branch if the target block is the next block so that the code // can fall through to the target block. The invertion is done by 'xor // condition, 1', which will be translated to (setne condition, -1). Since ptx // supports '@!pred bra target', we should use it. -def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target), - (CBranchOther Int1Regs:$a, bb:$target)>; +def : Pat<(brcond (i1 (setne i1:$a, -1)), bb:$target), + (CBranchOther $a, bb:$target)>; // Call def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, @@ -3845,9 +3689,12 @@ def Callseq_End : [(callseq_end timm:$amt1, timm:$amt2)]>; // trap instruction +def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>, Requires<[noPTXASUnreachableBug]>; // Emit an `exit` as well to convey to ptxas that `trap` exits the CFG. // This won't be necessary in a future version of ptxas. -def trapinst : NVPTXInst<(outs), (ins), "trap; exit;", [(trap)]>; +def trapexitinst : NVPTXInst<(outs), (ins), "trap; exit;", [(trap)]>, Requires<[hasPTXASUnreachableBug]>; +// brkpt instruction +def debugtrapinst : NVPTXInst<(outs), (ins), "brkpt;", [(debugtrap)]>; // Call prototype wrapper def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>; @@ -3873,7 +3720,7 @@ def DYNAMIC_STACKALLOC32 : (ins Int32Regs:$size, i32imm:$align), "alloca.u32 \t$ptr, $size, $align;\n\t" "cvta.local.u32 \t$ptr, $ptr;", - [(set (i32 Int32Regs:$ptr), (dyn_alloca Int32Regs:$size, (i32 timm:$align)))]>, + [(set i32:$ptr, (dyn_alloca i32:$size, (i32 timm:$align)))]>, Requires<[hasPTX<73>, hasSM<52>]>; def DYNAMIC_STACKALLOC64 : @@ -3881,7 +3728,112 @@ def DYNAMIC_STACKALLOC64 : (ins Int64Regs:$size, i32imm:$align), "alloca.u64 \t$ptr, $size, $align;\n\t" "cvta.local.u64 \t$ptr, $ptr;", - [(set Int64Regs:$ptr, (dyn_alloca Int64Regs:$size, (i32 timm:$align)))]>, + [(set i64:$ptr, (dyn_alloca i64:$size, (i32 timm:$align)))]>, + Requires<[hasPTX<73>, hasSM<52>]>; + + +// +// BRX +// + +def SDTBrxStartProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>; +def SDTBrxItemProfile : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>; +def SDTBrxEndProfile : SDTypeProfile<0, 3, [SDTCisVT<0, OtherVT>, SDTCisInt<1>, SDTCisInt<2>]>; + +def brx_start : + SDNode<"NVPTXISD::BrxStart", SDTBrxStartProfile, + [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; +def brx_item : + SDNode<"NVPTXISD::BrxItem", SDTBrxItemProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def brx_end : + SDNode<"NVPTXISD::BrxEnd", SDTBrxEndProfile, + [SDNPHasChain, SDNPInGlue, SDNPSideEffect]>; + +let isTerminator = 1, isBranch = 1, isIndirectBranch = 1, isNotDuplicable = 1 in { + + def BRX_START : + NVPTXInst<(outs), (ins i32imm:$id), + "$$L_brx_$id: .branchtargets", + [(brx_start (i32 imm:$id))]>; + + def BRX_ITEM : + NVPTXInst<(outs), (ins brtarget:$target), + "\t$target,", + [(brx_item bb:$target)]>; + + def BRX_END : + NVPTXInst<(outs), (ins brtarget:$target, Int32Regs:$val, i32imm:$id), + "\t$target;\n\tbrx.idx \t$val, $$L_brx_$id;", + [(brx_end bb:$target, i32:$val, (i32 imm:$id))]> { + let isBarrier = 1; + } +} + + +foreach a_type = ["s", "u"] in { + foreach b_type = ["s", "u"] in { + + def DOT4_ # a_type # b_type : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c), + "dp4a." # a_type # "32." # b_type # "32 \t$dst, $a, $b, $c;", + [(set i32:$dst, + (!cast<Intrinsic>("int_nvvm_idp4a_" # a_type # "_" # b_type) + i32:$a, i32:$b, i32:$c))]>, + Requires<[hasDotInstructions]>; + + foreach is_hi = [0, -1] in { + defvar lohi_suffix = !if(is_hi, "hi", "lo"); + + def DOT2_ # lohi_suffix # _ # a_type # b_type : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c), + "dp2a." # lohi_suffix # "." # a_type # "32." # b_type # "32 \t$dst, $a, $b, $c;", + [(set i32:$dst, + (!cast<Intrinsic>("int_nvvm_idp2a_" # a_type # "_" # b_type) + i32:$a, i32:$b, is_hi, i32:$c))]>, + Requires<[hasDotInstructions]>; + } + } +} + +// +// Stack Manipulation +// + +def SDTStackRestore : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def stackrestore : + SDNode<"NVPTXISD::STACKRESTORE", SDTStackRestore, + [SDNPHasChain, SDNPSideEffect]>; + +def stacksave : + SDNode<"NVPTXISD::STACKSAVE", SDTIntLeaf, + [SDNPHasChain, SDNPSideEffect]>; + +def STACKRESTORE_32 : + NVPTXInst<(outs), (ins Int32Regs:$ptr), + "stackrestore.u32 \t$ptr;", + [(stackrestore i32:$ptr)]>, + Requires<[hasPTX<73>, hasSM<52>]>; + +def STACKSAVE_32 : + NVPTXInst<(outs Int32Regs:$dst), (ins), + "stacksave.u32 \t$dst;", + [(set i32:$dst, (i32 stacksave))]>, + Requires<[hasPTX<73>, hasSM<52>]>; + +def STACKRESTORE_64 : + NVPTXInst<(outs), (ins Int64Regs:$ptr), + "stackrestore.u64 \t$ptr;", + [(stackrestore i64:$ptr)]>, + Requires<[hasPTX<73>, hasSM<52>]>; + +def STACKSAVE_64 : + NVPTXInst<(outs Int64Regs:$dst), (ins), + "stacksave.u64 \t$dst;", + [(set i64:$dst, (i64 stacksave))]>, Requires<[hasPTX<73>, hasSM<52>]>; include "NVPTXIntrinsics.td" @@ -3897,14 +3849,98 @@ include "NVPTXIntrinsics.td" def : Pat < (i32 (bswap i32:$a)), - (INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x0123))>; + (INT_NVVM_PRMT $a, (i32 0), (i32 0x0123))>; def : Pat < (v2i16 (bswap v2i16:$a)), - (INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x2301))>; + (INT_NVVM_PRMT $a, (i32 0), (i32 0x2301))>; def : Pat < (i64 (bswap i64:$a)), (V2I32toI64 - (INT_NVVM_PRMT (I64toI32H Int64Regs:$a), (i32 0), (i32 0x0123)), - (INT_NVVM_PRMT (I64toI32L Int64Regs:$a), (i32 0), (i32 0x0123)))>; + (INT_NVVM_PRMT (I64toI32H $a), (i32 0), (i32 0x0123)), + (INT_NVVM_PRMT (I64toI32L $a), (i32 0), (i32 0x0123)))>; + + +//////////////////////////////////////////////////////////////////////////////// +// PTX Fence instructions +//////////////////////////////////////////////////////////////////////////////// + +def atomic_thread_fence_seq_cst_sys : + NVPTXInst<(outs), (ins), "fence.sc.sys;", []>, + Requires<[hasPTX<60>, hasSM<70>]>; +def atomic_thread_fence_acq_rel_sys : + NVPTXInst<(outs), (ins), "fence.acq_rel.sys;", []>, + Requires<[hasPTX<60>, hasSM<70>]>; + +def atomic_thread_fence_seq_cst_gpu : + NVPTXInst<(outs), (ins), "fence.sc.gpu;", []>, + Requires<[hasPTX<60>, hasSM<70>]>; +def atomic_thread_fence_acq_rel_gpu : + NVPTXInst<(outs), (ins), "fence.acq_rel.gpu;", []>, + Requires<[hasPTX<60>, hasSM<70>]>; + +def atomic_thread_fence_seq_cst_cluster : + NVPTXInst<(outs), (ins), "fence.sc.cluster;", []>, + Requires<[hasPTX<78>, hasSM<90>]>; +def atomic_thread_fence_acq_rel_cluster : + NVPTXInst<(outs), (ins), "fence.acq_rel.cluster;", []>, + Requires<[hasPTX<78>, hasSM<90>]>; + +def atomic_thread_fence_seq_cst_cta : + NVPTXInst<(outs), (ins), "fence.sc.cta;", []>, + Requires<[hasPTX<60>, hasSM<70>]>; +def atomic_thread_fence_acq_rel_cta : + NVPTXInst<(outs), (ins), "fence.acq_rel.cta;", []>, + Requires<[hasPTX<60>, hasSM<70>]>; + +def fpimm_any_zero : FPImmLeaf<fAny, [{ + return Imm.isZero(); +}]>; + +def fpimm_positive_zero_v2f16 : PatFrag<(ops), (v2f16 (bitconvert (i32 0)))>; +def fpimm_positive_zero_v2bf16 : PatFrag<(ops), (v2bf16 (bitconvert (i32 0)))>; + +// Perform substitution if fma only has one use, and also if instruction has +// nnan instruction flag or if the TM has NoNaNsFPMath +def NVPTX_fma_oneuse_and_nnan : PatFrag<(ops node:$a, node:$b, node:$c), + (fma node:$a, node:$b, node:$c), [{ + return N->hasOneUse() && + (N->getFlags().hasNoNaNs() || TM.Options.NoNaNsFPMath); +}]>; +// fmaxnum will differentiate between signed and unsigned zeros soon, so this +// PatFrag is for a fmaxnum node with nsz +def NVPTX_fmaxnum_nsz : PatFrag<(ops node:$a, node:$b), + (fmaxnum node:$a, node:$b), [{ + return N->getFlags().hasNoSignedZeros() || TM.Options.NoSignedZerosFPMath; +}]>; + +class NVPTXInst_rrr<RegisterClass RC, string Instruction, list<Predicate> Preds> + : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), + !strconcat(Instruction, "\t$dst, $a, $b, $c;"), []>, + Requires<Preds>; + +def FMARELU_F16 : NVPTXInst_rrr<Int16Regs, "fma.rn.relu.f16", [useFP16Math, hasPTX<70>, hasSM<80>]>; +def FMARELU_F16_FTZ : NVPTXInst_rrr<Int16Regs, "fma.rn.ftz.relu.f16", [useFP16Math, hasPTX<70>, hasSM<80>]>; +def FMARELU_BF16 : NVPTXInst_rrr<Int16Regs, "fma.rn.relu.bf16", [hasBF16Math, hasPTX<70>, hasSM<80>]>; +def FMARELU_F16X2 : NVPTXInst_rrr<Int32Regs, "fma.rn.relu.f16x2", [useFP16Math, hasPTX<70>, hasSM<80>]>; +def FMARELU_F16X2_FTZ : NVPTXInst_rrr<Int32Regs, "fma.rn.ftz.relu.f16x2", [useFP16Math, hasPTX<70>, hasSM<80>]>; +def FMARELU_BF16X2 : NVPTXInst_rrr<Int32Regs, "fma.rn.relu.bf16x2", [hasBF16Math, hasPTX<70>, hasSM<80>]>; + +// FTZ +def : Pat<(f16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan f16:$a, f16:$b, f16:$c), fpimm_any_zero)), + (FMARELU_F16_FTZ $a, $b, $c)>, + Requires<[doF32FTZ]>; +def : Pat<(v2f16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan v2f16:$a, v2f16:$b, v2f16:$c), fpimm_positive_zero_v2f16)), + (FMARELU_F16X2_FTZ $a, $b, $c)>, + Requires<[doF32FTZ]>; + +// NO FTZ +def : Pat<(f16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan f16:$a, f16:$b, f16:$c), fpimm_any_zero)), + (FMARELU_F16 $a, $b, $c)>; +def : Pat<(bf16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan bf16:$a, bf16:$b, bf16:$c), fpimm_any_zero)), + (FMARELU_BF16 $a, $b, $c)>; +def : Pat<(v2f16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan v2f16:$a, v2f16:$b, v2f16:$c), fpimm_positive_zero_v2f16)), + (FMARELU_F16X2 $a, $b, $c)>; +def : Pat<(v2bf16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan v2bf16:$a, v2bf16:$b, v2bf16:$c), fpimm_positive_zero_v2bf16)), + (FMARELU_BF16X2 $a, $b, $c)>; |
