diff options
Diffstat (limited to 'contrib/llvm-project/clang/include/clang/Basic/BuiltinsNVPTX.def')
-rw-r--r-- | contrib/llvm-project/clang/include/clang/Basic/BuiltinsNVPTX.def | 296 |
1 files changed, 265 insertions, 31 deletions
diff --git a/contrib/llvm-project/clang/include/clang/Basic/BuiltinsNVPTX.def b/contrib/llvm-project/clang/include/clang/Basic/BuiltinsNVPTX.def index 6b94dd857300..0f2e8260143b 100644 --- a/contrib/llvm-project/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/contrib/llvm-project/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -17,12 +17,21 @@ # define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS) #endif +#pragma push_macro("SM_53") #pragma push_macro("SM_70") #pragma push_macro("SM_72") #pragma push_macro("SM_75") #pragma push_macro("SM_80") #pragma push_macro("SM_86") -#define SM_86 "sm_86" +#pragma push_macro("SM_87") +#pragma push_macro("SM_89") +#pragma push_macro("SM_90") +#pragma push_macro("SM_90a") +#define SM_90a "sm_90a" +#define SM_90 "sm_90|" SM_90a +#define SM_89 "sm_89|" SM_90 +#define SM_87 "sm_87|" SM_89 +#define SM_86 "sm_86|" SM_87 #define SM_80 "sm_80|" SM_86 #define SM_75 "sm_75|" SM_80 #define SM_72 "sm_72|" SM_75 @@ -30,7 +39,9 @@ #pragma push_macro("SM_60") #define SM_60 "sm_60|sm_61|sm_62|" SM_70 +#define SM_53 "sm_53|" SM_60 +#pragma push_macro("PTX42") #pragma push_macro("PTX60") #pragma push_macro("PTX61") #pragma push_macro("PTX63") @@ -42,7 +53,21 @@ #pragma push_macro("PTX73") #pragma push_macro("PTX74") #pragma push_macro("PTX75") -#define PTX75 "ptx75" +#pragma push_macro("PTX76") +#pragma push_macro("PTX77") +#pragma push_macro("PTX78") +#pragma push_macro("PTX80") +#pragma push_macro("PTX81") +#pragma push_macro("PTX82") +#pragma push_macro("PTX83") +#define PTX83 "ptx83" +#define PTX82 "ptx82|" PTX83 +#define PTX81 "ptx81|" PTX82 +#define PTX80 "ptx80|" PTX81 +#define PTX78 "ptx78|" PTX80 +#define PTX77 "ptx77|" PTX78 +#define PTX76 "ptx76|" PTX77 +#define PTX75 "ptx75|" PTX76 #define PTX74 "ptx74|" PTX75 #define PTX73 "ptx73|" PTX74 #define PTX72 "ptx72|" PTX73 @@ -53,6 +78,7 @@ #define PTX63 "ptx63|" PTX64 #define PTX61 "ptx61|" PTX63 #define PTX60 "ptx60|" PTX61 +#define PTX42 "ptx42|" PTX60 #pragma push_macro("AND") #define AND(a, b) "(" a "),(" b ")" @@ -79,6 +105,31 @@ BUILTIN(__nvvm_read_ptx_sreg_nctaid_y, "i", "nc") BUILTIN(__nvvm_read_ptx_sreg_nctaid_z, "i", "nc") BUILTIN(__nvvm_read_ptx_sreg_nctaid_w, "i", "nc") +TARGET_BUILTIN(__nvvm_read_ptx_sreg_clusterid_x, "i", "nc", AND(SM_90, PTX78)) +TARGET_BUILTIN(__nvvm_read_ptx_sreg_clusterid_y, "i", "nc", AND(SM_90, PTX78)) +TARGET_BUILTIN(__nvvm_read_ptx_sreg_clusterid_z, "i", "nc", AND(SM_90, PTX78)) +TARGET_BUILTIN(__nvvm_read_ptx_sreg_clusterid_w, "i", "nc", AND(SM_90, PTX78)) + +TARGET_BUILTIN(__nvvm_read_ptx_sreg_nclusterid_x, "i", "nc", AND(SM_90, PTX78)) +TARGET_BUILTIN(__nvvm_read_ptx_sreg_nclusterid_y, "i", "nc", AND(SM_90, PTX78)) +TARGET_BUILTIN(__nvvm_read_ptx_sreg_nclusterid_z, "i", "nc", AND(SM_90, PTX78)) +TARGET_BUILTIN(__nvvm_read_ptx_sreg_nclusterid_w, "i", "nc", AND(SM_90, PTX78)) + +TARGET_BUILTIN(__nvvm_read_ptx_sreg_cluster_ctaid_x, "i", "nc", AND(SM_90, PTX78)) +TARGET_BUILTIN(__nvvm_read_ptx_sreg_cluster_ctaid_y, "i", "nc", AND(SM_90, PTX78)) +TARGET_BUILTIN(__nvvm_read_ptx_sreg_cluster_ctaid_z, "i", "nc", AND(SM_90, PTX78)) +TARGET_BUILTIN(__nvvm_read_ptx_sreg_cluster_ctaid_w, "i", "nc", AND(SM_90, PTX78)) + +TARGET_BUILTIN(__nvvm_read_ptx_sreg_cluster_nctaid_x, "i", "nc", AND(SM_90, PTX78)) +TARGET_BUILTIN(__nvvm_read_ptx_sreg_cluster_nctaid_y, "i", "nc", AND(SM_90, PTX78)) +TARGET_BUILTIN(__nvvm_read_ptx_sreg_cluster_nctaid_z, "i", "nc", AND(SM_90, PTX78)) +TARGET_BUILTIN(__nvvm_read_ptx_sreg_cluster_nctaid_w, "i", "nc", AND(SM_90, PTX78)) + +TARGET_BUILTIN(__nvvm_read_ptx_sreg_cluster_ctarank, "i", "nc", AND(SM_90, PTX78)) +TARGET_BUILTIN(__nvvm_read_ptx_sreg_cluster_nctarank, "i", "nc", AND(SM_90, PTX78)) + +TARGET_BUILTIN(__nvvm_is_explicit_cluster, "b", "nc", AND(SM_90, PTX78)) + BUILTIN(__nvvm_read_ptx_sreg_laneid, "i", "nc") BUILTIN(__nvvm_read_ptx_sreg_warpid, "i", "nc") BUILTIN(__nvvm_read_ptx_sreg_nwarpid, "i", "nc") @@ -107,13 +158,97 @@ BUILTIN(__nvvm_prmt, "UiUiUiUi", "") // Min Max -BUILTIN(__nvvm_fmax_ftz_f, "fff", "") -BUILTIN(__nvvm_fmax_f, "fff", "") -BUILTIN(__nvvm_fmin_ftz_f, "fff", "") -BUILTIN(__nvvm_fmin_f, "fff", "") +TARGET_BUILTIN(__nvvm_fmin_f16, "hhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_ftz_f16, "hhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_nan_f16, "hhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f16, "hhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_ftz_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_xorsign_abs_f16, "hhh", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_ftz_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_f16x2, "V2hV2hV2h", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_ftz_xorsign_abs_f16x2, "V2hV2hV2h", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_f16x2, "V2hV2hV2h", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_xorsign_abs_f16x2, "V2hV2hV2h", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_bf16, "yyy", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_ftz_bf16, "yyy", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_nan_bf16, "yyy", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_bf16, "yyy", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_bf16, "yyy", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_bf16, "yyy", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_bf16x2, "V2yV2yV2y", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_ftz_bf16x2, "V2yV2yV2y", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_nan_bf16x2, "V2yV2yV2y", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_bf16x2, "V2yV2yV2y", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_bf16x2, "V2yV2yV2y", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_bf16x2, "V2yV2yV2y", "", + AND(SM_86, PTX72)) +BUILTIN(__nvvm_fmin_f, "fff", "") +BUILTIN(__nvvm_fmin_ftz_f, "fff", "") +TARGET_BUILTIN(__nvvm_fmin_nan_f, "fff", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f, "fff", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_ftz_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) +BUILTIN(__nvvm_fmin_d, "ddd", "") +TARGET_BUILTIN(__nvvm_fmax_f16, "hhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_ftz_f16, "hhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_nan_f16, "hhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f16, "hhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_ftz_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_xorsign_abs_f16, "hhh", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_ftz_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_f16x2, "V2hV2hV2h", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_ftz_xorsign_abs_f16x2, "V2hV2hV2h", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_f16x2, "V2hV2hV2h", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_xorsign_abs_f16x2, "V2hV2hV2h", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_bf16, "yyy", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_ftz_bf16, "yyy", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_nan_bf16, "yyy", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_bf16, "yyy", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_bf16, "yyy", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_bf16, "yyy", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_bf16x2, "V2yV2yV2y", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_ftz_bf16x2, "V2yV2yV2y", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_nan_bf16x2, "V2yV2yV2y", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_bf16x2, "V2yV2yV2y", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_bf16x2, "V2yV2yV2y", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_bf16x2, "V2yV2yV2y", "", + AND(SM_86, PTX72)) +BUILTIN(__nvvm_fmax_f, "fff", "") +BUILTIN(__nvvm_fmax_ftz_f, "fff", "") +TARGET_BUILTIN(__nvvm_fmax_nan_f, "fff", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f, "fff", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_ftz_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) BUILTIN(__nvvm_fmax_d, "ddd", "") -BUILTIN(__nvvm_fmin_d, "ddd", "") // Multiplication @@ -202,6 +337,8 @@ BUILTIN(__nvvm_saturate_d, "dd", "") BUILTIN(__nvvm_ex2_approx_ftz_f, "ff", "") BUILTIN(__nvvm_ex2_approx_f, "ff", "") BUILTIN(__nvvm_ex2_approx_d, "dd", "") +TARGET_BUILTIN(__nvvm_ex2_approx_f16, "hh", "", AND(SM_75, PTX70)) +TARGET_BUILTIN(__nvvm_ex2_approx_f16x2, "V2hV2h", "", AND(SM_75, PTX70)) BUILTIN(__nvvm_lg2_approx_ftz_f, "ff", "") BUILTIN(__nvvm_lg2_approx_f, "ff", "") @@ -217,6 +354,22 @@ BUILTIN(__nvvm_cos_approx_f, "ff", "") // Fma +TARGET_BUILTIN(__nvvm_fma_rn_f16, "hhhh", "", AND(SM_53, PTX42)) +TARGET_BUILTIN(__nvvm_fma_rn_ftz_f16, "hhhh", "", AND(SM_53, PTX42)) +TARGET_BUILTIN(__nvvm_fma_rn_sat_f16, "hhhh", "", AND(SM_53, PTX42)) +TARGET_BUILTIN(__nvvm_fma_rn_ftz_sat_f16, "hhhh", "", AND(SM_53, PTX42)) +TARGET_BUILTIN(__nvvm_fma_rn_relu_f16, "hhhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fma_rn_ftz_relu_f16, "hhhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fma_rn_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42)) +TARGET_BUILTIN(__nvvm_fma_rn_ftz_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42)) +TARGET_BUILTIN(__nvvm_fma_rn_sat_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42)) +TARGET_BUILTIN(__nvvm_fma_rn_ftz_sat_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42)) +TARGET_BUILTIN(__nvvm_fma_rn_relu_f16x2, "V2hV2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fma_rn_ftz_relu_f16x2, "V2hV2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fma_rn_bf16, "yyyy", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fma_rn_relu_bf16, "yyyy", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fma_rn_bf16x2, "V2yV2yV2yV2y", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fma_rn_relu_bf16x2, "V2yV2yV2yV2y", "", AND(SM_80, PTX70)) BUILTIN(__nvvm_fma_rn_ftz_f, "ffff", "") BUILTIN(__nvvm_fma_rn_f, "ffff", "") BUILTIN(__nvvm_fma_rz_ftz_f, "ffff", "") @@ -245,6 +398,8 @@ BUILTIN(__nvvm_rcp_rn_d, "dd", "") BUILTIN(__nvvm_rcp_rz_d, "dd", "") BUILTIN(__nvvm_rcp_rm_d, "dd", "") BUILTIN(__nvvm_rcp_rp_d, "dd", "") + +BUILTIN(__nvvm_rcp_approx_ftz_f, "ff", "") BUILTIN(__nvvm_rcp_approx_ftz_d, "dd", "") // Sqrt @@ -402,20 +557,20 @@ BUILTIN(__nvvm_ull2d_rp, "dULLi", "") BUILTIN(__nvvm_f2h_rn_ftz, "Usf", "") BUILTIN(__nvvm_f2h_rn, "Usf", "") -TARGET_BUILTIN(__nvvm_ff2bf16x2_rn, "ZUiff", "", AND(SM_80,PTX70)) -TARGET_BUILTIN(__nvvm_ff2bf16x2_rn_relu, "ZUiff", "", AND(SM_80,PTX70)) -TARGET_BUILTIN(__nvvm_ff2bf16x2_rz, "ZUiff", "", AND(SM_80,PTX70)) -TARGET_BUILTIN(__nvvm_ff2bf16x2_rz_relu, "ZUiff", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_ff2bf16x2_rn, "V2yff", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_ff2bf16x2_rn_relu, "V2yff", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_ff2bf16x2_rz, "V2yff", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_ff2bf16x2_rz_relu, "V2yff", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_ff2f16x2_rn, "V2hff", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_ff2f16x2_rn_relu, "V2hff", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_ff2f16x2_rz, "V2hff", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_ff2f16x2_rz_relu, "V2hff", "", AND(SM_80,PTX70)) -TARGET_BUILTIN(__nvvm_f2bf16_rn, "ZUsf", "", AND(SM_80,PTX70)) -TARGET_BUILTIN(__nvvm_f2bf16_rn_relu, "ZUsf", "", AND(SM_80,PTX70)) -TARGET_BUILTIN(__nvvm_f2bf16_rz, "ZUsf", "", AND(SM_80,PTX70)) -TARGET_BUILTIN(__nvvm_f2bf16_rz_relu, "ZUsf", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_f2bf16_rn, "yf", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_f2bf16_rn_relu, "yf", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_f2bf16_rz, "yf", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_f2bf16_rz_relu, "yf", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_f2tf32_rna, "ZUif", "", AND(SM_80,PTX70)) @@ -441,6 +596,11 @@ TARGET_BUILTIN(__nvvm_bar_warp_sync, "vUi", "n", PTX60) TARGET_BUILTIN(__nvvm_barrier_sync, "vUi", "n", PTX60) TARGET_BUILTIN(__nvvm_barrier_sync_cnt, "vUiUi", "n", PTX60) +TARGET_BUILTIN(__nvvm_barrier_cluster_arrive, "v", "n", AND(SM_90,PTX78)) +TARGET_BUILTIN(__nvvm_barrier_cluster_arrive_relaxed, "v", "n", AND(SM_90,PTX80)) +TARGET_BUILTIN(__nvvm_barrier_cluster_wait, "v", "n", AND(SM_90,PTX78)) +TARGET_BUILTIN(__nvvm_fence_sc_cluster, "v", "n", AND(SM_90,PTX78)) + // Shuffle BUILTIN(__nvvm_shfl_down_i32, "iiii", "") @@ -473,11 +633,11 @@ TARGET_BUILTIN(__nvvm_vote_uni_sync, "bUib", "", PTX60) TARGET_BUILTIN(__nvvm_vote_ballot_sync, "UiUib", "", PTX60) // Match -TARGET_BUILTIN(__nvvm_match_any_sync_i32, "UiUiUi", "", PTX60) -TARGET_BUILTIN(__nvvm_match_any_sync_i64, "WiUiWi", "", PTX60) +TARGET_BUILTIN(__nvvm_match_any_sync_i32, "UiUiUi", "", AND(SM_70,PTX60)) +TARGET_BUILTIN(__nvvm_match_any_sync_i64, "UiUiWi", "", AND(SM_70,PTX60)) // These return a pair {value, predicate}, which requires custom lowering. -TARGET_BUILTIN(__nvvm_match_all_sync_i32p, "UiUiUii*", "", PTX60) -TARGET_BUILTIN(__nvvm_match_all_sync_i64p, "WiUiWii*", "", PTX60) +TARGET_BUILTIN(__nvvm_match_all_sync_i32p, "UiUiUii*", "", AND(SM_70,PTX60)) +TARGET_BUILTIN(__nvvm_match_all_sync_i64p, "UiUiWii*", "", AND(SM_70,PTX60)) // Redux TARGET_BUILTIN(__nvvm_redux_sync_add, "iii", "", AND(SM_80,PTX70)) @@ -670,8 +830,50 @@ TARGET_BUILTIN(__nvvm_atom_sys_cas_gen_ll, "LLiLLiD*LLiLLi", "n", SM_60) BUILTIN(__nvvm_compiler_error, "vcC*4", "n") BUILTIN(__nvvm_compiler_warn, "vcC*4", "n") -// __ldg. This is not implemented as a builtin by nvcc. +BUILTIN(__nvvm_ldu_c, "ccC*", "") +BUILTIN(__nvvm_ldu_sc, "ScScC*", "") +BUILTIN(__nvvm_ldu_s, "ssC*", "") +BUILTIN(__nvvm_ldu_i, "iiC*", "") +BUILTIN(__nvvm_ldu_l, "LiLiC*", "") +BUILTIN(__nvvm_ldu_ll, "LLiLLiC*", "") + +BUILTIN(__nvvm_ldu_uc, "UcUcC*", "") +BUILTIN(__nvvm_ldu_us, "UsUsC*", "") +BUILTIN(__nvvm_ldu_ui, "UiUiC*", "") +BUILTIN(__nvvm_ldu_ul, "ULiULiC*", "") +BUILTIN(__nvvm_ldu_ull, "ULLiULLiC*", "") + +BUILTIN(__nvvm_ldu_h, "hhC*", "") +BUILTIN(__nvvm_ldu_f, "ffC*", "") +BUILTIN(__nvvm_ldu_d, "ddC*", "") + +BUILTIN(__nvvm_ldu_c2, "E2cE2cC*", "") +BUILTIN(__nvvm_ldu_sc2, "E2ScE2ScC*", "") +BUILTIN(__nvvm_ldu_c4, "E4cE4cC*", "") +BUILTIN(__nvvm_ldu_sc4, "E4ScE4ScC*", "") +BUILTIN(__nvvm_ldu_s2, "E2sE2sC*", "") +BUILTIN(__nvvm_ldu_s4, "E4sE4sC*", "") +BUILTIN(__nvvm_ldu_i2, "E2iE2iC*", "") +BUILTIN(__nvvm_ldu_i4, "E4iE4iC*", "") +BUILTIN(__nvvm_ldu_l2, "E2LiE2LiC*", "") +BUILTIN(__nvvm_ldu_ll2, "E2LLiE2LLiC*", "") + +BUILTIN(__nvvm_ldu_uc2, "E2UcE2UcC*", "") +BUILTIN(__nvvm_ldu_uc4, "E4UcE4UcC*", "") +BUILTIN(__nvvm_ldu_us2, "E2UsE2UsC*", "") +BUILTIN(__nvvm_ldu_us4, "E4UsE4UsC*", "") +BUILTIN(__nvvm_ldu_ui2, "E2UiE2UiC*", "") +BUILTIN(__nvvm_ldu_ui4, "E4UiE4UiC*", "") +BUILTIN(__nvvm_ldu_ul2, "E2ULiE2ULiC*", "") +BUILTIN(__nvvm_ldu_ull2, "E2ULLiE2ULLiC*", "") + +BUILTIN(__nvvm_ldu_h2, "E2hE2hC*", "") +BUILTIN(__nvvm_ldu_f2, "E2fE2fC*", "") +BUILTIN(__nvvm_ldu_f4, "E4fE4fC*", "") +BUILTIN(__nvvm_ldu_d2, "E2dE2dC*", "") + BUILTIN(__nvvm_ldg_c, "ccC*", "") +BUILTIN(__nvvm_ldg_sc, "ScScC*", "") BUILTIN(__nvvm_ldg_s, "ssC*", "") BUILTIN(__nvvm_ldg_i, "iiC*", "") BUILTIN(__nvvm_ldg_l, "LiLiC*", "") @@ -683,15 +885,19 @@ BUILTIN(__nvvm_ldg_ui, "UiUiC*", "") BUILTIN(__nvvm_ldg_ul, "ULiULiC*", "") BUILTIN(__nvvm_ldg_ull, "ULLiULLiC*", "") +BUILTIN(__nvvm_ldg_h, "hhC*", "") BUILTIN(__nvvm_ldg_f, "ffC*", "") BUILTIN(__nvvm_ldg_d, "ddC*", "") BUILTIN(__nvvm_ldg_c2, "E2cE2cC*", "") +BUILTIN(__nvvm_ldg_sc2, "E2ScE2ScC*", "") BUILTIN(__nvvm_ldg_c4, "E4cE4cC*", "") +BUILTIN(__nvvm_ldg_sc4, "E4ScE4ScC*", "") BUILTIN(__nvvm_ldg_s2, "E2sE2sC*", "") BUILTIN(__nvvm_ldg_s4, "E4sE4sC*", "") BUILTIN(__nvvm_ldg_i2, "E2iE2iC*", "") BUILTIN(__nvvm_ldg_i4, "E4iE4iC*", "") +BUILTIN(__nvvm_ldg_l2, "E2LiE2LiC*", "") BUILTIN(__nvvm_ldg_ll2, "E2LLiE2LLiC*", "") BUILTIN(__nvvm_ldg_uc2, "E2UcE2UcC*", "") @@ -700,8 +906,10 @@ BUILTIN(__nvvm_ldg_us2, "E2UsE2UsC*", "") BUILTIN(__nvvm_ldg_us4, "E4UsE4UsC*", "") BUILTIN(__nvvm_ldg_ui2, "E2UiE2UiC*", "") BUILTIN(__nvvm_ldg_ui4, "E4UiE4UiC*", "") +BUILTIN(__nvvm_ldg_ul2, "E2ULiE2ULiC*", "") BUILTIN(__nvvm_ldg_ull2, "E2ULLiE2ULLiC*", "") +BUILTIN(__nvvm_ldg_h2, "E2hE2hC*", "") BUILTIN(__nvvm_ldg_f2, "E2fE2fC*", "") BUILTIN(__nvvm_ldg_f4, "E4fE4fC*", "") BUILTIN(__nvvm_ldg_d2, "E2dE2dC*", "") @@ -711,28 +919,29 @@ BUILTIN(__nvvm_isspacep_const, "bvC*", "nc") BUILTIN(__nvvm_isspacep_global, "bvC*", "nc") BUILTIN(__nvvm_isspacep_local, "bvC*", "nc") BUILTIN(__nvvm_isspacep_shared, "bvC*", "nc") +TARGET_BUILTIN(__nvvm_isspacep_shared_cluster,"bvC*", "nc", AND(SM_90,PTX78)) // Builtins to support WMMA instructions on sm_70 TARGET_BUILTIN(__hmma_m16n16k16_ld_a, "vi*iC*UiIi", "", AND(SM_70,PTX60)) TARGET_BUILTIN(__hmma_m16n16k16_ld_b, "vi*iC*UiIi", "", AND(SM_70,PTX60)) TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f16, "vi*iC*UiIi", "", AND(SM_70,PTX60)) TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f32, "vf*fC*UiIi", "", AND(SM_70,PTX60)) -TARGET_BUILTIN(__hmma_m16n16k16_st_c_f16, "vi*i*UiIi", "", AND(SM_70,PTX60)) -TARGET_BUILTIN(__hmma_m16n16k16_st_c_f32, "vf*f*UiIi", "", AND(SM_70,PTX60)) +TARGET_BUILTIN(__hmma_m16n16k16_st_c_f16, "vi*iC*UiIi", "", AND(SM_70,PTX60)) +TARGET_BUILTIN(__hmma_m16n16k16_st_c_f32, "vf*fC*UiIi", "", AND(SM_70,PTX60)) TARGET_BUILTIN(__hmma_m32n8k16_ld_a, "vi*iC*UiIi", "", AND(SM_70,PTX61)) TARGET_BUILTIN(__hmma_m32n8k16_ld_b, "vi*iC*UiIi", "", AND(SM_70,PTX61)) TARGET_BUILTIN(__hmma_m32n8k16_ld_c_f16, "vi*iC*UiIi", "", AND(SM_70,PTX61)) TARGET_BUILTIN(__hmma_m32n8k16_ld_c_f32, "vf*fC*UiIi", "", AND(SM_70,PTX61)) -TARGET_BUILTIN(__hmma_m32n8k16_st_c_f16, "vi*i*UiIi", "", AND(SM_70,PTX61)) -TARGET_BUILTIN(__hmma_m32n8k16_st_c_f32, "vf*f*UiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m32n8k16_st_c_f16, "vi*iC*UiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m32n8k16_st_c_f32, "vf*fC*UiIi", "", AND(SM_70,PTX61)) TARGET_BUILTIN(__hmma_m8n32k16_ld_a, "vi*iC*UiIi", "", AND(SM_70,PTX61)) TARGET_BUILTIN(__hmma_m8n32k16_ld_b, "vi*iC*UiIi", "", AND(SM_70,PTX61)) TARGET_BUILTIN(__hmma_m8n32k16_ld_c_f16, "vi*iC*UiIi", "", AND(SM_70,PTX61)) TARGET_BUILTIN(__hmma_m8n32k16_ld_c_f32, "vf*fC*UiIi", "", AND(SM_70,PTX61)) -TARGET_BUILTIN(__hmma_m8n32k16_st_c_f16, "vi*i*UiIi", "", AND(SM_70,PTX61)) -TARGET_BUILTIN(__hmma_m8n32k16_st_c_f32, "vf*f*UiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m8n32k16_st_c_f16, "vi*iC*UiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m8n32k16_st_c_f32, "vf*fC*UiIi", "", AND(SM_70,PTX61)) TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f16, "vi*iC*iC*iC*IiIi", "", AND(SM_70,PTX60)) TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f16, "vf*iC*iC*iC*IiIi", "", AND(SM_70,PTX60)) @@ -753,7 +962,7 @@ TARGET_BUILTIN(__hmma_m8n32k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", AND(SM_70,PTX TARGET_BUILTIN(__bmma_m8n8k128_ld_a_b1, "vi*iC*UiIi", "", AND(SM_75,PTX63)) TARGET_BUILTIN(__bmma_m8n8k128_ld_b_b1, "vi*iC*UiIi", "", AND(SM_75,PTX63)) TARGET_BUILTIN(__bmma_m8n8k128_ld_c, "vi*iC*UiIi", "", AND(SM_75,PTX63)) -TARGET_BUILTIN(__bmma_m8n8k128_mma_and_popc_b1, "vi*iC*iC*iC*Ii", "", AND(SM_75,PTX71)) +TARGET_BUILTIN(__bmma_m8n8k128_mma_and_popc_b1, "vi*iC*iC*iC*Ii", "", AND(SM_80,PTX71)) TARGET_BUILTIN(__bmma_m8n8k128_mma_xor_popc_b1, "vi*iC*iC*iC*Ii", "", AND(SM_75,PTX63)) TARGET_BUILTIN(__bmma_m8n8k128_st_c_i32, "vi*iC*UiIi", "", AND(SM_75,PTX63)) TARGET_BUILTIN(__imma_m16n16k16_ld_a_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) @@ -818,24 +1027,42 @@ TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_shared, "vWi*3", "", AND(SM_80,PT TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc, "vWi*", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc_shared, "vWi*3", "", AND(SM_80,PTX70)) -TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_4, "vv*3vC*1", "", AND(SM_80,PTX70)) -TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_8, "vv*3vC*1", "", AND(SM_80,PTX70)) -TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_16, "vv*3vC*1", "", AND(SM_80,PTX70)) -TARGET_BUILTIN(__nvvm_cp_async_cg_shared_global_16, "vv*3vC*1", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_4, "vv*3vC*1.", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_8, "vv*3vC*1.", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_16, "vv*3vC*1.", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_cg_shared_global_16, "vv*3vC*1.", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_cp_async_commit_group, "v", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_cp_async_wait_group, "vIi", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_cp_async_wait_all, "v", "", AND(SM_80,PTX70)) + +// bf16, bf16x2 abs, neg +TARGET_BUILTIN(__nvvm_abs_bf16, "yy", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_abs_bf16x2, "V2yV2y", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_neg_bf16, "yy", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_neg_bf16x2, "V2yV2y", "", AND(SM_80,PTX70)) + +TARGET_BUILTIN(__nvvm_mapa, "v*v*i", "", AND(SM_90, PTX78)) +TARGET_BUILTIN(__nvvm_mapa_shared_cluster, "v*3v*3i", "", AND(SM_90, PTX78)) +TARGET_BUILTIN(__nvvm_getctarank, "iv*", "", AND(SM_90, PTX78)) +TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78)) + #undef BUILTIN #undef TARGET_BUILTIN #pragma pop_macro("AND") +#pragma pop_macro("SM_53") #pragma pop_macro("SM_60") #pragma pop_macro("SM_70") #pragma pop_macro("SM_72") #pragma pop_macro("SM_75") #pragma pop_macro("SM_80") #pragma pop_macro("SM_86") +#pragma pop_macro("SM_87") +#pragma pop_macro("SM_89") +#pragma pop_macro("SM_90") +#pragma pop_macro("SM_90a") +#pragma pop_macro("PTX42") #pragma pop_macro("PTX60") #pragma pop_macro("PTX61") #pragma pop_macro("PTX63") @@ -847,3 +1074,10 @@ TARGET_BUILTIN(__nvvm_cp_async_wait_all, "v", "", AND(SM_80,PTX70)) #pragma pop_macro("PTX73") #pragma pop_macro("PTX74") #pragma pop_macro("PTX75") +#pragma pop_macro("PTX76") +#pragma pop_macro("PTX77") +#pragma pop_macro("PTX78") +#pragma pop_macro("PTX80") +#pragma pop_macro("PTX81") +#pragma pop_macro("PTX82") +#pragma pop_macro("PTX83") |