aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/clang/lib/Headers/amxintrin.h
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/clang/lib/Headers/amxintrin.h')
-rw-r--r--contrib/llvm-project/clang/lib/Headers/amxintrin.h67
1 files changed, 49 insertions, 18 deletions
diff --git a/contrib/llvm-project/clang/lib/Headers/amxintrin.h b/contrib/llvm-project/clang/lib/Headers/amxintrin.h
index ec601a58e7c3..baa56f5b28e8 100644
--- a/contrib/llvm-project/clang/lib/Headers/amxintrin.h
+++ b/contrib/llvm-project/clang/lib/Headers/amxintrin.h
@@ -22,6 +22,8 @@
__attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
#define __DEFAULT_FN_ATTRS_BF16 \
__attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
+#define __DEFAULT_FN_ATTRS_FP16 \
+ __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
/// Load tile configuration from a 64-byte memory location specified by
/// "mem_addr". The tile configuration includes the tile type palette, the
@@ -290,6 +292,13 @@ _tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
}
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16
+_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+ _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+ return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
/// This struct pack the shape and tile data together for user. We suggest
/// initializing the struct as early as possible, because compiler depends
/// on the shape information to do configure. The constant value is preferred
@@ -314,8 +323,8 @@ typedef struct __tile1024i_str {
/// \param stride
/// The stride between the rows' data to be loaded in memory.
__DEFAULT_FN_ATTRS_TILE
-static void __tile_loadd(__tile1024i *dst, const void *base,
- __SIZE_TYPE__ stride) {
+static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
+ __SIZE_TYPE__ stride) {
dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
}
@@ -335,8 +344,8 @@ static void __tile_loadd(__tile1024i *dst, const void *base,
/// \param stride
/// The stride between the rows' data to be loaded in memory.
__DEFAULT_FN_ATTRS_TILE
-static void __tile_stream_loadd(__tile1024i *dst, const void *base,
- __SIZE_TYPE__ stride) {
+static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
+ __SIZE_TYPE__ stride) {
dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
}
@@ -357,8 +366,8 @@ static void __tile_stream_loadd(__tile1024i *dst, const void *base,
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_INT8
-static void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
- __tile1024i src1) {
+static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
@@ -380,8 +389,8 @@ static void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_INT8
-static void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
- __tile1024i src1) {
+static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
@@ -403,8 +412,8 @@ static void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_INT8
-static void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
- __tile1024i src1) {
+static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
@@ -426,8 +435,8 @@ static void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_INT8
-static void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
- __tile1024i src1) {
+static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
@@ -439,14 +448,13 @@ static void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
///
/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
///
-/// \param dst
-/// A destination tile. Max size is 1024 Bytes.
/// \param base
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be stored in memory.
__DEFAULT_FN_ATTRS_TILE
-static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
+static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
+ __tile1024i src) {
_tile_stored_internal(src.row, src.col, base, stride, src.tile);
}
@@ -459,7 +467,7 @@ static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
/// \param dst
/// The destination tile to be zero. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_TILE
-static void __tile_zero(__tile1024i *dst) {
+static __inline__ void __tile_zero(__tile1024i *dst) {
dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
}
@@ -479,15 +487,38 @@ static void __tile_zero(__tile1024i *dst) {
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_BF16
-static void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
- __tile1024i src1) {
+static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
+/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
+/// src1, accumulating the intermediate single-precision (32-bit) floating-point
+/// elements with elements in "dst", and store the 32-bit result back to tile
+/// "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_FP16
+static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
+ dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
+ src0.tile, src1.tile);
+}
+
#undef __DEFAULT_FN_ATTRS_TILE
#undef __DEFAULT_FN_ATTRS_INT8
#undef __DEFAULT_FN_ATTRS_BF16
+#undef __DEFAULT_FN_ATTRS_FP16
#endif /* __x86_64__ */
#endif /* __AMXINTRIN_H */