diff options
Diffstat (limited to 'contrib/arm-optimized-routines/string')
79 files changed, 2123 insertions, 1696 deletions
diff --git a/contrib/arm-optimized-routines/string/Dir.mk b/contrib/arm-optimized-routines/string/Dir.mk index cf3453f7580d..40ff5acc093e 100644 --- a/contrib/arm-optimized-routines/string/Dir.mk +++ b/contrib/arm-optimized-routines/string/Dir.mk @@ -1,7 +1,7 @@ # Makefile fragment - requires GNU make # # Copyright (c) 2019-2021, Arm Limited. -# SPDX-License-Identifier: MIT +# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception S := $(srcdir)/string B := build/string diff --git a/contrib/arm-optimized-routines/string/README.contributors b/contrib/arm-optimized-routines/string/README.contributors new file mode 100644 index 000000000000..0b4a51b56366 --- /dev/null +++ b/contrib/arm-optimized-routines/string/README.contributors @@ -0,0 +1,30 @@ +STYLE REQUIREMENTS +================== + +1. Most code in this sub-directory is expected to be upstreamed into glibc so + the GNU Coding Standard and glibc specific conventions should be followed + to ease upstreaming. + +2. ABI and symbols: the code should be written so it is suitable for inclusion + into a libc with minimal changes. This e.g. means that internal symbols + should be hidden and in the implementation reserved namespace according to + ISO C and POSIX rules. If possible the built shared libraries and static + library archives should be usable to override libc symbols at link time (or + at runtime via LD_PRELOAD). This requires the symbols to follow the glibc ABI + (other than symbol versioning), this cannot be done reliably for static + linking so this is a best effort requirement. + +3. API: include headers should be suitable for benchmarking and testing code + and should not conflict with libc headers. + + +CONTRIBUTION GUIDELINES FOR string SUB-DIRECTORY +================================================ +1. Code: + - The assumptions of the code must be clearly documented. + + - Assembly style should be consistent across different implementations. + + +2. Performance: + - Benchmarking is needed on several microarchitectures. diff --git a/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_region.S b/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_region.S index 84339f73cf23..207e22950c6d 100644 --- a/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_region.S +++ b/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_region.S @@ -1,8 +1,8 @@ /* * __mtag_tag_region - tag memory * - * Copyright (c) 2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2021-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -15,7 +15,7 @@ * The memory region may remain untagged if tagging is not enabled. */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_MEMORY_TAGGING diff --git a/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_zero_region.S b/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_zero_region.S index f58364ca6fcb..44b8e0114f42 100644 --- a/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_zero_region.S +++ b/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_zero_region.S @@ -1,8 +1,8 @@ /* * __mtag_tag_zero_region - tag memory and fill it with zero bytes * - * Copyright (c) 2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2021-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -15,7 +15,7 @@ * The memory region may remain untagged if tagging is not enabled. */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_MEMORY_TAGGING diff --git a/contrib/arm-optimized-routines/string/asmdefs.h b/contrib/arm-optimized-routines/string/aarch64/asmdefs.h index 340b427a505b..131b95e1fea9 100644 --- a/contrib/arm-optimized-routines/string/asmdefs.h +++ b/contrib/arm-optimized-routines/string/aarch64/asmdefs.h @@ -1,15 +1,13 @@ /* - * Macros for asm code. + * Macros for asm code. AArch64 version. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _ASMDEFS_H #define _ASMDEFS_H -#if defined(__aarch64__) - /* Branch Target Identitication support. */ #define BTI_C hint 34 #define BTI_J hint 36 @@ -23,6 +21,19 @@ #define FEATURE_1_PAC 2 /* Add a NT_GNU_PROPERTY_TYPE_0 note. */ +#ifdef __ILP32__ +#define GNU_PROPERTY(type, value) \ + .section .note.gnu.property, "a"; \ + .p2align 2; \ + .word 4; \ + .word 12; \ + .word 5; \ + .asciz "GNU"; \ + .word type; \ + .word 4; \ + .word value; \ + .text +#else #define GNU_PROPERTY(type, value) \ .section .note.gnu.property, "a"; \ .p2align 3; \ @@ -35,6 +46,7 @@ .word value; \ .word 0; \ .text +#endif /* If set then the GNU Property Note section will be added to mark objects to support BTI and PAC-RET. */ @@ -55,19 +67,6 @@ GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) .cfi_startproc; \ BTI_C; -#else - -#define END_FILE - -#define ENTRY_ALIGN(name, alignment) \ - .global name; \ - .type name,%function; \ - .align alignment; \ - name: \ - .cfi_startproc; - -#endif - #define ENTRY(name) ENTRY_ALIGN(name, 6) #define ENTRY_ALIAS(name) \ @@ -95,4 +94,13 @@ GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) #define SIZE_ARG(n) #endif +/* Compiler supports SVE instructions */ +#ifndef HAVE_SVE +# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5) +# define HAVE_SVE 1 +# else +# define HAVE_SVE 0 +# endif +#endif + #endif diff --git a/contrib/arm-optimized-routines/string/aarch64/check-arch.S b/contrib/arm-optimized-routines/string/aarch64/check-arch.S index 5a54242d7de6..131b7fa36ec2 100644 --- a/contrib/arm-optimized-routines/string/aarch64/check-arch.S +++ b/contrib/arm-optimized-routines/string/aarch64/check-arch.S @@ -1,8 +1,8 @@ /* * check ARCH setting. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #if !__aarch64__ @@ -10,4 +10,4 @@ #endif /* Include for GNU property notes. */ -#include "../asmdefs.h" +#include "asmdefs.h" diff --git a/contrib/arm-optimized-routines/string/aarch64/memchr-mte.S b/contrib/arm-optimized-routines/string/aarch64/memchr-mte.S index c2e967d1004e..948c3cbc7dd4 100644 --- a/contrib/arm-optimized-routines/string/aarch64/memchr-mte.S +++ b/contrib/arm-optimized-routines/string/aarch64/memchr-mte.S @@ -1,8 +1,8 @@ /* * memchr - find a character in a memory zone * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define chrin w1 @@ -23,25 +23,21 @@ #define synd x5 #define shift x6 #define tmp x7 -#define wtmp w7 #define vrepchr v0 #define qdata q1 #define vdata v1 #define vhas_chr v2 -#define vrepmask v3 -#define vend v4 -#define dend d4 +#define vend v3 +#define dend d3 /* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits + per byte. We take 4 bits of every comparison byte with shift right and narrow + by 4 instruction. Since the bits in the nibble mask reflect the order in + which things occur in the original string, counting leading zeros identifies + exactly which byte matched. */ ENTRY (__memchr_aarch64_mte) PTR_ARG (0) @@ -50,55 +46,53 @@ ENTRY (__memchr_aarch64_mte) cbz cntin, L(nomatch) ld1 {vdata.16b}, [src] dup vrepchr.16b, chrin - mov wtmp, 0xf00f - dup vrepmask.8h, wtmp cmeq vhas_chr.16b, vdata.16b, vrepchr.16b lsl shift, srcin, 2 - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ fmov synd, dend lsr synd, synd, shift cbz synd, L(start_loop) rbit synd, synd clz synd, synd - add result, srcin, synd, lsr 2 cmp cntin, synd, lsr 2 + add result, srcin, synd, lsr 2 csel result, result, xzr, hi ret + .p2align 3 L(start_loop): sub tmp, src, srcin - add tmp, tmp, 16 + add tmp, tmp, 17 subs cntrem, cntin, tmp - b.ls L(nomatch) + b.lo L(nomatch) /* Make sure that it won't overread by a 16-byte chunk */ - add tmp, cntrem, 15 - tbnz tmp, 4, L(loop32_2) - + tbz cntrem, 4, L(loop32_2) + sub src, src, 16 .p2align 4 L(loop32): - ldr qdata, [src, 16]! + ldr qdata, [src, 32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbnz synd, L(end) L(loop32_2): - ldr qdata, [src, 16]! - subs cntrem, cntrem, 32 + ldr qdata, [src, 16] cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - b.ls L(end) + subs cntrem, cntrem, 32 + b.lo L(end_2) umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbz synd, L(loop32) +L(end_2): + add src, src, 16 L(end): - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ + sub cntrem, src, srcin fmov synd, dend - add tmp, srcin, cntin - sub cntrem, tmp, src + sub cntrem, cntin, cntrem #ifndef __AARCH64EB__ rbit synd, synd #endif diff --git a/contrib/arm-optimized-routines/string/aarch64/memchr-sve.S b/contrib/arm-optimized-routines/string/aarch64/memchr-sve.S index c22e6596f19b..b851cf31f238 100644 --- a/contrib/arm-optimized-routines/string/aarch64/memchr-sve.S +++ b/contrib/arm-optimized-routines/string/aarch64/memchr-sve.S @@ -1,11 +1,11 @@ /* * memchr - find a character in a memory zone * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/contrib/arm-optimized-routines/string/aarch64/memchr.S b/contrib/arm-optimized-routines/string/aarch64/memchr.S index 353f0d1eac53..fe6cfe2bc0e2 100644 --- a/contrib/arm-optimized-routines/string/aarch64/memchr.S +++ b/contrib/arm-optimized-routines/string/aarch64/memchr.S @@ -1,8 +1,8 @@ /* * memchr - find a character in a memory zone * - * Copyright (c) 2014-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2014-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * Neon Available. */ -#include "../asmdefs.h" +#include "asmdefs.h" /* Arguments and results. */ #define srcin x0 diff --git a/contrib/arm-optimized-routines/string/aarch64/memcmp-sve.S b/contrib/arm-optimized-routines/string/aarch64/memcmp-sve.S index 78c5ecaa4cdc..d52ce4555344 100644 --- a/contrib/arm-optimized-routines/string/aarch64/memcmp-sve.S +++ b/contrib/arm-optimized-routines/string/aarch64/memcmp-sve.S @@ -1,11 +1,11 @@ /* * memcmp - compare memory * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/contrib/arm-optimized-routines/string/aarch64/memcmp.S b/contrib/arm-optimized-routines/string/aarch64/memcmp.S index 3b1026642eee..35135e72cc8e 100644 --- a/contrib/arm-optimized-routines/string/aarch64/memcmp.S +++ b/contrib/arm-optimized-routines/string/aarch64/memcmp.S @@ -1,103 +1,84 @@ /* memcmp - compare memory * - * Copyright (c) 2013-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2013-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: * - * ARMv8-a, AArch64, unaligned accesses. + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. */ -#include "../asmdefs.h" +#include "asmdefs.h" -/* Parameters and result. */ -#define src1 x0 -#define src2 x1 -#define limit x2 -#define result w0 +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result w0 + +#define data1 x3 +#define data1w w3 +#define data2 x4 +#define data2w w4 +#define data3 x5 +#define data3w w5 +#define data4 x6 +#define data4w w6 +#define tmp x6 +#define src1end x7 +#define src2end x8 -/* Internal variables. */ -#define data1 x3 -#define data1w w3 -#define data1h x4 -#define data2 x5 -#define data2w w5 -#define data2h x6 -#define tmp1 x7 -#define tmp2 x8 ENTRY (__memcmp_aarch64) PTR_ARG (0) PTR_ARG (1) SIZE_ARG (2) - subs limit, limit, 8 - b.lo L(less8) - - ldr data1, [src1], 8 - ldr data2, [src2], 8 - cmp data1, data2 - b.ne L(return) - - subs limit, limit, 8 - b.gt L(more16) - ldr data1, [src1, limit] - ldr data2, [src2, limit] - b L(return) - -L(more16): - ldr data1, [src1], 8 - ldr data2, [src2], 8 - cmp data1, data2 - bne L(return) - - /* Jump directly to comparing the last 16 bytes for 32 byte (or less) - strings. */ - subs limit, limit, 16 + cmp limit, 16 + b.lo L(less16) + ldp data1, data3, [src1] + ldp data2, data4, [src2] + ccmp data1, data2, 0, ne + ccmp data3, data4, 0, eq + b.ne L(return2) + + add src1end, src1, limit + add src2end, src2, limit + cmp limit, 32 b.ls L(last_bytes) + cmp limit, 160 + b.hs L(loop_align) + sub limit, limit, 32 - /* We overlap loads between 0-32 bytes at either side of SRC1 when we - try to align, so limit it only to strings larger than 128 bytes. */ - cmp limit, 96 - b.ls L(loop16) - - /* Align src1 and adjust src2 with bytes not yet done. */ - and tmp1, src1, 15 - add limit, limit, tmp1 - sub src1, src1, tmp1 - sub src2, src2, tmp1 - - /* Loop performing 16 bytes per iteration using aligned src1. - Limit is pre-decremented by 16 and must be larger than zero. - Exit if <= 16 bytes left to do or if the data is not equal. */ .p2align 4 -L(loop16): - ldp data1, data1h, [src1], 16 - ldp data2, data2h, [src2], 16 - subs limit, limit, 16 - ccmp data1, data2, 0, hi - ccmp data1h, data2h, 0, eq - b.eq L(loop16) - +L(loop32): + ldp data1, data3, [src1, 16] + ldp data2, data4, [src2, 16] cmp data1, data2 - bne L(return) - mov data1, data1h - mov data2, data2h + ccmp data3, data4, 0, eq + b.ne L(return2) + cmp limit, 16 + b.ls L(last_bytes) + + ldp data1, data3, [src1, 32] + ldp data2, data4, [src2, 32] cmp data1, data2 - bne L(return) + ccmp data3, data4, 0, eq + b.ne L(return2) + add src1, src1, 32 + add src2, src2, 32 +L(last64): + subs limit, limit, 32 + b.hi L(loop32) /* Compare last 1-16 bytes using unaligned access. */ L(last_bytes): - add src1, src1, limit - add src2, src2, limit - ldp data1, data1h, [src1] - ldp data2, data2h, [src2] - cmp data1, data2 - bne L(return) - mov data1, data1h - mov data2, data2h + ldp data1, data3, [src1end, -16] + ldp data2, data4, [src2end, -16] +L(return2): cmp data1, data2 + csel data1, data1, data3, ne + csel data2, data2, data4, ne /* Compare data bytes and set return value to 0, -1 or 1. */ L(return): @@ -105,33 +86,105 @@ L(return): rev data1, data1 rev data2, data2 #endif - cmp data1, data2 -L(ret_eq): + cmp data1, data2 cset result, ne cneg result, result, lo ret .p2align 4 - /* Compare up to 8 bytes. Limit is [-8..-1]. */ +L(less16): + add src1end, src1, limit + add src2end, src2, limit + tbz limit, 3, L(less8) + ldr data1, [src1] + ldr data2, [src2] + ldr data3, [src1end, -8] + ldr data4, [src2end, -8] + b L(return2) + + .p2align 4 L(less8): - adds limit, limit, 4 - b.lo L(less4) - ldr data1w, [src1], 4 - ldr data2w, [src2], 4 + tbz limit, 2, L(less4) + ldr data1w, [src1] + ldr data2w, [src2] + ldr data3w, [src1end, -4] + ldr data4w, [src2end, -4] + b L(return2) + +L(less4): + tbz limit, 1, L(less2) + ldrh data1w, [src1] + ldrh data2w, [src2] cmp data1w, data2w b.ne L(return) - sub limit, limit, 4 -L(less4): - adds limit, limit, 4 - beq L(ret_eq) -L(byte_loop): - ldrb data1w, [src1], 1 - ldrb data2w, [src2], 1 - subs limit, limit, 1 - ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ - b.eq L(byte_loop) +L(less2): + mov result, 0 + tbz limit, 0, L(return_zero) + ldrb data1w, [src1end, -1] + ldrb data2w, [src2end, -1] sub result, data1w, data2w +L(return_zero): ret -END (__memcmp_aarch64) +L(loop_align): + ldp data1, data3, [src1, 16] + ldp data2, data4, [src2, 16] + cmp data1, data2 + ccmp data3, data4, 0, eq + b.ne L(return2) + + /* Align src2 and adjust src1, src2 and limit. */ + and tmp, src2, 15 + sub tmp, tmp, 16 + sub src2, src2, tmp + add limit, limit, tmp + sub src1, src1, tmp + sub limit, limit, 64 + 16 + + .p2align 4 +L(loop64): + ldr q0, [src1, 16] + ldr q1, [src2, 16] + subs limit, limit, 64 + ldr q2, [src1, 32] + ldr q3, [src2, 32] + eor v0.16b, v0.16b, v1.16b + eor v1.16b, v2.16b, v3.16b + ldr q2, [src1, 48] + ldr q3, [src2, 48] + umaxp v0.16b, v0.16b, v1.16b + ldr q4, [src1, 64]! + ldr q5, [src2, 64]! + eor v1.16b, v2.16b, v3.16b + eor v2.16b, v4.16b, v5.16b + umaxp v1.16b, v1.16b, v2.16b + umaxp v0.16b, v0.16b, v1.16b + umaxp v0.16b, v0.16b, v0.16b + fmov tmp, d0 + ccmp tmp, 0, 0, hi + b.eq L(loop64) + + /* If equal, process last 1-64 bytes using scalar loop. */ + add limit, limit, 64 + 16 + cbz tmp, L(last64) + + /* Determine the 8-byte aligned offset of the first difference. */ +#ifdef __AARCH64EB__ + rev16 tmp, tmp +#endif + rev tmp, tmp + clz tmp, tmp + bic tmp, tmp, 7 + sub tmp, tmp, 48 + ldr data1, [src1, tmp] + ldr data2, [src2, tmp] +#ifndef __AARCH64EB__ + rev data1, data1 + rev data2, data2 +#endif + mov result, 1 + cmp data1, data2 + cneg result, result, lo + ret +END (__memcmp_aarch64) diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S b/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S index f97f2c3047b9..9d3027d4d3cd 100644 --- a/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S +++ b/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S @@ -1,8 +1,8 @@ /* * memcpy - copy memory area * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * */ -#include "../asmdefs.h" +#include "asmdefs.h" #define dstin x0 #define src x1 @@ -56,11 +56,12 @@ ENTRY (__memcpy_aarch64_simd) PTR_ARG (1) SIZE_ARG (2) add srcend, src, count - add dstend, dstin, count cmp count, 128 b.hi L(copy_long) + add dstend, dstin, count cmp count, 32 b.hi L(copy32_128) + nop /* Small copies: 0..32 bytes. */ cmp count, 16 @@ -71,6 +72,18 @@ ENTRY (__memcpy_aarch64_simd) str B_q, [dstend, -16] ret + .p2align 4 + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_q, B_q, [src] + ldp C_q, D_q, [srcend, -32] + cmp count, 64 + b.hi L(copy128) + stp A_q, B_q, [dstin] + stp C_q, D_q, [dstend, -32] + ret + + .p2align 4 /* Copy 8-15 bytes. */ L(copy16): tbz count, 3, L(copy8) @@ -80,7 +93,6 @@ L(copy16): str A_h, [dstend, -8] ret - .p2align 3 /* Copy 4-7 bytes. */ L(copy8): tbz count, 2, L(copy4) @@ -90,31 +102,6 @@ L(copy8): str B_lw, [dstend, -4] ret - /* Copy 0..3 bytes using a branchless sequence. */ -L(copy4): - cbz count, L(copy0) - lsr tmp1, count, 1 - ldrb A_lw, [src] - ldrb C_lw, [srcend, -1] - ldrb B_lw, [src, tmp1] - strb A_lw, [dstin] - strb B_lw, [dstin, tmp1] - strb C_lw, [dstend, -1] -L(copy0): - ret - - .p2align 4 - /* Medium copies: 33..128 bytes. */ -L(copy32_128): - ldp A_q, B_q, [src] - ldp C_q, D_q, [srcend, -32] - cmp count, 64 - b.hi L(copy128) - stp A_q, B_q, [dstin] - stp C_q, D_q, [dstend, -32] - ret - - .p2align 4 /* Copy 65..128 bytes. */ L(copy128): ldp E_q, F_q, [src, 32] @@ -128,8 +115,24 @@ L(copy96): stp C_q, D_q, [dstend, -32] ret + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb C_lw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb C_lw, [dstend, -1] +L(copy0): + ret + + .p2align 3 /* Copy more than 128 bytes. */ L(copy_long): + add dstend, dstin, count + /* Use backwards copy if there is an overlap. */ sub tmp1, dstin, src cmp tmp1, count @@ -166,6 +169,9 @@ L(copy64_from_end): stp A_q, B_q, [dstend, -32] ret + .p2align 4 + nop + /* Large backwards copy for overlapping copies. Copy 16 bytes and then align srcend to 16-byte alignment. */ L(copy_long_backwards): diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy-mops.S b/contrib/arm-optimized-routines/string/aarch64/memcpy-mops.S new file mode 100644 index 000000000000..b45c31418717 --- /dev/null +++ b/contrib/arm-optimized-routines/string/aarch64/memcpy-mops.S @@ -0,0 +1,21 @@ +/* + * memcpy using MOPS extension. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "asmdefs.h" + +ENTRY (__memcpy_aarch64_mops) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + + mov x3, x0 + .inst 0x19010443 /* cpyfp [x3]!, [x1]!, x2! */ + .inst 0x19410443 /* cpyfm [x3]!, [x1]!, x2! */ + .inst 0x19810443 /* cpyfe [x3]!, [x1]!, x2! */ + ret + +END (__memcpy_aarch64_mops) diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S b/contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S new file mode 100644 index 000000000000..e8a946d7db37 --- /dev/null +++ b/contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S @@ -0,0 +1,177 @@ +/* + * memcpy - copy memory area + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses. + * + */ + +#include "asmdefs.h" + +#ifdef HAVE_SVE + +.arch armv8-a+sve + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define tmp1 x6 +#define vlen x6 + +#define A_q q0 +#define B_q q1 +#define C_q q2 +#define D_q q3 +#define E_q q4 +#define F_q q5 +#define G_q q6 +#define H_q q7 + +/* This implementation handles overlaps and supports both memcpy and memmove + from a single entry point. It uses unaligned accesses and branchless + sequences to keep the code small, simple and improve performance. + SVE vectors are used to speedup small copies. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per iteration. + The source pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. +*/ + +ENTRY_ALIAS (__memmove_aarch64_sve) +ENTRY (__memcpy_aarch64_sve) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + + cmp count, 128 + b.hi L(copy_long) + cntb vlen + cmp count, vlen, lsl 1 + b.hi L(copy32_128) + + whilelo p0.b, xzr, count + whilelo p1.b, vlen, count + ld1b z0.b, p0/z, [src, 0, mul vl] + ld1b z1.b, p1/z, [src, 1, mul vl] + st1b z0.b, p0, [dstin, 0, mul vl] + st1b z1.b, p1, [dstin, 1, mul vl] + ret + + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + add srcend, src, count + add dstend, dstin, count + ldp A_q, B_q, [src] + ldp C_q, D_q, [srcend, -32] + cmp count, 64 + b.hi L(copy128) + stp A_q, B_q, [dstin] + stp C_q, D_q, [dstend, -32] + ret + + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_q, F_q, [src, 32] + cmp count, 96 + b.ls L(copy96) + ldp G_q, H_q, [srcend, -64] + stp G_q, H_q, [dstend, -64] +L(copy96): + stp A_q, B_q, [dstin] + stp E_q, F_q, [dstin, 32] + stp C_q, D_q, [dstend, -32] + ret + + /* Copy more than 128 bytes. */ +L(copy_long): + add srcend, src, count + add dstend, dstin, count + + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cmp tmp1, count + b.lo L(copy_long_backwards) + + /* Copy 16 bytes and then align src to 16-byte alignment. */ + ldr D_q, [src] + and tmp1, src, 15 + bic src, src, 15 + sub dst, dstin, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_q, B_q, [src, 16] + str D_q, [dstin] + ldp C_q, D_q, [src, 48] + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) +L(loop64): + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [src, 80] + stp C_q, D_q, [dst, 48] + ldp C_q, D_q, [src, 112] + add src, src, 64 + add dst, dst, 64 + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_q, F_q, [srcend, -64] + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [srcend, -32] + stp C_q, D_q, [dst, 48] + stp E_q, F_q, [dstend, -64] + stp A_q, B_q, [dstend, -32] + ret + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align srcend to 16-byte alignment. */ +L(copy_long_backwards): + cbz tmp1, L(return) + ldr D_q, [srcend, -16] + and tmp1, srcend, 15 + bic srcend, srcend, 15 + sub count, count, tmp1 + ldp A_q, B_q, [srcend, -32] + str D_q, [dstend, -16] + ldp C_q, D_q, [srcend, -64] + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + str B_q, [dstend, -16] + str A_q, [dstend, -32] + ldp A_q, B_q, [srcend, -96] + str D_q, [dstend, -48] + str C_q, [dstend, -64]! + ldp C_q, D_q, [srcend, -128] + sub srcend, srcend, 64 + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp E_q, F_q, [src, 32] + stp A_q, B_q, [dstend, -32] + ldp A_q, B_q, [src] + stp C_q, D_q, [dstend, -64] + stp E_q, F_q, [dstin, 32] + stp A_q, B_q, [dstin] +L(return): + ret + +END (__memcpy_aarch64_sve) + +#endif diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy.S b/contrib/arm-optimized-routines/string/aarch64/memcpy.S index dd254f6f9929..7c0606e2104a 100644 --- a/contrib/arm-optimized-routines/string/aarch64/memcpy.S +++ b/contrib/arm-optimized-routines/string/aarch64/memcpy.S @@ -1,8 +1,8 @@ /* * memcpy - copy memory area * - * Copyright (c) 2012-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2012-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * */ -#include "../asmdefs.h" +#include "asmdefs.h" #define dstin x0 #define src x1 diff --git a/contrib/arm-optimized-routines/string/aarch64/memmove-mops.S b/contrib/arm-optimized-routines/string/aarch64/memmove-mops.S new file mode 100644 index 000000000000..6c73017bb16f --- /dev/null +++ b/contrib/arm-optimized-routines/string/aarch64/memmove-mops.S @@ -0,0 +1,21 @@ +/* + * memmove using MOPS extension. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "asmdefs.h" + +ENTRY (__memmove_aarch64_mops) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + + mov x3, x0 + .inst 0x1d010443 /* cpyp [x3]!, [x1]!, x2! */ + .inst 0x1d410443 /* cpym [x3]!, [x1]!, x2! */ + .inst 0x1d810443 /* cpye [x3]!, [x1]!, x2! */ + ret + +END (__memmove_aarch64_mops) diff --git a/contrib/arm-optimized-routines/string/aarch64/memrchr.S b/contrib/arm-optimized-routines/string/aarch64/memrchr.S index 7b4be847cecb..6418bdf56f41 100644 --- a/contrib/arm-optimized-routines/string/aarch64/memrchr.S +++ b/contrib/arm-optimized-routines/string/aarch64/memrchr.S @@ -1,8 +1,8 @@ /* * memrchr - find last character in a memory zone. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define chrin w1 @@ -23,7 +23,6 @@ #define synd x5 #define shift x6 #define tmp x7 -#define wtmp w7 #define end x8 #define endm1 x9 @@ -31,19 +30,16 @@ #define qdata q1 #define vdata v1 #define vhas_chr v2 -#define vrepmask v3 -#define vend v4 -#define dend d4 +#define vend v3 +#define dend d3 /* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits + per byte. We take 4 bits of every comparison byte with shift right and narrow + by 4 instruction. Since the bits in the nibble mask reflect the order in + which things occur in the original string, counting leading zeros identifies + exactly which byte matched. */ ENTRY (__memrchr_aarch64) PTR_ARG (0) @@ -53,12 +49,9 @@ ENTRY (__memrchr_aarch64) cbz cntin, L(nomatch) ld1 {vdata.16b}, [src] dup vrepchr.16b, chrin - mov wtmp, 0xf00f - dup vrepmask.8h, wtmp cmeq vhas_chr.16b, vdata.16b, vrepchr.16b neg shift, end, lsl 2 - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ fmov synd, dend lsl synd, synd, shift cbz synd, L(start_loop) @@ -69,34 +62,36 @@ ENTRY (__memrchr_aarch64) csel result, result, xzr, hi ret + nop L(start_loop): - sub tmp, end, src - subs cntrem, cntin, tmp + subs cntrem, src, srcin b.ls L(nomatch) /* Make sure that it won't overread by a 16-byte chunk */ - add tmp, cntrem, 15 - tbnz tmp, 4, L(loop32_2) + sub cntrem, cntrem, 1 + tbz cntrem, 4, L(loop32_2) + add src, src, 16 - .p2align 4 + .p2align 5 L(loop32): - ldr qdata, [src, -16]! + ldr qdata, [src, -32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbnz synd, L(end) L(loop32_2): - ldr qdata, [src, -16]! + ldr qdata, [src, -16] subs cntrem, cntrem, 32 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - b.ls L(end) + b.lo L(end_2) umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbz synd, L(loop32) +L(end_2): + sub src, src, 16 L(end): - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ fmov synd, dend add tmp, src, 15 diff --git a/contrib/arm-optimized-routines/string/aarch64/memset-mops.S b/contrib/arm-optimized-routines/string/aarch64/memset-mops.S new file mode 100644 index 000000000000..ec791493bae9 --- /dev/null +++ b/contrib/arm-optimized-routines/string/aarch64/memset-mops.S @@ -0,0 +1,20 @@ +/* + * memset using MOPS extension. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "asmdefs.h" + +ENTRY (__memset_aarch64_mops) + PTR_ARG (0) + SIZE_ARG (2) + + mov x3, x0 + .inst 0x19c10443 /* setp [x3]!, x2!, x1 */ + .inst 0x19c14443 /* setm [x3]!, x2!, x1 */ + .inst 0x19c18443 /* sete [x3]!, x2!, x1 */ + ret + +END (__memset_aarch64_mops) diff --git a/contrib/arm-optimized-routines/string/aarch64/memset.S b/contrib/arm-optimized-routines/string/aarch64/memset.S index 9fcd97579913..553b0fcaefea 100644 --- a/contrib/arm-optimized-routines/string/aarch64/memset.S +++ b/contrib/arm-optimized-routines/string/aarch64/memset.S @@ -1,8 +1,8 @@ /* * memset - fill memory with a constant byte * - * Copyright (c) 2012-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2012-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * */ -#include "../asmdefs.h" +#include "asmdefs.h" #define dstin x0 #define val x1 diff --git a/contrib/arm-optimized-routines/string/aarch64/stpcpy-mte.S b/contrib/arm-optimized-routines/string/aarch64/stpcpy-mte.S deleted file mode 100644 index f1c711906515..000000000000 --- a/contrib/arm-optimized-routines/string/aarch64/stpcpy-mte.S +++ /dev/null @@ -1,10 +0,0 @@ -/* - * stpcpy - copy a string returning pointer to end. - * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#define BUILD_STPCPY 1 - -#include "strcpy-mte.S" diff --git a/contrib/arm-optimized-routines/string/aarch64/stpcpy-sve.S b/contrib/arm-optimized-routines/string/aarch64/stpcpy-sve.S index 82dd9717b0a0..5d3f14b86026 100644 --- a/contrib/arm-optimized-routines/string/aarch64/stpcpy-sve.S +++ b/contrib/arm-optimized-routines/string/aarch64/stpcpy-sve.S @@ -2,7 +2,7 @@ * stpcpy - copy a string returning pointer to end. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #define BUILD_STPCPY 1 diff --git a/contrib/arm-optimized-routines/string/aarch64/stpcpy.S b/contrib/arm-optimized-routines/string/aarch64/stpcpy.S index 4f62aa462389..155c68d75a7b 100644 --- a/contrib/arm-optimized-routines/string/aarch64/stpcpy.S +++ b/contrib/arm-optimized-routines/string/aarch64/stpcpy.S @@ -2,7 +2,7 @@ * stpcpy - copy a string returning pointer to end. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #define BUILD_STPCPY 1 diff --git a/contrib/arm-optimized-routines/string/aarch64/strchr-mte.S b/contrib/arm-optimized-routines/string/aarch64/strchr-mte.S index dcb0e4625870..6ec08f7acc76 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strchr-mte.S +++ b/contrib/arm-optimized-routines/string/aarch64/strchr-mte.S @@ -1,8 +1,8 @@ /* * strchr - find a character in a string * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define chrin w1 @@ -19,8 +19,7 @@ #define src x2 #define tmp1 x1 -#define wtmp2 w3 -#define tmp3 x3 +#define tmp2 x3 #define vrepchr v0 #define vdata v1 @@ -28,39 +27,30 @@ #define vhas_nul v2 #define vhas_chr v3 #define vrepmask v4 -#define vrepmask2 v5 -#define vend v6 -#define dend d6 +#define vend v5 +#define dend d5 /* Core algorithm. For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-1 are set if the relevant byte matched the - requested character, bits 2-3 are set if the byte is NUL (or matched), and - bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd - bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits - in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ + per byte. Bits 0-1 are set if the relevant byte matched the requested + character, bits 2-3 are set if the byte is NUL or matched. Count trailing + zeroes gives the position of the matching byte if it is a multiple of 4. + If it is not a multiple of 4, there was no match. */ ENTRY (__strchr_aarch64_mte) PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin ld1 {vdata.16b}, [src] - mov wtmp2, 0x3003 - dup vrepmask.8h, wtmp2 + movi vrepmask.16b, 0x33 cmeq vhas_nul.16b, vdata.16b, 0 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - mov wtmp2, 0xf00f - dup vrepmask2.8h, wtmp2 - bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b - lsl tmp3, srcin, 2 - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ - + lsl tmp2, srcin, 2 + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov tmp1, dend - lsr tmp1, tmp1, tmp3 + lsr tmp1, tmp1, tmp2 cbz tmp1, L(loop) rbit tmp1, tmp1 @@ -74,28 +64,34 @@ ENTRY (__strchr_aarch64_mte) .p2align 4 L(loop): - ldr qdata, [src, 16]! + ldr qdata, [src, 16] + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov tmp1, dend + cbnz tmp1, L(end) + ldr qdata, [src, 32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b umaxp vend.16b, vhas_nul.16b, vhas_nul.16b fmov tmp1, dend cbz tmp1, L(loop) + sub src, src, 16 +L(end): #ifdef __AARCH64EB__ bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov tmp1, dend #else bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov tmp1, dend rbit tmp1, tmp1 #endif + add src, src, 16 clz tmp1, tmp1 - /* Tmp1 is an even multiple of 2 if the target character was - found first. Otherwise we've found the end of string. */ + /* Tmp1 is a multiple of 4 if the target character was found. */ tst tmp1, 2 add result, src, tmp1, lsr 2 csel result, result, xzr, eq diff --git a/contrib/arm-optimized-routines/string/aarch64/strchr-sve.S b/contrib/arm-optimized-routines/string/aarch64/strchr-sve.S index 13ba9f44f9c5..ff075167bfef 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strchr-sve.S +++ b/contrib/arm-optimized-routines/string/aarch64/strchr-sve.S @@ -1,11 +1,11 @@ /* * strchr/strchrnul - find a character in a string * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/contrib/arm-optimized-routines/string/aarch64/strchr.S b/contrib/arm-optimized-routines/string/aarch64/strchr.S index 1063cbfd77aa..37193bd947a7 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strchr.S +++ b/contrib/arm-optimized-routines/string/aarch64/strchr.S @@ -1,8 +1,8 @@ /* * strchr - find a character in a string * - * Copyright (c) 2014-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2014-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * Neon Available. */ -#include "../asmdefs.h" +#include "asmdefs.h" /* Arguments and results. */ #define srcin x0 diff --git a/contrib/arm-optimized-routines/string/aarch64/strchrnul-mte.S b/contrib/arm-optimized-routines/string/aarch64/strchrnul-mte.S index 1b0d0a63094c..543ee88bb285 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strchrnul-mte.S +++ b/contrib/arm-optimized-routines/string/aarch64/strchrnul-mte.S @@ -1,8 +1,8 @@ /* * strchrnul - find a character or nul in a string * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define chrin w1 @@ -20,38 +20,32 @@ #define src x2 #define tmp1 x1 #define tmp2 x3 -#define tmp2w w3 #define vrepchr v0 #define vdata v1 #define qdata q1 #define vhas_nul v2 #define vhas_chr v3 -#define vrepmask v4 -#define vend v5 -#define dend d5 +#define vend v4 +#define dend d4 -/* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ +/* + Core algorithm: + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits + per byte. We take 4 bits of every comparison byte with shift right and narrow + by 4 instruction. Since the bits in the nibble mask reflect the order in + which things occur in the original string, counting leading zeros identifies + exactly which byte matched. */ ENTRY (__strchrnul_aarch64_mte) PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin ld1 {vdata.16b}, [src] - mov tmp2w, 0xf00f - dup vrepmask.8h, tmp2w cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b lsl tmp2, srcin, 2 - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ fmov tmp1, dend lsr tmp1, tmp1, tmp2 /* Mask padding bits. */ cbz tmp1, L(loop) @@ -63,15 +57,22 @@ ENTRY (__strchrnul_aarch64_mte) .p2align 4 L(loop): - ldr qdata, [src, 16]! + ldr qdata, [src, 16] + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b + fmov tmp1, dend + cbnz tmp1, L(end) + ldr qdata, [src, 32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b umaxp vend.16b, vhas_chr.16b, vhas_chr.16b fmov tmp1, dend cbz tmp1, L(loop) - - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + sub src, src, 16 +L(end): + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ + add src, src, 16 fmov tmp1, dend #ifndef __AARCH64EB__ rbit tmp1, tmp1 diff --git a/contrib/arm-optimized-routines/string/aarch64/strchrnul-sve.S b/contrib/arm-optimized-routines/string/aarch64/strchrnul-sve.S index 428ff1a3d008..0005f9177514 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strchrnul-sve.S +++ b/contrib/arm-optimized-routines/string/aarch64/strchrnul-sve.S @@ -2,7 +2,7 @@ * strchrnul - find a character or nul in a string * * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #define BUILD_STRCHRNUL diff --git a/contrib/arm-optimized-routines/string/aarch64/strchrnul.S b/contrib/arm-optimized-routines/string/aarch64/strchrnul.S index a4230d919b47..666e8d0304c1 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strchrnul.S +++ b/contrib/arm-optimized-routines/string/aarch64/strchrnul.S @@ -1,8 +1,8 @@ /* * strchrnul - find a character or nul in a string * - * Copyright (c) 2014-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2014-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * Neon Available. */ -#include "../asmdefs.h" +#include "asmdefs.h" /* Arguments and results. */ #define srcin x0 diff --git a/contrib/arm-optimized-routines/string/aarch64/strcmp-mte.S b/contrib/arm-optimized-routines/string/aarch64/strcmp-mte.S deleted file mode 100644 index 12d1a6b51dd3..000000000000 --- a/contrib/arm-optimized-routines/string/aarch64/strcmp-mte.S +++ /dev/null @@ -1,189 +0,0 @@ -/* - * strcmp - compare two strings - * - * Copyright (c) 2012-2020, Arm Limited. - * SPDX-License-Identifier: MIT - */ - - -/* Assumptions: - * - * ARMv8-a, AArch64. - * MTE compatible. - */ - -#include "../asmdefs.h" - -#define REP8_01 0x0101010101010101 -#define REP8_7f 0x7f7f7f7f7f7f7f7f - -#define src1 x0 -#define src2 x1 -#define result x0 - -#define data1 x2 -#define data1w w2 -#define data2 x3 -#define data2w w3 -#define has_nul x4 -#define diff x5 -#define off1 x5 -#define syndrome x6 -#define tmp x6 -#define data3 x7 -#define zeroones x8 -#define shift x9 -#define off2 x10 - -/* On big-endian early bytes are at MSB and on little-endian LSB. - LS_FW means shifting towards early bytes. */ -#ifdef __AARCH64EB__ -# define LS_FW lsl -#else -# define LS_FW lsr -#endif - -/* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. - Since carry propagation makes 0x1 bytes before a NUL byte appear - NUL too in big-endian, byte-reverse the data before the NUL check. */ - - -ENTRY (__strcmp_aarch64_mte) - PTR_ARG (0) - PTR_ARG (1) - sub off2, src2, src1 - mov zeroones, REP8_01 - and tmp, src1, 7 - tst off2, 7 - b.ne L(misaligned8) - cbnz tmp, L(mutual_align) - - .p2align 4 - -L(loop_aligned): - ldr data2, [src1, off2] - ldr data1, [src1], 8 -L(start_realigned): -#ifdef __AARCH64EB__ - rev tmp, data1 - sub has_nul, tmp, zeroones - orr tmp, tmp, REP8_7f -#else - sub has_nul, data1, zeroones - orr tmp, data1, REP8_7f -#endif - bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */ - ccmp data1, data2, 0, eq - b.eq L(loop_aligned) -#ifdef __AARCH64EB__ - rev has_nul, has_nul -#endif - eor diff, data1, data2 - orr syndrome, diff, has_nul -L(end): -#ifndef __AARCH64EB__ - rev syndrome, syndrome - rev data1, data1 - rev data2, data2 -#endif - clz shift, syndrome - /* The most-significant-non-zero bit of the syndrome marks either the - first bit that is different, or the top bit of the first zero byte. - Shifting left now will bring the critical information into the - top bits. */ - lsl data1, data1, shift - lsl data2, data2, shift - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, 56 - sub result, data1, data2, lsr 56 - ret - - .p2align 4 - -L(mutual_align): - /* Sources are mutually aligned, but are not currently at an - alignment boundary. Round down the addresses and then mask off - the bytes that precede the start point. */ - bic src1, src1, 7 - ldr data2, [src1, off2] - ldr data1, [src1], 8 - neg shift, src2, lsl 3 /* Bits to alignment -64. */ - mov tmp, -1 - LS_FW tmp, tmp, shift - orr data1, data1, tmp - orr data2, data2, tmp - b L(start_realigned) - -L(misaligned8): - /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always - checking to make sure that we don't access beyond the end of SRC2. */ - cbz tmp, L(src1_aligned) -L(do_misaligned): - ldrb data1w, [src1], 1 - ldrb data2w, [src2], 1 - cmp data1w, 0 - ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ - b.ne L(done) - tst src1, 7 - b.ne L(do_misaligned) - -L(src1_aligned): - neg shift, src2, lsl 3 - bic src2, src2, 7 - ldr data3, [src2], 8 -#ifdef __AARCH64EB__ - rev data3, data3 -#endif - lsr tmp, zeroones, shift - orr data3, data3, tmp - sub has_nul, data3, zeroones - orr tmp, data3, REP8_7f - bics has_nul, has_nul, tmp - b.ne L(tail) - - sub off1, src2, src1 - - .p2align 4 - -L(loop_unaligned): - ldr data3, [src1, off1] - ldr data2, [src1, off2] -#ifdef __AARCH64EB__ - rev data3, data3 -#endif - sub has_nul, data3, zeroones - orr tmp, data3, REP8_7f - ldr data1, [src1], 8 - bics has_nul, has_nul, tmp - ccmp data1, data2, 0, eq - b.eq L(loop_unaligned) - - lsl tmp, has_nul, shift -#ifdef __AARCH64EB__ - rev tmp, tmp -#endif - eor diff, data1, data2 - orr syndrome, diff, tmp - cbnz syndrome, L(end) -L(tail): - ldr data1, [src1] - neg shift, shift - lsr data2, data3, shift - lsr has_nul, has_nul, shift -#ifdef __AARCH64EB__ - rev data2, data2 - rev has_nul, has_nul -#endif - eor diff, data1, data2 - orr syndrome, diff, has_nul - b L(end) - -L(done): - sub result, data1, data2 - ret - -END (__strcmp_aarch64_mte) - diff --git a/contrib/arm-optimized-routines/string/aarch64/strcmp-sve.S b/contrib/arm-optimized-routines/string/aarch64/strcmp-sve.S index e6d2da5411ca..eaf909a378f1 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strcmp-sve.S +++ b/contrib/arm-optimized-routines/string/aarch64/strcmp-sve.S @@ -1,11 +1,11 @@ /* * __strcmp_aarch64_sve - compare two strings * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/contrib/arm-optimized-routines/string/aarch64/strcmp.S b/contrib/arm-optimized-routines/string/aarch64/strcmp.S index 7714ebf5577d..137a9aa06681 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strcmp.S +++ b/contrib/arm-optimized-routines/string/aarch64/strcmp.S @@ -1,168 +1,184 @@ /* * strcmp - compare two strings * - * Copyright (c) 2012-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2012-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ + /* Assumptions: * - * ARMv8-a, AArch64 + * ARMv8-a, AArch64. + * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 -/* Parameters and result. */ #define src1 x0 #define src2 x1 #define result x0 -/* Internal variables. */ #define data1 x2 #define data1w w2 #define data2 x3 #define data2w w3 #define has_nul x4 #define diff x5 +#define off1 x5 #define syndrome x6 -#define tmp1 x7 -#define tmp2 x8 -#define tmp3 x9 -#define zeroones x10 -#define pos x11 +#define tmp x6 +#define data3 x7 +#define zeroones x8 +#define shift x9 +#define off2 x10 + +/* On big-endian early bytes are at MSB and on little-endian LSB. + LS_FW means shifting towards early bytes. */ +#ifdef __AARCH64EB__ +# define LS_FW lsl +#else +# define LS_FW lsr +#endif + +/* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. + Since carry propagation makes 0x1 bytes before a NUL byte appear + NUL too in big-endian, byte-reverse the data before the NUL check. */ + - /* Start of performance-critical section -- one 64B cache line. */ ENTRY (__strcmp_aarch64) PTR_ARG (0) PTR_ARG (1) - eor tmp1, src1, src2 - mov zeroones, #REP8_01 - tst tmp1, #7 + sub off2, src2, src1 + mov zeroones, REP8_01 + and tmp, src1, 7 + tst off2, 7 b.ne L(misaligned8) - ands tmp1, src1, #7 - b.ne L(mutual_align) - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. */ + cbnz tmp, L(mutual_align) + + .p2align 4 + L(loop_aligned): - ldr data1, [src1], #8 - ldr data2, [src2], #8 + ldr data2, [src1, off2] + ldr data1, [src1], 8 L(start_realigned): - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - eor diff, data1, data2 /* Non-zero if differences found. */ - bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ +#ifdef __AARCH64EB__ + rev tmp, data1 + sub has_nul, tmp, zeroones + orr tmp, tmp, REP8_7f +#else + sub has_nul, data1, zeroones + orr tmp, data1, REP8_7f +#endif + bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */ + ccmp data1, data2, 0, eq + b.eq L(loop_aligned) +#ifdef __AARCH64EB__ + rev has_nul, has_nul +#endif + eor diff, data1, data2 orr syndrome, diff, has_nul - cbz syndrome, L(loop_aligned) - /* End of performance-critical section -- one 64B cache line. */ - L(end): -#ifndef __AARCH64EB__ +#ifndef __AARCH64EB__ rev syndrome, syndrome rev data1, data1 - /* The MS-non-zero bit of the syndrome marks either the first bit - that is different, or the top bit of the first zero byte. - Shifting left now will bring the critical information into the - top bits. */ - clz pos, syndrome rev data2, data2 - lsl data1, data1, pos - lsl data2, data2, pos - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 - ret -#else - /* For big-endian we cannot use the trick with the syndrome value - as carry-propagation can corrupt the upper bits if the trailing - bytes in the string contain 0x01. */ - /* However, if there is no NUL byte in the dword, we can generate - the result directly. We can't just subtract the bytes as the - MSB might be significant. */ - cbnz has_nul, 1f - cmp data1, data2 - cset result, ne - cneg result, result, lo - ret -1: - /* Re-compute the NUL-byte detection, using a byte-reversed value. */ - rev tmp3, data1 - sub tmp1, tmp3, zeroones - orr tmp2, tmp3, #REP8_7f - bic has_nul, tmp1, tmp2 - rev has_nul, has_nul - orr syndrome, diff, has_nul - clz pos, syndrome - /* The MS-non-zero bit of the syndrome marks either the first bit - that is different, or the top bit of the first zero byte. +#endif + clz shift, syndrome + /* The most-significant-non-zero bit of the syndrome marks either the + first bit that is different, or the top bit of the first zero byte. Shifting left now will bring the critical information into the top bits. */ - lsl data1, data1, pos - lsl data2, data2, pos + lsl data1, data1, shift + lsl data2, data2, shift /* But we need to zero-extend (char is unsigned) the value and then perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 + lsr data1, data1, 56 + sub result, data1, data2, lsr 56 ret -#endif + + .p2align 4 L(mutual_align): /* Sources are mutually aligned, but are not currently at an alignment boundary. Round down the addresses and then mask off - the bytes that preceed the start point. */ - bic src1, src1, #7 - bic src2, src2, #7 - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ - ldr data1, [src1], #8 - neg tmp1, tmp1 /* Bits to alignment -64. */ - ldr data2, [src2], #8 - mov tmp2, #~0 -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#endif - orr data1, data1, tmp2 - orr data2, data2, tmp2 + the bytes that precede the start point. */ + bic src1, src1, 7 + ldr data2, [src1, off2] + ldr data1, [src1], 8 + neg shift, src2, lsl 3 /* Bits to alignment -64. */ + mov tmp, -1 + LS_FW tmp, tmp, shift + orr data1, data1, tmp + orr data2, data2, tmp b L(start_realigned) L(misaligned8): /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always - checking to make sure that we don't access beyond page boundary in - SRC2. */ - tst src1, #7 - b.eq L(loop_misaligned) + checking to make sure that we don't access beyond the end of SRC2. */ + cbz tmp, L(src1_aligned) L(do_misaligned): - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - cmp data1w, #1 - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + cmp data1w, 0 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ b.ne L(done) - tst src1, #7 + tst src1, 7 b.ne L(do_misaligned) -L(loop_misaligned): - /* Test if we are within the last dword of the end of a 4K page. If - yes then jump back to the misaligned loop to copy a byte at a time. */ - and tmp1, src2, #0xff8 - eor tmp1, tmp1, #0xff8 - cbz tmp1, L(do_misaligned) - ldr data1, [src1], #8 - ldr data2, [src2], #8 - - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - eor diff, data1, data2 /* Non-zero if differences found. */ - bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ +L(src1_aligned): + neg shift, src2, lsl 3 + bic src2, src2, 7 + ldr data3, [src2], 8 +#ifdef __AARCH64EB__ + rev data3, data3 +#endif + lsr tmp, zeroones, shift + orr data3, data3, tmp + sub has_nul, data3, zeroones + orr tmp, data3, REP8_7f + bics has_nul, has_nul, tmp + b.ne L(tail) + + sub off1, src2, src1 + + .p2align 4 + +L(loop_unaligned): + ldr data3, [src1, off1] + ldr data2, [src1, off2] +#ifdef __AARCH64EB__ + rev data3, data3 +#endif + sub has_nul, data3, zeroones + orr tmp, data3, REP8_7f + ldr data1, [src1], 8 + bics has_nul, has_nul, tmp + ccmp data1, data2, 0, eq + b.eq L(loop_unaligned) + + lsl tmp, has_nul, shift +#ifdef __AARCH64EB__ + rev tmp, tmp +#endif + eor diff, data1, data2 + orr syndrome, diff, tmp + cbnz syndrome, L(end) +L(tail): + ldr data1, [src1] + neg shift, shift + lsr data2, data3, shift + lsr has_nul, has_nul, shift +#ifdef __AARCH64EB__ + rev data2, data2 + rev has_nul, has_nul +#endif + eor diff, data1, data2 orr syndrome, diff, has_nul - cbz syndrome, L(loop_misaligned) b L(end) L(done): diff --git a/contrib/arm-optimized-routines/string/aarch64/strcpy-mte.S b/contrib/arm-optimized-routines/string/aarch64/strcpy-mte.S deleted file mode 100644 index 88c222d61e53..000000000000 --- a/contrib/arm-optimized-routines/string/aarch64/strcpy-mte.S +++ /dev/null @@ -1,161 +0,0 @@ -/* - * strcpy/stpcpy - copy a string returning pointer to start/end. - * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -/* Assumptions: - * - * ARMv8-a, AArch64, Advanced SIMD. - * MTE compatible. - */ - -#include "../asmdefs.h" - -#define dstin x0 -#define srcin x1 -#define result x0 - -#define src x2 -#define dst x3 -#define len x4 -#define synd x4 -#define tmp x5 -#define wtmp w5 -#define shift x5 -#define data1 x6 -#define dataw1 w6 -#define data2 x7 -#define dataw2 w7 - -#define dataq q0 -#define vdata v0 -#define vhas_nul v1 -#define vrepmask v2 -#define vend v3 -#define dend d3 -#define dataq2 q1 - -#ifdef BUILD_STPCPY -# define STRCPY __stpcpy_aarch64_mte -# define IFSTPCPY(X,...) X,__VA_ARGS__ -#else -# define STRCPY __strcpy_aarch64_mte -# define IFSTPCPY(X,...) -#endif - -/* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ - -ENTRY (STRCPY) - PTR_ARG (0) - PTR_ARG (1) - bic src, srcin, 15 - mov wtmp, 0xf00f - ld1 {vdata.16b}, [src] - dup vrepmask.8h, wtmp - cmeq vhas_nul.16b, vdata.16b, 0 - lsl shift, srcin, 2 - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b - fmov synd, dend - lsr synd, synd, shift - cbnz synd, L(tail) - - ldr dataq, [src, 16]! - cmeq vhas_nul.16b, vdata.16b, 0 - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b - fmov synd, dend - cbz synd, L(start_loop) - -#ifndef __AARCH64EB__ - rbit synd, synd -#endif - sub tmp, src, srcin - clz len, synd - add len, tmp, len, lsr 2 - tbz len, 4, L(less16) - sub tmp, len, 15 - ldr dataq, [srcin] - ldr dataq2, [srcin, tmp] - str dataq, [dstin] - str dataq2, [dstin, tmp] - IFSTPCPY (add result, dstin, len) - ret - - .p2align 4,,8 -L(tail): - rbit synd, synd - clz len, synd - lsr len, len, 2 - - .p2align 4 -L(less16): - tbz len, 3, L(less8) - sub tmp, len, 7 - ldr data1, [srcin] - ldr data2, [srcin, tmp] - str data1, [dstin] - str data2, [dstin, tmp] - IFSTPCPY (add result, dstin, len) - ret - - .p2align 4 -L(less8): - subs tmp, len, 3 - b.lo L(less4) - ldr dataw1, [srcin] - ldr dataw2, [srcin, tmp] - str dataw1, [dstin] - str dataw2, [dstin, tmp] - IFSTPCPY (add result, dstin, len) - ret - -L(less4): - cbz len, L(zerobyte) - ldrh dataw1, [srcin] - strh dataw1, [dstin] -L(zerobyte): - strb wzr, [dstin, len] - IFSTPCPY (add result, dstin, len) - ret - - .p2align 4 -L(start_loop): - sub len, src, srcin - ldr dataq2, [srcin] - add dst, dstin, len - str dataq2, [dstin] - - .p2align 5 -L(loop): - str dataq, [dst], 16 - ldr dataq, [src, 16]! - cmeq vhas_nul.16b, vdata.16b, 0 - umaxp vend.16b, vhas_nul.16b, vhas_nul.16b - fmov synd, dend - cbz synd, L(loop) - - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ - fmov synd, dend -#ifndef __AARCH64EB__ - rbit synd, synd -#endif - clz len, synd - lsr len, len, 2 - sub tmp, len, 15 - ldr dataq, [src, tmp] - str dataq, [dst, tmp] - IFSTPCPY (add result, dst, len) - ret - -END (STRCPY) diff --git a/contrib/arm-optimized-routines/string/aarch64/strcpy-sve.S b/contrib/arm-optimized-routines/string/aarch64/strcpy-sve.S index f515462e09ae..00e72dce4451 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strcpy-sve.S +++ b/contrib/arm-optimized-routines/string/aarch64/strcpy-sve.S @@ -1,11 +1,11 @@ /* * strcpy/stpcpy - copy a string returning pointer to start/end. * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/contrib/arm-optimized-routines/string/aarch64/strcpy.S b/contrib/arm-optimized-routines/string/aarch64/strcpy.S index 6e9ed424b693..97ae37ea4229 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strcpy.S +++ b/contrib/arm-optimized-routines/string/aarch64/strcpy.S @@ -1,311 +1,156 @@ /* * strcpy/stpcpy - copy a string returning pointer to start/end. * - * Copyright (c) 2013-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: * - * ARMv8-a, AArch64, unaligned accesses, min page size 4k. + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" -/* To build as stpcpy, define BUILD_STPCPY before compiling this file. - - To test the page crossing code path more thoroughly, compile with - -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower - entry path. This option is not intended for production use. */ - -/* Arguments and results. */ #define dstin x0 #define srcin x1 +#define result x0 -/* Locals and temporaries. */ #define src x2 #define dst x3 -#define data1 x4 -#define data1w w4 -#define data2 x5 -#define data2w w5 -#define has_nul1 x6 -#define has_nul2 x7 -#define tmp1 x8 -#define tmp2 x9 -#define tmp3 x10 -#define tmp4 x11 -#define zeroones x12 -#define data1a x13 -#define data2a x14 -#define pos x15 -#define len x16 -#define to_align x17 +#define len x4 +#define synd x4 +#define tmp x5 +#define shift x5 +#define data1 x6 +#define dataw1 w6 +#define data2 x7 +#define dataw2 w7 + +#define dataq q0 +#define vdata v0 +#define vhas_nul v1 +#define vend v2 +#define dend d2 +#define dataq2 q1 #ifdef BUILD_STPCPY -#define STRCPY __stpcpy_aarch64 -#else -#define STRCPY __strcpy_aarch64 -#endif - - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. */ - -#define REP8_01 0x0101010101010101 -#define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 - - /* AArch64 systems have a minimum page size of 4k. We can do a quick - page size check for crossing this boundary on entry and if we - do not, then we can short-circuit much of the entry code. We - expect early page-crossing strings to be rare (probability of - 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite - predictable, even with random strings. - - We don't bother checking for larger page sizes, the cost of setting - up the correct page size is just not worth the extra gain from - a small reduction in the cases taking the slow path. Note that - we only care about whether the first fetch, which may be - misaligned, crosses a page boundary - after that we move to aligned - fetches for the remainder of the string. */ - -#ifdef STRCPY_TEST_PAGE_CROSS - /* Make everything that isn't Qword aligned look like a page cross. */ -#define MIN_PAGE_P2 4 +# define STRCPY __stpcpy_aarch64 +# define IFSTPCPY(X,...) X,__VA_ARGS__ #else -#define MIN_PAGE_P2 12 +# define STRCPY __strcpy_aarch64 +# define IFSTPCPY(X,...) #endif -#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2) +/* + Core algorithm: + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits + per byte. We take 4 bits of every comparison byte with shift right and narrow + by 4 instruction. Since the bits in the nibble mask reflect the order in + which things occur in the original string, counting leading zeros identifies + exactly which byte matched. */ ENTRY (STRCPY) PTR_ARG (0) PTR_ARG (1) - /* For moderately short strings, the fastest way to do the copy is to - calculate the length of the string in the same way as strlen, then - essentially do a memcpy of the result. This avoids the need for - multiple byte copies and further means that by the time we - reach the bulk copy loop we know we can always use DWord - accesses. We expect __strcpy_aarch64 to rarely be called repeatedly - with the same source string, so branch prediction is likely to - always be difficult - we mitigate against this by preferring - conditional select operations over branches whenever this is - feasible. */ - and tmp2, srcin, #(MIN_PAGE_SIZE - 1) - mov zeroones, #REP8_01 - and to_align, srcin, #15 - cmp tmp2, #(MIN_PAGE_SIZE - 16) - neg tmp1, to_align - /* The first fetch will straddle a (possible) page boundary iff - srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte - aligned string will never fail the page align check, so will - always take the fast path. */ - b.gt L(page_cross) - -L(page_cross_ok): - ldp data1, data2, [srcin] -#ifdef __AARCH64EB__ - /* Because we expect the end to be found within 16 characters - (profiling shows this is the most common case), it's worth - swapping the bytes now to save having to recalculate the - termination syndrome later. We preserve data1 and data2 - so that we can re-use the values later on. */ - rev tmp2, data1 - sub tmp1, tmp2, zeroones - orr tmp2, tmp2, #REP8_7f - bics has_nul1, tmp1, tmp2 - b.ne L(fp_le8) - rev tmp4, data2 - sub tmp3, tmp4, zeroones - orr tmp4, tmp4, #REP8_7f -#else - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - bics has_nul1, tmp1, tmp2 - b.ne L(fp_le8) - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f + bic src, srcin, 15 + ld1 {vdata.16b}, [src] + cmeq vhas_nul.16b, vdata.16b, 0 + lsl shift, srcin, 2 + shrn vend.8b, vhas_nul.8h, 4 + fmov synd, dend + lsr synd, synd, shift + cbnz synd, L(tail) + + ldr dataq, [src, 16]! + cmeq vhas_nul.16b, vdata.16b, 0 + shrn vend.8b, vhas_nul.8h, 4 + fmov synd, dend + cbz synd, L(start_loop) + +#ifndef __AARCH64EB__ + rbit synd, synd #endif - bics has_nul2, tmp3, tmp4 - b.eq L(bulk_entry) + sub tmp, src, srcin + clz len, synd + add len, tmp, len, lsr 2 + tbz len, 4, L(less16) + sub tmp, len, 15 + ldr dataq, [srcin] + ldr dataq2, [srcin, tmp] + str dataq, [dstin] + str dataq2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) + ret - /* The string is short (<=16 bytes). We don't know exactly how - short though, yet. Work out the exact length so that we can - quickly select the optimal copy strategy. */ -L(fp_gt8): - rev has_nul2, has_nul2 - clz pos, has_nul2 - mov tmp2, #56 - add dst, dstin, pos, lsr #3 /* Bits to bytes. */ - sub pos, tmp2, pos -#ifdef __AARCH64EB__ - lsr data2, data2, pos -#else - lsl data2, data2, pos -#endif - str data2, [dst, #1] +L(tail): + rbit synd, synd + clz len, synd + lsr len, len, 2 +L(less16): + tbz len, 3, L(less8) + sub tmp, len, 7 + ldr data1, [srcin] + ldr data2, [srcin, tmp] str data1, [dstin] -#ifdef BUILD_STPCPY - add dstin, dst, #8 -#endif + str data2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) ret -L(fp_le8): - rev has_nul1, has_nul1 - clz pos, has_nul1 - add dst, dstin, pos, lsr #3 /* Bits to bytes. */ - subs tmp2, pos, #24 /* Pos in bits. */ - b.lt L(fp_lt4) -#ifdef __AARCH64EB__ - mov tmp2, #56 - sub pos, tmp2, pos - lsr data2, data1, pos - lsr data1, data1, #32 -#else - lsr data2, data1, tmp2 -#endif - /* 4->7 bytes to copy. */ - str data2w, [dst, #-3] - str data1w, [dstin] -#ifdef BUILD_STPCPY - mov dstin, dst -#endif + .p2align 4 +L(less8): + subs tmp, len, 3 + b.lo L(less4) + ldr dataw1, [srcin] + ldr dataw2, [srcin, tmp] + str dataw1, [dstin] + str dataw2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) ret -L(fp_lt4): - cbz pos, L(fp_lt2) - /* 2->3 bytes to copy. */ -#ifdef __AARCH64EB__ - lsr data1, data1, #48 -#endif - strh data1w, [dstin] - /* Fall-through, one byte (max) to go. */ -L(fp_lt2): - /* Null-terminated string. Last character must be zero! */ - strb wzr, [dst] -#ifdef BUILD_STPCPY - mov dstin, dst -#endif - ret - - .p2align 6 - /* Aligning here ensures that the entry code and main loop all lies - within one 64-byte cache line. */ -L(bulk_entry): - sub to_align, to_align, #16 - stp data1, data2, [dstin] - sub src, srcin, to_align - sub dst, dstin, to_align - b L(entry_no_page_cross) - - /* The inner loop deals with two Dwords at a time. This has a - slightly higher start-up cost, but we should win quite quickly, - especially on cores with a high number of issue slots per - cycle, as we get much better parallelism out of the operations. */ -L(main_loop): - stp data1, data2, [dst], #16 -L(entry_no_page_cross): - ldp data1, data2, [src], #16 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f - bic has_nul1, tmp1, tmp2 - bics has_nul2, tmp3, tmp4 - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ - b.eq L(main_loop) - /* Since we know we are copying at least 16 bytes, the fastest way - to deal with the tail is to determine the location of the - trailing NUL, then (re)copy the 16 bytes leading up to that. */ - cmp has_nul1, #0 -#ifdef __AARCH64EB__ - /* For big-endian, carry propagation (if the final byte in the - string is 0x01) means we cannot use has_nul directly. The - easiest way to get the correct byte is to byte-swap the data - and calculate the syndrome a second time. */ - csel data1, data1, data2, ne - rev data1, data1 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - bic has_nul1, tmp1, tmp2 -#else - csel has_nul1, has_nul1, has_nul2, ne -#endif - rev has_nul1, has_nul1 - clz pos, has_nul1 - add tmp1, pos, #72 - add pos, pos, #8 - csel pos, pos, tmp1, ne - add src, src, pos, lsr #3 - add dst, dst, pos, lsr #3 - ldp data1, data2, [src, #-32] - stp data1, data2, [dst, #-16] -#ifdef BUILD_STPCPY - sub dstin, dst, #1 -#endif +L(less4): + cbz len, L(zerobyte) + ldrh dataw1, [srcin] + strh dataw1, [dstin] +L(zerobyte): + strb wzr, [dstin, len] + IFSTPCPY (add result, dstin, len) ret -L(page_cross): - bic src, srcin, #15 - /* Start by loading two words at [srcin & ~15], then forcing the - bytes that precede srcin to 0xff. This means they never look - like termination bytes. */ - ldp data1, data2, [src] - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ - tst to_align, #7 - csetm tmp2, ne -#ifdef __AARCH64EB__ - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#else - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ + .p2align 4 +L(start_loop): + sub tmp, srcin, dstin + ldr dataq2, [srcin] + sub dst, src, tmp + str dataq2, [dstin] +L(loop): + str dataq, [dst], 32 + ldr dataq, [src, 16] + cmeq vhas_nul.16b, vdata.16b, 0 + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbnz synd, L(loopend) + str dataq, [dst, -16] + ldr dataq, [src, 32]! + cmeq vhas_nul.16b, vdata.16b, 0 + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbz synd, L(loop) + add dst, dst, 16 +L(loopend): + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ + fmov synd, dend + sub dst, dst, 31 +#ifndef __AARCH64EB__ + rbit synd, synd #endif - orr data1, data1, tmp2 - orr data2a, data2, tmp2 - cmp to_align, #8 - csinv data1, data1, xzr, lt - csel data2, data2, data2a, lt - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f - bic has_nul1, tmp1, tmp2 - bics has_nul2, tmp3, tmp4 - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ - b.eq L(page_cross_ok) - /* We now need to make data1 and data2 look like they've been - loaded directly from srcin. Do a rotate on the 128-bit value. */ - lsl tmp1, to_align, #3 /* Bytes->bits. */ - neg tmp2, to_align, lsl #3 -#ifdef __AARCH64EB__ - lsl data1a, data1, tmp1 - lsr tmp4, data2, tmp2 - lsl data2, data2, tmp1 - orr tmp4, tmp4, data1a - cmp to_align, #8 - csel data1, tmp4, data2, lt - rev tmp2, data1 - rev tmp4, data2 - sub tmp1, tmp2, zeroones - orr tmp2, tmp2, #REP8_7f - sub tmp3, tmp4, zeroones - orr tmp4, tmp4, #REP8_7f -#else - lsr data1a, data1, tmp1 - lsl tmp4, data2, tmp2 - lsr data2, data2, tmp1 - orr tmp4, tmp4, data1a - cmp to_align, #8 - csel data1, tmp4, data2, lt - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f -#endif - bic has_nul1, tmp1, tmp2 - cbnz has_nul1, L(fp_le8) - bic has_nul2, tmp3, tmp4 - b L(fp_gt8) + clz len, synd + lsr len, len, 2 + add dst, dst, len + ldr dataq, [dst, tmp] + str dataq, [dst] + IFSTPCPY (add result, dst, 15) + ret END (STRCPY) - diff --git a/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S b/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S index 7cf41d5c1eac..77235797f7c5 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S +++ b/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S @@ -1,8 +1,8 @@ /* * strlen - calculate the length of a string. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define result x0 @@ -19,35 +19,26 @@ #define src x1 #define synd x2 #define tmp x3 -#define wtmp w3 #define shift x4 #define data q0 #define vdata v0 #define vhas_nul v1 -#define vrepmask v2 -#define vend v3 -#define dend d3 +#define vend v2 +#define dend d2 /* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ + Process the string in 16-byte aligned chunks. Compute a 64-bit mask with + four bits per byte using the shrn instruction. A count trailing zeros then + identifies the first zero byte. */ ENTRY (__strlen_aarch64_mte) PTR_ARG (0) bic src, srcin, 15 - mov wtmp, 0xf00f ld1 {vdata.16b}, [src] - dup vrepmask.8h, wtmp cmeq vhas_nul.16b, vdata.16b, 0 lsl shift, srcin, 2 - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov synd, dend lsr synd, synd, shift cbz synd, L(loop) @@ -59,19 +50,25 @@ ENTRY (__strlen_aarch64_mte) .p2align 5 L(loop): - ldr data, [src, 16]! + ldr data, [src, 16] + cmeq vhas_nul.16b, vdata.16b, 0 + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbnz synd, L(loop_end) + ldr data, [src, 32]! cmeq vhas_nul.16b, vdata.16b, 0 umaxp vend.16b, vhas_nul.16b, vhas_nul.16b fmov synd, dend cbz synd, L(loop) - - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + sub src, src, 16 +L(loop_end): + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ sub result, src, srcin fmov synd, dend #ifndef __AARCH64EB__ rbit synd, synd #endif + add result, result, 16 clz tmp, synd add result, result, tmp, lsr 2 ret diff --git a/contrib/arm-optimized-routines/string/aarch64/strlen-sve.S b/contrib/arm-optimized-routines/string/aarch64/strlen-sve.S index 2392493f1a3c..12ebbdba5c93 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strlen-sve.S +++ b/contrib/arm-optimized-routines/string/aarch64/strlen-sve.S @@ -1,11 +1,11 @@ /* * __strlen_aarch64_sve - compute the length of a string * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/contrib/arm-optimized-routines/string/aarch64/strlen.S b/contrib/arm-optimized-routines/string/aarch64/strlen.S index a1b164a49238..6f6f08f636b2 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strlen.S +++ b/contrib/arm-optimized-routines/string/aarch64/strlen.S @@ -1,8 +1,8 @@ /* * strlen - calculate the length of a string. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * Not MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define len x0 @@ -36,6 +36,7 @@ #define tmp x2 #define tmpw w2 #define synd x3 +#define syndw w3 #define shift x4 /* For the first 32 bytes, NUL detection works on the principle that @@ -110,7 +111,6 @@ ENTRY (__strlen_aarch64) add len, len, tmp1, lsr 3 ret - .p2align 3 /* Look for a NUL byte at offset 16..31 in the string. */ L(bytes16_31): ldp data1, data2, [srcin, 16] @@ -138,6 +138,7 @@ L(bytes16_31): add len, len, tmp1, lsr 3 ret + nop L(loop_entry): bic src, srcin, 31 @@ -153,18 +154,12 @@ L(loop): /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */ cmeq maskv.16b, datav1.16b, 0 sub len, src, srcin - tst synd, 0xffffffff - b.ne 1f + cbnz syndw, 1f cmeq maskv.16b, datav2.16b, 0 add len, len, 16 1: /* Generate a bitmask and compute correct byte offset. */ -#ifdef __AARCH64EB__ - bic maskv.8h, 0xf0 -#else - bic maskv.8h, 0x0f, lsl 8 -#endif - umaxp maskv.16b, maskv.16b, maskv.16b + shrn maskv.8b, maskv.8h, 4 fmov synd, maskd #ifndef __AARCH64EB__ rbit synd, synd @@ -173,8 +168,6 @@ L(loop): add len, len, tmp, lsr 2 ret - .p2align 4 - L(page_cross): bic src, srcin, 31 mov tmpw, 0x0c03 diff --git a/contrib/arm-optimized-routines/string/aarch64/strncmp-mte.S b/contrib/arm-optimized-routines/string/aarch64/strncmp-mte.S deleted file mode 100644 index c9d6fc8a158b..000000000000 --- a/contrib/arm-optimized-routines/string/aarch64/strncmp-mte.S +++ /dev/null @@ -1,307 +0,0 @@ -/* - * strncmp - compare two strings - * - * Copyright (c) 2013-2021, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -/* Assumptions: - * - * ARMv8-a, AArch64 - */ - -#include "../asmdefs.h" - -#define REP8_01 0x0101010101010101 -#define REP8_7f 0x7f7f7f7f7f7f7f7f - -/* Parameters and result. */ -#define src1 x0 -#define src2 x1 -#define limit x2 -#define result x0 - -/* Internal variables. */ -#define data1 x3 -#define data1w w3 -#define data2 x4 -#define data2w w4 -#define has_nul x5 -#define diff x6 -#define syndrome x7 -#define tmp1 x8 -#define tmp2 x9 -#define tmp3 x10 -#define zeroones x11 -#define pos x12 -#define mask x13 -#define endloop x14 -#define count mask -#define offset pos -#define neg_offset x15 - -/* Define endian dependent shift operations. - On big-endian early bytes are at MSB and on little-endian LSB. - LS_FW means shifting towards early bytes. - LS_BK means shifting towards later bytes. - */ -#ifdef __AARCH64EB__ -#define LS_FW lsl -#define LS_BK lsr -#else -#define LS_FW lsr -#define LS_BK lsl -#endif - -ENTRY (__strncmp_aarch64_mte) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) - cbz limit, L(ret0) - eor tmp1, src1, src2 - mov zeroones, #REP8_01 - tst tmp1, #7 - and count, src1, #7 - b.ne L(misaligned8) - cbnz count, L(mutual_align) - - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. */ - .p2align 4 -L(loop_aligned): - ldr data1, [src1], #8 - ldr data2, [src2], #8 -L(start_realigned): - subs limit, limit, #8 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - eor diff, data1, data2 /* Non-zero if differences found. */ - csinv endloop, diff, xzr, hi /* Last Dword or differences. */ - bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ - ccmp endloop, #0, #0, eq - b.eq L(loop_aligned) - /* End of main loop */ - -L(full_check): -#ifndef __AARCH64EB__ - orr syndrome, diff, has_nul - add limit, limit, 8 /* Rewind limit to before last subs. */ -L(syndrome_check): - /* Limit was reached. Check if the NUL byte or the difference - is before the limit. */ - rev syndrome, syndrome - rev data1, data1 - clz pos, syndrome - rev data2, data2 - lsl data1, data1, pos - cmp limit, pos, lsr #3 - lsl data2, data2, pos - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 - csel result, result, xzr, hi - ret -#else - /* Not reached the limit, must have found the end or a diff. */ - tbz limit, #63, L(not_limit) - add tmp1, limit, 8 - cbz limit, L(not_limit) - - lsl limit, tmp1, #3 /* Bits -> bytes. */ - mov mask, #~0 - lsr mask, mask, limit - bic data1, data1, mask - bic data2, data2, mask - - /* Make sure that the NUL byte is marked in the syndrome. */ - orr has_nul, has_nul, mask - -L(not_limit): - /* For big-endian we cannot use the trick with the syndrome value - as carry-propagation can corrupt the upper bits if the trailing - bytes in the string contain 0x01. */ - /* However, if there is no NUL byte in the dword, we can generate - the result directly. We can't just subtract the bytes as the - MSB might be significant. */ - cbnz has_nul, 1f - cmp data1, data2 - cset result, ne - cneg result, result, lo - ret -1: - /* Re-compute the NUL-byte detection, using a byte-reversed value. */ - rev tmp3, data1 - sub tmp1, tmp3, zeroones - orr tmp2, tmp3, #REP8_7f - bic has_nul, tmp1, tmp2 - rev has_nul, has_nul - orr syndrome, diff, has_nul - clz pos, syndrome - /* The most-significant-non-zero bit of the syndrome marks either the - first bit that is different, or the top bit of the first zero byte. - Shifting left now will bring the critical information into the - top bits. */ -L(end_quick): - lsl data1, data1, pos - lsl data2, data2, pos - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 - ret -#endif - -L(mutual_align): - /* Sources are mutually aligned, but are not currently at an - alignment boundary. Round down the addresses and then mask off - the bytes that precede the start point. - We also need to adjust the limit calculations, but without - overflowing if the limit is near ULONG_MAX. */ - bic src1, src1, #7 - bic src2, src2, #7 - ldr data1, [src1], #8 - neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ - ldr data2, [src2], #8 - mov tmp2, #~0 - LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ - /* Adjust the limit and ensure it doesn't overflow. */ - adds limit, limit, count - csinv limit, limit, xzr, lo - orr data1, data1, tmp2 - orr data2, data2, tmp2 - b L(start_realigned) - - .p2align 4 - /* Don't bother with dwords for up to 16 bytes. */ -L(misaligned8): - cmp limit, #16 - b.hs L(try_misaligned_words) - -L(byte_loop): - /* Perhaps we can do better than this. */ - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - subs limit, limit, #1 - ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.eq L(byte_loop) -L(done): - sub result, data1, data2 - ret - /* Align the SRC1 to a dword by doing a bytewise compare and then do - the dword loop. */ -L(try_misaligned_words): - cbz count, L(src1_aligned) - - neg count, count - and count, count, #7 - sub limit, limit, count - -L(page_end_loop): - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - cmp data1w, #1 - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.ne L(done) - subs count, count, #1 - b.hi L(page_end_loop) - - /* The following diagram explains the comparison of misaligned strings. - The bytes are shown in natural order. For little-endian, it is - reversed in the registers. The "x" bytes are before the string. - The "|" separates data that is loaded at one time. - src1 | a a a a a a a a | b b b c c c c c | . . . - src2 | x x x x x a a a a a a a a b b b | c c c c c . . . - - After shifting in each step, the data looks like this: - STEP_A STEP_B STEP_C - data1 a a a a a a a a b b b c c c c c b b b c c c c c - data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c - - The bytes with "0" are eliminated from the syndrome via mask. - - Align SRC2 down to 16 bytes. This way we can read 16 bytes at a - time from SRC2. The comparison happens in 3 steps. After each step - the loop can exit, or read from SRC1 or SRC2. */ -L(src1_aligned): - /* Calculate offset from 8 byte alignment to string start in bits. No - need to mask offset since shifts are ignoring upper bits. */ - lsl offset, src2, #3 - bic src2, src2, #0xf - mov mask, -1 - neg neg_offset, offset - ldr data1, [src1], #8 - ldp tmp1, tmp2, [src2], #16 - LS_BK mask, mask, neg_offset - and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ - /* Skip the first compare if data in tmp1 is irrelevant. */ - tbnz offset, 6, L(misaligned_mid_loop) - -L(loop_misaligned): - /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ - LS_FW data2, tmp1, offset - LS_BK tmp1, tmp2, neg_offset - subs limit, limit, #8 - orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ - sub has_nul, data1, zeroones - eor diff, data1, data2 /* Non-zero if differences found. */ - orr tmp3, data1, #REP8_7f - csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ - bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ - orr tmp3, endloop, has_nul - cbnz tmp3, L(full_check) - - ldr data1, [src1], #8 -L(misaligned_mid_loop): - /* STEP_B: Compare first part of data1 to second part of tmp2. */ - LS_FW data2, tmp2, offset -#ifdef __AARCH64EB__ - /* For big-endian we do a byte reverse to avoid carry-propagation - problem described above. This way we can reuse the has_nul in the - next step and also use syndrome value trick at the end. */ - rev tmp3, data1 - #define data1_fixed tmp3 -#else - #define data1_fixed data1 -#endif - sub has_nul, data1_fixed, zeroones - orr tmp3, data1_fixed, #REP8_7f - eor diff, data2, data1 /* Non-zero if differences found. */ - bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ -#ifdef __AARCH64EB__ - rev has_nul, has_nul -#endif - cmp limit, neg_offset, lsr #3 - orr syndrome, diff, has_nul - bic syndrome, syndrome, mask /* Ignore later bytes. */ - csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ - cbnz tmp3, L(syndrome_check) - - /* STEP_C: Compare second part of data1 to first part of tmp1. */ - ldp tmp1, tmp2, [src2], #16 - cmp limit, #8 - LS_BK data2, tmp1, neg_offset - eor diff, data2, data1 /* Non-zero if differences found. */ - orr syndrome, diff, has_nul - and syndrome, syndrome, mask /* Ignore earlier bytes. */ - csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ - cbnz tmp3, L(syndrome_check) - - ldr data1, [src1], #8 - sub limit, limit, #8 - b L(loop_misaligned) - -#ifdef __AARCH64EB__ -L(syndrome_check): - clz pos, syndrome - cmp pos, limit, lsl #3 - b.lo L(end_quick) -#endif - -L(ret0): - mov result, #0 - ret -END(__strncmp_aarch64_mte) - diff --git a/contrib/arm-optimized-routines/string/aarch64/strncmp-sve.S b/contrib/arm-optimized-routines/string/aarch64/strncmp-sve.S index 234190e245b0..6a9e9f7b6437 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strncmp-sve.S +++ b/contrib/arm-optimized-routines/string/aarch64/strncmp-sve.S @@ -1,11 +1,11 @@ /* * strncmp - compare two strings with limit * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/contrib/arm-optimized-routines/string/aarch64/strncmp.S b/contrib/arm-optimized-routines/string/aarch64/strncmp.S index 738b6539cab6..128a10c52bb1 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strncmp.S +++ b/contrib/arm-optimized-routines/string/aarch64/strncmp.S @@ -1,20 +1,20 @@ /* * strncmp - compare two strings * - * Copyright (c) 2013-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2013-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: * - * ARMv8-a, AArch64 + * ARMv8-a, AArch64. + * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 /* Parameters and result. */ #define src1 x0 @@ -35,10 +35,24 @@ #define tmp3 x10 #define zeroones x11 #define pos x12 -#define limit_wd x13 -#define mask x14 -#define endloop x15 +#define mask x13 +#define endloop x14 #define count mask +#define offset pos +#define neg_offset x15 + +/* Define endian dependent shift operations. + On big-endian early bytes are at MSB and on little-endian LSB. + LS_FW means shifting towards early bytes. + LS_BK means shifting towards later bytes. + */ +#ifdef __AARCH64EB__ +#define LS_FW lsl +#define LS_BK lsr +#else +#define LS_FW lsr +#define LS_BK lsl +#endif ENTRY (__strncmp_aarch64) PTR_ARG (0) @@ -51,9 +65,6 @@ ENTRY (__strncmp_aarch64) and count, src1, #7 b.ne L(misaligned8) cbnz count, L(mutual_align) - /* Calculate the number of full and partial words -1. */ - sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ - lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and @@ -63,56 +74,52 @@ L(loop_aligned): ldr data1, [src1], #8 ldr data2, [src2], #8 L(start_realigned): - subs limit_wd, limit_wd, #1 + subs limit, limit, #8 sub tmp1, data1, zeroones orr tmp2, data1, #REP8_7f eor diff, data1, data2 /* Non-zero if differences found. */ - csinv endloop, diff, xzr, pl /* Last Dword or differences. */ + csinv endloop, diff, xzr, hi /* Last Dword or differences. */ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ ccmp endloop, #0, #0, eq b.eq L(loop_aligned) /* End of main loop */ - /* Not reached the limit, must have found the end or a diff. */ - tbz limit_wd, #63, L(not_limit) - - /* Limit % 8 == 0 => all bytes significant. */ - ands limit, limit, #7 - b.eq L(not_limit) - - lsl limit, limit, #3 /* Bits -> bytes. */ - mov mask, #~0 -#ifdef __AARCH64EB__ - lsr mask, mask, limit -#else - lsl mask, mask, limit -#endif - bic data1, data1, mask - bic data2, data2, mask - - /* Make sure that the NUL byte is marked in the syndrome. */ - orr has_nul, has_nul, mask - -L(not_limit): +L(full_check): +#ifndef __AARCH64EB__ orr syndrome, diff, has_nul - -#ifndef __AARCH64EB__ + add limit, limit, 8 /* Rewind limit to before last subs. */ +L(syndrome_check): + /* Limit was reached. Check if the NUL byte or the difference + is before the limit. */ rev syndrome, syndrome rev data1, data1 - /* The MS-non-zero bit of the syndrome marks either the first bit - that is different, or the top bit of the first zero byte. - Shifting left now will bring the critical information into the - top bits. */ clz pos, syndrome rev data2, data2 lsl data1, data1, pos + cmp limit, pos, lsr #3 lsl data2, data2, pos /* But we need to zero-extend (char is unsigned) the value and then perform a signed 32-bit subtraction. */ lsr data1, data1, #56 sub result, data1, data2, lsr #56 + csel result, result, xzr, hi ret #else + /* Not reached the limit, must have found the end or a diff. */ + tbz limit, #63, L(not_limit) + add tmp1, limit, 8 + cbz limit, L(not_limit) + + lsl limit, tmp1, #3 /* Bits -> bytes. */ + mov mask, #~0 + lsr mask, mask, limit + bic data1, data1, mask + bic data2, data2, mask + + /* Make sure that the NUL byte is marked in the syndrome. */ + orr has_nul, has_nul, mask + +L(not_limit): /* For big-endian we cannot use the trick with the syndrome value as carry-propagation can corrupt the upper bits if the trailing bytes in the string contain 0x01. */ @@ -133,10 +140,11 @@ L(not_limit): rev has_nul, has_nul orr syndrome, diff, has_nul clz pos, syndrome - /* The MS-non-zero bit of the syndrome marks either the first bit - that is different, or the top bit of the first zero byte. + /* The most-significant-non-zero bit of the syndrome marks either the + first bit that is different, or the top bit of the first zero byte. Shifting left now will bring the critical information into the top bits. */ +L(end_quick): lsl data1, data1, pos lsl data2, data2, pos /* But we need to zero-extend (char is unsigned) the value and then @@ -158,22 +166,12 @@ L(mutual_align): neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ ldr data2, [src2], #8 mov tmp2, #~0 - sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */ -#endif - and tmp3, limit_wd, #7 - lsr limit_wd, limit_wd, #3 - /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */ - add limit, limit, count - add tmp3, tmp3, count + LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ + /* Adjust the limit and ensure it doesn't overflow. */ + adds limit, limit, count + csinv limit, limit, xzr, lo orr data1, data1, tmp2 orr data2, data2, tmp2 - add limit_wd, limit_wd, tmp3, lsr #3 b L(start_realigned) .p2align 4 @@ -196,13 +194,11 @@ L(done): /* Align the SRC1 to a dword by doing a bytewise compare and then do the dword loop. */ L(try_misaligned_words): - lsr limit_wd, limit, #3 - cbz count, L(do_misaligned) + cbz count, L(src1_aligned) neg count, count and count, count, #7 sub limit, limit, count - lsr limit_wd, limit, #3 L(page_end_loop): ldrb data1w, [src1], #1 @@ -213,48 +209,100 @@ L(page_end_loop): subs count, count, #1 b.hi L(page_end_loop) -L(do_misaligned): - /* Prepare ourselves for the next page crossing. Unlike the aligned - loop, we fetch 1 less dword because we risk crossing bounds on - SRC2. */ - mov count, #8 - subs limit_wd, limit_wd, #1 - b.lo L(done_loop) -L(loop_misaligned): - and tmp2, src2, #0xff8 - eor tmp2, tmp2, #0xff8 - cbz tmp2, L(page_end_loop) + /* The following diagram explains the comparison of misaligned strings. + The bytes are shown in natural order. For little-endian, it is + reversed in the registers. The "x" bytes are before the string. + The "|" separates data that is loaded at one time. + src1 | a a a a a a a a | b b b c c c c c | . . . + src2 | x x x x x a a a a a a a a b b b | c c c c c . . . + + After shifting in each step, the data looks like this: + STEP_A STEP_B STEP_C + data1 a a a a a a a a b b b c c c c c b b b c c c c c + data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c + The bytes with "0" are eliminated from the syndrome via mask. + + Align SRC2 down to 16 bytes. This way we can read 16 bytes at a + time from SRC2. The comparison happens in 3 steps. After each step + the loop can exit, or read from SRC1 or SRC2. */ +L(src1_aligned): + /* Calculate offset from 8 byte alignment to string start in bits. No + need to mask offset since shifts are ignoring upper bits. */ + lsl offset, src2, #3 + bic src2, src2, #0xf + mov mask, -1 + neg neg_offset, offset ldr data1, [src1], #8 - ldr data2, [src2], #8 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - eor diff, data1, data2 /* Non-zero if differences found. */ - bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ - ccmp diff, #0, #0, eq - b.ne L(not_limit) - subs limit_wd, limit_wd, #1 - b.pl L(loop_misaligned) + ldp tmp1, tmp2, [src2], #16 + LS_BK mask, mask, neg_offset + and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ + /* Skip the first compare if data in tmp1 is irrelevant. */ + tbnz offset, 6, L(misaligned_mid_loop) -L(done_loop): - /* We found a difference or a NULL before the limit was reached. */ - and limit, limit, #7 - cbz limit, L(not_limit) - /* Read the last word. */ - sub src1, src1, 8 - sub src2, src2, 8 - ldr data1, [src1, limit] - ldr data2, [src2, limit] - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f +L(loop_misaligned): + /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ + LS_FW data2, tmp1, offset + LS_BK tmp1, tmp2, neg_offset + subs limit, limit, #8 + orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ + sub has_nul, data1, zeroones eor diff, data1, data2 /* Non-zero if differences found. */ - bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ - ccmp diff, #0, #0, eq - b.ne L(not_limit) + orr tmp3, data1, #REP8_7f + csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ + bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ + orr tmp3, endloop, has_nul + cbnz tmp3, L(full_check) + + ldr data1, [src1], #8 +L(misaligned_mid_loop): + /* STEP_B: Compare first part of data1 to second part of tmp2. */ + LS_FW data2, tmp2, offset +#ifdef __AARCH64EB__ + /* For big-endian we do a byte reverse to avoid carry-propagation + problem described above. This way we can reuse the has_nul in the + next step and also use syndrome value trick at the end. */ + rev tmp3, data1 + #define data1_fixed tmp3 +#else + #define data1_fixed data1 +#endif + sub has_nul, data1_fixed, zeroones + orr tmp3, data1_fixed, #REP8_7f + eor diff, data2, data1 /* Non-zero if differences found. */ + bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ +#ifdef __AARCH64EB__ + rev has_nul, has_nul +#endif + cmp limit, neg_offset, lsr #3 + orr syndrome, diff, has_nul + bic syndrome, syndrome, mask /* Ignore later bytes. */ + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ + cbnz tmp3, L(syndrome_check) + + /* STEP_C: Compare second part of data1 to first part of tmp1. */ + ldp tmp1, tmp2, [src2], #16 + cmp limit, #8 + LS_BK data2, tmp1, neg_offset + eor diff, data2, data1 /* Non-zero if differences found. */ + orr syndrome, diff, has_nul + and syndrome, syndrome, mask /* Ignore earlier bytes. */ + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ + cbnz tmp3, L(syndrome_check) + + ldr data1, [src1], #8 + sub limit, limit, #8 + b L(loop_misaligned) + +#ifdef __AARCH64EB__ +L(syndrome_check): + clz pos, syndrome + cmp pos, limit, lsl #3 + b.lo L(end_quick) +#endif L(ret0): mov result, #0 ret - -END ( __strncmp_aarch64) +END(__strncmp_aarch64) diff --git a/contrib/arm-optimized-routines/string/aarch64/strnlen-sve.S b/contrib/arm-optimized-routines/string/aarch64/strnlen-sve.S index 5b9ebf7763bc..6c43dc427da7 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strnlen-sve.S +++ b/contrib/arm-optimized-routines/string/aarch64/strnlen-sve.S @@ -1,11 +1,11 @@ /* * strnlen - calculate the length of a string with limit. * - * Copyright (c) 2019-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/contrib/arm-optimized-routines/string/aarch64/strnlen.S b/contrib/arm-optimized-routines/string/aarch64/strnlen.S index 48d2495d2082..f2090a7485a5 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strnlen.S +++ b/contrib/arm-optimized-routines/string/aarch64/strnlen.S @@ -1,8 +1,8 @@ /* * strnlen - calculate the length of a string with limit. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define cntin x1 @@ -20,39 +20,30 @@ #define src x2 #define synd x3 #define shift x4 -#define wtmp w4 #define tmp x4 #define cntrem x5 #define qdata q0 #define vdata v0 #define vhas_chr v1 -#define vrepmask v2 -#define vend v3 -#define dend d3 +#define vend v2 +#define dend d2 /* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ + Process the string in 16-byte aligned chunks. Compute a 64-bit mask with + four bits per byte using the shrn instruction. A count trailing zeros then + identifies the first zero byte. */ ENTRY (__strnlen_aarch64) PTR_ARG (0) SIZE_ARG (1) bic src, srcin, 15 - mov wtmp, 0xf00f cbz cntin, L(nomatch) - ld1 {vdata.16b}, [src], 16 - dup vrepmask.8h, wtmp + ld1 {vdata.16b}, [src] cmeq vhas_chr.16b, vdata.16b, 0 lsl shift, srcin, 2 - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ fmov synd, dend lsr synd, synd, shift cbz synd, L(start_loop) @@ -64,37 +55,40 @@ L(finish): csel result, cntin, result, ls ret +L(nomatch): + mov result, cntin + ret + L(start_loop): sub tmp, src, srcin + add tmp, tmp, 17 subs cntrem, cntin, tmp - b.ls L(nomatch) + b.lo L(nomatch) /* Make sure that it won't overread by a 16-byte chunk */ - add tmp, cntrem, 15 - tbnz tmp, 4, L(loop32_2) - + tbz cntrem, 4, L(loop32_2) + sub src, src, 16 .p2align 5 L(loop32): - ldr qdata, [src], 16 + ldr qdata, [src, 32]! cmeq vhas_chr.16b, vdata.16b, 0 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbnz synd, L(end) L(loop32_2): - ldr qdata, [src], 16 + ldr qdata, [src, 16] subs cntrem, cntrem, 32 cmeq vhas_chr.16b, vdata.16b, 0 - b.ls L(end) + b.lo L(end_2) umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbz synd, L(loop32) - +L(end_2): + add src, src, 16 L(end): - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ - sub src, src, 16 - mov synd, vend.d[0] + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ sub result, src, srcin + fmov synd, dend #ifndef __AARCH64EB__ rbit synd, synd #endif @@ -104,9 +98,5 @@ L(end): csel result, cntin, result, ls ret -L(nomatch): - mov result, cntin - ret - END (__strnlen_aarch64) diff --git a/contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S b/contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S index 1e4fb1a68f7e..bb61ab9ad4e7 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S +++ b/contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S @@ -1,8 +1,8 @@ /* * strrchr - find last position of a character in a string. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define chrin w1 @@ -19,7 +19,6 @@ #define src x2 #define tmp x3 -#define wtmp w3 #define synd x3 #define shift x4 #define src_match x4 @@ -31,7 +30,6 @@ #define vhas_nul v2 #define vhas_chr v3 #define vrepmask v4 -#define vrepmask2 v5 #define vend v5 #define dend d5 @@ -47,55 +45,67 @@ ENTRY (__strrchr_aarch64_mte) PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin - mov wtmp, 0x3003 - dup vrepmask.8h, wtmp - tst srcin, 15 - beq L(loop1) - - ld1 {vdata.16b}, [src], 16 + movi vrepmask.16b, 0x33 + ld1 {vdata.16b}, [src] cmeq vhas_nul.16b, vdata.16b, 0 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - mov wtmp, 0xf00f - dup vrepmask2.8h, wtmp bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b + shrn vend.8b, vhas_nul.8h, 4 lsl shift, srcin, 2 fmov synd, dend lsr synd, synd, shift lsl synd, synd, shift ands nul_match, synd, 0xcccccccccccccccc bne L(tail) - cbnz synd, L(loop2) + cbnz synd, L(loop2_start) - .p2align 5 + .p2align 4 L(loop1): - ld1 {vdata.16b}, [src], 16 + ldr q1, [src, 16] + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbnz synd, L(loop1_end) + ldr q1, [src, 32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b umaxp vend.16b, vhas_nul.16b, vhas_nul.16b fmov synd, dend cbz synd, L(loop1) - + sub src, src, 16 +L(loop1_end): + add src, src, 16 cmeq vhas_nul.16b, vdata.16b, 0 +#ifdef __AARCH64EB__ + bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b + shrn vend.8b, vhas_nul.8h, 4 + fmov synd, dend + rbit synd, synd +#else bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - bic vhas_nul.8h, 0x0f, lsl 8 - addp vend.16b, vhas_nul.16b, vhas_nul.16b + shrn vend.8b, vhas_nul.8h, 4 fmov synd, dend +#endif ands nul_match, synd, 0xcccccccccccccccc - beq L(loop2) - + beq L(loop2_start) L(tail): sub nul_match, nul_match, 1 and chr_match, synd, 0x3333333333333333 ands chr_match, chr_match, nul_match - sub result, src, 1 + add result, src, 15 clz tmp, chr_match sub result, result, tmp, lsr 2 csel result, result, xzr, ne ret .p2align 4 + nop + nop +L(loop2_start): + add src, src, 16 + bic vrepmask.8h, 0xf0 + L(loop2): cmp synd, 0 csel src_match, src, src_match, ne diff --git a/contrib/arm-optimized-routines/string/aarch64/strrchr-sve.S b/contrib/arm-optimized-routines/string/aarch64/strrchr-sve.S index d36d69af37fd..825a7384cfc1 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strrchr-sve.S +++ b/contrib/arm-optimized-routines/string/aarch64/strrchr-sve.S @@ -1,11 +1,11 @@ /* * strrchr - find the last of a character in a string * - * Copyright (c) 2019-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/contrib/arm-optimized-routines/string/aarch64/strrchr.S b/contrib/arm-optimized-routines/string/aarch64/strrchr.S index 56185ff534e3..bf9cb297b6cb 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strrchr.S +++ b/contrib/arm-optimized-routines/string/aarch64/strrchr.S @@ -1,8 +1,8 @@ /* * strrchr - find last position of a character in a string. * - * Copyright (c) 2014-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2014-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * Neon Available. */ -#include "../asmdefs.h" +#include "asmdefs.h" /* Arguments and results. */ #define srcin x0 diff --git a/contrib/arm-optimized-routines/string/arm/asmdefs.h b/contrib/arm-optimized-routines/string/arm/asmdefs.h new file mode 100644 index 000000000000..e31188804716 --- /dev/null +++ b/contrib/arm-optimized-routines/string/arm/asmdefs.h @@ -0,0 +1,477 @@ +/* + * Macros for asm code. Arm version. + * + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef _ASMDEFS_H +#define _ASMDEFS_H + +/* Check whether leaf function PAC signing has been requested in the + -mbranch-protect compile-time option. */ +#define LEAF_PROTECT_BIT 2 + +#ifdef __ARM_FEATURE_PAC_DEFAULT +# define HAVE_PAC_LEAF \ + ((__ARM_FEATURE_PAC_DEFAULT & (1 << LEAF_PROTECT_BIT)) && 1) +#else +# define HAVE_PAC_LEAF 0 +#endif + +/* Provide default parameters for PAC-code handling in leaf-functions. */ +#if HAVE_PAC_LEAF +# ifndef PAC_LEAF_PUSH_IP +# define PAC_LEAF_PUSH_IP 1 +# endif +#else /* !HAVE_PAC_LEAF */ +# undef PAC_LEAF_PUSH_IP +# define PAC_LEAF_PUSH_IP 0 +#endif /* HAVE_PAC_LEAF */ + +#define STACK_ALIGN_ENFORCE 0 + +/****************************************************************************** +* Implementation of the prologue and epilogue assembler macros and their +* associated helper functions. +* +* These functions add support for the following: +* +* - M-profile branch target identification (BTI) landing-pads when compiled +* with `-mbranch-protection=bti'. +* - PAC-signing and verification instructions, depending on hardware support +* and whether the PAC-signing of leaf functions has been requested via the +* `-mbranch-protection=pac-ret+leaf' compiler argument. +* - 8-byte stack alignment preservation at function entry, defaulting to the +* value of STACK_ALIGN_ENFORCE. +* +* Notes: +* - Prologue stack alignment is implemented by detecting a push with an odd +* number of registers and prepending a dummy register to the list. +* - If alignment is attempted on a list containing r0, compilation will result +* in an error. +* - If alignment is attempted in a list containing r1, r0 will be prepended to +* the register list and r0 will be restored prior to function return. for +* functions with non-void return types, this will result in the corruption of +* the result register. +* - Stack alignment is enforced via the following helper macro call-chain: +* +* {prologue|epilogue} ->_align8 -> _preprocess_reglist -> +* _preprocess_reglist1 -> {_prologue|_epilogue} +* +* - Debug CFI directives are automatically added to prologues and epilogues, +* assisted by `cfisavelist' and `cfirestorelist', respectively. +* +* Arguments: +* prologue +* -------- +* - first - If `last' specified, this serves as start of general-purpose +* register (GPR) range to push onto stack, otherwise represents +* single GPR to push onto stack. If omitted, no GPRs pushed +* onto stack at prologue. +* - last - If given, specifies inclusive upper-bound of GPR range. +* - push_ip - Determines whether IP register is to be pushed to stack at +* prologue. When pac-signing is requested, this holds the +* the pac-key. Either 1 or 0 to push or not push, respectively. +* Default behavior: Set to value of PAC_LEAF_PUSH_IP macro. +* - push_lr - Determines whether to push lr to the stack on function entry. +* Either 1 or 0 to push or not push, respectively. +* - align8 - Whether to enforce alignment. Either 1 or 0, with 1 requesting +* alignment. +* +* epilogue +* -------- +* The epilogue should be called passing the same arguments as those passed to +* the prologue to ensure the stack is not corrupted on function return. +* +* Usage examples: +* +* prologue push_ip=1 -> push {ip} +* epilogue push_ip=1, align8=1 -> pop {r2, ip} +* prologue push_ip=1, push_lr=1 -> push {ip, lr} +* epilogue 1 -> pop {r1} +* prologue 1, align8=1 -> push {r0, r1} +* epilogue 1, push_ip=1 -> pop {r1, ip} +* prologue 1, 4 -> push {r1-r4} +* epilogue 1, 4 push_ip=1 -> pop {r1-r4, ip} +* +******************************************************************************/ + +/* Emit .cfi_restore directives for a consecutive sequence of registers. */ + .macro cfirestorelist first, last + .cfi_restore \last + .if \last-\first + cfirestorelist \first, \last-1 + .endif + .endm + +/* Emit .cfi_offset directives for a consecutive sequence of registers. */ + .macro cfisavelist first, last, index=1 + .cfi_offset \last, -4*(\index) + .if \last-\first + cfisavelist \first, \last-1, \index+1 + .endif + .endm + +.macro _prologue first=-1, last=-1, push_ip=PAC_LEAF_PUSH_IP, push_lr=0 + .if \push_ip & 1 != \push_ip + .error "push_ip may be either 0 or 1" + .endif + .if \push_lr & 1 != \push_lr + .error "push_lr may be either 0 or 1" + .endif + .if \first != -1 + .if \last == -1 + /* Upper-bound not provided: Set upper = lower. */ + _prologue \first, \first, \push_ip, \push_lr + .exitm + .endif + .endif +#if HAVE_PAC_LEAF +# if __ARM_FEATURE_BTI_DEFAULT + pacbti ip, lr, sp +# else + pac ip, lr, sp +# endif /* __ARM_FEATURE_BTI_DEFAULT */ + .cfi_register 143, 12 +#else +# if __ARM_FEATURE_BTI_DEFAULT + bti +# endif /* __ARM_FEATURE_BTI_DEFAULT */ +#endif /* HAVE_PAC_LEAF */ + .if \first != -1 + .if \last != \first + .if \last >= 13 + .error "SP cannot be in the save list" + .endif + .if \push_ip + .if \push_lr + /* Case 1: push register range, ip and lr registers. */ + push {r\first-r\last, ip, lr} + .cfi_adjust_cfa_offset ((\last-\first)+3)*4 + .cfi_offset 14, -4 + .cfi_offset 143, -8 + cfisavelist \first, \last, 3 + .else // !\push_lr + /* Case 2: push register range and ip register. */ + push {r\first-r\last, ip} + .cfi_adjust_cfa_offset ((\last-\first)+2)*4 + .cfi_offset 143, -4 + cfisavelist \first, \last, 2 + .endif + .else // !\push_ip + .if \push_lr + /* Case 3: push register range and lr register. */ + push {r\first-r\last, lr} + .cfi_adjust_cfa_offset ((\last-\first)+2)*4 + .cfi_offset 14, -4 + cfisavelist \first, \last, 2 + .else // !\push_lr + /* Case 4: push register range. */ + push {r\first-r\last} + .cfi_adjust_cfa_offset ((\last-\first)+1)*4 + cfisavelist \first, \last, 1 + .endif + .endif + .else // \last == \first + .if \push_ip + .if \push_lr + /* Case 5: push single GP register plus ip and lr registers. */ + push {r\first, ip, lr} + .cfi_adjust_cfa_offset 12 + .cfi_offset 14, -4 + .cfi_offset 143, -8 + cfisavelist \first, \first, 3 + .else // !\push_lr + /* Case 6: push single GP register plus ip register. */ + push {r\first, ip} + .cfi_adjust_cfa_offset 8 + .cfi_offset 143, -4 + cfisavelist \first, \first, 2 + .endif + .else // !\push_ip + .if \push_lr + /* Case 7: push single GP register plus lr register. */ + push {r\first, lr} + .cfi_adjust_cfa_offset 8 + .cfi_offset 14, -4 + cfisavelist \first, \first, 2 + .else // !\push_lr + /* Case 8: push single GP register. */ + push {r\first} + .cfi_adjust_cfa_offset 4 + cfisavelist \first, \first, 1 + .endif + .endif + .endif + .else // \first == -1 + .if \push_ip + .if \push_lr + /* Case 9: push ip and lr registers. */ + push {ip, lr} + .cfi_adjust_cfa_offset 8 + .cfi_offset 14, -4 + .cfi_offset 143, -8 + .else // !\push_lr + /* Case 10: push ip register. */ + push {ip} + .cfi_adjust_cfa_offset 4 + .cfi_offset 143, -4 + .endif + .else // !\push_ip + .if \push_lr + /* Case 11: push lr register. */ + push {lr} + .cfi_adjust_cfa_offset 4 + .cfi_offset 14, -4 + .endif + .endif + .endif +.endm + +.macro _epilogue first=-1, last=-1, push_ip=PAC_LEAF_PUSH_IP, push_lr=0 + .if \push_ip & 1 != \push_ip + .error "push_ip may be either 0 or 1" + .endif + .if \push_lr & 1 != \push_lr + .error "push_lr may be either 0 or 1" + .endif + .if \first != -1 + .if \last == -1 + /* Upper-bound not provided: Set upper = lower. */ + _epilogue \first, \first, \push_ip, \push_lr + .exitm + .endif + .if \last != \first + .if \last >= 13 + .error "SP cannot be in the save list" + .endif + .if \push_ip + .if \push_lr + /* Case 1: pop register range, ip and lr registers. */ + pop {r\first-r\last, ip, lr} + .cfi_restore 14 + .cfi_register 143, 12 + cfirestorelist \first, \last + .else // !\push_lr + /* Case 2: pop register range and ip register. */ + pop {r\first-r\last, ip} + .cfi_register 143, 12 + cfirestorelist \first, \last + .endif + .else // !\push_ip + .if \push_lr + /* Case 3: pop register range and lr register. */ + pop {r\first-r\last, lr} + .cfi_restore 14 + cfirestorelist \first, \last + .else // !\push_lr + /* Case 4: pop register range. */ + pop {r\first-r\last} + cfirestorelist \first, \last + .endif + .endif + .else // \last == \first + .if \push_ip + .if \push_lr + /* Case 5: pop single GP register plus ip and lr registers. */ + pop {r\first, ip, lr} + .cfi_restore 14 + .cfi_register 143, 12 + cfirestorelist \first, \first + .else // !\push_lr + /* Case 6: pop single GP register plus ip register. */ + pop {r\first, ip} + .cfi_register 143, 12 + cfirestorelist \first, \first + .endif + .else // !\push_ip + .if \push_lr + /* Case 7: pop single GP register plus lr register. */ + pop {r\first, lr} + .cfi_restore 14 + cfirestorelist \first, \first + .else // !\push_lr + /* Case 8: pop single GP register. */ + pop {r\first} + cfirestorelist \first, \first + .endif + .endif + .endif + .else // \first == -1 + .if \push_ip + .if \push_lr + /* Case 9: pop ip and lr registers. */ + pop {ip, lr} + .cfi_restore 14 + .cfi_register 143, 12 + .else // !\push_lr + /* Case 10: pop ip register. */ + pop {ip} + .cfi_register 143, 12 + .endif + .else // !\push_ip + .if \push_lr + /* Case 11: pop lr register. */ + pop {lr} + .cfi_restore 14 + .endif + .endif + .endif +#if HAVE_PAC_LEAF + aut ip, lr, sp +#endif /* HAVE_PAC_LEAF */ + bx lr +.endm + +/* Clean up expressions in 'last'. */ +.macro _preprocess_reglist1 first:req, last:req, push_ip:req, push_lr:req, reglist_op:req + .if \last == 0 + \reglist_op \first, 0, \push_ip, \push_lr + .elseif \last == 1 + \reglist_op \first, 1, \push_ip, \push_lr + .elseif \last == 2 + \reglist_op \first, 2, \push_ip, \push_lr + .elseif \last == 3 + \reglist_op \first, 3, \push_ip, \push_lr + .elseif \last == 4 + \reglist_op \first, 4, \push_ip, \push_lr + .elseif \last == 5 + \reglist_op \first, 5, \push_ip, \push_lr + .elseif \last == 6 + \reglist_op \first, 6, \push_ip, \push_lr + .elseif \last == 7 + \reglist_op \first, 7, \push_ip, \push_lr + .elseif \last == 8 + \reglist_op \first, 8, \push_ip, \push_lr + .elseif \last == 9 + \reglist_op \first, 9, \push_ip, \push_lr + .elseif \last == 10 + \reglist_op \first, 10, \push_ip, \push_lr + .elseif \last == 11 + \reglist_op \first, 11, \push_ip, \push_lr + .else + .error "last (\last) out of range" + .endif +.endm + +/* Clean up expressions in 'first'. */ +.macro _preprocess_reglist first:req, last, push_ip=0, push_lr=0, reglist_op:req + .ifb \last + _preprocess_reglist \first \first \push_ip \push_lr + .else + .if \first > \last + .error "last (\last) must be at least as great as first (\first)" + .endif + .if \first == 0 + _preprocess_reglist1 0, \last, \push_ip, \push_lr, \reglist_op + .elseif \first == 1 + _preprocess_reglist1 1, \last, \push_ip, \push_lr, \reglist_op + .elseif \first == 2 + _preprocess_reglist1 2, \last, \push_ip, \push_lr, \reglist_op + .elseif \first == 3 + _preprocess_reglist1 3, \last, \push_ip, \push_lr, \reglist_op + .elseif \first == 4 + _preprocess_reglist1 4, \last, \push_ip, \push_lr, \reglist_op + .elseif \first == 5 + _preprocess_reglist1 5, \last, \push_ip, \push_lr, \reglist_op + .elseif \first == 6 + _preprocess_reglist1 6, \last, \push_ip, \push_lr, \reglist_op + .elseif \first == 7 + _preprocess_reglist1 7, \last, \push_ip, \push_lr, \reglist_op + .elseif \first == 8 + _preprocess_reglist1 8, \last, \push_ip, \push_lr, \reglist_op + .elseif \first == 9 + _preprocess_reglist1 9, \last, \push_ip, \push_lr, \reglist_op + .elseif \first == 10 + _preprocess_reglist1 10, \last, \push_ip, \push_lr, \reglist_op + .elseif \first == 11 + _preprocess_reglist1 11, \last, \push_ip, \push_lr, \reglist_op + .else + .error "first (\first) out of range" + .endif + .endif +.endm + +.macro _align8 first, last, push_ip=0, push_lr=0, reglist_op=_prologue + .ifb \first + .ifnb \last + .error "can't have last (\last) without specifying first" + .else // \last not blank + .if ((\push_ip + \push_lr) % 2) == 0 + \reglist_op first=-1, last=-1, push_ip=\push_ip, push_lr=\push_lr + .exitm + .else // ((\push_ip + \push_lr) % 2) odd + _align8 2, 2, \push_ip, \push_lr, \reglist_op + .exitm + .endif // ((\push_ip + \push_lr) % 2) == 0 + .endif // .ifnb \last + .endif // .ifb \first + + .ifb \last + _align8 \first, \first, \push_ip, \push_lr, \reglist_op + .else + .if \push_ip & 1 <> \push_ip + .error "push_ip may be 0 or 1" + .endif + .if \push_lr & 1 <> \push_lr + .error "push_lr may be 0 or 1" + .endif + .ifeq (\last - \first + \push_ip + \push_lr) % 2 + .if \first == 0 + .error "Alignment required and first register is r0" + .exitm + .endif + _preprocess_reglist \first-1, \last, \push_ip, \push_lr, \reglist_op + .else + _preprocess_reglist \first \last, \push_ip, \push_lr, \reglist_op + .endif + .endif +.endm + +.macro prologue first, last, push_ip=PAC_LEAF_PUSH_IP, push_lr=0, align8=STACK_ALIGN_ENFORCE + .if \align8 + _align8 \first, \last, \push_ip, \push_lr, _prologue + .else + _prologue first=\first, last=\last, push_ip=\push_ip, push_lr=\push_lr + .endif +.endm + +.macro epilogue first, last, push_ip=PAC_LEAF_PUSH_IP, push_lr=0, align8=STACK_ALIGN_ENFORCE + .if \align8 + _align8 \first, \last, \push_ip, \push_lr, reglist_op=_epilogue + .else + _epilogue first=\first, last=\last, push_ip=\push_ip, push_lr=\push_lr + .endif +.endm + +#define ENTRY_ALIGN(name, alignment) \ + .global name; \ + .type name,%function; \ + .align alignment; \ + name: \ + .fnstart; \ + .cfi_startproc; + +#define ENTRY(name) ENTRY_ALIGN(name, 6) + +#define ENTRY_ALIAS(name) \ + .global name; \ + .type name,%function; \ + name: + +#if defined (IS_LEAF) +# define END_UNWIND .cantunwind; +#else +# define END_UNWIND +#endif + +#define END(name) \ + .cfi_endproc; \ + END_UNWIND \ + .fnend; \ + .size name, .-name; + +#define L(l) .L ## l + +#endif diff --git a/contrib/arm-optimized-routines/string/arm/check-arch.S b/contrib/arm-optimized-routines/string/arm/check-arch.S index 1cff9345e343..95516710fb85 100644 --- a/contrib/arm-optimized-routines/string/arm/check-arch.S +++ b/contrib/arm-optimized-routines/string/arm/check-arch.S @@ -1,10 +1,13 @@ /* * check ARCH setting. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #if !__arm__ # error ARCH setting does not match the compiler. #endif + +/* For attributes that may affect ABI. */ +#include "asmdefs.h" diff --git a/contrib/arm-optimized-routines/string/arm/memchr.S b/contrib/arm-optimized-routines/string/arm/memchr.S index 3f1ac4df136f..823d6013eb35 100644 --- a/contrib/arm-optimized-routines/string/arm/memchr.S +++ b/contrib/arm-optimized-routines/string/arm/memchr.S @@ -1,8 +1,8 @@ /* * memchr - scan memory for a character * - * Copyright (c) 2010-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2010-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* @@ -23,7 +23,11 @@ @ Removed unneeded cbz from align loop .syntax unified +#if __ARM_ARCH >= 8 && __ARM_ARCH_PROFILE == 'M' + /* keep config inherited from -march= */ +#else .arch armv7-a +#endif @ this lets us check a flag in a 00/ff byte easily in either endianness #ifdef __ARMEB__ @@ -32,6 +36,8 @@ #define CHARTSTMASK(c) 1<<(c*8) #endif .thumb +#include "asmdefs.h" + @ --------------------------------------------------------------------------- .thumb_func @@ -39,11 +45,14 @@ .p2align 4,,15 .global __memchr_arm .type __memchr_arm,%function + .fnstart + .cfi_startproc __memchr_arm: @ r0 = start of memory to scan @ r1 = character to look for @ r2 = length @ returns r0 = pointer to character or NULL if not found + prologue and r1,r1,#0xff @ Don't think we can trust the caller to actually pass a char cmp r2,#16 @ If it's short don't bother with anything clever @@ -64,6 +73,11 @@ __memchr_arm: 10: @ At this point, we are aligned, we know we have at least 8 bytes to work with push {r4,r5,r6,r7} + .cfi_adjust_cfa_offset 16 + .cfi_rel_offset 4, 0 + .cfi_rel_offset 5, 4 + .cfi_rel_offset 6, 8 + .cfi_rel_offset 7, 12 orr r1, r1, r1, lsl #8 @ expand the match word across to all bytes orr r1, r1, r1, lsl #16 bic r4, r2, #7 @ Number of double words to work with @@ -83,6 +97,11 @@ __memchr_arm: bne 15b @ (Flags from the subs above) If not run out of bytes then go around again pop {r4,r5,r6,r7} + .cfi_restore 7 + .cfi_restore 6 + .cfi_restore 5 + .cfi_restore 4 + .cfi_adjust_cfa_offset -16 and r1,r1,#0xff @ Get r1 back to a single character from the expansion above and r2,r2,#7 @ Leave the count remaining as the number after the double words have been done @@ -97,16 +116,25 @@ __memchr_arm: bne 21b @ on r2 flags 40: + .cfi_remember_state movs r0,#0 @ not found - bx lr + epilogue 50: + .cfi_restore_state + .cfi_remember_state subs r0,r0,#1 @ found - bx lr + epilogue 60: @ We're here because the fast path found a hit - now we have to track down exactly which word it was @ r0 points to the start of the double word after the one that was tested @ r5 has the 00/ff pattern for the first word, r6 has the chained value + .cfi_restore_state @ Standard post-prologue state + .cfi_adjust_cfa_offset 16 + .cfi_rel_offset 4, 0 + .cfi_rel_offset 5, 4 + .cfi_rel_offset 6, 8 + .cfi_rel_offset 7, 12 cmp r5, #0 itte eq moveq r5, r6 @ the end is in the 2nd word @@ -126,7 +154,15 @@ __memchr_arm: 61: pop {r4,r5,r6,r7} + .cfi_restore 7 + .cfi_restore 6 + .cfi_restore 5 + .cfi_restore 4 + .cfi_adjust_cfa_offset -16 subs r0,r0,#1 - bx lr + epilogue + .cfi_endproc + .cantunwind + .fnend .size __memchr_arm, . - __memchr_arm diff --git a/contrib/arm-optimized-routines/string/arm/memcpy.S b/contrib/arm-optimized-routines/string/arm/memcpy.S index 86e64938edb1..2423cfd69061 100644 --- a/contrib/arm-optimized-routines/string/arm/memcpy.S +++ b/contrib/arm-optimized-routines/string/arm/memcpy.S @@ -1,8 +1,8 @@ /* * memcpy - copy memory area * - * Copyright (c) 2013-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2013-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* @@ -17,7 +17,7 @@ */ -#include "../asmdefs.h" +#include "asmdefs.h" .syntax unified /* This implementation requires ARM state. */ diff --git a/contrib/arm-optimized-routines/string/arm/memset.S b/contrib/arm-optimized-routines/string/arm/memset.S index 11e927368fd1..487b9d6a8f6c 100644 --- a/contrib/arm-optimized-routines/string/arm/memset.S +++ b/contrib/arm-optimized-routines/string/arm/memset.S @@ -2,7 +2,7 @@ * memset - fill memory with a constant * * Copyright (c) 2010-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* diff --git a/contrib/arm-optimized-routines/string/arm/strcmp-armv6m.S b/contrib/arm-optimized-routines/string/arm/strcmp-armv6m.S index b75d4143db57..4d55306810ad 100644 --- a/contrib/arm-optimized-routines/string/arm/strcmp-armv6m.S +++ b/contrib/arm-optimized-routines/string/arm/strcmp-armv6m.S @@ -1,10 +1,12 @@ /* * strcmp for ARMv6-M (optimized for performance, not size) * - * Copyright (c) 2014-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2014-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +#include "asmdefs.h" + #if __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1 .thumb_func diff --git a/contrib/arm-optimized-routines/string/arm/strcmp.S b/contrib/arm-optimized-routines/string/arm/strcmp.S index 51443e343058..74b3d235fb18 100644 --- a/contrib/arm-optimized-routines/string/arm/strcmp.S +++ b/contrib/arm-optimized-routines/string/arm/strcmp.S @@ -1,8 +1,8 @@ /* * strcmp for ARMv7 * - * Copyright (c) 2012-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2012-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 @@ -12,7 +12,7 @@ is sufficiently aligned. Use saturating arithmetic to optimize the compares. */ -#include "../asmdefs.h" +#include "asmdefs.h" /* Build Options: STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first @@ -26,6 +26,11 @@ #define STRCMP_NO_PRECHECK 0 +/* Ensure the .cantunwind directive is prepended to .fnend. + Leaf functions cannot throw exceptions - EHABI only supports + synchronous exceptions. */ +#define IS_LEAF + /* This version uses Thumb-2 code. */ .thumb .syntax unified @@ -98,8 +103,9 @@ ldrd r4, r5, [sp], #16 .cfi_restore 4 .cfi_restore 5 + .cfi_adjust_cfa_offset -16 sub result, result, r1, lsr #24 - bx lr + epilogue push_ip=HAVE_PAC_LEAF #else /* To use the big-endian trick we'd have to reverse all three words. that's slower than this approach. */ @@ -119,21 +125,15 @@ ldrd r4, r5, [sp], #16 .cfi_restore 4 .cfi_restore 5 + .cfi_adjust_cfa_offset -16 sub result, result, r1 - bx lr + epilogue push_ip=HAVE_PAC_LEAF #endif .endm - .p2align 5 -L(strcmp_start_addr): -#if STRCMP_NO_PRECHECK == 0 -L(fastpath_exit): - sub r0, r2, r3 - bx lr - nop -#endif -ENTRY_ALIGN (__strcmp_arm, 0) +ENTRY(__strcmp_arm) + prologue push_ip=HAVE_PAC_LEAF #if STRCMP_NO_PRECHECK == 0 ldrb r2, [src1] ldrb r3, [src2] @@ -143,13 +143,13 @@ ENTRY_ALIGN (__strcmp_arm, 0) bne L(fastpath_exit) #endif strd r4, r5, [sp, #-16]! - .cfi_def_cfa_offset 16 - .cfi_offset 4, -16 - .cfi_offset 5, -12 + .cfi_adjust_cfa_offset 16 + .cfi_rel_offset 4, 0 + .cfi_rel_offset 5, 4 orr tmp1, src1, src2 strd r6, r7, [sp, #8] - .cfi_offset 6, -8 - .cfi_offset 7, -4 + .cfi_rel_offset 6, 8 + .cfi_rel_offset 7, 12 mvn const_m1, #0 lsl r2, tmp1, #29 cbz r2, L(loop_aligned8) @@ -318,10 +318,19 @@ L(misaligned_exit): mov result, tmp1 ldr r4, [sp], #16 .cfi_restore 4 - bx lr + .cfi_adjust_cfa_offset -16 + epilogue push_ip=HAVE_PAC_LEAF #if STRCMP_NO_PRECHECK == 0 +L(fastpath_exit): + .cfi_restore_state + .cfi_remember_state + sub r0, r2, r3 + epilogue push_ip=HAVE_PAC_LEAF + L(aligned_m1): + .cfi_restore_state + .cfi_remember_state add src2, src2, #4 #endif L(src1_aligned): @@ -368,9 +377,9 @@ L(overlap3): /* R6/7 Not used in this sequence. */ .cfi_restore 6 .cfi_restore 7 + .cfi_adjust_cfa_offset -16 neg result, result - bx lr - + epilogue push_ip=HAVE_PAC_LEAF 6: .cfi_restore_state S2LO data1, data1, #24 @@ -445,7 +454,8 @@ L(strcmp_done_equal): /* R6/7 not used in this sequence. */ .cfi_restore 6 .cfi_restore 7 - bx lr + .cfi_adjust_cfa_offset -16 + epilogue push_ip=HAVE_PAC_LEAF L(strcmp_tail): .cfi_restore_state @@ -467,8 +477,9 @@ L(strcmp_tail): /* R6/7 not used in this sequence. */ .cfi_restore 6 .cfi_restore 7 + .cfi_adjust_cfa_offset -16 sub result, result, data2, lsr #24 - bx lr + epilogue push_ip=HAVE_PAC_LEAF END (__strcmp_arm) diff --git a/contrib/arm-optimized-routines/string/arm/strcpy.c b/contrib/arm-optimized-routines/string/arm/strcpy.c index 02cf94ff4be0..b5728a2534f0 100644 --- a/contrib/arm-optimized-routines/string/arm/strcpy.c +++ b/contrib/arm-optimized-routines/string/arm/strcpy.c @@ -2,7 +2,7 @@ * strcpy * * Copyright (c) 2008-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #if defined (__thumb2__) && !defined (__thumb__) diff --git a/contrib/arm-optimized-routines/string/arm/strlen-armv6t2.S b/contrib/arm-optimized-routines/string/arm/strlen-armv6t2.S index 5ad30c941586..5eb8671bdc8b 100644 --- a/contrib/arm-optimized-routines/string/arm/strlen-armv6t2.S +++ b/contrib/arm-optimized-routines/string/arm/strlen-armv6t2.S @@ -1,8 +1,8 @@ /* * strlen - calculate the length of a string * - * Copyright (c) 2010-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2010-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2 @@ -13,7 +13,7 @@ */ -#include "../asmdefs.h" +#include "asmdefs.h" #ifdef __ARMEB__ #define S2LO lsl @@ -23,6 +23,11 @@ #define S2HI lsl #endif +/* Ensure the .cantunwind directive is prepended to .fnend. + Leaf functions cannot throw exceptions - EHABI only supports + synchronous exceptions. */ +#define IS_LEAF + /* This code requires Thumb. */ .thumb .syntax unified @@ -41,8 +46,8 @@ #define tmp2 r5 ENTRY (__strlen_armv6t2) + prologue 4 5 push_ip=HAVE_PAC_LEAF pld [srcin, #0] - strd r4, r5, [sp, #-8]! bic src, srcin, #7 mvn const_m1, #0 ands tmp1, srcin, #7 /* (8 - bytes) to alignment. */ @@ -92,6 +97,7 @@ L(start_realigned): beq L(loop_aligned) L(null_found): + .cfi_remember_state cmp data1a, #0 itt eq addeq result, result, #4 @@ -100,11 +106,11 @@ L(null_found): rev data1a, data1a #endif clz data1a, data1a - ldrd r4, r5, [sp], #8 add result, result, data1a, lsr #3 /* Bits -> Bytes. */ - bx lr + epilogue 4 5 push_ip=HAVE_PAC_LEAF L(misaligned8): + .cfi_restore_state ldrd data1a, data1b, [src] and tmp2, tmp1, #3 rsb result, tmp1, #0 diff --git a/contrib/arm-optimized-routines/string/bench/memcpy.c b/contrib/arm-optimized-routines/string/bench/memcpy.c index d5d4ea7e0309..b628f9b60d96 100644 --- a/contrib/arm-optimized-routines/string/bench/memcpy.c +++ b/contrib/arm-optimized-routines/string/bench/memcpy.c @@ -1,8 +1,8 @@ /* * memcpy benchmark. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #define _GNU_SOURCE @@ -13,14 +13,15 @@ #include "stringlib.h" #include "benchlib.h" -#define ITERS 5000 +#define ITERS 5000 #define ITERS2 20000000 -#define ITERS3 500000 -#define MAX_COPIES 8192 -#define SIZE (256*1024) +#define ITERS3 200000 +#define NUM_TESTS 16384 +#define MIN_SIZE 32768 +#define MAX_SIZE (1024 * 1024) -static uint8_t a[SIZE + 4096] __attribute__((__aligned__(64))); -static uint8_t b[SIZE + 4096] __attribute__((__aligned__(64))); +static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64))); +static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64))); #define F(x) {#x, x}, @@ -30,15 +31,21 @@ static const struct fun void *(*fun)(void *, const void *, size_t); } funtab[] = { - F(memcpy) #if __aarch64__ F(__memcpy_aarch64) # if __ARM_NEON F(__memcpy_aarch64_simd) # endif +# if __ARM_FEATURE_SVE + F(__memcpy_aarch64_sve) +# endif +# if WANT_MOPS + F(__memcpy_aarch64_mops) +# endif #elif __arm__ F(__memcpy_arm) #endif + F(memcpy) #undef F {0, 0} }; @@ -109,7 +116,7 @@ typedef struct uint64_t len : 16; } copy_t; -static copy_t copy[MAX_COPIES]; +static copy_t test_arr[NUM_TESTS]; typedef char *(*proto_t) (char *, const char *, size_t); @@ -140,14 +147,14 @@ init_copies (size_t max_size) size_t total = 0; /* Create a random set of copies with the given size and alignment distributions. */ - for (int i = 0; i < MAX_COPIES; i++) + for (int i = 0; i < NUM_TESTS; i++) { - copy[i].dst = (rand32 (0) & (max_size - 1)); - copy[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK]; - copy[i].src = (rand32 (0) & (max_size - 1)); - copy[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK]; - copy[i].len = size_arr[rand32 (0) & SIZE_MASK]; - total += copy[i].len; + test_arr[i].dst = (rand32 (0) & (max_size - 1)); + test_arr[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK]; + test_arr[i].src = (rand32 (0) & (max_size - 1)); + test_arr[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK]; + test_arr[i].len = size_arr[rand32 (0) & SIZE_MASK]; + total += test_arr[i].len; } return total; @@ -160,25 +167,27 @@ int main (void) memset (a, 1, sizeof (a)); memset (b, 2, sizeof (b)); - printf("Random memcpy:\n"); + printf("Random memcpy (bytes/ns):\n"); for (int f = 0; funtab[f].name != 0; f++) { size_t total = 0; uint64_t tsum = 0; - printf ("%22s (B/ns) ", funtab[f].name); + printf ("%22s ", funtab[f].name); rand32 (0x12345678); - for (int size = 16384; size <= SIZE; size *= 2) + for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2) { size_t copy_size = init_copies (size) * ITERS; - for (int c = 0; c < MAX_COPIES; c++) - funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len); + for (int c = 0; c < NUM_TESTS; c++) + funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src, + test_arr[c].len); uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS; i++) - for (int c = 0; c < MAX_COPIES; c++) - funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len); + for (int c = 0; c < NUM_TESTS; c++) + funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src, + test_arr[c].len); t = clock_get_ns () - t; total += copy_size; tsum += t; @@ -187,74 +196,147 @@ int main (void) printf( "avg %.2f\n", (double)total / tsum); } - printf ("\nMedium memcpy:\n"); + size_t total = 0; + uint64_t tsum = 0; + printf ("%22s ", "memcpy_call"); + rand32 (0x12345678); + + for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2) + { + size_t copy_size = init_copies (size) * ITERS; + + for (int c = 0; c < NUM_TESTS; c++) + memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len); + + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS; i++) + for (int c = 0; c < NUM_TESTS; c++) + memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len); + t = clock_get_ns () - t; + total += copy_size; + tsum += t; + printf ("%dK: %.2f ", size / 1024, (double)copy_size / t); + } + printf( "avg %.2f\n", (double)total / tsum); + + + printf ("\nAligned medium memcpy (bytes/ns):\n"); for (int f = 0; funtab[f].name != 0; f++) { - printf ("%22s (B/ns) ", funtab[f].name); + printf ("%22s ", funtab[f].name); - for (int size = 16; size <= 512; size *= 2) + for (int size = 8; size <= 512; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS2; i++) funtab[f].fun (b, a, size); t = clock_get_ns () - t; - printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, - size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t); + printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); } printf ("\n"); } - printf ("\nLarge memcpy:\n"); + printf ("%22s ", "memcpy_call"); + for (int size = 8; size <= 512; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + memcpy (b, a, size); + t = clock_get_ns () - t; + printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); + } + printf ("\n"); + + + printf ("\nUnaligned medium memcpy (bytes/ns):\n"); for (int f = 0; funtab[f].name != 0; f++) { - printf ("%22s (B/ns) ", funtab[f].name); + printf ("%22s ", funtab[f].name); - for (int size = 1024; size <= 32768; size *= 2) + for (int size = 8; size <= 512; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + funtab[f].fun (b + 3, a + 1, size); + t = clock_get_ns () - t; + printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); + } + printf ("\n"); + } + + printf ("%22s ", "memcpy_call"); + for (int size = 8; size <= 512; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + memcpy (b + 3, a + 1, size); + t = clock_get_ns () - t; + printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); + } + printf ("\n"); + + + printf ("\nLarge memcpy (bytes/ns):\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s ", funtab[f].name); + + for (int size = 1024; size <= 65536; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS3; i++) funtab[f].fun (b, a, size); t = clock_get_ns () - t; - printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, - size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); + printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); } printf ("\n"); } - printf ("\nUnaligned forwards memmove:\n"); + printf ("%22s ", "memcpy_call"); + for (int size = 1024; size <= 65536; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + memcpy (b, a, size); + t = clock_get_ns () - t; + printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); + } + printf ("\n"); + + + printf ("\nUnaligned forwards memmove (bytes/ns):\n"); for (int f = 0; funtab[f].name != 0; f++) { - printf ("%22s (B/ns) ", funtab[f].name); + printf ("%22s ", funtab[f].name); - for (int size = 1024; size <= 32768; size *= 2) + for (int size = 1024; size <= 65536; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS3; i++) funtab[f].fun (a, a + 256 + (i & 31), size); t = clock_get_ns () - t; - printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, - size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); + printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); } printf ("\n"); } - printf ("\nUnaligned backwards memmove:\n"); + printf ("\nUnaligned backwards memmove (bytes/ns):\n"); for (int f = 0; funtab[f].name != 0; f++) { - printf ("%22s (B/ns) ", funtab[f].name); + printf ("%22s ", funtab[f].name); - for (int size = 1024; size <= 32768; size *= 2) + for (int size = 1024; size <= 65536; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS3; i++) funtab[f].fun (a + 256 + (i & 31), a, size); t = clock_get_ns () - t; - printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, - size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); + printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); } printf ("\n"); } + printf ("\n"); return 0; } diff --git a/contrib/arm-optimized-routines/string/bench/memset.c b/contrib/arm-optimized-routines/string/bench/memset.c new file mode 100644 index 000000000000..990e23ba9a36 --- /dev/null +++ b/contrib/arm-optimized-routines/string/bench/memset.c @@ -0,0 +1,243 @@ +/* + * memset benchmark. + * + * Copyright (c) 2021, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#define _GNU_SOURCE +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> +#include "stringlib.h" +#include "benchlib.h" + +#define ITERS 5000 +#define ITERS2 20000000 +#define ITERS3 1000000 +#define NUM_TESTS 16384 +#define MIN_SIZE 32768 +#define MAX_SIZE (1024 * 1024) + +static uint8_t a[MAX_SIZE + 4096] __attribute__((__aligned__(64))); + +#define F(x) {#x, x}, + +static const struct fun +{ + const char *name; + void *(*fun)(void *, int, size_t); +} funtab[] = +{ +#if __aarch64__ + F(__memset_aarch64) +#elif __arm__ + F(__memset_arm) +#endif + F(memset) +#undef F + {0, 0} +}; + +typedef struct { uint32_t offset : 20, len : 12; } memset_test_t; +static memset_test_t test_arr[NUM_TESTS]; + +typedef struct { uint16_t size; uint16_t freq; } freq_data_t; +typedef struct { uint8_t align; uint16_t freq; } align_data_t; + +#define SIZE_NUM 65536 +#define SIZE_MASK (SIZE_NUM-1) +static uint8_t len_arr[SIZE_NUM]; + +/* Frequency data for memset sizes up to 4096 based on SPEC2017. */ +static freq_data_t memset_len_freq[] = +{ +{40,28817}, {32,15336}, { 16,3823}, {296,3545}, { 24,3454}, { 8,1412}, +{292,1202}, { 48, 927}, { 12, 613}, { 11, 539}, {284, 493}, {108, 414}, +{ 88, 380}, { 20, 295}, {312, 271}, { 72, 233}, { 2, 200}, { 4, 192}, +{ 15, 180}, { 14, 174}, { 13, 160}, { 56, 151}, { 36, 144}, { 64, 140}, +{4095,133}, { 10, 130}, { 9, 124}, { 3, 124}, { 28, 120}, { 0, 118}, +{288, 110}, {1152, 96}, {104, 90}, { 1, 86}, {832, 76}, {248, 74}, +{1024, 69}, {120, 64}, {512, 63}, {384, 60}, { 6, 59}, { 80, 54}, +{ 17, 50}, { 7, 49}, {520, 47}, {2048, 39}, {256, 37}, {864, 33}, +{1440, 28}, { 22, 27}, {2056, 24}, {260, 23}, { 68, 23}, { 5, 22}, +{ 18, 21}, {200, 18}, {2120, 18}, { 60, 17}, { 52, 16}, {336, 15}, +{ 44, 13}, {192, 13}, {160, 12}, {2064, 12}, {128, 12}, { 76, 11}, +{164, 11}, {152, 10}, {136, 9}, {488, 7}, { 96, 6}, {560, 6}, +{1016, 6}, {112, 5}, {232, 5}, {168, 5}, {952, 5}, {184, 5}, +{144, 4}, {252, 4}, { 84, 3}, {960, 3}, {3808, 3}, {244, 3}, +{280, 3}, {224, 3}, {156, 3}, {1088, 3}, {440, 3}, {216, 2}, +{304, 2}, { 23, 2}, { 25, 2}, { 26, 2}, {264, 2}, {328, 2}, +{1096, 2}, {240, 2}, {1104, 2}, {704, 2}, {1664, 2}, {360, 2}, +{808, 1}, {544, 1}, {236, 1}, {720, 1}, {368, 1}, {424, 1}, +{640, 1}, {1112, 1}, {552, 1}, {272, 1}, {776, 1}, {376, 1}, +{ 92, 1}, {536, 1}, {824, 1}, {496, 1}, {760, 1}, {792, 1}, +{504, 1}, {344, 1}, {1816, 1}, {880, 1}, {176, 1}, {320, 1}, +{352, 1}, {2008, 1}, {208, 1}, {408, 1}, {228, 1}, {2072, 1}, +{568, 1}, {220, 1}, {616, 1}, {600, 1}, {392, 1}, {696, 1}, +{2144, 1}, {1280, 1}, {2136, 1}, {632, 1}, {584, 1}, {456, 1}, +{472, 1}, {3440, 1}, {2088, 1}, {680, 1}, {2928, 1}, {212, 1}, +{648, 1}, {1752, 1}, {664, 1}, {3512, 1}, {1032, 1}, {528, 1}, +{4072, 1}, {204, 1}, {2880, 1}, {3392, 1}, {712, 1}, { 59, 1}, +{736, 1}, {592, 1}, {2520, 1}, {744, 1}, {196, 1}, {172, 1}, +{728, 1}, {2040, 1}, {1192, 1}, {3600, 1}, {0, 0} +}; + +#define ALIGN_NUM 1024 +#define ALIGN_MASK (ALIGN_NUM-1) +static uint8_t align_arr[ALIGN_NUM]; + +/* Alignment data for memset based on SPEC2017. */ +static align_data_t memset_align_freq[] = +{ + {16, 338}, {8, 307}, {32, 148}, {64, 131}, {4, 72}, {1, 23}, {2, 5}, {0, 0} +}; + +static void +init_memset_distribution (void) +{ + int i, j, freq, size, n; + + for (n = i = 0; (freq = memset_len_freq[i].freq) != 0; i++) + for (j = 0, size = memset_len_freq[i].size; j < freq; j++) + len_arr[n++] = size; + assert (n == SIZE_NUM); + + for (n = i = 0; (freq = memset_align_freq[i].freq) != 0; i++) + for (j = 0, size = memset_align_freq[i].align; j < freq; j++) + align_arr[n++] = size - 1; + assert (n == ALIGN_NUM); +} + +static size_t +init_memset (size_t max_size) +{ + size_t total = 0; + /* Create a random set of memsets with the given size and alignment + distributions. */ + for (int i = 0; i < NUM_TESTS; i++) + { + test_arr[i].offset = (rand32 (0) & (max_size - 1)); + test_arr[i].offset &= ~align_arr[rand32 (0) & ALIGN_MASK]; + test_arr[i].len = len_arr[rand32 (0) & SIZE_MASK]; + total += test_arr[i].len; + } + + return total; +} + + +int main (void) +{ + init_memset_distribution (); + + memset (a, 1, sizeof (a)); + + printf("Random memset (bytes/ns):\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + size_t total_size = 0; + uint64_t tsum = 0; + printf ("%22s ", funtab[f].name); + rand32 (0x12345678); + + for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2) + { + size_t memset_size = init_memset (size) * ITERS; + + for (int c = 0; c < NUM_TESTS; c++) + funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len); + + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS; i++) + for (int c = 0; c < NUM_TESTS; c++) + funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len); + t = clock_get_ns () - t; + total_size += memset_size; + tsum += t; + printf ("%dK: %.2f ", size / 1024, (double)memset_size / t); + } + printf( "avg %.2f\n", (double)total_size / tsum); + } + + size_t total_size = 0; + uint64_t tsum = 0; + printf ("%22s ", "memset_call"); + rand32 (0x12345678); + + for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2) + { + size_t memset_size = init_memset (size) * ITERS; + + for (int c = 0; c < NUM_TESTS; c++) + memset (a + test_arr[c].offset, 0, test_arr[c].len); + + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS; i++) + for (int c = 0; c < NUM_TESTS; c++) + memset (a + test_arr[c].offset, 0, test_arr[c].len); + t = clock_get_ns () - t; + total_size += memset_size; + tsum += t; + printf ("%dK: %.2f ", size / 1024, (double)memset_size / t); + } + printf( "avg %.2f\n", (double)total_size / tsum); + + + printf ("\nMedium memset (bytes/ns):\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s ", funtab[f].name); + + for (int size = 8; size <= 512; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + funtab[f].fun (a, 0, size); + t = clock_get_ns () - t; + printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); + } + printf ("\n"); + } + + printf ("%22s ", "memset_call"); + for (int size = 8; size <= 512; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + memset (a, 0, size); + t = clock_get_ns () - t; + printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); + } + + + printf ("\nLarge memset (bytes/ns):\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s ", funtab[f].name); + + for (int size = 1024; size <= 65536; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + funtab[f].fun (a, 0, size); + t = clock_get_ns () - t; + printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); + } + printf ("\n"); + } + + printf ("%22s ", "memset_call"); + for (int size = 1024; size <= 65536; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + memset (a, 0, size); + t = clock_get_ns () - t; + printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); + } + printf ("\n\n"); + + return 0; +} diff --git a/contrib/arm-optimized-routines/string/bench/strlen.c b/contrib/arm-optimized-routines/string/bench/strlen.c index cc0f04bee547..f05d0d5b89e6 100644 --- a/contrib/arm-optimized-routines/string/bench/strlen.c +++ b/contrib/arm-optimized-routines/string/bench/strlen.c @@ -1,8 +1,8 @@ /* * strlen benchmark. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2021, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #define _GNU_SOURCE @@ -13,10 +13,10 @@ #include "stringlib.h" #include "benchlib.h" -#define ITERS 2000 +#define ITERS 5000 #define ITERS2 20000000 #define ITERS3 2000000 -#define NUM_STRLEN 16384 +#define NUM_TESTS 16384 #define MAX_ALIGN 32 #define MAX_STRLEN 256 @@ -49,7 +49,7 @@ static const struct fun }; #undef F -static uint16_t strlen_tests[NUM_STRLEN]; +static uint16_t strlen_tests[NUM_TESTS]; typedef struct { uint16_t size; uint16_t freq; } freq_data_t; typedef struct { uint8_t align; uint16_t freq; } align_data_t; @@ -117,7 +117,7 @@ init_strlen_tests (void) /* Create a random set of strlen input strings using the string length and alignment distributions. */ - for (int n = 0; n < NUM_STRLEN; n++) + for (int n = 0; n < NUM_TESTS; n++) { int align = strlen_align_arr[rand32 (0) & ALIGN_MASK]; int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK]; @@ -141,14 +141,14 @@ int main (void) size_t res = 0, strlen_size = 0, mask = maskv; printf ("%22s ", funtab[f].name); - for (int c = 0; c < NUM_STRLEN; c++) + for (int c = 0; c < NUM_TESTS; c++) strlen_size += funtab[f].fun (a + strlen_tests[c]); strlen_size *= ITERS; /* Measure latency of strlen result with (res & mask). */ uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS; i++) - for (int c = 0; c < NUM_STRLEN; c++) + for (int c = 0; c < NUM_TESTS; c++) res = funtab[f].fun (a + strlen_tests[c] + (res & mask)); t = clock_get_ns () - t; printf ("%.2f\n", (double)strlen_size / t); diff --git a/contrib/arm-optimized-routines/string/include/benchlib.h b/contrib/arm-optimized-routines/string/include/benchlib.h index 0f2ce2eb6bce..f1bbea388cd2 100644 --- a/contrib/arm-optimized-routines/string/include/benchlib.h +++ b/contrib/arm-optimized-routines/string/include/benchlib.h @@ -2,7 +2,7 @@ * Benchmark support functions. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> diff --git a/contrib/arm-optimized-routines/string/include/stringlib.h b/contrib/arm-optimized-routines/string/include/stringlib.h index 378c3cd2d645..01da7ebfc18d 100644 --- a/contrib/arm-optimized-routines/string/include/stringlib.h +++ b/contrib/arm-optimized-routines/string/include/stringlib.h @@ -1,8 +1,8 @@ /* * Public API. * - * Copyright (c) 2019-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stddef.h> @@ -29,19 +29,17 @@ size_t __strlen_aarch64 (const char *); size_t __strnlen_aarch64 (const char *, size_t); int __strncmp_aarch64 (const char *, const char *, size_t); void * __memchr_aarch64_mte (const void *, int, size_t); -char *__strcpy_aarch64_mte (char *__restrict, const char *__restrict); -char *__stpcpy_aarch64_mte (char *__restrict, const char *__restrict); char *__strchr_aarch64_mte (const char *, int); char * __strchrnul_aarch64_mte (const char *, int ); size_t __strlen_aarch64_mte (const char *); char *__strrchr_aarch64_mte (const char *, int); -int __strcmp_aarch64_mte (const char *, const char *); -int __strncmp_aarch64_mte (const char *, const char *, size_t); #if __ARM_NEON void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t); void *__memmove_aarch64_simd (void *, const void *, size_t); #endif # if __ARM_FEATURE_SVE +void *__memcpy_aarch64_sve (void *__restrict, const void *__restrict, size_t); +void *__memmove_aarch64_sve (void *__restrict, const void *__restrict, size_t); void *__memchr_aarch64_sve (const void *, int, size_t); int __memcmp_aarch64_sve (const void *, const void *, size_t); char *__strchr_aarch64_sve (const char *, int); @@ -54,6 +52,11 @@ size_t __strlen_aarch64_sve (const char *); size_t __strnlen_aarch64_sve (const char *, size_t); int __strncmp_aarch64_sve (const char *, const char *, size_t); # endif +# if WANT_MOPS +void *__memcpy_aarch64_mops (void *__restrict, const void *__restrict, size_t); +void *__memmove_aarch64_mops (void *__restrict, const void *__restrict, size_t); +void *__memset_aarch64_mops (void *, int, size_t); +# endif # if __ARM_FEATURE_MEMORY_TAGGING void *__mtag_tag_region (void *, size_t); void *__mtag_tag_zero_region (void *, size_t); diff --git a/contrib/arm-optimized-routines/string/test/__mtag_tag_region.c b/contrib/arm-optimized-routines/string/test/__mtag_tag_region.c index d8c02d92d626..c45fa6662a77 100644 --- a/contrib/arm-optimized-routines/string/test/__mtag_tag_region.c +++ b/contrib/arm-optimized-routines/string/test/__mtag_tag_region.c @@ -2,7 +2,7 @@ * __mtag_tag_region test. * * Copyright (c) 2021, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST diff --git a/contrib/arm-optimized-routines/string/test/__mtag_tag_zero_region.c b/contrib/arm-optimized-routines/string/test/__mtag_tag_zero_region.c index 221c223a2f31..a4a7861620d1 100644 --- a/contrib/arm-optimized-routines/string/test/__mtag_tag_zero_region.c +++ b/contrib/arm-optimized-routines/string/test/__mtag_tag_zero_region.c @@ -2,7 +2,7 @@ * __mtag_tag_zero_region test. * * Copyright (c) 2021, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST diff --git a/contrib/arm-optimized-routines/string/test/memchr.c b/contrib/arm-optimized-routines/string/test/memchr.c index 0ff77f5710bf..c6a94481c0ad 100644 --- a/contrib/arm-optimized-routines/string/test/memchr.c +++ b/contrib/arm-optimized-routines/string/test/memchr.c @@ -2,7 +2,7 @@ * memchr test. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> diff --git a/contrib/arm-optimized-routines/string/test/memcmp.c b/contrib/arm-optimized-routines/string/test/memcmp.c index 7a7cf9cff35a..f9236b83a60d 100644 --- a/contrib/arm-optimized-routines/string/test/memcmp.c +++ b/contrib/arm-optimized-routines/string/test/memcmp.c @@ -2,7 +2,7 @@ * memcmp test. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> diff --git a/contrib/arm-optimized-routines/string/test/memcpy.c b/contrib/arm-optimized-routines/string/test/memcpy.c index ce0ceeef5ee8..dc95844bd45a 100644 --- a/contrib/arm-optimized-routines/string/test/memcpy.c +++ b/contrib/arm-optimized-routines/string/test/memcpy.c @@ -1,8 +1,8 @@ /* * memcpy test. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> @@ -28,6 +28,12 @@ static const struct fun # if __ARM_NEON F(__memcpy_aarch64_simd, 1) # endif +# if __ARM_FEATURE_SVE + F(__memcpy_aarch64_sve, 1) +# endif +# if WANT_MOPS + F(__memcpy_aarch64_mops, 1) +# endif #elif __arm__ F(__memcpy_arm, 0) #endif diff --git a/contrib/arm-optimized-routines/string/test/memmove.c b/contrib/arm-optimized-routines/string/test/memmove.c index 689b68c98af2..b85dd1e864ef 100644 --- a/contrib/arm-optimized-routines/string/test/memmove.c +++ b/contrib/arm-optimized-routines/string/test/memmove.c @@ -1,8 +1,8 @@ /* * memmove test. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> @@ -28,6 +28,12 @@ static const struct fun # if __ARM_NEON F(__memmove_aarch64_simd, 1) # endif +# if __ARM_FEATURE_SVE + F(__memmove_aarch64_sve, 1) +# endif +# if WANT_MOPS + F(__memmove_aarch64_mops, 1) +# endif #endif {0, 0, 0} // clang-format on diff --git a/contrib/arm-optimized-routines/string/test/memrchr.c b/contrib/arm-optimized-routines/string/test/memrchr.c index adf96f049cc9..4171a56daefd 100644 --- a/contrib/arm-optimized-routines/string/test/memrchr.c +++ b/contrib/arm-optimized-routines/string/test/memrchr.c @@ -2,7 +2,7 @@ * memchr test. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _GNU_SOURCE diff --git a/contrib/arm-optimized-routines/string/test/memset.c b/contrib/arm-optimized-routines/string/test/memset.c index f1721442dbaf..7d09c267ffec 100644 --- a/contrib/arm-optimized-routines/string/test/memset.c +++ b/contrib/arm-optimized-routines/string/test/memset.c @@ -1,8 +1,8 @@ /* * memset test. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> @@ -25,6 +25,9 @@ static const struct fun F(memset, 0) #if __aarch64__ F(__memset_aarch64, 1) +# if WANT_MOPS + F(__memset_aarch64_mops, 1) +# endif #elif __arm__ F(__memset_arm, 0) #endif diff --git a/contrib/arm-optimized-routines/string/test/mte.h b/contrib/arm-optimized-routines/string/test/mte.h index e67cbd9d2d40..40b0ecf6c194 100644 --- a/contrib/arm-optimized-routines/string/test/mte.h +++ b/contrib/arm-optimized-routines/string/test/mte.h @@ -2,7 +2,7 @@ * Memory tagging testing code. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef __TEST_MTE_H diff --git a/contrib/arm-optimized-routines/string/test/stpcpy.c b/contrib/arm-optimized-routines/string/test/stpcpy.c index 1827e68c9a30..0300892a1f3c 100644 --- a/contrib/arm-optimized-routines/string/test/stpcpy.c +++ b/contrib/arm-optimized-routines/string/test/stpcpy.c @@ -1,8 +1,8 @@ /* * stpcpy test. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _GNU_SOURCE @@ -28,8 +28,7 @@ static const struct fun // clang-format off F(stpcpy, 0) #if __aarch64__ - F(__stpcpy_aarch64, 0) - F(__stpcpy_aarch64_mte, 1) + F(__stpcpy_aarch64, 1) # if __ARM_FEATURE_SVE F(__stpcpy_aarch64_sve, 1) # endif diff --git a/contrib/arm-optimized-routines/string/test/strchr.c b/contrib/arm-optimized-routines/string/test/strchr.c index f3ae982ef0ad..66180acfb57c 100644 --- a/contrib/arm-optimized-routines/string/test/strchr.c +++ b/contrib/arm-optimized-routines/string/test/strchr.c @@ -2,7 +2,7 @@ * strchr test. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> diff --git a/contrib/arm-optimized-routines/string/test/strchrnul.c b/contrib/arm-optimized-routines/string/test/strchrnul.c index 6c30ab2123f1..aad0bf59da66 100644 --- a/contrib/arm-optimized-routines/string/test/strchrnul.c +++ b/contrib/arm-optimized-routines/string/test/strchrnul.c @@ -2,7 +2,7 @@ * strchrnul test. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _GNU_SOURCE diff --git a/contrib/arm-optimized-routines/string/test/strcmp.c b/contrib/arm-optimized-routines/string/test/strcmp.c index d57b54ed50a8..4aa95f4f2f1d 100644 --- a/contrib/arm-optimized-routines/string/test/strcmp.c +++ b/contrib/arm-optimized-routines/string/test/strcmp.c @@ -1,8 +1,8 @@ /* * strcmp test. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> @@ -24,8 +24,7 @@ static const struct fun // clang-format off F(strcmp, 0) #if __aarch64__ - F(__strcmp_aarch64, 0) - F(__strcmp_aarch64_mte, 1) + F(__strcmp_aarch64, 1) # if __ARM_FEATURE_SVE F(__strcmp_aarch64_sve, 1) # endif diff --git a/contrib/arm-optimized-routines/string/test/strcpy.c b/contrib/arm-optimized-routines/string/test/strcpy.c index e84cace9c8c6..af297f90396a 100644 --- a/contrib/arm-optimized-routines/string/test/strcpy.c +++ b/contrib/arm-optimized-routines/string/test/strcpy.c @@ -1,8 +1,8 @@ /* * strcpy test. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> @@ -24,8 +24,7 @@ static const struct fun // clang-format off F(strcpy, 0) #if __aarch64__ - F(__strcpy_aarch64, 0) - F(__strcpy_aarch64_mte, 1) + F(__strcpy_aarch64, 1) # if __ARM_FEATURE_SVE F(__strcpy_aarch64_sve, 1) # endif diff --git a/contrib/arm-optimized-routines/string/test/stringtest.h b/contrib/arm-optimized-routines/string/test/stringtest.h index fe855fc21736..6bb7e1fdfeca 100644 --- a/contrib/arm-optimized-routines/string/test/stringtest.h +++ b/contrib/arm-optimized-routines/string/test/stringtest.h @@ -2,7 +2,7 @@ * Common string test code. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <ctype.h> diff --git a/contrib/arm-optimized-routines/string/test/strlen.c b/contrib/arm-optimized-routines/string/test/strlen.c index 6278380f26df..47ef3dcf0ef0 100644 --- a/contrib/arm-optimized-routines/string/test/strlen.c +++ b/contrib/arm-optimized-routines/string/test/strlen.c @@ -1,15 +1,14 @@ /* * strlen test. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <sys/mman.h> #include <limits.h> #include "mte.h" #include "stringlib.h" diff --git a/contrib/arm-optimized-routines/string/test/strncmp.c b/contrib/arm-optimized-routines/string/test/strncmp.c index 018a8a431ab8..4bbab6f93450 100644 --- a/contrib/arm-optimized-routines/string/test/strncmp.c +++ b/contrib/arm-optimized-routines/string/test/strncmp.c @@ -1,8 +1,8 @@ /* * strncmp test. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> @@ -24,8 +24,7 @@ static const struct fun // clang-format off F(strncmp, 0) #if __aarch64__ - F(__strncmp_aarch64, 0) - F(__strncmp_aarch64_mte, 1) + F(__strncmp_aarch64, 1) # if __ARM_FEATURE_SVE F(__strncmp_aarch64_sve, 1) # endif diff --git a/contrib/arm-optimized-routines/string/test/strnlen.c b/contrib/arm-optimized-routines/string/test/strnlen.c index 0dea00eaf8e3..a800fd1993cd 100644 --- a/contrib/arm-optimized-routines/string/test/strnlen.c +++ b/contrib/arm-optimized-routines/string/test/strnlen.c @@ -2,7 +2,7 @@ * strnlen test. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _GNU_SOURCE diff --git a/contrib/arm-optimized-routines/string/test/strrchr.c b/contrib/arm-optimized-routines/string/test/strrchr.c index fedbdc52fcc1..580ca497f8a4 100644 --- a/contrib/arm-optimized-routines/string/test/strrchr.c +++ b/contrib/arm-optimized-routines/string/test/strrchr.c @@ -2,7 +2,7 @@ * strrchr test. * * Copyright (c) 2019-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> diff --git a/contrib/arm-optimized-routines/string/x86_64/check-arch.S b/contrib/arm-optimized-routines/string/x86_64/check-arch.S index 26ade0a0c7db..5afcf7b7ee54 100644 --- a/contrib/arm-optimized-routines/string/x86_64/check-arch.S +++ b/contrib/arm-optimized-routines/string/x86_64/check-arch.S @@ -2,7 +2,7 @@ * check ARCH setting. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #if !__x86_64__ |