diff options
Diffstat (limited to 'contrib/arm-optimized-routines/string')
80 files changed, 2377 insertions, 2038 deletions
diff --git a/contrib/arm-optimized-routines/string/Dir.mk b/contrib/arm-optimized-routines/string/Dir.mk index cf3453f7580d..dd8283ec4977 100644 --- a/contrib/arm-optimized-routines/string/Dir.mk +++ b/contrib/arm-optimized-routines/string/Dir.mk @@ -1,7 +1,7 @@ # Makefile fragment - requires GNU make # # Copyright (c) 2019-2021, Arm Limited. -# SPDX-License-Identifier: MIT +# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception S := $(srcdir)/string B := build/string @@ -13,9 +13,12 @@ all-string bench-string check-string install-string clean-string: else string-lib-srcs := $(wildcard $(S)/$(ARCH)/*.[cS]) +string-lib-srcs += $(wildcard $(S)/$(ARCH)/experimental/*.[cS]) string-test-srcs := $(wildcard $(S)/test/*.c) string-bench-srcs := $(wildcard $(S)/bench/*.c) +string-arch-include-dir := $(wildcard $(S)/$(ARCH)) +string-arch-includes := $(wildcard $(S)/$(ARCH)/*.h) string-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h)) string-libs := \ @@ -43,6 +46,7 @@ string-tests := \ string-benches := \ build/bin/bench/memcpy \ + build/bin/bench/memset \ build/bin/bench/strlen string-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-lib-srcs))) @@ -64,8 +68,8 @@ string-files := \ all-string: $(string-libs) $(string-tests) $(string-benches) $(string-includes) -$(string-objs): $(string-includes) -$(string-objs): CFLAGS_ALL += $(string-cflags) +$(string-objs): $(string-includes) $(string-arch-includes) +$(string-objs): CFLAGS_ALL += $(string-cflags) -I$(string-arch-include-dir) $(string-test-objs): CFLAGS_ALL += -D_GNU_SOURCE @@ -101,6 +105,7 @@ check-string: $(string-tests-out) bench-string: $(string-benches) $(EMULATOR) build/bin/bench/strlen $(EMULATOR) build/bin/bench/memcpy + $(EMULATOR) build/bin/bench/memset install-string: \ $(string-libs:build/lib/%=$(DESTDIR)$(libdir)/%) \ diff --git a/contrib/arm-optimized-routines/string/README.contributors b/contrib/arm-optimized-routines/string/README.contributors new file mode 100644 index 000000000000..0b4a51b56366 --- /dev/null +++ b/contrib/arm-optimized-routines/string/README.contributors @@ -0,0 +1,30 @@ +STYLE REQUIREMENTS +================== + +1. Most code in this sub-directory is expected to be upstreamed into glibc so + the GNU Coding Standard and glibc specific conventions should be followed + to ease upstreaming. + +2. ABI and symbols: the code should be written so it is suitable for inclusion + into a libc with minimal changes. This e.g. means that internal symbols + should be hidden and in the implementation reserved namespace according to + ISO C and POSIX rules. If possible the built shared libraries and static + library archives should be usable to override libc symbols at link time (or + at runtime via LD_PRELOAD). This requires the symbols to follow the glibc ABI + (other than symbol versioning), this cannot be done reliably for static + linking so this is a best effort requirement. + +3. API: include headers should be suitable for benchmarking and testing code + and should not conflict with libc headers. + + +CONTRIBUTION GUIDELINES FOR string SUB-DIRECTORY +================================================ +1. Code: + - The assumptions of the code must be clearly documented. + + - Assembly style should be consistent across different implementations. + + +2. Performance: + - Benchmarking is needed on several microarchitectures. diff --git a/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_region.S b/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_region.S index 84339f73cf23..34b5789240da 100644 --- a/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_region.S +++ b/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_region.S @@ -1,8 +1,8 @@ /* * __mtag_tag_region - tag memory * - * Copyright (c) 2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2021-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -15,7 +15,7 @@ * The memory region may remain untagged if tagging is not enabled. */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_MEMORY_TAGGING @@ -27,9 +27,6 @@ #define zva_val x4 ENTRY (__mtag_tag_region) - PTR_ARG (0) - SIZE_ARG (1) - add dstend, dstin, count cmp count, 96 diff --git a/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_zero_region.S b/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_zero_region.S index f58364ca6fcb..2fa248e25621 100644 --- a/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_zero_region.S +++ b/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_zero_region.S @@ -1,8 +1,8 @@ /* * __mtag_tag_zero_region - tag memory and fill it with zero bytes * - * Copyright (c) 2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2021-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -15,7 +15,7 @@ * The memory region may remain untagged if tagging is not enabled. */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_MEMORY_TAGGING @@ -27,9 +27,6 @@ #define zva_val x4 ENTRY (__mtag_tag_zero_region) - PTR_ARG (0) - SIZE_ARG (1) - add dstend, dstin, count cmp count, 96 diff --git a/contrib/arm-optimized-routines/string/asmdefs.h b/contrib/arm-optimized-routines/string/aarch64/asmdefs.h index 340b427a505b..90166676977a 100644 --- a/contrib/arm-optimized-routines/string/asmdefs.h +++ b/contrib/arm-optimized-routines/string/aarch64/asmdefs.h @@ -1,15 +1,13 @@ /* - * Macros for asm code. + * Macros for asm code. AArch64 version. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _ASMDEFS_H #define _ASMDEFS_H -#if defined(__aarch64__) - /* Branch Target Identitication support. */ #define BTI_C hint 34 #define BTI_J hint 36 @@ -55,19 +53,6 @@ GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) .cfi_startproc; \ BTI_C; -#else - -#define END_FILE - -#define ENTRY_ALIGN(name, alignment) \ - .global name; \ - .type name,%function; \ - .align alignment; \ - name: \ - .cfi_startproc; - -#endif - #define ENTRY(name) ENTRY_ALIGN(name, 6) #define ENTRY_ALIAS(name) \ @@ -81,18 +66,4 @@ GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) #define L(l) .L ## l -#ifdef __ILP32__ - /* Sanitize padding bits of pointer arguments as per aapcs64 */ -#define PTR_ARG(n) mov w##n, w##n -#else -#define PTR_ARG(n) -#endif - -#ifdef __ILP32__ - /* Sanitize padding bits of size arguments as per aapcs64 */ -#define SIZE_ARG(n) mov w##n, w##n -#else -#define SIZE_ARG(n) -#endif - #endif diff --git a/contrib/arm-optimized-routines/string/aarch64/check-arch.S b/contrib/arm-optimized-routines/string/aarch64/check-arch.S index 5a54242d7de6..131b7fa36ec2 100644 --- a/contrib/arm-optimized-routines/string/aarch64/check-arch.S +++ b/contrib/arm-optimized-routines/string/aarch64/check-arch.S @@ -1,8 +1,8 @@ /* * check ARCH setting. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #if !__aarch64__ @@ -10,4 +10,4 @@ #endif /* Include for GNU property notes. */ -#include "../asmdefs.h" +#include "asmdefs.h" diff --git a/contrib/arm-optimized-routines/string/aarch64/memchr-sve.S b/contrib/arm-optimized-routines/string/aarch64/experimental/memchr-sve.S index c22e6596f19b..b314551f3e0f 100644 --- a/contrib/arm-optimized-routines/string/aarch64/memchr-sve.S +++ b/contrib/arm-optimized-routines/string/aarch64/experimental/memchr-sve.S @@ -1,13 +1,14 @@ /* * memchr - find a character in a memory zone * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" + +.arch armv8-a+sve -#if __ARM_FEATURE_SVE /* Assumptions: * * ARMv8-a, AArch64 @@ -15,8 +16,6 @@ */ ENTRY (__memchr_aarch64_sve) - PTR_ARG (0) - SIZE_ARG (2) dup z1.b, w1 /* duplicate c to a vector */ setffr /* initialize FFR */ mov x3, 0 /* initialize off */ @@ -59,6 +58,3 @@ ENTRY (__memchr_aarch64_sve) ret END (__memchr_aarch64_sve) - -#endif - diff --git a/contrib/arm-optimized-routines/string/aarch64/memcmp-sve.S b/contrib/arm-optimized-routines/string/aarch64/experimental/memcmp-sve.S index 78c5ecaa4cdc..ad3534836d04 100644 --- a/contrib/arm-optimized-routines/string/aarch64/memcmp-sve.S +++ b/contrib/arm-optimized-routines/string/aarch64/experimental/memcmp-sve.S @@ -1,13 +1,14 @@ /* * memcmp - compare memory * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" + +.arch armv8-a+sve -#if __ARM_FEATURE_SVE /* Assumptions: * * ARMv8-a, AArch64 @@ -15,9 +16,6 @@ */ ENTRY (__memcmp_aarch64_sve) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) mov x3, 0 /* initialize off */ 0: whilelo p0.b, x3, x2 /* while off < max */ @@ -46,6 +44,3 @@ ENTRY (__memcmp_aarch64_sve) ret END (__memcmp_aarch64_sve) - -#endif - diff --git a/contrib/arm-optimized-routines/string/aarch64/stpcpy-sve.S b/contrib/arm-optimized-routines/string/aarch64/experimental/stpcpy-sve.S index 82dd9717b0a0..5d3f14b86026 100644 --- a/contrib/arm-optimized-routines/string/aarch64/stpcpy-sve.S +++ b/contrib/arm-optimized-routines/string/aarch64/experimental/stpcpy-sve.S @@ -2,7 +2,7 @@ * stpcpy - copy a string returning pointer to end. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #define BUILD_STPCPY 1 diff --git a/contrib/arm-optimized-routines/string/aarch64/strchr-sve.S b/contrib/arm-optimized-routines/string/aarch64/experimental/strchr-sve.S index 13ba9f44f9c5..7d74ae9ff232 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strchr-sve.S +++ b/contrib/arm-optimized-routines/string/aarch64/experimental/strchr-sve.S @@ -1,13 +1,14 @@ /* * strchr/strchrnul - find a character in a string * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" + +.arch armv8-a+sve -#if __ARM_FEATURE_SVE /* Assumptions: * * ARMv8-a, AArch64 @@ -22,7 +23,6 @@ #endif ENTRY (FUNC) - PTR_ARG (0) dup z1.b, w1 /* replicate byte across vector */ setffr /* initialize FFR */ ptrue p1.b /* all ones; loop invariant */ @@ -65,6 +65,3 @@ ENTRY (FUNC) b 0b END (FUNC) - -#endif - diff --git a/contrib/arm-optimized-routines/string/aarch64/strchrnul-sve.S b/contrib/arm-optimized-routines/string/aarch64/experimental/strchrnul-sve.S index 428ff1a3d008..0005f9177514 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strchrnul-sve.S +++ b/contrib/arm-optimized-routines/string/aarch64/experimental/strchrnul-sve.S @@ -2,7 +2,7 @@ * strchrnul - find a character or nul in a string * * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #define BUILD_STRCHRNUL diff --git a/contrib/arm-optimized-routines/string/aarch64/strcmp-sve.S b/contrib/arm-optimized-routines/string/aarch64/experimental/strcmp-sve.S index e6d2da5411ca..b6c249588534 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strcmp-sve.S +++ b/contrib/arm-optimized-routines/string/aarch64/experimental/strcmp-sve.S @@ -1,13 +1,14 @@ /* * __strcmp_aarch64_sve - compare two strings * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" + +.arch armv8-a+sve -#if __ARM_FEATURE_SVE /* Assumptions: * * ARMv8-a, AArch64 @@ -15,8 +16,6 @@ */ ENTRY (__strcmp_aarch64_sve) - PTR_ARG (0) - PTR_ARG (1) setffr /* initialize FFR */ ptrue p1.b, all /* all ones; loop invariant */ mov x2, 0 /* initialize offset */ @@ -54,6 +53,3 @@ ENTRY (__strcmp_aarch64_sve) b 1b END (__strcmp_aarch64_sve) - -#endif - diff --git a/contrib/arm-optimized-routines/string/aarch64/strcpy-sve.S b/contrib/arm-optimized-routines/string/aarch64/experimental/strcpy-sve.S index f515462e09ae..57b77c8a00e7 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strcpy-sve.S +++ b/contrib/arm-optimized-routines/string/aarch64/experimental/strcpy-sve.S @@ -1,13 +1,14 @@ /* * strcpy/stpcpy - copy a string returning pointer to start/end. * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" + +.arch armv8-a+sve -#if __ARM_FEATURE_SVE /* Assumptions: * * ARMv8-a, AArch64 @@ -22,8 +23,6 @@ #endif ENTRY (FUNC) - PTR_ARG (0) - PTR_ARG (1) setffr /* initialize FFR */ ptrue p2.b, all /* all ones; loop invariant */ mov x2, 0 /* initialize offset */ @@ -66,6 +65,3 @@ ENTRY (FUNC) ret END (FUNC) - -#endif - diff --git a/contrib/arm-optimized-routines/string/aarch64/strlen-sve.S b/contrib/arm-optimized-routines/string/aarch64/experimental/strlen-sve.S index 2392493f1a3c..c83155052c07 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strlen-sve.S +++ b/contrib/arm-optimized-routines/string/aarch64/experimental/strlen-sve.S @@ -1,13 +1,14 @@ /* * __strlen_aarch64_sve - compute the length of a string * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" + +.arch armv8-a+sve -#if __ARM_FEATURE_SVE /* Assumptions: * * ARMv8-a, AArch64 @@ -15,7 +16,6 @@ */ ENTRY (__strlen_aarch64_sve) - PTR_ARG (0) setffr /* initialize FFR */ ptrue p2.b /* all ones; loop invariant */ mov x1, 0 /* initialize length */ @@ -50,6 +50,3 @@ ENTRY (__strlen_aarch64_sve) b 0b END (__strlen_aarch64_sve) - -#endif - diff --git a/contrib/arm-optimized-routines/string/aarch64/strncmp-sve.S b/contrib/arm-optimized-routines/string/aarch64/experimental/strncmp-sve.S index 234190e245b0..a281e642d8aa 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strncmp-sve.S +++ b/contrib/arm-optimized-routines/string/aarch64/experimental/strncmp-sve.S @@ -1,13 +1,14 @@ /* * strncmp - compare two strings with limit * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" + +.arch armv8-a+sve -#if __ARM_FEATURE_SVE /* Assumptions: * * ARMv8-a, AArch64 @@ -15,9 +16,6 @@ */ ENTRY (__strncmp_aarch64_sve) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) setffr /* initialize FFR */ mov x3, 0 /* initialize off */ @@ -64,6 +62,3 @@ ENTRY (__strncmp_aarch64_sve) ret END (__strncmp_aarch64_sve) - -#endif - diff --git a/contrib/arm-optimized-routines/string/aarch64/strnlen-sve.S b/contrib/arm-optimized-routines/string/aarch64/experimental/strnlen-sve.S index 5b9ebf7763bc..11d835a1b13c 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strnlen-sve.S +++ b/contrib/arm-optimized-routines/string/aarch64/experimental/strnlen-sve.S @@ -1,13 +1,14 @@ /* * strnlen - calculate the length of a string with limit. * - * Copyright (c) 2019-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" + +.arch armv8-a+sve -#if __ARM_FEATURE_SVE /* Assumptions: * * ARMv8-a, AArch64 @@ -15,8 +16,6 @@ */ ENTRY (__strnlen_aarch64_sve) - PTR_ARG (0) - SIZE_ARG (1) setffr /* initialize FFR */ mov x2, 0 /* initialize len */ b 1f @@ -69,6 +68,3 @@ ENTRY (__strnlen_aarch64_sve) ret END (__strnlen_aarch64_sve) - -#endif - diff --git a/contrib/arm-optimized-routines/string/aarch64/strrchr-sve.S b/contrib/arm-optimized-routines/string/aarch64/experimental/strrchr-sve.S index d36d69af37fd..731edaddf156 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strrchr-sve.S +++ b/contrib/arm-optimized-routines/string/aarch64/experimental/strrchr-sve.S @@ -1,13 +1,14 @@ /* * strrchr - find the last of a character in a string * - * Copyright (c) 2019-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" + +.arch armv8-a+sve -#if __ARM_FEATURE_SVE /* Assumptions: * * ARMv8-a, AArch64 @@ -15,7 +16,6 @@ */ ENTRY (__strrchr_aarch64_sve) - PTR_ARG (0) dup z1.b, w1 /* replicate byte across vector */ setffr /* initialize FFR */ ptrue p1.b /* all ones; loop invariant */ @@ -79,6 +79,3 @@ ENTRY (__strrchr_aarch64_sve) ret END (__strrchr_aarch64_sve) - -#endif - diff --git a/contrib/arm-optimized-routines/string/aarch64/memchr-mte.S b/contrib/arm-optimized-routines/string/aarch64/memchr-mte.S index c2e967d1004e..68bd0af9a8c5 100644 --- a/contrib/arm-optimized-routines/string/aarch64/memchr-mte.S +++ b/contrib/arm-optimized-routines/string/aarch64/memchr-mte.S @@ -1,8 +1,8 @@ /* * memchr - find a character in a memory zone * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define chrin w1 @@ -23,82 +23,74 @@ #define synd x5 #define shift x6 #define tmp x7 -#define wtmp w7 #define vrepchr v0 #define qdata q1 #define vdata v1 #define vhas_chr v2 -#define vrepmask v3 -#define vend v4 -#define dend d4 +#define vend v3 +#define dend d3 /* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits + per byte. We take 4 bits of every comparison byte with shift right and narrow + by 4 instruction. Since the bits in the nibble mask reflect the order in + which things occur in the original string, counting leading zeros identifies + exactly which byte matched. */ ENTRY (__memchr_aarch64_mte) - PTR_ARG (0) - SIZE_ARG (2) bic src, srcin, 15 cbz cntin, L(nomatch) ld1 {vdata.16b}, [src] dup vrepchr.16b, chrin - mov wtmp, 0xf00f - dup vrepmask.8h, wtmp cmeq vhas_chr.16b, vdata.16b, vrepchr.16b lsl shift, srcin, 2 - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ fmov synd, dend lsr synd, synd, shift cbz synd, L(start_loop) rbit synd, synd clz synd, synd - add result, srcin, synd, lsr 2 cmp cntin, synd, lsr 2 + add result, srcin, synd, lsr 2 csel result, result, xzr, hi ret + .p2align 3 L(start_loop): sub tmp, src, srcin - add tmp, tmp, 16 + add tmp, tmp, 17 subs cntrem, cntin, tmp - b.ls L(nomatch) + b.lo L(nomatch) /* Make sure that it won't overread by a 16-byte chunk */ - add tmp, cntrem, 15 - tbnz tmp, 4, L(loop32_2) - + tbz cntrem, 4, L(loop32_2) + sub src, src, 16 .p2align 4 L(loop32): - ldr qdata, [src, 16]! + ldr qdata, [src, 32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbnz synd, L(end) L(loop32_2): - ldr qdata, [src, 16]! - subs cntrem, cntrem, 32 + ldr qdata, [src, 16] cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - b.ls L(end) + subs cntrem, cntrem, 32 + b.lo L(end_2) umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbz synd, L(loop32) +L(end_2): + add src, src, 16 L(end): - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ + sub cntrem, src, srcin fmov synd, dend - add tmp, srcin, cntin - sub cntrem, tmp, src + sub cntrem, cntin, cntrem #ifndef __AARCH64EB__ rbit synd, synd #endif diff --git a/contrib/arm-optimized-routines/string/aarch64/memchr.S b/contrib/arm-optimized-routines/string/aarch64/memchr.S index 353f0d1eac53..d12a38abbc30 100644 --- a/contrib/arm-optimized-routines/string/aarch64/memchr.S +++ b/contrib/arm-optimized-routines/string/aarch64/memchr.S @@ -1,8 +1,8 @@ /* * memchr - find a character in a memory zone * - * Copyright (c) 2014-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2014-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * Neon Available. */ -#include "../asmdefs.h" +#include "asmdefs.h" /* Arguments and results. */ #define srcin x0 @@ -47,8 +47,6 @@ */ ENTRY (__memchr_aarch64) - PTR_ARG (0) - SIZE_ARG (2) /* Do not dereference srcin if no bytes to compare. */ cbz cntin, L(zero_length) /* diff --git a/contrib/arm-optimized-routines/string/aarch64/memcmp.S b/contrib/arm-optimized-routines/string/aarch64/memcmp.S index 3b1026642eee..43439de4db69 100644 --- a/contrib/arm-optimized-routines/string/aarch64/memcmp.S +++ b/contrib/arm-optimized-routines/string/aarch64/memcmp.S @@ -1,103 +1,80 @@ /* memcmp - compare memory * - * Copyright (c) 2013-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2013-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: * - * ARMv8-a, AArch64, unaligned accesses. + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. */ -#include "../asmdefs.h" +#include "asmdefs.h" -/* Parameters and result. */ -#define src1 x0 -#define src2 x1 -#define limit x2 -#define result w0 +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result w0 -/* Internal variables. */ -#define data1 x3 -#define data1w w3 -#define data1h x4 -#define data2 x5 -#define data2w w5 -#define data2h x6 -#define tmp1 x7 -#define tmp2 x8 +#define data1 x3 +#define data1w w3 +#define data2 x4 +#define data2w w4 +#define data3 x5 +#define data3w w5 +#define data4 x6 +#define data4w w6 +#define tmp x6 +#define src1end x7 +#define src2end x8 -ENTRY (__memcmp_aarch64) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) - subs limit, limit, 8 - b.lo L(less8) - - ldr data1, [src1], 8 - ldr data2, [src2], 8 - cmp data1, data2 - b.ne L(return) - - subs limit, limit, 8 - b.gt L(more16) - - ldr data1, [src1, limit] - ldr data2, [src2, limit] - b L(return) -L(more16): - ldr data1, [src1], 8 - ldr data2, [src2], 8 - cmp data1, data2 - bne L(return) - - /* Jump directly to comparing the last 16 bytes for 32 byte (or less) - strings. */ - subs limit, limit, 16 +ENTRY (__memcmp_aarch64) + cmp limit, 16 + b.lo L(less16) + ldp data1, data3, [src1] + ldp data2, data4, [src2] + ccmp data1, data2, 0, ne + ccmp data3, data4, 0, eq + b.ne L(return2) + + add src1end, src1, limit + add src2end, src2, limit + cmp limit, 32 b.ls L(last_bytes) + cmp limit, 160 + b.hs L(loop_align) + sub limit, limit, 32 - /* We overlap loads between 0-32 bytes at either side of SRC1 when we - try to align, so limit it only to strings larger than 128 bytes. */ - cmp limit, 96 - b.ls L(loop16) - - /* Align src1 and adjust src2 with bytes not yet done. */ - and tmp1, src1, 15 - add limit, limit, tmp1 - sub src1, src1, tmp1 - sub src2, src2, tmp1 - - /* Loop performing 16 bytes per iteration using aligned src1. - Limit is pre-decremented by 16 and must be larger than zero. - Exit if <= 16 bytes left to do or if the data is not equal. */ .p2align 4 -L(loop16): - ldp data1, data1h, [src1], 16 - ldp data2, data2h, [src2], 16 - subs limit, limit, 16 - ccmp data1, data2, 0, hi - ccmp data1h, data2h, 0, eq - b.eq L(loop16) - +L(loop32): + ldp data1, data3, [src1, 16] + ldp data2, data4, [src2, 16] cmp data1, data2 - bne L(return) - mov data1, data1h - mov data2, data2h + ccmp data3, data4, 0, eq + b.ne L(return2) + cmp limit, 16 + b.ls L(last_bytes) + + ldp data1, data3, [src1, 32] + ldp data2, data4, [src2, 32] cmp data1, data2 - bne L(return) + ccmp data3, data4, 0, eq + b.ne L(return2) + add src1, src1, 32 + add src2, src2, 32 +L(last64): + subs limit, limit, 32 + b.hi L(loop32) /* Compare last 1-16 bytes using unaligned access. */ L(last_bytes): - add src1, src1, limit - add src2, src2, limit - ldp data1, data1h, [src1] - ldp data2, data2h, [src2] - cmp data1, data2 - bne L(return) - mov data1, data1h - mov data2, data2h + ldp data1, data3, [src1end, -16] + ldp data2, data4, [src2end, -16] +L(return2): cmp data1, data2 + csel data1, data1, data3, ne + csel data2, data2, data4, ne /* Compare data bytes and set return value to 0, -1 or 1. */ L(return): @@ -105,33 +82,105 @@ L(return): rev data1, data1 rev data2, data2 #endif - cmp data1, data2 -L(ret_eq): + cmp data1, data2 cset result, ne cneg result, result, lo ret .p2align 4 - /* Compare up to 8 bytes. Limit is [-8..-1]. */ +L(less16): + add src1end, src1, limit + add src2end, src2, limit + tbz limit, 3, L(less8) + ldr data1, [src1] + ldr data2, [src2] + ldr data3, [src1end, -8] + ldr data4, [src2end, -8] + b L(return2) + + .p2align 4 L(less8): - adds limit, limit, 4 - b.lo L(less4) - ldr data1w, [src1], 4 - ldr data2w, [src2], 4 + tbz limit, 2, L(less4) + ldr data1w, [src1] + ldr data2w, [src2] + ldr data3w, [src1end, -4] + ldr data4w, [src2end, -4] + b L(return2) + +L(less4): + tbz limit, 1, L(less2) + ldrh data1w, [src1] + ldrh data2w, [src2] cmp data1w, data2w b.ne L(return) - sub limit, limit, 4 -L(less4): - adds limit, limit, 4 - beq L(ret_eq) -L(byte_loop): - ldrb data1w, [src1], 1 - ldrb data2w, [src2], 1 - subs limit, limit, 1 - ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ - b.eq L(byte_loop) +L(less2): + mov result, 0 + tbz limit, 0, L(return_zero) + ldrb data1w, [src1end, -1] + ldrb data2w, [src2end, -1] sub result, data1w, data2w +L(return_zero): ret -END (__memcmp_aarch64) +L(loop_align): + ldp data1, data3, [src1, 16] + ldp data2, data4, [src2, 16] + cmp data1, data2 + ccmp data3, data4, 0, eq + b.ne L(return2) + + /* Align src2 and adjust src1, src2 and limit. */ + and tmp, src2, 15 + sub tmp, tmp, 16 + sub src2, src2, tmp + add limit, limit, tmp + sub src1, src1, tmp + sub limit, limit, 64 + 16 + + .p2align 4 +L(loop64): + ldr q0, [src1, 16] + ldr q1, [src2, 16] + subs limit, limit, 64 + ldr q2, [src1, 32] + ldr q3, [src2, 32] + eor v0.16b, v0.16b, v1.16b + eor v1.16b, v2.16b, v3.16b + ldr q2, [src1, 48] + ldr q3, [src2, 48] + umaxp v0.16b, v0.16b, v1.16b + ldr q4, [src1, 64]! + ldr q5, [src2, 64]! + eor v1.16b, v2.16b, v3.16b + eor v2.16b, v4.16b, v5.16b + umaxp v1.16b, v1.16b, v2.16b + umaxp v0.16b, v0.16b, v1.16b + umaxp v0.16b, v0.16b, v0.16b + fmov tmp, d0 + ccmp tmp, 0, 0, hi + b.eq L(loop64) + + /* If equal, process last 1-64 bytes using scalar loop. */ + add limit, limit, 64 + 16 + cbz tmp, L(last64) + + /* Determine the 8-byte aligned offset of the first difference. */ +#ifdef __AARCH64EB__ + rev16 tmp, tmp +#endif + rev tmp, tmp + clz tmp, tmp + bic tmp, tmp, 7 + sub tmp, tmp, 48 + ldr data1, [src1, tmp] + ldr data2, [src2, tmp] +#ifndef __AARCH64EB__ + rev data1, data1 + rev data2, data2 +#endif + mov result, 1 + cmp data1, data2 + cneg result, result, lo + ret +END (__memcmp_aarch64) diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S b/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S index f97f2c3047b9..cbf4c581500e 100644 --- a/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S +++ b/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S @@ -1,8 +1,8 @@ /* * memcpy - copy memory area * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * */ -#include "../asmdefs.h" +#include "asmdefs.h" #define dstin x0 #define src x1 @@ -52,15 +52,13 @@ ENTRY_ALIAS (__memmove_aarch64_simd) ENTRY (__memcpy_aarch64_simd) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) add srcend, src, count - add dstend, dstin, count cmp count, 128 b.hi L(copy_long) + add dstend, dstin, count cmp count, 32 b.hi L(copy32_128) + nop /* Small copies: 0..32 bytes. */ cmp count, 16 @@ -71,6 +69,18 @@ ENTRY (__memcpy_aarch64_simd) str B_q, [dstend, -16] ret + .p2align 4 + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_q, B_q, [src] + ldp C_q, D_q, [srcend, -32] + cmp count, 64 + b.hi L(copy128) + stp A_q, B_q, [dstin] + stp C_q, D_q, [dstend, -32] + ret + + .p2align 4 /* Copy 8-15 bytes. */ L(copy16): tbz count, 3, L(copy8) @@ -80,7 +90,6 @@ L(copy16): str A_h, [dstend, -8] ret - .p2align 3 /* Copy 4-7 bytes. */ L(copy8): tbz count, 2, L(copy4) @@ -90,31 +99,6 @@ L(copy8): str B_lw, [dstend, -4] ret - /* Copy 0..3 bytes using a branchless sequence. */ -L(copy4): - cbz count, L(copy0) - lsr tmp1, count, 1 - ldrb A_lw, [src] - ldrb C_lw, [srcend, -1] - ldrb B_lw, [src, tmp1] - strb A_lw, [dstin] - strb B_lw, [dstin, tmp1] - strb C_lw, [dstend, -1] -L(copy0): - ret - - .p2align 4 - /* Medium copies: 33..128 bytes. */ -L(copy32_128): - ldp A_q, B_q, [src] - ldp C_q, D_q, [srcend, -32] - cmp count, 64 - b.hi L(copy128) - stp A_q, B_q, [dstin] - stp C_q, D_q, [dstend, -32] - ret - - .p2align 4 /* Copy 65..128 bytes. */ L(copy128): ldp E_q, F_q, [src, 32] @@ -128,8 +112,24 @@ L(copy96): stp C_q, D_q, [dstend, -32] ret + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb C_lw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb C_lw, [dstend, -1] +L(copy0): + ret + + .p2align 3 /* Copy more than 128 bytes. */ L(copy_long): + add dstend, dstin, count + /* Use backwards copy if there is an overlap. */ sub tmp1, dstin, src cmp tmp1, count @@ -166,6 +166,9 @@ L(copy64_from_end): stp A_q, B_q, [dstend, -32] ret + .p2align 4 + nop + /* Large backwards copy for overlapping copies. Copy 16 bytes and then align srcend to 16-byte alignment. */ L(copy_long_backwards): diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy-mops.S b/contrib/arm-optimized-routines/string/aarch64/memcpy-mops.S new file mode 100644 index 000000000000..03ae95570c04 --- /dev/null +++ b/contrib/arm-optimized-routines/string/aarch64/memcpy-mops.S @@ -0,0 +1,17 @@ +/* + * memcpy using MOPS extension. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "asmdefs.h" + +ENTRY (__memcpy_aarch64_mops) + mov x3, x0 + .inst 0x19010443 /* cpyfp [x3]!, [x1]!, x2! */ + .inst 0x19410443 /* cpyfm [x3]!, [x1]!, x2! */ + .inst 0x19810443 /* cpyfe [x3]!, [x1]!, x2! */ + ret + +END (__memcpy_aarch64_mops) diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S b/contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S new file mode 100644 index 000000000000..9b05cb2a58ee --- /dev/null +++ b/contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S @@ -0,0 +1,169 @@ +/* + * memcpy - copy memory area + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses. + * + */ + +#include "asmdefs.h" + +.arch armv8-a+sve + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define tmp1 x6 +#define vlen x6 + +#define A_q q0 +#define B_q q1 +#define C_q q2 +#define D_q q3 +#define E_q q4 +#define F_q q5 +#define G_q q6 +#define H_q q7 + +/* This implementation handles overlaps and supports both memcpy and memmove + from a single entry point. It uses unaligned accesses and branchless + sequences to keep the code small, simple and improve performance. + SVE vectors are used to speedup small copies. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per iteration. + The source pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. +*/ + +ENTRY_ALIAS (__memmove_aarch64_sve) +ENTRY (__memcpy_aarch64_sve) + cmp count, 128 + b.hi L(copy_long) + cntb vlen + cmp count, vlen, lsl 1 + b.hi L(copy32_128) + + whilelo p0.b, xzr, count + whilelo p1.b, vlen, count + ld1b z0.b, p0/z, [src, 0, mul vl] + ld1b z1.b, p1/z, [src, 1, mul vl] + st1b z0.b, p0, [dstin, 0, mul vl] + st1b z1.b, p1, [dstin, 1, mul vl] + ret + + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + add srcend, src, count + add dstend, dstin, count + ldp A_q, B_q, [src] + ldp C_q, D_q, [srcend, -32] + cmp count, 64 + b.hi L(copy128) + stp A_q, B_q, [dstin] + stp C_q, D_q, [dstend, -32] + ret + + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_q, F_q, [src, 32] + cmp count, 96 + b.ls L(copy96) + ldp G_q, H_q, [srcend, -64] + stp G_q, H_q, [dstend, -64] +L(copy96): + stp A_q, B_q, [dstin] + stp E_q, F_q, [dstin, 32] + stp C_q, D_q, [dstend, -32] + ret + + /* Copy more than 128 bytes. */ +L(copy_long): + add srcend, src, count + add dstend, dstin, count + + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cmp tmp1, count + b.lo L(copy_long_backwards) + + /* Copy 16 bytes and then align src to 16-byte alignment. */ + ldr D_q, [src] + and tmp1, src, 15 + bic src, src, 15 + sub dst, dstin, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_q, B_q, [src, 16] + str D_q, [dstin] + ldp C_q, D_q, [src, 48] + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) +L(loop64): + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [src, 80] + stp C_q, D_q, [dst, 48] + ldp C_q, D_q, [src, 112] + add src, src, 64 + add dst, dst, 64 + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_q, F_q, [srcend, -64] + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [srcend, -32] + stp C_q, D_q, [dst, 48] + stp E_q, F_q, [dstend, -64] + stp A_q, B_q, [dstend, -32] + ret + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align srcend to 16-byte alignment. */ +L(copy_long_backwards): + cbz tmp1, L(return) + ldr D_q, [srcend, -16] + and tmp1, srcend, 15 + bic srcend, srcend, 15 + sub count, count, tmp1 + ldp A_q, B_q, [srcend, -32] + str D_q, [dstend, -16] + ldp C_q, D_q, [srcend, -64] + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + str B_q, [dstend, -16] + str A_q, [dstend, -32] + ldp A_q, B_q, [srcend, -96] + str D_q, [dstend, -48] + str C_q, [dstend, -64]! + ldp C_q, D_q, [srcend, -128] + sub srcend, srcend, 64 + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp E_q, F_q, [src, 32] + stp A_q, B_q, [dstend, -32] + ldp A_q, B_q, [src] + stp C_q, D_q, [dstend, -64] + stp E_q, F_q, [dstin, 32] + stp A_q, B_q, [dstin] +L(return): + ret + +END (__memcpy_aarch64_sve) diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy.S b/contrib/arm-optimized-routines/string/aarch64/memcpy.S index dd254f6f9929..351f1a11f097 100644 --- a/contrib/arm-optimized-routines/string/aarch64/memcpy.S +++ b/contrib/arm-optimized-routines/string/aarch64/memcpy.S @@ -1,8 +1,8 @@ /* * memcpy - copy memory area * - * Copyright (c) 2012-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2012-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * */ -#include "../asmdefs.h" +#include "asmdefs.h" #define dstin x0 #define src x1 @@ -55,9 +55,6 @@ ENTRY_ALIAS (__memmove_aarch64) ENTRY (__memcpy_aarch64) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) add srcend, src, count add dstend, dstin, count cmp count, 128 diff --git a/contrib/arm-optimized-routines/string/aarch64/memmove-mops.S b/contrib/arm-optimized-routines/string/aarch64/memmove-mops.S new file mode 100644 index 000000000000..d9839f86e9b4 --- /dev/null +++ b/contrib/arm-optimized-routines/string/aarch64/memmove-mops.S @@ -0,0 +1,17 @@ +/* + * memmove using MOPS extension. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "asmdefs.h" + +ENTRY (__memmove_aarch64_mops) + mov x3, x0 + .inst 0x1d010443 /* cpyp [x3]!, [x1]!, x2! */ + .inst 0x1d410443 /* cpym [x3]!, [x1]!, x2! */ + .inst 0x1d810443 /* cpye [x3]!, [x1]!, x2! */ + ret + +END (__memmove_aarch64_mops) diff --git a/contrib/arm-optimized-routines/string/aarch64/memrchr.S b/contrib/arm-optimized-routines/string/aarch64/memrchr.S index 7b4be847cecb..ed38478a6faa 100644 --- a/contrib/arm-optimized-routines/string/aarch64/memrchr.S +++ b/contrib/arm-optimized-routines/string/aarch64/memrchr.S @@ -1,8 +1,8 @@ /* * memrchr - find last character in a memory zone. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define chrin w1 @@ -23,7 +23,6 @@ #define synd x5 #define shift x6 #define tmp x7 -#define wtmp w7 #define end x8 #define endm1 x9 @@ -31,34 +30,27 @@ #define qdata q1 #define vdata v1 #define vhas_chr v2 -#define vrepmask v3 -#define vend v4 -#define dend d4 +#define vend v3 +#define dend d3 /* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits + per byte. We take 4 bits of every comparison byte with shift right and narrow + by 4 instruction. Since the bits in the nibble mask reflect the order in + which things occur in the original string, counting leading zeros identifies + exactly which byte matched. */ ENTRY (__memrchr_aarch64) - PTR_ARG (0) add end, srcin, cntin sub endm1, end, 1 bic src, endm1, 15 cbz cntin, L(nomatch) ld1 {vdata.16b}, [src] dup vrepchr.16b, chrin - mov wtmp, 0xf00f - dup vrepmask.8h, wtmp cmeq vhas_chr.16b, vdata.16b, vrepchr.16b neg shift, end, lsl 2 - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ fmov synd, dend lsl synd, synd, shift cbz synd, L(start_loop) @@ -69,34 +61,36 @@ ENTRY (__memrchr_aarch64) csel result, result, xzr, hi ret + nop L(start_loop): - sub tmp, end, src - subs cntrem, cntin, tmp + subs cntrem, src, srcin b.ls L(nomatch) /* Make sure that it won't overread by a 16-byte chunk */ - add tmp, cntrem, 15 - tbnz tmp, 4, L(loop32_2) + sub cntrem, cntrem, 1 + tbz cntrem, 4, L(loop32_2) + add src, src, 16 - .p2align 4 + .p2align 5 L(loop32): - ldr qdata, [src, -16]! + ldr qdata, [src, -32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbnz synd, L(end) L(loop32_2): - ldr qdata, [src, -16]! + ldr qdata, [src, -16] subs cntrem, cntrem, 32 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - b.ls L(end) + b.lo L(end_2) umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbz synd, L(loop32) +L(end_2): + sub src, src, 16 L(end): - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ fmov synd, dend add tmp, src, 15 diff --git a/contrib/arm-optimized-routines/string/aarch64/memset-mops.S b/contrib/arm-optimized-routines/string/aarch64/memset-mops.S new file mode 100644 index 000000000000..00d8e7d2c05f --- /dev/null +++ b/contrib/arm-optimized-routines/string/aarch64/memset-mops.S @@ -0,0 +1,17 @@ +/* + * memset using MOPS extension. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "asmdefs.h" + +ENTRY (__memset_aarch64_mops) + mov x3, x0 + .inst 0x19c10443 /* setp [x3]!, x2!, x1 */ + .inst 0x19c14443 /* setm [x3]!, x2!, x1 */ + .inst 0x19c18443 /* sete [x3]!, x2!, x1 */ + ret + +END (__memset_aarch64_mops) diff --git a/contrib/arm-optimized-routines/string/aarch64/memset-sve.S b/contrib/arm-optimized-routines/string/aarch64/memset-sve.S new file mode 100644 index 000000000000..efaeaece284e --- /dev/null +++ b/contrib/arm-optimized-routines/string/aarch64/memset-sve.S @@ -0,0 +1,114 @@ +/* + * memset - fill memory with a constant byte + * + * Copyright (c) 2024-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses. + * + */ + +#include "asmdefs.h" + +.arch armv8-a+sve + +#define dstin x0 +#define val x1 +#define valw w1 +#define count x2 +#define dst x3 +#define dstend x4 +#define zva_val x5 +#define vlen x5 +#define off x3 +#define dstend2 x5 + +ENTRY (__memset_aarch64_sve) + dup v0.16B, valw + cmp count, 16 + b.lo L(set_16) + + add dstend, dstin, count + cmp count, 64 + b.hs L(set_128) + + /* Set 16..63 bytes. */ + mov off, 16 + and off, off, count, lsr 1 + sub dstend2, dstend, off + str q0, [dstin] + str q0, [dstin, off] + str q0, [dstend2, -16] + str q0, [dstend, -16] + ret + + .p2align 4 +L(set_16): + whilelo p0.b, xzr, count + st1b z0.b, p0, [dstin] + ret + + .p2align 4 +L(set_128): + bic dst, dstin, 15 + cmp count, 128 + b.hi L(set_long) + stp q0, q0, [dstin] + stp q0, q0, [dstin, 32] + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + + .p2align 4 +L(set_long): + cmp count, 256 + b.lo L(no_zva) + tst valw, 255 + b.ne L(no_zva) + +#ifndef SKIP_ZVA_CHECK + mrs zva_val, dczid_el0 + and zva_val, zva_val, 31 + cmp zva_val, 4 /* ZVA size is 64 bytes. */ + b.ne L(no_zva) +#endif + str q0, [dstin] + str q0, [dst, 16] + bic dst, dstin, 31 + stp q0, q0, [dst, 32] + bic dst, dstin, 63 + sub count, dstend, dst /* Count is now 64 too large. */ + sub count, count, 128 /* Adjust count and bias for loop. */ + + sub x8, dstend, 1 /* Write last bytes before ZVA loop. */ + bic x8, x8, 15 + stp q0, q0, [x8, -48] + str q0, [x8, -16] + str q0, [dstend, -16] + + .p2align 4 +L(zva64_loop): + add dst, dst, 64 + dc zva, dst + subs count, count, 64 + b.hi L(zva64_loop) + ret + +L(no_zva): + str q0, [dstin] + sub count, dstend, dst /* Count is 16 too large. */ + sub count, count, 64 + 16 /* Adjust count and bias for loop. */ +L(no_zva_loop): + stp q0, q0, [dst, 16] + stp q0, q0, [dst, 48] + add dst, dst, 64 + subs count, count, 64 + b.hi L(no_zva_loop) + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + +END (__memset_aarch64_sve) diff --git a/contrib/arm-optimized-routines/string/aarch64/memset.S b/contrib/arm-optimized-routines/string/aarch64/memset.S index 9fcd97579913..906a4dcf46c6 100644 --- a/contrib/arm-optimized-routines/string/aarch64/memset.S +++ b/contrib/arm-optimized-routines/string/aarch64/memset.S @@ -1,8 +1,8 @@ /* * memset - fill memory with a constant byte * - * Copyright (c) 2012-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2012-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * */ -#include "../asmdefs.h" +#include "asmdefs.h" #define dstin x0 #define val x1 @@ -20,93 +20,98 @@ #define dst x3 #define dstend x4 #define zva_val x5 +#define off x3 +#define dstend2 x5 ENTRY (__memset_aarch64) - PTR_ARG (0) - SIZE_ARG (2) - dup v0.16B, valw - add dstend, dstin, count - - cmp count, 96 - b.hi L(set_long) cmp count, 16 - b.hs L(set_medium) - mov val, v0.D[0] + b.lo L(set_small) - /* Set 0..15 bytes. */ - tbz count, 3, 1f - str val, [dstin] - str val, [dstend, -8] + add dstend, dstin, count + cmp count, 64 + b.hs L(set_128) + + /* Set 16..63 bytes. */ + mov off, 16 + and off, off, count, lsr 1 + sub dstend2, dstend, off + str q0, [dstin] + str q0, [dstin, off] + str q0, [dstend2, -16] + str q0, [dstend, -16] ret + .p2align 4 -1: tbz count, 2, 2f - str valw, [dstin] - str valw, [dstend, -4] + /* Set 0..15 bytes. */ +L(set_small): + add dstend, dstin, count + cmp count, 4 + b.lo 2f + lsr off, count, 3 + sub dstend2, dstend, off, lsl 2 + str s0, [dstin] + str s0, [dstin, off, lsl 2] + str s0, [dstend2, -4] + str s0, [dstend, -4] ret + + /* Set 0..3 bytes. */ 2: cbz count, 3f + lsr off, count, 1 strb valw, [dstin] - tbz count, 1, 3f - strh valw, [dstend, -2] + strb valw, [dstin, off] + strb valw, [dstend, -1] 3: ret - /* Set 17..96 bytes. */ -L(set_medium): - str q0, [dstin] - tbnz count, 6, L(set96) - str q0, [dstend, -16] - tbz count, 5, 1f - str q0, [dstin, 16] - str q0, [dstend, -32] -1: ret - .p2align 4 - /* Set 64..96 bytes. Write 64 bytes from the start and - 32 bytes from the end. */ -L(set96): - str q0, [dstin, 16] +L(set_128): + bic dst, dstin, 15 + cmp count, 128 + b.hi L(set_long) + stp q0, q0, [dstin] stp q0, q0, [dstin, 32] + stp q0, q0, [dstend, -64] stp q0, q0, [dstend, -32] ret .p2align 4 L(set_long): - and valw, valw, 255 - bic dst, dstin, 15 str q0, [dstin] - cmp count, 160 - ccmp valw, 0, 0, hs + str q0, [dst, 16] + tst valw, 255 b.ne L(no_zva) - #ifndef SKIP_ZVA_CHECK mrs zva_val, dczid_el0 and zva_val, zva_val, 31 cmp zva_val, 4 /* ZVA size is 64 bytes. */ b.ne L(no_zva) #endif - str q0, [dst, 16] stp q0, q0, [dst, 32] - bic dst, dst, 63 + bic dst, dstin, 63 sub count, dstend, dst /* Count is now 64 too large. */ - sub count, count, 128 /* Adjust count and bias for loop. */ + sub count, count, 64 + 64 /* Adjust count and bias for loop. */ + + /* Write last bytes before ZVA loop. */ + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] .p2align 4 -L(zva_loop): +L(zva64_loop): add dst, dst, 64 dc zva, dst subs count, count, 64 - b.hi L(zva_loop) - stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] + b.hi L(zva64_loop) ret + .p2align 3 L(no_zva): - sub count, dstend, dst /* Count is 16 too large. */ - sub dst, dst, 16 /* Dst is biased by -32. */ - sub count, count, 64 + 16 /* Adjust count and bias for loop. */ + sub count, dstend, dst /* Count is 32 too large. */ + sub count, count, 64 + 32 /* Adjust count and bias for loop. */ L(no_zva_loop): stp q0, q0, [dst, 32] - stp q0, q0, [dst, 64]! + stp q0, q0, [dst, 64] + add dst, dst, 64 subs count, count, 64 b.hi L(no_zva_loop) stp q0, q0, [dstend, -64] @@ -114,4 +119,3 @@ L(no_zva_loop): ret END (__memset_aarch64) - diff --git a/contrib/arm-optimized-routines/string/aarch64/stpcpy-mte.S b/contrib/arm-optimized-routines/string/aarch64/stpcpy-mte.S deleted file mode 100644 index f1c711906515..000000000000 --- a/contrib/arm-optimized-routines/string/aarch64/stpcpy-mte.S +++ /dev/null @@ -1,10 +0,0 @@ -/* - * stpcpy - copy a string returning pointer to end. - * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#define BUILD_STPCPY 1 - -#include "strcpy-mte.S" diff --git a/contrib/arm-optimized-routines/string/aarch64/stpcpy.S b/contrib/arm-optimized-routines/string/aarch64/stpcpy.S index 4f62aa462389..155c68d75a7b 100644 --- a/contrib/arm-optimized-routines/string/aarch64/stpcpy.S +++ b/contrib/arm-optimized-routines/string/aarch64/stpcpy.S @@ -2,7 +2,7 @@ * stpcpy - copy a string returning pointer to end. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #define BUILD_STPCPY 1 diff --git a/contrib/arm-optimized-routines/string/aarch64/strchr-mte.S b/contrib/arm-optimized-routines/string/aarch64/strchr-mte.S index dcb0e4625870..42b747311bc6 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strchr-mte.S +++ b/contrib/arm-optimized-routines/string/aarch64/strchr-mte.S @@ -1,8 +1,8 @@ /* * strchr - find a character in a string * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define chrin w1 @@ -19,8 +19,7 @@ #define src x2 #define tmp1 x1 -#define wtmp2 w3 -#define tmp3 x3 +#define tmp2 x3 #define vrepchr v0 #define vdata v1 @@ -28,39 +27,29 @@ #define vhas_nul v2 #define vhas_chr v3 #define vrepmask v4 -#define vrepmask2 v5 -#define vend v6 -#define dend d6 +#define vend v5 +#define dend d5 /* Core algorithm. For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-1 are set if the relevant byte matched the - requested character, bits 2-3 are set if the byte is NUL (or matched), and - bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd - bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits - in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ + per byte. Bits 0-1 are set if the relevant byte matched the requested + character, bits 2-3 are set if the byte is NUL or matched. Count trailing + zeroes gives the position of the matching byte if it is a multiple of 4. + If it is not a multiple of 4, there was no match. */ ENTRY (__strchr_aarch64_mte) - PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin ld1 {vdata.16b}, [src] - mov wtmp2, 0x3003 - dup vrepmask.8h, wtmp2 + movi vrepmask.16b, 0x33 cmeq vhas_nul.16b, vdata.16b, 0 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - mov wtmp2, 0xf00f - dup vrepmask2.8h, wtmp2 - bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b - lsl tmp3, srcin, 2 - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ - + lsl tmp2, srcin, 2 + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov tmp1, dend - lsr tmp1, tmp1, tmp3 + lsr tmp1, tmp1, tmp2 cbz tmp1, L(loop) rbit tmp1, tmp1 @@ -74,28 +63,34 @@ ENTRY (__strchr_aarch64_mte) .p2align 4 L(loop): - ldr qdata, [src, 16]! + ldr qdata, [src, 16] + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov tmp1, dend + cbnz tmp1, L(end) + ldr qdata, [src, 32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b umaxp vend.16b, vhas_nul.16b, vhas_nul.16b fmov tmp1, dend cbz tmp1, L(loop) + sub src, src, 16 +L(end): #ifdef __AARCH64EB__ bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov tmp1, dend #else bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov tmp1, dend rbit tmp1, tmp1 #endif + add src, src, 16 clz tmp1, tmp1 - /* Tmp1 is an even multiple of 2 if the target character was - found first. Otherwise we've found the end of string. */ + /* Tmp1 is a multiple of 4 if the target character was found. */ tst tmp1, 2 add result, src, tmp1, lsr 2 csel result, result, xzr, eq diff --git a/contrib/arm-optimized-routines/string/aarch64/strchr.S b/contrib/arm-optimized-routines/string/aarch64/strchr.S index 1063cbfd77aa..c1d01e9635b6 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strchr.S +++ b/contrib/arm-optimized-routines/string/aarch64/strchr.S @@ -1,8 +1,8 @@ /* * strchr - find a character in a string * - * Copyright (c) 2014-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2014-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * Neon Available. */ -#include "../asmdefs.h" +#include "asmdefs.h" /* Arguments and results. */ #define srcin x0 @@ -51,7 +51,6 @@ /* Locals and temporaries. */ ENTRY (__strchr_aarch64) - PTR_ARG (0) /* Magic constant 0xc0300c03 to allow us to identify which lane matches the requested byte. Even bits are set if the character matches, odd bits if either the char is NUL or matches. */ diff --git a/contrib/arm-optimized-routines/string/aarch64/strchrnul-mte.S b/contrib/arm-optimized-routines/string/aarch64/strchrnul-mte.S index 1b0d0a63094c..b3180cdf9e2c 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strchrnul-mte.S +++ b/contrib/arm-optimized-routines/string/aarch64/strchrnul-mte.S @@ -1,8 +1,8 @@ /* * strchrnul - find a character or nul in a string * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define chrin w1 @@ -20,38 +20,31 @@ #define src x2 #define tmp1 x1 #define tmp2 x3 -#define tmp2w w3 #define vrepchr v0 #define vdata v1 #define qdata q1 #define vhas_nul v2 #define vhas_chr v3 -#define vrepmask v4 -#define vend v5 -#define dend d5 +#define vend v4 +#define dend d4 -/* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ +/* + Core algorithm: + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits + per byte. We take 4 bits of every comparison byte with shift right and narrow + by 4 instruction. Since the bits in the nibble mask reflect the order in + which things occur in the original string, counting leading zeros identifies + exactly which byte matched. */ ENTRY (__strchrnul_aarch64_mte) - PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin ld1 {vdata.16b}, [src] - mov tmp2w, 0xf00f - dup vrepmask.8h, tmp2w cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b lsl tmp2, srcin, 2 - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ fmov tmp1, dend lsr tmp1, tmp1, tmp2 /* Mask padding bits. */ cbz tmp1, L(loop) @@ -63,15 +56,22 @@ ENTRY (__strchrnul_aarch64_mte) .p2align 4 L(loop): - ldr qdata, [src, 16]! + ldr qdata, [src, 16] + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b + fmov tmp1, dend + cbnz tmp1, L(end) + ldr qdata, [src, 32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b umaxp vend.16b, vhas_chr.16b, vhas_chr.16b fmov tmp1, dend cbz tmp1, L(loop) - - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + sub src, src, 16 +L(end): + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ + add src, src, 16 fmov tmp1, dend #ifndef __AARCH64EB__ rbit tmp1, tmp1 diff --git a/contrib/arm-optimized-routines/string/aarch64/strchrnul.S b/contrib/arm-optimized-routines/string/aarch64/strchrnul.S index a4230d919b47..0a32c46c30c5 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strchrnul.S +++ b/contrib/arm-optimized-routines/string/aarch64/strchrnul.S @@ -1,8 +1,8 @@ /* * strchrnul - find a character or nul in a string * - * Copyright (c) 2014-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2014-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * Neon Available. */ -#include "../asmdefs.h" +#include "asmdefs.h" /* Arguments and results. */ #define srcin x0 @@ -47,7 +47,6 @@ /* Locals and temporaries. */ ENTRY (__strchrnul_aarch64) - PTR_ARG (0) /* Magic constant 0x40100401 to allow us to identify which lane matches the termination condition. */ mov wtmp2, #0x0401 diff --git a/contrib/arm-optimized-routines/string/aarch64/strcmp-mte.S b/contrib/arm-optimized-routines/string/aarch64/strcmp-mte.S deleted file mode 100644 index 12d1a6b51dd3..000000000000 --- a/contrib/arm-optimized-routines/string/aarch64/strcmp-mte.S +++ /dev/null @@ -1,189 +0,0 @@ -/* - * strcmp - compare two strings - * - * Copyright (c) 2012-2020, Arm Limited. - * SPDX-License-Identifier: MIT - */ - - -/* Assumptions: - * - * ARMv8-a, AArch64. - * MTE compatible. - */ - -#include "../asmdefs.h" - -#define REP8_01 0x0101010101010101 -#define REP8_7f 0x7f7f7f7f7f7f7f7f - -#define src1 x0 -#define src2 x1 -#define result x0 - -#define data1 x2 -#define data1w w2 -#define data2 x3 -#define data2w w3 -#define has_nul x4 -#define diff x5 -#define off1 x5 -#define syndrome x6 -#define tmp x6 -#define data3 x7 -#define zeroones x8 -#define shift x9 -#define off2 x10 - -/* On big-endian early bytes are at MSB and on little-endian LSB. - LS_FW means shifting towards early bytes. */ -#ifdef __AARCH64EB__ -# define LS_FW lsl -#else -# define LS_FW lsr -#endif - -/* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. - Since carry propagation makes 0x1 bytes before a NUL byte appear - NUL too in big-endian, byte-reverse the data before the NUL check. */ - - -ENTRY (__strcmp_aarch64_mte) - PTR_ARG (0) - PTR_ARG (1) - sub off2, src2, src1 - mov zeroones, REP8_01 - and tmp, src1, 7 - tst off2, 7 - b.ne L(misaligned8) - cbnz tmp, L(mutual_align) - - .p2align 4 - -L(loop_aligned): - ldr data2, [src1, off2] - ldr data1, [src1], 8 -L(start_realigned): -#ifdef __AARCH64EB__ - rev tmp, data1 - sub has_nul, tmp, zeroones - orr tmp, tmp, REP8_7f -#else - sub has_nul, data1, zeroones - orr tmp, data1, REP8_7f -#endif - bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */ - ccmp data1, data2, 0, eq - b.eq L(loop_aligned) -#ifdef __AARCH64EB__ - rev has_nul, has_nul -#endif - eor diff, data1, data2 - orr syndrome, diff, has_nul -L(end): -#ifndef __AARCH64EB__ - rev syndrome, syndrome - rev data1, data1 - rev data2, data2 -#endif - clz shift, syndrome - /* The most-significant-non-zero bit of the syndrome marks either the - first bit that is different, or the top bit of the first zero byte. - Shifting left now will bring the critical information into the - top bits. */ - lsl data1, data1, shift - lsl data2, data2, shift - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, 56 - sub result, data1, data2, lsr 56 - ret - - .p2align 4 - -L(mutual_align): - /* Sources are mutually aligned, but are not currently at an - alignment boundary. Round down the addresses and then mask off - the bytes that precede the start point. */ - bic src1, src1, 7 - ldr data2, [src1, off2] - ldr data1, [src1], 8 - neg shift, src2, lsl 3 /* Bits to alignment -64. */ - mov tmp, -1 - LS_FW tmp, tmp, shift - orr data1, data1, tmp - orr data2, data2, tmp - b L(start_realigned) - -L(misaligned8): - /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always - checking to make sure that we don't access beyond the end of SRC2. */ - cbz tmp, L(src1_aligned) -L(do_misaligned): - ldrb data1w, [src1], 1 - ldrb data2w, [src2], 1 - cmp data1w, 0 - ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ - b.ne L(done) - tst src1, 7 - b.ne L(do_misaligned) - -L(src1_aligned): - neg shift, src2, lsl 3 - bic src2, src2, 7 - ldr data3, [src2], 8 -#ifdef __AARCH64EB__ - rev data3, data3 -#endif - lsr tmp, zeroones, shift - orr data3, data3, tmp - sub has_nul, data3, zeroones - orr tmp, data3, REP8_7f - bics has_nul, has_nul, tmp - b.ne L(tail) - - sub off1, src2, src1 - - .p2align 4 - -L(loop_unaligned): - ldr data3, [src1, off1] - ldr data2, [src1, off2] -#ifdef __AARCH64EB__ - rev data3, data3 -#endif - sub has_nul, data3, zeroones - orr tmp, data3, REP8_7f - ldr data1, [src1], 8 - bics has_nul, has_nul, tmp - ccmp data1, data2, 0, eq - b.eq L(loop_unaligned) - - lsl tmp, has_nul, shift -#ifdef __AARCH64EB__ - rev tmp, tmp -#endif - eor diff, data1, data2 - orr syndrome, diff, tmp - cbnz syndrome, L(end) -L(tail): - ldr data1, [src1] - neg shift, shift - lsr data2, data3, shift - lsr has_nul, has_nul, shift -#ifdef __AARCH64EB__ - rev data2, data2 - rev has_nul, has_nul -#endif - eor diff, data1, data2 - orr syndrome, diff, has_nul - b L(end) - -L(done): - sub result, data1, data2 - ret - -END (__strcmp_aarch64_mte) - diff --git a/contrib/arm-optimized-routines/string/aarch64/strcmp.S b/contrib/arm-optimized-routines/string/aarch64/strcmp.S index 7714ebf5577d..7c0d0485a89b 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strcmp.S +++ b/contrib/arm-optimized-routines/string/aarch64/strcmp.S @@ -1,168 +1,182 @@ /* * strcmp - compare two strings * - * Copyright (c) 2012-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2012-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ + /* Assumptions: * - * ARMv8-a, AArch64 + * ARMv8-a, AArch64. + * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 -/* Parameters and result. */ #define src1 x0 #define src2 x1 #define result x0 -/* Internal variables. */ #define data1 x2 #define data1w w2 #define data2 x3 #define data2w w3 #define has_nul x4 #define diff x5 +#define off1 x5 #define syndrome x6 -#define tmp1 x7 -#define tmp2 x8 -#define tmp3 x9 -#define zeroones x10 -#define pos x11 +#define tmp x6 +#define data3 x7 +#define zeroones x8 +#define shift x9 +#define off2 x10 + +/* On big-endian early bytes are at MSB and on little-endian LSB. + LS_FW means shifting towards early bytes. */ +#ifdef __AARCH64EB__ +# define LS_FW lsl +#else +# define LS_FW lsr +#endif + +/* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. + Since carry propagation makes 0x1 bytes before a NUL byte appear + NUL too in big-endian, byte-reverse the data before the NUL check. */ + - /* Start of performance-critical section -- one 64B cache line. */ ENTRY (__strcmp_aarch64) - PTR_ARG (0) - PTR_ARG (1) - eor tmp1, src1, src2 - mov zeroones, #REP8_01 - tst tmp1, #7 + sub off2, src2, src1 + mov zeroones, REP8_01 + and tmp, src1, 7 + tst off2, 7 b.ne L(misaligned8) - ands tmp1, src1, #7 - b.ne L(mutual_align) - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. */ + cbnz tmp, L(mutual_align) + + .p2align 4 + L(loop_aligned): - ldr data1, [src1], #8 - ldr data2, [src2], #8 + ldr data2, [src1, off2] + ldr data1, [src1], 8 L(start_realigned): - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - eor diff, data1, data2 /* Non-zero if differences found. */ - bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ +#ifdef __AARCH64EB__ + rev tmp, data1 + sub has_nul, tmp, zeroones + orr tmp, tmp, REP8_7f +#else + sub has_nul, data1, zeroones + orr tmp, data1, REP8_7f +#endif + bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */ + ccmp data1, data2, 0, eq + b.eq L(loop_aligned) +#ifdef __AARCH64EB__ + rev has_nul, has_nul +#endif + eor diff, data1, data2 orr syndrome, diff, has_nul - cbz syndrome, L(loop_aligned) - /* End of performance-critical section -- one 64B cache line. */ - L(end): -#ifndef __AARCH64EB__ +#ifndef __AARCH64EB__ rev syndrome, syndrome rev data1, data1 - /* The MS-non-zero bit of the syndrome marks either the first bit - that is different, or the top bit of the first zero byte. - Shifting left now will bring the critical information into the - top bits. */ - clz pos, syndrome rev data2, data2 - lsl data1, data1, pos - lsl data2, data2, pos - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 - ret -#else - /* For big-endian we cannot use the trick with the syndrome value - as carry-propagation can corrupt the upper bits if the trailing - bytes in the string contain 0x01. */ - /* However, if there is no NUL byte in the dword, we can generate - the result directly. We can't just subtract the bytes as the - MSB might be significant. */ - cbnz has_nul, 1f - cmp data1, data2 - cset result, ne - cneg result, result, lo - ret -1: - /* Re-compute the NUL-byte detection, using a byte-reversed value. */ - rev tmp3, data1 - sub tmp1, tmp3, zeroones - orr tmp2, tmp3, #REP8_7f - bic has_nul, tmp1, tmp2 - rev has_nul, has_nul - orr syndrome, diff, has_nul - clz pos, syndrome - /* The MS-non-zero bit of the syndrome marks either the first bit - that is different, or the top bit of the first zero byte. +#endif + clz shift, syndrome + /* The most-significant-non-zero bit of the syndrome marks either the + first bit that is different, or the top bit of the first zero byte. Shifting left now will bring the critical information into the top bits. */ - lsl data1, data1, pos - lsl data2, data2, pos + lsl data1, data1, shift + lsl data2, data2, shift /* But we need to zero-extend (char is unsigned) the value and then perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 + lsr data1, data1, 56 + sub result, data1, data2, lsr 56 ret -#endif + + .p2align 4 L(mutual_align): /* Sources are mutually aligned, but are not currently at an alignment boundary. Round down the addresses and then mask off - the bytes that preceed the start point. */ - bic src1, src1, #7 - bic src2, src2, #7 - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ - ldr data1, [src1], #8 - neg tmp1, tmp1 /* Bits to alignment -64. */ - ldr data2, [src2], #8 - mov tmp2, #~0 -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#endif - orr data1, data1, tmp2 - orr data2, data2, tmp2 + the bytes that precede the start point. */ + bic src1, src1, 7 + ldr data2, [src1, off2] + ldr data1, [src1], 8 + neg shift, src2, lsl 3 /* Bits to alignment -64. */ + mov tmp, -1 + LS_FW tmp, tmp, shift + orr data1, data1, tmp + orr data2, data2, tmp b L(start_realigned) L(misaligned8): /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always - checking to make sure that we don't access beyond page boundary in - SRC2. */ - tst src1, #7 - b.eq L(loop_misaligned) + checking to make sure that we don't access beyond the end of SRC2. */ + cbz tmp, L(src1_aligned) L(do_misaligned): - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - cmp data1w, #1 - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + cmp data1w, 0 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ b.ne L(done) - tst src1, #7 + tst src1, 7 b.ne L(do_misaligned) -L(loop_misaligned): - /* Test if we are within the last dword of the end of a 4K page. If - yes then jump back to the misaligned loop to copy a byte at a time. */ - and tmp1, src2, #0xff8 - eor tmp1, tmp1, #0xff8 - cbz tmp1, L(do_misaligned) - ldr data1, [src1], #8 - ldr data2, [src2], #8 - - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - eor diff, data1, data2 /* Non-zero if differences found. */ - bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ +L(src1_aligned): + neg shift, src2, lsl 3 + bic src2, src2, 7 + ldr data3, [src2], 8 +#ifdef __AARCH64EB__ + rev data3, data3 +#endif + lsr tmp, zeroones, shift + orr data3, data3, tmp + sub has_nul, data3, zeroones + orr tmp, data3, REP8_7f + bics has_nul, has_nul, tmp + b.ne L(tail) + + sub off1, src2, src1 + + .p2align 4 + +L(loop_unaligned): + ldr data3, [src1, off1] + ldr data2, [src1, off2] +#ifdef __AARCH64EB__ + rev data3, data3 +#endif + sub has_nul, data3, zeroones + orr tmp, data3, REP8_7f + ldr data1, [src1], 8 + bics has_nul, has_nul, tmp + ccmp data1, data2, 0, eq + b.eq L(loop_unaligned) + + lsl tmp, has_nul, shift +#ifdef __AARCH64EB__ + rev tmp, tmp +#endif + eor diff, data1, data2 + orr syndrome, diff, tmp + cbnz syndrome, L(end) +L(tail): + ldr data1, [src1] + neg shift, shift + lsr data2, data3, shift + lsr has_nul, has_nul, shift +#ifdef __AARCH64EB__ + rev data2, data2 + rev has_nul, has_nul +#endif + eor diff, data1, data2 orr syndrome, diff, has_nul - cbz syndrome, L(loop_misaligned) b L(end) L(done): diff --git a/contrib/arm-optimized-routines/string/aarch64/strcpy-mte.S b/contrib/arm-optimized-routines/string/aarch64/strcpy-mte.S deleted file mode 100644 index 88c222d61e53..000000000000 --- a/contrib/arm-optimized-routines/string/aarch64/strcpy-mte.S +++ /dev/null @@ -1,161 +0,0 @@ -/* - * strcpy/stpcpy - copy a string returning pointer to start/end. - * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -/* Assumptions: - * - * ARMv8-a, AArch64, Advanced SIMD. - * MTE compatible. - */ - -#include "../asmdefs.h" - -#define dstin x0 -#define srcin x1 -#define result x0 - -#define src x2 -#define dst x3 -#define len x4 -#define synd x4 -#define tmp x5 -#define wtmp w5 -#define shift x5 -#define data1 x6 -#define dataw1 w6 -#define data2 x7 -#define dataw2 w7 - -#define dataq q0 -#define vdata v0 -#define vhas_nul v1 -#define vrepmask v2 -#define vend v3 -#define dend d3 -#define dataq2 q1 - -#ifdef BUILD_STPCPY -# define STRCPY __stpcpy_aarch64_mte -# define IFSTPCPY(X,...) X,__VA_ARGS__ -#else -# define STRCPY __strcpy_aarch64_mte -# define IFSTPCPY(X,...) -#endif - -/* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ - -ENTRY (STRCPY) - PTR_ARG (0) - PTR_ARG (1) - bic src, srcin, 15 - mov wtmp, 0xf00f - ld1 {vdata.16b}, [src] - dup vrepmask.8h, wtmp - cmeq vhas_nul.16b, vdata.16b, 0 - lsl shift, srcin, 2 - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b - fmov synd, dend - lsr synd, synd, shift - cbnz synd, L(tail) - - ldr dataq, [src, 16]! - cmeq vhas_nul.16b, vdata.16b, 0 - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b - fmov synd, dend - cbz synd, L(start_loop) - -#ifndef __AARCH64EB__ - rbit synd, synd -#endif - sub tmp, src, srcin - clz len, synd - add len, tmp, len, lsr 2 - tbz len, 4, L(less16) - sub tmp, len, 15 - ldr dataq, [srcin] - ldr dataq2, [srcin, tmp] - str dataq, [dstin] - str dataq2, [dstin, tmp] - IFSTPCPY (add result, dstin, len) - ret - - .p2align 4,,8 -L(tail): - rbit synd, synd - clz len, synd - lsr len, len, 2 - - .p2align 4 -L(less16): - tbz len, 3, L(less8) - sub tmp, len, 7 - ldr data1, [srcin] - ldr data2, [srcin, tmp] - str data1, [dstin] - str data2, [dstin, tmp] - IFSTPCPY (add result, dstin, len) - ret - - .p2align 4 -L(less8): - subs tmp, len, 3 - b.lo L(less4) - ldr dataw1, [srcin] - ldr dataw2, [srcin, tmp] - str dataw1, [dstin] - str dataw2, [dstin, tmp] - IFSTPCPY (add result, dstin, len) - ret - -L(less4): - cbz len, L(zerobyte) - ldrh dataw1, [srcin] - strh dataw1, [dstin] -L(zerobyte): - strb wzr, [dstin, len] - IFSTPCPY (add result, dstin, len) - ret - - .p2align 4 -L(start_loop): - sub len, src, srcin - ldr dataq2, [srcin] - add dst, dstin, len - str dataq2, [dstin] - - .p2align 5 -L(loop): - str dataq, [dst], 16 - ldr dataq, [src, 16]! - cmeq vhas_nul.16b, vdata.16b, 0 - umaxp vend.16b, vhas_nul.16b, vhas_nul.16b - fmov synd, dend - cbz synd, L(loop) - - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ - fmov synd, dend -#ifndef __AARCH64EB__ - rbit synd, synd -#endif - clz len, synd - lsr len, len, 2 - sub tmp, len, 15 - ldr dataq, [src, tmp] - str dataq, [dst, tmp] - IFSTPCPY (add result, dst, len) - ret - -END (STRCPY) diff --git a/contrib/arm-optimized-routines/string/aarch64/strcpy.S b/contrib/arm-optimized-routines/string/aarch64/strcpy.S index 6e9ed424b693..5852616e6024 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strcpy.S +++ b/contrib/arm-optimized-routines/string/aarch64/strcpy.S @@ -1,311 +1,154 @@ /* * strcpy/stpcpy - copy a string returning pointer to start/end. * - * Copyright (c) 2013-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: * - * ARMv8-a, AArch64, unaligned accesses, min page size 4k. + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" -/* To build as stpcpy, define BUILD_STPCPY before compiling this file. - - To test the page crossing code path more thoroughly, compile with - -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower - entry path. This option is not intended for production use. */ - -/* Arguments and results. */ #define dstin x0 #define srcin x1 +#define result x0 -/* Locals and temporaries. */ #define src x2 #define dst x3 -#define data1 x4 -#define data1w w4 -#define data2 x5 -#define data2w w5 -#define has_nul1 x6 -#define has_nul2 x7 -#define tmp1 x8 -#define tmp2 x9 -#define tmp3 x10 -#define tmp4 x11 -#define zeroones x12 -#define data1a x13 -#define data2a x14 -#define pos x15 -#define len x16 -#define to_align x17 +#define len x4 +#define synd x4 +#define tmp x5 +#define shift x5 +#define data1 x6 +#define dataw1 w6 +#define data2 x7 +#define dataw2 w7 + +#define dataq q0 +#define vdata v0 +#define vhas_nul v1 +#define vend v2 +#define dend d2 +#define dataq2 q1 #ifdef BUILD_STPCPY -#define STRCPY __stpcpy_aarch64 -#else -#define STRCPY __strcpy_aarch64 -#endif - - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. */ - -#define REP8_01 0x0101010101010101 -#define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 - - /* AArch64 systems have a minimum page size of 4k. We can do a quick - page size check for crossing this boundary on entry and if we - do not, then we can short-circuit much of the entry code. We - expect early page-crossing strings to be rare (probability of - 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite - predictable, even with random strings. - - We don't bother checking for larger page sizes, the cost of setting - up the correct page size is just not worth the extra gain from - a small reduction in the cases taking the slow path. Note that - we only care about whether the first fetch, which may be - misaligned, crosses a page boundary - after that we move to aligned - fetches for the remainder of the string. */ - -#ifdef STRCPY_TEST_PAGE_CROSS - /* Make everything that isn't Qword aligned look like a page cross. */ -#define MIN_PAGE_P2 4 +# define STRCPY __stpcpy_aarch64 +# define IFSTPCPY(X,...) X,__VA_ARGS__ #else -#define MIN_PAGE_P2 12 +# define STRCPY __strcpy_aarch64 +# define IFSTPCPY(X,...) #endif -#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2) +/* + Core algorithm: + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits + per byte. We take 4 bits of every comparison byte with shift right and narrow + by 4 instruction. Since the bits in the nibble mask reflect the order in + which things occur in the original string, counting leading zeros identifies + exactly which byte matched. */ ENTRY (STRCPY) - PTR_ARG (0) - PTR_ARG (1) - /* For moderately short strings, the fastest way to do the copy is to - calculate the length of the string in the same way as strlen, then - essentially do a memcpy of the result. This avoids the need for - multiple byte copies and further means that by the time we - reach the bulk copy loop we know we can always use DWord - accesses. We expect __strcpy_aarch64 to rarely be called repeatedly - with the same source string, so branch prediction is likely to - always be difficult - we mitigate against this by preferring - conditional select operations over branches whenever this is - feasible. */ - and tmp2, srcin, #(MIN_PAGE_SIZE - 1) - mov zeroones, #REP8_01 - and to_align, srcin, #15 - cmp tmp2, #(MIN_PAGE_SIZE - 16) - neg tmp1, to_align - /* The first fetch will straddle a (possible) page boundary iff - srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte - aligned string will never fail the page align check, so will - always take the fast path. */ - b.gt L(page_cross) - -L(page_cross_ok): - ldp data1, data2, [srcin] -#ifdef __AARCH64EB__ - /* Because we expect the end to be found within 16 characters - (profiling shows this is the most common case), it's worth - swapping the bytes now to save having to recalculate the - termination syndrome later. We preserve data1 and data2 - so that we can re-use the values later on. */ - rev tmp2, data1 - sub tmp1, tmp2, zeroones - orr tmp2, tmp2, #REP8_7f - bics has_nul1, tmp1, tmp2 - b.ne L(fp_le8) - rev tmp4, data2 - sub tmp3, tmp4, zeroones - orr tmp4, tmp4, #REP8_7f -#else - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - bics has_nul1, tmp1, tmp2 - b.ne L(fp_le8) - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f + bic src, srcin, 15 + ld1 {vdata.16b}, [src] + cmeq vhas_nul.16b, vdata.16b, 0 + lsl shift, srcin, 2 + shrn vend.8b, vhas_nul.8h, 4 + fmov synd, dend + lsr synd, synd, shift + cbnz synd, L(tail) + + ldr dataq, [src, 16]! + cmeq vhas_nul.16b, vdata.16b, 0 + shrn vend.8b, vhas_nul.8h, 4 + fmov synd, dend + cbz synd, L(start_loop) + +#ifndef __AARCH64EB__ + rbit synd, synd #endif - bics has_nul2, tmp3, tmp4 - b.eq L(bulk_entry) + sub tmp, src, srcin + clz len, synd + add len, tmp, len, lsr 2 + tbz len, 4, L(less16) + sub tmp, len, 15 + ldr dataq, [srcin] + ldr dataq2, [srcin, tmp] + str dataq, [dstin] + str dataq2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) + ret - /* The string is short (<=16 bytes). We don't know exactly how - short though, yet. Work out the exact length so that we can - quickly select the optimal copy strategy. */ -L(fp_gt8): - rev has_nul2, has_nul2 - clz pos, has_nul2 - mov tmp2, #56 - add dst, dstin, pos, lsr #3 /* Bits to bytes. */ - sub pos, tmp2, pos -#ifdef __AARCH64EB__ - lsr data2, data2, pos -#else - lsl data2, data2, pos -#endif - str data2, [dst, #1] +L(tail): + rbit synd, synd + clz len, synd + lsr len, len, 2 +L(less16): + tbz len, 3, L(less8) + sub tmp, len, 7 + ldr data1, [srcin] + ldr data2, [srcin, tmp] str data1, [dstin] -#ifdef BUILD_STPCPY - add dstin, dst, #8 -#endif + str data2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) ret -L(fp_le8): - rev has_nul1, has_nul1 - clz pos, has_nul1 - add dst, dstin, pos, lsr #3 /* Bits to bytes. */ - subs tmp2, pos, #24 /* Pos in bits. */ - b.lt L(fp_lt4) -#ifdef __AARCH64EB__ - mov tmp2, #56 - sub pos, tmp2, pos - lsr data2, data1, pos - lsr data1, data1, #32 -#else - lsr data2, data1, tmp2 -#endif - /* 4->7 bytes to copy. */ - str data2w, [dst, #-3] - str data1w, [dstin] -#ifdef BUILD_STPCPY - mov dstin, dst -#endif + .p2align 4 +L(less8): + subs tmp, len, 3 + b.lo L(less4) + ldr dataw1, [srcin] + ldr dataw2, [srcin, tmp] + str dataw1, [dstin] + str dataw2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) ret -L(fp_lt4): - cbz pos, L(fp_lt2) - /* 2->3 bytes to copy. */ -#ifdef __AARCH64EB__ - lsr data1, data1, #48 -#endif - strh data1w, [dstin] - /* Fall-through, one byte (max) to go. */ -L(fp_lt2): - /* Null-terminated string. Last character must be zero! */ - strb wzr, [dst] -#ifdef BUILD_STPCPY - mov dstin, dst -#endif - ret - - .p2align 6 - /* Aligning here ensures that the entry code and main loop all lies - within one 64-byte cache line. */ -L(bulk_entry): - sub to_align, to_align, #16 - stp data1, data2, [dstin] - sub src, srcin, to_align - sub dst, dstin, to_align - b L(entry_no_page_cross) - - /* The inner loop deals with two Dwords at a time. This has a - slightly higher start-up cost, but we should win quite quickly, - especially on cores with a high number of issue slots per - cycle, as we get much better parallelism out of the operations. */ -L(main_loop): - stp data1, data2, [dst], #16 -L(entry_no_page_cross): - ldp data1, data2, [src], #16 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f - bic has_nul1, tmp1, tmp2 - bics has_nul2, tmp3, tmp4 - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ - b.eq L(main_loop) - /* Since we know we are copying at least 16 bytes, the fastest way - to deal with the tail is to determine the location of the - trailing NUL, then (re)copy the 16 bytes leading up to that. */ - cmp has_nul1, #0 -#ifdef __AARCH64EB__ - /* For big-endian, carry propagation (if the final byte in the - string is 0x01) means we cannot use has_nul directly. The - easiest way to get the correct byte is to byte-swap the data - and calculate the syndrome a second time. */ - csel data1, data1, data2, ne - rev data1, data1 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - bic has_nul1, tmp1, tmp2 -#else - csel has_nul1, has_nul1, has_nul2, ne -#endif - rev has_nul1, has_nul1 - clz pos, has_nul1 - add tmp1, pos, #72 - add pos, pos, #8 - csel pos, pos, tmp1, ne - add src, src, pos, lsr #3 - add dst, dst, pos, lsr #3 - ldp data1, data2, [src, #-32] - stp data1, data2, [dst, #-16] -#ifdef BUILD_STPCPY - sub dstin, dst, #1 -#endif +L(less4): + cbz len, L(zerobyte) + ldrh dataw1, [srcin] + strh dataw1, [dstin] +L(zerobyte): + strb wzr, [dstin, len] + IFSTPCPY (add result, dstin, len) ret -L(page_cross): - bic src, srcin, #15 - /* Start by loading two words at [srcin & ~15], then forcing the - bytes that precede srcin to 0xff. This means they never look - like termination bytes. */ - ldp data1, data2, [src] - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ - tst to_align, #7 - csetm tmp2, ne -#ifdef __AARCH64EB__ - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#else - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ + .p2align 4 +L(start_loop): + sub tmp, srcin, dstin + ldr dataq2, [srcin] + sub dst, src, tmp + str dataq2, [dstin] +L(loop): + str dataq, [dst], 32 + ldr dataq, [src, 16] + cmeq vhas_nul.16b, vdata.16b, 0 + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbnz synd, L(loopend) + str dataq, [dst, -16] + ldr dataq, [src, 32]! + cmeq vhas_nul.16b, vdata.16b, 0 + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbz synd, L(loop) + add dst, dst, 16 +L(loopend): + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ + fmov synd, dend + sub dst, dst, 31 +#ifndef __AARCH64EB__ + rbit synd, synd #endif - orr data1, data1, tmp2 - orr data2a, data2, tmp2 - cmp to_align, #8 - csinv data1, data1, xzr, lt - csel data2, data2, data2a, lt - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f - bic has_nul1, tmp1, tmp2 - bics has_nul2, tmp3, tmp4 - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ - b.eq L(page_cross_ok) - /* We now need to make data1 and data2 look like they've been - loaded directly from srcin. Do a rotate on the 128-bit value. */ - lsl tmp1, to_align, #3 /* Bytes->bits. */ - neg tmp2, to_align, lsl #3 -#ifdef __AARCH64EB__ - lsl data1a, data1, tmp1 - lsr tmp4, data2, tmp2 - lsl data2, data2, tmp1 - orr tmp4, tmp4, data1a - cmp to_align, #8 - csel data1, tmp4, data2, lt - rev tmp2, data1 - rev tmp4, data2 - sub tmp1, tmp2, zeroones - orr tmp2, tmp2, #REP8_7f - sub tmp3, tmp4, zeroones - orr tmp4, tmp4, #REP8_7f -#else - lsr data1a, data1, tmp1 - lsl tmp4, data2, tmp2 - lsr data2, data2, tmp1 - orr tmp4, tmp4, data1a - cmp to_align, #8 - csel data1, tmp4, data2, lt - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f -#endif - bic has_nul1, tmp1, tmp2 - cbnz has_nul1, L(fp_le8) - bic has_nul2, tmp3, tmp4 - b L(fp_gt8) + clz len, synd + lsr len, len, 2 + add dst, dst, len + ldr dataq, [dst, tmp] + str dataq, [dst] + IFSTPCPY (add result, dst, 15) + ret END (STRCPY) - diff --git a/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S b/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S index 7cf41d5c1eac..afa72eed9a43 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S +++ b/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S @@ -1,8 +1,8 @@ /* * strlen - calculate the length of a string. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define result x0 @@ -19,62 +19,71 @@ #define src x1 #define synd x2 #define tmp x3 -#define wtmp w3 #define shift x4 #define data q0 #define vdata v0 #define vhas_nul v1 -#define vrepmask v2 -#define vend v3 -#define dend d3 +#define vend v2 +#define dend d2 /* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ + Process the string in 16-byte aligned chunks. Compute a 64-bit mask with + four bits per byte using the shrn instruction. A count trailing zeros then + identifies the first zero byte. */ ENTRY (__strlen_aarch64_mte) - PTR_ARG (0) bic src, srcin, 15 - mov wtmp, 0xf00f ld1 {vdata.16b}, [src] - dup vrepmask.8h, wtmp cmeq vhas_nul.16b, vdata.16b, 0 lsl shift, srcin, 2 - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov synd, dend lsr synd, synd, shift - cbz synd, L(loop) + cbz synd, L(next16) rbit synd, synd clz result, synd lsr result, result, 2 ret - .p2align 5 -L(loop): - ldr data, [src, 16]! +L(next16): + ldr data, [src, 16] cmeq vhas_nul.16b, vdata.16b, 0 - umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov synd, dend cbz synd, L(loop) - - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + add src, src, 16 +#ifndef __AARCH64EB__ + rbit synd, synd +#endif sub result, src, srcin + clz tmp, synd + add result, result, tmp, lsr 2 + ret + + .p2align 5 +L(loop): + ldr data, [src, 32]! + cmeq vhas_nul.16b, vdata.16b, 0 + addhn vend.8b, vhas_nul.8h, vhas_nul.8h + fmov synd, dend + cbnz synd, L(loop_end) + ldr data, [src, 16] + cmeq vhas_nul.16b, vdata.16b, 0 + addhn vend.8b, vhas_nul.8h, vhas_nul.8h fmov synd, dend + cbz synd, L(loop) + add src, src, 16 +L(loop_end): + sub result, shift, src, lsl 2 /* (srcin - src) << 2. */ #ifndef __AARCH64EB__ rbit synd, synd + sub result, result, 3 #endif clz tmp, synd - add result, result, tmp, lsr 2 + sub result, tmp, result + lsr result, result, 2 ret END (__strlen_aarch64_mte) - diff --git a/contrib/arm-optimized-routines/string/aarch64/strlen.S b/contrib/arm-optimized-routines/string/aarch64/strlen.S index a1b164a49238..0ebb26be844c 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strlen.S +++ b/contrib/arm-optimized-routines/string/aarch64/strlen.S @@ -1,8 +1,8 @@ /* * strlen - calculate the length of a string. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * Not MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define len x0 @@ -36,6 +36,7 @@ #define tmp x2 #define tmpw w2 #define synd x3 +#define syndw w3 #define shift x4 /* For the first 32 bytes, NUL detection works on the principle that @@ -74,7 +75,6 @@ character, return the length, if not, continue in the main loop. */ ENTRY (__strlen_aarch64) - PTR_ARG (0) and tmp1, srcin, MIN_PAGE_SIZE - 1 cmp tmp1, MIN_PAGE_SIZE - 32 b.hi L(page_cross) @@ -110,7 +110,6 @@ ENTRY (__strlen_aarch64) add len, len, tmp1, lsr 3 ret - .p2align 3 /* Look for a NUL byte at offset 16..31 in the string. */ L(bytes16_31): ldp data1, data2, [srcin, 16] @@ -138,6 +137,7 @@ L(bytes16_31): add len, len, tmp1, lsr 3 ret + nop L(loop_entry): bic src, srcin, 31 @@ -153,18 +153,12 @@ L(loop): /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */ cmeq maskv.16b, datav1.16b, 0 sub len, src, srcin - tst synd, 0xffffffff - b.ne 1f + cbnz syndw, 1f cmeq maskv.16b, datav2.16b, 0 add len, len, 16 1: /* Generate a bitmask and compute correct byte offset. */ -#ifdef __AARCH64EB__ - bic maskv.8h, 0xf0 -#else - bic maskv.8h, 0x0f, lsl 8 -#endif - umaxp maskv.16b, maskv.16b, maskv.16b + shrn maskv.8b, maskv.8h, 4 fmov synd, maskd #ifndef __AARCH64EB__ rbit synd, synd @@ -173,8 +167,6 @@ L(loop): add len, len, tmp, lsr 2 ret - .p2align 4 - L(page_cross): bic src, srcin, 31 mov tmpw, 0x0c03 diff --git a/contrib/arm-optimized-routines/string/aarch64/strncmp-mte.S b/contrib/arm-optimized-routines/string/aarch64/strncmp-mte.S deleted file mode 100644 index c9d6fc8a158b..000000000000 --- a/contrib/arm-optimized-routines/string/aarch64/strncmp-mte.S +++ /dev/null @@ -1,307 +0,0 @@ -/* - * strncmp - compare two strings - * - * Copyright (c) 2013-2021, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -/* Assumptions: - * - * ARMv8-a, AArch64 - */ - -#include "../asmdefs.h" - -#define REP8_01 0x0101010101010101 -#define REP8_7f 0x7f7f7f7f7f7f7f7f - -/* Parameters and result. */ -#define src1 x0 -#define src2 x1 -#define limit x2 -#define result x0 - -/* Internal variables. */ -#define data1 x3 -#define data1w w3 -#define data2 x4 -#define data2w w4 -#define has_nul x5 -#define diff x6 -#define syndrome x7 -#define tmp1 x8 -#define tmp2 x9 -#define tmp3 x10 -#define zeroones x11 -#define pos x12 -#define mask x13 -#define endloop x14 -#define count mask -#define offset pos -#define neg_offset x15 - -/* Define endian dependent shift operations. - On big-endian early bytes are at MSB and on little-endian LSB. - LS_FW means shifting towards early bytes. - LS_BK means shifting towards later bytes. - */ -#ifdef __AARCH64EB__ -#define LS_FW lsl -#define LS_BK lsr -#else -#define LS_FW lsr -#define LS_BK lsl -#endif - -ENTRY (__strncmp_aarch64_mte) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) - cbz limit, L(ret0) - eor tmp1, src1, src2 - mov zeroones, #REP8_01 - tst tmp1, #7 - and count, src1, #7 - b.ne L(misaligned8) - cbnz count, L(mutual_align) - - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. */ - .p2align 4 -L(loop_aligned): - ldr data1, [src1], #8 - ldr data2, [src2], #8 -L(start_realigned): - subs limit, limit, #8 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - eor diff, data1, data2 /* Non-zero if differences found. */ - csinv endloop, diff, xzr, hi /* Last Dword or differences. */ - bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ - ccmp endloop, #0, #0, eq - b.eq L(loop_aligned) - /* End of main loop */ - -L(full_check): -#ifndef __AARCH64EB__ - orr syndrome, diff, has_nul - add limit, limit, 8 /* Rewind limit to before last subs. */ -L(syndrome_check): - /* Limit was reached. Check if the NUL byte or the difference - is before the limit. */ - rev syndrome, syndrome - rev data1, data1 - clz pos, syndrome - rev data2, data2 - lsl data1, data1, pos - cmp limit, pos, lsr #3 - lsl data2, data2, pos - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 - csel result, result, xzr, hi - ret -#else - /* Not reached the limit, must have found the end or a diff. */ - tbz limit, #63, L(not_limit) - add tmp1, limit, 8 - cbz limit, L(not_limit) - - lsl limit, tmp1, #3 /* Bits -> bytes. */ - mov mask, #~0 - lsr mask, mask, limit - bic data1, data1, mask - bic data2, data2, mask - - /* Make sure that the NUL byte is marked in the syndrome. */ - orr has_nul, has_nul, mask - -L(not_limit): - /* For big-endian we cannot use the trick with the syndrome value - as carry-propagation can corrupt the upper bits if the trailing - bytes in the string contain 0x01. */ - /* However, if there is no NUL byte in the dword, we can generate - the result directly. We can't just subtract the bytes as the - MSB might be significant. */ - cbnz has_nul, 1f - cmp data1, data2 - cset result, ne - cneg result, result, lo - ret -1: - /* Re-compute the NUL-byte detection, using a byte-reversed value. */ - rev tmp3, data1 - sub tmp1, tmp3, zeroones - orr tmp2, tmp3, #REP8_7f - bic has_nul, tmp1, tmp2 - rev has_nul, has_nul - orr syndrome, diff, has_nul - clz pos, syndrome - /* The most-significant-non-zero bit of the syndrome marks either the - first bit that is different, or the top bit of the first zero byte. - Shifting left now will bring the critical information into the - top bits. */ -L(end_quick): - lsl data1, data1, pos - lsl data2, data2, pos - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 - ret -#endif - -L(mutual_align): - /* Sources are mutually aligned, but are not currently at an - alignment boundary. Round down the addresses and then mask off - the bytes that precede the start point. - We also need to adjust the limit calculations, but without - overflowing if the limit is near ULONG_MAX. */ - bic src1, src1, #7 - bic src2, src2, #7 - ldr data1, [src1], #8 - neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ - ldr data2, [src2], #8 - mov tmp2, #~0 - LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ - /* Adjust the limit and ensure it doesn't overflow. */ - adds limit, limit, count - csinv limit, limit, xzr, lo - orr data1, data1, tmp2 - orr data2, data2, tmp2 - b L(start_realigned) - - .p2align 4 - /* Don't bother with dwords for up to 16 bytes. */ -L(misaligned8): - cmp limit, #16 - b.hs L(try_misaligned_words) - -L(byte_loop): - /* Perhaps we can do better than this. */ - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - subs limit, limit, #1 - ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.eq L(byte_loop) -L(done): - sub result, data1, data2 - ret - /* Align the SRC1 to a dword by doing a bytewise compare and then do - the dword loop. */ -L(try_misaligned_words): - cbz count, L(src1_aligned) - - neg count, count - and count, count, #7 - sub limit, limit, count - -L(page_end_loop): - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - cmp data1w, #1 - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.ne L(done) - subs count, count, #1 - b.hi L(page_end_loop) - - /* The following diagram explains the comparison of misaligned strings. - The bytes are shown in natural order. For little-endian, it is - reversed in the registers. The "x" bytes are before the string. - The "|" separates data that is loaded at one time. - src1 | a a a a a a a a | b b b c c c c c | . . . - src2 | x x x x x a a a a a a a a b b b | c c c c c . . . - - After shifting in each step, the data looks like this: - STEP_A STEP_B STEP_C - data1 a a a a a a a a b b b c c c c c b b b c c c c c - data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c - - The bytes with "0" are eliminated from the syndrome via mask. - - Align SRC2 down to 16 bytes. This way we can read 16 bytes at a - time from SRC2. The comparison happens in 3 steps. After each step - the loop can exit, or read from SRC1 or SRC2. */ -L(src1_aligned): - /* Calculate offset from 8 byte alignment to string start in bits. No - need to mask offset since shifts are ignoring upper bits. */ - lsl offset, src2, #3 - bic src2, src2, #0xf - mov mask, -1 - neg neg_offset, offset - ldr data1, [src1], #8 - ldp tmp1, tmp2, [src2], #16 - LS_BK mask, mask, neg_offset - and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ - /* Skip the first compare if data in tmp1 is irrelevant. */ - tbnz offset, 6, L(misaligned_mid_loop) - -L(loop_misaligned): - /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ - LS_FW data2, tmp1, offset - LS_BK tmp1, tmp2, neg_offset - subs limit, limit, #8 - orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ - sub has_nul, data1, zeroones - eor diff, data1, data2 /* Non-zero if differences found. */ - orr tmp3, data1, #REP8_7f - csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ - bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ - orr tmp3, endloop, has_nul - cbnz tmp3, L(full_check) - - ldr data1, [src1], #8 -L(misaligned_mid_loop): - /* STEP_B: Compare first part of data1 to second part of tmp2. */ - LS_FW data2, tmp2, offset -#ifdef __AARCH64EB__ - /* For big-endian we do a byte reverse to avoid carry-propagation - problem described above. This way we can reuse the has_nul in the - next step and also use syndrome value trick at the end. */ - rev tmp3, data1 - #define data1_fixed tmp3 -#else - #define data1_fixed data1 -#endif - sub has_nul, data1_fixed, zeroones - orr tmp3, data1_fixed, #REP8_7f - eor diff, data2, data1 /* Non-zero if differences found. */ - bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ -#ifdef __AARCH64EB__ - rev has_nul, has_nul -#endif - cmp limit, neg_offset, lsr #3 - orr syndrome, diff, has_nul - bic syndrome, syndrome, mask /* Ignore later bytes. */ - csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ - cbnz tmp3, L(syndrome_check) - - /* STEP_C: Compare second part of data1 to first part of tmp1. */ - ldp tmp1, tmp2, [src2], #16 - cmp limit, #8 - LS_BK data2, tmp1, neg_offset - eor diff, data2, data1 /* Non-zero if differences found. */ - orr syndrome, diff, has_nul - and syndrome, syndrome, mask /* Ignore earlier bytes. */ - csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ - cbnz tmp3, L(syndrome_check) - - ldr data1, [src1], #8 - sub limit, limit, #8 - b L(loop_misaligned) - -#ifdef __AARCH64EB__ -L(syndrome_check): - clz pos, syndrome - cmp pos, limit, lsl #3 - b.lo L(end_quick) -#endif - -L(ret0): - mov result, #0 - ret -END(__strncmp_aarch64_mte) - diff --git a/contrib/arm-optimized-routines/string/aarch64/strncmp.S b/contrib/arm-optimized-routines/string/aarch64/strncmp.S index 738b6539cab6..493a0f06ed1d 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strncmp.S +++ b/contrib/arm-optimized-routines/string/aarch64/strncmp.S @@ -1,20 +1,20 @@ /* * strncmp - compare two strings * - * Copyright (c) 2013-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2013-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: * - * ARMv8-a, AArch64 + * ARMv8-a, AArch64. + * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 /* Parameters and result. */ #define src1 x0 @@ -35,15 +35,26 @@ #define tmp3 x10 #define zeroones x11 #define pos x12 -#define limit_wd x13 -#define mask x14 -#define endloop x15 +#define mask x13 +#define endloop x14 #define count mask +#define offset pos +#define neg_offset x15 + +/* Define endian dependent shift operations. + On big-endian early bytes are at MSB and on little-endian LSB. + LS_FW means shifting towards early bytes. + LS_BK means shifting towards later bytes. + */ +#ifdef __AARCH64EB__ +#define LS_FW lsl +#define LS_BK lsr +#else +#define LS_FW lsr +#define LS_BK lsl +#endif ENTRY (__strncmp_aarch64) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) cbz limit, L(ret0) eor tmp1, src1, src2 mov zeroones, #REP8_01 @@ -51,9 +62,6 @@ ENTRY (__strncmp_aarch64) and count, src1, #7 b.ne L(misaligned8) cbnz count, L(mutual_align) - /* Calculate the number of full and partial words -1. */ - sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ - lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and @@ -63,56 +71,52 @@ L(loop_aligned): ldr data1, [src1], #8 ldr data2, [src2], #8 L(start_realigned): - subs limit_wd, limit_wd, #1 + subs limit, limit, #8 sub tmp1, data1, zeroones orr tmp2, data1, #REP8_7f eor diff, data1, data2 /* Non-zero if differences found. */ - csinv endloop, diff, xzr, pl /* Last Dword or differences. */ + csinv endloop, diff, xzr, hi /* Last Dword or differences. */ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ ccmp endloop, #0, #0, eq b.eq L(loop_aligned) /* End of main loop */ - /* Not reached the limit, must have found the end or a diff. */ - tbz limit_wd, #63, L(not_limit) - - /* Limit % 8 == 0 => all bytes significant. */ - ands limit, limit, #7 - b.eq L(not_limit) - - lsl limit, limit, #3 /* Bits -> bytes. */ - mov mask, #~0 -#ifdef __AARCH64EB__ - lsr mask, mask, limit -#else - lsl mask, mask, limit -#endif - bic data1, data1, mask - bic data2, data2, mask - - /* Make sure that the NUL byte is marked in the syndrome. */ - orr has_nul, has_nul, mask - -L(not_limit): +L(full_check): +#ifndef __AARCH64EB__ orr syndrome, diff, has_nul - -#ifndef __AARCH64EB__ + add limit, limit, 8 /* Rewind limit to before last subs. */ +L(syndrome_check): + /* Limit was reached. Check if the NUL byte or the difference + is before the limit. */ rev syndrome, syndrome rev data1, data1 - /* The MS-non-zero bit of the syndrome marks either the first bit - that is different, or the top bit of the first zero byte. - Shifting left now will bring the critical information into the - top bits. */ clz pos, syndrome rev data2, data2 lsl data1, data1, pos + cmp limit, pos, lsr #3 lsl data2, data2, pos /* But we need to zero-extend (char is unsigned) the value and then perform a signed 32-bit subtraction. */ lsr data1, data1, #56 sub result, data1, data2, lsr #56 + csel result, result, xzr, hi ret #else + /* Not reached the limit, must have found the end or a diff. */ + tbz limit, #63, L(not_limit) + add tmp1, limit, 8 + cbz limit, L(not_limit) + + lsl limit, tmp1, #3 /* Bits -> bytes. */ + mov mask, #~0 + lsr mask, mask, limit + bic data1, data1, mask + bic data2, data2, mask + + /* Make sure that the NUL byte is marked in the syndrome. */ + orr has_nul, has_nul, mask + +L(not_limit): /* For big-endian we cannot use the trick with the syndrome value as carry-propagation can corrupt the upper bits if the trailing bytes in the string contain 0x01. */ @@ -133,10 +137,11 @@ L(not_limit): rev has_nul, has_nul orr syndrome, diff, has_nul clz pos, syndrome - /* The MS-non-zero bit of the syndrome marks either the first bit - that is different, or the top bit of the first zero byte. + /* The most-significant-non-zero bit of the syndrome marks either the + first bit that is different, or the top bit of the first zero byte. Shifting left now will bring the critical information into the top bits. */ +L(end_quick): lsl data1, data1, pos lsl data2, data2, pos /* But we need to zero-extend (char is unsigned) the value and then @@ -158,22 +163,12 @@ L(mutual_align): neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ ldr data2, [src2], #8 mov tmp2, #~0 - sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */ -#endif - and tmp3, limit_wd, #7 - lsr limit_wd, limit_wd, #3 - /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */ - add limit, limit, count - add tmp3, tmp3, count + LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ + /* Adjust the limit and ensure it doesn't overflow. */ + adds limit, limit, count + csinv limit, limit, xzr, lo orr data1, data1, tmp2 orr data2, data2, tmp2 - add limit_wd, limit_wd, tmp3, lsr #3 b L(start_realigned) .p2align 4 @@ -196,13 +191,11 @@ L(done): /* Align the SRC1 to a dword by doing a bytewise compare and then do the dword loop. */ L(try_misaligned_words): - lsr limit_wd, limit, #3 - cbz count, L(do_misaligned) + cbz count, L(src1_aligned) neg count, count and count, count, #7 sub limit, limit, count - lsr limit_wd, limit, #3 L(page_end_loop): ldrb data1w, [src1], #1 @@ -213,48 +206,100 @@ L(page_end_loop): subs count, count, #1 b.hi L(page_end_loop) -L(do_misaligned): - /* Prepare ourselves for the next page crossing. Unlike the aligned - loop, we fetch 1 less dword because we risk crossing bounds on - SRC2. */ - mov count, #8 - subs limit_wd, limit_wd, #1 - b.lo L(done_loop) -L(loop_misaligned): - and tmp2, src2, #0xff8 - eor tmp2, tmp2, #0xff8 - cbz tmp2, L(page_end_loop) + /* The following diagram explains the comparison of misaligned strings. + The bytes are shown in natural order. For little-endian, it is + reversed in the registers. The "x" bytes are before the string. + The "|" separates data that is loaded at one time. + src1 | a a a a a a a a | b b b c c c c c | . . . + src2 | x x x x x a a a a a a a a b b b | c c c c c . . . + + After shifting in each step, the data looks like this: + STEP_A STEP_B STEP_C + data1 a a a a a a a a b b b c c c c c b b b c c c c c + data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c + The bytes with "0" are eliminated from the syndrome via mask. + + Align SRC2 down to 16 bytes. This way we can read 16 bytes at a + time from SRC2. The comparison happens in 3 steps. After each step + the loop can exit, or read from SRC1 or SRC2. */ +L(src1_aligned): + /* Calculate offset from 8 byte alignment to string start in bits. No + need to mask offset since shifts are ignoring upper bits. */ + lsl offset, src2, #3 + bic src2, src2, #0xf + mov mask, -1 + neg neg_offset, offset ldr data1, [src1], #8 - ldr data2, [src2], #8 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - eor diff, data1, data2 /* Non-zero if differences found. */ - bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ - ccmp diff, #0, #0, eq - b.ne L(not_limit) - subs limit_wd, limit_wd, #1 - b.pl L(loop_misaligned) + ldp tmp1, tmp2, [src2], #16 + LS_BK mask, mask, neg_offset + and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ + /* Skip the first compare if data in tmp1 is irrelevant. */ + tbnz offset, 6, L(misaligned_mid_loop) -L(done_loop): - /* We found a difference or a NULL before the limit was reached. */ - and limit, limit, #7 - cbz limit, L(not_limit) - /* Read the last word. */ - sub src1, src1, 8 - sub src2, src2, 8 - ldr data1, [src1, limit] - ldr data2, [src2, limit] - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f +L(loop_misaligned): + /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ + LS_FW data2, tmp1, offset + LS_BK tmp1, tmp2, neg_offset + subs limit, limit, #8 + orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ + sub has_nul, data1, zeroones eor diff, data1, data2 /* Non-zero if differences found. */ - bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ - ccmp diff, #0, #0, eq - b.ne L(not_limit) + orr tmp3, data1, #REP8_7f + csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ + bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ + orr tmp3, endloop, has_nul + cbnz tmp3, L(full_check) + + ldr data1, [src1], #8 +L(misaligned_mid_loop): + /* STEP_B: Compare first part of data1 to second part of tmp2. */ + LS_FW data2, tmp2, offset +#ifdef __AARCH64EB__ + /* For big-endian we do a byte reverse to avoid carry-propagation + problem described above. This way we can reuse the has_nul in the + next step and also use syndrome value trick at the end. */ + rev tmp3, data1 + #define data1_fixed tmp3 +#else + #define data1_fixed data1 +#endif + sub has_nul, data1_fixed, zeroones + orr tmp3, data1_fixed, #REP8_7f + eor diff, data2, data1 /* Non-zero if differences found. */ + bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ +#ifdef __AARCH64EB__ + rev has_nul, has_nul +#endif + cmp limit, neg_offset, lsr #3 + orr syndrome, diff, has_nul + bic syndrome, syndrome, mask /* Ignore later bytes. */ + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ + cbnz tmp3, L(syndrome_check) + + /* STEP_C: Compare second part of data1 to first part of tmp1. */ + ldp tmp1, tmp2, [src2], #16 + cmp limit, #8 + LS_BK data2, tmp1, neg_offset + eor diff, data2, data1 /* Non-zero if differences found. */ + orr syndrome, diff, has_nul + and syndrome, syndrome, mask /* Ignore earlier bytes. */ + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ + cbnz tmp3, L(syndrome_check) + + ldr data1, [src1], #8 + sub limit, limit, #8 + b L(loop_misaligned) + +#ifdef __AARCH64EB__ +L(syndrome_check): + clz pos, syndrome + cmp pos, limit, lsl #3 + b.lo L(end_quick) +#endif L(ret0): mov result, #0 ret - -END ( __strncmp_aarch64) +END(__strncmp_aarch64) diff --git a/contrib/arm-optimized-routines/string/aarch64/strnlen.S b/contrib/arm-optimized-routines/string/aarch64/strnlen.S index 48d2495d2082..6a96ec268f1a 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strnlen.S +++ b/contrib/arm-optimized-routines/string/aarch64/strnlen.S @@ -1,8 +1,8 @@ /* * strnlen - calculate the length of a string with limit. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define cntin x1 @@ -20,39 +20,28 @@ #define src x2 #define synd x3 #define shift x4 -#define wtmp w4 #define tmp x4 #define cntrem x5 #define qdata q0 #define vdata v0 #define vhas_chr v1 -#define vrepmask v2 -#define vend v3 -#define dend d3 +#define vend v2 +#define dend d2 /* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ + Process the string in 16-byte aligned chunks. Compute a 64-bit mask with + four bits per byte using the shrn instruction. A count trailing zeros then + identifies the first zero byte. */ ENTRY (__strnlen_aarch64) - PTR_ARG (0) - SIZE_ARG (1) bic src, srcin, 15 - mov wtmp, 0xf00f cbz cntin, L(nomatch) - ld1 {vdata.16b}, [src], 16 - dup vrepmask.8h, wtmp + ld1 {vdata.16b}, [src] cmeq vhas_chr.16b, vdata.16b, 0 lsl shift, srcin, 2 - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ fmov synd, dend lsr synd, synd, shift cbz synd, L(start_loop) @@ -64,37 +53,40 @@ L(finish): csel result, cntin, result, ls ret +L(nomatch): + mov result, cntin + ret + L(start_loop): sub tmp, src, srcin + add tmp, tmp, 17 subs cntrem, cntin, tmp - b.ls L(nomatch) + b.lo L(nomatch) /* Make sure that it won't overread by a 16-byte chunk */ - add tmp, cntrem, 15 - tbnz tmp, 4, L(loop32_2) - + tbz cntrem, 4, L(loop32_2) + sub src, src, 16 .p2align 5 L(loop32): - ldr qdata, [src], 16 + ldr qdata, [src, 32]! cmeq vhas_chr.16b, vdata.16b, 0 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbnz synd, L(end) L(loop32_2): - ldr qdata, [src], 16 + ldr qdata, [src, 16] subs cntrem, cntrem, 32 cmeq vhas_chr.16b, vdata.16b, 0 - b.ls L(end) + b.lo L(end_2) umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbz synd, L(loop32) - +L(end_2): + add src, src, 16 L(end): - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ - sub src, src, 16 - mov synd, vend.d[0] + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ sub result, src, srcin + fmov synd, dend #ifndef __AARCH64EB__ rbit synd, synd #endif @@ -104,9 +96,5 @@ L(end): csel result, cntin, result, ls ret -L(nomatch): - mov result, cntin - ret - END (__strnlen_aarch64) diff --git a/contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S b/contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S index 1e4fb1a68f7e..8668ce6d2916 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S +++ b/contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S @@ -1,8 +1,8 @@ /* * strrchr - find last position of a character in a string. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define chrin w1 @@ -19,7 +19,6 @@ #define src x2 #define tmp x3 -#define wtmp w3 #define synd x3 #define shift x4 #define src_match x4 @@ -31,7 +30,6 @@ #define vhas_nul v2 #define vhas_chr v3 #define vrepmask v4 -#define vrepmask2 v5 #define vend v5 #define dend d5 @@ -44,58 +42,69 @@ if the relevant byte matched the NUL end of string. */ ENTRY (__strrchr_aarch64_mte) - PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin - mov wtmp, 0x3003 - dup vrepmask.8h, wtmp - tst srcin, 15 - beq L(loop1) - - ld1 {vdata.16b}, [src], 16 + movi vrepmask.16b, 0x33 + ld1 {vdata.16b}, [src] cmeq vhas_nul.16b, vdata.16b, 0 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - mov wtmp, 0xf00f - dup vrepmask2.8h, wtmp bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b + shrn vend.8b, vhas_nul.8h, 4 lsl shift, srcin, 2 fmov synd, dend lsr synd, synd, shift lsl synd, synd, shift ands nul_match, synd, 0xcccccccccccccccc bne L(tail) - cbnz synd, L(loop2) + cbnz synd, L(loop2_start) - .p2align 5 + .p2align 4 L(loop1): - ld1 {vdata.16b}, [src], 16 + ldr q1, [src, 16] + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbnz synd, L(loop1_end) + ldr q1, [src, 32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b umaxp vend.16b, vhas_nul.16b, vhas_nul.16b fmov synd, dend cbz synd, L(loop1) - + sub src, src, 16 +L(loop1_end): + add src, src, 16 cmeq vhas_nul.16b, vdata.16b, 0 +#ifdef __AARCH64EB__ + bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b + shrn vend.8b, vhas_nul.8h, 4 + fmov synd, dend + rbit synd, synd +#else bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - bic vhas_nul.8h, 0x0f, lsl 8 - addp vend.16b, vhas_nul.16b, vhas_nul.16b + shrn vend.8b, vhas_nul.8h, 4 fmov synd, dend +#endif ands nul_match, synd, 0xcccccccccccccccc - beq L(loop2) - + beq L(loop2_start) L(tail): sub nul_match, nul_match, 1 and chr_match, synd, 0x3333333333333333 ands chr_match, chr_match, nul_match - sub result, src, 1 + add result, src, 15 clz tmp, chr_match sub result, result, tmp, lsr 2 csel result, result, xzr, ne ret .p2align 4 + nop + nop +L(loop2_start): + add src, src, 16 + bic vrepmask.8h, 0xf0 + L(loop2): cmp synd, 0 csel src_match, src, src_match, ne diff --git a/contrib/arm-optimized-routines/string/aarch64/strrchr.S b/contrib/arm-optimized-routines/string/aarch64/strrchr.S index 56185ff534e3..f5713f4260fb 100644 --- a/contrib/arm-optimized-routines/string/aarch64/strrchr.S +++ b/contrib/arm-optimized-routines/string/aarch64/strrchr.S @@ -1,8 +1,8 @@ /* * strrchr - find last position of a character in a string. * - * Copyright (c) 2014-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2014-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * Neon Available. */ -#include "../asmdefs.h" +#include "asmdefs.h" /* Arguments and results. */ #define srcin x0 @@ -55,7 +55,6 @@ identify exactly which byte is causing the termination, and why. */ ENTRY (__strrchr_aarch64) - PTR_ARG (0) /* Magic constant 0x40100401 to allow us to identify which lane matches the requested byte. Magic constant 0x80200802 used similarly for NUL termination. */ diff --git a/contrib/arm-optimized-routines/string/arm/asmdefs.h b/contrib/arm-optimized-routines/string/arm/asmdefs.h new file mode 100644 index 000000000000..e31188804716 --- /dev/null +++ b/contrib/arm-optimized-routines/string/arm/asmdefs.h @@ -0,0 +1,477 @@ +/* + * Macros for asm code. Arm version. + * + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef _ASMDEFS_H +#define _ASMDEFS_H + +/* Check whether leaf function PAC signing has been requested in the + -mbranch-protect compile-time option. */ +#define LEAF_PROTECT_BIT 2 + +#ifdef __ARM_FEATURE_PAC_DEFAULT +# define HAVE_PAC_LEAF \ + ((__ARM_FEATURE_PAC_DEFAULT & (1 << LEAF_PROTECT_BIT)) && 1) +#else +# define HAVE_PAC_LEAF 0 +#endif + +/* Provide default parameters for PAC-code handling in leaf-functions. */ +#if HAVE_PAC_LEAF +# ifndef PAC_LEAF_PUSH_IP +# define PAC_LEAF_PUSH_IP 1 +# endif +#else /* !HAVE_PAC_LEAF */ +# undef PAC_LEAF_PUSH_IP +# define PAC_LEAF_PUSH_IP 0 +#endif /* HAVE_PAC_LEAF */ + +#define STACK_ALIGN_ENFORCE 0 + +/****************************************************************************** +* Implementation of the prologue and epilogue assembler macros and their +* associated helper functions. +* +* These functions add support for the following: +* +* - M-profile branch target identification (BTI) landing-pads when compiled +* with `-mbranch-protection=bti'. +* - PAC-signing and verification instructions, depending on hardware support +* and whether the PAC-signing of leaf functions has been requested via the +* `-mbranch-protection=pac-ret+leaf' compiler argument. +* - 8-byte stack alignment preservation at function entry, defaulting to the +* value of STACK_ALIGN_ENFORCE. +* +* Notes: +* - Prologue stack alignment is implemented by detecting a push with an odd +* number of registers and prepending a dummy register to the list. +* - If alignment is attempted on a list containing r0, compilation will result +* in an error. +* - If alignment is attempted in a list containing r1, r0 will be prepended to +* the register list and r0 will be restored prior to function return. for +* functions with non-void return types, this will result in the corruption of +* the result register. +* - Stack alignment is enforced via the following helper macro call-chain: +* +* {prologue|epilogue} ->_align8 -> _preprocess_reglist -> +* _preprocess_reglist1 -> {_prologue|_epilogue} +* +* - Debug CFI directives are automatically added to prologues and epilogues, +* assisted by `cfisavelist' and `cfirestorelist', respectively. +* +* Arguments: +* prologue +* -------- +* - first - If `last' specified, this serves as start of general-purpose +* register (GPR) range to push onto stack, otherwise represents +* single GPR to push onto stack. If omitted, no GPRs pushed +* onto stack at prologue. +* - last - If given, specifies inclusive upper-bound of GPR range. +* - push_ip - Determines whether IP register is to be pushed to stack at +* prologue. When pac-signing is requested, this holds the +* the pac-key. Either 1 or 0 to push or not push, respectively. +* Default behavior: Set to value of PAC_LEAF_PUSH_IP macro. +* - push_lr - Determines whether to push lr to the stack on function entry. +* Either 1 or 0 to push or not push, respectively. +* - align8 - Whether to enforce alignment. Either 1 or 0, with 1 requesting +* alignment. +* +* epilogue +* -------- +* The epilogue should be called passing the same arguments as those passed to +* the prologue to ensure the stack is not corrupted on function return. +* +* Usage examples: +* +* prologue push_ip=1 -> push {ip} +* epilogue push_ip=1, align8=1 -> pop {r2, ip} +* prologue push_ip=1, push_lr=1 -> push {ip, lr} +* epilogue 1 -> pop {r1} +* prologue 1, align8=1 -> push {r0, r1} +* epilogue 1, push_ip=1 -> pop {r1, ip} +* prologue 1, 4 -> push {r1-r4} +* epilogue 1, 4 push_ip=1 -> pop {r1-r4, ip} +* +******************************************************************************/ + +/* Emit .cfi_restore directives for a consecutive sequence of registers. */ + .macro cfirestorelist first, last + .cfi_restore \last + .if \last-\first + cfirestorelist \first, \last-1 + .endif + .endm + +/* Emit .cfi_offset directives for a consecutive sequence of registers. */ + .macro cfisavelist first, last, index=1 + .cfi_offset \last, -4*(\index) + .if \last-\first + cfisavelist \first, \last-1, \index+1 + .endif + .endm + +.macro _prologue first=-1, last=-1, push_ip=PAC_LEAF_PUSH_IP, push_lr=0 + .if \push_ip & 1 != \push_ip + .error "push_ip may be either 0 or 1" + .endif + .if \push_lr & 1 != \push_lr + .error "push_lr may be either 0 or 1" + .endif + .if \first != -1 + .if \last == -1 + /* Upper-bound not provided: Set upper = lower. */ + _prologue \first, \first, \push_ip, \push_lr + .exitm + .endif + .endif +#if HAVE_PAC_LEAF +# if __ARM_FEATURE_BTI_DEFAULT + pacbti ip, lr, sp +# else + pac ip, lr, sp +# endif /* __ARM_FEATURE_BTI_DEFAULT */ + .cfi_register 143, 12 +#else +# if __ARM_FEATURE_BTI_DEFAULT + bti +# endif /* __ARM_FEATURE_BTI_DEFAULT */ +#endif /* HAVE_PAC_LEAF */ + .if \first != -1 + .if \last != \first + .if \last >= 13 + .error "SP cannot be in the save list" + .endif + .if \push_ip + .if \push_lr + /* Case 1: push register range, ip and lr registers. */ + push {r\first-r\last, ip, lr} + .cfi_adjust_cfa_offset ((\last-\first)+3)*4 + .cfi_offset 14, -4 + .cfi_offset 143, -8 + cfisavelist \first, \last, 3 + .else // !\push_lr + /* Case 2: push register range and ip register. */ + push {r\first-r\last, ip} + .cfi_adjust_cfa_offset ((\last-\first)+2)*4 + .cfi_offset 143, -4 + cfisavelist \first, \last, 2 + .endif + .else // !\push_ip + .if \push_lr + /* Case 3: push register range and lr register. */ + push {r\first-r\last, lr} + .cfi_adjust_cfa_offset ((\last-\first)+2)*4 + .cfi_offset 14, -4 + cfisavelist \first, \last, 2 + .else // !\push_lr + /* Case 4: push register range. */ + push {r\first-r\last} + .cfi_adjust_cfa_offset ((\last-\first)+1)*4 + cfisavelist \first, \last, 1 + .endif + .endif + .else // \last == \first + .if \push_ip + .if \push_lr + /* Case 5: push single GP register plus ip and lr registers. */ + push {r\first, ip, lr} + .cfi_adjust_cfa_offset 12 + .cfi_offset 14, -4 + .cfi_offset 143, -8 + cfisavelist \first, \first, 3 + .else // !\push_lr + /* Case 6: push single GP register plus ip register. */ + push {r\first, ip} + .cfi_adjust_cfa_offset 8 + .cfi_offset 143, -4 + cfisavelist \first, \first, 2 + .endif + .else // !\push_ip + .if \push_lr + /* Case 7: push single GP register plus lr register. */ + push {r\first, lr} + .cfi_adjust_cfa_offset 8 + .cfi_offset 14, -4 + cfisavelist \first, \first, 2 + .else // !\push_lr + /* Case 8: push single GP register. */ + push {r\first} + .cfi_adjust_cfa_offset 4 + cfisavelist \first, \first, 1 + .endif + .endif + .endif + .else // \first == -1 + .if \push_ip + .if \push_lr + /* Case 9: push ip and lr registers. */ + push {ip, lr} + .cfi_adjust_cfa_offset 8 + .cfi_offset 14, -4 + .cfi_offset 143, -8 + .else // !\push_lr + /* Case 10: push ip register. */ + push {ip} + .cfi_adjust_cfa_offset 4 + .cfi_offset 143, -4 + .endif + .else // !\push_ip + .if \push_lr + /* Case 11: push lr register. */ + push {lr} + .cfi_adjust_cfa_offset 4 + .cfi_offset 14, -4 + .endif + .endif + .endif +.endm + +.macro _epilogue first=-1, last=-1, push_ip=PAC_LEAF_PUSH_IP, push_lr=0 + .if \push_ip & 1 != \push_ip + .error "push_ip may be either 0 or 1" + .endif + .if \push_lr & 1 != \push_lr + .error "push_lr may be either 0 or 1" + .endif + .if \first != -1 + .if \last == -1 + /* Upper-bound not provided: Set upper = lower. */ + _epilogue \first, \first, \push_ip, \push_lr + .exitm + .endif + .if \last != \first + .if \last >= 13 + .error "SP cannot be in the save list" + .endif + .if \push_ip + .if \push_lr + /* Case 1: pop register range, ip and lr registers. */ + pop {r\first-r\last, ip, lr} + .cfi_restore 14 + .cfi_register 143, 12 + cfirestorelist \first, \last + .else // !\push_lr + /* Case 2: pop register range and ip register. */ + pop {r\first-r\last, ip} + .cfi_register 143, 12 + cfirestorelist \first, \last + .endif + .else // !\push_ip + .if \push_lr + /* Case 3: pop register range and lr register. */ + pop {r\first-r\last, lr} + .cfi_restore 14 + cfirestorelist \first, \last + .else // !\push_lr + /* Case 4: pop register range. */ + pop {r\first-r\last} + cfirestorelist \first, \last + .endif + .endif + .else // \last == \first + .if \push_ip + .if \push_lr + /* Case 5: pop single GP register plus ip and lr registers. */ + pop {r\first, ip, lr} + .cfi_restore 14 + .cfi_register 143, 12 + cfirestorelist \first, \first + .else // !\push_lr + /* Case 6: pop single GP register plus ip register. */ + pop {r\first, ip} + .cfi_register 143, 12 + cfirestorelist \first, \first + .endif + .else // !\push_ip + .if \push_lr + /* Case 7: pop single GP register plus lr register. */ + pop {r\first, lr} + .cfi_restore 14 + cfirestorelist \first, \first + .else // !\push_lr + /* Case 8: pop single GP register. */ + pop {r\first} + cfirestorelist \first, \first + .endif + .endif + .endif + .else // \first == -1 + .if \push_ip + .if \push_lr + /* Case 9: pop ip and lr registers. */ + pop {ip, lr} + .cfi_restore 14 + .cfi_register 143, 12 + .else // !\push_lr + /* Case 10: pop ip register. */ + pop {ip} + .cfi_register 143, 12 + .endif + .else // !\push_ip + .if \push_lr + /* Case 11: pop lr register. */ + pop {lr} + .cfi_restore 14 + .endif + .endif + .endif +#if HAVE_PAC_LEAF + aut ip, lr, sp +#endif /* HAVE_PAC_LEAF */ + bx lr +.endm + +/* Clean up expressions in 'last'. */ +.macro _preprocess_reglist1 first:req, last:req, push_ip:req, push_lr:req, reglist_op:req + .if \last == 0 + \reglist_op \first, 0, \push_ip, \push_lr + .elseif \last == 1 + \reglist_op \first, 1, \push_ip, \push_lr + .elseif \last == 2 + \reglist_op \first, 2, \push_ip, \push_lr + .elseif \last == 3 + \reglist_op \first, 3, \push_ip, \push_lr + .elseif \last == 4 + \reglist_op \first, 4, \push_ip, \push_lr + .elseif \last == 5 + \reglist_op \first, 5, \push_ip, \push_lr + .elseif \last == 6 + \reglist_op \first, 6, \push_ip, \push_lr + .elseif \last == 7 + \reglist_op \first, 7, \push_ip, \push_lr + .elseif \last == 8 + \reglist_op \first, 8, \push_ip, \push_lr + .elseif \last == 9 + \reglist_op \first, 9, \push_ip, \push_lr + .elseif \last == 10 + \reglist_op \first, 10, \push_ip, \push_lr + .elseif \last == 11 + \reglist_op \first, 11, \push_ip, \push_lr + .else + .error "last (\last) out of range" + .endif +.endm + +/* Clean up expressions in 'first'. */ +.macro _preprocess_reglist first:req, last, push_ip=0, push_lr=0, reglist_op:req + .ifb \last + _preprocess_reglist \first \first \push_ip \push_lr + .else + .if \first > \last + .error "last (\last) must be at least as great as first (\first)" + .endif + .if \first == 0 + _preprocess_reglist1 0, \last, \push_ip, \push_lr, \reglist_op + .elseif \first == 1 + _preprocess_reglist1 1, \last, \push_ip, \push_lr, \reglist_op + .elseif \first == 2 + _preprocess_reglist1 2, \last, \push_ip, \push_lr, \reglist_op + .elseif \first == 3 + _preprocess_reglist1 3, \last, \push_ip, \push_lr, \reglist_op + .elseif \first == 4 + _preprocess_reglist1 4, \last, \push_ip, \push_lr, \reglist_op + .elseif \first == 5 + _preprocess_reglist1 5, \last, \push_ip, \push_lr, \reglist_op + .elseif \first == 6 + _preprocess_reglist1 6, \last, \push_ip, \push_lr, \reglist_op + .elseif \first == 7 + _preprocess_reglist1 7, \last, \push_ip, \push_lr, \reglist_op + .elseif \first == 8 + _preprocess_reglist1 8, \last, \push_ip, \push_lr, \reglist_op + .elseif \first == 9 + _preprocess_reglist1 9, \last, \push_ip, \push_lr, \reglist_op + .elseif \first == 10 + _preprocess_reglist1 10, \last, \push_ip, \push_lr, \reglist_op + .elseif \first == 11 + _preprocess_reglist1 11, \last, \push_ip, \push_lr, \reglist_op + .else + .error "first (\first) out of range" + .endif + .endif +.endm + +.macro _align8 first, last, push_ip=0, push_lr=0, reglist_op=_prologue + .ifb \first + .ifnb \last + .error "can't have last (\last) without specifying first" + .else // \last not blank + .if ((\push_ip + \push_lr) % 2) == 0 + \reglist_op first=-1, last=-1, push_ip=\push_ip, push_lr=\push_lr + .exitm + .else // ((\push_ip + \push_lr) % 2) odd + _align8 2, 2, \push_ip, \push_lr, \reglist_op + .exitm + .endif // ((\push_ip + \push_lr) % 2) == 0 + .endif // .ifnb \last + .endif // .ifb \first + + .ifb \last + _align8 \first, \first, \push_ip, \push_lr, \reglist_op + .else + .if \push_ip & 1 <> \push_ip + .error "push_ip may be 0 or 1" + .endif + .if \push_lr & 1 <> \push_lr + .error "push_lr may be 0 or 1" + .endif + .ifeq (\last - \first + \push_ip + \push_lr) % 2 + .if \first == 0 + .error "Alignment required and first register is r0" + .exitm + .endif + _preprocess_reglist \first-1, \last, \push_ip, \push_lr, \reglist_op + .else + _preprocess_reglist \first \last, \push_ip, \push_lr, \reglist_op + .endif + .endif +.endm + +.macro prologue first, last, push_ip=PAC_LEAF_PUSH_IP, push_lr=0, align8=STACK_ALIGN_ENFORCE + .if \align8 + _align8 \first, \last, \push_ip, \push_lr, _prologue + .else + _prologue first=\first, last=\last, push_ip=\push_ip, push_lr=\push_lr + .endif +.endm + +.macro epilogue first, last, push_ip=PAC_LEAF_PUSH_IP, push_lr=0, align8=STACK_ALIGN_ENFORCE + .if \align8 + _align8 \first, \last, \push_ip, \push_lr, reglist_op=_epilogue + .else + _epilogue first=\first, last=\last, push_ip=\push_ip, push_lr=\push_lr + .endif +.endm + +#define ENTRY_ALIGN(name, alignment) \ + .global name; \ + .type name,%function; \ + .align alignment; \ + name: \ + .fnstart; \ + .cfi_startproc; + +#define ENTRY(name) ENTRY_ALIGN(name, 6) + +#define ENTRY_ALIAS(name) \ + .global name; \ + .type name,%function; \ + name: + +#if defined (IS_LEAF) +# define END_UNWIND .cantunwind; +#else +# define END_UNWIND +#endif + +#define END(name) \ + .cfi_endproc; \ + END_UNWIND \ + .fnend; \ + .size name, .-name; + +#define L(l) .L ## l + +#endif diff --git a/contrib/arm-optimized-routines/string/arm/check-arch.S b/contrib/arm-optimized-routines/string/arm/check-arch.S index 1cff9345e343..95516710fb85 100644 --- a/contrib/arm-optimized-routines/string/arm/check-arch.S +++ b/contrib/arm-optimized-routines/string/arm/check-arch.S @@ -1,10 +1,13 @@ /* * check ARCH setting. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #if !__arm__ # error ARCH setting does not match the compiler. #endif + +/* For attributes that may affect ABI. */ +#include "asmdefs.h" diff --git a/contrib/arm-optimized-routines/string/arm/memchr.S b/contrib/arm-optimized-routines/string/arm/memchr.S index 3f1ac4df136f..823d6013eb35 100644 --- a/contrib/arm-optimized-routines/string/arm/memchr.S +++ b/contrib/arm-optimized-routines/string/arm/memchr.S @@ -1,8 +1,8 @@ /* * memchr - scan memory for a character * - * Copyright (c) 2010-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2010-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* @@ -23,7 +23,11 @@ @ Removed unneeded cbz from align loop .syntax unified +#if __ARM_ARCH >= 8 && __ARM_ARCH_PROFILE == 'M' + /* keep config inherited from -march= */ +#else .arch armv7-a +#endif @ this lets us check a flag in a 00/ff byte easily in either endianness #ifdef __ARMEB__ @@ -32,6 +36,8 @@ #define CHARTSTMASK(c) 1<<(c*8) #endif .thumb +#include "asmdefs.h" + @ --------------------------------------------------------------------------- .thumb_func @@ -39,11 +45,14 @@ .p2align 4,,15 .global __memchr_arm .type __memchr_arm,%function + .fnstart + .cfi_startproc __memchr_arm: @ r0 = start of memory to scan @ r1 = character to look for @ r2 = length @ returns r0 = pointer to character or NULL if not found + prologue and r1,r1,#0xff @ Don't think we can trust the caller to actually pass a char cmp r2,#16 @ If it's short don't bother with anything clever @@ -64,6 +73,11 @@ __memchr_arm: 10: @ At this point, we are aligned, we know we have at least 8 bytes to work with push {r4,r5,r6,r7} + .cfi_adjust_cfa_offset 16 + .cfi_rel_offset 4, 0 + .cfi_rel_offset 5, 4 + .cfi_rel_offset 6, 8 + .cfi_rel_offset 7, 12 orr r1, r1, r1, lsl #8 @ expand the match word across to all bytes orr r1, r1, r1, lsl #16 bic r4, r2, #7 @ Number of double words to work with @@ -83,6 +97,11 @@ __memchr_arm: bne 15b @ (Flags from the subs above) If not run out of bytes then go around again pop {r4,r5,r6,r7} + .cfi_restore 7 + .cfi_restore 6 + .cfi_restore 5 + .cfi_restore 4 + .cfi_adjust_cfa_offset -16 and r1,r1,#0xff @ Get r1 back to a single character from the expansion above and r2,r2,#7 @ Leave the count remaining as the number after the double words have been done @@ -97,16 +116,25 @@ __memchr_arm: bne 21b @ on r2 flags 40: + .cfi_remember_state movs r0,#0 @ not found - bx lr + epilogue 50: + .cfi_restore_state + .cfi_remember_state subs r0,r0,#1 @ found - bx lr + epilogue 60: @ We're here because the fast path found a hit - now we have to track down exactly which word it was @ r0 points to the start of the double word after the one that was tested @ r5 has the 00/ff pattern for the first word, r6 has the chained value + .cfi_restore_state @ Standard post-prologue state + .cfi_adjust_cfa_offset 16 + .cfi_rel_offset 4, 0 + .cfi_rel_offset 5, 4 + .cfi_rel_offset 6, 8 + .cfi_rel_offset 7, 12 cmp r5, #0 itte eq moveq r5, r6 @ the end is in the 2nd word @@ -126,7 +154,15 @@ __memchr_arm: 61: pop {r4,r5,r6,r7} + .cfi_restore 7 + .cfi_restore 6 + .cfi_restore 5 + .cfi_restore 4 + .cfi_adjust_cfa_offset -16 subs r0,r0,#1 - bx lr + epilogue + .cfi_endproc + .cantunwind + .fnend .size __memchr_arm, . - __memchr_arm diff --git a/contrib/arm-optimized-routines/string/arm/memcpy.S b/contrib/arm-optimized-routines/string/arm/memcpy.S index 86e64938edb1..2423cfd69061 100644 --- a/contrib/arm-optimized-routines/string/arm/memcpy.S +++ b/contrib/arm-optimized-routines/string/arm/memcpy.S @@ -1,8 +1,8 @@ /* * memcpy - copy memory area * - * Copyright (c) 2013-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2013-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* @@ -17,7 +17,7 @@ */ -#include "../asmdefs.h" +#include "asmdefs.h" .syntax unified /* This implementation requires ARM state. */ diff --git a/contrib/arm-optimized-routines/string/arm/memset.S b/contrib/arm-optimized-routines/string/arm/memset.S index 11e927368fd1..487b9d6a8f6c 100644 --- a/contrib/arm-optimized-routines/string/arm/memset.S +++ b/contrib/arm-optimized-routines/string/arm/memset.S @@ -2,7 +2,7 @@ * memset - fill memory with a constant * * Copyright (c) 2010-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* diff --git a/contrib/arm-optimized-routines/string/arm/strcmp-armv6m.S b/contrib/arm-optimized-routines/string/arm/strcmp-armv6m.S index b75d4143db57..4d55306810ad 100644 --- a/contrib/arm-optimized-routines/string/arm/strcmp-armv6m.S +++ b/contrib/arm-optimized-routines/string/arm/strcmp-armv6m.S @@ -1,10 +1,12 @@ /* * strcmp for ARMv6-M (optimized for performance, not size) * - * Copyright (c) 2014-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2014-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +#include "asmdefs.h" + #if __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1 .thumb_func diff --git a/contrib/arm-optimized-routines/string/arm/strcmp.S b/contrib/arm-optimized-routines/string/arm/strcmp.S index 51443e343058..74b3d235fb18 100644 --- a/contrib/arm-optimized-routines/string/arm/strcmp.S +++ b/contrib/arm-optimized-routines/string/arm/strcmp.S @@ -1,8 +1,8 @@ /* * strcmp for ARMv7 * - * Copyright (c) 2012-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2012-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 @@ -12,7 +12,7 @@ is sufficiently aligned. Use saturating arithmetic to optimize the compares. */ -#include "../asmdefs.h" +#include "asmdefs.h" /* Build Options: STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first @@ -26,6 +26,11 @@ #define STRCMP_NO_PRECHECK 0 +/* Ensure the .cantunwind directive is prepended to .fnend. + Leaf functions cannot throw exceptions - EHABI only supports + synchronous exceptions. */ +#define IS_LEAF + /* This version uses Thumb-2 code. */ .thumb .syntax unified @@ -98,8 +103,9 @@ ldrd r4, r5, [sp], #16 .cfi_restore 4 .cfi_restore 5 + .cfi_adjust_cfa_offset -16 sub result, result, r1, lsr #24 - bx lr + epilogue push_ip=HAVE_PAC_LEAF #else /* To use the big-endian trick we'd have to reverse all three words. that's slower than this approach. */ @@ -119,21 +125,15 @@ ldrd r4, r5, [sp], #16 .cfi_restore 4 .cfi_restore 5 + .cfi_adjust_cfa_offset -16 sub result, result, r1 - bx lr + epilogue push_ip=HAVE_PAC_LEAF #endif .endm - .p2align 5 -L(strcmp_start_addr): -#if STRCMP_NO_PRECHECK == 0 -L(fastpath_exit): - sub r0, r2, r3 - bx lr - nop -#endif -ENTRY_ALIGN (__strcmp_arm, 0) +ENTRY(__strcmp_arm) + prologue push_ip=HAVE_PAC_LEAF #if STRCMP_NO_PRECHECK == 0 ldrb r2, [src1] ldrb r3, [src2] @@ -143,13 +143,13 @@ ENTRY_ALIGN (__strcmp_arm, 0) bne L(fastpath_exit) #endif strd r4, r5, [sp, #-16]! - .cfi_def_cfa_offset 16 - .cfi_offset 4, -16 - .cfi_offset 5, -12 + .cfi_adjust_cfa_offset 16 + .cfi_rel_offset 4, 0 + .cfi_rel_offset 5, 4 orr tmp1, src1, src2 strd r6, r7, [sp, #8] - .cfi_offset 6, -8 - .cfi_offset 7, -4 + .cfi_rel_offset 6, 8 + .cfi_rel_offset 7, 12 mvn const_m1, #0 lsl r2, tmp1, #29 cbz r2, L(loop_aligned8) @@ -318,10 +318,19 @@ L(misaligned_exit): mov result, tmp1 ldr r4, [sp], #16 .cfi_restore 4 - bx lr + .cfi_adjust_cfa_offset -16 + epilogue push_ip=HAVE_PAC_LEAF #if STRCMP_NO_PRECHECK == 0 +L(fastpath_exit): + .cfi_restore_state + .cfi_remember_state + sub r0, r2, r3 + epilogue push_ip=HAVE_PAC_LEAF + L(aligned_m1): + .cfi_restore_state + .cfi_remember_state add src2, src2, #4 #endif L(src1_aligned): @@ -368,9 +377,9 @@ L(overlap3): /* R6/7 Not used in this sequence. */ .cfi_restore 6 .cfi_restore 7 + .cfi_adjust_cfa_offset -16 neg result, result - bx lr - + epilogue push_ip=HAVE_PAC_LEAF 6: .cfi_restore_state S2LO data1, data1, #24 @@ -445,7 +454,8 @@ L(strcmp_done_equal): /* R6/7 not used in this sequence. */ .cfi_restore 6 .cfi_restore 7 - bx lr + .cfi_adjust_cfa_offset -16 + epilogue push_ip=HAVE_PAC_LEAF L(strcmp_tail): .cfi_restore_state @@ -467,8 +477,9 @@ L(strcmp_tail): /* R6/7 not used in this sequence. */ .cfi_restore 6 .cfi_restore 7 + .cfi_adjust_cfa_offset -16 sub result, result, data2, lsr #24 - bx lr + epilogue push_ip=HAVE_PAC_LEAF END (__strcmp_arm) diff --git a/contrib/arm-optimized-routines/string/arm/strcpy.c b/contrib/arm-optimized-routines/string/arm/strcpy.c index 02cf94ff4be0..b5728a2534f0 100644 --- a/contrib/arm-optimized-routines/string/arm/strcpy.c +++ b/contrib/arm-optimized-routines/string/arm/strcpy.c @@ -2,7 +2,7 @@ * strcpy * * Copyright (c) 2008-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #if defined (__thumb2__) && !defined (__thumb__) diff --git a/contrib/arm-optimized-routines/string/arm/strlen-armv6t2.S b/contrib/arm-optimized-routines/string/arm/strlen-armv6t2.S index 5ad30c941586..5eb8671bdc8b 100644 --- a/contrib/arm-optimized-routines/string/arm/strlen-armv6t2.S +++ b/contrib/arm-optimized-routines/string/arm/strlen-armv6t2.S @@ -1,8 +1,8 @@ /* * strlen - calculate the length of a string * - * Copyright (c) 2010-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2010-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2 @@ -13,7 +13,7 @@ */ -#include "../asmdefs.h" +#include "asmdefs.h" #ifdef __ARMEB__ #define S2LO lsl @@ -23,6 +23,11 @@ #define S2HI lsl #endif +/* Ensure the .cantunwind directive is prepended to .fnend. + Leaf functions cannot throw exceptions - EHABI only supports + synchronous exceptions. */ +#define IS_LEAF + /* This code requires Thumb. */ .thumb .syntax unified @@ -41,8 +46,8 @@ #define tmp2 r5 ENTRY (__strlen_armv6t2) + prologue 4 5 push_ip=HAVE_PAC_LEAF pld [srcin, #0] - strd r4, r5, [sp, #-8]! bic src, srcin, #7 mvn const_m1, #0 ands tmp1, srcin, #7 /* (8 - bytes) to alignment. */ @@ -92,6 +97,7 @@ L(start_realigned): beq L(loop_aligned) L(null_found): + .cfi_remember_state cmp data1a, #0 itt eq addeq result, result, #4 @@ -100,11 +106,11 @@ L(null_found): rev data1a, data1a #endif clz data1a, data1a - ldrd r4, r5, [sp], #8 add result, result, data1a, lsr #3 /* Bits -> Bytes. */ - bx lr + epilogue 4 5 push_ip=HAVE_PAC_LEAF L(misaligned8): + .cfi_restore_state ldrd data1a, data1b, [src] and tmp2, tmp1, #3 rsb result, tmp1, #0 diff --git a/contrib/arm-optimized-routines/string/bench/memcpy.c b/contrib/arm-optimized-routines/string/bench/memcpy.c index d5d4ea7e0309..583fa505db75 100644 --- a/contrib/arm-optimized-routines/string/bench/memcpy.c +++ b/contrib/arm-optimized-routines/string/bench/memcpy.c @@ -1,8 +1,8 @@ /* * memcpy benchmark. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #define _GNU_SOURCE @@ -13,35 +13,25 @@ #include "stringlib.h" #include "benchlib.h" -#define ITERS 5000 +#define ITERS 5000 #define ITERS2 20000000 -#define ITERS3 500000 -#define MAX_COPIES 8192 -#define SIZE (256*1024) - -static uint8_t a[SIZE + 4096] __attribute__((__aligned__(64))); -static uint8_t b[SIZE + 4096] __attribute__((__aligned__(64))); - -#define F(x) {#x, x}, - -static const struct fun -{ - const char *name; - void *(*fun)(void *, const void *, size_t); -} funtab[] = -{ - F(memcpy) -#if __aarch64__ - F(__memcpy_aarch64) -# if __ARM_NEON - F(__memcpy_aarch64_simd) -# endif -#elif __arm__ - F(__memcpy_arm) -#endif -#undef F - {0, 0} -}; +#define ITERS3 200000 +#define NUM_TESTS 16384 +#define MIN_SIZE 32768 +#define MAX_SIZE (1024 * 1024) + +static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(4096))); +static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(4096))); + +#define DOTEST(STR,TESTFN) \ + printf (STR); \ + RUN (TESTFN, memcpy); \ + RUNA64 (TESTFN, __memcpy_aarch64); \ + RUNA64 (TESTFN, __memcpy_aarch64_simd); \ + RUNSVE (TESTFN, __memcpy_aarch64_sve); \ + RUNMOPS (TESTFN, __memcpy_aarch64_mops); \ + RUNA32 (TESTFN, __memcpy_arm); \ + printf ("\n"); typedef struct { uint16_t size; uint16_t freq; } freq_data_t; typedef struct { uint8_t align; uint16_t freq; } align_data_t; @@ -109,7 +99,7 @@ typedef struct uint64_t len : 16; } copy_t; -static copy_t copy[MAX_COPIES]; +static copy_t test_arr[NUM_TESTS]; typedef char *(*proto_t) (char *, const char *, size_t); @@ -140,121 +130,138 @@ init_copies (size_t max_size) size_t total = 0; /* Create a random set of copies with the given size and alignment distributions. */ - for (int i = 0; i < MAX_COPIES; i++) + for (int i = 0; i < NUM_TESTS; i++) { - copy[i].dst = (rand32 (0) & (max_size - 1)); - copy[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK]; - copy[i].src = (rand32 (0) & (max_size - 1)); - copy[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK]; - copy[i].len = size_arr[rand32 (0) & SIZE_MASK]; - total += copy[i].len; + test_arr[i].dst = (rand32 (0) & (max_size - 1)); + test_arr[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK]; + test_arr[i].src = (rand32 (0) & (max_size - 1)); + test_arr[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK]; + test_arr[i].len = size_arr[rand32 (0) & SIZE_MASK]; + total += test_arr[i].len; } return total; } -int main (void) +static void inline __attribute ((always_inline)) +memcpy_random (const char *name, void *(*fn)(void *, const void *, size_t)) { - init_copy_distribution (); + printf ("%22s ", name); + uint64_t total = 0, tsum = 0; + for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2) + { + uint64_t copy_size = init_copies (size) * ITERS; + + for (int c = 0; c < NUM_TESTS; c++) + fn (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len); + + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS; i++) + for (int c = 0; c < NUM_TESTS; c++) + fn (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len); + t = clock_get_ns () - t; + total += copy_size; + tsum += t; + printf ("%dK: %5.2f ", size / 1024, (double)copy_size / t); + } + printf( "avg %5.2f\n", (double)total / tsum); +} - memset (a, 1, sizeof (a)); - memset (b, 2, sizeof (b)); +static void inline __attribute ((always_inline)) +memcpy_medium_aligned (const char *name, void *(*fn)(void *, const void *, size_t)) +{ + printf ("%22s ", name); - printf("Random memcpy:\n"); - for (int f = 0; funtab[f].name != 0; f++) + for (int size = 8; size <= 512; size *= 2) { - size_t total = 0; - uint64_t tsum = 0; - printf ("%22s (B/ns) ", funtab[f].name); - rand32 (0x12345678); - - for (int size = 16384; size <= SIZE; size *= 2) - { - size_t copy_size = init_copies (size) * ITERS; - - for (int c = 0; c < MAX_COPIES; c++) - funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len); - - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS; i++) - for (int c = 0; c < MAX_COPIES; c++) - funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len); - t = clock_get_ns () - t; - total += copy_size; - tsum += t; - printf ("%dK: %.2f ", size / 1024, (double)copy_size / t); - } - printf( "avg %.2f\n", (double)total / tsum); + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + fn (b, a, size); + t = clock_get_ns () - t; + printf ("%dB: %5.2f ", size, (double)size * ITERS2 / t); } + printf ("\n"); +} + +static void inline __attribute ((always_inline)) +memcpy_medium_unaligned (const char *name, void *(*fn)(void *, const void *, size_t)) +{ + printf ("%22s ", name); - printf ("\nMedium memcpy:\n"); - for (int f = 0; funtab[f].name != 0; f++) + for (int size = 8; size <= 512; size *= 2) { - printf ("%22s (B/ns) ", funtab[f].name); - - for (int size = 16; size <= 512; size *= 2) - { - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS2; i++) - funtab[f].fun (b, a, size); - t = clock_get_ns () - t; - printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, - size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t); - } - printf ("\n"); + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + fn (b + 3, a + 1, size); + t = clock_get_ns () - t; + printf ("%dB: %5.2f ", size, (double)size * ITERS2 / t); } + printf ("\n"); +} - printf ("\nLarge memcpy:\n"); - for (int f = 0; funtab[f].name != 0; f++) +static void inline __attribute ((always_inline)) +memcpy_large (const char *name, void *(*fn)(void *, const void *, size_t)) +{ + printf ("%22s ", name); + + for (int size = 1024; size <= 65536; size *= 2) { - printf ("%22s (B/ns) ", funtab[f].name); - - for (int size = 1024; size <= 32768; size *= 2) - { - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS3; i++) - funtab[f].fun (b, a, size); - t = clock_get_ns () - t; - printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, - size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); - } - printf ("\n"); + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + fn (b, a, size); + t = clock_get_ns () - t; + printf ("%dK: %5.2f ", size / 1024, (double)size * ITERS3 / t); } + printf ("\n"); +} + +static void inline __attribute ((always_inline)) +memmove_forward_unaligned (const char *name, void *(*fn)(void *, const void *, size_t)) +{ + printf ("%22s ", name); - printf ("\nUnaligned forwards memmove:\n"); - for (int f = 0; funtab[f].name != 0; f++) + for (int size = 1024; size <= 65536; size *= 2) { - printf ("%22s (B/ns) ", funtab[f].name); - - for (int size = 1024; size <= 32768; size *= 2) - { - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS3; i++) - funtab[f].fun (a, a + 256 + (i & 31), size); - t = clock_get_ns () - t; - printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, - size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); - } - printf ("\n"); + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + fn (a, a + 256 + (i & 31), size); + t = clock_get_ns () - t; + printf ("%dK: %5.2f ", size / 1024, (double)size * ITERS3 / t); } + printf ("\n"); +} + +static void inline __attribute ((always_inline)) +memmove_backward_unaligned (const char *name, void *(*fn)(void *, const void *, size_t)) +{ + printf ("%22s ", name); - printf ("\nUnaligned backwards memmove:\n"); - for (int f = 0; funtab[f].name != 0; f++) + for (int size = 1024; size <= 65536; size *= 2) { - printf ("%22s (B/ns) ", funtab[f].name); - - for (int size = 1024; size <= 32768; size *= 2) - { - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS3; i++) - funtab[f].fun (a + 256 + (i & 31), a, size); - t = clock_get_ns () - t; - printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, - size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); - } - printf ("\n"); + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + fn (a + 256 + (i & 31), a, size); + t = clock_get_ns () - t; + printf ("%dK: %5.2f ", size / 1024, (double)size * ITERS3 / t); } + printf ("\n"); +} + +int main (void) +{ + init_copy_distribution (); + + memset (a, 1, sizeof (a)); + memset (b, 2, sizeof (b)); + + DOTEST ("Random memcpy (bytes/ns):\n", memcpy_random); + DOTEST ("Medium memcpy aligned (bytes/ns):\n", memcpy_medium_aligned); + DOTEST ("Medium memcpy unaligned (bytes/ns):\n", memcpy_medium_unaligned); + DOTEST ("Large memcpy (bytes/ns):\n", memcpy_large); + DOTEST ("Forwards memmove unaligned (bytes/ns):\n", memmove_forward_unaligned); + DOTEST ("Backwards memmove unaligned (bytes/ns):\n", memmove_backward_unaligned); + return 0; } diff --git a/contrib/arm-optimized-routines/string/bench/memset.c b/contrib/arm-optimized-routines/string/bench/memset.c new file mode 100644 index 000000000000..07474e469146 --- /dev/null +++ b/contrib/arm-optimized-routines/string/bench/memset.c @@ -0,0 +1,190 @@ +/* + * memset benchmark. + * + * Copyright (c) 2021, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#define _GNU_SOURCE +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> +#include "stringlib.h" +#include "benchlib.h" + +#define ITERS 5000 +#define ITERS2 20000000 +#define ITERS3 1000000 +#define NUM_TESTS 16384 +#define MIN_SIZE 32768 +#define MAX_SIZE (1024 * 1024) + +static uint8_t a[MAX_SIZE + 4096] __attribute__((__aligned__(4096))); + +#define DOTEST(STR,TESTFN) \ + printf (STR); \ + RUN (TESTFN, memset); \ + RUNA64 (TESTFN, __memset_aarch64); \ + RUNSVE (TESTFN, __memset_aarch64_sve); \ + RUNMOPS (TESTFN, __memset_mops); \ + RUNA32 (TESTFN, __memset_arm); \ + printf ("\n"); + +typedef struct { uint32_t offset : 20, len : 12; } memset_test_t; +static memset_test_t test_arr[NUM_TESTS]; + +typedef struct { uint16_t size; uint16_t freq; } freq_data_t; +typedef struct { uint8_t align; uint16_t freq; } align_data_t; + +#define SIZE_NUM 65536 +#define SIZE_MASK (SIZE_NUM-1) +static uint8_t len_arr[SIZE_NUM]; + +/* Frequency data for memset sizes up to 4096 based on SPEC2017. */ +static freq_data_t memset_len_freq[] = +{ +{40,28817}, {32,15336}, { 16,3823}, {296,3545}, { 24,3454}, { 8,1412}, +{292,1202}, { 48, 927}, { 12, 613}, { 11, 539}, {284, 493}, {108, 414}, +{ 88, 380}, { 20, 295}, {312, 271}, { 72, 233}, { 2, 200}, { 4, 192}, +{ 15, 180}, { 14, 174}, { 13, 160}, { 56, 151}, { 36, 144}, { 64, 140}, +{4095,133}, { 10, 130}, { 9, 124}, { 3, 124}, { 28, 120}, { 0, 118}, +{288, 110}, {1152, 96}, {104, 90}, { 1, 86}, {832, 76}, {248, 74}, +{1024, 69}, {120, 64}, {512, 63}, {384, 60}, { 6, 59}, { 80, 54}, +{ 17, 50}, { 7, 49}, {520, 47}, {2048, 39}, {256, 37}, {864, 33}, +{1440, 28}, { 22, 27}, {2056, 24}, {260, 23}, { 68, 23}, { 5, 22}, +{ 18, 21}, {200, 18}, {2120, 18}, { 60, 17}, { 52, 16}, {336, 15}, +{ 44, 13}, {192, 13}, {160, 12}, {2064, 12}, {128, 12}, { 76, 11}, +{164, 11}, {152, 10}, {136, 9}, {488, 7}, { 96, 6}, {560, 6}, +{1016, 6}, {112, 5}, {232, 5}, {168, 5}, {952, 5}, {184, 5}, +{144, 4}, {252, 4}, { 84, 3}, {960, 3}, {3808, 3}, {244, 3}, +{280, 3}, {224, 3}, {156, 3}, {1088, 3}, {440, 3}, {216, 2}, +{304, 2}, { 23, 2}, { 25, 2}, { 26, 2}, {264, 2}, {328, 2}, +{1096, 2}, {240, 2}, {1104, 2}, {704, 2}, {1664, 2}, {360, 2}, +{808, 1}, {544, 1}, {236, 1}, {720, 1}, {368, 1}, {424, 1}, +{640, 1}, {1112, 1}, {552, 1}, {272, 1}, {776, 1}, {376, 1}, +{ 92, 1}, {536, 1}, {824, 1}, {496, 1}, {760, 1}, {792, 1}, +{504, 1}, {344, 1}, {1816, 1}, {880, 1}, {176, 1}, {320, 1}, +{352, 1}, {2008, 1}, {208, 1}, {408, 1}, {228, 1}, {2072, 1}, +{568, 1}, {220, 1}, {616, 1}, {600, 1}, {392, 1}, {696, 1}, +{2144, 1}, {1280, 1}, {2136, 1}, {632, 1}, {584, 1}, {456, 1}, +{472, 1}, {3440, 1}, {2088, 1}, {680, 1}, {2928, 1}, {212, 1}, +{648, 1}, {1752, 1}, {664, 1}, {3512, 1}, {1032, 1}, {528, 1}, +{4072, 1}, {204, 1}, {2880, 1}, {3392, 1}, {712, 1}, { 59, 1}, +{736, 1}, {592, 1}, {2520, 1}, {744, 1}, {196, 1}, {172, 1}, +{728, 1}, {2040, 1}, {1192, 1}, {3600, 1}, {0, 0} +}; + +#define ALIGN_NUM 1024 +#define ALIGN_MASK (ALIGN_NUM-1) +static uint8_t align_arr[ALIGN_NUM]; + +/* Alignment data for memset based on SPEC2017. */ +static align_data_t memset_align_freq[] = +{ + {16, 338}, {8, 307}, {32, 148}, {64, 131}, {4, 72}, {1, 23}, {2, 5}, {0, 0} +}; + +static void +init_memset_distribution (void) +{ + int i, j, freq, size, n; + + for (n = i = 0; (freq = memset_len_freq[i].freq) != 0; i++) + for (j = 0, size = memset_len_freq[i].size; j < freq; j++) + len_arr[n++] = size; + assert (n == SIZE_NUM); + + for (n = i = 0; (freq = memset_align_freq[i].freq) != 0; i++) + for (j = 0, size = memset_align_freq[i].align; j < freq; j++) + align_arr[n++] = size - 1; + assert (n == ALIGN_NUM); +} + +static size_t +init_memset (size_t max_size) +{ + size_t total = 0; + /* Create a random set of memsets with the given size and alignment + distributions. */ + for (int i = 0; i < NUM_TESTS; i++) + { + test_arr[i].offset = (rand32 (0) & (max_size - 1)); + test_arr[i].offset &= ~align_arr[rand32 (0) & ALIGN_MASK]; + test_arr[i].len = len_arr[rand32 (0) & SIZE_MASK]; + total += test_arr[i].len; + } + + return total; +} + +static void inline __attribute ((always_inline)) +memset_random (const char *name, void *(*set)(void *, int, size_t)) +{ + uint64_t total_size = 0; + uint64_t tsum = 0; + printf ("%22s ", name); + rand32 (0x12345678); + + for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2) + { + uint64_t memset_size = init_memset (size) * ITERS; + + for (int c = 0; c < NUM_TESTS; c++) + set (a + test_arr[c].offset, 0, test_arr[c].len); + + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS; i++) + for (int c = 0; c < NUM_TESTS; c++) + set (a + test_arr[c].offset, 0, test_arr[c].len); + t = clock_get_ns () - t; + total_size += memset_size; + tsum += t; + printf ("%dK: %5.2f ", size / 1024, (double)memset_size / t); + } + printf( "avg %5.2f\n", (double)total_size / tsum); +} + +static void inline __attribute ((always_inline)) +memset_medium (const char *name, void *(*set)(void *, int, size_t)) +{ + printf ("%22s ", name); + + for (int size = 8; size <= 512; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + set (a, 0, size); + t = clock_get_ns () - t; + printf ("%dB: %5.2f ", size, (double)size * ITERS2 / t); + } + printf ("\n"); +} + +static void inline __attribute ((always_inline)) +memset_large (const char *name, void *(*set)(void *, int, size_t)) +{ + printf ("%22s ", name); + + for (int size = 1024; size <= 65536; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + set (a, 0, size); + t = clock_get_ns () - t; + printf ("%dKB: %6.2f ", size / 1024, (double)size * ITERS3 / t); + } + printf ("\n"); +} + +int main (void) +{ + init_memset_distribution (); + + memset (a, 1, sizeof (a)); + + DOTEST ("Random memset (bytes/ns):\n", memset_random); + DOTEST ("Medium memset (bytes/ns):\n", memset_medium); + DOTEST ("Large memset (bytes/ns):\n", memset_large); + return 0; +} diff --git a/contrib/arm-optimized-routines/string/bench/strlen.c b/contrib/arm-optimized-routines/string/bench/strlen.c index cc0f04bee547..a8dd55cf5fc4 100644 --- a/contrib/arm-optimized-routines/string/bench/strlen.c +++ b/contrib/arm-optimized-routines/string/bench/strlen.c @@ -1,8 +1,8 @@ /* * strlen benchmark. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2021, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #define _GNU_SOURCE @@ -13,43 +13,26 @@ #include "stringlib.h" #include "benchlib.h" -#define ITERS 2000 -#define ITERS2 20000000 -#define ITERS3 2000000 -#define NUM_STRLEN 16384 +#define ITERS 5000 +#define ITERS2 40000000 +#define ITERS3 4000000 +#define NUM_TESTS 65536 #define MAX_ALIGN 32 -#define MAX_STRLEN 256 +#define MAX_STRLEN 128 static char a[(MAX_STRLEN + 1) * MAX_ALIGN] __attribute__((__aligned__(4096))); -#define F(x, mte) {#x, x, mte}, - -static const struct fun -{ - const char *name; - size_t (*fun) (const char *s); - int test_mte; -} funtab[] = { - // clang-format off - F(strlen, 0) -#if __aarch64__ - F(__strlen_aarch64, 0) - F(__strlen_aarch64_mte, 1) -# if __ARM_FEATURE_SVE - F(__strlen_aarch64_sve, 1) -# endif -#elif __arm__ -# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2 - F(__strlen_armv6t2, 0) -# endif -#endif - {0, 0, 0} - // clang-format on -}; -#undef F +#define DOTEST(STR,TESTFN) \ + printf (STR); \ + RUN (TESTFN, strlen); \ + RUNA64 (TESTFN, __strlen_aarch64); \ + RUNA64 (TESTFN, __strlen_aarch64_mte); \ + RUNSVE (TESTFN, __strlen_aarch64_sve); \ + RUNT32 (TESTFN, __strlen_armv6t2); \ + printf ("\n"); -static uint16_t strlen_tests[NUM_STRLEN]; +static uint16_t strlen_tests[NUM_TESTS]; typedef struct { uint16_t size; uint16_t freq; } freq_data_t; typedef struct { uint8_t align; uint16_t freq; } align_data_t; @@ -117,105 +100,126 @@ init_strlen_tests (void) /* Create a random set of strlen input strings using the string length and alignment distributions. */ - for (int n = 0; n < NUM_STRLEN; n++) + for (int n = 0; n < NUM_TESTS; n++) { int align = strlen_align_arr[rand32 (0) & ALIGN_MASK]; int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK]; strlen_tests[n] = index[(align + exp_len) & (MAX_ALIGN - 1)] + MAX_STRLEN - exp_len; + assert ((strlen_tests[n] & (align - 1)) == 0); + assert (strlen (a + strlen_tests[n]) == exp_len); } } static volatile size_t maskv = 0; -int main (void) +static void inline __attribute ((always_inline)) +strlen_random (const char *name, size_t (*fn)(const char *)) { - rand32 (0x12345678); - init_strlen_distribution (); - init_strlen_tests (); + size_t res = 0, mask = maskv; + uint64_t strlen_size = 0; + printf ("%22s ", name); + + for (int c = 0; c < NUM_TESTS; c++) + strlen_size += fn (a + strlen_tests[c]) + 1; + strlen_size *= ITERS; + + /* Measure throughput of strlen. */ + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS; i++) + for (int c = 0; c < NUM_TESTS; c++) + res += fn (a + strlen_tests[c]); + t = clock_get_ns () - t; + printf ("tp: %.3f ", (double)strlen_size / t); + + /* Measure latency of strlen result with (res & mask). */ + t = clock_get_ns (); + for (int i = 0; i < ITERS; i++) + for (int c = 0; c < NUM_TESTS; c++) + res += fn (a + strlen_tests[c] + (res & mask)); + t = clock_get_ns () - t; + printf ("lat: %.3f\n", (double)strlen_size / t); + maskv = res & mask; +} - printf ("\nRandom strlen (bytes/ns):\n"); - for (int f = 0; funtab[f].name != 0; f++) - { - size_t res = 0, strlen_size = 0, mask = maskv; - printf ("%22s ", funtab[f].name); +static void inline __attribute ((always_inline)) +strlen_small_aligned (const char *name, size_t (*fn)(const char *)) +{ + printf ("%22s ", name); - for (int c = 0; c < NUM_STRLEN; c++) - strlen_size += funtab[f].fun (a + strlen_tests[c]); - strlen_size *= ITERS; + size_t res = 0, mask = maskv; + for (int size = 1; size <= 64; size *= 2) + { + memset (a, 'x', size); + a[size - 1] = 0; - /* Measure latency of strlen result with (res & mask). */ uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS; i++) - for (int c = 0; c < NUM_STRLEN; c++) - res = funtab[f].fun (a + strlen_tests[c] + (res & mask)); + for (int i = 0; i < ITERS2; i++) + res += fn (a + (i & mask)); t = clock_get_ns () - t; - printf ("%.2f\n", (double)strlen_size / t); + printf ("%d%c: %5.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t); } + maskv &= res; + printf ("\n"); +} - printf ("\nSmall aligned strlen (bytes/ns):\n"); - for (int f = 0; funtab[f].name != 0; f++) - { - printf ("%22s ", funtab[f].name); - - for (int size = 1; size <= 64; size *= 2) - { - memset (a, 'x', size); - a[size - 1] = 0; - - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS2; i++) - funtab[f].fun (a); - t = clock_get_ns () - t; - printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, - size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t); - } - printf ("\n"); - } +static void inline __attribute ((always_inline)) +strlen_small_unaligned (const char *name, size_t (*fn)(const char *)) +{ + printf ("%22s ", name); - printf ("\nSmall unaligned strlen (bytes/ns):\n"); - for (int f = 0; funtab[f].name != 0; f++) + size_t res = 0, mask = maskv; + int align = 9; + for (int size = 1; size <= 64; size *= 2) { - printf ("%22s ", funtab[f].name); - - int align = 9; - for (int size = 1; size <= 64; size *= 2) - { - memset (a + align, 'x', size); - a[align + size - 1] = 0; - - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS2; i++) - funtab[f].fun (a + align); - t = clock_get_ns () - t; - printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, - size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t); - } - printf ("\n"); + memset (a + align, 'x', size); + a[align + size - 1] = 0; + + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + res += fn (a + align + (i & mask)); + t = clock_get_ns () - t; + printf ("%d%c: %5.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t); } + maskv &= res; + printf ("\n"); +} - printf ("\nMedium strlen (bytes/ns):\n"); - for (int f = 0; funtab[f].name != 0; f++) +static void inline __attribute ((always_inline)) +strlen_medium (const char *name, size_t (*fn)(const char *)) +{ + printf ("%22s ", name); + + size_t res = 0, mask = maskv; + for (int size = 128; size <= 4096; size *= 2) { - printf ("%22s ", funtab[f].name); - - for (int size = 128; size <= 4096; size *= 2) - { - memset (a, 'x', size); - a[size - 1] = 0; - - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS3; i++) - funtab[f].fun (a); - t = clock_get_ns () - t; - printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, - size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); - } - printf ("\n"); - } + memset (a, 'x', size); + a[size - 1] = 0; + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + res += fn (a + (i & mask)); + t = clock_get_ns () - t; + printf ("%d%c: %5.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); + } + maskv &= res; printf ("\n"); +} + +int main (void) +{ + rand32 (0x12345678); + init_strlen_distribution (); + init_strlen_tests (); + + DOTEST ("Random strlen (bytes/ns):\n", strlen_random); + DOTEST ("Small aligned strlen (bytes/ns):\n", strlen_small_aligned); + DOTEST ("Small unaligned strlen (bytes/ns):\n", strlen_small_unaligned); + DOTEST ("Medium strlen (bytes/ns):\n", strlen_medium); return 0; } diff --git a/contrib/arm-optimized-routines/string/include/benchlib.h b/contrib/arm-optimized-routines/string/include/benchlib.h index 0f2ce2eb6bce..486504e99ddf 100644 --- a/contrib/arm-optimized-routines/string/include/benchlib.h +++ b/contrib/arm-optimized-routines/string/include/benchlib.h @@ -2,7 +2,7 @@ * Benchmark support functions. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> @@ -30,4 +30,35 @@ rand32 (uint32_t seed) return res; } +/* Macros to run a benchmark BENCH using string function FN. */ +#define RUN(BENCH, FN) BENCH(#FN, FN) +#if __aarch64__ +# define RUNA64(BENCH, FN) BENCH(#FN, FN) +#else +# define RUNA64(BENCH, FN) +#endif + +#if __ARM_FEATURE_SVE +# define RUNSVE(BENCH, FN) BENCH(#FN, FN) +#else +# define RUNSVE(BENCH, FN) +#endif + +#if WANT_MOPS +# define RUNMOPS(BENCH, FN) BENCH(#FN, FN) +#else +# define RUNMOPS(BENCH, FN) +#endif + +#if __arm__ +# define RUNA32(BENCH, FN) BENCH(#FN, FN) +#else +# define RUNA32(BENCH, FN) +#endif + +#if __arm__ && __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2 +# define RUNT32(BENCH, FN) BENCH(#FN, FN) +#else +# define RUNT32(BENCH, FN) +#endif diff --git a/contrib/arm-optimized-routines/string/include/stringlib.h b/contrib/arm-optimized-routines/string/include/stringlib.h index 378c3cd2d645..bb9db930f132 100644 --- a/contrib/arm-optimized-routines/string/include/stringlib.h +++ b/contrib/arm-optimized-routines/string/include/stringlib.h @@ -1,8 +1,8 @@ /* * Public API. * - * Copyright (c) 2019-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stddef.h> @@ -29,19 +29,16 @@ size_t __strlen_aarch64 (const char *); size_t __strnlen_aarch64 (const char *, size_t); int __strncmp_aarch64 (const char *, const char *, size_t); void * __memchr_aarch64_mte (const void *, int, size_t); -char *__strcpy_aarch64_mte (char *__restrict, const char *__restrict); -char *__stpcpy_aarch64_mte (char *__restrict, const char *__restrict); char *__strchr_aarch64_mte (const char *, int); char * __strchrnul_aarch64_mte (const char *, int ); size_t __strlen_aarch64_mte (const char *); char *__strrchr_aarch64_mte (const char *, int); -int __strcmp_aarch64_mte (const char *, const char *); -int __strncmp_aarch64_mte (const char *, const char *, size_t); -#if __ARM_NEON void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t); void *__memmove_aarch64_simd (void *, const void *, size_t); -#endif # if __ARM_FEATURE_SVE +void *__memcpy_aarch64_sve (void *__restrict, const void *__restrict, size_t); +void *__memmove_aarch64_sve (void *__restrict, const void *__restrict, size_t); +void *__memset_aarch64_sve (void *, int, size_t); void *__memchr_aarch64_sve (const void *, int, size_t); int __memcmp_aarch64_sve (const void *, const void *, size_t); char *__strchr_aarch64_sve (const char *, int); @@ -54,6 +51,11 @@ size_t __strlen_aarch64_sve (const char *); size_t __strnlen_aarch64_sve (const char *, size_t); int __strncmp_aarch64_sve (const char *, const char *, size_t); # endif +# if WANT_MOPS +void *__memcpy_aarch64_mops (void *__restrict, const void *__restrict, size_t); +void *__memmove_aarch64_mops (void *__restrict, const void *__restrict, size_t); +void *__memset_aarch64_mops (void *, int, size_t); +# endif # if __ARM_FEATURE_MEMORY_TAGGING void *__mtag_tag_region (void *, size_t); void *__mtag_tag_zero_region (void *, size_t); diff --git a/contrib/arm-optimized-routines/string/test/__mtag_tag_region.c b/contrib/arm-optimized-routines/string/test/__mtag_tag_region.c index d8c02d92d626..c45fa6662a77 100644 --- a/contrib/arm-optimized-routines/string/test/__mtag_tag_region.c +++ b/contrib/arm-optimized-routines/string/test/__mtag_tag_region.c @@ -2,7 +2,7 @@ * __mtag_tag_region test. * * Copyright (c) 2021, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST diff --git a/contrib/arm-optimized-routines/string/test/__mtag_tag_zero_region.c b/contrib/arm-optimized-routines/string/test/__mtag_tag_zero_region.c index 221c223a2f31..a4a7861620d1 100644 --- a/contrib/arm-optimized-routines/string/test/__mtag_tag_zero_region.c +++ b/contrib/arm-optimized-routines/string/test/__mtag_tag_zero_region.c @@ -2,7 +2,7 @@ * __mtag_tag_zero_region test. * * Copyright (c) 2021, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST diff --git a/contrib/arm-optimized-routines/string/test/memchr.c b/contrib/arm-optimized-routines/string/test/memchr.c index 0ff77f5710bf..c6a94481c0ad 100644 --- a/contrib/arm-optimized-routines/string/test/memchr.c +++ b/contrib/arm-optimized-routines/string/test/memchr.c @@ -2,7 +2,7 @@ * memchr test. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> diff --git a/contrib/arm-optimized-routines/string/test/memcmp.c b/contrib/arm-optimized-routines/string/test/memcmp.c index 7a7cf9cff35a..f9236b83a60d 100644 --- a/contrib/arm-optimized-routines/string/test/memcmp.c +++ b/contrib/arm-optimized-routines/string/test/memcmp.c @@ -2,7 +2,7 @@ * memcmp test. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> diff --git a/contrib/arm-optimized-routines/string/test/memcpy.c b/contrib/arm-optimized-routines/string/test/memcpy.c index ce0ceeef5ee8..98255e06f31c 100644 --- a/contrib/arm-optimized-routines/string/test/memcpy.c +++ b/contrib/arm-optimized-routines/string/test/memcpy.c @@ -1,8 +1,8 @@ /* * memcpy test. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> @@ -25,8 +25,12 @@ static const struct fun F(memcpy, 0) #if __aarch64__ F(__memcpy_aarch64, 1) -# if __ARM_NEON F(__memcpy_aarch64_simd, 1) +# if __ARM_FEATURE_SVE + F(__memcpy_aarch64_sve, 1) +# endif +# if WANT_MOPS + F(__memcpy_aarch64_mops, 1) # endif #elif __arm__ F(__memcpy_arm, 0) diff --git a/contrib/arm-optimized-routines/string/test/memmove.c b/contrib/arm-optimized-routines/string/test/memmove.c index 689b68c98af2..ff3f7652f763 100644 --- a/contrib/arm-optimized-routines/string/test/memmove.c +++ b/contrib/arm-optimized-routines/string/test/memmove.c @@ -1,8 +1,8 @@ /* * memmove test. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> @@ -25,8 +25,12 @@ static const struct fun F(memmove, 0) #if __aarch64__ F(__memmove_aarch64, 1) -# if __ARM_NEON F(__memmove_aarch64_simd, 1) +# if __ARM_FEATURE_SVE + F(__memmove_aarch64_sve, 1) +# endif +# if WANT_MOPS + F(__memmove_aarch64_mops, 1) # endif #endif {0, 0, 0} diff --git a/contrib/arm-optimized-routines/string/test/memrchr.c b/contrib/arm-optimized-routines/string/test/memrchr.c index adf96f049cc9..4171a56daefd 100644 --- a/contrib/arm-optimized-routines/string/test/memrchr.c +++ b/contrib/arm-optimized-routines/string/test/memrchr.c @@ -2,7 +2,7 @@ * memchr test. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _GNU_SOURCE diff --git a/contrib/arm-optimized-routines/string/test/memset.c b/contrib/arm-optimized-routines/string/test/memset.c index f1721442dbaf..a9639f9b28b0 100644 --- a/contrib/arm-optimized-routines/string/test/memset.c +++ b/contrib/arm-optimized-routines/string/test/memset.c @@ -1,8 +1,8 @@ /* * memset test. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> @@ -25,6 +25,12 @@ static const struct fun F(memset, 0) #if __aarch64__ F(__memset_aarch64, 1) +# if __ARM_FEATURE_SVE + F(__memset_aarch64_sve, 1) +# endif +# if WANT_MOPS + F(__memset_aarch64_mops, 1) +# endif #elif __arm__ F(__memset_arm, 0) #endif diff --git a/contrib/arm-optimized-routines/string/test/mte.h b/contrib/arm-optimized-routines/string/test/mte.h index e67cbd9d2d40..40b0ecf6c194 100644 --- a/contrib/arm-optimized-routines/string/test/mte.h +++ b/contrib/arm-optimized-routines/string/test/mte.h @@ -2,7 +2,7 @@ * Memory tagging testing code. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef __TEST_MTE_H diff --git a/contrib/arm-optimized-routines/string/test/stpcpy.c b/contrib/arm-optimized-routines/string/test/stpcpy.c index 1827e68c9a30..0300892a1f3c 100644 --- a/contrib/arm-optimized-routines/string/test/stpcpy.c +++ b/contrib/arm-optimized-routines/string/test/stpcpy.c @@ -1,8 +1,8 @@ /* * stpcpy test. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _GNU_SOURCE @@ -28,8 +28,7 @@ static const struct fun // clang-format off F(stpcpy, 0) #if __aarch64__ - F(__stpcpy_aarch64, 0) - F(__stpcpy_aarch64_mte, 1) + F(__stpcpy_aarch64, 1) # if __ARM_FEATURE_SVE F(__stpcpy_aarch64_sve, 1) # endif diff --git a/contrib/arm-optimized-routines/string/test/strchr.c b/contrib/arm-optimized-routines/string/test/strchr.c index f3ae982ef0ad..66180acfb57c 100644 --- a/contrib/arm-optimized-routines/string/test/strchr.c +++ b/contrib/arm-optimized-routines/string/test/strchr.c @@ -2,7 +2,7 @@ * strchr test. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> diff --git a/contrib/arm-optimized-routines/string/test/strchrnul.c b/contrib/arm-optimized-routines/string/test/strchrnul.c index 6c30ab2123f1..aad0bf59da66 100644 --- a/contrib/arm-optimized-routines/string/test/strchrnul.c +++ b/contrib/arm-optimized-routines/string/test/strchrnul.c @@ -2,7 +2,7 @@ * strchrnul test. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _GNU_SOURCE diff --git a/contrib/arm-optimized-routines/string/test/strcmp.c b/contrib/arm-optimized-routines/string/test/strcmp.c index d57b54ed50a8..4aa95f4f2f1d 100644 --- a/contrib/arm-optimized-routines/string/test/strcmp.c +++ b/contrib/arm-optimized-routines/string/test/strcmp.c @@ -1,8 +1,8 @@ /* * strcmp test. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> @@ -24,8 +24,7 @@ static const struct fun // clang-format off F(strcmp, 0) #if __aarch64__ - F(__strcmp_aarch64, 0) - F(__strcmp_aarch64_mte, 1) + F(__strcmp_aarch64, 1) # if __ARM_FEATURE_SVE F(__strcmp_aarch64_sve, 1) # endif diff --git a/contrib/arm-optimized-routines/string/test/strcpy.c b/contrib/arm-optimized-routines/string/test/strcpy.c index e84cace9c8c6..af297f90396a 100644 --- a/contrib/arm-optimized-routines/string/test/strcpy.c +++ b/contrib/arm-optimized-routines/string/test/strcpy.c @@ -1,8 +1,8 @@ /* * strcpy test. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> @@ -24,8 +24,7 @@ static const struct fun // clang-format off F(strcpy, 0) #if __aarch64__ - F(__strcpy_aarch64, 0) - F(__strcpy_aarch64_mte, 1) + F(__strcpy_aarch64, 1) # if __ARM_FEATURE_SVE F(__strcpy_aarch64_sve, 1) # endif diff --git a/contrib/arm-optimized-routines/string/test/stringtest.h b/contrib/arm-optimized-routines/string/test/stringtest.h index fe855fc21736..6bb7e1fdfeca 100644 --- a/contrib/arm-optimized-routines/string/test/stringtest.h +++ b/contrib/arm-optimized-routines/string/test/stringtest.h @@ -2,7 +2,7 @@ * Common string test code. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <ctype.h> diff --git a/contrib/arm-optimized-routines/string/test/strlen.c b/contrib/arm-optimized-routines/string/test/strlen.c index 6278380f26df..47ef3dcf0ef0 100644 --- a/contrib/arm-optimized-routines/string/test/strlen.c +++ b/contrib/arm-optimized-routines/string/test/strlen.c @@ -1,15 +1,14 @@ /* * strlen test. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <sys/mman.h> #include <limits.h> #include "mte.h" #include "stringlib.h" diff --git a/contrib/arm-optimized-routines/string/test/strncmp.c b/contrib/arm-optimized-routines/string/test/strncmp.c index 018a8a431ab8..4bbab6f93450 100644 --- a/contrib/arm-optimized-routines/string/test/strncmp.c +++ b/contrib/arm-optimized-routines/string/test/strncmp.c @@ -1,8 +1,8 @@ /* * strncmp test. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> @@ -24,8 +24,7 @@ static const struct fun // clang-format off F(strncmp, 0) #if __aarch64__ - F(__strncmp_aarch64, 0) - F(__strncmp_aarch64_mte, 1) + F(__strncmp_aarch64, 1) # if __ARM_FEATURE_SVE F(__strncmp_aarch64_sve, 1) # endif diff --git a/contrib/arm-optimized-routines/string/test/strnlen.c b/contrib/arm-optimized-routines/string/test/strnlen.c index 0dea00eaf8e3..a800fd1993cd 100644 --- a/contrib/arm-optimized-routines/string/test/strnlen.c +++ b/contrib/arm-optimized-routines/string/test/strnlen.c @@ -2,7 +2,7 @@ * strnlen test. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _GNU_SOURCE diff --git a/contrib/arm-optimized-routines/string/test/strrchr.c b/contrib/arm-optimized-routines/string/test/strrchr.c index fedbdc52fcc1..580ca497f8a4 100644 --- a/contrib/arm-optimized-routines/string/test/strrchr.c +++ b/contrib/arm-optimized-routines/string/test/strrchr.c @@ -2,7 +2,7 @@ * strrchr test. * * Copyright (c) 2019-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> diff --git a/contrib/arm-optimized-routines/string/x86_64/check-arch.S b/contrib/arm-optimized-routines/string/x86_64/check-arch.S index 26ade0a0c7db..5afcf7b7ee54 100644 --- a/contrib/arm-optimized-routines/string/x86_64/check-arch.S +++ b/contrib/arm-optimized-routines/string/x86_64/check-arch.S @@ -2,7 +2,7 @@ * check ARCH setting. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #if !__x86_64__ |
