aboutsummaryrefslogtreecommitdiff
path: root/contrib/arm-optimized-routines/string
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/arm-optimized-routines/string')
-rw-r--r--contrib/arm-optimized-routines/string/Dir.mk2
-rw-r--r--contrib/arm-optimized-routines/string/README.contributors30
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/__mtag_tag_region.S6
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/__mtag_tag_zero_region.S6
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/asmdefs.h (renamed from contrib/arm-optimized-routines/string/asmdefs.h)44
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/check-arch.S6
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/memchr-mte.S58
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/memchr-sve.S6
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/memchr.S6
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/memcmp-sve.S6
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/memcmp.S239
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S66
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/memcpy-mops.S21
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S177
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/memcpy.S6
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/memmove-mops.S21
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/memrchr.S51
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/memset-mops.S20
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/memset.S6
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/stpcpy-mte.S10
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/stpcpy-sve.S2
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/stpcpy.S2
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/strchr-mte.S58
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/strchr-sve.S6
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/strchr.S6
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/strchrnul-mte.S47
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/strchrnul-sve.S2
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/strchrnul.S6
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/strcmp-mte.S189
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/strcmp-sve.S6
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/strcmp.S238
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/strcpy-mte.S161
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/strcpy-sve.S6
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/strcpy.S395
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/strlen-mte.S41
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/strlen-sve.S6
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/strlen.S21
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/strncmp-mte.S307
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/strncmp-sve.S6
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/strncmp.S238
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/strnlen-sve.S6
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/strnlen.S60
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S58
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/strrchr-sve.S6
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/strrchr.S6
-rw-r--r--contrib/arm-optimized-routines/string/arm/asmdefs.h477
-rw-r--r--contrib/arm-optimized-routines/string/arm/check-arch.S7
-rw-r--r--contrib/arm-optimized-routines/string/arm/memchr.S46
-rw-r--r--contrib/arm-optimized-routines/string/arm/memcpy.S6
-rw-r--r--contrib/arm-optimized-routines/string/arm/memset.S2
-rw-r--r--contrib/arm-optimized-routines/string/arm/strcmp-armv6m.S6
-rw-r--r--contrib/arm-optimized-routines/string/arm/strcmp.S59
-rw-r--r--contrib/arm-optimized-routines/string/arm/strcpy.c2
-rw-r--r--contrib/arm-optimized-routines/string/arm/strlen-armv6t2.S18
-rw-r--r--contrib/arm-optimized-routines/string/bench/memcpy.c170
-rw-r--r--contrib/arm-optimized-routines/string/bench/memset.c243
-rw-r--r--contrib/arm-optimized-routines/string/bench/strlen.c16
-rw-r--r--contrib/arm-optimized-routines/string/include/benchlib.h2
-rw-r--r--contrib/arm-optimized-routines/string/include/stringlib.h15
-rw-r--r--contrib/arm-optimized-routines/string/test/__mtag_tag_region.c2
-rw-r--r--contrib/arm-optimized-routines/string/test/__mtag_tag_zero_region.c2
-rw-r--r--contrib/arm-optimized-routines/string/test/memchr.c2
-rw-r--r--contrib/arm-optimized-routines/string/test/memcmp.c2
-rw-r--r--contrib/arm-optimized-routines/string/test/memcpy.c10
-rw-r--r--contrib/arm-optimized-routines/string/test/memmove.c10
-rw-r--r--contrib/arm-optimized-routines/string/test/memrchr.c2
-rw-r--r--contrib/arm-optimized-routines/string/test/memset.c7
-rw-r--r--contrib/arm-optimized-routines/string/test/mte.h2
-rw-r--r--contrib/arm-optimized-routines/string/test/stpcpy.c7
-rw-r--r--contrib/arm-optimized-routines/string/test/strchr.c2
-rw-r--r--contrib/arm-optimized-routines/string/test/strchrnul.c2
-rw-r--r--contrib/arm-optimized-routines/string/test/strcmp.c7
-rw-r--r--contrib/arm-optimized-routines/string/test/strcpy.c7
-rw-r--r--contrib/arm-optimized-routines/string/test/stringtest.h2
-rw-r--r--contrib/arm-optimized-routines/string/test/strlen.c5
-rw-r--r--contrib/arm-optimized-routines/string/test/strncmp.c7
-rw-r--r--contrib/arm-optimized-routines/string/test/strnlen.c2
-rw-r--r--contrib/arm-optimized-routines/string/test/strrchr.c2
-rw-r--r--contrib/arm-optimized-routines/string/x86_64/check-arch.S2
79 files changed, 2123 insertions, 1696 deletions
diff --git a/contrib/arm-optimized-routines/string/Dir.mk b/contrib/arm-optimized-routines/string/Dir.mk
index cf3453f7580d..40ff5acc093e 100644
--- a/contrib/arm-optimized-routines/string/Dir.mk
+++ b/contrib/arm-optimized-routines/string/Dir.mk
@@ -1,7 +1,7 @@
# Makefile fragment - requires GNU make
#
# Copyright (c) 2019-2021, Arm Limited.
-# SPDX-License-Identifier: MIT
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
S := $(srcdir)/string
B := build/string
diff --git a/contrib/arm-optimized-routines/string/README.contributors b/contrib/arm-optimized-routines/string/README.contributors
new file mode 100644
index 000000000000..0b4a51b56366
--- /dev/null
+++ b/contrib/arm-optimized-routines/string/README.contributors
@@ -0,0 +1,30 @@
+STYLE REQUIREMENTS
+==================
+
+1. Most code in this sub-directory is expected to be upstreamed into glibc so
+ the GNU Coding Standard and glibc specific conventions should be followed
+ to ease upstreaming.
+
+2. ABI and symbols: the code should be written so it is suitable for inclusion
+ into a libc with minimal changes. This e.g. means that internal symbols
+ should be hidden and in the implementation reserved namespace according to
+ ISO C and POSIX rules. If possible the built shared libraries and static
+ library archives should be usable to override libc symbols at link time (or
+ at runtime via LD_PRELOAD). This requires the symbols to follow the glibc ABI
+ (other than symbol versioning), this cannot be done reliably for static
+ linking so this is a best effort requirement.
+
+3. API: include headers should be suitable for benchmarking and testing code
+ and should not conflict with libc headers.
+
+
+CONTRIBUTION GUIDELINES FOR string SUB-DIRECTORY
+================================================
+1. Code:
+ - The assumptions of the code must be clearly documented.
+
+ - Assembly style should be consistent across different implementations.
+
+
+2. Performance:
+ - Benchmarking is needed on several microarchitectures.
diff --git a/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_region.S b/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_region.S
index 84339f73cf23..207e22950c6d 100644
--- a/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_region.S
+++ b/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_region.S
@@ -1,8 +1,8 @@
/*
* __mtag_tag_region - tag memory
*
- * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -15,7 +15,7 @@
* The memory region may remain untagged if tagging is not enabled.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#if __ARM_FEATURE_MEMORY_TAGGING
diff --git a/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_zero_region.S b/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_zero_region.S
index f58364ca6fcb..44b8e0114f42 100644
--- a/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_zero_region.S
+++ b/contrib/arm-optimized-routines/string/aarch64/__mtag_tag_zero_region.S
@@ -1,8 +1,8 @@
/*
* __mtag_tag_zero_region - tag memory and fill it with zero bytes
*
- * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -15,7 +15,7 @@
* The memory region may remain untagged if tagging is not enabled.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#if __ARM_FEATURE_MEMORY_TAGGING
diff --git a/contrib/arm-optimized-routines/string/asmdefs.h b/contrib/arm-optimized-routines/string/aarch64/asmdefs.h
index 340b427a505b..131b95e1fea9 100644
--- a/contrib/arm-optimized-routines/string/asmdefs.h
+++ b/contrib/arm-optimized-routines/string/aarch64/asmdefs.h
@@ -1,15 +1,13 @@
/*
- * Macros for asm code.
+ * Macros for asm code. AArch64 version.
*
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef _ASMDEFS_H
#define _ASMDEFS_H
-#if defined(__aarch64__)
-
/* Branch Target Identitication support. */
#define BTI_C hint 34
#define BTI_J hint 36
@@ -23,6 +21,19 @@
#define FEATURE_1_PAC 2
/* Add a NT_GNU_PROPERTY_TYPE_0 note. */
+#ifdef __ILP32__
+#define GNU_PROPERTY(type, value) \
+ .section .note.gnu.property, "a"; \
+ .p2align 2; \
+ .word 4; \
+ .word 12; \
+ .word 5; \
+ .asciz "GNU"; \
+ .word type; \
+ .word 4; \
+ .word value; \
+ .text
+#else
#define GNU_PROPERTY(type, value) \
.section .note.gnu.property, "a"; \
.p2align 3; \
@@ -35,6 +46,7 @@
.word value; \
.word 0; \
.text
+#endif
/* If set then the GNU Property Note section will be added to
mark objects to support BTI and PAC-RET. */
@@ -55,19 +67,6 @@ GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
.cfi_startproc; \
BTI_C;
-#else
-
-#define END_FILE
-
-#define ENTRY_ALIGN(name, alignment) \
- .global name; \
- .type name,%function; \
- .align alignment; \
- name: \
- .cfi_startproc;
-
-#endif
-
#define ENTRY(name) ENTRY_ALIGN(name, 6)
#define ENTRY_ALIAS(name) \
@@ -95,4 +94,13 @@ GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
#define SIZE_ARG(n)
#endif
+/* Compiler supports SVE instructions */
+#ifndef HAVE_SVE
+# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5)
+# define HAVE_SVE 1
+# else
+# define HAVE_SVE 0
+# endif
+#endif
+
#endif
diff --git a/contrib/arm-optimized-routines/string/aarch64/check-arch.S b/contrib/arm-optimized-routines/string/aarch64/check-arch.S
index 5a54242d7de6..131b7fa36ec2 100644
--- a/contrib/arm-optimized-routines/string/aarch64/check-arch.S
+++ b/contrib/arm-optimized-routines/string/aarch64/check-arch.S
@@ -1,8 +1,8 @@
/*
* check ARCH setting.
*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#if !__aarch64__
@@ -10,4 +10,4 @@
#endif
/* Include for GNU property notes. */
-#include "../asmdefs.h"
+#include "asmdefs.h"
diff --git a/contrib/arm-optimized-routines/string/aarch64/memchr-mte.S b/contrib/arm-optimized-routines/string/aarch64/memchr-mte.S
index c2e967d1004e..948c3cbc7dd4 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memchr-mte.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memchr-mte.S
@@ -1,8 +1,8 @@
/*
* memchr - find a character in a memory zone
*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* MTE compatible.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define srcin x0
#define chrin w1
@@ -23,25 +23,21 @@
#define synd x5
#define shift x6
#define tmp x7
-#define wtmp w7
#define vrepchr v0
#define qdata q1
#define vdata v1
#define vhas_chr v2
-#define vrepmask v3
-#define vend v4
-#define dend d4
+#define vend v3
+#define dend d3
/*
Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
+ For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+ per byte. We take 4 bits of every comparison byte with shift right and narrow
+ by 4 instruction. Since the bits in the nibble mask reflect the order in
+ which things occur in the original string, counting leading zeros identifies
+ exactly which byte matched. */
ENTRY (__memchr_aarch64_mte)
PTR_ARG (0)
@@ -50,55 +46,53 @@ ENTRY (__memchr_aarch64_mte)
cbz cntin, L(nomatch)
ld1 {vdata.16b}, [src]
dup vrepchr.16b, chrin
- mov wtmp, 0xf00f
- dup vrepmask.8h, wtmp
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
lsl shift, srcin, 2
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
fmov synd, dend
lsr synd, synd, shift
cbz synd, L(start_loop)
rbit synd, synd
clz synd, synd
- add result, srcin, synd, lsr 2
cmp cntin, synd, lsr 2
+ add result, srcin, synd, lsr 2
csel result, result, xzr, hi
ret
+ .p2align 3
L(start_loop):
sub tmp, src, srcin
- add tmp, tmp, 16
+ add tmp, tmp, 17
subs cntrem, cntin, tmp
- b.ls L(nomatch)
+ b.lo L(nomatch)
/* Make sure that it won't overread by a 16-byte chunk */
- add tmp, cntrem, 15
- tbnz tmp, 4, L(loop32_2)
-
+ tbz cntrem, 4, L(loop32_2)
+ sub src, src, 16
.p2align 4
L(loop32):
- ldr qdata, [src, 16]!
+ ldr qdata, [src, 32]!
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbnz synd, L(end)
L(loop32_2):
- ldr qdata, [src, 16]!
- subs cntrem, cntrem, 32
+ ldr qdata, [src, 16]
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- b.ls L(end)
+ subs cntrem, cntrem, 32
+ b.lo L(end_2)
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbz synd, L(loop32)
+L(end_2):
+ add src, src, 16
L(end):
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
+ sub cntrem, src, srcin
fmov synd, dend
- add tmp, srcin, cntin
- sub cntrem, tmp, src
+ sub cntrem, cntin, cntrem
#ifndef __AARCH64EB__
rbit synd, synd
#endif
diff --git a/contrib/arm-optimized-routines/string/aarch64/memchr-sve.S b/contrib/arm-optimized-routines/string/aarch64/memchr-sve.S
index c22e6596f19b..b851cf31f238 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memchr-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memchr-sve.S
@@ -1,11 +1,11 @@
/*
* memchr - find a character in a memory zone
*
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/contrib/arm-optimized-routines/string/aarch64/memchr.S b/contrib/arm-optimized-routines/string/aarch64/memchr.S
index 353f0d1eac53..fe6cfe2bc0e2 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memchr.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memchr.S
@@ -1,8 +1,8 @@
/*
* memchr - find a character in a memory zone
*
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* Neon Available.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
/* Arguments and results. */
#define srcin x0
diff --git a/contrib/arm-optimized-routines/string/aarch64/memcmp-sve.S b/contrib/arm-optimized-routines/string/aarch64/memcmp-sve.S
index 78c5ecaa4cdc..d52ce4555344 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memcmp-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memcmp-sve.S
@@ -1,11 +1,11 @@
/*
* memcmp - compare memory
*
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/contrib/arm-optimized-routines/string/aarch64/memcmp.S b/contrib/arm-optimized-routines/string/aarch64/memcmp.S
index 3b1026642eee..35135e72cc8e 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memcmp.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memcmp.S
@@ -1,103 +1,84 @@
/* memcmp - compare memory
*
- * Copyright (c) 2013-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2013-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
*
- * ARMv8-a, AArch64, unaligned accesses.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
-/* Parameters and result. */
-#define src1 x0
-#define src2 x1
-#define limit x2
-#define result w0
+#define src1 x0
+#define src2 x1
+#define limit x2
+#define result w0
+
+#define data1 x3
+#define data1w w3
+#define data2 x4
+#define data2w w4
+#define data3 x5
+#define data3w w5
+#define data4 x6
+#define data4w w6
+#define tmp x6
+#define src1end x7
+#define src2end x8
-/* Internal variables. */
-#define data1 x3
-#define data1w w3
-#define data1h x4
-#define data2 x5
-#define data2w w5
-#define data2h x6
-#define tmp1 x7
-#define tmp2 x8
ENTRY (__memcmp_aarch64)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
- subs limit, limit, 8
- b.lo L(less8)
-
- ldr data1, [src1], 8
- ldr data2, [src2], 8
- cmp data1, data2
- b.ne L(return)
-
- subs limit, limit, 8
- b.gt L(more16)
- ldr data1, [src1, limit]
- ldr data2, [src2, limit]
- b L(return)
-
-L(more16):
- ldr data1, [src1], 8
- ldr data2, [src2], 8
- cmp data1, data2
- bne L(return)
-
- /* Jump directly to comparing the last 16 bytes for 32 byte (or less)
- strings. */
- subs limit, limit, 16
+ cmp limit, 16
+ b.lo L(less16)
+ ldp data1, data3, [src1]
+ ldp data2, data4, [src2]
+ ccmp data1, data2, 0, ne
+ ccmp data3, data4, 0, eq
+ b.ne L(return2)
+
+ add src1end, src1, limit
+ add src2end, src2, limit
+ cmp limit, 32
b.ls L(last_bytes)
+ cmp limit, 160
+ b.hs L(loop_align)
+ sub limit, limit, 32
- /* We overlap loads between 0-32 bytes at either side of SRC1 when we
- try to align, so limit it only to strings larger than 128 bytes. */
- cmp limit, 96
- b.ls L(loop16)
-
- /* Align src1 and adjust src2 with bytes not yet done. */
- and tmp1, src1, 15
- add limit, limit, tmp1
- sub src1, src1, tmp1
- sub src2, src2, tmp1
-
- /* Loop performing 16 bytes per iteration using aligned src1.
- Limit is pre-decremented by 16 and must be larger than zero.
- Exit if <= 16 bytes left to do or if the data is not equal. */
.p2align 4
-L(loop16):
- ldp data1, data1h, [src1], 16
- ldp data2, data2h, [src2], 16
- subs limit, limit, 16
- ccmp data1, data2, 0, hi
- ccmp data1h, data2h, 0, eq
- b.eq L(loop16)
-
+L(loop32):
+ ldp data1, data3, [src1, 16]
+ ldp data2, data4, [src2, 16]
cmp data1, data2
- bne L(return)
- mov data1, data1h
- mov data2, data2h
+ ccmp data3, data4, 0, eq
+ b.ne L(return2)
+ cmp limit, 16
+ b.ls L(last_bytes)
+
+ ldp data1, data3, [src1, 32]
+ ldp data2, data4, [src2, 32]
cmp data1, data2
- bne L(return)
+ ccmp data3, data4, 0, eq
+ b.ne L(return2)
+ add src1, src1, 32
+ add src2, src2, 32
+L(last64):
+ subs limit, limit, 32
+ b.hi L(loop32)
/* Compare last 1-16 bytes using unaligned access. */
L(last_bytes):
- add src1, src1, limit
- add src2, src2, limit
- ldp data1, data1h, [src1]
- ldp data2, data2h, [src2]
- cmp data1, data2
- bne L(return)
- mov data1, data1h
- mov data2, data2h
+ ldp data1, data3, [src1end, -16]
+ ldp data2, data4, [src2end, -16]
+L(return2):
cmp data1, data2
+ csel data1, data1, data3, ne
+ csel data2, data2, data4, ne
/* Compare data bytes and set return value to 0, -1 or 1. */
L(return):
@@ -105,33 +86,105 @@ L(return):
rev data1, data1
rev data2, data2
#endif
- cmp data1, data2
-L(ret_eq):
+ cmp data1, data2
cset result, ne
cneg result, result, lo
ret
.p2align 4
- /* Compare up to 8 bytes. Limit is [-8..-1]. */
+L(less16):
+ add src1end, src1, limit
+ add src2end, src2, limit
+ tbz limit, 3, L(less8)
+ ldr data1, [src1]
+ ldr data2, [src2]
+ ldr data3, [src1end, -8]
+ ldr data4, [src2end, -8]
+ b L(return2)
+
+ .p2align 4
L(less8):
- adds limit, limit, 4
- b.lo L(less4)
- ldr data1w, [src1], 4
- ldr data2w, [src2], 4
+ tbz limit, 2, L(less4)
+ ldr data1w, [src1]
+ ldr data2w, [src2]
+ ldr data3w, [src1end, -4]
+ ldr data4w, [src2end, -4]
+ b L(return2)
+
+L(less4):
+ tbz limit, 1, L(less2)
+ ldrh data1w, [src1]
+ ldrh data2w, [src2]
cmp data1w, data2w
b.ne L(return)
- sub limit, limit, 4
-L(less4):
- adds limit, limit, 4
- beq L(ret_eq)
-L(byte_loop):
- ldrb data1w, [src1], 1
- ldrb data2w, [src2], 1
- subs limit, limit, 1
- ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
- b.eq L(byte_loop)
+L(less2):
+ mov result, 0
+ tbz limit, 0, L(return_zero)
+ ldrb data1w, [src1end, -1]
+ ldrb data2w, [src2end, -1]
sub result, data1w, data2w
+L(return_zero):
ret
-END (__memcmp_aarch64)
+L(loop_align):
+ ldp data1, data3, [src1, 16]
+ ldp data2, data4, [src2, 16]
+ cmp data1, data2
+ ccmp data3, data4, 0, eq
+ b.ne L(return2)
+
+ /* Align src2 and adjust src1, src2 and limit. */
+ and tmp, src2, 15
+ sub tmp, tmp, 16
+ sub src2, src2, tmp
+ add limit, limit, tmp
+ sub src1, src1, tmp
+ sub limit, limit, 64 + 16
+
+ .p2align 4
+L(loop64):
+ ldr q0, [src1, 16]
+ ldr q1, [src2, 16]
+ subs limit, limit, 64
+ ldr q2, [src1, 32]
+ ldr q3, [src2, 32]
+ eor v0.16b, v0.16b, v1.16b
+ eor v1.16b, v2.16b, v3.16b
+ ldr q2, [src1, 48]
+ ldr q3, [src2, 48]
+ umaxp v0.16b, v0.16b, v1.16b
+ ldr q4, [src1, 64]!
+ ldr q5, [src2, 64]!
+ eor v1.16b, v2.16b, v3.16b
+ eor v2.16b, v4.16b, v5.16b
+ umaxp v1.16b, v1.16b, v2.16b
+ umaxp v0.16b, v0.16b, v1.16b
+ umaxp v0.16b, v0.16b, v0.16b
+ fmov tmp, d0
+ ccmp tmp, 0, 0, hi
+ b.eq L(loop64)
+
+ /* If equal, process last 1-64 bytes using scalar loop. */
+ add limit, limit, 64 + 16
+ cbz tmp, L(last64)
+
+ /* Determine the 8-byte aligned offset of the first difference. */
+#ifdef __AARCH64EB__
+ rev16 tmp, tmp
+#endif
+ rev tmp, tmp
+ clz tmp, tmp
+ bic tmp, tmp, 7
+ sub tmp, tmp, 48
+ ldr data1, [src1, tmp]
+ ldr data2, [src2, tmp]
+#ifndef __AARCH64EB__
+ rev data1, data1
+ rev data2, data2
+#endif
+ mov result, 1
+ cmp data1, data2
+ cneg result, result, lo
+ ret
+END (__memcmp_aarch64)
diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S b/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S
index f97f2c3047b9..9d3027d4d3cd 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S
@@ -1,8 +1,8 @@
/*
* memcpy - copy memory area
*
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
*
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define dstin x0
#define src x1
@@ -56,11 +56,12 @@ ENTRY (__memcpy_aarch64_simd)
PTR_ARG (1)
SIZE_ARG (2)
add srcend, src, count
- add dstend, dstin, count
cmp count, 128
b.hi L(copy_long)
+ add dstend, dstin, count
cmp count, 32
b.hi L(copy32_128)
+ nop
/* Small copies: 0..32 bytes. */
cmp count, 16
@@ -71,6 +72,18 @@ ENTRY (__memcpy_aarch64_simd)
str B_q, [dstend, -16]
ret
+ .p2align 4
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ ldp A_q, B_q, [src]
+ ldp C_q, D_q, [srcend, -32]
+ cmp count, 64
+ b.hi L(copy128)
+ stp A_q, B_q, [dstin]
+ stp C_q, D_q, [dstend, -32]
+ ret
+
+ .p2align 4
/* Copy 8-15 bytes. */
L(copy16):
tbz count, 3, L(copy8)
@@ -80,7 +93,6 @@ L(copy16):
str A_h, [dstend, -8]
ret
- .p2align 3
/* Copy 4-7 bytes. */
L(copy8):
tbz count, 2, L(copy4)
@@ -90,31 +102,6 @@ L(copy8):
str B_lw, [dstend, -4]
ret
- /* Copy 0..3 bytes using a branchless sequence. */
-L(copy4):
- cbz count, L(copy0)
- lsr tmp1, count, 1
- ldrb A_lw, [src]
- ldrb C_lw, [srcend, -1]
- ldrb B_lw, [src, tmp1]
- strb A_lw, [dstin]
- strb B_lw, [dstin, tmp1]
- strb C_lw, [dstend, -1]
-L(copy0):
- ret
-
- .p2align 4
- /* Medium copies: 33..128 bytes. */
-L(copy32_128):
- ldp A_q, B_q, [src]
- ldp C_q, D_q, [srcend, -32]
- cmp count, 64
- b.hi L(copy128)
- stp A_q, B_q, [dstin]
- stp C_q, D_q, [dstend, -32]
- ret
-
- .p2align 4
/* Copy 65..128 bytes. */
L(copy128):
ldp E_q, F_q, [src, 32]
@@ -128,8 +115,24 @@ L(copy96):
stp C_q, D_q, [dstend, -32]
ret
+ /* Copy 0..3 bytes using a branchless sequence. */
+L(copy4):
+ cbz count, L(copy0)
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb C_lw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb C_lw, [dstend, -1]
+L(copy0):
+ ret
+
+ .p2align 3
/* Copy more than 128 bytes. */
L(copy_long):
+ add dstend, dstin, count
+
/* Use backwards copy if there is an overlap. */
sub tmp1, dstin, src
cmp tmp1, count
@@ -166,6 +169,9 @@ L(copy64_from_end):
stp A_q, B_q, [dstend, -32]
ret
+ .p2align 4
+ nop
+
/* Large backwards copy for overlapping copies.
Copy 16 bytes and then align srcend to 16-byte alignment. */
L(copy_long_backwards):
diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy-mops.S b/contrib/arm-optimized-routines/string/aarch64/memcpy-mops.S
new file mode 100644
index 000000000000..b45c31418717
--- /dev/null
+++ b/contrib/arm-optimized-routines/string/aarch64/memcpy-mops.S
@@ -0,0 +1,21 @@
+/*
+ * memcpy using MOPS extension.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "asmdefs.h"
+
+ENTRY (__memcpy_aarch64_mops)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ SIZE_ARG (2)
+
+ mov x3, x0
+ .inst 0x19010443 /* cpyfp [x3]!, [x1]!, x2! */
+ .inst 0x19410443 /* cpyfm [x3]!, [x1]!, x2! */
+ .inst 0x19810443 /* cpyfe [x3]!, [x1]!, x2! */
+ ret
+
+END (__memcpy_aarch64_mops)
diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S b/contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S
new file mode 100644
index 000000000000..e8a946d7db37
--- /dev/null
+++ b/contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S
@@ -0,0 +1,177 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
+ *
+ */
+
+#include "asmdefs.h"
+
+#ifdef HAVE_SVE
+
+.arch armv8-a+sve
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define tmp1 x6
+#define vlen x6
+
+#define A_q q0
+#define B_q q1
+#define C_q q2
+#define D_q q3
+#define E_q q4
+#define F_q q5
+#define G_q q6
+#define H_q q7
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+ from a single entry point. It uses unaligned accesses and branchless
+ sequences to keep the code small, simple and improve performance.
+ SVE vectors are used to speedup small copies.
+
+ Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+ copies of up to 128 bytes, and large copies. The overhead of the overlap
+ check is negligible since it is only required for large copies.
+
+ Large copies use a software pipelined loop processing 64 bytes per iteration.
+ The source pointer is 16-byte aligned to minimize unaligned accesses.
+ The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+ENTRY_ALIAS (__memmove_aarch64_sve)
+ENTRY (__memcpy_aarch64_sve)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ SIZE_ARG (2)
+
+ cmp count, 128
+ b.hi L(copy_long)
+ cntb vlen
+ cmp count, vlen, lsl 1
+ b.hi L(copy32_128)
+
+ whilelo p0.b, xzr, count
+ whilelo p1.b, vlen, count
+ ld1b z0.b, p0/z, [src, 0, mul vl]
+ ld1b z1.b, p1/z, [src, 1, mul vl]
+ st1b z0.b, p0, [dstin, 0, mul vl]
+ st1b z1.b, p1, [dstin, 1, mul vl]
+ ret
+
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ add srcend, src, count
+ add dstend, dstin, count
+ ldp A_q, B_q, [src]
+ ldp C_q, D_q, [srcend, -32]
+ cmp count, 64
+ b.hi L(copy128)
+ stp A_q, B_q, [dstin]
+ stp C_q, D_q, [dstend, -32]
+ ret
+
+ /* Copy 65..128 bytes. */
+L(copy128):
+ ldp E_q, F_q, [src, 32]
+ cmp count, 96
+ b.ls L(copy96)
+ ldp G_q, H_q, [srcend, -64]
+ stp G_q, H_q, [dstend, -64]
+L(copy96):
+ stp A_q, B_q, [dstin]
+ stp E_q, F_q, [dstin, 32]
+ stp C_q, D_q, [dstend, -32]
+ ret
+
+ /* Copy more than 128 bytes. */
+L(copy_long):
+ add srcend, src, count
+ add dstend, dstin, count
+
+ /* Use backwards copy if there is an overlap. */
+ sub tmp1, dstin, src
+ cmp tmp1, count
+ b.lo L(copy_long_backwards)
+
+ /* Copy 16 bytes and then align src to 16-byte alignment. */
+ ldr D_q, [src]
+ and tmp1, src, 15
+ bic src, src, 15
+ sub dst, dstin, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_q, B_q, [src, 16]
+ str D_q, [dstin]
+ ldp C_q, D_q, [src, 48]
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(copy64_from_end)
+L(loop64):
+ stp A_q, B_q, [dst, 16]
+ ldp A_q, B_q, [src, 80]
+ stp C_q, D_q, [dst, 48]
+ ldp C_q, D_q, [src, 112]
+ add src, src, 64
+ add dst, dst, 64
+ subs count, count, 64
+ b.hi L(loop64)
+
+ /* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end):
+ ldp E_q, F_q, [srcend, -64]
+ stp A_q, B_q, [dst, 16]
+ ldp A_q, B_q, [srcend, -32]
+ stp C_q, D_q, [dst, 48]
+ stp E_q, F_q, [dstend, -64]
+ stp A_q, B_q, [dstend, -32]
+ ret
+
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align srcend to 16-byte alignment. */
+L(copy_long_backwards):
+ cbz tmp1, L(return)
+ ldr D_q, [srcend, -16]
+ and tmp1, srcend, 15
+ bic srcend, srcend, 15
+ sub count, count, tmp1
+ ldp A_q, B_q, [srcend, -32]
+ str D_q, [dstend, -16]
+ ldp C_q, D_q, [srcend, -64]
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls L(copy64_from_start)
+
+L(loop64_backwards):
+ str B_q, [dstend, -16]
+ str A_q, [dstend, -32]
+ ldp A_q, B_q, [srcend, -96]
+ str D_q, [dstend, -48]
+ str C_q, [dstend, -64]!
+ ldp C_q, D_q, [srcend, -128]
+ sub srcend, srcend, 64
+ subs count, count, 64
+ b.hi L(loop64_backwards)
+
+ /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start):
+ ldp E_q, F_q, [src, 32]
+ stp A_q, B_q, [dstend, -32]
+ ldp A_q, B_q, [src]
+ stp C_q, D_q, [dstend, -64]
+ stp E_q, F_q, [dstin, 32]
+ stp A_q, B_q, [dstin]
+L(return):
+ ret
+
+END (__memcpy_aarch64_sve)
+
+#endif
diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy.S b/contrib/arm-optimized-routines/string/aarch64/memcpy.S
index dd254f6f9929..7c0606e2104a 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memcpy.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memcpy.S
@@ -1,8 +1,8 @@
/*
* memcpy - copy memory area
*
- * Copyright (c) 2012-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
*
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define dstin x0
#define src x1
diff --git a/contrib/arm-optimized-routines/string/aarch64/memmove-mops.S b/contrib/arm-optimized-routines/string/aarch64/memmove-mops.S
new file mode 100644
index 000000000000..6c73017bb16f
--- /dev/null
+++ b/contrib/arm-optimized-routines/string/aarch64/memmove-mops.S
@@ -0,0 +1,21 @@
+/*
+ * memmove using MOPS extension.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "asmdefs.h"
+
+ENTRY (__memmove_aarch64_mops)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ SIZE_ARG (2)
+
+ mov x3, x0
+ .inst 0x1d010443 /* cpyp [x3]!, [x1]!, x2! */
+ .inst 0x1d410443 /* cpym [x3]!, [x1]!, x2! */
+ .inst 0x1d810443 /* cpye [x3]!, [x1]!, x2! */
+ ret
+
+END (__memmove_aarch64_mops)
diff --git a/contrib/arm-optimized-routines/string/aarch64/memrchr.S b/contrib/arm-optimized-routines/string/aarch64/memrchr.S
index 7b4be847cecb..6418bdf56f41 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memrchr.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memrchr.S
@@ -1,8 +1,8 @@
/*
* memrchr - find last character in a memory zone.
*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* MTE compatible.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define srcin x0
#define chrin w1
@@ -23,7 +23,6 @@
#define synd x5
#define shift x6
#define tmp x7
-#define wtmp w7
#define end x8
#define endm1 x9
@@ -31,19 +30,16 @@
#define qdata q1
#define vdata v1
#define vhas_chr v2
-#define vrepmask v3
-#define vend v4
-#define dend d4
+#define vend v3
+#define dend d3
/*
Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
+ For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+ per byte. We take 4 bits of every comparison byte with shift right and narrow
+ by 4 instruction. Since the bits in the nibble mask reflect the order in
+ which things occur in the original string, counting leading zeros identifies
+ exactly which byte matched. */
ENTRY (__memrchr_aarch64)
PTR_ARG (0)
@@ -53,12 +49,9 @@ ENTRY (__memrchr_aarch64)
cbz cntin, L(nomatch)
ld1 {vdata.16b}, [src]
dup vrepchr.16b, chrin
- mov wtmp, 0xf00f
- dup vrepmask.8h, wtmp
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
neg shift, end, lsl 2
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
fmov synd, dend
lsl synd, synd, shift
cbz synd, L(start_loop)
@@ -69,34 +62,36 @@ ENTRY (__memrchr_aarch64)
csel result, result, xzr, hi
ret
+ nop
L(start_loop):
- sub tmp, end, src
- subs cntrem, cntin, tmp
+ subs cntrem, src, srcin
b.ls L(nomatch)
/* Make sure that it won't overread by a 16-byte chunk */
- add tmp, cntrem, 15
- tbnz tmp, 4, L(loop32_2)
+ sub cntrem, cntrem, 1
+ tbz cntrem, 4, L(loop32_2)
+ add src, src, 16
- .p2align 4
+ .p2align 5
L(loop32):
- ldr qdata, [src, -16]!
+ ldr qdata, [src, -32]!
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbnz synd, L(end)
L(loop32_2):
- ldr qdata, [src, -16]!
+ ldr qdata, [src, -16]
subs cntrem, cntrem, 32
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- b.ls L(end)
+ b.lo L(end_2)
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbz synd, L(loop32)
+L(end_2):
+ sub src, src, 16
L(end):
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
fmov synd, dend
add tmp, src, 15
diff --git a/contrib/arm-optimized-routines/string/aarch64/memset-mops.S b/contrib/arm-optimized-routines/string/aarch64/memset-mops.S
new file mode 100644
index 000000000000..ec791493bae9
--- /dev/null
+++ b/contrib/arm-optimized-routines/string/aarch64/memset-mops.S
@@ -0,0 +1,20 @@
+/*
+ * memset using MOPS extension.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "asmdefs.h"
+
+ENTRY (__memset_aarch64_mops)
+ PTR_ARG (0)
+ SIZE_ARG (2)
+
+ mov x3, x0
+ .inst 0x19c10443 /* setp [x3]!, x2!, x1 */
+ .inst 0x19c14443 /* setm [x3]!, x2!, x1 */
+ .inst 0x19c18443 /* sete [x3]!, x2!, x1 */
+ ret
+
+END (__memset_aarch64_mops)
diff --git a/contrib/arm-optimized-routines/string/aarch64/memset.S b/contrib/arm-optimized-routines/string/aarch64/memset.S
index 9fcd97579913..553b0fcaefea 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memset.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memset.S
@@ -1,8 +1,8 @@
/*
* memset - fill memory with a constant byte
*
- * Copyright (c) 2012-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
*
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define dstin x0
#define val x1
diff --git a/contrib/arm-optimized-routines/string/aarch64/stpcpy-mte.S b/contrib/arm-optimized-routines/string/aarch64/stpcpy-mte.S
deleted file mode 100644
index f1c711906515..000000000000
--- a/contrib/arm-optimized-routines/string/aarch64/stpcpy-mte.S
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * stpcpy - copy a string returning pointer to end.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#define BUILD_STPCPY 1
-
-#include "strcpy-mte.S"
diff --git a/contrib/arm-optimized-routines/string/aarch64/stpcpy-sve.S b/contrib/arm-optimized-routines/string/aarch64/stpcpy-sve.S
index 82dd9717b0a0..5d3f14b86026 100644
--- a/contrib/arm-optimized-routines/string/aarch64/stpcpy-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/stpcpy-sve.S
@@ -2,7 +2,7 @@
* stpcpy - copy a string returning pointer to end.
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#define BUILD_STPCPY 1
diff --git a/contrib/arm-optimized-routines/string/aarch64/stpcpy.S b/contrib/arm-optimized-routines/string/aarch64/stpcpy.S
index 4f62aa462389..155c68d75a7b 100644
--- a/contrib/arm-optimized-routines/string/aarch64/stpcpy.S
+++ b/contrib/arm-optimized-routines/string/aarch64/stpcpy.S
@@ -2,7 +2,7 @@
* stpcpy - copy a string returning pointer to end.
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#define BUILD_STPCPY 1
diff --git a/contrib/arm-optimized-routines/string/aarch64/strchr-mte.S b/contrib/arm-optimized-routines/string/aarch64/strchr-mte.S
index dcb0e4625870..6ec08f7acc76 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strchr-mte.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strchr-mte.S
@@ -1,8 +1,8 @@
/*
* strchr - find a character in a string
*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* MTE compatible.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define srcin x0
#define chrin w1
@@ -19,8 +19,7 @@
#define src x2
#define tmp1 x1
-#define wtmp2 w3
-#define tmp3 x3
+#define tmp2 x3
#define vrepchr v0
#define vdata v1
@@ -28,39 +27,30 @@
#define vhas_nul v2
#define vhas_chr v3
#define vrepmask v4
-#define vrepmask2 v5
-#define vend v6
-#define dend d6
+#define vend v5
+#define dend d5
/* Core algorithm.
For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-1 are set if the relevant byte matched the
- requested character, bits 2-3 are set if the byte is NUL (or matched), and
- bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd
- bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits
- in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
+ per byte. Bits 0-1 are set if the relevant byte matched the requested
+ character, bits 2-3 are set if the byte is NUL or matched. Count trailing
+ zeroes gives the position of the matching byte if it is a multiple of 4.
+ If it is not a multiple of 4, there was no match. */
ENTRY (__strchr_aarch64_mte)
PTR_ARG (0)
bic src, srcin, 15
dup vrepchr.16b, chrin
ld1 {vdata.16b}, [src]
- mov wtmp2, 0x3003
- dup vrepmask.8h, wtmp2
+ movi vrepmask.16b, 0x33
cmeq vhas_nul.16b, vdata.16b, 0
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- mov wtmp2, 0xf00f
- dup vrepmask2.8h, wtmp2
-
bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
- lsl tmp3, srcin, 2
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
-
+ lsl tmp2, srcin, 2
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov tmp1, dend
- lsr tmp1, tmp1, tmp3
+ lsr tmp1, tmp1, tmp2
cbz tmp1, L(loop)
rbit tmp1, tmp1
@@ -74,28 +64,34 @@ ENTRY (__strchr_aarch64_mte)
.p2align 4
L(loop):
- ldr qdata, [src, 16]!
+ ldr qdata, [src, 16]
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov tmp1, dend
+ cbnz tmp1, L(end)
+ ldr qdata, [src, 32]!
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov tmp1, dend
cbz tmp1, L(loop)
+ sub src, src, 16
+L(end):
#ifdef __AARCH64EB__
bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov tmp1, dend
#else
bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov tmp1, dend
rbit tmp1, tmp1
#endif
+ add src, src, 16
clz tmp1, tmp1
- /* Tmp1 is an even multiple of 2 if the target character was
- found first. Otherwise we've found the end of string. */
+ /* Tmp1 is a multiple of 4 if the target character was found. */
tst tmp1, 2
add result, src, tmp1, lsr 2
csel result, result, xzr, eq
diff --git a/contrib/arm-optimized-routines/string/aarch64/strchr-sve.S b/contrib/arm-optimized-routines/string/aarch64/strchr-sve.S
index 13ba9f44f9c5..ff075167bfef 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strchr-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strchr-sve.S
@@ -1,11 +1,11 @@
/*
* strchr/strchrnul - find a character in a string
*
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/contrib/arm-optimized-routines/string/aarch64/strchr.S b/contrib/arm-optimized-routines/string/aarch64/strchr.S
index 1063cbfd77aa..37193bd947a7 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strchr.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strchr.S
@@ -1,8 +1,8 @@
/*
* strchr - find a character in a string
*
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* Neon Available.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
/* Arguments and results. */
#define srcin x0
diff --git a/contrib/arm-optimized-routines/string/aarch64/strchrnul-mte.S b/contrib/arm-optimized-routines/string/aarch64/strchrnul-mte.S
index 1b0d0a63094c..543ee88bb285 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strchrnul-mte.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strchrnul-mte.S
@@ -1,8 +1,8 @@
/*
* strchrnul - find a character or nul in a string
*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* MTE compatible.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define srcin x0
#define chrin w1
@@ -20,38 +20,32 @@
#define src x2
#define tmp1 x1
#define tmp2 x3
-#define tmp2w w3
#define vrepchr v0
#define vdata v1
#define qdata q1
#define vhas_nul v2
#define vhas_chr v3
-#define vrepmask v4
-#define vend v5
-#define dend d5
+#define vend v4
+#define dend d4
-/* Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
+/*
+ Core algorithm:
+ For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+ per byte. We take 4 bits of every comparison byte with shift right and narrow
+ by 4 instruction. Since the bits in the nibble mask reflect the order in
+ which things occur in the original string, counting leading zeros identifies
+ exactly which byte matched. */
ENTRY (__strchrnul_aarch64_mte)
PTR_ARG (0)
bic src, srcin, 15
dup vrepchr.16b, chrin
ld1 {vdata.16b}, [src]
- mov tmp2w, 0xf00f
- dup vrepmask.8h, tmp2w
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b
lsl tmp2, srcin, 2
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
fmov tmp1, dend
lsr tmp1, tmp1, tmp2 /* Mask padding bits. */
cbz tmp1, L(loop)
@@ -63,15 +57,22 @@ ENTRY (__strchrnul_aarch64_mte)
.p2align 4
L(loop):
- ldr qdata, [src, 16]!
+ ldr qdata, [src, 16]
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b
+ umaxp vend.16b, vhas_chr.16b, vhas_chr.16b
+ fmov tmp1, dend
+ cbnz tmp1, L(end)
+ ldr qdata, [src, 32]!
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b
fmov tmp1, dend
cbz tmp1, L(loop)
-
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ sub src, src, 16
+L(end):
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
+ add src, src, 16
fmov tmp1, dend
#ifndef __AARCH64EB__
rbit tmp1, tmp1
diff --git a/contrib/arm-optimized-routines/string/aarch64/strchrnul-sve.S b/contrib/arm-optimized-routines/string/aarch64/strchrnul-sve.S
index 428ff1a3d008..0005f9177514 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strchrnul-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strchrnul-sve.S
@@ -2,7 +2,7 @@
* strchrnul - find a character or nul in a string
*
* Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#define BUILD_STRCHRNUL
diff --git a/contrib/arm-optimized-routines/string/aarch64/strchrnul.S b/contrib/arm-optimized-routines/string/aarch64/strchrnul.S
index a4230d919b47..666e8d0304c1 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strchrnul.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strchrnul.S
@@ -1,8 +1,8 @@
/*
* strchrnul - find a character or nul in a string
*
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* Neon Available.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
/* Arguments and results. */
#define srcin x0
diff --git a/contrib/arm-optimized-routines/string/aarch64/strcmp-mte.S b/contrib/arm-optimized-routines/string/aarch64/strcmp-mte.S
deleted file mode 100644
index 12d1a6b51dd3..000000000000
--- a/contrib/arm-optimized-routines/string/aarch64/strcmp-mte.S
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * strcmp - compare two strings
- *
- * Copyright (c) 2012-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64.
- * MTE compatible.
- */
-
-#include "../asmdefs.h"
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-
-#define src1 x0
-#define src2 x1
-#define result x0
-
-#define data1 x2
-#define data1w w2
-#define data2 x3
-#define data2w w3
-#define has_nul x4
-#define diff x5
-#define off1 x5
-#define syndrome x6
-#define tmp x6
-#define data3 x7
-#define zeroones x8
-#define shift x9
-#define off2 x10
-
-/* On big-endian early bytes are at MSB and on little-endian LSB.
- LS_FW means shifting towards early bytes. */
-#ifdef __AARCH64EB__
-# define LS_FW lsl
-#else
-# define LS_FW lsr
-#endif
-
-/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- can be done in parallel across the entire word.
- Since carry propagation makes 0x1 bytes before a NUL byte appear
- NUL too in big-endian, byte-reverse the data before the NUL check. */
-
-
-ENTRY (__strcmp_aarch64_mte)
- PTR_ARG (0)
- PTR_ARG (1)
- sub off2, src2, src1
- mov zeroones, REP8_01
- and tmp, src1, 7
- tst off2, 7
- b.ne L(misaligned8)
- cbnz tmp, L(mutual_align)
-
- .p2align 4
-
-L(loop_aligned):
- ldr data2, [src1, off2]
- ldr data1, [src1], 8
-L(start_realigned):
-#ifdef __AARCH64EB__
- rev tmp, data1
- sub has_nul, tmp, zeroones
- orr tmp, tmp, REP8_7f
-#else
- sub has_nul, data1, zeroones
- orr tmp, data1, REP8_7f
-#endif
- bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */
- ccmp data1, data2, 0, eq
- b.eq L(loop_aligned)
-#ifdef __AARCH64EB__
- rev has_nul, has_nul
-#endif
- eor diff, data1, data2
- orr syndrome, diff, has_nul
-L(end):
-#ifndef __AARCH64EB__
- rev syndrome, syndrome
- rev data1, data1
- rev data2, data2
-#endif
- clz shift, syndrome
- /* The most-significant-non-zero bit of the syndrome marks either the
- first bit that is different, or the top bit of the first zero byte.
- Shifting left now will bring the critical information into the
- top bits. */
- lsl data1, data1, shift
- lsl data2, data2, shift
- /* But we need to zero-extend (char is unsigned) the value and then
- perform a signed 32-bit subtraction. */
- lsr data1, data1, 56
- sub result, data1, data2, lsr 56
- ret
-
- .p2align 4
-
-L(mutual_align):
- /* Sources are mutually aligned, but are not currently at an
- alignment boundary. Round down the addresses and then mask off
- the bytes that precede the start point. */
- bic src1, src1, 7
- ldr data2, [src1, off2]
- ldr data1, [src1], 8
- neg shift, src2, lsl 3 /* Bits to alignment -64. */
- mov tmp, -1
- LS_FW tmp, tmp, shift
- orr data1, data1, tmp
- orr data2, data2, tmp
- b L(start_realigned)
-
-L(misaligned8):
- /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
- checking to make sure that we don't access beyond the end of SRC2. */
- cbz tmp, L(src1_aligned)
-L(do_misaligned):
- ldrb data1w, [src1], 1
- ldrb data2w, [src2], 1
- cmp data1w, 0
- ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
- b.ne L(done)
- tst src1, 7
- b.ne L(do_misaligned)
-
-L(src1_aligned):
- neg shift, src2, lsl 3
- bic src2, src2, 7
- ldr data3, [src2], 8
-#ifdef __AARCH64EB__
- rev data3, data3
-#endif
- lsr tmp, zeroones, shift
- orr data3, data3, tmp
- sub has_nul, data3, zeroones
- orr tmp, data3, REP8_7f
- bics has_nul, has_nul, tmp
- b.ne L(tail)
-
- sub off1, src2, src1
-
- .p2align 4
-
-L(loop_unaligned):
- ldr data3, [src1, off1]
- ldr data2, [src1, off2]
-#ifdef __AARCH64EB__
- rev data3, data3
-#endif
- sub has_nul, data3, zeroones
- orr tmp, data3, REP8_7f
- ldr data1, [src1], 8
- bics has_nul, has_nul, tmp
- ccmp data1, data2, 0, eq
- b.eq L(loop_unaligned)
-
- lsl tmp, has_nul, shift
-#ifdef __AARCH64EB__
- rev tmp, tmp
-#endif
- eor diff, data1, data2
- orr syndrome, diff, tmp
- cbnz syndrome, L(end)
-L(tail):
- ldr data1, [src1]
- neg shift, shift
- lsr data2, data3, shift
- lsr has_nul, has_nul, shift
-#ifdef __AARCH64EB__
- rev data2, data2
- rev has_nul, has_nul
-#endif
- eor diff, data1, data2
- orr syndrome, diff, has_nul
- b L(end)
-
-L(done):
- sub result, data1, data2
- ret
-
-END (__strcmp_aarch64_mte)
-
diff --git a/contrib/arm-optimized-routines/string/aarch64/strcmp-sve.S b/contrib/arm-optimized-routines/string/aarch64/strcmp-sve.S
index e6d2da5411ca..eaf909a378f1 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strcmp-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strcmp-sve.S
@@ -1,11 +1,11 @@
/*
* __strcmp_aarch64_sve - compare two strings
*
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/contrib/arm-optimized-routines/string/aarch64/strcmp.S b/contrib/arm-optimized-routines/string/aarch64/strcmp.S
index 7714ebf5577d..137a9aa06681 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strcmp.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strcmp.S
@@ -1,168 +1,184 @@
/*
* strcmp - compare two strings
*
- * Copyright (c) 2012-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
+
/* Assumptions:
*
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64.
+ * MTE compatible.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
-/* Parameters and result. */
#define src1 x0
#define src2 x1
#define result x0
-/* Internal variables. */
#define data1 x2
#define data1w w2
#define data2 x3
#define data2w w3
#define has_nul x4
#define diff x5
+#define off1 x5
#define syndrome x6
-#define tmp1 x7
-#define tmp2 x8
-#define tmp3 x9
-#define zeroones x10
-#define pos x11
+#define tmp x6
+#define data3 x7
+#define zeroones x8
+#define shift x9
+#define off2 x10
+
+/* On big-endian early bytes are at MSB and on little-endian LSB.
+ LS_FW means shifting towards early bytes. */
+#ifdef __AARCH64EB__
+# define LS_FW lsl
+#else
+# define LS_FW lsr
+#endif
+
+/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word.
+ Since carry propagation makes 0x1 bytes before a NUL byte appear
+ NUL too in big-endian, byte-reverse the data before the NUL check. */
+
- /* Start of performance-critical section -- one 64B cache line. */
ENTRY (__strcmp_aarch64)
PTR_ARG (0)
PTR_ARG (1)
- eor tmp1, src1, src2
- mov zeroones, #REP8_01
- tst tmp1, #7
+ sub off2, src2, src1
+ mov zeroones, REP8_01
+ and tmp, src1, 7
+ tst off2, 7
b.ne L(misaligned8)
- ands tmp1, src1, #7
- b.ne L(mutual_align)
- /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- can be done in parallel across the entire word. */
+ cbnz tmp, L(mutual_align)
+
+ .p2align 4
+
L(loop_aligned):
- ldr data1, [src1], #8
- ldr data2, [src2], #8
+ ldr data2, [src1, off2]
+ ldr data1, [src1], 8
L(start_realigned):
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- eor diff, data1, data2 /* Non-zero if differences found. */
- bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+#ifdef __AARCH64EB__
+ rev tmp, data1
+ sub has_nul, tmp, zeroones
+ orr tmp, tmp, REP8_7f
+#else
+ sub has_nul, data1, zeroones
+ orr tmp, data1, REP8_7f
+#endif
+ bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */
+ ccmp data1, data2, 0, eq
+ b.eq L(loop_aligned)
+#ifdef __AARCH64EB__
+ rev has_nul, has_nul
+#endif
+ eor diff, data1, data2
orr syndrome, diff, has_nul
- cbz syndrome, L(loop_aligned)
- /* End of performance-critical section -- one 64B cache line. */
-
L(end):
-#ifndef __AARCH64EB__
+#ifndef __AARCH64EB__
rev syndrome, syndrome
rev data1, data1
- /* The MS-non-zero bit of the syndrome marks either the first bit
- that is different, or the top bit of the first zero byte.
- Shifting left now will bring the critical information into the
- top bits. */
- clz pos, syndrome
rev data2, data2
- lsl data1, data1, pos
- lsl data2, data2, pos
- /* But we need to zero-extend (char is unsigned) the value and then
- perform a signed 32-bit subtraction. */
- lsr data1, data1, #56
- sub result, data1, data2, lsr #56
- ret
-#else
- /* For big-endian we cannot use the trick with the syndrome value
- as carry-propagation can corrupt the upper bits if the trailing
- bytes in the string contain 0x01. */
- /* However, if there is no NUL byte in the dword, we can generate
- the result directly. We can't just subtract the bytes as the
- MSB might be significant. */
- cbnz has_nul, 1f
- cmp data1, data2
- cset result, ne
- cneg result, result, lo
- ret
-1:
- /* Re-compute the NUL-byte detection, using a byte-reversed value. */
- rev tmp3, data1
- sub tmp1, tmp3, zeroones
- orr tmp2, tmp3, #REP8_7f
- bic has_nul, tmp1, tmp2
- rev has_nul, has_nul
- orr syndrome, diff, has_nul
- clz pos, syndrome
- /* The MS-non-zero bit of the syndrome marks either the first bit
- that is different, or the top bit of the first zero byte.
+#endif
+ clz shift, syndrome
+ /* The most-significant-non-zero bit of the syndrome marks either the
+ first bit that is different, or the top bit of the first zero byte.
Shifting left now will bring the critical information into the
top bits. */
- lsl data1, data1, pos
- lsl data2, data2, pos
+ lsl data1, data1, shift
+ lsl data2, data2, shift
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
- lsr data1, data1, #56
- sub result, data1, data2, lsr #56
+ lsr data1, data1, 56
+ sub result, data1, data2, lsr 56
ret
-#endif
+
+ .p2align 4
L(mutual_align):
/* Sources are mutually aligned, but are not currently at an
alignment boundary. Round down the addresses and then mask off
- the bytes that preceed the start point. */
- bic src1, src1, #7
- bic src2, src2, #7
- lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
- ldr data1, [src1], #8
- neg tmp1, tmp1 /* Bits to alignment -64. */
- ldr data2, [src2], #8
- mov tmp2, #~0
-#ifdef __AARCH64EB__
- /* Big-endian. Early bytes are at MSB. */
- lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
-#else
- /* Little-endian. Early bytes are at LSB. */
- lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
-#endif
- orr data1, data1, tmp2
- orr data2, data2, tmp2
+ the bytes that precede the start point. */
+ bic src1, src1, 7
+ ldr data2, [src1, off2]
+ ldr data1, [src1], 8
+ neg shift, src2, lsl 3 /* Bits to alignment -64. */
+ mov tmp, -1
+ LS_FW tmp, tmp, shift
+ orr data1, data1, tmp
+ orr data2, data2, tmp
b L(start_realigned)
L(misaligned8):
/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
- checking to make sure that we don't access beyond page boundary in
- SRC2. */
- tst src1, #7
- b.eq L(loop_misaligned)
+ checking to make sure that we don't access beyond the end of SRC2. */
+ cbz tmp, L(src1_aligned)
L(do_misaligned):
- ldrb data1w, [src1], #1
- ldrb data2w, [src2], #1
- cmp data1w, #1
- ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ ldrb data1w, [src1], 1
+ ldrb data2w, [src2], 1
+ cmp data1w, 0
+ ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
b.ne L(done)
- tst src1, #7
+ tst src1, 7
b.ne L(do_misaligned)
-L(loop_misaligned):
- /* Test if we are within the last dword of the end of a 4K page. If
- yes then jump back to the misaligned loop to copy a byte at a time. */
- and tmp1, src2, #0xff8
- eor tmp1, tmp1, #0xff8
- cbz tmp1, L(do_misaligned)
- ldr data1, [src1], #8
- ldr data2, [src2], #8
-
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- eor diff, data1, data2 /* Non-zero if differences found. */
- bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+L(src1_aligned):
+ neg shift, src2, lsl 3
+ bic src2, src2, 7
+ ldr data3, [src2], 8
+#ifdef __AARCH64EB__
+ rev data3, data3
+#endif
+ lsr tmp, zeroones, shift
+ orr data3, data3, tmp
+ sub has_nul, data3, zeroones
+ orr tmp, data3, REP8_7f
+ bics has_nul, has_nul, tmp
+ b.ne L(tail)
+
+ sub off1, src2, src1
+
+ .p2align 4
+
+L(loop_unaligned):
+ ldr data3, [src1, off1]
+ ldr data2, [src1, off2]
+#ifdef __AARCH64EB__
+ rev data3, data3
+#endif
+ sub has_nul, data3, zeroones
+ orr tmp, data3, REP8_7f
+ ldr data1, [src1], 8
+ bics has_nul, has_nul, tmp
+ ccmp data1, data2, 0, eq
+ b.eq L(loop_unaligned)
+
+ lsl tmp, has_nul, shift
+#ifdef __AARCH64EB__
+ rev tmp, tmp
+#endif
+ eor diff, data1, data2
+ orr syndrome, diff, tmp
+ cbnz syndrome, L(end)
+L(tail):
+ ldr data1, [src1]
+ neg shift, shift
+ lsr data2, data3, shift
+ lsr has_nul, has_nul, shift
+#ifdef __AARCH64EB__
+ rev data2, data2
+ rev has_nul, has_nul
+#endif
+ eor diff, data1, data2
orr syndrome, diff, has_nul
- cbz syndrome, L(loop_misaligned)
b L(end)
L(done):
diff --git a/contrib/arm-optimized-routines/string/aarch64/strcpy-mte.S b/contrib/arm-optimized-routines/string/aarch64/strcpy-mte.S
deleted file mode 100644
index 88c222d61e53..000000000000
--- a/contrib/arm-optimized-routines/string/aarch64/strcpy-mte.S
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * strcpy/stpcpy - copy a string returning pointer to start/end.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, Advanced SIMD.
- * MTE compatible.
- */
-
-#include "../asmdefs.h"
-
-#define dstin x0
-#define srcin x1
-#define result x0
-
-#define src x2
-#define dst x3
-#define len x4
-#define synd x4
-#define tmp x5
-#define wtmp w5
-#define shift x5
-#define data1 x6
-#define dataw1 w6
-#define data2 x7
-#define dataw2 w7
-
-#define dataq q0
-#define vdata v0
-#define vhas_nul v1
-#define vrepmask v2
-#define vend v3
-#define dend d3
-#define dataq2 q1
-
-#ifdef BUILD_STPCPY
-# define STRCPY __stpcpy_aarch64_mte
-# define IFSTPCPY(X,...) X,__VA_ARGS__
-#else
-# define STRCPY __strcpy_aarch64_mte
-# define IFSTPCPY(X,...)
-#endif
-
-/* Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
-
-ENTRY (STRCPY)
- PTR_ARG (0)
- PTR_ARG (1)
- bic src, srcin, 15
- mov wtmp, 0xf00f
- ld1 {vdata.16b}, [src]
- dup vrepmask.8h, wtmp
- cmeq vhas_nul.16b, vdata.16b, 0
- lsl shift, srcin, 2
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b
- fmov synd, dend
- lsr synd, synd, shift
- cbnz synd, L(tail)
-
- ldr dataq, [src, 16]!
- cmeq vhas_nul.16b, vdata.16b, 0
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b
- fmov synd, dend
- cbz synd, L(start_loop)
-
-#ifndef __AARCH64EB__
- rbit synd, synd
-#endif
- sub tmp, src, srcin
- clz len, synd
- add len, tmp, len, lsr 2
- tbz len, 4, L(less16)
- sub tmp, len, 15
- ldr dataq, [srcin]
- ldr dataq2, [srcin, tmp]
- str dataq, [dstin]
- str dataq2, [dstin, tmp]
- IFSTPCPY (add result, dstin, len)
- ret
-
- .p2align 4,,8
-L(tail):
- rbit synd, synd
- clz len, synd
- lsr len, len, 2
-
- .p2align 4
-L(less16):
- tbz len, 3, L(less8)
- sub tmp, len, 7
- ldr data1, [srcin]
- ldr data2, [srcin, tmp]
- str data1, [dstin]
- str data2, [dstin, tmp]
- IFSTPCPY (add result, dstin, len)
- ret
-
- .p2align 4
-L(less8):
- subs tmp, len, 3
- b.lo L(less4)
- ldr dataw1, [srcin]
- ldr dataw2, [srcin, tmp]
- str dataw1, [dstin]
- str dataw2, [dstin, tmp]
- IFSTPCPY (add result, dstin, len)
- ret
-
-L(less4):
- cbz len, L(zerobyte)
- ldrh dataw1, [srcin]
- strh dataw1, [dstin]
-L(zerobyte):
- strb wzr, [dstin, len]
- IFSTPCPY (add result, dstin, len)
- ret
-
- .p2align 4
-L(start_loop):
- sub len, src, srcin
- ldr dataq2, [srcin]
- add dst, dstin, len
- str dataq2, [dstin]
-
- .p2align 5
-L(loop):
- str dataq, [dst], 16
- ldr dataq, [src, 16]!
- cmeq vhas_nul.16b, vdata.16b, 0
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
- fmov synd, dend
- cbz synd, L(loop)
-
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
- fmov synd, dend
-#ifndef __AARCH64EB__
- rbit synd, synd
-#endif
- clz len, synd
- lsr len, len, 2
- sub tmp, len, 15
- ldr dataq, [src, tmp]
- str dataq, [dst, tmp]
- IFSTPCPY (add result, dst, len)
- ret
-
-END (STRCPY)
diff --git a/contrib/arm-optimized-routines/string/aarch64/strcpy-sve.S b/contrib/arm-optimized-routines/string/aarch64/strcpy-sve.S
index f515462e09ae..00e72dce4451 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strcpy-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strcpy-sve.S
@@ -1,11 +1,11 @@
/*
* strcpy/stpcpy - copy a string returning pointer to start/end.
*
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/contrib/arm-optimized-routines/string/aarch64/strcpy.S b/contrib/arm-optimized-routines/string/aarch64/strcpy.S
index 6e9ed424b693..97ae37ea4229 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strcpy.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strcpy.S
@@ -1,311 +1,156 @@
/*
* strcpy/stpcpy - copy a string returning pointer to start/end.
*
- * Copyright (c) 2013-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
*
- * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
-/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
-
- To test the page crossing code path more thoroughly, compile with
- -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
- entry path. This option is not intended for production use. */
-
-/* Arguments and results. */
#define dstin x0
#define srcin x1
+#define result x0
-/* Locals and temporaries. */
#define src x2
#define dst x3
-#define data1 x4
-#define data1w w4
-#define data2 x5
-#define data2w w5
-#define has_nul1 x6
-#define has_nul2 x7
-#define tmp1 x8
-#define tmp2 x9
-#define tmp3 x10
-#define tmp4 x11
-#define zeroones x12
-#define data1a x13
-#define data2a x14
-#define pos x15
-#define len x16
-#define to_align x17
+#define len x4
+#define synd x4
+#define tmp x5
+#define shift x5
+#define data1 x6
+#define dataw1 w6
+#define data2 x7
+#define dataw2 w7
+
+#define dataq q0
+#define vdata v0
+#define vhas_nul v1
+#define vend v2
+#define dend d2
+#define dataq2 q1
#ifdef BUILD_STPCPY
-#define STRCPY __stpcpy_aarch64
-#else
-#define STRCPY __strcpy_aarch64
-#endif
-
- /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- can be done in parallel across the entire word. */
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
-
- /* AArch64 systems have a minimum page size of 4k. We can do a quick
- page size check for crossing this boundary on entry and if we
- do not, then we can short-circuit much of the entry code. We
- expect early page-crossing strings to be rare (probability of
- 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
- predictable, even with random strings.
-
- We don't bother checking for larger page sizes, the cost of setting
- up the correct page size is just not worth the extra gain from
- a small reduction in the cases taking the slow path. Note that
- we only care about whether the first fetch, which may be
- misaligned, crosses a page boundary - after that we move to aligned
- fetches for the remainder of the string. */
-
-#ifdef STRCPY_TEST_PAGE_CROSS
- /* Make everything that isn't Qword aligned look like a page cross. */
-#define MIN_PAGE_P2 4
+# define STRCPY __stpcpy_aarch64
+# define IFSTPCPY(X,...) X,__VA_ARGS__
#else
-#define MIN_PAGE_P2 12
+# define STRCPY __strcpy_aarch64
+# define IFSTPCPY(X,...)
#endif
-#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
+/*
+ Core algorithm:
+ For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+ per byte. We take 4 bits of every comparison byte with shift right and narrow
+ by 4 instruction. Since the bits in the nibble mask reflect the order in
+ which things occur in the original string, counting leading zeros identifies
+ exactly which byte matched. */
ENTRY (STRCPY)
PTR_ARG (0)
PTR_ARG (1)
- /* For moderately short strings, the fastest way to do the copy is to
- calculate the length of the string in the same way as strlen, then
- essentially do a memcpy of the result. This avoids the need for
- multiple byte copies and further means that by the time we
- reach the bulk copy loop we know we can always use DWord
- accesses. We expect __strcpy_aarch64 to rarely be called repeatedly
- with the same source string, so branch prediction is likely to
- always be difficult - we mitigate against this by preferring
- conditional select operations over branches whenever this is
- feasible. */
- and tmp2, srcin, #(MIN_PAGE_SIZE - 1)
- mov zeroones, #REP8_01
- and to_align, srcin, #15
- cmp tmp2, #(MIN_PAGE_SIZE - 16)
- neg tmp1, to_align
- /* The first fetch will straddle a (possible) page boundary iff
- srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
- aligned string will never fail the page align check, so will
- always take the fast path. */
- b.gt L(page_cross)
-
-L(page_cross_ok):
- ldp data1, data2, [srcin]
-#ifdef __AARCH64EB__
- /* Because we expect the end to be found within 16 characters
- (profiling shows this is the most common case), it's worth
- swapping the bytes now to save having to recalculate the
- termination syndrome later. We preserve data1 and data2
- so that we can re-use the values later on. */
- rev tmp2, data1
- sub tmp1, tmp2, zeroones
- orr tmp2, tmp2, #REP8_7f
- bics has_nul1, tmp1, tmp2
- b.ne L(fp_le8)
- rev tmp4, data2
- sub tmp3, tmp4, zeroones
- orr tmp4, tmp4, #REP8_7f
-#else
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- bics has_nul1, tmp1, tmp2
- b.ne L(fp_le8)
- sub tmp3, data2, zeroones
- orr tmp4, data2, #REP8_7f
+ bic src, srcin, 15
+ ld1 {vdata.16b}, [src]
+ cmeq vhas_nul.16b, vdata.16b, 0
+ lsl shift, srcin, 2
+ shrn vend.8b, vhas_nul.8h, 4
+ fmov synd, dend
+ lsr synd, synd, shift
+ cbnz synd, L(tail)
+
+ ldr dataq, [src, 16]!
+ cmeq vhas_nul.16b, vdata.16b, 0
+ shrn vend.8b, vhas_nul.8h, 4
+ fmov synd, dend
+ cbz synd, L(start_loop)
+
+#ifndef __AARCH64EB__
+ rbit synd, synd
#endif
- bics has_nul2, tmp3, tmp4
- b.eq L(bulk_entry)
+ sub tmp, src, srcin
+ clz len, synd
+ add len, tmp, len, lsr 2
+ tbz len, 4, L(less16)
+ sub tmp, len, 15
+ ldr dataq, [srcin]
+ ldr dataq2, [srcin, tmp]
+ str dataq, [dstin]
+ str dataq2, [dstin, tmp]
+ IFSTPCPY (add result, dstin, len)
+ ret
- /* The string is short (<=16 bytes). We don't know exactly how
- short though, yet. Work out the exact length so that we can
- quickly select the optimal copy strategy. */
-L(fp_gt8):
- rev has_nul2, has_nul2
- clz pos, has_nul2
- mov tmp2, #56
- add dst, dstin, pos, lsr #3 /* Bits to bytes. */
- sub pos, tmp2, pos
-#ifdef __AARCH64EB__
- lsr data2, data2, pos
-#else
- lsl data2, data2, pos
-#endif
- str data2, [dst, #1]
+L(tail):
+ rbit synd, synd
+ clz len, synd
+ lsr len, len, 2
+L(less16):
+ tbz len, 3, L(less8)
+ sub tmp, len, 7
+ ldr data1, [srcin]
+ ldr data2, [srcin, tmp]
str data1, [dstin]
-#ifdef BUILD_STPCPY
- add dstin, dst, #8
-#endif
+ str data2, [dstin, tmp]
+ IFSTPCPY (add result, dstin, len)
ret
-L(fp_le8):
- rev has_nul1, has_nul1
- clz pos, has_nul1
- add dst, dstin, pos, lsr #3 /* Bits to bytes. */
- subs tmp2, pos, #24 /* Pos in bits. */
- b.lt L(fp_lt4)
-#ifdef __AARCH64EB__
- mov tmp2, #56
- sub pos, tmp2, pos
- lsr data2, data1, pos
- lsr data1, data1, #32
-#else
- lsr data2, data1, tmp2
-#endif
- /* 4->7 bytes to copy. */
- str data2w, [dst, #-3]
- str data1w, [dstin]
-#ifdef BUILD_STPCPY
- mov dstin, dst
-#endif
+ .p2align 4
+L(less8):
+ subs tmp, len, 3
+ b.lo L(less4)
+ ldr dataw1, [srcin]
+ ldr dataw2, [srcin, tmp]
+ str dataw1, [dstin]
+ str dataw2, [dstin, tmp]
+ IFSTPCPY (add result, dstin, len)
ret
-L(fp_lt4):
- cbz pos, L(fp_lt2)
- /* 2->3 bytes to copy. */
-#ifdef __AARCH64EB__
- lsr data1, data1, #48
-#endif
- strh data1w, [dstin]
- /* Fall-through, one byte (max) to go. */
-L(fp_lt2):
- /* Null-terminated string. Last character must be zero! */
- strb wzr, [dst]
-#ifdef BUILD_STPCPY
- mov dstin, dst
-#endif
- ret
-
- .p2align 6
- /* Aligning here ensures that the entry code and main loop all lies
- within one 64-byte cache line. */
-L(bulk_entry):
- sub to_align, to_align, #16
- stp data1, data2, [dstin]
- sub src, srcin, to_align
- sub dst, dstin, to_align
- b L(entry_no_page_cross)
-
- /* The inner loop deals with two Dwords at a time. This has a
- slightly higher start-up cost, but we should win quite quickly,
- especially on cores with a high number of issue slots per
- cycle, as we get much better parallelism out of the operations. */
-L(main_loop):
- stp data1, data2, [dst], #16
-L(entry_no_page_cross):
- ldp data1, data2, [src], #16
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- sub tmp3, data2, zeroones
- orr tmp4, data2, #REP8_7f
- bic has_nul1, tmp1, tmp2
- bics has_nul2, tmp3, tmp4
- ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
- b.eq L(main_loop)
- /* Since we know we are copying at least 16 bytes, the fastest way
- to deal with the tail is to determine the location of the
- trailing NUL, then (re)copy the 16 bytes leading up to that. */
- cmp has_nul1, #0
-#ifdef __AARCH64EB__
- /* For big-endian, carry propagation (if the final byte in the
- string is 0x01) means we cannot use has_nul directly. The
- easiest way to get the correct byte is to byte-swap the data
- and calculate the syndrome a second time. */
- csel data1, data1, data2, ne
- rev data1, data1
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- bic has_nul1, tmp1, tmp2
-#else
- csel has_nul1, has_nul1, has_nul2, ne
-#endif
- rev has_nul1, has_nul1
- clz pos, has_nul1
- add tmp1, pos, #72
- add pos, pos, #8
- csel pos, pos, tmp1, ne
- add src, src, pos, lsr #3
- add dst, dst, pos, lsr #3
- ldp data1, data2, [src, #-32]
- stp data1, data2, [dst, #-16]
-#ifdef BUILD_STPCPY
- sub dstin, dst, #1
-#endif
+L(less4):
+ cbz len, L(zerobyte)
+ ldrh dataw1, [srcin]
+ strh dataw1, [dstin]
+L(zerobyte):
+ strb wzr, [dstin, len]
+ IFSTPCPY (add result, dstin, len)
ret
-L(page_cross):
- bic src, srcin, #15
- /* Start by loading two words at [srcin & ~15], then forcing the
- bytes that precede srcin to 0xff. This means they never look
- like termination bytes. */
- ldp data1, data2, [src]
- lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
- tst to_align, #7
- csetm tmp2, ne
-#ifdef __AARCH64EB__
- lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
-#else
- lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+ .p2align 4
+L(start_loop):
+ sub tmp, srcin, dstin
+ ldr dataq2, [srcin]
+ sub dst, src, tmp
+ str dataq2, [dstin]
+L(loop):
+ str dataq, [dst], 32
+ ldr dataq, [src, 16]
+ cmeq vhas_nul.16b, vdata.16b, 0
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ cbnz synd, L(loopend)
+ str dataq, [dst, -16]
+ ldr dataq, [src, 32]!
+ cmeq vhas_nul.16b, vdata.16b, 0
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ cbz synd, L(loop)
+ add dst, dst, 16
+L(loopend):
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
+ fmov synd, dend
+ sub dst, dst, 31
+#ifndef __AARCH64EB__
+ rbit synd, synd
#endif
- orr data1, data1, tmp2
- orr data2a, data2, tmp2
- cmp to_align, #8
- csinv data1, data1, xzr, lt
- csel data2, data2, data2a, lt
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- sub tmp3, data2, zeroones
- orr tmp4, data2, #REP8_7f
- bic has_nul1, tmp1, tmp2
- bics has_nul2, tmp3, tmp4
- ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
- b.eq L(page_cross_ok)
- /* We now need to make data1 and data2 look like they've been
- loaded directly from srcin. Do a rotate on the 128-bit value. */
- lsl tmp1, to_align, #3 /* Bytes->bits. */
- neg tmp2, to_align, lsl #3
-#ifdef __AARCH64EB__
- lsl data1a, data1, tmp1
- lsr tmp4, data2, tmp2
- lsl data2, data2, tmp1
- orr tmp4, tmp4, data1a
- cmp to_align, #8
- csel data1, tmp4, data2, lt
- rev tmp2, data1
- rev tmp4, data2
- sub tmp1, tmp2, zeroones
- orr tmp2, tmp2, #REP8_7f
- sub tmp3, tmp4, zeroones
- orr tmp4, tmp4, #REP8_7f
-#else
- lsr data1a, data1, tmp1
- lsl tmp4, data2, tmp2
- lsr data2, data2, tmp1
- orr tmp4, tmp4, data1a
- cmp to_align, #8
- csel data1, tmp4, data2, lt
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- sub tmp3, data2, zeroones
- orr tmp4, data2, #REP8_7f
-#endif
- bic has_nul1, tmp1, tmp2
- cbnz has_nul1, L(fp_le8)
- bic has_nul2, tmp3, tmp4
- b L(fp_gt8)
+ clz len, synd
+ lsr len, len, 2
+ add dst, dst, len
+ ldr dataq, [dst, tmp]
+ str dataq, [dst]
+ IFSTPCPY (add result, dst, 15)
+ ret
END (STRCPY)
-
diff --git a/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S b/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S
index 7cf41d5c1eac..77235797f7c5 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S
@@ -1,8 +1,8 @@
/*
* strlen - calculate the length of a string.
*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* MTE compatible.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define srcin x0
#define result x0
@@ -19,35 +19,26 @@
#define src x1
#define synd x2
#define tmp x3
-#define wtmp w3
#define shift x4
#define data q0
#define vdata v0
#define vhas_nul v1
-#define vrepmask v2
-#define vend v3
-#define dend d3
+#define vend v2
+#define dend d2
/* Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
+ Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
+ four bits per byte using the shrn instruction. A count trailing zeros then
+ identifies the first zero byte. */
ENTRY (__strlen_aarch64_mte)
PTR_ARG (0)
bic src, srcin, 15
- mov wtmp, 0xf00f
ld1 {vdata.16b}, [src]
- dup vrepmask.8h, wtmp
cmeq vhas_nul.16b, vdata.16b, 0
lsl shift, srcin, 2
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
lsr synd, synd, shift
cbz synd, L(loop)
@@ -59,19 +50,25 @@ ENTRY (__strlen_aarch64_mte)
.p2align 5
L(loop):
- ldr data, [src, 16]!
+ ldr data, [src, 16]
+ cmeq vhas_nul.16b, vdata.16b, 0
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ cbnz synd, L(loop_end)
+ ldr data, [src, 32]!
cmeq vhas_nul.16b, vdata.16b, 0
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend
cbz synd, L(loop)
-
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ sub src, src, 16
+L(loop_end):
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
sub result, src, srcin
fmov synd, dend
#ifndef __AARCH64EB__
rbit synd, synd
#endif
+ add result, result, 16
clz tmp, synd
add result, result, tmp, lsr 2
ret
diff --git a/contrib/arm-optimized-routines/string/aarch64/strlen-sve.S b/contrib/arm-optimized-routines/string/aarch64/strlen-sve.S
index 2392493f1a3c..12ebbdba5c93 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strlen-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strlen-sve.S
@@ -1,11 +1,11 @@
/*
* __strlen_aarch64_sve - compute the length of a string
*
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/contrib/arm-optimized-routines/string/aarch64/strlen.S b/contrib/arm-optimized-routines/string/aarch64/strlen.S
index a1b164a49238..6f6f08f636b2 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strlen.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strlen.S
@@ -1,8 +1,8 @@
/*
* strlen - calculate the length of a string.
*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* Not MTE compatible.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define srcin x0
#define len x0
@@ -36,6 +36,7 @@
#define tmp x2
#define tmpw w2
#define synd x3
+#define syndw w3
#define shift x4
/* For the first 32 bytes, NUL detection works on the principle that
@@ -110,7 +111,6 @@ ENTRY (__strlen_aarch64)
add len, len, tmp1, lsr 3
ret
- .p2align 3
/* Look for a NUL byte at offset 16..31 in the string. */
L(bytes16_31):
ldp data1, data2, [srcin, 16]
@@ -138,6 +138,7 @@ L(bytes16_31):
add len, len, tmp1, lsr 3
ret
+ nop
L(loop_entry):
bic src, srcin, 31
@@ -153,18 +154,12 @@ L(loop):
/* Low 32 bits of synd are non-zero if a NUL was found in datav1. */
cmeq maskv.16b, datav1.16b, 0
sub len, src, srcin
- tst synd, 0xffffffff
- b.ne 1f
+ cbnz syndw, 1f
cmeq maskv.16b, datav2.16b, 0
add len, len, 16
1:
/* Generate a bitmask and compute correct byte offset. */
-#ifdef __AARCH64EB__
- bic maskv.8h, 0xf0
-#else
- bic maskv.8h, 0x0f, lsl 8
-#endif
- umaxp maskv.16b, maskv.16b, maskv.16b
+ shrn maskv.8b, maskv.8h, 4
fmov synd, maskd
#ifndef __AARCH64EB__
rbit synd, synd
@@ -173,8 +168,6 @@ L(loop):
add len, len, tmp, lsr 2
ret
- .p2align 4
-
L(page_cross):
bic src, srcin, 31
mov tmpw, 0x0c03
diff --git a/contrib/arm-optimized-routines/string/aarch64/strncmp-mte.S b/contrib/arm-optimized-routines/string/aarch64/strncmp-mte.S
deleted file mode 100644
index c9d6fc8a158b..000000000000
--- a/contrib/arm-optimized-routines/string/aarch64/strncmp-mte.S
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * strncmp - compare two strings
- *
- * Copyright (c) 2013-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64
- */
-
-#include "../asmdefs.h"
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-
-/* Parameters and result. */
-#define src1 x0
-#define src2 x1
-#define limit x2
-#define result x0
-
-/* Internal variables. */
-#define data1 x3
-#define data1w w3
-#define data2 x4
-#define data2w w4
-#define has_nul x5
-#define diff x6
-#define syndrome x7
-#define tmp1 x8
-#define tmp2 x9
-#define tmp3 x10
-#define zeroones x11
-#define pos x12
-#define mask x13
-#define endloop x14
-#define count mask
-#define offset pos
-#define neg_offset x15
-
-/* Define endian dependent shift operations.
- On big-endian early bytes are at MSB and on little-endian LSB.
- LS_FW means shifting towards early bytes.
- LS_BK means shifting towards later bytes.
- */
-#ifdef __AARCH64EB__
-#define LS_FW lsl
-#define LS_BK lsr
-#else
-#define LS_FW lsr
-#define LS_BK lsl
-#endif
-
-ENTRY (__strncmp_aarch64_mte)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
- cbz limit, L(ret0)
- eor tmp1, src1, src2
- mov zeroones, #REP8_01
- tst tmp1, #7
- and count, src1, #7
- b.ne L(misaligned8)
- cbnz count, L(mutual_align)
-
- /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- can be done in parallel across the entire word. */
- .p2align 4
-L(loop_aligned):
- ldr data1, [src1], #8
- ldr data2, [src2], #8
-L(start_realigned):
- subs limit, limit, #8
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- eor diff, data1, data2 /* Non-zero if differences found. */
- csinv endloop, diff, xzr, hi /* Last Dword or differences. */
- bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
- ccmp endloop, #0, #0, eq
- b.eq L(loop_aligned)
- /* End of main loop */
-
-L(full_check):
-#ifndef __AARCH64EB__
- orr syndrome, diff, has_nul
- add limit, limit, 8 /* Rewind limit to before last subs. */
-L(syndrome_check):
- /* Limit was reached. Check if the NUL byte or the difference
- is before the limit. */
- rev syndrome, syndrome
- rev data1, data1
- clz pos, syndrome
- rev data2, data2
- lsl data1, data1, pos
- cmp limit, pos, lsr #3
- lsl data2, data2, pos
- /* But we need to zero-extend (char is unsigned) the value and then
- perform a signed 32-bit subtraction. */
- lsr data1, data1, #56
- sub result, data1, data2, lsr #56
- csel result, result, xzr, hi
- ret
-#else
- /* Not reached the limit, must have found the end or a diff. */
- tbz limit, #63, L(not_limit)
- add tmp1, limit, 8
- cbz limit, L(not_limit)
-
- lsl limit, tmp1, #3 /* Bits -> bytes. */
- mov mask, #~0
- lsr mask, mask, limit
- bic data1, data1, mask
- bic data2, data2, mask
-
- /* Make sure that the NUL byte is marked in the syndrome. */
- orr has_nul, has_nul, mask
-
-L(not_limit):
- /* For big-endian we cannot use the trick with the syndrome value
- as carry-propagation can corrupt the upper bits if the trailing
- bytes in the string contain 0x01. */
- /* However, if there is no NUL byte in the dword, we can generate
- the result directly. We can't just subtract the bytes as the
- MSB might be significant. */
- cbnz has_nul, 1f
- cmp data1, data2
- cset result, ne
- cneg result, result, lo
- ret
-1:
- /* Re-compute the NUL-byte detection, using a byte-reversed value. */
- rev tmp3, data1
- sub tmp1, tmp3, zeroones
- orr tmp2, tmp3, #REP8_7f
- bic has_nul, tmp1, tmp2
- rev has_nul, has_nul
- orr syndrome, diff, has_nul
- clz pos, syndrome
- /* The most-significant-non-zero bit of the syndrome marks either the
- first bit that is different, or the top bit of the first zero byte.
- Shifting left now will bring the critical information into the
- top bits. */
-L(end_quick):
- lsl data1, data1, pos
- lsl data2, data2, pos
- /* But we need to zero-extend (char is unsigned) the value and then
- perform a signed 32-bit subtraction. */
- lsr data1, data1, #56
- sub result, data1, data2, lsr #56
- ret
-#endif
-
-L(mutual_align):
- /* Sources are mutually aligned, but are not currently at an
- alignment boundary. Round down the addresses and then mask off
- the bytes that precede the start point.
- We also need to adjust the limit calculations, but without
- overflowing if the limit is near ULONG_MAX. */
- bic src1, src1, #7
- bic src2, src2, #7
- ldr data1, [src1], #8
- neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
- ldr data2, [src2], #8
- mov tmp2, #~0
- LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
- /* Adjust the limit and ensure it doesn't overflow. */
- adds limit, limit, count
- csinv limit, limit, xzr, lo
- orr data1, data1, tmp2
- orr data2, data2, tmp2
- b L(start_realigned)
-
- .p2align 4
- /* Don't bother with dwords for up to 16 bytes. */
-L(misaligned8):
- cmp limit, #16
- b.hs L(try_misaligned_words)
-
-L(byte_loop):
- /* Perhaps we can do better than this. */
- ldrb data1w, [src1], #1
- ldrb data2w, [src2], #1
- subs limit, limit, #1
- ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
- ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.eq L(byte_loop)
-L(done):
- sub result, data1, data2
- ret
- /* Align the SRC1 to a dword by doing a bytewise compare and then do
- the dword loop. */
-L(try_misaligned_words):
- cbz count, L(src1_aligned)
-
- neg count, count
- and count, count, #7
- sub limit, limit, count
-
-L(page_end_loop):
- ldrb data1w, [src1], #1
- ldrb data2w, [src2], #1
- cmp data1w, #1
- ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.ne L(done)
- subs count, count, #1
- b.hi L(page_end_loop)
-
- /* The following diagram explains the comparison of misaligned strings.
- The bytes are shown in natural order. For little-endian, it is
- reversed in the registers. The "x" bytes are before the string.
- The "|" separates data that is loaded at one time.
- src1 | a a a a a a a a | b b b c c c c c | . . .
- src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
-
- After shifting in each step, the data looks like this:
- STEP_A STEP_B STEP_C
- data1 a a a a a a a a b b b c c c c c b b b c c c c c
- data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
-
- The bytes with "0" are eliminated from the syndrome via mask.
-
- Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
- time from SRC2. The comparison happens in 3 steps. After each step
- the loop can exit, or read from SRC1 or SRC2. */
-L(src1_aligned):
- /* Calculate offset from 8 byte alignment to string start in bits. No
- need to mask offset since shifts are ignoring upper bits. */
- lsl offset, src2, #3
- bic src2, src2, #0xf
- mov mask, -1
- neg neg_offset, offset
- ldr data1, [src1], #8
- ldp tmp1, tmp2, [src2], #16
- LS_BK mask, mask, neg_offset
- and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
- /* Skip the first compare if data in tmp1 is irrelevant. */
- tbnz offset, 6, L(misaligned_mid_loop)
-
-L(loop_misaligned):
- /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
- LS_FW data2, tmp1, offset
- LS_BK tmp1, tmp2, neg_offset
- subs limit, limit, #8
- orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
- sub has_nul, data1, zeroones
- eor diff, data1, data2 /* Non-zero if differences found. */
- orr tmp3, data1, #REP8_7f
- csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
- bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
- orr tmp3, endloop, has_nul
- cbnz tmp3, L(full_check)
-
- ldr data1, [src1], #8
-L(misaligned_mid_loop):
- /* STEP_B: Compare first part of data1 to second part of tmp2. */
- LS_FW data2, tmp2, offset
-#ifdef __AARCH64EB__
- /* For big-endian we do a byte reverse to avoid carry-propagation
- problem described above. This way we can reuse the has_nul in the
- next step and also use syndrome value trick at the end. */
- rev tmp3, data1
- #define data1_fixed tmp3
-#else
- #define data1_fixed data1
-#endif
- sub has_nul, data1_fixed, zeroones
- orr tmp3, data1_fixed, #REP8_7f
- eor diff, data2, data1 /* Non-zero if differences found. */
- bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
-#ifdef __AARCH64EB__
- rev has_nul, has_nul
-#endif
- cmp limit, neg_offset, lsr #3
- orr syndrome, diff, has_nul
- bic syndrome, syndrome, mask /* Ignore later bytes. */
- csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
- cbnz tmp3, L(syndrome_check)
-
- /* STEP_C: Compare second part of data1 to first part of tmp1. */
- ldp tmp1, tmp2, [src2], #16
- cmp limit, #8
- LS_BK data2, tmp1, neg_offset
- eor diff, data2, data1 /* Non-zero if differences found. */
- orr syndrome, diff, has_nul
- and syndrome, syndrome, mask /* Ignore earlier bytes. */
- csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
- cbnz tmp3, L(syndrome_check)
-
- ldr data1, [src1], #8
- sub limit, limit, #8
- b L(loop_misaligned)
-
-#ifdef __AARCH64EB__
-L(syndrome_check):
- clz pos, syndrome
- cmp pos, limit, lsl #3
- b.lo L(end_quick)
-#endif
-
-L(ret0):
- mov result, #0
- ret
-END(__strncmp_aarch64_mte)
-
diff --git a/contrib/arm-optimized-routines/string/aarch64/strncmp-sve.S b/contrib/arm-optimized-routines/string/aarch64/strncmp-sve.S
index 234190e245b0..6a9e9f7b6437 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strncmp-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strncmp-sve.S
@@ -1,11 +1,11 @@
/*
* strncmp - compare two strings with limit
*
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/contrib/arm-optimized-routines/string/aarch64/strncmp.S b/contrib/arm-optimized-routines/string/aarch64/strncmp.S
index 738b6539cab6..128a10c52bb1 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strncmp.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strncmp.S
@@ -1,20 +1,20 @@
/*
* strncmp - compare two strings
*
- * Copyright (c) 2013-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2013-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
*
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64.
+ * MTE compatible.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
/* Parameters and result. */
#define src1 x0
@@ -35,10 +35,24 @@
#define tmp3 x10
#define zeroones x11
#define pos x12
-#define limit_wd x13
-#define mask x14
-#define endloop x15
+#define mask x13
+#define endloop x14
#define count mask
+#define offset pos
+#define neg_offset x15
+
+/* Define endian dependent shift operations.
+ On big-endian early bytes are at MSB and on little-endian LSB.
+ LS_FW means shifting towards early bytes.
+ LS_BK means shifting towards later bytes.
+ */
+#ifdef __AARCH64EB__
+#define LS_FW lsl
+#define LS_BK lsr
+#else
+#define LS_FW lsr
+#define LS_BK lsl
+#endif
ENTRY (__strncmp_aarch64)
PTR_ARG (0)
@@ -51,9 +65,6 @@ ENTRY (__strncmp_aarch64)
and count, src1, #7
b.ne L(misaligned8)
cbnz count, L(mutual_align)
- /* Calculate the number of full and partial words -1. */
- sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
- lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
@@ -63,56 +74,52 @@ L(loop_aligned):
ldr data1, [src1], #8
ldr data2, [src2], #8
L(start_realigned):
- subs limit_wd, limit_wd, #1
+ subs limit, limit, #8
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
eor diff, data1, data2 /* Non-zero if differences found. */
- csinv endloop, diff, xzr, pl /* Last Dword or differences. */
+ csinv endloop, diff, xzr, hi /* Last Dword or differences. */
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp endloop, #0, #0, eq
b.eq L(loop_aligned)
/* End of main loop */
- /* Not reached the limit, must have found the end or a diff. */
- tbz limit_wd, #63, L(not_limit)
-
- /* Limit % 8 == 0 => all bytes significant. */
- ands limit, limit, #7
- b.eq L(not_limit)
-
- lsl limit, limit, #3 /* Bits -> bytes. */
- mov mask, #~0
-#ifdef __AARCH64EB__
- lsr mask, mask, limit
-#else
- lsl mask, mask, limit
-#endif
- bic data1, data1, mask
- bic data2, data2, mask
-
- /* Make sure that the NUL byte is marked in the syndrome. */
- orr has_nul, has_nul, mask
-
-L(not_limit):
+L(full_check):
+#ifndef __AARCH64EB__
orr syndrome, diff, has_nul
-
-#ifndef __AARCH64EB__
+ add limit, limit, 8 /* Rewind limit to before last subs. */
+L(syndrome_check):
+ /* Limit was reached. Check if the NUL byte or the difference
+ is before the limit. */
rev syndrome, syndrome
rev data1, data1
- /* The MS-non-zero bit of the syndrome marks either the first bit
- that is different, or the top bit of the first zero byte.
- Shifting left now will bring the critical information into the
- top bits. */
clz pos, syndrome
rev data2, data2
lsl data1, data1, pos
+ cmp limit, pos, lsr #3
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
lsr data1, data1, #56
sub result, data1, data2, lsr #56
+ csel result, result, xzr, hi
ret
#else
+ /* Not reached the limit, must have found the end or a diff. */
+ tbz limit, #63, L(not_limit)
+ add tmp1, limit, 8
+ cbz limit, L(not_limit)
+
+ lsl limit, tmp1, #3 /* Bits -> bytes. */
+ mov mask, #~0
+ lsr mask, mask, limit
+ bic data1, data1, mask
+ bic data2, data2, mask
+
+ /* Make sure that the NUL byte is marked in the syndrome. */
+ orr has_nul, has_nul, mask
+
+L(not_limit):
/* For big-endian we cannot use the trick with the syndrome value
as carry-propagation can corrupt the upper bits if the trailing
bytes in the string contain 0x01. */
@@ -133,10 +140,11 @@ L(not_limit):
rev has_nul, has_nul
orr syndrome, diff, has_nul
clz pos, syndrome
- /* The MS-non-zero bit of the syndrome marks either the first bit
- that is different, or the top bit of the first zero byte.
+ /* The most-significant-non-zero bit of the syndrome marks either the
+ first bit that is different, or the top bit of the first zero byte.
Shifting left now will bring the critical information into the
top bits. */
+L(end_quick):
lsl data1, data1, pos
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
@@ -158,22 +166,12 @@ L(mutual_align):
neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
ldr data2, [src2], #8
mov tmp2, #~0
- sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
-#ifdef __AARCH64EB__
- /* Big-endian. Early bytes are at MSB. */
- lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */
-#else
- /* Little-endian. Early bytes are at LSB. */
- lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */
-#endif
- and tmp3, limit_wd, #7
- lsr limit_wd, limit_wd, #3
- /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
- add limit, limit, count
- add tmp3, tmp3, count
+ LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
+ /* Adjust the limit and ensure it doesn't overflow. */
+ adds limit, limit, count
+ csinv limit, limit, xzr, lo
orr data1, data1, tmp2
orr data2, data2, tmp2
- add limit_wd, limit_wd, tmp3, lsr #3
b L(start_realigned)
.p2align 4
@@ -196,13 +194,11 @@ L(done):
/* Align the SRC1 to a dword by doing a bytewise compare and then do
the dword loop. */
L(try_misaligned_words):
- lsr limit_wd, limit, #3
- cbz count, L(do_misaligned)
+ cbz count, L(src1_aligned)
neg count, count
and count, count, #7
sub limit, limit, count
- lsr limit_wd, limit, #3
L(page_end_loop):
ldrb data1w, [src1], #1
@@ -213,48 +209,100 @@ L(page_end_loop):
subs count, count, #1
b.hi L(page_end_loop)
-L(do_misaligned):
- /* Prepare ourselves for the next page crossing. Unlike the aligned
- loop, we fetch 1 less dword because we risk crossing bounds on
- SRC2. */
- mov count, #8
- subs limit_wd, limit_wd, #1
- b.lo L(done_loop)
-L(loop_misaligned):
- and tmp2, src2, #0xff8
- eor tmp2, tmp2, #0xff8
- cbz tmp2, L(page_end_loop)
+ /* The following diagram explains the comparison of misaligned strings.
+ The bytes are shown in natural order. For little-endian, it is
+ reversed in the registers. The "x" bytes are before the string.
+ The "|" separates data that is loaded at one time.
+ src1 | a a a a a a a a | b b b c c c c c | . . .
+ src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
+
+ After shifting in each step, the data looks like this:
+ STEP_A STEP_B STEP_C
+ data1 a a a a a a a a b b b c c c c c b b b c c c c c
+ data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
+ The bytes with "0" are eliminated from the syndrome via mask.
+
+ Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
+ time from SRC2. The comparison happens in 3 steps. After each step
+ the loop can exit, or read from SRC1 or SRC2. */
+L(src1_aligned):
+ /* Calculate offset from 8 byte alignment to string start in bits. No
+ need to mask offset since shifts are ignoring upper bits. */
+ lsl offset, src2, #3
+ bic src2, src2, #0xf
+ mov mask, -1
+ neg neg_offset, offset
ldr data1, [src1], #8
- ldr data2, [src2], #8
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- eor diff, data1, data2 /* Non-zero if differences found. */
- bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
- ccmp diff, #0, #0, eq
- b.ne L(not_limit)
- subs limit_wd, limit_wd, #1
- b.pl L(loop_misaligned)
+ ldp tmp1, tmp2, [src2], #16
+ LS_BK mask, mask, neg_offset
+ and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
+ /* Skip the first compare if data in tmp1 is irrelevant. */
+ tbnz offset, 6, L(misaligned_mid_loop)
-L(done_loop):
- /* We found a difference or a NULL before the limit was reached. */
- and limit, limit, #7
- cbz limit, L(not_limit)
- /* Read the last word. */
- sub src1, src1, 8
- sub src2, src2, 8
- ldr data1, [src1, limit]
- ldr data2, [src2, limit]
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
+L(loop_misaligned):
+ /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
+ LS_FW data2, tmp1, offset
+ LS_BK tmp1, tmp2, neg_offset
+ subs limit, limit, #8
+ orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
+ sub has_nul, data1, zeroones
eor diff, data1, data2 /* Non-zero if differences found. */
- bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
- ccmp diff, #0, #0, eq
- b.ne L(not_limit)
+ orr tmp3, data1, #REP8_7f
+ csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
+ bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
+ orr tmp3, endloop, has_nul
+ cbnz tmp3, L(full_check)
+
+ ldr data1, [src1], #8
+L(misaligned_mid_loop):
+ /* STEP_B: Compare first part of data1 to second part of tmp2. */
+ LS_FW data2, tmp2, offset
+#ifdef __AARCH64EB__
+ /* For big-endian we do a byte reverse to avoid carry-propagation
+ problem described above. This way we can reuse the has_nul in the
+ next step and also use syndrome value trick at the end. */
+ rev tmp3, data1
+ #define data1_fixed tmp3
+#else
+ #define data1_fixed data1
+#endif
+ sub has_nul, data1_fixed, zeroones
+ orr tmp3, data1_fixed, #REP8_7f
+ eor diff, data2, data1 /* Non-zero if differences found. */
+ bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
+#ifdef __AARCH64EB__
+ rev has_nul, has_nul
+#endif
+ cmp limit, neg_offset, lsr #3
+ orr syndrome, diff, has_nul
+ bic syndrome, syndrome, mask /* Ignore later bytes. */
+ csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
+ cbnz tmp3, L(syndrome_check)
+
+ /* STEP_C: Compare second part of data1 to first part of tmp1. */
+ ldp tmp1, tmp2, [src2], #16
+ cmp limit, #8
+ LS_BK data2, tmp1, neg_offset
+ eor diff, data2, data1 /* Non-zero if differences found. */
+ orr syndrome, diff, has_nul
+ and syndrome, syndrome, mask /* Ignore earlier bytes. */
+ csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
+ cbnz tmp3, L(syndrome_check)
+
+ ldr data1, [src1], #8
+ sub limit, limit, #8
+ b L(loop_misaligned)
+
+#ifdef __AARCH64EB__
+L(syndrome_check):
+ clz pos, syndrome
+ cmp pos, limit, lsl #3
+ b.lo L(end_quick)
+#endif
L(ret0):
mov result, #0
ret
-
-END ( __strncmp_aarch64)
+END(__strncmp_aarch64)
diff --git a/contrib/arm-optimized-routines/string/aarch64/strnlen-sve.S b/contrib/arm-optimized-routines/string/aarch64/strnlen-sve.S
index 5b9ebf7763bc..6c43dc427da7 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strnlen-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strnlen-sve.S
@@ -1,11 +1,11 @@
/*
* strnlen - calculate the length of a string with limit.
*
- * Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/contrib/arm-optimized-routines/string/aarch64/strnlen.S b/contrib/arm-optimized-routines/string/aarch64/strnlen.S
index 48d2495d2082..f2090a7485a5 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strnlen.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strnlen.S
@@ -1,8 +1,8 @@
/*
* strnlen - calculate the length of a string with limit.
*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* MTE compatible.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define srcin x0
#define cntin x1
@@ -20,39 +20,30 @@
#define src x2
#define synd x3
#define shift x4
-#define wtmp w4
#define tmp x4
#define cntrem x5
#define qdata q0
#define vdata v0
#define vhas_chr v1
-#define vrepmask v2
-#define vend v3
-#define dend d3
+#define vend v2
+#define dend d2
/*
Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
+ Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
+ four bits per byte using the shrn instruction. A count trailing zeros then
+ identifies the first zero byte. */
ENTRY (__strnlen_aarch64)
PTR_ARG (0)
SIZE_ARG (1)
bic src, srcin, 15
- mov wtmp, 0xf00f
cbz cntin, L(nomatch)
- ld1 {vdata.16b}, [src], 16
- dup vrepmask.8h, wtmp
+ ld1 {vdata.16b}, [src]
cmeq vhas_chr.16b, vdata.16b, 0
lsl shift, srcin, 2
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
fmov synd, dend
lsr synd, synd, shift
cbz synd, L(start_loop)
@@ -64,37 +55,40 @@ L(finish):
csel result, cntin, result, ls
ret
+L(nomatch):
+ mov result, cntin
+ ret
+
L(start_loop):
sub tmp, src, srcin
+ add tmp, tmp, 17
subs cntrem, cntin, tmp
- b.ls L(nomatch)
+ b.lo L(nomatch)
/* Make sure that it won't overread by a 16-byte chunk */
- add tmp, cntrem, 15
- tbnz tmp, 4, L(loop32_2)
-
+ tbz cntrem, 4, L(loop32_2)
+ sub src, src, 16
.p2align 5
L(loop32):
- ldr qdata, [src], 16
+ ldr qdata, [src, 32]!
cmeq vhas_chr.16b, vdata.16b, 0
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbnz synd, L(end)
L(loop32_2):
- ldr qdata, [src], 16
+ ldr qdata, [src, 16]
subs cntrem, cntrem, 32
cmeq vhas_chr.16b, vdata.16b, 0
- b.ls L(end)
+ b.lo L(end_2)
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbz synd, L(loop32)
-
+L(end_2):
+ add src, src, 16
L(end):
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
- sub src, src, 16
- mov synd, vend.d[0]
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
sub result, src, srcin
+ fmov synd, dend
#ifndef __AARCH64EB__
rbit synd, synd
#endif
@@ -104,9 +98,5 @@ L(end):
csel result, cntin, result, ls
ret
-L(nomatch):
- mov result, cntin
- ret
-
END (__strnlen_aarch64)
diff --git a/contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S b/contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S
index 1e4fb1a68f7e..bb61ab9ad4e7 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S
@@ -1,8 +1,8 @@
/*
* strrchr - find last position of a character in a string.
*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* MTE compatible.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define srcin x0
#define chrin w1
@@ -19,7 +19,6 @@
#define src x2
#define tmp x3
-#define wtmp w3
#define synd x3
#define shift x4
#define src_match x4
@@ -31,7 +30,6 @@
#define vhas_nul v2
#define vhas_chr v3
#define vrepmask v4
-#define vrepmask2 v5
#define vend v5
#define dend d5
@@ -47,55 +45,67 @@ ENTRY (__strrchr_aarch64_mte)
PTR_ARG (0)
bic src, srcin, 15
dup vrepchr.16b, chrin
- mov wtmp, 0x3003
- dup vrepmask.8h, wtmp
- tst srcin, 15
- beq L(loop1)
-
- ld1 {vdata.16b}, [src], 16
+ movi vrepmask.16b, 0x33
+ ld1 {vdata.16b}, [src]
cmeq vhas_nul.16b, vdata.16b, 0
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- mov wtmp, 0xf00f
- dup vrepmask2.8h, wtmp
bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b
+ shrn vend.8b, vhas_nul.8h, 4
lsl shift, srcin, 2
fmov synd, dend
lsr synd, synd, shift
lsl synd, synd, shift
ands nul_match, synd, 0xcccccccccccccccc
bne L(tail)
- cbnz synd, L(loop2)
+ cbnz synd, L(loop2_start)
- .p2align 5
+ .p2align 4
L(loop1):
- ld1 {vdata.16b}, [src], 16
+ ldr q1, [src, 16]
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ cbnz synd, L(loop1_end)
+ ldr q1, [src, 32]!
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend
cbz synd, L(loop1)
-
+ sub src, src, 16
+L(loop1_end):
+ add src, src, 16
cmeq vhas_nul.16b, vdata.16b, 0
+#ifdef __AARCH64EB__
+ bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+ shrn vend.8b, vhas_nul.8h, 4
+ fmov synd, dend
+ rbit synd, synd
+#else
bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- bic vhas_nul.8h, 0x0f, lsl 8
- addp vend.16b, vhas_nul.16b, vhas_nul.16b
+ shrn vend.8b, vhas_nul.8h, 4
fmov synd, dend
+#endif
ands nul_match, synd, 0xcccccccccccccccc
- beq L(loop2)
-
+ beq L(loop2_start)
L(tail):
sub nul_match, nul_match, 1
and chr_match, synd, 0x3333333333333333
ands chr_match, chr_match, nul_match
- sub result, src, 1
+ add result, src, 15
clz tmp, chr_match
sub result, result, tmp, lsr 2
csel result, result, xzr, ne
ret
.p2align 4
+ nop
+ nop
+L(loop2_start):
+ add src, src, 16
+ bic vrepmask.8h, 0xf0
+
L(loop2):
cmp synd, 0
csel src_match, src, src_match, ne
diff --git a/contrib/arm-optimized-routines/string/aarch64/strrchr-sve.S b/contrib/arm-optimized-routines/string/aarch64/strrchr-sve.S
index d36d69af37fd..825a7384cfc1 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strrchr-sve.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strrchr-sve.S
@@ -1,11 +1,11 @@
/*
* strrchr - find the last of a character in a string
*
- * Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/contrib/arm-optimized-routines/string/aarch64/strrchr.S b/contrib/arm-optimized-routines/string/aarch64/strrchr.S
index 56185ff534e3..bf9cb297b6cb 100644
--- a/contrib/arm-optimized-routines/string/aarch64/strrchr.S
+++ b/contrib/arm-optimized-routines/string/aarch64/strrchr.S
@@ -1,8 +1,8 @@
/*
* strrchr - find last position of a character in a string.
*
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* Neon Available.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
/* Arguments and results. */
#define srcin x0
diff --git a/contrib/arm-optimized-routines/string/arm/asmdefs.h b/contrib/arm-optimized-routines/string/arm/asmdefs.h
new file mode 100644
index 000000000000..e31188804716
--- /dev/null
+++ b/contrib/arm-optimized-routines/string/arm/asmdefs.h
@@ -0,0 +1,477 @@
+/*
+ * Macros for asm code. Arm version.
+ *
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _ASMDEFS_H
+#define _ASMDEFS_H
+
+/* Check whether leaf function PAC signing has been requested in the
+ -mbranch-protect compile-time option. */
+#define LEAF_PROTECT_BIT 2
+
+#ifdef __ARM_FEATURE_PAC_DEFAULT
+# define HAVE_PAC_LEAF \
+ ((__ARM_FEATURE_PAC_DEFAULT & (1 << LEAF_PROTECT_BIT)) && 1)
+#else
+# define HAVE_PAC_LEAF 0
+#endif
+
+/* Provide default parameters for PAC-code handling in leaf-functions. */
+#if HAVE_PAC_LEAF
+# ifndef PAC_LEAF_PUSH_IP
+# define PAC_LEAF_PUSH_IP 1
+# endif
+#else /* !HAVE_PAC_LEAF */
+# undef PAC_LEAF_PUSH_IP
+# define PAC_LEAF_PUSH_IP 0
+#endif /* HAVE_PAC_LEAF */
+
+#define STACK_ALIGN_ENFORCE 0
+
+/******************************************************************************
+* Implementation of the prologue and epilogue assembler macros and their
+* associated helper functions.
+*
+* These functions add support for the following:
+*
+* - M-profile branch target identification (BTI) landing-pads when compiled
+* with `-mbranch-protection=bti'.
+* - PAC-signing and verification instructions, depending on hardware support
+* and whether the PAC-signing of leaf functions has been requested via the
+* `-mbranch-protection=pac-ret+leaf' compiler argument.
+* - 8-byte stack alignment preservation at function entry, defaulting to the
+* value of STACK_ALIGN_ENFORCE.
+*
+* Notes:
+* - Prologue stack alignment is implemented by detecting a push with an odd
+* number of registers and prepending a dummy register to the list.
+* - If alignment is attempted on a list containing r0, compilation will result
+* in an error.
+* - If alignment is attempted in a list containing r1, r0 will be prepended to
+* the register list and r0 will be restored prior to function return. for
+* functions with non-void return types, this will result in the corruption of
+* the result register.
+* - Stack alignment is enforced via the following helper macro call-chain:
+*
+* {prologue|epilogue} ->_align8 -> _preprocess_reglist ->
+* _preprocess_reglist1 -> {_prologue|_epilogue}
+*
+* - Debug CFI directives are automatically added to prologues and epilogues,
+* assisted by `cfisavelist' and `cfirestorelist', respectively.
+*
+* Arguments:
+* prologue
+* --------
+* - first - If `last' specified, this serves as start of general-purpose
+* register (GPR) range to push onto stack, otherwise represents
+* single GPR to push onto stack. If omitted, no GPRs pushed
+* onto stack at prologue.
+* - last - If given, specifies inclusive upper-bound of GPR range.
+* - push_ip - Determines whether IP register is to be pushed to stack at
+* prologue. When pac-signing is requested, this holds the
+* the pac-key. Either 1 or 0 to push or not push, respectively.
+* Default behavior: Set to value of PAC_LEAF_PUSH_IP macro.
+* - push_lr - Determines whether to push lr to the stack on function entry.
+* Either 1 or 0 to push or not push, respectively.
+* - align8 - Whether to enforce alignment. Either 1 or 0, with 1 requesting
+* alignment.
+*
+* epilogue
+* --------
+* The epilogue should be called passing the same arguments as those passed to
+* the prologue to ensure the stack is not corrupted on function return.
+*
+* Usage examples:
+*
+* prologue push_ip=1 -> push {ip}
+* epilogue push_ip=1, align8=1 -> pop {r2, ip}
+* prologue push_ip=1, push_lr=1 -> push {ip, lr}
+* epilogue 1 -> pop {r1}
+* prologue 1, align8=1 -> push {r0, r1}
+* epilogue 1, push_ip=1 -> pop {r1, ip}
+* prologue 1, 4 -> push {r1-r4}
+* epilogue 1, 4 push_ip=1 -> pop {r1-r4, ip}
+*
+******************************************************************************/
+
+/* Emit .cfi_restore directives for a consecutive sequence of registers. */
+ .macro cfirestorelist first, last
+ .cfi_restore \last
+ .if \last-\first
+ cfirestorelist \first, \last-1
+ .endif
+ .endm
+
+/* Emit .cfi_offset directives for a consecutive sequence of registers. */
+ .macro cfisavelist first, last, index=1
+ .cfi_offset \last, -4*(\index)
+ .if \last-\first
+ cfisavelist \first, \last-1, \index+1
+ .endif
+ .endm
+
+.macro _prologue first=-1, last=-1, push_ip=PAC_LEAF_PUSH_IP, push_lr=0
+ .if \push_ip & 1 != \push_ip
+ .error "push_ip may be either 0 or 1"
+ .endif
+ .if \push_lr & 1 != \push_lr
+ .error "push_lr may be either 0 or 1"
+ .endif
+ .if \first != -1
+ .if \last == -1
+ /* Upper-bound not provided: Set upper = lower. */
+ _prologue \first, \first, \push_ip, \push_lr
+ .exitm
+ .endif
+ .endif
+#if HAVE_PAC_LEAF
+# if __ARM_FEATURE_BTI_DEFAULT
+ pacbti ip, lr, sp
+# else
+ pac ip, lr, sp
+# endif /* __ARM_FEATURE_BTI_DEFAULT */
+ .cfi_register 143, 12
+#else
+# if __ARM_FEATURE_BTI_DEFAULT
+ bti
+# endif /* __ARM_FEATURE_BTI_DEFAULT */
+#endif /* HAVE_PAC_LEAF */
+ .if \first != -1
+ .if \last != \first
+ .if \last >= 13
+ .error "SP cannot be in the save list"
+ .endif
+ .if \push_ip
+ .if \push_lr
+ /* Case 1: push register range, ip and lr registers. */
+ push {r\first-r\last, ip, lr}
+ .cfi_adjust_cfa_offset ((\last-\first)+3)*4
+ .cfi_offset 14, -4
+ .cfi_offset 143, -8
+ cfisavelist \first, \last, 3
+ .else // !\push_lr
+ /* Case 2: push register range and ip register. */
+ push {r\first-r\last, ip}
+ .cfi_adjust_cfa_offset ((\last-\first)+2)*4
+ .cfi_offset 143, -4
+ cfisavelist \first, \last, 2
+ .endif
+ .else // !\push_ip
+ .if \push_lr
+ /* Case 3: push register range and lr register. */
+ push {r\first-r\last, lr}
+ .cfi_adjust_cfa_offset ((\last-\first)+2)*4
+ .cfi_offset 14, -4
+ cfisavelist \first, \last, 2
+ .else // !\push_lr
+ /* Case 4: push register range. */
+ push {r\first-r\last}
+ .cfi_adjust_cfa_offset ((\last-\first)+1)*4
+ cfisavelist \first, \last, 1
+ .endif
+ .endif
+ .else // \last == \first
+ .if \push_ip
+ .if \push_lr
+ /* Case 5: push single GP register plus ip and lr registers. */
+ push {r\first, ip, lr}
+ .cfi_adjust_cfa_offset 12
+ .cfi_offset 14, -4
+ .cfi_offset 143, -8
+ cfisavelist \first, \first, 3
+ .else // !\push_lr
+ /* Case 6: push single GP register plus ip register. */
+ push {r\first, ip}
+ .cfi_adjust_cfa_offset 8
+ .cfi_offset 143, -4
+ cfisavelist \first, \first, 2
+ .endif
+ .else // !\push_ip
+ .if \push_lr
+ /* Case 7: push single GP register plus lr register. */
+ push {r\first, lr}
+ .cfi_adjust_cfa_offset 8
+ .cfi_offset 14, -4
+ cfisavelist \first, \first, 2
+ .else // !\push_lr
+ /* Case 8: push single GP register. */
+ push {r\first}
+ .cfi_adjust_cfa_offset 4
+ cfisavelist \first, \first, 1
+ .endif
+ .endif
+ .endif
+ .else // \first == -1
+ .if \push_ip
+ .if \push_lr
+ /* Case 9: push ip and lr registers. */
+ push {ip, lr}
+ .cfi_adjust_cfa_offset 8
+ .cfi_offset 14, -4
+ .cfi_offset 143, -8
+ .else // !\push_lr
+ /* Case 10: push ip register. */
+ push {ip}
+ .cfi_adjust_cfa_offset 4
+ .cfi_offset 143, -4
+ .endif
+ .else // !\push_ip
+ .if \push_lr
+ /* Case 11: push lr register. */
+ push {lr}
+ .cfi_adjust_cfa_offset 4
+ .cfi_offset 14, -4
+ .endif
+ .endif
+ .endif
+.endm
+
+.macro _epilogue first=-1, last=-1, push_ip=PAC_LEAF_PUSH_IP, push_lr=0
+ .if \push_ip & 1 != \push_ip
+ .error "push_ip may be either 0 or 1"
+ .endif
+ .if \push_lr & 1 != \push_lr
+ .error "push_lr may be either 0 or 1"
+ .endif
+ .if \first != -1
+ .if \last == -1
+ /* Upper-bound not provided: Set upper = lower. */
+ _epilogue \first, \first, \push_ip, \push_lr
+ .exitm
+ .endif
+ .if \last != \first
+ .if \last >= 13
+ .error "SP cannot be in the save list"
+ .endif
+ .if \push_ip
+ .if \push_lr
+ /* Case 1: pop register range, ip and lr registers. */
+ pop {r\first-r\last, ip, lr}
+ .cfi_restore 14
+ .cfi_register 143, 12
+ cfirestorelist \first, \last
+ .else // !\push_lr
+ /* Case 2: pop register range and ip register. */
+ pop {r\first-r\last, ip}
+ .cfi_register 143, 12
+ cfirestorelist \first, \last
+ .endif
+ .else // !\push_ip
+ .if \push_lr
+ /* Case 3: pop register range and lr register. */
+ pop {r\first-r\last, lr}
+ .cfi_restore 14
+ cfirestorelist \first, \last
+ .else // !\push_lr
+ /* Case 4: pop register range. */
+ pop {r\first-r\last}
+ cfirestorelist \first, \last
+ .endif
+ .endif
+ .else // \last == \first
+ .if \push_ip
+ .if \push_lr
+ /* Case 5: pop single GP register plus ip and lr registers. */
+ pop {r\first, ip, lr}
+ .cfi_restore 14
+ .cfi_register 143, 12
+ cfirestorelist \first, \first
+ .else // !\push_lr
+ /* Case 6: pop single GP register plus ip register. */
+ pop {r\first, ip}
+ .cfi_register 143, 12
+ cfirestorelist \first, \first
+ .endif
+ .else // !\push_ip
+ .if \push_lr
+ /* Case 7: pop single GP register plus lr register. */
+ pop {r\first, lr}
+ .cfi_restore 14
+ cfirestorelist \first, \first
+ .else // !\push_lr
+ /* Case 8: pop single GP register. */
+ pop {r\first}
+ cfirestorelist \first, \first
+ .endif
+ .endif
+ .endif
+ .else // \first == -1
+ .if \push_ip
+ .if \push_lr
+ /* Case 9: pop ip and lr registers. */
+ pop {ip, lr}
+ .cfi_restore 14
+ .cfi_register 143, 12
+ .else // !\push_lr
+ /* Case 10: pop ip register. */
+ pop {ip}
+ .cfi_register 143, 12
+ .endif
+ .else // !\push_ip
+ .if \push_lr
+ /* Case 11: pop lr register. */
+ pop {lr}
+ .cfi_restore 14
+ .endif
+ .endif
+ .endif
+#if HAVE_PAC_LEAF
+ aut ip, lr, sp
+#endif /* HAVE_PAC_LEAF */
+ bx lr
+.endm
+
+/* Clean up expressions in 'last'. */
+.macro _preprocess_reglist1 first:req, last:req, push_ip:req, push_lr:req, reglist_op:req
+ .if \last == 0
+ \reglist_op \first, 0, \push_ip, \push_lr
+ .elseif \last == 1
+ \reglist_op \first, 1, \push_ip, \push_lr
+ .elseif \last == 2
+ \reglist_op \first, 2, \push_ip, \push_lr
+ .elseif \last == 3
+ \reglist_op \first, 3, \push_ip, \push_lr
+ .elseif \last == 4
+ \reglist_op \first, 4, \push_ip, \push_lr
+ .elseif \last == 5
+ \reglist_op \first, 5, \push_ip, \push_lr
+ .elseif \last == 6
+ \reglist_op \first, 6, \push_ip, \push_lr
+ .elseif \last == 7
+ \reglist_op \first, 7, \push_ip, \push_lr
+ .elseif \last == 8
+ \reglist_op \first, 8, \push_ip, \push_lr
+ .elseif \last == 9
+ \reglist_op \first, 9, \push_ip, \push_lr
+ .elseif \last == 10
+ \reglist_op \first, 10, \push_ip, \push_lr
+ .elseif \last == 11
+ \reglist_op \first, 11, \push_ip, \push_lr
+ .else
+ .error "last (\last) out of range"
+ .endif
+.endm
+
+/* Clean up expressions in 'first'. */
+.macro _preprocess_reglist first:req, last, push_ip=0, push_lr=0, reglist_op:req
+ .ifb \last
+ _preprocess_reglist \first \first \push_ip \push_lr
+ .else
+ .if \first > \last
+ .error "last (\last) must be at least as great as first (\first)"
+ .endif
+ .if \first == 0
+ _preprocess_reglist1 0, \last, \push_ip, \push_lr, \reglist_op
+ .elseif \first == 1
+ _preprocess_reglist1 1, \last, \push_ip, \push_lr, \reglist_op
+ .elseif \first == 2
+ _preprocess_reglist1 2, \last, \push_ip, \push_lr, \reglist_op
+ .elseif \first == 3
+ _preprocess_reglist1 3, \last, \push_ip, \push_lr, \reglist_op
+ .elseif \first == 4
+ _preprocess_reglist1 4, \last, \push_ip, \push_lr, \reglist_op
+ .elseif \first == 5
+ _preprocess_reglist1 5, \last, \push_ip, \push_lr, \reglist_op
+ .elseif \first == 6
+ _preprocess_reglist1 6, \last, \push_ip, \push_lr, \reglist_op
+ .elseif \first == 7
+ _preprocess_reglist1 7, \last, \push_ip, \push_lr, \reglist_op
+ .elseif \first == 8
+ _preprocess_reglist1 8, \last, \push_ip, \push_lr, \reglist_op
+ .elseif \first == 9
+ _preprocess_reglist1 9, \last, \push_ip, \push_lr, \reglist_op
+ .elseif \first == 10
+ _preprocess_reglist1 10, \last, \push_ip, \push_lr, \reglist_op
+ .elseif \first == 11
+ _preprocess_reglist1 11, \last, \push_ip, \push_lr, \reglist_op
+ .else
+ .error "first (\first) out of range"
+ .endif
+ .endif
+.endm
+
+.macro _align8 first, last, push_ip=0, push_lr=0, reglist_op=_prologue
+ .ifb \first
+ .ifnb \last
+ .error "can't have last (\last) without specifying first"
+ .else // \last not blank
+ .if ((\push_ip + \push_lr) % 2) == 0
+ \reglist_op first=-1, last=-1, push_ip=\push_ip, push_lr=\push_lr
+ .exitm
+ .else // ((\push_ip + \push_lr) % 2) odd
+ _align8 2, 2, \push_ip, \push_lr, \reglist_op
+ .exitm
+ .endif // ((\push_ip + \push_lr) % 2) == 0
+ .endif // .ifnb \last
+ .endif // .ifb \first
+
+ .ifb \last
+ _align8 \first, \first, \push_ip, \push_lr, \reglist_op
+ .else
+ .if \push_ip & 1 <> \push_ip
+ .error "push_ip may be 0 or 1"
+ .endif
+ .if \push_lr & 1 <> \push_lr
+ .error "push_lr may be 0 or 1"
+ .endif
+ .ifeq (\last - \first + \push_ip + \push_lr) % 2
+ .if \first == 0
+ .error "Alignment required and first register is r0"
+ .exitm
+ .endif
+ _preprocess_reglist \first-1, \last, \push_ip, \push_lr, \reglist_op
+ .else
+ _preprocess_reglist \first \last, \push_ip, \push_lr, \reglist_op
+ .endif
+ .endif
+.endm
+
+.macro prologue first, last, push_ip=PAC_LEAF_PUSH_IP, push_lr=0, align8=STACK_ALIGN_ENFORCE
+ .if \align8
+ _align8 \first, \last, \push_ip, \push_lr, _prologue
+ .else
+ _prologue first=\first, last=\last, push_ip=\push_ip, push_lr=\push_lr
+ .endif
+.endm
+
+.macro epilogue first, last, push_ip=PAC_LEAF_PUSH_IP, push_lr=0, align8=STACK_ALIGN_ENFORCE
+ .if \align8
+ _align8 \first, \last, \push_ip, \push_lr, reglist_op=_epilogue
+ .else
+ _epilogue first=\first, last=\last, push_ip=\push_ip, push_lr=\push_lr
+ .endif
+.endm
+
+#define ENTRY_ALIGN(name, alignment) \
+ .global name; \
+ .type name,%function; \
+ .align alignment; \
+ name: \
+ .fnstart; \
+ .cfi_startproc;
+
+#define ENTRY(name) ENTRY_ALIGN(name, 6)
+
+#define ENTRY_ALIAS(name) \
+ .global name; \
+ .type name,%function; \
+ name:
+
+#if defined (IS_LEAF)
+# define END_UNWIND .cantunwind;
+#else
+# define END_UNWIND
+#endif
+
+#define END(name) \
+ .cfi_endproc; \
+ END_UNWIND \
+ .fnend; \
+ .size name, .-name;
+
+#define L(l) .L ## l
+
+#endif
diff --git a/contrib/arm-optimized-routines/string/arm/check-arch.S b/contrib/arm-optimized-routines/string/arm/check-arch.S
index 1cff9345e343..95516710fb85 100644
--- a/contrib/arm-optimized-routines/string/arm/check-arch.S
+++ b/contrib/arm-optimized-routines/string/arm/check-arch.S
@@ -1,10 +1,13 @@
/*
* check ARCH setting.
*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#if !__arm__
# error ARCH setting does not match the compiler.
#endif
+
+/* For attributes that may affect ABI. */
+#include "asmdefs.h"
diff --git a/contrib/arm-optimized-routines/string/arm/memchr.S b/contrib/arm-optimized-routines/string/arm/memchr.S
index 3f1ac4df136f..823d6013eb35 100644
--- a/contrib/arm-optimized-routines/string/arm/memchr.S
+++ b/contrib/arm-optimized-routines/string/arm/memchr.S
@@ -1,8 +1,8 @@
/*
* memchr - scan memory for a character
*
- * Copyright (c) 2010-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2010-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/*
@@ -23,7 +23,11 @@
@ Removed unneeded cbz from align loop
.syntax unified
+#if __ARM_ARCH >= 8 && __ARM_ARCH_PROFILE == 'M'
+ /* keep config inherited from -march= */
+#else
.arch armv7-a
+#endif
@ this lets us check a flag in a 00/ff byte easily in either endianness
#ifdef __ARMEB__
@@ -32,6 +36,8 @@
#define CHARTSTMASK(c) 1<<(c*8)
#endif
.thumb
+#include "asmdefs.h"
+
@ ---------------------------------------------------------------------------
.thumb_func
@@ -39,11 +45,14 @@
.p2align 4,,15
.global __memchr_arm
.type __memchr_arm,%function
+ .fnstart
+ .cfi_startproc
__memchr_arm:
@ r0 = start of memory to scan
@ r1 = character to look for
@ r2 = length
@ returns r0 = pointer to character or NULL if not found
+ prologue
and r1,r1,#0xff @ Don't think we can trust the caller to actually pass a char
cmp r2,#16 @ If it's short don't bother with anything clever
@@ -64,6 +73,11 @@ __memchr_arm:
10:
@ At this point, we are aligned, we know we have at least 8 bytes to work with
push {r4,r5,r6,r7}
+ .cfi_adjust_cfa_offset 16
+ .cfi_rel_offset 4, 0
+ .cfi_rel_offset 5, 4
+ .cfi_rel_offset 6, 8
+ .cfi_rel_offset 7, 12
orr r1, r1, r1, lsl #8 @ expand the match word across to all bytes
orr r1, r1, r1, lsl #16
bic r4, r2, #7 @ Number of double words to work with
@@ -83,6 +97,11 @@ __memchr_arm:
bne 15b @ (Flags from the subs above) If not run out of bytes then go around again
pop {r4,r5,r6,r7}
+ .cfi_restore 7
+ .cfi_restore 6
+ .cfi_restore 5
+ .cfi_restore 4
+ .cfi_adjust_cfa_offset -16
and r1,r1,#0xff @ Get r1 back to a single character from the expansion above
and r2,r2,#7 @ Leave the count remaining as the number after the double words have been done
@@ -97,16 +116,25 @@ __memchr_arm:
bne 21b @ on r2 flags
40:
+ .cfi_remember_state
movs r0,#0 @ not found
- bx lr
+ epilogue
50:
+ .cfi_restore_state
+ .cfi_remember_state
subs r0,r0,#1 @ found
- bx lr
+ epilogue
60: @ We're here because the fast path found a hit - now we have to track down exactly which word it was
@ r0 points to the start of the double word after the one that was tested
@ r5 has the 00/ff pattern for the first word, r6 has the chained value
+ .cfi_restore_state @ Standard post-prologue state
+ .cfi_adjust_cfa_offset 16
+ .cfi_rel_offset 4, 0
+ .cfi_rel_offset 5, 4
+ .cfi_rel_offset 6, 8
+ .cfi_rel_offset 7, 12
cmp r5, #0
itte eq
moveq r5, r6 @ the end is in the 2nd word
@@ -126,7 +154,15 @@ __memchr_arm:
61:
pop {r4,r5,r6,r7}
+ .cfi_restore 7
+ .cfi_restore 6
+ .cfi_restore 5
+ .cfi_restore 4
+ .cfi_adjust_cfa_offset -16
subs r0,r0,#1
- bx lr
+ epilogue
+ .cfi_endproc
+ .cantunwind
+ .fnend
.size __memchr_arm, . - __memchr_arm
diff --git a/contrib/arm-optimized-routines/string/arm/memcpy.S b/contrib/arm-optimized-routines/string/arm/memcpy.S
index 86e64938edb1..2423cfd69061 100644
--- a/contrib/arm-optimized-routines/string/arm/memcpy.S
+++ b/contrib/arm-optimized-routines/string/arm/memcpy.S
@@ -1,8 +1,8 @@
/*
* memcpy - copy memory area
*
- * Copyright (c) 2013-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2013-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/*
@@ -17,7 +17,7 @@
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
.syntax unified
/* This implementation requires ARM state. */
diff --git a/contrib/arm-optimized-routines/string/arm/memset.S b/contrib/arm-optimized-routines/string/arm/memset.S
index 11e927368fd1..487b9d6a8f6c 100644
--- a/contrib/arm-optimized-routines/string/arm/memset.S
+++ b/contrib/arm-optimized-routines/string/arm/memset.S
@@ -2,7 +2,7 @@
* memset - fill memory with a constant
*
* Copyright (c) 2010-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/*
diff --git a/contrib/arm-optimized-routines/string/arm/strcmp-armv6m.S b/contrib/arm-optimized-routines/string/arm/strcmp-armv6m.S
index b75d4143db57..4d55306810ad 100644
--- a/contrib/arm-optimized-routines/string/arm/strcmp-armv6m.S
+++ b/contrib/arm-optimized-routines/string/arm/strcmp-armv6m.S
@@ -1,10 +1,12 @@
/*
* strcmp for ARMv6-M (optimized for performance, not size)
*
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
+#include "asmdefs.h"
+
#if __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1
.thumb_func
diff --git a/contrib/arm-optimized-routines/string/arm/strcmp.S b/contrib/arm-optimized-routines/string/arm/strcmp.S
index 51443e343058..74b3d235fb18 100644
--- a/contrib/arm-optimized-routines/string/arm/strcmp.S
+++ b/contrib/arm-optimized-routines/string/arm/strcmp.S
@@ -1,8 +1,8 @@
/*
* strcmp for ARMv7
*
- * Copyright (c) 2012-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
@@ -12,7 +12,7 @@
is sufficiently aligned. Use saturating arithmetic to optimize
the compares. */
-#include "../asmdefs.h"
+#include "asmdefs.h"
/* Build Options:
STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
@@ -26,6 +26,11 @@
#define STRCMP_NO_PRECHECK 0
+/* Ensure the .cantunwind directive is prepended to .fnend.
+ Leaf functions cannot throw exceptions - EHABI only supports
+ synchronous exceptions. */
+#define IS_LEAF
+
/* This version uses Thumb-2 code. */
.thumb
.syntax unified
@@ -98,8 +103,9 @@
ldrd r4, r5, [sp], #16
.cfi_restore 4
.cfi_restore 5
+ .cfi_adjust_cfa_offset -16
sub result, result, r1, lsr #24
- bx lr
+ epilogue push_ip=HAVE_PAC_LEAF
#else
/* To use the big-endian trick we'd have to reverse all three words.
that's slower than this approach. */
@@ -119,21 +125,15 @@
ldrd r4, r5, [sp], #16
.cfi_restore 4
.cfi_restore 5
+ .cfi_adjust_cfa_offset -16
sub result, result, r1
- bx lr
+ epilogue push_ip=HAVE_PAC_LEAF
#endif
.endm
- .p2align 5
-L(strcmp_start_addr):
-#if STRCMP_NO_PRECHECK == 0
-L(fastpath_exit):
- sub r0, r2, r3
- bx lr
- nop
-#endif
-ENTRY_ALIGN (__strcmp_arm, 0)
+ENTRY(__strcmp_arm)
+ prologue push_ip=HAVE_PAC_LEAF
#if STRCMP_NO_PRECHECK == 0
ldrb r2, [src1]
ldrb r3, [src2]
@@ -143,13 +143,13 @@ ENTRY_ALIGN (__strcmp_arm, 0)
bne L(fastpath_exit)
#endif
strd r4, r5, [sp, #-16]!
- .cfi_def_cfa_offset 16
- .cfi_offset 4, -16
- .cfi_offset 5, -12
+ .cfi_adjust_cfa_offset 16
+ .cfi_rel_offset 4, 0
+ .cfi_rel_offset 5, 4
orr tmp1, src1, src2
strd r6, r7, [sp, #8]
- .cfi_offset 6, -8
- .cfi_offset 7, -4
+ .cfi_rel_offset 6, 8
+ .cfi_rel_offset 7, 12
mvn const_m1, #0
lsl r2, tmp1, #29
cbz r2, L(loop_aligned8)
@@ -318,10 +318,19 @@ L(misaligned_exit):
mov result, tmp1
ldr r4, [sp], #16
.cfi_restore 4
- bx lr
+ .cfi_adjust_cfa_offset -16
+ epilogue push_ip=HAVE_PAC_LEAF
#if STRCMP_NO_PRECHECK == 0
+L(fastpath_exit):
+ .cfi_restore_state
+ .cfi_remember_state
+ sub r0, r2, r3
+ epilogue push_ip=HAVE_PAC_LEAF
+
L(aligned_m1):
+ .cfi_restore_state
+ .cfi_remember_state
add src2, src2, #4
#endif
L(src1_aligned):
@@ -368,9 +377,9 @@ L(overlap3):
/* R6/7 Not used in this sequence. */
.cfi_restore 6
.cfi_restore 7
+ .cfi_adjust_cfa_offset -16
neg result, result
- bx lr
-
+ epilogue push_ip=HAVE_PAC_LEAF
6:
.cfi_restore_state
S2LO data1, data1, #24
@@ -445,7 +454,8 @@ L(strcmp_done_equal):
/* R6/7 not used in this sequence. */
.cfi_restore 6
.cfi_restore 7
- bx lr
+ .cfi_adjust_cfa_offset -16
+ epilogue push_ip=HAVE_PAC_LEAF
L(strcmp_tail):
.cfi_restore_state
@@ -467,8 +477,9 @@ L(strcmp_tail):
/* R6/7 not used in this sequence. */
.cfi_restore 6
.cfi_restore 7
+ .cfi_adjust_cfa_offset -16
sub result, result, data2, lsr #24
- bx lr
+ epilogue push_ip=HAVE_PAC_LEAF
END (__strcmp_arm)
diff --git a/contrib/arm-optimized-routines/string/arm/strcpy.c b/contrib/arm-optimized-routines/string/arm/strcpy.c
index 02cf94ff4be0..b5728a2534f0 100644
--- a/contrib/arm-optimized-routines/string/arm/strcpy.c
+++ b/contrib/arm-optimized-routines/string/arm/strcpy.c
@@ -2,7 +2,7 @@
* strcpy
*
* Copyright (c) 2008-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#if defined (__thumb2__) && !defined (__thumb__)
diff --git a/contrib/arm-optimized-routines/string/arm/strlen-armv6t2.S b/contrib/arm-optimized-routines/string/arm/strlen-armv6t2.S
index 5ad30c941586..5eb8671bdc8b 100644
--- a/contrib/arm-optimized-routines/string/arm/strlen-armv6t2.S
+++ b/contrib/arm-optimized-routines/string/arm/strlen-armv6t2.S
@@ -1,8 +1,8 @@
/*
* strlen - calculate the length of a string
*
- * Copyright (c) 2010-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2010-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
@@ -13,7 +13,7 @@
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#ifdef __ARMEB__
#define S2LO lsl
@@ -23,6 +23,11 @@
#define S2HI lsl
#endif
+/* Ensure the .cantunwind directive is prepended to .fnend.
+ Leaf functions cannot throw exceptions - EHABI only supports
+ synchronous exceptions. */
+#define IS_LEAF
+
/* This code requires Thumb. */
.thumb
.syntax unified
@@ -41,8 +46,8 @@
#define tmp2 r5
ENTRY (__strlen_armv6t2)
+ prologue 4 5 push_ip=HAVE_PAC_LEAF
pld [srcin, #0]
- strd r4, r5, [sp, #-8]!
bic src, srcin, #7
mvn const_m1, #0
ands tmp1, srcin, #7 /* (8 - bytes) to alignment. */
@@ -92,6 +97,7 @@ L(start_realigned):
beq L(loop_aligned)
L(null_found):
+ .cfi_remember_state
cmp data1a, #0
itt eq
addeq result, result, #4
@@ -100,11 +106,11 @@ L(null_found):
rev data1a, data1a
#endif
clz data1a, data1a
- ldrd r4, r5, [sp], #8
add result, result, data1a, lsr #3 /* Bits -> Bytes. */
- bx lr
+ epilogue 4 5 push_ip=HAVE_PAC_LEAF
L(misaligned8):
+ .cfi_restore_state
ldrd data1a, data1b, [src]
and tmp2, tmp1, #3
rsb result, tmp1, #0
diff --git a/contrib/arm-optimized-routines/string/bench/memcpy.c b/contrib/arm-optimized-routines/string/bench/memcpy.c
index d5d4ea7e0309..b628f9b60d96 100644
--- a/contrib/arm-optimized-routines/string/bench/memcpy.c
+++ b/contrib/arm-optimized-routines/string/bench/memcpy.c
@@ -1,8 +1,8 @@
/*
* memcpy benchmark.
*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#define _GNU_SOURCE
@@ -13,14 +13,15 @@
#include "stringlib.h"
#include "benchlib.h"
-#define ITERS 5000
+#define ITERS 5000
#define ITERS2 20000000
-#define ITERS3 500000
-#define MAX_COPIES 8192
-#define SIZE (256*1024)
+#define ITERS3 200000
+#define NUM_TESTS 16384
+#define MIN_SIZE 32768
+#define MAX_SIZE (1024 * 1024)
-static uint8_t a[SIZE + 4096] __attribute__((__aligned__(64)));
-static uint8_t b[SIZE + 4096] __attribute__((__aligned__(64)));
+static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
+static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
#define F(x) {#x, x},
@@ -30,15 +31,21 @@ static const struct fun
void *(*fun)(void *, const void *, size_t);
} funtab[] =
{
- F(memcpy)
#if __aarch64__
F(__memcpy_aarch64)
# if __ARM_NEON
F(__memcpy_aarch64_simd)
# endif
+# if __ARM_FEATURE_SVE
+ F(__memcpy_aarch64_sve)
+# endif
+# if WANT_MOPS
+ F(__memcpy_aarch64_mops)
+# endif
#elif __arm__
F(__memcpy_arm)
#endif
+ F(memcpy)
#undef F
{0, 0}
};
@@ -109,7 +116,7 @@ typedef struct
uint64_t len : 16;
} copy_t;
-static copy_t copy[MAX_COPIES];
+static copy_t test_arr[NUM_TESTS];
typedef char *(*proto_t) (char *, const char *, size_t);
@@ -140,14 +147,14 @@ init_copies (size_t max_size)
size_t total = 0;
/* Create a random set of copies with the given size and alignment
distributions. */
- for (int i = 0; i < MAX_COPIES; i++)
+ for (int i = 0; i < NUM_TESTS; i++)
{
- copy[i].dst = (rand32 (0) & (max_size - 1));
- copy[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];
- copy[i].src = (rand32 (0) & (max_size - 1));
- copy[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];
- copy[i].len = size_arr[rand32 (0) & SIZE_MASK];
- total += copy[i].len;
+ test_arr[i].dst = (rand32 (0) & (max_size - 1));
+ test_arr[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];
+ test_arr[i].src = (rand32 (0) & (max_size - 1));
+ test_arr[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];
+ test_arr[i].len = size_arr[rand32 (0) & SIZE_MASK];
+ total += test_arr[i].len;
}
return total;
@@ -160,25 +167,27 @@ int main (void)
memset (a, 1, sizeof (a));
memset (b, 2, sizeof (b));
- printf("Random memcpy:\n");
+ printf("Random memcpy (bytes/ns):\n");
for (int f = 0; funtab[f].name != 0; f++)
{
size_t total = 0;
uint64_t tsum = 0;
- printf ("%22s (B/ns) ", funtab[f].name);
+ printf ("%22s ", funtab[f].name);
rand32 (0x12345678);
- for (int size = 16384; size <= SIZE; size *= 2)
+ for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
{
size_t copy_size = init_copies (size) * ITERS;
- for (int c = 0; c < MAX_COPIES; c++)
- funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);
+ for (int c = 0; c < NUM_TESTS; c++)
+ funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
+ test_arr[c].len);
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS; i++)
- for (int c = 0; c < MAX_COPIES; c++)
- funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);
+ for (int c = 0; c < NUM_TESTS; c++)
+ funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
+ test_arr[c].len);
t = clock_get_ns () - t;
total += copy_size;
tsum += t;
@@ -187,74 +196,147 @@ int main (void)
printf( "avg %.2f\n", (double)total / tsum);
}
- printf ("\nMedium memcpy:\n");
+ size_t total = 0;
+ uint64_t tsum = 0;
+ printf ("%22s ", "memcpy_call");
+ rand32 (0x12345678);
+
+ for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
+ {
+ size_t copy_size = init_copies (size) * ITERS;
+
+ for (int c = 0; c < NUM_TESTS; c++)
+ memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
+
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS; i++)
+ for (int c = 0; c < NUM_TESTS; c++)
+ memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
+ t = clock_get_ns () - t;
+ total += copy_size;
+ tsum += t;
+ printf ("%dK: %.2f ", size / 1024, (double)copy_size / t);
+ }
+ printf( "avg %.2f\n", (double)total / tsum);
+
+
+ printf ("\nAligned medium memcpy (bytes/ns):\n");
for (int f = 0; funtab[f].name != 0; f++)
{
- printf ("%22s (B/ns) ", funtab[f].name);
+ printf ("%22s ", funtab[f].name);
- for (int size = 16; size <= 512; size *= 2)
+ for (int size = 8; size <= 512; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS2; i++)
funtab[f].fun (b, a, size);
t = clock_get_ns () - t;
- printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
- size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
+ printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
}
printf ("\n");
}
- printf ("\nLarge memcpy:\n");
+ printf ("%22s ", "memcpy_call");
+ for (int size = 8; size <= 512; size *= 2)
+ {
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS2; i++)
+ memcpy (b, a, size);
+ t = clock_get_ns () - t;
+ printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+ }
+ printf ("\n");
+
+
+ printf ("\nUnaligned medium memcpy (bytes/ns):\n");
for (int f = 0; funtab[f].name != 0; f++)
{
- printf ("%22s (B/ns) ", funtab[f].name);
+ printf ("%22s ", funtab[f].name);
- for (int size = 1024; size <= 32768; size *= 2)
+ for (int size = 8; size <= 512; size *= 2)
+ {
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS2; i++)
+ funtab[f].fun (b + 3, a + 1, size);
+ t = clock_get_ns () - t;
+ printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+ }
+ printf ("\n");
+ }
+
+ printf ("%22s ", "memcpy_call");
+ for (int size = 8; size <= 512; size *= 2)
+ {
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS2; i++)
+ memcpy (b + 3, a + 1, size);
+ t = clock_get_ns () - t;
+ printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+ }
+ printf ("\n");
+
+
+ printf ("\nLarge memcpy (bytes/ns):\n");
+ for (int f = 0; funtab[f].name != 0; f++)
+ {
+ printf ("%22s ", funtab[f].name);
+
+ for (int size = 1024; size <= 65536; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS3; i++)
funtab[f].fun (b, a, size);
t = clock_get_ns () - t;
- printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
- size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+ printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
}
printf ("\n");
}
- printf ("\nUnaligned forwards memmove:\n");
+ printf ("%22s ", "memcpy_call");
+ for (int size = 1024; size <= 65536; size *= 2)
+ {
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS3; i++)
+ memcpy (b, a, size);
+ t = clock_get_ns () - t;
+ printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
+ }
+ printf ("\n");
+
+
+ printf ("\nUnaligned forwards memmove (bytes/ns):\n");
for (int f = 0; funtab[f].name != 0; f++)
{
- printf ("%22s (B/ns) ", funtab[f].name);
+ printf ("%22s ", funtab[f].name);
- for (int size = 1024; size <= 32768; size *= 2)
+ for (int size = 1024; size <= 65536; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS3; i++)
funtab[f].fun (a, a + 256 + (i & 31), size);
t = clock_get_ns () - t;
- printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
- size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+ printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
}
printf ("\n");
}
- printf ("\nUnaligned backwards memmove:\n");
+ printf ("\nUnaligned backwards memmove (bytes/ns):\n");
for (int f = 0; funtab[f].name != 0; f++)
{
- printf ("%22s (B/ns) ", funtab[f].name);
+ printf ("%22s ", funtab[f].name);
- for (int size = 1024; size <= 32768; size *= 2)
+ for (int size = 1024; size <= 65536; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS3; i++)
funtab[f].fun (a + 256 + (i & 31), a, size);
t = clock_get_ns () - t;
- printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
- size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+ printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
}
printf ("\n");
}
+ printf ("\n");
return 0;
}
diff --git a/contrib/arm-optimized-routines/string/bench/memset.c b/contrib/arm-optimized-routines/string/bench/memset.c
new file mode 100644
index 000000000000..990e23ba9a36
--- /dev/null
+++ b/contrib/arm-optimized-routines/string/bench/memset.c
@@ -0,0 +1,243 @@
+/*
+ * memset benchmark.
+ *
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#define _GNU_SOURCE
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "stringlib.h"
+#include "benchlib.h"
+
+#define ITERS 5000
+#define ITERS2 20000000
+#define ITERS3 1000000
+#define NUM_TESTS 16384
+#define MIN_SIZE 32768
+#define MAX_SIZE (1024 * 1024)
+
+static uint8_t a[MAX_SIZE + 4096] __attribute__((__aligned__(64)));
+
+#define F(x) {#x, x},
+
+static const struct fun
+{
+ const char *name;
+ void *(*fun)(void *, int, size_t);
+} funtab[] =
+{
+#if __aarch64__
+ F(__memset_aarch64)
+#elif __arm__
+ F(__memset_arm)
+#endif
+ F(memset)
+#undef F
+ {0, 0}
+};
+
+typedef struct { uint32_t offset : 20, len : 12; } memset_test_t;
+static memset_test_t test_arr[NUM_TESTS];
+
+typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
+typedef struct { uint8_t align; uint16_t freq; } align_data_t;
+
+#define SIZE_NUM 65536
+#define SIZE_MASK (SIZE_NUM-1)
+static uint8_t len_arr[SIZE_NUM];
+
+/* Frequency data for memset sizes up to 4096 based on SPEC2017. */
+static freq_data_t memset_len_freq[] =
+{
+{40,28817}, {32,15336}, { 16,3823}, {296,3545}, { 24,3454}, { 8,1412},
+{292,1202}, { 48, 927}, { 12, 613}, { 11, 539}, {284, 493}, {108, 414},
+{ 88, 380}, { 20, 295}, {312, 271}, { 72, 233}, { 2, 200}, { 4, 192},
+{ 15, 180}, { 14, 174}, { 13, 160}, { 56, 151}, { 36, 144}, { 64, 140},
+{4095,133}, { 10, 130}, { 9, 124}, { 3, 124}, { 28, 120}, { 0, 118},
+{288, 110}, {1152, 96}, {104, 90}, { 1, 86}, {832, 76}, {248, 74},
+{1024, 69}, {120, 64}, {512, 63}, {384, 60}, { 6, 59}, { 80, 54},
+{ 17, 50}, { 7, 49}, {520, 47}, {2048, 39}, {256, 37}, {864, 33},
+{1440, 28}, { 22, 27}, {2056, 24}, {260, 23}, { 68, 23}, { 5, 22},
+{ 18, 21}, {200, 18}, {2120, 18}, { 60, 17}, { 52, 16}, {336, 15},
+{ 44, 13}, {192, 13}, {160, 12}, {2064, 12}, {128, 12}, { 76, 11},
+{164, 11}, {152, 10}, {136, 9}, {488, 7}, { 96, 6}, {560, 6},
+{1016, 6}, {112, 5}, {232, 5}, {168, 5}, {952, 5}, {184, 5},
+{144, 4}, {252, 4}, { 84, 3}, {960, 3}, {3808, 3}, {244, 3},
+{280, 3}, {224, 3}, {156, 3}, {1088, 3}, {440, 3}, {216, 2},
+{304, 2}, { 23, 2}, { 25, 2}, { 26, 2}, {264, 2}, {328, 2},
+{1096, 2}, {240, 2}, {1104, 2}, {704, 2}, {1664, 2}, {360, 2},
+{808, 1}, {544, 1}, {236, 1}, {720, 1}, {368, 1}, {424, 1},
+{640, 1}, {1112, 1}, {552, 1}, {272, 1}, {776, 1}, {376, 1},
+{ 92, 1}, {536, 1}, {824, 1}, {496, 1}, {760, 1}, {792, 1},
+{504, 1}, {344, 1}, {1816, 1}, {880, 1}, {176, 1}, {320, 1},
+{352, 1}, {2008, 1}, {208, 1}, {408, 1}, {228, 1}, {2072, 1},
+{568, 1}, {220, 1}, {616, 1}, {600, 1}, {392, 1}, {696, 1},
+{2144, 1}, {1280, 1}, {2136, 1}, {632, 1}, {584, 1}, {456, 1},
+{472, 1}, {3440, 1}, {2088, 1}, {680, 1}, {2928, 1}, {212, 1},
+{648, 1}, {1752, 1}, {664, 1}, {3512, 1}, {1032, 1}, {528, 1},
+{4072, 1}, {204, 1}, {2880, 1}, {3392, 1}, {712, 1}, { 59, 1},
+{736, 1}, {592, 1}, {2520, 1}, {744, 1}, {196, 1}, {172, 1},
+{728, 1}, {2040, 1}, {1192, 1}, {3600, 1}, {0, 0}
+};
+
+#define ALIGN_NUM 1024
+#define ALIGN_MASK (ALIGN_NUM-1)
+static uint8_t align_arr[ALIGN_NUM];
+
+/* Alignment data for memset based on SPEC2017. */
+static align_data_t memset_align_freq[] =
+{
+ {16, 338}, {8, 307}, {32, 148}, {64, 131}, {4, 72}, {1, 23}, {2, 5}, {0, 0}
+};
+
+static void
+init_memset_distribution (void)
+{
+ int i, j, freq, size, n;
+
+ for (n = i = 0; (freq = memset_len_freq[i].freq) != 0; i++)
+ for (j = 0, size = memset_len_freq[i].size; j < freq; j++)
+ len_arr[n++] = size;
+ assert (n == SIZE_NUM);
+
+ for (n = i = 0; (freq = memset_align_freq[i].freq) != 0; i++)
+ for (j = 0, size = memset_align_freq[i].align; j < freq; j++)
+ align_arr[n++] = size - 1;
+ assert (n == ALIGN_NUM);
+}
+
+static size_t
+init_memset (size_t max_size)
+{
+ size_t total = 0;
+ /* Create a random set of memsets with the given size and alignment
+ distributions. */
+ for (int i = 0; i < NUM_TESTS; i++)
+ {
+ test_arr[i].offset = (rand32 (0) & (max_size - 1));
+ test_arr[i].offset &= ~align_arr[rand32 (0) & ALIGN_MASK];
+ test_arr[i].len = len_arr[rand32 (0) & SIZE_MASK];
+ total += test_arr[i].len;
+ }
+
+ return total;
+}
+
+
+int main (void)
+{
+ init_memset_distribution ();
+
+ memset (a, 1, sizeof (a));
+
+ printf("Random memset (bytes/ns):\n");
+ for (int f = 0; funtab[f].name != 0; f++)
+ {
+ size_t total_size = 0;
+ uint64_t tsum = 0;
+ printf ("%22s ", funtab[f].name);
+ rand32 (0x12345678);
+
+ for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
+ {
+ size_t memset_size = init_memset (size) * ITERS;
+
+ for (int c = 0; c < NUM_TESTS; c++)
+ funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);
+
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS; i++)
+ for (int c = 0; c < NUM_TESTS; c++)
+ funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);
+ t = clock_get_ns () - t;
+ total_size += memset_size;
+ tsum += t;
+ printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);
+ }
+ printf( "avg %.2f\n", (double)total_size / tsum);
+ }
+
+ size_t total_size = 0;
+ uint64_t tsum = 0;
+ printf ("%22s ", "memset_call");
+ rand32 (0x12345678);
+
+ for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
+ {
+ size_t memset_size = init_memset (size) * ITERS;
+
+ for (int c = 0; c < NUM_TESTS; c++)
+ memset (a + test_arr[c].offset, 0, test_arr[c].len);
+
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS; i++)
+ for (int c = 0; c < NUM_TESTS; c++)
+ memset (a + test_arr[c].offset, 0, test_arr[c].len);
+ t = clock_get_ns () - t;
+ total_size += memset_size;
+ tsum += t;
+ printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);
+ }
+ printf( "avg %.2f\n", (double)total_size / tsum);
+
+
+ printf ("\nMedium memset (bytes/ns):\n");
+ for (int f = 0; funtab[f].name != 0; f++)
+ {
+ printf ("%22s ", funtab[f].name);
+
+ for (int size = 8; size <= 512; size *= 2)
+ {
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS2; i++)
+ funtab[f].fun (a, 0, size);
+ t = clock_get_ns () - t;
+ printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+ }
+ printf ("\n");
+ }
+
+ printf ("%22s ", "memset_call");
+ for (int size = 8; size <= 512; size *= 2)
+ {
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS2; i++)
+ memset (a, 0, size);
+ t = clock_get_ns () - t;
+ printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+ }
+
+
+ printf ("\nLarge memset (bytes/ns):\n");
+ for (int f = 0; funtab[f].name != 0; f++)
+ {
+ printf ("%22s ", funtab[f].name);
+
+ for (int size = 1024; size <= 65536; size *= 2)
+ {
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS3; i++)
+ funtab[f].fun (a, 0, size);
+ t = clock_get_ns () - t;
+ printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
+ }
+ printf ("\n");
+ }
+
+ printf ("%22s ", "memset_call");
+ for (int size = 1024; size <= 65536; size *= 2)
+ {
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS3; i++)
+ memset (a, 0, size);
+ t = clock_get_ns () - t;
+ printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
+ }
+ printf ("\n\n");
+
+ return 0;
+}
diff --git a/contrib/arm-optimized-routines/string/bench/strlen.c b/contrib/arm-optimized-routines/string/bench/strlen.c
index cc0f04bee547..f05d0d5b89e6 100644
--- a/contrib/arm-optimized-routines/string/bench/strlen.c
+++ b/contrib/arm-optimized-routines/string/bench/strlen.c
@@ -1,8 +1,8 @@
/*
* strlen benchmark.
*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#define _GNU_SOURCE
@@ -13,10 +13,10 @@
#include "stringlib.h"
#include "benchlib.h"
-#define ITERS 2000
+#define ITERS 5000
#define ITERS2 20000000
#define ITERS3 2000000
-#define NUM_STRLEN 16384
+#define NUM_TESTS 16384
#define MAX_ALIGN 32
#define MAX_STRLEN 256
@@ -49,7 +49,7 @@ static const struct fun
};
#undef F
-static uint16_t strlen_tests[NUM_STRLEN];
+static uint16_t strlen_tests[NUM_TESTS];
typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
typedef struct { uint8_t align; uint16_t freq; } align_data_t;
@@ -117,7 +117,7 @@ init_strlen_tests (void)
/* Create a random set of strlen input strings using the string length
and alignment distributions. */
- for (int n = 0; n < NUM_STRLEN; n++)
+ for (int n = 0; n < NUM_TESTS; n++)
{
int align = strlen_align_arr[rand32 (0) & ALIGN_MASK];
int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK];
@@ -141,14 +141,14 @@ int main (void)
size_t res = 0, strlen_size = 0, mask = maskv;
printf ("%22s ", funtab[f].name);
- for (int c = 0; c < NUM_STRLEN; c++)
+ for (int c = 0; c < NUM_TESTS; c++)
strlen_size += funtab[f].fun (a + strlen_tests[c]);
strlen_size *= ITERS;
/* Measure latency of strlen result with (res & mask). */
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS; i++)
- for (int c = 0; c < NUM_STRLEN; c++)
+ for (int c = 0; c < NUM_TESTS; c++)
res = funtab[f].fun (a + strlen_tests[c] + (res & mask));
t = clock_get_ns () - t;
printf ("%.2f\n", (double)strlen_size / t);
diff --git a/contrib/arm-optimized-routines/string/include/benchlib.h b/contrib/arm-optimized-routines/string/include/benchlib.h
index 0f2ce2eb6bce..f1bbea388cd2 100644
--- a/contrib/arm-optimized-routines/string/include/benchlib.h
+++ b/contrib/arm-optimized-routines/string/include/benchlib.h
@@ -2,7 +2,7 @@
* Benchmark support functions.
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <stdint.h>
diff --git a/contrib/arm-optimized-routines/string/include/stringlib.h b/contrib/arm-optimized-routines/string/include/stringlib.h
index 378c3cd2d645..01da7ebfc18d 100644
--- a/contrib/arm-optimized-routines/string/include/stringlib.h
+++ b/contrib/arm-optimized-routines/string/include/stringlib.h
@@ -1,8 +1,8 @@
/*
* Public API.
*
- * Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <stddef.h>
@@ -29,19 +29,17 @@ size_t __strlen_aarch64 (const char *);
size_t __strnlen_aarch64 (const char *, size_t);
int __strncmp_aarch64 (const char *, const char *, size_t);
void * __memchr_aarch64_mte (const void *, int, size_t);
-char *__strcpy_aarch64_mte (char *__restrict, const char *__restrict);
-char *__stpcpy_aarch64_mte (char *__restrict, const char *__restrict);
char *__strchr_aarch64_mte (const char *, int);
char * __strchrnul_aarch64_mte (const char *, int );
size_t __strlen_aarch64_mte (const char *);
char *__strrchr_aarch64_mte (const char *, int);
-int __strcmp_aarch64_mte (const char *, const char *);
-int __strncmp_aarch64_mte (const char *, const char *, size_t);
#if __ARM_NEON
void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t);
void *__memmove_aarch64_simd (void *, const void *, size_t);
#endif
# if __ARM_FEATURE_SVE
+void *__memcpy_aarch64_sve (void *__restrict, const void *__restrict, size_t);
+void *__memmove_aarch64_sve (void *__restrict, const void *__restrict, size_t);
void *__memchr_aarch64_sve (const void *, int, size_t);
int __memcmp_aarch64_sve (const void *, const void *, size_t);
char *__strchr_aarch64_sve (const char *, int);
@@ -54,6 +52,11 @@ size_t __strlen_aarch64_sve (const char *);
size_t __strnlen_aarch64_sve (const char *, size_t);
int __strncmp_aarch64_sve (const char *, const char *, size_t);
# endif
+# if WANT_MOPS
+void *__memcpy_aarch64_mops (void *__restrict, const void *__restrict, size_t);
+void *__memmove_aarch64_mops (void *__restrict, const void *__restrict, size_t);
+void *__memset_aarch64_mops (void *, int, size_t);
+# endif
# if __ARM_FEATURE_MEMORY_TAGGING
void *__mtag_tag_region (void *, size_t);
void *__mtag_tag_zero_region (void *, size_t);
diff --git a/contrib/arm-optimized-routines/string/test/__mtag_tag_region.c b/contrib/arm-optimized-routines/string/test/__mtag_tag_region.c
index d8c02d92d626..c45fa6662a77 100644
--- a/contrib/arm-optimized-routines/string/test/__mtag_tag_region.c
+++ b/contrib/arm-optimized-routines/string/test/__mtag_tag_region.c
@@ -2,7 +2,7 @@
* __mtag_tag_region test.
*
* Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
diff --git a/contrib/arm-optimized-routines/string/test/__mtag_tag_zero_region.c b/contrib/arm-optimized-routines/string/test/__mtag_tag_zero_region.c
index 221c223a2f31..a4a7861620d1 100644
--- a/contrib/arm-optimized-routines/string/test/__mtag_tag_zero_region.c
+++ b/contrib/arm-optimized-routines/string/test/__mtag_tag_zero_region.c
@@ -2,7 +2,7 @@
* __mtag_tag_zero_region test.
*
* Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
diff --git a/contrib/arm-optimized-routines/string/test/memchr.c b/contrib/arm-optimized-routines/string/test/memchr.c
index 0ff77f5710bf..c6a94481c0ad 100644
--- a/contrib/arm-optimized-routines/string/test/memchr.c
+++ b/contrib/arm-optimized-routines/string/test/memchr.c
@@ -2,7 +2,7 @@
* memchr test.
*
* Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <stdint.h>
diff --git a/contrib/arm-optimized-routines/string/test/memcmp.c b/contrib/arm-optimized-routines/string/test/memcmp.c
index 7a7cf9cff35a..f9236b83a60d 100644
--- a/contrib/arm-optimized-routines/string/test/memcmp.c
+++ b/contrib/arm-optimized-routines/string/test/memcmp.c
@@ -2,7 +2,7 @@
* memcmp test.
*
* Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <stdint.h>
diff --git a/contrib/arm-optimized-routines/string/test/memcpy.c b/contrib/arm-optimized-routines/string/test/memcpy.c
index ce0ceeef5ee8..dc95844bd45a 100644
--- a/contrib/arm-optimized-routines/string/test/memcpy.c
+++ b/contrib/arm-optimized-routines/string/test/memcpy.c
@@ -1,8 +1,8 @@
/*
* memcpy test.
*
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <stdint.h>
@@ -28,6 +28,12 @@ static const struct fun
# if __ARM_NEON
F(__memcpy_aarch64_simd, 1)
# endif
+# if __ARM_FEATURE_SVE
+ F(__memcpy_aarch64_sve, 1)
+# endif
+# if WANT_MOPS
+ F(__memcpy_aarch64_mops, 1)
+# endif
#elif __arm__
F(__memcpy_arm, 0)
#endif
diff --git a/contrib/arm-optimized-routines/string/test/memmove.c b/contrib/arm-optimized-routines/string/test/memmove.c
index 689b68c98af2..b85dd1e864ef 100644
--- a/contrib/arm-optimized-routines/string/test/memmove.c
+++ b/contrib/arm-optimized-routines/string/test/memmove.c
@@ -1,8 +1,8 @@
/*
* memmove test.
*
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <stdint.h>
@@ -28,6 +28,12 @@ static const struct fun
# if __ARM_NEON
F(__memmove_aarch64_simd, 1)
# endif
+# if __ARM_FEATURE_SVE
+ F(__memmove_aarch64_sve, 1)
+# endif
+# if WANT_MOPS
+ F(__memmove_aarch64_mops, 1)
+# endif
#endif
{0, 0, 0}
// clang-format on
diff --git a/contrib/arm-optimized-routines/string/test/memrchr.c b/contrib/arm-optimized-routines/string/test/memrchr.c
index adf96f049cc9..4171a56daefd 100644
--- a/contrib/arm-optimized-routines/string/test/memrchr.c
+++ b/contrib/arm-optimized-routines/string/test/memrchr.c
@@ -2,7 +2,7 @@
* memchr test.
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef _GNU_SOURCE
diff --git a/contrib/arm-optimized-routines/string/test/memset.c b/contrib/arm-optimized-routines/string/test/memset.c
index f1721442dbaf..7d09c267ffec 100644
--- a/contrib/arm-optimized-routines/string/test/memset.c
+++ b/contrib/arm-optimized-routines/string/test/memset.c
@@ -1,8 +1,8 @@
/*
* memset test.
*
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <stdint.h>
@@ -25,6 +25,9 @@ static const struct fun
F(memset, 0)
#if __aarch64__
F(__memset_aarch64, 1)
+# if WANT_MOPS
+ F(__memset_aarch64_mops, 1)
+# endif
#elif __arm__
F(__memset_arm, 0)
#endif
diff --git a/contrib/arm-optimized-routines/string/test/mte.h b/contrib/arm-optimized-routines/string/test/mte.h
index e67cbd9d2d40..40b0ecf6c194 100644
--- a/contrib/arm-optimized-routines/string/test/mte.h
+++ b/contrib/arm-optimized-routines/string/test/mte.h
@@ -2,7 +2,7 @@
* Memory tagging testing code.
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef __TEST_MTE_H
diff --git a/contrib/arm-optimized-routines/string/test/stpcpy.c b/contrib/arm-optimized-routines/string/test/stpcpy.c
index 1827e68c9a30..0300892a1f3c 100644
--- a/contrib/arm-optimized-routines/string/test/stpcpy.c
+++ b/contrib/arm-optimized-routines/string/test/stpcpy.c
@@ -1,8 +1,8 @@
/*
* stpcpy test.
*
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef _GNU_SOURCE
@@ -28,8 +28,7 @@ static const struct fun
// clang-format off
F(stpcpy, 0)
#if __aarch64__
- F(__stpcpy_aarch64, 0)
- F(__stpcpy_aarch64_mte, 1)
+ F(__stpcpy_aarch64, 1)
# if __ARM_FEATURE_SVE
F(__stpcpy_aarch64_sve, 1)
# endif
diff --git a/contrib/arm-optimized-routines/string/test/strchr.c b/contrib/arm-optimized-routines/string/test/strchr.c
index f3ae982ef0ad..66180acfb57c 100644
--- a/contrib/arm-optimized-routines/string/test/strchr.c
+++ b/contrib/arm-optimized-routines/string/test/strchr.c
@@ -2,7 +2,7 @@
* strchr test.
*
* Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <stdint.h>
diff --git a/contrib/arm-optimized-routines/string/test/strchrnul.c b/contrib/arm-optimized-routines/string/test/strchrnul.c
index 6c30ab2123f1..aad0bf59da66 100644
--- a/contrib/arm-optimized-routines/string/test/strchrnul.c
+++ b/contrib/arm-optimized-routines/string/test/strchrnul.c
@@ -2,7 +2,7 @@
* strchrnul test.
*
* Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef _GNU_SOURCE
diff --git a/contrib/arm-optimized-routines/string/test/strcmp.c b/contrib/arm-optimized-routines/string/test/strcmp.c
index d57b54ed50a8..4aa95f4f2f1d 100644
--- a/contrib/arm-optimized-routines/string/test/strcmp.c
+++ b/contrib/arm-optimized-routines/string/test/strcmp.c
@@ -1,8 +1,8 @@
/*
* strcmp test.
*
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <stdint.h>
@@ -24,8 +24,7 @@ static const struct fun
// clang-format off
F(strcmp, 0)
#if __aarch64__
- F(__strcmp_aarch64, 0)
- F(__strcmp_aarch64_mte, 1)
+ F(__strcmp_aarch64, 1)
# if __ARM_FEATURE_SVE
F(__strcmp_aarch64_sve, 1)
# endif
diff --git a/contrib/arm-optimized-routines/string/test/strcpy.c b/contrib/arm-optimized-routines/string/test/strcpy.c
index e84cace9c8c6..af297f90396a 100644
--- a/contrib/arm-optimized-routines/string/test/strcpy.c
+++ b/contrib/arm-optimized-routines/string/test/strcpy.c
@@ -1,8 +1,8 @@
/*
* strcpy test.
*
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <stdint.h>
@@ -24,8 +24,7 @@ static const struct fun
// clang-format off
F(strcpy, 0)
#if __aarch64__
- F(__strcpy_aarch64, 0)
- F(__strcpy_aarch64_mte, 1)
+ F(__strcpy_aarch64, 1)
# if __ARM_FEATURE_SVE
F(__strcpy_aarch64_sve, 1)
# endif
diff --git a/contrib/arm-optimized-routines/string/test/stringtest.h b/contrib/arm-optimized-routines/string/test/stringtest.h
index fe855fc21736..6bb7e1fdfeca 100644
--- a/contrib/arm-optimized-routines/string/test/stringtest.h
+++ b/contrib/arm-optimized-routines/string/test/stringtest.h
@@ -2,7 +2,7 @@
* Common string test code.
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <ctype.h>
diff --git a/contrib/arm-optimized-routines/string/test/strlen.c b/contrib/arm-optimized-routines/string/test/strlen.c
index 6278380f26df..47ef3dcf0ef0 100644
--- a/contrib/arm-optimized-routines/string/test/strlen.c
+++ b/contrib/arm-optimized-routines/string/test/strlen.c
@@ -1,15 +1,14 @@
/*
* strlen test.
*
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include <sys/mman.h>
#include <limits.h>
#include "mte.h"
#include "stringlib.h"
diff --git a/contrib/arm-optimized-routines/string/test/strncmp.c b/contrib/arm-optimized-routines/string/test/strncmp.c
index 018a8a431ab8..4bbab6f93450 100644
--- a/contrib/arm-optimized-routines/string/test/strncmp.c
+++ b/contrib/arm-optimized-routines/string/test/strncmp.c
@@ -1,8 +1,8 @@
/*
* strncmp test.
*
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <stdint.h>
@@ -24,8 +24,7 @@ static const struct fun
// clang-format off
F(strncmp, 0)
#if __aarch64__
- F(__strncmp_aarch64, 0)
- F(__strncmp_aarch64_mte, 1)
+ F(__strncmp_aarch64, 1)
# if __ARM_FEATURE_SVE
F(__strncmp_aarch64_sve, 1)
# endif
diff --git a/contrib/arm-optimized-routines/string/test/strnlen.c b/contrib/arm-optimized-routines/string/test/strnlen.c
index 0dea00eaf8e3..a800fd1993cd 100644
--- a/contrib/arm-optimized-routines/string/test/strnlen.c
+++ b/contrib/arm-optimized-routines/string/test/strnlen.c
@@ -2,7 +2,7 @@
* strnlen test.
*
* Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef _GNU_SOURCE
diff --git a/contrib/arm-optimized-routines/string/test/strrchr.c b/contrib/arm-optimized-routines/string/test/strrchr.c
index fedbdc52fcc1..580ca497f8a4 100644
--- a/contrib/arm-optimized-routines/string/test/strrchr.c
+++ b/contrib/arm-optimized-routines/string/test/strrchr.c
@@ -2,7 +2,7 @@
* strrchr test.
*
* Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <stdint.h>
diff --git a/contrib/arm-optimized-routines/string/x86_64/check-arch.S b/contrib/arm-optimized-routines/string/x86_64/check-arch.S
index 26ade0a0c7db..5afcf7b7ee54 100644
--- a/contrib/arm-optimized-routines/string/x86_64/check-arch.S
+++ b/contrib/arm-optimized-routines/string/x86_64/check-arch.S
@@ -2,7 +2,7 @@
* check ARCH setting.
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#if !__x86_64__