src - FreeBSD source tree

diff options


context:
space:
mode:

author	Andrew Turner <andrew@FreeBSD.org>	2022-08-10 11:54:14 +0000
committer	Andrew Turner <andrew@FreeBSD.org>	2022-08-10 14:15:46 +0000
commit	01c4cb317efa9e5e905a88340c396a9683db72e7 (patch)
tree	f1526cbf2d05dec37108a2d1e206ba62e7f8b38a
parent	f9f37c002ab5a580accfe26b731eef45e798b435 (diff)
download	src-01c4cb317efa9e5e905a88340c396a9683db72e7.tar.gz src-01c4cb317efa9e5e905a88340c396a9683db72e7.zip

Update the Arm Optimized Routinesvendor/arm-optimized-routines/20220210-89ca9c3

Import revision 89ca9c3629eb6a62c28918db929a6fe80b141825 from https://github.com/ARM-software/optimized-routines. This is the last revision before updating the license to MIT or Apache 2.0 + LLVM exception. As no changes have happened to the string functions since the license update pull in the last version before it to reduce the diff. This version: - Improves the performance of memcmp - Adds SVE implementation of memcpy - Uses the MTE version of some str* functions as they are faster Sponsored by: The FreeBSD Foundation

-rw-r--r--

-rw-r--r--

-rw-r--r--

-rw-r--r--

-rw-r--r--

-rw-r--r--

string/aarch64/memcmp.S

237

-rw-r--r--

string/aarch64/memcpy-sve.S

180

-rw-r--r--

string/aarch64/stpcpy-mte.S

-rw-r--r--

string/aarch64/strcmp-mte.S

189

-rw-r--r--

string/aarch64/strcmp.S

234

-rw-r--r--

string/aarch64/strcpy-mte.S

161

-rw-r--r--

string/aarch64/strcpy.S

394

-rw-r--r--

string/aarch64/strncmp-mte.S

307

-rw-r--r--

string/aarch64/strncmp.S

234

-rw-r--r--

string/bench/memcpy.c

165

-rw-r--r--

string/bench/memset.c

243

-rw-r--r--

string/bench/strlen.c

-rw-r--r--

string/include/stringlib.h

-rw-r--r--

string/test/memcpy.c

-rw-r--r--

string/test/memmove.c

-rw-r--r--

string/test/stpcpy.c

-rw-r--r--

string/test/strcmp.c

-rw-r--r--

string/test/strcpy.c

-rw-r--r--

string/test/strncmp.c

24 files changed, 1102 insertions, 1300 deletions

diff --git a/README b/README
index ae465e93fea7..9e1a34fdc65d 100644
--- a/README
+++ b/README

@@ -9,7 +9,7 @@ contributor-agreement.pdf. This is needed so upstreaming code

to projects that require copyright assignment is possible.

Regular quarterly releases are tagged as vYY.MM, the latest

-release is v20.11.

+release is v21.02.

Source code layout:

diff --git a/math/cosf.c b/math/cosf.c
index f29f19474e23..67a3798b573e 100644
--- a/math/cosf.c
+++ b/math/cosf.c

@@ -22,7 +22,7 @@ cosf (float y)

int n;

const sincos_t *p = &__sincosf_table[0];

- if (abstop12 (y) < abstop12 (pio4))

+ if (abstop12 (y) < abstop12 (pio4f))

{

double x2 = x * x;

diff --git a/math/sincosf.c b/math/sincosf.c
index 9746f1c22e6c..6fb299d10309 100644
--- a/math/sincosf.c
+++ b/math/sincosf.c

@@ -22,7 +22,7 @@ sincosf (float y, float *sinp, float *cosp)

int n;

const sincos_t *p = &__sincosf_table[0];

- if (abstop12 (y) < abstop12 (pio4))

+ if (abstop12 (y) < abstop12 (pio4f))

{

double x2 = x * x;

diff --git a/math/sincosf.h b/math/sincosf.h
index 1e80fc9ba8e1..59124699f552 100644
--- a/math/sincosf.h
+++ b/math/sincosf.h

@@ -12,7 +12,7 @@

/* 2PI * 2^-64. */

static const double pi63 = 0x1.921FB54442D18p-62;

/* PI / 4. */

-static const double pio4 = 0x1.921FB54442D18p-1;

+static const float pio4f = 0x1.921FB6p-1f;

/* The constants and polynomials for sine and cosine. */

typedef struct

diff --git a/math/sinf.c b/math/sinf.c
index ddbc1daf74a9..4d2cbd6fae72 100644
--- a/math/sinf.c
+++ b/math/sinf.c

@@ -21,7 +21,7 @@ sinf (float y)

int n;

const sincos_t *p = &__sincosf_table[0];

- if (abstop12 (y) < abstop12 (pio4))

+ if (abstop12 (y) < abstop12 (pio4f))

{

s = x * x;

diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
index 3b1026642eee..7ca1135edec7 100644
--- a/string/aarch64/memcmp.S
+++ b/string/aarch64/memcmp.S

@@ -1,103 +1,84 @@

/* memcmp - compare memory

* SPDX-License-Identifier: MIT

/* Assumptions:

- * ARMv8-a, AArch64, unaligned accesses.

+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.

#include "../asmdefs.h"

-/* Parameters and result. */

-#define src1 x0

-#define src2 x1

-#define limit x2

-#define result w0

-/* Internal variables. */

-#define data1 x3

-#define data1w w3

-#define data1h x4

-#define data2 x5

-#define data2w w5

-#define data2h x6

-#define tmp1 x7

-#define tmp2 x8

+#define src1 x0

+#define src2 x1

+#define limit x2

+#define result w0

+#define data1 x3

+#define data1w w3

+#define data2 x4

+#define data2w w4

+#define data3 x5

+#define data3w w5

+#define data4 x6

+#define data4w w6

+#define tmp x6

+#define src1end x7

+#define src2end x8

ENTRY (__memcmp_aarch64)

PTR_ARG (0)

PTR_ARG (1)

SIZE_ARG (2)

- subs limit, limit, 8

- b.lo L(less8)

- ldr data1, [src1], 8

- ldr data2, [src2], 8

- cmp data1, data2

- b.ne L(return)

- subs limit, limit, 8

- b.gt L(more16)

- ldr data1, [src1, limit]

- ldr data2, [src2, limit]

- b L(return)

-L(more16):

- ldr data1, [src1], 8

- ldr data2, [src2], 8

- cmp data1, data2

- bne L(return)

- /* Jump directly to comparing the last 16 bytes for 32 byte (or less)

- strings. */

- subs limit, limit, 16

+ cmp limit, 16

+ b.lo L(less16)

+ ldp data1, data3, [src1]

+ ldp data2, data4, [src2]

+ ccmp data1, data2, 0, ne

+ ccmp data3, data4, 0, eq

+ b.ne L(return2)

+ add src1end, src1, limit

+ add src2end, src2, limit

+ cmp limit, 32

b.ls L(last_bytes)

+ cmp limit, 160

+ b.hs L(loop_align)

+ sub limit, limit, 32

- /* We overlap loads between 0-32 bytes at either side of SRC1 when we

- try to align, so limit it only to strings larger than 128 bytes. */

- cmp limit, 96

- b.ls L(loop16)

- /* Align src1 and adjust src2 with bytes not yet done. */

- and tmp1, src1, 15

- add limit, limit, tmp1

- sub src1, src1, tmp1

- sub src2, src2, tmp1

- /* Loop performing 16 bytes per iteration using aligned src1.

- Limit is pre-decremented by 16 and must be larger than zero.

- Exit if <= 16 bytes left to do or if the data is not equal. */

.p2align 4

-L(loop16):

- ldp data1, data1h, [src1], 16

- ldp data2, data2h, [src2], 16

- subs limit, limit, 16

- ccmp data1, data2, 0, hi

- ccmp data1h, data2h, 0, eq

- b.eq L(loop16)

+L(loop32):

+ ldp data1, data3, [src1, 16]

+ ldp data2, data4, [src2, 16]

cmp data1, data2

- bne L(return)

- mov data1, data1h

- mov data2, data2h

+ ccmp data3, data4, 0, eq

+ b.ne L(return2)

+ cmp limit, 16

+ b.ls L(last_bytes)

+ ldp data1, data3, [src1, 32]

+ ldp data2, data4, [src2, 32]

cmp data1, data2

- bne L(return)

+ ccmp data3, data4, 0, eq

+ b.ne L(return2)

+ add src1, src1, 32

+ add src2, src2, 32

+L(last64):

+ subs limit, limit, 32

+ b.hi L(loop32)

/* Compare last 1-16 bytes using unaligned access. */

L(last_bytes):

- add src1, src1, limit

- add src2, src2, limit

- ldp data1, data1h, [src1]

- ldp data2, data2h, [src2]

- cmp data1, data2

- bne L(return)

- mov data1, data1h

- mov data2, data2h

+ ldp data1, data3, [src1end, -16]

+ ldp data2, data4, [src2end, -16]

+L(return2):

cmp data1, data2

+ csel data1, data1, data3, ne

+ csel data2, data2, data4, ne

/* Compare data bytes and set return value to 0, -1 or 1. */

L(return):

@@ -105,33 +86,105 @@ L(return):

rev data1, data1

rev data2, data2

#endif

- cmp data1, data2

-L(ret_eq):

+ cmp data1, data2

cset result, ne

cneg result, result, lo

ret

.p2align 4

- /* Compare up to 8 bytes. Limit is [-8..-1]. */

+L(less16):

+ add src1end, src1, limit

+ add src2end, src2, limit

+ tbz limit, 3, L(less8)

+ ldr data1, [src1]

+ ldr data2, [src2]

+ ldr data3, [src1end, -8]

+ ldr data4, [src2end, -8]

+ b L(return2)

+ .p2align 4

L(less8):

- adds limit, limit, 4

- b.lo L(less4)

- ldr data1w, [src1], 4

- ldr data2w, [src2], 4

+ tbz limit, 2, L(less4)

+ ldr data1w, [src1]

+ ldr data2w, [src2]

+ ldr data3w, [src1end, -4]

+ ldr data4w, [src2end, -4]

+ b L(return2)

+L(less4):

+ tbz limit, 1, L(less2)

+ ldrh data1w, [src1]

+ ldrh data2w, [src2]

cmp data1w, data2w

b.ne L(return)

- sub limit, limit, 4

-L(less4):

- adds limit, limit, 4

- beq L(ret_eq)

-L(byte_loop):

- ldrb data1w, [src1], 1

- ldrb data2w, [src2], 1

- subs limit, limit, 1

- ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */

- b.eq L(byte_loop)

+L(less2):

+ mov result, 0

+ tbz limit, 0, L(return_zero)

+ ldrb data1w, [src1end, -1]

+ ldrb data2w, [src2end, -1]

sub result, data1w, data2w

+L(return_zero):

ret

-END (__memcmp_aarch64)

+L(loop_align):

+ ldp data1, data3, [src1, 16]

+ ldp data2, data4, [src2, 16]

+ cmp data1, data2

+ ccmp data3, data4, 0, eq

+ b.ne L(return2)

+ /* Align src2 and adjust src1, src2 and limit. */

+ and tmp, src2, 15

+ sub tmp, tmp, 16

+ sub src2, src2, tmp

+ add limit, limit, tmp

+ sub src1, src1, tmp

+ sub limit, limit, 64 + 16

+ .p2align 4

+L(loop64):

+ ldr q0, [src1, 16]

+ ldr q1, [src2, 16]

+ subs limit, limit, 64

+ ldr q2, [src1, 32]

+ ldr q3, [src2, 32]

+ eor v0.16b, v0.16b, v1.16b

+ eor v1.16b, v2.16b, v3.16b

+ ldr q2, [src1, 48]

+ ldr q3, [src2, 48]

+ umaxp v0.16b, v0.16b, v1.16b

+ ldr q4, [src1, 64]!

+ ldr q5, [src2, 64]!

+ eor v1.16b, v2.16b, v3.16b

+ eor v2.16b, v4.16b, v5.16b

+ umaxp v1.16b, v1.16b, v2.16b

+ umaxp v0.16b, v0.16b, v1.16b

+ umaxp v0.16b, v0.16b, v0.16b

+ fmov tmp, d0

+ ccmp tmp, 0, 0, hi

+ b.eq L(loop64)

+ /* If equal, process last 1-64 bytes using scalar loop. */

+ add limit, limit, 64 + 16

+ cbz tmp, L(last64)

+ /* Determine the 8-byte aligned offset of the first difference. */

+#ifdef __AARCH64EB__

+ rev16 tmp, tmp

+#endif

+ rev tmp, tmp

+ clz tmp, tmp

+ bic tmp, tmp, 7

+ sub tmp, tmp, 48

+ ldr data1, [src1, tmp]

+ ldr data2, [src2, tmp]

+#ifndef __AARCH64EB__

+ rev data1, data1

+ rev data2, data2

+#endif

+ mov result, 1

+ cmp data1, data2

+ cneg result, result, lo

+ ret

+END (__memcmp_aarch64)

diff --git a/string/aarch64/memcpy-sve.S b/string/aarch64/memcpy-sve.S
new file mode 100644
index 000000000000..f85e8009f3c5
--- /dev/null
+++ b/string/aarch64/memcpy-sve.S

@@ -0,0 +1,180 @@

+/*

+ * memcpy - copy memory area

+ *

+ * SPDX-License-Identifier: MIT

+ */

+/* Assumptions:

+ *

+ * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.

+ *

+ */

+#if __ARM_FEATURE_SVE

+#include "../asmdefs.h"

+#define dstin x0

+#define src x1

+#define count x2

+#define dst x3

+#define srcend x4

+#define dstend x5

+#define tmp1 x6

+#define vlen x6

+#define A_q q0

+#define B_q q1

+#define C_q q2

+#define D_q q3

+#define E_q q4

+#define F_q q5

+#define G_q q6

+#define H_q q7

+/* This implementation handles overlaps and supports both memcpy and memmove

+ from a single entry point. It uses unaligned accesses and branchless

+ sequences to keep the code small, simple and improve performance.

+ SVE vectors are used to speedup small copies.

+ Copies are split into 3 main cases: small copies of up to 32 bytes, medium

+ copies of up to 128 bytes, and large copies. The overhead of the overlap

+ check is negligible since it is only required for large copies.

+ Large copies use a software pipelined loop processing 64 bytes per iteration.

+ The source pointer is 16-byte aligned to minimize unaligned accesses.

+ The loop tail is handled by always copying 64 bytes from the end.

+*/

+ENTRY_ALIAS (__memmove_aarch64_sve)

+ENTRY (__memcpy_aarch64_sve)

+ PTR_ARG (0)

+ PTR_ARG (1)

+ SIZE_ARG (2)

+ cmp count, 128

+ b.hi L(copy_long)

+ cmp count, 32

+ b.hi L(copy32_128)

+ whilelo p0.b, xzr, count

+ cntb vlen

+ tbnz vlen, 4, L(vlen128)

+ ld1b z0.b, p0/z, [src]

+ st1b z0.b, p0, [dstin]

+ ret

+ /* Medium copies: 33..128 bytes. */

+L(copy32_128):

+ add srcend, src, count

+ add dstend, dstin, count

+ ldp A_q, B_q, [src]

+ ldp C_q, D_q, [srcend, -32]

+ cmp count, 64

+ b.hi L(copy128)

+ stp A_q, B_q, [dstin]

+ stp C_q, D_q, [dstend, -32]

+ ret

+ /* Copy 65..128 bytes. */

+L(copy128):

+ ldp E_q, F_q, [src, 32]

+ cmp count, 96

+ b.ls L(copy96)

+ ldp G_q, H_q, [srcend, -64]

+ stp G_q, H_q, [dstend, -64]

+L(copy96):

+ stp A_q, B_q, [dstin]

+ stp E_q, F_q, [dstin, 32]

+ stp C_q, D_q, [dstend, -32]

+ ret

+ /* Copy more than 128 bytes. */

+L(copy_long):

+ add srcend, src, count

+ add dstend, dstin, count

+ /* Use backwards copy if there is an overlap. */

+ sub tmp1, dstin, src

+ cmp tmp1, count

+ b.lo L(copy_long_backwards)

+ /* Copy 16 bytes and then align src to 16-byte alignment. */

+ ldr D_q, [src]

+ and tmp1, src, 15

+ bic src, src, 15

+ sub dst, dstin, tmp1

+ add count, count, tmp1 /* Count is now 16 too large. */

+ ldp A_q, B_q, [src, 16]

+ str D_q, [dstin]

+ ldp C_q, D_q, [src, 48]

+ subs count, count, 128 + 16 /* Test and readjust count. */

+ b.ls L(copy64_from_end)

+L(loop64):

+ stp A_q, B_q, [dst, 16]

+ ldp A_q, B_q, [src, 80]

+ stp C_q, D_q, [dst, 48]

+ ldp C_q, D_q, [src, 112]

+ add src, src, 64

+ add dst, dst, 64

+ subs count, count, 64

+ b.hi L(loop64)

+ /* Write the last iteration and copy 64 bytes from the end. */

+L(copy64_from_end):

+ ldp E_q, F_q, [srcend, -64]

+ stp A_q, B_q, [dst, 16]

+ ldp A_q, B_q, [srcend, -32]

+ stp C_q, D_q, [dst, 48]

+ stp E_q, F_q, [dstend, -64]

+ stp A_q, B_q, [dstend, -32]

+ ret

+L(vlen128):

+ whilelo p1.b, vlen, count

+ ld1b z0.b, p0/z, [src, 0, mul vl]

+ ld1b z1.b, p1/z, [src, 1, mul vl]

+ st1b z0.b, p0, [dstin, 0, mul vl]

+ st1b z1.b, p1, [dstin, 1, mul vl]

+ ret

+ /* Large backwards copy for overlapping copies.

+ Copy 16 bytes and then align srcend to 16-byte alignment. */

+L(copy_long_backwards):

+ cbz tmp1, L(return)

+ ldr D_q, [srcend, -16]

+ and tmp1, srcend, 15

+ bic srcend, srcend, 15

+ sub count, count, tmp1

+ ldp A_q, B_q, [srcend, -32]

+ str D_q, [dstend, -16]

+ ldp C_q, D_q, [srcend, -64]

+ sub dstend, dstend, tmp1

+ subs count, count, 128

+ b.ls L(copy64_from_start)

+L(loop64_backwards):

+ str B_q, [dstend, -16]

+ str A_q, [dstend, -32]

+ ldp A_q, B_q, [srcend, -96]

+ str D_q, [dstend, -48]

+ str C_q, [dstend, -64]!

+ ldp C_q, D_q, [srcend, -128]

+ sub srcend, srcend, 64

+ subs count, count, 64

+ b.hi L(loop64_backwards)

+ /* Write the last iteration and copy 64 bytes from the start. */

+L(copy64_from_start):

+ ldp E_q, F_q, [src, 32]

+ stp A_q, B_q, [dstend, -32]

+ ldp A_q, B_q, [src]

+ stp C_q, D_q, [dstend, -64]

+ stp E_q, F_q, [dstin, 32]

+ stp A_q, B_q, [dstin]

+L(return):

+ ret

+END (__memcpy_aarch64_sve)

+#endif

diff --git a/string/aarch64/stpcpy-mte.S b/string/aarch64/stpcpy-mte.S
deleted file mode 100644
index f1c711906515..000000000000
--- a/string/aarch64/stpcpy-mte.S
+++ /dev/null

@@ -1,10 +0,0 @@

-/*

- * stpcpy - copy a string returning pointer to end.

- *

- * SPDX-License-Identifier: MIT

- */

-#define BUILD_STPCPY 1

-#include "strcpy-mte.S"

diff --git a/string/aarch64/strcmp-mte.S b/string/aarch64/strcmp-mte.S
deleted file mode 100644
index 12d1a6b51dd3..000000000000
--- a/string/aarch64/strcmp-mte.S
+++ /dev/null

@@ -1,189 +0,0 @@

-/*

- * strcmp - compare two strings

- *

- * SPDX-License-Identifier: MIT

- */

-/* Assumptions:

- *

- * ARMv8-a, AArch64.

- * MTE compatible.

- */

-#include "../asmdefs.h"

-#define REP8_01 0x0101010101010101

-#define REP8_7f 0x7f7f7f7f7f7f7f7f

-#define src1 x0

-#define src2 x1

-#define result x0

-#define data1 x2

-#define data1w w2

-#define data2 x3

-#define data2w w3

-#define has_nul x4

-#define diff x5

-#define off1 x5

-#define syndrome x6

-#define tmp x6

-#define data3 x7

-#define zeroones x8

-#define shift x9

-#define off2 x10

-/* On big-endian early bytes are at MSB and on little-endian LSB.

- LS_FW means shifting towards early bytes. */

-#ifdef __AARCH64EB__

-# define LS_FW lsl

-#else

-# define LS_FW lsr

-#endif

-/* NUL detection works on the principle that (X - 1) & (~X) & 0x80

- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and

- can be done in parallel across the entire word.

- Since carry propagation makes 0x1 bytes before a NUL byte appear

- NUL too in big-endian, byte-reverse the data before the NUL check. */

-ENTRY (__strcmp_aarch64_mte)

- PTR_ARG (0)

- PTR_ARG (1)

- sub off2, src2, src1

- mov zeroones, REP8_01

- and tmp, src1, 7

- tst off2, 7

- b.ne L(misaligned8)

- cbnz tmp, L(mutual_align)

- .p2align 4

-L(loop_aligned):

- ldr data2, [src1, off2]

- ldr data1, [src1], 8

-L(start_realigned):

-#ifdef __AARCH64EB__

- rev tmp, data1

- sub has_nul, tmp, zeroones

- orr tmp, tmp, REP8_7f

-#else

- sub has_nul, data1, zeroones

- orr tmp, data1, REP8_7f

-#endif

- bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */

- ccmp data1, data2, 0, eq

- b.eq L(loop_aligned)

-#ifdef __AARCH64EB__

- rev has_nul, has_nul

-#endif

- eor diff, data1, data2

- orr syndrome, diff, has_nul

-L(end):

-#ifndef __AARCH64EB__

- rev syndrome, syndrome

- rev data1, data1

- rev data2, data2

-#endif

- clz shift, syndrome

- /* The most-significant-non-zero bit of the syndrome marks either the

- first bit that is different, or the top bit of the first zero byte.

- Shifting left now will bring the critical information into the

- top bits. */

- lsl data1, data1, shift

- lsl data2, data2, shift

- /* But we need to zero-extend (char is unsigned) the value and then

- perform a signed 32-bit subtraction. */

- lsr data1, data1, 56

- sub result, data1, data2, lsr 56

- ret

- .p2align 4

-L(mutual_align):

- /* Sources are mutually aligned, but are not currently at an

- alignment boundary. Round down the addresses and then mask off

- the bytes that precede the start point. */

- bic src1, src1, 7

- ldr data2, [src1, off2]

- ldr data1, [src1], 8

- neg shift, src2, lsl 3 /* Bits to alignment -64. */

- mov tmp, -1

- LS_FW tmp, tmp, shift

- orr data1, data1, tmp

- orr data2, data2, tmp

- b L(start_realigned)

-L(misaligned8):

- /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always

- checking to make sure that we don't access beyond the end of SRC2. */

- cbz tmp, L(src1_aligned)

-L(do_misaligned):

- ldrb data1w, [src1], 1

- ldrb data2w, [src2], 1

- cmp data1w, 0

- ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */

- b.ne L(done)

- tst src1, 7

- b.ne L(do_misaligned)

-L(src1_aligned):

- neg shift, src2, lsl 3

- bic src2, src2, 7

- ldr data3, [src2], 8

-#ifdef __AARCH64EB__

- rev data3, data3

-#endif

- lsr tmp, zeroones, shift

- orr data3, data3, tmp

- sub has_nul, data3, zeroones

- orr tmp, data3, REP8_7f

- bics has_nul, has_nul, tmp

- b.ne L(tail)

- sub off1, src2, src1

- .p2align 4

-L(loop_unaligned):

- ldr data3, [src1, off1]

- ldr data2, [src1, off2]

-#ifdef __AARCH64EB__

- rev data3, data3

-#endif

- sub has_nul, data3, zeroones

- orr tmp, data3, REP8_7f

- ldr data1, [src1], 8

- bics has_nul, has_nul, tmp

- ccmp data1, data2, 0, eq

- b.eq L(loop_unaligned)

- lsl tmp, has_nul, shift

-#ifdef __AARCH64EB__

- rev tmp, tmp

-#endif

- eor diff, data1, data2

- orr syndrome, diff, tmp

- cbnz syndrome, L(end)

-L(tail):

- ldr data1, [src1]

- neg shift, shift

- lsr data2, data3, shift

- lsr has_nul, has_nul, shift

-#ifdef __AARCH64EB__

- rev data2, data2

- rev has_nul, has_nul

-#endif

- eor diff, data1, data2

- orr syndrome, diff, has_nul

- b L(end)

-L(done):

- sub result, data1, data2

- ret

-END (__strcmp_aarch64_mte)

diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S
index 7714ebf5577d..6e77845ae6ff 100644
--- a/string/aarch64/strcmp.S
+++ b/string/aarch64/strcmp.S

@@ -1,168 +1,184 @@

* strcmp - compare two strings

* SPDX-License-Identifier: MIT

/* Assumptions:

- * ARMv8-a, AArch64

+ * ARMv8-a, AArch64.

+ * MTE compatible.

#include "../asmdefs.h"

#define REP8_01 0x0101010101010101

#define REP8_7f 0x7f7f7f7f7f7f7f7f

-#define REP8_80 0x8080808080808080

-/* Parameters and result. */

#define src1 x0

#define src2 x1

#define result x0

-/* Internal variables. */

#define data1 x2

#define data1w w2

#define data2 x3

#define data2w w3

#define has_nul x4

#define diff x5

+#define off1 x5

#define syndrome x6

-#define tmp1 x7

-#define tmp2 x8

-#define tmp3 x9

-#define zeroones x10

-#define pos x11

+#define tmp x6

+#define data3 x7

+#define zeroones x8

+#define shift x9

+#define off2 x10

+/* On big-endian early bytes are at MSB and on little-endian LSB.

+ LS_FW means shifting towards early bytes. */

+#ifdef __AARCH64EB__

+# define LS_FW lsl

+#else

+# define LS_FW lsr

+#endif

+/* NUL detection works on the principle that (X - 1) & (~X) & 0x80

+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and

+ can be done in parallel across the entire word.

+ Since carry propagation makes 0x1 bytes before a NUL byte appear

+ NUL too in big-endian, byte-reverse the data before the NUL check. */

- /* Start of performance-critical section -- one 64B cache line. */

ENTRY (__strcmp_aarch64)

PTR_ARG (0)

PTR_ARG (1)

- eor tmp1, src1, src2

- mov zeroones, #REP8_01

- tst tmp1, #7

+ sub off2, src2, src1

+ mov zeroones, REP8_01

+ and tmp, src1, 7

+ tst off2, 7

b.ne L(misaligned8)

- ands tmp1, src1, #7

- b.ne L(mutual_align)

- /* NUL detection works on the principle that (X - 1) & (~X) & 0x80

- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and

- can be done in parallel across the entire word. */

+ cbnz tmp, L(mutual_align)

+ .p2align 4

L(loop_aligned):

- ldr data1, [src1], #8

- ldr data2, [src2], #8

+ ldr data2, [src1, off2]

+ ldr data1, [src1], 8

L(start_realigned):

- sub tmp1, data1, zeroones

- orr tmp2, data1, #REP8_7f

- eor diff, data1, data2 /* Non-zero if differences found. */

- bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */

+#ifdef __AARCH64EB__

+ rev tmp, data1

+ sub has_nul, tmp, zeroones

+ orr tmp, tmp, REP8_7f

+#else

+ sub has_nul, data1, zeroones

+ orr tmp, data1, REP8_7f

+#endif

+ bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */

+ ccmp data1, data2, 0, eq

+ b.eq L(loop_aligned)

+#ifdef __AARCH64EB__

+ rev has_nul, has_nul

+#endif

+ eor diff, data1, data2

orr syndrome, diff, has_nul

- cbz syndrome, L(loop_aligned)

- /* End of performance-critical section -- one 64B cache line. */

L(end):

-#ifndef __AARCH64EB__

+#ifndef __AARCH64EB__

rev syndrome, syndrome

rev data1, data1

- /* The MS-non-zero bit of the syndrome marks either the first bit

- that is different, or the top bit of the first zero byte.

- Shifting left now will bring the critical information into the

- top bits. */

- clz pos, syndrome

rev data2, data2

- lsl data1, data1, pos

- lsl data2, data2, pos

- /* But we need to zero-extend (char is unsigned) the value and then

- perform a signed 32-bit subtraction. */

- lsr data1, data1, #56

- sub result, data1, data2, lsr #56

- ret

-#else

- /* For big-endian we cannot use the trick with the syndrome value

- as carry-propagation can corrupt the upper bits if the trailing

- bytes in the string contain 0x01. */

- /* However, if there is no NUL byte in the dword, we can generate

- the result directly. We can't just subtract the bytes as the

- MSB might be significant. */

- cbnz has_nul, 1f

- cmp data1, data2

- cset result, ne

- cneg result, result, lo

- ret

-1:

- /* Re-compute the NUL-byte detection, using a byte-reversed value. */

- rev tmp3, data1

- sub tmp1, tmp3, zeroones

- orr tmp2, tmp3, #REP8_7f

- bic has_nul, tmp1, tmp2

- rev has_nul, has_nul

- orr syndrome, diff, has_nul

- clz pos, syndrome

- /* The MS-non-zero bit of the syndrome marks either the first bit

- that is different, or the top bit of the first zero byte.

+#endif

+ clz shift, syndrome

+ /* The most-significant-non-zero bit of the syndrome marks either the

+ first bit that is different, or the top bit of the first zero byte.

Shifting left now will bring the critical information into the

top bits. */

- lsl data1, data1, pos

- lsl data2, data2, pos

+ lsl data1, data1, shift

+ lsl data2, data2, shift

/* But we need to zero-extend (char is unsigned) the value and then

perform a signed 32-bit subtraction. */

- lsr data1, data1, #56

- sub result, data1, data2, lsr #56

+ lsr data1, data1, 56

+ sub result, data1, data2, lsr 56

ret

-#endif

+ .p2align 4

L(mutual_align):

/* Sources are mutually aligned, but are not currently at an

alignment boundary. Round down the addresses and then mask off

- the bytes that preceed the start point. */

- bic src1, src1, #7

- bic src2, src2, #7

- lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */

- ldr data1, [src1], #8

- neg tmp1, tmp1 /* Bits to alignment -64. */

- ldr data2, [src2], #8

- mov tmp2, #~0

-#ifdef __AARCH64EB__

- /* Big-endian. Early bytes are at MSB. */

- lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */

-#else

- /* Little-endian. Early bytes are at LSB. */

- lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */

-#endif

- orr data1, data1, tmp2

- orr data2, data2, tmp2

+ the bytes that precede the start point. */

+ bic src1, src1, 7

+ ldr data2, [src1, off2]

+ ldr data1, [src1], 8

+ neg shift, src2, lsl 3 /* Bits to alignment -64. */

+ mov tmp, -1

+ LS_FW tmp, tmp, shift

+ orr data1, data1, tmp

+ orr data2, data2, tmp

b L(start_realigned)

L(misaligned8):

/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always

- checking to make sure that we don't access beyond page boundary in

- SRC2. */

- tst src1, #7

- b.eq L(loop_misaligned)

+ checking to make sure that we don't access beyond the end of SRC2. */

+ cbz tmp, L(src1_aligned)

L(do_misaligned):

- ldrb data1w, [src1], #1

- ldrb data2w, [src2], #1

- cmp data1w, #1

- ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */

+ ldrb data1w, [src1], 1

+ ldrb data2w, [src2], 1

+ cmp data1w, 0

+ ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */

b.ne L(done)

- tst src1, #7

+ tst src1, 7

b.ne L(do_misaligned)

-L(loop_misaligned):

- /* Test if we are within the last dword of the end of a 4K page. If

- yes then jump back to the misaligned loop to copy a byte at a time. */

- and tmp1, src2, #0xff8

- eor tmp1, tmp1, #0xff8

- cbz tmp1, L(do_misaligned)

- ldr data1, [src1], #8

- ldr data2, [src2], #8

- sub tmp1, data1, zeroones

- orr tmp2, data1, #REP8_7f

- eor diff, data1, data2 /* Non-zero if differences found. */

- bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */

+L(src1_aligned):

+ neg shift, src2, lsl 3

+ bic src2, src2, 7

+ ldr data3, [src2], 8

+#ifdef __AARCH64EB__

+ rev data3, data3

+#endif

+ lsr tmp, zeroones, shift

+ orr data3, data3, tmp

+ sub has_nul, data3, zeroones

+ orr tmp, data3, REP8_7f

+ bics has_nul, has_nul, tmp

+ b.ne L(tail)

+ sub off1, src2, src1

+ .p2align 4

+L(loop_unaligned):

+ ldr data3, [src1, off1]

+ ldr data2, [src1, off2]

+#ifdef __AARCH64EB__

+ rev data3, data3

+#endif

+ sub has_nul, data3, zeroones

+ orr tmp, data3, REP8_7f

+ ldr data1, [src1], 8

+ bics has_nul, has_nul, tmp

+ ccmp data1, data2, 0, eq

+ b.eq L(loop_unaligned)

+ lsl tmp, has_nul, shift

+#ifdef __AARCH64EB__

+ rev tmp, tmp

+#endif

+ eor diff, data1, data2

+ orr syndrome, diff, tmp

+ cbnz syndrome, L(end)

+L(tail):

+ ldr data1, [src1]

+ neg shift, shift

+ lsr data2, data3, shift

+ lsr has_nul, has_nul, shift

+#ifdef __AARCH64EB__

+ rev data2, data2

+ rev has_nul, has_nul

+#endif

+ eor diff, data1, data2

orr syndrome, diff, has_nul

- cbz syndrome, L(loop_misaligned)

b L(end)

L(done):

diff --git a/string/aarch64/strcpy-mte.S b/string/aarch64/strcpy-mte.S
deleted file mode 100644
index 88c222d61e53..000000000000
--- a/string/aarch64/strcpy-mte.S
+++ /dev/null

@@ -1,161 +0,0 @@

-/*

- * strcpy/stpcpy - copy a string returning pointer to start/end.

- *

- * SPDX-License-Identifier: MIT

- */

-/* Assumptions:

- *

- * ARMv8-a, AArch64, Advanced SIMD.

- * MTE compatible.

- */

-#include "../asmdefs.h"

-#define dstin x0

-#define srcin x1

-#define result x0

-#define src x2

-#define dst x3

-#define len x4

-#define synd x4

-#define tmp x5

-#define wtmp w5

-#define shift x5

-#define data1 x6

-#define dataw1 w6

-#define data2 x7

-#define dataw2 w7

-#define dataq q0

-#define vdata v0

-#define vhas_nul v1

-#define vrepmask v2

-#define vend v3

-#define dend d3

-#define dataq2 q1

-#ifdef BUILD_STPCPY

-# define STRCPY __stpcpy_aarch64_mte

-# define IFSTPCPY(X,...) X,__VA_ARGS__

-#else

-# define STRCPY __strcpy_aarch64_mte

-# define IFSTPCPY(X,...)

-#endif

-/* Core algorithm:

- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits

- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the

- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are

- set likewise for odd bytes so that adjacent bytes can be merged. Since the

- bits in the syndrome reflect the order in which things occur in the original

- string, counting trailing zeros identifies exactly which byte matched. */

-ENTRY (STRCPY)

- PTR_ARG (0)

- PTR_ARG (1)

- bic src, srcin, 15

- mov wtmp, 0xf00f

- ld1 {vdata.16b}, [src]

- dup vrepmask.8h, wtmp

- cmeq vhas_nul.16b, vdata.16b, 0

- lsl shift, srcin, 2

- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b

- addp vend.16b, vhas_nul.16b, vhas_nul.16b

- fmov synd, dend

- lsr synd, synd, shift

- cbnz synd, L(tail)

- ldr dataq, [src, 16]!

- cmeq vhas_nul.16b, vdata.16b, 0

- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b

- addp vend.16b, vhas_nul.16b, vhas_nul.16b

- fmov synd, dend

- cbz synd, L(start_loop)

-#ifndef __AARCH64EB__

- rbit synd, synd

-#endif

- sub tmp, src, srcin

- clz len, synd

- add len, tmp, len, lsr 2

- tbz len, 4, L(less16)

- sub tmp, len, 15

- ldr dataq, [srcin]

- ldr dataq2, [srcin, tmp]

- str dataq, [dstin]

- str dataq2, [dstin, tmp]

- IFSTPCPY (add result, dstin, len)

- ret

- .p2align 4,,8

-L(tail):

- rbit synd, synd

- clz len, synd

- lsr len, len, 2

- .p2align 4

-L(less16):

- tbz len, 3, L(less8)

- sub tmp, len, 7

- ldr data1, [srcin]

- ldr data2, [srcin, tmp]

- str data1, [dstin]

- str data2, [dstin, tmp]

- IFSTPCPY (add result, dstin, len)

- ret

- .p2align 4

-L(less8):

- subs tmp, len, 3

- b.lo L(less4)

- ldr dataw1, [srcin]

- ldr dataw2, [srcin, tmp]

- str dataw1, [dstin]

- str dataw2, [dstin, tmp]

- IFSTPCPY (add result, dstin, len)

- ret

-L(less4):

- cbz len, L(zerobyte)

- ldrh dataw1, [srcin]

- strh dataw1, [dstin]

-L(zerobyte):

- strb wzr, [dstin, len]

- IFSTPCPY (add result, dstin, len)

- ret

- .p2align 4

-L(start_loop):

- sub len, src, srcin

- ldr dataq2, [srcin]

- add dst, dstin, len

- str dataq2, [dstin]

- .p2align 5

-L(loop):

- str dataq, [dst], 16

- ldr dataq, [src, 16]!

- cmeq vhas_nul.16b, vdata.16b, 0

- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b

- fmov synd, dend

- cbz synd, L(loop)

- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b

- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */

- fmov synd, dend

-#ifndef __AARCH64EB__

- rbit synd, synd

-#endif

- clz len, synd

- lsr len, len, 2

- sub tmp, len, 15

- ldr dataq, [src, tmp]

- str dataq, [dst, tmp]

- IFSTPCPY (add result, dst, len)

- ret

-END (STRCPY)

diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index 6e9ed424b693..b99e49403be8 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S

@@ -1,311 +1,161 @@

* strcpy/stpcpy - copy a string returning pointer to start/end.

* SPDX-License-Identifier: MIT

/* Assumptions:

- * ARMv8-a, AArch64, unaligned accesses, min page size 4k.

+ * ARMv8-a, AArch64, Advanced SIMD.

+ * MTE compatible.

#include "../asmdefs.h"

-/* To build as stpcpy, define BUILD_STPCPY before compiling this file.

- To test the page crossing code path more thoroughly, compile with

- -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower

- entry path. This option is not intended for production use. */

-/* Arguments and results. */

#define dstin x0

#define srcin x1

+#define result x0

-/* Locals and temporaries. */

#define src x2

#define dst x3

-#define data1 x4

-#define data1w w4

-#define data2 x5

-#define data2w w5

-#define has_nul1 x6

-#define has_nul2 x7

-#define tmp1 x8

-#define tmp2 x9

-#define tmp3 x10

-#define tmp4 x11

-#define zeroones x12

-#define data1a x13

-#define data2a x14

-#define pos x15

-#define len x16

-#define to_align x17

+#define len x4

+#define synd x4

+#define tmp x5

+#define wtmp w5

+#define shift x5

+#define data1 x6

+#define dataw1 w6

+#define data2 x7

+#define dataw2 w7

+#define dataq q0

+#define vdata v0

+#define vhas_nul v1

+#define vrepmask v2

+#define vend v3

+#define dend d3

+#define dataq2 q1

#ifdef BUILD_STPCPY

-#define STRCPY __stpcpy_aarch64

+# define STRCPY __stpcpy_aarch64

+# define IFSTPCPY(X,...) X,__VA_ARGS__

#else

-#define STRCPY __strcpy_aarch64

+# define STRCPY __strcpy_aarch64

+# define IFSTPCPY(X,...)

#endif

- /* NUL detection works on the principle that (X - 1) & (~X) & 0x80

- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and

- can be done in parallel across the entire word. */

-#define REP8_01 0x0101010101010101

-#define REP8_7f 0x7f7f7f7f7f7f7f7f

-#define REP8_80 0x8080808080808080

+/* Core algorithm:

- /* AArch64 systems have a minimum page size of 4k. We can do a quick

- page size check for crossing this boundary on entry and if we

- do not, then we can short-circuit much of the entry code. We

- expect early page-crossing strings to be rare (probability of

- 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite

- predictable, even with random strings.

- We don't bother checking for larger page sizes, the cost of setting

- up the correct page size is just not worth the extra gain from

- a small reduction in the cases taking the slow path. Note that

- we only care about whether the first fetch, which may be

- misaligned, crosses a page boundary - after that we move to aligned

- fetches for the remainder of the string. */

-#ifdef STRCPY_TEST_PAGE_CROSS

- /* Make everything that isn't Qword aligned look like a page cross. */

-#define MIN_PAGE_P2 4

-#else

-#define MIN_PAGE_P2 12

-#endif

-#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)

+ For each 16-byte chunk we calculate a 64-bit syndrome value with four bits

+ per byte. For even bytes, bits 0-3 are set if the relevant byte matched the

+ requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are

+ set likewise for odd bytes so that adjacent bytes can be merged. Since the

+ bits in the syndrome reflect the order in which things occur in the original

+ string, counting trailing zeros identifies exactly which byte matched. */

ENTRY (STRCPY)

PTR_ARG (0)

PTR_ARG (1)

- /* For moderately short strings, the fastest way to do the copy is to

- calculate the length of the string in the same way as strlen, then

- essentially do a memcpy of the result. This avoids the need for

- multiple byte copies and further means that by the time we

- reach the bulk copy loop we know we can always use DWord

- accesses. We expect __strcpy_aarch64 to rarely be called repeatedly

- with the same source string, so branch prediction is likely to

- always be difficult - we mitigate against this by preferring

- conditional select operations over branches whenever this is

- feasible. */

- and tmp2, srcin, #(MIN_PAGE_SIZE - 1)

- mov zeroones, #REP8_01

- and to_align, srcin, #15

- cmp tmp2, #(MIN_PAGE_SIZE - 16)

- neg tmp1, to_align

- /* The first fetch will straddle a (possible) page boundary iff

- srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte

- aligned string will never fail the page align check, so will

- always take the fast path. */

- b.gt L(page_cross)

-L(page_cross_ok):

- ldp data1, data2, [srcin]

-#ifdef __AARCH64EB__

- /* Because we expect the end to be found within 16 characters

- (profiling shows this is the most common case), it's worth

- swapping the bytes now to save having to recalculate the

- termination syndrome later. We preserve data1 and data2

- so that we can re-use the values later on. */

- rev tmp2, data1

- sub tmp1, tmp2, zeroones

- orr tmp2, tmp2, #REP8_7f

- bics has_nul1, tmp1, tmp2

- b.ne L(fp_le8)

- rev tmp4, data2

- sub tmp3, tmp4, zeroones

- orr tmp4, tmp4, #REP8_7f

-#else

- sub tmp1, data1, zeroones

- orr tmp2, data1, #REP8_7f

- bics has_nul1, tmp1, tmp2

- b.ne L(fp_le8)

- sub tmp3, data2, zeroones

- orr tmp4, data2, #REP8_7f

+ bic src, srcin, 15

+ mov wtmp, 0xf00f

+ ld1 {vdata.16b}, [src]

+ dup vrepmask.8h, wtmp

+ cmeq vhas_nul.16b, vdata.16b, 0

+ lsl shift, srcin, 2

+ and vhas_nul.16b, vhas_nul.16b, vrepmask.16b

+ addp vend.16b, vhas_nul.16b, vhas_nul.16b

+ fmov synd, dend

+ lsr synd, synd, shift

+ cbnz synd, L(tail)

+ ldr dataq, [src, 16]!

+ cmeq vhas_nul.16b, vdata.16b, 0

+ and vhas_nul.16b, vhas_nul.16b, vrepmask.16b

+ addp vend.16b, vhas_nul.16b, vhas_nul.16b

+ fmov synd, dend

+ cbz synd, L(start_loop)

+#ifndef __AARCH64EB__

+ rbit synd, synd

#endif

- bics has_nul2, tmp3, tmp4

- b.eq L(bulk_entry)

+ sub tmp, src, srcin

+ clz len, synd

+ add len, tmp, len, lsr 2

+ tbz len, 4, L(less16)

+ sub tmp, len, 15

+ ldr dataq, [srcin]

+ ldr dataq2, [srcin, tmp]

+ str dataq, [dstin]

+ str dataq2, [dstin, tmp]

+ IFSTPCPY (add result, dstin, len)

+ ret

- /* The string is short (<=16 bytes). We don't know exactly how

- short though, yet. Work out the exact length so that we can

- quickly select the optimal copy strategy. */

-L(fp_gt8):

- rev has_nul2, has_nul2

- clz pos, has_nul2

- mov tmp2, #56

- add dst, dstin, pos, lsr #3 /* Bits to bytes. */

- sub pos, tmp2, pos

-#ifdef __AARCH64EB__

- lsr data2, data2, pos

-#else

- lsl data2, data2, pos

-#endif

- str data2, [dst, #1]

+ .p2align 4,,8

+L(tail):

+ rbit synd, synd

+ clz len, synd

+ lsr len, len, 2

+ .p2align 4

+L(less16):

+ tbz len, 3, L(less8)

+ sub tmp, len, 7

+ ldr data1, [srcin]

+ ldr data2, [srcin, tmp]

str data1, [dstin]

-#ifdef BUILD_STPCPY

- add dstin, dst, #8

-#endif

+ str data2, [dstin, tmp]

+ IFSTPCPY (add result, dstin, len)

ret

-L(fp_le8):

- rev has_nul1, has_nul1

- clz pos, has_nul1

- add dst, dstin, pos, lsr #3 /* Bits to bytes. */

- subs tmp2, pos, #24 /* Pos in bits. */

- b.lt L(fp_lt4)

-#ifdef __AARCH64EB__

- mov tmp2, #56

- sub pos, tmp2, pos

- lsr data2, data1, pos

- lsr data1, data1, #32

-#else

- lsr data2, data1, tmp2

-#endif

- /* 4->7 bytes to copy. */

- str data2w, [dst, #-3]

- str data1w, [dstin]

-#ifdef BUILD_STPCPY

- mov dstin, dst

-#endif

+ .p2align 4

+L(less8):

+ subs tmp, len, 3

+ b.lo L(less4)

+ ldr dataw1, [srcin]

+ ldr dataw2, [srcin, tmp]

+ str dataw1, [dstin]

+ str dataw2, [dstin, tmp]

+ IFSTPCPY (add result, dstin, len)

ret

-L(fp_lt4):

- cbz pos, L(fp_lt2)

- /* 2->3 bytes to copy. */

-#ifdef __AARCH64EB__

- lsr data1, data1, #48

-#endif

- strh data1w, [dstin]

- /* Fall-through, one byte (max) to go. */

-L(fp_lt2):

- /* Null-terminated string. Last character must be zero! */

- strb wzr, [dst]

-#ifdef BUILD_STPCPY

- mov dstin, dst

-#endif

- ret

- .p2align 6

- /* Aligning here ensures that the entry code and main loop all lies

- within one 64-byte cache line. */

-L(bulk_entry):

- sub to_align, to_align, #16

- stp data1, data2, [dstin]

- sub src, srcin, to_align

- sub dst, dstin, to_align

- b L(entry_no_page_cross)

- /* The inner loop deals with two Dwords at a time. This has a

- slightly higher start-up cost, but we should win quite quickly,

- especially on cores with a high number of issue slots per

- cycle, as we get much better parallelism out of the operations. */

-L(main_loop):

- stp data1, data2, [dst], #16

-L(entry_no_page_cross):

- ldp data1, data2, [src], #16

- sub tmp1, data1, zeroones

- orr tmp2, data1, #REP8_7f

- sub tmp3, data2, zeroones

- orr tmp4, data2, #REP8_7f

- bic has_nul1, tmp1, tmp2

- bics has_nul2, tmp3, tmp4

- ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */

- b.eq L(main_loop)

- /* Since we know we are copying at least 16 bytes, the fastest way

- to deal with the tail is to determine the location of the

- trailing NUL, then (re)copy the 16 bytes leading up to that. */

- cmp has_nul1, #0

-#ifdef __AARCH64EB__

- /* For big-endian, carry propagation (if the final byte in the

- string is 0x01) means we cannot use has_nul directly. The

- easiest way to get the correct byte is to byte-swap the data

- and calculate the syndrome a second time. */

- csel data1, data1, data2, ne

- rev data1, data1

- sub tmp1, data1, zeroones

- orr tmp2, data1, #REP8_7f

- bic has_nul1, tmp1, tmp2

-#else

- csel has_nul1, has_nul1, has_nul2, ne

-#endif

- rev has_nul1, has_nul1

- clz pos, has_nul1

- add tmp1, pos, #72

- add pos, pos, #8

- csel pos, pos, tmp1, ne

- add src, src, pos, lsr #3

- add dst, dst, pos, lsr #3

- ldp data1, data2, [src, #-32]

- stp data1, data2, [dst, #-16]

-#ifdef BUILD_STPCPY

- sub dstin, dst, #1

-#endif

+L(less4):

+ cbz len, L(zerobyte)

+ ldrh dataw1, [srcin]

+ strh dataw1, [dstin]

+L(zerobyte):

+ strb wzr, [dstin, len]

+ IFSTPCPY (add result, dstin, len)

ret

-L(page_cross):

- bic src, srcin, #15

- /* Start by loading two words at [srcin & ~15], then forcing the

- bytes that precede srcin to 0xff. This means they never look

- like termination bytes. */

- ldp data1, data2, [src]

- lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */

- tst to_align, #7

- csetm tmp2, ne

-#ifdef __AARCH64EB__

- lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */

-#else

- lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */

+ .p2align 4

+L(start_loop):

+ sub len, src, srcin

+ ldr dataq2, [srcin]

+ add dst, dstin, len

+ str dataq2, [dstin]

+ .p2align 5

+L(loop):

+ str dataq, [dst], 16

+ ldr dataq, [src, 16]!

+ cmeq vhas_nul.16b, vdata.16b, 0

+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b

+ fmov synd, dend

+ cbz synd, L(loop)

+ and vhas_nul.16b, vhas_nul.16b, vrepmask.16b

+ addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */

+ fmov synd, dend

+#ifndef __AARCH64EB__

+ rbit synd, synd

#endif

- orr data1, data1, tmp2

- orr data2a, data2, tmp2

- cmp to_align, #8

- csinv data1, data1, xzr, lt

- csel data2, data2, data2a, lt

- sub tmp1, data1, zeroones

- orr tmp2, data1, #REP8_7f

- sub tmp3, data2, zeroones

- orr tmp4, data2, #REP8_7f

- bic has_nul1, tmp1, tmp2

- bics has_nul2, tmp3, tmp4

- ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */

- b.eq L(page_cross_ok)

- /* We now need to make data1 and data2 look like they've been

- loaded directly from srcin. Do a rotate on the 128-bit value. */

- lsl tmp1, to_align, #3 /* Bytes->bits. */

- neg tmp2, to_align, lsl #3

-#ifdef __AARCH64EB__

- lsl data1a, data1, tmp1

- lsr tmp4, data2, tmp2

- lsl data2, data2, tmp1

- orr tmp4, tmp4, data1a

- cmp to_align, #8

- csel data1, tmp4, data2, lt

- rev tmp2, data1

- rev tmp4, data2

- sub tmp1, tmp2, zeroones

- orr tmp2, tmp2, #REP8_7f

- sub tmp3, tmp4, zeroones

- orr tmp4, tmp4, #REP8_7f

-#else

- lsr data1a, data1, tmp1

- lsl tmp4, data2, tmp2

- lsr data2, data2, tmp1

- orr tmp4, tmp4, data1a

- cmp to_align, #8

- csel data1, tmp4, data2, lt

- sub tmp1, data1, zeroones

- orr tmp2, data1, #REP8_7f

- sub tmp3, data2, zeroones

- orr tmp4, data2, #REP8_7f

-#endif

- bic has_nul1, tmp1, tmp2

- cbnz has_nul1, L(fp_le8)

- bic has_nul2, tmp3, tmp4

- b L(fp_gt8)

+ clz len, synd

+ lsr len, len, 2

+ sub tmp, len, 15

+ ldr dataq, [src, tmp]

+ str dataq, [dst, tmp]

+ IFSTPCPY (add result, dst, len)

+ ret

END (STRCPY)

diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S
deleted file mode 100644
index c9d6fc8a158b..000000000000
--- a/string/aarch64/strncmp-mte.S
+++ /dev/null

@@ -1,307 +0,0 @@

-/*

- * strncmp - compare two strings

- *

- * SPDX-License-Identifier: MIT

- */

-/* Assumptions:

- *

- * ARMv8-a, AArch64

- */

-#include "../asmdefs.h"

-#define REP8_01 0x0101010101010101

-#define REP8_7f 0x7f7f7f7f7f7f7f7f

-/* Parameters and result. */

-#define src1 x0

-#define src2 x1

-#define limit x2

-#define result x0

-/* Internal variables. */

-#define data1 x3

-#define data1w w3

-#define data2 x4

-#define data2w w4

-#define has_nul x5

-#define diff x6

-#define syndrome x7

-#define tmp1 x8

-#define tmp2 x9

-#define tmp3 x10

-#define zeroones x11

-#define pos x12

-#define mask x13

-#define endloop x14

-#define count mask

-#define offset pos

-#define neg_offset x15

-/* Define endian dependent shift operations.

- On big-endian early bytes are at MSB and on little-endian LSB.

- LS_FW means shifting towards early bytes.

- LS_BK means shifting towards later bytes.

- */

-#ifdef __AARCH64EB__

-#define LS_FW lsl

-#define LS_BK lsr

-#else

-#define LS_FW lsr

-#define LS_BK lsl

-#endif

-ENTRY (__strncmp_aarch64_mte)

- PTR_ARG (0)

- PTR_ARG (1)

- SIZE_ARG (2)

- cbz limit, L(ret0)

- eor tmp1, src1, src2

- mov zeroones, #REP8_01

- tst tmp1, #7

- and count, src1, #7

- b.ne L(misaligned8)

- cbnz count, L(mutual_align)

- /* NUL detection works on the principle that (X - 1) & (~X) & 0x80

- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and

- can be done in parallel across the entire word. */

- .p2align 4

-L(loop_aligned):

- ldr data1, [src1], #8

- ldr data2, [src2], #8

-L(start_realigned):

- subs limit, limit, #8

- sub tmp1, data1, zeroones

- orr tmp2, data1, #REP8_7f

- eor diff, data1, data2 /* Non-zero if differences found. */

- csinv endloop, diff, xzr, hi /* Last Dword or differences. */

- bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */

- ccmp endloop, #0, #0, eq

- b.eq L(loop_aligned)

- /* End of main loop */

-L(full_check):

-#ifndef __AARCH64EB__

- orr syndrome, diff, has_nul

- add limit, limit, 8 /* Rewind limit to before last subs. */

-L(syndrome_check):

- /* Limit was reached. Check if the NUL byte or the difference

- is before the limit. */

- rev syndrome, syndrome

- rev data1, data1

- clz pos, syndrome

- rev data2, data2

- lsl data1, data1, pos

- cmp limit, pos, lsr #3

- lsl data2, data2, pos

- /* But we need to zero-extend (char is unsigned) the value and then

- perform a signed 32-bit subtraction. */

- lsr data1, data1, #56

- sub result, data1, data2, lsr #56

- csel result, result, xzr, hi

- ret

-#else

- /* Not reached the limit, must have found the end or a diff. */

- tbz limit, #63, L(not_limit)

- add tmp1, limit, 8

- cbz limit, L(not_limit)

- lsl limit, tmp1, #3 /* Bits -> bytes. */

- mov mask, #~0

- lsr mask, mask, limit

- bic data1, data1, mask

- bic data2, data2, mask

- /* Make sure that the NUL byte is marked in the syndrome. */

- orr has_nul, has_nul, mask

-L(not_limit):

- /* For big-endian we cannot use the trick with the syndrome value

- as carry-propagation can corrupt the upper bits if the trailing

- bytes in the string contain 0x01. */

- /* However, if there is no NUL byte in the dword, we can generate

- the result directly. We can't just subtract the bytes as the

- MSB might be significant. */

- cbnz has_nul, 1f

- cmp data1, data2

- cset result, ne

- cneg result, result, lo

- ret

-1:

- /* Re-compute the NUL-byte detection, using a byte-reversed value. */

- rev tmp3, data1

- sub tmp1, tmp3, zeroones

- orr tmp2, tmp3, #REP8_7f

- bic has_nul, tmp1, tmp2

- rev has_nul, has_nul

- orr syndrome, diff, has_nul

- clz pos, syndrome

- /* The most-significant-non-zero bit of the syndrome marks either the

- first bit that is different, or the top bit of the first zero byte.

- Shifting left now will bring the critical information into the

- top bits. */

-L(end_quick):

- lsl data1, data1, pos

- lsl data2, data2, pos

- /* But we need to zero-extend (char is unsigned) the value and then

- perform a signed 32-bit subtraction. */

- lsr data1, data1, #56

- sub result, data1, data2, lsr #56

- ret

-#endif

-L(mutual_align):

- /* Sources are mutually aligned, but are not currently at an

- alignment boundary. Round down the addresses and then mask off

- the bytes that precede the start point.

- We also need to adjust the limit calculations, but without

- overflowing if the limit is near ULONG_MAX. */

- bic src1, src1, #7

- bic src2, src2, #7

- ldr data1, [src1], #8

- neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */

- ldr data2, [src2], #8

- mov tmp2, #~0

- LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */

- /* Adjust the limit and ensure it doesn't overflow. */

- adds limit, limit, count

- csinv limit, limit, xzr, lo

- orr data1, data1, tmp2

- orr data2, data2, tmp2

- b L(start_realigned)

- .p2align 4

- /* Don't bother with dwords for up to 16 bytes. */

-L(misaligned8):

- cmp limit, #16

- b.hs L(try_misaligned_words)

-L(byte_loop):

- /* Perhaps we can do better than this. */

- ldrb data1w, [src1], #1

- ldrb data2w, [src2], #1

- subs limit, limit, #1

- ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */

- ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */

- b.eq L(byte_loop)

-L(done):

- sub result, data1, data2

- ret

- /* Align the SRC1 to a dword by doing a bytewise compare and then do

- the dword loop. */

-L(try_misaligned_words):

- cbz count, L(src1_aligned)

- neg count, count

- and count, count, #7

- sub limit, limit, count

-L(page_end_loop):

- ldrb data1w, [src1], #1

- ldrb data2w, [src2], #1

- cmp data1w, #1

- ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */

- b.ne L(done)

- subs count, count, #1

- b.hi L(page_end_loop)

- /* The following diagram explains the comparison of misaligned strings.

- The bytes are shown in natural order. For little-endian, it is

- reversed in the registers. The "x" bytes are before the string.

- The "|" separates data that is loaded at one time.

- src1 | a a a a a a a a | b b b c c c c c | . . .

- src2 | x x x x x a a a a a a a a b b b | c c c c c . . .

- After shifting in each step, the data looks like this:

- STEP_A STEP_B STEP_C

- data1 a a a a a a a a b b b c c c c c b b b c c c c c

- data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c

- The bytes with "0" are eliminated from the syndrome via mask.

- Align SRC2 down to 16 bytes. This way we can read 16 bytes at a

- time from SRC2. The comparison happens in 3 steps. After each step

- the loop can exit, or read from SRC1 or SRC2. */

-L(src1_aligned):

- /* Calculate offset from 8 byte alignment to string start in bits. No

- need to mask offset since shifts are ignoring upper bits. */

- lsl offset, src2, #3

- bic src2, src2, #0xf

- mov mask, -1

- neg neg_offset, offset

- ldr data1, [src1], #8

- ldp tmp1, tmp2, [src2], #16

- LS_BK mask, mask, neg_offset

- and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */

- /* Skip the first compare if data in tmp1 is irrelevant. */

- tbnz offset, 6, L(misaligned_mid_loop)

-L(loop_misaligned):

- /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/

- LS_FW data2, tmp1, offset

- LS_BK tmp1, tmp2, neg_offset

- subs limit, limit, #8

- orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/

- sub has_nul, data1, zeroones

- eor diff, data1, data2 /* Non-zero if differences found. */

- orr tmp3, data1, #REP8_7f

- csinv endloop, diff, xzr, hi /* If limit, set to all ones. */

- bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */

- orr tmp3, endloop, has_nul

- cbnz tmp3, L(full_check)

- ldr data1, [src1], #8

-L(misaligned_mid_loop):

- /* STEP_B: Compare first part of data1 to second part of tmp2. */

- LS_FW data2, tmp2, offset

-#ifdef __AARCH64EB__

- /* For big-endian we do a byte reverse to avoid carry-propagation

- problem described above. This way we can reuse the has_nul in the

- next step and also use syndrome value trick at the end. */

- rev tmp3, data1

- #define data1_fixed tmp3

-#else

- #define data1_fixed data1

-#endif

- sub has_nul, data1_fixed, zeroones

- orr tmp3, data1_fixed, #REP8_7f

- eor diff, data2, data1 /* Non-zero if differences found. */

- bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */

-#ifdef __AARCH64EB__

- rev has_nul, has_nul

-#endif

- cmp limit, neg_offset, lsr #3

- orr syndrome, diff, has_nul

- bic syndrome, syndrome, mask /* Ignore later bytes. */

- csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */

- cbnz tmp3, L(syndrome_check)

- /* STEP_C: Compare second part of data1 to first part of tmp1. */

- ldp tmp1, tmp2, [src2], #16

- cmp limit, #8

- LS_BK data2, tmp1, neg_offset

- eor diff, data2, data1 /* Non-zero if differences found. */

- orr syndrome, diff, has_nul

- and syndrome, syndrome, mask /* Ignore earlier bytes. */

- csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */

- cbnz tmp3, L(syndrome_check)

- ldr data1, [src1], #8

- sub limit, limit, #8

- b L(loop_misaligned)

-#ifdef __AARCH64EB__

-L(syndrome_check):

- clz pos, syndrome

- cmp pos, limit, lsl #3

- b.lo L(end_quick)

-#endif

-L(ret0):

- mov result, #0

- ret

-END(__strncmp_aarch64_mte)

diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
index 738b6539cab6..7e636b4a593d 100644
--- a/string/aarch64/strncmp.S
+++ b/string/aarch64/strncmp.S

@@ -1,20 +1,20 @@

* strncmp - compare two strings

* SPDX-License-Identifier: MIT

/* Assumptions:

- * ARMv8-a, AArch64

+ * ARMv8-a, AArch64.

+ * MTE compatible.

#include "../asmdefs.h"

#define REP8_01 0x0101010101010101

#define REP8_7f 0x7f7f7f7f7f7f7f7f

-#define REP8_80 0x8080808080808080

/* Parameters and result. */

#define src1 x0

@@ -35,10 +35,24 @@

#define tmp3 x10

#define zeroones x11

#define pos x12

-#define limit_wd x13

-#define mask x14

-#define endloop x15

+#define mask x13

+#define endloop x14

#define count mask

+#define offset pos

+#define neg_offset x15

+/* Define endian dependent shift operations.

+ On big-endian early bytes are at MSB and on little-endian LSB.

+ LS_FW means shifting towards early bytes.

+ LS_BK means shifting towards later bytes.

+ */

+#ifdef __AARCH64EB__

+#define LS_FW lsl

+#define LS_BK lsr

+#else

+#define LS_FW lsr

+#define LS_BK lsl

+#endif

ENTRY (__strncmp_aarch64)

PTR_ARG (0)

@@ -51,9 +65,6 @@ ENTRY (__strncmp_aarch64)

and count, src1, #7

b.ne L(misaligned8)

cbnz count, L(mutual_align)

- /* Calculate the number of full and partial words -1. */

- sub limit_wd, limit, #1 /* limit != 0, so no underflow. */

- lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */

/* NUL detection works on the principle that (X - 1) & (~X) & 0x80

(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and

@@ -63,56 +74,52 @@ L(loop_aligned):

ldr data1, [src1], #8

ldr data2, [src2], #8

L(start_realigned):

- subs limit_wd, limit_wd, #1

+ subs limit, limit, #8

sub tmp1, data1, zeroones

orr tmp2, data1, #REP8_7f

eor diff, data1, data2 /* Non-zero if differences found. */

- csinv endloop, diff, xzr, pl /* Last Dword or differences. */

+ csinv endloop, diff, xzr, hi /* Last Dword or differences. */

bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */

ccmp endloop, #0, #0, eq

b.eq L(loop_aligned)

/* End of main loop */

- /* Not reached the limit, must have found the end or a diff. */

- tbz limit_wd, #63, L(not_limit)

- /* Limit % 8 == 0 => all bytes significant. */

- ands limit, limit, #7

- b.eq L(not_limit)

- lsl limit, limit, #3 /* Bits -> bytes. */

- mov mask, #~0

-#ifdef __AARCH64EB__

- lsr mask, mask, limit

-#else

- lsl mask, mask, limit

-#endif

- bic data1, data1, mask

- bic data2, data2, mask

- /* Make sure that the NUL byte is marked in the syndrome. */

- orr has_nul, has_nul, mask

-L(not_limit):

+L(full_check):

+#ifndef __AARCH64EB__

orr syndrome, diff, has_nul

-#ifndef __AARCH64EB__

+ add limit, limit, 8 /* Rewind limit to before last subs. */

+L(syndrome_check):

+ /* Limit was reached. Check if the NUL byte or the difference

+ is before the limit. */

rev syndrome, syndrome

rev data1, data1

- /* The MS-non-zero bit of the syndrome marks either the first bit

- that is different, or the top bit of the first zero byte.

- Shifting left now will bring the critical information into the

- top bits. */

clz pos, syndrome

rev data2, data2

lsl data1, data1, pos

+ cmp limit, pos, lsr #3

lsl data2, data2, pos

/* But we need to zero-extend (char is unsigned) the value and then

perform a signed 32-bit subtraction. */

lsr data1, data1, #56

sub result, data1, data2, lsr #56

+ csel result, result, xzr, hi

ret

#else

+ /* Not reached the limit, must have found the end or a diff. */

+ tbz limit, #63, L(not_limit)

+ add tmp1, limit, 8

+ cbz limit, L(not_limit)

+ lsl limit, tmp1, #3 /* Bits -> bytes. */

+ mov mask, #~0

+ lsr mask, mask, limit

+ bic data1, data1, mask

+ bic data2, data2, mask

+ /* Make sure that the NUL byte is marked in the syndrome. */

+ orr has_nul, has_nul, mask

+L(not_limit):

/* For big-endian we cannot use the trick with the syndrome value

as carry-propagation can corrupt the upper bits if the trailing

bytes in the string contain 0x01. */

@@ -133,10 +140,11 @@ L(not_limit):

rev has_nul, has_nul

orr syndrome, diff, has_nul

clz pos, syndrome

- /* The MS-non-zero bit of the syndrome marks either the first bit

- that is different, or the top bit of the first zero byte.

+ /* The most-significant-non-zero bit of the syndrome marks either the

+ first bit that is different, or the top bit of the first zero byte.

Shifting left now will bring the critical information into the

top bits. */

+L(end_quick):

lsl data1, data1, pos

lsl data2, data2, pos

/* But we need to zero-extend (char is unsigned) the value and then

@@ -158,22 +166,12 @@ L(mutual_align):

neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */

ldr data2, [src2], #8

mov tmp2, #~0

- sub limit_wd, limit, #1 /* limit != 0, so no underflow. */

-#ifdef __AARCH64EB__

- /* Big-endian. Early bytes are at MSB. */

- lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */

-#else

- /* Little-endian. Early bytes are at LSB. */

- lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */

-#endif

- and tmp3, limit_wd, #7

- lsr limit_wd, limit_wd, #3

- /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */

- add limit, limit, count

- add tmp3, tmp3, count

+ LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */

+ /* Adjust the limit and ensure it doesn't overflow. */

+ adds limit, limit, count

+ csinv limit, limit, xzr, lo

orr data1, data1, tmp2

orr data2, data2, tmp2

- add limit_wd, limit_wd, tmp3, lsr #3

b L(start_realigned)

.p2align 4

@@ -196,13 +194,11 @@ L(done):

/* Align the SRC1 to a dword by doing a bytewise compare and then do

the dword loop. */

L(try_misaligned_words):

- lsr limit_wd, limit, #3

- cbz count, L(do_misaligned)

+ cbz count, L(src1_aligned)

neg count, count

and count, count, #7

sub limit, limit, count

- lsr limit_wd, limit, #3

L(page_end_loop):

ldrb data1w, [src1], #1

@@ -213,48 +209,100 @@ L(page_end_loop):

subs count, count, #1

b.hi L(page_end_loop)

-L(do_misaligned):

- /* Prepare ourselves for the next page crossing. Unlike the aligned

- loop, we fetch 1 less dword because we risk crossing bounds on

- SRC2. */

- mov count, #8

- subs limit_wd, limit_wd, #1

- b.lo L(done_loop)

-L(loop_misaligned):

- and tmp2, src2, #0xff8

- eor tmp2, tmp2, #0xff8

- cbz tmp2, L(page_end_loop)

+ /* The following diagram explains the comparison of misaligned strings.

+ The bytes are shown in natural order. For little-endian, it is

+ reversed in the registers. The "x" bytes are before the string.

+ The "|" separates data that is loaded at one time.

+ src1 | a a a a a a a a | b b b c c c c c | . . .

+ src2 | x x x x x a a a a a a a a b b b | c c c c c . . .

+ After shifting in each step, the data looks like this:

+ STEP_A STEP_B STEP_C

+ data1 a a a a a a a a b b b c c c c c b b b c c c c c

+ data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c

+ The bytes with "0" are eliminated from the syndrome via mask.

+ Align SRC2 down to 16 bytes. This way we can read 16 bytes at a

+ time from SRC2. The comparison happens in 3 steps. After each step

+ the loop can exit, or read from SRC1 or SRC2. */

+L(src1_aligned):

+ /* Calculate offset from 8 byte alignment to string start in bits. No

+ need to mask offset since shifts are ignoring upper bits. */

+ lsl offset, src2, #3

+ bic src2, src2, #0xf

+ mov mask, -1

+ neg neg_offset, offset

ldr data1, [src1], #8

- ldr data2, [src2], #8

- sub tmp1, data1, zeroones

- orr tmp2, data1, #REP8_7f

- eor diff, data1, data2 /* Non-zero if differences found. */

- bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */

- ccmp diff, #0, #0, eq

- b.ne L(not_limit)

- subs limit_wd, limit_wd, #1

- b.pl L(loop_misaligned)

+ ldp tmp1, tmp2, [src2], #16

+ LS_BK mask, mask, neg_offset

+ and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */

+ /* Skip the first compare if data in tmp1 is irrelevant. */

+ tbnz offset, 6, L(misaligned_mid_loop)

-L(done_loop):

- /* We found a difference or a NULL before the limit was reached. */

- and limit, limit, #7

- cbz limit, L(not_limit)

- /* Read the last word. */

- sub src1, src1, 8

- sub src2, src2, 8

- ldr data1, [src1, limit]

- ldr data2, [src2, limit]

- sub tmp1, data1, zeroones

- orr tmp2, data1, #REP8_7f

+L(loop_misaligned):

+ /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/

+ LS_FW data2, tmp1, offset

+ LS_BK tmp1, tmp2, neg_offset

+ subs limit, limit, #8

+ orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/

+ sub has_nul, data1, zeroones

eor diff, data1, data2 /* Non-zero if differences found. */

- bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */

- ccmp diff, #0, #0, eq

- b.ne L(not_limit)

+ orr tmp3, data1, #REP8_7f

+ csinv endloop, diff, xzr, hi /* If limit, set to all ones. */

+ bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */

+ orr tmp3, endloop, has_nul

+ cbnz tmp3, L(full_check)

+ ldr data1, [src1], #8

+L(misaligned_mid_loop):

+ /* STEP_B: Compare first part of data1 to second part of tmp2. */

+ LS_FW data2, tmp2, offset

+#ifdef __AARCH64EB__

+ /* For big-endian we do a byte reverse to avoid carry-propagation

+ problem described above. This way we can reuse the has_nul in the

+ next step and also use syndrome value trick at the end. */

+ rev tmp3, data1

+ #define data1_fixed tmp3

+#else

+ #define data1_fixed data1

+#endif

+ sub has_nul, data1_fixed, zeroones

+ orr tmp3, data1_fixed, #REP8_7f

+ eor diff, data2, data1 /* Non-zero if differences found. */

+ bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */

+#ifdef __AARCH64EB__

+ rev has_nul, has_nul

+#endif

+ cmp limit, neg_offset, lsr #3

+ orr syndrome, diff, has_nul

+ bic syndrome, syndrome, mask /* Ignore later bytes. */

+ csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */

+ cbnz tmp3, L(syndrome_check)

+ /* STEP_C: Compare second part of data1 to first part of tmp1. */

+ ldp tmp1, tmp2, [src2], #16

+ cmp limit, #8

+ LS_BK data2, tmp1, neg_offset

+ eor diff, data2, data1 /* Non-zero if differences found. */

+ orr syndrome, diff, has_nul

+ and syndrome, syndrome, mask /* Ignore earlier bytes. */

+ csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */

+ cbnz tmp3, L(syndrome_check)

+ ldr data1, [src1], #8

+ sub limit, limit, #8

+ b L(loop_misaligned)

+#ifdef __AARCH64EB__

+L(syndrome_check):

+ clz pos, syndrome

+ cmp pos, limit, lsl #3

+ b.lo L(end_quick)

+#endif

L(ret0):

mov result, #0

ret

-END ( __strncmp_aarch64)

+END(__strncmp_aarch64)

diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c
index d5d4ea7e0309..6bd27633e224 100644
--- a/string/bench/memcpy.c
+++ b/string/bench/memcpy.c

@@ -1,7 +1,7 @@

* memcpy benchmark.

* SPDX-License-Identifier: MIT

@@ -13,14 +13,15 @@

#include "stringlib.h"

#include "benchlib.h"

-#define ITERS 5000

+#define ITERS 5000

#define ITERS2 20000000

-#define ITERS3 500000

-#define MAX_COPIES 8192

-#define SIZE (256*1024)

+#define ITERS3 200000

+#define NUM_TESTS 16384

+#define MIN_SIZE 32768

+#define MAX_SIZE (1024 * 1024)

-static uint8_t a[SIZE + 4096] __attribute__((__aligned__(64)));

-static uint8_t b[SIZE + 4096] __attribute__((__aligned__(64)));

+static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));

+static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));

#define F(x) {#x, x},

@@ -30,15 +31,18 @@ static const struct fun

void *(*fun)(void *, const void *, size_t);

} funtab[] =

{

- F(memcpy)

#if __aarch64__

F(__memcpy_aarch64)

# if __ARM_NEON

F(__memcpy_aarch64_simd)

# endif

+# if __ARM_FEATURE_SVE

+ F(__memcpy_aarch64_sve)

+# endif

#elif __arm__

F(__memcpy_arm)

#endif

+ F(memcpy)

#undef F

{0, 0}

};

@@ -109,7 +113,7 @@ typedef struct

uint64_t len : 16;

} copy_t;

-static copy_t copy[MAX_COPIES];

+static copy_t test_arr[NUM_TESTS];

typedef char *(*proto_t) (char *, const char *, size_t);

@@ -140,14 +144,14 @@ init_copies (size_t max_size)

size_t total = 0;

/* Create a random set of copies with the given size and alignment

distributions. */

- for (int i = 0; i < MAX_COPIES; i++)

+ for (int i = 0; i < NUM_TESTS; i++)

{

- copy[i].dst = (rand32 (0) & (max_size - 1));

- copy[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];

- copy[i].src = (rand32 (0) & (max_size - 1));

- copy[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];

- copy[i].len = size_arr[rand32 (0) & SIZE_MASK];

- total += copy[i].len;

+ test_arr[i].dst = (rand32 (0) & (max_size - 1));

+ test_arr[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];

+ test_arr[i].src = (rand32 (0) & (max_size - 1));

+ test_arr[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];

+ test_arr[i].len = size_arr[rand32 (0) & SIZE_MASK];

+ total += test_arr[i].len;

}

return total;

@@ -160,25 +164,27 @@ int main (void)

memset (a, 1, sizeof (a));

memset (b, 2, sizeof (b));

- printf("Random memcpy:\n");

+ printf("Random memcpy (bytes/ns):\n");

for (int f = 0; funtab[f].name != 0; f++)

{

size_t total = 0;

uint64_t tsum = 0;

- printf ("%22s (B/ns) ", funtab[f].name);

+ printf ("%22s ", funtab[f].name);

rand32 (0x12345678);

- for (int size = 16384; size <= SIZE; size *= 2)

+ for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)

{

size_t copy_size = init_copies (size) * ITERS;

- for (int c = 0; c < MAX_COPIES; c++)

- funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);

+ for (int c = 0; c < NUM_TESTS; c++)

+ funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,

+ test_arr[c].len);

uint64_t t = clock_get_ns ();

for (int i = 0; i < ITERS; i++)

- for (int c = 0; c < MAX_COPIES; c++)

- funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);

+ for (int c = 0; c < NUM_TESTS; c++)

+ funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,

+ test_arr[c].len);

t = clock_get_ns () - t;

total += copy_size;

tsum += t;

@@ -187,74 +193,147 @@ int main (void)

printf( "avg %.2f\n", (double)total / tsum);

}

- printf ("\nMedium memcpy:\n");

+ size_t total = 0;

+ uint64_t tsum = 0;

+ printf ("%22s ", "memcpy_call");

+ rand32 (0x12345678);

+ for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)

+ {

+ size_t copy_size = init_copies (size) * ITERS;

+ for (int c = 0; c < NUM_TESTS; c++)

+ memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);

+ uint64_t t = clock_get_ns ();

+ for (int i = 0; i < ITERS; i++)

+ for (int c = 0; c < NUM_TESTS; c++)

+ memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);

+ t = clock_get_ns () - t;

+ total += copy_size;

+ tsum += t;

+ printf ("%dK: %.2f ", size / 1024, (double)copy_size / t);

+ }

+ printf( "avg %.2f\n", (double)total / tsum);

+ printf ("\nAligned medium memcpy (bytes/ns):\n");

for (int f = 0; funtab[f].name != 0; f++)

{

- printf ("%22s (B/ns) ", funtab[f].name);

+ printf ("%22s ", funtab[f].name);

- for (int size = 16; size <= 512; size *= 2)

+ for (int size = 8; size <= 512; size *= 2)

{

uint64_t t = clock_get_ns ();

for (int i = 0; i < ITERS2; i++)

funtab[f].fun (b, a, size);

t = clock_get_ns () - t;

- printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,

- size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);

+ printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);

+ }

+ printf ("\n");

+ }

+ printf ("%22s ", "memcpy_call");

+ for (int size = 8; size <= 512; size *= 2)

+ {

+ uint64_t t = clock_get_ns ();

+ for (int i = 0; i < ITERS2; i++)

+ memcpy (b, a, size);

+ t = clock_get_ns () - t;

+ printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);

+ }

+ printf ("\n");

+ printf ("\nUnaligned medium memcpy (bytes/ns):\n");

+ for (int f = 0; funtab[f].name != 0; f++)

+ {

+ printf ("%22s ", funtab[f].name);

+ for (int size = 8; size <= 512; size *= 2)

+ {

+ uint64_t t = clock_get_ns ();

+ for (int i = 0; i < ITERS2; i++)

+ funtab[f].fun (b + 3, a + 1, size);

+ t = clock_get_ns () - t;

+ printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);

}

printf ("\n");

}

- printf ("\nLarge memcpy:\n");

+ printf ("%22s ", "memcpy_call");

+ for (int size = 8; size <= 512; size *= 2)

+ {

+ uint64_t t = clock_get_ns ();

+ for (int i = 0; i < ITERS2; i++)

+ memcpy (b + 3, a + 1, size);

+ t = clock_get_ns () - t;

+ printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);

+ }

+ printf ("\n");

+ printf ("\nLarge memcpy (bytes/ns):\n");

for (int f = 0; funtab[f].name != 0; f++)

{

- printf ("%22s (B/ns) ", funtab[f].name);

+ printf ("%22s ", funtab[f].name);

- for (int size = 1024; size <= 32768; size *= 2)

+ for (int size = 1024; size <= 65536; size *= 2)

{

uint64_t t = clock_get_ns ();

for (int i = 0; i < ITERS3; i++)

funtab[f].fun (b, a, size);

t = clock_get_ns () - t;

- printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,

- size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);

+ printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);

}

printf ("\n");

}

- printf ("\nUnaligned forwards memmove:\n");

+ printf ("%22s ", "memcpy_call");

+ for (int size = 1024; size <= 65536; size *= 2)

+ {

+ uint64_t t = clock_get_ns ();

+ for (int i = 0; i < ITERS3; i++)

+ memcpy (b, a, size);

+ t = clock_get_ns () - t;

+ printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);

+ }

+ printf ("\n");

+ printf ("\nUnaligned forwards memmove (bytes/ns):\n");

for (int f = 0; funtab[f].name != 0; f++)

{

- printf ("%22s (B/ns) ", funtab[f].name);

+ printf ("%22s ", funtab[f].name);

- for (int size = 1024; size <= 32768; size *= 2)

+ for (int size = 1024; size <= 65536; size *= 2)

{

uint64_t t = clock_get_ns ();

for (int i = 0; i < ITERS3; i++)

funtab[f].fun (a, a + 256 + (i & 31), size);

t = clock_get_ns () - t;

- printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,

- size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);

+ printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);

}

printf ("\n");

}

- printf ("\nUnaligned backwards memmove:\n");

+ printf ("\nUnaligned backwards memmove (bytes/ns):\n");

for (int f = 0; funtab[f].name != 0; f++)

{

- printf ("%22s (B/ns) ", funtab[f].name);

+ printf ("%22s ", funtab[f].name);

- for (int size = 1024; size <= 32768; size *= 2)

+ for (int size = 1024; size <= 65536; size *= 2)

{

uint64_t t = clock_get_ns ();

for (int i = 0; i < ITERS3; i++)

funtab[f].fun (a + 256 + (i & 31), a, size);

t = clock_get_ns () - t;

- printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,

- size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);

+ printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);

}

printf ("\n");

}

+ printf ("\n");

return 0;

}

diff --git a/string/bench/memset.c b/string/bench/memset.c
new file mode 100644
index 000000000000..2d6196931307
--- /dev/null
+++ b/string/bench/memset.c

@@ -0,0 +1,243 @@

+/*

+ * memset benchmark.

+ *

+ * SPDX-License-Identifier: MIT

+ */

+#define _GNU_SOURCE

+#include <stdint.h>

+#include <stdio.h>

+#include <string.h>

+#include <assert.h>

+#include "stringlib.h"

+#include "benchlib.h"

+#define ITERS 5000

+#define ITERS2 20000000

+#define ITERS3 1000000

+#define NUM_TESTS 16384

+#define MIN_SIZE 32768

+#define MAX_SIZE (1024 * 1024)

+static uint8_t a[MAX_SIZE + 4096] __attribute__((__aligned__(64)));

+#define F(x) {#x, x},

+static const struct fun

+ const char *name;

+ void *(*fun)(void *, int, size_t);

+} funtab[] =

+#if __aarch64__

+ F(__memset_aarch64)

+#elif __arm__

+ F(__memset_arm)

+#endif

+ F(memset)

+#undef F

+ {0, 0}

+};

+typedef struct { uint32_t offset : 20, len : 12; } memset_test_t;

+static memset_test_t test_arr[NUM_TESTS];

+typedef struct { uint16_t size; uint16_t freq; } freq_data_t;

+typedef struct { uint8_t align; uint16_t freq; } align_data_t;

+#define SIZE_NUM 65536

+#define SIZE_MASK (SIZE_NUM-1)

+static uint8_t len_arr[SIZE_NUM];

+/* Frequency data for memset sizes up to 4096 based on SPEC2017. */

+static freq_data_t memset_len_freq[] =

+{40,28817}, {32,15336}, { 16,3823}, {296,3545}, { 24,3454}, { 8,1412},

+{292,1202}, { 48, 927}, { 12, 613}, { 11, 539}, {284, 493}, {108, 414},

+{ 88, 380}, { 20, 295}, {312, 271}, { 72, 233}, { 2, 200}, { 4, 192},

+{ 15, 180}, { 14, 174}, { 13, 160}, { 56, 151}, { 36, 144}, { 64, 140},

+{4095,133}, { 10, 130}, { 9, 124}, { 3, 124}, { 28, 120}, { 0, 118},

+{288, 110}, {1152, 96}, {104, 90}, { 1, 86}, {832, 76}, {248, 74},

+{1024, 69}, {120, 64}, {512, 63}, {384, 60}, { 6, 59}, { 80, 54},

+{ 17, 50}, { 7, 49}, {520, 47}, {2048, 39}, {256, 37}, {864, 33},

+{1440, 28}, { 22, 27}, {2056, 24}, {260, 23}, { 68, 23}, { 5, 22},

+{ 18, 21}, {200, 18}, {2120, 18}, { 60, 17}, { 52, 16}, {336, 15},

+{ 44, 13}, {192, 13}, {160, 12}, {2064, 12}, {128, 12}, { 76, 11},

+{164, 11}, {152, 10}, {136, 9}, {488, 7}, { 96, 6}, {560, 6},

+{1016, 6}, {112, 5}, {232, 5}, {168, 5}, {952, 5}, {184, 5},

+{144, 4}, {252, 4}, { 84, 3}, {960, 3}, {3808, 3}, {244, 3},

+{280, 3}, {224, 3}, {156, 3}, {1088, 3}, {440, 3}, {216, 2},

+{304, 2}, { 23, 2}, { 25, 2}, { 26, 2}, {264, 2}, {328, 2},

+{1096, 2}, {240, 2}, {1104, 2}, {704, 2}, {1664, 2}, {360, 2},

+{808, 1}, {544, 1}, {236, 1}, {720, 1}, {368, 1}, {424, 1},

+{640, 1}, {1112, 1}, {552, 1}, {272, 1}, {776, 1}, {376, 1},

+{ 92, 1}, {536, 1}, {824, 1}, {496, 1}, {760, 1}, {792, 1},

+{504, 1}, {344, 1}, {1816, 1}, {880, 1}, {176, 1}, {320, 1},

+{352, 1}, {2008, 1}, {208, 1}, {408, 1}, {228, 1}, {2072, 1},

+{568, 1}, {220, 1}, {616, 1}, {600, 1}, {392, 1}, {696, 1},

+{2144, 1}, {1280, 1}, {2136, 1}, {632, 1}, {584, 1}, {456, 1},

+{472, 1}, {3440, 1}, {2088, 1}, {680, 1}, {2928, 1}, {212, 1},

+{648, 1}, {1752, 1}, {664, 1}, {3512, 1}, {1032, 1}, {528, 1},

+{4072, 1}, {204, 1}, {2880, 1}, {3392, 1}, {712, 1}, { 59, 1},

+{736, 1}, {592, 1}, {2520, 1}, {744, 1}, {196, 1}, {172, 1},

+{728, 1}, {2040, 1}, {1192, 1}, {3600, 1}, {0, 0}

+};

+#define ALIGN_NUM 1024

+#define ALIGN_MASK (ALIGN_NUM-1)

+static uint8_t align_arr[ALIGN_NUM];

+/* Alignment data for memset based on SPEC2017. */

+static align_data_t memset_align_freq[] =

+ {16, 338}, {8, 307}, {32, 148}, {64, 131}, {4, 72}, {1, 23}, {2, 5}, {0, 0}

+};

+static void

+init_memset_distribution (void)

+ int i, j, freq, size, n;

+ for (n = i = 0; (freq = memset_len_freq[i].freq) != 0; i++)

+ for (j = 0, size = memset_len_freq[i].size; j < freq; j++)

+ len_arr[n++] = size;

+ assert (n == SIZE_NUM);

+ for (n = i = 0; (freq = memset_align_freq[i].freq) != 0; i++)

+ for (j = 0, size = memset_align_freq[i].align; j < freq; j++)

+ align_arr[n++] = size - 1;

+ assert (n == ALIGN_NUM);

+static size_t

+init_memset (size_t max_size)

+ size_t total = 0;

+ /* Create a random set of memsets with the given size and alignment

+ distributions. */

+ for (int i = 0; i < NUM_TESTS; i++)

+ {

+ test_arr[i].offset = (rand32 (0) & (max_size - 1));

+ test_arr[i].offset &= ~align_arr[rand32 (0) & ALIGN_MASK];

+ test_arr[i].len = len_arr[rand32 (0) & SIZE_MASK];

+ total += test_arr[i].len;

+ }

+ return total;

+int main (void)

+ init_memset_distribution ();

+ memset (a, 1, sizeof (a));

+ printf("Random memset (bytes/ns):\n");

+ for (int f = 0; funtab[f].name != 0; f++)

+ {

+ size_t total_size = 0;

+ uint64_t tsum = 0;

+ printf ("%22s ", funtab[f].name);

+ rand32 (0x12345678);

+ for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)

+ {

+ size_t memset_size = init_memset (size) * ITERS;

+ for (int c = 0; c < NUM_TESTS; c++)

+ funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);

+ uint64_t t = clock_get_ns ();

+ for (int i = 0; i < ITERS; i++)

+ for (int c = 0; c < NUM_TESTS; c++)

+ funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);

+ t = clock_get_ns () - t;

+ total_size += memset_size;

+ tsum += t;

+ printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);

+ }

+ printf( "avg %.2f\n", (double)total_size / tsum);

+ }

+ size_t total_size = 0;

+ uint64_t tsum = 0;

+ printf ("%22s ", "memset_call");

+ rand32 (0x12345678);

+ for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)

+ {

+ size_t memset_size = init_memset (size) * ITERS;

+ for (int c = 0; c < NUM_TESTS; c++)

+ memset (a + test_arr[c].offset, 0, test_arr[c].len);

+ uint64_t t = clock_get_ns ();

+ for (int i = 0; i < ITERS; i++)

+ for (int c = 0; c < NUM_TESTS; c++)

+ memset (a + test_arr[c].offset, 0, test_arr[c].len);

+ t = clock_get_ns () - t;

+ total_size += memset_size;

+ tsum += t;

+ printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);

+ }

+ printf( "avg %.2f\n", (double)total_size / tsum);

+ printf ("\nMedium memset (bytes/ns):\n");

+ for (int f = 0; funtab[f].name != 0; f++)

+ {

+ printf ("%22s ", funtab[f].name);

+ for (int size = 8; size <= 512; size *= 2)

+ {

+ uint64_t t = clock_get_ns ();

+ for (int i = 0; i < ITERS2; i++)

+ funtab[f].fun (a, 0, size);

+ t = clock_get_ns () - t;

+ printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);

+ }

+ printf ("\n");

+ }

+ printf ("%22s ", "memset_call");

+ for (int size = 8; size <= 512; size *= 2)

+ {

+ uint64_t t = clock_get_ns ();

+ for (int i = 0; i < ITERS2; i++)

+ memset (a, 0, size);

+ t = clock_get_ns () - t;

+ printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);

+ }

+ printf ("\nLarge memset (bytes/ns):\n");

+ for (int f = 0; funtab[f].name != 0; f++)

+ {

+ printf ("%22s ", funtab[f].name);

+ for (int size = 1024; size <= 65536; size *= 2)

+ {

+ uint64_t t = clock_get_ns ();

+ for (int i = 0; i < ITERS3; i++)

+ funtab[f].fun (a, 0, size);

+ t = clock_get_ns () - t;

+ printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);

+ }

+ printf ("\n");

+ }

+ printf ("%22s ", "memset_call");

+ for (int size = 1024; size <= 65536; size *= 2)

+ {

+ uint64_t t = clock_get_ns ();

+ for (int i = 0; i < ITERS3; i++)

+ memset (a, 0, size);

+ t = clock_get_ns () - t;

+ printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);

+ }

+ printf ("\n\n");

+ return 0;

diff --git a/string/bench/strlen.c b/string/bench/strlen.c
index cc0f04bee547..b7eee6e905ab 100644
--- a/string/bench/strlen.c
+++ b/string/bench/strlen.c

@@ -1,7 +1,7 @@

* strlen benchmark.

* SPDX-License-Identifier: MIT

@@ -13,10 +13,10 @@

#include "stringlib.h"

#include "benchlib.h"

-#define ITERS 2000

+#define ITERS 5000

#define ITERS2 20000000

#define ITERS3 2000000

-#define NUM_STRLEN 16384

+#define NUM_TESTS 16384

#define MAX_ALIGN 32

#define MAX_STRLEN 256

@@ -49,7 +49,7 @@ static const struct fun

};

#undef F

-static uint16_t strlen_tests[NUM_STRLEN];

+static uint16_t strlen_tests[NUM_TESTS];

typedef struct { uint16_t size; uint16_t freq; } freq_data_t;

typedef struct { uint8_t align; uint16_t freq; } align_data_t;

@@ -117,7 +117,7 @@ init_strlen_tests (void)

/* Create a random set of strlen input strings using the string length

and alignment distributions. */

- for (int n = 0; n < NUM_STRLEN; n++)

+ for (int n = 0; n < NUM_TESTS; n++)

{

int align = strlen_align_arr[rand32 (0) & ALIGN_MASK];

int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK];

@@ -141,14 +141,14 @@ int main (void)

size_t res = 0, strlen_size = 0, mask = maskv;

printf ("%22s ", funtab[f].name);

- for (int c = 0; c < NUM_STRLEN; c++)

+ for (int c = 0; c < NUM_TESTS; c++)

strlen_size += funtab[f].fun (a + strlen_tests[c]);

strlen_size *= ITERS;

/* Measure latency of strlen result with (res & mask). */

uint64_t t = clock_get_ns ();

for (int i = 0; i < ITERS; i++)

- for (int c = 0; c < NUM_STRLEN; c++)

+ for (int c = 0; c < NUM_TESTS; c++)

res = funtab[f].fun (a + strlen_tests[c] + (res & mask));

t = clock_get_ns () - t;

printf ("%.2f\n", (double)strlen_size / t);

diff --git a/string/include/stringlib.h b/string/include/stringlib.h
index 378c3cd2d645..85e630279ceb 100644
--- a/string/include/stringlib.h
+++ b/string/include/stringlib.h

@@ -29,19 +29,17 @@ size_t __strlen_aarch64 (const char *);

size_t __strnlen_aarch64 (const char *, size_t);

int __strncmp_aarch64 (const char *, const char *, size_t);

void * __memchr_aarch64_mte (const void *, int, size_t);

-char *__strcpy_aarch64_mte (char *__restrict, const char *__restrict);

-char *__stpcpy_aarch64_mte (char *__restrict, const char *__restrict);

char *__strchr_aarch64_mte (const char *, int);

char * __strchrnul_aarch64_mte (const char *, int );

size_t __strlen_aarch64_mte (const char *);

char *__strrchr_aarch64_mte (const char *, int);

-int __strcmp_aarch64_mte (const char *, const char *);

-int __strncmp_aarch64_mte (const char *, const char *, size_t);

#if __ARM_NEON

void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t);

void *__memmove_aarch64_simd (void *, const void *, size_t);

#endif

# if __ARM_FEATURE_SVE

+void *__memcpy_aarch64_sve (void *__restrict, const void *__restrict, size_t);

+void *__memmove_aarch64_sve (void *__restrict, const void *__restrict, size_t);

void *__memchr_aarch64_sve (const void *, int, size_t);

int __memcmp_aarch64_sve (const void *, const void *, size_t);

char *__strchr_aarch64_sve (const char *, int);

diff --git a/string/test/memcpy.c b/string/test/memcpy.c
index ce0ceeef5ee8..21b35b990b9b 100644
--- a/string/test/memcpy.c
+++ b/string/test/memcpy.c

@@ -28,6 +28,9 @@ static const struct fun

# if __ARM_NEON

F(__memcpy_aarch64_simd, 1)

# endif

+# if __ARM_FEATURE_SVE

+ F(__memcpy_aarch64_sve, 1)

+# endif

#elif __arm__

F(__memcpy_arm, 0)

#endif

diff --git a/string/test/memmove.c b/string/test/memmove.c
index 689b68c98af2..12a70574c7c5 100644
--- a/string/test/memmove.c
+++ b/string/test/memmove.c

@@ -28,6 +28,9 @@ static const struct fun

# if __ARM_NEON

F(__memmove_aarch64_simd, 1)

# endif

+# if __ARM_FEATURE_SVE

+ F(__memmove_aarch64_sve, 1)

+# endif

#endif

{0, 0, 0}

// clang-format on

diff --git a/string/test/stpcpy.c b/string/test/stpcpy.c
index 1827e68c9a30..1b61245bf8df 100644
--- a/string/test/stpcpy.c
+++ b/string/test/stpcpy.c

@@ -28,8 +28,7 @@ static const struct fun

// clang-format off

F(stpcpy, 0)

#if __aarch64__

- F(__stpcpy_aarch64, 0)

- F(__stpcpy_aarch64_mte, 1)

+ F(__stpcpy_aarch64, 1)

# if __ARM_FEATURE_SVE

F(__stpcpy_aarch64_sve, 1)

# endif

diff --git a/string/test/strcmp.c b/string/test/strcmp.c
index d57b54ed50a8..0262397dec88 100644
--- a/string/test/strcmp.c
+++ b/string/test/strcmp.c

@@ -24,8 +24,7 @@ static const struct fun

// clang-format off

F(strcmp, 0)

#if __aarch64__

- F(__strcmp_aarch64, 0)

- F(__strcmp_aarch64_mte, 1)

+ F(__strcmp_aarch64, 1)

# if __ARM_FEATURE_SVE

F(__strcmp_aarch64_sve, 1)

# endif

diff --git a/string/test/strcpy.c b/string/test/strcpy.c
index e84cace9c8c6..6de3bed590ef 100644
--- a/string/test/strcpy.c
+++ b/string/test/strcpy.c

@@ -24,8 +24,7 @@ static const struct fun

// clang-format off

F(strcpy, 0)

#if __aarch64__

- F(__strcpy_aarch64, 0)

- F(__strcpy_aarch64_mte, 1)

+ F(__strcpy_aarch64, 1)

# if __ARM_FEATURE_SVE

F(__strcpy_aarch64_sve, 1)

# endif

diff --git a/string/test/strncmp.c b/string/test/strncmp.c
index 018a8a431ab8..f8c2167f8f1e 100644
--- a/string/test/strncmp.c
+++ b/string/test/strncmp.c

@@ -24,8 +24,7 @@ static const struct fun

// clang-format off

F(strncmp, 0)

#if __aarch64__

- F(__strncmp_aarch64, 0)

- F(__strncmp_aarch64_mte, 1)

+ F(__strncmp_aarch64, 1)

# if __ARM_FEATURE_SVE

F(__strncmp_aarch64_sve, 1)

# endif