aboutsummaryrefslogtreecommitdiff
path: root/string
diff options
context:
space:
mode:
Diffstat (limited to 'string')
-rw-r--r--string/Dir.mk113
-rw-r--r--string/aarch64/__mtag_tag_region.S100
-rw-r--r--string/aarch64/__mtag_tag_zero_region.S100
-rw-r--r--string/aarch64/check-arch.S13
-rw-r--r--string/aarch64/memchr-mte.S116
-rw-r--r--string/aarch64/memchr-sve.S64
-rw-r--r--string/aarch64/memchr.S146
-rw-r--r--string/aarch64/memcmp-sve.S51
-rw-r--r--string/aarch64/memcmp.S137
-rw-r--r--string/aarch64/memcpy-advsimd.S206
-rw-r--r--string/aarch64/memcpy.S243
-rw-r--r--string/aarch64/memrchr.S117
-rw-r--r--string/aarch64/memset.S117
-rw-r--r--string/aarch64/stpcpy-mte.S10
-rw-r--r--string/aarch64/stpcpy-sve.S10
-rw-r--r--string/aarch64/stpcpy.S10
-rw-r--r--string/aarch64/strchr-mte.S105
-rw-r--r--string/aarch64/strchr-sve.S70
-rw-r--r--string/aarch64/strchr.S126
-rw-r--r--string/aarch64/strchrnul-mte.S84
-rw-r--r--string/aarch64/strchrnul-sve.S9
-rw-r--r--string/aarch64/strchrnul.S114
-rw-r--r--string/aarch64/strcmp-mte.S189
-rw-r--r--string/aarch64/strcmp-sve.S59
-rw-r--r--string/aarch64/strcmp.S173
-rw-r--r--string/aarch64/strcpy-mte.S161
-rw-r--r--string/aarch64/strcpy-sve.S71
-rw-r--r--string/aarch64/strcpy.S311
-rw-r--r--string/aarch64/strlen-mte.S80
-rw-r--r--string/aarch64/strlen-sve.S55
-rw-r--r--string/aarch64/strlen.S200
-rw-r--r--string/aarch64/strncmp-mte.S307
-rw-r--r--string/aarch64/strncmp-sve.S69
-rw-r--r--string/aarch64/strncmp.S260
-rw-r--r--string/aarch64/strnlen-sve.S74
-rw-r--r--string/aarch64/strnlen.S112
-rw-r--r--string/aarch64/strrchr-mte.S127
-rw-r--r--string/aarch64/strrchr-sve.S84
-rw-r--r--string/aarch64/strrchr.S149
-rw-r--r--string/arm/check-arch.S10
-rw-r--r--string/arm/memchr.S132
-rw-r--r--string/arm/memcpy.S587
-rw-r--r--string/arm/memset.S98
-rw-r--r--string/arm/strcmp-armv6m.S117
-rw-r--r--string/arm/strcmp.S475
-rw-r--r--string/arm/strcpy.c133
-rw-r--r--string/arm/strlen-armv6t2.S124
-rw-r--r--string/asmdefs.h98
-rw-r--r--string/bench/memcpy.c260
-rw-r--r--string/bench/strlen.c221
-rw-r--r--string/include/benchlib.h33
-rw-r--r--string/include/stringlib.h69
-rw-r--r--string/test/__mtag_tag_region.c147
-rw-r--r--string/test/__mtag_tag_zero_region.c147
-rw-r--r--string/test/memchr.c110
-rw-r--r--string/test/memcmp.c125
-rw-r--r--string/test/memcpy.c120
-rw-r--r--string/test/memmove.c164
-rw-r--r--string/test/memrchr.c106
-rw-r--r--string/test/memset.c129
-rw-r--r--string/test/mte.h142
-rw-r--r--string/test/stpcpy.c125
-rw-r--r--string/test/strchr.c121
-rw-r--r--string/test/strchrnul.c126
-rw-r--r--string/test/strcmp.c132
-rw-r--r--string/test/strcpy.c123
-rw-r--r--string/test/stringtest.h55
-rw-r--r--string/test/strlen.c103
-rw-r--r--string/test/strncmp.c139
-rw-r--r--string/test/strnlen.c109
-rw-r--r--string/test/strrchr.c121
-rw-r--r--string/x86_64/check-arch.S10
72 files changed, 9253 insertions, 0 deletions
diff --git a/string/Dir.mk b/string/Dir.mk
new file mode 100644
index 000000000000..cf3453f7580d
--- /dev/null
+++ b/string/Dir.mk
@@ -0,0 +1,113 @@
+# Makefile fragment - requires GNU make
+#
+# Copyright (c) 2019-2021, Arm Limited.
+# SPDX-License-Identifier: MIT
+
+S := $(srcdir)/string
+B := build/string
+
+ifeq ($(ARCH),)
+all-string bench-string check-string install-string clean-string:
+ @echo "*** Please set ARCH in config.mk. ***"
+ @exit 1
+else
+
+string-lib-srcs := $(wildcard $(S)/$(ARCH)/*.[cS])
+string-test-srcs := $(wildcard $(S)/test/*.c)
+string-bench-srcs := $(wildcard $(S)/bench/*.c)
+
+string-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h))
+
+string-libs := \
+ build/lib/libstringlib.so \
+ build/lib/libstringlib.a \
+
+string-tests := \
+ build/bin/test/memcpy \
+ build/bin/test/memmove \
+ build/bin/test/memset \
+ build/bin/test/memchr \
+ build/bin/test/memrchr \
+ build/bin/test/memcmp \
+ build/bin/test/__mtag_tag_region \
+ build/bin/test/__mtag_tag_zero_region \
+ build/bin/test/strcpy \
+ build/bin/test/stpcpy \
+ build/bin/test/strcmp \
+ build/bin/test/strchr \
+ build/bin/test/strrchr \
+ build/bin/test/strchrnul \
+ build/bin/test/strlen \
+ build/bin/test/strnlen \
+ build/bin/test/strncmp
+
+string-benches := \
+ build/bin/bench/memcpy \
+ build/bin/bench/strlen
+
+string-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-lib-srcs)))
+string-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-test-srcs)))
+string-bench-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-bench-srcs)))
+
+string-objs := \
+ $(string-lib-objs) \
+ $(string-lib-objs:%.o=%.os) \
+ $(string-test-objs) \
+ $(string-bench-objs)
+
+string-files := \
+ $(string-objs) \
+ $(string-libs) \
+ $(string-tests) \
+ $(string-benches) \
+ $(string-includes) \
+
+all-string: $(string-libs) $(string-tests) $(string-benches) $(string-includes)
+
+$(string-objs): $(string-includes)
+$(string-objs): CFLAGS_ALL += $(string-cflags)
+
+$(string-test-objs): CFLAGS_ALL += -D_GNU_SOURCE
+
+build/lib/libstringlib.so: $(string-lib-objs:%.o=%.os)
+ $(CC) $(CFLAGS_ALL) $(LDFLAGS) -shared -o $@ $^
+
+build/lib/libstringlib.a: $(string-lib-objs)
+ rm -f $@
+ $(AR) rc $@ $^
+ $(RANLIB) $@
+
+build/bin/test/%: $(B)/test/%.o build/lib/libstringlib.a
+ $(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
+
+build/bin/bench/%: $(B)/bench/%.o build/lib/libstringlib.a
+ $(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
+
+build/include/%.h: $(S)/include/%.h
+ cp $< $@
+
+build/bin/%.sh: $(S)/test/%.sh
+ cp $< $@
+
+string-tests-out = $(string-tests:build/bin/test/%=build/string/test/%.out)
+
+build/string/test/%.out: build/bin/test/%
+ $(EMULATOR) $^ | tee $@.tmp
+ mv $@.tmp $@
+
+check-string: $(string-tests-out)
+ ! grep FAIL $^
+
+bench-string: $(string-benches)
+ $(EMULATOR) build/bin/bench/strlen
+ $(EMULATOR) build/bin/bench/memcpy
+
+install-string: \
+ $(string-libs:build/lib/%=$(DESTDIR)$(libdir)/%) \
+ $(string-includes:build/include/%=$(DESTDIR)$(includedir)/%)
+
+clean-string:
+ rm -f $(string-files)
+endif
+
+.PHONY: all-string bench-string check-string install-string clean-string
diff --git a/string/aarch64/__mtag_tag_region.S b/string/aarch64/__mtag_tag_region.S
new file mode 100644
index 000000000000..84339f73cf23
--- /dev/null
+++ b/string/aarch64/__mtag_tag_region.S
@@ -0,0 +1,100 @@
+/*
+ * __mtag_tag_region - tag memory
+ *
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, MTE, LP64 ABI.
+ *
+ * Interface contract:
+ * Address is 16 byte aligned and size is multiple of 16.
+ * Returns the passed pointer.
+ * The memory region may remain untagged if tagging is not enabled.
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_MEMORY_TAGGING
+
+#define dstin x0
+#define count x1
+#define dst x2
+#define dstend x3
+#define tmp x4
+#define zva_val x4
+
+ENTRY (__mtag_tag_region)
+ PTR_ARG (0)
+ SIZE_ARG (1)
+
+ add dstend, dstin, count
+
+ cmp count, 96
+ b.hi L(set_long)
+
+ tbnz count, 6, L(set96)
+
+ /* Set 0, 16, 32, or 48 bytes. */
+ lsr tmp, count, 5
+ add tmp, dstin, tmp, lsl 4
+ cbz count, L(end)
+ stg dstin, [dstin]
+ stg dstin, [tmp]
+ stg dstin, [dstend, -16]
+L(end):
+ ret
+
+ .p2align 4
+ /* Set 64..96 bytes. Write 64 bytes from the start and
+ 32 bytes from the end. */
+L(set96):
+ st2g dstin, [dstin]
+ st2g dstin, [dstin, 32]
+ st2g dstin, [dstend, -32]
+ ret
+
+ .p2align 4
+ /* Size is > 96 bytes. */
+L(set_long):
+ cmp count, 160
+ b.lo L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+ mrs zva_val, dczid_el0
+ and zva_val, zva_val, 31
+ cmp zva_val, 4 /* ZVA size is 64 bytes. */
+ b.ne L(no_zva)
+#endif
+ st2g dstin, [dstin]
+ st2g dstin, [dstin, 32]
+ bic dst, dstin, 63
+ sub count, dstend, dst /* Count is now 64 too large. */
+ sub count, count, 128 /* Adjust count and bias for loop. */
+
+ .p2align 4
+L(zva_loop):
+ add dst, dst, 64
+ dc gva, dst
+ subs count, count, 64
+ b.hi L(zva_loop)
+ st2g dstin, [dstend, -64]
+ st2g dstin, [dstend, -32]
+ ret
+
+L(no_zva):
+ sub dst, dstin, 32 /* Dst is biased by -32. */
+ sub count, count, 64 /* Adjust count for loop. */
+L(no_zva_loop):
+ st2g dstin, [dst, 32]
+ st2g dstin, [dst, 64]!
+ subs count, count, 64
+ b.hi L(no_zva_loop)
+ st2g dstin, [dstend, -64]
+ st2g dstin, [dstend, -32]
+ ret
+
+END (__mtag_tag_region)
+#endif
diff --git a/string/aarch64/__mtag_tag_zero_region.S b/string/aarch64/__mtag_tag_zero_region.S
new file mode 100644
index 000000000000..f58364ca6fcb
--- /dev/null
+++ b/string/aarch64/__mtag_tag_zero_region.S
@@ -0,0 +1,100 @@
+/*
+ * __mtag_tag_zero_region - tag memory and fill it with zero bytes
+ *
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, MTE, LP64 ABI.
+ *
+ * Interface contract:
+ * Address is 16 byte aligned and size is multiple of 16.
+ * Returns the passed pointer.
+ * The memory region may remain untagged if tagging is not enabled.
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_MEMORY_TAGGING
+
+#define dstin x0
+#define count x1
+#define dst x2
+#define dstend x3
+#define tmp x4
+#define zva_val x4
+
+ENTRY (__mtag_tag_zero_region)
+ PTR_ARG (0)
+ SIZE_ARG (1)
+
+ add dstend, dstin, count
+
+ cmp count, 96
+ b.hi L(set_long)
+
+ tbnz count, 6, L(set96)
+
+ /* Set 0, 16, 32, or 48 bytes. */
+ lsr tmp, count, 5
+ add tmp, dstin, tmp, lsl 4
+ cbz count, L(end)
+ stzg dstin, [dstin]
+ stzg dstin, [tmp]
+ stzg dstin, [dstend, -16]
+L(end):
+ ret
+
+ .p2align 4
+ /* Set 64..96 bytes. Write 64 bytes from the start and
+ 32 bytes from the end. */
+L(set96):
+ stz2g dstin, [dstin]
+ stz2g dstin, [dstin, 32]
+ stz2g dstin, [dstend, -32]
+ ret
+
+ .p2align 4
+ /* Size is > 96 bytes. */
+L(set_long):
+ cmp count, 160
+ b.lo L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+ mrs zva_val, dczid_el0
+ and zva_val, zva_val, 31
+ cmp zva_val, 4 /* ZVA size is 64 bytes. */
+ b.ne L(no_zva)
+#endif
+ stz2g dstin, [dstin]
+ stz2g dstin, [dstin, 32]
+ bic dst, dstin, 63
+ sub count, dstend, dst /* Count is now 64 too large. */
+ sub count, count, 128 /* Adjust count and bias for loop. */
+
+ .p2align 4
+L(zva_loop):
+ add dst, dst, 64
+ dc gzva, dst
+ subs count, count, 64
+ b.hi L(zva_loop)
+ stz2g dstin, [dstend, -64]
+ stz2g dstin, [dstend, -32]
+ ret
+
+L(no_zva):
+ sub dst, dstin, 32 /* Dst is biased by -32. */
+ sub count, count, 64 /* Adjust count for loop. */
+L(no_zva_loop):
+ stz2g dstin, [dst, 32]
+ stz2g dstin, [dst, 64]!
+ subs count, count, 64
+ b.hi L(no_zva_loop)
+ stz2g dstin, [dstend, -64]
+ stz2g dstin, [dstend, -32]
+ ret
+
+END (__mtag_tag_zero_region)
+#endif
diff --git a/string/aarch64/check-arch.S b/string/aarch64/check-arch.S
new file mode 100644
index 000000000000..5a54242d7de6
--- /dev/null
+++ b/string/aarch64/check-arch.S
@@ -0,0 +1,13 @@
+/*
+ * check ARCH setting.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if !__aarch64__
+# error ARCH setting does not match the compiler.
+#endif
+
+/* Include for GNU property notes. */
+#include "../asmdefs.h"
diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S
new file mode 100644
index 000000000000..c2e967d1004e
--- /dev/null
+++ b/string/aarch64/memchr-mte.S
@@ -0,0 +1,116 @@
+/*
+ * memchr - find a character in a memory zone
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin x0
+#define chrin w1
+#define cntin x2
+#define result x0
+
+#define src x3
+#define cntrem x4
+#define synd x5
+#define shift x6
+#define tmp x7
+#define wtmp w7
+
+#define vrepchr v0
+#define qdata q1
+#define vdata v1
+#define vhas_chr v2
+#define vrepmask v3
+#define vend v4
+#define dend d4
+
+/*
+ Core algorithm:
+
+ For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+ per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+ requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+ set likewise for odd bytes so that adjacent bytes can be merged. Since the
+ bits in the syndrome reflect the order in which things occur in the original
+ string, counting trailing zeros identifies exactly which byte matched. */
+
+ENTRY (__memchr_aarch64_mte)
+ PTR_ARG (0)
+ SIZE_ARG (2)
+ bic src, srcin, 15
+ cbz cntin, L(nomatch)
+ ld1 {vdata.16b}, [src]
+ dup vrepchr.16b, chrin
+ mov wtmp, 0xf00f
+ dup vrepmask.8h, wtmp
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ lsl shift, srcin, 2
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+ lsr synd, synd, shift
+ cbz synd, L(start_loop)
+
+ rbit synd, synd
+ clz synd, synd
+ add result, srcin, synd, lsr 2
+ cmp cntin, synd, lsr 2
+ csel result, result, xzr, hi
+ ret
+
+L(start_loop):
+ sub tmp, src, srcin
+ add tmp, tmp, 16
+ subs cntrem, cntin, tmp
+ b.ls L(nomatch)
+
+ /* Make sure that it won't overread by a 16-byte chunk */
+ add tmp, cntrem, 15
+ tbnz tmp, 4, L(loop32_2)
+
+ .p2align 4
+L(loop32):
+ ldr qdata, [src, 16]!
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+ cbnz synd, L(end)
+
+L(loop32_2):
+ ldr qdata, [src, 16]!
+ subs cntrem, cntrem, 32
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ b.ls L(end)
+ umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+ cbz synd, L(loop32)
+L(end):
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+ add tmp, srcin, cntin
+ sub cntrem, tmp, src
+#ifndef __AARCH64EB__
+ rbit synd, synd
+#endif
+ clz synd, synd
+ cmp cntrem, synd, lsr 2
+ add result, src, synd, lsr 2
+ csel result, result, xzr, hi
+ ret
+
+L(nomatch):
+ mov result, 0
+ ret
+
+END (__memchr_aarch64_mte)
+
diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S
new file mode 100644
index 000000000000..c22e6596f19b
--- /dev/null
+++ b/string/aarch64/memchr-sve.S
@@ -0,0 +1,64 @@
+/*
+ * memchr - find a character in a memory zone
+ *
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+ENTRY (__memchr_aarch64_sve)
+ PTR_ARG (0)
+ SIZE_ARG (2)
+ dup z1.b, w1 /* duplicate c to a vector */
+ setffr /* initialize FFR */
+ mov x3, 0 /* initialize off */
+
+ .p2align 4
+0: whilelo p1.b, x3, x2 /* make sure off < max */
+ b.none 9f
+
+ /* Read a vector's worth of bytes, bounded by max,
+ stopping on first fault. */
+ ldff1b z0.b, p1/z, [x0, x3]
+ rdffrs p0.b, p1/z
+ b.nlast 2f
+
+ /* First fault did not fail: the vector bounded by max is valid.
+ Avoid depending on the contents of FFR beyond the branch. */
+ incb x3 /* speculate increment */
+ cmpeq p2.b, p1/z, z0.b, z1.b /* search for c */
+ b.none 0b
+ decb x3 /* undo speculate */
+
+ /* Found C. */
+1: brkb p2.b, p1/z, p2.b /* find the first c */
+ add x0, x0, x3 /* form partial pointer */
+ incp x0, p2.b /* form final pointer to c */
+ ret
+
+ /* First fault failed: only some of the vector is valid.
+ Perform the comparision only on the valid bytes. */
+2: cmpeq p2.b, p0/z, z0.b, z1.b
+ b.any 1b
+
+ /* No C found. Re-init FFR, increment, and loop. */
+ setffr
+ incp x3, p0.b
+ b 0b
+
+ /* Found end of count. */
+9: mov x0, 0 /* return null */
+ ret
+
+END (__memchr_aarch64_sve)
+
+#endif
+
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
new file mode 100644
index 000000000000..353f0d1eac53
--- /dev/null
+++ b/string/aarch64/memchr.S
@@ -0,0 +1,146 @@
+/*
+ * memchr - find a character in a memory zone
+ *
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+#include "../asmdefs.h"
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+#define cntin x2
+
+#define result x0
+
+#define src x3
+#define tmp x4
+#define wtmp2 w5
+#define synd x6
+#define soff x9
+#define cntrem x10
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_chr1 v3
+#define vhas_chr2 v4
+#define vrepmask v5
+#define vend v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY (__memchr_aarch64)
+ PTR_ARG (0)
+ SIZE_ARG (2)
+ /* Do not dereference srcin if no bytes to compare. */
+ cbz cntin, L(zero_length)
+ /*
+ * Magic constant 0x40100401 allows us to identify which lane matches
+ * the requested byte.
+ */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ /* Work with aligned 32-byte chunks */
+ bic src, srcin, #31
+ dup vrepmask.4s, wtmp2
+ ands soff, srcin, #31
+ and cntrem, cntin, #31
+ b.eq L(loop)
+
+ /*
+ * Input string is not 32-byte aligned. We calculate the syndrome
+ * value for the aligned 32 bytes block containing the first bytes
+ * and mask the irrelevant part.
+ */
+
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ sub tmp, soff, #32
+ adds cntin, cntin, tmp
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.d[0]
+ /* Clear the soff*2 lower bits */
+ lsl tmp, soff, #1
+ lsr synd, synd, tmp
+ lsl synd, synd, tmp
+ /* The first block can also be the last */
+ b.ls L(masklast)
+ /* Have we found something already? */
+ cbnz synd, L(tail)
+
+L(loop):
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ subs cntin, cntin, #32
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ /* If we're out of data we finish regardless of the result */
+ b.ls L(end)
+ /* Use a fast check for the termination condition */
+ orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
+ addp vend.2d, vend.2d, vend.2d
+ mov synd, vend.d[0]
+ /* We're not out of data, loop if we haven't found the character */
+ cbz synd, L(loop)
+
+L(end):
+ /* Termination condition found, let's calculate the syndrome value */
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.d[0]
+ /* Only do the clear for the last possible block */
+ b.hs L(tail)
+
+L(masklast):
+ /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+ add tmp, cntrem, soff
+ and tmp, tmp, #31
+ sub tmp, tmp, #32
+ neg tmp, tmp, lsl #1
+ lsl synd, synd, tmp
+ lsr synd, synd, tmp
+
+L(tail):
+ /* Count the trailing zeros using bit reversing */
+ rbit synd, synd
+ /* Compensate the last post-increment */
+ sub src, src, #32
+ /* Check that we have found a character */
+ cmp synd, #0
+ /* And count the leading zeros */
+ clz synd, synd
+ /* Compute the potential result */
+ add result, src, synd, lsr #1
+ /* Select result or NULL */
+ csel result, xzr, result, eq
+ ret
+
+L(zero_length):
+ mov result, #0
+ ret
+
+END (__memchr_aarch64)
+
diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S
new file mode 100644
index 000000000000..78c5ecaa4cdc
--- /dev/null
+++ b/string/aarch64/memcmp-sve.S
@@ -0,0 +1,51 @@
+/*
+ * memcmp - compare memory
+ *
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+ENTRY (__memcmp_aarch64_sve)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ SIZE_ARG (2)
+ mov x3, 0 /* initialize off */
+
+0: whilelo p0.b, x3, x2 /* while off < max */
+ b.none 9f
+
+ ld1b z0.b, p0/z, [x0, x3] /* read vectors bounded by max. */
+ ld1b z1.b, p0/z, [x1, x3]
+
+ /* Increment for a whole vector, even if we've only read a partial.
+ This is significantly cheaper than INCP, and since OFF is not
+ used after the loop it is ok to increment OFF past MAX. */
+ incb x3
+
+ cmpne p1.b, p0/z, z0.b, z1.b /* while no inequalities */
+ b.none 0b
+
+ /* Found inequality. */
+1: brkb p1.b, p0/z, p1.b /* find first such */
+ lasta w0, p1, z0.b /* extract each byte */
+ lasta w1, p1, z1.b
+ sub x0, x0, x1 /* return comparison */
+ ret
+
+ /* Found end-of-count. */
+9: mov x0, 0 /* return equality */
+ ret
+
+END (__memcmp_aarch64_sve)
+
+#endif
+
diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
new file mode 100644
index 000000000000..3b1026642eee
--- /dev/null
+++ b/string/aarch64/memcmp.S
@@ -0,0 +1,137 @@
+/* memcmp - compare memory
+ *
+ * Copyright (c) 2013-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ */
+
+#include "../asmdefs.h"
+
+/* Parameters and result. */
+#define src1 x0
+#define src2 x1
+#define limit x2
+#define result w0
+
+/* Internal variables. */
+#define data1 x3
+#define data1w w3
+#define data1h x4
+#define data2 x5
+#define data2w w5
+#define data2h x6
+#define tmp1 x7
+#define tmp2 x8
+
+ENTRY (__memcmp_aarch64)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ SIZE_ARG (2)
+ subs limit, limit, 8
+ b.lo L(less8)
+
+ ldr data1, [src1], 8
+ ldr data2, [src2], 8
+ cmp data1, data2
+ b.ne L(return)
+
+ subs limit, limit, 8
+ b.gt L(more16)
+
+ ldr data1, [src1, limit]
+ ldr data2, [src2, limit]
+ b L(return)
+
+L(more16):
+ ldr data1, [src1], 8
+ ldr data2, [src2], 8
+ cmp data1, data2
+ bne L(return)
+
+ /* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+ strings. */
+ subs limit, limit, 16
+ b.ls L(last_bytes)
+
+ /* We overlap loads between 0-32 bytes at either side of SRC1 when we
+ try to align, so limit it only to strings larger than 128 bytes. */
+ cmp limit, 96
+ b.ls L(loop16)
+
+ /* Align src1 and adjust src2 with bytes not yet done. */
+ and tmp1, src1, 15
+ add limit, limit, tmp1
+ sub src1, src1, tmp1
+ sub src2, src2, tmp1
+
+ /* Loop performing 16 bytes per iteration using aligned src1.
+ Limit is pre-decremented by 16 and must be larger than zero.
+ Exit if <= 16 bytes left to do or if the data is not equal. */
+ .p2align 4
+L(loop16):
+ ldp data1, data1h, [src1], 16
+ ldp data2, data2h, [src2], 16
+ subs limit, limit, 16
+ ccmp data1, data2, 0, hi
+ ccmp data1h, data2h, 0, eq
+ b.eq L(loop16)
+
+ cmp data1, data2
+ bne L(return)
+ mov data1, data1h
+ mov data2, data2h
+ cmp data1, data2
+ bne L(return)
+
+ /* Compare last 1-16 bytes using unaligned access. */
+L(last_bytes):
+ add src1, src1, limit
+ add src2, src2, limit
+ ldp data1, data1h, [src1]
+ ldp data2, data2h, [src2]
+ cmp data1, data2
+ bne L(return)
+ mov data1, data1h
+ mov data2, data2h
+ cmp data1, data2
+
+ /* Compare data bytes and set return value to 0, -1 or 1. */
+L(return):
+#ifndef __AARCH64EB__
+ rev data1, data1
+ rev data2, data2
+#endif
+ cmp data1, data2
+L(ret_eq):
+ cset result, ne
+ cneg result, result, lo
+ ret
+
+ .p2align 4
+ /* Compare up to 8 bytes. Limit is [-8..-1]. */
+L(less8):
+ adds limit, limit, 4
+ b.lo L(less4)
+ ldr data1w, [src1], 4
+ ldr data2w, [src2], 4
+ cmp data1w, data2w
+ b.ne L(return)
+ sub limit, limit, 4
+L(less4):
+ adds limit, limit, 4
+ beq L(ret_eq)
+L(byte_loop):
+ ldrb data1w, [src1], 1
+ ldrb data2w, [src2], 1
+ subs limit, limit, 1
+ ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
+ b.eq L(byte_loop)
+ sub result, data1w, data2w
+ ret
+
+END (__memcmp_aarch64)
+
diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S
new file mode 100644
index 000000000000..f97f2c3047b9
--- /dev/null
+++ b/string/aarch64/memcpy-advsimd.S
@@ -0,0 +1,206 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ *
+ */
+
+#include "../asmdefs.h"
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define A_l x6
+#define A_lw w6
+#define A_h x7
+#define B_l x8
+#define B_lw w8
+#define B_h x9
+#define C_lw w10
+#define tmp1 x14
+
+#define A_q q0
+#define B_q q1
+#define C_q q2
+#define D_q q3
+#define E_q q4
+#define F_q q5
+#define G_q q6
+#define H_q q7
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+ from a single entry point. It uses unaligned accesses and branchless
+ sequences to keep the code small, simple and improve performance.
+
+ Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+ copies of up to 128 bytes, and large copies. The overhead of the overlap
+ check is negligible since it is only required for large copies.
+
+ Large copies use a software pipelined loop processing 64 bytes per iteration.
+ The source pointer is 16-byte aligned to minimize unaligned accesses.
+ The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+ENTRY_ALIAS (__memmove_aarch64_simd)
+ENTRY (__memcpy_aarch64_simd)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ SIZE_ARG (2)
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 128
+ b.hi L(copy_long)
+ cmp count, 32
+ b.hi L(copy32_128)
+
+ /* Small copies: 0..32 bytes. */
+ cmp count, 16
+ b.lo L(copy16)
+ ldr A_q, [src]
+ ldr B_q, [srcend, -16]
+ str A_q, [dstin]
+ str B_q, [dstend, -16]
+ ret
+
+ /* Copy 8-15 bytes. */
+L(copy16):
+ tbz count, 3, L(copy8)
+ ldr A_l, [src]
+ ldr A_h, [srcend, -8]
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ ret
+
+ .p2align 3
+ /* Copy 4-7 bytes. */
+L(copy8):
+ tbz count, 2, L(copy4)
+ ldr A_lw, [src]
+ ldr B_lw, [srcend, -4]
+ str A_lw, [dstin]
+ str B_lw, [dstend, -4]
+ ret
+
+ /* Copy 0..3 bytes using a branchless sequence. */
+L(copy4):
+ cbz count, L(copy0)
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb C_lw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb C_lw, [dstend, -1]
+L(copy0):
+ ret
+
+ .p2align 4
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ ldp A_q, B_q, [src]
+ ldp C_q, D_q, [srcend, -32]
+ cmp count, 64
+ b.hi L(copy128)
+ stp A_q, B_q, [dstin]
+ stp C_q, D_q, [dstend, -32]
+ ret
+
+ .p2align 4
+ /* Copy 65..128 bytes. */
+L(copy128):
+ ldp E_q, F_q, [src, 32]
+ cmp count, 96
+ b.ls L(copy96)
+ ldp G_q, H_q, [srcend, -64]
+ stp G_q, H_q, [dstend, -64]
+L(copy96):
+ stp A_q, B_q, [dstin]
+ stp E_q, F_q, [dstin, 32]
+ stp C_q, D_q, [dstend, -32]
+ ret
+
+ /* Copy more than 128 bytes. */
+L(copy_long):
+ /* Use backwards copy if there is an overlap. */
+ sub tmp1, dstin, src
+ cmp tmp1, count
+ b.lo L(copy_long_backwards)
+
+ /* Copy 16 bytes and then align src to 16-byte alignment. */
+ ldr D_q, [src]
+ and tmp1, src, 15
+ bic src, src, 15
+ sub dst, dstin, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_q, B_q, [src, 16]
+ str D_q, [dstin]
+ ldp C_q, D_q, [src, 48]
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(copy64_from_end)
+L(loop64):
+ stp A_q, B_q, [dst, 16]
+ ldp A_q, B_q, [src, 80]
+ stp C_q, D_q, [dst, 48]
+ ldp C_q, D_q, [src, 112]
+ add src, src, 64
+ add dst, dst, 64
+ subs count, count, 64
+ b.hi L(loop64)
+
+ /* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end):
+ ldp E_q, F_q, [srcend, -64]
+ stp A_q, B_q, [dst, 16]
+ ldp A_q, B_q, [srcend, -32]
+ stp C_q, D_q, [dst, 48]
+ stp E_q, F_q, [dstend, -64]
+ stp A_q, B_q, [dstend, -32]
+ ret
+
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align srcend to 16-byte alignment. */
+L(copy_long_backwards):
+ cbz tmp1, L(copy0)
+ ldr D_q, [srcend, -16]
+ and tmp1, srcend, 15
+ bic srcend, srcend, 15
+ sub count, count, tmp1
+ ldp A_q, B_q, [srcend, -32]
+ str D_q, [dstend, -16]
+ ldp C_q, D_q, [srcend, -64]
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls L(copy64_from_start)
+
+L(loop64_backwards):
+ str B_q, [dstend, -16]
+ str A_q, [dstend, -32]
+ ldp A_q, B_q, [srcend, -96]
+ str D_q, [dstend, -48]
+ str C_q, [dstend, -64]!
+ ldp C_q, D_q, [srcend, -128]
+ sub srcend, srcend, 64
+ subs count, count, 64
+ b.hi L(loop64_backwards)
+
+ /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start):
+ ldp E_q, F_q, [src, 32]
+ stp A_q, B_q, [dstend, -32]
+ ldp A_q, B_q, [src]
+ stp C_q, D_q, [dstend, -64]
+ stp E_q, F_q, [dstin, 32]
+ stp A_q, B_q, [dstin]
+ ret
+
+END (__memcpy_aarch64_simd)
+
diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S
new file mode 100644
index 000000000000..dd254f6f9929
--- /dev/null
+++ b/string/aarch64/memcpy.S
@@ -0,0 +1,243 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#include "../asmdefs.h"
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define A_l x6
+#define A_lw w6
+#define A_h x7
+#define B_l x8
+#define B_lw w8
+#define B_h x9
+#define C_l x10
+#define C_lw w10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l x14
+#define E_h x15
+#define F_l x16
+#define F_h x17
+#define G_l count
+#define G_h dst
+#define H_l src
+#define H_h srcend
+#define tmp1 x14
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+ from a single entry point. It uses unaligned accesses and branchless
+ sequences to keep the code small, simple and improve performance.
+
+ Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+ copies of up to 128 bytes, and large copies. The overhead of the overlap
+ check is negligible since it is only required for large copies.
+
+ Large copies use a software pipelined loop processing 64 bytes per iteration.
+ The destination pointer is 16-byte aligned to minimize unaligned accesses.
+ The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+ENTRY_ALIAS (__memmove_aarch64)
+ENTRY (__memcpy_aarch64)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ SIZE_ARG (2)
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 128
+ b.hi L(copy_long)
+ cmp count, 32
+ b.hi L(copy32_128)
+
+ /* Small copies: 0..32 bytes. */
+ cmp count, 16
+ b.lo L(copy16)
+ ldp A_l, A_h, [src]
+ ldp D_l, D_h, [srcend, -16]
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ /* Copy 8-15 bytes. */
+L(copy16):
+ tbz count, 3, L(copy8)
+ ldr A_l, [src]
+ ldr A_h, [srcend, -8]
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ ret
+
+ .p2align 3
+ /* Copy 4-7 bytes. */
+L(copy8):
+ tbz count, 2, L(copy4)
+ ldr A_lw, [src]
+ ldr B_lw, [srcend, -4]
+ str A_lw, [dstin]
+ str B_lw, [dstend, -4]
+ ret
+
+ /* Copy 0..3 bytes using a branchless sequence. */
+L(copy4):
+ cbz count, L(copy0)
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb C_lw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb C_lw, [dstend, -1]
+L(copy0):
+ ret
+
+ .p2align 4
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ ldp A_l, A_h, [src]
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ ldp D_l, D_h, [srcend, -16]
+ cmp count, 64
+ b.hi L(copy128)
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Copy 65..128 bytes. */
+L(copy128):
+ ldp E_l, E_h, [src, 32]
+ ldp F_l, F_h, [src, 48]
+ cmp count, 96
+ b.ls L(copy96)
+ ldp G_l, G_h, [srcend, -64]
+ ldp H_l, H_h, [srcend, -48]
+ stp G_l, G_h, [dstend, -64]
+ stp H_l, H_h, [dstend, -48]
+L(copy96):
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp E_l, E_h, [dstin, 32]
+ stp F_l, F_h, [dstin, 48]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Copy more than 128 bytes. */
+L(copy_long):
+ /* Use backwards copy if there is an overlap. */
+ sub tmp1, dstin, src
+ cbz tmp1, L(copy0)
+ cmp tmp1, count
+ b.lo L(copy_long_backwards)
+
+ /* Copy 16 bytes and then align dst to 16-byte alignment. */
+
+ ldp D_l, D_h, [src]
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_l, A_h, [src, 16]
+ stp D_l, D_h, [dstin]
+ ldp B_l, B_h, [src, 32]
+ ldp C_l, C_h, [src, 48]
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(copy64_from_end)
+
+L(loop64):
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]!
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 64
+ b.hi L(loop64)
+
+ /* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end):
+ ldp E_l, E_h, [srcend, -64]
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [srcend, -48]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [srcend, -16]
+ stp D_l, D_h, [dst, 64]
+ stp E_l, E_h, [dstend, -64]
+ stp A_l, A_h, [dstend, -48]
+ stp B_l, B_h, [dstend, -32]
+ stp C_l, C_h, [dstend, -16]
+ ret
+
+ .p2align 4
+
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align dst to 16-byte alignment. */
+L(copy_long_backwards):
+ ldp D_l, D_h, [srcend, -16]
+ and tmp1, dstend, 15
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldp A_l, A_h, [srcend, -16]
+ stp D_l, D_h, [dstend, -16]
+ ldp B_l, B_h, [srcend, -32]
+ ldp C_l, C_h, [srcend, -48]
+ ldp D_l, D_h, [srcend, -64]!
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls L(copy64_from_start)
+
+L(loop64_backwards):
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [srcend, -16]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [srcend, -48]
+ stp D_l, D_h, [dstend, -64]!
+ ldp D_l, D_h, [srcend, -64]!
+ subs count, count, 64
+ b.hi L(loop64_backwards)
+
+ /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start):
+ ldp G_l, G_h, [src, 48]
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [src, 32]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [src, 16]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [src]
+ stp D_l, D_h, [dstend, -64]
+ stp G_l, G_h, [dstin, 48]
+ stp A_l, A_h, [dstin, 32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin]
+ ret
+
+END (__memcpy_aarch64)
+
diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S
new file mode 100644
index 000000000000..7b4be847cecb
--- /dev/null
+++ b/string/aarch64/memrchr.S
@@ -0,0 +1,117 @@
+/*
+ * memrchr - find last character in a memory zone.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin x0
+#define chrin w1
+#define cntin x2
+#define result x0
+
+#define src x3
+#define cntrem x4
+#define synd x5
+#define shift x6
+#define tmp x7
+#define wtmp w7
+#define end x8
+#define endm1 x9
+
+#define vrepchr v0
+#define qdata q1
+#define vdata v1
+#define vhas_chr v2
+#define vrepmask v3
+#define vend v4
+#define dend d4
+
+/*
+ Core algorithm:
+
+ For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+ per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+ requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+ set likewise for odd bytes so that adjacent bytes can be merged. Since the
+ bits in the syndrome reflect the order in which things occur in the original
+ string, counting trailing zeros identifies exactly which byte matched. */
+
+ENTRY (__memrchr_aarch64)
+ PTR_ARG (0)
+ add end, srcin, cntin
+ sub endm1, end, 1
+ bic src, endm1, 15
+ cbz cntin, L(nomatch)
+ ld1 {vdata.16b}, [src]
+ dup vrepchr.16b, chrin
+ mov wtmp, 0xf00f
+ dup vrepmask.8h, wtmp
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ neg shift, end, lsl 2
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+ lsl synd, synd, shift
+ cbz synd, L(start_loop)
+
+ clz synd, synd
+ sub result, endm1, synd, lsr 2
+ cmp cntin, synd, lsr 2
+ csel result, result, xzr, hi
+ ret
+
+L(start_loop):
+ sub tmp, end, src
+ subs cntrem, cntin, tmp
+ b.ls L(nomatch)
+
+ /* Make sure that it won't overread by a 16-byte chunk */
+ add tmp, cntrem, 15
+ tbnz tmp, 4, L(loop32_2)
+
+ .p2align 4
+L(loop32):
+ ldr qdata, [src, -16]!
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+ cbnz synd, L(end)
+
+L(loop32_2):
+ ldr qdata, [src, -16]!
+ subs cntrem, cntrem, 32
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ b.ls L(end)
+ umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+ cbz synd, L(loop32)
+L(end):
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+
+ add tmp, src, 15
+#ifdef __AARCH64EB__
+ rbit synd, synd
+#endif
+ clz synd, synd
+ sub tmp, tmp, synd, lsr 2
+ cmp tmp, srcin
+ csel result, tmp, xzr, hs
+ ret
+
+L(nomatch):
+ mov result, 0
+ ret
+
+END (__memrchr_aarch64)
+
diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S
new file mode 100644
index 000000000000..9fcd97579913
--- /dev/null
+++ b/string/aarch64/memset.S
@@ -0,0 +1,117 @@
+/*
+ * memset - fill memory with a constant byte
+ *
+ * Copyright (c) 2012-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ *
+ */
+
+#include "../asmdefs.h"
+
+#define dstin x0
+#define val x1
+#define valw w1
+#define count x2
+#define dst x3
+#define dstend x4
+#define zva_val x5
+
+ENTRY (__memset_aarch64)
+ PTR_ARG (0)
+ SIZE_ARG (2)
+
+ dup v0.16B, valw
+ add dstend, dstin, count
+
+ cmp count, 96
+ b.hi L(set_long)
+ cmp count, 16
+ b.hs L(set_medium)
+ mov val, v0.D[0]
+
+ /* Set 0..15 bytes. */
+ tbz count, 3, 1f
+ str val, [dstin]
+ str val, [dstend, -8]
+ ret
+ .p2align 4
+1: tbz count, 2, 2f
+ str valw, [dstin]
+ str valw, [dstend, -4]
+ ret
+2: cbz count, 3f
+ strb valw, [dstin]
+ tbz count, 1, 3f
+ strh valw, [dstend, -2]
+3: ret
+
+ /* Set 17..96 bytes. */
+L(set_medium):
+ str q0, [dstin]
+ tbnz count, 6, L(set96)
+ str q0, [dstend, -16]
+ tbz count, 5, 1f
+ str q0, [dstin, 16]
+ str q0, [dstend, -32]
+1: ret
+
+ .p2align 4
+ /* Set 64..96 bytes. Write 64 bytes from the start and
+ 32 bytes from the end. */
+L(set96):
+ str q0, [dstin, 16]
+ stp q0, q0, [dstin, 32]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 4
+L(set_long):
+ and valw, valw, 255
+ bic dst, dstin, 15
+ str q0, [dstin]
+ cmp count, 160
+ ccmp valw, 0, 0, hs
+ b.ne L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+ mrs zva_val, dczid_el0
+ and zva_val, zva_val, 31
+ cmp zva_val, 4 /* ZVA size is 64 bytes. */
+ b.ne L(no_zva)
+#endif
+ str q0, [dst, 16]
+ stp q0, q0, [dst, 32]
+ bic dst, dst, 63
+ sub count, dstend, dst /* Count is now 64 too large. */
+ sub count, count, 128 /* Adjust count and bias for loop. */
+
+ .p2align 4
+L(zva_loop):
+ add dst, dst, 64
+ dc zva, dst
+ subs count, count, 64
+ b.hi L(zva_loop)
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+L(no_zva):
+ sub count, dstend, dst /* Count is 16 too large. */
+ sub dst, dst, 16 /* Dst is biased by -32. */
+ sub count, count, 64 + 16 /* Adjust count and bias for loop. */
+L(no_zva_loop):
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dst, 64]!
+ subs count, count, 64
+ b.hi L(no_zva_loop)
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+END (__memset_aarch64)
+
diff --git a/string/aarch64/stpcpy-mte.S b/string/aarch64/stpcpy-mte.S
new file mode 100644
index 000000000000..f1c711906515
--- /dev/null
+++ b/string/aarch64/stpcpy-mte.S
@@ -0,0 +1,10 @@
+/*
+ * stpcpy - copy a string returning pointer to end.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define BUILD_STPCPY 1
+
+#include "strcpy-mte.S"
diff --git a/string/aarch64/stpcpy-sve.S b/string/aarch64/stpcpy-sve.S
new file mode 100644
index 000000000000..82dd9717b0a0
--- /dev/null
+++ b/string/aarch64/stpcpy-sve.S
@@ -0,0 +1,10 @@
+/*
+ * stpcpy - copy a string returning pointer to end.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define BUILD_STPCPY 1
+
+#include "strcpy-sve.S"
diff --git a/string/aarch64/stpcpy.S b/string/aarch64/stpcpy.S
new file mode 100644
index 000000000000..4f62aa462389
--- /dev/null
+++ b/string/aarch64/stpcpy.S
@@ -0,0 +1,10 @@
+/*
+ * stpcpy - copy a string returning pointer to end.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define BUILD_STPCPY 1
+
+#include "strcpy.S"
diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S
new file mode 100644
index 000000000000..dcb0e4625870
--- /dev/null
+++ b/string/aarch64/strchr-mte.S
@@ -0,0 +1,105 @@
+/*
+ * strchr - find a character in a string
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin x0
+#define chrin w1
+#define result x0
+
+#define src x2
+#define tmp1 x1
+#define wtmp2 w3
+#define tmp3 x3
+
+#define vrepchr v0
+#define vdata v1
+#define qdata q1
+#define vhas_nul v2
+#define vhas_chr v3
+#define vrepmask v4
+#define vrepmask2 v5
+#define vend v6
+#define dend d6
+
+/* Core algorithm.
+
+ For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+ per byte. For even bytes, bits 0-1 are set if the relevant byte matched the
+ requested character, bits 2-3 are set if the byte is NUL (or matched), and
+ bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd
+ bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits
+ in the syndrome reflect the order in which things occur in the original
+ string, counting trailing zeros identifies exactly which byte matched. */
+
+ENTRY (__strchr_aarch64_mte)
+ PTR_ARG (0)
+ bic src, srcin, 15
+ dup vrepchr.16b, chrin
+ ld1 {vdata.16b}, [src]
+ mov wtmp2, 0x3003
+ dup vrepmask.8h, wtmp2
+ cmeq vhas_nul.16b, vdata.16b, 0
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ mov wtmp2, 0xf00f
+ dup vrepmask2.8h, wtmp2
+
+ bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+ and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+ lsl tmp3, srcin, 2
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+
+ fmov tmp1, dend
+ lsr tmp1, tmp1, tmp3
+ cbz tmp1, L(loop)
+
+ rbit tmp1, tmp1
+ clz tmp1, tmp1
+ /* Tmp1 is an even multiple of 2 if the target character was
+ found first. Otherwise we've found the end of string. */
+ tst tmp1, 2
+ add result, srcin, tmp1, lsr 2
+ csel result, result, xzr, eq
+ ret
+
+ .p2align 4
+L(loop):
+ ldr qdata, [src, 16]!
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov tmp1, dend
+ cbz tmp1, L(loop)
+
+#ifdef __AARCH64EB__
+ bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+ and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ fmov tmp1, dend
+#else
+ bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+ and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ fmov tmp1, dend
+ rbit tmp1, tmp1
+#endif
+ clz tmp1, tmp1
+ /* Tmp1 is an even multiple of 2 if the target character was
+ found first. Otherwise we've found the end of string. */
+ tst tmp1, 2
+ add result, src, tmp1, lsr 2
+ csel result, result, xzr, eq
+ ret
+
+END (__strchr_aarch64_mte)
+
diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S
new file mode 100644
index 000000000000..13ba9f44f9c5
--- /dev/null
+++ b/string/aarch64/strchr-sve.S
@@ -0,0 +1,70 @@
+/*
+ * strchr/strchrnul - find a character in a string
+ *
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+/* To build as strchrnul, define BUILD_STRCHRNUL before compiling this file. */
+#ifdef BUILD_STRCHRNUL
+#define FUNC __strchrnul_aarch64_sve
+#else
+#define FUNC __strchr_aarch64_sve
+#endif
+
+ENTRY (FUNC)
+ PTR_ARG (0)
+ dup z1.b, w1 /* replicate byte across vector */
+ setffr /* initialize FFR */
+ ptrue p1.b /* all ones; loop invariant */
+
+ .p2align 4
+ /* Read a vector's worth of bytes, stopping on first fault. */
+0: ldff1b z0.b, p1/z, [x0, xzr]
+ rdffrs p0.b, p1/z
+ b.nlast 2f
+
+ /* First fault did not fail: the whole vector is valid.
+ Avoid depending on the contents of FFR beyond the branch. */
+ incb x0 /* speculate increment */
+ cmpeq p2.b, p1/z, z0.b, z1.b /* search for c */
+ cmpeq p3.b, p1/z, z0.b, 0 /* search for 0 */
+ orrs p4.b, p1/z, p2.b, p3.b /* c | 0 */
+ b.none 0b
+ decb x0 /* undo speculate */
+
+ /* Found C or 0. */
+1: brka p4.b, p1/z, p4.b /* find first such */
+ sub x0, x0, 1 /* adjust pointer for that byte */
+ incp x0, p4.b
+#ifndef BUILD_STRCHRNUL
+ ptest p4, p2.b /* was first in c? */
+ csel x0, xzr, x0, none /* if there was no c, return null */
+#endif
+ ret
+
+ /* First fault failed: only some of the vector is valid.
+ Perform the comparision only on the valid bytes. */
+2: cmpeq p2.b, p0/z, z0.b, z1.b /* search for c */
+ cmpeq p3.b, p0/z, z0.b, 0 /* search for 0 */
+ orrs p4.b, p0/z, p2.b, p3.b /* c | 0 */
+ b.any 1b
+
+ /* No C or 0 found. Re-init FFR, increment, and loop. */
+ setffr
+ incp x0, p0.b
+ b 0b
+
+END (FUNC)
+
+#endif
+
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
new file mode 100644
index 000000000000..1063cbfd77aa
--- /dev/null
+++ b/string/aarch64/strchr.S
@@ -0,0 +1,126 @@
+/*
+ * strchr - find a character in a string
+ *
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+#include "../asmdefs.h"
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+
+#define result x0
+
+#define src x2
+#define tmp1 x3
+#define wtmp2 w4
+#define tmp3 x5
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_nul1 v3
+#define vhas_nul2 v4
+#define vhas_chr1 v5
+#define vhas_chr2 v6
+#define vrepmask_0 v7
+#define vrepmask_c v16
+#define vend1 v17
+#define vend2 v18
+
+/* Core algorithm.
+
+ For each 32-byte hunk we calculate a 64-bit syndrome value, with
+ two bits per byte (LSB is always in bits 0 and 1, for both big
+ and little-endian systems). For each tuple, bit 0 is set iff
+ the relevant byte matched the requested character; bit 1 is set
+ iff the relevant byte matched the NUL end of string (we trigger
+ off bit0 for the special case of looking for NUL). Since the bits
+ in the syndrome reflect exactly the order in which things occur
+ in the original string a count_trailing_zeros() operation will
+ identify exactly which byte is causing the termination, and why. */
+
+/* Locals and temporaries. */
+
+ENTRY (__strchr_aarch64)
+ PTR_ARG (0)
+ /* Magic constant 0xc0300c03 to allow us to identify which lane
+ matches the requested byte. Even bits are set if the character
+ matches, odd bits if either the char is NUL or matches. */
+ mov wtmp2, 0x0c03
+ movk wtmp2, 0xc030, lsl 16
+ dup vrepchr.16b, chrin
+ bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
+ dup vrepmask_c.4s, wtmp2
+ ands tmp1, srcin, #31
+ add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
+ b.eq L(loop)
+
+ /* Input string is not 32-byte aligned. Rather than forcing
+ the padding bytes to a safe value, we calculate the syndrome
+ for all the bytes, but then mask off those bits of the
+ syndrome that are related to the padding. */
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ neg tmp1, tmp1
+ cmeq vhas_nul1.16b, vdata1.16b, #0
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_nul2.16b, vdata2.16b, #0
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
+ bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
+ and vend1.16b, vhas_nul1.16b, vrepmask_c.16b
+ and vend2.16b, vhas_nul2.16b, vrepmask_c.16b
+ lsl tmp1, tmp1, #1
+ addp vend1.16b, vend1.16b, vend2.16b // 256->128
+ mov tmp3, #~0
+ addp vend1.16b, vend1.16b, vend2.16b // 128->64
+ lsr tmp1, tmp3, tmp1
+
+ mov tmp3, vend1.d[0]
+ bic tmp1, tmp3, tmp1 // Mask padding bits.
+ cbnz tmp1, L(tail)
+
+ .p2align 4
+L(loop):
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+ cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+ orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b
+ umaxp vend1.16b, vend1.16b, vend1.16b
+ mov tmp1, vend1.d[0]
+ cbz tmp1, L(loop)
+
+ /* Termination condition found. Now need to establish exactly why
+ we terminated. */
+ bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
+ bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
+ and vend1.16b, vhas_nul1.16b, vrepmask_c.16b
+ and vend2.16b, vhas_nul2.16b, vrepmask_c.16b
+ addp vend1.16b, vend1.16b, vend2.16b // 256->128
+ addp vend1.16b, vend1.16b, vend2.16b // 128->64
+ mov tmp1, vend1.d[0]
+L(tail):
+ /* Count the trailing zeros, by bit reversing... */
+ rbit tmp1, tmp1
+ /* Re-bias source. */
+ sub src, src, #32
+ clz tmp1, tmp1 /* And counting the leading zeros. */
+ /* Tmp1 is even if the target charager was found first. Otherwise
+ we've found the end of string and we weren't looking for NUL. */
+ tst tmp1, #1
+ add result, src, tmp1, lsr #1
+ csel result, result, xzr, eq
+ ret
+
+END (__strchr_aarch64)
+
diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S
new file mode 100644
index 000000000000..1b0d0a63094c
--- /dev/null
+++ b/string/aarch64/strchrnul-mte.S
@@ -0,0 +1,84 @@
+/*
+ * strchrnul - find a character or nul in a string
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin x0
+#define chrin w1
+#define result x0
+
+#define src x2
+#define tmp1 x1
+#define tmp2 x3
+#define tmp2w w3
+
+#define vrepchr v0
+#define vdata v1
+#define qdata q1
+#define vhas_nul v2
+#define vhas_chr v3
+#define vrepmask v4
+#define vend v5
+#define dend d5
+
+/* Core algorithm:
+
+ For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+ per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+ requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+ set likewise for odd bytes so that adjacent bytes can be merged. Since the
+ bits in the syndrome reflect the order in which things occur in the original
+ string, counting trailing zeros identifies exactly which byte matched. */
+
+ENTRY (__strchrnul_aarch64_mte)
+ PTR_ARG (0)
+ bic src, srcin, 15
+ dup vrepchr.16b, chrin
+ ld1 {vdata.16b}, [src]
+ mov tmp2w, 0xf00f
+ dup vrepmask.8h, tmp2w
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b
+ lsl tmp2, srcin, 2
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov tmp1, dend
+ lsr tmp1, tmp1, tmp2 /* Mask padding bits. */
+ cbz tmp1, L(loop)
+
+ rbit tmp1, tmp1
+ clz tmp1, tmp1
+ add result, srcin, tmp1, lsr 2
+ ret
+
+ .p2align 4
+L(loop):
+ ldr qdata, [src, 16]!
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b
+ umaxp vend.16b, vhas_chr.16b, vhas_chr.16b
+ fmov tmp1, dend
+ cbz tmp1, L(loop)
+
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov tmp1, dend
+#ifndef __AARCH64EB__
+ rbit tmp1, tmp1
+#endif
+ clz tmp1, tmp1
+ add result, src, tmp1, lsr 2
+ ret
+
+END (__strchrnul_aarch64_mte)
+
diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/strchrnul-sve.S
new file mode 100644
index 000000000000..428ff1a3d008
--- /dev/null
+++ b/string/aarch64/strchrnul-sve.S
@@ -0,0 +1,9 @@
+/*
+ * strchrnul - find a character or nul in a string
+ *
+ * Copyright (c) 2018-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define BUILD_STRCHRNUL
+#include "strchr-sve.S"
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
new file mode 100644
index 000000000000..a4230d919b47
--- /dev/null
+++ b/string/aarch64/strchrnul.S
@@ -0,0 +1,114 @@
+/*
+ * strchrnul - find a character or nul in a string
+ *
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+#include "../asmdefs.h"
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+
+#define result x0
+
+#define src x2
+#define tmp1 x3
+#define wtmp2 w4
+#define tmp3 x5
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_nul1 v3
+#define vhas_nul2 v4
+#define vhas_chr1 v5
+#define vhas_chr2 v6
+#define vrepmask v7
+#define vend1 v16
+
+/* Core algorithm.
+
+ For each 32-byte hunk we calculate a 64-bit syndrome value, with
+ two bits per byte (LSB is always in bits 0 and 1, for both big
+ and little-endian systems). For each tuple, bit 0 is set iff
+ the relevant byte matched the requested character or nul. Since the
+ bits in the syndrome reflect exactly the order in which things occur
+ in the original string a count_trailing_zeros() operation will
+ identify exactly which byte is causing the termination. */
+
+/* Locals and temporaries. */
+
+ENTRY (__strchrnul_aarch64)
+ PTR_ARG (0)
+ /* Magic constant 0x40100401 to allow us to identify which lane
+ matches the termination condition. */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
+ dup vrepmask.4s, wtmp2
+ ands tmp1, srcin, #31
+ b.eq L(loop)
+
+ /* Input string is not 32-byte aligned. Rather than forcing
+ the padding bytes to a safe value, we calculate the syndrome
+ for all the bytes, but then mask off those bits of the
+ syndrome that are related to the padding. */
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ neg tmp1, tmp1
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+ cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+ and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
+ lsl tmp1, tmp1, #1
+ addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
+ mov tmp3, #~0
+ addp vend1.16b, vend1.16b, vend1.16b // 128->64
+ lsr tmp1, tmp3, tmp1
+
+ mov tmp3, vend1.d[0]
+ bic tmp1, tmp3, tmp1 // Mask padding bits.
+ cbnz tmp1, L(tail)
+
+ .p2align 4
+L(loop):
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+ cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+ orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b
+ umaxp vend1.16b, vend1.16b, vend1.16b
+ mov tmp1, vend1.d[0]
+ cbz tmp1, L(loop)
+
+ /* Termination condition found. Now need to establish exactly why
+ we terminated. */
+ and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
+ addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
+ addp vend1.16b, vend1.16b, vend1.16b // 128->64
+
+ mov tmp1, vend1.d[0]
+L(tail):
+ /* Count the trailing zeros, by bit reversing... */
+ rbit tmp1, tmp1
+ /* Re-bias source. */
+ sub src, src, #32
+ clz tmp1, tmp1 /* ... and counting the leading zeros. */
+ /* tmp1 is twice the offset into the fragment. */
+ add result, src, tmp1, lsr #1
+ ret
+
+END (__strchrnul_aarch64)
+
diff --git a/string/aarch64/strcmp-mte.S b/string/aarch64/strcmp-mte.S
new file mode 100644
index 000000000000..12d1a6b51dd3
--- /dev/null
+++ b/string/aarch64/strcmp-mte.S
@@ -0,0 +1,189 @@
+/*
+ * strcmp - compare two strings
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+#define src1 x0
+#define src2 x1
+#define result x0
+
+#define data1 x2
+#define data1w w2
+#define data2 x3
+#define data2w w3
+#define has_nul x4
+#define diff x5
+#define off1 x5
+#define syndrome x6
+#define tmp x6
+#define data3 x7
+#define zeroones x8
+#define shift x9
+#define off2 x10
+
+/* On big-endian early bytes are at MSB and on little-endian LSB.
+ LS_FW means shifting towards early bytes. */
+#ifdef __AARCH64EB__
+# define LS_FW lsl
+#else
+# define LS_FW lsr
+#endif
+
+/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word.
+ Since carry propagation makes 0x1 bytes before a NUL byte appear
+ NUL too in big-endian, byte-reverse the data before the NUL check. */
+
+
+ENTRY (__strcmp_aarch64_mte)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ sub off2, src2, src1
+ mov zeroones, REP8_01
+ and tmp, src1, 7
+ tst off2, 7
+ b.ne L(misaligned8)
+ cbnz tmp, L(mutual_align)
+
+ .p2align 4
+
+L(loop_aligned):
+ ldr data2, [src1, off2]
+ ldr data1, [src1], 8
+L(start_realigned):
+#ifdef __AARCH64EB__
+ rev tmp, data1
+ sub has_nul, tmp, zeroones
+ orr tmp, tmp, REP8_7f
+#else
+ sub has_nul, data1, zeroones
+ orr tmp, data1, REP8_7f
+#endif
+ bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */
+ ccmp data1, data2, 0, eq
+ b.eq L(loop_aligned)
+#ifdef __AARCH64EB__
+ rev has_nul, has_nul
+#endif
+ eor diff, data1, data2
+ orr syndrome, diff, has_nul
+L(end):
+#ifndef __AARCH64EB__
+ rev syndrome, syndrome
+ rev data1, data1
+ rev data2, data2
+#endif
+ clz shift, syndrome
+ /* The most-significant-non-zero bit of the syndrome marks either the
+ first bit that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+ lsl data1, data1, shift
+ lsl data2, data2, shift
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, 56
+ sub result, data1, data2, lsr 56
+ ret
+
+ .p2align 4
+
+L(mutual_align):
+ /* Sources are mutually aligned, but are not currently at an
+ alignment boundary. Round down the addresses and then mask off
+ the bytes that precede the start point. */
+ bic src1, src1, 7
+ ldr data2, [src1, off2]
+ ldr data1, [src1], 8
+ neg shift, src2, lsl 3 /* Bits to alignment -64. */
+ mov tmp, -1
+ LS_FW tmp, tmp, shift
+ orr data1, data1, tmp
+ orr data2, data2, tmp
+ b L(start_realigned)
+
+L(misaligned8):
+ /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+ checking to make sure that we don't access beyond the end of SRC2. */
+ cbz tmp, L(src1_aligned)
+L(do_misaligned):
+ ldrb data1w, [src1], 1
+ ldrb data2w, [src2], 1
+ cmp data1w, 0
+ ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
+ b.ne L(done)
+ tst src1, 7
+ b.ne L(do_misaligned)
+
+L(src1_aligned):
+ neg shift, src2, lsl 3
+ bic src2, src2, 7
+ ldr data3, [src2], 8
+#ifdef __AARCH64EB__
+ rev data3, data3
+#endif
+ lsr tmp, zeroones, shift
+ orr data3, data3, tmp
+ sub has_nul, data3, zeroones
+ orr tmp, data3, REP8_7f
+ bics has_nul, has_nul, tmp
+ b.ne L(tail)
+
+ sub off1, src2, src1
+
+ .p2align 4
+
+L(loop_unaligned):
+ ldr data3, [src1, off1]
+ ldr data2, [src1, off2]
+#ifdef __AARCH64EB__
+ rev data3, data3
+#endif
+ sub has_nul, data3, zeroones
+ orr tmp, data3, REP8_7f
+ ldr data1, [src1], 8
+ bics has_nul, has_nul, tmp
+ ccmp data1, data2, 0, eq
+ b.eq L(loop_unaligned)
+
+ lsl tmp, has_nul, shift
+#ifdef __AARCH64EB__
+ rev tmp, tmp
+#endif
+ eor diff, data1, data2
+ orr syndrome, diff, tmp
+ cbnz syndrome, L(end)
+L(tail):
+ ldr data1, [src1]
+ neg shift, shift
+ lsr data2, data3, shift
+ lsr has_nul, has_nul, shift
+#ifdef __AARCH64EB__
+ rev data2, data2
+ rev has_nul, has_nul
+#endif
+ eor diff, data1, data2
+ orr syndrome, diff, has_nul
+ b L(end)
+
+L(done):
+ sub result, data1, data2
+ ret
+
+END (__strcmp_aarch64_mte)
+
diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S
new file mode 100644
index 000000000000..e6d2da5411ca
--- /dev/null
+++ b/string/aarch64/strcmp-sve.S
@@ -0,0 +1,59 @@
+/*
+ * __strcmp_aarch64_sve - compare two strings
+ *
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+ENTRY (__strcmp_aarch64_sve)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ setffr /* initialize FFR */
+ ptrue p1.b, all /* all ones; loop invariant */
+ mov x2, 0 /* initialize offset */
+
+ /* Read a vector's worth of bytes, stopping on first fault. */
+ .p2align 4
+0: ldff1b z0.b, p1/z, [x0, x2]
+ ldff1b z1.b, p1/z, [x1, x2]
+ rdffrs p0.b, p1/z
+ b.nlast 2f
+
+ /* First fault did not fail: the whole vector is valid.
+ Avoid depending on the contents of FFR beyond the branch. */
+ incb x2, all /* skip bytes for next round */
+ cmpeq p2.b, p1/z, z0.b, z1.b /* compare strings */
+ cmpne p3.b, p1/z, z0.b, 0 /* search for ~zero */
+ nands p2.b, p1/z, p2.b, p3.b /* ~(eq & ~zero) -> ne | zero */
+ b.none 0b
+
+ /* Found end-of-string or inequality. */
+1: brkb p2.b, p1/z, p2.b /* find first such */
+ lasta w0, p2, z0.b /* extract each char */
+ lasta w1, p2, z1.b
+ sub x0, x0, x1 /* return comparison */
+ ret
+
+ /* First fault failed: only some of the vector is valid.
+ Perform the comparison only on the valid bytes. */
+2: incp x2, p0.b /* skip bytes for next round */
+ setffr /* re-init FFR for next round */
+ cmpeq p2.b, p0/z, z0.b, z1.b /* compare strings, as above */
+ cmpne p3.b, p0/z, z0.b, 0
+ nands p2.b, p0/z, p2.b, p3.b
+ b.none 0b
+ b 1b
+
+END (__strcmp_aarch64_sve)
+
+#endif
+
diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S
new file mode 100644
index 000000000000..7714ebf5577d
--- /dev/null
+++ b/string/aarch64/strcmp.S
@@ -0,0 +1,173 @@
+/*
+ * strcmp - compare two strings
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+#include "../asmdefs.h"
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and result. */
+#define src1 x0
+#define src2 x1
+#define result x0
+
+/* Internal variables. */
+#define data1 x2
+#define data1w w2
+#define data2 x3
+#define data2w w3
+#define has_nul x4
+#define diff x5
+#define syndrome x6
+#define tmp1 x7
+#define tmp2 x8
+#define tmp3 x9
+#define zeroones x10
+#define pos x11
+
+ /* Start of performance-critical section -- one 64B cache line. */
+ENTRY (__strcmp_aarch64)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ eor tmp1, src1, src2
+ mov zeroones, #REP8_01
+ tst tmp1, #7
+ b.ne L(misaligned8)
+ ands tmp1, src1, #7
+ b.ne L(mutual_align)
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. */
+L(loop_aligned):
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+L(start_realigned):
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ orr syndrome, diff, has_nul
+ cbz syndrome, L(loop_aligned)
+ /* End of performance-critical section -- one 64B cache line. */
+
+L(end):
+#ifndef __AARCH64EB__
+ rev syndrome, syndrome
+ rev data1, data1
+ /* The MS-non-zero bit of the syndrome marks either the first bit
+ that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+ clz pos, syndrome
+ rev data2, data2
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+#else
+ /* For big-endian we cannot use the trick with the syndrome value
+ as carry-propagation can corrupt the upper bits if the trailing
+ bytes in the string contain 0x01. */
+ /* However, if there is no NUL byte in the dword, we can generate
+ the result directly. We can't just subtract the bytes as the
+ MSB might be significant. */
+ cbnz has_nul, 1f
+ cmp data1, data2
+ cset result, ne
+ cneg result, result, lo
+ ret
+1:
+ /* Re-compute the NUL-byte detection, using a byte-reversed value. */
+ rev tmp3, data1
+ sub tmp1, tmp3, zeroones
+ orr tmp2, tmp3, #REP8_7f
+ bic has_nul, tmp1, tmp2
+ rev has_nul, has_nul
+ orr syndrome, diff, has_nul
+ clz pos, syndrome
+ /* The MS-non-zero bit of the syndrome marks either the first bit
+ that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+#endif
+
+L(mutual_align):
+ /* Sources are mutually aligned, but are not currently at an
+ alignment boundary. Round down the addresses and then mask off
+ the bytes that preceed the start point. */
+ bic src1, src1, #7
+ bic src2, src2, #7
+ lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
+ ldr data1, [src1], #8
+ neg tmp1, tmp1 /* Bits to alignment -64. */
+ ldr data2, [src2], #8
+ mov tmp2, #~0
+#ifdef __AARCH64EB__
+ /* Big-endian. Early bytes are at MSB. */
+ lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+#else
+ /* Little-endian. Early bytes are at LSB. */
+ lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+#endif
+ orr data1, data1, tmp2
+ orr data2, data2, tmp2
+ b L(start_realigned)
+
+L(misaligned8):
+ /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+ checking to make sure that we don't access beyond page boundary in
+ SRC2. */
+ tst src1, #7
+ b.eq L(loop_misaligned)
+L(do_misaligned):
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ cmp data1w, #1
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.ne L(done)
+ tst src1, #7
+ b.ne L(do_misaligned)
+
+L(loop_misaligned):
+ /* Test if we are within the last dword of the end of a 4K page. If
+ yes then jump back to the misaligned loop to copy a byte at a time. */
+ and tmp1, src2, #0xff8
+ eor tmp1, tmp1, #0xff8
+ cbz tmp1, L(do_misaligned)
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ orr syndrome, diff, has_nul
+ cbz syndrome, L(loop_misaligned)
+ b L(end)
+
+L(done):
+ sub result, data1, data2
+ ret
+
+END (__strcmp_aarch64)
+
diff --git a/string/aarch64/strcpy-mte.S b/string/aarch64/strcpy-mte.S
new file mode 100644
index 000000000000..88c222d61e53
--- /dev/null
+++ b/string/aarch64/strcpy-mte.S
@@ -0,0 +1,161 @@
+/*
+ * strcpy/stpcpy - copy a string returning pointer to start/end.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define dstin x0
+#define srcin x1
+#define result x0
+
+#define src x2
+#define dst x3
+#define len x4
+#define synd x4
+#define tmp x5
+#define wtmp w5
+#define shift x5
+#define data1 x6
+#define dataw1 w6
+#define data2 x7
+#define dataw2 w7
+
+#define dataq q0
+#define vdata v0
+#define vhas_nul v1
+#define vrepmask v2
+#define vend v3
+#define dend d3
+#define dataq2 q1
+
+#ifdef BUILD_STPCPY
+# define STRCPY __stpcpy_aarch64_mte
+# define IFSTPCPY(X,...) X,__VA_ARGS__
+#else
+# define STRCPY __strcpy_aarch64_mte
+# define IFSTPCPY(X,...)
+#endif
+
+/* Core algorithm:
+
+ For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+ per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+ requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+ set likewise for odd bytes so that adjacent bytes can be merged. Since the
+ bits in the syndrome reflect the order in which things occur in the original
+ string, counting trailing zeros identifies exactly which byte matched. */
+
+ENTRY (STRCPY)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ bic src, srcin, 15
+ mov wtmp, 0xf00f
+ ld1 {vdata.16b}, [src]
+ dup vrepmask.8h, wtmp
+ cmeq vhas_nul.16b, vdata.16b, 0
+ lsl shift, srcin, 2
+ and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ lsr synd, synd, shift
+ cbnz synd, L(tail)
+
+ ldr dataq, [src, 16]!
+ cmeq vhas_nul.16b, vdata.16b, 0
+ and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ cbz synd, L(start_loop)
+
+#ifndef __AARCH64EB__
+ rbit synd, synd
+#endif
+ sub tmp, src, srcin
+ clz len, synd
+ add len, tmp, len, lsr 2
+ tbz len, 4, L(less16)
+ sub tmp, len, 15
+ ldr dataq, [srcin]
+ ldr dataq2, [srcin, tmp]
+ str dataq, [dstin]
+ str dataq2, [dstin, tmp]
+ IFSTPCPY (add result, dstin, len)
+ ret
+
+ .p2align 4,,8
+L(tail):
+ rbit synd, synd
+ clz len, synd
+ lsr len, len, 2
+
+ .p2align 4
+L(less16):
+ tbz len, 3, L(less8)
+ sub tmp, len, 7
+ ldr data1, [srcin]
+ ldr data2, [srcin, tmp]
+ str data1, [dstin]
+ str data2, [dstin, tmp]
+ IFSTPCPY (add result, dstin, len)
+ ret
+
+ .p2align 4
+L(less8):
+ subs tmp, len, 3
+ b.lo L(less4)
+ ldr dataw1, [srcin]
+ ldr dataw2, [srcin, tmp]
+ str dataw1, [dstin]
+ str dataw2, [dstin, tmp]
+ IFSTPCPY (add result, dstin, len)
+ ret
+
+L(less4):
+ cbz len, L(zerobyte)
+ ldrh dataw1, [srcin]
+ strh dataw1, [dstin]
+L(zerobyte):
+ strb wzr, [dstin, len]
+ IFSTPCPY (add result, dstin, len)
+ ret
+
+ .p2align 4
+L(start_loop):
+ sub len, src, srcin
+ ldr dataq2, [srcin]
+ add dst, dstin, len
+ str dataq2, [dstin]
+
+ .p2align 5
+L(loop):
+ str dataq, [dst], 16
+ ldr dataq, [src, 16]!
+ cmeq vhas_nul.16b, vdata.16b, 0
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ cbz synd, L(loop)
+
+ and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ fmov synd, dend
+#ifndef __AARCH64EB__
+ rbit synd, synd
+#endif
+ clz len, synd
+ lsr len, len, 2
+ sub tmp, len, 15
+ ldr dataq, [src, tmp]
+ str dataq, [dst, tmp]
+ IFSTPCPY (add result, dst, len)
+ ret
+
+END (STRCPY)
diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S
new file mode 100644
index 000000000000..f515462e09ae
--- /dev/null
+++ b/string/aarch64/strcpy-sve.S
@@ -0,0 +1,71 @@
+/*
+ * strcpy/stpcpy - copy a string returning pointer to start/end.
+ *
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+/* To build as stpcpy, define BUILD_STPCPY before compiling this file. */
+#ifdef BUILD_STPCPY
+#define FUNC __stpcpy_aarch64_sve
+#else
+#define FUNC __strcpy_aarch64_sve
+#endif
+
+ENTRY (FUNC)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ setffr /* initialize FFR */
+ ptrue p2.b, all /* all ones; loop invariant */
+ mov x2, 0 /* initialize offset */
+
+ .p2align 4
+ /* Read a vector's worth of bytes, stopping on first fault. */
+0: ldff1b z0.b, p2/z, [x1, x2]
+ rdffrs p0.b, p2/z
+ b.nlast 1f
+
+ /* First fault did not fail: the whole vector is valid.
+ Avoid depending on the contexts of FFR beyond the branch. */
+ cmpeq p1.b, p2/z, z0.b, 0 /* search for zeros */
+ b.any 2f
+
+ /* No zero found. Store the whole vector and loop. */
+ st1b z0.b, p2, [x0, x2]
+ incb x2, all
+ b 0b
+
+ /* First fault failed: only some of the vector is valid.
+ Perform the comparison only on the valid bytes. */
+1: cmpeq p1.b, p0/z, z0.b, 0 /* search for zeros */
+ b.any 2f
+
+ /* No zero found. Store the valid portion of the vector and loop. */
+ setffr /* re-init FFR */
+ st1b z0.b, p0, [x0, x2]
+ incp x2, p0.b
+ b 0b
+
+ /* Zero found. Crop the vector to the found zero and finish. */
+2: brka p0.b, p2/z, p1.b
+ st1b z0.b, p0, [x0, x2]
+#ifdef BUILD_STPCPY
+ add x0, x0, x2
+ sub x0, x0, 1
+ incp x0, p0.b
+#endif
+ ret
+
+END (FUNC)
+
+#endif
+
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
new file mode 100644
index 000000000000..6e9ed424b693
--- /dev/null
+++ b/string/aarch64/strcpy.S
@@ -0,0 +1,311 @@
+/*
+ * strcpy/stpcpy - copy a string returning pointer to start/end.
+ *
+ * Copyright (c) 2013-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ */
+
+#include "../asmdefs.h"
+
+/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
+
+ To test the page crossing code path more thoroughly, compile with
+ -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
+ entry path. This option is not intended for production use. */
+
+/* Arguments and results. */
+#define dstin x0
+#define srcin x1
+
+/* Locals and temporaries. */
+#define src x2
+#define dst x3
+#define data1 x4
+#define data1w w4
+#define data2 x5
+#define data2w w5
+#define has_nul1 x6
+#define has_nul2 x7
+#define tmp1 x8
+#define tmp2 x9
+#define tmp3 x10
+#define tmp4 x11
+#define zeroones x12
+#define data1a x13
+#define data2a x14
+#define pos x15
+#define len x16
+#define to_align x17
+
+#ifdef BUILD_STPCPY
+#define STRCPY __stpcpy_aarch64
+#else
+#define STRCPY __strcpy_aarch64
+#endif
+
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. */
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+ /* AArch64 systems have a minimum page size of 4k. We can do a quick
+ page size check for crossing this boundary on entry and if we
+ do not, then we can short-circuit much of the entry code. We
+ expect early page-crossing strings to be rare (probability of
+ 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
+ predictable, even with random strings.
+
+ We don't bother checking for larger page sizes, the cost of setting
+ up the correct page size is just not worth the extra gain from
+ a small reduction in the cases taking the slow path. Note that
+ we only care about whether the first fetch, which may be
+ misaligned, crosses a page boundary - after that we move to aligned
+ fetches for the remainder of the string. */
+
+#ifdef STRCPY_TEST_PAGE_CROSS
+ /* Make everything that isn't Qword aligned look like a page cross. */
+#define MIN_PAGE_P2 4
+#else
+#define MIN_PAGE_P2 12
+#endif
+
+#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
+
+ENTRY (STRCPY)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ /* For moderately short strings, the fastest way to do the copy is to
+ calculate the length of the string in the same way as strlen, then
+ essentially do a memcpy of the result. This avoids the need for
+ multiple byte copies and further means that by the time we
+ reach the bulk copy loop we know we can always use DWord
+ accesses. We expect __strcpy_aarch64 to rarely be called repeatedly
+ with the same source string, so branch prediction is likely to
+ always be difficult - we mitigate against this by preferring
+ conditional select operations over branches whenever this is
+ feasible. */
+ and tmp2, srcin, #(MIN_PAGE_SIZE - 1)
+ mov zeroones, #REP8_01
+ and to_align, srcin, #15
+ cmp tmp2, #(MIN_PAGE_SIZE - 16)
+ neg tmp1, to_align
+ /* The first fetch will straddle a (possible) page boundary iff
+ srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
+ aligned string will never fail the page align check, so will
+ always take the fast path. */
+ b.gt L(page_cross)
+
+L(page_cross_ok):
+ ldp data1, data2, [srcin]
+#ifdef __AARCH64EB__
+ /* Because we expect the end to be found within 16 characters
+ (profiling shows this is the most common case), it's worth
+ swapping the bytes now to save having to recalculate the
+ termination syndrome later. We preserve data1 and data2
+ so that we can re-use the values later on. */
+ rev tmp2, data1
+ sub tmp1, tmp2, zeroones
+ orr tmp2, tmp2, #REP8_7f
+ bics has_nul1, tmp1, tmp2
+ b.ne L(fp_le8)
+ rev tmp4, data2
+ sub tmp3, tmp4, zeroones
+ orr tmp4, tmp4, #REP8_7f
+#else
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ bics has_nul1, tmp1, tmp2
+ b.ne L(fp_le8)
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
+#endif
+ bics has_nul2, tmp3, tmp4
+ b.eq L(bulk_entry)
+
+ /* The string is short (<=16 bytes). We don't know exactly how
+ short though, yet. Work out the exact length so that we can
+ quickly select the optimal copy strategy. */
+L(fp_gt8):
+ rev has_nul2, has_nul2
+ clz pos, has_nul2
+ mov tmp2, #56
+ add dst, dstin, pos, lsr #3 /* Bits to bytes. */
+ sub pos, tmp2, pos
+#ifdef __AARCH64EB__
+ lsr data2, data2, pos
+#else
+ lsl data2, data2, pos
+#endif
+ str data2, [dst, #1]
+ str data1, [dstin]
+#ifdef BUILD_STPCPY
+ add dstin, dst, #8
+#endif
+ ret
+
+L(fp_le8):
+ rev has_nul1, has_nul1
+ clz pos, has_nul1
+ add dst, dstin, pos, lsr #3 /* Bits to bytes. */
+ subs tmp2, pos, #24 /* Pos in bits. */
+ b.lt L(fp_lt4)
+#ifdef __AARCH64EB__
+ mov tmp2, #56
+ sub pos, tmp2, pos
+ lsr data2, data1, pos
+ lsr data1, data1, #32
+#else
+ lsr data2, data1, tmp2
+#endif
+ /* 4->7 bytes to copy. */
+ str data2w, [dst, #-3]
+ str data1w, [dstin]
+#ifdef BUILD_STPCPY
+ mov dstin, dst
+#endif
+ ret
+L(fp_lt4):
+ cbz pos, L(fp_lt2)
+ /* 2->3 bytes to copy. */
+#ifdef __AARCH64EB__
+ lsr data1, data1, #48
+#endif
+ strh data1w, [dstin]
+ /* Fall-through, one byte (max) to go. */
+L(fp_lt2):
+ /* Null-terminated string. Last character must be zero! */
+ strb wzr, [dst]
+#ifdef BUILD_STPCPY
+ mov dstin, dst
+#endif
+ ret
+
+ .p2align 6
+ /* Aligning here ensures that the entry code and main loop all lies
+ within one 64-byte cache line. */
+L(bulk_entry):
+ sub to_align, to_align, #16
+ stp data1, data2, [dstin]
+ sub src, srcin, to_align
+ sub dst, dstin, to_align
+ b L(entry_no_page_cross)
+
+ /* The inner loop deals with two Dwords at a time. This has a
+ slightly higher start-up cost, but we should win quite quickly,
+ especially on cores with a high number of issue slots per
+ cycle, as we get much better parallelism out of the operations. */
+L(main_loop):
+ stp data1, data2, [dst], #16
+L(entry_no_page_cross):
+ ldp data1, data2, [src], #16
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
+ bic has_nul1, tmp1, tmp2
+ bics has_nul2, tmp3, tmp4
+ ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
+ b.eq L(main_loop)
+
+ /* Since we know we are copying at least 16 bytes, the fastest way
+ to deal with the tail is to determine the location of the
+ trailing NUL, then (re)copy the 16 bytes leading up to that. */
+ cmp has_nul1, #0
+#ifdef __AARCH64EB__
+ /* For big-endian, carry propagation (if the final byte in the
+ string is 0x01) means we cannot use has_nul directly. The
+ easiest way to get the correct byte is to byte-swap the data
+ and calculate the syndrome a second time. */
+ csel data1, data1, data2, ne
+ rev data1, data1
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ bic has_nul1, tmp1, tmp2
+#else
+ csel has_nul1, has_nul1, has_nul2, ne
+#endif
+ rev has_nul1, has_nul1
+ clz pos, has_nul1
+ add tmp1, pos, #72
+ add pos, pos, #8
+ csel pos, pos, tmp1, ne
+ add src, src, pos, lsr #3
+ add dst, dst, pos, lsr #3
+ ldp data1, data2, [src, #-32]
+ stp data1, data2, [dst, #-16]
+#ifdef BUILD_STPCPY
+ sub dstin, dst, #1
+#endif
+ ret
+
+L(page_cross):
+ bic src, srcin, #15
+ /* Start by loading two words at [srcin & ~15], then forcing the
+ bytes that precede srcin to 0xff. This means they never look
+ like termination bytes. */
+ ldp data1, data2, [src]
+ lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
+ tst to_align, #7
+ csetm tmp2, ne
+#ifdef __AARCH64EB__
+ lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+#else
+ lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+#endif
+ orr data1, data1, tmp2
+ orr data2a, data2, tmp2
+ cmp to_align, #8
+ csinv data1, data1, xzr, lt
+ csel data2, data2, data2a, lt
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
+ bic has_nul1, tmp1, tmp2
+ bics has_nul2, tmp3, tmp4
+ ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
+ b.eq L(page_cross_ok)
+ /* We now need to make data1 and data2 look like they've been
+ loaded directly from srcin. Do a rotate on the 128-bit value. */
+ lsl tmp1, to_align, #3 /* Bytes->bits. */
+ neg tmp2, to_align, lsl #3
+#ifdef __AARCH64EB__
+ lsl data1a, data1, tmp1
+ lsr tmp4, data2, tmp2
+ lsl data2, data2, tmp1
+ orr tmp4, tmp4, data1a
+ cmp to_align, #8
+ csel data1, tmp4, data2, lt
+ rev tmp2, data1
+ rev tmp4, data2
+ sub tmp1, tmp2, zeroones
+ orr tmp2, tmp2, #REP8_7f
+ sub tmp3, tmp4, zeroones
+ orr tmp4, tmp4, #REP8_7f
+#else
+ lsr data1a, data1, tmp1
+ lsl tmp4, data2, tmp2
+ lsr data2, data2, tmp1
+ orr tmp4, tmp4, data1a
+ cmp to_align, #8
+ csel data1, tmp4, data2, lt
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
+#endif
+ bic has_nul1, tmp1, tmp2
+ cbnz has_nul1, L(fp_le8)
+ bic has_nul2, tmp3, tmp4
+ b L(fp_gt8)
+
+END (STRCPY)
+
diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S
new file mode 100644
index 000000000000..7cf41d5c1eac
--- /dev/null
+++ b/string/aarch64/strlen-mte.S
@@ -0,0 +1,80 @@
+/*
+ * strlen - calculate the length of a string.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin x0
+#define result x0
+
+#define src x1
+#define synd x2
+#define tmp x3
+#define wtmp w3
+#define shift x4
+
+#define data q0
+#define vdata v0
+#define vhas_nul v1
+#define vrepmask v2
+#define vend v3
+#define dend d3
+
+/* Core algorithm:
+
+ For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+ per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+ requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+ set likewise for odd bytes so that adjacent bytes can be merged. Since the
+ bits in the syndrome reflect the order in which things occur in the original
+ string, counting trailing zeros identifies exactly which byte matched. */
+
+ENTRY (__strlen_aarch64_mte)
+ PTR_ARG (0)
+ bic src, srcin, 15
+ mov wtmp, 0xf00f
+ ld1 {vdata.16b}, [src]
+ dup vrepmask.8h, wtmp
+ cmeq vhas_nul.16b, vdata.16b, 0
+ lsl shift, srcin, 2
+ and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ fmov synd, dend
+ lsr synd, synd, shift
+ cbz synd, L(loop)
+
+ rbit synd, synd
+ clz result, synd
+ lsr result, result, 2
+ ret
+
+ .p2align 5
+L(loop):
+ ldr data, [src, 16]!
+ cmeq vhas_nul.16b, vdata.16b, 0
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ cbz synd, L(loop)
+
+ and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ sub result, src, srcin
+ fmov synd, dend
+#ifndef __AARCH64EB__
+ rbit synd, synd
+#endif
+ clz tmp, synd
+ add result, result, tmp, lsr 2
+ ret
+
+END (__strlen_aarch64_mte)
+
diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S
new file mode 100644
index 000000000000..2392493f1a3c
--- /dev/null
+++ b/string/aarch64/strlen-sve.S
@@ -0,0 +1,55 @@
+/*
+ * __strlen_aarch64_sve - compute the length of a string
+ *
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+ENTRY (__strlen_aarch64_sve)
+ PTR_ARG (0)
+ setffr /* initialize FFR */
+ ptrue p2.b /* all ones; loop invariant */
+ mov x1, 0 /* initialize length */
+
+ /* Read a vector's worth of bytes, stopping on first fault. */
+ .p2align 4
+0: ldff1b z0.b, p2/z, [x0, x1]
+ rdffrs p0.b, p2/z
+ b.nlast 2f
+
+ /* First fault did not fail: the whole vector is valid.
+ Avoid depending on the contents of FFR beyond the branch. */
+ incb x1, all /* speculate increment */
+ cmpeq p1.b, p2/z, z0.b, 0 /* loop if no zeros */
+ b.none 0b
+ decb x1, all /* undo speculate */
+
+ /* Zero found. Select the bytes before the first and count them. */
+1: brkb p0.b, p2/z, p1.b
+ incp x1, p0.b
+ mov x0, x1
+ ret
+
+ /* First fault failed: only some of the vector is valid.
+ Perform the comparison only on the valid bytes. */
+2: cmpeq p1.b, p0/z, z0.b, 0
+ b.any 1b
+
+ /* No zero found. Re-init FFR, increment, and loop. */
+ setffr
+ incp x1, p0.b
+ b 0b
+
+END (__strlen_aarch64_sve)
+
+#endif
+
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
new file mode 100644
index 000000000000..a1b164a49238
--- /dev/null
+++ b/string/aarch64/strlen.S
@@ -0,0 +1,200 @@
+/*
+ * strlen - calculate the length of a string.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ * Not MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin x0
+#define len x0
+
+#define src x1
+#define data1 x2
+#define data2 x3
+#define has_nul1 x4
+#define has_nul2 x5
+#define tmp1 x4
+#define tmp2 x5
+#define tmp3 x6
+#define tmp4 x7
+#define zeroones x8
+
+#define maskv v0
+#define maskd d0
+#define dataq1 q1
+#define dataq2 q2
+#define datav1 v1
+#define datav2 v2
+#define tmp x2
+#define tmpw w2
+#define synd x3
+#define shift x4
+
+/* For the first 32 bytes, NUL detection works on the principle that
+ (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
+ byte is zero, and can be done in parallel across the entire word. */
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+/* To test the page crossing code path more thoroughly, compile with
+ -DTEST_PAGE_CROSS - this will force all calls through the slower
+ entry path. This option is not intended for production use. */
+
+#ifdef TEST_PAGE_CROSS
+# define MIN_PAGE_SIZE 32
+#else
+# define MIN_PAGE_SIZE 4096
+#endif
+
+/* Core algorithm:
+
+ Since strings are short on average, we check the first 32 bytes of the
+ string for a NUL character without aligning the string. In order to use
+ unaligned loads safely we must do a page cross check first.
+
+ If there is a NUL byte we calculate the length from the 2 8-byte words
+ using conditional select to reduce branch mispredictions (it is unlikely
+ strlen will be repeatedly called on strings with the same length).
+
+ If the string is longer than 32 bytes, align src so we don't need further
+ page cross checks, and process 32 bytes per iteration using a fast SIMD
+ loop.
+
+ If the page cross check fails, we read 32 bytes from an aligned address,
+ and ignore any characters before the string. If it contains a NUL
+ character, return the length, if not, continue in the main loop. */
+
+ENTRY (__strlen_aarch64)
+ PTR_ARG (0)
+ and tmp1, srcin, MIN_PAGE_SIZE - 1
+ cmp tmp1, MIN_PAGE_SIZE - 32
+ b.hi L(page_cross)
+
+ /* Look for a NUL byte in the first 16 bytes. */
+ ldp data1, data2, [srcin]
+ mov zeroones, REP8_01
+
+#ifdef __AARCH64EB__
+ /* For big-endian, carry propagation (if the final byte in the
+ string is 0x01) means we cannot use has_nul1/2 directly.
+ Since we expect strings to be small and early-exit,
+ byte-swap the data now so has_null1/2 will be correct. */
+ rev data1, data1
+ rev data2, data2
+#endif
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ b.eq L(bytes16_31)
+
+ /* Find the exact offset of the first NUL byte in the first 16 bytes
+ from the string start. Enter with C = has_nul1 == 0. */
+ csel has_nul1, has_nul1, has_nul2, cc
+ mov len, 8
+ rev has_nul1, has_nul1
+ csel len, xzr, len, cc
+ clz tmp1, has_nul1
+ add len, len, tmp1, lsr 3
+ ret
+
+ .p2align 3
+ /* Look for a NUL byte at offset 16..31 in the string. */
+L(bytes16_31):
+ ldp data1, data2, [srcin, 16]
+#ifdef __AARCH64EB__
+ rev data1, data1
+ rev data2, data2
+#endif
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ b.eq L(loop_entry)
+
+ /* Find the exact offset of the first NUL byte at offset 16..31 from
+ the string start. Enter with C = has_nul1 == 0. */
+ csel has_nul1, has_nul1, has_nul2, cc
+ mov len, 24
+ rev has_nul1, has_nul1
+ mov tmp3, 16
+ clz tmp1, has_nul1
+ csel len, tmp3, len, cc
+ add len, len, tmp1, lsr 3
+ ret
+
+L(loop_entry):
+ bic src, srcin, 31
+
+ .p2align 5
+L(loop):
+ ldp dataq1, dataq2, [src, 32]!
+ uminp maskv.16b, datav1.16b, datav2.16b
+ uminp maskv.16b, maskv.16b, maskv.16b
+ cmeq maskv.8b, maskv.8b, 0
+ fmov synd, maskd
+ cbz synd, L(loop)
+
+ /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */
+ cmeq maskv.16b, datav1.16b, 0
+ sub len, src, srcin
+ tst synd, 0xffffffff
+ b.ne 1f
+ cmeq maskv.16b, datav2.16b, 0
+ add len, len, 16
+1:
+ /* Generate a bitmask and compute correct byte offset. */
+#ifdef __AARCH64EB__
+ bic maskv.8h, 0xf0
+#else
+ bic maskv.8h, 0x0f, lsl 8
+#endif
+ umaxp maskv.16b, maskv.16b, maskv.16b
+ fmov synd, maskd
+#ifndef __AARCH64EB__
+ rbit synd, synd
+#endif
+ clz tmp, synd
+ add len, len, tmp, lsr 2
+ ret
+
+ .p2align 4
+
+L(page_cross):
+ bic src, srcin, 31
+ mov tmpw, 0x0c03
+ movk tmpw, 0xc030, lsl 16
+ ld1 {datav1.16b, datav2.16b}, [src]
+ dup maskv.4s, tmpw
+ cmeq datav1.16b, datav1.16b, 0
+ cmeq datav2.16b, datav2.16b, 0
+ and datav1.16b, datav1.16b, maskv.16b
+ and datav2.16b, datav2.16b, maskv.16b
+ addp maskv.16b, datav1.16b, datav2.16b
+ addp maskv.16b, maskv.16b, maskv.16b
+ fmov synd, maskd
+ lsl shift, srcin, 1
+ lsr synd, synd, shift
+ cbz synd, L(loop)
+
+ rbit synd, synd
+ clz len, synd
+ lsr len, len, 1
+ ret
+
+END (__strlen_aarch64)
diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S
new file mode 100644
index 000000000000..c9d6fc8a158b
--- /dev/null
+++ b/string/aarch64/strncmp-mte.S
@@ -0,0 +1,307 @@
+/*
+ * strncmp - compare two strings
+ *
+ * Copyright (c) 2013-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+#include "../asmdefs.h"
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+/* Parameters and result. */
+#define src1 x0
+#define src2 x1
+#define limit x2
+#define result x0
+
+/* Internal variables. */
+#define data1 x3
+#define data1w w3
+#define data2 x4
+#define data2w w4
+#define has_nul x5
+#define diff x6
+#define syndrome x7
+#define tmp1 x8
+#define tmp2 x9
+#define tmp3 x10
+#define zeroones x11
+#define pos x12
+#define mask x13
+#define endloop x14
+#define count mask
+#define offset pos
+#define neg_offset x15
+
+/* Define endian dependent shift operations.
+ On big-endian early bytes are at MSB and on little-endian LSB.
+ LS_FW means shifting towards early bytes.
+ LS_BK means shifting towards later bytes.
+ */
+#ifdef __AARCH64EB__
+#define LS_FW lsl
+#define LS_BK lsr
+#else
+#define LS_FW lsr
+#define LS_BK lsl
+#endif
+
+ENTRY (__strncmp_aarch64_mte)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ SIZE_ARG (2)
+ cbz limit, L(ret0)
+ eor tmp1, src1, src2
+ mov zeroones, #REP8_01
+ tst tmp1, #7
+ and count, src1, #7
+ b.ne L(misaligned8)
+ cbnz count, L(mutual_align)
+
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. */
+ .p2align 4
+L(loop_aligned):
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+L(start_realigned):
+ subs limit, limit, #8
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ csinv endloop, diff, xzr, hi /* Last Dword or differences. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ ccmp endloop, #0, #0, eq
+ b.eq L(loop_aligned)
+ /* End of main loop */
+
+L(full_check):
+#ifndef __AARCH64EB__
+ orr syndrome, diff, has_nul
+ add limit, limit, 8 /* Rewind limit to before last subs. */
+L(syndrome_check):
+ /* Limit was reached. Check if the NUL byte or the difference
+ is before the limit. */
+ rev syndrome, syndrome
+ rev data1, data1
+ clz pos, syndrome
+ rev data2, data2
+ lsl data1, data1, pos
+ cmp limit, pos, lsr #3
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ csel result, result, xzr, hi
+ ret
+#else
+ /* Not reached the limit, must have found the end or a diff. */
+ tbz limit, #63, L(not_limit)
+ add tmp1, limit, 8
+ cbz limit, L(not_limit)
+
+ lsl limit, tmp1, #3 /* Bits -> bytes. */
+ mov mask, #~0
+ lsr mask, mask, limit
+ bic data1, data1, mask
+ bic data2, data2, mask
+
+ /* Make sure that the NUL byte is marked in the syndrome. */
+ orr has_nul, has_nul, mask
+
+L(not_limit):
+ /* For big-endian we cannot use the trick with the syndrome value
+ as carry-propagation can corrupt the upper bits if the trailing
+ bytes in the string contain 0x01. */
+ /* However, if there is no NUL byte in the dword, we can generate
+ the result directly. We can't just subtract the bytes as the
+ MSB might be significant. */
+ cbnz has_nul, 1f
+ cmp data1, data2
+ cset result, ne
+ cneg result, result, lo
+ ret
+1:
+ /* Re-compute the NUL-byte detection, using a byte-reversed value. */
+ rev tmp3, data1
+ sub tmp1, tmp3, zeroones
+ orr tmp2, tmp3, #REP8_7f
+ bic has_nul, tmp1, tmp2
+ rev has_nul, has_nul
+ orr syndrome, diff, has_nul
+ clz pos, syndrome
+ /* The most-significant-non-zero bit of the syndrome marks either the
+ first bit that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+L(end_quick):
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+#endif
+
+L(mutual_align):
+ /* Sources are mutually aligned, but are not currently at an
+ alignment boundary. Round down the addresses and then mask off
+ the bytes that precede the start point.
+ We also need to adjust the limit calculations, but without
+ overflowing if the limit is near ULONG_MAX. */
+ bic src1, src1, #7
+ bic src2, src2, #7
+ ldr data1, [src1], #8
+ neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
+ ldr data2, [src2], #8
+ mov tmp2, #~0
+ LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
+ /* Adjust the limit and ensure it doesn't overflow. */
+ adds limit, limit, count
+ csinv limit, limit, xzr, lo
+ orr data1, data1, tmp2
+ orr data2, data2, tmp2
+ b L(start_realigned)
+
+ .p2align 4
+ /* Don't bother with dwords for up to 16 bytes. */
+L(misaligned8):
+ cmp limit, #16
+ b.hs L(try_misaligned_words)
+
+L(byte_loop):
+ /* Perhaps we can do better than this. */
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ subs limit, limit, #1
+ ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.eq L(byte_loop)
+L(done):
+ sub result, data1, data2
+ ret
+ /* Align the SRC1 to a dword by doing a bytewise compare and then do
+ the dword loop. */
+L(try_misaligned_words):
+ cbz count, L(src1_aligned)
+
+ neg count, count
+ and count, count, #7
+ sub limit, limit, count
+
+L(page_end_loop):
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ cmp data1w, #1
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.ne L(done)
+ subs count, count, #1
+ b.hi L(page_end_loop)
+
+ /* The following diagram explains the comparison of misaligned strings.
+ The bytes are shown in natural order. For little-endian, it is
+ reversed in the registers. The "x" bytes are before the string.
+ The "|" separates data that is loaded at one time.
+ src1 | a a a a a a a a | b b b c c c c c | . . .
+ src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
+
+ After shifting in each step, the data looks like this:
+ STEP_A STEP_B STEP_C
+ data1 a a a a a a a a b b b c c c c c b b b c c c c c
+ data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
+
+ The bytes with "0" are eliminated from the syndrome via mask.
+
+ Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
+ time from SRC2. The comparison happens in 3 steps. After each step
+ the loop can exit, or read from SRC1 or SRC2. */
+L(src1_aligned):
+ /* Calculate offset from 8 byte alignment to string start in bits. No
+ need to mask offset since shifts are ignoring upper bits. */
+ lsl offset, src2, #3
+ bic src2, src2, #0xf
+ mov mask, -1
+ neg neg_offset, offset
+ ldr data1, [src1], #8
+ ldp tmp1, tmp2, [src2], #16
+ LS_BK mask, mask, neg_offset
+ and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
+ /* Skip the first compare if data in tmp1 is irrelevant. */
+ tbnz offset, 6, L(misaligned_mid_loop)
+
+L(loop_misaligned):
+ /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
+ LS_FW data2, tmp1, offset
+ LS_BK tmp1, tmp2, neg_offset
+ subs limit, limit, #8
+ orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
+ sub has_nul, data1, zeroones
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ orr tmp3, data1, #REP8_7f
+ csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
+ bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
+ orr tmp3, endloop, has_nul
+ cbnz tmp3, L(full_check)
+
+ ldr data1, [src1], #8
+L(misaligned_mid_loop):
+ /* STEP_B: Compare first part of data1 to second part of tmp2. */
+ LS_FW data2, tmp2, offset
+#ifdef __AARCH64EB__
+ /* For big-endian we do a byte reverse to avoid carry-propagation
+ problem described above. This way we can reuse the has_nul in the
+ next step and also use syndrome value trick at the end. */
+ rev tmp3, data1
+ #define data1_fixed tmp3
+#else
+ #define data1_fixed data1
+#endif
+ sub has_nul, data1_fixed, zeroones
+ orr tmp3, data1_fixed, #REP8_7f
+ eor diff, data2, data1 /* Non-zero if differences found. */
+ bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
+#ifdef __AARCH64EB__
+ rev has_nul, has_nul
+#endif
+ cmp limit, neg_offset, lsr #3
+ orr syndrome, diff, has_nul
+ bic syndrome, syndrome, mask /* Ignore later bytes. */
+ csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
+ cbnz tmp3, L(syndrome_check)
+
+ /* STEP_C: Compare second part of data1 to first part of tmp1. */
+ ldp tmp1, tmp2, [src2], #16
+ cmp limit, #8
+ LS_BK data2, tmp1, neg_offset
+ eor diff, data2, data1 /* Non-zero if differences found. */
+ orr syndrome, diff, has_nul
+ and syndrome, syndrome, mask /* Ignore earlier bytes. */
+ csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
+ cbnz tmp3, L(syndrome_check)
+
+ ldr data1, [src1], #8
+ sub limit, limit, #8
+ b L(loop_misaligned)
+
+#ifdef __AARCH64EB__
+L(syndrome_check):
+ clz pos, syndrome
+ cmp pos, limit, lsl #3
+ b.lo L(end_quick)
+#endif
+
+L(ret0):
+ mov result, #0
+ ret
+END(__strncmp_aarch64_mte)
+
diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S
new file mode 100644
index 000000000000..234190e245b0
--- /dev/null
+++ b/string/aarch64/strncmp-sve.S
@@ -0,0 +1,69 @@
+/*
+ * strncmp - compare two strings with limit
+ *
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+ENTRY (__strncmp_aarch64_sve)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ SIZE_ARG (2)
+ setffr /* initialize FFR */
+ mov x3, 0 /* initialize off */
+
+0: whilelo p0.b, x3, x2 /* while off < max */
+ b.none 9f
+
+ ldff1b z0.b, p0/z, [x0, x3]
+ ldff1b z1.b, p0/z, [x1, x3]
+ rdffrs p1.b, p0/z
+ b.nlast 2f
+
+ /* First fault did not fail: the vector up to max is valid.
+ Avoid depending on the contents of FFR beyond the branch.
+ Increment for a whole vector, even if we've only read a partial.
+ This is significantly cheaper than INCP, and since OFF is not
+ used after the loop it is ok to increment OFF past MAX. */
+ incb x3
+ cmpeq p1.b, p0/z, z0.b, z1.b /* compare strings */
+ cmpne p2.b, p0/z, z0.b, 0 /* search for ~zero */
+ nands p2.b, p0/z, p1.b, p2.b /* ~(eq & ~zero) -> ne | zero */
+ b.none 0b
+
+ /* Found end-of-string or inequality. */
+1: brkb p2.b, p0/z, p2.b /* find first such */
+ lasta w0, p2, z0.b /* extract each char */
+ lasta w1, p2, z1.b
+ sub x0, x0, x1 /* return comparison */
+ ret
+
+ /* First fault failed: only some of the vector is valid.
+ Perform the comparison only on the valid bytes. */
+2: cmpeq p2.b, p1/z, z0.b, z1.b /* compare strings, as above */
+ cmpne p3.b, p1/z, z0.b, 0
+ nands p2.b, p1/z, p2.b, p3.b
+ b.any 1b
+
+ /* No inequality or zero found. Re-init FFR, incr and loop. */
+ setffr
+ incp x3, p1.b
+ b 0b
+
+ /* Found end-of-count. */
+9: mov x0, 0 /* return equal */
+ ret
+
+END (__strncmp_aarch64_sve)
+
+#endif
+
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
new file mode 100644
index 000000000000..738b6539cab6
--- /dev/null
+++ b/string/aarch64/strncmp.S
@@ -0,0 +1,260 @@
+/*
+ * strncmp - compare two strings
+ *
+ * Copyright (c) 2013-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+#include "../asmdefs.h"
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and result. */
+#define src1 x0
+#define src2 x1
+#define limit x2
+#define result x0
+
+/* Internal variables. */
+#define data1 x3
+#define data1w w3
+#define data2 x4
+#define data2w w4
+#define has_nul x5
+#define diff x6
+#define syndrome x7
+#define tmp1 x8
+#define tmp2 x9
+#define tmp3 x10
+#define zeroones x11
+#define pos x12
+#define limit_wd x13
+#define mask x14
+#define endloop x15
+#define count mask
+
+ENTRY (__strncmp_aarch64)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ SIZE_ARG (2)
+ cbz limit, L(ret0)
+ eor tmp1, src1, src2
+ mov zeroones, #REP8_01
+ tst tmp1, #7
+ and count, src1, #7
+ b.ne L(misaligned8)
+ cbnz count, L(mutual_align)
+ /* Calculate the number of full and partial words -1. */
+ sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
+ lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
+
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. */
+ .p2align 4
+L(loop_aligned):
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+L(start_realigned):
+ subs limit_wd, limit_wd, #1
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ csinv endloop, diff, xzr, pl /* Last Dword or differences. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ ccmp endloop, #0, #0, eq
+ b.eq L(loop_aligned)
+ /* End of main loop */
+
+ /* Not reached the limit, must have found the end or a diff. */
+ tbz limit_wd, #63, L(not_limit)
+
+ /* Limit % 8 == 0 => all bytes significant. */
+ ands limit, limit, #7
+ b.eq L(not_limit)
+
+ lsl limit, limit, #3 /* Bits -> bytes. */
+ mov mask, #~0
+#ifdef __AARCH64EB__
+ lsr mask, mask, limit
+#else
+ lsl mask, mask, limit
+#endif
+ bic data1, data1, mask
+ bic data2, data2, mask
+
+ /* Make sure that the NUL byte is marked in the syndrome. */
+ orr has_nul, has_nul, mask
+
+L(not_limit):
+ orr syndrome, diff, has_nul
+
+#ifndef __AARCH64EB__
+ rev syndrome, syndrome
+ rev data1, data1
+ /* The MS-non-zero bit of the syndrome marks either the first bit
+ that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+ clz pos, syndrome
+ rev data2, data2
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+#else
+ /* For big-endian we cannot use the trick with the syndrome value
+ as carry-propagation can corrupt the upper bits if the trailing
+ bytes in the string contain 0x01. */
+ /* However, if there is no NUL byte in the dword, we can generate
+ the result directly. We can't just subtract the bytes as the
+ MSB might be significant. */
+ cbnz has_nul, 1f
+ cmp data1, data2
+ cset result, ne
+ cneg result, result, lo
+ ret
+1:
+ /* Re-compute the NUL-byte detection, using a byte-reversed value. */
+ rev tmp3, data1
+ sub tmp1, tmp3, zeroones
+ orr tmp2, tmp3, #REP8_7f
+ bic has_nul, tmp1, tmp2
+ rev has_nul, has_nul
+ orr syndrome, diff, has_nul
+ clz pos, syndrome
+ /* The MS-non-zero bit of the syndrome marks either the first bit
+ that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+#endif
+
+L(mutual_align):
+ /* Sources are mutually aligned, but are not currently at an
+ alignment boundary. Round down the addresses and then mask off
+ the bytes that precede the start point.
+ We also need to adjust the limit calculations, but without
+ overflowing if the limit is near ULONG_MAX. */
+ bic src1, src1, #7
+ bic src2, src2, #7
+ ldr data1, [src1], #8
+ neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
+ ldr data2, [src2], #8
+ mov tmp2, #~0
+ sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
+#ifdef __AARCH64EB__
+ /* Big-endian. Early bytes are at MSB. */
+ lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */
+#else
+ /* Little-endian. Early bytes are at LSB. */
+ lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */
+#endif
+ and tmp3, limit_wd, #7
+ lsr limit_wd, limit_wd, #3
+ /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
+ add limit, limit, count
+ add tmp3, tmp3, count
+ orr data1, data1, tmp2
+ orr data2, data2, tmp2
+ add limit_wd, limit_wd, tmp3, lsr #3
+ b L(start_realigned)
+
+ .p2align 4
+ /* Don't bother with dwords for up to 16 bytes. */
+L(misaligned8):
+ cmp limit, #16
+ b.hs L(try_misaligned_words)
+
+L(byte_loop):
+ /* Perhaps we can do better than this. */
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ subs limit, limit, #1
+ ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.eq L(byte_loop)
+L(done):
+ sub result, data1, data2
+ ret
+ /* Align the SRC1 to a dword by doing a bytewise compare and then do
+ the dword loop. */
+L(try_misaligned_words):
+ lsr limit_wd, limit, #3
+ cbz count, L(do_misaligned)
+
+ neg count, count
+ and count, count, #7
+ sub limit, limit, count
+ lsr limit_wd, limit, #3
+
+L(page_end_loop):
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ cmp data1w, #1
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.ne L(done)
+ subs count, count, #1
+ b.hi L(page_end_loop)
+
+L(do_misaligned):
+ /* Prepare ourselves for the next page crossing. Unlike the aligned
+ loop, we fetch 1 less dword because we risk crossing bounds on
+ SRC2. */
+ mov count, #8
+ subs limit_wd, limit_wd, #1
+ b.lo L(done_loop)
+L(loop_misaligned):
+ and tmp2, src2, #0xff8
+ eor tmp2, tmp2, #0xff8
+ cbz tmp2, L(page_end_loop)
+
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ ccmp diff, #0, #0, eq
+ b.ne L(not_limit)
+ subs limit_wd, limit_wd, #1
+ b.pl L(loop_misaligned)
+
+L(done_loop):
+ /* We found a difference or a NULL before the limit was reached. */
+ and limit, limit, #7
+ cbz limit, L(not_limit)
+ /* Read the last word. */
+ sub src1, src1, 8
+ sub src2, src2, 8
+ ldr data1, [src1, limit]
+ ldr data2, [src2, limit]
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ ccmp diff, #0, #0, eq
+ b.ne L(not_limit)
+
+L(ret0):
+ mov result, #0
+ ret
+
+END ( __strncmp_aarch64)
+
diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S
new file mode 100644
index 000000000000..5b9ebf7763bc
--- /dev/null
+++ b/string/aarch64/strnlen-sve.S
@@ -0,0 +1,74 @@
+/*
+ * strnlen - calculate the length of a string with limit.
+ *
+ * Copyright (c) 2019-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+ENTRY (__strnlen_aarch64_sve)
+ PTR_ARG (0)
+ SIZE_ARG (1)
+ setffr /* initialize FFR */
+ mov x2, 0 /* initialize len */
+ b 1f
+
+ .p2align 4
+ /* We have off + vl <= max, and so may read the whole vector. */
+0: ldff1b z0.b, p0/z, [x0, x2]
+ rdffrs p1.b, p0/z
+ b.nlast 2f
+
+ /* First fault did not fail: the whole vector is valid.
+ Avoid depending on the contents of FFR beyond the branch. */
+ cmpeq p2.b, p0/z, z0.b, 0
+ b.any 8f
+ incb x2
+
+1: whilelo p0.b, x2, x1
+ b.last 0b
+
+ /* We have off + vl < max. Test for off == max before proceeding. */
+ b.none 9f
+
+ ldff1b z0.b, p0/z, [x0, x2]
+ rdffrs p1.b, p0/z
+ b.nlast 2f
+
+ /* First fault did not fail: the vector up to max is valid.
+ Avoid depending on the contents of FFR beyond the branch.
+ Compare for end-of-string, but there are no more bytes. */
+ cmpeq p2.b, p0/z, z0.b, 0
+
+ /* Found end-of-string or zero. */
+8: brkb p2.b, p0/z, p2.b
+ mov x0, x2
+ incp x0, p2.b
+ ret
+
+ /* First fault failed: only some of the vector is valid.
+ Perform the comparison only on the valid bytes. */
+2: cmpeq p2.b, p1/z, z0.b, 0
+ b.any 8b
+
+ /* No inequality or zero found. Re-init FFR, incr and loop. */
+ setffr
+ incp x2, p1.b
+ b 1b
+
+ /* End of count. Return max. */
+9: mov x0, x1
+ ret
+
+END (__strnlen_aarch64_sve)
+
+#endif
+
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
new file mode 100644
index 000000000000..48d2495d2082
--- /dev/null
+++ b/string/aarch64/strnlen.S
@@ -0,0 +1,112 @@
+/*
+ * strnlen - calculate the length of a string with limit.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin x0
+#define cntin x1
+#define result x0
+
+#define src x2
+#define synd x3
+#define shift x4
+#define wtmp w4
+#define tmp x4
+#define cntrem x5
+
+#define qdata q0
+#define vdata v0
+#define vhas_chr v1
+#define vrepmask v2
+#define vend v3
+#define dend d3
+
+/*
+ Core algorithm:
+
+ For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+ per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+ requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+ set likewise for odd bytes so that adjacent bytes can be merged. Since the
+ bits in the syndrome reflect the order in which things occur in the original
+ string, counting trailing zeros identifies exactly which byte matched. */
+
+ENTRY (__strnlen_aarch64)
+ PTR_ARG (0)
+ SIZE_ARG (1)
+ bic src, srcin, 15
+ mov wtmp, 0xf00f
+ cbz cntin, L(nomatch)
+ ld1 {vdata.16b}, [src], 16
+ dup vrepmask.8h, wtmp
+ cmeq vhas_chr.16b, vdata.16b, 0
+ lsl shift, srcin, 2
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+ lsr synd, synd, shift
+ cbz synd, L(start_loop)
+L(finish):
+ rbit synd, synd
+ clz synd, synd
+ lsr result, synd, 2
+ cmp cntin, result
+ csel result, cntin, result, ls
+ ret
+
+L(start_loop):
+ sub tmp, src, srcin
+ subs cntrem, cntin, tmp
+ b.ls L(nomatch)
+
+ /* Make sure that it won't overread by a 16-byte chunk */
+ add tmp, cntrem, 15
+ tbnz tmp, 4, L(loop32_2)
+
+ .p2align 5
+L(loop32):
+ ldr qdata, [src], 16
+ cmeq vhas_chr.16b, vdata.16b, 0
+ umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+ cbnz synd, L(end)
+L(loop32_2):
+ ldr qdata, [src], 16
+ subs cntrem, cntrem, 32
+ cmeq vhas_chr.16b, vdata.16b, 0
+ b.ls L(end)
+ umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+ cbz synd, L(loop32)
+
+L(end):
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ sub src, src, 16
+ mov synd, vend.d[0]
+ sub result, src, srcin
+#ifndef __AARCH64EB__
+ rbit synd, synd
+#endif
+ clz synd, synd
+ add result, result, synd, lsr 2
+ cmp cntin, result
+ csel result, cntin, result, ls
+ ret
+
+L(nomatch):
+ mov result, cntin
+ ret
+
+END (__strnlen_aarch64)
+
diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S
new file mode 100644
index 000000000000..1e4fb1a68f7e
--- /dev/null
+++ b/string/aarch64/strrchr-mte.S
@@ -0,0 +1,127 @@
+/*
+ * strrchr - find last position of a character in a string.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin x0
+#define chrin w1
+#define result x0
+
+#define src x2
+#define tmp x3
+#define wtmp w3
+#define synd x3
+#define shift x4
+#define src_match x4
+#define nul_match x5
+#define chr_match x6
+
+#define vrepchr v0
+#define vdata v1
+#define vhas_nul v2
+#define vhas_chr v3
+#define vrepmask v4
+#define vrepmask2 v5
+#define vend v5
+#define dend d5
+
+/* Core algorithm.
+
+ For each 16-byte chunk we calculate a 64-bit syndrome value, with
+ four bits per byte (LSB is always in bits 0 and 1, for both big
+ and little-endian systems). For each tuple, bits 0-1 are set if
+ the relevant byte matched the requested character; bits 2-3 are set
+ if the relevant byte matched the NUL end of string. */
+
+ENTRY (__strrchr_aarch64_mte)
+ PTR_ARG (0)
+ bic src, srcin, 15
+ dup vrepchr.16b, chrin
+ mov wtmp, 0x3003
+ dup vrepmask.8h, wtmp
+ tst srcin, 15
+ beq L(loop1)
+
+ ld1 {vdata.16b}, [src], 16
+ cmeq vhas_nul.16b, vdata.16b, 0
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ mov wtmp, 0xf00f
+ dup vrepmask2.8h, wtmp
+ bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+ and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b
+ lsl shift, srcin, 2
+ fmov synd, dend
+ lsr synd, synd, shift
+ lsl synd, synd, shift
+ ands nul_match, synd, 0xcccccccccccccccc
+ bne L(tail)
+ cbnz synd, L(loop2)
+
+ .p2align 5
+L(loop1):
+ ld1 {vdata.16b}, [src], 16
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ cbz synd, L(loop1)
+
+ cmeq vhas_nul.16b, vdata.16b, 0
+ bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+ bic vhas_nul.8h, 0x0f, lsl 8
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ ands nul_match, synd, 0xcccccccccccccccc
+ beq L(loop2)
+
+L(tail):
+ sub nul_match, nul_match, 1
+ and chr_match, synd, 0x3333333333333333
+ ands chr_match, chr_match, nul_match
+ sub result, src, 1
+ clz tmp, chr_match
+ sub result, result, tmp, lsr 2
+ csel result, result, xzr, ne
+ ret
+
+ .p2align 4
+L(loop2):
+ cmp synd, 0
+ csel src_match, src, src_match, ne
+ csel chr_match, synd, chr_match, ne
+ ld1 {vdata.16b}, [src], 16
+ cmeq vhas_nul.16b, vdata.16b, 0
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ tst synd, 0xcccccccccccccccc
+ beq L(loop2)
+
+ bic vhas_nul.8h, 0x0f, lsl 8
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ and nul_match, synd, 0xcccccccccccccccc
+ sub nul_match, nul_match, 1
+ and tmp, synd, 0x3333333333333333
+ ands tmp, tmp, nul_match
+ csel chr_match, tmp, chr_match, ne
+ csel src_match, src, src_match, ne
+ sub src_match, src_match, 1
+ clz tmp, chr_match
+ sub result, src_match, tmp, lsr 2
+ ret
+
+END (__strrchr_aarch64_mte)
+
diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S
new file mode 100644
index 000000000000..d36d69af37fd
--- /dev/null
+++ b/string/aarch64/strrchr-sve.S
@@ -0,0 +1,84 @@
+/*
+ * strrchr - find the last of a character in a string
+ *
+ * Copyright (c) 2019-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+ENTRY (__strrchr_aarch64_sve)
+ PTR_ARG (0)
+ dup z1.b, w1 /* replicate byte across vector */
+ setffr /* initialize FFR */
+ ptrue p1.b /* all ones; loop invariant */
+ mov x2, 0 /* no match found so far */
+ pfalse p2.b
+
+ .p2align 4
+ /* Read a vector's worth of bytes, stopping on first fault. */
+0: ldff1b z0.b, p1/z, [x0, xzr]
+ rdffrs p0.b, p1/z
+ b.nlast 1f
+
+ /* First fault did not fail: the whole vector is valid.
+ Avoid depending on the contents of FFR beyond the branch. */
+ incb x0, all /* skip bytes this round */
+ cmpeq p3.b, p1/z, z0.b, 0 /* search for 0 */
+ b.any 3f
+
+ cmpeq p3.b, p1/z, z0.b, z1.b /* search for c; no eos */
+ b.none 0b
+
+ mov x2, x0 /* save advanced base */
+ mov p2.b, p3.b /* save current search */
+ b 0b
+
+ /* First fault failed: only some of the vector is valid.
+ Perform the comparisions only on the valid bytes. */
+1: cmpeq p3.b, p0/z, z0.b, 0 /* search for 0 */
+ b.any 2f
+
+ cmpeq p3.b, p0/z, z0.b, z1.b /* search for c; no eos */
+ mov x3, x0
+ incp x0, p0.b /* skip bytes this round */
+ setffr /* re-init FFR */
+ b.none 0b
+
+ addvl x2, x3, 1 /* save advanced base */
+ mov p2.b, p3.b /* save current search */
+ b 0b
+
+ /* Found end-of-string. */
+2: incb x0, all /* advance base */
+3: brka p3.b, p1/z, p3.b /* mask after first 0 */
+ cmpeq p3.b, p3/z, z0.b, z1.b /* search for c not after eos */
+ b.any 4f
+
+ /* No C within last vector. Did we have one before? */
+ cbz x2, 5f
+ mov x0, x2 /* restore advanced base */
+ mov p3.b, p2.b /* restore saved search */
+
+ /* Find the *last* match in the predicate. This is slightly
+ more complicated than finding the first match. */
+4: rev p3.b, p3.b /* reverse the bits */
+ brka p3.b, p1/z, p3.b /* find position of last match */
+ decp x0, p3.b /* retard pointer to last match */
+ ret
+
+ /* No C whatsoever. Return NULL. */
+5: mov x0, 0
+ ret
+
+END (__strrchr_aarch64_sve)
+
+#endif
+
diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S
new file mode 100644
index 000000000000..56185ff534e3
--- /dev/null
+++ b/string/aarch64/strrchr.S
@@ -0,0 +1,149 @@
+/*
+ * strrchr - find last position of a character in a string.
+ *
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+#include "../asmdefs.h"
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+
+#define result x0
+
+#define src x2
+#define tmp1 x3
+#define wtmp2 w4
+#define tmp3 x5
+#define src_match x6
+#define src_offset x7
+#define const_m1 x8
+#define tmp4 x9
+#define nul_match x10
+#define chr_match x11
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_nul1 v3
+#define vhas_nul2 v4
+#define vhas_chr1 v5
+#define vhas_chr2 v6
+#define vrepmask_0 v7
+#define vrepmask_c v16
+#define vend1 v17
+#define vend2 v18
+
+/* Core algorithm.
+
+ For each 32-byte hunk we calculate a 64-bit syndrome value, with
+ two bits per byte (LSB is always in bits 0 and 1, for both big
+ and little-endian systems). For each tuple, bit 0 is set iff
+ the relevant byte matched the requested character; bit 1 is set
+ iff the relevant byte matched the NUL end of string (we trigger
+ off bit0 for the special case of looking for NUL). Since the bits
+ in the syndrome reflect exactly the order in which things occur
+ in the original string a count_trailing_zeros() operation will
+ identify exactly which byte is causing the termination, and why. */
+
+ENTRY (__strrchr_aarch64)
+ PTR_ARG (0)
+ /* Magic constant 0x40100401 to allow us to identify which lane
+ matches the requested byte. Magic constant 0x80200802 used
+ similarly for NUL termination. */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
+ dup vrepmask_c.4s, wtmp2
+ mov src_offset, #0
+ ands tmp1, srcin, #31
+ add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
+ b.eq L(aligned)
+
+ /* Input string is not 32-byte aligned. Rather than forcing
+ the padding bytes to a safe value, we calculate the syndrome
+ for all the bytes, but then mask off those bits of the
+ syndrome that are related to the padding. */
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ neg tmp1, tmp1
+ cmeq vhas_nul1.16b, vdata1.16b, #0
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_nul2.16b, vdata2.16b, #0
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+ and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+ addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128
+ addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
+ addp vend1.16b, vhas_nul1.16b, vhas_chr1.16b // 128->64
+ mov nul_match, vend1.d[0]
+ lsl tmp1, tmp1, #1
+ mov const_m1, #~0
+ lsr tmp3, const_m1, tmp1
+ mov chr_match, vend1.d[1]
+
+ bic nul_match, nul_match, tmp3 // Mask padding bits.
+ bic chr_match, chr_match, tmp3 // Mask padding bits.
+ cbnz nul_match, L(tail)
+
+ .p2align 4
+L(loop):
+ cmp chr_match, #0
+ csel src_match, src, src_match, ne
+ csel src_offset, chr_match, src_offset, ne
+L(aligned):
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ uminp vend1.16b, vdata1.16b, vdata2.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+ cmeq vend1.16b, vend1.16b, 0
+ addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
+ addp vend1.16b, vend1.16b, vhas_chr1.16b // 128->64
+ mov nul_match, vend1.d[0]
+ mov chr_match, vend1.d[1]
+ cbz nul_match, L(loop)
+
+ cmeq vhas_nul1.16b, vdata1.16b, #0
+ cmeq vhas_nul2.16b, vdata2.16b, #0
+ and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+ and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+ addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
+ addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
+ mov nul_match, vhas_nul1.d[0]
+
+L(tail):
+ /* Work out exactly where the string ends. */
+ sub tmp4, nul_match, #1
+ eor tmp4, tmp4, nul_match
+ ands chr_match, chr_match, tmp4
+ /* And pick the values corresponding to the last match. */
+ csel src_match, src, src_match, ne
+ csel src_offset, chr_match, src_offset, ne
+
+ /* Count down from the top of the syndrome to find the last match. */
+ clz tmp3, src_offset
+ /* Src_match points beyond the word containing the match, so we can
+ simply subtract half the bit-offset into the syndrome. Because
+ we are counting down, we need to go back one more character. */
+ add tmp3, tmp3, #2
+ sub result, src_match, tmp3, lsr #1
+ /* But if the syndrome shows no match was found, then return NULL. */
+ cmp src_offset, #0
+ csel result, result, xzr, ne
+
+ ret
+
+END (__strrchr_aarch64)
+
diff --git a/string/arm/check-arch.S b/string/arm/check-arch.S
new file mode 100644
index 000000000000..1cff9345e343
--- /dev/null
+++ b/string/arm/check-arch.S
@@ -0,0 +1,10 @@
+/*
+ * check ARCH setting.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if !__arm__
+# error ARCH setting does not match the compiler.
+#endif
diff --git a/string/arm/memchr.S b/string/arm/memchr.S
new file mode 100644
index 000000000000..3f1ac4df136f
--- /dev/null
+++ b/string/arm/memchr.S
@@ -0,0 +1,132 @@
+/*
+ * memchr - scan memory for a character
+ *
+ * Copyright (c) 2010-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/*
+ Written by Dave Gilbert <david.gilbert@linaro.org>
+
+ This __memchr_arm routine is optimised on a Cortex-A9 and should work on
+ all ARMv7 processors. It has a fast past for short sizes, and has
+ an optimised path for large data sets; the worst case is finding the
+ match early in a large data set.
+
+ */
+
+@ 2011-02-07 david.gilbert@linaro.org
+@ Extracted from local git a5b438d861
+@ 2011-07-14 david.gilbert@linaro.org
+@ Import endianness fix from local git ea786f1b
+@ 2011-12-07 david.gilbert@linaro.org
+@ Removed unneeded cbz from align loop
+
+ .syntax unified
+ .arch armv7-a
+
+@ this lets us check a flag in a 00/ff byte easily in either endianness
+#ifdef __ARMEB__
+#define CHARTSTMASK(c) 1<<(31-(c*8))
+#else
+#define CHARTSTMASK(c) 1<<(c*8)
+#endif
+ .thumb
+
+@ ---------------------------------------------------------------------------
+ .thumb_func
+ .align 2
+ .p2align 4,,15
+ .global __memchr_arm
+ .type __memchr_arm,%function
+__memchr_arm:
+ @ r0 = start of memory to scan
+ @ r1 = character to look for
+ @ r2 = length
+ @ returns r0 = pointer to character or NULL if not found
+ and r1,r1,#0xff @ Don't think we can trust the caller to actually pass a char
+
+ cmp r2,#16 @ If it's short don't bother with anything clever
+ blt 20f
+
+ tst r0, #7 @ If it's already aligned skip the next bit
+ beq 10f
+
+ @ Work up to an aligned point
+5:
+ ldrb r3, [r0],#1
+ subs r2, r2, #1
+ cmp r3, r1
+ beq 50f @ If it matches exit found
+ tst r0, #7
+ bne 5b @ If not aligned yet then do next byte
+
+10:
+ @ At this point, we are aligned, we know we have at least 8 bytes to work with
+ push {r4,r5,r6,r7}
+ orr r1, r1, r1, lsl #8 @ expand the match word across to all bytes
+ orr r1, r1, r1, lsl #16
+ bic r4, r2, #7 @ Number of double words to work with
+ mvns r7, #0 @ all F's
+ movs r3, #0
+
+15:
+ ldmia r0!,{r5,r6}
+ subs r4, r4, #8
+ eor r5,r5, r1 @ Get it so that r5,r6 have 00's where the bytes match the target
+ eor r6,r6, r1
+ uadd8 r5, r5, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+ sel r5, r3, r7 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+ uadd8 r6, r6, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+ sel r6, r5, r7 @ chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+ cbnz r6, 60f
+ bne 15b @ (Flags from the subs above) If not run out of bytes then go around again
+
+ pop {r4,r5,r6,r7}
+ and r1,r1,#0xff @ Get r1 back to a single character from the expansion above
+ and r2,r2,#7 @ Leave the count remaining as the number after the double words have been done
+
+20:
+ cbz r2, 40f @ 0 length or hit the end already then not found
+
+21: @ Post aligned section, or just a short call
+ ldrb r3,[r0],#1
+ subs r2,r2,#1
+ eor r3,r3,r1 @ r3 = 0 if match - doesn't break flags from sub
+ cbz r3, 50f
+ bne 21b @ on r2 flags
+
+40:
+ movs r0,#0 @ not found
+ bx lr
+
+50:
+ subs r0,r0,#1 @ found
+ bx lr
+
+60: @ We're here because the fast path found a hit - now we have to track down exactly which word it was
+ @ r0 points to the start of the double word after the one that was tested
+ @ r5 has the 00/ff pattern for the first word, r6 has the chained value
+ cmp r5, #0
+ itte eq
+ moveq r5, r6 @ the end is in the 2nd word
+ subeq r0,r0,#3 @ Points to 2nd byte of 2nd word
+ subne r0,r0,#7 @ or 2nd byte of 1st word
+
+ @ r0 currently points to the 3rd byte of the word containing the hit
+ tst r5, # CHARTSTMASK(0) @ 1st character
+ bne 61f
+ adds r0,r0,#1
+ tst r5, # CHARTSTMASK(1) @ 2nd character
+ ittt eq
+ addeq r0,r0,#1
+ tsteq r5, # (3<<15) @ 2nd & 3rd character
+ @ If not the 3rd must be the last one
+ addeq r0,r0,#1
+
+61:
+ pop {r4,r5,r6,r7}
+ subs r0,r0,#1
+ bx lr
+
+ .size __memchr_arm, . - __memchr_arm
diff --git a/string/arm/memcpy.S b/string/arm/memcpy.S
new file mode 100644
index 000000000000..86e64938edb1
--- /dev/null
+++ b/string/arm/memcpy.S
@@ -0,0 +1,587 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2013-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/*
+ This memcpy routine is optimised for Cortex-A15 cores and takes advantage
+ of VFP or NEON when built with the appropriate flags.
+
+ Assumptions:
+
+ ARMv6 (ARMv7-a if using Neon)
+ ARM state
+ Unaligned accesses
+
+ */
+
+#include "../asmdefs.h"
+
+ .syntax unified
+ /* This implementation requires ARM state. */
+ .arm
+
+#ifdef __ARM_NEON__
+
+ .fpu neon
+ .arch armv7-a
+# define FRAME_SIZE 4
+# define USE_VFP
+# define USE_NEON
+
+#elif !defined (__SOFTFP__)
+
+ .arch armv6
+ .fpu vfpv2
+# define FRAME_SIZE 32
+# define USE_VFP
+
+#else
+ .arch armv6
+# define FRAME_SIZE 32
+
+#endif
+
+/* Old versions of GAS incorrectly implement the NEON align semantics. */
+#ifdef BROKEN_ASM_NEON_ALIGN
+#define ALIGN(addr, align) addr,:align
+#else
+#define ALIGN(addr, align) addr:align
+#endif
+
+#define PC_OFFSET 8 /* PC pipeline compensation. */
+#define INSN_SIZE 4
+
+/* Call parameters. */
+#define dstin r0
+#define src r1
+#define count r2
+
+/* Locals. */
+#define tmp1 r3
+#define dst ip
+#define tmp2 r10
+
+#ifndef USE_NEON
+/* For bulk copies using GP registers. */
+#define A_l r2 /* Call-clobbered. */
+#define A_h r3 /* Call-clobbered. */
+#define B_l r4
+#define B_h r5
+#define C_l r6
+#define C_h r7
+#define D_l r8
+#define D_h r9
+#endif
+
+/* Number of lines ahead to pre-fetch data. If you change this the code
+ below will need adjustment to compensate. */
+
+#define prefetch_lines 5
+
+#ifdef USE_VFP
+ .macro cpy_line_vfp vreg, base
+ vstr \vreg, [dst, #\base]
+ vldr \vreg, [src, #\base]
+ vstr d0, [dst, #\base + 8]
+ vldr d0, [src, #\base + 8]
+ vstr d1, [dst, #\base + 16]
+ vldr d1, [src, #\base + 16]
+ vstr d2, [dst, #\base + 24]
+ vldr d2, [src, #\base + 24]
+ vstr \vreg, [dst, #\base + 32]
+ vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
+ vstr d0, [dst, #\base + 40]
+ vldr d0, [src, #\base + 40]
+ vstr d1, [dst, #\base + 48]
+ vldr d1, [src, #\base + 48]
+ vstr d2, [dst, #\base + 56]
+ vldr d2, [src, #\base + 56]
+ .endm
+
+ .macro cpy_tail_vfp vreg, base
+ vstr \vreg, [dst, #\base]
+ vldr \vreg, [src, #\base]
+ vstr d0, [dst, #\base + 8]
+ vldr d0, [src, #\base + 8]
+ vstr d1, [dst, #\base + 16]
+ vldr d1, [src, #\base + 16]
+ vstr d2, [dst, #\base + 24]
+ vldr d2, [src, #\base + 24]
+ vstr \vreg, [dst, #\base + 32]
+ vstr d0, [dst, #\base + 40]
+ vldr d0, [src, #\base + 40]
+ vstr d1, [dst, #\base + 48]
+ vldr d1, [src, #\base + 48]
+ vstr d2, [dst, #\base + 56]
+ vldr d2, [src, #\base + 56]
+ .endm
+#endif
+
+ENTRY (__memcpy_arm)
+
+ mov dst, dstin /* Preserve dstin, we need to return it. */
+ cmp count, #64
+ bhs L(cpy_not_short)
+ /* Deal with small copies quickly by dropping straight into the
+ exit block. */
+
+L(tail63unaligned):
+#ifdef USE_NEON
+ and tmp1, count, #0x38
+ rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+ add pc, pc, tmp1
+ vld1.8 {d0}, [src]! /* 14 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 12 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 10 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 8 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 6 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 4 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 2 words to go. */
+ vst1.8 {d0}, [dst]!
+
+ tst count, #4
+ ldrne tmp1, [src], #4
+ strne tmp1, [dst], #4
+#else
+ /* Copy up to 15 full words of data. May not be aligned. */
+ /* Cannot use VFP for unaligned data. */
+ and tmp1, count, #0x3c
+ add dst, dst, tmp1
+ add src, src, tmp1
+ rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
+ /* Jump directly into the sequence below at the correct offset. */
+ add pc, pc, tmp1, lsl #1
+
+ ldr tmp1, [src, #-60] /* 15 words to go. */
+ str tmp1, [dst, #-60]
+
+ ldr tmp1, [src, #-56] /* 14 words to go. */
+ str tmp1, [dst, #-56]
+ ldr tmp1, [src, #-52]
+ str tmp1, [dst, #-52]
+
+ ldr tmp1, [src, #-48] /* 12 words to go. */
+ str tmp1, [dst, #-48]
+ ldr tmp1, [src, #-44]
+ str tmp1, [dst, #-44]
+
+ ldr tmp1, [src, #-40] /* 10 words to go. */
+ str tmp1, [dst, #-40]
+ ldr tmp1, [src, #-36]
+ str tmp1, [dst, #-36]
+
+ ldr tmp1, [src, #-32] /* 8 words to go. */
+ str tmp1, [dst, #-32]
+ ldr tmp1, [src, #-28]
+ str tmp1, [dst, #-28]
+
+ ldr tmp1, [src, #-24] /* 6 words to go. */
+ str tmp1, [dst, #-24]
+ ldr tmp1, [src, #-20]
+ str tmp1, [dst, #-20]
+
+ ldr tmp1, [src, #-16] /* 4 words to go. */
+ str tmp1, [dst, #-16]
+ ldr tmp1, [src, #-12]
+ str tmp1, [dst, #-12]
+
+ ldr tmp1, [src, #-8] /* 2 words to go. */
+ str tmp1, [dst, #-8]
+ ldr tmp1, [src, #-4]
+ str tmp1, [dst, #-4]
+#endif
+
+ lsls count, count, #31
+ ldrhcs tmp1, [src], #2
+ ldrbne src, [src] /* Src is dead, use as a scratch. */
+ strhcs tmp1, [dst], #2
+ strbne src, [dst]
+ bx lr
+
+L(cpy_not_short):
+ /* At least 64 bytes to copy, but don't know the alignment yet. */
+ str tmp2, [sp, #-FRAME_SIZE]!
+ and tmp2, src, #7
+ and tmp1, dst, #7
+ cmp tmp1, tmp2
+ bne L(cpy_notaligned)
+
+#ifdef USE_VFP
+ /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
+ that the FP pipeline is much better at streaming loads and
+ stores. This is outside the critical loop. */
+ vmov.f32 s0, s0
+#endif
+
+ /* SRC and DST have the same mutual 64-bit alignment, but we may
+ still need to pre-copy some bytes to get to natural alignment.
+ We bring SRC and DST into full 64-bit alignment. */
+ lsls tmp2, dst, #29
+ beq 1f
+ rsbs tmp2, tmp2, #0
+ sub count, count, tmp2, lsr #29
+ ldrmi tmp1, [src], #4
+ strmi tmp1, [dst], #4
+ lsls tmp2, tmp2, #2
+ ldrhcs tmp1, [src], #2
+ ldrbne tmp2, [src], #1
+ strhcs tmp1, [dst], #2
+ strbne tmp2, [dst], #1
+
+1:
+ subs tmp2, count, #64 /* Use tmp2 for count. */
+ blo L(tail63aligned)
+
+ cmp tmp2, #512
+ bhs L(cpy_body_long)
+
+L(cpy_body_medium): /* Count in tmp2. */
+#ifdef USE_VFP
+1:
+ vldr d0, [src, #0]
+ subs tmp2, tmp2, #64
+ vldr d1, [src, #8]
+ vstr d0, [dst, #0]
+ vldr d0, [src, #16]
+ vstr d1, [dst, #8]
+ vldr d1, [src, #24]
+ vstr d0, [dst, #16]
+ vldr d0, [src, #32]
+ vstr d1, [dst, #24]
+ vldr d1, [src, #40]
+ vstr d0, [dst, #32]
+ vldr d0, [src, #48]
+ vstr d1, [dst, #40]
+ vldr d1, [src, #56]
+ vstr d0, [dst, #48]
+ add src, src, #64
+ vstr d1, [dst, #56]
+ add dst, dst, #64
+ bhs 1b
+ tst tmp2, #0x3f
+ beq L(done)
+
+L(tail63aligned): /* Count in tmp2. */
+ and tmp1, tmp2, #0x38
+ add dst, dst, tmp1
+ add src, src, tmp1
+ rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+ add pc, pc, tmp1
+
+ vldr d0, [src, #-56] /* 14 words to go. */
+ vstr d0, [dst, #-56]
+ vldr d0, [src, #-48] /* 12 words to go. */
+ vstr d0, [dst, #-48]
+ vldr d0, [src, #-40] /* 10 words to go. */
+ vstr d0, [dst, #-40]
+ vldr d0, [src, #-32] /* 8 words to go. */
+ vstr d0, [dst, #-32]
+ vldr d0, [src, #-24] /* 6 words to go. */
+ vstr d0, [dst, #-24]
+ vldr d0, [src, #-16] /* 4 words to go. */
+ vstr d0, [dst, #-16]
+ vldr d0, [src, #-8] /* 2 words to go. */
+ vstr d0, [dst, #-8]
+#else
+ sub src, src, #8
+ sub dst, dst, #8
+1:
+ ldrd A_l, A_h, [src, #8]
+ strd A_l, A_h, [dst, #8]
+ ldrd A_l, A_h, [src, #16]
+ strd A_l, A_h, [dst, #16]
+ ldrd A_l, A_h, [src, #24]
+ strd A_l, A_h, [dst, #24]
+ ldrd A_l, A_h, [src, #32]
+ strd A_l, A_h, [dst, #32]
+ ldrd A_l, A_h, [src, #40]
+ strd A_l, A_h, [dst, #40]
+ ldrd A_l, A_h, [src, #48]
+ strd A_l, A_h, [dst, #48]
+ ldrd A_l, A_h, [src, #56]
+ strd A_l, A_h, [dst, #56]
+ ldrd A_l, A_h, [src, #64]!
+ strd A_l, A_h, [dst, #64]!
+ subs tmp2, tmp2, #64
+ bhs 1b
+ tst tmp2, #0x3f
+ bne 1f
+ ldr tmp2,[sp], #FRAME_SIZE
+ bx lr
+1:
+ add src, src, #8
+ add dst, dst, #8
+
+L(tail63aligned): /* Count in tmp2. */
+ /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
+ we know that the src and dest are 64-bit aligned so we can use
+ LDRD/STRD to improve efficiency. */
+ /* TMP2 is now negative, but we don't care about that. The bottom
+ six bits still tell us how many bytes are left to copy. */
+
+ and tmp1, tmp2, #0x38
+ add dst, dst, tmp1
+ add src, src, tmp1
+ rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+ add pc, pc, tmp1
+ ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
+ strd A_l, A_h, [dst, #-56]
+ ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
+ strd A_l, A_h, [dst, #-48]
+ ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
+ strd A_l, A_h, [dst, #-40]
+ ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
+ strd A_l, A_h, [dst, #-32]
+ ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
+ strd A_l, A_h, [dst, #-24]
+ ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
+ strd A_l, A_h, [dst, #-16]
+ ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
+ strd A_l, A_h, [dst, #-8]
+
+#endif
+ tst tmp2, #4
+ ldrne tmp1, [src], #4
+ strne tmp1, [dst], #4
+ lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
+ ldrhcs tmp1, [src], #2
+ ldrbne tmp2, [src]
+ strhcs tmp1, [dst], #2
+ strbne tmp2, [dst]
+
+L(done):
+ ldr tmp2, [sp], #FRAME_SIZE
+ bx lr
+
+L(cpy_body_long): /* Count in tmp2. */
+
+ /* Long copy. We know that there's at least (prefetch_lines * 64)
+ bytes to go. */
+#ifdef USE_VFP
+ /* Don't use PLD. Instead, read some data in advance of the current
+ copy position into a register. This should act like a PLD
+ operation but we won't have to repeat the transfer. */
+
+ vldr d3, [src, #0]
+ vldr d4, [src, #64]
+ vldr d5, [src, #128]
+ vldr d6, [src, #192]
+ vldr d7, [src, #256]
+
+ vldr d0, [src, #8]
+ vldr d1, [src, #16]
+ vldr d2, [src, #24]
+ add src, src, #32
+
+ subs tmp2, tmp2, #prefetch_lines * 64 * 2
+ blo 2f
+1:
+ cpy_line_vfp d3, 0
+ cpy_line_vfp d4, 64
+ cpy_line_vfp d5, 128
+ add dst, dst, #3 * 64
+ add src, src, #3 * 64
+ cpy_line_vfp d6, 0
+ cpy_line_vfp d7, 64
+ add dst, dst, #2 * 64
+ add src, src, #2 * 64
+ subs tmp2, tmp2, #prefetch_lines * 64
+ bhs 1b
+
+2:
+ cpy_tail_vfp d3, 0
+ cpy_tail_vfp d4, 64
+ cpy_tail_vfp d5, 128
+ add src, src, #3 * 64
+ add dst, dst, #3 * 64
+ cpy_tail_vfp d6, 0
+ vstr d7, [dst, #64]
+ vldr d7, [src, #64]
+ vstr d0, [dst, #64 + 8]
+ vldr d0, [src, #64 + 8]
+ vstr d1, [dst, #64 + 16]
+ vldr d1, [src, #64 + 16]
+ vstr d2, [dst, #64 + 24]
+ vldr d2, [src, #64 + 24]
+ vstr d7, [dst, #64 + 32]
+ add src, src, #96
+ vstr d0, [dst, #64 + 40]
+ vstr d1, [dst, #64 + 48]
+ vstr d2, [dst, #64 + 56]
+ add dst, dst, #128
+ add tmp2, tmp2, #prefetch_lines * 64
+ b L(cpy_body_medium)
+#else
+ /* Long copy. Use an SMS style loop to maximize the I/O
+ bandwidth of the core. We don't have enough spare registers
+ to synthesise prefetching, so use PLD operations. */
+ /* Pre-bias src and dst. */
+ sub src, src, #8
+ sub dst, dst, #8
+ pld [src, #8]
+ pld [src, #72]
+ subs tmp2, tmp2, #64
+ pld [src, #136]
+ ldrd A_l, A_h, [src, #8]
+ strd B_l, B_h, [sp, #8]
+ ldrd B_l, B_h, [src, #16]
+ strd C_l, C_h, [sp, #16]
+ ldrd C_l, C_h, [src, #24]
+ strd D_l, D_h, [sp, #24]
+ pld [src, #200]
+ ldrd D_l, D_h, [src, #32]!
+ b 1f
+ .p2align 6
+2:
+ pld [src, #232]
+ strd A_l, A_h, [dst, #40]
+ ldrd A_l, A_h, [src, #40]
+ strd B_l, B_h, [dst, #48]
+ ldrd B_l, B_h, [src, #48]
+ strd C_l, C_h, [dst, #56]
+ ldrd C_l, C_h, [src, #56]
+ strd D_l, D_h, [dst, #64]!
+ ldrd D_l, D_h, [src, #64]!
+ subs tmp2, tmp2, #64
+1:
+ strd A_l, A_h, [dst, #8]
+ ldrd A_l, A_h, [src, #8]
+ strd B_l, B_h, [dst, #16]
+ ldrd B_l, B_h, [src, #16]
+ strd C_l, C_h, [dst, #24]
+ ldrd C_l, C_h, [src, #24]
+ strd D_l, D_h, [dst, #32]
+ ldrd D_l, D_h, [src, #32]
+ bcs 2b
+ /* Save the remaining bytes and restore the callee-saved regs. */
+ strd A_l, A_h, [dst, #40]
+ add src, src, #40
+ strd B_l, B_h, [dst, #48]
+ ldrd B_l, B_h, [sp, #8]
+ strd C_l, C_h, [dst, #56]
+ ldrd C_l, C_h, [sp, #16]
+ strd D_l, D_h, [dst, #64]
+ ldrd D_l, D_h, [sp, #24]
+ add dst, dst, #72
+ tst tmp2, #0x3f
+ bne L(tail63aligned)
+ ldr tmp2, [sp], #FRAME_SIZE
+ bx lr
+#endif
+
+L(cpy_notaligned):
+ pld [src]
+ pld [src, #64]
+ /* There's at least 64 bytes to copy, but there is no mutual
+ alignment. */
+ /* Bring DST to 64-bit alignment. */
+ lsls tmp2, dst, #29
+ pld [src, #(2 * 64)]
+ beq 1f
+ rsbs tmp2, tmp2, #0
+ sub count, count, tmp2, lsr #29
+ ldrmi tmp1, [src], #4
+ strmi tmp1, [dst], #4
+ lsls tmp2, tmp2, #2
+ ldrbne tmp1, [src], #1
+ ldrhcs tmp2, [src], #2
+ strbne tmp1, [dst], #1
+ strhcs tmp2, [dst], #2
+1:
+ pld [src, #(3 * 64)]
+ subs count, count, #64
+ ldrlo tmp2, [sp], #FRAME_SIZE
+ blo L(tail63unaligned)
+ pld [src, #(4 * 64)]
+
+#ifdef USE_NEON
+ vld1.8 {d0-d3}, [src]!
+ vld1.8 {d4-d7}, [src]!
+ subs count, count, #64
+ blo 2f
+1:
+ pld [src, #(4 * 64)]
+ vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
+ vld1.8 {d0-d3}, [src]!
+ vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
+ vld1.8 {d4-d7}, [src]!
+ subs count, count, #64
+ bhs 1b
+2:
+ vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
+ vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
+ ands count, count, #0x3f
+#else
+ /* Use an SMS style loop to maximize the I/O bandwidth. */
+ sub src, src, #4
+ sub dst, dst, #8
+ subs tmp2, count, #64 /* Use tmp2 for count. */
+ ldr A_l, [src, #4]
+ ldr A_h, [src, #8]
+ strd B_l, B_h, [sp, #8]
+ ldr B_l, [src, #12]
+ ldr B_h, [src, #16]
+ strd C_l, C_h, [sp, #16]
+ ldr C_l, [src, #20]
+ ldr C_h, [src, #24]
+ strd D_l, D_h, [sp, #24]
+ ldr D_l, [src, #28]
+ ldr D_h, [src, #32]!
+ b 1f
+ .p2align 6
+2:
+ pld [src, #(5 * 64) - (32 - 4)]
+ strd A_l, A_h, [dst, #40]
+ ldr A_l, [src, #36]
+ ldr A_h, [src, #40]
+ strd B_l, B_h, [dst, #48]
+ ldr B_l, [src, #44]
+ ldr B_h, [src, #48]
+ strd C_l, C_h, [dst, #56]
+ ldr C_l, [src, #52]
+ ldr C_h, [src, #56]
+ strd D_l, D_h, [dst, #64]!
+ ldr D_l, [src, #60]
+ ldr D_h, [src, #64]!
+ subs tmp2, tmp2, #64
+1:
+ strd A_l, A_h, [dst, #8]
+ ldr A_l, [src, #4]
+ ldr A_h, [src, #8]
+ strd B_l, B_h, [dst, #16]
+ ldr B_l, [src, #12]
+ ldr B_h, [src, #16]
+ strd C_l, C_h, [dst, #24]
+ ldr C_l, [src, #20]
+ ldr C_h, [src, #24]
+ strd D_l, D_h, [dst, #32]
+ ldr D_l, [src, #28]
+ ldr D_h, [src, #32]
+ bcs 2b
+
+ /* Save the remaining bytes and restore the callee-saved regs. */
+ strd A_l, A_h, [dst, #40]
+ add src, src, #36
+ strd B_l, B_h, [dst, #48]
+ ldrd B_l, B_h, [sp, #8]
+ strd C_l, C_h, [dst, #56]
+ ldrd C_l, C_h, [sp, #16]
+ strd D_l, D_h, [dst, #64]
+ ldrd D_l, D_h, [sp, #24]
+ add dst, dst, #72
+ ands count, tmp2, #0x3f
+#endif
+ ldr tmp2, [sp], #FRAME_SIZE
+ bne L(tail63unaligned)
+ bx lr
+
+END (__memcpy_arm)
diff --git a/string/arm/memset.S b/string/arm/memset.S
new file mode 100644
index 000000000000..11e927368fd1
--- /dev/null
+++ b/string/arm/memset.S
@@ -0,0 +1,98 @@
+/*
+ * memset - fill memory with a constant
+ *
+ * Copyright (c) 2010-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/*
+ Written by Dave Gilbert <david.gilbert@linaro.org>
+
+ This memset routine is optimised on a Cortex-A9 and should work on
+ all ARMv7 processors.
+
+ */
+
+ .syntax unified
+ .arch armv7-a
+
+@ 2011-08-30 david.gilbert@linaro.org
+@ Extracted from local git 2f11b436
+
+@ this lets us check a flag in a 00/ff byte easily in either endianness
+#ifdef __ARMEB__
+#define CHARTSTMASK(c) 1<<(31-(c*8))
+#else
+#define CHARTSTMASK(c) 1<<(c*8)
+#endif
+ .thumb
+
+@ ---------------------------------------------------------------------------
+ .thumb_func
+ .align 2
+ .p2align 4,,15
+ .global __memset_arm
+ .type __memset_arm,%function
+__memset_arm:
+ @ r0 = address
+ @ r1 = character
+ @ r2 = count
+ @ returns original address in r0
+
+ mov r3, r0 @ Leave r0 alone
+ cbz r2, 10f @ Exit if 0 length
+
+ tst r0, #7
+ beq 2f @ Already aligned
+
+ @ Ok, so we're misaligned here
+1:
+ strb r1, [r3], #1
+ subs r2,r2,#1
+ tst r3, #7
+ cbz r2, 10f @ Exit if we hit the end
+ bne 1b @ go round again if still misaligned
+
+2:
+ @ OK, so we're aligned
+ push {r4,r5,r6,r7}
+ bics r4, r2, #15 @ if less than 16 bytes then need to finish it off
+ beq 5f
+
+3:
+ @ POSIX says that ch is cast to an unsigned char. A uxtb is one
+ @ byte and takes two cycles, where an AND is four bytes but one
+ @ cycle.
+ and r1, #0xFF
+ orr r1, r1, r1, lsl#8 @ Same character into all bytes
+ orr r1, r1, r1, lsl#16
+ mov r5,r1
+ mov r6,r1
+ mov r7,r1
+
+4:
+ subs r4,r4,#16
+ stmia r3!,{r1,r5,r6,r7}
+ bne 4b
+ and r2,r2,#15
+
+ @ At this point we're still aligned and we have upto align-1 bytes left to right
+ @ we can avoid some of the byte-at-a time now by testing for some big chunks
+ tst r2,#8
+ itt ne
+ subne r2,r2,#8
+ stmiane r3!,{r1,r5}
+
+5:
+ pop {r4,r5,r6,r7}
+ cbz r2, 10f
+
+ @ Got to do any last < alignment bytes
+6:
+ subs r2,r2,#1
+ strb r1,[r3],#1
+ bne 6b
+
+10:
+ bx lr @ goodbye
+ .size __memset_arm, . - __memset_arm
diff --git a/string/arm/strcmp-armv6m.S b/string/arm/strcmp-armv6m.S
new file mode 100644
index 000000000000..b75d4143db57
--- /dev/null
+++ b/string/arm/strcmp-armv6m.S
@@ -0,0 +1,117 @@
+/*
+ * strcmp for ARMv6-M (optimized for performance, not size)
+ *
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1
+
+ .thumb_func
+ .syntax unified
+ .arch armv6-m
+
+ .macro DoSub n, label
+ subs r0, r0, r1
+#ifdef __ARM_BIG_ENDIAN
+ lsrs r1, r4, \n
+#else
+ lsls r1, r4, \n
+#endif
+ orrs r1, r0
+ bne \label
+ .endm
+
+ .macro Byte_Test n, label
+ lsrs r0, r2, \n
+ lsrs r1, r3, \n
+ DoSub \n, \label
+ .endm
+
+ENTRY_ALIGN (__strcmp_armv6m, 4)
+ mov r2, r0
+ push {r4, r5, r6, lr}
+ orrs r2, r1
+ lsls r2, r2, #30
+ bne 6f
+ ldr r5, =0x01010101
+ lsls r6, r5, #7
+1:
+ ldmia r0!, {r2}
+ ldmia r1!, {r3}
+ subs r4, r2, r5
+ bics r4, r2
+ ands r4, r6
+ beq 3f
+
+#ifdef __ARM_BIG_ENDIAN
+ Byte_Test #24, 4f
+ Byte_Test #16, 4f
+ Byte_Test #8, 4f
+
+ b 7f
+3:
+ cmp r2, r3
+ beq 1b
+ cmp r2, r3
+#else
+ uxtb r0, r2
+ uxtb r1, r3
+ DoSub #24, 2f
+
+ uxth r0, r2
+ uxth r1, r3
+ DoSub #16, 2f
+
+ lsls r0, r2, #8
+ lsls r1, r3, #8
+ lsrs r0, r0, #8
+ lsrs r1, r1, #8
+ DoSub #8, 2f
+
+ lsrs r0, r2, #24
+ lsrs r1, r3, #24
+ subs r0, r0, r1
+2:
+ pop {r4, r5, r6, pc}
+
+3:
+ cmp r2, r3
+ beq 1b
+ rev r0, r2
+ rev r1, r3
+ cmp r0, r1
+#endif
+
+ bls 5f
+ movs r0, #1
+4:
+ pop {r4, r5, r6, pc}
+5:
+ movs r0, #0
+ mvns r0, r0
+ pop {r4, r5, r6, pc}
+6:
+ ldrb r2, [r0, #0]
+ ldrb r3, [r1, #0]
+ adds r0, #1
+ adds r1, #1
+ cmp r2, #0
+ beq 7f
+ cmp r2, r3
+ bne 7f
+ ldrb r2, [r0, #0]
+ ldrb r3, [r1, #0]
+ adds r0, #1
+ adds r1, #1
+ cmp r2, #0
+ beq 7f
+ cmp r2, r3
+ beq 6b
+7:
+ subs r0, r2, r3
+ pop {r4, r5, r6, pc}
+
+END (__strcmp_armv6m)
+
+#endif /* __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1 */
diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S
new file mode 100644
index 000000000000..51443e343058
--- /dev/null
+++ b/string/arm/strcmp.S
@@ -0,0 +1,475 @@
+/*
+ * strcmp for ARMv7
+ *
+ * Copyright (c) 2012-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
+
+/* Implementation of strcmp for ARMv7 when DSP instructions are
+ available. Use ldrd to support wider loads, provided the data
+ is sufficiently aligned. Use saturating arithmetic to optimize
+ the compares. */
+
+#include "../asmdefs.h"
+
+/* Build Options:
+ STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
+ byte in the string. If comparing completely random strings
+ the pre-check will save time, since there is a very high
+ probability of a mismatch in the first character: we save
+ significant overhead if this is the common case. However,
+ if strings are likely to be identical (eg because we're
+ verifying a hit in a hash table), then this check is largely
+ redundant. */
+
+#define STRCMP_NO_PRECHECK 0
+
+ /* This version uses Thumb-2 code. */
+ .thumb
+ .syntax unified
+
+#ifdef __ARM_BIG_ENDIAN
+#define S2LO lsl
+#define S2LOEQ lsleq
+#define S2HI lsr
+#define MSB 0x000000ff
+#define LSB 0xff000000
+#define BYTE0_OFFSET 24
+#define BYTE1_OFFSET 16
+#define BYTE2_OFFSET 8
+#define BYTE3_OFFSET 0
+#else /* not __ARM_BIG_ENDIAN */
+#define S2LO lsr
+#define S2LOEQ lsreq
+#define S2HI lsl
+#define BYTE0_OFFSET 0
+#define BYTE1_OFFSET 8
+#define BYTE2_OFFSET 16
+#define BYTE3_OFFSET 24
+#define MSB 0xff000000
+#define LSB 0x000000ff
+#endif /* not __ARM_BIG_ENDIAN */
+
+/* Parameters and result. */
+#define src1 r0
+#define src2 r1
+#define result r0 /* Overlaps src1. */
+
+/* Internal variables. */
+#define tmp1 r4
+#define tmp2 r5
+#define const_m1 r12
+
+/* Additional internal variables for 64-bit aligned data. */
+#define data1a r2
+#define data1b r3
+#define data2a r6
+#define data2b r7
+#define syndrome_a tmp1
+#define syndrome_b tmp2
+
+/* Additional internal variables for 32-bit aligned data. */
+#define data1 r2
+#define data2 r3
+#define syndrome tmp2
+
+
+ /* Macro to compute and return the result value for word-aligned
+ cases. */
+ .macro strcmp_epilogue_aligned synd d1 d2 restore_r6
+#ifdef __ARM_BIG_ENDIAN
+ /* If data1 contains a zero byte, then syndrome will contain a 1 in
+ bit 7 of that byte. Otherwise, the highest set bit in the
+ syndrome will highlight the first different bit. It is therefore
+ sufficient to extract the eight bits starting with the syndrome
+ bit. */
+ clz tmp1, \synd
+ lsl r1, \d2, tmp1
+ .if \restore_r6
+ ldrd r6, r7, [sp, #8]
+ .endif
+ .cfi_restore 6
+ .cfi_restore 7
+ lsl \d1, \d1, tmp1
+ .cfi_remember_state
+ lsr result, \d1, #24
+ ldrd r4, r5, [sp], #16
+ .cfi_restore 4
+ .cfi_restore 5
+ sub result, result, r1, lsr #24
+ bx lr
+#else
+ /* To use the big-endian trick we'd have to reverse all three words.
+ that's slower than this approach. */
+ rev \synd, \synd
+ clz tmp1, \synd
+ bic tmp1, tmp1, #7
+ lsr r1, \d2, tmp1
+ .cfi_remember_state
+ .if \restore_r6
+ ldrd r6, r7, [sp, #8]
+ .endif
+ .cfi_restore 6
+ .cfi_restore 7
+ lsr \d1, \d1, tmp1
+ and result, \d1, #255
+ and r1, r1, #255
+ ldrd r4, r5, [sp], #16
+ .cfi_restore 4
+ .cfi_restore 5
+ sub result, result, r1
+
+ bx lr
+#endif
+ .endm
+
+ .p2align 5
+L(strcmp_start_addr):
+#if STRCMP_NO_PRECHECK == 0
+L(fastpath_exit):
+ sub r0, r2, r3
+ bx lr
+ nop
+#endif
+ENTRY_ALIGN (__strcmp_arm, 0)
+#if STRCMP_NO_PRECHECK == 0
+ ldrb r2, [src1]
+ ldrb r3, [src2]
+ cmp r2, #1
+ it cs
+ cmpcs r2, r3
+ bne L(fastpath_exit)
+#endif
+ strd r4, r5, [sp, #-16]!
+ .cfi_def_cfa_offset 16
+ .cfi_offset 4, -16
+ .cfi_offset 5, -12
+ orr tmp1, src1, src2
+ strd r6, r7, [sp, #8]
+ .cfi_offset 6, -8
+ .cfi_offset 7, -4
+ mvn const_m1, #0
+ lsl r2, tmp1, #29
+ cbz r2, L(loop_aligned8)
+
+L(not_aligned):
+ eor tmp1, src1, src2
+ tst tmp1, #7
+ bne L(misaligned8)
+
+ /* Deal with mutual misalignment by aligning downwards and then
+ masking off the unwanted loaded data to prevent a difference. */
+ and tmp1, src1, #7
+ bic src1, src1, #7
+ and tmp2, tmp1, #3
+ bic src2, src2, #7
+ lsl tmp2, tmp2, #3 /* Bytes -> bits. */
+ ldrd data1a, data1b, [src1], #16
+ tst tmp1, #4
+ ldrd data2a, data2b, [src2], #16
+ /* In thumb code we can't use MVN with a register shift, but
+ we do have ORN. */
+ S2HI tmp1, const_m1, tmp2
+ orn data1a, data1a, tmp1
+ orn data2a, data2a, tmp1
+ beq L(start_realigned8)
+ orn data1b, data1b, tmp1
+ mov data1a, const_m1
+ orn data2b, data2b, tmp1
+ mov data2a, const_m1
+ b L(start_realigned8)
+
+ /* Unwind the inner loop by a factor of 2, giving 16 bytes per
+ pass. */
+ .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */
+ .p2align 2 /* Always word aligned. */
+L(loop_aligned8):
+ ldrd data1a, data1b, [src1], #16
+ ldrd data2a, data2b, [src2], #16
+L(start_realigned8):
+ uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
+ eor syndrome_a, data1a, data2a
+ sel syndrome_a, syndrome_a, const_m1
+ cbnz syndrome_a, L(diff_in_a)
+ uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
+ eor syndrome_b, data1b, data2b
+ sel syndrome_b, syndrome_b, const_m1
+ cbnz syndrome_b, L(diff_in_b)
+
+ ldrd data1a, data1b, [src1, #-8]
+ ldrd data2a, data2b, [src2, #-8]
+ uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
+ eor syndrome_a, data1a, data2a
+ sel syndrome_a, syndrome_a, const_m1
+ uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
+ eor syndrome_b, data1b, data2b
+ sel syndrome_b, syndrome_b, const_m1
+ /* Can't use CBZ for backwards branch. */
+ orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
+ beq L(loop_aligned8)
+
+L(diff_found):
+ cbnz syndrome_a, L(diff_in_a)
+
+L(diff_in_b):
+ strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
+
+L(diff_in_a):
+ .cfi_restore_state
+ strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
+
+ .cfi_restore_state
+L(misaligned8):
+ tst tmp1, #3
+ bne L(misaligned4)
+ ands tmp1, src1, #3
+ bne L(mutual_align4)
+
+ /* Unrolled by a factor of 2, to reduce the number of post-increment
+ operations. */
+L(loop_aligned4):
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+L(start_realigned4):
+ uadd8 syndrome, data1, const_m1 /* Only need GE bits. */
+ eor syndrome, data1, data2
+ sel syndrome, syndrome, const_m1
+ cbnz syndrome, L(aligned4_done)
+ ldr data1, [src1, #-4]
+ ldr data2, [src2, #-4]
+ uadd8 syndrome, data1, const_m1
+ eor syndrome, data1, data2
+ sel syndrome, syndrome, const_m1
+ cmp syndrome, #0
+ beq L(loop_aligned4)
+
+L(aligned4_done):
+ strcmp_epilogue_aligned syndrome, data1, data2, 0
+
+L(mutual_align4):
+ .cfi_restore_state
+ /* Deal with mutual misalignment by aligning downwards and then
+ masking off the unwanted loaded data to prevent a difference. */
+ lsl tmp1, tmp1, #3 /* Bytes -> bits. */
+ bic src1, src1, #3
+ ldr data1, [src1], #8
+ bic src2, src2, #3
+ ldr data2, [src2], #8
+
+ /* In thumb code we can't use MVN with a register shift, but
+ we do have ORN. */
+ S2HI tmp1, const_m1, tmp1
+ orn data1, data1, tmp1
+ orn data2, data2, tmp1
+ b L(start_realigned4)
+
+L(misaligned4):
+ ands tmp1, src1, #3
+ beq L(src1_aligned)
+ sub src2, src2, tmp1
+ bic src1, src1, #3
+ lsls tmp1, tmp1, #31
+ ldr data1, [src1], #4
+ beq L(aligned_m2)
+ bcs L(aligned_m1)
+
+#if STRCMP_NO_PRECHECK == 1
+ ldrb data2, [src2, #1]
+ uxtb tmp1, data1, ror #BYTE1_OFFSET
+ subs tmp1, tmp1, data2
+ bne L(misaligned_exit)
+ cbz data2, L(misaligned_exit)
+
+L(aligned_m2):
+ ldrb data2, [src2, #2]
+ uxtb tmp1, data1, ror #BYTE2_OFFSET
+ subs tmp1, tmp1, data2
+ bne L(misaligned_exit)
+ cbz data2, L(misaligned_exit)
+
+L(aligned_m1):
+ ldrb data2, [src2, #3]
+ uxtb tmp1, data1, ror #BYTE3_OFFSET
+ subs tmp1, tmp1, data2
+ bne L(misaligned_exit)
+ add src2, src2, #4
+ cbnz data2, L(src1_aligned)
+#else /* STRCMP_NO_PRECHECK */
+ /* If we've done the pre-check, then we don't need to check the
+ first byte again here. */
+ ldrb data2, [src2, #2]
+ uxtb tmp1, data1, ror #BYTE2_OFFSET
+ subs tmp1, tmp1, data2
+ bne L(misaligned_exit)
+ cbz data2, L(misaligned_exit)
+
+L(aligned_m2):
+ ldrb data2, [src2, #3]
+ uxtb tmp1, data1, ror #BYTE3_OFFSET
+ subs tmp1, tmp1, data2
+ bne L(misaligned_exit)
+ cbnz data2, L(aligned_m1)
+#endif
+
+L(misaligned_exit):
+ .cfi_remember_state
+ mov result, tmp1
+ ldr r4, [sp], #16
+ .cfi_restore 4
+ bx lr
+
+#if STRCMP_NO_PRECHECK == 0
+L(aligned_m1):
+ add src2, src2, #4
+#endif
+L(src1_aligned):
+ .cfi_restore_state
+ /* src1 is word aligned, but src2 has no common alignment
+ with it. */
+ ldr data1, [src1], #4
+ lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */
+
+ bic src2, src2, #3
+ ldr data2, [src2], #4
+ bhi L(overlap1) /* C=1, Z=0 => src2[1:0] = 0b11. */
+ bcs L(overlap2) /* C=1, Z=1 => src2[1:0] = 0b10. */
+
+ /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */
+L(overlap3):
+ bic tmp1, data1, #MSB
+ uadd8 syndrome, data1, const_m1
+ eors syndrome, tmp1, data2, S2LO #8
+ sel syndrome, syndrome, const_m1
+ bne 4f
+ cbnz syndrome, 5f
+ ldr data2, [src2], #4
+ eor tmp1, tmp1, data1
+ cmp tmp1, data2, S2HI #24
+ bne 6f
+ ldr data1, [src1], #4
+ b L(overlap3)
+4:
+ S2LO data2, data2, #8
+ b L(strcmp_tail)
+
+5:
+ bics syndrome, syndrome, #MSB
+ bne L(strcmp_done_equal)
+
+ /* We can only get here if the MSB of data1 contains 0, so
+ fast-path the exit. */
+ ldrb result, [src2]
+ .cfi_remember_state
+ ldrd r4, r5, [sp], #16
+ .cfi_restore 4
+ .cfi_restore 5
+ /* R6/7 Not used in this sequence. */
+ .cfi_restore 6
+ .cfi_restore 7
+ neg result, result
+ bx lr
+
+6:
+ .cfi_restore_state
+ S2LO data1, data1, #24
+ and data2, data2, #LSB
+ b L(strcmp_tail)
+
+ .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
+L(overlap2):
+ and tmp1, data1, const_m1, S2LO #16
+ uadd8 syndrome, data1, const_m1
+ eors syndrome, tmp1, data2, S2LO #16
+ sel syndrome, syndrome, const_m1
+ bne 4f
+ cbnz syndrome, 5f
+ ldr data2, [src2], #4
+ eor tmp1, tmp1, data1
+ cmp tmp1, data2, S2HI #16
+ bne 6f
+ ldr data1, [src1], #4
+ b L(overlap2)
+4:
+ S2LO data2, data2, #16
+ b L(strcmp_tail)
+5:
+ ands syndrome, syndrome, const_m1, S2LO #16
+ bne L(strcmp_done_equal)
+
+ ldrh data2, [src2]
+ S2LO data1, data1, #16
+#ifdef __ARM_BIG_ENDIAN
+ lsl data2, data2, #16
+#endif
+ b L(strcmp_tail)
+
+6:
+ S2LO data1, data1, #16
+ and data2, data2, const_m1, S2LO #16
+ b L(strcmp_tail)
+
+ .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
+L(overlap1):
+ and tmp1, data1, #LSB
+ uadd8 syndrome, data1, const_m1
+ eors syndrome, tmp1, data2, S2LO #24
+ sel syndrome, syndrome, const_m1
+ bne 4f
+ cbnz syndrome, 5f
+ ldr data2, [src2], #4
+ eor tmp1, tmp1, data1
+ cmp tmp1, data2, S2HI #8
+ bne 6f
+ ldr data1, [src1], #4
+ b L(overlap1)
+4:
+ S2LO data2, data2, #24
+ b L(strcmp_tail)
+5:
+ tst syndrome, #LSB
+ bne L(strcmp_done_equal)
+ ldr data2, [src2]
+6:
+ S2LO data1, data1, #8
+ bic data2, data2, #MSB
+ b L(strcmp_tail)
+
+L(strcmp_done_equal):
+ mov result, #0
+ .cfi_remember_state
+ ldrd r4, r5, [sp], #16
+ .cfi_restore 4
+ .cfi_restore 5
+ /* R6/7 not used in this sequence. */
+ .cfi_restore 6
+ .cfi_restore 7
+ bx lr
+
+L(strcmp_tail):
+ .cfi_restore_state
+#ifndef __ARM_BIG_ENDIAN
+ rev data1, data1
+ rev data2, data2
+ /* Now everything looks big-endian... */
+#endif
+ uadd8 tmp1, data1, const_m1
+ eor tmp1, data1, data2
+ sel syndrome, tmp1, const_m1
+ clz tmp1, syndrome
+ lsl data1, data1, tmp1
+ lsl data2, data2, tmp1
+ lsr result, data1, #24
+ ldrd r4, r5, [sp], #16
+ .cfi_restore 4
+ .cfi_restore 5
+ /* R6/7 not used in this sequence. */
+ .cfi_restore 6
+ .cfi_restore 7
+ sub result, result, data2, lsr #24
+ bx lr
+
+END (__strcmp_arm)
+
+#endif /* __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 */
diff --git a/string/arm/strcpy.c b/string/arm/strcpy.c
new file mode 100644
index 000000000000..02cf94ff4be0
--- /dev/null
+++ b/string/arm/strcpy.c
@@ -0,0 +1,133 @@
+/*
+ * strcpy
+ *
+ * Copyright (c) 2008-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if defined (__thumb2__) && !defined (__thumb__)
+
+/* For GLIBC:
+#include <string.h>
+#include <memcopy.h>
+
+#undef strcmp
+*/
+
+#ifdef __thumb2__
+#define magic1(REG) "#0x01010101"
+#define magic2(REG) "#0x80808080"
+#else
+#define magic1(REG) #REG
+#define magic2(REG) #REG ", lsl #7"
+#endif
+
+char* __attribute__((naked))
+__strcpy_arm (char* dst, const char* src)
+{
+ __asm__ (
+ "pld [r1, #0]\n\t"
+ "eor r2, r0, r1\n\t"
+ "mov ip, r0\n\t"
+ "tst r2, #3\n\t"
+ "bne 4f\n\t"
+ "tst r1, #3\n\t"
+ "bne 3f\n"
+ "5:\n\t"
+# ifndef __thumb2__
+ "str r5, [sp, #-4]!\n\t"
+ "mov r5, #0x01\n\t"
+ "orr r5, r5, r5, lsl #8\n\t"
+ "orr r5, r5, r5, lsl #16\n\t"
+# endif
+
+ "str r4, [sp, #-4]!\n\t"
+ "tst r1, #4\n\t"
+ "ldr r3, [r1], #4\n\t"
+ "beq 2f\n\t"
+ "sub r2, r3, "magic1(r5)"\n\t"
+ "bics r2, r2, r3\n\t"
+ "tst r2, "magic2(r5)"\n\t"
+ "itt eq\n\t"
+ "streq r3, [ip], #4\n\t"
+ "ldreq r3, [r1], #4\n"
+ "bne 1f\n\t"
+ /* Inner loop. We now know that r1 is 64-bit aligned, so we
+ can safely fetch up to two words. This allows us to avoid
+ load stalls. */
+ ".p2align 2\n"
+ "2:\n\t"
+ "pld [r1, #8]\n\t"
+ "ldr r4, [r1], #4\n\t"
+ "sub r2, r3, "magic1(r5)"\n\t"
+ "bics r2, r2, r3\n\t"
+ "tst r2, "magic2(r5)"\n\t"
+ "sub r2, r4, "magic1(r5)"\n\t"
+ "bne 1f\n\t"
+ "str r3, [ip], #4\n\t"
+ "bics r2, r2, r4\n\t"
+ "tst r2, "magic2(r5)"\n\t"
+ "itt eq\n\t"
+ "ldreq r3, [r1], #4\n\t"
+ "streq r4, [ip], #4\n\t"
+ "beq 2b\n\t"
+ "mov r3, r4\n"
+ "1:\n\t"
+# ifdef __ARMEB__
+ "rors r3, r3, #24\n\t"
+# endif
+ "strb r3, [ip], #1\n\t"
+ "tst r3, #0xff\n\t"
+# ifdef __ARMEL__
+ "ror r3, r3, #8\n\t"
+# endif
+ "bne 1b\n\t"
+ "ldr r4, [sp], #4\n\t"
+# ifndef __thumb2__
+ "ldr r5, [sp], #4\n\t"
+# endif
+ "BX LR\n"
+
+ /* Strings have the same offset from word alignment, but it's
+ not zero. */
+ "3:\n\t"
+ "tst r1, #1\n\t"
+ "beq 1f\n\t"
+ "ldrb r2, [r1], #1\n\t"
+ "strb r2, [ip], #1\n\t"
+ "cmp r2, #0\n\t"
+ "it eq\n"
+ "BXEQ LR\n"
+ "1:\n\t"
+ "tst r1, #2\n\t"
+ "beq 5b\n\t"
+ "ldrh r2, [r1], #2\n\t"
+# ifdef __ARMEB__
+ "tst r2, #0xff00\n\t"
+ "iteet ne\n\t"
+ "strneh r2, [ip], #2\n\t"
+ "lsreq r2, r2, #8\n\t"
+ "streqb r2, [ip]\n\t"
+ "tstne r2, #0xff\n\t"
+# else
+ "tst r2, #0xff\n\t"
+ "itet ne\n\t"
+ "strneh r2, [ip], #2\n\t"
+ "streqb r2, [ip]\n\t"
+ "tstne r2, #0xff00\n\t"
+# endif
+ "bne 5b\n\t"
+ "BX LR\n"
+
+ /* src and dst do not have a common word-alignement. Fall back to
+ byte copying. */
+ "4:\n\t"
+ "ldrb r2, [r1], #1\n\t"
+ "strb r2, [ip], #1\n\t"
+ "cmp r2, #0\n\t"
+ "bne 4b\n\t"
+ "BX LR");
+}
+/* For GLIBC: libc_hidden_builtin_def (strcpy) */
+
+#endif /* defined (__thumb2__) && !defined (__thumb__) */
diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S
new file mode 100644
index 000000000000..5ad30c941586
--- /dev/null
+++ b/string/arm/strlen-armv6t2.S
@@ -0,0 +1,124 @@
+/*
+ * strlen - calculate the length of a string
+ *
+ * Copyright (c) 2010-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
+
+/*
+ Assumes:
+ ARMv6T2, AArch32
+
+ */
+
+#include "../asmdefs.h"
+
+#ifdef __ARMEB__
+#define S2LO lsl
+#define S2HI lsr
+#else
+#define S2LO lsr
+#define S2HI lsl
+#endif
+
+ /* This code requires Thumb. */
+ .thumb
+ .syntax unified
+
+/* Parameters and result. */
+#define srcin r0
+#define result r0
+
+/* Internal variables. */
+#define src r1
+#define data1a r2
+#define data1b r3
+#define const_m1 r12
+#define const_0 r4
+#define tmp1 r4 /* Overlaps const_0 */
+#define tmp2 r5
+
+ENTRY (__strlen_armv6t2)
+ pld [srcin, #0]
+ strd r4, r5, [sp, #-8]!
+ bic src, srcin, #7
+ mvn const_m1, #0
+ ands tmp1, srcin, #7 /* (8 - bytes) to alignment. */
+ pld [src, #32]
+ bne.w L(misaligned8)
+ mov const_0, #0
+ mov result, #-8
+L(loop_aligned):
+ /* Bytes 0-7. */
+ ldrd data1a, data1b, [src]
+ pld [src, #64]
+ add result, result, #8
+L(start_realigned):
+ uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
+ sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
+ uadd8 data1b, data1b, const_m1
+ sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
+ cbnz data1b, L(null_found)
+
+ /* Bytes 8-15. */
+ ldrd data1a, data1b, [src, #8]
+ uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
+ add result, result, #8
+ sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
+ uadd8 data1b, data1b, const_m1
+ sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
+ cbnz data1b, L(null_found)
+
+ /* Bytes 16-23. */
+ ldrd data1a, data1b, [src, #16]
+ uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
+ add result, result, #8
+ sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
+ uadd8 data1b, data1b, const_m1
+ sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
+ cbnz data1b, L(null_found)
+
+ /* Bytes 24-31. */
+ ldrd data1a, data1b, [src, #24]
+ add src, src, #32
+ uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
+ add result, result, #8
+ sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
+ uadd8 data1b, data1b, const_m1
+ sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
+ cmp data1b, #0
+ beq L(loop_aligned)
+
+L(null_found):
+ cmp data1a, #0
+ itt eq
+ addeq result, result, #4
+ moveq data1a, data1b
+#ifndef __ARMEB__
+ rev data1a, data1a
+#endif
+ clz data1a, data1a
+ ldrd r4, r5, [sp], #8
+ add result, result, data1a, lsr #3 /* Bits -> Bytes. */
+ bx lr
+
+L(misaligned8):
+ ldrd data1a, data1b, [src]
+ and tmp2, tmp1, #3
+ rsb result, tmp1, #0
+ lsl tmp2, tmp2, #3 /* Bytes -> bits. */
+ tst tmp1, #4
+ pld [src, #64]
+ S2HI tmp2, const_m1, tmp2
+ orn data1a, data1a, tmp2
+ itt ne
+ ornne data1b, data1b, tmp2
+ movne data1a, const_m1
+ mov const_0, #0
+ b L(start_realigned)
+
+END (__strlen_armv6t2)
+
+#endif /* __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2 */
diff --git a/string/asmdefs.h b/string/asmdefs.h
new file mode 100644
index 000000000000..340b427a505b
--- /dev/null
+++ b/string/asmdefs.h
@@ -0,0 +1,98 @@
+/*
+ * Macros for asm code.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef _ASMDEFS_H
+#define _ASMDEFS_H
+
+#if defined(__aarch64__)
+
+/* Branch Target Identitication support. */
+#define BTI_C hint 34
+#define BTI_J hint 36
+/* Return address signing support (pac-ret). */
+#define PACIASP hint 25; .cfi_window_save
+#define AUTIASP hint 29; .cfi_window_save
+
+/* GNU_PROPERTY_AARCH64_* macros from elf.h. */
+#define FEATURE_1_AND 0xc0000000
+#define FEATURE_1_BTI 1
+#define FEATURE_1_PAC 2
+
+/* Add a NT_GNU_PROPERTY_TYPE_0 note. */
+#define GNU_PROPERTY(type, value) \
+ .section .note.gnu.property, "a"; \
+ .p2align 3; \
+ .word 4; \
+ .word 16; \
+ .word 5; \
+ .asciz "GNU"; \
+ .word type; \
+ .word 4; \
+ .word value; \
+ .word 0; \
+ .text
+
+/* If set then the GNU Property Note section will be added to
+ mark objects to support BTI and PAC-RET. */
+#ifndef WANT_GNU_PROPERTY
+#define WANT_GNU_PROPERTY 1
+#endif
+
+#if WANT_GNU_PROPERTY
+/* Add property note with supported features to all asm files. */
+GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
+#endif
+
+#define ENTRY_ALIGN(name, alignment) \
+ .global name; \
+ .type name,%function; \
+ .align alignment; \
+ name: \
+ .cfi_startproc; \
+ BTI_C;
+
+#else
+
+#define END_FILE
+
+#define ENTRY_ALIGN(name, alignment) \
+ .global name; \
+ .type name,%function; \
+ .align alignment; \
+ name: \
+ .cfi_startproc;
+
+#endif
+
+#define ENTRY(name) ENTRY_ALIGN(name, 6)
+
+#define ENTRY_ALIAS(name) \
+ .global name; \
+ .type name,%function; \
+ name:
+
+#define END(name) \
+ .cfi_endproc; \
+ .size name, .-name;
+
+#define L(l) .L ## l
+
+#ifdef __ILP32__
+ /* Sanitize padding bits of pointer arguments as per aapcs64 */
+#define PTR_ARG(n) mov w##n, w##n
+#else
+#define PTR_ARG(n)
+#endif
+
+#ifdef __ILP32__
+ /* Sanitize padding bits of size arguments as per aapcs64 */
+#define SIZE_ARG(n) mov w##n, w##n
+#else
+#define SIZE_ARG(n)
+#endif
+
+#endif
diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c
new file mode 100644
index 000000000000..d5d4ea7e0309
--- /dev/null
+++ b/string/bench/memcpy.c
@@ -0,0 +1,260 @@
+/*
+ * memcpy benchmark.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define _GNU_SOURCE
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "stringlib.h"
+#include "benchlib.h"
+
+#define ITERS 5000
+#define ITERS2 20000000
+#define ITERS3 500000
+#define MAX_COPIES 8192
+#define SIZE (256*1024)
+
+static uint8_t a[SIZE + 4096] __attribute__((__aligned__(64)));
+static uint8_t b[SIZE + 4096] __attribute__((__aligned__(64)));
+
+#define F(x) {#x, x},
+
+static const struct fun
+{
+ const char *name;
+ void *(*fun)(void *, const void *, size_t);
+} funtab[] =
+{
+ F(memcpy)
+#if __aarch64__
+ F(__memcpy_aarch64)
+# if __ARM_NEON
+ F(__memcpy_aarch64_simd)
+# endif
+#elif __arm__
+ F(__memcpy_arm)
+#endif
+#undef F
+ {0, 0}
+};
+
+typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
+typedef struct { uint8_t align; uint16_t freq; } align_data_t;
+
+#define SIZE_NUM 65536
+#define SIZE_MASK (SIZE_NUM-1)
+static uint8_t size_arr[SIZE_NUM];
+
+/* Frequency data for memcpy of less than 4096 bytes based on SPEC2017. */
+static freq_data_t size_freq[] =
+{
+{32,22320}, { 16,9554}, { 8,8915}, {152,5327}, { 4,2159}, {292,2035},
+{ 12,1608}, { 24,1343}, {1152,895}, {144, 813}, {884, 733}, {284, 721},
+{120, 661}, { 2, 649}, {882, 550}, { 5, 475}, { 7, 461}, {108, 460},
+{ 10, 361}, { 9, 361}, { 6, 334}, { 3, 326}, {464, 308}, {2048,303},
+{ 1, 298}, { 64, 250}, { 11, 197}, {296, 194}, { 68, 187}, { 15, 185},
+{192, 184}, {1764,183}, { 13, 173}, {560, 126}, {160, 115}, {288, 96},
+{104, 96}, {1144, 83}, { 18, 80}, { 23, 78}, { 40, 77}, { 19, 68},
+{ 48, 63}, { 17, 57}, { 72, 54}, {1280, 51}, { 20, 49}, { 28, 47},
+{ 22, 46}, {640, 45}, { 25, 41}, { 14, 40}, { 56, 37}, { 27, 35},
+{ 35, 33}, {384, 33}, { 29, 32}, { 80, 30}, {4095, 22}, {232, 22},
+{ 36, 19}, {184, 17}, { 21, 17}, {256, 16}, { 44, 15}, { 26, 15},
+{ 31, 14}, { 88, 14}, {176, 13}, { 33, 12}, {1024, 12}, {208, 11},
+{ 62, 11}, {128, 10}, {704, 10}, {324, 10}, { 96, 10}, { 60, 9},
+{136, 9}, {124, 9}, { 34, 8}, { 30, 8}, {480, 8}, {1344, 8},
+{273, 7}, {520, 7}, {112, 6}, { 52, 6}, {344, 6}, {336, 6},
+{504, 5}, {168, 5}, {424, 5}, { 0, 4}, { 76, 3}, {200, 3},
+{512, 3}, {312, 3}, {240, 3}, {960, 3}, {264, 2}, {672, 2},
+{ 38, 2}, {328, 2}, { 84, 2}, { 39, 2}, {216, 2}, { 42, 2},
+{ 37, 2}, {1608, 2}, { 70, 2}, { 46, 2}, {536, 2}, {280, 1},
+{248, 1}, { 47, 1}, {1088, 1}, {1288, 1}, {224, 1}, { 41, 1},
+{ 50, 1}, { 49, 1}, {808, 1}, {360, 1}, {440, 1}, { 43, 1},
+{ 45, 1}, { 78, 1}, {968, 1}, {392, 1}, { 54, 1}, { 53, 1},
+{ 59, 1}, {376, 1}, {664, 1}, { 58, 1}, {272, 1}, { 66, 1},
+{2688, 1}, {472, 1}, {568, 1}, {720, 1}, { 51, 1}, { 63, 1},
+{ 86, 1}, {496, 1}, {776, 1}, { 57, 1}, {680, 1}, {792, 1},
+{122, 1}, {760, 1}, {824, 1}, {552, 1}, { 67, 1}, {456, 1},
+{984, 1}, { 74, 1}, {408, 1}, { 75, 1}, { 92, 1}, {576, 1},
+{116, 1}, { 65, 1}, {117, 1}, { 82, 1}, {352, 1}, { 55, 1},
+{100, 1}, { 90, 1}, {696, 1}, {111, 1}, {880, 1}, { 79, 1},
+{488, 1}, { 61, 1}, {114, 1}, { 94, 1}, {1032, 1}, { 98, 1},
+{ 87, 1}, {584, 1}, { 85, 1}, {648, 1}, {0, 0}
+};
+
+#define ALIGN_NUM 1024
+#define ALIGN_MASK (ALIGN_NUM-1)
+static uint8_t src_align_arr[ALIGN_NUM];
+static uint8_t dst_align_arr[ALIGN_NUM];
+
+/* Source alignment frequency for memcpy based on SPEC2017. */
+static align_data_t src_align_freq[] =
+{
+ {8, 300}, {16, 292}, {32, 168}, {64, 153}, {4, 79}, {2, 14}, {1, 18}, {0, 0}
+};
+
+static align_data_t dst_align_freq[] =
+{
+ {8, 265}, {16, 263}, {64, 209}, {32, 174}, {4, 90}, {2, 10}, {1, 13}, {0, 0}
+};
+
+typedef struct
+{
+ uint64_t src : 24;
+ uint64_t dst : 24;
+ uint64_t len : 16;
+} copy_t;
+
+static copy_t copy[MAX_COPIES];
+
+typedef char *(*proto_t) (char *, const char *, size_t);
+
+static void
+init_copy_distribution (void)
+{
+ int i, j, freq, size, n;
+
+ for (n = i = 0; (freq = size_freq[i].freq) != 0; i++)
+ for (j = 0, size = size_freq[i].size; j < freq; j++)
+ size_arr[n++] = size;
+ assert (n == SIZE_NUM);
+
+ for (n = i = 0; (freq = src_align_freq[i].freq) != 0; i++)
+ for (j = 0, size = src_align_freq[i].align; j < freq; j++)
+ src_align_arr[n++] = size - 1;
+ assert (n == ALIGN_NUM);
+
+ for (n = i = 0; (freq = dst_align_freq[i].freq) != 0; i++)
+ for (j = 0, size = dst_align_freq[i].align; j < freq; j++)
+ dst_align_arr[n++] = size - 1;
+ assert (n == ALIGN_NUM);
+}
+
+static size_t
+init_copies (size_t max_size)
+{
+ size_t total = 0;
+ /* Create a random set of copies with the given size and alignment
+ distributions. */
+ for (int i = 0; i < MAX_COPIES; i++)
+ {
+ copy[i].dst = (rand32 (0) & (max_size - 1));
+ copy[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];
+ copy[i].src = (rand32 (0) & (max_size - 1));
+ copy[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];
+ copy[i].len = size_arr[rand32 (0) & SIZE_MASK];
+ total += copy[i].len;
+ }
+
+ return total;
+}
+
+int main (void)
+{
+ init_copy_distribution ();
+
+ memset (a, 1, sizeof (a));
+ memset (b, 2, sizeof (b));
+
+ printf("Random memcpy:\n");
+ for (int f = 0; funtab[f].name != 0; f++)
+ {
+ size_t total = 0;
+ uint64_t tsum = 0;
+ printf ("%22s (B/ns) ", funtab[f].name);
+ rand32 (0x12345678);
+
+ for (int size = 16384; size <= SIZE; size *= 2)
+ {
+ size_t copy_size = init_copies (size) * ITERS;
+
+ for (int c = 0; c < MAX_COPIES; c++)
+ funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);
+
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS; i++)
+ for (int c = 0; c < MAX_COPIES; c++)
+ funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);
+ t = clock_get_ns () - t;
+ total += copy_size;
+ tsum += t;
+ printf ("%dK: %.2f ", size / 1024, (double)copy_size / t);
+ }
+ printf( "avg %.2f\n", (double)total / tsum);
+ }
+
+ printf ("\nMedium memcpy:\n");
+ for (int f = 0; funtab[f].name != 0; f++)
+ {
+ printf ("%22s (B/ns) ", funtab[f].name);
+
+ for (int size = 16; size <= 512; size *= 2)
+ {
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS2; i++)
+ funtab[f].fun (b, a, size);
+ t = clock_get_ns () - t;
+ printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+ size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
+ }
+ printf ("\n");
+ }
+
+ printf ("\nLarge memcpy:\n");
+ for (int f = 0; funtab[f].name != 0; f++)
+ {
+ printf ("%22s (B/ns) ", funtab[f].name);
+
+ for (int size = 1024; size <= 32768; size *= 2)
+ {
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS3; i++)
+ funtab[f].fun (b, a, size);
+ t = clock_get_ns () - t;
+ printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+ size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+ }
+ printf ("\n");
+ }
+
+ printf ("\nUnaligned forwards memmove:\n");
+ for (int f = 0; funtab[f].name != 0; f++)
+ {
+ printf ("%22s (B/ns) ", funtab[f].name);
+
+ for (int size = 1024; size <= 32768; size *= 2)
+ {
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS3; i++)
+ funtab[f].fun (a, a + 256 + (i & 31), size);
+ t = clock_get_ns () - t;
+ printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+ size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+ }
+ printf ("\n");
+ }
+
+
+ printf ("\nUnaligned backwards memmove:\n");
+ for (int f = 0; funtab[f].name != 0; f++)
+ {
+ printf ("%22s (B/ns) ", funtab[f].name);
+
+ for (int size = 1024; size <= 32768; size *= 2)
+ {
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS3; i++)
+ funtab[f].fun (a + 256 + (i & 31), a, size);
+ t = clock_get_ns () - t;
+ printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+ size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+ }
+ printf ("\n");
+ }
+
+ return 0;
+}
diff --git a/string/bench/strlen.c b/string/bench/strlen.c
new file mode 100644
index 000000000000..cc0f04bee547
--- /dev/null
+++ b/string/bench/strlen.c
@@ -0,0 +1,221 @@
+/*
+ * strlen benchmark.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define _GNU_SOURCE
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "stringlib.h"
+#include "benchlib.h"
+
+#define ITERS 2000
+#define ITERS2 20000000
+#define ITERS3 2000000
+#define NUM_STRLEN 16384
+
+#define MAX_ALIGN 32
+#define MAX_STRLEN 256
+
+static char a[(MAX_STRLEN + 1) * MAX_ALIGN] __attribute__((__aligned__(4096)));
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+ const char *name;
+ size_t (*fun) (const char *s);
+ int test_mte;
+} funtab[] = {
+ // clang-format off
+ F(strlen, 0)
+#if __aarch64__
+ F(__strlen_aarch64, 0)
+ F(__strlen_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+ F(__strlen_aarch64_sve, 1)
+# endif
+#elif __arm__
+# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
+ F(__strlen_armv6t2, 0)
+# endif
+#endif
+ {0, 0, 0}
+ // clang-format on
+};
+#undef F
+
+static uint16_t strlen_tests[NUM_STRLEN];
+
+typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
+typedef struct { uint8_t align; uint16_t freq; } align_data_t;
+
+#define SIZE_NUM 65536
+#define SIZE_MASK (SIZE_NUM - 1)
+static uint8_t strlen_len_arr[SIZE_NUM];
+
+/* Frequency data for strlen sizes up to 128 based on SPEC2017. */
+static freq_data_t strlen_len_freq[] =
+{
+ { 12,22671}, { 18,12834}, { 13, 9555}, { 6, 6348}, { 17, 6095}, { 11, 2115},
+ { 10, 1335}, { 7, 814}, { 2, 646}, { 9, 483}, { 8, 471}, { 16, 418},
+ { 4, 390}, { 1, 388}, { 5, 233}, { 3, 204}, { 0, 79}, { 14, 79},
+ { 15, 69}, { 26, 36}, { 22, 35}, { 31, 24}, { 32, 24}, { 19, 21},
+ { 25, 17}, { 28, 15}, { 21, 14}, { 33, 14}, { 20, 13}, { 24, 9},
+ { 29, 9}, { 30, 9}, { 23, 7}, { 34, 7}, { 27, 6}, { 44, 5},
+ { 42, 4}, { 45, 3}, { 47, 3}, { 40, 2}, { 41, 2}, { 43, 2},
+ { 58, 2}, { 78, 2}, { 36, 2}, { 48, 1}, { 52, 1}, { 60, 1},
+ { 64, 1}, { 56, 1}, { 76, 1}, { 68, 1}, { 80, 1}, { 84, 1},
+ { 72, 1}, { 86, 1}, { 35, 1}, { 39, 1}, { 50, 1}, { 38, 1},
+ { 37, 1}, { 46, 1}, { 98, 1}, {102, 1}, {128, 1}, { 51, 1},
+ {107, 1}, { 0, 0}
+};
+
+#define ALIGN_NUM 1024
+#define ALIGN_MASK (ALIGN_NUM - 1)
+static uint8_t strlen_align_arr[ALIGN_NUM];
+
+/* Alignment data for strlen based on SPEC2017. */
+static align_data_t string_align_freq[] =
+{
+ {8, 470}, {32, 427}, {16, 99}, {1, 19}, {2, 6}, {4, 3}, {0, 0}
+};
+
+static void
+init_strlen_distribution (void)
+{
+ int i, j, freq, size, n;
+
+ for (n = i = 0; (freq = strlen_len_freq[i].freq) != 0; i++)
+ for (j = 0, size = strlen_len_freq[i].size; j < freq; j++)
+ strlen_len_arr[n++] = size;
+ assert (n == SIZE_NUM);
+
+ for (n = i = 0; (freq = string_align_freq[i].freq) != 0; i++)
+ for (j = 0, size = string_align_freq[i].align; j < freq; j++)
+ strlen_align_arr[n++] = size;
+ assert (n == ALIGN_NUM);
+}
+
+static void
+init_strlen_tests (void)
+{
+ uint16_t index[MAX_ALIGN];
+
+ memset (a, 'x', sizeof (a));
+
+ /* Create indices for strings at all alignments. */
+ for (int i = 0; i < MAX_ALIGN; i++)
+ {
+ index[i] = i * (MAX_STRLEN + 1);
+ a[index[i] + MAX_STRLEN] = 0;
+ }
+
+ /* Create a random set of strlen input strings using the string length
+ and alignment distributions. */
+ for (int n = 0; n < NUM_STRLEN; n++)
+ {
+ int align = strlen_align_arr[rand32 (0) & ALIGN_MASK];
+ int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK];
+
+ strlen_tests[n] =
+ index[(align + exp_len) & (MAX_ALIGN - 1)] + MAX_STRLEN - exp_len;
+ }
+}
+
+static volatile size_t maskv = 0;
+
+int main (void)
+{
+ rand32 (0x12345678);
+ init_strlen_distribution ();
+ init_strlen_tests ();
+
+ printf ("\nRandom strlen (bytes/ns):\n");
+ for (int f = 0; funtab[f].name != 0; f++)
+ {
+ size_t res = 0, strlen_size = 0, mask = maskv;
+ printf ("%22s ", funtab[f].name);
+
+ for (int c = 0; c < NUM_STRLEN; c++)
+ strlen_size += funtab[f].fun (a + strlen_tests[c]);
+ strlen_size *= ITERS;
+
+ /* Measure latency of strlen result with (res & mask). */
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS; i++)
+ for (int c = 0; c < NUM_STRLEN; c++)
+ res = funtab[f].fun (a + strlen_tests[c] + (res & mask));
+ t = clock_get_ns () - t;
+ printf ("%.2f\n", (double)strlen_size / t);
+ }
+
+ printf ("\nSmall aligned strlen (bytes/ns):\n");
+ for (int f = 0; funtab[f].name != 0; f++)
+ {
+ printf ("%22s ", funtab[f].name);
+
+ for (int size = 1; size <= 64; size *= 2)
+ {
+ memset (a, 'x', size);
+ a[size - 1] = 0;
+
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS2; i++)
+ funtab[f].fun (a);
+ t = clock_get_ns () - t;
+ printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+ size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
+ }
+ printf ("\n");
+ }
+
+ printf ("\nSmall unaligned strlen (bytes/ns):\n");
+ for (int f = 0; funtab[f].name != 0; f++)
+ {
+ printf ("%22s ", funtab[f].name);
+
+ int align = 9;
+ for (int size = 1; size <= 64; size *= 2)
+ {
+ memset (a + align, 'x', size);
+ a[align + size - 1] = 0;
+
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS2; i++)
+ funtab[f].fun (a + align);
+ t = clock_get_ns () - t;
+ printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+ size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
+ }
+ printf ("\n");
+ }
+
+ printf ("\nMedium strlen (bytes/ns):\n");
+ for (int f = 0; funtab[f].name != 0; f++)
+ {
+ printf ("%22s ", funtab[f].name);
+
+ for (int size = 128; size <= 4096; size *= 2)
+ {
+ memset (a, 'x', size);
+ a[size - 1] = 0;
+
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS3; i++)
+ funtab[f].fun (a);
+ t = clock_get_ns () - t;
+ printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+ size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+ }
+ printf ("\n");
+ }
+
+ printf ("\n");
+
+ return 0;
+}
diff --git a/string/include/benchlib.h b/string/include/benchlib.h
new file mode 100644
index 000000000000..0f2ce2eb6bce
--- /dev/null
+++ b/string/include/benchlib.h
@@ -0,0 +1,33 @@
+/*
+ * Benchmark support functions.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <time.h>
+
+/* Fast and accurate timer returning nanoseconds. */
+static inline uint64_t
+clock_get_ns (void)
+{
+ struct timespec ts;
+ clock_gettime (CLOCK_MONOTONIC, &ts);
+ return ts.tv_sec * (uint64_t) 1000000000 + ts.tv_nsec;
+}
+
+/* Fast 32-bit random number generator. Passing a non-zero seed
+ value resets the internal state. */
+static inline uint32_t
+rand32 (uint32_t seed)
+{
+ static uint64_t state = 0xb707be451df0bb19ULL;
+ if (seed != 0)
+ state = seed;
+ uint32_t res = state >> 32;
+ state = state * 6364136223846793005ULL + 1;
+ return res;
+}
+
+
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
new file mode 100644
index 000000000000..378c3cd2d645
--- /dev/null
+++ b/string/include/stringlib.h
@@ -0,0 +1,69 @@
+/*
+ * Public API.
+ *
+ * Copyright (c) 2019-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stddef.h>
+
+/* restrict is not needed, but kept for documenting the interface contract. */
+#ifndef __restrict
+# define __restrict
+#endif
+
+#if __aarch64__
+void *__memcpy_aarch64 (void *__restrict, const void *__restrict, size_t);
+void *__memmove_aarch64 (void *, const void *, size_t);
+void *__memset_aarch64 (void *, int, size_t);
+void *__memchr_aarch64 (const void *, int, size_t);
+void *__memrchr_aarch64 (const void *, int, size_t);
+int __memcmp_aarch64 (const void *, const void *, size_t);
+char *__strcpy_aarch64 (char *__restrict, const char *__restrict);
+char *__stpcpy_aarch64 (char *__restrict, const char *__restrict);
+int __strcmp_aarch64 (const char *, const char *);
+char *__strchr_aarch64 (const char *, int);
+char *__strrchr_aarch64 (const char *, int);
+char *__strchrnul_aarch64 (const char *, int );
+size_t __strlen_aarch64 (const char *);
+size_t __strnlen_aarch64 (const char *, size_t);
+int __strncmp_aarch64 (const char *, const char *, size_t);
+void * __memchr_aarch64_mte (const void *, int, size_t);
+char *__strcpy_aarch64_mte (char *__restrict, const char *__restrict);
+char *__stpcpy_aarch64_mte (char *__restrict, const char *__restrict);
+char *__strchr_aarch64_mte (const char *, int);
+char * __strchrnul_aarch64_mte (const char *, int );
+size_t __strlen_aarch64_mte (const char *);
+char *__strrchr_aarch64_mte (const char *, int);
+int __strcmp_aarch64_mte (const char *, const char *);
+int __strncmp_aarch64_mte (const char *, const char *, size_t);
+#if __ARM_NEON
+void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t);
+void *__memmove_aarch64_simd (void *, const void *, size_t);
+#endif
+# if __ARM_FEATURE_SVE
+void *__memchr_aarch64_sve (const void *, int, size_t);
+int __memcmp_aarch64_sve (const void *, const void *, size_t);
+char *__strchr_aarch64_sve (const char *, int);
+char *__strrchr_aarch64_sve (const char *, int);
+char *__strchrnul_aarch64_sve (const char *, int );
+int __strcmp_aarch64_sve (const char *, const char *);
+char *__strcpy_aarch64_sve (char *__restrict, const char *__restrict);
+char *__stpcpy_aarch64_sve (char *__restrict, const char *__restrict);
+size_t __strlen_aarch64_sve (const char *);
+size_t __strnlen_aarch64_sve (const char *, size_t);
+int __strncmp_aarch64_sve (const char *, const char *, size_t);
+# endif
+# if __ARM_FEATURE_MEMORY_TAGGING
+void *__mtag_tag_region (void *, size_t);
+void *__mtag_tag_zero_region (void *, size_t);
+# endif
+#elif __arm__
+void *__memcpy_arm (void *__restrict, const void *__restrict, size_t);
+void *__memset_arm (void *, int, size_t);
+void *__memchr_arm (const void *, int, size_t);
+char *__strcpy_arm (char *__restrict, const char *__restrict);
+int __strcmp_arm (const char *, const char *);
+int __strcmp_armv6m (const char *, const char *);
+size_t __strlen_armv6t2 (const char *);
+#endif
diff --git a/string/test/__mtag_tag_region.c b/string/test/__mtag_tag_region.c
new file mode 100644
index 000000000000..d8c02d92d626
--- /dev/null
+++ b/string/test/__mtag_tag_region.c
@@ -0,0 +1,147 @@
+/*
+ * __mtag_tag_region test.
+ *
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+static void
+mtag_quoteat (const char *prefix, void *p, int len, int at)
+{
+ /* Print tag, untag and quote the context. */
+ printf ("location: %p\n", __arm_mte_get_tag ((char *) p + at));
+ untag_buffer (p, len, 1);
+ p = untag_pointer (p);
+ quoteat (prefix, p, len, at);
+}
+
+#define F(x) {#x, x},
+
+static const struct fun
+{
+ const char *name;
+ void *(*fun) (void *s, size_t n);
+} funtab[] = {
+// clang-format off
+#if __aarch64__
+ F(__mtag_tag_region)
+#endif
+ {0, 0}
+ // clang-format on
+};
+#undef F
+
+#define A 64
+#define LEN 250000
+static unsigned char *sbuf;
+
+static void *
+alignup (void *p)
+{
+ return (void *) (((uintptr_t) p + A - 1) & -A);
+}
+
+static void
+test (const struct fun *fun, int salign, int len)
+{
+ unsigned char *src = alignup (sbuf);
+ unsigned char *s = src + salign;
+ void *p;
+ int i;
+
+ if (err_count >= ERR_LIMIT)
+ return;
+ if (len > LEN || salign >= A)
+ abort ();
+ for (i = 0; i < len + 2 * A; i++)
+ src[i] = '?';
+ for (i = 0; i < len; i++)
+ s[i] = 'a';
+
+ src = tag_buffer (src, len + 2 * A, 1);
+ s = src + salign;
+ /* Use different tag. */
+ s = __arm_mte_increment_tag (s, 1);
+ p = fun->fun (s, len);
+
+ if (p != s)
+ ERR ("%s(%p,..) returned %p\n", fun->name, s, p);
+
+ for (i = 0; i < salign; i++)
+ {
+ if (src[i] != '?')
+ {
+ ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+ mtag_quoteat ("got head", src, len + 2 * A, i);
+ return;
+ }
+ }
+
+ for (; i < salign + len; i++)
+ {
+ if (s[i - salign] != 'a')
+ {
+ ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+ mtag_quoteat ("got body", src, len + 2 * A, i);
+ return;
+ }
+ }
+
+ for (; i < len + 2 * A; i++)
+ {
+ if (src[i] != '?')
+ {
+ ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+ mtag_quoteat ("got tail", src, len + 2 * A, i);
+ return;
+ }
+ }
+
+ untag_buffer (src, len + 2 * A, 1);
+}
+
+int
+main ()
+{
+ if (!mte_enabled ())
+ return 0;
+
+ sbuf = mte_mmap (LEN + 3 * A);
+ int r = 0;
+ for (int i = 0; funtab[i].name; i++)
+ {
+ err_count = 0;
+ for (int s = 0; s < A; s += 16)
+ {
+ int n;
+ for (n = 0; n < 200; n += 16)
+ {
+ test (funtab + i, s, n);
+ }
+ for (; n < LEN; n *= 2)
+ {
+ test (funtab + i, s, n);
+ }
+ }
+ printf ("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+ if (err_count)
+ r = -1;
+ }
+ return r;
+}
+#else
+int
+main ()
+{
+ return 0;
+}
+#endif
diff --git a/string/test/__mtag_tag_zero_region.c b/string/test/__mtag_tag_zero_region.c
new file mode 100644
index 000000000000..221c223a2f31
--- /dev/null
+++ b/string/test/__mtag_tag_zero_region.c
@@ -0,0 +1,147 @@
+/*
+ * __mtag_tag_zero_region test.
+ *
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+static void
+mtag_quoteat (const char *prefix, void *p, int len, int at)
+{
+ /* Print tag, untag and quote the context. */
+ printf ("location: %p\n", __arm_mte_get_tag ((char *) p + at));
+ untag_buffer (p, len, 1);
+ p = untag_pointer (p);
+ quoteat (prefix, p, len, at);
+}
+
+#define F(x) {#x, x},
+
+static const struct fun
+{
+ const char *name;
+ void *(*fun) (void *s, size_t n);
+} funtab[] = {
+// clang-format off
+#if __aarch64__
+ F(__mtag_tag_zero_region)
+#endif
+ {0, 0}
+ // clang-format on
+};
+#undef F
+
+#define A 64
+#define LEN 250000
+static unsigned char *sbuf;
+
+static void *
+alignup (void *p)
+{
+ return (void *) (((uintptr_t) p + A - 1) & -A);
+}
+
+static void
+test (const struct fun *fun, int salign, int len)
+{
+ unsigned char *src = alignup (sbuf);
+ unsigned char *s = src + salign;
+ void *p;
+ int i;
+
+ if (err_count >= ERR_LIMIT)
+ return;
+ if (len > LEN || salign >= A)
+ abort ();
+ for (i = 0; i < len + 2 * A; i++)
+ src[i] = '?';
+ for (i = 0; i < len; i++)
+ s[i] = 'a' + i % 23;
+
+ src = tag_buffer (src, len + 2 * A, 1);
+ s = src + salign;
+ /* Use different tag. */
+ s = __arm_mte_increment_tag (s, 1);
+ p = fun->fun (s, len);
+
+ if (p != s)
+ ERR ("%s(%p,..) returned %p\n", fun->name, s, p);
+
+ for (i = 0; i < salign; i++)
+ {
+ if (src[i] != '?')
+ {
+ ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+ mtag_quoteat ("got head", src, len + 2 * A, i);
+ return;
+ }
+ }
+
+ for (; i < salign + len; i++)
+ {
+ if (s[i - salign] != 0)
+ {
+ ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+ mtag_quoteat ("got body", src, len + 2 * A, i);
+ return;
+ }
+ }
+
+ for (; i < len + 2 * A; i++)
+ {
+ if (src[i] != '?')
+ {
+ ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+ mtag_quoteat ("got tail", src, len + 2 * A, i);
+ return;
+ }
+ }
+
+ untag_buffer (src, len + 2 * A, 1);
+}
+
+int
+main ()
+{
+ if (!mte_enabled ())
+ return 0;
+
+ sbuf = mte_mmap (LEN + 3 * A);
+ int r = 0;
+ for (int i = 0; funtab[i].name; i++)
+ {
+ err_count = 0;
+ for (int s = 0; s < A; s += 16)
+ {
+ int n;
+ for (n = 0; n < 200; n += 16)
+ {
+ test (funtab + i, s, n);
+ }
+ for (; n < LEN; n *= 2)
+ {
+ test (funtab + i, s, n);
+ }
+ }
+ printf ("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+ if (err_count)
+ r = -1;
+ }
+ return r;
+}
+#else
+int
+main ()
+{
+ return 0;
+}
+#endif
diff --git a/string/test/memchr.c b/string/test/memchr.c
new file mode 100644
index 000000000000..0ff77f5710bf
--- /dev/null
+++ b/string/test/memchr.c
@@ -0,0 +1,110 @@
+/*
+ * memchr test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+ const char *name;
+ void *(*fun) (const void *s, int c, size_t n);
+ int test_mte;
+} funtab[] = {
+ // clang-format off
+ F(memchr, 0)
+#if __aarch64__
+ F(__memchr_aarch64, 0)
+ F(__memchr_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+ F(__memchr_aarch64_sve, 1)
+# endif
+#elif __arm__
+ F(__memchr_arm, 0)
+#endif
+ {0, 0, 0}
+ // clang-format on
+};
+#undef F
+
+#define ALIGN 32
+#define LEN 512
+static char *sbuf;
+
+static void *
+alignup (void *p)
+{
+ return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
+}
+
+static void
+test (const struct fun *fun, int align, size_t seekpos, size_t len,
+ size_t maxlen)
+{
+ char *src = alignup (sbuf);
+ char *s = src + align;
+ char *f = seekpos < maxlen ? s + seekpos : NULL;
+ int seekchar = 1;
+ void *p;
+
+ if (err_count >= ERR_LIMIT)
+ return;
+ if (len > LEN || seekpos > LEN || align > ALIGN)
+ abort ();
+
+ for (int i = 0; src + i < s; i++)
+ src[i] = seekchar;
+ for (int i = 0; i <= ALIGN; i++)
+ s[len + i] = seekchar;
+ for (int i = 0; i < len; i++)
+ s[i] = 'a' + (i & 31);
+ s[seekpos] = seekchar;
+ s[((len ^ align) & 1) ? seekpos + 1 : len] = seekchar;
+
+ int mte_len = seekpos != -1 ? seekpos + 1 : maxlen;
+ s = tag_buffer (s, mte_len, fun->test_mte);
+ p = fun->fun (s, seekchar, maxlen);
+ untag_buffer (s, mte_len, fun->test_mte);
+ p = untag_pointer (p);
+
+ if (p != f)
+ {
+ ERR ("%s (%p, 0x%02x, %zu) returned %p, expected %p\n", fun->name, s,
+ seekchar, maxlen, p, f);
+ quote ("input", s, len);
+ }
+}
+
+int
+main (void)
+{
+ sbuf = mte_mmap (LEN + 3 * ALIGN);
+ int r = 0;
+ for (int i = 0; funtab[i].name; i++)
+ {
+ err_count = 0;
+ for (int a = 0; a < ALIGN; a++)
+ for (int n = 0; n < LEN; n++)
+ {
+ for (int sp = 0; sp < LEN; sp++)
+ test (funtab + i, a, sp, n, n);
+ test (funtab + i, a, n, n, SIZE_MAX - a);
+ }
+ char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+ printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+ if (err_count)
+ r = -1;
+ }
+ return r;
+}
diff --git a/string/test/memcmp.c b/string/test/memcmp.c
new file mode 100644
index 000000000000..7a7cf9cff35a
--- /dev/null
+++ b/string/test/memcmp.c
@@ -0,0 +1,125 @@
+/*
+ * memcmp test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+ const char *name;
+ int (*fun) (const void *s1, const void *s2, size_t n);
+ int test_mte;
+} funtab[] = {
+ // clang-format off
+ F(memcmp, 0)
+#if __aarch64__
+ F(__memcmp_aarch64, 1)
+# if __ARM_FEATURE_SVE
+ F(__memcmp_aarch64_sve, 1)
+# endif
+#endif
+ {0, 0, 0}
+ // clang-format on
+};
+#undef F
+
+#define A 32
+#define LEN 250000
+static unsigned char *s1buf;
+static unsigned char *s2buf;
+
+static void *
+alignup (void *p)
+{
+ return (void *) (((uintptr_t) p + A - 1) & -A);
+}
+
+static void
+test (const struct fun *fun, int s1align, int s2align, int len, int diffpos,
+ int delta)
+{
+ unsigned char *src1 = alignup (s1buf);
+ unsigned char *src2 = alignup (s2buf);
+ unsigned char *s1 = src1 + s1align;
+ unsigned char *s2 = src2 + s2align;
+ int r;
+
+ if (err_count >= ERR_LIMIT)
+ return;
+ if (len > LEN || s1align >= A || s2align >= A)
+ abort ();
+ if (diffpos >= len)
+ abort ();
+ if ((diffpos < 0) != (delta == 0))
+ abort ();
+
+ for (int i = 0; i < len + A; i++)
+ src1[i] = src2[i] = '?';
+ for (int i = 0; i < len; i++)
+ s1[i] = s2[i] = 'a' + i % 23;
+ if (delta)
+ s1[diffpos] += delta;
+
+ s1 = tag_buffer (s1, len, fun->test_mte);
+ s2 = tag_buffer (s2, len, fun->test_mte);
+ r = fun->fun (s1, s2, len);
+ untag_buffer (s1, len, fun->test_mte);
+ untag_buffer (s2, len, fun->test_mte);
+
+ if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0))
+ {
+ ERR ("%s(align %d, align %d, %d) failed, returned %d\n", fun->name,
+ s1align, s2align, len, r);
+ quoteat ("src1", src1, len + A, diffpos);
+ quoteat ("src2", src2, len + A, diffpos);
+ }
+}
+
+int
+main ()
+{
+ s1buf = mte_mmap (LEN + 2 * A);
+ s2buf = mte_mmap (LEN + 2 * A);
+ int r = 0;
+ for (int i = 0; funtab[i].name; i++)
+ {
+ err_count = 0;
+ for (int d = 0; d < A; d++)
+ for (int s = 0; s < A; s++)
+ {
+ int n;
+ test (funtab + i, d, s, 0, -1, 0);
+ test (funtab + i, d, s, 1, -1, 0);
+ test (funtab + i, d, s, 1, 0, -1);
+ test (funtab + i, d, s, 1, 0, 1);
+ for (n = 2; n < 100; n++)
+ {
+ test (funtab + i, d, s, n, -1, 0);
+ test (funtab + i, d, s, n, 0, -1);
+ test (funtab + i, d, s, n, n - 1, -1);
+ test (funtab + i, d, s, n, n / 2, 1);
+ }
+ for (; n < LEN; n *= 2)
+ {
+ test (funtab + i, d, s, n, -1, 0);
+ test (funtab + i, d, s, n, n / 2, -1);
+ }
+ }
+ char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+ printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+ if (err_count)
+ r = -1;
+ }
+ return r;
+}
diff --git a/string/test/memcpy.c b/string/test/memcpy.c
new file mode 100644
index 000000000000..ce0ceeef5ee8
--- /dev/null
+++ b/string/test/memcpy.c
@@ -0,0 +1,120 @@
+/*
+ * memcpy test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+ const char *name;
+ void *(*fun) (void *, const void *, size_t);
+ int test_mte;
+} funtab[] = {
+ // clang-format off
+ F(memcpy, 0)
+#if __aarch64__
+ F(__memcpy_aarch64, 1)
+# if __ARM_NEON
+ F(__memcpy_aarch64_simd, 1)
+# endif
+#elif __arm__
+ F(__memcpy_arm, 0)
+#endif
+ {0, 0, 0}
+ // clang-format on
+};
+#undef F
+
+#define A 32
+#define LEN 250000
+static unsigned char *dbuf;
+static unsigned char *sbuf;
+static unsigned char wbuf[LEN + 2 * A];
+
+static void *
+alignup (void *p)
+{
+ return (void *) (((uintptr_t) p + A - 1) & -A);
+}
+
+static void
+test (const struct fun *fun, int dalign, int salign, int len)
+{
+ unsigned char *src = alignup (sbuf);
+ unsigned char *dst = alignup (dbuf);
+ unsigned char *want = wbuf;
+ unsigned char *s = src + salign;
+ unsigned char *d = dst + dalign;
+ unsigned char *w = want + dalign;
+ void *p;
+ int i;
+
+ if (err_count >= ERR_LIMIT)
+ return;
+ if (len > LEN || dalign >= A || salign >= A)
+ abort ();
+ for (i = 0; i < len + A; i++)
+ {
+ src[i] = '?';
+ want[i] = dst[i] = '*';
+ }
+ for (i = 0; i < len; i++)
+ s[i] = w[i] = 'a' + i % 23;
+
+ s = tag_buffer (s, len, fun->test_mte);
+ d = tag_buffer (d, len, fun->test_mte);
+ p = fun->fun (d, s, len);
+ untag_buffer (s, len, fun->test_mte);
+ untag_buffer (d, len, fun->test_mte);
+
+ if (p != d)
+ ERR ("%s(%p,..) returned %p\n", fun->name, d, p);
+ for (i = 0; i < len + A; i++)
+ {
+ if (dst[i] != want[i])
+ {
+ ERR ("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign,
+ len);
+ quoteat ("got", dst, len + A, i);
+ quoteat ("want", want, len + A, i);
+ break;
+ }
+ }
+}
+
+int
+main ()
+{
+ dbuf = mte_mmap (LEN + 2 * A);
+ sbuf = mte_mmap (LEN + 2 * A);
+ int r = 0;
+ for (int i = 0; funtab[i].name; i++)
+ {
+ err_count = 0;
+ for (int d = 0; d < A; d++)
+ for (int s = 0; s < A; s++)
+ {
+ int n;
+ for (n = 0; n < 100; n++)
+ test (funtab + i, d, s, n);
+ for (; n < LEN; n *= 2)
+ test (funtab + i, d, s, n);
+ }
+ char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+ printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+ if (err_count)
+ r = -1;
+ }
+ return r;
+}
diff --git a/string/test/memmove.c b/string/test/memmove.c
new file mode 100644
index 000000000000..689b68c98af2
--- /dev/null
+++ b/string/test/memmove.c
@@ -0,0 +1,164 @@
+/*
+ * memmove test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+ const char *name;
+ void *(*fun) (void *, const void *, size_t);
+ int test_mte;
+} funtab[] = {
+ // clang-format off
+ F(memmove, 0)
+#if __aarch64__
+ F(__memmove_aarch64, 1)
+# if __ARM_NEON
+ F(__memmove_aarch64_simd, 1)
+# endif
+#endif
+ {0, 0, 0}
+ // clang-format on
+};
+#undef F
+
+#define A 32
+#define LEN 250000
+static unsigned char *dbuf;
+static unsigned char *sbuf;
+static unsigned char wbuf[LEN + 2 * A];
+
+static void *
+alignup (void *p)
+{
+ return (void *) (((uintptr_t) p + A - 1) & -A);
+}
+
+static void
+test (const struct fun *fun, int dalign, int salign, int len)
+{
+ unsigned char *src = alignup (sbuf);
+ unsigned char *dst = alignup (dbuf);
+ unsigned char *want = wbuf;
+ unsigned char *s = src + salign;
+ unsigned char *d = dst + dalign;
+ unsigned char *w = want + dalign;
+ void *p;
+ int i;
+
+ if (err_count >= ERR_LIMIT)
+ return;
+ if (len > LEN || dalign >= A || salign >= A)
+ abort ();
+ for (i = 0; i < len + A; i++)
+ {
+ src[i] = '?';
+ want[i] = dst[i] = '*';
+ }
+ for (i = 0; i < len; i++)
+ s[i] = w[i] = 'a' + i % 23;
+
+ p = fun->fun (d, s, len);
+ if (p != d)
+ ERR ("%s(%p,..) returned %p\n", fun->name, d, p);
+ for (i = 0; i < len + A; i++)
+ {
+ if (dst[i] != want[i])
+ {
+ ERR ("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign,
+ len);
+ quoteat ("got", dst, len + A, i);
+ quoteat ("want", want, len + A, i);
+ break;
+ }
+ }
+}
+
+static void
+test_overlap (const struct fun *fun, int dalign, int salign, int len)
+{
+ unsigned char *src = alignup (sbuf);
+ unsigned char *dst = src;
+ unsigned char *want = wbuf;
+ unsigned char *s = src + salign;
+ unsigned char *d = dst + dalign;
+ unsigned char *w = wbuf + dalign;
+ void *p;
+
+ if (err_count >= ERR_LIMIT)
+ return;
+ if (len > LEN || dalign >= A || salign >= A)
+ abort ();
+
+ for (int i = 0; i < len + A; i++)
+ src[i] = want[i] = '?';
+
+ for (int i = 0; i < len; i++)
+ s[i] = want[salign + i] = 'a' + i % 23;
+ for (int i = 0; i < len; i++)
+ w[i] = s[i];
+
+ s = tag_buffer (s, len, fun->test_mte);
+ d = tag_buffer (d, len, fun->test_mte);
+ p = fun->fun (d, s, len);
+ untag_buffer (s, len, fun->test_mte);
+ untag_buffer (d, len, fun->test_mte);
+
+ if (p != d)
+ ERR ("%s(%p,..) returned %p\n", fun->name, d, p);
+ for (int i = 0; i < len + A; i++)
+ {
+ if (dst[i] != want[i])
+ {
+ ERR ("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign,
+ len);
+ quoteat ("got", dst, len + A, i);
+ quoteat ("want", want, len + A, i);
+ break;
+ }
+ }
+}
+
+int
+main ()
+{
+ dbuf = mte_mmap (LEN + 2 * A);
+ sbuf = mte_mmap (LEN + 2 * A);
+ int r = 0;
+ for (int i = 0; funtab[i].name; i++)
+ {
+ err_count = 0;
+ for (int d = 0; d < A; d++)
+ for (int s = 0; s < A; s++)
+ {
+ int n;
+ for (n = 0; n < 100; n++)
+ {
+ test (funtab + i, d, s, n);
+ test_overlap (funtab + i, d, s, n);
+ }
+ for (; n < LEN; n *= 2)
+ {
+ test (funtab + i, d, s, n);
+ test_overlap (funtab + i, d, s, n);
+ }
+ }
+ char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+ printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+ if (err_count)
+ r = -1;
+ }
+ return r;
+}
diff --git a/string/test/memrchr.c b/string/test/memrchr.c
new file mode 100644
index 000000000000..adf96f049cc9
--- /dev/null
+++ b/string/test/memrchr.c
@@ -0,0 +1,106 @@
+/*
+ * memchr test.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+ const char *name;
+ void *(*fun) (const void *s, int c, size_t n);
+ int test_mte;
+} funtab[] = {
+ // clang-format off
+ F(memrchr, 0)
+#if __aarch64__
+ F(__memrchr_aarch64, 1)
+#endif
+ {0, 0, 0}
+ // clang-format on
+};
+#undef F
+
+#define ALIGN 32
+#define LEN 512
+static char *sbuf;
+
+static void *
+alignup (void *p)
+{
+ return (void *) (((uintptr_t) p + ALIGN) & -ALIGN);
+}
+
+static void
+test (const struct fun *fun, int align, size_t seekpos, size_t len,
+ size_t maxlen)
+{
+ char *src = alignup (sbuf);
+ char *s = src + align;
+ char *f = seekpos < maxlen ? s + seekpos : NULL;
+ int seekchar = 1;
+ void *p;
+
+ if (err_count >= ERR_LIMIT)
+ return;
+ if (len > LEN || seekpos > LEN || align > ALIGN)
+ abort ();
+
+ for (int i = 0; src + i < s; i++)
+ src[i] = seekchar;
+ for (int i = 0; i <= ALIGN; i++)
+ s[len + i] = seekchar;
+ for (int i = 0; i < len; i++)
+ s[i] = 'a' + (i & 31);
+ s[seekpos] = seekchar;
+ s[((len ^ align) & 1) && seekpos < maxlen ? seekpos - 1 : len] = seekchar;
+
+ s = tag_buffer (s, maxlen, fun->test_mte);
+ p = fun->fun (s, seekchar, maxlen);
+ untag_buffer (s, maxlen, fun->test_mte);
+ p = untag_pointer (p);
+
+ if (p != f)
+ {
+ ERR ("%s (%p, 0x%02x, %zu) returned %p, expected %p\n", fun->name, s,
+ seekchar, maxlen, p, f);
+ quote ("input", s, len);
+ }
+}
+
+int
+main (void)
+{
+ sbuf = mte_mmap (LEN + 3 * ALIGN);
+ int r = 0;
+ for (int i = 0; funtab[i].name; i++)
+ {
+ err_count = 0;
+ for (int a = 0; a < ALIGN; a++)
+ for (int n = 0; n < LEN; n++)
+ {
+ for (int sp = 0; sp < LEN; sp++)
+ test (funtab + i, a, sp, n, n);
+ }
+ char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+ printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+ if (err_count)
+ r = -1;
+ }
+ return r;
+}
diff --git a/string/test/memset.c b/string/test/memset.c
new file mode 100644
index 000000000000..f1721442dbaf
--- /dev/null
+++ b/string/test/memset.c
@@ -0,0 +1,129 @@
+/*
+ * memset test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+ const char *name;
+ void *(*fun) (void *s, int c, size_t n);
+ int test_mte;
+} funtab[] = {
+ // clang-format off
+ F(memset, 0)
+#if __aarch64__
+ F(__memset_aarch64, 1)
+#elif __arm__
+ F(__memset_arm, 0)
+#endif
+ {0, 0, 0}
+ // clang-format on
+};
+#undef F
+
+#define A 32
+#define LEN 250000
+static unsigned char *sbuf;
+
+static void *
+alignup (void *p)
+{
+ return (void *) (((uintptr_t) p + A - 1) & -A);
+}
+
+static void
+test (const struct fun *fun, int salign, int c, int len)
+{
+ unsigned char *src = alignup (sbuf);
+ unsigned char *s = src + salign;
+ void *p;
+ int i;
+
+ if (err_count >= ERR_LIMIT)
+ return;
+ if (len > LEN || salign >= A)
+ abort ();
+ for (i = 0; i < len + A; i++)
+ src[i] = '?';
+ for (i = 0; i < len; i++)
+ s[i] = 'a' + i % 23;
+
+ s = tag_buffer (s, len, fun->test_mte);
+ p = fun->fun (s, c, len);
+ untag_buffer (s, len, fun->test_mte);
+
+ if (p != s)
+ ERR ("%s(%p,..) returned %p\n", fun->name, s, p);
+
+ for (i = 0; i < salign; i++)
+ {
+ if (src[i] != '?')
+ {
+ ERR ("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len);
+ quoteat ("got", src, len + A, i);
+ return;
+ }
+ }
+ for (; i < salign + len; i++)
+ {
+ if (src[i] != (unsigned char) c)
+ {
+ ERR ("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len);
+ quoteat ("got", src, len + A, i);
+ return;
+ }
+ }
+ for (; i < len + A; i++)
+ {
+ if (src[i] != '?')
+ {
+ ERR ("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len);
+ quoteat ("got", src, len + A, i);
+ return;
+ }
+ }
+}
+
+int
+main ()
+{
+ sbuf = mte_mmap (LEN + 2 * A);
+ int r = 0;
+ for (int i = 0; funtab[i].name; i++)
+ {
+ err_count = 0;
+ for (int s = 0; s < A; s++)
+ {
+ int n;
+ for (n = 0; n < 100; n++)
+ {
+ test (funtab + i, s, 0, n);
+ test (funtab + i, s, 0x25, n);
+ test (funtab + i, s, 0xaa25, n);
+ }
+ for (; n < LEN; n *= 2)
+ {
+ test (funtab + i, s, 0, n);
+ test (funtab + i, s, 0x25, n);
+ test (funtab + i, s, 0xaa25, n);
+ }
+ }
+ char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+ printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+ if (err_count)
+ r = -1;
+ }
+ return r;
+}
diff --git a/string/test/mte.h b/string/test/mte.h
new file mode 100644
index 000000000000..e67cbd9d2d40
--- /dev/null
+++ b/string/test/mte.h
@@ -0,0 +1,142 @@
+/*
+ * Memory tagging testing code.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef __TEST_MTE_H
+#define __TEST_MTE_H
+
+#include <stdlib.h>
+
+#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
+#include <arm_acle.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+
+// These depend on a not yet merged kernel ABI.
+#define PR_SET_TAGGED_ADDR_CTRL 55
+#define PR_TAGGED_ADDR_ENABLE (1UL << 0)
+#define PR_MTE_TCF_SHIFT 1
+#define PR_MTE_TCF_SYNC (1UL << PR_MTE_TCF_SHIFT)
+#define PR_MTE_TAG_SHIFT 3
+#define PROT_MTE 0x20
+
+#define MTE_GRANULE_SIZE 16
+
+int
+mte_enabled ()
+{
+ static int enabled = -1;
+ if (enabled == -1)
+ {
+ int res = prctl (PR_SET_TAGGED_ADDR_CTRL,
+ PR_TAGGED_ADDR_ENABLE | PR_MTE_TCF_SYNC
+ | (0xfffe << PR_MTE_TAG_SHIFT),
+ 0, 0, 0);
+ enabled = (res == 0);
+ }
+ return enabled;
+}
+
+static void *
+mte_mmap (size_t size)
+{
+ if (mte_enabled ())
+ {
+ return mmap (NULL, size, PROT_READ | PROT_WRITE | PROT_MTE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ }
+ else
+ {
+ return malloc (size);
+ }
+}
+
+void *
+alignup_mte (void *p)
+{
+ return (void *) (((uintptr_t) p + MTE_GRANULE_SIZE - 1)
+ & ~(MTE_GRANULE_SIZE - 1));
+}
+
+void *
+aligndown_mte (void *p)
+{
+ return (void *) ((uintptr_t) p & ~(MTE_GRANULE_SIZE - 1));
+}
+
+void *
+untag_pointer (void *p)
+{
+ return (void *) ((unsigned long long) p & (~0ULL >> 8));
+}
+
+void
+tag_buffer_helper (void *p, int len)
+{
+ char *ptr = p;
+ char *end = alignup_mte (ptr + len);
+ ptr = aligndown_mte (p);
+ for (; ptr < end; ptr += MTE_GRANULE_SIZE)
+ {
+ __arm_mte_set_tag (ptr);
+ }
+}
+
+void *
+tag_buffer (void *p, int len, int test_mte)
+{
+ if (test_mte && mte_enabled ())
+ {
+ p = __arm_mte_increment_tag (p, 1);
+ tag_buffer_helper (p, len);
+ }
+ return p;
+}
+
+void *
+untag_buffer (void *p, int len, int test_mte)
+{
+ p = untag_pointer (p);
+ if (test_mte && mte_enabled ())
+ {
+ tag_buffer_helper (p, len);
+ }
+ return p;
+}
+
+#else // __ARM_FEATURE_MEMORY_TAGGING
+int
+mte_enabled ()
+{
+ return 0;
+}
+static void *
+mte_mmap (size_t size)
+{
+ return malloc (size);
+}
+void *
+tag_buffer (void *p, int len, int test_mte)
+{
+ (void) len;
+ (void) test_mte;
+ return p;
+}
+void *
+untag_buffer (void *p, int len, int test_mte)
+{
+ (void) len;
+ (void) test_mte;
+ return p;
+}
+void *
+untag_pointer (void *p)
+{
+ return p;
+}
+#endif // __ARM_FEATURE_MEMORY_TAGGING
+
+#endif
diff --git a/string/test/stpcpy.c b/string/test/stpcpy.c
new file mode 100644
index 000000000000..1827e68c9a30
--- /dev/null
+++ b/string/test/stpcpy.c
@@ -0,0 +1,125 @@
+/*
+ * stpcpy test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+ const char *name;
+ char *(*fun) (char *dest, const char *src);
+ int test_mte;
+} funtab[] = {
+ // clang-format off
+ F(stpcpy, 0)
+#if __aarch64__
+ F(__stpcpy_aarch64, 0)
+ F(__stpcpy_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+ F(__stpcpy_aarch64_sve, 1)
+# endif
+#endif
+ {0, 0, 0}
+ // clang-format on
+};
+#undef F
+
+#define ALIGN 32
+#define LEN 512
+static char *dbuf;
+static char *sbuf;
+static char wbuf[LEN + 3 * ALIGN];
+
+static void *
+alignup (void *p)
+{
+ return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
+}
+
+static void
+test (const struct fun *fun, int dalign, int salign, int len)
+{
+ char *src = alignup (sbuf);
+ char *dst = alignup (dbuf);
+ char *want = wbuf;
+ char *s = src + salign;
+ char *d = dst + dalign;
+ char *w = want + dalign;
+ void *p;
+ int i;
+
+ if (err_count >= ERR_LIMIT)
+ return;
+ if (len > LEN || dalign >= ALIGN || salign >= ALIGN)
+ abort ();
+ for (i = 0; i < len + ALIGN; i++)
+ {
+ src[i] = '?';
+ want[i] = dst[i] = '*';
+ }
+ for (int i = 0; src + i < s; i++)
+ src[i] = 0;
+ for (int i = 1; i <= ALIGN; i++)
+ s[len + i] = (len + salign) & 1 ? 1 : 0;
+ for (i = 0; i < len; i++)
+ s[i] = w[i] = 'a' + (i & 31);
+ s[len] = w[len] = '\0';
+
+ s = tag_buffer (s, len + 1, fun->test_mte);
+ d = tag_buffer (d, len + 1, fun->test_mte);
+ p = fun->fun (d, s);
+ untag_buffer (s, len + 1, fun->test_mte);
+ untag_buffer (d, len + 1, fun->test_mte);
+
+ if (p != d + len)
+ ERR ("%s (%p,..) returned %p expected %p\n", fun->name, d, p, d + len);
+
+ for (i = 0; i < len + ALIGN; i++)
+ {
+ if (dst[i] != want[i])
+ {
+ ERR ("%s (align %d, align %d, %d) failed\n",
+ fun->name, dalign, salign, len);
+ quoteat ("got", dst, len + ALIGN, i);
+ quoteat ("want", want, len + ALIGN, i);
+ break;
+ }
+ }
+}
+
+int
+main (void)
+{
+ sbuf = mte_mmap (LEN + 3 * ALIGN);
+ dbuf = mte_mmap (LEN + 3 * ALIGN);
+ int r = 0;
+ for (int i = 0; funtab[i].name; i++)
+ {
+ err_count = 0;
+ for (int d = 0; d < ALIGN; d++)
+ for (int s = 0; s < ALIGN; s++)
+ for (int n = 0; n < LEN; n++)
+ test (funtab + i, d, s, n);
+
+ char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+ printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+ if (err_count)
+ r = -1;
+ }
+ return r;
+}
diff --git a/string/test/strchr.c b/string/test/strchr.c
new file mode 100644
index 000000000000..f3ae982ef0ad
--- /dev/null
+++ b/string/test/strchr.c
@@ -0,0 +1,121 @@
+/*
+ * strchr test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+ const char *name;
+ char *(*fun) (const char *s, int c);
+ int test_mte;
+} funtab[] = {
+ // clang-format off
+ F(strchr, 0)
+#if __aarch64__
+ F(__strchr_aarch64, 0)
+ F(__strchr_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+ F(__strchr_aarch64_sve, 1)
+# endif
+#endif
+ {0, 0, 0}
+ // clang-format on
+};
+#undef F
+
+#define ALIGN 32
+#define LEN 512
+static char *sbuf;
+
+static void *
+alignup (void *p)
+{
+ return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
+}
+
+static void
+test (const struct fun *fun, int align, int seekpos, int len)
+{
+ char *src = alignup (sbuf);
+ char *s = src + align;
+ char *f = seekpos != -1 ? s + seekpos : 0;
+ int seekchar = 0x1;
+ void *p;
+
+ if (err_count >= ERR_LIMIT)
+ return;
+ if (len > LEN || seekpos >= len || align >= ALIGN)
+ abort ();
+
+ for (int i = 0; src + i < s; i++)
+ src[i] = (i + len) & 1 ? seekchar : 0;
+ for (int i = 1; i <= ALIGN; i++)
+ s[len + i] = (i + len) & 1 ? seekchar : 0;
+ for (int i = 0; i < len; i++)
+ s[i] = 'a' + (i & 31);
+ if (seekpos != -1)
+ s[seekpos] = seekchar;
+ if (seekpos != -1 && (len + align) & 1)
+ s[seekpos + 1] = seekchar;
+ s[len] = '\0';
+
+ s = tag_buffer (s, len + 1, fun->test_mte);
+ p = fun->fun (s, seekchar);
+ untag_buffer (s, len + 1, fun->test_mte);
+ p = untag_pointer (p);
+
+ if (p != f)
+ {
+ ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
+ fun->name, s, seekchar, len, p, f, seekpos);
+ quote ("input", s, len);
+ }
+
+ s = tag_buffer (s, len + 1, fun->test_mte);
+ p = fun->fun (s, 0);
+ untag_buffer (s, len + 1, fun->test_mte);
+
+ if (p != s + len)
+ {
+ ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
+ fun->name, s, 0, len, p, f, len);
+ quote ("input", s, len);
+ }
+}
+
+int
+main (void)
+{
+ sbuf = mte_mmap (LEN + 3 * ALIGN);
+ int r = 0;
+ for (int i = 0; funtab[i].name; i++)
+ {
+ err_count = 0;
+ for (int a = 0; a < ALIGN; a++)
+ for (int n = 0; n < LEN; n++)
+ {
+ for (int sp = 0; sp < n; sp++)
+ test (funtab + i, a, sp, n);
+ test (funtab + i, a, -1, n);
+ }
+
+ char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+ printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+ if (err_count)
+ r = -1;
+ }
+ return r;
+}
diff --git a/string/test/strchrnul.c b/string/test/strchrnul.c
new file mode 100644
index 000000000000..6c30ab2123f1
--- /dev/null
+++ b/string/test/strchrnul.c
@@ -0,0 +1,126 @@
+/*
+ * strchrnul test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+ const char *name;
+ char *(*fun) (const char *s, int c);
+ int test_mte;
+} funtab[] = {
+ // clang-format off
+ F(strchrnul, 0)
+#if __aarch64__
+ F(__strchrnul_aarch64, 0)
+ F(__strchrnul_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+ F(__strchrnul_aarch64_sve, 1)
+# endif
+#endif
+ {0, 0, 0}
+ // clang-format on
+};
+#undef F
+
+#define ALIGN 32
+#define LEN 512
+static char *sbuf;
+
+static void *
+alignup (void *p)
+{
+ return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
+}
+
+static void
+test (const struct fun *fun, int align, int seekpos, int len)
+{
+ char *src = alignup (sbuf);
+ char *s = src + align;
+ char *f = seekpos != -1 ? s + seekpos : s + len;
+ int seekchar = 0x1;
+ void *p;
+
+ if (err_count >= ERR_LIMIT)
+ return;
+ if (len > LEN || seekpos >= len || align >= ALIGN)
+ abort ();
+
+ for (int i = 0; src + i < s; i++)
+ src[i] = (i + len) & 1 ? seekchar : 0;
+ for (int i = 1; i <= ALIGN; i++)
+ s[len + i] = (i + len) & 1 ? seekchar : 0;
+ for (int i = 0; i < len; i++)
+ s[i] = 'a' + (i & 31);
+ if (seekpos != -1)
+ s[seekpos] = seekchar;
+ if (seekpos != -1 && (len + align) & 1)
+ s[seekpos + 1] = seekchar;
+ s[len] = '\0';
+
+ int mte_len = seekpos != -1 ? seekpos + 1 : len + 1;
+ s = tag_buffer (s, mte_len, fun->test_mte);
+ p = fun->fun (s, seekchar);
+ untag_buffer (s, mte_len, fun->test_mte);
+ p = untag_pointer (p);
+
+ if (p != f)
+ {
+ ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
+ fun->name, s, seekchar, len, p, f, seekpos);
+ quote ("input", s, len);
+ }
+
+ s = tag_buffer (s, len + 1, fun->test_mte);
+ p = fun->fun (s, 0);
+ untag_buffer (s, len + 1, fun->test_mte);
+
+ if (p != s + len)
+ {
+ ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
+ fun->name, s, 0, len, p, f, len);
+ quote ("input", s, len);
+ }
+}
+
+int
+main (void)
+{
+ sbuf = mte_mmap (LEN + 3 * ALIGN);
+ int r = 0;
+ for (int i = 0; funtab[i].name; i++)
+ {
+ err_count = 0;
+ for (int a = 0; a < ALIGN; a++)
+ for (int n = 0; n < LEN; n++)
+ {
+ for (int sp = 0; sp < n; sp++)
+ test (funtab + i, a, sp, n);
+ test (funtab + i, a, -1, n);
+ }
+
+ char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+ printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+ if (err_count)
+ r = -1;
+ }
+ return r;
+}
diff --git a/string/test/strcmp.c b/string/test/strcmp.c
new file mode 100644
index 000000000000..d57b54ed50a8
--- /dev/null
+++ b/string/test/strcmp.c
@@ -0,0 +1,132 @@
+/*
+ * strcmp test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+ const char *name;
+ int (*fun) (const char *s1, const char *s2);
+ int test_mte;
+} funtab[] = {
+ // clang-format off
+ F(strcmp, 0)
+#if __aarch64__
+ F(__strcmp_aarch64, 0)
+ F(__strcmp_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+ F(__strcmp_aarch64_sve, 1)
+# endif
+#elif __arm__
+# if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
+ F(__strcmp_arm, 0)
+# elif __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1
+ F(__strcmp_armv6m, 0)
+# endif
+#endif
+ {0, 0, 0}
+ // clang-format on
+};
+#undef F
+
+#define A 32
+#define LEN 250000
+static char *s1buf;
+static char *s2buf;
+
+static void *
+alignup (void *p)
+{
+ return (void *) (((uintptr_t) p + A - 1) & -A);
+}
+
+static void
+test (const struct fun *fun, int s1align, int s2align, int len, int diffpos,
+ int delta)
+{
+ char *src1 = alignup (s1buf);
+ char *src2 = alignup (s2buf);
+ char *s1 = src1 + s1align;
+ char *s2 = src2 + s2align;
+ int r;
+
+ if (err_count >= ERR_LIMIT)
+ return;
+ if (len > LEN || s1align >= A || s2align >= A)
+ abort ();
+ if (diffpos >= len)
+ abort ();
+ if ((diffpos < 0) != (delta == 0))
+ abort ();
+
+ for (int i = 0; i < len + A; i++)
+ src1[i] = src2[i] = '?';
+ for (int i = 0; i < len; i++)
+ s1[i] = s2[i] = 'a' + i % 23;
+ if (delta)
+ s1[diffpos] += delta;
+ s1[len] = s2[len] = '\0';
+
+ s1 = tag_buffer (s1, len + 1, fun->test_mte);
+ s2 = tag_buffer (s2, len + 1, fun->test_mte);
+ r = fun->fun (s1, s2);
+ untag_buffer (s1, len + 1, fun->test_mte);
+ untag_buffer (s2, len + 1, fun->test_mte);
+
+ if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0))
+ {
+ ERR ("%s(align %d, align %d, %d) failed, returned %d\n", fun->name,
+ s1align, s2align, len, r);
+ quoteat ("src1", src1, len + A, diffpos);
+ quoteat ("src2", src2, len + A, diffpos);
+ }
+}
+
+int
+main ()
+{
+ s1buf = mte_mmap (LEN + 2 * A + 1);
+ s2buf = mte_mmap (LEN + 2 * A + 1);
+ int r = 0;
+ for (int i = 0; funtab[i].name; i++)
+ {
+ err_count = 0;
+ for (int d = 0; d < A; d++)
+ for (int s = 0; s < A; s++)
+ {
+ int n;
+ test (funtab + i, d, s, 0, -1, 0);
+ test (funtab + i, d, s, 1, -1, 0);
+ test (funtab + i, d, s, 1, 0, 1);
+ test (funtab + i, d, s, 1, 0, -1);
+ for (n = 2; n < 100; n++)
+ {
+ test (funtab + i, d, s, n, -1, 0);
+ test (funtab + i, d, s, n, n - 1, -1);
+ test (funtab + i, d, s, n, n / 2, 1);
+ }
+ for (; n < LEN; n *= 2)
+ {
+ test (funtab + i, d, s, n, -1, 0);
+ test (funtab + i, d, s, n, n / 2, -1);
+ }
+ }
+ char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+ printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+ if (err_count)
+ r = -1;
+ }
+ return r;
+}
diff --git a/string/test/strcpy.c b/string/test/strcpy.c
new file mode 100644
index 000000000000..e84cace9c8c6
--- /dev/null
+++ b/string/test/strcpy.c
@@ -0,0 +1,123 @@
+/*
+ * strcpy test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+ const char *name;
+ char *(*fun) (char *dest, const char *src);
+ int test_mte;
+} funtab[] = {
+ // clang-format off
+ F(strcpy, 0)
+#if __aarch64__
+ F(__strcpy_aarch64, 0)
+ F(__strcpy_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+ F(__strcpy_aarch64_sve, 1)
+# endif
+#elif __arm__ && defined (__thumb2__) && !defined (__thumb__)
+ F(__strcpy_arm, 0)
+#endif
+ {0, 0, 0}
+ // clang-format on
+};
+#undef F
+
+#define ALIGN 32
+#define LEN 512
+static char *dbuf;
+static char *sbuf;
+static char wbuf[LEN + 3 * ALIGN];
+
+static void *
+alignup (void *p)
+{
+ return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
+}
+
+static void
+test (const struct fun *fun, int dalign, int salign, int len)
+{
+ char *src = alignup (sbuf);
+ char *dst = alignup (dbuf);
+ char *want = wbuf;
+ char *s = src + salign;
+ char *d = dst + dalign;
+ char *w = want + dalign;
+ void *p;
+ int i;
+
+ if (err_count >= ERR_LIMIT)
+ return;
+ if (len > LEN || dalign >= ALIGN || salign >= ALIGN)
+ abort ();
+ for (i = 0; i < len + ALIGN; i++)
+ {
+ src[i] = '?';
+ want[i] = dst[i] = '*';
+ }
+ for (int i = 0; src + i < s; i++)
+ src[i] = 0;
+ for (int i = 1; i <= ALIGN; i++)
+ s[len + i] = (len + salign) & 1 ? 1 : 0;
+ for (i = 0; i < len; i++)
+ s[i] = w[i] = 'a' + (i & 31);
+ s[len] = w[len] = '\0';
+
+ s = tag_buffer (s, len + 1, fun->test_mte);
+ d = tag_buffer (d, len + 1, fun->test_mte);
+ p = fun->fun (d, s);
+ untag_buffer (s, len + 1, fun->test_mte);
+ untag_buffer (d, len + 1, fun->test_mte);
+
+ if (p != d)
+ ERR ("%s (%p,..) returned %p\n", fun->name, d, p);
+
+ for (i = 0; i < len + ALIGN; i++)
+ {
+ if (dst[i] != want[i])
+ {
+ ERR ("%s (align %d, align %d, %d) failed\n",
+ fun->name, dalign, salign, len);
+ quoteat ("got", dst, len + ALIGN, i);
+ quoteat ("want", want, len + ALIGN, i);
+ break;
+ }
+ }
+}
+
+int
+main (void)
+{
+ sbuf = mte_mmap (LEN + 3 * ALIGN);
+ dbuf = mte_mmap (LEN + 3 * ALIGN);
+ int r = 0;
+ for (int i = 0; funtab[i].name; i++)
+ {
+ err_count = 0;
+ for (int d = 0; d < ALIGN; d++)
+ for (int s = 0; s < ALIGN; s++)
+ for (int n = 0; n < LEN; n++)
+ test (funtab + i, d, s, n);
+
+ char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+ printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+ if (err_count)
+ r = -1;
+ }
+ return r;
+}
diff --git a/string/test/stringtest.h b/string/test/stringtest.h
new file mode 100644
index 000000000000..fe855fc21736
--- /dev/null
+++ b/string/test/stringtest.h
@@ -0,0 +1,55 @@
+/*
+ * Common string test code.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <ctype.h>
+#include <stdio.h>
+
+/* Accounting errors for a test case. */
+static int err_count;
+#define ERR_LIMIT 10
+#define ERR(...) (err_count++, printf (__VA_ARGS__))
+
+static inline void
+quotechar (unsigned char c)
+{
+ if (isprint (c))
+ putchar (c);
+ else
+ printf ("\\x%02x", c);
+}
+
+/* quoted print around at or the entire string if at < 0. */
+static void
+quoteat (const char *prefix, const void *p, int len, int at)
+{
+ static const int CTXLEN = 15;
+ int i;
+ const char *pre = "\"";
+ const char *post = "\"";
+ const char *s = p;
+ if (at > CTXLEN)
+ {
+ s += at - CTXLEN;
+ len -= at - CTXLEN;
+ pre = "...\"";
+ }
+ if (at >= 0 && len > 2 * CTXLEN + 1)
+ {
+ len = 2 * CTXLEN + 1;
+ post = "\"...";
+ }
+ printf ("%4s: %s", prefix, pre);
+ for (i = 0; i < len; i++)
+ quotechar (s[i]);
+ printf ("%s\n", post);
+}
+
+static inline void
+quote (const char *prefix, const void *p, int len)
+{
+ quoteat (prefix, p, len, -1);
+}
diff --git a/string/test/strlen.c b/string/test/strlen.c
new file mode 100644
index 000000000000..6278380f26df
--- /dev/null
+++ b/string/test/strlen.c
@@ -0,0 +1,103 @@
+/*
+ * strlen test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <limits.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+ const char *name;
+ size_t (*fun) (const char *s);
+ int test_mte;
+} funtab[] = {
+ // clang-format off
+ F(strlen, 0)
+#if __aarch64__
+ F(__strlen_aarch64, 0)
+ F(__strlen_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+ F(__strlen_aarch64_sve, 1)
+# endif
+#elif __arm__
+# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
+ F(__strlen_armv6t2, 0)
+# endif
+#endif
+ {0, 0, 0}
+ // clang-format on
+};
+#undef F
+
+#define ALIGN 32
+#define LEN 512
+static char *sbuf;
+
+static void *
+alignup (void *p)
+{
+ return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
+}
+
+static void
+test (const struct fun *fun, int align, int len)
+{
+ char *src = alignup (sbuf);
+ char *s = src + align;
+ size_t r;
+
+ if (err_count >= ERR_LIMIT)
+ return;
+ if (len > LEN || align >= ALIGN)
+ abort ();
+
+ for (int i = 0; src + i < s; i++)
+ src[i] = 0;
+ for (int i = 1; i <= ALIGN; i++)
+ s[len + i] = (len + align) & 1 ? 1 : 0;
+ for (int i = 0; i < len; i++)
+ s[i] = 'a' + (i & 31);
+ s[len] = '\0';
+
+ s = tag_buffer (s, len + 1, fun->test_mte);
+ r = fun->fun (s);
+ untag_buffer (s, len + 1, fun->test_mte);
+
+ if (r != len)
+ {
+ ERR ("%s (%p) returned %zu expected %d\n", fun->name, s, r, len);
+ quote ("input", src, len);
+ }
+}
+
+int
+main (void)
+{
+ sbuf = mte_mmap (LEN + 3 * ALIGN);
+ int r = 0;
+ for (int i = 0; funtab[i].name; i++)
+ {
+ err_count = 0;
+ for (int a = 0; a < ALIGN; a++)
+ for (int n = 0; n < LEN; n++)
+ test (funtab + i, a, n);
+
+ char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+ printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+ if (err_count)
+ r = -1;
+ }
+ return r;
+}
diff --git a/string/test/strncmp.c b/string/test/strncmp.c
new file mode 100644
index 000000000000..018a8a431ab8
--- /dev/null
+++ b/string/test/strncmp.c
@@ -0,0 +1,139 @@
+/*
+ * strncmp test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+ const char *name;
+ int (*fun) (const char *, const char *, size_t);
+ int test_mte;
+} funtab[] = {
+ // clang-format off
+ F(strncmp, 0)
+#if __aarch64__
+ F(__strncmp_aarch64, 0)
+ F(__strncmp_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+ F(__strncmp_aarch64_sve, 1)
+# endif
+#endif
+ {0, 0, 0}
+ // clang-format on
+};
+#undef F
+
+#define A 32
+#define LEN 250000
+static char *s1buf;
+static char *s2buf;
+
+static void *
+alignup (void *p)
+{
+ return (void *) (((uintptr_t) p + A - 1) & -A);
+}
+
+static void
+test (const struct fun *fun, int s1align, int s2align, int maxlen, int diffpos,
+ int len, int delta)
+{
+ char *src1 = alignup (s1buf);
+ char *src2 = alignup (s2buf);
+ char *s1 = src1 + s1align;
+ char *s2 = src2 + s2align;
+ int r;
+
+ if (err_count >= ERR_LIMIT)
+ return;
+ if (len > LEN || s1align >= A || s2align >= A)
+ abort ();
+ if (diffpos >= len)
+ abort ();
+ if ((diffpos < 0) != (delta == 0))
+ abort ();
+
+ for (int i = 0; i < len + A; i++)
+ src1[i] = src2[i] = '?';
+ for (int i = 0; i < len; i++)
+ s1[i] = s2[i] = 'a' + i % 23;
+ if (delta)
+ s1[diffpos] += delta;
+ s1[len] = s2[len] = '\0';
+
+ size_t mte_len = maxlen < len + 1 ? maxlen : len + 1;
+ s1 = tag_buffer (s1, mte_len, fun->test_mte);
+ s2 = tag_buffer (s2, mte_len, fun->test_mte);
+ r = fun->fun (s1, s2, maxlen);
+ untag_buffer (s1, mte_len, fun->test_mte);
+ untag_buffer (s2, mte_len, fun->test_mte);
+
+ if (diffpos >= maxlen)
+ {
+ diffpos = -1;
+ delta = 0;
+ }
+ if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0))
+ {
+ ERR (
+ "%s(align %d, align %d, %d) (len=%d, diffpos=%d) failed, returned %d\n",
+ fun->name, s1align, s2align, maxlen, len, diffpos, r);
+ quoteat ("src1", src1, len + A, diffpos);
+ quoteat ("src2", src2, len + A, diffpos);
+ }
+}
+
+int
+main ()
+{
+ s1buf = mte_mmap (LEN + 2 * A + 1);
+ s2buf = mte_mmap (LEN + 2 * A + 1);
+ int r = 0;
+ for (int i = 0; funtab[i].name; i++)
+ {
+ err_count = 0;
+ for (int d = 0; d < A; d++)
+ for (int s = 0; s < A; s++)
+ {
+ int n;
+ test (funtab + i, d, s, 0, -1, 0, 0);
+ test (funtab + i, d, s, 1, -1, 0, 0);
+ test (funtab + i, d, s, 0, -1, 1, 0);
+ test (funtab + i, d, s, 1, -1, 1, 0);
+ test (funtab + i, d, s, 2, -1, 1, 0);
+ test (funtab + i, d, s, 1, 0, 1, 1);
+ test (funtab + i, d, s, 1, 0, 1, -1);
+ for (n = 2; n < 100; n++)
+ {
+ test (funtab + i, d, s, n, -1, n, 0);
+ test (funtab + i, d, s, n, n / 2, n, 1);
+ test (funtab + i, d, s, n / 2, -1, n, 0);
+ test (funtab + i, d, s, n / 2, n / 2, n, -1);
+ }
+ for (; n < LEN; n *= 2)
+ {
+ test (funtab + i, d, s, n, -1, n, 0);
+ test (funtab + i, d, s, n, n / 2, n, -1);
+ test (funtab + i, d, s, n / 2, -1, n, 0);
+ test (funtab + i, d, s, n / 2, n / 2, n, 1);
+ }
+ }
+ char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+ printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+ if (err_count)
+ r = -1;
+ }
+ return r;
+}
diff --git a/string/test/strnlen.c b/string/test/strnlen.c
new file mode 100644
index 000000000000..0dea00eaf8e3
--- /dev/null
+++ b/string/test/strnlen.c
@@ -0,0 +1,109 @@
+/*
+ * strnlen test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+ const char *name;
+ size_t (*fun) (const char *s, size_t m);
+ int test_mte;
+} funtab[] = {
+ // clang-format off
+ F(strnlen, 0)
+#if __aarch64__
+ F(__strnlen_aarch64, 1)
+# if __ARM_FEATURE_SVE
+ F(__strnlen_aarch64_sve, 1)
+# endif
+#endif
+ {0, 0, 0}
+ // clang-format on
+};
+#undef F
+
+#define ALIGN 32
+#define LEN 512
+static char *sbuf;
+
+static void *
+alignup (void *p)
+{
+ return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
+}
+
+static void
+test (const struct fun *fun, int align, size_t maxlen, size_t len)
+{
+ char *src = alignup (sbuf);
+ char *s = src + align;
+ size_t r;
+ size_t e = maxlen < len ? maxlen : len;
+
+ if (err_count >= ERR_LIMIT)
+ return;
+ if (len > LEN || align >= ALIGN)
+ abort ();
+
+ for (int i = 0; src + i < s; i++)
+ src[i] = 0;
+ for (int i = 1; i <= ALIGN; i++)
+ s[len + i] = (len + align) & 1 ? 1 : 0;
+ for (int i = 0; i < len; i++)
+ s[i] = 'a' + (i & 31);
+ s[len] = 0;
+ if ((len + align) & 1)
+ s[e + 1] = 0;
+
+ size_t mte_len = maxlen < len + 1 ? maxlen : len + 1;
+ s = tag_buffer (s, mte_len, fun->test_mte);
+ r = fun->fun (s, maxlen);
+ untag_buffer (s, mte_len, fun->test_mte);
+
+ if (r != e)
+ {
+ ERR ("%s (%p, %zu) len %zu returned %zu, expected %zu\n",
+ fun->name, s, maxlen, len, r, e);
+ quote ("input", s, len);
+ }
+}
+
+int
+main (void)
+{
+ sbuf = mte_mmap (LEN + 3 * ALIGN);
+ int r = 0;
+ for (int i = 0; funtab[i].name; i++)
+ {
+ err_count = 0;
+ for (int a = 0; a < ALIGN; a++)
+ for (int n = 0; n < LEN; n++)
+ {
+ for (int maxlen = 0; maxlen < LEN; maxlen++)
+ test (funtab + i, a, maxlen, n);
+ test (funtab + i, a, SIZE_MAX - a, n);
+ }
+ char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+ printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+ if (err_count)
+ r = -1;
+ }
+ return r;
+}
diff --git a/string/test/strrchr.c b/string/test/strrchr.c
new file mode 100644
index 000000000000..fedbdc52fcc1
--- /dev/null
+++ b/string/test/strrchr.c
@@ -0,0 +1,121 @@
+/*
+ * strrchr test.
+ *
+ * Copyright (c) 2019-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+ const char *name;
+ char *(*fun) (const char *s, int c);
+ int test_mte;
+} funtab[] = {
+ // clang-format off
+ F(strrchr, 0)
+#if __aarch64__
+ F(__strrchr_aarch64, 0)
+ F(__strrchr_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+ F(__strrchr_aarch64_sve, 1)
+# endif
+#endif
+ {0, 0, 0}
+ // clang-format on
+};
+#undef F
+
+#define ALIGN 32
+#define LEN 512
+static char *sbuf;
+
+static void *
+alignup (void *p)
+{
+ return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
+}
+
+static void
+test (const struct fun *fun, int align, int seekpos, int len)
+{
+ char *src = alignup (sbuf);
+ char *s = src + align;
+ char *f = seekpos != -1 ? s + seekpos : 0;
+ int seekchar = 0x1;
+ void *p;
+
+ if (err_count >= ERR_LIMIT)
+ return;
+ if (len > LEN || seekpos >= len || align >= ALIGN)
+ abort ();
+
+ for (int i = 0; src + i < s; i++)
+ src[i] = (i + len) & 1 ? seekchar : 0;
+ for (int i = 1; i <= ALIGN; i++)
+ s[len + i] = (i + len) & 1 ? seekchar : 0;
+ for (int i = 0; i < len; i++)
+ s[i] = 'a' + (i & 31);
+ if (seekpos != -1)
+ s[seekpos / 2] = s[seekpos] = seekchar;
+ if (seekpos > 0 && (len + align) & 1)
+ s[seekpos - 1] = seekchar;
+ s[len] = '\0';
+
+ s = tag_buffer (s, len + 1, fun->test_mte);
+ p = fun->fun (s, seekchar);
+ untag_buffer (s, len + 1, fun->test_mte);
+ p = untag_pointer (p);
+
+ if (p != f)
+ {
+ ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
+ fun->name, s, seekchar, len, p, f, seekpos);
+ quote ("input", s, len);
+ }
+
+ s = tag_buffer (s, len + 1, fun->test_mte);
+ p = fun->fun (s, 0);
+ untag_buffer (s, len + 1, fun->test_mte);
+
+ if (p != s + len)
+ {
+ ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
+ fun->name, s, 0, len, p, s + len, len);
+ quote ("input", s, len);
+ }
+}
+
+int
+main (void)
+{
+ sbuf = mte_mmap (LEN + 3 * ALIGN);
+ int r = 0;
+ for (int i = 0; funtab[i].name; i++)
+ {
+ err_count = 0;
+ for (int a = 0; a < ALIGN; a++)
+ for (int n = 0; n < LEN; n++)
+ {
+ for (int sp = 0; sp < n; sp++)
+ test (funtab + i, a, sp, n);
+ test (funtab + i, a, -1, n);
+ }
+
+ char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+ printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+ if (err_count)
+ r = -1;
+ }
+ return r;
+}
diff --git a/string/x86_64/check-arch.S b/string/x86_64/check-arch.S
new file mode 100644
index 000000000000..26ade0a0c7db
--- /dev/null
+++ b/string/x86_64/check-arch.S
@@ -0,0 +1,10 @@
+/*
+ * check ARCH setting.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if !__x86_64__
+# error ARCH setting does not match the compiler.
+#endif