72 files changed, 9253 insertions, 0 deletions
diff --git a/string/Dir.mk b/string/Dir.mk
new file mode 100644
index 000000000000..cf3453f7580d
--- /dev/null
+++ b/string/Dir.mk
@@ -0,0 +1,113 @@
+# Makefile fragment - requires GNU make
+#
+# Copyright (c) 2019-2021, Arm Limited.
+# SPDX-License-Identifier: MIT
+
+S := $(srcdir)/string
+B := build/string
+
+ifeq ($(ARCH),)
+all-string bench-string check-string install-string clean-string:
+	@echo "*** Please set ARCH in config.mk. ***"
+	@exit 1
+else
+
+string-lib-srcs := $(wildcard $(S)/$(ARCH)/*.[cS])
+string-test-srcs := $(wildcard $(S)/test/*.c)
+string-bench-srcs := $(wildcard $(S)/bench/*.c)
+
+string-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h))
+
+string-libs := \
+	build/lib/libstringlib.so \
+	build/lib/libstringlib.a \
+
+string-tests := \
+	build/bin/test/memcpy \
+	build/bin/test/memmove \
+	build/bin/test/memset \
+	build/bin/test/memchr \
+	build/bin/test/memrchr \
+	build/bin/test/memcmp \
+	build/bin/test/__mtag_tag_region \
+	build/bin/test/__mtag_tag_zero_region \
+	build/bin/test/strcpy \
+	build/bin/test/stpcpy \
+	build/bin/test/strcmp \
+	build/bin/test/strchr \
+	build/bin/test/strrchr \
+	build/bin/test/strchrnul \
+	build/bin/test/strlen \
+	build/bin/test/strnlen \
+	build/bin/test/strncmp
+
+string-benches := \
+	build/bin/bench/memcpy \
+	build/bin/bench/strlen
+
+string-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-lib-srcs)))
+string-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-test-srcs)))
+string-bench-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-bench-srcs)))
+
+string-objs := \
+	$(string-lib-objs) \
+	$(string-lib-objs:%.o=%.os) \
+	$(string-test-objs) \
+	$(string-bench-objs)
+
+string-files := \
+	$(string-objs) \
+	$(string-libs) \
+	$(string-tests) \
+	$(string-benches) \
+	$(string-includes) \
+
+all-string: $(string-libs) $(string-tests) $(string-benches) $(string-includes)
+
+$(string-objs): $(string-includes)
+$(string-objs): CFLAGS_ALL += $(string-cflags)
+
+$(string-test-objs): CFLAGS_ALL += -D_GNU_SOURCE
+
+build/lib/libstringlib.so: $(string-lib-objs:%.o=%.os)
+	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -shared -o $@ $^
+
+build/lib/libstringlib.a: $(string-lib-objs)
+	rm -f $@
+	$(AR) rc $@ $^
+	$(RANLIB) $@
+
+build/bin/test/%: $(B)/test/%.o build/lib/libstringlib.a
+	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
+
+build/bin/bench/%: $(B)/bench/%.o build/lib/libstringlib.a
+	$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
+
+build/include/%.h: $(S)/include/%.h
+	cp $< $@
+
+build/bin/%.sh: $(S)/test/%.sh
+	cp $< $@
+
+string-tests-out = $(string-tests:build/bin/test/%=build/string/test/%.out)
+
+build/string/test/%.out: build/bin/test/%
+	$(EMULATOR) $^ | tee $@.tmp
+	mv $@.tmp $@
+
+check-string: $(string-tests-out)
+	! grep FAIL $^
+
+bench-string: $(string-benches)
+	$(EMULATOR) build/bin/bench/strlen
+	$(EMULATOR) build/bin/bench/memcpy
+
+install-string: \
+ $(string-libs:build/lib/%=$(DESTDIR)$(libdir)/%) \
+ $(string-includes:build/include/%=$(DESTDIR)$(includedir)/%)
+
+clean-string:
+	rm -f $(string-files)
+endif
+
+.PHONY: all-string bench-string check-string install-string clean-string
diff --git a/string/aarch64/__mtag_tag_region.S b/string/aarch64/__mtag_tag_region.S
new file mode 100644
index 000000000000..84339f73cf23
--- /dev/null
+++ b/string/aarch64/__mtag_tag_region.S
@@ -0,0 +1,100 @@
+/*
+ * __mtag_tag_region - tag memory
+ *
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, MTE, LP64 ABI.
+ *
+ * Interface contract:
+ * Address is 16 byte aligned and size is multiple of 16.
+ * Returns the passed pointer.
+ * The memory region may remain untagged if tagging is not enabled.
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_MEMORY_TAGGING
+
+#define dstin	x0
+#define count	x1
+#define dst	x2
+#define dstend	x3
+#define tmp	x4
+#define zva_val	x4
+
+ENTRY (__mtag_tag_region)
+	PTR_ARG (0)
+	SIZE_ARG (1)
+
+	add	dstend, dstin, count
+
+	cmp	count, 96
+	b.hi	L(set_long)
+
+	tbnz	count, 6, L(set96)
+
+	/* Set 0, 16, 32, or 48 bytes.  */
+	lsr	tmp, count, 5
+	add	tmp, dstin, tmp, lsl 4
+	cbz     count, L(end)
+	stg	dstin, [dstin]
+	stg	dstin, [tmp]
+	stg	dstin, [dstend, -16]
+L(end):
+	ret
+
+	.p2align 4
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+L(set96):
+	st2g	dstin, [dstin]
+	st2g	dstin, [dstin, 32]
+	st2g	dstin, [dstend, -32]
+	ret
+
+	.p2align 4
+	/* Size is > 96 bytes.  */
+L(set_long):
+	cmp	count, 160
+	b.lo	L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+	mrs	zva_val, dczid_el0
+	and	zva_val, zva_val, 31
+	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+	b.ne	L(no_zva)
+#endif
+	st2g	dstin, [dstin]
+	st2g	dstin, [dstin, 32]
+	bic	dst, dstin, 63
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 128	/* Adjust count and bias for loop.  */
+
+	.p2align 4
+L(zva_loop):
+	add	dst, dst, 64
+	dc	gva, dst
+	subs	count, count, 64
+	b.hi	L(zva_loop)
+	st2g	dstin, [dstend, -64]
+	st2g	dstin, [dstend, -32]
+	ret
+
+L(no_zva):
+	sub	dst, dstin, 32		/* Dst is biased by -32.  */
+	sub	count, count, 64	/* Adjust count for loop.  */
+L(no_zva_loop):
+	st2g	dstin, [dst, 32]
+	st2g	dstin, [dst, 64]!
+	subs	count, count, 64
+	b.hi	L(no_zva_loop)
+	st2g	dstin, [dstend, -64]
+	st2g	dstin, [dstend, -32]
+	ret
+
+END (__mtag_tag_region)
+#endif
diff --git a/string/aarch64/__mtag_tag_zero_region.S b/string/aarch64/__mtag_tag_zero_region.S
new file mode 100644
index 000000000000..f58364ca6fcb
--- /dev/null
+++ b/string/aarch64/__mtag_tag_zero_region.S
@@ -0,0 +1,100 @@
+/*
+ * __mtag_tag_zero_region - tag memory and fill it with zero bytes
+ *
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, MTE, LP64 ABI.
+ *
+ * Interface contract:
+ * Address is 16 byte aligned and size is multiple of 16.
+ * Returns the passed pointer.
+ * The memory region may remain untagged if tagging is not enabled.
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_MEMORY_TAGGING
+
+#define dstin	x0
+#define count	x1
+#define dst	x2
+#define dstend	x3
+#define tmp	x4
+#define zva_val	x4
+
+ENTRY (__mtag_tag_zero_region)
+	PTR_ARG (0)
+	SIZE_ARG (1)
+
+	add	dstend, dstin, count
+
+	cmp	count, 96
+	b.hi	L(set_long)
+
+	tbnz	count, 6, L(set96)
+
+	/* Set 0, 16, 32, or 48 bytes.  */
+	lsr	tmp, count, 5
+	add	tmp, dstin, tmp, lsl 4
+	cbz     count, L(end)
+	stzg	dstin, [dstin]
+	stzg	dstin, [tmp]
+	stzg	dstin, [dstend, -16]
+L(end):
+	ret
+
+	.p2align 4
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+L(set96):
+	stz2g	dstin, [dstin]
+	stz2g	dstin, [dstin, 32]
+	stz2g	dstin, [dstend, -32]
+	ret
+
+	.p2align 4
+	/* Size is > 96 bytes.  */
+L(set_long):
+	cmp	count, 160
+	b.lo	L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+	mrs	zva_val, dczid_el0
+	and	zva_val, zva_val, 31
+	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+	b.ne	L(no_zva)
+#endif
+	stz2g	dstin, [dstin]
+	stz2g	dstin, [dstin, 32]
+	bic	dst, dstin, 63
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 128	/* Adjust count and bias for loop.  */
+
+	.p2align 4
+L(zva_loop):
+	add	dst, dst, 64
+	dc	gzva, dst
+	subs	count, count, 64
+	b.hi	L(zva_loop)
+	stz2g	dstin, [dstend, -64]
+	stz2g	dstin, [dstend, -32]
+	ret
+
+L(no_zva):
+	sub	dst, dstin, 32		/* Dst is biased by -32.  */
+	sub	count, count, 64	/* Adjust count for loop.  */
+L(no_zva_loop):
+	stz2g	dstin, [dst, 32]
+	stz2g	dstin, [dst, 64]!
+	subs	count, count, 64
+	b.hi	L(no_zva_loop)
+	stz2g	dstin, [dstend, -64]
+	stz2g	dstin, [dstend, -32]
+	ret
+
+END (__mtag_tag_zero_region)
+#endif
diff --git a/string/aarch64/check-arch.S b/string/aarch64/check-arch.S
new file mode 100644
index 000000000000..5a54242d7de6
--- /dev/null
+++ b/string/aarch64/check-arch.S
@@ -0,0 +1,13 @@
+/*
+ * check ARCH setting.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if !__aarch64__
+# error ARCH setting does not match the compiler.
+#endif
+
+/* Include for GNU property notes.  */
+#include "../asmdefs.h"
diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S
new file mode 100644
index 000000000000..c2e967d1004e
--- /dev/null
+++ b/string/aarch64/memchr-mte.S
@@ -0,0 +1,116 @@
+/*
+ * memchr - find a character in a memory zone
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin		x0
+#define chrin		w1
+#define cntin		x2
+#define result		x0
+
+#define src		x3
+#define cntrem		x4
+#define synd		x5
+#define shift		x6
+#define	tmp		x7
+#define wtmp		w7
+
+#define vrepchr		v0
+#define qdata		q1
+#define vdata		v1
+#define vhas_chr	v2
+#define vrepmask	v3
+#define vend		v4
+#define dend		d4
+
+/*
+   Core algorithm:
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
+
+ENTRY (__memchr_aarch64_mte)
+	PTR_ARG (0)
+	SIZE_ARG (2)
+	bic	src, srcin, 15
+	cbz	cntin, L(nomatch)
+	ld1	{vdata.16b}, [src]
+	dup	vrepchr.16b, chrin
+	mov	wtmp, 0xf00f
+	dup	vrepmask.8h, wtmp
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	lsl	shift, srcin, 2
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	fmov	synd, dend
+	lsr	synd, synd, shift
+	cbz	synd, L(start_loop)
+
+	rbit	synd, synd
+	clz	synd, synd
+	add	result, srcin, synd, lsr 2
+	cmp	cntin, synd, lsr 2
+	csel	result, result, xzr, hi
+	ret
+
+L(start_loop):
+	sub	tmp, src, srcin
+	add	tmp, tmp, 16
+	subs	cntrem, cntin, tmp
+	b.ls	L(nomatch)
+
+	/* Make sure that it won't overread by a 16-byte chunk */
+	add	tmp, cntrem, 15
+	tbnz	tmp, 4, L(loop32_2)
+
+	.p2align 4
+L(loop32):
+	ldr	qdata, [src, 16]!
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbnz	synd, L(end)
+
+L(loop32_2):
+	ldr	qdata, [src, 16]!
+	subs	cntrem, cntrem, 32
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	b.ls	L(end)
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbz	synd, L(loop32)
+L(end):
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	add	tmp, srcin, cntin
+	sub	cntrem, tmp, src
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	clz	synd, synd
+	cmp	cntrem, synd, lsr 2
+	add	result, src, synd, lsr 2
+	csel	result, result, xzr, hi
+	ret
+
+L(nomatch):
+	mov	result, 0
+	ret
+
+END (__memchr_aarch64_mte)
+
diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S
new file mode 100644
index 000000000000..c22e6596f19b
--- /dev/null
+++ b/string/aarch64/memchr-sve.S
@@ -0,0 +1,64 @@
+/*
+ * memchr - find a character in a memory zone
+ *
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+ENTRY (__memchr_aarch64_sve)
+	PTR_ARG (0)
+	SIZE_ARG (2)
+	dup	z1.b, w1			/* duplicate c to a vector */
+	setffr					/* initialize FFR */
+	mov	x3, 0				/* initialize off */
+
+	.p2align 4
+0:	whilelo	p1.b, x3, x2			/* make sure off < max */
+	b.none	9f
+
+	/* Read a vector's worth of bytes, bounded by max,
+	   stopping on first fault.  */
+	ldff1b	z0.b, p1/z, [x0, x3]
+	rdffrs	p0.b, p1/z
+	b.nlast	2f
+
+	/* First fault did not fail: the vector bounded by max is valid.
+	   Avoid depending on the contents of FFR beyond the branch.  */
+	incb	x3				/* speculate increment */
+	cmpeq	p2.b, p1/z, z0.b, z1.b		/* search for c */
+	b.none	0b
+	decb	x3				/* undo speculate */
+
+	/* Found C.  */
+1:	brkb	p2.b, p1/z, p2.b	/* find the first c */
+	add	x0, x0, x3		/* form partial pointer */
+	incp	x0, p2.b		/* form final pointer to c */
+	ret
+
+	/* First fault failed: only some of the vector is valid.
+	   Perform the comparision only on the valid bytes.  */
+2:	cmpeq	p2.b, p0/z, z0.b, z1.b
+	b.any	1b
+
+	/* No C found.  Re-init FFR, increment, and loop.  */
+	setffr
+	incp	x3, p0.b
+	b	0b
+
+	/* Found end of count.  */
+9:	mov	x0, 0			/* return null */
+	ret
+
+END (__memchr_aarch64_sve)
+
+#endif
+
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
new file mode 100644
index 000000000000..353f0d1eac53
--- /dev/null
+++ b/string/aarch64/memchr.S
@@ -0,0 +1,146 @@
+/*
+ * memchr - find a character in a memory zone
+ *
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+#include "../asmdefs.h"
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+#define cntin		x2
+
+#define result		x0
+
+#define src		x3
+#define	tmp		x4
+#define wtmp2		w5
+#define synd		x6
+#define soff		x9
+#define cntrem		x10
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_chr1	v3
+#define vhas_chr2	v4
+#define vrepmask	v5
+#define vend		v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY (__memchr_aarch64)
+	PTR_ARG (0)
+	SIZE_ARG (2)
+	/* Do not dereference srcin if no bytes to compare.  */
+	cbz	cntin, L(zero_length)
+	/*
+	 * Magic constant 0x40100401 allows us to identify which lane matches
+	 * the requested byte.
+	 */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	/* Work with aligned 32-byte chunks */
+	bic	src, srcin, #31
+	dup	vrepmask.4s, wtmp2
+	ands	soff, srcin, #31
+	and	cntrem, cntin, #31
+	b.eq	L(loop)
+
+	/*
+	 * Input string is not 32-byte aligned. We calculate the syndrome
+	 * value for the aligned 32 bytes block containing the first bytes
+	 * and mask the irrelevant part.
+	 */
+
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	sub	tmp, soff, #32
+	adds	cntin, cntin, tmp
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.d[0]
+	/* Clear the soff*2 lower bits */
+	lsl	tmp, soff, #1
+	lsr	synd, synd, tmp
+	lsl	synd, synd, tmp
+	/* The first block can also be the last */
+	b.ls	L(masklast)
+	/* Have we found something already? */
+	cbnz	synd, L(tail)
+
+L(loop):
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	subs	cntin, cntin, #32
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	/* If we're out of data we finish regardless of the result */
+	b.ls	L(end)
+	/* Use a fast check for the termination condition */
+	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
+	addp	vend.2d, vend.2d, vend.2d
+	mov	synd, vend.d[0]
+	/* We're not out of data, loop if we haven't found the character */
+	cbz	synd, L(loop)
+
+L(end):
+	/* Termination condition found, let's calculate the syndrome value */
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.d[0]
+	/* Only do the clear for the last possible block */
+	b.hs	L(tail)
+
+L(masklast):
+	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+	add	tmp, cntrem, soff
+	and	tmp, tmp, #31
+	sub	tmp, tmp, #32
+	neg	tmp, tmp, lsl #1
+	lsl	synd, synd, tmp
+	lsr	synd, synd, tmp
+
+L(tail):
+	/* Count the trailing zeros using bit reversing */
+	rbit	synd, synd
+	/* Compensate the last post-increment */
+	sub	src, src, #32
+	/* Check that we have found a character */
+	cmp	synd, #0
+	/* And count the leading zeros */
+	clz	synd, synd
+	/* Compute the potential result */
+	add	result, src, synd, lsr #1
+	/* Select result or NULL */
+	csel	result, xzr, result, eq
+	ret
+
+L(zero_length):
+	mov	result, #0
+	ret
+
+END (__memchr_aarch64)
+
diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S
new file mode 100644
index 000000000000..78c5ecaa4cdc
--- /dev/null
+++ b/string/aarch64/memcmp-sve.S
@@ -0,0 +1,51 @@
+/*
+ * memcmp - compare memory
+ *
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+ENTRY (__memcmp_aarch64_sve)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+	mov	x3, 0			/* initialize off */
+
+0:	whilelo	p0.b, x3, x2		/* while off < max */
+	b.none	9f
+
+	ld1b	z0.b, p0/z, [x0, x3]	/* read vectors bounded by max.  */
+	ld1b	z1.b, p0/z, [x1, x3]
+
+	/* Increment for a whole vector, even if we've only read a partial.
+	   This is significantly cheaper than INCP, and since OFF is not
+	   used after the loop it is ok to increment OFF past MAX.  */
+	incb	x3
+
+	cmpne	p1.b, p0/z, z0.b, z1.b	/* while no inequalities */
+	b.none	0b
+
+	/* Found inequality.  */
+1:	brkb	p1.b, p0/z, p1.b	/* find first such */
+	lasta	w0, p1, z0.b		/* extract each byte */
+	lasta	w1, p1, z1.b
+	sub	x0, x0, x1		/* return comparison */
+	ret
+
+	/* Found end-of-count.  */
+9:	mov	x0, 0			/* return equality */
+	ret
+
+END (__memcmp_aarch64_sve)
+
+#endif
+
diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
new file mode 100644
index 000000000000..3b1026642eee
--- /dev/null
+++ b/string/aarch64/memcmp.S
@@ -0,0 +1,137 @@
+/* memcmp - compare memory
+ *
+ * Copyright (c) 2013-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ */
+
+#include "../asmdefs.h"
+
+/* Parameters and result.  */
+#define src1		x0
+#define src2		x1
+#define limit		x2
+#define result		w0
+
+/* Internal variables.  */
+#define data1		x3
+#define data1w		w3
+#define data1h		x4
+#define data2		x5
+#define data2w		w5
+#define data2h		x6
+#define tmp1		x7
+#define tmp2		x8
+
+ENTRY (__memcmp_aarch64)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+	subs	limit, limit, 8
+	b.lo	L(less8)
+
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	cmp	data1, data2
+	b.ne	L(return)
+
+	subs	limit, limit, 8
+	b.gt	L(more16)
+
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	b	L(return)
+
+L(more16):
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	cmp	data1, data2
+	bne	L(return)
+
+	/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+	   strings.  */
+	subs	limit, limit, 16
+	b.ls	L(last_bytes)
+
+	/* We overlap loads between 0-32 bytes at either side of SRC1 when we
+	   try to align, so limit it only to strings larger than 128 bytes.  */
+	cmp	limit, 96
+	b.ls	L(loop16)
+
+	/* Align src1 and adjust src2 with bytes not yet done.  */
+	and	tmp1, src1, 15
+	add	limit, limit, tmp1
+	sub	src1, src1, tmp1
+	sub	src2, src2, tmp1
+
+	/* Loop performing 16 bytes per iteration using aligned src1.
+	   Limit is pre-decremented by 16 and must be larger than zero.
+	   Exit if <= 16 bytes left to do or if the data is not equal.  */
+	.p2align 4
+L(loop16):
+	ldp	data1, data1h, [src1], 16
+	ldp	data2, data2h, [src2], 16
+	subs	limit, limit, 16
+	ccmp	data1, data2, 0, hi
+	ccmp	data1h, data2h, 0, eq
+	b.eq	L(loop16)
+
+	cmp	data1, data2
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
+	cmp	data1, data2
+	bne	L(return)
+
+	/* Compare last 1-16 bytes using unaligned access.  */
+L(last_bytes):
+	add	src1, src1, limit
+	add	src2, src2, limit
+	ldp	data1, data1h, [src1]
+	ldp	data2, data2h, [src2]
+	cmp     data1, data2
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
+	cmp	data1, data2
+
+	/* Compare data bytes and set return value to 0, -1 or 1.  */
+L(return):
+#ifndef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	cmp     data1, data2
+L(ret_eq):
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+
+	.p2align 4
+	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
+L(less8):
+	adds	limit, limit, 4
+	b.lo	L(less4)
+	ldr	data1w, [src1], 4
+	ldr	data2w, [src2], 4
+	cmp	data1w, data2w
+	b.ne	L(return)
+	sub	limit, limit, 4
+L(less4):
+	adds	limit, limit, 4
+	beq	L(ret_eq)
+L(byte_loop):
+	ldrb	data1w, [src1], 1
+	ldrb	data2w, [src2], 1
+	subs	limit, limit, 1
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
+	b.eq	L(byte_loop)
+	sub	result, data1w, data2w
+	ret
+
+END (__memcmp_aarch64)
+
diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S
new file mode 100644
index 000000000000..f97f2c3047b9
--- /dev/null
+++ b/string/aarch64/memcpy-advsimd.S
@@ -0,0 +1,206 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ *
+ */
+
+#include "../asmdefs.h"
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define A_l	x6
+#define A_lw	w6
+#define A_h	x7
+#define B_l	x8
+#define B_lw	w8
+#define B_h	x9
+#define C_lw	w10
+#define tmp1	x14
+
+#define A_q	q0
+#define B_q	q1
+#define C_q	q2
+#define D_q	q3
+#define E_q	q4
+#define F_q	q5
+#define G_q	q6
+#define H_q	q7
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The source pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+ENTRY_ALIAS (__memmove_aarch64_simd)
+ENTRY (__memcpy_aarch64_simd)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 128
+	b.hi	L(copy_long)
+	cmp	count, 32
+	b.hi	L(copy32_128)
+
+	/* Small copies: 0..32 bytes.  */
+	cmp	count, 16
+	b.lo	L(copy16)
+	ldr	A_q, [src]
+	ldr	B_q, [srcend, -16]
+	str	A_q, [dstin]
+	str	B_q, [dstend, -16]
+	ret
+
+	/* Copy 8-15 bytes.  */
+L(copy16):
+	tbz	count, 3, L(copy8)
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
+	ret
+
+	.p2align 3
+	/* Copy 4-7 bytes.  */
+L(copy8):
+	tbz	count, 2, L(copy4)
+	ldr	A_lw, [src]
+	ldr	B_lw, [srcend, -4]
+	str	A_lw, [dstin]
+	str	B_lw, [dstend, -4]
+	ret
+
+	/* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+	cbz	count, L(copy0)
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	C_lw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	C_lw, [dstend, -1]
+L(copy0):
+	ret
+
+	.p2align 4
+	/* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+	ldp	A_q, B_q, [src]
+	ldp	C_q, D_q, [srcend, -32]
+	cmp	count, 64
+	b.hi	L(copy128)
+	stp	A_q, B_q, [dstin]
+	stp	C_q, D_q, [dstend, -32]
+	ret
+
+	.p2align 4
+	/* Copy 65..128 bytes.  */
+L(copy128):
+	ldp	E_q, F_q, [src, 32]
+	cmp	count, 96
+	b.ls	L(copy96)
+	ldp	G_q, H_q, [srcend, -64]
+	stp	G_q, H_q, [dstend, -64]
+L(copy96):
+	stp	A_q, B_q, [dstin]
+	stp	E_q, F_q, [dstin, 32]
+	stp	C_q, D_q, [dstend, -32]
+	ret
+
+	/* Copy more than 128 bytes.  */
+L(copy_long):
+	/* Use backwards copy if there is an overlap.  */
+	sub	tmp1, dstin, src
+	cmp	tmp1, count
+	b.lo	L(copy_long_backwards)
+
+	/* Copy 16 bytes and then align src to 16-byte alignment.  */
+	ldr	D_q, [src]
+	and	tmp1, src, 15
+	bic	src, src, 15
+	sub	dst, dstin, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_q, B_q, [src, 16]
+	str	D_q, [dstin]
+	ldp	C_q, D_q, [src, 48]
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	L(copy64_from_end)
+L(loop64):
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [src, 80]
+	stp	C_q, D_q, [dst, 48]
+	ldp	C_q, D_q, [src, 112]
+	add	src, src, 64
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	L(loop64)
+
+	/* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+	ldp	E_q, F_q, [srcend, -64]
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [srcend, -32]
+	stp	C_q, D_q, [dst, 48]
+	stp	E_q, F_q, [dstend, -64]
+	stp	A_q, B_q, [dstend, -32]
+	ret
+
+	/* Large backwards copy for overlapping copies.
+	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
+L(copy_long_backwards):
+	cbz	tmp1, L(copy0)
+	ldr	D_q, [srcend, -16]
+	and	tmp1, srcend, 15
+	bic	srcend, srcend, 15
+	sub	count, count, tmp1
+	ldp	A_q, B_q, [srcend, -32]
+	str	D_q, [dstend, -16]
+	ldp	C_q, D_q, [srcend, -64]
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	L(copy64_from_start)
+
+L(loop64_backwards):
+	str	B_q, [dstend, -16]
+	str	A_q, [dstend, -32]
+	ldp	A_q, B_q, [srcend, -96]
+	str	D_q, [dstend, -48]
+	str	C_q, [dstend, -64]!
+	ldp	C_q, D_q, [srcend, -128]
+	sub	srcend, srcend, 64
+	subs	count, count, 64
+	b.hi	L(loop64_backwards)
+
+	/* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+	ldp	E_q, F_q, [src, 32]
+	stp	A_q, B_q, [dstend, -32]
+	ldp	A_q, B_q, [src]
+	stp	C_q, D_q, [dstend, -64]
+	stp	E_q, F_q, [dstin, 32]
+	stp	A_q, B_q, [dstin]
+	ret
+
+END (__memcpy_aarch64_simd)
+
diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S
new file mode 100644
index 000000000000..dd254f6f9929
--- /dev/null
+++ b/string/aarch64/memcpy.S
@@ -0,0 +1,243 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#include "../asmdefs.h"
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define A_l	x6
+#define A_lw	w6
+#define A_h	x7
+#define B_l	x8
+#define B_lw	w8
+#define B_h	x9
+#define C_l	x10
+#define C_lw	w10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	x14
+#define E_h	x15
+#define F_l	x16
+#define F_h	x17
+#define G_l	count
+#define G_h	dst
+#define H_l	src
+#define H_h	srcend
+#define tmp1	x14
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+ENTRY_ALIAS (__memmove_aarch64)
+ENTRY (__memcpy_aarch64)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 128
+	b.hi	L(copy_long)
+	cmp	count, 32
+	b.hi	L(copy32_128)
+
+	/* Small copies: 0..32 bytes.  */
+	cmp	count, 16
+	b.lo	L(copy16)
+	ldp	A_l, A_h, [src]
+	ldp	D_l, D_h, [srcend, -16]
+	stp	A_l, A_h, [dstin]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	/* Copy 8-15 bytes.  */
+L(copy16):
+	tbz	count, 3, L(copy8)
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
+	ret
+
+	.p2align 3
+	/* Copy 4-7 bytes.  */
+L(copy8):
+	tbz	count, 2, L(copy4)
+	ldr	A_lw, [src]
+	ldr	B_lw, [srcend, -4]
+	str	A_lw, [dstin]
+	str	B_lw, [dstend, -4]
+	ret
+
+	/* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+	cbz	count, L(copy0)
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	C_lw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	C_lw, [dstend, -1]
+L(copy0):
+	ret
+
+	.p2align 4
+	/* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+	ldp	A_l, A_h, [src]
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [srcend, -32]
+	ldp	D_l, D_h, [srcend, -16]
+	cmp	count, 64
+	b.hi	L(copy128)
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstend, -32]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Copy 65..128 bytes.  */
+L(copy128):
+	ldp	E_l, E_h, [src, 32]
+	ldp	F_l, F_h, [src, 48]
+	cmp	count, 96
+	b.ls	L(copy96)
+	ldp	G_l, G_h, [srcend, -64]
+	ldp	H_l, H_h, [srcend, -48]
+	stp	G_l, G_h, [dstend, -64]
+	stp	H_l, H_h, [dstend, -48]
+L(copy96):
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	E_l, E_h, [dstin, 32]
+	stp	F_l, F_h, [dstin, 48]
+	stp	C_l, C_h, [dstend, -32]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Copy more than 128 bytes.  */
+L(copy_long):
+	/* Use backwards copy if there is an overlap.  */
+	sub	tmp1, dstin, src
+	cbz	tmp1, L(copy0)
+	cmp	tmp1, count
+	b.lo	L(copy_long_backwards)
+
+	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
+
+	ldp	D_l, D_h, [src]
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	sub	src, src, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_l, A_h, [src, 16]
+	stp	D_l, D_h, [dstin]
+	ldp	B_l, B_h, [src, 32]
+	ldp	C_l, C_h, [src, 48]
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	L(copy64_from_end)
+
+L(loop64):
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]!
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 64
+	b.hi	L(loop64)
+
+	/* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+	ldp	E_l, E_h, [srcend, -64]
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [srcend, -48]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [srcend, -16]
+	stp	D_l, D_h, [dst, 64]
+	stp	E_l, E_h, [dstend, -64]
+	stp	A_l, A_h, [dstend, -48]
+	stp	B_l, B_h, [dstend, -32]
+	stp	C_l, C_h, [dstend, -16]
+	ret
+
+	.p2align 4
+
+	/* Large backwards copy for overlapping copies.
+	   Copy 16 bytes and then align dst to 16-byte alignment.  */
+L(copy_long_backwards):
+	ldp	D_l, D_h, [srcend, -16]
+	and	tmp1, dstend, 15
+	sub	srcend, srcend, tmp1
+	sub	count, count, tmp1
+	ldp	A_l, A_h, [srcend, -16]
+	stp	D_l, D_h, [dstend, -16]
+	ldp	B_l, B_h, [srcend, -32]
+	ldp	C_l, C_h, [srcend, -48]
+	ldp	D_l, D_h, [srcend, -64]!
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	L(copy64_from_start)
+
+L(loop64_backwards):
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [srcend, -16]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [srcend, -48]
+	stp	D_l, D_h, [dstend, -64]!
+	ldp	D_l, D_h, [srcend, -64]!
+	subs	count, count, 64
+	b.hi	L(loop64_backwards)
+
+	/* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+	ldp	G_l, G_h, [src, 48]
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [src, 32]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [src, 16]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [src]
+	stp	D_l, D_h, [dstend, -64]
+	stp	G_l, G_h, [dstin, 48]
+	stp	A_l, A_h, [dstin, 32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin]
+	ret
+
+END (__memcpy_aarch64)
+
diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S
new file mode 100644
index 000000000000..7b4be847cecb
--- /dev/null
+++ b/string/aarch64/memrchr.S
@@ -0,0 +1,117 @@
+/*
+ * memrchr - find last character in a memory zone.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin		x0
+#define chrin		w1
+#define cntin		x2
+#define result		x0
+
+#define src		x3
+#define cntrem		x4
+#define synd		x5
+#define shift		x6
+#define	tmp		x7
+#define wtmp		w7
+#define end		x8
+#define endm1		x9
+
+#define vrepchr		v0
+#define qdata		q1
+#define vdata		v1
+#define vhas_chr	v2
+#define vrepmask	v3
+#define vend		v4
+#define dend		d4
+
+/*
+   Core algorithm:
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
+
+ENTRY (__memrchr_aarch64)
+	PTR_ARG (0)
+	add	end, srcin, cntin
+	sub	endm1, end, 1
+	bic	src, endm1, 15
+	cbz	cntin, L(nomatch)
+	ld1	{vdata.16b}, [src]
+	dup	vrepchr.16b, chrin
+	mov	wtmp, 0xf00f
+	dup	vrepmask.8h, wtmp
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	neg	shift, end, lsl 2
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	fmov	synd, dend
+	lsl	synd, synd, shift
+	cbz	synd, L(start_loop)
+
+	clz	synd, synd
+	sub	result, endm1, synd, lsr 2
+	cmp	cntin, synd, lsr 2
+	csel	result, result, xzr, hi
+	ret
+
+L(start_loop):
+	sub	tmp, end, src
+	subs	cntrem, cntin, tmp
+	b.ls	L(nomatch)
+
+	/* Make sure that it won't overread by a 16-byte chunk */
+	add	tmp, cntrem, 15
+	tbnz	tmp, 4, L(loop32_2)
+
+	.p2align 4
+L(loop32):
+	ldr	qdata, [src, -16]!
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbnz	synd, L(end)
+
+L(loop32_2):
+	ldr	qdata, [src, -16]!
+	subs	cntrem, cntrem, 32
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	b.ls	L(end)
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbz	synd, L(loop32)
+L(end):
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+
+	add	tmp, src, 15
+#ifdef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	clz	synd, synd
+	sub	tmp, tmp, synd, lsr 2
+	cmp	tmp, srcin
+	csel	result, tmp, xzr, hs
+	ret
+
+L(nomatch):
+	mov	result, 0
+	ret
+
+END (__memrchr_aarch64)
+
diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S
new file mode 100644
index 000000000000..9fcd97579913
--- /dev/null
+++ b/string/aarch64/memset.S
@@ -0,0 +1,117 @@
+/*
+ * memset - fill memory with a constant byte
+ *
+ * Copyright (c) 2012-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ *
+ */
+
+#include "../asmdefs.h"
+
+#define dstin	x0
+#define val	x1
+#define valw	w1
+#define count	x2
+#define dst	x3
+#define dstend	x4
+#define zva_val	x5
+
+ENTRY (__memset_aarch64)
+	PTR_ARG (0)
+	SIZE_ARG (2)
+
+	dup	v0.16B, valw
+	add	dstend, dstin, count
+
+	cmp	count, 96
+	b.hi	L(set_long)
+	cmp	count, 16
+	b.hs	L(set_medium)
+	mov	val, v0.D[0]
+
+	/* Set 0..15 bytes.  */
+	tbz	count, 3, 1f
+	str	val, [dstin]
+	str	val, [dstend, -8]
+	ret
+	.p2align 4
+1:	tbz	count, 2, 2f
+	str	valw, [dstin]
+	str	valw, [dstend, -4]
+	ret
+2:	cbz	count, 3f
+	strb	valw, [dstin]
+	tbz	count, 1, 3f
+	strh	valw, [dstend, -2]
+3:	ret
+
+	/* Set 17..96 bytes.  */
+L(set_medium):
+	str	q0, [dstin]
+	tbnz	count, 6, L(set96)
+	str	q0, [dstend, -16]
+	tbz	count, 5, 1f
+	str	q0, [dstin, 16]
+	str	q0, [dstend, -32]
+1:	ret
+
+	.p2align 4
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+L(set96):
+	str	q0, [dstin, 16]
+	stp	q0, q0, [dstin, 32]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+	.p2align 4
+L(set_long):
+	and	valw, valw, 255
+	bic	dst, dstin, 15
+	str	q0, [dstin]
+	cmp	count, 160
+	ccmp	valw, 0, 0, hs
+	b.ne	L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+	mrs	zva_val, dczid_el0
+	and	zva_val, zva_val, 31
+	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+	b.ne	L(no_zva)
+#endif
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	bic	dst, dst, 63
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 128	/* Adjust count and bias for loop.  */
+
+	.p2align 4
+L(zva_loop):
+	add	dst, dst, 64
+	dc	zva, dst
+	subs	count, count, 64
+	b.hi	L(zva_loop)
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+L(no_zva):
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	sub	dst, dst, 16		/* Dst is biased by -32.  */
+	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+L(no_zva_loop):
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]!
+	subs	count, count, 64
+	b.hi	L(no_zva_loop)
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+END (__memset_aarch64)
+
diff --git a/string/aarch64/stpcpy-mte.S b/string/aarch64/stpcpy-mte.S
new file mode 100644
index 000000000000..f1c711906515
--- /dev/null
+++ b/string/aarch64/stpcpy-mte.S
@@ -0,0 +1,10 @@
+/*
+ * stpcpy - copy a string returning pointer to end.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define BUILD_STPCPY 1
+
+#include "strcpy-mte.S"
diff --git a/string/aarch64/stpcpy-sve.S b/string/aarch64/stpcpy-sve.S
new file mode 100644
index 000000000000..82dd9717b0a0
--- /dev/null
+++ b/string/aarch64/stpcpy-sve.S
@@ -0,0 +1,10 @@
+/*
+ * stpcpy - copy a string returning pointer to end.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define BUILD_STPCPY 1
+
+#include "strcpy-sve.S"
diff --git a/string/aarch64/stpcpy.S b/string/aarch64/stpcpy.S
new file mode 100644
index 000000000000..4f62aa462389
--- /dev/null
+++ b/string/aarch64/stpcpy.S
@@ -0,0 +1,10 @@
+/*
+ * stpcpy - copy a string returning pointer to end.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define BUILD_STPCPY 1
+
+#include "strcpy.S"
diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S
new file mode 100644
index 000000000000..dcb0e4625870
--- /dev/null
+++ b/string/aarch64/strchr-mte.S
@@ -0,0 +1,105 @@
+/*
+ * strchr - find a character in a string
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin		x0
+#define chrin		w1
+#define result		x0
+
+#define src		x2
+#define tmp1		x1
+#define wtmp2		w3
+#define tmp3		x3
+
+#define vrepchr		v0
+#define vdata		v1
+#define qdata		q1
+#define vhas_nul	v2
+#define vhas_chr	v3
+#define vrepmask	v4
+#define vrepmask2	v5
+#define vend		v6
+#define dend		d6
+
+/* Core algorithm.
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-1 are set if the relevant byte matched the
+   requested character, bits 2-3 are set if the byte is NUL (or matched), and
+   bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd
+   bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits
+   in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
+
+ENTRY (__strchr_aarch64_mte)
+	PTR_ARG (0)
+	bic	src, srcin, 15
+	dup	vrepchr.16b, chrin
+	ld1	{vdata.16b}, [src]
+	mov	wtmp2, 0x3003
+	dup	vrepmask.8h, wtmp2
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	mov	wtmp2, 0xf00f
+	dup	vrepmask2.8h, wtmp2
+
+	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+	lsl	tmp3, srcin, 2
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+
+	fmov	tmp1, dend
+	lsr	tmp1, tmp1, tmp3
+	cbz	tmp1, L(loop)
+
+	rbit	tmp1, tmp1
+	clz	tmp1, tmp1
+	/* Tmp1 is an even multiple of 2 if the target character was
+	   found first. Otherwise we've found the end of string.  */
+	tst	tmp1, 2
+	add	result, srcin, tmp1, lsr 2
+	csel	result, result, xzr, eq
+	ret
+
+	.p2align 4
+L(loop):
+	ldr	qdata, [src, 16]!
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	tmp1, dend
+	cbz	tmp1, L(loop)
+
+#ifdef __AARCH64EB__
+	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	fmov	tmp1, dend
+#else
+	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	fmov	tmp1, dend
+	rbit	tmp1, tmp1
+#endif
+	clz	tmp1, tmp1
+	/* Tmp1 is an even multiple of 2 if the target character was
+	   found first. Otherwise we've found the end of string.  */
+	tst	tmp1, 2
+	add	result, src, tmp1, lsr 2
+	csel	result, result, xzr, eq
+	ret
+
+END (__strchr_aarch64_mte)
+
diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S
new file mode 100644
index 000000000000..13ba9f44f9c5
--- /dev/null
+++ b/string/aarch64/strchr-sve.S
@@ -0,0 +1,70 @@
+/*
+ * strchr/strchrnul - find a character in a string
+ *
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+/* To build as strchrnul, define BUILD_STRCHRNUL before compiling this file.  */
+#ifdef BUILD_STRCHRNUL
+#define FUNC  __strchrnul_aarch64_sve
+#else
+#define FUNC  __strchr_aarch64_sve
+#endif
+
+ENTRY (FUNC)
+	PTR_ARG (0)
+	dup	z1.b, w1		/* replicate byte across vector */
+	setffr				/* initialize FFR */
+	ptrue	p1.b			/* all ones; loop invariant */
+
+	.p2align 4
+	/* Read a vector's worth of bytes, stopping on first fault.  */
+0:	ldff1b	z0.b, p1/z, [x0, xzr]
+	rdffrs	p0.b, p1/z
+	b.nlast	2f
+
+	/* First fault did not fail: the whole vector is valid.
+	   Avoid depending on the contents of FFR beyond the branch.  */
+	incb	x0				/* speculate increment */
+	cmpeq	p2.b, p1/z, z0.b, z1.b		/* search for c */
+	cmpeq	p3.b, p1/z, z0.b, 0		/* search for 0 */
+	orrs	p4.b, p1/z, p2.b, p3.b		/* c | 0 */
+	b.none	0b
+	decb	x0				/* undo speculate */
+
+	/* Found C or 0.  */
+1:	brka	p4.b, p1/z, p4.b	/* find first such */
+	sub	x0, x0, 1		/* adjust pointer for that byte */
+	incp	x0, p4.b
+#ifndef BUILD_STRCHRNUL
+	ptest	p4, p2.b		/* was first in c? */
+	csel	x0, xzr, x0, none	/* if there was no c, return null */
+#endif
+	ret
+
+	/* First fault failed: only some of the vector is valid.
+	   Perform the comparision only on the valid bytes.  */
+2:	cmpeq	p2.b, p0/z, z0.b, z1.b		/* search for c */
+	cmpeq	p3.b, p0/z, z0.b, 0		/* search for 0 */
+	orrs	p4.b, p0/z, p2.b, p3.b		/* c | 0 */
+	b.any	1b
+
+	/* No C or 0 found.  Re-init FFR, increment, and loop.  */
+	setffr
+	incp	x0, p0.b
+	b	0b
+
+END (FUNC)
+
+#endif
+
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
new file mode 100644
index 000000000000..1063cbfd77aa
--- /dev/null
+++ b/string/aarch64/strchr.S
@@ -0,0 +1,126 @@
+/*
+ * strchr - find a character in a string
+ *
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+#include "../asmdefs.h"
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+
+#define result		x0
+
+#define src		x2
+#define	tmp1		x3
+#define wtmp2		w4
+#define tmp3		x5
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_nul1	v3
+#define vhas_nul2	v4
+#define vhas_chr1	v5
+#define vhas_chr2	v6
+#define vrepmask_0	v7
+#define vrepmask_c	v16
+#define vend1		v17
+#define vend2		v18
+
+/* Core algorithm.
+
+   For each 32-byte hunk we calculate a 64-bit syndrome value, with
+   two bits per byte (LSB is always in bits 0 and 1, for both big
+   and little-endian systems).  For each tuple, bit 0 is set iff
+   the relevant byte matched the requested character; bit 1 is set
+   iff the relevant byte matched the NUL end of string (we trigger
+   off bit0 for the special case of looking for NUL).  Since the bits
+   in the syndrome reflect exactly the order in which things occur
+   in the original string a count_trailing_zeros() operation will
+   identify exactly which byte is causing the termination, and why.  */
+
+/* Locals and temporaries.  */
+
+ENTRY (__strchr_aarch64)
+	PTR_ARG (0)
+	/* Magic constant 0xc0300c03 to allow us to identify which lane
+	   matches the requested byte.  Even bits are set if the character
+	   matches, odd bits if either the char is NUL or matches.  */
+	mov	wtmp2, 0x0c03
+	movk	wtmp2, 0xc030, lsl 16
+	dup	vrepchr.16b, chrin
+	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
+	dup	vrepmask_c.4s, wtmp2
+	ands	tmp1, srcin, #31
+	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
+	b.eq	L(loop)
+
+	/* Input string is not 32-byte aligned.  Rather than forcing
+	   the padding bytes to a safe value, we calculate the syndrome
+	   for all the bytes, but then mask off those bits of the
+	   syndrome that are related to the padding.  */
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	neg	tmp1, tmp1
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	bif	vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
+	bif	vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
+	and	vend1.16b, vhas_nul1.16b, vrepmask_c.16b
+	and	vend2.16b, vhas_nul2.16b, vrepmask_c.16b
+	lsl	tmp1, tmp1, #1
+	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
+	mov	tmp3, #~0
+	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
+	lsr	tmp1, tmp3, tmp1
+
+	mov	tmp3, vend1.d[0]
+	bic	tmp1, tmp3, tmp1	// Mask padding bits.
+	cbnz	tmp1, L(tail)
+
+	.p2align 4
+L(loop):
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+	orr	vend1.16b, vhas_nul1.16b, vhas_nul2.16b
+	umaxp	vend1.16b, vend1.16b, vend1.16b
+	mov	tmp1, vend1.d[0]
+	cbz	tmp1, L(loop)
+
+	/* Termination condition found.  Now need to establish exactly why
+	   we terminated.  */
+	bif	vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
+	bif	vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
+	and	vend1.16b, vhas_nul1.16b, vrepmask_c.16b
+	and	vend2.16b, vhas_nul2.16b, vrepmask_c.16b
+	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
+	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
+	mov	tmp1, vend1.d[0]
+L(tail):
+	/* Count the trailing zeros, by bit reversing...  */
+	rbit	tmp1, tmp1
+	/* Re-bias source.  */
+	sub	src, src, #32
+	clz	tmp1, tmp1	/* And counting the leading zeros.  */
+	/* Tmp1 is even if the target charager was found first.  Otherwise
+	   we've found the end of string and we weren't looking for NUL.  */
+	tst	tmp1, #1
+	add	result, src, tmp1, lsr #1
+	csel	result, result, xzr, eq
+	ret
+
+END (__strchr_aarch64)
+
diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S
new file mode 100644
index 000000000000..1b0d0a63094c
--- /dev/null
+++ b/string/aarch64/strchrnul-mte.S
@@ -0,0 +1,84 @@
+/*
+ * strchrnul - find a character or nul in a string
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin		x0
+#define chrin		w1
+#define result		x0
+
+#define src		x2
+#define tmp1		x1
+#define tmp2		x3
+#define tmp2w		w3
+
+#define vrepchr		v0
+#define vdata		v1
+#define qdata		q1
+#define vhas_nul	v2
+#define vhas_chr	v3
+#define vrepmask	v4
+#define vend		v5
+#define dend		d5
+
+/* Core algorithm:
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
+
+ENTRY (__strchrnul_aarch64_mte)
+	PTR_ARG (0)
+	bic	src, srcin, 15
+	dup	vrepchr.16b, chrin
+	ld1	{vdata.16b}, [src]
+	mov	tmp2w, 0xf00f
+	dup	vrepmask.8h, tmp2w
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
+	lsl	tmp2, srcin, 2
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	tmp1, dend
+	lsr	tmp1, tmp1, tmp2	/* Mask padding bits.  */
+	cbz	tmp1, L(loop)
+
+	rbit	tmp1, tmp1
+	clz	tmp1, tmp1
+	add	result, srcin, tmp1, lsr 2
+	ret
+
+	.p2align 4
+L(loop):
+	ldr	qdata, [src, 16]!
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b
+	fmov	tmp1, dend
+	cbz	tmp1, L(loop)
+
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	tmp1, dend
+#ifndef __AARCH64EB__
+	rbit	tmp1, tmp1
+#endif
+	clz	tmp1, tmp1
+	add	result, src, tmp1, lsr 2
+	ret
+
+END (__strchrnul_aarch64_mte)
+
diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/strchrnul-sve.S
new file mode 100644
index 000000000000..428ff1a3d008
--- /dev/null
+++ b/string/aarch64/strchrnul-sve.S
@@ -0,0 +1,9 @@
+/*
+ * strchrnul - find a character or nul in a string
+ *
+ * Copyright (c) 2018-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define BUILD_STRCHRNUL
+#include "strchr-sve.S"
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
new file mode 100644
index 000000000000..a4230d919b47
--- /dev/null
+++ b/string/aarch64/strchrnul.S
@@ -0,0 +1,114 @@
+/*
+ * strchrnul - find a character or nul in a string
+ *
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+#include "../asmdefs.h"
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+
+#define result		x0
+
+#define src		x2
+#define	tmp1		x3
+#define wtmp2		w4
+#define tmp3		x5
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_nul1	v3
+#define vhas_nul2	v4
+#define vhas_chr1	v5
+#define vhas_chr2	v6
+#define vrepmask	v7
+#define vend1		v16
+
+/* Core algorithm.
+
+   For each 32-byte hunk we calculate a 64-bit syndrome value, with
+   two bits per byte (LSB is always in bits 0 and 1, for both big
+   and little-endian systems).  For each tuple, bit 0 is set iff
+   the relevant byte matched the requested character or nul.  Since the
+   bits in the syndrome reflect exactly the order in which things occur
+   in the original string a count_trailing_zeros() operation will
+   identify exactly which byte is causing the termination.  */
+
+/* Locals and temporaries.  */
+
+ENTRY (__strchrnul_aarch64)
+	PTR_ARG (0)
+	/* Magic constant 0x40100401 to allow us to identify which lane
+	   matches the termination condition.  */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
+	dup	vrepmask.4s, wtmp2
+	ands	tmp1, srcin, #31
+	b.eq	L(loop)
+
+	/* Input string is not 32-byte aligned.  Rather than forcing
+	   the padding bytes to a safe value, we calculate the syndrome
+	   for all the bytes, but then mask off those bits of the
+	   syndrome that are related to the padding.  */
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	neg	tmp1, tmp1
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+	and	vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
+	lsl	tmp1, tmp1, #1
+	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
+	mov	tmp3, #~0
+	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
+	lsr	tmp1, tmp3, tmp1
+
+	mov	tmp3, vend1.d[0]
+	bic	tmp1, tmp3, tmp1	// Mask padding bits.
+	cbnz	tmp1, L(tail)
+
+	.p2align 4
+L(loop):
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+	orr	vend1.16b, vhas_nul1.16b, vhas_nul2.16b
+	umaxp	vend1.16b, vend1.16b, vend1.16b
+	mov	tmp1, vend1.d[0]
+	cbz	tmp1, L(loop)
+
+	/* Termination condition found.  Now need to establish exactly why
+	   we terminated.  */
+	and	vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
+	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b		// 256->128
+	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
+
+	mov	tmp1, vend1.d[0]
+L(tail):
+	/* Count the trailing zeros, by bit reversing...  */
+	rbit	tmp1, tmp1
+	/* Re-bias source.  */
+	sub	src, src, #32
+	clz	tmp1, tmp1	/* ... and counting the leading zeros.  */
+	/* tmp1 is twice the offset into the fragment.  */
+	add	result, src, tmp1, lsr #1
+	ret
+
+END (__strchrnul_aarch64)
+
diff --git a/string/aarch64/strcmp-mte.S b/string/aarch64/strcmp-mte.S
new file mode 100644
index 000000000000..12d1a6b51dd3
--- /dev/null
+++ b/string/aarch64/strcmp-mte.S
@@ -0,0 +1,189 @@
+/*
+ * strcmp - compare two strings
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+#define src1		x0
+#define src2		x1
+#define result		x0
+
+#define data1		x2
+#define data1w		w2
+#define data2		x3
+#define data2w		w3
+#define has_nul		x4
+#define diff		x5
+#define off1		x5
+#define syndrome	x6
+#define tmp		x6
+#define data3		x7
+#define zeroones	x8
+#define shift		x9
+#define off2		x10
+
+/* On big-endian early bytes are at MSB and on little-endian LSB.
+   LS_FW means shifting towards early bytes.  */
+#ifdef __AARCH64EB__
+# define LS_FW lsl
+#else
+# define LS_FW lsr
+#endif
+
+/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+   can be done in parallel across the entire word.
+   Since carry propagation makes 0x1 bytes before a NUL byte appear
+   NUL too in big-endian, byte-reverse the data before the NUL check.  */
+
+
+ENTRY (__strcmp_aarch64_mte)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	sub	off2, src2, src1
+	mov	zeroones, REP8_01
+	and	tmp, src1, 7
+	tst	off2, 7
+	b.ne	L(misaligned8)
+	cbnz	tmp, L(mutual_align)
+
+	.p2align 4
+
+L(loop_aligned):
+	ldr	data2, [src1, off2]
+	ldr	data1, [src1], 8
+L(start_realigned):
+#ifdef __AARCH64EB__
+	rev	tmp, data1
+	sub	has_nul, tmp, zeroones
+	orr	tmp, tmp, REP8_7f
+#else
+	sub	has_nul, data1, zeroones
+	orr	tmp, data1, REP8_7f
+#endif
+	bics	has_nul, has_nul, tmp	/* Non-zero if NUL terminator.  */
+	ccmp	data1, data2, 0, eq
+	b.eq	L(loop_aligned)
+#ifdef __AARCH64EB__
+	rev	has_nul, has_nul
+#endif
+	eor	diff, data1, data2
+	orr	syndrome, diff, has_nul
+L(end):
+#ifndef __AARCH64EB__
+	rev	syndrome, syndrome
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	clz	shift, syndrome
+	/* The most-significant-non-zero bit of the syndrome marks either the
+	   first bit that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	lsl	data1, data1, shift
+	lsl	data2, data2, shift
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, 56
+	sub	result, data1, data2, lsr 56
+	ret
+
+	.p2align 4
+
+L(mutual_align):
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that precede the start point.  */
+	bic	src1, src1, 7
+	ldr	data2, [src1, off2]
+	ldr	data1, [src1], 8
+	neg	shift, src2, lsl 3	/* Bits to alignment -64.  */
+	mov	tmp, -1
+	LS_FW	tmp, tmp, shift
+	orr	data1, data1, tmp
+	orr	data2, data2, tmp
+	b	L(start_realigned)
+
+L(misaligned8):
+	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+	   checking to make sure that we don't access beyond the end of SRC2.  */
+	cbz	tmp, L(src1_aligned)
+L(do_misaligned):
+	ldrb	data1w, [src1], 1
+	ldrb	data2w, [src2], 1
+	cmp	data1w, 0
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
+	b.ne	L(done)
+	tst	src1, 7
+	b.ne	L(do_misaligned)
+
+L(src1_aligned):
+	neg	shift, src2, lsl 3
+	bic	src2, src2, 7
+	ldr	data3, [src2], 8
+#ifdef __AARCH64EB__
+	rev	data3, data3
+#endif
+	lsr	tmp, zeroones, shift
+	orr	data3, data3, tmp
+	sub	has_nul, data3, zeroones
+	orr	tmp, data3, REP8_7f
+	bics	has_nul, has_nul, tmp
+	b.ne	L(tail)
+
+	sub	off1, src2, src1
+
+	.p2align 4
+
+L(loop_unaligned):
+	ldr	data3, [src1, off1]
+	ldr	data2, [src1, off2]
+#ifdef __AARCH64EB__
+	rev	data3, data3
+#endif
+	sub	has_nul, data3, zeroones
+	orr	tmp, data3, REP8_7f
+	ldr	data1, [src1], 8
+	bics	has_nul, has_nul, tmp
+	ccmp	data1, data2, 0, eq
+	b.eq	L(loop_unaligned)
+
+	lsl	tmp, has_nul, shift
+#ifdef __AARCH64EB__
+	rev	tmp, tmp
+#endif
+	eor	diff, data1, data2
+	orr	syndrome, diff, tmp
+	cbnz	syndrome, L(end)
+L(tail):
+	ldr	data1, [src1]
+	neg	shift, shift
+	lsr	data2, data3, shift
+	lsr	has_nul, has_nul, shift
+#ifdef __AARCH64EB__
+	rev     data2, data2
+	rev	has_nul, has_nul
+#endif
+	eor	diff, data1, data2
+	orr	syndrome, diff, has_nul
+	b	L(end)
+
+L(done):
+	sub	result, data1, data2
+	ret
+
+END (__strcmp_aarch64_mte)
+
diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S
new file mode 100644
index 000000000000..e6d2da5411ca
--- /dev/null
+++ b/string/aarch64/strcmp-sve.S
@@ -0,0 +1,59 @@
+/*
+ * __strcmp_aarch64_sve - compare two strings
+ *
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+ENTRY (__strcmp_aarch64_sve)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	setffr				/* initialize FFR */
+	ptrue	p1.b, all		/* all ones; loop invariant */
+	mov	x2, 0			/* initialize offset */
+
+	/* Read a vector's worth of bytes, stopping on first fault.  */
+	.p2align 4
+0:	ldff1b	z0.b, p1/z, [x0, x2]
+	ldff1b	z1.b, p1/z, [x1, x2]
+	rdffrs	p0.b, p1/z
+	b.nlast	2f
+
+	/* First fault did not fail: the whole vector is valid.
+	   Avoid depending on the contents of FFR beyond the branch.  */
+	incb	x2, all			/* skip bytes for next round */
+	cmpeq	p2.b, p1/z, z0.b, z1.b	/* compare strings */
+	cmpne	p3.b, p1/z, z0.b, 0	/* search for ~zero */
+	nands	p2.b, p1/z, p2.b, p3.b	/* ~(eq & ~zero) -> ne | zero */
+	b.none	0b
+
+	/* Found end-of-string or inequality.  */
+1:	brkb	p2.b, p1/z, p2.b	/* find first such */
+	lasta	w0, p2, z0.b		/* extract each char */
+	lasta	w1, p2, z1.b
+	sub	x0, x0, x1		/* return comparison */
+	ret
+
+	/* First fault failed: only some of the vector is valid.
+	   Perform the comparison only on the valid bytes.  */
+2:	incp	x2, p0.b		/* skip bytes for next round */
+	setffr				/* re-init FFR for next round */
+	cmpeq	p2.b, p0/z, z0.b, z1.b	/* compare strings, as above */
+	cmpne	p3.b, p0/z, z0.b, 0
+	nands	p2.b, p0/z, p2.b, p3.b
+	b.none	0b
+	b	1b
+
+END (__strcmp_aarch64_sve)
+
+#endif
+
diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S
new file mode 100644
index 000000000000..7714ebf5577d
--- /dev/null
+++ b/string/aarch64/strcmp.S
@@ -0,0 +1,173 @@
+/*
+ * strcmp - compare two strings
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+#include "../asmdefs.h"
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and result.  */
+#define src1		x0
+#define src2		x1
+#define result		x0
+
+/* Internal variables.  */
+#define data1		x2
+#define data1w		w2
+#define data2		x3
+#define data2w		w3
+#define has_nul		x4
+#define diff		x5
+#define syndrome	x6
+#define tmp1		x7
+#define tmp2		x8
+#define tmp3		x9
+#define zeroones	x10
+#define pos		x11
+
+	/* Start of performance-critical section  -- one 64B cache line.  */
+ENTRY (__strcmp_aarch64)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	eor	tmp1, src1, src2
+	mov	zeroones, #REP8_01
+	tst	tmp1, #7
+	b.ne	L(misaligned8)
+	ands	tmp1, src1, #7
+	b.ne	L(mutual_align)
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+L(loop_aligned):
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+L(start_realigned):
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	orr	syndrome, diff, has_nul
+	cbz	syndrome, L(loop_aligned)
+	/* End of performance-critical section  -- one 64B cache line.  */
+
+L(end):
+#ifndef	__AARCH64EB__
+	rev	syndrome, syndrome
+	rev	data1, data1
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	clz	pos, syndrome
+	rev	data2, data2
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#else
+	/* For big-endian we cannot use the trick with the syndrome value
+	   as carry-propagation can corrupt the upper bits if the trailing
+	   bytes in the string contain 0x01.  */
+	/* However, if there is no NUL byte in the dword, we can generate
+	   the result directly.  We can't just subtract the bytes as the
+	   MSB might be significant.  */
+	cbnz	has_nul, 1f
+	cmp	data1, data2
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+1:
+	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
+	rev	tmp3, data1
+	sub	tmp1, tmp3, zeroones
+	orr	tmp2, tmp3, #REP8_7f
+	bic	has_nul, tmp1, tmp2
+	rev	has_nul, has_nul
+	orr	syndrome, diff, has_nul
+	clz	pos, syndrome
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#endif
+
+L(mutual_align):
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that preceed the start point.  */
+	bic	src1, src1, #7
+	bic	src2, src2, #7
+	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
+	ldr	data1, [src1], #8
+	neg	tmp1, tmp1		/* Bits to alignment -64.  */
+	ldr	data2, [src2], #8
+	mov	tmp2, #~0
+#ifdef __AARCH64EB__
+	/* Big-endian.  Early bytes are at MSB.  */
+	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#else
+	/* Little-endian.  Early bytes are at LSB.  */
+	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#endif
+	orr	data1, data1, tmp2
+	orr	data2, data2, tmp2
+	b	L(start_realigned)
+
+L(misaligned8):
+	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+	   checking to make sure that we don't access beyond page boundary in
+	   SRC2.  */
+	tst	src1, #7
+	b.eq	L(loop_misaligned)
+L(do_misaligned):
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.ne	L(done)
+	tst	src1, #7
+	b.ne	L(do_misaligned)
+
+L(loop_misaligned):
+	/* Test if we are within the last dword of the end of a 4K page.  If
+	   yes then jump back to the misaligned loop to copy a byte at a time.  */
+	and	tmp1, src2, #0xff8
+	eor	tmp1, tmp1, #0xff8
+	cbz	tmp1, L(do_misaligned)
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	orr	syndrome, diff, has_nul
+	cbz	syndrome, L(loop_misaligned)
+	b	L(end)
+
+L(done):
+	sub	result, data1, data2
+	ret
+
+END (__strcmp_aarch64)
+
diff --git a/string/aarch64/strcpy-mte.S b/string/aarch64/strcpy-mte.S
new file mode 100644
index 000000000000..88c222d61e53
--- /dev/null
+++ b/string/aarch64/strcpy-mte.S
@@ -0,0 +1,161 @@
+/*
+ * strcpy/stpcpy - copy a string returning pointer to start/end.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define dstin		x0
+#define srcin		x1
+#define result		x0
+
+#define src		x2
+#define dst		x3
+#define len		x4
+#define synd		x4
+#define	tmp		x5
+#define wtmp		w5
+#define shift		x5
+#define data1		x6
+#define dataw1		w6
+#define data2		x7
+#define dataw2		w7
+
+#define dataq		q0
+#define vdata		v0
+#define vhas_nul	v1
+#define vrepmask	v2
+#define vend		v3
+#define dend		d3
+#define dataq2		q1
+
+#ifdef BUILD_STPCPY
+# define STRCPY __stpcpy_aarch64_mte
+# define IFSTPCPY(X,...) X,__VA_ARGS__
+#else
+# define STRCPY __strcpy_aarch64_mte
+# define IFSTPCPY(X,...)
+#endif
+
+/* Core algorithm:
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
+
+ENTRY (STRCPY)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	bic	src, srcin, 15
+	mov	wtmp, 0xf00f
+	ld1	{vdata.16b}, [src]
+	dup	vrepmask.8h, wtmp
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	lsl	shift, srcin, 2
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	lsr	synd, synd, shift
+	cbnz	synd, L(tail)
+
+	ldr	dataq, [src, 16]!
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbz	synd, L(start_loop)
+
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	sub	tmp, src, srcin
+	clz	len, synd
+	add	len, tmp, len, lsr 2
+	tbz	len, 4, L(less16)
+	sub	tmp, len, 15
+	ldr	dataq, [srcin]
+	ldr	dataq2, [srcin, tmp]
+	str	dataq, [dstin]
+	str	dataq2, [dstin, tmp]
+	IFSTPCPY (add result, dstin, len)
+	ret
+
+	.p2align 4,,8
+L(tail):
+	rbit	synd, synd
+	clz	len, synd
+	lsr	len, len, 2
+
+	.p2align 4
+L(less16):
+	tbz	len, 3, L(less8)
+	sub	tmp, len, 7
+	ldr	data1, [srcin]
+	ldr	data2, [srcin, tmp]
+	str	data1, [dstin]
+	str	data2, [dstin, tmp]
+	IFSTPCPY (add result, dstin, len)
+	ret
+
+	.p2align 4
+L(less8):
+	subs	tmp, len, 3
+	b.lo	L(less4)
+	ldr	dataw1, [srcin]
+	ldr	dataw2, [srcin, tmp]
+	str	dataw1, [dstin]
+	str	dataw2, [dstin, tmp]
+	IFSTPCPY (add result, dstin, len)
+	ret
+
+L(less4):
+	cbz	len, L(zerobyte)
+	ldrh	dataw1, [srcin]
+	strh	dataw1, [dstin]
+L(zerobyte):
+	strb	wzr, [dstin, len]
+	IFSTPCPY (add result, dstin, len)
+	ret
+
+	.p2align 4
+L(start_loop):
+	sub	len, src, srcin
+	ldr	dataq2, [srcin]
+	add	dst, dstin, len
+	str	dataq2, [dstin]
+
+	.p2align 5
+L(loop):
+	str	dataq, [dst], 16
+	ldr	dataq, [src, 16]!
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbz	synd, L(loop)
+
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	fmov	synd, dend
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	clz	len, synd
+	lsr	len, len, 2
+	sub	tmp, len, 15
+	ldr	dataq, [src, tmp]
+	str	dataq, [dst, tmp]
+	IFSTPCPY (add result, dst, len)
+	ret
+
+END (STRCPY)
diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S
new file mode 100644
index 000000000000..f515462e09ae
--- /dev/null
+++ b/string/aarch64/strcpy-sve.S
@@ -0,0 +1,71 @@
+/*
+ * strcpy/stpcpy - copy a string returning pointer to start/end.
+ *
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+/* To build as stpcpy, define BUILD_STPCPY before compiling this file.  */
+#ifdef BUILD_STPCPY
+#define FUNC  __stpcpy_aarch64_sve
+#else
+#define FUNC  __strcpy_aarch64_sve
+#endif
+
+ENTRY (FUNC)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	setffr				/* initialize FFR */
+	ptrue	p2.b, all		/* all ones; loop invariant */
+	mov	x2, 0			/* initialize offset */
+
+	.p2align 4
+	/* Read a vector's worth of bytes, stopping on first fault.  */
+0:	ldff1b	z0.b, p2/z, [x1, x2]
+	rdffrs	p0.b, p2/z
+	b.nlast	1f
+
+	/* First fault did not fail: the whole vector is valid.
+	   Avoid depending on the contexts of FFR beyond the branch.  */
+	cmpeq	p1.b, p2/z, z0.b, 0	/* search for zeros */
+	b.any	2f
+
+	/* No zero found.  Store the whole vector and loop.  */
+	st1b	z0.b, p2, [x0, x2]
+	incb	x2, all
+	b	0b
+
+	/* First fault failed: only some of the vector is valid.
+	   Perform the comparison only on the valid bytes.  */
+1:	cmpeq	p1.b, p0/z, z0.b, 0	/* search for zeros */
+	b.any	2f
+
+	/* No zero found.  Store the valid portion of the vector and loop.  */
+	setffr				/* re-init FFR */
+	st1b	z0.b, p0, [x0, x2]
+	incp	x2, p0.b
+	b	0b
+
+	/* Zero found.  Crop the vector to the found zero and finish.  */
+2:	brka	p0.b, p2/z, p1.b
+	st1b	z0.b, p0, [x0, x2]
+#ifdef BUILD_STPCPY
+	add	x0, x0, x2
+	sub	x0, x0, 1
+	incp	x0, p0.b
+#endif
+	ret
+
+END (FUNC)
+
+#endif
+
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
new file mode 100644
index 000000000000..6e9ed424b693
--- /dev/null
+++ b/string/aarch64/strcpy.S
@@ -0,0 +1,311 @@
+/*
+ * strcpy/stpcpy - copy a string returning pointer to start/end.
+ *
+ * Copyright (c) 2013-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ */
+
+#include "../asmdefs.h"
+
+/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
+
+   To test the page crossing code path more thoroughly, compile with
+   -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
+   entry path.  This option is not intended for production use.  */
+
+/* Arguments and results.  */
+#define dstin		x0
+#define srcin		x1
+
+/* Locals and temporaries.  */
+#define src		x2
+#define dst		x3
+#define data1		x4
+#define data1w		w4
+#define data2		x5
+#define data2w		w5
+#define has_nul1	x6
+#define has_nul2	x7
+#define tmp1		x8
+#define tmp2		x9
+#define tmp3		x10
+#define tmp4		x11
+#define zeroones	x12
+#define data1a		x13
+#define data2a		x14
+#define pos		x15
+#define len		x16
+#define to_align	x17
+
+#ifdef BUILD_STPCPY
+#define STRCPY __stpcpy_aarch64
+#else
+#define STRCPY __strcpy_aarch64
+#endif
+
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+	/* AArch64 systems have a minimum page size of 4k.  We can do a quick
+	   page size check for crossing this boundary on entry and if we
+	   do not, then we can short-circuit much of the entry code.  We
+	   expect early page-crossing strings to be rare (probability of
+	   16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
+	   predictable, even with random strings.
+
+	   We don't bother checking for larger page sizes, the cost of setting
+	   up the correct page size is just not worth the extra gain from
+	   a small reduction in the cases taking the slow path.  Note that
+	   we only care about whether the first fetch, which may be
+	   misaligned, crosses a page boundary - after that we move to aligned
+	   fetches for the remainder of the string.  */
+
+#ifdef STRCPY_TEST_PAGE_CROSS
+	/* Make everything that isn't Qword aligned look like a page cross.  */
+#define MIN_PAGE_P2 4
+#else
+#define MIN_PAGE_P2 12
+#endif
+
+#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
+
+ENTRY (STRCPY)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	/* For moderately short strings, the fastest way to do the copy is to
+	   calculate the length of the string in the same way as strlen, then
+	   essentially do a memcpy of the result.  This avoids the need for
+	   multiple byte copies and further means that by the time we
+	   reach the bulk copy loop we know we can always use DWord
+	   accesses.  We expect __strcpy_aarch64 to rarely be called repeatedly
+	   with the same source string, so branch prediction is likely to
+	   always be difficult - we mitigate against this by preferring
+	   conditional select operations over branches whenever this is
+	   feasible.  */
+	and	tmp2, srcin, #(MIN_PAGE_SIZE - 1)
+	mov	zeroones, #REP8_01
+	and	to_align, srcin, #15
+	cmp	tmp2, #(MIN_PAGE_SIZE - 16)
+	neg	tmp1, to_align
+	/* The first fetch will straddle a (possible) page boundary iff
+	   srcin + 15 causes bit[MIN_PAGE_P2] to change value.  A 16-byte
+	   aligned string will never fail the page align check, so will
+	   always take the fast path.  */
+	b.gt	L(page_cross)
+
+L(page_cross_ok):
+	ldp	data1, data2, [srcin]
+#ifdef __AARCH64EB__
+	/* Because we expect the end to be found within 16 characters
+	   (profiling shows this is the most common case), it's worth
+	   swapping the bytes now to save having to recalculate the
+	   termination syndrome later.  We preserve data1 and data2
+	   so that we can re-use the values later on.  */
+	rev	tmp2, data1
+	sub	tmp1, tmp2, zeroones
+	orr	tmp2, tmp2, #REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	b.ne	L(fp_le8)
+	rev	tmp4, data2
+	sub	tmp3, tmp4, zeroones
+	orr	tmp4, tmp4, #REP8_7f
+#else
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	b.ne	L(fp_le8)
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+#endif
+	bics	has_nul2, tmp3, tmp4
+	b.eq	L(bulk_entry)
+
+	/* The string is short (<=16 bytes).  We don't know exactly how
+	   short though, yet.  Work out the exact length so that we can
+	   quickly select the optimal copy strategy.  */
+L(fp_gt8):
+	rev	has_nul2, has_nul2
+	clz	pos, has_nul2
+	mov	tmp2, #56
+	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
+	sub	pos, tmp2, pos
+#ifdef __AARCH64EB__
+	lsr	data2, data2, pos
+#else
+	lsl	data2, data2, pos
+#endif
+	str	data2, [dst, #1]
+	str	data1, [dstin]
+#ifdef BUILD_STPCPY
+	add	dstin, dst, #8
+#endif
+	ret
+
+L(fp_le8):
+	rev	has_nul1, has_nul1
+	clz	pos, has_nul1
+	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
+	subs	tmp2, pos, #24			/* Pos in bits. */
+	b.lt	L(fp_lt4)
+#ifdef __AARCH64EB__
+	mov	tmp2, #56
+	sub	pos, tmp2, pos
+	lsr	data2, data1, pos
+	lsr	data1, data1, #32
+#else
+	lsr	data2, data1, tmp2
+#endif
+	/* 4->7 bytes to copy.  */
+	str	data2w, [dst, #-3]
+	str	data1w, [dstin]
+#ifdef BUILD_STPCPY
+	mov	dstin, dst
+#endif
+	ret
+L(fp_lt4):
+	cbz	pos, L(fp_lt2)
+	/* 2->3 bytes to copy.  */
+#ifdef __AARCH64EB__
+	lsr	data1, data1, #48
+#endif
+	strh	data1w, [dstin]
+	/* Fall-through, one byte (max) to go.  */
+L(fp_lt2):
+	/* Null-terminated string.  Last character must be zero!  */
+	strb	wzr, [dst]
+#ifdef BUILD_STPCPY
+	mov	dstin, dst
+#endif
+	ret
+
+	.p2align 6
+	/* Aligning here ensures that the entry code and main loop all lies
+	   within one 64-byte cache line.  */
+L(bulk_entry):
+	sub	to_align, to_align, #16
+	stp	data1, data2, [dstin]
+	sub	src, srcin, to_align
+	sub	dst, dstin, to_align
+	b	L(entry_no_page_cross)
+
+	/* The inner loop deals with two Dwords at a time.  This has a
+	   slightly higher start-up cost, but we should win quite quickly,
+	   especially on cores with a high number of issue slots per
+	   cycle, as we get much better parallelism out of the operations.  */
+L(main_loop):
+	stp	data1, data2, [dst], #16
+L(entry_no_page_cross):
+	ldp	data1, data2, [src], #16
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+	bics	has_nul2, tmp3, tmp4
+	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
+	b.eq	L(main_loop)
+
+	/* Since we know we are copying at least 16 bytes, the fastest way
+	   to deal with the tail is to determine the location of the
+	   trailing NUL, then (re)copy the 16 bytes leading up to that.  */
+	cmp	has_nul1, #0
+#ifdef __AARCH64EB__
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul directly.  The
+	   easiest way to get the correct byte is to byte-swap the data
+	   and calculate the syndrome a second time.  */
+	csel	data1, data1, data2, ne
+	rev	data1, data1
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+#else
+	csel	has_nul1, has_nul1, has_nul2, ne
+#endif
+	rev	has_nul1, has_nul1
+	clz	pos, has_nul1
+	add	tmp1, pos, #72
+	add	pos, pos, #8
+	csel	pos, pos, tmp1, ne
+	add	src, src, pos, lsr #3
+	add	dst, dst, pos, lsr #3
+	ldp	data1, data2, [src, #-32]
+	stp	data1, data2, [dst, #-16]
+#ifdef BUILD_STPCPY
+	sub	dstin, dst, #1
+#endif
+	ret
+
+L(page_cross):
+	bic	src, srcin, #15
+	/* Start by loading two words at [srcin & ~15], then forcing the
+	   bytes that precede srcin to 0xff.  This means they never look
+	   like termination bytes.  */
+	ldp	data1, data2, [src]
+	lsl	tmp1, tmp1, #3	/* Bytes beyond alignment -> bits.  */
+	tst	to_align, #7
+	csetm	tmp2, ne
+#ifdef __AARCH64EB__
+	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#else
+	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#endif
+	orr	data1, data1, tmp2
+	orr	data2a, data2, tmp2
+	cmp	to_align, #8
+	csinv	data1, data1, xzr, lt
+	csel	data2, data2, data2a, lt
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+	bics	has_nul2, tmp3, tmp4
+	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
+	b.eq	L(page_cross_ok)
+	/* We now need to make data1 and data2 look like they've been
+	   loaded directly from srcin.  Do a rotate on the 128-bit value.  */
+	lsl	tmp1, to_align, #3	/* Bytes->bits.  */
+	neg	tmp2, to_align, lsl #3
+#ifdef __AARCH64EB__
+	lsl	data1a, data1, tmp1
+	lsr	tmp4, data2, tmp2
+	lsl	data2, data2, tmp1
+	orr	tmp4, tmp4, data1a
+	cmp	to_align, #8
+	csel	data1, tmp4, data2, lt
+	rev	tmp2, data1
+	rev	tmp4, data2
+	sub	tmp1, tmp2, zeroones
+	orr	tmp2, tmp2, #REP8_7f
+	sub	tmp3, tmp4, zeroones
+	orr	tmp4, tmp4, #REP8_7f
+#else
+	lsr	data1a, data1, tmp1
+	lsl	tmp4, data2, tmp2
+	lsr	data2, data2, tmp1
+	orr	tmp4, tmp4, data1a
+	cmp	to_align, #8
+	csel	data1, tmp4, data2, lt
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+#endif
+	bic	has_nul1, tmp1, tmp2
+	cbnz	has_nul1, L(fp_le8)
+	bic	has_nul2, tmp3, tmp4
+	b	L(fp_gt8)
+
+END (STRCPY)
+
diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S
new file mode 100644
index 000000000000..7cf41d5c1eac
--- /dev/null
+++ b/string/aarch64/strlen-mte.S
@@ -0,0 +1,80 @@
+/*
+ * strlen - calculate the length of a string.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin		x0
+#define result		x0
+
+#define src		x1
+#define	synd		x2
+#define tmp		x3
+#define wtmp		w3
+#define shift		x4
+
+#define data		q0
+#define vdata		v0
+#define vhas_nul	v1
+#define vrepmask	v2
+#define vend		v3
+#define dend		d3
+
+/* Core algorithm:
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
+
+ENTRY (__strlen_aarch64_mte)
+	PTR_ARG (0)
+	bic	src, srcin, 15
+	mov	wtmp, 0xf00f
+	ld1	{vdata.16b}, [src]
+	dup	vrepmask.8h, wtmp
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	lsl	shift, srcin, 2
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	fmov	synd, dend
+	lsr	synd, synd, shift
+	cbz	synd, L(loop)
+
+	rbit	synd, synd
+	clz	result, synd
+	lsr	result, result, 2
+	ret
+
+	.p2align 5
+L(loop):
+	ldr	data, [src, 16]!
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbz	synd, L(loop)
+
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	sub	result, src, srcin
+	fmov	synd, dend
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	clz	tmp, synd
+	add	result, result, tmp, lsr 2
+	ret
+
+END (__strlen_aarch64_mte)
+
diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S
new file mode 100644
index 000000000000..2392493f1a3c
--- /dev/null
+++ b/string/aarch64/strlen-sve.S
@@ -0,0 +1,55 @@
+/*
+ * __strlen_aarch64_sve - compute the length of a string
+ *
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+ENTRY (__strlen_aarch64_sve)
+	PTR_ARG (0)
+	setffr			/* initialize FFR */
+	ptrue	p2.b		/* all ones; loop invariant */
+	mov	x1, 0		/* initialize length */
+
+	/* Read a vector's worth of bytes, stopping on first fault.  */
+	.p2align 4
+0:	ldff1b	z0.b, p2/z, [x0, x1]
+	rdffrs	p0.b, p2/z
+	b.nlast	2f
+
+	/* First fault did not fail: the whole vector is valid.
+	   Avoid depending on the contents of FFR beyond the branch.  */
+	incb	x1, all			/* speculate increment */
+	cmpeq	p1.b, p2/z, z0.b, 0	/* loop if no zeros */
+	b.none	0b
+	decb	x1, all			/* undo speculate */
+
+	/* Zero found.  Select the bytes before the first and count them.  */
+1:	brkb	p0.b, p2/z, p1.b
+	incp	x1, p0.b
+	mov	x0, x1
+	ret
+
+	/* First fault failed: only some of the vector is valid.
+	   Perform the comparison only on the valid bytes.  */
+2:	cmpeq	p1.b, p0/z, z0.b, 0
+	b.any	1b
+
+	/* No zero found.  Re-init FFR, increment, and loop.  */
+	setffr
+	incp	x1, p0.b
+	b	0b
+
+END (__strlen_aarch64_sve)
+
+#endif
+
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
new file mode 100644
index 000000000000..a1b164a49238
--- /dev/null
+++ b/string/aarch64/strlen.S
@@ -0,0 +1,200 @@
+/*
+ * strlen - calculate the length of a string.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ * Not MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin	x0
+#define len	x0
+
+#define src	x1
+#define data1	x2
+#define data2	x3
+#define has_nul1 x4
+#define has_nul2 x5
+#define tmp1	x4
+#define tmp2	x5
+#define tmp3	x6
+#define tmp4	x7
+#define zeroones x8
+
+#define maskv	v0
+#define maskd	d0
+#define dataq1	q1
+#define dataq2	q2
+#define datav1	v1
+#define datav2	v2
+#define tmp	x2
+#define tmpw	w2
+#define synd	x3
+#define shift	x4
+
+/* For the first 32 bytes, NUL detection works on the principle that
+   (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
+   byte is zero, and can be done in parallel across the entire word.  */
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+/* To test the page crossing code path more thoroughly, compile with
+   -DTEST_PAGE_CROSS - this will force all calls through the slower
+   entry path.  This option is not intended for production use.  */
+
+#ifdef TEST_PAGE_CROSS
+# define MIN_PAGE_SIZE 32
+#else
+# define MIN_PAGE_SIZE 4096
+#endif
+
+/* Core algorithm:
+
+   Since strings are short on average, we check the first 32 bytes of the
+   string for a NUL character without aligning the string.  In order to use
+   unaligned loads safely we must do a page cross check first.
+
+   If there is a NUL byte we calculate the length from the 2 8-byte words
+   using conditional select to reduce branch mispredictions (it is unlikely
+   strlen will be repeatedly called on strings with the same length).
+
+   If the string is longer than 32 bytes, align src so we don't need further
+   page cross checks, and process 32 bytes per iteration using a fast SIMD
+   loop.
+
+   If the page cross check fails, we read 32 bytes from an aligned address,
+   and ignore any characters before the string.  If it contains a NUL
+   character, return the length, if not, continue in the main loop.  */
+
+ENTRY (__strlen_aarch64)
+	PTR_ARG (0)
+	and	tmp1, srcin, MIN_PAGE_SIZE - 1
+	cmp	tmp1, MIN_PAGE_SIZE - 32
+	b.hi	L(page_cross)
+
+	/* Look for a NUL byte in the first 16 bytes.  */
+	ldp	data1, data2, [srcin]
+	mov	zeroones, REP8_01
+
+#ifdef __AARCH64EB__
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul1/2 directly.
+	   Since we expect strings to be small and early-exit,
+	   byte-swap the data now so has_null1/2 will be correct.  */
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	b.eq	L(bytes16_31)
+
+	/* Find the exact offset of the first NUL byte in the first 16 bytes
+	   from the string start.  Enter with C = has_nul1 == 0.  */
+	csel	has_nul1, has_nul1, has_nul2, cc
+	mov	len, 8
+	rev	has_nul1, has_nul1
+	csel	len, xzr, len, cc
+	clz	tmp1, has_nul1
+	add	len, len, tmp1, lsr 3
+	ret
+
+	.p2align 3
+	/* Look for a NUL byte at offset 16..31 in the string.  */
+L(bytes16_31):
+	ldp	data1, data2, [srcin, 16]
+#ifdef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	b.eq	L(loop_entry)
+
+	/* Find the exact offset of the first NUL byte at offset 16..31 from
+	   the string start.  Enter with C = has_nul1 == 0.  */
+	csel	has_nul1, has_nul1, has_nul2, cc
+	mov	len, 24
+	rev	has_nul1, has_nul1
+	mov	tmp3, 16
+	clz	tmp1, has_nul1
+	csel	len, tmp3, len, cc
+	add	len, len, tmp1, lsr 3
+	ret
+
+L(loop_entry):
+	bic	src, srcin, 31
+
+	.p2align 5
+L(loop):
+	ldp	dataq1, dataq2, [src, 32]!
+	uminp	maskv.16b, datav1.16b, datav2.16b
+	uminp	maskv.16b, maskv.16b, maskv.16b
+	cmeq	maskv.8b, maskv.8b, 0
+	fmov	synd, maskd
+	cbz	synd, L(loop)
+
+	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
+	cmeq	maskv.16b, datav1.16b, 0
+	sub	len, src, srcin
+	tst	synd, 0xffffffff
+	b.ne	1f
+	cmeq	maskv.16b, datav2.16b, 0
+	add	len, len, 16
+1:
+	/* Generate a bitmask and compute correct byte offset.  */
+#ifdef __AARCH64EB__
+	bic	maskv.8h, 0xf0
+#else
+	bic	maskv.8h, 0x0f, lsl 8
+#endif
+	umaxp	maskv.16b, maskv.16b, maskv.16b
+	fmov	synd, maskd
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	clz	tmp, synd
+	add	len, len, tmp, lsr 2
+	ret
+
+        .p2align 4
+
+L(page_cross):
+	bic	src, srcin, 31
+	mov	tmpw, 0x0c03
+	movk	tmpw, 0xc030, lsl 16
+	ld1	{datav1.16b, datav2.16b}, [src]
+	dup	maskv.4s, tmpw
+	cmeq	datav1.16b, datav1.16b, 0
+	cmeq	datav2.16b, datav2.16b, 0
+	and	datav1.16b, datav1.16b, maskv.16b
+	and	datav2.16b, datav2.16b, maskv.16b
+	addp	maskv.16b, datav1.16b, datav2.16b
+	addp	maskv.16b, maskv.16b, maskv.16b
+	fmov	synd, maskd
+	lsl	shift, srcin, 1
+	lsr	synd, synd, shift
+	cbz	synd, L(loop)
+
+	rbit	synd, synd
+	clz	len, synd
+	lsr	len, len, 1
+	ret
+
+END (__strlen_aarch64)
diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S
new file mode 100644
index 000000000000..c9d6fc8a158b
--- /dev/null
+++ b/string/aarch64/strncmp-mte.S
@@ -0,0 +1,307 @@
+/*
+ * strncmp - compare two strings
+ *
+ * Copyright (c) 2013-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+#include "../asmdefs.h"
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+/* Parameters and result.  */
+#define src1		x0
+#define src2		x1
+#define limit		x2
+#define result		x0
+
+/* Internal variables.  */
+#define data1		x3
+#define data1w		w3
+#define data2		x4
+#define data2w		w4
+#define has_nul		x5
+#define diff		x6
+#define syndrome	x7
+#define tmp1		x8
+#define tmp2		x9
+#define tmp3		x10
+#define zeroones	x11
+#define pos		x12
+#define mask		x13
+#define endloop		x14
+#define count		mask
+#define offset		pos
+#define neg_offset	x15
+
+/* Define endian dependent shift operations.
+   On big-endian early bytes are at MSB and on little-endian LSB.
+   LS_FW means shifting towards early bytes.
+   LS_BK means shifting towards later bytes.
+   */
+#ifdef __AARCH64EB__
+#define LS_FW lsl
+#define LS_BK lsr
+#else
+#define LS_FW lsr
+#define LS_BK lsl
+#endif
+
+ENTRY (__strncmp_aarch64_mte)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+	cbz	limit, L(ret0)
+	eor	tmp1, src1, src2
+	mov	zeroones, #REP8_01
+	tst	tmp1, #7
+	and	count, src1, #7
+	b.ne	L(misaligned8)
+	cbnz	count, L(mutual_align)
+
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+	.p2align 4
+L(loop_aligned):
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+L(start_realigned):
+	subs	limit, limit, #8
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	endloop, #0, #0, eq
+	b.eq	L(loop_aligned)
+	/* End of main loop */
+
+L(full_check):
+#ifndef __AARCH64EB__
+	orr	syndrome, diff, has_nul
+	add	limit, limit, 8	/* Rewind limit to before last subs. */
+L(syndrome_check):
+	/* Limit was reached. Check if the NUL byte or the difference
+	   is before the limit. */
+	rev	syndrome, syndrome
+	rev	data1, data1
+	clz	pos, syndrome
+	rev	data2, data2
+	lsl	data1, data1, pos
+	cmp	limit, pos, lsr #3
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	csel result, result, xzr, hi
+	ret
+#else
+	/* Not reached the limit, must have found the end or a diff.  */
+	tbz	limit, #63, L(not_limit)
+	add	tmp1, limit, 8
+	cbz	limit, L(not_limit)
+
+	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
+	mov	mask, #~0
+	lsr	mask, mask, limit
+	bic	data1, data1, mask
+	bic	data2, data2, mask
+
+	/* Make sure that the NUL byte is marked in the syndrome.  */
+	orr	has_nul, has_nul, mask
+
+L(not_limit):
+	/* For big-endian we cannot use the trick with the syndrome value
+	   as carry-propagation can corrupt the upper bits if the trailing
+	   bytes in the string contain 0x01.  */
+	/* However, if there is no NUL byte in the dword, we can generate
+	   the result directly.  We can't just subtract the bytes as the
+	   MSB might be significant.  */
+	cbnz	has_nul, 1f
+	cmp	data1, data2
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+1:
+	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
+	rev	tmp3, data1
+	sub	tmp1, tmp3, zeroones
+	orr	tmp2, tmp3, #REP8_7f
+	bic	has_nul, tmp1, tmp2
+	rev	has_nul, has_nul
+	orr	syndrome, diff, has_nul
+	clz	pos, syndrome
+	/* The most-significant-non-zero bit of the syndrome marks either the
+	   first bit that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+L(end_quick):
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#endif
+
+L(mutual_align):
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that precede the start point.
+	   We also need to adjust the limit calculations, but without
+	   overflowing if the limit is near ULONG_MAX.  */
+	bic	src1, src1, #7
+	bic	src2, src2, #7
+	ldr	data1, [src1], #8
+	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
+	ldr	data2, [src2], #8
+	mov	tmp2, #~0
+	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
+	/* Adjust the limit and ensure it doesn't overflow.  */
+	adds	limit, limit, count
+	csinv	limit, limit, xzr, lo
+	orr	data1, data1, tmp2
+	orr	data2, data2, tmp2
+	b	L(start_realigned)
+
+	.p2align 4
+	/* Don't bother with dwords for up to 16 bytes.  */
+L(misaligned8):
+	cmp	limit, #16
+	b.hs	L(try_misaligned_words)
+
+L(byte_loop):
+	/* Perhaps we can do better than this.  */
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	subs	limit, limit, #1
+	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.eq	L(byte_loop)
+L(done):
+	sub	result, data1, data2
+	ret
+	/* Align the SRC1 to a dword by doing a bytewise compare and then do
+	   the dword loop.  */
+L(try_misaligned_words):
+	cbz	count, L(src1_aligned)
+
+	neg	count, count
+	and	count, count, #7
+	sub	limit, limit, count
+
+L(page_end_loop):
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.ne	L(done)
+	subs	count, count, #1
+	b.hi	L(page_end_loop)
+
+	/* The following diagram explains the comparison of misaligned strings.
+	   The bytes are shown in natural order. For little-endian, it is
+	   reversed in the registers. The "x" bytes are before the string.
+	   The "|" separates data that is loaded at one time.
+	   src1     | a a a a a a a a | b b b c c c c c | . . .
+	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
+
+	   After shifting in each step, the data looks like this:
+	                STEP_A              STEP_B              STEP_C
+	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
+	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
+
+	   The bytes with "0" are eliminated from the syndrome via mask.
+
+	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
+	   time from SRC2. The comparison happens in 3 steps. After each step
+	   the loop can exit, or read from SRC1 or SRC2. */
+L(src1_aligned):
+	/* Calculate offset from 8 byte alignment to string start in bits. No
+	   need to mask offset since shifts are ignoring upper bits. */
+	lsl	offset, src2, #3
+	bic	src2, src2, #0xf
+	mov	mask, -1
+	neg	neg_offset, offset
+	ldr	data1, [src1], #8
+	ldp	tmp1, tmp2, [src2], #16
+	LS_BK	mask, mask, neg_offset
+	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
+	/* Skip the first compare if data in tmp1 is irrelevant. */
+	tbnz	offset, 6, L(misaligned_mid_loop)
+
+L(loop_misaligned):
+	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
+	LS_FW	data2, tmp1, offset
+	LS_BK	tmp1, tmp2, neg_offset
+	subs	limit, limit, #8
+	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
+	sub	has_nul, data1, zeroones
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	orr	tmp3, data1, #REP8_7f
+	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
+	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
+	orr	tmp3, endloop, has_nul
+	cbnz	tmp3, L(full_check)
+
+	ldr	data1, [src1], #8
+L(misaligned_mid_loop):
+	/* STEP_B: Compare first part of data1 to second part of tmp2. */
+	LS_FW	data2, tmp2, offset
+#ifdef __AARCH64EB__
+	/* For big-endian we do a byte reverse to avoid carry-propagation
+	problem described above. This way we can reuse the has_nul in the
+	next step and also use syndrome value trick at the end. */
+	rev	tmp3, data1
+	#define data1_fixed tmp3
+#else
+	#define data1_fixed data1
+#endif
+	sub	has_nul, data1_fixed, zeroones
+	orr	tmp3, data1_fixed, #REP8_7f
+	eor	diff, data2, data1	/* Non-zero if differences found.  */
+	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
+#ifdef __AARCH64EB__
+	rev	has_nul, has_nul
+#endif
+	cmp	limit, neg_offset, lsr #3
+	orr	syndrome, diff, has_nul
+	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
+	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
+	cbnz	tmp3, L(syndrome_check)
+
+	/* STEP_C: Compare second part of data1 to first part of tmp1. */
+	ldp	tmp1, tmp2, [src2], #16
+	cmp	limit, #8
+	LS_BK	data2, tmp1, neg_offset
+	eor	diff, data2, data1	/* Non-zero if differences found.  */
+	orr	syndrome, diff, has_nul
+	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
+	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
+	cbnz	tmp3, L(syndrome_check)
+
+	ldr	data1, [src1], #8
+	sub	limit, limit, #8
+	b	L(loop_misaligned)
+
+#ifdef	__AARCH64EB__
+L(syndrome_check):
+	clz	pos, syndrome
+	cmp	pos, limit, lsl #3
+	b.lo	L(end_quick)
+#endif
+
+L(ret0):
+	mov	result, #0
+	ret
+END(__strncmp_aarch64_mte)
+
diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S
new file mode 100644
index 000000000000..234190e245b0
--- /dev/null
+++ b/string/aarch64/strncmp-sve.S
@@ -0,0 +1,69 @@
+/*
+ * strncmp - compare two strings with limit
+ *
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+ENTRY (__strncmp_aarch64_sve)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+	setffr				/* initialize FFR */
+	mov	x3, 0			/* initialize off */
+
+0:	whilelo	p0.b, x3, x2		/* while off < max */
+	b.none	9f
+
+	ldff1b	z0.b, p0/z, [x0, x3]
+	ldff1b	z1.b, p0/z, [x1, x3]
+	rdffrs	p1.b, p0/z
+	b.nlast	2f
+
+	/* First fault did not fail: the vector up to max is valid.
+	   Avoid depending on the contents of FFR beyond the branch.
+	   Increment for a whole vector, even if we've only read a partial.
+	   This is significantly cheaper than INCP, and since OFF is not
+	   used after the loop it is ok to increment OFF past MAX.  */
+	incb	x3
+	cmpeq	p1.b, p0/z, z0.b, z1.b	/* compare strings */
+	cmpne	p2.b, p0/z, z0.b, 0	/* search for ~zero */
+	nands	p2.b, p0/z, p1.b, p2.b	/* ~(eq & ~zero) -> ne | zero */
+	b.none	0b
+
+	/* Found end-of-string or inequality.  */
+1:	brkb	p2.b, p0/z, p2.b	/* find first such */
+	lasta	w0, p2, z0.b		/* extract each char */
+	lasta	w1, p2, z1.b
+	sub	x0, x0, x1		/* return comparison */
+	ret
+
+	/* First fault failed: only some of the vector is valid.
+	   Perform the comparison only on the valid bytes.  */
+2:	cmpeq	p2.b, p1/z, z0.b, z1.b	/* compare strings, as above */
+	cmpne	p3.b, p1/z, z0.b, 0
+	nands	p2.b, p1/z, p2.b, p3.b
+	b.any	1b
+
+	/* No inequality or zero found.  Re-init FFR, incr and loop.  */
+	setffr
+	incp	x3, p1.b
+	b	0b
+
+	/* Found end-of-count.  */
+9:	mov	x0, 0			/* return equal */
+	ret
+
+END (__strncmp_aarch64_sve)
+
+#endif
+
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
new file mode 100644
index 000000000000..738b6539cab6
--- /dev/null
+++ b/string/aarch64/strncmp.S
@@ -0,0 +1,260 @@
+/*
+ * strncmp - compare two strings
+ *
+ * Copyright (c) 2013-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+#include "../asmdefs.h"
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and result.  */
+#define src1		x0
+#define src2		x1
+#define limit		x2
+#define result		x0
+
+/* Internal variables.  */
+#define data1		x3
+#define data1w		w3
+#define data2		x4
+#define data2w		w4
+#define has_nul		x5
+#define diff		x6
+#define syndrome	x7
+#define tmp1		x8
+#define tmp2		x9
+#define tmp3		x10
+#define zeroones	x11
+#define pos		x12
+#define limit_wd	x13
+#define mask		x14
+#define endloop		x15
+#define count		mask
+
+ENTRY (__strncmp_aarch64)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+	cbz	limit, L(ret0)
+	eor	tmp1, src1, src2
+	mov	zeroones, #REP8_01
+	tst	tmp1, #7
+	and	count, src1, #7
+	b.ne	L(misaligned8)
+	cbnz	count, L(mutual_align)
+	/* Calculate the number of full and partial words -1.  */
+	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
+	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
+
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+	.p2align 4
+L(loop_aligned):
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+L(start_realigned):
+	subs	limit_wd, limit_wd, #1
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	csinv	endloop, diff, xzr, pl	/* Last Dword or differences.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	endloop, #0, #0, eq
+	b.eq	L(loop_aligned)
+	/* End of main loop */
+
+	/* Not reached the limit, must have found the end or a diff.  */
+	tbz	limit_wd, #63, L(not_limit)
+
+	/* Limit % 8 == 0 => all bytes significant.  */
+	ands	limit, limit, #7
+	b.eq	L(not_limit)
+
+	lsl	limit, limit, #3	/* Bits -> bytes.  */
+	mov	mask, #~0
+#ifdef __AARCH64EB__
+	lsr	mask, mask, limit
+#else
+	lsl	mask, mask, limit
+#endif
+	bic	data1, data1, mask
+	bic	data2, data2, mask
+
+	/* Make sure that the NUL byte is marked in the syndrome.  */
+	orr	has_nul, has_nul, mask
+
+L(not_limit):
+	orr	syndrome, diff, has_nul
+
+#ifndef	__AARCH64EB__
+	rev	syndrome, syndrome
+	rev	data1, data1
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	clz	pos, syndrome
+	rev	data2, data2
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#else
+	/* For big-endian we cannot use the trick with the syndrome value
+	   as carry-propagation can corrupt the upper bits if the trailing
+	   bytes in the string contain 0x01.  */
+	/* However, if there is no NUL byte in the dword, we can generate
+	   the result directly.  We can't just subtract the bytes as the
+	   MSB might be significant.  */
+	cbnz	has_nul, 1f
+	cmp	data1, data2
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+1:
+	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
+	rev	tmp3, data1
+	sub	tmp1, tmp3, zeroones
+	orr	tmp2, tmp3, #REP8_7f
+	bic	has_nul, tmp1, tmp2
+	rev	has_nul, has_nul
+	orr	syndrome, diff, has_nul
+	clz	pos, syndrome
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#endif
+
+L(mutual_align):
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that precede the start point.
+	   We also need to adjust the limit calculations, but without
+	   overflowing if the limit is near ULONG_MAX.  */
+	bic	src1, src1, #7
+	bic	src2, src2, #7
+	ldr	data1, [src1], #8
+	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
+	ldr	data2, [src2], #8
+	mov	tmp2, #~0
+	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
+#ifdef __AARCH64EB__
+	/* Big-endian.  Early bytes are at MSB.  */
+	lsl	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
+#else
+	/* Little-endian.  Early bytes are at LSB.  */
+	lsr	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
+#endif
+	and	tmp3, limit_wd, #7
+	lsr	limit_wd, limit_wd, #3
+	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
+	add	limit, limit, count
+	add	tmp3, tmp3, count
+	orr	data1, data1, tmp2
+	orr	data2, data2, tmp2
+	add	limit_wd, limit_wd, tmp3, lsr #3
+	b	L(start_realigned)
+
+	.p2align 4
+	/* Don't bother with dwords for up to 16 bytes.  */
+L(misaligned8):
+	cmp	limit, #16
+	b.hs	L(try_misaligned_words)
+
+L(byte_loop):
+	/* Perhaps we can do better than this.  */
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	subs	limit, limit, #1
+	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.eq	L(byte_loop)
+L(done):
+	sub	result, data1, data2
+	ret
+	/* Align the SRC1 to a dword by doing a bytewise compare and then do
+	   the dword loop.  */
+L(try_misaligned_words):
+	lsr	limit_wd, limit, #3
+	cbz	count, L(do_misaligned)
+
+	neg	count, count
+	and	count, count, #7
+	sub	limit, limit, count
+	lsr	limit_wd, limit, #3
+
+L(page_end_loop):
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.ne	L(done)
+	subs	count, count, #1
+	b.hi	L(page_end_loop)
+
+L(do_misaligned):
+	/* Prepare ourselves for the next page crossing.  Unlike the aligned
+	   loop, we fetch 1 less dword because we risk crossing bounds on
+	   SRC2.  */
+	mov	count, #8
+	subs	limit_wd, limit_wd, #1
+	b.lo	L(done_loop)
+L(loop_misaligned):
+	and	tmp2, src2, #0xff8
+	eor	tmp2, tmp2, #0xff8
+	cbz	tmp2, L(page_end_loop)
+
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	diff, #0, #0, eq
+	b.ne	L(not_limit)
+	subs	limit_wd, limit_wd, #1
+	b.pl	L(loop_misaligned)
+
+L(done_loop):
+	/* We found a difference or a NULL before the limit was reached.  */
+	and	limit, limit, #7
+	cbz	limit, L(not_limit)
+	/* Read the last word.  */
+	sub	src1, src1, 8
+	sub	src2, src2, 8
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	diff, #0, #0, eq
+	b.ne	L(not_limit)
+
+L(ret0):
+	mov	result, #0
+	ret
+
+END ( __strncmp_aarch64)
+
diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S
new file mode 100644
index 000000000000..5b9ebf7763bc
--- /dev/null
+++ b/string/aarch64/strnlen-sve.S
@@ -0,0 +1,74 @@
+/*
+ * strnlen - calculate the length of a string with limit.
+ *
+ * Copyright (c) 2019-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+ENTRY (__strnlen_aarch64_sve)
+	PTR_ARG (0)
+	SIZE_ARG (1)
+	setffr				/* initialize FFR */
+	mov	x2, 0			/* initialize len */
+	b	1f
+
+	.p2align 4
+	/* We have off + vl <= max, and so may read the whole vector.  */
+0:	ldff1b	z0.b, p0/z, [x0, x2]
+	rdffrs	p1.b, p0/z
+	b.nlast	2f
+
+	/* First fault did not fail: the whole vector is valid.
+	   Avoid depending on the contents of FFR beyond the branch.  */
+	cmpeq	p2.b, p0/z, z0.b, 0
+	b.any	8f
+	incb	x2
+
+1:	whilelo	p0.b, x2, x1
+	b.last	0b
+
+	/* We have off + vl < max.  Test for off == max before proceeding.  */
+	b.none	9f
+
+	ldff1b	z0.b, p0/z, [x0, x2]
+	rdffrs	p1.b, p0/z
+	b.nlast	2f
+
+	/* First fault did not fail: the vector up to max is valid.
+	   Avoid depending on the contents of FFR beyond the branch.
+	   Compare for end-of-string, but there are no more bytes.  */
+	cmpeq	p2.b, p0/z, z0.b, 0
+
+	/* Found end-of-string or zero.  */
+8:	brkb	p2.b, p0/z, p2.b
+	mov	x0, x2
+	incp	x0, p2.b
+	ret
+
+	/* First fault failed: only some of the vector is valid.
+	   Perform the comparison only on the valid bytes.  */
+2:	cmpeq	p2.b, p1/z, z0.b, 0
+	b.any	8b
+
+	/* No inequality or zero found.  Re-init FFR, incr and loop.  */
+	setffr
+	incp	x2, p1.b
+	b	1b
+
+	/* End of count.  Return max.  */
+9:	mov	x0, x1
+	ret
+
+END (__strnlen_aarch64_sve)
+
+#endif
+
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
new file mode 100644
index 000000000000..48d2495d2082
--- /dev/null
+++ b/string/aarch64/strnlen.S
@@ -0,0 +1,112 @@
+/*
+ * strnlen - calculate the length of a string with limit.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin		x0
+#define cntin		x1
+#define result		x0
+
+#define src		x2
+#define synd		x3
+#define	shift		x4
+#define wtmp		w4
+#define tmp		x4
+#define cntrem		x5
+
+#define qdata		q0
+#define vdata		v0
+#define vhas_chr	v1
+#define vrepmask	v2
+#define vend		v3
+#define dend		d3
+
+/*
+   Core algorithm:
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
+
+ENTRY (__strnlen_aarch64)
+	PTR_ARG (0)
+	SIZE_ARG (1)
+	bic	src, srcin, 15
+	mov	wtmp, 0xf00f
+	cbz	cntin, L(nomatch)
+	ld1	{vdata.16b}, [src], 16
+	dup	vrepmask.8h, wtmp
+	cmeq	vhas_chr.16b, vdata.16b, 0
+	lsl	shift, srcin, 2
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	lsr	synd, synd, shift
+	cbz	synd, L(start_loop)
+L(finish):
+	rbit	synd, synd
+	clz	synd, synd
+	lsr	result, synd, 2
+	cmp	cntin, result
+	csel	result, cntin, result, ls
+	ret
+
+L(start_loop):
+	sub	tmp, src, srcin
+	subs	cntrem, cntin, tmp
+	b.ls	L(nomatch)
+
+	/* Make sure that it won't overread by a 16-byte chunk */
+	add	tmp, cntrem, 15
+	tbnz	tmp, 4, L(loop32_2)
+
+	.p2align 5
+L(loop32):
+	ldr	qdata, [src], 16
+	cmeq	vhas_chr.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbnz	synd, L(end)
+L(loop32_2):
+	ldr	qdata, [src], 16
+	subs	cntrem, cntrem, 32
+	cmeq	vhas_chr.16b, vdata.16b, 0
+	b.ls	L(end)
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbz	synd, L(loop32)
+
+L(end):
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	sub	src, src, 16
+	mov	synd, vend.d[0]
+	sub	result, src, srcin
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	clz	synd, synd
+	add	result, result, synd, lsr 2
+	cmp	cntin, result
+	csel	result, cntin, result, ls
+	ret
+
+L(nomatch):
+	mov	result, cntin
+	ret
+
+END (__strnlen_aarch64)
+
diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S
new file mode 100644
index 000000000000..1e4fb1a68f7e
--- /dev/null
+++ b/string/aarch64/strrchr-mte.S
@@ -0,0 +1,127 @@
+/*
+ * strrchr - find last position of a character in a string.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin		x0
+#define chrin		w1
+#define result		x0
+
+#define src		x2
+#define tmp		x3
+#define wtmp		w3
+#define synd		x3
+#define shift		x4
+#define src_match	x4
+#define nul_match	x5
+#define chr_match	x6
+
+#define vrepchr		v0
+#define vdata		v1
+#define vhas_nul	v2
+#define vhas_chr	v3
+#define vrepmask	v4
+#define vrepmask2	v5
+#define vend		v5
+#define dend		d5
+
+/* Core algorithm.
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value, with
+   four bits per byte (LSB is always in bits 0 and 1, for both big
+   and little-endian systems).  For each tuple, bits 0-1 are set if
+   the relevant byte matched the requested character; bits 2-3 are set
+   if the relevant byte matched the NUL end of string.  */
+
+ENTRY (__strrchr_aarch64_mte)
+	PTR_ARG (0)
+	bic	src, srcin, 15
+	dup	vrepchr.16b, chrin
+	mov	wtmp, 0x3003
+	dup	vrepmask.8h, wtmp
+	tst	srcin, 15
+	beq	L(loop1)
+
+	ld1	{vdata.16b}, [src], 16
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	mov	wtmp, 0xf00f
+	dup	vrepmask2.8h, wtmp
+	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	lsl	shift, srcin, 2
+	fmov	synd, dend
+	lsr	synd, synd, shift
+	lsl	synd, synd, shift
+	ands	nul_match, synd, 0xcccccccccccccccc
+	bne	L(tail)
+	cbnz	synd, L(loop2)
+
+	.p2align 5
+L(loop1):
+	ld1	{vdata.16b}, [src], 16
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbz	synd, L(loop1)
+
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	bic	vhas_nul.8h, 0x0f, lsl 8
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	ands	nul_match, synd, 0xcccccccccccccccc
+	beq	L(loop2)
+
+L(tail):
+	sub	nul_match, nul_match, 1
+	and	chr_match, synd, 0x3333333333333333
+	ands	chr_match, chr_match, nul_match
+	sub	result, src, 1
+	clz	tmp, chr_match
+	sub	result, result, tmp, lsr 2
+	csel	result, result, xzr, ne
+	ret
+
+	.p2align 4
+L(loop2):
+	cmp	synd, 0
+	csel	src_match, src, src_match, ne
+	csel	chr_match, synd, chr_match, ne
+	ld1	{vdata.16b}, [src], 16
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	tst	synd, 0xcccccccccccccccc
+	beq	L(loop2)
+
+	bic	vhas_nul.8h, 0x0f, lsl 8
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	and	nul_match, synd, 0xcccccccccccccccc
+	sub	nul_match, nul_match, 1
+	and	tmp, synd, 0x3333333333333333
+	ands	tmp, tmp, nul_match
+	csel	chr_match, tmp, chr_match, ne
+	csel	src_match, src, src_match, ne
+	sub	src_match, src_match, 1
+	clz	tmp, chr_match
+	sub	result, src_match, tmp, lsr 2
+	ret
+
+END (__strrchr_aarch64_mte)
+
diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S
new file mode 100644
index 000000000000..d36d69af37fd
--- /dev/null
+++ b/string/aarch64/strrchr-sve.S
@@ -0,0 +1,84 @@
+/*
+ * strrchr - find the last of a character in a string
+ *
+ * Copyright (c) 2019-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_SVE
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * SVE Available.
+ */
+
+ENTRY (__strrchr_aarch64_sve)
+	PTR_ARG (0)
+	dup	z1.b, w1		/* replicate byte across vector */
+	setffr				/* initialize FFR */
+	ptrue	p1.b			/* all ones; loop invariant */
+	mov	x2, 0			/* no match found so far */
+	pfalse	p2.b
+
+	.p2align 4
+	/* Read a vector's worth of bytes, stopping on first fault.  */
+0:	ldff1b	z0.b, p1/z, [x0, xzr]
+	rdffrs	p0.b, p1/z
+	b.nlast	1f
+
+	/* First fault did not fail: the whole vector is valid.
+	   Avoid depending on the contents of FFR beyond the branch.  */
+	incb	x0, all			/* skip bytes this round */
+	cmpeq	p3.b, p1/z, z0.b, 0	/* search for 0 */
+	b.any	3f
+
+	cmpeq	p3.b, p1/z, z0.b, z1.b	/* search for c; no eos */
+	b.none	0b
+
+	mov	x2, x0			/* save advanced base */
+	mov	p2.b, p3.b		/* save current search */
+	b	0b
+
+	/* First fault failed: only some of the vector is valid.
+	   Perform the comparisions only on the valid bytes.  */
+1:	cmpeq	p3.b, p0/z, z0.b, 0	/* search for 0 */
+	b.any	2f
+
+	cmpeq	p3.b, p0/z, z0.b, z1.b	/* search for c; no eos */
+	mov	x3, x0
+	incp	x0, p0.b		/* skip bytes this round */
+	setffr				/* re-init FFR */
+	b.none	0b
+
+	addvl	x2, x3, 1		/* save advanced base */
+	mov	p2.b, p3.b		/* save current search */
+	b	0b
+
+	/* Found end-of-string.  */
+2:	incb	x0, all			/* advance base */
+3:	brka	p3.b, p1/z, p3.b	/* mask after first 0 */
+	cmpeq	p3.b, p3/z, z0.b, z1.b	/* search for c not after eos */
+	b.any	4f
+
+	/* No C within last vector.  Did we have one before?  */
+	cbz	x2, 5f
+	mov	x0, x2			/* restore advanced base */
+	mov	p3.b, p2.b		/* restore saved search */
+
+	/* Find the *last* match in the predicate.  This is slightly
+	   more complicated than finding the first match.  */
+4:	rev	p3.b, p3.b		/* reverse the bits */
+	brka	p3.b, p1/z, p3.b	/* find position of last match */
+	decp	x0, p3.b		/* retard pointer to last match */
+	ret
+
+	/* No C whatsoever.  Return NULL.  */
+5:	mov	x0, 0
+	ret
+
+END (__strrchr_aarch64_sve)
+
+#endif
+
diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S
new file mode 100644
index 000000000000..56185ff534e3
--- /dev/null
+++ b/string/aarch64/strrchr.S
@@ -0,0 +1,149 @@
+/*
+ * strrchr - find last position of a character in a string.
+ *
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+#include "../asmdefs.h"
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+
+#define result		x0
+
+#define src		x2
+#define	tmp1		x3
+#define wtmp2		w4
+#define tmp3		x5
+#define src_match	x6
+#define src_offset	x7
+#define const_m1	x8
+#define tmp4		x9
+#define nul_match	x10
+#define chr_match	x11
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_nul1	v3
+#define vhas_nul2	v4
+#define vhas_chr1	v5
+#define vhas_chr2	v6
+#define vrepmask_0	v7
+#define vrepmask_c	v16
+#define vend1		v17
+#define vend2		v18
+
+/* Core algorithm.
+
+   For each 32-byte hunk we calculate a 64-bit syndrome value, with
+   two bits per byte (LSB is always in bits 0 and 1, for both big
+   and little-endian systems).  For each tuple, bit 0 is set iff
+   the relevant byte matched the requested character; bit 1 is set
+   iff the relevant byte matched the NUL end of string (we trigger
+   off bit0 for the special case of looking for NUL).  Since the bits
+   in the syndrome reflect exactly the order in which things occur
+   in the original string a count_trailing_zeros() operation will
+   identify exactly which byte is causing the termination, and why.  */
+
+ENTRY (__strrchr_aarch64)
+	PTR_ARG (0)
+	/* Magic constant 0x40100401 to allow us to identify which lane
+	   matches the requested byte.  Magic constant 0x80200802 used
+	   similarly for NUL termination.  */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
+	dup	vrepmask_c.4s, wtmp2
+	mov	src_offset, #0
+	ands	tmp1, srcin, #31
+	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
+	b.eq	L(aligned)
+
+	/* Input string is not 32-byte aligned.  Rather than forcing
+	   the padding bytes to a safe value, we calculate the syndrome
+	   for all the bytes, but then mask off those bits of the
+	   syndrome that are related to the padding.  */
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	neg	tmp1, tmp1
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
+	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
+	addp	vend1.16b, vhas_nul1.16b, vhas_chr1.16b		// 128->64
+	mov	nul_match, vend1.d[0]
+	lsl	tmp1, tmp1, #1
+	mov	const_m1, #~0
+	lsr	tmp3, const_m1, tmp1
+	mov	chr_match, vend1.d[1]
+
+	bic	nul_match, nul_match, tmp3	// Mask padding bits.
+	bic	chr_match, chr_match, tmp3	// Mask padding bits.
+	cbnz	nul_match, L(tail)
+
+	.p2align 4
+L(loop):
+	cmp	chr_match, #0
+	csel	src_match, src, src_match, ne
+	csel	src_offset, chr_match, src_offset, ne
+L(aligned):
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	uminp	vend1.16b, vdata1.16b, vdata2.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+	cmeq	vend1.16b, vend1.16b, 0
+	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
+	addp	vend1.16b, vend1.16b, vhas_chr1.16b		// 128->64
+	mov	nul_match, vend1.d[0]
+	mov	chr_match, vend1.d[1]
+	cbz	nul_match, L(loop)
+
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
+	mov	nul_match, vhas_nul1.d[0]
+
+L(tail):
+	/* Work out exactly where the string ends.  */
+	sub	tmp4, nul_match, #1
+	eor	tmp4, tmp4, nul_match
+	ands	chr_match, chr_match, tmp4
+	/* And pick the values corresponding to the last match.  */
+	csel	src_match, src, src_match, ne
+	csel	src_offset, chr_match, src_offset, ne
+
+	/* Count down from the top of the syndrome to find the last match.  */
+	clz	tmp3, src_offset
+	/* Src_match points beyond the word containing the match, so we can
+	   simply subtract half the bit-offset into the syndrome.  Because
+	   we are counting down, we need to go back one more character.  */
+	add	tmp3, tmp3, #2
+	sub	result, src_match, tmp3, lsr #1
+	/* But if the syndrome shows no match was found, then return NULL.  */
+	cmp	src_offset, #0
+	csel	result, result, xzr, ne
+
+	ret
+
+END (__strrchr_aarch64)
+
diff --git a/string/arm/check-arch.S b/string/arm/check-arch.S
new file mode 100644
index 000000000000..1cff9345e343
--- /dev/null
+++ b/string/arm/check-arch.S
@@ -0,0 +1,10 @@
+/*
+ * check ARCH setting.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if !__arm__
+# error ARCH setting does not match the compiler.
+#endif
diff --git a/string/arm/memchr.S b/string/arm/memchr.S
new file mode 100644
index 000000000000..3f1ac4df136f
--- /dev/null
+++ b/string/arm/memchr.S
@@ -0,0 +1,132 @@
+/*
+ * memchr - scan memory for a character
+ *
+ * Copyright (c) 2010-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/*
+   Written by Dave Gilbert <david.gilbert@linaro.org>
+
+   This __memchr_arm routine is optimised on a Cortex-A9 and should work on
+   all ARMv7 processors.   It has a fast past for short sizes, and has
+   an optimised path for large data sets; the worst case is finding the
+   match early in a large data set.
+
+ */
+
+@ 2011-02-07 david.gilbert@linaro.org
+@    Extracted from local git a5b438d861
+@ 2011-07-14 david.gilbert@linaro.org
+@    Import endianness fix from local git ea786f1b
+@ 2011-12-07 david.gilbert@linaro.org
+@    Removed unneeded cbz from align loop
+
+	.syntax unified
+	.arch armv7-a
+
+@ this lets us check a flag in a 00/ff byte easily in either endianness
+#ifdef __ARMEB__
+#define CHARTSTMASK(c) 1<<(31-(c*8))
+#else
+#define CHARTSTMASK(c) 1<<(c*8)
+#endif
+	.thumb
+
+@ ---------------------------------------------------------------------------
+	.thumb_func
+	.align 2
+	.p2align 4,,15
+	.global __memchr_arm
+	.type __memchr_arm,%function
+__memchr_arm:
+	@ r0 = start of memory to scan
+	@ r1 = character to look for
+	@ r2 = length
+	@ returns r0 = pointer to character or NULL if not found
+	and	r1,r1,#0xff	@ Don't think we can trust the caller to actually pass a char
+
+	cmp	r2,#16		@ If it's short don't bother with anything clever
+	blt	20f 
+
+	tst	r0, #7		@ If it's already aligned skip the next bit
+	beq	10f
+
+	@ Work up to an aligned point
+5:
+	ldrb	r3, [r0],#1
+	subs	r2, r2, #1
+	cmp	r3, r1
+	beq	50f		@ If it matches exit found
+	tst	r0, #7
+	bne	5b		@ If not aligned yet then do next byte
+	
+10:
+	@ At this point, we are aligned, we know we have at least 8 bytes to work with
+	push	{r4,r5,r6,r7}
+	orr	r1, r1, r1, lsl #8	@ expand the match word across to all bytes
+	orr	r1, r1, r1, lsl #16
+	bic	r4, r2, #7	@ Number of double words to work with
+	mvns	r7, #0		@ all F's
+	movs	r3, #0
+	
+15:
+	ldmia	r0!,{r5,r6}
+	subs	r4, r4, #8
+	eor	r5,r5, r1	@ Get it so that r5,r6 have 00's where the bytes match the target
+	eor	r6,r6, r1
+	uadd8	r5, r5, r7	@ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+	sel	r5, r3, r7	@ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+	uadd8	r6, r6, r7	@ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+	sel	r6, r5, r7	@ chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+	cbnz	r6, 60f
+	bne	15b		@ (Flags from the subs above) If not run out of bytes then go around again
+
+	pop	{r4,r5,r6,r7}
+	and	r1,r1,#0xff	@ Get r1 back to a single character from the expansion above
+	and	r2,r2,#7	@ Leave the count remaining as the number after the double words have been done
+ 
+20:
+	cbz	r2, 40f		@ 0 length or hit the end already then not found
+
+21:  @ Post aligned section, or just a short call
+	ldrb	r3,[r0],#1
+	subs	r2,r2,#1
+	eor	r3,r3,r1	@ r3 = 0 if match - doesn't break flags from sub
+	cbz	r3, 50f
+	bne	21b		@ on r2 flags
+
+40:
+	movs	r0,#0		@ not found
+	bx	lr
+
+50:
+	subs	r0,r0,#1	@ found
+	bx	lr
+
+60:  @ We're here because the fast path found a hit - now we have to track down exactly which word it was
+	@ r0 points to the start of the double word after the one that was tested
+	@ r5 has the 00/ff pattern for the first word, r6 has the chained value
+	cmp	r5, #0
+	itte	eq
+	moveq	r5, r6		@ the end is in the 2nd word
+	subeq	r0,r0,#3	@ Points to 2nd byte of 2nd word
+	subne	r0,r0,#7	@ or 2nd byte of 1st word
+
+	@ r0 currently points to the 3rd byte of the word containing the hit
+	tst	r5, # CHARTSTMASK(0)	@ 1st character
+	bne	61f
+	adds	r0,r0,#1
+	tst	r5, # CHARTSTMASK(1)	@ 2nd character
+	ittt	eq
+	addeq	r0,r0,#1
+	tsteq	r5, # (3<<15)		@ 2nd & 3rd character
+	@ If not the 3rd must be the last one
+	addeq	r0,r0,#1
+
+61:
+	pop	{r4,r5,r6,r7}
+	subs	r0,r0,#1
+	bx	lr
+
+	.size	__memchr_arm, . - __memchr_arm
diff --git a/string/arm/memcpy.S b/string/arm/memcpy.S
new file mode 100644
index 000000000000..86e64938edb1
--- /dev/null
+++ b/string/arm/memcpy.S
@@ -0,0 +1,587 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2013-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/*
+   This memcpy routine is optimised for Cortex-A15 cores and takes advantage
+   of VFP or NEON when built with the appropriate flags.
+
+   Assumptions:
+
+    ARMv6 (ARMv7-a if using Neon)
+    ARM state
+    Unaligned accesses
+
+ */
+
+#include "../asmdefs.h"
+
+	.syntax unified
+	/* This implementation requires ARM state.  */
+	.arm
+
+#ifdef __ARM_NEON__
+
+	.fpu	neon
+	.arch	armv7-a
+# define FRAME_SIZE	4
+# define USE_VFP
+# define USE_NEON
+
+#elif !defined (__SOFTFP__)
+
+	.arch	armv6
+	.fpu	vfpv2
+# define FRAME_SIZE	32
+# define USE_VFP
+
+#else
+	.arch	armv6
+# define FRAME_SIZE    32
+
+#endif
+
+/* Old versions of GAS incorrectly implement the NEON align semantics.  */
+#ifdef BROKEN_ASM_NEON_ALIGN
+#define ALIGN(addr, align) addr,:align
+#else
+#define ALIGN(addr, align) addr:align
+#endif
+
+#define PC_OFFSET	8	/* PC pipeline compensation.  */
+#define INSN_SIZE	4
+
+/* Call parameters.  */
+#define dstin	r0
+#define src	r1
+#define count	r2
+
+/* Locals.  */
+#define tmp1	r3
+#define dst	ip
+#define tmp2	r10
+
+#ifndef USE_NEON
+/* For bulk copies using GP registers.  */
+#define	A_l	r2		/* Call-clobbered.  */
+#define	A_h	r3		/* Call-clobbered.  */
+#define	B_l	r4
+#define	B_h	r5
+#define	C_l	r6
+#define	C_h	r7
+#define	D_l	r8
+#define	D_h	r9
+#endif
+
+/* Number of lines ahead to pre-fetch data.  If you change this the code
+   below will need adjustment to compensate.  */
+
+#define prefetch_lines	5
+
+#ifdef USE_VFP
+	.macro	cpy_line_vfp vreg, base
+	vstr	\vreg, [dst, #\base]
+	vldr	\vreg, [src, #\base]
+	vstr	d0, [dst, #\base + 8]
+	vldr	d0, [src, #\base + 8]
+	vstr	d1, [dst, #\base + 16]
+	vldr	d1, [src, #\base + 16]
+	vstr	d2, [dst, #\base + 24]
+	vldr	d2, [src, #\base + 24]
+	vstr	\vreg, [dst, #\base + 32]
+	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
+	vstr	d0, [dst, #\base + 40]
+	vldr	d0, [src, #\base + 40]
+	vstr	d1, [dst, #\base + 48]
+	vldr	d1, [src, #\base + 48]
+	vstr	d2, [dst, #\base + 56]
+	vldr	d2, [src, #\base + 56]
+	.endm
+
+	.macro	cpy_tail_vfp vreg, base
+	vstr	\vreg, [dst, #\base]
+	vldr	\vreg, [src, #\base]
+	vstr	d0, [dst, #\base + 8]
+	vldr	d0, [src, #\base + 8]
+	vstr	d1, [dst, #\base + 16]
+	vldr	d1, [src, #\base + 16]
+	vstr	d2, [dst, #\base + 24]
+	vldr	d2, [src, #\base + 24]
+	vstr	\vreg, [dst, #\base + 32]
+	vstr	d0, [dst, #\base + 40]
+	vldr	d0, [src, #\base + 40]
+	vstr	d1, [dst, #\base + 48]
+	vldr	d1, [src, #\base + 48]
+	vstr	d2, [dst, #\base + 56]
+	vldr	d2, [src, #\base + 56]
+	.endm
+#endif
+
+ENTRY (__memcpy_arm)
+
+	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
+	cmp	count, #64
+	bhs	L(cpy_not_short)
+	/* Deal with small copies quickly by dropping straight into the
+	   exit block.  */
+
+L(tail63unaligned):
+#ifdef USE_NEON
+	and	tmp1, count, #0x38
+	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+	add	pc, pc, tmp1
+	vld1.8	{d0}, [src]!	/* 14 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 12 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 10 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 8 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 6 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 4 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 2 words to go.  */
+	vst1.8	{d0}, [dst]!
+
+	tst	count, #4
+	ldrne	tmp1, [src], #4
+	strne	tmp1, [dst], #4
+#else
+	/* Copy up to 15 full words of data.  May not be aligned.  */
+	/* Cannot use VFP for unaligned data.  */
+	and	tmp1, count, #0x3c
+	add	dst, dst, tmp1
+	add	src, src, tmp1
+	rsb	tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
+	/* Jump directly into the sequence below at the correct offset.  */
+	add	pc, pc, tmp1, lsl #1
+
+	ldr	tmp1, [src, #-60]	/* 15 words to go.  */
+	str	tmp1, [dst, #-60]
+
+	ldr	tmp1, [src, #-56]	/* 14 words to go.  */
+	str	tmp1, [dst, #-56]
+	ldr	tmp1, [src, #-52]
+	str	tmp1, [dst, #-52]
+
+	ldr	tmp1, [src, #-48]	/* 12 words to go.  */
+	str	tmp1, [dst, #-48]
+	ldr	tmp1, [src, #-44]
+	str	tmp1, [dst, #-44]
+
+	ldr	tmp1, [src, #-40]	/* 10 words to go.  */
+	str	tmp1, [dst, #-40]
+	ldr	tmp1, [src, #-36]
+	str	tmp1, [dst, #-36]
+
+	ldr	tmp1, [src, #-32]	/* 8 words to go.  */
+	str	tmp1, [dst, #-32]
+	ldr	tmp1, [src, #-28]
+	str	tmp1, [dst, #-28]
+
+	ldr	tmp1, [src, #-24]	/* 6 words to go.  */
+	str	tmp1, [dst, #-24]
+	ldr	tmp1, [src, #-20]
+	str	tmp1, [dst, #-20]
+
+	ldr	tmp1, [src, #-16]	/* 4 words to go.  */
+	str	tmp1, [dst, #-16]
+	ldr	tmp1, [src, #-12]
+	str	tmp1, [dst, #-12]
+
+	ldr	tmp1, [src, #-8]	/* 2 words to go.  */
+	str	tmp1, [dst, #-8]
+	ldr	tmp1, [src, #-4]
+	str	tmp1, [dst, #-4]
+#endif
+
+	lsls	count, count, #31
+	ldrhcs	tmp1, [src], #2
+	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
+	strhcs	tmp1, [dst], #2
+	strbne	src, [dst]
+	bx	lr
+
+L(cpy_not_short):
+	/* At least 64 bytes to copy, but don't know the alignment yet.  */
+	str	tmp2, [sp, #-FRAME_SIZE]!
+	and	tmp2, src, #7
+	and	tmp1, dst, #7
+	cmp	tmp1, tmp2
+	bne	L(cpy_notaligned)
+
+#ifdef USE_VFP
+	/* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
+	   that the FP pipeline is much better at streaming loads and
+	   stores.  This is outside the critical loop.  */
+	vmov.f32	s0, s0
+#endif
+
+	/* SRC and DST have the same mutual 64-bit alignment, but we may
+	   still need to pre-copy some bytes to get to natural alignment.
+	   We bring SRC and DST into full 64-bit alignment.  */
+	lsls	tmp2, dst, #29
+	beq	1f
+	rsbs	tmp2, tmp2, #0
+	sub	count, count, tmp2, lsr #29
+	ldrmi	tmp1, [src], #4
+	strmi	tmp1, [dst], #4
+	lsls	tmp2, tmp2, #2
+	ldrhcs	tmp1, [src], #2
+	ldrbne	tmp2, [src], #1
+	strhcs	tmp1, [dst], #2
+	strbne	tmp2, [dst], #1
+
+1:
+	subs	tmp2, count, #64	/* Use tmp2 for count.  */
+	blo	L(tail63aligned)
+
+	cmp	tmp2, #512
+	bhs	L(cpy_body_long)
+
+L(cpy_body_medium):			/* Count in tmp2.  */
+#ifdef USE_VFP
+1:
+	vldr	d0, [src, #0]
+	subs	tmp2, tmp2, #64
+	vldr	d1, [src, #8]
+	vstr	d0, [dst, #0]
+	vldr	d0, [src, #16]
+	vstr	d1, [dst, #8]
+	vldr	d1, [src, #24]
+	vstr	d0, [dst, #16]
+	vldr	d0, [src, #32]
+	vstr	d1, [dst, #24]
+	vldr	d1, [src, #40]
+	vstr	d0, [dst, #32]
+	vldr	d0, [src, #48]
+	vstr	d1, [dst, #40]
+	vldr	d1, [src, #56]
+	vstr	d0, [dst, #48]
+	add	src, src, #64
+	vstr	d1, [dst, #56]
+	add	dst, dst, #64
+	bhs	1b
+	tst	tmp2, #0x3f
+	beq	L(done)
+
+L(tail63aligned):			/* Count in tmp2.  */
+	and	tmp1, tmp2, #0x38
+	add	dst, dst, tmp1
+	add	src, src, tmp1
+	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+	add	pc, pc, tmp1
+
+	vldr	d0, [src, #-56]	/* 14 words to go.  */
+	vstr	d0, [dst, #-56]
+	vldr	d0, [src, #-48]	/* 12 words to go.  */
+	vstr	d0, [dst, #-48]
+	vldr	d0, [src, #-40]	/* 10 words to go.  */
+	vstr	d0, [dst, #-40]
+	vldr	d0, [src, #-32]	/* 8 words to go.  */
+	vstr	d0, [dst, #-32]
+	vldr	d0, [src, #-24]	/* 6 words to go.  */
+	vstr	d0, [dst, #-24]
+	vldr	d0, [src, #-16]	/* 4 words to go.  */
+	vstr	d0, [dst, #-16]
+	vldr	d0, [src, #-8]	/* 2 words to go.  */
+	vstr	d0, [dst, #-8]
+#else
+	sub	src, src, #8
+	sub	dst, dst, #8
+1:
+	ldrd	A_l, A_h, [src, #8]
+	strd	A_l, A_h, [dst, #8]
+	ldrd	A_l, A_h, [src, #16]
+	strd	A_l, A_h, [dst, #16]
+	ldrd	A_l, A_h, [src, #24]
+	strd	A_l, A_h, [dst, #24]
+	ldrd	A_l, A_h, [src, #32]
+	strd	A_l, A_h, [dst, #32]
+	ldrd	A_l, A_h, [src, #40]
+	strd	A_l, A_h, [dst, #40]
+	ldrd	A_l, A_h, [src, #48]
+	strd	A_l, A_h, [dst, #48]
+	ldrd	A_l, A_h, [src, #56]
+	strd	A_l, A_h, [dst, #56]
+	ldrd	A_l, A_h, [src, #64]!
+	strd	A_l, A_h, [dst, #64]!
+	subs	tmp2, tmp2, #64
+	bhs	1b
+	tst	tmp2, #0x3f
+	bne	1f
+	ldr	tmp2,[sp], #FRAME_SIZE
+	bx	lr
+1:
+	add	src, src, #8
+	add	dst, dst, #8
+
+L(tail63aligned):			/* Count in tmp2.  */
+	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
+	   we know that the src and dest are 64-bit aligned so we can use
+	   LDRD/STRD to improve efficiency.  */
+	/* TMP2 is now negative, but we don't care about that.  The bottom
+	   six bits still tell us how many bytes are left to copy.  */
+
+	and	tmp1, tmp2, #0x38
+	add	dst, dst, tmp1
+	add	src, src, tmp1
+	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+	add	pc, pc, tmp1
+	ldrd	A_l, A_h, [src, #-56]	/* 14 words to go.  */
+	strd	A_l, A_h, [dst, #-56]
+	ldrd	A_l, A_h, [src, #-48]	/* 12 words to go.  */
+	strd	A_l, A_h, [dst, #-48]
+	ldrd	A_l, A_h, [src, #-40]	/* 10 words to go.  */
+	strd	A_l, A_h, [dst, #-40]
+	ldrd	A_l, A_h, [src, #-32]	/* 8 words to go.  */
+	strd	A_l, A_h, [dst, #-32]
+	ldrd	A_l, A_h, [src, #-24]	/* 6 words to go.  */
+	strd	A_l, A_h, [dst, #-24]
+	ldrd	A_l, A_h, [src, #-16]	/* 4 words to go.  */
+	strd	A_l, A_h, [dst, #-16]
+	ldrd	A_l, A_h, [src, #-8]	/* 2 words to go.  */
+	strd	A_l, A_h, [dst, #-8]
+
+#endif
+	tst	tmp2, #4
+	ldrne	tmp1, [src], #4
+	strne	tmp1, [dst], #4
+	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
+	ldrhcs	tmp1, [src], #2
+	ldrbne	tmp2, [src]
+	strhcs	tmp1, [dst], #2
+	strbne	tmp2, [dst]
+
+L(done):
+	ldr	tmp2, [sp], #FRAME_SIZE
+	bx	lr
+
+L(cpy_body_long):			/* Count in tmp2.  */
+
+	/* Long copy.  We know that there's at least (prefetch_lines * 64)
+	   bytes to go.  */
+#ifdef USE_VFP
+	/* Don't use PLD.  Instead, read some data in advance of the current
+	   copy position into a register.  This should act like a PLD
+	   operation but we won't have to repeat the transfer.  */
+
+	vldr	d3, [src, #0]
+	vldr	d4, [src, #64]
+	vldr	d5, [src, #128]
+	vldr	d6, [src, #192]
+	vldr	d7, [src, #256]
+
+	vldr	d0, [src, #8]
+	vldr	d1, [src, #16]
+	vldr	d2, [src, #24]
+	add	src, src, #32
+
+	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
+	blo	2f
+1:
+	cpy_line_vfp	d3, 0
+	cpy_line_vfp	d4, 64
+	cpy_line_vfp	d5, 128
+	add	dst, dst, #3 * 64
+	add	src, src, #3 * 64
+	cpy_line_vfp	d6, 0
+	cpy_line_vfp	d7, 64
+	add	dst, dst, #2 * 64
+	add	src, src, #2 * 64
+	subs	tmp2, tmp2, #prefetch_lines * 64
+	bhs	1b
+
+2:
+	cpy_tail_vfp	d3, 0
+	cpy_tail_vfp	d4, 64
+	cpy_tail_vfp	d5, 128
+	add	src, src, #3 * 64
+	add	dst, dst, #3 * 64
+	cpy_tail_vfp	d6, 0
+	vstr	d7, [dst, #64]
+	vldr	d7, [src, #64]
+	vstr	d0, [dst, #64 + 8]
+	vldr	d0, [src, #64 + 8]
+	vstr	d1, [dst, #64 + 16]
+	vldr	d1, [src, #64 + 16]
+	vstr	d2, [dst, #64 + 24]
+	vldr	d2, [src, #64 + 24]
+	vstr	d7, [dst, #64 + 32]
+	add	src, src, #96
+	vstr	d0, [dst, #64 + 40]
+	vstr	d1, [dst, #64 + 48]
+	vstr	d2, [dst, #64 + 56]
+	add	dst, dst, #128
+	add	tmp2, tmp2, #prefetch_lines * 64
+	b	L(cpy_body_medium)
+#else
+	/* Long copy.  Use an SMS style loop to maximize the I/O
+	   bandwidth of the core.  We don't have enough spare registers
+	   to synthesise prefetching, so use PLD operations.  */
+	/* Pre-bias src and dst.  */
+	sub	src, src, #8
+	sub	dst, dst, #8
+	pld	[src, #8]
+	pld	[src, #72]
+	subs	tmp2, tmp2, #64
+	pld	[src, #136]
+	ldrd	A_l, A_h, [src, #8]
+	strd	B_l, B_h, [sp, #8]
+	ldrd	B_l, B_h, [src, #16]
+	strd	C_l, C_h, [sp, #16]
+	ldrd	C_l, C_h, [src, #24]
+	strd	D_l, D_h, [sp, #24]
+	pld	[src, #200]
+	ldrd	D_l, D_h, [src, #32]!
+	b	1f
+	.p2align	6
+2:
+	pld	[src, #232]
+	strd	A_l, A_h, [dst, #40]
+	ldrd	A_l, A_h, [src, #40]
+	strd	B_l, B_h, [dst, #48]
+	ldrd	B_l, B_h, [src, #48]
+	strd	C_l, C_h, [dst, #56]
+	ldrd	C_l, C_h, [src, #56]
+	strd	D_l, D_h, [dst, #64]!
+	ldrd	D_l, D_h, [src, #64]!
+	subs	tmp2, tmp2, #64
+1:
+	strd	A_l, A_h, [dst, #8]
+	ldrd	A_l, A_h, [src, #8]
+	strd	B_l, B_h, [dst, #16]
+	ldrd	B_l, B_h, [src, #16]
+	strd	C_l, C_h, [dst, #24]
+	ldrd	C_l, C_h, [src, #24]
+	strd	D_l, D_h, [dst, #32]
+	ldrd	D_l, D_h, [src, #32]
+	bcs	2b
+	/* Save the remaining bytes and restore the callee-saved regs.  */
+	strd	A_l, A_h, [dst, #40]
+	add	src, src, #40
+	strd	B_l, B_h, [dst, #48]
+	ldrd	B_l, B_h, [sp, #8]
+	strd	C_l, C_h, [dst, #56]
+	ldrd	C_l, C_h, [sp, #16]
+	strd	D_l, D_h, [dst, #64]
+	ldrd	D_l, D_h, [sp, #24]
+	add	dst, dst, #72
+	tst	tmp2, #0x3f
+	bne	L(tail63aligned)
+	ldr	tmp2, [sp], #FRAME_SIZE
+	bx	lr
+#endif
+
+L(cpy_notaligned):
+	pld	[src]
+	pld	[src, #64]
+	/* There's at least 64 bytes to copy, but there is no mutual
+	   alignment.  */
+	/* Bring DST to 64-bit alignment.  */
+	lsls	tmp2, dst, #29
+	pld	[src, #(2 * 64)]
+	beq	1f
+	rsbs	tmp2, tmp2, #0
+	sub	count, count, tmp2, lsr #29
+	ldrmi	tmp1, [src], #4
+	strmi	tmp1, [dst], #4
+	lsls	tmp2, tmp2, #2
+	ldrbne	tmp1, [src], #1
+	ldrhcs	tmp2, [src], #2
+	strbne	tmp1, [dst], #1
+	strhcs	tmp2, [dst], #2
+1:
+	pld	[src, #(3 * 64)]
+	subs	count, count, #64
+	ldrlo	tmp2, [sp], #FRAME_SIZE
+	blo	L(tail63unaligned)
+	pld	[src, #(4 * 64)]
+
+#ifdef USE_NEON
+	vld1.8	{d0-d3}, [src]!
+	vld1.8	{d4-d7}, [src]!
+	subs	count, count, #64
+	blo	2f
+1:
+	pld	[src, #(4 * 64)]
+	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
+	vld1.8	{d0-d3}, [src]!
+	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
+	vld1.8	{d4-d7}, [src]!
+	subs	count, count, #64
+	bhs	1b
+2:
+	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
+	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
+	ands	count, count, #0x3f
+#else
+	/* Use an SMS style loop to maximize the I/O bandwidth.  */
+	sub	src, src, #4
+	sub	dst, dst, #8
+	subs	tmp2, count, #64	/* Use tmp2 for count.  */
+	ldr	A_l, [src, #4]
+	ldr	A_h, [src, #8]
+	strd	B_l, B_h, [sp, #8]
+	ldr	B_l, [src, #12]
+	ldr	B_h, [src, #16]
+	strd	C_l, C_h, [sp, #16]
+	ldr	C_l, [src, #20]
+	ldr	C_h, [src, #24]
+	strd	D_l, D_h, [sp, #24]
+	ldr	D_l, [src, #28]
+	ldr	D_h, [src, #32]!
+	b	1f
+	.p2align	6
+2:
+	pld	[src, #(5 * 64) - (32 - 4)]
+	strd	A_l, A_h, [dst, #40]
+	ldr	A_l, [src, #36]
+	ldr	A_h, [src, #40]
+	strd	B_l, B_h, [dst, #48]
+	ldr	B_l, [src, #44]
+	ldr	B_h, [src, #48]
+	strd	C_l, C_h, [dst, #56]
+	ldr	C_l, [src, #52]
+	ldr	C_h, [src, #56]
+	strd	D_l, D_h, [dst, #64]!
+	ldr	D_l, [src, #60]
+	ldr	D_h, [src, #64]!
+	subs	tmp2, tmp2, #64
+1:
+	strd	A_l, A_h, [dst, #8]
+	ldr	A_l, [src, #4]
+	ldr	A_h, [src, #8]
+	strd	B_l, B_h, [dst, #16]
+	ldr	B_l, [src, #12]
+	ldr	B_h, [src, #16]
+	strd	C_l, C_h, [dst, #24]
+	ldr	C_l, [src, #20]
+	ldr	C_h, [src, #24]
+	strd	D_l, D_h, [dst, #32]
+	ldr	D_l, [src, #28]
+	ldr	D_h, [src, #32]
+	bcs	2b
+
+	/* Save the remaining bytes and restore the callee-saved regs.  */
+	strd	A_l, A_h, [dst, #40]
+	add	src, src, #36
+	strd	B_l, B_h, [dst, #48]
+	ldrd	B_l, B_h, [sp, #8]
+	strd	C_l, C_h, [dst, #56]
+	ldrd	C_l, C_h, [sp, #16]
+	strd	D_l, D_h, [dst, #64]
+	ldrd	D_l, D_h, [sp, #24]
+	add	dst, dst, #72
+	ands	count, tmp2, #0x3f
+#endif
+	ldr	tmp2, [sp], #FRAME_SIZE
+	bne	L(tail63unaligned)
+	bx	lr
+
+END (__memcpy_arm)
diff --git a/string/arm/memset.S b/string/arm/memset.S
new file mode 100644
index 000000000000..11e927368fd1
--- /dev/null
+++ b/string/arm/memset.S
@@ -0,0 +1,98 @@
+/*
+ * memset - fill memory with a constant
+ *
+ * Copyright (c) 2010-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/*
+   Written by Dave Gilbert <david.gilbert@linaro.org>
+
+   This memset routine is optimised on a Cortex-A9 and should work on
+   all ARMv7 processors.
+
+ */
+
+	.syntax unified
+	.arch armv7-a
+
+@ 2011-08-30 david.gilbert@linaro.org
+@    Extracted from local git 2f11b436
+
+@ this lets us check a flag in a 00/ff byte easily in either endianness
+#ifdef __ARMEB__
+#define CHARTSTMASK(c) 1<<(31-(c*8))
+#else
+#define CHARTSTMASK(c) 1<<(c*8)
+#endif
+	.thumb
+
+@ ---------------------------------------------------------------------------
+	.thumb_func
+	.align 2
+	.p2align 4,,15
+	.global __memset_arm
+	.type __memset_arm,%function
+__memset_arm:
+	@ r0 = address
+	@ r1 = character
+	@ r2 = count
+	@ returns original address in r0
+
+	mov	r3, r0		@ Leave r0 alone
+	cbz	r2, 10f		@ Exit if 0 length
+
+	tst	r0, #7
+	beq	2f		@ Already aligned
+
+	@ Ok, so we're misaligned here
+1:
+	strb	r1, [r3], #1
+	subs	r2,r2,#1
+	tst	r3, #7
+	cbz	r2, 10f		@ Exit if we hit the end
+	bne	1b		@ go round again if still misaligned
+
+2:
+	@ OK, so we're aligned
+	push	{r4,r5,r6,r7}
+	bics	r4, r2, #15	@ if less than 16 bytes then need to finish it off
+	beq	5f
+
+3:
+	@ POSIX says that ch is cast to an unsigned char.  A uxtb is one
+	@ byte and takes two cycles, where an AND is four bytes but one
+	@ cycle.
+	and	r1, #0xFF
+	orr	r1, r1, r1, lsl#8	@ Same character into all bytes
+	orr	r1, r1, r1, lsl#16
+	mov	r5,r1
+	mov	r6,r1
+	mov	r7,r1
+
+4:
+	subs	r4,r4,#16
+	stmia	r3!,{r1,r5,r6,r7}
+	bne	4b
+	and	r2,r2,#15
+
+	@ At this point we're still aligned and we have upto align-1 bytes left to right
+	@ we can avoid some of the byte-at-a time now by testing for some big chunks
+	tst	r2,#8
+	itt	ne
+	subne	r2,r2,#8
+	stmiane	r3!,{r1,r5}
+
+5:
+	pop	{r4,r5,r6,r7}
+	cbz	r2, 10f
+
+	@ Got to do any last < alignment bytes
+6:
+	subs	r2,r2,#1
+	strb	r1,[r3],#1
+	bne	6b
+
+10:
+	bx	lr		@ goodbye
+	.size	__memset_arm, . - __memset_arm
diff --git a/string/arm/strcmp-armv6m.S b/string/arm/strcmp-armv6m.S
new file mode 100644
index 000000000000..b75d4143db57
--- /dev/null
+++ b/string/arm/strcmp-armv6m.S
@@ -0,0 +1,117 @@
+/*
+ * strcmp for ARMv6-M (optimized for performance, not size)
+ *
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1
+
+	.thumb_func
+	.syntax unified
+	.arch	armv6-m
+
+	.macro DoSub n, label
+	subs	r0, r0, r1
+#ifdef __ARM_BIG_ENDIAN
+	lsrs	r1, r4, \n
+#else
+	lsls	r1, r4, \n
+#endif
+	orrs	r1, r0
+	bne	\label
+	.endm
+
+	.macro Byte_Test n, label
+	lsrs	r0, r2, \n
+	lsrs	r1, r3, \n
+	DoSub	\n, \label
+	.endm
+
+ENTRY_ALIGN (__strcmp_armv6m, 4)
+	mov	r2, r0
+	push	{r4, r5, r6, lr}
+	orrs	r2, r1
+	lsls	r2, r2, #30
+	bne	6f
+	ldr	r5, =0x01010101
+	lsls	r6, r5, #7
+1:
+	ldmia	r0!, {r2}
+	ldmia	r1!, {r3}
+	subs	r4, r2, r5
+	bics	r4, r2
+	ands	r4, r6
+	beq	3f
+
+#ifdef __ARM_BIG_ENDIAN
+	Byte_Test #24, 4f
+	Byte_Test #16, 4f
+	Byte_Test #8, 4f
+
+	b       7f
+3:
+	cmp     r2, r3
+	beq     1b
+	cmp     r2, r3
+#else
+	uxtb    r0, r2
+	uxtb    r1, r3
+	DoSub   #24, 2f
+
+	uxth    r0, r2
+	uxth    r1, r3
+	DoSub   #16, 2f
+
+	lsls    r0, r2, #8
+	lsls    r1, r3, #8
+	lsrs    r0, r0, #8
+	lsrs    r1, r1, #8
+	DoSub   #8, 2f
+
+	lsrs    r0, r2, #24
+	lsrs    r1, r3, #24
+	subs    r0, r0, r1
+2:
+	pop     {r4, r5, r6, pc}
+
+3:
+	cmp     r2, r3
+	beq     1b
+	rev     r0, r2
+	rev     r1, r3
+	cmp     r0, r1
+#endif
+
+	bls	5f
+	movs	r0, #1
+4:
+	pop	{r4, r5, r6, pc}
+5:
+	movs	r0, #0
+	mvns	r0, r0
+	pop	{r4, r5, r6, pc}
+6:
+	ldrb	r2, [r0, #0]
+	ldrb	r3, [r1, #0]
+	adds	r0, #1
+	adds	r1, #1
+	cmp	r2, #0
+	beq	7f
+	cmp	r2, r3
+	bne	7f
+	ldrb	r2, [r0, #0]
+	ldrb	r3, [r1, #0]
+	adds	r0, #1
+	adds	r1, #1
+	cmp	r2, #0
+	beq	7f
+	cmp	r2, r3
+	beq	6b
+7:
+	subs	r0, r2, r3
+	pop	{r4, r5, r6, pc}
+
+END (__strcmp_armv6m)
+
+#endif /* __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1  */
diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S
new file mode 100644
index 000000000000..51443e343058
--- /dev/null
+++ b/string/arm/strcmp.S
@@ -0,0 +1,475 @@
+/*
+ * strcmp for ARMv7
+ *
+ * Copyright (c) 2012-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
+
+/* Implementation of strcmp for ARMv7 when DSP instructions are
+   available.  Use ldrd to support wider loads, provided the data
+   is sufficiently aligned.  Use saturating arithmetic to optimize
+   the compares.  */
+
+#include "../asmdefs.h"
+
+/* Build Options:
+   STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
+   byte in the string.  If comparing completely random strings
+   the pre-check will save time, since there is a very high
+   probability of a mismatch in the first character: we save
+   significant overhead if this is the common case.  However,
+   if strings are likely to be identical (eg because we're
+   verifying a hit in a hash table), then this check is largely
+   redundant.  */
+
+#define STRCMP_NO_PRECHECK	0
+
+	/* This version uses Thumb-2 code.  */
+	.thumb
+	.syntax unified
+
+#ifdef __ARM_BIG_ENDIAN
+#define S2LO lsl
+#define S2LOEQ lsleq
+#define S2HI lsr
+#define MSB 0x000000ff
+#define LSB 0xff000000
+#define BYTE0_OFFSET 24
+#define BYTE1_OFFSET 16
+#define BYTE2_OFFSET 8
+#define BYTE3_OFFSET 0
+#else /* not  __ARM_BIG_ENDIAN */
+#define S2LO lsr
+#define S2LOEQ lsreq
+#define S2HI lsl
+#define BYTE0_OFFSET 0
+#define BYTE1_OFFSET 8
+#define BYTE2_OFFSET 16
+#define BYTE3_OFFSET 24
+#define MSB 0xff000000
+#define LSB 0x000000ff
+#endif /* not  __ARM_BIG_ENDIAN */
+
+/* Parameters and result.  */
+#define src1		r0
+#define src2		r1
+#define result		r0	/* Overlaps src1.  */
+
+/* Internal variables.  */
+#define tmp1		r4
+#define tmp2		r5
+#define const_m1	r12
+
+/* Additional internal variables for 64-bit aligned data.  */
+#define data1a		r2
+#define data1b		r3
+#define data2a		r6
+#define data2b		r7
+#define syndrome_a	tmp1
+#define syndrome_b	tmp2
+
+/* Additional internal variables for 32-bit aligned data.  */
+#define data1		r2
+#define data2		r3
+#define syndrome	tmp2
+
+
+	/* Macro to compute and return the result value for word-aligned
+	   cases.  */
+	.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
+#ifdef __ARM_BIG_ENDIAN
+	/* If data1 contains a zero byte, then syndrome will contain a 1 in
+	   bit 7 of that byte.  Otherwise, the highest set bit in the
+	   syndrome will highlight the first different bit.  It is therefore
+	   sufficient to extract the eight bits starting with the syndrome
+	   bit.  */
+	clz	tmp1, \synd
+	lsl	r1, \d2, tmp1
+	.if \restore_r6
+	ldrd	r6, r7, [sp, #8]
+	.endif
+	.cfi_restore 6
+	.cfi_restore 7
+	lsl	\d1, \d1, tmp1
+	.cfi_remember_state
+	lsr	result, \d1, #24
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	sub	result, result, r1, lsr #24
+	bx	lr
+#else
+	/* To use the big-endian trick we'd have to reverse all three words.
+	   that's slower than this approach.  */
+	rev	\synd, \synd
+	clz	tmp1, \synd
+	bic	tmp1, tmp1, #7
+	lsr	r1, \d2, tmp1
+	.cfi_remember_state
+	.if \restore_r6
+	ldrd	r6, r7, [sp, #8]
+	.endif
+	.cfi_restore 6
+	.cfi_restore 7
+	lsr	\d1, \d1, tmp1
+	and	result, \d1, #255
+	and	r1, r1, #255
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	sub	result, result, r1
+
+	bx	lr
+#endif
+	.endm
+
+	.p2align	5
+L(strcmp_start_addr):
+#if STRCMP_NO_PRECHECK == 0
+L(fastpath_exit):
+	sub	r0, r2, r3
+	bx	lr
+	nop
+#endif
+ENTRY_ALIGN (__strcmp_arm, 0)
+#if STRCMP_NO_PRECHECK == 0
+	ldrb	r2, [src1]
+	ldrb	r3, [src2]
+	cmp	r2, #1
+	it	cs
+	cmpcs	r2, r3
+	bne	L(fastpath_exit)
+#endif
+	strd	r4, r5, [sp, #-16]!
+	.cfi_def_cfa_offset 16
+	.cfi_offset 4, -16
+	.cfi_offset 5, -12
+	orr	tmp1, src1, src2
+	strd	r6, r7, [sp, #8]
+	.cfi_offset 6, -8
+	.cfi_offset 7, -4
+	mvn	const_m1, #0
+	lsl	r2, tmp1, #29
+	cbz	r2, L(loop_aligned8)
+
+L(not_aligned):
+	eor	tmp1, src1, src2
+	tst	tmp1, #7
+	bne	L(misaligned8)
+
+	/* Deal with mutual misalignment by aligning downwards and then
+	   masking off the unwanted loaded data to prevent a difference.  */
+	and	tmp1, src1, #7
+	bic	src1, src1, #7
+	and	tmp2, tmp1, #3
+	bic	src2, src2, #7
+	lsl	tmp2, tmp2, #3	/* Bytes -> bits.  */
+	ldrd	data1a, data1b, [src1], #16
+	tst	tmp1, #4
+	ldrd	data2a, data2b, [src2], #16
+	/* In thumb code we can't use MVN with a register shift, but
+	   we do have ORN.  */
+	S2HI	tmp1, const_m1, tmp2
+	orn	data1a, data1a, tmp1
+	orn	data2a, data2a, tmp1
+	beq	L(start_realigned8)
+	orn	data1b, data1b, tmp1
+	mov	data1a, const_m1
+	orn	data2b, data2b, tmp1
+	mov	data2a, const_m1
+	b	L(start_realigned8)
+
+	/* Unwind the inner loop by a factor of 2, giving 16 bytes per
+	   pass.  */
+	.p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
+	.p2align 2	/* Always word aligned.  */
+L(loop_aligned8):
+	ldrd	data1a, data1b, [src1], #16
+	ldrd	data2a, data2b, [src2], #16
+L(start_realigned8):
+	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
+	eor	syndrome_a, data1a, data2a
+	sel	syndrome_a, syndrome_a, const_m1
+	cbnz	syndrome_a, L(diff_in_a)
+	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
+	eor	syndrome_b, data1b, data2b
+	sel	syndrome_b, syndrome_b, const_m1
+	cbnz	syndrome_b, L(diff_in_b)
+
+	ldrd	data1a, data1b, [src1, #-8]
+	ldrd	data2a, data2b, [src2, #-8]
+	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
+	eor	syndrome_a, data1a, data2a
+	sel	syndrome_a, syndrome_a, const_m1
+	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
+	eor	syndrome_b, data1b, data2b
+	sel	syndrome_b, syndrome_b, const_m1
+	/* Can't use CBZ for backwards branch.  */
+	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
+	beq	L(loop_aligned8)
+
+L(diff_found):
+	cbnz	syndrome_a, L(diff_in_a)
+
+L(diff_in_b):
+	strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
+
+L(diff_in_a):
+	.cfi_restore_state
+	strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
+
+	.cfi_restore_state
+L(misaligned8):
+	tst	tmp1, #3
+	bne	L(misaligned4)
+	ands	tmp1, src1, #3
+	bne	L(mutual_align4)
+
+	/* Unrolled by a factor of 2, to reduce the number of post-increment
+	   operations.  */
+L(loop_aligned4):
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+L(start_realigned4):
+	uadd8	syndrome, data1, const_m1	/* Only need GE bits.  */
+	eor	syndrome, data1, data2
+	sel	syndrome, syndrome, const_m1
+	cbnz	syndrome, L(aligned4_done)
+	ldr	data1, [src1, #-4]
+	ldr	data2, [src2, #-4]
+	uadd8	syndrome, data1, const_m1
+	eor	syndrome, data1, data2
+	sel	syndrome, syndrome, const_m1
+	cmp	syndrome, #0
+	beq	L(loop_aligned4)
+
+L(aligned4_done):
+	strcmp_epilogue_aligned syndrome, data1, data2, 0
+
+L(mutual_align4):
+	.cfi_restore_state
+	/* Deal with mutual misalignment by aligning downwards and then
+	   masking off the unwanted loaded data to prevent a difference.  */
+	lsl	tmp1, tmp1, #3	/* Bytes -> bits.  */
+	bic	src1, src1, #3
+	ldr	data1, [src1], #8
+	bic	src2, src2, #3
+	ldr	data2, [src2], #8
+
+	/* In thumb code we can't use MVN with a register shift, but
+	   we do have ORN.  */
+	S2HI	tmp1, const_m1, tmp1
+	orn	data1, data1, tmp1
+	orn	data2, data2, tmp1
+	b	L(start_realigned4)
+
+L(misaligned4):
+	ands	tmp1, src1, #3
+	beq	L(src1_aligned)
+	sub	src2, src2, tmp1
+	bic	src1, src1, #3
+	lsls	tmp1, tmp1, #31
+	ldr	data1, [src1], #4
+	beq	L(aligned_m2)
+	bcs	L(aligned_m1)
+
+#if STRCMP_NO_PRECHECK == 1
+	ldrb	data2, [src2, #1]
+	uxtb	tmp1, data1, ror #BYTE1_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	L(misaligned_exit)
+	cbz	data2, L(misaligned_exit)
+
+L(aligned_m2):
+	ldrb	data2, [src2, #2]
+	uxtb	tmp1, data1, ror #BYTE2_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	L(misaligned_exit)
+	cbz	data2, L(misaligned_exit)
+
+L(aligned_m1):
+	ldrb	data2, [src2, #3]
+	uxtb	tmp1, data1, ror #BYTE3_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	L(misaligned_exit)
+	add	src2, src2, #4
+	cbnz	data2, L(src1_aligned)
+#else  /* STRCMP_NO_PRECHECK */
+	/* If we've done the pre-check, then we don't need to check the
+	   first byte again here.  */
+	ldrb	data2, [src2, #2]
+	uxtb	tmp1, data1, ror #BYTE2_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	L(misaligned_exit)
+	cbz	data2, L(misaligned_exit)
+
+L(aligned_m2):
+	ldrb	data2, [src2, #3]
+	uxtb	tmp1, data1, ror #BYTE3_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	L(misaligned_exit)
+	cbnz	data2, L(aligned_m1)
+#endif
+
+L(misaligned_exit):
+	.cfi_remember_state
+	mov	result, tmp1
+	ldr	r4, [sp], #16
+	.cfi_restore 4
+	bx	lr
+
+#if STRCMP_NO_PRECHECK == 0
+L(aligned_m1):
+	add	src2, src2, #4
+#endif
+L(src1_aligned):
+	.cfi_restore_state
+	/* src1 is word aligned, but src2 has no common alignment
+	   with it.  */
+	ldr	data1, [src1], #4
+	lsls	tmp1, src2, #31		/* C=src2[1], Z=src2[0].  */
+
+	bic	src2, src2, #3
+	ldr	data2, [src2], #4
+	bhi	L(overlap1)		/* C=1, Z=0 => src2[1:0] = 0b11.  */
+	bcs	L(overlap2)		/* C=1, Z=1 => src2[1:0] = 0b10.  */
+
+	/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01.  */
+L(overlap3):
+	bic	tmp1, data1, #MSB
+	uadd8	syndrome, data1, const_m1
+	eors	syndrome, tmp1, data2, S2LO #8
+	sel	syndrome, syndrome, const_m1
+	bne	4f
+	cbnz	syndrome, 5f
+	ldr	data2, [src2], #4
+	eor	tmp1, tmp1, data1
+	cmp	tmp1, data2, S2HI #24
+	bne	6f
+	ldr	data1, [src1], #4
+	b	L(overlap3)
+4:
+	S2LO	data2, data2, #8
+	b	L(strcmp_tail)
+
+5:
+	bics	syndrome, syndrome, #MSB
+	bne	L(strcmp_done_equal)
+
+	/* We can only get here if the MSB of data1 contains 0, so
+	   fast-path the exit.  */
+	ldrb	result, [src2]
+	.cfi_remember_state
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	/* R6/7 Not used in this sequence.  */
+	.cfi_restore 6
+	.cfi_restore 7
+	neg	result, result
+	bx	lr
+
+6:
+	.cfi_restore_state
+	S2LO	data1, data1, #24
+	and	data2, data2, #LSB
+	b	L(strcmp_tail)
+
+	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
+L(overlap2):
+	and	tmp1, data1, const_m1, S2LO #16
+	uadd8	syndrome, data1, const_m1
+	eors	syndrome, tmp1, data2, S2LO #16
+	sel	syndrome, syndrome, const_m1
+	bne	4f
+	cbnz	syndrome, 5f
+	ldr	data2, [src2], #4
+	eor	tmp1, tmp1, data1
+	cmp	tmp1, data2, S2HI #16
+	bne	6f
+	ldr	data1, [src1], #4
+	b	L(overlap2)
+4:
+	S2LO	data2, data2, #16
+	b	L(strcmp_tail)
+5:
+	ands	syndrome, syndrome, const_m1, S2LO #16
+	bne	L(strcmp_done_equal)
+
+	ldrh	data2, [src2]
+	S2LO	data1, data1, #16
+#ifdef __ARM_BIG_ENDIAN
+	lsl	data2, data2, #16
+#endif
+	b	L(strcmp_tail)
+
+6:
+	S2LO	data1, data1, #16
+	and	data2, data2, const_m1, S2LO #16
+	b	L(strcmp_tail)
+
+	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
+L(overlap1):
+	and	tmp1, data1, #LSB
+	uadd8	syndrome, data1, const_m1
+	eors	syndrome, tmp1, data2, S2LO #24
+	sel	syndrome, syndrome, const_m1
+	bne	4f
+	cbnz	syndrome, 5f
+	ldr	data2, [src2], #4
+	eor	tmp1, tmp1, data1
+	cmp	tmp1, data2, S2HI #8
+	bne	6f
+	ldr	data1, [src1], #4
+	b	L(overlap1)
+4:
+	S2LO	data2, data2, #24
+	b	L(strcmp_tail)
+5:
+	tst	syndrome, #LSB
+	bne	L(strcmp_done_equal)
+	ldr	data2, [src2]
+6:
+	S2LO	data1, data1, #8
+	bic	data2, data2, #MSB
+	b	L(strcmp_tail)
+
+L(strcmp_done_equal):
+	mov	result, #0
+	.cfi_remember_state
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	/* R6/7 not used in this sequence.  */
+	.cfi_restore 6
+	.cfi_restore 7
+	bx	lr
+
+L(strcmp_tail):
+	.cfi_restore_state
+#ifndef __ARM_BIG_ENDIAN
+	rev	data1, data1
+	rev	data2, data2
+	/* Now everything looks big-endian...  */
+#endif
+	uadd8	tmp1, data1, const_m1
+	eor	tmp1, data1, data2
+	sel	syndrome, tmp1, const_m1
+	clz	tmp1, syndrome
+	lsl	data1, data1, tmp1
+	lsl	data2, data2, tmp1
+	lsr	result, data1, #24
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	/* R6/7 not used in this sequence.  */
+	.cfi_restore 6
+	.cfi_restore 7
+	sub	result, result, data2, lsr #24
+	bx	lr
+
+END (__strcmp_arm)
+
+#endif /* __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1  */
diff --git a/string/arm/strcpy.c b/string/arm/strcpy.c
new file mode 100644
index 000000000000..02cf94ff4be0
--- /dev/null
+++ b/string/arm/strcpy.c
@@ -0,0 +1,133 @@
+/*
+ * strcpy
+ *
+ * Copyright (c) 2008-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if defined (__thumb2__) && !defined (__thumb__)
+
+/* For GLIBC:
+#include <string.h>
+#include <memcopy.h>
+
+#undef strcmp
+*/
+
+#ifdef __thumb2__
+#define magic1(REG) "#0x01010101"
+#define magic2(REG) "#0x80808080"
+#else
+#define magic1(REG) #REG
+#define magic2(REG) #REG ", lsl #7"
+#endif
+
+char* __attribute__((naked))
+__strcpy_arm (char* dst, const char* src)
+{
+  __asm__ (
+       "pld	[r1, #0]\n\t"
+       "eor	r2, r0, r1\n\t"
+       "mov	ip, r0\n\t"
+       "tst	r2, #3\n\t"
+       "bne	4f\n\t"
+       "tst	r1, #3\n\t"
+       "bne	3f\n"
+  "5:\n\t"
+# ifndef __thumb2__
+       "str	r5, [sp, #-4]!\n\t"
+       "mov	r5, #0x01\n\t"
+       "orr	r5, r5, r5, lsl #8\n\t"
+       "orr	r5, r5, r5, lsl #16\n\t"
+# endif
+
+       "str	r4, [sp, #-4]!\n\t"
+       "tst	r1, #4\n\t"
+       "ldr	r3, [r1], #4\n\t"
+       "beq	2f\n\t"
+       "sub	r2, r3, "magic1(r5)"\n\t"
+       "bics	r2, r2, r3\n\t"
+       "tst	r2, "magic2(r5)"\n\t"
+       "itt	eq\n\t"
+       "streq	r3, [ip], #4\n\t"
+       "ldreq	r3, [r1], #4\n"
+       "bne	1f\n\t"
+       /* Inner loop.  We now know that r1 is 64-bit aligned, so we
+	  can safely fetch up to two words.  This allows us to avoid
+	  load stalls.  */
+       ".p2align 2\n"
+  "2:\n\t"
+       "pld	[r1, #8]\n\t"
+       "ldr	r4, [r1], #4\n\t"
+       "sub	r2, r3, "magic1(r5)"\n\t"
+       "bics	r2, r2, r3\n\t"
+       "tst	r2, "magic2(r5)"\n\t"
+       "sub	r2, r4, "magic1(r5)"\n\t"
+       "bne	1f\n\t"
+       "str	r3, [ip], #4\n\t"
+       "bics	r2, r2, r4\n\t"
+       "tst	r2, "magic2(r5)"\n\t"
+       "itt	eq\n\t"
+       "ldreq	r3, [r1], #4\n\t"
+       "streq	r4, [ip], #4\n\t"
+       "beq	2b\n\t"
+       "mov	r3, r4\n"
+  "1:\n\t"
+# ifdef __ARMEB__
+       "rors	r3, r3, #24\n\t"
+# endif
+       "strb	r3, [ip], #1\n\t"
+       "tst	r3, #0xff\n\t"
+# ifdef __ARMEL__
+       "ror	r3, r3, #8\n\t"
+# endif
+       "bne	1b\n\t"
+       "ldr	r4, [sp], #4\n\t"
+# ifndef __thumb2__
+       "ldr	r5, [sp], #4\n\t"
+# endif
+       "BX LR\n"
+
+       /* Strings have the same offset from word alignment, but it's
+	  not zero.  */
+  "3:\n\t"
+       "tst	r1, #1\n\t"
+       "beq	1f\n\t"
+       "ldrb	r2, [r1], #1\n\t"
+       "strb	r2, [ip], #1\n\t"
+       "cmp	r2, #0\n\t"
+       "it	eq\n"
+       "BXEQ LR\n"
+  "1:\n\t"
+       "tst	r1, #2\n\t"
+       "beq	5b\n\t"
+       "ldrh	r2, [r1], #2\n\t"
+# ifdef __ARMEB__
+       "tst	r2, #0xff00\n\t"
+       "iteet	ne\n\t"
+       "strneh	r2, [ip], #2\n\t"
+       "lsreq	r2, r2, #8\n\t"
+       "streqb	r2, [ip]\n\t"
+       "tstne	r2, #0xff\n\t"
+# else
+       "tst	r2, #0xff\n\t"
+       "itet	ne\n\t"
+       "strneh	r2, [ip], #2\n\t"
+       "streqb	r2, [ip]\n\t"
+       "tstne	r2, #0xff00\n\t"
+# endif
+       "bne	5b\n\t"
+       "BX LR\n"
+
+       /* src and dst do not have a common word-alignement.  Fall back to
+	  byte copying.  */
+  "4:\n\t"
+       "ldrb	r2, [r1], #1\n\t"
+       "strb	r2, [ip], #1\n\t"
+       "cmp	r2, #0\n\t"
+       "bne	4b\n\t"
+       "BX LR");
+}
+/* For GLIBC: libc_hidden_builtin_def (strcpy) */
+
+#endif /* defined (__thumb2__) && !defined (__thumb__)  */
diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S
new file mode 100644
index 000000000000..5ad30c941586
--- /dev/null
+++ b/string/arm/strlen-armv6t2.S
@@ -0,0 +1,124 @@
+/*
+ * strlen - calculate the length of a string
+ *
+ * Copyright (c) 2010-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
+
+/*
+   Assumes:
+   ARMv6T2, AArch32
+
+ */
+
+#include "../asmdefs.h"
+
+#ifdef __ARMEB__
+#define S2LO		lsl
+#define S2HI		lsr
+#else
+#define S2LO		lsr
+#define S2HI		lsl
+#endif
+
+	/* This code requires Thumb.  */
+	.thumb
+	.syntax unified
+
+/* Parameters and result.  */
+#define srcin		r0
+#define result		r0
+
+/* Internal variables.  */
+#define src		r1
+#define data1a		r2
+#define data1b		r3
+#define const_m1	r12
+#define const_0		r4
+#define tmp1		r4		/* Overlaps const_0  */
+#define tmp2		r5
+
+ENTRY (__strlen_armv6t2)
+	pld	[srcin, #0]
+	strd	r4, r5, [sp, #-8]!
+	bic	src, srcin, #7
+	mvn	const_m1, #0
+	ands	tmp1, srcin, #7		/* (8 - bytes) to alignment.  */
+	pld	[src, #32]
+	bne.w	L(misaligned8)
+	mov	const_0, #0
+	mov	result, #-8
+L(loop_aligned):
+	/* Bytes 0-7.  */
+	ldrd	data1a, data1b, [src]
+	pld	[src, #64]
+	add	result, result, #8
+L(start_realigned):
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
+	uadd8	data1b, data1b, const_m1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
+	cbnz	data1b, L(null_found)
+
+	/* Bytes 8-15.  */
+	ldrd	data1a, data1b, [src, #8]
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
+	add	result, result, #8
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
+	uadd8	data1b, data1b, const_m1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
+	cbnz	data1b, L(null_found)
+
+	/* Bytes 16-23.  */
+	ldrd	data1a, data1b, [src, #16]
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
+	add	result, result, #8
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
+	uadd8	data1b, data1b, const_m1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
+	cbnz	data1b, L(null_found)
+
+	/* Bytes 24-31.  */
+	ldrd	data1a, data1b, [src, #24]
+	add	src, src, #32
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
+	add	result, result, #8
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
+	uadd8	data1b, data1b, const_m1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
+	cmp	data1b, #0
+	beq	L(loop_aligned)
+
+L(null_found):
+	cmp	data1a, #0
+	itt	eq
+	addeq	result, result, #4
+	moveq	data1a, data1b
+#ifndef __ARMEB__
+	rev	data1a, data1a
+#endif
+	clz	data1a, data1a
+	ldrd	r4, r5, [sp], #8
+	add	result, result, data1a, lsr #3	/* Bits -> Bytes.  */
+	bx	lr
+
+L(misaligned8):
+	ldrd	data1a, data1b, [src]
+	and	tmp2, tmp1, #3
+	rsb	result, tmp1, #0
+	lsl	tmp2, tmp2, #3			/* Bytes -> bits.  */
+	tst	tmp1, #4
+	pld	[src, #64]
+	S2HI	tmp2, const_m1, tmp2
+	orn	data1a, data1a, tmp2
+	itt	ne
+	ornne	data1b, data1b, tmp2
+	movne	data1a, const_m1
+	mov	const_0, #0
+	b	L(start_realigned)
+
+END (__strlen_armv6t2)
+
+#endif /* __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2  */
diff --git a/string/asmdefs.h b/string/asmdefs.h
new file mode 100644
index 000000000000..340b427a505b
--- /dev/null
+++ b/string/asmdefs.h
@@ -0,0 +1,98 @@
+/*
+ * Macros for asm code.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef _ASMDEFS_H
+#define _ASMDEFS_H
+
+#if defined(__aarch64__)
+
+/* Branch Target Identitication support.  */
+#define BTI_C		hint	34
+#define BTI_J		hint	36
+/* Return address signing support (pac-ret).  */
+#define PACIASP		hint	25; .cfi_window_save
+#define AUTIASP		hint	29; .cfi_window_save
+
+/* GNU_PROPERTY_AARCH64_* macros from elf.h.  */
+#define FEATURE_1_AND 0xc0000000
+#define FEATURE_1_BTI 1
+#define FEATURE_1_PAC 2
+
+/* Add a NT_GNU_PROPERTY_TYPE_0 note.  */
+#define GNU_PROPERTY(type, value)	\
+  .section .note.gnu.property, "a";	\
+  .p2align 3;				\
+  .word 4;				\
+  .word 16;				\
+  .word 5;				\
+  .asciz "GNU";				\
+  .word type;				\
+  .word 4;				\
+  .word value;				\
+  .word 0;				\
+  .text
+
+/* If set then the GNU Property Note section will be added to
+   mark objects to support BTI and PAC-RET.  */
+#ifndef WANT_GNU_PROPERTY
+#define WANT_GNU_PROPERTY 1
+#endif
+
+#if WANT_GNU_PROPERTY
+/* Add property note with supported features to all asm files.  */
+GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
+#endif
+
+#define ENTRY_ALIGN(name, alignment)	\
+  .global name;		\
+  .type name,%function;	\
+  .align alignment;		\
+  name:			\
+  .cfi_startproc;	\
+  BTI_C;
+
+#else
+
+#define END_FILE
+
+#define ENTRY_ALIGN(name, alignment)	\
+  .global name;		\
+  .type name,%function;	\
+  .align alignment;		\
+  name:			\
+  .cfi_startproc;
+
+#endif
+
+#define ENTRY(name)	ENTRY_ALIGN(name, 6)
+
+#define ENTRY_ALIAS(name)	\
+  .global name;		\
+  .type name,%function;	\
+  name:
+
+#define END(name)	\
+  .cfi_endproc;		\
+  .size name, .-name;
+
+#define L(l) .L ## l
+
+#ifdef __ILP32__
+  /* Sanitize padding bits of pointer arguments as per aapcs64 */
+#define PTR_ARG(n)  mov w##n, w##n
+#else
+#define PTR_ARG(n)
+#endif
+
+#ifdef __ILP32__
+  /* Sanitize padding bits of size arguments as per aapcs64 */
+#define SIZE_ARG(n)  mov w##n, w##n
+#else
+#define SIZE_ARG(n)
+#endif
+
+#endif
diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c
new file mode 100644
index 000000000000..d5d4ea7e0309
--- /dev/null
+++ b/string/bench/memcpy.c
@@ -0,0 +1,260 @@
+/*
+ * memcpy benchmark.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define _GNU_SOURCE
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "stringlib.h"
+#include "benchlib.h"
+
+#define ITERS 5000
+#define ITERS2 20000000
+#define ITERS3 500000
+#define MAX_COPIES 8192
+#define SIZE (256*1024)
+
+static uint8_t a[SIZE + 4096] __attribute__((__aligned__(64)));
+static uint8_t b[SIZE + 4096] __attribute__((__aligned__(64)));
+
+#define F(x) {#x, x},
+
+static const struct fun
+{
+  const char *name;
+  void *(*fun)(void *, const void *, size_t);
+} funtab[] =
+{
+  F(memcpy)
+#if __aarch64__
+  F(__memcpy_aarch64)
+# if __ARM_NEON
+  F(__memcpy_aarch64_simd)
+# endif
+#elif __arm__
+  F(__memcpy_arm)
+#endif
+#undef F
+  {0, 0}
+};
+
+typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
+typedef struct { uint8_t align; uint16_t freq; } align_data_t;
+
+#define SIZE_NUM 65536
+#define SIZE_MASK (SIZE_NUM-1)
+static uint8_t size_arr[SIZE_NUM];
+
+/* Frequency data for memcpy of less than 4096 bytes based on SPEC2017.  */
+static freq_data_t size_freq[] =
+{
+{32,22320}, { 16,9554}, {  8,8915}, {152,5327}, {  4,2159}, {292,2035},
+{ 12,1608}, { 24,1343}, {1152,895}, {144, 813}, {884, 733}, {284, 721},
+{120, 661}, {  2, 649}, {882, 550}, {  5, 475}, {  7, 461}, {108, 460},
+{ 10, 361}, {  9, 361}, {  6, 334}, {  3, 326}, {464, 308}, {2048,303},
+{  1, 298}, { 64, 250}, { 11, 197}, {296, 194}, { 68, 187}, { 15, 185},
+{192, 184}, {1764,183}, { 13, 173}, {560, 126}, {160, 115}, {288,  96},
+{104,  96}, {1144, 83}, { 18,  80}, { 23,  78}, { 40,  77}, { 19,  68},
+{ 48,  63}, { 17,  57}, { 72,  54}, {1280, 51}, { 20,  49}, { 28,  47},
+{ 22,  46}, {640,  45}, { 25,  41}, { 14,  40}, { 56,  37}, { 27,  35},
+{ 35,  33}, {384,  33}, { 29,  32}, { 80,  30}, {4095, 22}, {232,  22},
+{ 36,  19}, {184,  17}, { 21,  17}, {256,  16}, { 44,  15}, { 26,  15},
+{ 31,  14}, { 88,  14}, {176,  13}, { 33,  12}, {1024, 12}, {208,  11},
+{ 62,  11}, {128,  10}, {704,  10}, {324,  10}, { 96,  10}, { 60,   9},
+{136,   9}, {124,   9}, { 34,   8}, { 30,   8}, {480,   8}, {1344,  8},
+{273,   7}, {520,   7}, {112,   6}, { 52,   6}, {344,   6}, {336,   6},
+{504,   5}, {168,   5}, {424,   5}, {  0,   4}, { 76,   3}, {200,   3},
+{512,   3}, {312,   3}, {240,   3}, {960,   3}, {264,   2}, {672,   2},
+{ 38,   2}, {328,   2}, { 84,   2}, { 39,   2}, {216,   2}, { 42,   2},
+{ 37,   2}, {1608,  2}, { 70,   2}, { 46,   2}, {536,   2}, {280,   1},
+{248,   1}, { 47,   1}, {1088,  1}, {1288,  1}, {224,   1}, { 41,   1},
+{ 50,   1}, { 49,   1}, {808,   1}, {360,   1}, {440,   1}, { 43,   1},
+{ 45,   1}, { 78,   1}, {968,   1}, {392,   1}, { 54,   1}, { 53,   1},
+{ 59,   1}, {376,   1}, {664,   1}, { 58,   1}, {272,   1}, { 66,   1},
+{2688,  1}, {472,   1}, {568,   1}, {720,   1}, { 51,   1}, { 63,   1},
+{ 86,   1}, {496,   1}, {776,   1}, { 57,   1}, {680,   1}, {792,   1},
+{122,   1}, {760,   1}, {824,   1}, {552,   1}, { 67,   1}, {456,   1},
+{984,   1}, { 74,   1}, {408,   1}, { 75,   1}, { 92,   1}, {576,   1},
+{116,   1}, { 65,   1}, {117,   1}, { 82,   1}, {352,   1}, { 55,   1},
+{100,   1}, { 90,   1}, {696,   1}, {111,   1}, {880,   1}, { 79,   1},
+{488,   1}, { 61,   1}, {114,   1}, { 94,   1}, {1032,  1}, { 98,   1},
+{ 87,   1}, {584,   1}, { 85,   1}, {648,   1}, {0, 0}
+};
+
+#define ALIGN_NUM 1024
+#define ALIGN_MASK (ALIGN_NUM-1)
+static uint8_t src_align_arr[ALIGN_NUM];
+static uint8_t dst_align_arr[ALIGN_NUM];
+
+/* Source alignment frequency for memcpy based on SPEC2017.  */
+static align_data_t src_align_freq[] =
+{
+  {8, 300}, {16, 292}, {32, 168}, {64, 153}, {4, 79}, {2, 14}, {1, 18}, {0, 0}
+};
+
+static align_data_t dst_align_freq[] =
+{
+  {8, 265}, {16, 263}, {64, 209}, {32, 174}, {4, 90}, {2, 10}, {1, 13}, {0, 0}
+};
+
+typedef struct
+{
+  uint64_t src : 24;
+  uint64_t dst : 24;
+  uint64_t len : 16;
+} copy_t;
+
+static copy_t copy[MAX_COPIES];
+
+typedef char *(*proto_t) (char *, const char *, size_t);
+
+static void
+init_copy_distribution (void)
+{
+  int i, j, freq, size, n;
+
+  for (n = i = 0; (freq = size_freq[i].freq) != 0; i++)
+    for (j = 0, size = size_freq[i].size; j < freq; j++)
+      size_arr[n++] = size;
+  assert (n == SIZE_NUM);
+
+  for (n = i = 0; (freq = src_align_freq[i].freq) != 0; i++)
+    for (j = 0, size = src_align_freq[i].align; j < freq; j++)
+      src_align_arr[n++] = size - 1;
+  assert (n == ALIGN_NUM);
+
+  for (n = i = 0; (freq = dst_align_freq[i].freq) != 0; i++)
+    for (j = 0, size = dst_align_freq[i].align; j < freq; j++)
+      dst_align_arr[n++] = size - 1;
+  assert (n == ALIGN_NUM);
+}
+
+static size_t
+init_copies (size_t max_size)
+{
+  size_t total = 0;
+  /* Create a random set of copies with the given size and alignment
+     distributions.  */
+  for (int i = 0; i < MAX_COPIES; i++)
+    {
+      copy[i].dst = (rand32 (0) & (max_size - 1));
+      copy[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];
+      copy[i].src = (rand32 (0) & (max_size - 1));
+      copy[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];
+      copy[i].len = size_arr[rand32 (0) & SIZE_MASK];
+      total += copy[i].len;
+    }
+
+  return total;
+}
+
+int main (void)
+{
+  init_copy_distribution ();
+
+  memset (a, 1, sizeof (a));
+  memset (b, 2, sizeof (b));
+
+  printf("Random memcpy:\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      size_t total = 0;
+      uint64_t tsum = 0;
+      printf ("%22s (B/ns) ", funtab[f].name);
+      rand32 (0x12345678);
+
+      for (int size = 16384; size <= SIZE; size *= 2)
+	{
+	  size_t copy_size = init_copies (size) * ITERS;
+
+	  for (int c = 0; c < MAX_COPIES; c++)
+	    funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);
+
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS; i++)
+	    for (int c = 0; c < MAX_COPIES; c++)
+	      funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);
+	  t = clock_get_ns () - t;
+	  total += copy_size;
+	  tsum += t;
+	  printf ("%dK: %.2f ", size / 1024, (double)copy_size / t);
+	}
+      printf( "avg %.2f\n", (double)total / tsum);
+    }
+
+  printf ("\nMedium memcpy:\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s (B/ns) ", funtab[f].name);
+
+      for (int size = 16; size <= 512; size *= 2)
+	{
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS2; i++)
+	    funtab[f].fun (b, a, size);
+	  t = clock_get_ns () - t;
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("\nLarge memcpy:\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s (B/ns) ", funtab[f].name);
+
+      for (int size = 1024; size <= 32768; size *= 2)
+	{
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS3; i++)
+	    funtab[f].fun (b, a, size);
+	  t = clock_get_ns () - t;
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("\nUnaligned forwards memmove:\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s (B/ns) ", funtab[f].name);
+
+      for (int size = 1024; size <= 32768; size *= 2)
+	{
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS3; i++)
+	    funtab[f].fun (a, a + 256 + (i & 31), size);
+	  t = clock_get_ns () - t;
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+	}
+      printf ("\n");
+    }
+
+
+  printf ("\nUnaligned backwards memmove:\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s (B/ns) ", funtab[f].name);
+
+      for (int size = 1024; size <= 32768; size *= 2)
+	{
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS3; i++)
+	    funtab[f].fun (a + 256 + (i & 31), a, size);
+	  t = clock_get_ns () - t;
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+	}
+      printf ("\n");
+    }
+
+  return 0;
+}
diff --git a/string/bench/strlen.c b/string/bench/strlen.c
new file mode 100644
index 000000000000..cc0f04bee547
--- /dev/null
+++ b/string/bench/strlen.c
@@ -0,0 +1,221 @@
+/*
+ * strlen benchmark.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define _GNU_SOURCE
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "stringlib.h"
+#include "benchlib.h"
+
+#define ITERS 2000
+#define ITERS2 20000000
+#define ITERS3 2000000
+#define NUM_STRLEN 16384
+
+#define MAX_ALIGN 32
+#define MAX_STRLEN 256
+
+static char a[(MAX_STRLEN + 1) * MAX_ALIGN] __attribute__((__aligned__(4096)));
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+  const char *name;
+  size_t (*fun) (const char *s);
+  int test_mte;
+} funtab[] = {
+  // clang-format off
+  F(strlen, 0)
+#if __aarch64__
+  F(__strlen_aarch64, 0)
+  F(__strlen_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+  F(__strlen_aarch64_sve, 1)
+# endif
+#elif __arm__
+# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
+  F(__strlen_armv6t2, 0)
+# endif
+#endif
+  {0, 0, 0}
+  // clang-format on
+};
+#undef F
+
+static uint16_t strlen_tests[NUM_STRLEN];
+
+typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
+typedef struct { uint8_t align; uint16_t freq; } align_data_t;
+
+#define SIZE_NUM 65536
+#define SIZE_MASK (SIZE_NUM - 1)
+static uint8_t strlen_len_arr[SIZE_NUM];
+
+/* Frequency data for strlen sizes up to 128 based on SPEC2017.  */
+static freq_data_t strlen_len_freq[] =
+{
+  { 12,22671}, { 18,12834}, { 13, 9555}, {  6, 6348}, { 17, 6095}, { 11, 2115},
+  { 10, 1335}, {  7,  814}, {  2,  646}, {  9,  483}, {  8,  471}, { 16,  418},
+  {  4,  390}, {  1,  388}, {  5,  233}, {  3,  204}, {  0,   79}, { 14,   79},
+  { 15,   69}, { 26,   36}, { 22,   35}, { 31,   24}, { 32,   24}, { 19,   21},
+  { 25,   17}, { 28,   15}, { 21,   14}, { 33,   14}, { 20,   13}, { 24,    9},
+  { 29,    9}, { 30,    9}, { 23,    7}, { 34,    7}, { 27,    6}, { 44,    5},
+  { 42,    4}, { 45,    3}, { 47,    3}, { 40,    2}, { 41,    2}, { 43,    2},
+  { 58,    2}, { 78,    2}, { 36,    2}, { 48,    1}, { 52,    1}, { 60,    1},
+  { 64,    1}, { 56,    1}, { 76,    1}, { 68,    1}, { 80,    1}, { 84,    1},
+  { 72,    1}, { 86,    1}, { 35,    1}, { 39,    1}, { 50,    1}, { 38,    1},
+  { 37,    1}, { 46,    1}, { 98,    1}, {102,    1}, {128,    1}, { 51,    1},
+  {107,    1}, { 0,     0}
+};
+
+#define ALIGN_NUM 1024
+#define ALIGN_MASK (ALIGN_NUM - 1)
+static uint8_t strlen_align_arr[ALIGN_NUM];
+
+/* Alignment data for strlen based on SPEC2017.  */
+static align_data_t string_align_freq[] =
+{
+  {8, 470}, {32, 427}, {16, 99}, {1, 19}, {2, 6}, {4, 3}, {0, 0}
+};
+
+static void
+init_strlen_distribution (void)
+{
+  int i, j, freq, size, n;
+
+  for (n = i = 0; (freq = strlen_len_freq[i].freq) != 0; i++)
+    for (j = 0, size = strlen_len_freq[i].size; j < freq; j++)
+      strlen_len_arr[n++] = size;
+  assert (n == SIZE_NUM);
+
+  for (n = i = 0; (freq = string_align_freq[i].freq) != 0; i++)
+    for (j = 0, size = string_align_freq[i].align; j < freq; j++)
+      strlen_align_arr[n++] = size;
+  assert (n == ALIGN_NUM);
+}
+
+static void
+init_strlen_tests (void)
+{
+  uint16_t index[MAX_ALIGN];
+
+  memset (a, 'x', sizeof (a));
+
+  /* Create indices for strings at all alignments.  */
+  for (int i = 0; i < MAX_ALIGN; i++)
+    {
+      index[i] = i * (MAX_STRLEN + 1);
+      a[index[i] + MAX_STRLEN] = 0;
+    }
+
+  /* Create a random set of strlen input strings using the string length
+     and alignment distributions.  */
+  for (int n = 0; n < NUM_STRLEN; n++)
+    {
+      int align = strlen_align_arr[rand32 (0) & ALIGN_MASK];
+      int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK];
+
+      strlen_tests[n] =
+	index[(align + exp_len) & (MAX_ALIGN - 1)] + MAX_STRLEN - exp_len;
+    }
+}
+
+static volatile size_t maskv = 0;
+
+int main (void)
+{
+  rand32 (0x12345678);
+  init_strlen_distribution ();
+  init_strlen_tests ();
+
+  printf ("\nRandom strlen (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      size_t res = 0, strlen_size = 0, mask = maskv;
+      printf ("%22s ", funtab[f].name);
+
+      for (int c = 0; c < NUM_STRLEN; c++)
+	strlen_size += funtab[f].fun (a + strlen_tests[c]);
+      strlen_size *= ITERS;
+
+      /* Measure latency of strlen result with (res & mask).  */
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS; i++)
+	for (int c = 0; c < NUM_STRLEN; c++)
+	  res = funtab[f].fun (a + strlen_tests[c] + (res & mask));
+      t = clock_get_ns () - t;
+      printf ("%.2f\n", (double)strlen_size / t);
+    }
+
+  printf ("\nSmall aligned strlen (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s ", funtab[f].name);
+
+      for (int size = 1; size <= 64; size *= 2)
+	{
+	  memset (a, 'x', size);
+	  a[size - 1] = 0;
+
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS2; i++)
+	    funtab[f].fun (a);
+	  t = clock_get_ns () - t;
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("\nSmall unaligned strlen (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s ", funtab[f].name);
+
+      int align = 9;
+      for (int size = 1; size <= 64; size *= 2)
+	{
+	  memset (a + align, 'x', size);
+	  a[align + size - 1] = 0;
+
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS2; i++)
+	    funtab[f].fun (a + align);
+	  t = clock_get_ns () - t;
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("\nMedium strlen (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s ", funtab[f].name);
+
+      for (int size = 128; size <= 4096; size *= 2)
+	{
+	  memset (a, 'x', size);
+	  a[size - 1] = 0;
+
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS3; i++)
+	    funtab[f].fun (a);
+	  t = clock_get_ns () - t;
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("\n");
+
+  return 0;
+}
diff --git a/string/include/benchlib.h b/string/include/benchlib.h
new file mode 100644
index 000000000000..0f2ce2eb6bce
--- /dev/null
+++ b/string/include/benchlib.h
@@ -0,0 +1,33 @@
+/*
+ * Benchmark support functions.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <time.h>
+
+/* Fast and accurate timer returning nanoseconds.  */
+static inline uint64_t
+clock_get_ns (void)
+{
+  struct timespec ts;
+  clock_gettime (CLOCK_MONOTONIC, &ts);
+  return ts.tv_sec * (uint64_t) 1000000000 + ts.tv_nsec;
+}
+
+/* Fast 32-bit random number generator.  Passing a non-zero seed
+   value resets the internal state.  */
+static inline uint32_t
+rand32 (uint32_t seed)
+{
+  static uint64_t state = 0xb707be451df0bb19ULL;
+  if (seed != 0)
+    state = seed;
+  uint32_t res = state >> 32;
+  state = state * 6364136223846793005ULL + 1;
+  return res;
+}
+
+
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
new file mode 100644
index 000000000000..378c3cd2d645
--- /dev/null
+++ b/string/include/stringlib.h
@@ -0,0 +1,69 @@
+/*
+ * Public API.
+ *
+ * Copyright (c) 2019-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stddef.h>
+
+/* restrict is not needed, but kept for documenting the interface contract.  */
+#ifndef __restrict
+# define __restrict
+#endif
+
+#if __aarch64__
+void *__memcpy_aarch64 (void *__restrict, const void *__restrict, size_t);
+void *__memmove_aarch64 (void *, const void *, size_t);
+void *__memset_aarch64 (void *, int, size_t);
+void *__memchr_aarch64 (const void *, int, size_t);
+void *__memrchr_aarch64 (const void *, int, size_t);
+int __memcmp_aarch64 (const void *, const void *, size_t);
+char *__strcpy_aarch64 (char *__restrict, const char *__restrict);
+char *__stpcpy_aarch64 (char *__restrict, const char *__restrict);
+int __strcmp_aarch64 (const char *, const char *);
+char *__strchr_aarch64 (const char *, int);
+char *__strrchr_aarch64 (const char *, int);
+char *__strchrnul_aarch64 (const char *, int );
+size_t __strlen_aarch64 (const char *);
+size_t __strnlen_aarch64 (const char *, size_t);
+int __strncmp_aarch64 (const char *, const char *, size_t);
+void * __memchr_aarch64_mte (const void *, int, size_t);
+char *__strcpy_aarch64_mte (char *__restrict, const char *__restrict);
+char *__stpcpy_aarch64_mte (char *__restrict, const char *__restrict);
+char *__strchr_aarch64_mte (const char *, int);
+char * __strchrnul_aarch64_mte (const char *, int );
+size_t __strlen_aarch64_mte (const char *);
+char *__strrchr_aarch64_mte (const char *, int);
+int __strcmp_aarch64_mte (const char *, const char *);
+int __strncmp_aarch64_mte (const char *, const char *, size_t);
+#if __ARM_NEON
+void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t);
+void *__memmove_aarch64_simd (void *, const void *, size_t);
+#endif
+# if __ARM_FEATURE_SVE
+void *__memchr_aarch64_sve (const void *, int, size_t);
+int __memcmp_aarch64_sve (const void *, const void *, size_t);
+char *__strchr_aarch64_sve (const char *, int);
+char *__strrchr_aarch64_sve (const char *, int);
+char *__strchrnul_aarch64_sve (const char *, int );
+int __strcmp_aarch64_sve (const char *, const char *);
+char *__strcpy_aarch64_sve (char *__restrict, const char *__restrict);
+char *__stpcpy_aarch64_sve (char *__restrict, const char *__restrict);
+size_t __strlen_aarch64_sve (const char *);
+size_t __strnlen_aarch64_sve (const char *, size_t);
+int __strncmp_aarch64_sve (const char *, const char *, size_t);
+# endif
+# if __ARM_FEATURE_MEMORY_TAGGING
+void *__mtag_tag_region (void *, size_t);
+void *__mtag_tag_zero_region (void *, size_t);
+# endif
+#elif __arm__
+void *__memcpy_arm (void *__restrict, const void *__restrict, size_t);
+void *__memset_arm (void *, int, size_t);
+void *__memchr_arm (const void *, int, size_t);
+char *__strcpy_arm (char *__restrict, const char *__restrict);
+int __strcmp_arm (const char *, const char *);
+int __strcmp_armv6m (const char *, const char *);
+size_t __strlen_armv6t2 (const char *);
+#endif
diff --git a/string/test/__mtag_tag_region.c b/string/test/__mtag_tag_region.c
new file mode 100644
index 000000000000..d8c02d92d626
--- /dev/null
+++ b/string/test/__mtag_tag_region.c
@@ -0,0 +1,147 @@
+/*
+ * __mtag_tag_region test.
+ *
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+static void
+mtag_quoteat (const char *prefix, void *p, int len, int at)
+{
+  /* Print tag, untag and quote the context.  */
+  printf ("location: %p\n", __arm_mte_get_tag ((char *) p + at));
+  untag_buffer (p, len, 1);
+  p = untag_pointer (p);
+  quoteat (prefix, p, len, at);
+}
+
+#define F(x) {#x, x},
+
+static const struct fun
+{
+  const char *name;
+  void *(*fun) (void *s, size_t n);
+} funtab[] = {
+// clang-format off
+#if __aarch64__
+  F(__mtag_tag_region)
+#endif
+  {0, 0}
+  // clang-format on
+};
+#undef F
+
+#define A 64
+#define LEN 250000
+static unsigned char *sbuf;
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + A - 1) & -A);
+}
+
+static void
+test (const struct fun *fun, int salign, int len)
+{
+  unsigned char *src = alignup (sbuf);
+  unsigned char *s = src + salign;
+  void *p;
+  int i;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || salign >= A)
+    abort ();
+  for (i = 0; i < len + 2 * A; i++)
+    src[i] = '?';
+  for (i = 0; i < len; i++)
+    s[i] = 'a';
+
+  src = tag_buffer (src, len + 2 * A, 1);
+  s = src + salign;
+  /* Use different tag.  */
+  s = __arm_mte_increment_tag (s, 1);
+  p = fun->fun (s, len);
+
+  if (p != s)
+    ERR ("%s(%p,..) returned %p\n", fun->name, s, p);
+
+  for (i = 0; i < salign; i++)
+    {
+      if (src[i] != '?')
+	{
+	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+	  mtag_quoteat ("got head", src, len + 2 * A, i);
+	  return;
+	}
+    }
+
+  for (; i < salign + len; i++)
+    {
+      if (s[i - salign] != 'a')
+	{
+	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+	  mtag_quoteat ("got body", src, len + 2 * A, i);
+	  return;
+	}
+    }
+
+  for (; i < len + 2 * A; i++)
+    {
+      if (src[i] != '?')
+	{
+	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+	  mtag_quoteat ("got tail", src, len + 2 * A, i);
+	  return;
+	}
+    }
+
+  untag_buffer (src, len + 2 * A, 1);
+}
+
+int
+main ()
+{
+  if (!mte_enabled ())
+    return 0;
+
+  sbuf = mte_mmap (LEN + 3 * A);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int s = 0; s < A; s += 16)
+	{
+	  int n;
+	  for (n = 0; n < 200; n += 16)
+	    {
+	      test (funtab + i, s, n);
+	    }
+	  for (; n < LEN; n *= 2)
+	    {
+	      test (funtab + i, s, n);
+	    }
+	}
+      printf ("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
+#else
+int
+main ()
+{
+  return 0;
+}
+#endif
diff --git a/string/test/__mtag_tag_zero_region.c b/string/test/__mtag_tag_zero_region.c
new file mode 100644
index 000000000000..221c223a2f31
--- /dev/null
+++ b/string/test/__mtag_tag_zero_region.c
@@ -0,0 +1,147 @@
+/*
+ * __mtag_tag_zero_region test.
+ *
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+static void
+mtag_quoteat (const char *prefix, void *p, int len, int at)
+{
+  /* Print tag, untag and quote the context.  */
+  printf ("location: %p\n", __arm_mte_get_tag ((char *) p + at));
+  untag_buffer (p, len, 1);
+  p = untag_pointer (p);
+  quoteat (prefix, p, len, at);
+}
+
+#define F(x) {#x, x},
+
+static const struct fun
+{
+  const char *name;
+  void *(*fun) (void *s, size_t n);
+} funtab[] = {
+// clang-format off
+#if __aarch64__
+  F(__mtag_tag_zero_region)
+#endif
+  {0, 0}
+  // clang-format on
+};
+#undef F
+
+#define A 64
+#define LEN 250000
+static unsigned char *sbuf;
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + A - 1) & -A);
+}
+
+static void
+test (const struct fun *fun, int salign, int len)
+{
+  unsigned char *src = alignup (sbuf);
+  unsigned char *s = src + salign;
+  void *p;
+  int i;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || salign >= A)
+    abort ();
+  for (i = 0; i < len + 2 * A; i++)
+    src[i] = '?';
+  for (i = 0; i < len; i++)
+    s[i] = 'a' + i % 23;
+
+  src = tag_buffer (src, len + 2 * A, 1);
+  s = src + salign;
+  /* Use different tag.  */
+  s = __arm_mte_increment_tag (s, 1);
+  p = fun->fun (s, len);
+
+  if (p != s)
+    ERR ("%s(%p,..) returned %p\n", fun->name, s, p);
+
+  for (i = 0; i < salign; i++)
+    {
+      if (src[i] != '?')
+	{
+	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+	  mtag_quoteat ("got head", src, len + 2 * A, i);
+	  return;
+	}
+    }
+
+  for (; i < salign + len; i++)
+    {
+      if (s[i - salign] != 0)
+	{
+	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+	  mtag_quoteat ("got body", src, len + 2 * A, i);
+	  return;
+	}
+    }
+
+  for (; i < len + 2 * A; i++)
+    {
+      if (src[i] != '?')
+	{
+	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+	  mtag_quoteat ("got tail", src, len + 2 * A, i);
+	  return;
+	}
+    }
+
+  untag_buffer (src, len + 2 * A, 1);
+}
+
+int
+main ()
+{
+  if (!mte_enabled ())
+    return 0;
+
+  sbuf = mte_mmap (LEN + 3 * A);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int s = 0; s < A; s += 16)
+	{
+	  int n;
+	  for (n = 0; n < 200; n += 16)
+	    {
+	      test (funtab + i, s, n);
+	    }
+	  for (; n < LEN; n *= 2)
+	    {
+	      test (funtab + i, s, n);
+	    }
+	}
+      printf ("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
+#else
+int
+main ()
+{
+  return 0;
+}
+#endif
diff --git a/string/test/memchr.c b/string/test/memchr.c
new file mode 100644
index 000000000000..0ff77f5710bf
--- /dev/null
+++ b/string/test/memchr.c
@@ -0,0 +1,110 @@
+/*
+ * memchr test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+  const char *name;
+  void *(*fun) (const void *s, int c, size_t n);
+  int test_mte;
+} funtab[] = {
+  // clang-format off
+  F(memchr, 0)
+#if __aarch64__
+  F(__memchr_aarch64, 0)
+  F(__memchr_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+  F(__memchr_aarch64_sve, 1)
+# endif
+#elif __arm__
+  F(__memchr_arm, 0)
+#endif
+  {0, 0, 0}
+  // clang-format on
+};
+#undef F
+
+#define ALIGN 32
+#define LEN 512
+static char *sbuf;
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
+}
+
+static void
+test (const struct fun *fun, int align, size_t seekpos, size_t len,
+      size_t maxlen)
+{
+  char *src = alignup (sbuf);
+  char *s = src + align;
+  char *f = seekpos < maxlen ? s + seekpos : NULL;
+  int seekchar = 1;
+  void *p;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || seekpos > LEN || align > ALIGN)
+    abort ();
+
+  for (int i = 0; src + i < s; i++)
+    src[i] = seekchar;
+  for (int i = 0; i <= ALIGN; i++)
+    s[len + i] = seekchar;
+  for (int i = 0; i < len; i++)
+    s[i] = 'a' + (i & 31);
+  s[seekpos] = seekchar;
+  s[((len ^ align) & 1) ? seekpos + 1 : len] = seekchar;
+
+  int mte_len = seekpos != -1 ? seekpos + 1 : maxlen;
+  s = tag_buffer (s, mte_len, fun->test_mte);
+  p = fun->fun (s, seekchar, maxlen);
+  untag_buffer (s, mte_len, fun->test_mte);
+  p = untag_pointer (p);
+
+  if (p != f)
+    {
+      ERR ("%s (%p, 0x%02x, %zu) returned %p, expected %p\n", fun->name, s,
+	   seekchar, maxlen, p, f);
+      quote ("input", s, len);
+    }
+}
+
+int
+main (void)
+{
+  sbuf = mte_mmap (LEN + 3 * ALIGN);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int a = 0; a < ALIGN; a++)
+	for (int n = 0; n < LEN; n++)
+	  {
+	    for (int sp = 0; sp < LEN; sp++)
+	      test (funtab + i, a, sp, n, n);
+	    test (funtab + i, a, n, n, SIZE_MAX - a);
+	  }
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
diff --git a/string/test/memcmp.c b/string/test/memcmp.c
new file mode 100644
index 000000000000..7a7cf9cff35a
--- /dev/null
+++ b/string/test/memcmp.c
@@ -0,0 +1,125 @@
+/*
+ * memcmp test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+  const char *name;
+  int (*fun) (const void *s1, const void *s2, size_t n);
+  int test_mte;
+} funtab[] = {
+  // clang-format off
+  F(memcmp, 0)
+#if __aarch64__
+  F(__memcmp_aarch64, 1)
+# if __ARM_FEATURE_SVE
+  F(__memcmp_aarch64_sve, 1)
+# endif
+#endif
+  {0, 0, 0}
+  // clang-format on
+};
+#undef F
+
+#define A 32
+#define LEN 250000
+static unsigned char *s1buf;
+static unsigned char *s2buf;
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + A - 1) & -A);
+}
+
+static void
+test (const struct fun *fun, int s1align, int s2align, int len, int diffpos,
+      int delta)
+{
+  unsigned char *src1 = alignup (s1buf);
+  unsigned char *src2 = alignup (s2buf);
+  unsigned char *s1 = src1 + s1align;
+  unsigned char *s2 = src2 + s2align;
+  int r;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || s1align >= A || s2align >= A)
+    abort ();
+  if (diffpos >= len)
+    abort ();
+  if ((diffpos < 0) != (delta == 0))
+    abort ();
+
+  for (int i = 0; i < len + A; i++)
+    src1[i] = src2[i] = '?';
+  for (int i = 0; i < len; i++)
+    s1[i] = s2[i] = 'a' + i % 23;
+  if (delta)
+    s1[diffpos] += delta;
+
+  s1 = tag_buffer (s1, len, fun->test_mte);
+  s2 = tag_buffer (s2, len, fun->test_mte);
+  r = fun->fun (s1, s2, len);
+  untag_buffer (s1, len, fun->test_mte);
+  untag_buffer (s2, len, fun->test_mte);
+
+  if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0))
+    {
+      ERR ("%s(align %d, align %d, %d) failed, returned %d\n", fun->name,
+	   s1align, s2align, len, r);
+      quoteat ("src1", src1, len + A, diffpos);
+      quoteat ("src2", src2, len + A, diffpos);
+    }
+}
+
+int
+main ()
+{
+  s1buf = mte_mmap (LEN + 2 * A);
+  s2buf = mte_mmap (LEN + 2 * A);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int d = 0; d < A; d++)
+	for (int s = 0; s < A; s++)
+	  {
+	    int n;
+	    test (funtab + i, d, s, 0, -1, 0);
+	    test (funtab + i, d, s, 1, -1, 0);
+	    test (funtab + i, d, s, 1, 0, -1);
+	    test (funtab + i, d, s, 1, 0, 1);
+	    for (n = 2; n < 100; n++)
+	      {
+		test (funtab + i, d, s, n, -1, 0);
+		test (funtab + i, d, s, n, 0, -1);
+		test (funtab + i, d, s, n, n - 1, -1);
+		test (funtab + i, d, s, n, n / 2, 1);
+	      }
+	    for (; n < LEN; n *= 2)
+	      {
+		test (funtab + i, d, s, n, -1, 0);
+		test (funtab + i, d, s, n, n / 2, -1);
+	      }
+	  }
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
diff --git a/string/test/memcpy.c b/string/test/memcpy.c
new file mode 100644
index 000000000000..ce0ceeef5ee8
--- /dev/null
+++ b/string/test/memcpy.c
@@ -0,0 +1,120 @@
+/*
+ * memcpy test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+  const char *name;
+  void *(*fun) (void *, const void *, size_t);
+  int test_mte;
+} funtab[] = {
+  // clang-format off
+  F(memcpy, 0)
+#if __aarch64__
+  F(__memcpy_aarch64, 1)
+# if __ARM_NEON
+  F(__memcpy_aarch64_simd, 1)
+# endif
+#elif __arm__
+  F(__memcpy_arm, 0)
+#endif
+  {0, 0, 0}
+  // clang-format on
+};
+#undef F
+
+#define A 32
+#define LEN 250000
+static unsigned char *dbuf;
+static unsigned char *sbuf;
+static unsigned char wbuf[LEN + 2 * A];
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + A - 1) & -A);
+}
+
+static void
+test (const struct fun *fun, int dalign, int salign, int len)
+{
+  unsigned char *src = alignup (sbuf);
+  unsigned char *dst = alignup (dbuf);
+  unsigned char *want = wbuf;
+  unsigned char *s = src + salign;
+  unsigned char *d = dst + dalign;
+  unsigned char *w = want + dalign;
+  void *p;
+  int i;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || dalign >= A || salign >= A)
+    abort ();
+  for (i = 0; i < len + A; i++)
+    {
+      src[i] = '?';
+      want[i] = dst[i] = '*';
+    }
+  for (i = 0; i < len; i++)
+    s[i] = w[i] = 'a' + i % 23;
+
+  s = tag_buffer (s, len, fun->test_mte);
+  d = tag_buffer (d, len, fun->test_mte);
+  p = fun->fun (d, s, len);
+  untag_buffer (s, len, fun->test_mte);
+  untag_buffer (d, len, fun->test_mte);
+
+  if (p != d)
+    ERR ("%s(%p,..) returned %p\n", fun->name, d, p);
+  for (i = 0; i < len + A; i++)
+    {
+      if (dst[i] != want[i])
+	{
+	  ERR ("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign,
+	       len);
+	  quoteat ("got", dst, len + A, i);
+	  quoteat ("want", want, len + A, i);
+	  break;
+	}
+    }
+}
+
+int
+main ()
+{
+  dbuf = mte_mmap (LEN + 2 * A);
+  sbuf = mte_mmap (LEN + 2 * A);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int d = 0; d < A; d++)
+	for (int s = 0; s < A; s++)
+	  {
+	    int n;
+	    for (n = 0; n < 100; n++)
+	      test (funtab + i, d, s, n);
+	    for (; n < LEN; n *= 2)
+	      test (funtab + i, d, s, n);
+	  }
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
diff --git a/string/test/memmove.c b/string/test/memmove.c
new file mode 100644
index 000000000000..689b68c98af2
--- /dev/null
+++ b/string/test/memmove.c
@@ -0,0 +1,164 @@
+/*
+ * memmove test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+  const char *name;
+  void *(*fun) (void *, const void *, size_t);
+  int test_mte;
+} funtab[] = {
+  // clang-format off
+  F(memmove, 0)
+#if __aarch64__
+  F(__memmove_aarch64, 1)
+# if __ARM_NEON
+  F(__memmove_aarch64_simd, 1)
+# endif
+#endif
+  {0, 0, 0}
+  // clang-format on
+};
+#undef F
+
+#define A 32
+#define LEN 250000
+static unsigned char *dbuf;
+static unsigned char *sbuf;
+static unsigned char wbuf[LEN + 2 * A];
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + A - 1) & -A);
+}
+
+static void
+test (const struct fun *fun, int dalign, int salign, int len)
+{
+  unsigned char *src = alignup (sbuf);
+  unsigned char *dst = alignup (dbuf);
+  unsigned char *want = wbuf;
+  unsigned char *s = src + salign;
+  unsigned char *d = dst + dalign;
+  unsigned char *w = want + dalign;
+  void *p;
+  int i;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || dalign >= A || salign >= A)
+    abort ();
+  for (i = 0; i < len + A; i++)
+    {
+      src[i] = '?';
+      want[i] = dst[i] = '*';
+    }
+  for (i = 0; i < len; i++)
+    s[i] = w[i] = 'a' + i % 23;
+
+  p = fun->fun (d, s, len);
+  if (p != d)
+    ERR ("%s(%p,..) returned %p\n", fun->name, d, p);
+  for (i = 0; i < len + A; i++)
+    {
+      if (dst[i] != want[i])
+	{
+	  ERR ("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign,
+	       len);
+	  quoteat ("got", dst, len + A, i);
+	  quoteat ("want", want, len + A, i);
+	  break;
+	}
+    }
+}
+
+static void
+test_overlap (const struct fun *fun, int dalign, int salign, int len)
+{
+  unsigned char *src = alignup (sbuf);
+  unsigned char *dst = src;
+  unsigned char *want = wbuf;
+  unsigned char *s = src + salign;
+  unsigned char *d = dst + dalign;
+  unsigned char *w = wbuf + dalign;
+  void *p;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || dalign >= A || salign >= A)
+    abort ();
+
+  for (int i = 0; i < len + A; i++)
+    src[i] = want[i] = '?';
+
+  for (int i = 0; i < len; i++)
+    s[i] = want[salign + i] = 'a' + i % 23;
+  for (int i = 0; i < len; i++)
+    w[i] = s[i];
+
+  s = tag_buffer (s, len, fun->test_mte);
+  d = tag_buffer (d, len, fun->test_mte);
+  p = fun->fun (d, s, len);
+  untag_buffer (s, len, fun->test_mte);
+  untag_buffer (d, len, fun->test_mte);
+
+  if (p != d)
+    ERR ("%s(%p,..) returned %p\n", fun->name, d, p);
+  for (int i = 0; i < len + A; i++)
+    {
+      if (dst[i] != want[i])
+	{
+	  ERR ("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign,
+	       len);
+	  quoteat ("got", dst, len + A, i);
+	  quoteat ("want", want, len + A, i);
+	  break;
+	}
+    }
+}
+
+int
+main ()
+{
+  dbuf = mte_mmap (LEN + 2 * A);
+  sbuf = mte_mmap (LEN + 2 * A);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int d = 0; d < A; d++)
+	for (int s = 0; s < A; s++)
+	  {
+	    int n;
+	    for (n = 0; n < 100; n++)
+	      {
+		test (funtab + i, d, s, n);
+		test_overlap (funtab + i, d, s, n);
+	      }
+	    for (; n < LEN; n *= 2)
+	      {
+		test (funtab + i, d, s, n);
+		test_overlap (funtab + i, d, s, n);
+	      }
+	  }
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
diff --git a/string/test/memrchr.c b/string/test/memrchr.c
new file mode 100644
index 000000000000..adf96f049cc9
--- /dev/null
+++ b/string/test/memrchr.c
@@ -0,0 +1,106 @@
+/*
+ * memchr test.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+  const char *name;
+  void *(*fun) (const void *s, int c, size_t n);
+  int test_mte;
+} funtab[] = {
+  // clang-format off
+  F(memrchr, 0)
+#if __aarch64__
+  F(__memrchr_aarch64, 1)
+#endif
+  {0, 0, 0}
+  // clang-format on
+};
+#undef F
+
+#define ALIGN 32
+#define LEN 512
+static char *sbuf;
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + ALIGN) & -ALIGN);
+}
+
+static void
+test (const struct fun *fun, int align, size_t seekpos, size_t len,
+      size_t maxlen)
+{
+  char *src = alignup (sbuf);
+  char *s = src + align;
+  char *f = seekpos < maxlen ? s + seekpos : NULL;
+  int seekchar = 1;
+  void *p;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || seekpos > LEN || align > ALIGN)
+    abort ();
+
+  for (int i = 0; src + i < s; i++)
+    src[i] = seekchar;
+  for (int i = 0; i <= ALIGN; i++)
+    s[len + i] = seekchar;
+  for (int i = 0; i < len; i++)
+    s[i] = 'a' + (i & 31);
+  s[seekpos] = seekchar;
+  s[((len ^ align) & 1) && seekpos < maxlen ? seekpos - 1 : len] = seekchar;
+
+  s = tag_buffer (s, maxlen, fun->test_mte);
+  p = fun->fun (s, seekchar, maxlen);
+  untag_buffer (s, maxlen, fun->test_mte);
+  p = untag_pointer (p);
+
+  if (p != f)
+    {
+      ERR ("%s (%p, 0x%02x, %zu) returned %p, expected %p\n", fun->name, s,
+	   seekchar, maxlen, p, f);
+      quote ("input", s, len);
+    }
+}
+
+int
+main (void)
+{
+  sbuf = mte_mmap (LEN + 3 * ALIGN);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int a = 0; a < ALIGN; a++)
+	for (int n = 0; n < LEN; n++)
+	  {
+	    for (int sp = 0; sp < LEN; sp++)
+	      test (funtab + i, a, sp, n, n);
+	  }
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
diff --git a/string/test/memset.c b/string/test/memset.c
new file mode 100644
index 000000000000..f1721442dbaf
--- /dev/null
+++ b/string/test/memset.c
@@ -0,0 +1,129 @@
+/*
+ * memset test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+  const char *name;
+  void *(*fun) (void *s, int c, size_t n);
+  int test_mte;
+} funtab[] = {
+  // clang-format off
+  F(memset, 0)
+#if __aarch64__
+  F(__memset_aarch64, 1)
+#elif __arm__
+  F(__memset_arm, 0)
+#endif
+  {0, 0, 0}
+  // clang-format on
+};
+#undef F
+
+#define A 32
+#define LEN 250000
+static unsigned char *sbuf;
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + A - 1) & -A);
+}
+
+static void
+test (const struct fun *fun, int salign, int c, int len)
+{
+  unsigned char *src = alignup (sbuf);
+  unsigned char *s = src + salign;
+  void *p;
+  int i;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || salign >= A)
+    abort ();
+  for (i = 0; i < len + A; i++)
+    src[i] = '?';
+  for (i = 0; i < len; i++)
+    s[i] = 'a' + i % 23;
+
+  s = tag_buffer (s, len, fun->test_mte);
+  p = fun->fun (s, c, len);
+  untag_buffer (s, len, fun->test_mte);
+
+  if (p != s)
+    ERR ("%s(%p,..) returned %p\n", fun->name, s, p);
+
+  for (i = 0; i < salign; i++)
+    {
+      if (src[i] != '?')
+	{
+	  ERR ("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len);
+	  quoteat ("got", src, len + A, i);
+	  return;
+	}
+    }
+  for (; i < salign + len; i++)
+    {
+      if (src[i] != (unsigned char) c)
+	{
+	  ERR ("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len);
+	  quoteat ("got", src, len + A, i);
+	  return;
+	}
+    }
+  for (; i < len + A; i++)
+    {
+      if (src[i] != '?')
+	{
+	  ERR ("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len);
+	  quoteat ("got", src, len + A, i);
+	  return;
+	}
+    }
+}
+
+int
+main ()
+{
+  sbuf = mte_mmap (LEN + 2 * A);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int s = 0; s < A; s++)
+	{
+	  int n;
+	  for (n = 0; n < 100; n++)
+	    {
+	      test (funtab + i, s, 0, n);
+	      test (funtab + i, s, 0x25, n);
+	      test (funtab + i, s, 0xaa25, n);
+	    }
+	  for (; n < LEN; n *= 2)
+	    {
+	      test (funtab + i, s, 0, n);
+	      test (funtab + i, s, 0x25, n);
+	      test (funtab + i, s, 0xaa25, n);
+	    }
+	}
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
diff --git a/string/test/mte.h b/string/test/mte.h
new file mode 100644
index 000000000000..e67cbd9d2d40
--- /dev/null
+++ b/string/test/mte.h
@@ -0,0 +1,142 @@
+/*
+ * Memory tagging testing code.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef __TEST_MTE_H
+#define __TEST_MTE_H
+
+#include <stdlib.h>
+
+#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
+#include <arm_acle.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+
+// These depend on a not yet merged kernel ABI.
+#define PR_SET_TAGGED_ADDR_CTRL 55
+#define PR_TAGGED_ADDR_ENABLE (1UL << 0)
+#define PR_MTE_TCF_SHIFT 1
+#define PR_MTE_TCF_SYNC (1UL << PR_MTE_TCF_SHIFT)
+#define PR_MTE_TAG_SHIFT 3
+#define PROT_MTE 0x20
+
+#define MTE_GRANULE_SIZE 16
+
+int
+mte_enabled ()
+{
+  static int enabled = -1;
+  if (enabled == -1)
+    {
+      int res = prctl (PR_SET_TAGGED_ADDR_CTRL,
+		       PR_TAGGED_ADDR_ENABLE | PR_MTE_TCF_SYNC
+			 | (0xfffe << PR_MTE_TAG_SHIFT),
+		       0, 0, 0);
+      enabled = (res == 0);
+    }
+  return enabled;
+}
+
+static void *
+mte_mmap (size_t size)
+{
+  if (mte_enabled ())
+    {
+      return mmap (NULL, size, PROT_READ | PROT_WRITE | PROT_MTE,
+		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    }
+  else
+    {
+      return malloc (size);
+    }
+}
+
+void *
+alignup_mte (void *p)
+{
+  return (void *) (((uintptr_t) p + MTE_GRANULE_SIZE - 1)
+		   & ~(MTE_GRANULE_SIZE - 1));
+}
+
+void *
+aligndown_mte (void *p)
+{
+  return (void *) ((uintptr_t) p & ~(MTE_GRANULE_SIZE - 1));
+}
+
+void *
+untag_pointer (void *p)
+{
+  return (void *) ((unsigned long long) p & (~0ULL >> 8));
+}
+
+void
+tag_buffer_helper (void *p, int len)
+{
+  char *ptr = p;
+  char *end = alignup_mte (ptr + len);
+  ptr = aligndown_mte (p);
+  for (; ptr < end; ptr += MTE_GRANULE_SIZE)
+    {
+      __arm_mte_set_tag (ptr);
+    }
+}
+
+void *
+tag_buffer (void *p, int len, int test_mte)
+{
+  if (test_mte && mte_enabled ())
+    {
+      p = __arm_mte_increment_tag (p, 1);
+      tag_buffer_helper (p, len);
+    }
+  return p;
+}
+
+void *
+untag_buffer (void *p, int len, int test_mte)
+{
+  p = untag_pointer (p);
+  if (test_mte && mte_enabled ())
+    {
+      tag_buffer_helper (p, len);
+    }
+  return p;
+}
+
+#else  // __ARM_FEATURE_MEMORY_TAGGING
+int
+mte_enabled ()
+{
+  return 0;
+}
+static void *
+mte_mmap (size_t size)
+{
+  return malloc (size);
+}
+void *
+tag_buffer (void *p, int len, int test_mte)
+{
+  (void) len;
+  (void) test_mte;
+  return p;
+}
+void *
+untag_buffer (void *p, int len, int test_mte)
+{
+  (void) len;
+  (void) test_mte;
+  return p;
+}
+void *
+untag_pointer (void *p)
+{
+  return p;
+}
+#endif // __ARM_FEATURE_MEMORY_TAGGING
+
+#endif
diff --git a/string/test/stpcpy.c b/string/test/stpcpy.c
new file mode 100644
index 000000000000..1827e68c9a30
--- /dev/null
+++ b/string/test/stpcpy.c
@@ -0,0 +1,125 @@
+/*
+ * stpcpy test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+  const char *name;
+  char *(*fun) (char *dest, const char *src);
+  int test_mte;
+} funtab[] = {
+  // clang-format off
+  F(stpcpy, 0)
+#if __aarch64__
+  F(__stpcpy_aarch64, 0)
+  F(__stpcpy_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+  F(__stpcpy_aarch64_sve, 1)
+# endif
+#endif
+  {0, 0, 0}
+  // clang-format on
+};
+#undef F
+
+#define ALIGN 32
+#define LEN 512
+static char *dbuf;
+static char *sbuf;
+static char wbuf[LEN + 3 * ALIGN];
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
+}
+
+static void
+test (const struct fun *fun, int dalign, int salign, int len)
+{
+  char *src = alignup (sbuf);
+  char *dst = alignup (dbuf);
+  char *want = wbuf;
+  char *s = src + salign;
+  char *d = dst + dalign;
+  char *w = want + dalign;
+  void *p;
+  int i;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || dalign >= ALIGN || salign >= ALIGN)
+    abort ();
+  for (i = 0; i < len + ALIGN; i++)
+    {
+      src[i] = '?';
+      want[i] = dst[i] = '*';
+    }
+  for (int i = 0; src + i < s; i++)
+    src[i] = 0;
+  for (int i = 1; i <= ALIGN; i++)
+    s[len + i] = (len + salign) & 1 ? 1 : 0;
+  for (i = 0; i < len; i++)
+    s[i] = w[i] = 'a' + (i & 31);
+  s[len] = w[len] = '\0';
+
+  s = tag_buffer (s, len + 1, fun->test_mte);
+  d = tag_buffer (d, len + 1, fun->test_mte);
+  p = fun->fun (d, s);
+  untag_buffer (s, len + 1, fun->test_mte);
+  untag_buffer (d, len + 1, fun->test_mte);
+
+  if (p != d + len)
+    ERR ("%s (%p,..) returned %p expected %p\n", fun->name, d, p, d + len);
+
+  for (i = 0; i < len + ALIGN; i++)
+    {
+      if (dst[i] != want[i])
+	{
+	  ERR ("%s (align %d, align %d, %d) failed\n",
+	       fun->name, dalign, salign, len);
+	  quoteat ("got", dst, len + ALIGN, i);
+	  quoteat ("want", want, len + ALIGN, i);
+	  break;
+	}
+    }
+}
+
+int
+main (void)
+{
+  sbuf = mte_mmap (LEN + 3 * ALIGN);
+  dbuf = mte_mmap (LEN + 3 * ALIGN);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int d = 0; d < ALIGN; d++)
+	for (int s = 0; s < ALIGN; s++)
+	  for (int n = 0; n < LEN; n++)
+	    test (funtab + i, d, s, n);
+
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
diff --git a/string/test/strchr.c b/string/test/strchr.c
new file mode 100644
index 000000000000..f3ae982ef0ad
--- /dev/null
+++ b/string/test/strchr.c
@@ -0,0 +1,121 @@
+/*
+ * strchr test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+  const char *name;
+  char *(*fun) (const char *s, int c);
+  int test_mte;
+} funtab[] = {
+  // clang-format off
+  F(strchr, 0)
+#if __aarch64__
+  F(__strchr_aarch64, 0)
+  F(__strchr_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+  F(__strchr_aarch64_sve, 1)
+# endif
+#endif
+  {0, 0, 0}
+  // clang-format on
+};
+#undef F
+
+#define ALIGN 32
+#define LEN 512
+static char *sbuf;
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
+}
+
+static void
+test (const struct fun *fun, int align, int seekpos, int len)
+{
+  char *src = alignup (sbuf);
+  char *s = src + align;
+  char *f = seekpos != -1 ? s + seekpos : 0;
+  int seekchar = 0x1;
+  void *p;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || seekpos >= len || align >= ALIGN)
+    abort ();
+
+  for (int i = 0; src + i < s; i++)
+    src[i] = (i + len) & 1 ? seekchar : 0;
+  for (int i = 1; i <= ALIGN; i++)
+     s[len + i] = (i + len) & 1 ? seekchar : 0;
+  for (int i = 0; i < len; i++)
+    s[i] = 'a' + (i & 31);
+  if (seekpos != -1)
+    s[seekpos] = seekchar;
+  if (seekpos != -1 && (len + align) & 1)
+    s[seekpos + 1] = seekchar;
+  s[len] = '\0';
+
+  s = tag_buffer (s, len + 1, fun->test_mte);
+  p = fun->fun (s, seekchar);
+  untag_buffer (s, len + 1, fun->test_mte);
+  p = untag_pointer (p);
+
+  if (p != f)
+    {
+      ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
+	   fun->name, s, seekchar, len, p, f, seekpos);
+      quote ("input", s, len);
+    }
+
+  s = tag_buffer (s, len + 1, fun->test_mte);
+  p = fun->fun (s, 0);
+  untag_buffer (s, len + 1, fun->test_mte);
+
+  if (p != s + len)
+    {
+      ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
+	   fun->name, s, 0, len, p, f, len);
+      quote ("input", s, len);
+    }
+}
+
+int
+main (void)
+{
+  sbuf = mte_mmap (LEN + 3 * ALIGN);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int a = 0; a < ALIGN; a++)
+	for (int n = 0; n < LEN; n++)
+	  {
+	    for (int sp = 0; sp < n; sp++)
+	      test (funtab + i, a, sp, n);
+	    test (funtab + i, a, -1, n);
+	  }
+
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
diff --git a/string/test/strchrnul.c b/string/test/strchrnul.c
new file mode 100644
index 000000000000..6c30ab2123f1
--- /dev/null
+++ b/string/test/strchrnul.c
@@ -0,0 +1,126 @@
+/*
+ * strchrnul test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+  const char *name;
+  char *(*fun) (const char *s, int c);
+  int test_mte;
+} funtab[] = {
+  // clang-format off
+  F(strchrnul, 0)
+#if __aarch64__
+  F(__strchrnul_aarch64, 0)
+  F(__strchrnul_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+  F(__strchrnul_aarch64_sve, 1)
+# endif
+#endif
+  {0, 0, 0}
+  // clang-format on
+};
+#undef F
+
+#define ALIGN 32
+#define LEN 512
+static char *sbuf;
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
+}
+
+static void
+test (const struct fun *fun, int align, int seekpos, int len)
+{
+  char *src = alignup (sbuf);
+  char *s = src + align;
+  char *f = seekpos != -1 ? s + seekpos : s + len;
+  int seekchar = 0x1;
+  void *p;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || seekpos >= len || align >= ALIGN)
+    abort ();
+
+  for (int i = 0; src + i < s; i++)
+    src[i] = (i + len) & 1 ? seekchar : 0;
+  for (int i = 1; i <= ALIGN; i++)
+    s[len + i] = (i + len) & 1 ? seekchar : 0;
+  for (int i = 0; i < len; i++)
+    s[i] = 'a' + (i & 31);
+  if (seekpos != -1)
+    s[seekpos] = seekchar;
+  if (seekpos != -1 && (len + align) & 1)
+    s[seekpos + 1] = seekchar;
+  s[len] = '\0';
+
+  int mte_len = seekpos != -1 ? seekpos + 1 : len + 1;
+  s = tag_buffer (s, mte_len, fun->test_mte);
+  p = fun->fun (s, seekchar);
+  untag_buffer (s, mte_len, fun->test_mte);
+  p = untag_pointer (p);
+
+  if (p != f)
+    {
+      ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
+	   fun->name, s, seekchar, len, p, f, seekpos);
+      quote ("input", s, len);
+    }
+
+  s = tag_buffer (s, len + 1, fun->test_mte);
+  p = fun->fun (s, 0);
+  untag_buffer (s, len + 1, fun->test_mte);
+
+  if (p != s + len)
+    {
+      ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
+	   fun->name, s, 0, len, p, f, len);
+      quote ("input", s, len);
+    }
+}
+
+int
+main (void)
+{
+  sbuf = mte_mmap (LEN + 3 * ALIGN);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int a = 0; a < ALIGN; a++)
+	for (int n = 0; n < LEN; n++)
+	  {
+	    for (int sp = 0; sp < n; sp++)
+	      test (funtab + i, a, sp, n);
+	    test (funtab + i, a, -1, n);
+	  }
+
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
diff --git a/string/test/strcmp.c b/string/test/strcmp.c
new file mode 100644
index 000000000000..d57b54ed50a8
--- /dev/null
+++ b/string/test/strcmp.c
@@ -0,0 +1,132 @@
+/*
+ * strcmp test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+  const char *name;
+  int (*fun) (const char *s1, const char *s2);
+  int test_mte;
+} funtab[] = {
+  // clang-format off
+  F(strcmp, 0)
+#if __aarch64__
+  F(__strcmp_aarch64, 0)
+  F(__strcmp_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+  F(__strcmp_aarch64_sve, 1)
+# endif
+#elif __arm__
+# if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
+  F(__strcmp_arm, 0)
+# elif __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1
+  F(__strcmp_armv6m, 0)
+# endif
+#endif
+  {0, 0, 0}
+  // clang-format on
+};
+#undef F
+
+#define A 32
+#define LEN 250000
+static char *s1buf;
+static char *s2buf;
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + A - 1) & -A);
+}
+
+static void
+test (const struct fun *fun, int s1align, int s2align, int len, int diffpos,
+      int delta)
+{
+  char *src1 = alignup (s1buf);
+  char *src2 = alignup (s2buf);
+  char *s1 = src1 + s1align;
+  char *s2 = src2 + s2align;
+  int r;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || s1align >= A || s2align >= A)
+    abort ();
+  if (diffpos >= len)
+    abort ();
+  if ((diffpos < 0) != (delta == 0))
+    abort ();
+
+  for (int i = 0; i < len + A; i++)
+    src1[i] = src2[i] = '?';
+  for (int i = 0; i < len; i++)
+    s1[i] = s2[i] = 'a' + i % 23;
+  if (delta)
+    s1[diffpos] += delta;
+  s1[len] = s2[len] = '\0';
+
+  s1 = tag_buffer (s1, len + 1, fun->test_mte);
+  s2 = tag_buffer (s2, len + 1, fun->test_mte);
+  r = fun->fun (s1, s2);
+  untag_buffer (s1, len + 1, fun->test_mte);
+  untag_buffer (s2, len + 1, fun->test_mte);
+
+  if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0))
+    {
+      ERR ("%s(align %d, align %d, %d) failed, returned %d\n", fun->name,
+	   s1align, s2align, len, r);
+      quoteat ("src1", src1, len + A, diffpos);
+      quoteat ("src2", src2, len + A, diffpos);
+    }
+}
+
+int
+main ()
+{
+  s1buf = mte_mmap (LEN + 2 * A + 1);
+  s2buf = mte_mmap (LEN + 2 * A + 1);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int d = 0; d < A; d++)
+	for (int s = 0; s < A; s++)
+	  {
+	    int n;
+	    test (funtab + i, d, s, 0, -1, 0);
+	    test (funtab + i, d, s, 1, -1, 0);
+	    test (funtab + i, d, s, 1, 0, 1);
+	    test (funtab + i, d, s, 1, 0, -1);
+	    for (n = 2; n < 100; n++)
+	      {
+		test (funtab + i, d, s, n, -1, 0);
+		test (funtab + i, d, s, n, n - 1, -1);
+		test (funtab + i, d, s, n, n / 2, 1);
+	      }
+	    for (; n < LEN; n *= 2)
+	      {
+		test (funtab + i, d, s, n, -1, 0);
+		test (funtab + i, d, s, n, n / 2, -1);
+	      }
+	  }
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
diff --git a/string/test/strcpy.c b/string/test/strcpy.c
new file mode 100644
index 000000000000..e84cace9c8c6
--- /dev/null
+++ b/string/test/strcpy.c
@@ -0,0 +1,123 @@
+/*
+ * strcpy test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+  const char *name;
+  char *(*fun) (char *dest, const char *src);
+  int test_mte;
+} funtab[] = {
+  // clang-format off
+  F(strcpy, 0)
+#if __aarch64__
+  F(__strcpy_aarch64, 0)
+  F(__strcpy_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+  F(__strcpy_aarch64_sve, 1)
+# endif
+#elif __arm__ && defined (__thumb2__) && !defined (__thumb__)
+  F(__strcpy_arm, 0)
+#endif
+  {0, 0, 0}
+  // clang-format on
+};
+#undef F
+
+#define ALIGN 32
+#define LEN 512
+static char *dbuf;
+static char *sbuf;
+static char wbuf[LEN + 3 * ALIGN];
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
+}
+
+static void
+test (const struct fun *fun, int dalign, int salign, int len)
+{
+  char *src = alignup (sbuf);
+  char *dst = alignup (dbuf);
+  char *want = wbuf;
+  char *s = src + salign;
+  char *d = dst + dalign;
+  char *w = want + dalign;
+  void *p;
+  int i;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || dalign >= ALIGN || salign >= ALIGN)
+    abort ();
+  for (i = 0; i < len + ALIGN; i++)
+    {
+      src[i] = '?';
+      want[i] = dst[i] = '*';
+    }
+  for (int i = 0; src + i < s; i++)
+    src[i] = 0;
+  for (int i = 1; i <= ALIGN; i++)
+    s[len + i] = (len + salign) & 1 ? 1 : 0;
+  for (i = 0; i < len; i++)
+    s[i] = w[i] = 'a' + (i & 31);
+  s[len] = w[len] = '\0';
+
+  s = tag_buffer (s, len + 1, fun->test_mte);
+  d = tag_buffer (d, len + 1, fun->test_mte);
+  p = fun->fun (d, s);
+  untag_buffer (s, len + 1, fun->test_mte);
+  untag_buffer (d, len + 1, fun->test_mte);
+
+  if (p != d)
+    ERR ("%s (%p,..) returned %p\n", fun->name, d, p);
+
+  for (i = 0; i < len + ALIGN; i++)
+    {
+      if (dst[i] != want[i])
+	{
+	  ERR ("%s (align %d, align %d, %d) failed\n",
+	       fun->name, dalign, salign, len);
+	  quoteat ("got", dst, len + ALIGN, i);
+	  quoteat ("want", want, len + ALIGN, i);
+	  break;
+	}
+    }
+}
+
+int
+main (void)
+{
+  sbuf = mte_mmap (LEN + 3 * ALIGN);
+  dbuf = mte_mmap (LEN + 3 * ALIGN);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int d = 0; d < ALIGN; d++)
+	for (int s = 0; s < ALIGN; s++)
+	  for (int n = 0; n < LEN; n++)
+	    test (funtab + i, d, s, n);
+
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
diff --git a/string/test/stringtest.h b/string/test/stringtest.h
new file mode 100644
index 000000000000..fe855fc21736
--- /dev/null
+++ b/string/test/stringtest.h
@@ -0,0 +1,55 @@
+/*
+ * Common string test code.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <ctype.h>
+#include <stdio.h>
+
+/* Accounting errors for a test case.  */
+static int err_count;
+#define ERR_LIMIT 10
+#define ERR(...) (err_count++, printf (__VA_ARGS__))
+
+static inline void
+quotechar (unsigned char c)
+{
+  if (isprint (c))
+    putchar (c);
+  else
+    printf ("\\x%02x", c);
+}
+
+/* quoted print around at or the entire string if at < 0.  */
+static void
+quoteat (const char *prefix, const void *p, int len, int at)
+{
+  static const int CTXLEN = 15;
+  int i;
+  const char *pre = "\"";
+  const char *post = "\"";
+  const char *s = p;
+  if (at > CTXLEN)
+    {
+      s += at - CTXLEN;
+      len -= at - CTXLEN;
+      pre = "...\"";
+    }
+  if (at >= 0 && len > 2 * CTXLEN + 1)
+    {
+      len = 2 * CTXLEN + 1;
+      post = "\"...";
+    }
+  printf ("%4s: %s", prefix, pre);
+  for (i = 0; i < len; i++)
+    quotechar (s[i]);
+  printf ("%s\n", post);
+}
+
+static inline void
+quote (const char *prefix, const void *p, int len)
+{
+  quoteat (prefix, p, len, -1);
+}
diff --git a/string/test/strlen.c b/string/test/strlen.c
new file mode 100644
index 000000000000..6278380f26df
--- /dev/null
+++ b/string/test/strlen.c
@@ -0,0 +1,103 @@
+/*
+ * strlen test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <limits.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+  const char *name;
+  size_t (*fun) (const char *s);
+  int test_mte;
+} funtab[] = {
+  // clang-format off
+  F(strlen, 0)
+#if __aarch64__
+  F(__strlen_aarch64, 0)
+  F(__strlen_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+  F(__strlen_aarch64_sve, 1)
+# endif
+#elif __arm__
+# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
+  F(__strlen_armv6t2, 0)
+# endif
+#endif
+  {0, 0, 0}
+  // clang-format on
+};
+#undef F
+
+#define ALIGN 32
+#define LEN 512
+static char *sbuf;
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
+}
+
+static void
+test (const struct fun *fun, int align, int len)
+{
+  char *src = alignup (sbuf);
+  char *s = src + align;
+  size_t r;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || align >= ALIGN)
+    abort ();
+
+  for (int i = 0; src + i < s; i++)
+    src[i] = 0;
+  for (int i = 1; i <= ALIGN; i++)
+    s[len + i] = (len + align) & 1 ? 1 : 0;
+  for (int i = 0; i < len; i++)
+    s[i] = 'a' + (i & 31);
+  s[len] = '\0';
+
+  s = tag_buffer (s, len + 1, fun->test_mte);
+  r = fun->fun (s);
+  untag_buffer (s, len + 1, fun->test_mte);
+
+  if (r != len)
+    {
+      ERR ("%s (%p) returned %zu expected %d\n", fun->name, s, r, len);
+      quote ("input", src, len);
+    }
+}
+
+int
+main (void)
+{
+  sbuf = mte_mmap (LEN + 3 * ALIGN);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int a = 0; a < ALIGN; a++)
+	for (int n = 0; n < LEN; n++)
+	  test (funtab + i, a, n);
+
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
diff --git a/string/test/strncmp.c b/string/test/strncmp.c
new file mode 100644
index 000000000000..018a8a431ab8
--- /dev/null
+++ b/string/test/strncmp.c
@@ -0,0 +1,139 @@
+/*
+ * strncmp test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+  const char *name;
+  int (*fun) (const char *, const char *, size_t);
+  int test_mte;
+} funtab[] = {
+  // clang-format off
+  F(strncmp, 0)
+#if __aarch64__
+  F(__strncmp_aarch64, 0)
+  F(__strncmp_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+  F(__strncmp_aarch64_sve, 1)
+# endif
+#endif
+  {0, 0, 0}
+  // clang-format on
+};
+#undef F
+
+#define A 32
+#define LEN 250000
+static char *s1buf;
+static char *s2buf;
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + A - 1) & -A);
+}
+
+static void
+test (const struct fun *fun, int s1align, int s2align, int maxlen, int diffpos,
+      int len, int delta)
+{
+  char *src1 = alignup (s1buf);
+  char *src2 = alignup (s2buf);
+  char *s1 = src1 + s1align;
+  char *s2 = src2 + s2align;
+  int r;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || s1align >= A || s2align >= A)
+    abort ();
+  if (diffpos >= len)
+    abort ();
+  if ((diffpos < 0) != (delta == 0))
+    abort ();
+
+  for (int i = 0; i < len + A; i++)
+    src1[i] = src2[i] = '?';
+  for (int i = 0; i < len; i++)
+    s1[i] = s2[i] = 'a' + i % 23;
+  if (delta)
+    s1[diffpos] += delta;
+  s1[len] = s2[len] = '\0';
+
+  size_t mte_len = maxlen < len + 1 ? maxlen : len + 1;
+  s1 = tag_buffer (s1, mte_len, fun->test_mte);
+  s2 = tag_buffer (s2, mte_len, fun->test_mte);
+  r = fun->fun (s1, s2, maxlen);
+  untag_buffer (s1, mte_len, fun->test_mte);
+  untag_buffer (s2, mte_len, fun->test_mte);
+
+  if (diffpos >= maxlen)
+    {
+      diffpos = -1;
+      delta = 0;
+    }
+  if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0))
+    {
+      ERR (
+	"%s(align %d, align %d, %d) (len=%d, diffpos=%d) failed, returned %d\n",
+	fun->name, s1align, s2align, maxlen, len, diffpos, r);
+      quoteat ("src1", src1, len + A, diffpos);
+      quoteat ("src2", src2, len + A, diffpos);
+    }
+}
+
+int
+main ()
+{
+  s1buf = mte_mmap (LEN + 2 * A + 1);
+  s2buf = mte_mmap (LEN + 2 * A + 1);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int d = 0; d < A; d++)
+	for (int s = 0; s < A; s++)
+	  {
+	    int n;
+	    test (funtab + i, d, s, 0, -1, 0, 0);
+	    test (funtab + i, d, s, 1, -1, 0, 0);
+	    test (funtab + i, d, s, 0, -1, 1, 0);
+	    test (funtab + i, d, s, 1, -1, 1, 0);
+	    test (funtab + i, d, s, 2, -1, 1, 0);
+	    test (funtab + i, d, s, 1, 0, 1, 1);
+	    test (funtab + i, d, s, 1, 0, 1, -1);
+	    for (n = 2; n < 100; n++)
+	      {
+		test (funtab + i, d, s, n, -1, n, 0);
+		test (funtab + i, d, s, n, n / 2, n, 1);
+		test (funtab + i, d, s, n / 2, -1, n, 0);
+		test (funtab + i, d, s, n / 2, n / 2, n, -1);
+	      }
+	    for (; n < LEN; n *= 2)
+	      {
+		test (funtab + i, d, s, n, -1, n, 0);
+		test (funtab + i, d, s, n, n / 2, n, -1);
+		test (funtab + i, d, s, n / 2, -1, n, 0);
+		test (funtab + i, d, s, n / 2, n / 2, n, 1);
+	      }
+	  }
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
diff --git a/string/test/strnlen.c b/string/test/strnlen.c
new file mode 100644
index 000000000000..0dea00eaf8e3
--- /dev/null
+++ b/string/test/strnlen.c
@@ -0,0 +1,109 @@
+/*
+ * strnlen test.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+  const char *name;
+  size_t (*fun) (const char *s, size_t m);
+  int test_mte;
+} funtab[] = {
+  // clang-format off
+  F(strnlen, 0)
+#if __aarch64__
+  F(__strnlen_aarch64, 1)
+# if __ARM_FEATURE_SVE
+  F(__strnlen_aarch64_sve, 1)
+# endif
+#endif
+  {0, 0, 0}
+  // clang-format on
+};
+#undef F
+
+#define ALIGN 32
+#define LEN 512
+static char *sbuf;
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
+}
+
+static void
+test (const struct fun *fun, int align, size_t maxlen, size_t len)
+{
+  char *src = alignup (sbuf);
+  char *s = src + align;
+  size_t r;
+  size_t e = maxlen < len ? maxlen : len;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || align >= ALIGN)
+    abort ();
+
+  for (int i = 0; src + i < s; i++)
+    src[i] = 0;
+  for (int i = 1; i <= ALIGN; i++)
+    s[len + i] = (len + align) & 1 ? 1 : 0;
+  for (int i = 0; i < len; i++)
+    s[i] = 'a' + (i & 31);
+  s[len] = 0;
+  if ((len + align) & 1)
+    s[e + 1] = 0;
+
+  size_t mte_len = maxlen < len + 1 ? maxlen : len + 1;
+  s = tag_buffer (s, mte_len, fun->test_mte);
+  r = fun->fun (s, maxlen);
+  untag_buffer (s, mte_len, fun->test_mte);
+
+  if (r != e)
+    {
+      ERR ("%s (%p, %zu) len %zu returned %zu, expected %zu\n",
+	   fun->name, s, maxlen, len, r, e);
+      quote ("input", s, len);
+    }
+}
+
+int
+main (void)
+{
+  sbuf = mte_mmap (LEN + 3 * ALIGN);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int a = 0; a < ALIGN; a++)
+	for (int n = 0; n < LEN; n++)
+	  {
+	    for (int maxlen = 0; maxlen < LEN; maxlen++)
+	      test (funtab + i, a, maxlen, n);
+	    test (funtab + i, a, SIZE_MAX - a, n);
+	  }
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
diff --git a/string/test/strrchr.c b/string/test/strrchr.c
new file mode 100644
index 000000000000..fedbdc52fcc1
--- /dev/null
+++ b/string/test/strrchr.c
@@ -0,0 +1,121 @@
+/*
+ * strrchr test.
+ *
+ * Copyright (c) 2019-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+  const char *name;
+  char *(*fun) (const char *s, int c);
+  int test_mte;
+} funtab[] = {
+  // clang-format off
+  F(strrchr, 0)
+#if __aarch64__
+  F(__strrchr_aarch64, 0)
+  F(__strrchr_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+  F(__strrchr_aarch64_sve, 1)
+# endif
+#endif
+  {0, 0, 0}
+  // clang-format on
+};
+#undef F
+
+#define ALIGN 32
+#define LEN 512
+static char *sbuf;
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
+}
+
+static void
+test (const struct fun *fun, int align, int seekpos, int len)
+{
+  char *src = alignup (sbuf);
+  char *s = src + align;
+  char *f = seekpos != -1 ? s + seekpos : 0;
+  int seekchar = 0x1;
+  void *p;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || seekpos >= len || align >= ALIGN)
+    abort ();
+
+  for (int i = 0; src + i < s; i++)
+    src[i] = (i + len) & 1 ? seekchar : 0;
+  for (int i = 1; i <= ALIGN; i++)
+    s[len + i] = (i + len) & 1 ? seekchar : 0;
+  for (int i = 0; i < len; i++)
+    s[i] = 'a' + (i & 31);
+  if (seekpos != -1)
+    s[seekpos / 2] = s[seekpos] = seekchar;
+  if (seekpos > 0 && (len + align) & 1)
+    s[seekpos - 1] = seekchar;
+  s[len] = '\0';
+
+  s = tag_buffer (s, len + 1, fun->test_mte);
+  p = fun->fun (s, seekchar);
+  untag_buffer (s, len + 1, fun->test_mte);
+  p = untag_pointer (p);
+
+  if (p != f)
+    {
+      ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
+	   fun->name, s, seekchar, len, p, f, seekpos);
+      quote ("input", s, len);
+    }
+
+  s = tag_buffer (s, len + 1, fun->test_mte);
+  p = fun->fun (s, 0);
+  untag_buffer (s, len + 1, fun->test_mte);
+
+  if (p != s + len)
+    {
+      ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
+	   fun->name, s, 0, len, p, s + len, len);
+      quote ("input", s, len);
+    }
+}
+
+int
+main (void)
+{
+  sbuf = mte_mmap (LEN + 3 * ALIGN);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int a = 0; a < ALIGN; a++)
+	for (int n = 0; n < LEN; n++)
+	  {
+	    for (int sp = 0; sp < n; sp++)
+	      test (funtab + i, a, sp, n);
+	    test (funtab + i, a, -1, n);
+	  }
+
+      char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
+      printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
diff --git a/string/x86_64/check-arch.S b/string/x86_64/check-arch.S
new file mode 100644
index 000000000000..26ade0a0c7db
--- /dev/null
+++ b/string/x86_64/check-arch.S
@@ -0,0 +1,10 @@
+/*
+ * check ARCH setting.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if !__x86_64__
+# error ARCH setting does not match the compiler.
+#endif