diff options
Diffstat (limited to 'string')
72 files changed, 9253 insertions, 0 deletions
diff --git a/string/Dir.mk b/string/Dir.mk new file mode 100644 index 000000000000..cf3453f7580d --- /dev/null +++ b/string/Dir.mk @@ -0,0 +1,113 @@ +# Makefile fragment - requires GNU make +# +# Copyright (c) 2019-2021, Arm Limited. +# SPDX-License-Identifier: MIT + +S := $(srcdir)/string +B := build/string + +ifeq ($(ARCH),) +all-string bench-string check-string install-string clean-string: + @echo "*** Please set ARCH in config.mk. ***" + @exit 1 +else + +string-lib-srcs := $(wildcard $(S)/$(ARCH)/*.[cS]) +string-test-srcs := $(wildcard $(S)/test/*.c) +string-bench-srcs := $(wildcard $(S)/bench/*.c) + +string-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h)) + +string-libs := \ + build/lib/libstringlib.so \ + build/lib/libstringlib.a \ + +string-tests := \ + build/bin/test/memcpy \ + build/bin/test/memmove \ + build/bin/test/memset \ + build/bin/test/memchr \ + build/bin/test/memrchr \ + build/bin/test/memcmp \ + build/bin/test/__mtag_tag_region \ + build/bin/test/__mtag_tag_zero_region \ + build/bin/test/strcpy \ + build/bin/test/stpcpy \ + build/bin/test/strcmp \ + build/bin/test/strchr \ + build/bin/test/strrchr \ + build/bin/test/strchrnul \ + build/bin/test/strlen \ + build/bin/test/strnlen \ + build/bin/test/strncmp + +string-benches := \ + build/bin/bench/memcpy \ + build/bin/bench/strlen + +string-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-lib-srcs))) +string-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-test-srcs))) +string-bench-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-bench-srcs))) + +string-objs := \ + $(string-lib-objs) \ + $(string-lib-objs:%.o=%.os) \ + $(string-test-objs) \ + $(string-bench-objs) + +string-files := \ + $(string-objs) \ + $(string-libs) \ + $(string-tests) \ + $(string-benches) \ + $(string-includes) \ + +all-string: $(string-libs) $(string-tests) $(string-benches) $(string-includes) + +$(string-objs): $(string-includes) +$(string-objs): CFLAGS_ALL += $(string-cflags) + +$(string-test-objs): CFLAGS_ALL += -D_GNU_SOURCE + +build/lib/libstringlib.so: $(string-lib-objs:%.o=%.os) + $(CC) $(CFLAGS_ALL) $(LDFLAGS) -shared -o $@ $^ + +build/lib/libstringlib.a: $(string-lib-objs) + rm -f $@ + $(AR) rc $@ $^ + $(RANLIB) $@ + +build/bin/test/%: $(B)/test/%.o build/lib/libstringlib.a + $(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS) + +build/bin/bench/%: $(B)/bench/%.o build/lib/libstringlib.a + $(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS) + +build/include/%.h: $(S)/include/%.h + cp $< $@ + +build/bin/%.sh: $(S)/test/%.sh + cp $< $@ + +string-tests-out = $(string-tests:build/bin/test/%=build/string/test/%.out) + +build/string/test/%.out: build/bin/test/% + $(EMULATOR) $^ | tee $@.tmp + mv $@.tmp $@ + +check-string: $(string-tests-out) + ! grep FAIL $^ + +bench-string: $(string-benches) + $(EMULATOR) build/bin/bench/strlen + $(EMULATOR) build/bin/bench/memcpy + +install-string: \ + $(string-libs:build/lib/%=$(DESTDIR)$(libdir)/%) \ + $(string-includes:build/include/%=$(DESTDIR)$(includedir)/%) + +clean-string: + rm -f $(string-files) +endif + +.PHONY: all-string bench-string check-string install-string clean-string diff --git a/string/aarch64/__mtag_tag_region.S b/string/aarch64/__mtag_tag_region.S new file mode 100644 index 000000000000..84339f73cf23 --- /dev/null +++ b/string/aarch64/__mtag_tag_region.S @@ -0,0 +1,100 @@ +/* + * __mtag_tag_region - tag memory + * + * Copyright (c) 2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, MTE, LP64 ABI. + * + * Interface contract: + * Address is 16 byte aligned and size is multiple of 16. + * Returns the passed pointer. + * The memory region may remain untagged if tagging is not enabled. + */ + +#include "../asmdefs.h" + +#if __ARM_FEATURE_MEMORY_TAGGING + +#define dstin x0 +#define count x1 +#define dst x2 +#define dstend x3 +#define tmp x4 +#define zva_val x4 + +ENTRY (__mtag_tag_region) + PTR_ARG (0) + SIZE_ARG (1) + + add dstend, dstin, count + + cmp count, 96 + b.hi L(set_long) + + tbnz count, 6, L(set96) + + /* Set 0, 16, 32, or 48 bytes. */ + lsr tmp, count, 5 + add tmp, dstin, tmp, lsl 4 + cbz count, L(end) + stg dstin, [dstin] + stg dstin, [tmp] + stg dstin, [dstend, -16] +L(end): + ret + + .p2align 4 + /* Set 64..96 bytes. Write 64 bytes from the start and + 32 bytes from the end. */ +L(set96): + st2g dstin, [dstin] + st2g dstin, [dstin, 32] + st2g dstin, [dstend, -32] + ret + + .p2align 4 + /* Size is > 96 bytes. */ +L(set_long): + cmp count, 160 + b.lo L(no_zva) + +#ifndef SKIP_ZVA_CHECK + mrs zva_val, dczid_el0 + and zva_val, zva_val, 31 + cmp zva_val, 4 /* ZVA size is 64 bytes. */ + b.ne L(no_zva) +#endif + st2g dstin, [dstin] + st2g dstin, [dstin, 32] + bic dst, dstin, 63 + sub count, dstend, dst /* Count is now 64 too large. */ + sub count, count, 128 /* Adjust count and bias for loop. */ + + .p2align 4 +L(zva_loop): + add dst, dst, 64 + dc gva, dst + subs count, count, 64 + b.hi L(zva_loop) + st2g dstin, [dstend, -64] + st2g dstin, [dstend, -32] + ret + +L(no_zva): + sub dst, dstin, 32 /* Dst is biased by -32. */ + sub count, count, 64 /* Adjust count for loop. */ +L(no_zva_loop): + st2g dstin, [dst, 32] + st2g dstin, [dst, 64]! + subs count, count, 64 + b.hi L(no_zva_loop) + st2g dstin, [dstend, -64] + st2g dstin, [dstend, -32] + ret + +END (__mtag_tag_region) +#endif diff --git a/string/aarch64/__mtag_tag_zero_region.S b/string/aarch64/__mtag_tag_zero_region.S new file mode 100644 index 000000000000..f58364ca6fcb --- /dev/null +++ b/string/aarch64/__mtag_tag_zero_region.S @@ -0,0 +1,100 @@ +/* + * __mtag_tag_zero_region - tag memory and fill it with zero bytes + * + * Copyright (c) 2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, MTE, LP64 ABI. + * + * Interface contract: + * Address is 16 byte aligned and size is multiple of 16. + * Returns the passed pointer. + * The memory region may remain untagged if tagging is not enabled. + */ + +#include "../asmdefs.h" + +#if __ARM_FEATURE_MEMORY_TAGGING + +#define dstin x0 +#define count x1 +#define dst x2 +#define dstend x3 +#define tmp x4 +#define zva_val x4 + +ENTRY (__mtag_tag_zero_region) + PTR_ARG (0) + SIZE_ARG (1) + + add dstend, dstin, count + + cmp count, 96 + b.hi L(set_long) + + tbnz count, 6, L(set96) + + /* Set 0, 16, 32, or 48 bytes. */ + lsr tmp, count, 5 + add tmp, dstin, tmp, lsl 4 + cbz count, L(end) + stzg dstin, [dstin] + stzg dstin, [tmp] + stzg dstin, [dstend, -16] +L(end): + ret + + .p2align 4 + /* Set 64..96 bytes. Write 64 bytes from the start and + 32 bytes from the end. */ +L(set96): + stz2g dstin, [dstin] + stz2g dstin, [dstin, 32] + stz2g dstin, [dstend, -32] + ret + + .p2align 4 + /* Size is > 96 bytes. */ +L(set_long): + cmp count, 160 + b.lo L(no_zva) + +#ifndef SKIP_ZVA_CHECK + mrs zva_val, dczid_el0 + and zva_val, zva_val, 31 + cmp zva_val, 4 /* ZVA size is 64 bytes. */ + b.ne L(no_zva) +#endif + stz2g dstin, [dstin] + stz2g dstin, [dstin, 32] + bic dst, dstin, 63 + sub count, dstend, dst /* Count is now 64 too large. */ + sub count, count, 128 /* Adjust count and bias for loop. */ + + .p2align 4 +L(zva_loop): + add dst, dst, 64 + dc gzva, dst + subs count, count, 64 + b.hi L(zva_loop) + stz2g dstin, [dstend, -64] + stz2g dstin, [dstend, -32] + ret + +L(no_zva): + sub dst, dstin, 32 /* Dst is biased by -32. */ + sub count, count, 64 /* Adjust count for loop. */ +L(no_zva_loop): + stz2g dstin, [dst, 32] + stz2g dstin, [dst, 64]! + subs count, count, 64 + b.hi L(no_zva_loop) + stz2g dstin, [dstend, -64] + stz2g dstin, [dstend, -32] + ret + +END (__mtag_tag_zero_region) +#endif diff --git a/string/aarch64/check-arch.S b/string/aarch64/check-arch.S new file mode 100644 index 000000000000..5a54242d7de6 --- /dev/null +++ b/string/aarch64/check-arch.S @@ -0,0 +1,13 @@ +/* + * check ARCH setting. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if !__aarch64__ +# error ARCH setting does not match the compiler. +#endif + +/* Include for GNU property notes. */ +#include "../asmdefs.h" diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S new file mode 100644 index 000000000000..c2e967d1004e --- /dev/null +++ b/string/aarch64/memchr-mte.S @@ -0,0 +1,116 @@ +/* + * memchr - find a character in a memory zone + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. + */ + +#include "../asmdefs.h" + +#define srcin x0 +#define chrin w1 +#define cntin x2 +#define result x0 + +#define src x3 +#define cntrem x4 +#define synd x5 +#define shift x6 +#define tmp x7 +#define wtmp w7 + +#define vrepchr v0 +#define qdata q1 +#define vdata v1 +#define vhas_chr v2 +#define vrepmask v3 +#define vend v4 +#define dend d4 + +/* + Core algorithm: + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-3 are set if the relevant byte matched the + requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are + set likewise for odd bytes so that adjacent bytes can be merged. Since the + bits in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ + +ENTRY (__memchr_aarch64_mte) + PTR_ARG (0) + SIZE_ARG (2) + bic src, srcin, 15 + cbz cntin, L(nomatch) + ld1 {vdata.16b}, [src] + dup vrepchr.16b, chrin + mov wtmp, 0xf00f + dup vrepmask.8h, wtmp + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + lsl shift, srcin, 2 + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + lsr synd, synd, shift + cbz synd, L(start_loop) + + rbit synd, synd + clz synd, synd + add result, srcin, synd, lsr 2 + cmp cntin, synd, lsr 2 + csel result, result, xzr, hi + ret + +L(start_loop): + sub tmp, src, srcin + add tmp, tmp, 16 + subs cntrem, cntin, tmp + b.ls L(nomatch) + + /* Make sure that it won't overread by a 16-byte chunk */ + add tmp, cntrem, 15 + tbnz tmp, 4, L(loop32_2) + + .p2align 4 +L(loop32): + ldr qdata, [src, 16]! + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + cbnz synd, L(end) + +L(loop32_2): + ldr qdata, [src, 16]! + subs cntrem, cntrem, 32 + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + b.ls L(end) + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + cbz synd, L(loop32) +L(end): + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + add tmp, srcin, cntin + sub cntrem, tmp, src +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + clz synd, synd + cmp cntrem, synd, lsr 2 + add result, src, synd, lsr 2 + csel result, result, xzr, hi + ret + +L(nomatch): + mov result, 0 + ret + +END (__memchr_aarch64_mte) + diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S new file mode 100644 index 000000000000..c22e6596f19b --- /dev/null +++ b/string/aarch64/memchr-sve.S @@ -0,0 +1,64 @@ +/* + * memchr - find a character in a memory zone + * + * Copyright (c) 2018-2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "../asmdefs.h" + +#if __ARM_FEATURE_SVE +/* Assumptions: + * + * ARMv8-a, AArch64 + * SVE Available. + */ + +ENTRY (__memchr_aarch64_sve) + PTR_ARG (0) + SIZE_ARG (2) + dup z1.b, w1 /* duplicate c to a vector */ + setffr /* initialize FFR */ + mov x3, 0 /* initialize off */ + + .p2align 4 +0: whilelo p1.b, x3, x2 /* make sure off < max */ + b.none 9f + + /* Read a vector's worth of bytes, bounded by max, + stopping on first fault. */ + ldff1b z0.b, p1/z, [x0, x3] + rdffrs p0.b, p1/z + b.nlast 2f + + /* First fault did not fail: the vector bounded by max is valid. + Avoid depending on the contents of FFR beyond the branch. */ + incb x3 /* speculate increment */ + cmpeq p2.b, p1/z, z0.b, z1.b /* search for c */ + b.none 0b + decb x3 /* undo speculate */ + + /* Found C. */ +1: brkb p2.b, p1/z, p2.b /* find the first c */ + add x0, x0, x3 /* form partial pointer */ + incp x0, p2.b /* form final pointer to c */ + ret + + /* First fault failed: only some of the vector is valid. + Perform the comparision only on the valid bytes. */ +2: cmpeq p2.b, p0/z, z0.b, z1.b + b.any 1b + + /* No C found. Re-init FFR, increment, and loop. */ + setffr + incp x3, p0.b + b 0b + + /* Found end of count. */ +9: mov x0, 0 /* return null */ + ret + +END (__memchr_aarch64_sve) + +#endif + diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S new file mode 100644 index 000000000000..353f0d1eac53 --- /dev/null +++ b/string/aarch64/memchr.S @@ -0,0 +1,146 @@ +/* + * memchr - find a character in a memory zone + * + * Copyright (c) 2014-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + * Neon Available. + */ + +#include "../asmdefs.h" + +/* Arguments and results. */ +#define srcin x0 +#define chrin w1 +#define cntin x2 + +#define result x0 + +#define src x3 +#define tmp x4 +#define wtmp2 w5 +#define synd x6 +#define soff x9 +#define cntrem x10 + +#define vrepchr v0 +#define vdata1 v1 +#define vdata2 v2 +#define vhas_chr1 v3 +#define vhas_chr2 v4 +#define vrepmask v5 +#define vend v6 + +/* + * Core algorithm: + * + * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits + * per byte. For each tuple, bit 0 is set if the relevant byte matched the + * requested character and bit 1 is not used (faster than using a 32bit + * syndrome). Since the bits in the syndrome reflect exactly the order in which + * things occur in the original string, counting trailing zeros allows to + * identify exactly which byte has matched. + */ + +ENTRY (__memchr_aarch64) + PTR_ARG (0) + SIZE_ARG (2) + /* Do not dereference srcin if no bytes to compare. */ + cbz cntin, L(zero_length) + /* + * Magic constant 0x40100401 allows us to identify which lane matches + * the requested byte. + */ + mov wtmp2, #0x0401 + movk wtmp2, #0x4010, lsl #16 + dup vrepchr.16b, chrin + /* Work with aligned 32-byte chunks */ + bic src, srcin, #31 + dup vrepmask.4s, wtmp2 + ands soff, srcin, #31 + and cntrem, cntin, #31 + b.eq L(loop) + + /* + * Input string is not 32-byte aligned. We calculate the syndrome + * value for the aligned 32 bytes block containing the first bytes + * and mask the irrelevant part. + */ + + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + sub tmp, soff, #32 + adds cntin, cntin, tmp + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ + addp vend.16b, vend.16b, vend.16b /* 128->64 */ + mov synd, vend.d[0] + /* Clear the soff*2 lower bits */ + lsl tmp, soff, #1 + lsr synd, synd, tmp + lsl synd, synd, tmp + /* The first block can also be the last */ + b.ls L(masklast) + /* Have we found something already? */ + cbnz synd, L(tail) + +L(loop): + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + subs cntin, cntin, #32 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + /* If we're out of data we finish regardless of the result */ + b.ls L(end) + /* Use a fast check for the termination condition */ + orr vend.16b, vhas_chr1.16b, vhas_chr2.16b + addp vend.2d, vend.2d, vend.2d + mov synd, vend.d[0] + /* We're not out of data, loop if we haven't found the character */ + cbz synd, L(loop) + +L(end): + /* Termination condition found, let's calculate the syndrome value */ + and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ + addp vend.16b, vend.16b, vend.16b /* 128->64 */ + mov synd, vend.d[0] + /* Only do the clear for the last possible block */ + b.hs L(tail) + +L(masklast): + /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ + add tmp, cntrem, soff + and tmp, tmp, #31 + sub tmp, tmp, #32 + neg tmp, tmp, lsl #1 + lsl synd, synd, tmp + lsr synd, synd, tmp + +L(tail): + /* Count the trailing zeros using bit reversing */ + rbit synd, synd + /* Compensate the last post-increment */ + sub src, src, #32 + /* Check that we have found a character */ + cmp synd, #0 + /* And count the leading zeros */ + clz synd, synd + /* Compute the potential result */ + add result, src, synd, lsr #1 + /* Select result or NULL */ + csel result, xzr, result, eq + ret + +L(zero_length): + mov result, #0 + ret + +END (__memchr_aarch64) + diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S new file mode 100644 index 000000000000..78c5ecaa4cdc --- /dev/null +++ b/string/aarch64/memcmp-sve.S @@ -0,0 +1,51 @@ +/* + * memcmp - compare memory + * + * Copyright (c) 2018-2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "../asmdefs.h" + +#if __ARM_FEATURE_SVE +/* Assumptions: + * + * ARMv8-a, AArch64 + * SVE Available. + */ + +ENTRY (__memcmp_aarch64_sve) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + mov x3, 0 /* initialize off */ + +0: whilelo p0.b, x3, x2 /* while off < max */ + b.none 9f + + ld1b z0.b, p0/z, [x0, x3] /* read vectors bounded by max. */ + ld1b z1.b, p0/z, [x1, x3] + + /* Increment for a whole vector, even if we've only read a partial. + This is significantly cheaper than INCP, and since OFF is not + used after the loop it is ok to increment OFF past MAX. */ + incb x3 + + cmpne p1.b, p0/z, z0.b, z1.b /* while no inequalities */ + b.none 0b + + /* Found inequality. */ +1: brkb p1.b, p0/z, p1.b /* find first such */ + lasta w0, p1, z0.b /* extract each byte */ + lasta w1, p1, z1.b + sub x0, x0, x1 /* return comparison */ + ret + + /* Found end-of-count. */ +9: mov x0, 0 /* return equality */ + ret + +END (__memcmp_aarch64_sve) + +#endif + diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S new file mode 100644 index 000000000000..3b1026642eee --- /dev/null +++ b/string/aarch64/memcmp.S @@ -0,0 +1,137 @@ +/* memcmp - compare memory + * + * Copyright (c) 2013-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses. + */ + +#include "../asmdefs.h" + +/* Parameters and result. */ +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result w0 + +/* Internal variables. */ +#define data1 x3 +#define data1w w3 +#define data1h x4 +#define data2 x5 +#define data2w w5 +#define data2h x6 +#define tmp1 x7 +#define tmp2 x8 + +ENTRY (__memcmp_aarch64) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + subs limit, limit, 8 + b.lo L(less8) + + ldr data1, [src1], 8 + ldr data2, [src2], 8 + cmp data1, data2 + b.ne L(return) + + subs limit, limit, 8 + b.gt L(more16) + + ldr data1, [src1, limit] + ldr data2, [src2, limit] + b L(return) + +L(more16): + ldr data1, [src1], 8 + ldr data2, [src2], 8 + cmp data1, data2 + bne L(return) + + /* Jump directly to comparing the last 16 bytes for 32 byte (or less) + strings. */ + subs limit, limit, 16 + b.ls L(last_bytes) + + /* We overlap loads between 0-32 bytes at either side of SRC1 when we + try to align, so limit it only to strings larger than 128 bytes. */ + cmp limit, 96 + b.ls L(loop16) + + /* Align src1 and adjust src2 with bytes not yet done. */ + and tmp1, src1, 15 + add limit, limit, tmp1 + sub src1, src1, tmp1 + sub src2, src2, tmp1 + + /* Loop performing 16 bytes per iteration using aligned src1. + Limit is pre-decremented by 16 and must be larger than zero. + Exit if <= 16 bytes left to do or if the data is not equal. */ + .p2align 4 +L(loop16): + ldp data1, data1h, [src1], 16 + ldp data2, data2h, [src2], 16 + subs limit, limit, 16 + ccmp data1, data2, 0, hi + ccmp data1h, data2h, 0, eq + b.eq L(loop16) + + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h + cmp data1, data2 + bne L(return) + + /* Compare last 1-16 bytes using unaligned access. */ +L(last_bytes): + add src1, src1, limit + add src2, src2, limit + ldp data1, data1h, [src1] + ldp data2, data2h, [src2] + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h + cmp data1, data2 + + /* Compare data bytes and set return value to 0, -1 or 1. */ +L(return): +#ifndef __AARCH64EB__ + rev data1, data1 + rev data2, data2 +#endif + cmp data1, data2 +L(ret_eq): + cset result, ne + cneg result, result, lo + ret + + .p2align 4 + /* Compare up to 8 bytes. Limit is [-8..-1]. */ +L(less8): + adds limit, limit, 4 + b.lo L(less4) + ldr data1w, [src1], 4 + ldr data2w, [src2], 4 + cmp data1w, data2w + b.ne L(return) + sub limit, limit, 4 +L(less4): + adds limit, limit, 4 + beq L(ret_eq) +L(byte_loop): + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + subs limit, limit, 1 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ + b.eq L(byte_loop) + sub result, data1w, data2w + ret + +END (__memcmp_aarch64) + diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S new file mode 100644 index 000000000000..f97f2c3047b9 --- /dev/null +++ b/string/aarch64/memcpy-advsimd.S @@ -0,0 +1,206 @@ +/* + * memcpy - copy memory area + * + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. + * + */ + +#include "../asmdefs.h" + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_lw w10 +#define tmp1 x14 + +#define A_q q0 +#define B_q q1 +#define C_q q2 +#define D_q q3 +#define E_q q4 +#define F_q q5 +#define G_q q6 +#define H_q q7 + +/* This implementation handles overlaps and supports both memcpy and memmove + from a single entry point. It uses unaligned accesses and branchless + sequences to keep the code small, simple and improve performance. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per iteration. + The source pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. +*/ + +ENTRY_ALIAS (__memmove_aarch64_simd) +ENTRY (__memcpy_aarch64_simd) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + add srcend, src, count + add dstend, dstin, count + cmp count, 128 + b.hi L(copy_long) + cmp count, 32 + b.hi L(copy32_128) + + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) + ldr A_q, [src] + ldr B_q, [srcend, -16] + str A_q, [dstin] + str B_q, [dstend, -16] + ret + + /* Copy 8-15 bytes. */ +L(copy16): + tbz count, 3, L(copy8) + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + + .p2align 3 + /* Copy 4-7 bytes. */ +L(copy8): + tbz count, 2, L(copy4) + ldr A_lw, [src] + ldr B_lw, [srcend, -4] + str A_lw, [dstin] + str B_lw, [dstend, -4] + ret + + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb C_lw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb C_lw, [dstend, -1] +L(copy0): + ret + + .p2align 4 + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_q, B_q, [src] + ldp C_q, D_q, [srcend, -32] + cmp count, 64 + b.hi L(copy128) + stp A_q, B_q, [dstin] + stp C_q, D_q, [dstend, -32] + ret + + .p2align 4 + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_q, F_q, [src, 32] + cmp count, 96 + b.ls L(copy96) + ldp G_q, H_q, [srcend, -64] + stp G_q, H_q, [dstend, -64] +L(copy96): + stp A_q, B_q, [dstin] + stp E_q, F_q, [dstin, 32] + stp C_q, D_q, [dstend, -32] + ret + + /* Copy more than 128 bytes. */ +L(copy_long): + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cmp tmp1, count + b.lo L(copy_long_backwards) + + /* Copy 16 bytes and then align src to 16-byte alignment. */ + ldr D_q, [src] + and tmp1, src, 15 + bic src, src, 15 + sub dst, dstin, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_q, B_q, [src, 16] + str D_q, [dstin] + ldp C_q, D_q, [src, 48] + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) +L(loop64): + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [src, 80] + stp C_q, D_q, [dst, 48] + ldp C_q, D_q, [src, 112] + add src, src, 64 + add dst, dst, 64 + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_q, F_q, [srcend, -64] + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [srcend, -32] + stp C_q, D_q, [dst, 48] + stp E_q, F_q, [dstend, -64] + stp A_q, B_q, [dstend, -32] + ret + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align srcend to 16-byte alignment. */ +L(copy_long_backwards): + cbz tmp1, L(copy0) + ldr D_q, [srcend, -16] + and tmp1, srcend, 15 + bic srcend, srcend, 15 + sub count, count, tmp1 + ldp A_q, B_q, [srcend, -32] + str D_q, [dstend, -16] + ldp C_q, D_q, [srcend, -64] + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + str B_q, [dstend, -16] + str A_q, [dstend, -32] + ldp A_q, B_q, [srcend, -96] + str D_q, [dstend, -48] + str C_q, [dstend, -64]! + ldp C_q, D_q, [srcend, -128] + sub srcend, srcend, 64 + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp E_q, F_q, [src, 32] + stp A_q, B_q, [dstend, -32] + ldp A_q, B_q, [src] + stp C_q, D_q, [dstend, -64] + stp E_q, F_q, [dstin, 32] + stp A_q, B_q, [dstin] + ret + +END (__memcpy_aarch64_simd) + diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S new file mode 100644 index 000000000000..dd254f6f9929 --- /dev/null +++ b/string/aarch64/memcpy.S @@ -0,0 +1,243 @@ +/* + * memcpy - copy memory area + * + * Copyright (c) 2012-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses. + * + */ + +#include "../asmdefs.h" + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_lw w10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l x14 +#define E_h x15 +#define F_l x16 +#define F_h x17 +#define G_l count +#define G_h dst +#define H_l src +#define H_h srcend +#define tmp1 x14 + +/* This implementation handles overlaps and supports both memcpy and memmove + from a single entry point. It uses unaligned accesses and branchless + sequences to keep the code small, simple and improve performance. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per iteration. + The destination pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. +*/ + +ENTRY_ALIAS (__memmove_aarch64) +ENTRY (__memcpy_aarch64) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + add srcend, src, count + add dstend, dstin, count + cmp count, 128 + b.hi L(copy_long) + cmp count, 32 + b.hi L(copy32_128) + + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) + ldp A_l, A_h, [src] + ldp D_l, D_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + /* Copy 8-15 bytes. */ +L(copy16): + tbz count, 3, L(copy8) + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + + .p2align 3 + /* Copy 4-7 bytes. */ +L(copy8): + tbz count, 2, L(copy4) + ldr A_lw, [src] + ldr B_lw, [srcend, -4] + str A_lw, [dstin] + str B_lw, [dstend, -4] + ret + + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb C_lw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb C_lw, [dstend, -1] +L(copy0): + ret + + .p2align 4 + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_l, A_h, [src] + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + ldp D_l, D_h, [srcend, -16] + cmp count, 64 + b.hi L(copy128) + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_l, E_h, [src, 32] + ldp F_l, F_h, [src, 48] + cmp count, 96 + b.ls L(copy96) + ldp G_l, G_h, [srcend, -64] + ldp H_l, H_h, [srcend, -48] + stp G_l, G_h, [dstend, -64] + stp H_l, H_h, [dstend, -48] +L(copy96): + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp E_l, E_h, [dstin, 32] + stp F_l, F_h, [dstin, 48] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Copy more than 128 bytes. */ +L(copy_long): + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cbz tmp1, L(copy0) + cmp tmp1, count + b.lo L(copy_long_backwards) + + /* Copy 16 bytes and then align dst to 16-byte alignment. */ + + ldp D_l, D_h, [src] + and tmp1, dstin, 15 + bic dst, dstin, 15 + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) + +L(loop64): + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_l, E_h, [srcend, -64] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [srcend, -48] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [srcend, -16] + stp D_l, D_h, [dst, 64] + stp E_l, E_h, [dstend, -64] + stp A_l, A_h, [dstend, -48] + stp B_l, B_h, [dstend, -32] + stp C_l, C_h, [dstend, -16] + ret + + .p2align 4 + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align dst to 16-byte alignment. */ +L(copy_long_backwards): + ldp D_l, D_h, [srcend, -16] + and tmp1, dstend, 15 + sub srcend, srcend, tmp1 + sub count, count, tmp1 + ldp A_l, A_h, [srcend, -16] + stp D_l, D_h, [dstend, -16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -48] + ldp D_l, D_h, [srcend, -64]! + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [srcend, -16] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [srcend, -48] + stp D_l, D_h, [dstend, -64]! + ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp G_l, G_h, [src, 48] + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend, -64] + stp G_l, G_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] + ret + +END (__memcpy_aarch64) + diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S new file mode 100644 index 000000000000..7b4be847cecb --- /dev/null +++ b/string/aarch64/memrchr.S @@ -0,0 +1,117 @@ +/* + * memrchr - find last character in a memory zone. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. + */ + +#include "../asmdefs.h" + +#define srcin x0 +#define chrin w1 +#define cntin x2 +#define result x0 + +#define src x3 +#define cntrem x4 +#define synd x5 +#define shift x6 +#define tmp x7 +#define wtmp w7 +#define end x8 +#define endm1 x9 + +#define vrepchr v0 +#define qdata q1 +#define vdata v1 +#define vhas_chr v2 +#define vrepmask v3 +#define vend v4 +#define dend d4 + +/* + Core algorithm: + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-3 are set if the relevant byte matched the + requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are + set likewise for odd bytes so that adjacent bytes can be merged. Since the + bits in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ + +ENTRY (__memrchr_aarch64) + PTR_ARG (0) + add end, srcin, cntin + sub endm1, end, 1 + bic src, endm1, 15 + cbz cntin, L(nomatch) + ld1 {vdata.16b}, [src] + dup vrepchr.16b, chrin + mov wtmp, 0xf00f + dup vrepmask.8h, wtmp + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + neg shift, end, lsl 2 + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + lsl synd, synd, shift + cbz synd, L(start_loop) + + clz synd, synd + sub result, endm1, synd, lsr 2 + cmp cntin, synd, lsr 2 + csel result, result, xzr, hi + ret + +L(start_loop): + sub tmp, end, src + subs cntrem, cntin, tmp + b.ls L(nomatch) + + /* Make sure that it won't overread by a 16-byte chunk */ + add tmp, cntrem, 15 + tbnz tmp, 4, L(loop32_2) + + .p2align 4 +L(loop32): + ldr qdata, [src, -16]! + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + cbnz synd, L(end) + +L(loop32_2): + ldr qdata, [src, -16]! + subs cntrem, cntrem, 32 + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + b.ls L(end) + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + cbz synd, L(loop32) +L(end): + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + + add tmp, src, 15 +#ifdef __AARCH64EB__ + rbit synd, synd +#endif + clz synd, synd + sub tmp, tmp, synd, lsr 2 + cmp tmp, srcin + csel result, tmp, xzr, hs + ret + +L(nomatch): + mov result, 0 + ret + +END (__memrchr_aarch64) + diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S new file mode 100644 index 000000000000..9fcd97579913 --- /dev/null +++ b/string/aarch64/memset.S @@ -0,0 +1,117 @@ +/* + * memset - fill memory with a constant byte + * + * Copyright (c) 2012-2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. + * + */ + +#include "../asmdefs.h" + +#define dstin x0 +#define val x1 +#define valw w1 +#define count x2 +#define dst x3 +#define dstend x4 +#define zva_val x5 + +ENTRY (__memset_aarch64) + PTR_ARG (0) + SIZE_ARG (2) + + dup v0.16B, valw + add dstend, dstin, count + + cmp count, 96 + b.hi L(set_long) + cmp count, 16 + b.hs L(set_medium) + mov val, v0.D[0] + + /* Set 0..15 bytes. */ + tbz count, 3, 1f + str val, [dstin] + str val, [dstend, -8] + ret + .p2align 4 +1: tbz count, 2, 2f + str valw, [dstin] + str valw, [dstend, -4] + ret +2: cbz count, 3f + strb valw, [dstin] + tbz count, 1, 3f + strh valw, [dstend, -2] +3: ret + + /* Set 17..96 bytes. */ +L(set_medium): + str q0, [dstin] + tbnz count, 6, L(set96) + str q0, [dstend, -16] + tbz count, 5, 1f + str q0, [dstin, 16] + str q0, [dstend, -32] +1: ret + + .p2align 4 + /* Set 64..96 bytes. Write 64 bytes from the start and + 32 bytes from the end. */ +L(set96): + str q0, [dstin, 16] + stp q0, q0, [dstin, 32] + stp q0, q0, [dstend, -32] + ret + + .p2align 4 +L(set_long): + and valw, valw, 255 + bic dst, dstin, 15 + str q0, [dstin] + cmp count, 160 + ccmp valw, 0, 0, hs + b.ne L(no_zva) + +#ifndef SKIP_ZVA_CHECK + mrs zva_val, dczid_el0 + and zva_val, zva_val, 31 + cmp zva_val, 4 /* ZVA size is 64 bytes. */ + b.ne L(no_zva) +#endif + str q0, [dst, 16] + stp q0, q0, [dst, 32] + bic dst, dst, 63 + sub count, dstend, dst /* Count is now 64 too large. */ + sub count, count, 128 /* Adjust count and bias for loop. */ + + .p2align 4 +L(zva_loop): + add dst, dst, 64 + dc zva, dst + subs count, count, 64 + b.hi L(zva_loop) + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + +L(no_zva): + sub count, dstend, dst /* Count is 16 too large. */ + sub dst, dst, 16 /* Dst is biased by -32. */ + sub count, count, 64 + 16 /* Adjust count and bias for loop. */ +L(no_zva_loop): + stp q0, q0, [dst, 32] + stp q0, q0, [dst, 64]! + subs count, count, 64 + b.hi L(no_zva_loop) + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + +END (__memset_aarch64) + diff --git a/string/aarch64/stpcpy-mte.S b/string/aarch64/stpcpy-mte.S new file mode 100644 index 000000000000..f1c711906515 --- /dev/null +++ b/string/aarch64/stpcpy-mte.S @@ -0,0 +1,10 @@ +/* + * stpcpy - copy a string returning pointer to end. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#define BUILD_STPCPY 1 + +#include "strcpy-mte.S" diff --git a/string/aarch64/stpcpy-sve.S b/string/aarch64/stpcpy-sve.S new file mode 100644 index 000000000000..82dd9717b0a0 --- /dev/null +++ b/string/aarch64/stpcpy-sve.S @@ -0,0 +1,10 @@ +/* + * stpcpy - copy a string returning pointer to end. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#define BUILD_STPCPY 1 + +#include "strcpy-sve.S" diff --git a/string/aarch64/stpcpy.S b/string/aarch64/stpcpy.S new file mode 100644 index 000000000000..4f62aa462389 --- /dev/null +++ b/string/aarch64/stpcpy.S @@ -0,0 +1,10 @@ +/* + * stpcpy - copy a string returning pointer to end. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#define BUILD_STPCPY 1 + +#include "strcpy.S" diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S new file mode 100644 index 000000000000..dcb0e4625870 --- /dev/null +++ b/string/aarch64/strchr-mte.S @@ -0,0 +1,105 @@ +/* + * strchr - find a character in a string + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. + */ + +#include "../asmdefs.h" + +#define srcin x0 +#define chrin w1 +#define result x0 + +#define src x2 +#define tmp1 x1 +#define wtmp2 w3 +#define tmp3 x3 + +#define vrepchr v0 +#define vdata v1 +#define qdata q1 +#define vhas_nul v2 +#define vhas_chr v3 +#define vrepmask v4 +#define vrepmask2 v5 +#define vend v6 +#define dend d6 + +/* Core algorithm. + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-1 are set if the relevant byte matched the + requested character, bits 2-3 are set if the byte is NUL (or matched), and + bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd + bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits + in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ + +ENTRY (__strchr_aarch64_mte) + PTR_ARG (0) + bic src, srcin, 15 + dup vrepchr.16b, chrin + ld1 {vdata.16b}, [src] + mov wtmp2, 0x3003 + dup vrepmask.8h, wtmp2 + cmeq vhas_nul.16b, vdata.16b, 0 + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + mov wtmp2, 0xf00f + dup vrepmask2.8h, wtmp2 + + bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b + and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b + lsl tmp3, srcin, 2 + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + + fmov tmp1, dend + lsr tmp1, tmp1, tmp3 + cbz tmp1, L(loop) + + rbit tmp1, tmp1 + clz tmp1, tmp1 + /* Tmp1 is an even multiple of 2 if the target character was + found first. Otherwise we've found the end of string. */ + tst tmp1, 2 + add result, srcin, tmp1, lsr 2 + csel result, result, xzr, eq + ret + + .p2align 4 +L(loop): + ldr qdata, [src, 16]! + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov tmp1, dend + cbz tmp1, L(loop) + +#ifdef __AARCH64EB__ + bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b + and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + fmov tmp1, dend +#else + bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b + and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + fmov tmp1, dend + rbit tmp1, tmp1 +#endif + clz tmp1, tmp1 + /* Tmp1 is an even multiple of 2 if the target character was + found first. Otherwise we've found the end of string. */ + tst tmp1, 2 + add result, src, tmp1, lsr 2 + csel result, result, xzr, eq + ret + +END (__strchr_aarch64_mte) + diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S new file mode 100644 index 000000000000..13ba9f44f9c5 --- /dev/null +++ b/string/aarch64/strchr-sve.S @@ -0,0 +1,70 @@ +/* + * strchr/strchrnul - find a character in a string + * + * Copyright (c) 2018-2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "../asmdefs.h" + +#if __ARM_FEATURE_SVE +/* Assumptions: + * + * ARMv8-a, AArch64 + * SVE Available. + */ + +/* To build as strchrnul, define BUILD_STRCHRNUL before compiling this file. */ +#ifdef BUILD_STRCHRNUL +#define FUNC __strchrnul_aarch64_sve +#else +#define FUNC __strchr_aarch64_sve +#endif + +ENTRY (FUNC) + PTR_ARG (0) + dup z1.b, w1 /* replicate byte across vector */ + setffr /* initialize FFR */ + ptrue p1.b /* all ones; loop invariant */ + + .p2align 4 + /* Read a vector's worth of bytes, stopping on first fault. */ +0: ldff1b z0.b, p1/z, [x0, xzr] + rdffrs p0.b, p1/z + b.nlast 2f + + /* First fault did not fail: the whole vector is valid. + Avoid depending on the contents of FFR beyond the branch. */ + incb x0 /* speculate increment */ + cmpeq p2.b, p1/z, z0.b, z1.b /* search for c */ + cmpeq p3.b, p1/z, z0.b, 0 /* search for 0 */ + orrs p4.b, p1/z, p2.b, p3.b /* c | 0 */ + b.none 0b + decb x0 /* undo speculate */ + + /* Found C or 0. */ +1: brka p4.b, p1/z, p4.b /* find first such */ + sub x0, x0, 1 /* adjust pointer for that byte */ + incp x0, p4.b +#ifndef BUILD_STRCHRNUL + ptest p4, p2.b /* was first in c? */ + csel x0, xzr, x0, none /* if there was no c, return null */ +#endif + ret + + /* First fault failed: only some of the vector is valid. + Perform the comparision only on the valid bytes. */ +2: cmpeq p2.b, p0/z, z0.b, z1.b /* search for c */ + cmpeq p3.b, p0/z, z0.b, 0 /* search for 0 */ + orrs p4.b, p0/z, p2.b, p3.b /* c | 0 */ + b.any 1b + + /* No C or 0 found. Re-init FFR, increment, and loop. */ + setffr + incp x0, p0.b + b 0b + +END (FUNC) + +#endif + diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S new file mode 100644 index 000000000000..1063cbfd77aa --- /dev/null +++ b/string/aarch64/strchr.S @@ -0,0 +1,126 @@ +/* + * strchr - find a character in a string + * + * Copyright (c) 2014-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + * Neon Available. + */ + +#include "../asmdefs.h" + +/* Arguments and results. */ +#define srcin x0 +#define chrin w1 + +#define result x0 + +#define src x2 +#define tmp1 x3 +#define wtmp2 w4 +#define tmp3 x5 + +#define vrepchr v0 +#define vdata1 v1 +#define vdata2 v2 +#define vhas_nul1 v3 +#define vhas_nul2 v4 +#define vhas_chr1 v5 +#define vhas_chr2 v6 +#define vrepmask_0 v7 +#define vrepmask_c v16 +#define vend1 v17 +#define vend2 v18 + +/* Core algorithm. + + For each 32-byte hunk we calculate a 64-bit syndrome value, with + two bits per byte (LSB is always in bits 0 and 1, for both big + and little-endian systems). For each tuple, bit 0 is set iff + the relevant byte matched the requested character; bit 1 is set + iff the relevant byte matched the NUL end of string (we trigger + off bit0 for the special case of looking for NUL). Since the bits + in the syndrome reflect exactly the order in which things occur + in the original string a count_trailing_zeros() operation will + identify exactly which byte is causing the termination, and why. */ + +/* Locals and temporaries. */ + +ENTRY (__strchr_aarch64) + PTR_ARG (0) + /* Magic constant 0xc0300c03 to allow us to identify which lane + matches the requested byte. Even bits are set if the character + matches, odd bits if either the char is NUL or matches. */ + mov wtmp2, 0x0c03 + movk wtmp2, 0xc030, lsl 16 + dup vrepchr.16b, chrin + bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ + dup vrepmask_c.4s, wtmp2 + ands tmp1, srcin, #31 + add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */ + b.eq L(loop) + + /* Input string is not 32-byte aligned. Rather than forcing + the padding bytes to a safe value, we calculate the syndrome + for all the bytes, but then mask off those bits of the + syndrome that are related to the padding. */ + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + neg tmp1, tmp1 + cmeq vhas_nul1.16b, vdata1.16b, #0 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_nul2.16b, vdata2.16b, #0 + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b + bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b + and vend1.16b, vhas_nul1.16b, vrepmask_c.16b + and vend2.16b, vhas_nul2.16b, vrepmask_c.16b + lsl tmp1, tmp1, #1 + addp vend1.16b, vend1.16b, vend2.16b // 256->128 + mov tmp3, #~0 + addp vend1.16b, vend1.16b, vend2.16b // 128->64 + lsr tmp1, tmp3, tmp1 + + mov tmp3, vend1.d[0] + bic tmp1, tmp3, tmp1 // Mask padding bits. + cbnz tmp1, L(tail) + + .p2align 4 +L(loop): + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b + cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b + orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b + umaxp vend1.16b, vend1.16b, vend1.16b + mov tmp1, vend1.d[0] + cbz tmp1, L(loop) + + /* Termination condition found. Now need to establish exactly why + we terminated. */ + bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b + bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b + and vend1.16b, vhas_nul1.16b, vrepmask_c.16b + and vend2.16b, vhas_nul2.16b, vrepmask_c.16b + addp vend1.16b, vend1.16b, vend2.16b // 256->128 + addp vend1.16b, vend1.16b, vend2.16b // 128->64 + mov tmp1, vend1.d[0] +L(tail): + /* Count the trailing zeros, by bit reversing... */ + rbit tmp1, tmp1 + /* Re-bias source. */ + sub src, src, #32 + clz tmp1, tmp1 /* And counting the leading zeros. */ + /* Tmp1 is even if the target charager was found first. Otherwise + we've found the end of string and we weren't looking for NUL. */ + tst tmp1, #1 + add result, src, tmp1, lsr #1 + csel result, result, xzr, eq + ret + +END (__strchr_aarch64) + diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S new file mode 100644 index 000000000000..1b0d0a63094c --- /dev/null +++ b/string/aarch64/strchrnul-mte.S @@ -0,0 +1,84 @@ +/* + * strchrnul - find a character or nul in a string + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. + */ + +#include "../asmdefs.h" + +#define srcin x0 +#define chrin w1 +#define result x0 + +#define src x2 +#define tmp1 x1 +#define tmp2 x3 +#define tmp2w w3 + +#define vrepchr v0 +#define vdata v1 +#define qdata q1 +#define vhas_nul v2 +#define vhas_chr v3 +#define vrepmask v4 +#define vend v5 +#define dend d5 + +/* Core algorithm: + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-3 are set if the relevant byte matched the + requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are + set likewise for odd bytes so that adjacent bytes can be merged. Since the + bits in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ + +ENTRY (__strchrnul_aarch64_mte) + PTR_ARG (0) + bic src, srcin, 15 + dup vrepchr.16b, chrin + ld1 {vdata.16b}, [src] + mov tmp2w, 0xf00f + dup vrepmask.8h, tmp2w + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b + lsl tmp2, srcin, 2 + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov tmp1, dend + lsr tmp1, tmp1, tmp2 /* Mask padding bits. */ + cbz tmp1, L(loop) + + rbit tmp1, tmp1 + clz tmp1, tmp1 + add result, srcin, tmp1, lsr 2 + ret + + .p2align 4 +L(loop): + ldr qdata, [src, 16]! + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b + fmov tmp1, dend + cbz tmp1, L(loop) + + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov tmp1, dend +#ifndef __AARCH64EB__ + rbit tmp1, tmp1 +#endif + clz tmp1, tmp1 + add result, src, tmp1, lsr 2 + ret + +END (__strchrnul_aarch64_mte) + diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/strchrnul-sve.S new file mode 100644 index 000000000000..428ff1a3d008 --- /dev/null +++ b/string/aarch64/strchrnul-sve.S @@ -0,0 +1,9 @@ +/* + * strchrnul - find a character or nul in a string + * + * Copyright (c) 2018-2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#define BUILD_STRCHRNUL +#include "strchr-sve.S" diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S new file mode 100644 index 000000000000..a4230d919b47 --- /dev/null +++ b/string/aarch64/strchrnul.S @@ -0,0 +1,114 @@ +/* + * strchrnul - find a character or nul in a string + * + * Copyright (c) 2014-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + * Neon Available. + */ + +#include "../asmdefs.h" + +/* Arguments and results. */ +#define srcin x0 +#define chrin w1 + +#define result x0 + +#define src x2 +#define tmp1 x3 +#define wtmp2 w4 +#define tmp3 x5 + +#define vrepchr v0 +#define vdata1 v1 +#define vdata2 v2 +#define vhas_nul1 v3 +#define vhas_nul2 v4 +#define vhas_chr1 v5 +#define vhas_chr2 v6 +#define vrepmask v7 +#define vend1 v16 + +/* Core algorithm. + + For each 32-byte hunk we calculate a 64-bit syndrome value, with + two bits per byte (LSB is always in bits 0 and 1, for both big + and little-endian systems). For each tuple, bit 0 is set iff + the relevant byte matched the requested character or nul. Since the + bits in the syndrome reflect exactly the order in which things occur + in the original string a count_trailing_zeros() operation will + identify exactly which byte is causing the termination. */ + +/* Locals and temporaries. */ + +ENTRY (__strchrnul_aarch64) + PTR_ARG (0) + /* Magic constant 0x40100401 to allow us to identify which lane + matches the termination condition. */ + mov wtmp2, #0x0401 + movk wtmp2, #0x4010, lsl #16 + dup vrepchr.16b, chrin + bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ + dup vrepmask.4s, wtmp2 + ands tmp1, srcin, #31 + b.eq L(loop) + + /* Input string is not 32-byte aligned. Rather than forcing + the padding bytes to a safe value, we calculate the syndrome + for all the bytes, but then mask off those bits of the + syndrome that are related to the padding. */ + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + neg tmp1, tmp1 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b + cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b + and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b + lsl tmp1, tmp1, #1 + addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 + mov tmp3, #~0 + addp vend1.16b, vend1.16b, vend1.16b // 128->64 + lsr tmp1, tmp3, tmp1 + + mov tmp3, vend1.d[0] + bic tmp1, tmp3, tmp1 // Mask padding bits. + cbnz tmp1, L(tail) + + .p2align 4 +L(loop): + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b + cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b + orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b + umaxp vend1.16b, vend1.16b, vend1.16b + mov tmp1, vend1.d[0] + cbz tmp1, L(loop) + + /* Termination condition found. Now need to establish exactly why + we terminated. */ + and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b + addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 + addp vend1.16b, vend1.16b, vend1.16b // 128->64 + + mov tmp1, vend1.d[0] +L(tail): + /* Count the trailing zeros, by bit reversing... */ + rbit tmp1, tmp1 + /* Re-bias source. */ + sub src, src, #32 + clz tmp1, tmp1 /* ... and counting the leading zeros. */ + /* tmp1 is twice the offset into the fragment. */ + add result, src, tmp1, lsr #1 + ret + +END (__strchrnul_aarch64) + diff --git a/string/aarch64/strcmp-mte.S b/string/aarch64/strcmp-mte.S new file mode 100644 index 000000000000..12d1a6b51dd3 --- /dev/null +++ b/string/aarch64/strcmp-mte.S @@ -0,0 +1,189 @@ +/* + * strcmp - compare two strings + * + * Copyright (c) 2012-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + + +/* Assumptions: + * + * ARMv8-a, AArch64. + * MTE compatible. + */ + +#include "../asmdefs.h" + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f + +#define src1 x0 +#define src2 x1 +#define result x0 + +#define data1 x2 +#define data1w w2 +#define data2 x3 +#define data2w w3 +#define has_nul x4 +#define diff x5 +#define off1 x5 +#define syndrome x6 +#define tmp x6 +#define data3 x7 +#define zeroones x8 +#define shift x9 +#define off2 x10 + +/* On big-endian early bytes are at MSB and on little-endian LSB. + LS_FW means shifting towards early bytes. */ +#ifdef __AARCH64EB__ +# define LS_FW lsl +#else +# define LS_FW lsr +#endif + +/* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. + Since carry propagation makes 0x1 bytes before a NUL byte appear + NUL too in big-endian, byte-reverse the data before the NUL check. */ + + +ENTRY (__strcmp_aarch64_mte) + PTR_ARG (0) + PTR_ARG (1) + sub off2, src2, src1 + mov zeroones, REP8_01 + and tmp, src1, 7 + tst off2, 7 + b.ne L(misaligned8) + cbnz tmp, L(mutual_align) + + .p2align 4 + +L(loop_aligned): + ldr data2, [src1, off2] + ldr data1, [src1], 8 +L(start_realigned): +#ifdef __AARCH64EB__ + rev tmp, data1 + sub has_nul, tmp, zeroones + orr tmp, tmp, REP8_7f +#else + sub has_nul, data1, zeroones + orr tmp, data1, REP8_7f +#endif + bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */ + ccmp data1, data2, 0, eq + b.eq L(loop_aligned) +#ifdef __AARCH64EB__ + rev has_nul, has_nul +#endif + eor diff, data1, data2 + orr syndrome, diff, has_nul +L(end): +#ifndef __AARCH64EB__ + rev syndrome, syndrome + rev data1, data1 + rev data2, data2 +#endif + clz shift, syndrome + /* The most-significant-non-zero bit of the syndrome marks either the + first bit that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ + lsl data1, data1, shift + lsl data2, data2, shift + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, 56 + sub result, data1, data2, lsr 56 + ret + + .p2align 4 + +L(mutual_align): + /* Sources are mutually aligned, but are not currently at an + alignment boundary. Round down the addresses and then mask off + the bytes that precede the start point. */ + bic src1, src1, 7 + ldr data2, [src1, off2] + ldr data1, [src1], 8 + neg shift, src2, lsl 3 /* Bits to alignment -64. */ + mov tmp, -1 + LS_FW tmp, tmp, shift + orr data1, data1, tmp + orr data2, data2, tmp + b L(start_realigned) + +L(misaligned8): + /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always + checking to make sure that we don't access beyond the end of SRC2. */ + cbz tmp, L(src1_aligned) +L(do_misaligned): + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + cmp data1w, 0 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ + b.ne L(done) + tst src1, 7 + b.ne L(do_misaligned) + +L(src1_aligned): + neg shift, src2, lsl 3 + bic src2, src2, 7 + ldr data3, [src2], 8 +#ifdef __AARCH64EB__ + rev data3, data3 +#endif + lsr tmp, zeroones, shift + orr data3, data3, tmp + sub has_nul, data3, zeroones + orr tmp, data3, REP8_7f + bics has_nul, has_nul, tmp + b.ne L(tail) + + sub off1, src2, src1 + + .p2align 4 + +L(loop_unaligned): + ldr data3, [src1, off1] + ldr data2, [src1, off2] +#ifdef __AARCH64EB__ + rev data3, data3 +#endif + sub has_nul, data3, zeroones + orr tmp, data3, REP8_7f + ldr data1, [src1], 8 + bics has_nul, has_nul, tmp + ccmp data1, data2, 0, eq + b.eq L(loop_unaligned) + + lsl tmp, has_nul, shift +#ifdef __AARCH64EB__ + rev tmp, tmp +#endif + eor diff, data1, data2 + orr syndrome, diff, tmp + cbnz syndrome, L(end) +L(tail): + ldr data1, [src1] + neg shift, shift + lsr data2, data3, shift + lsr has_nul, has_nul, shift +#ifdef __AARCH64EB__ + rev data2, data2 + rev has_nul, has_nul +#endif + eor diff, data1, data2 + orr syndrome, diff, has_nul + b L(end) + +L(done): + sub result, data1, data2 + ret + +END (__strcmp_aarch64_mte) + diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S new file mode 100644 index 000000000000..e6d2da5411ca --- /dev/null +++ b/string/aarch64/strcmp-sve.S @@ -0,0 +1,59 @@ +/* + * __strcmp_aarch64_sve - compare two strings + * + * Copyright (c) 2018-2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "../asmdefs.h" + +#if __ARM_FEATURE_SVE +/* Assumptions: + * + * ARMv8-a, AArch64 + * SVE Available. + */ + +ENTRY (__strcmp_aarch64_sve) + PTR_ARG (0) + PTR_ARG (1) + setffr /* initialize FFR */ + ptrue p1.b, all /* all ones; loop invariant */ + mov x2, 0 /* initialize offset */ + + /* Read a vector's worth of bytes, stopping on first fault. */ + .p2align 4 +0: ldff1b z0.b, p1/z, [x0, x2] + ldff1b z1.b, p1/z, [x1, x2] + rdffrs p0.b, p1/z + b.nlast 2f + + /* First fault did not fail: the whole vector is valid. + Avoid depending on the contents of FFR beyond the branch. */ + incb x2, all /* skip bytes for next round */ + cmpeq p2.b, p1/z, z0.b, z1.b /* compare strings */ + cmpne p3.b, p1/z, z0.b, 0 /* search for ~zero */ + nands p2.b, p1/z, p2.b, p3.b /* ~(eq & ~zero) -> ne | zero */ + b.none 0b + + /* Found end-of-string or inequality. */ +1: brkb p2.b, p1/z, p2.b /* find first such */ + lasta w0, p2, z0.b /* extract each char */ + lasta w1, p2, z1.b + sub x0, x0, x1 /* return comparison */ + ret + + /* First fault failed: only some of the vector is valid. + Perform the comparison only on the valid bytes. */ +2: incp x2, p0.b /* skip bytes for next round */ + setffr /* re-init FFR for next round */ + cmpeq p2.b, p0/z, z0.b, z1.b /* compare strings, as above */ + cmpne p3.b, p0/z, z0.b, 0 + nands p2.b, p0/z, p2.b, p3.b + b.none 0b + b 1b + +END (__strcmp_aarch64_sve) + +#endif + diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S new file mode 100644 index 000000000000..7714ebf5577d --- /dev/null +++ b/string/aarch64/strcmp.S @@ -0,0 +1,173 @@ +/* + * strcmp - compare two strings + * + * Copyright (c) 2012-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + */ + +#include "../asmdefs.h" + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +/* Parameters and result. */ +#define src1 x0 +#define src2 x1 +#define result x0 + +/* Internal variables. */ +#define data1 x2 +#define data1w w2 +#define data2 x3 +#define data2w w3 +#define has_nul x4 +#define diff x5 +#define syndrome x6 +#define tmp1 x7 +#define tmp2 x8 +#define tmp3 x9 +#define zeroones x10 +#define pos x11 + + /* Start of performance-critical section -- one 64B cache line. */ +ENTRY (__strcmp_aarch64) + PTR_ARG (0) + PTR_ARG (1) + eor tmp1, src1, src2 + mov zeroones, #REP8_01 + tst tmp1, #7 + b.ne L(misaligned8) + ands tmp1, src1, #7 + b.ne L(mutual_align) + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. */ +L(loop_aligned): + ldr data1, [src1], #8 + ldr data2, [src2], #8 +L(start_realigned): + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + orr syndrome, diff, has_nul + cbz syndrome, L(loop_aligned) + /* End of performance-critical section -- one 64B cache line. */ + +L(end): +#ifndef __AARCH64EB__ + rev syndrome, syndrome + rev data1, data1 + /* The MS-non-zero bit of the syndrome marks either the first bit + that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ + clz pos, syndrome + rev data2, data2 + lsl data1, data1, pos + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret +#else + /* For big-endian we cannot use the trick with the syndrome value + as carry-propagation can corrupt the upper bits if the trailing + bytes in the string contain 0x01. */ + /* However, if there is no NUL byte in the dword, we can generate + the result directly. We can't just subtract the bytes as the + MSB might be significant. */ + cbnz has_nul, 1f + cmp data1, data2 + cset result, ne + cneg result, result, lo + ret +1: + /* Re-compute the NUL-byte detection, using a byte-reversed value. */ + rev tmp3, data1 + sub tmp1, tmp3, zeroones + orr tmp2, tmp3, #REP8_7f + bic has_nul, tmp1, tmp2 + rev has_nul, has_nul + orr syndrome, diff, has_nul + clz pos, syndrome + /* The MS-non-zero bit of the syndrome marks either the first bit + that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ + lsl data1, data1, pos + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret +#endif + +L(mutual_align): + /* Sources are mutually aligned, but are not currently at an + alignment boundary. Round down the addresses and then mask off + the bytes that preceed the start point. */ + bic src1, src1, #7 + bic src2, src2, #7 + lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ + ldr data1, [src1], #8 + neg tmp1, tmp1 /* Bits to alignment -64. */ + ldr data2, [src2], #8 + mov tmp2, #~0 +#ifdef __AARCH64EB__ + /* Big-endian. Early bytes are at MSB. */ + lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ +#else + /* Little-endian. Early bytes are at LSB. */ + lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ +#endif + orr data1, data1, tmp2 + orr data2, data2, tmp2 + b L(start_realigned) + +L(misaligned8): + /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always + checking to make sure that we don't access beyond page boundary in + SRC2. */ + tst src1, #7 + b.eq L(loop_misaligned) +L(do_misaligned): + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + cmp data1w, #1 + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.ne L(done) + tst src1, #7 + b.ne L(do_misaligned) + +L(loop_misaligned): + /* Test if we are within the last dword of the end of a 4K page. If + yes then jump back to the misaligned loop to copy a byte at a time. */ + and tmp1, src2, #0xff8 + eor tmp1, tmp1, #0xff8 + cbz tmp1, L(do_misaligned) + ldr data1, [src1], #8 + ldr data2, [src2], #8 + + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + orr syndrome, diff, has_nul + cbz syndrome, L(loop_misaligned) + b L(end) + +L(done): + sub result, data1, data2 + ret + +END (__strcmp_aarch64) + diff --git a/string/aarch64/strcpy-mte.S b/string/aarch64/strcpy-mte.S new file mode 100644 index 000000000000..88c222d61e53 --- /dev/null +++ b/string/aarch64/strcpy-mte.S @@ -0,0 +1,161 @@ +/* + * strcpy/stpcpy - copy a string returning pointer to start/end. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. + */ + +#include "../asmdefs.h" + +#define dstin x0 +#define srcin x1 +#define result x0 + +#define src x2 +#define dst x3 +#define len x4 +#define synd x4 +#define tmp x5 +#define wtmp w5 +#define shift x5 +#define data1 x6 +#define dataw1 w6 +#define data2 x7 +#define dataw2 w7 + +#define dataq q0 +#define vdata v0 +#define vhas_nul v1 +#define vrepmask v2 +#define vend v3 +#define dend d3 +#define dataq2 q1 + +#ifdef BUILD_STPCPY +# define STRCPY __stpcpy_aarch64_mte +# define IFSTPCPY(X,...) X,__VA_ARGS__ +#else +# define STRCPY __strcpy_aarch64_mte +# define IFSTPCPY(X,...) +#endif + +/* Core algorithm: + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-3 are set if the relevant byte matched the + requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are + set likewise for odd bytes so that adjacent bytes can be merged. Since the + bits in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ + +ENTRY (STRCPY) + PTR_ARG (0) + PTR_ARG (1) + bic src, srcin, 15 + mov wtmp, 0xf00f + ld1 {vdata.16b}, [src] + dup vrepmask.8h, wtmp + cmeq vhas_nul.16b, vdata.16b, 0 + lsl shift, srcin, 2 + and vhas_nul.16b, vhas_nul.16b, vrepmask.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + lsr synd, synd, shift + cbnz synd, L(tail) + + ldr dataq, [src, 16]! + cmeq vhas_nul.16b, vdata.16b, 0 + and vhas_nul.16b, vhas_nul.16b, vrepmask.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbz synd, L(start_loop) + +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + sub tmp, src, srcin + clz len, synd + add len, tmp, len, lsr 2 + tbz len, 4, L(less16) + sub tmp, len, 15 + ldr dataq, [srcin] + ldr dataq2, [srcin, tmp] + str dataq, [dstin] + str dataq2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) + ret + + .p2align 4,,8 +L(tail): + rbit synd, synd + clz len, synd + lsr len, len, 2 + + .p2align 4 +L(less16): + tbz len, 3, L(less8) + sub tmp, len, 7 + ldr data1, [srcin] + ldr data2, [srcin, tmp] + str data1, [dstin] + str data2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) + ret + + .p2align 4 +L(less8): + subs tmp, len, 3 + b.lo L(less4) + ldr dataw1, [srcin] + ldr dataw2, [srcin, tmp] + str dataw1, [dstin] + str dataw2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) + ret + +L(less4): + cbz len, L(zerobyte) + ldrh dataw1, [srcin] + strh dataw1, [dstin] +L(zerobyte): + strb wzr, [dstin, len] + IFSTPCPY (add result, dstin, len) + ret + + .p2align 4 +L(start_loop): + sub len, src, srcin + ldr dataq2, [srcin] + add dst, dstin, len + str dataq2, [dstin] + + .p2align 5 +L(loop): + str dataq, [dst], 16 + ldr dataq, [src, 16]! + cmeq vhas_nul.16b, vdata.16b, 0 + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbz synd, L(loop) + + and vhas_nul.16b, vhas_nul.16b, vrepmask.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + fmov synd, dend +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + clz len, synd + lsr len, len, 2 + sub tmp, len, 15 + ldr dataq, [src, tmp] + str dataq, [dst, tmp] + IFSTPCPY (add result, dst, len) + ret + +END (STRCPY) diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S new file mode 100644 index 000000000000..f515462e09ae --- /dev/null +++ b/string/aarch64/strcpy-sve.S @@ -0,0 +1,71 @@ +/* + * strcpy/stpcpy - copy a string returning pointer to start/end. + * + * Copyright (c) 2018-2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "../asmdefs.h" + +#if __ARM_FEATURE_SVE +/* Assumptions: + * + * ARMv8-a, AArch64 + * SVE Available. + */ + +/* To build as stpcpy, define BUILD_STPCPY before compiling this file. */ +#ifdef BUILD_STPCPY +#define FUNC __stpcpy_aarch64_sve +#else +#define FUNC __strcpy_aarch64_sve +#endif + +ENTRY (FUNC) + PTR_ARG (0) + PTR_ARG (1) + setffr /* initialize FFR */ + ptrue p2.b, all /* all ones; loop invariant */ + mov x2, 0 /* initialize offset */ + + .p2align 4 + /* Read a vector's worth of bytes, stopping on first fault. */ +0: ldff1b z0.b, p2/z, [x1, x2] + rdffrs p0.b, p2/z + b.nlast 1f + + /* First fault did not fail: the whole vector is valid. + Avoid depending on the contexts of FFR beyond the branch. */ + cmpeq p1.b, p2/z, z0.b, 0 /* search for zeros */ + b.any 2f + + /* No zero found. Store the whole vector and loop. */ + st1b z0.b, p2, [x0, x2] + incb x2, all + b 0b + + /* First fault failed: only some of the vector is valid. + Perform the comparison only on the valid bytes. */ +1: cmpeq p1.b, p0/z, z0.b, 0 /* search for zeros */ + b.any 2f + + /* No zero found. Store the valid portion of the vector and loop. */ + setffr /* re-init FFR */ + st1b z0.b, p0, [x0, x2] + incp x2, p0.b + b 0b + + /* Zero found. Crop the vector to the found zero and finish. */ +2: brka p0.b, p2/z, p1.b + st1b z0.b, p0, [x0, x2] +#ifdef BUILD_STPCPY + add x0, x0, x2 + sub x0, x0, 1 + incp x0, p0.b +#endif + ret + +END (FUNC) + +#endif + diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S new file mode 100644 index 000000000000..6e9ed424b693 --- /dev/null +++ b/string/aarch64/strcpy.S @@ -0,0 +1,311 @@ +/* + * strcpy/stpcpy - copy a string returning pointer to start/end. + * + * Copyright (c) 2013-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses, min page size 4k. + */ + +#include "../asmdefs.h" + +/* To build as stpcpy, define BUILD_STPCPY before compiling this file. + + To test the page crossing code path more thoroughly, compile with + -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower + entry path. This option is not intended for production use. */ + +/* Arguments and results. */ +#define dstin x0 +#define srcin x1 + +/* Locals and temporaries. */ +#define src x2 +#define dst x3 +#define data1 x4 +#define data1w w4 +#define data2 x5 +#define data2w w5 +#define has_nul1 x6 +#define has_nul2 x7 +#define tmp1 x8 +#define tmp2 x9 +#define tmp3 x10 +#define tmp4 x11 +#define zeroones x12 +#define data1a x13 +#define data2a x14 +#define pos x15 +#define len x16 +#define to_align x17 + +#ifdef BUILD_STPCPY +#define STRCPY __stpcpy_aarch64 +#else +#define STRCPY __strcpy_aarch64 +#endif + + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. */ + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + + /* AArch64 systems have a minimum page size of 4k. We can do a quick + page size check for crossing this boundary on entry and if we + do not, then we can short-circuit much of the entry code. We + expect early page-crossing strings to be rare (probability of + 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite + predictable, even with random strings. + + We don't bother checking for larger page sizes, the cost of setting + up the correct page size is just not worth the extra gain from + a small reduction in the cases taking the slow path. Note that + we only care about whether the first fetch, which may be + misaligned, crosses a page boundary - after that we move to aligned + fetches for the remainder of the string. */ + +#ifdef STRCPY_TEST_PAGE_CROSS + /* Make everything that isn't Qword aligned look like a page cross. */ +#define MIN_PAGE_P2 4 +#else +#define MIN_PAGE_P2 12 +#endif + +#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2) + +ENTRY (STRCPY) + PTR_ARG (0) + PTR_ARG (1) + /* For moderately short strings, the fastest way to do the copy is to + calculate the length of the string in the same way as strlen, then + essentially do a memcpy of the result. This avoids the need for + multiple byte copies and further means that by the time we + reach the bulk copy loop we know we can always use DWord + accesses. We expect __strcpy_aarch64 to rarely be called repeatedly + with the same source string, so branch prediction is likely to + always be difficult - we mitigate against this by preferring + conditional select operations over branches whenever this is + feasible. */ + and tmp2, srcin, #(MIN_PAGE_SIZE - 1) + mov zeroones, #REP8_01 + and to_align, srcin, #15 + cmp tmp2, #(MIN_PAGE_SIZE - 16) + neg tmp1, to_align + /* The first fetch will straddle a (possible) page boundary iff + srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte + aligned string will never fail the page align check, so will + always take the fast path. */ + b.gt L(page_cross) + +L(page_cross_ok): + ldp data1, data2, [srcin] +#ifdef __AARCH64EB__ + /* Because we expect the end to be found within 16 characters + (profiling shows this is the most common case), it's worth + swapping the bytes now to save having to recalculate the + termination syndrome later. We preserve data1 and data2 + so that we can re-use the values later on. */ + rev tmp2, data1 + sub tmp1, tmp2, zeroones + orr tmp2, tmp2, #REP8_7f + bics has_nul1, tmp1, tmp2 + b.ne L(fp_le8) + rev tmp4, data2 + sub tmp3, tmp4, zeroones + orr tmp4, tmp4, #REP8_7f +#else + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + bics has_nul1, tmp1, tmp2 + b.ne L(fp_le8) + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f +#endif + bics has_nul2, tmp3, tmp4 + b.eq L(bulk_entry) + + /* The string is short (<=16 bytes). We don't know exactly how + short though, yet. Work out the exact length so that we can + quickly select the optimal copy strategy. */ +L(fp_gt8): + rev has_nul2, has_nul2 + clz pos, has_nul2 + mov tmp2, #56 + add dst, dstin, pos, lsr #3 /* Bits to bytes. */ + sub pos, tmp2, pos +#ifdef __AARCH64EB__ + lsr data2, data2, pos +#else + lsl data2, data2, pos +#endif + str data2, [dst, #1] + str data1, [dstin] +#ifdef BUILD_STPCPY + add dstin, dst, #8 +#endif + ret + +L(fp_le8): + rev has_nul1, has_nul1 + clz pos, has_nul1 + add dst, dstin, pos, lsr #3 /* Bits to bytes. */ + subs tmp2, pos, #24 /* Pos in bits. */ + b.lt L(fp_lt4) +#ifdef __AARCH64EB__ + mov tmp2, #56 + sub pos, tmp2, pos + lsr data2, data1, pos + lsr data1, data1, #32 +#else + lsr data2, data1, tmp2 +#endif + /* 4->7 bytes to copy. */ + str data2w, [dst, #-3] + str data1w, [dstin] +#ifdef BUILD_STPCPY + mov dstin, dst +#endif + ret +L(fp_lt4): + cbz pos, L(fp_lt2) + /* 2->3 bytes to copy. */ +#ifdef __AARCH64EB__ + lsr data1, data1, #48 +#endif + strh data1w, [dstin] + /* Fall-through, one byte (max) to go. */ +L(fp_lt2): + /* Null-terminated string. Last character must be zero! */ + strb wzr, [dst] +#ifdef BUILD_STPCPY + mov dstin, dst +#endif + ret + + .p2align 6 + /* Aligning here ensures that the entry code and main loop all lies + within one 64-byte cache line. */ +L(bulk_entry): + sub to_align, to_align, #16 + stp data1, data2, [dstin] + sub src, srcin, to_align + sub dst, dstin, to_align + b L(entry_no_page_cross) + + /* The inner loop deals with two Dwords at a time. This has a + slightly higher start-up cost, but we should win quite quickly, + especially on cores with a high number of issue slots per + cycle, as we get much better parallelism out of the operations. */ +L(main_loop): + stp data1, data2, [dst], #16 +L(entry_no_page_cross): + ldp data1, data2, [src], #16 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f + bic has_nul1, tmp1, tmp2 + bics has_nul2, tmp3, tmp4 + ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ + b.eq L(main_loop) + + /* Since we know we are copying at least 16 bytes, the fastest way + to deal with the tail is to determine the location of the + trailing NUL, then (re)copy the 16 bytes leading up to that. */ + cmp has_nul1, #0 +#ifdef __AARCH64EB__ + /* For big-endian, carry propagation (if the final byte in the + string is 0x01) means we cannot use has_nul directly. The + easiest way to get the correct byte is to byte-swap the data + and calculate the syndrome a second time. */ + csel data1, data1, data2, ne + rev data1, data1 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + bic has_nul1, tmp1, tmp2 +#else + csel has_nul1, has_nul1, has_nul2, ne +#endif + rev has_nul1, has_nul1 + clz pos, has_nul1 + add tmp1, pos, #72 + add pos, pos, #8 + csel pos, pos, tmp1, ne + add src, src, pos, lsr #3 + add dst, dst, pos, lsr #3 + ldp data1, data2, [src, #-32] + stp data1, data2, [dst, #-16] +#ifdef BUILD_STPCPY + sub dstin, dst, #1 +#endif + ret + +L(page_cross): + bic src, srcin, #15 + /* Start by loading two words at [srcin & ~15], then forcing the + bytes that precede srcin to 0xff. This means they never look + like termination bytes. */ + ldp data1, data2, [src] + lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ + tst to_align, #7 + csetm tmp2, ne +#ifdef __AARCH64EB__ + lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ +#else + lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ +#endif + orr data1, data1, tmp2 + orr data2a, data2, tmp2 + cmp to_align, #8 + csinv data1, data1, xzr, lt + csel data2, data2, data2a, lt + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f + bic has_nul1, tmp1, tmp2 + bics has_nul2, tmp3, tmp4 + ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ + b.eq L(page_cross_ok) + /* We now need to make data1 and data2 look like they've been + loaded directly from srcin. Do a rotate on the 128-bit value. */ + lsl tmp1, to_align, #3 /* Bytes->bits. */ + neg tmp2, to_align, lsl #3 +#ifdef __AARCH64EB__ + lsl data1a, data1, tmp1 + lsr tmp4, data2, tmp2 + lsl data2, data2, tmp1 + orr tmp4, tmp4, data1a + cmp to_align, #8 + csel data1, tmp4, data2, lt + rev tmp2, data1 + rev tmp4, data2 + sub tmp1, tmp2, zeroones + orr tmp2, tmp2, #REP8_7f + sub tmp3, tmp4, zeroones + orr tmp4, tmp4, #REP8_7f +#else + lsr data1a, data1, tmp1 + lsl tmp4, data2, tmp2 + lsr data2, data2, tmp1 + orr tmp4, tmp4, data1a + cmp to_align, #8 + csel data1, tmp4, data2, lt + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f +#endif + bic has_nul1, tmp1, tmp2 + cbnz has_nul1, L(fp_le8) + bic has_nul2, tmp3, tmp4 + b L(fp_gt8) + +END (STRCPY) + diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S new file mode 100644 index 000000000000..7cf41d5c1eac --- /dev/null +++ b/string/aarch64/strlen-mte.S @@ -0,0 +1,80 @@ +/* + * strlen - calculate the length of a string. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. + */ + +#include "../asmdefs.h" + +#define srcin x0 +#define result x0 + +#define src x1 +#define synd x2 +#define tmp x3 +#define wtmp w3 +#define shift x4 + +#define data q0 +#define vdata v0 +#define vhas_nul v1 +#define vrepmask v2 +#define vend v3 +#define dend d3 + +/* Core algorithm: + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-3 are set if the relevant byte matched the + requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are + set likewise for odd bytes so that adjacent bytes can be merged. Since the + bits in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ + +ENTRY (__strlen_aarch64_mte) + PTR_ARG (0) + bic src, srcin, 15 + mov wtmp, 0xf00f + ld1 {vdata.16b}, [src] + dup vrepmask.8h, wtmp + cmeq vhas_nul.16b, vdata.16b, 0 + lsl shift, srcin, 2 + and vhas_nul.16b, vhas_nul.16b, vrepmask.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + fmov synd, dend + lsr synd, synd, shift + cbz synd, L(loop) + + rbit synd, synd + clz result, synd + lsr result, result, 2 + ret + + .p2align 5 +L(loop): + ldr data, [src, 16]! + cmeq vhas_nul.16b, vdata.16b, 0 + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbz synd, L(loop) + + and vhas_nul.16b, vhas_nul.16b, vrepmask.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + sub result, src, srcin + fmov synd, dend +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + clz tmp, synd + add result, result, tmp, lsr 2 + ret + +END (__strlen_aarch64_mte) + diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S new file mode 100644 index 000000000000..2392493f1a3c --- /dev/null +++ b/string/aarch64/strlen-sve.S @@ -0,0 +1,55 @@ +/* + * __strlen_aarch64_sve - compute the length of a string + * + * Copyright (c) 2018-2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "../asmdefs.h" + +#if __ARM_FEATURE_SVE +/* Assumptions: + * + * ARMv8-a, AArch64 + * SVE Available. + */ + +ENTRY (__strlen_aarch64_sve) + PTR_ARG (0) + setffr /* initialize FFR */ + ptrue p2.b /* all ones; loop invariant */ + mov x1, 0 /* initialize length */ + + /* Read a vector's worth of bytes, stopping on first fault. */ + .p2align 4 +0: ldff1b z0.b, p2/z, [x0, x1] + rdffrs p0.b, p2/z + b.nlast 2f + + /* First fault did not fail: the whole vector is valid. + Avoid depending on the contents of FFR beyond the branch. */ + incb x1, all /* speculate increment */ + cmpeq p1.b, p2/z, z0.b, 0 /* loop if no zeros */ + b.none 0b + decb x1, all /* undo speculate */ + + /* Zero found. Select the bytes before the first and count them. */ +1: brkb p0.b, p2/z, p1.b + incp x1, p0.b + mov x0, x1 + ret + + /* First fault failed: only some of the vector is valid. + Perform the comparison only on the valid bytes. */ +2: cmpeq p1.b, p0/z, z0.b, 0 + b.any 1b + + /* No zero found. Re-init FFR, increment, and loop. */ + setffr + incp x1, p0.b + b 0b + +END (__strlen_aarch64_sve) + +#endif + diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S new file mode 100644 index 000000000000..a1b164a49238 --- /dev/null +++ b/string/aarch64/strlen.S @@ -0,0 +1,200 @@ +/* + * strlen - calculate the length of a string. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. + * Not MTE compatible. + */ + +#include "../asmdefs.h" + +#define srcin x0 +#define len x0 + +#define src x1 +#define data1 x2 +#define data2 x3 +#define has_nul1 x4 +#define has_nul2 x5 +#define tmp1 x4 +#define tmp2 x5 +#define tmp3 x6 +#define tmp4 x7 +#define zeroones x8 + +#define maskv v0 +#define maskd d0 +#define dataq1 q1 +#define dataq2 q2 +#define datav1 v1 +#define datav2 v2 +#define tmp x2 +#define tmpw w2 +#define synd x3 +#define shift x4 + +/* For the first 32 bytes, NUL detection works on the principle that + (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a + byte is zero, and can be done in parallel across the entire word. */ + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f + +/* To test the page crossing code path more thoroughly, compile with + -DTEST_PAGE_CROSS - this will force all calls through the slower + entry path. This option is not intended for production use. */ + +#ifdef TEST_PAGE_CROSS +# define MIN_PAGE_SIZE 32 +#else +# define MIN_PAGE_SIZE 4096 +#endif + +/* Core algorithm: + + Since strings are short on average, we check the first 32 bytes of the + string for a NUL character without aligning the string. In order to use + unaligned loads safely we must do a page cross check first. + + If there is a NUL byte we calculate the length from the 2 8-byte words + using conditional select to reduce branch mispredictions (it is unlikely + strlen will be repeatedly called on strings with the same length). + + If the string is longer than 32 bytes, align src so we don't need further + page cross checks, and process 32 bytes per iteration using a fast SIMD + loop. + + If the page cross check fails, we read 32 bytes from an aligned address, + and ignore any characters before the string. If it contains a NUL + character, return the length, if not, continue in the main loop. */ + +ENTRY (__strlen_aarch64) + PTR_ARG (0) + and tmp1, srcin, MIN_PAGE_SIZE - 1 + cmp tmp1, MIN_PAGE_SIZE - 32 + b.hi L(page_cross) + + /* Look for a NUL byte in the first 16 bytes. */ + ldp data1, data2, [srcin] + mov zeroones, REP8_01 + +#ifdef __AARCH64EB__ + /* For big-endian, carry propagation (if the final byte in the + string is 0x01) means we cannot use has_nul1/2 directly. + Since we expect strings to be small and early-exit, + byte-swap the data now so has_null1/2 will be correct. */ + rev data1, data1 + rev data2, data2 +#endif + sub tmp1, data1, zeroones + orr tmp2, data1, REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, REP8_7f + bics has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + ccmp has_nul2, 0, 0, eq + b.eq L(bytes16_31) + + /* Find the exact offset of the first NUL byte in the first 16 bytes + from the string start. Enter with C = has_nul1 == 0. */ + csel has_nul1, has_nul1, has_nul2, cc + mov len, 8 + rev has_nul1, has_nul1 + csel len, xzr, len, cc + clz tmp1, has_nul1 + add len, len, tmp1, lsr 3 + ret + + .p2align 3 + /* Look for a NUL byte at offset 16..31 in the string. */ +L(bytes16_31): + ldp data1, data2, [srcin, 16] +#ifdef __AARCH64EB__ + rev data1, data1 + rev data2, data2 +#endif + sub tmp1, data1, zeroones + orr tmp2, data1, REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, REP8_7f + bics has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + ccmp has_nul2, 0, 0, eq + b.eq L(loop_entry) + + /* Find the exact offset of the first NUL byte at offset 16..31 from + the string start. Enter with C = has_nul1 == 0. */ + csel has_nul1, has_nul1, has_nul2, cc + mov len, 24 + rev has_nul1, has_nul1 + mov tmp3, 16 + clz tmp1, has_nul1 + csel len, tmp3, len, cc + add len, len, tmp1, lsr 3 + ret + +L(loop_entry): + bic src, srcin, 31 + + .p2align 5 +L(loop): + ldp dataq1, dataq2, [src, 32]! + uminp maskv.16b, datav1.16b, datav2.16b + uminp maskv.16b, maskv.16b, maskv.16b + cmeq maskv.8b, maskv.8b, 0 + fmov synd, maskd + cbz synd, L(loop) + + /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */ + cmeq maskv.16b, datav1.16b, 0 + sub len, src, srcin + tst synd, 0xffffffff + b.ne 1f + cmeq maskv.16b, datav2.16b, 0 + add len, len, 16 +1: + /* Generate a bitmask and compute correct byte offset. */ +#ifdef __AARCH64EB__ + bic maskv.8h, 0xf0 +#else + bic maskv.8h, 0x0f, lsl 8 +#endif + umaxp maskv.16b, maskv.16b, maskv.16b + fmov synd, maskd +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + clz tmp, synd + add len, len, tmp, lsr 2 + ret + + .p2align 4 + +L(page_cross): + bic src, srcin, 31 + mov tmpw, 0x0c03 + movk tmpw, 0xc030, lsl 16 + ld1 {datav1.16b, datav2.16b}, [src] + dup maskv.4s, tmpw + cmeq datav1.16b, datav1.16b, 0 + cmeq datav2.16b, datav2.16b, 0 + and datav1.16b, datav1.16b, maskv.16b + and datav2.16b, datav2.16b, maskv.16b + addp maskv.16b, datav1.16b, datav2.16b + addp maskv.16b, maskv.16b, maskv.16b + fmov synd, maskd + lsl shift, srcin, 1 + lsr synd, synd, shift + cbz synd, L(loop) + + rbit synd, synd + clz len, synd + lsr len, len, 1 + ret + +END (__strlen_aarch64) diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S new file mode 100644 index 000000000000..c9d6fc8a158b --- /dev/null +++ b/string/aarch64/strncmp-mte.S @@ -0,0 +1,307 @@ +/* + * strncmp - compare two strings + * + * Copyright (c) 2013-2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + */ + +#include "../asmdefs.h" + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f + +/* Parameters and result. */ +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result x0 + +/* Internal variables. */ +#define data1 x3 +#define data1w w3 +#define data2 x4 +#define data2w w4 +#define has_nul x5 +#define diff x6 +#define syndrome x7 +#define tmp1 x8 +#define tmp2 x9 +#define tmp3 x10 +#define zeroones x11 +#define pos x12 +#define mask x13 +#define endloop x14 +#define count mask +#define offset pos +#define neg_offset x15 + +/* Define endian dependent shift operations. + On big-endian early bytes are at MSB and on little-endian LSB. + LS_FW means shifting towards early bytes. + LS_BK means shifting towards later bytes. + */ +#ifdef __AARCH64EB__ +#define LS_FW lsl +#define LS_BK lsr +#else +#define LS_FW lsr +#define LS_BK lsl +#endif + +ENTRY (__strncmp_aarch64_mte) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + cbz limit, L(ret0) + eor tmp1, src1, src2 + mov zeroones, #REP8_01 + tst tmp1, #7 + and count, src1, #7 + b.ne L(misaligned8) + cbnz count, L(mutual_align) + + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. */ + .p2align 4 +L(loop_aligned): + ldr data1, [src1], #8 + ldr data2, [src2], #8 +L(start_realigned): + subs limit, limit, #8 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + csinv endloop, diff, xzr, hi /* Last Dword or differences. */ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp endloop, #0, #0, eq + b.eq L(loop_aligned) + /* End of main loop */ + +L(full_check): +#ifndef __AARCH64EB__ + orr syndrome, diff, has_nul + add limit, limit, 8 /* Rewind limit to before last subs. */ +L(syndrome_check): + /* Limit was reached. Check if the NUL byte or the difference + is before the limit. */ + rev syndrome, syndrome + rev data1, data1 + clz pos, syndrome + rev data2, data2 + lsl data1, data1, pos + cmp limit, pos, lsr #3 + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + csel result, result, xzr, hi + ret +#else + /* Not reached the limit, must have found the end or a diff. */ + tbz limit, #63, L(not_limit) + add tmp1, limit, 8 + cbz limit, L(not_limit) + + lsl limit, tmp1, #3 /* Bits -> bytes. */ + mov mask, #~0 + lsr mask, mask, limit + bic data1, data1, mask + bic data2, data2, mask + + /* Make sure that the NUL byte is marked in the syndrome. */ + orr has_nul, has_nul, mask + +L(not_limit): + /* For big-endian we cannot use the trick with the syndrome value + as carry-propagation can corrupt the upper bits if the trailing + bytes in the string contain 0x01. */ + /* However, if there is no NUL byte in the dword, we can generate + the result directly. We can't just subtract the bytes as the + MSB might be significant. */ + cbnz has_nul, 1f + cmp data1, data2 + cset result, ne + cneg result, result, lo + ret +1: + /* Re-compute the NUL-byte detection, using a byte-reversed value. */ + rev tmp3, data1 + sub tmp1, tmp3, zeroones + orr tmp2, tmp3, #REP8_7f + bic has_nul, tmp1, tmp2 + rev has_nul, has_nul + orr syndrome, diff, has_nul + clz pos, syndrome + /* The most-significant-non-zero bit of the syndrome marks either the + first bit that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ +L(end_quick): + lsl data1, data1, pos + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret +#endif + +L(mutual_align): + /* Sources are mutually aligned, but are not currently at an + alignment boundary. Round down the addresses and then mask off + the bytes that precede the start point. + We also need to adjust the limit calculations, but without + overflowing if the limit is near ULONG_MAX. */ + bic src1, src1, #7 + bic src2, src2, #7 + ldr data1, [src1], #8 + neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ + ldr data2, [src2], #8 + mov tmp2, #~0 + LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ + /* Adjust the limit and ensure it doesn't overflow. */ + adds limit, limit, count + csinv limit, limit, xzr, lo + orr data1, data1, tmp2 + orr data2, data2, tmp2 + b L(start_realigned) + + .p2align 4 + /* Don't bother with dwords for up to 16 bytes. */ +L(misaligned8): + cmp limit, #16 + b.hs L(try_misaligned_words) + +L(byte_loop): + /* Perhaps we can do better than this. */ + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + subs limit, limit, #1 + ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.eq L(byte_loop) +L(done): + sub result, data1, data2 + ret + /* Align the SRC1 to a dword by doing a bytewise compare and then do + the dword loop. */ +L(try_misaligned_words): + cbz count, L(src1_aligned) + + neg count, count + and count, count, #7 + sub limit, limit, count + +L(page_end_loop): + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + cmp data1w, #1 + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.ne L(done) + subs count, count, #1 + b.hi L(page_end_loop) + + /* The following diagram explains the comparison of misaligned strings. + The bytes are shown in natural order. For little-endian, it is + reversed in the registers. The "x" bytes are before the string. + The "|" separates data that is loaded at one time. + src1 | a a a a a a a a | b b b c c c c c | . . . + src2 | x x x x x a a a a a a a a b b b | c c c c c . . . + + After shifting in each step, the data looks like this: + STEP_A STEP_B STEP_C + data1 a a a a a a a a b b b c c c c c b b b c c c c c + data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c + + The bytes with "0" are eliminated from the syndrome via mask. + + Align SRC2 down to 16 bytes. This way we can read 16 bytes at a + time from SRC2. The comparison happens in 3 steps. After each step + the loop can exit, or read from SRC1 or SRC2. */ +L(src1_aligned): + /* Calculate offset from 8 byte alignment to string start in bits. No + need to mask offset since shifts are ignoring upper bits. */ + lsl offset, src2, #3 + bic src2, src2, #0xf + mov mask, -1 + neg neg_offset, offset + ldr data1, [src1], #8 + ldp tmp1, tmp2, [src2], #16 + LS_BK mask, mask, neg_offset + and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ + /* Skip the first compare if data in tmp1 is irrelevant. */ + tbnz offset, 6, L(misaligned_mid_loop) + +L(loop_misaligned): + /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ + LS_FW data2, tmp1, offset + LS_BK tmp1, tmp2, neg_offset + subs limit, limit, #8 + orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ + sub has_nul, data1, zeroones + eor diff, data1, data2 /* Non-zero if differences found. */ + orr tmp3, data1, #REP8_7f + csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ + bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ + orr tmp3, endloop, has_nul + cbnz tmp3, L(full_check) + + ldr data1, [src1], #8 +L(misaligned_mid_loop): + /* STEP_B: Compare first part of data1 to second part of tmp2. */ + LS_FW data2, tmp2, offset +#ifdef __AARCH64EB__ + /* For big-endian we do a byte reverse to avoid carry-propagation + problem described above. This way we can reuse the has_nul in the + next step and also use syndrome value trick at the end. */ + rev tmp3, data1 + #define data1_fixed tmp3 +#else + #define data1_fixed data1 +#endif + sub has_nul, data1_fixed, zeroones + orr tmp3, data1_fixed, #REP8_7f + eor diff, data2, data1 /* Non-zero if differences found. */ + bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ +#ifdef __AARCH64EB__ + rev has_nul, has_nul +#endif + cmp limit, neg_offset, lsr #3 + orr syndrome, diff, has_nul + bic syndrome, syndrome, mask /* Ignore later bytes. */ + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ + cbnz tmp3, L(syndrome_check) + + /* STEP_C: Compare second part of data1 to first part of tmp1. */ + ldp tmp1, tmp2, [src2], #16 + cmp limit, #8 + LS_BK data2, tmp1, neg_offset + eor diff, data2, data1 /* Non-zero if differences found. */ + orr syndrome, diff, has_nul + and syndrome, syndrome, mask /* Ignore earlier bytes. */ + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ + cbnz tmp3, L(syndrome_check) + + ldr data1, [src1], #8 + sub limit, limit, #8 + b L(loop_misaligned) + +#ifdef __AARCH64EB__ +L(syndrome_check): + clz pos, syndrome + cmp pos, limit, lsl #3 + b.lo L(end_quick) +#endif + +L(ret0): + mov result, #0 + ret +END(__strncmp_aarch64_mte) + diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S new file mode 100644 index 000000000000..234190e245b0 --- /dev/null +++ b/string/aarch64/strncmp-sve.S @@ -0,0 +1,69 @@ +/* + * strncmp - compare two strings with limit + * + * Copyright (c) 2018-2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "../asmdefs.h" + +#if __ARM_FEATURE_SVE +/* Assumptions: + * + * ARMv8-a, AArch64 + * SVE Available. + */ + +ENTRY (__strncmp_aarch64_sve) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + setffr /* initialize FFR */ + mov x3, 0 /* initialize off */ + +0: whilelo p0.b, x3, x2 /* while off < max */ + b.none 9f + + ldff1b z0.b, p0/z, [x0, x3] + ldff1b z1.b, p0/z, [x1, x3] + rdffrs p1.b, p0/z + b.nlast 2f + + /* First fault did not fail: the vector up to max is valid. + Avoid depending on the contents of FFR beyond the branch. + Increment for a whole vector, even if we've only read a partial. + This is significantly cheaper than INCP, and since OFF is not + used after the loop it is ok to increment OFF past MAX. */ + incb x3 + cmpeq p1.b, p0/z, z0.b, z1.b /* compare strings */ + cmpne p2.b, p0/z, z0.b, 0 /* search for ~zero */ + nands p2.b, p0/z, p1.b, p2.b /* ~(eq & ~zero) -> ne | zero */ + b.none 0b + + /* Found end-of-string or inequality. */ +1: brkb p2.b, p0/z, p2.b /* find first such */ + lasta w0, p2, z0.b /* extract each char */ + lasta w1, p2, z1.b + sub x0, x0, x1 /* return comparison */ + ret + + /* First fault failed: only some of the vector is valid. + Perform the comparison only on the valid bytes. */ +2: cmpeq p2.b, p1/z, z0.b, z1.b /* compare strings, as above */ + cmpne p3.b, p1/z, z0.b, 0 + nands p2.b, p1/z, p2.b, p3.b + b.any 1b + + /* No inequality or zero found. Re-init FFR, incr and loop. */ + setffr + incp x3, p1.b + b 0b + + /* Found end-of-count. */ +9: mov x0, 0 /* return equal */ + ret + +END (__strncmp_aarch64_sve) + +#endif + diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S new file mode 100644 index 000000000000..738b6539cab6 --- /dev/null +++ b/string/aarch64/strncmp.S @@ -0,0 +1,260 @@ +/* + * strncmp - compare two strings + * + * Copyright (c) 2013-2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + */ + +#include "../asmdefs.h" + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +/* Parameters and result. */ +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result x0 + +/* Internal variables. */ +#define data1 x3 +#define data1w w3 +#define data2 x4 +#define data2w w4 +#define has_nul x5 +#define diff x6 +#define syndrome x7 +#define tmp1 x8 +#define tmp2 x9 +#define tmp3 x10 +#define zeroones x11 +#define pos x12 +#define limit_wd x13 +#define mask x14 +#define endloop x15 +#define count mask + +ENTRY (__strncmp_aarch64) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + cbz limit, L(ret0) + eor tmp1, src1, src2 + mov zeroones, #REP8_01 + tst tmp1, #7 + and count, src1, #7 + b.ne L(misaligned8) + cbnz count, L(mutual_align) + /* Calculate the number of full and partial words -1. */ + sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ + lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ + + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. */ + .p2align 4 +L(loop_aligned): + ldr data1, [src1], #8 + ldr data2, [src2], #8 +L(start_realigned): + subs limit_wd, limit_wd, #1 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + csinv endloop, diff, xzr, pl /* Last Dword or differences. */ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp endloop, #0, #0, eq + b.eq L(loop_aligned) + /* End of main loop */ + + /* Not reached the limit, must have found the end or a diff. */ + tbz limit_wd, #63, L(not_limit) + + /* Limit % 8 == 0 => all bytes significant. */ + ands limit, limit, #7 + b.eq L(not_limit) + + lsl limit, limit, #3 /* Bits -> bytes. */ + mov mask, #~0 +#ifdef __AARCH64EB__ + lsr mask, mask, limit +#else + lsl mask, mask, limit +#endif + bic data1, data1, mask + bic data2, data2, mask + + /* Make sure that the NUL byte is marked in the syndrome. */ + orr has_nul, has_nul, mask + +L(not_limit): + orr syndrome, diff, has_nul + +#ifndef __AARCH64EB__ + rev syndrome, syndrome + rev data1, data1 + /* The MS-non-zero bit of the syndrome marks either the first bit + that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ + clz pos, syndrome + rev data2, data2 + lsl data1, data1, pos + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret +#else + /* For big-endian we cannot use the trick with the syndrome value + as carry-propagation can corrupt the upper bits if the trailing + bytes in the string contain 0x01. */ + /* However, if there is no NUL byte in the dword, we can generate + the result directly. We can't just subtract the bytes as the + MSB might be significant. */ + cbnz has_nul, 1f + cmp data1, data2 + cset result, ne + cneg result, result, lo + ret +1: + /* Re-compute the NUL-byte detection, using a byte-reversed value. */ + rev tmp3, data1 + sub tmp1, tmp3, zeroones + orr tmp2, tmp3, #REP8_7f + bic has_nul, tmp1, tmp2 + rev has_nul, has_nul + orr syndrome, diff, has_nul + clz pos, syndrome + /* The MS-non-zero bit of the syndrome marks either the first bit + that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ + lsl data1, data1, pos + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret +#endif + +L(mutual_align): + /* Sources are mutually aligned, but are not currently at an + alignment boundary. Round down the addresses and then mask off + the bytes that precede the start point. + We also need to adjust the limit calculations, but without + overflowing if the limit is near ULONG_MAX. */ + bic src1, src1, #7 + bic src2, src2, #7 + ldr data1, [src1], #8 + neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ + ldr data2, [src2], #8 + mov tmp2, #~0 + sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ +#ifdef __AARCH64EB__ + /* Big-endian. Early bytes are at MSB. */ + lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */ +#else + /* Little-endian. Early bytes are at LSB. */ + lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */ +#endif + and tmp3, limit_wd, #7 + lsr limit_wd, limit_wd, #3 + /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */ + add limit, limit, count + add tmp3, tmp3, count + orr data1, data1, tmp2 + orr data2, data2, tmp2 + add limit_wd, limit_wd, tmp3, lsr #3 + b L(start_realigned) + + .p2align 4 + /* Don't bother with dwords for up to 16 bytes. */ +L(misaligned8): + cmp limit, #16 + b.hs L(try_misaligned_words) + +L(byte_loop): + /* Perhaps we can do better than this. */ + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + subs limit, limit, #1 + ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.eq L(byte_loop) +L(done): + sub result, data1, data2 + ret + /* Align the SRC1 to a dword by doing a bytewise compare and then do + the dword loop. */ +L(try_misaligned_words): + lsr limit_wd, limit, #3 + cbz count, L(do_misaligned) + + neg count, count + and count, count, #7 + sub limit, limit, count + lsr limit_wd, limit, #3 + +L(page_end_loop): + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + cmp data1w, #1 + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.ne L(done) + subs count, count, #1 + b.hi L(page_end_loop) + +L(do_misaligned): + /* Prepare ourselves for the next page crossing. Unlike the aligned + loop, we fetch 1 less dword because we risk crossing bounds on + SRC2. */ + mov count, #8 + subs limit_wd, limit_wd, #1 + b.lo L(done_loop) +L(loop_misaligned): + and tmp2, src2, #0xff8 + eor tmp2, tmp2, #0xff8 + cbz tmp2, L(page_end_loop) + + ldr data1, [src1], #8 + ldr data2, [src2], #8 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp diff, #0, #0, eq + b.ne L(not_limit) + subs limit_wd, limit_wd, #1 + b.pl L(loop_misaligned) + +L(done_loop): + /* We found a difference or a NULL before the limit was reached. */ + and limit, limit, #7 + cbz limit, L(not_limit) + /* Read the last word. */ + sub src1, src1, 8 + sub src2, src2, 8 + ldr data1, [src1, limit] + ldr data2, [src2, limit] + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp diff, #0, #0, eq + b.ne L(not_limit) + +L(ret0): + mov result, #0 + ret + +END ( __strncmp_aarch64) + diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S new file mode 100644 index 000000000000..5b9ebf7763bc --- /dev/null +++ b/string/aarch64/strnlen-sve.S @@ -0,0 +1,74 @@ +/* + * strnlen - calculate the length of a string with limit. + * + * Copyright (c) 2019-2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "../asmdefs.h" + +#if __ARM_FEATURE_SVE +/* Assumptions: + * + * ARMv8-a, AArch64 + * SVE Available. + */ + +ENTRY (__strnlen_aarch64_sve) + PTR_ARG (0) + SIZE_ARG (1) + setffr /* initialize FFR */ + mov x2, 0 /* initialize len */ + b 1f + + .p2align 4 + /* We have off + vl <= max, and so may read the whole vector. */ +0: ldff1b z0.b, p0/z, [x0, x2] + rdffrs p1.b, p0/z + b.nlast 2f + + /* First fault did not fail: the whole vector is valid. + Avoid depending on the contents of FFR beyond the branch. */ + cmpeq p2.b, p0/z, z0.b, 0 + b.any 8f + incb x2 + +1: whilelo p0.b, x2, x1 + b.last 0b + + /* We have off + vl < max. Test for off == max before proceeding. */ + b.none 9f + + ldff1b z0.b, p0/z, [x0, x2] + rdffrs p1.b, p0/z + b.nlast 2f + + /* First fault did not fail: the vector up to max is valid. + Avoid depending on the contents of FFR beyond the branch. + Compare for end-of-string, but there are no more bytes. */ + cmpeq p2.b, p0/z, z0.b, 0 + + /* Found end-of-string or zero. */ +8: brkb p2.b, p0/z, p2.b + mov x0, x2 + incp x0, p2.b + ret + + /* First fault failed: only some of the vector is valid. + Perform the comparison only on the valid bytes. */ +2: cmpeq p2.b, p1/z, z0.b, 0 + b.any 8b + + /* No inequality or zero found. Re-init FFR, incr and loop. */ + setffr + incp x2, p1.b + b 1b + + /* End of count. Return max. */ +9: mov x0, x1 + ret + +END (__strnlen_aarch64_sve) + +#endif + diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S new file mode 100644 index 000000000000..48d2495d2082 --- /dev/null +++ b/string/aarch64/strnlen.S @@ -0,0 +1,112 @@ +/* + * strnlen - calculate the length of a string with limit. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. + */ + +#include "../asmdefs.h" + +#define srcin x0 +#define cntin x1 +#define result x0 + +#define src x2 +#define synd x3 +#define shift x4 +#define wtmp w4 +#define tmp x4 +#define cntrem x5 + +#define qdata q0 +#define vdata v0 +#define vhas_chr v1 +#define vrepmask v2 +#define vend v3 +#define dend d3 + +/* + Core algorithm: + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-3 are set if the relevant byte matched the + requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are + set likewise for odd bytes so that adjacent bytes can be merged. Since the + bits in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ + +ENTRY (__strnlen_aarch64) + PTR_ARG (0) + SIZE_ARG (1) + bic src, srcin, 15 + mov wtmp, 0xf00f + cbz cntin, L(nomatch) + ld1 {vdata.16b}, [src], 16 + dup vrepmask.8h, wtmp + cmeq vhas_chr.16b, vdata.16b, 0 + lsl shift, srcin, 2 + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + lsr synd, synd, shift + cbz synd, L(start_loop) +L(finish): + rbit synd, synd + clz synd, synd + lsr result, synd, 2 + cmp cntin, result + csel result, cntin, result, ls + ret + +L(start_loop): + sub tmp, src, srcin + subs cntrem, cntin, tmp + b.ls L(nomatch) + + /* Make sure that it won't overread by a 16-byte chunk */ + add tmp, cntrem, 15 + tbnz tmp, 4, L(loop32_2) + + .p2align 5 +L(loop32): + ldr qdata, [src], 16 + cmeq vhas_chr.16b, vdata.16b, 0 + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + cbnz synd, L(end) +L(loop32_2): + ldr qdata, [src], 16 + subs cntrem, cntrem, 32 + cmeq vhas_chr.16b, vdata.16b, 0 + b.ls L(end) + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + cbz synd, L(loop32) + +L(end): + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + sub src, src, 16 + mov synd, vend.d[0] + sub result, src, srcin +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + clz synd, synd + add result, result, synd, lsr 2 + cmp cntin, result + csel result, cntin, result, ls + ret + +L(nomatch): + mov result, cntin + ret + +END (__strnlen_aarch64) + diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S new file mode 100644 index 000000000000..1e4fb1a68f7e --- /dev/null +++ b/string/aarch64/strrchr-mte.S @@ -0,0 +1,127 @@ +/* + * strrchr - find last position of a character in a string. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. + */ + +#include "../asmdefs.h" + +#define srcin x0 +#define chrin w1 +#define result x0 + +#define src x2 +#define tmp x3 +#define wtmp w3 +#define synd x3 +#define shift x4 +#define src_match x4 +#define nul_match x5 +#define chr_match x6 + +#define vrepchr v0 +#define vdata v1 +#define vhas_nul v2 +#define vhas_chr v3 +#define vrepmask v4 +#define vrepmask2 v5 +#define vend v5 +#define dend d5 + +/* Core algorithm. + + For each 16-byte chunk we calculate a 64-bit syndrome value, with + four bits per byte (LSB is always in bits 0 and 1, for both big + and little-endian systems). For each tuple, bits 0-1 are set if + the relevant byte matched the requested character; bits 2-3 are set + if the relevant byte matched the NUL end of string. */ + +ENTRY (__strrchr_aarch64_mte) + PTR_ARG (0) + bic src, srcin, 15 + dup vrepchr.16b, chrin + mov wtmp, 0x3003 + dup vrepmask.8h, wtmp + tst srcin, 15 + beq L(loop1) + + ld1 {vdata.16b}, [src], 16 + cmeq vhas_nul.16b, vdata.16b, 0 + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + mov wtmp, 0xf00f + dup vrepmask2.8h, wtmp + bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b + and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b + lsl shift, srcin, 2 + fmov synd, dend + lsr synd, synd, shift + lsl synd, synd, shift + ands nul_match, synd, 0xcccccccccccccccc + bne L(tail) + cbnz synd, L(loop2) + + .p2align 5 +L(loop1): + ld1 {vdata.16b}, [src], 16 + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbz synd, L(loop1) + + cmeq vhas_nul.16b, vdata.16b, 0 + bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b + bic vhas_nul.8h, 0x0f, lsl 8 + addp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + ands nul_match, synd, 0xcccccccccccccccc + beq L(loop2) + +L(tail): + sub nul_match, nul_match, 1 + and chr_match, synd, 0x3333333333333333 + ands chr_match, chr_match, nul_match + sub result, src, 1 + clz tmp, chr_match + sub result, result, tmp, lsr 2 + csel result, result, xzr, ne + ret + + .p2align 4 +L(loop2): + cmp synd, 0 + csel src_match, src, src_match, ne + csel chr_match, synd, chr_match, ne + ld1 {vdata.16b}, [src], 16 + cmeq vhas_nul.16b, vdata.16b, 0 + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + tst synd, 0xcccccccccccccccc + beq L(loop2) + + bic vhas_nul.8h, 0x0f, lsl 8 + addp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + and nul_match, synd, 0xcccccccccccccccc + sub nul_match, nul_match, 1 + and tmp, synd, 0x3333333333333333 + ands tmp, tmp, nul_match + csel chr_match, tmp, chr_match, ne + csel src_match, src, src_match, ne + sub src_match, src_match, 1 + clz tmp, chr_match + sub result, src_match, tmp, lsr 2 + ret + +END (__strrchr_aarch64_mte) + diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S new file mode 100644 index 000000000000..d36d69af37fd --- /dev/null +++ b/string/aarch64/strrchr-sve.S @@ -0,0 +1,84 @@ +/* + * strrchr - find the last of a character in a string + * + * Copyright (c) 2019-2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "../asmdefs.h" + +#if __ARM_FEATURE_SVE +/* Assumptions: + * + * ARMv8-a, AArch64 + * SVE Available. + */ + +ENTRY (__strrchr_aarch64_sve) + PTR_ARG (0) + dup z1.b, w1 /* replicate byte across vector */ + setffr /* initialize FFR */ + ptrue p1.b /* all ones; loop invariant */ + mov x2, 0 /* no match found so far */ + pfalse p2.b + + .p2align 4 + /* Read a vector's worth of bytes, stopping on first fault. */ +0: ldff1b z0.b, p1/z, [x0, xzr] + rdffrs p0.b, p1/z + b.nlast 1f + + /* First fault did not fail: the whole vector is valid. + Avoid depending on the contents of FFR beyond the branch. */ + incb x0, all /* skip bytes this round */ + cmpeq p3.b, p1/z, z0.b, 0 /* search for 0 */ + b.any 3f + + cmpeq p3.b, p1/z, z0.b, z1.b /* search for c; no eos */ + b.none 0b + + mov x2, x0 /* save advanced base */ + mov p2.b, p3.b /* save current search */ + b 0b + + /* First fault failed: only some of the vector is valid. + Perform the comparisions only on the valid bytes. */ +1: cmpeq p3.b, p0/z, z0.b, 0 /* search for 0 */ + b.any 2f + + cmpeq p3.b, p0/z, z0.b, z1.b /* search for c; no eos */ + mov x3, x0 + incp x0, p0.b /* skip bytes this round */ + setffr /* re-init FFR */ + b.none 0b + + addvl x2, x3, 1 /* save advanced base */ + mov p2.b, p3.b /* save current search */ + b 0b + + /* Found end-of-string. */ +2: incb x0, all /* advance base */ +3: brka p3.b, p1/z, p3.b /* mask after first 0 */ + cmpeq p3.b, p3/z, z0.b, z1.b /* search for c not after eos */ + b.any 4f + + /* No C within last vector. Did we have one before? */ + cbz x2, 5f + mov x0, x2 /* restore advanced base */ + mov p3.b, p2.b /* restore saved search */ + + /* Find the *last* match in the predicate. This is slightly + more complicated than finding the first match. */ +4: rev p3.b, p3.b /* reverse the bits */ + brka p3.b, p1/z, p3.b /* find position of last match */ + decp x0, p3.b /* retard pointer to last match */ + ret + + /* No C whatsoever. Return NULL. */ +5: mov x0, 0 + ret + +END (__strrchr_aarch64_sve) + +#endif + diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S new file mode 100644 index 000000000000..56185ff534e3 --- /dev/null +++ b/string/aarch64/strrchr.S @@ -0,0 +1,149 @@ +/* + * strrchr - find last position of a character in a string. + * + * Copyright (c) 2014-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + * Neon Available. + */ + +#include "../asmdefs.h" + +/* Arguments and results. */ +#define srcin x0 +#define chrin w1 + +#define result x0 + +#define src x2 +#define tmp1 x3 +#define wtmp2 w4 +#define tmp3 x5 +#define src_match x6 +#define src_offset x7 +#define const_m1 x8 +#define tmp4 x9 +#define nul_match x10 +#define chr_match x11 + +#define vrepchr v0 +#define vdata1 v1 +#define vdata2 v2 +#define vhas_nul1 v3 +#define vhas_nul2 v4 +#define vhas_chr1 v5 +#define vhas_chr2 v6 +#define vrepmask_0 v7 +#define vrepmask_c v16 +#define vend1 v17 +#define vend2 v18 + +/* Core algorithm. + + For each 32-byte hunk we calculate a 64-bit syndrome value, with + two bits per byte (LSB is always in bits 0 and 1, for both big + and little-endian systems). For each tuple, bit 0 is set iff + the relevant byte matched the requested character; bit 1 is set + iff the relevant byte matched the NUL end of string (we trigger + off bit0 for the special case of looking for NUL). Since the bits + in the syndrome reflect exactly the order in which things occur + in the original string a count_trailing_zeros() operation will + identify exactly which byte is causing the termination, and why. */ + +ENTRY (__strrchr_aarch64) + PTR_ARG (0) + /* Magic constant 0x40100401 to allow us to identify which lane + matches the requested byte. Magic constant 0x80200802 used + similarly for NUL termination. */ + mov wtmp2, #0x0401 + movk wtmp2, #0x4010, lsl #16 + dup vrepchr.16b, chrin + bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ + dup vrepmask_c.4s, wtmp2 + mov src_offset, #0 + ands tmp1, srcin, #31 + add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */ + b.eq L(aligned) + + /* Input string is not 32-byte aligned. Rather than forcing + the padding bytes to a safe value, we calculate the syndrome + for all the bytes, but then mask off those bits of the + syndrome that are related to the padding. */ + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + neg tmp1, tmp1 + cmeq vhas_nul1.16b, vdata1.16b, #0 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_nul2.16b, vdata2.16b, #0 + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b + and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b + and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b + addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128 + addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 + addp vend1.16b, vhas_nul1.16b, vhas_chr1.16b // 128->64 + mov nul_match, vend1.d[0] + lsl tmp1, tmp1, #1 + mov const_m1, #~0 + lsr tmp3, const_m1, tmp1 + mov chr_match, vend1.d[1] + + bic nul_match, nul_match, tmp3 // Mask padding bits. + bic chr_match, chr_match, tmp3 // Mask padding bits. + cbnz nul_match, L(tail) + + .p2align 4 +L(loop): + cmp chr_match, #0 + csel src_match, src, src_match, ne + csel src_offset, chr_match, src_offset, ne +L(aligned): + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + uminp vend1.16b, vdata1.16b, vdata2.16b + and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b + cmeq vend1.16b, vend1.16b, 0 + addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 + addp vend1.16b, vend1.16b, vhas_chr1.16b // 128->64 + mov nul_match, vend1.d[0] + mov chr_match, vend1.d[1] + cbz nul_match, L(loop) + + cmeq vhas_nul1.16b, vdata1.16b, #0 + cmeq vhas_nul2.16b, vdata2.16b, #0 + and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b + and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b + addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b + addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b + mov nul_match, vhas_nul1.d[0] + +L(tail): + /* Work out exactly where the string ends. */ + sub tmp4, nul_match, #1 + eor tmp4, tmp4, nul_match + ands chr_match, chr_match, tmp4 + /* And pick the values corresponding to the last match. */ + csel src_match, src, src_match, ne + csel src_offset, chr_match, src_offset, ne + + /* Count down from the top of the syndrome to find the last match. */ + clz tmp3, src_offset + /* Src_match points beyond the word containing the match, so we can + simply subtract half the bit-offset into the syndrome. Because + we are counting down, we need to go back one more character. */ + add tmp3, tmp3, #2 + sub result, src_match, tmp3, lsr #1 + /* But if the syndrome shows no match was found, then return NULL. */ + cmp src_offset, #0 + csel result, result, xzr, ne + + ret + +END (__strrchr_aarch64) + diff --git a/string/arm/check-arch.S b/string/arm/check-arch.S new file mode 100644 index 000000000000..1cff9345e343 --- /dev/null +++ b/string/arm/check-arch.S @@ -0,0 +1,10 @@ +/* + * check ARCH setting. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if !__arm__ +# error ARCH setting does not match the compiler. +#endif diff --git a/string/arm/memchr.S b/string/arm/memchr.S new file mode 100644 index 000000000000..3f1ac4df136f --- /dev/null +++ b/string/arm/memchr.S @@ -0,0 +1,132 @@ +/* + * memchr - scan memory for a character + * + * Copyright (c) 2010-2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* + Written by Dave Gilbert <david.gilbert@linaro.org> + + This __memchr_arm routine is optimised on a Cortex-A9 and should work on + all ARMv7 processors. It has a fast past for short sizes, and has + an optimised path for large data sets; the worst case is finding the + match early in a large data set. + + */ + +@ 2011-02-07 david.gilbert@linaro.org +@ Extracted from local git a5b438d861 +@ 2011-07-14 david.gilbert@linaro.org +@ Import endianness fix from local git ea786f1b +@ 2011-12-07 david.gilbert@linaro.org +@ Removed unneeded cbz from align loop + + .syntax unified + .arch armv7-a + +@ this lets us check a flag in a 00/ff byte easily in either endianness +#ifdef __ARMEB__ +#define CHARTSTMASK(c) 1<<(31-(c*8)) +#else +#define CHARTSTMASK(c) 1<<(c*8) +#endif + .thumb + +@ --------------------------------------------------------------------------- + .thumb_func + .align 2 + .p2align 4,,15 + .global __memchr_arm + .type __memchr_arm,%function +__memchr_arm: + @ r0 = start of memory to scan + @ r1 = character to look for + @ r2 = length + @ returns r0 = pointer to character or NULL if not found + and r1,r1,#0xff @ Don't think we can trust the caller to actually pass a char + + cmp r2,#16 @ If it's short don't bother with anything clever + blt 20f + + tst r0, #7 @ If it's already aligned skip the next bit + beq 10f + + @ Work up to an aligned point +5: + ldrb r3, [r0],#1 + subs r2, r2, #1 + cmp r3, r1 + beq 50f @ If it matches exit found + tst r0, #7 + bne 5b @ If not aligned yet then do next byte + +10: + @ At this point, we are aligned, we know we have at least 8 bytes to work with + push {r4,r5,r6,r7} + orr r1, r1, r1, lsl #8 @ expand the match word across to all bytes + orr r1, r1, r1, lsl #16 + bic r4, r2, #7 @ Number of double words to work with + mvns r7, #0 @ all F's + movs r3, #0 + +15: + ldmia r0!,{r5,r6} + subs r4, r4, #8 + eor r5,r5, r1 @ Get it so that r5,r6 have 00's where the bytes match the target + eor r6,r6, r1 + uadd8 r5, r5, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0 + sel r5, r3, r7 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION + uadd8 r6, r6, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0 + sel r6, r5, r7 @ chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION + cbnz r6, 60f + bne 15b @ (Flags from the subs above) If not run out of bytes then go around again + + pop {r4,r5,r6,r7} + and r1,r1,#0xff @ Get r1 back to a single character from the expansion above + and r2,r2,#7 @ Leave the count remaining as the number after the double words have been done + +20: + cbz r2, 40f @ 0 length or hit the end already then not found + +21: @ Post aligned section, or just a short call + ldrb r3,[r0],#1 + subs r2,r2,#1 + eor r3,r3,r1 @ r3 = 0 if match - doesn't break flags from sub + cbz r3, 50f + bne 21b @ on r2 flags + +40: + movs r0,#0 @ not found + bx lr + +50: + subs r0,r0,#1 @ found + bx lr + +60: @ We're here because the fast path found a hit - now we have to track down exactly which word it was + @ r0 points to the start of the double word after the one that was tested + @ r5 has the 00/ff pattern for the first word, r6 has the chained value + cmp r5, #0 + itte eq + moveq r5, r6 @ the end is in the 2nd word + subeq r0,r0,#3 @ Points to 2nd byte of 2nd word + subne r0,r0,#7 @ or 2nd byte of 1st word + + @ r0 currently points to the 3rd byte of the word containing the hit + tst r5, # CHARTSTMASK(0) @ 1st character + bne 61f + adds r0,r0,#1 + tst r5, # CHARTSTMASK(1) @ 2nd character + ittt eq + addeq r0,r0,#1 + tsteq r5, # (3<<15) @ 2nd & 3rd character + @ If not the 3rd must be the last one + addeq r0,r0,#1 + +61: + pop {r4,r5,r6,r7} + subs r0,r0,#1 + bx lr + + .size __memchr_arm, . - __memchr_arm diff --git a/string/arm/memcpy.S b/string/arm/memcpy.S new file mode 100644 index 000000000000..86e64938edb1 --- /dev/null +++ b/string/arm/memcpy.S @@ -0,0 +1,587 @@ +/* + * memcpy - copy memory area + * + * Copyright (c) 2013-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* + This memcpy routine is optimised for Cortex-A15 cores and takes advantage + of VFP or NEON when built with the appropriate flags. + + Assumptions: + + ARMv6 (ARMv7-a if using Neon) + ARM state + Unaligned accesses + + */ + +#include "../asmdefs.h" + + .syntax unified + /* This implementation requires ARM state. */ + .arm + +#ifdef __ARM_NEON__ + + .fpu neon + .arch armv7-a +# define FRAME_SIZE 4 +# define USE_VFP +# define USE_NEON + +#elif !defined (__SOFTFP__) + + .arch armv6 + .fpu vfpv2 +# define FRAME_SIZE 32 +# define USE_VFP + +#else + .arch armv6 +# define FRAME_SIZE 32 + +#endif + +/* Old versions of GAS incorrectly implement the NEON align semantics. */ +#ifdef BROKEN_ASM_NEON_ALIGN +#define ALIGN(addr, align) addr,:align +#else +#define ALIGN(addr, align) addr:align +#endif + +#define PC_OFFSET 8 /* PC pipeline compensation. */ +#define INSN_SIZE 4 + +/* Call parameters. */ +#define dstin r0 +#define src r1 +#define count r2 + +/* Locals. */ +#define tmp1 r3 +#define dst ip +#define tmp2 r10 + +#ifndef USE_NEON +/* For bulk copies using GP registers. */ +#define A_l r2 /* Call-clobbered. */ +#define A_h r3 /* Call-clobbered. */ +#define B_l r4 +#define B_h r5 +#define C_l r6 +#define C_h r7 +#define D_l r8 +#define D_h r9 +#endif + +/* Number of lines ahead to pre-fetch data. If you change this the code + below will need adjustment to compensate. */ + +#define prefetch_lines 5 + +#ifdef USE_VFP + .macro cpy_line_vfp vreg, base + vstr \vreg, [dst, #\base] + vldr \vreg, [src, #\base] + vstr d0, [dst, #\base + 8] + vldr d0, [src, #\base + 8] + vstr d1, [dst, #\base + 16] + vldr d1, [src, #\base + 16] + vstr d2, [dst, #\base + 24] + vldr d2, [src, #\base + 24] + vstr \vreg, [dst, #\base + 32] + vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] + vstr d0, [dst, #\base + 40] + vldr d0, [src, #\base + 40] + vstr d1, [dst, #\base + 48] + vldr d1, [src, #\base + 48] + vstr d2, [dst, #\base + 56] + vldr d2, [src, #\base + 56] + .endm + + .macro cpy_tail_vfp vreg, base + vstr \vreg, [dst, #\base] + vldr \vreg, [src, #\base] + vstr d0, [dst, #\base + 8] + vldr d0, [src, #\base + 8] + vstr d1, [dst, #\base + 16] + vldr d1, [src, #\base + 16] + vstr d2, [dst, #\base + 24] + vldr d2, [src, #\base + 24] + vstr \vreg, [dst, #\base + 32] + vstr d0, [dst, #\base + 40] + vldr d0, [src, #\base + 40] + vstr d1, [dst, #\base + 48] + vldr d1, [src, #\base + 48] + vstr d2, [dst, #\base + 56] + vldr d2, [src, #\base + 56] + .endm +#endif + +ENTRY (__memcpy_arm) + + mov dst, dstin /* Preserve dstin, we need to return it. */ + cmp count, #64 + bhs L(cpy_not_short) + /* Deal with small copies quickly by dropping straight into the + exit block. */ + +L(tail63unaligned): +#ifdef USE_NEON + and tmp1, count, #0x38 + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) + add pc, pc, tmp1 + vld1.8 {d0}, [src]! /* 14 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 12 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 10 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 8 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 6 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 4 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 2 words to go. */ + vst1.8 {d0}, [dst]! + + tst count, #4 + ldrne tmp1, [src], #4 + strne tmp1, [dst], #4 +#else + /* Copy up to 15 full words of data. May not be aligned. */ + /* Cannot use VFP for unaligned data. */ + and tmp1, count, #0x3c + add dst, dst, tmp1 + add src, src, tmp1 + rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2) + /* Jump directly into the sequence below at the correct offset. */ + add pc, pc, tmp1, lsl #1 + + ldr tmp1, [src, #-60] /* 15 words to go. */ + str tmp1, [dst, #-60] + + ldr tmp1, [src, #-56] /* 14 words to go. */ + str tmp1, [dst, #-56] + ldr tmp1, [src, #-52] + str tmp1, [dst, #-52] + + ldr tmp1, [src, #-48] /* 12 words to go. */ + str tmp1, [dst, #-48] + ldr tmp1, [src, #-44] + str tmp1, [dst, #-44] + + ldr tmp1, [src, #-40] /* 10 words to go. */ + str tmp1, [dst, #-40] + ldr tmp1, [src, #-36] + str tmp1, [dst, #-36] + + ldr tmp1, [src, #-32] /* 8 words to go. */ + str tmp1, [dst, #-32] + ldr tmp1, [src, #-28] + str tmp1, [dst, #-28] + + ldr tmp1, [src, #-24] /* 6 words to go. */ + str tmp1, [dst, #-24] + ldr tmp1, [src, #-20] + str tmp1, [dst, #-20] + + ldr tmp1, [src, #-16] /* 4 words to go. */ + str tmp1, [dst, #-16] + ldr tmp1, [src, #-12] + str tmp1, [dst, #-12] + + ldr tmp1, [src, #-8] /* 2 words to go. */ + str tmp1, [dst, #-8] + ldr tmp1, [src, #-4] + str tmp1, [dst, #-4] +#endif + + lsls count, count, #31 + ldrhcs tmp1, [src], #2 + ldrbne src, [src] /* Src is dead, use as a scratch. */ + strhcs tmp1, [dst], #2 + strbne src, [dst] + bx lr + +L(cpy_not_short): + /* At least 64 bytes to copy, but don't know the alignment yet. */ + str tmp2, [sp, #-FRAME_SIZE]! + and tmp2, src, #7 + and tmp1, dst, #7 + cmp tmp1, tmp2 + bne L(cpy_notaligned) + +#ifdef USE_VFP + /* Magic dust alert! Force VFP on Cortex-A9. Experiments show + that the FP pipeline is much better at streaming loads and + stores. This is outside the critical loop. */ + vmov.f32 s0, s0 +#endif + + /* SRC and DST have the same mutual 64-bit alignment, but we may + still need to pre-copy some bytes to get to natural alignment. + We bring SRC and DST into full 64-bit alignment. */ + lsls tmp2, dst, #29 + beq 1f + rsbs tmp2, tmp2, #0 + sub count, count, tmp2, lsr #29 + ldrmi tmp1, [src], #4 + strmi tmp1, [dst], #4 + lsls tmp2, tmp2, #2 + ldrhcs tmp1, [src], #2 + ldrbne tmp2, [src], #1 + strhcs tmp1, [dst], #2 + strbne tmp2, [dst], #1 + +1: + subs tmp2, count, #64 /* Use tmp2 for count. */ + blo L(tail63aligned) + + cmp tmp2, #512 + bhs L(cpy_body_long) + +L(cpy_body_medium): /* Count in tmp2. */ +#ifdef USE_VFP +1: + vldr d0, [src, #0] + subs tmp2, tmp2, #64 + vldr d1, [src, #8] + vstr d0, [dst, #0] + vldr d0, [src, #16] + vstr d1, [dst, #8] + vldr d1, [src, #24] + vstr d0, [dst, #16] + vldr d0, [src, #32] + vstr d1, [dst, #24] + vldr d1, [src, #40] + vstr d0, [dst, #32] + vldr d0, [src, #48] + vstr d1, [dst, #40] + vldr d1, [src, #56] + vstr d0, [dst, #48] + add src, src, #64 + vstr d1, [dst, #56] + add dst, dst, #64 + bhs 1b + tst tmp2, #0x3f + beq L(done) + +L(tail63aligned): /* Count in tmp2. */ + and tmp1, tmp2, #0x38 + add dst, dst, tmp1 + add src, src, tmp1 + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) + add pc, pc, tmp1 + + vldr d0, [src, #-56] /* 14 words to go. */ + vstr d0, [dst, #-56] + vldr d0, [src, #-48] /* 12 words to go. */ + vstr d0, [dst, #-48] + vldr d0, [src, #-40] /* 10 words to go. */ + vstr d0, [dst, #-40] + vldr d0, [src, #-32] /* 8 words to go. */ + vstr d0, [dst, #-32] + vldr d0, [src, #-24] /* 6 words to go. */ + vstr d0, [dst, #-24] + vldr d0, [src, #-16] /* 4 words to go. */ + vstr d0, [dst, #-16] + vldr d0, [src, #-8] /* 2 words to go. */ + vstr d0, [dst, #-8] +#else + sub src, src, #8 + sub dst, dst, #8 +1: + ldrd A_l, A_h, [src, #8] + strd A_l, A_h, [dst, #8] + ldrd A_l, A_h, [src, #16] + strd A_l, A_h, [dst, #16] + ldrd A_l, A_h, [src, #24] + strd A_l, A_h, [dst, #24] + ldrd A_l, A_h, [src, #32] + strd A_l, A_h, [dst, #32] + ldrd A_l, A_h, [src, #40] + strd A_l, A_h, [dst, #40] + ldrd A_l, A_h, [src, #48] + strd A_l, A_h, [dst, #48] + ldrd A_l, A_h, [src, #56] + strd A_l, A_h, [dst, #56] + ldrd A_l, A_h, [src, #64]! + strd A_l, A_h, [dst, #64]! + subs tmp2, tmp2, #64 + bhs 1b + tst tmp2, #0x3f + bne 1f + ldr tmp2,[sp], #FRAME_SIZE + bx lr +1: + add src, src, #8 + add dst, dst, #8 + +L(tail63aligned): /* Count in tmp2. */ + /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but + we know that the src and dest are 64-bit aligned so we can use + LDRD/STRD to improve efficiency. */ + /* TMP2 is now negative, but we don't care about that. The bottom + six bits still tell us how many bytes are left to copy. */ + + and tmp1, tmp2, #0x38 + add dst, dst, tmp1 + add src, src, tmp1 + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) + add pc, pc, tmp1 + ldrd A_l, A_h, [src, #-56] /* 14 words to go. */ + strd A_l, A_h, [dst, #-56] + ldrd A_l, A_h, [src, #-48] /* 12 words to go. */ + strd A_l, A_h, [dst, #-48] + ldrd A_l, A_h, [src, #-40] /* 10 words to go. */ + strd A_l, A_h, [dst, #-40] + ldrd A_l, A_h, [src, #-32] /* 8 words to go. */ + strd A_l, A_h, [dst, #-32] + ldrd A_l, A_h, [src, #-24] /* 6 words to go. */ + strd A_l, A_h, [dst, #-24] + ldrd A_l, A_h, [src, #-16] /* 4 words to go. */ + strd A_l, A_h, [dst, #-16] + ldrd A_l, A_h, [src, #-8] /* 2 words to go. */ + strd A_l, A_h, [dst, #-8] + +#endif + tst tmp2, #4 + ldrne tmp1, [src], #4 + strne tmp1, [dst], #4 + lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ + ldrhcs tmp1, [src], #2 + ldrbne tmp2, [src] + strhcs tmp1, [dst], #2 + strbne tmp2, [dst] + +L(done): + ldr tmp2, [sp], #FRAME_SIZE + bx lr + +L(cpy_body_long): /* Count in tmp2. */ + + /* Long copy. We know that there's at least (prefetch_lines * 64) + bytes to go. */ +#ifdef USE_VFP + /* Don't use PLD. Instead, read some data in advance of the current + copy position into a register. This should act like a PLD + operation but we won't have to repeat the transfer. */ + + vldr d3, [src, #0] + vldr d4, [src, #64] + vldr d5, [src, #128] + vldr d6, [src, #192] + vldr d7, [src, #256] + + vldr d0, [src, #8] + vldr d1, [src, #16] + vldr d2, [src, #24] + add src, src, #32 + + subs tmp2, tmp2, #prefetch_lines * 64 * 2 + blo 2f +1: + cpy_line_vfp d3, 0 + cpy_line_vfp d4, 64 + cpy_line_vfp d5, 128 + add dst, dst, #3 * 64 + add src, src, #3 * 64 + cpy_line_vfp d6, 0 + cpy_line_vfp d7, 64 + add dst, dst, #2 * 64 + add src, src, #2 * 64 + subs tmp2, tmp2, #prefetch_lines * 64 + bhs 1b + +2: + cpy_tail_vfp d3, 0 + cpy_tail_vfp d4, 64 + cpy_tail_vfp d5, 128 + add src, src, #3 * 64 + add dst, dst, #3 * 64 + cpy_tail_vfp d6, 0 + vstr d7, [dst, #64] + vldr d7, [src, #64] + vstr d0, [dst, #64 + 8] + vldr d0, [src, #64 + 8] + vstr d1, [dst, #64 + 16] + vldr d1, [src, #64 + 16] + vstr d2, [dst, #64 + 24] + vldr d2, [src, #64 + 24] + vstr d7, [dst, #64 + 32] + add src, src, #96 + vstr d0, [dst, #64 + 40] + vstr d1, [dst, #64 + 48] + vstr d2, [dst, #64 + 56] + add dst, dst, #128 + add tmp2, tmp2, #prefetch_lines * 64 + b L(cpy_body_medium) +#else + /* Long copy. Use an SMS style loop to maximize the I/O + bandwidth of the core. We don't have enough spare registers + to synthesise prefetching, so use PLD operations. */ + /* Pre-bias src and dst. */ + sub src, src, #8 + sub dst, dst, #8 + pld [src, #8] + pld [src, #72] + subs tmp2, tmp2, #64 + pld [src, #136] + ldrd A_l, A_h, [src, #8] + strd B_l, B_h, [sp, #8] + ldrd B_l, B_h, [src, #16] + strd C_l, C_h, [sp, #16] + ldrd C_l, C_h, [src, #24] + strd D_l, D_h, [sp, #24] + pld [src, #200] + ldrd D_l, D_h, [src, #32]! + b 1f + .p2align 6 +2: + pld [src, #232] + strd A_l, A_h, [dst, #40] + ldrd A_l, A_h, [src, #40] + strd B_l, B_h, [dst, #48] + ldrd B_l, B_h, [src, #48] + strd C_l, C_h, [dst, #56] + ldrd C_l, C_h, [src, #56] + strd D_l, D_h, [dst, #64]! + ldrd D_l, D_h, [src, #64]! + subs tmp2, tmp2, #64 +1: + strd A_l, A_h, [dst, #8] + ldrd A_l, A_h, [src, #8] + strd B_l, B_h, [dst, #16] + ldrd B_l, B_h, [src, #16] + strd C_l, C_h, [dst, #24] + ldrd C_l, C_h, [src, #24] + strd D_l, D_h, [dst, #32] + ldrd D_l, D_h, [src, #32] + bcs 2b + /* Save the remaining bytes and restore the callee-saved regs. */ + strd A_l, A_h, [dst, #40] + add src, src, #40 + strd B_l, B_h, [dst, #48] + ldrd B_l, B_h, [sp, #8] + strd C_l, C_h, [dst, #56] + ldrd C_l, C_h, [sp, #16] + strd D_l, D_h, [dst, #64] + ldrd D_l, D_h, [sp, #24] + add dst, dst, #72 + tst tmp2, #0x3f + bne L(tail63aligned) + ldr tmp2, [sp], #FRAME_SIZE + bx lr +#endif + +L(cpy_notaligned): + pld [src] + pld [src, #64] + /* There's at least 64 bytes to copy, but there is no mutual + alignment. */ + /* Bring DST to 64-bit alignment. */ + lsls tmp2, dst, #29 + pld [src, #(2 * 64)] + beq 1f + rsbs tmp2, tmp2, #0 + sub count, count, tmp2, lsr #29 + ldrmi tmp1, [src], #4 + strmi tmp1, [dst], #4 + lsls tmp2, tmp2, #2 + ldrbne tmp1, [src], #1 + ldrhcs tmp2, [src], #2 + strbne tmp1, [dst], #1 + strhcs tmp2, [dst], #2 +1: + pld [src, #(3 * 64)] + subs count, count, #64 + ldrlo tmp2, [sp], #FRAME_SIZE + blo L(tail63unaligned) + pld [src, #(4 * 64)] + +#ifdef USE_NEON + vld1.8 {d0-d3}, [src]! + vld1.8 {d4-d7}, [src]! + subs count, count, #64 + blo 2f +1: + pld [src, #(4 * 64)] + vst1.8 {d0-d3}, [ALIGN (dst, 64)]! + vld1.8 {d0-d3}, [src]! + vst1.8 {d4-d7}, [ALIGN (dst, 64)]! + vld1.8 {d4-d7}, [src]! + subs count, count, #64 + bhs 1b +2: + vst1.8 {d0-d3}, [ALIGN (dst, 64)]! + vst1.8 {d4-d7}, [ALIGN (dst, 64)]! + ands count, count, #0x3f +#else + /* Use an SMS style loop to maximize the I/O bandwidth. */ + sub src, src, #4 + sub dst, dst, #8 + subs tmp2, count, #64 /* Use tmp2 for count. */ + ldr A_l, [src, #4] + ldr A_h, [src, #8] + strd B_l, B_h, [sp, #8] + ldr B_l, [src, #12] + ldr B_h, [src, #16] + strd C_l, C_h, [sp, #16] + ldr C_l, [src, #20] + ldr C_h, [src, #24] + strd D_l, D_h, [sp, #24] + ldr D_l, [src, #28] + ldr D_h, [src, #32]! + b 1f + .p2align 6 +2: + pld [src, #(5 * 64) - (32 - 4)] + strd A_l, A_h, [dst, #40] + ldr A_l, [src, #36] + ldr A_h, [src, #40] + strd B_l, B_h, [dst, #48] + ldr B_l, [src, #44] + ldr B_h, [src, #48] + strd C_l, C_h, [dst, #56] + ldr C_l, [src, #52] + ldr C_h, [src, #56] + strd D_l, D_h, [dst, #64]! + ldr D_l, [src, #60] + ldr D_h, [src, #64]! + subs tmp2, tmp2, #64 +1: + strd A_l, A_h, [dst, #8] + ldr A_l, [src, #4] + ldr A_h, [src, #8] + strd B_l, B_h, [dst, #16] + ldr B_l, [src, #12] + ldr B_h, [src, #16] + strd C_l, C_h, [dst, #24] + ldr C_l, [src, #20] + ldr C_h, [src, #24] + strd D_l, D_h, [dst, #32] + ldr D_l, [src, #28] + ldr D_h, [src, #32] + bcs 2b + + /* Save the remaining bytes and restore the callee-saved regs. */ + strd A_l, A_h, [dst, #40] + add src, src, #36 + strd B_l, B_h, [dst, #48] + ldrd B_l, B_h, [sp, #8] + strd C_l, C_h, [dst, #56] + ldrd C_l, C_h, [sp, #16] + strd D_l, D_h, [dst, #64] + ldrd D_l, D_h, [sp, #24] + add dst, dst, #72 + ands count, tmp2, #0x3f +#endif + ldr tmp2, [sp], #FRAME_SIZE + bne L(tail63unaligned) + bx lr + +END (__memcpy_arm) diff --git a/string/arm/memset.S b/string/arm/memset.S new file mode 100644 index 000000000000..11e927368fd1 --- /dev/null +++ b/string/arm/memset.S @@ -0,0 +1,98 @@ +/* + * memset - fill memory with a constant + * + * Copyright (c) 2010-2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* + Written by Dave Gilbert <david.gilbert@linaro.org> + + This memset routine is optimised on a Cortex-A9 and should work on + all ARMv7 processors. + + */ + + .syntax unified + .arch armv7-a + +@ 2011-08-30 david.gilbert@linaro.org +@ Extracted from local git 2f11b436 + +@ this lets us check a flag in a 00/ff byte easily in either endianness +#ifdef __ARMEB__ +#define CHARTSTMASK(c) 1<<(31-(c*8)) +#else +#define CHARTSTMASK(c) 1<<(c*8) +#endif + .thumb + +@ --------------------------------------------------------------------------- + .thumb_func + .align 2 + .p2align 4,,15 + .global __memset_arm + .type __memset_arm,%function +__memset_arm: + @ r0 = address + @ r1 = character + @ r2 = count + @ returns original address in r0 + + mov r3, r0 @ Leave r0 alone + cbz r2, 10f @ Exit if 0 length + + tst r0, #7 + beq 2f @ Already aligned + + @ Ok, so we're misaligned here +1: + strb r1, [r3], #1 + subs r2,r2,#1 + tst r3, #7 + cbz r2, 10f @ Exit if we hit the end + bne 1b @ go round again if still misaligned + +2: + @ OK, so we're aligned + push {r4,r5,r6,r7} + bics r4, r2, #15 @ if less than 16 bytes then need to finish it off + beq 5f + +3: + @ POSIX says that ch is cast to an unsigned char. A uxtb is one + @ byte and takes two cycles, where an AND is four bytes but one + @ cycle. + and r1, #0xFF + orr r1, r1, r1, lsl#8 @ Same character into all bytes + orr r1, r1, r1, lsl#16 + mov r5,r1 + mov r6,r1 + mov r7,r1 + +4: + subs r4,r4,#16 + stmia r3!,{r1,r5,r6,r7} + bne 4b + and r2,r2,#15 + + @ At this point we're still aligned and we have upto align-1 bytes left to right + @ we can avoid some of the byte-at-a time now by testing for some big chunks + tst r2,#8 + itt ne + subne r2,r2,#8 + stmiane r3!,{r1,r5} + +5: + pop {r4,r5,r6,r7} + cbz r2, 10f + + @ Got to do any last < alignment bytes +6: + subs r2,r2,#1 + strb r1,[r3],#1 + bne 6b + +10: + bx lr @ goodbye + .size __memset_arm, . - __memset_arm diff --git a/string/arm/strcmp-armv6m.S b/string/arm/strcmp-armv6m.S new file mode 100644 index 000000000000..b75d4143db57 --- /dev/null +++ b/string/arm/strcmp-armv6m.S @@ -0,0 +1,117 @@ +/* + * strcmp for ARMv6-M (optimized for performance, not size) + * + * Copyright (c) 2014-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1 + + .thumb_func + .syntax unified + .arch armv6-m + + .macro DoSub n, label + subs r0, r0, r1 +#ifdef __ARM_BIG_ENDIAN + lsrs r1, r4, \n +#else + lsls r1, r4, \n +#endif + orrs r1, r0 + bne \label + .endm + + .macro Byte_Test n, label + lsrs r0, r2, \n + lsrs r1, r3, \n + DoSub \n, \label + .endm + +ENTRY_ALIGN (__strcmp_armv6m, 4) + mov r2, r0 + push {r4, r5, r6, lr} + orrs r2, r1 + lsls r2, r2, #30 + bne 6f + ldr r5, =0x01010101 + lsls r6, r5, #7 +1: + ldmia r0!, {r2} + ldmia r1!, {r3} + subs r4, r2, r5 + bics r4, r2 + ands r4, r6 + beq 3f + +#ifdef __ARM_BIG_ENDIAN + Byte_Test #24, 4f + Byte_Test #16, 4f + Byte_Test #8, 4f + + b 7f +3: + cmp r2, r3 + beq 1b + cmp r2, r3 +#else + uxtb r0, r2 + uxtb r1, r3 + DoSub #24, 2f + + uxth r0, r2 + uxth r1, r3 + DoSub #16, 2f + + lsls r0, r2, #8 + lsls r1, r3, #8 + lsrs r0, r0, #8 + lsrs r1, r1, #8 + DoSub #8, 2f + + lsrs r0, r2, #24 + lsrs r1, r3, #24 + subs r0, r0, r1 +2: + pop {r4, r5, r6, pc} + +3: + cmp r2, r3 + beq 1b + rev r0, r2 + rev r1, r3 + cmp r0, r1 +#endif + + bls 5f + movs r0, #1 +4: + pop {r4, r5, r6, pc} +5: + movs r0, #0 + mvns r0, r0 + pop {r4, r5, r6, pc} +6: + ldrb r2, [r0, #0] + ldrb r3, [r1, #0] + adds r0, #1 + adds r1, #1 + cmp r2, #0 + beq 7f + cmp r2, r3 + bne 7f + ldrb r2, [r0, #0] + ldrb r3, [r1, #0] + adds r0, #1 + adds r1, #1 + cmp r2, #0 + beq 7f + cmp r2, r3 + beq 6b +7: + subs r0, r2, r3 + pop {r4, r5, r6, pc} + +END (__strcmp_armv6m) + +#endif /* __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1 */ diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S new file mode 100644 index 000000000000..51443e343058 --- /dev/null +++ b/string/arm/strcmp.S @@ -0,0 +1,475 @@ +/* + * strcmp for ARMv7 + * + * Copyright (c) 2012-2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 + +/* Implementation of strcmp for ARMv7 when DSP instructions are + available. Use ldrd to support wider loads, provided the data + is sufficiently aligned. Use saturating arithmetic to optimize + the compares. */ + +#include "../asmdefs.h" + +/* Build Options: + STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first + byte in the string. If comparing completely random strings + the pre-check will save time, since there is a very high + probability of a mismatch in the first character: we save + significant overhead if this is the common case. However, + if strings are likely to be identical (eg because we're + verifying a hit in a hash table), then this check is largely + redundant. */ + +#define STRCMP_NO_PRECHECK 0 + + /* This version uses Thumb-2 code. */ + .thumb + .syntax unified + +#ifdef __ARM_BIG_ENDIAN +#define S2LO lsl +#define S2LOEQ lsleq +#define S2HI lsr +#define MSB 0x000000ff +#define LSB 0xff000000 +#define BYTE0_OFFSET 24 +#define BYTE1_OFFSET 16 +#define BYTE2_OFFSET 8 +#define BYTE3_OFFSET 0 +#else /* not __ARM_BIG_ENDIAN */ +#define S2LO lsr +#define S2LOEQ lsreq +#define S2HI lsl +#define BYTE0_OFFSET 0 +#define BYTE1_OFFSET 8 +#define BYTE2_OFFSET 16 +#define BYTE3_OFFSET 24 +#define MSB 0xff000000 +#define LSB 0x000000ff +#endif /* not __ARM_BIG_ENDIAN */ + +/* Parameters and result. */ +#define src1 r0 +#define src2 r1 +#define result r0 /* Overlaps src1. */ + +/* Internal variables. */ +#define tmp1 r4 +#define tmp2 r5 +#define const_m1 r12 + +/* Additional internal variables for 64-bit aligned data. */ +#define data1a r2 +#define data1b r3 +#define data2a r6 +#define data2b r7 +#define syndrome_a tmp1 +#define syndrome_b tmp2 + +/* Additional internal variables for 32-bit aligned data. */ +#define data1 r2 +#define data2 r3 +#define syndrome tmp2 + + + /* Macro to compute and return the result value for word-aligned + cases. */ + .macro strcmp_epilogue_aligned synd d1 d2 restore_r6 +#ifdef __ARM_BIG_ENDIAN + /* If data1 contains a zero byte, then syndrome will contain a 1 in + bit 7 of that byte. Otherwise, the highest set bit in the + syndrome will highlight the first different bit. It is therefore + sufficient to extract the eight bits starting with the syndrome + bit. */ + clz tmp1, \synd + lsl r1, \d2, tmp1 + .if \restore_r6 + ldrd r6, r7, [sp, #8] + .endif + .cfi_restore 6 + .cfi_restore 7 + lsl \d1, \d1, tmp1 + .cfi_remember_state + lsr result, \d1, #24 + ldrd r4, r5, [sp], #16 + .cfi_restore 4 + .cfi_restore 5 + sub result, result, r1, lsr #24 + bx lr +#else + /* To use the big-endian trick we'd have to reverse all three words. + that's slower than this approach. */ + rev \synd, \synd + clz tmp1, \synd + bic tmp1, tmp1, #7 + lsr r1, \d2, tmp1 + .cfi_remember_state + .if \restore_r6 + ldrd r6, r7, [sp, #8] + .endif + .cfi_restore 6 + .cfi_restore 7 + lsr \d1, \d1, tmp1 + and result, \d1, #255 + and r1, r1, #255 + ldrd r4, r5, [sp], #16 + .cfi_restore 4 + .cfi_restore 5 + sub result, result, r1 + + bx lr +#endif + .endm + + .p2align 5 +L(strcmp_start_addr): +#if STRCMP_NO_PRECHECK == 0 +L(fastpath_exit): + sub r0, r2, r3 + bx lr + nop +#endif +ENTRY_ALIGN (__strcmp_arm, 0) +#if STRCMP_NO_PRECHECK == 0 + ldrb r2, [src1] + ldrb r3, [src2] + cmp r2, #1 + it cs + cmpcs r2, r3 + bne L(fastpath_exit) +#endif + strd r4, r5, [sp, #-16]! + .cfi_def_cfa_offset 16 + .cfi_offset 4, -16 + .cfi_offset 5, -12 + orr tmp1, src1, src2 + strd r6, r7, [sp, #8] + .cfi_offset 6, -8 + .cfi_offset 7, -4 + mvn const_m1, #0 + lsl r2, tmp1, #29 + cbz r2, L(loop_aligned8) + +L(not_aligned): + eor tmp1, src1, src2 + tst tmp1, #7 + bne L(misaligned8) + + /* Deal with mutual misalignment by aligning downwards and then + masking off the unwanted loaded data to prevent a difference. */ + and tmp1, src1, #7 + bic src1, src1, #7 + and tmp2, tmp1, #3 + bic src2, src2, #7 + lsl tmp2, tmp2, #3 /* Bytes -> bits. */ + ldrd data1a, data1b, [src1], #16 + tst tmp1, #4 + ldrd data2a, data2b, [src2], #16 + /* In thumb code we can't use MVN with a register shift, but + we do have ORN. */ + S2HI tmp1, const_m1, tmp2 + orn data1a, data1a, tmp1 + orn data2a, data2a, tmp1 + beq L(start_realigned8) + orn data1b, data1b, tmp1 + mov data1a, const_m1 + orn data2b, data2b, tmp1 + mov data2a, const_m1 + b L(start_realigned8) + + /* Unwind the inner loop by a factor of 2, giving 16 bytes per + pass. */ + .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ + .p2align 2 /* Always word aligned. */ +L(loop_aligned8): + ldrd data1a, data1b, [src1], #16 + ldrd data2a, data2b, [src2], #16 +L(start_realigned8): + uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ + eor syndrome_a, data1a, data2a + sel syndrome_a, syndrome_a, const_m1 + cbnz syndrome_a, L(diff_in_a) + uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ + eor syndrome_b, data1b, data2b + sel syndrome_b, syndrome_b, const_m1 + cbnz syndrome_b, L(diff_in_b) + + ldrd data1a, data1b, [src1, #-8] + ldrd data2a, data2b, [src2, #-8] + uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ + eor syndrome_a, data1a, data2a + sel syndrome_a, syndrome_a, const_m1 + uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ + eor syndrome_b, data1b, data2b + sel syndrome_b, syndrome_b, const_m1 + /* Can't use CBZ for backwards branch. */ + orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ + beq L(loop_aligned8) + +L(diff_found): + cbnz syndrome_a, L(diff_in_a) + +L(diff_in_b): + strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 + +L(diff_in_a): + .cfi_restore_state + strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 + + .cfi_restore_state +L(misaligned8): + tst tmp1, #3 + bne L(misaligned4) + ands tmp1, src1, #3 + bne L(mutual_align4) + + /* Unrolled by a factor of 2, to reduce the number of post-increment + operations. */ +L(loop_aligned4): + ldr data1, [src1], #8 + ldr data2, [src2], #8 +L(start_realigned4): + uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ + eor syndrome, data1, data2 + sel syndrome, syndrome, const_m1 + cbnz syndrome, L(aligned4_done) + ldr data1, [src1, #-4] + ldr data2, [src2, #-4] + uadd8 syndrome, data1, const_m1 + eor syndrome, data1, data2 + sel syndrome, syndrome, const_m1 + cmp syndrome, #0 + beq L(loop_aligned4) + +L(aligned4_done): + strcmp_epilogue_aligned syndrome, data1, data2, 0 + +L(mutual_align4): + .cfi_restore_state + /* Deal with mutual misalignment by aligning downwards and then + masking off the unwanted loaded data to prevent a difference. */ + lsl tmp1, tmp1, #3 /* Bytes -> bits. */ + bic src1, src1, #3 + ldr data1, [src1], #8 + bic src2, src2, #3 + ldr data2, [src2], #8 + + /* In thumb code we can't use MVN with a register shift, but + we do have ORN. */ + S2HI tmp1, const_m1, tmp1 + orn data1, data1, tmp1 + orn data2, data2, tmp1 + b L(start_realigned4) + +L(misaligned4): + ands tmp1, src1, #3 + beq L(src1_aligned) + sub src2, src2, tmp1 + bic src1, src1, #3 + lsls tmp1, tmp1, #31 + ldr data1, [src1], #4 + beq L(aligned_m2) + bcs L(aligned_m1) + +#if STRCMP_NO_PRECHECK == 1 + ldrb data2, [src2, #1] + uxtb tmp1, data1, ror #BYTE1_OFFSET + subs tmp1, tmp1, data2 + bne L(misaligned_exit) + cbz data2, L(misaligned_exit) + +L(aligned_m2): + ldrb data2, [src2, #2] + uxtb tmp1, data1, ror #BYTE2_OFFSET + subs tmp1, tmp1, data2 + bne L(misaligned_exit) + cbz data2, L(misaligned_exit) + +L(aligned_m1): + ldrb data2, [src2, #3] + uxtb tmp1, data1, ror #BYTE3_OFFSET + subs tmp1, tmp1, data2 + bne L(misaligned_exit) + add src2, src2, #4 + cbnz data2, L(src1_aligned) +#else /* STRCMP_NO_PRECHECK */ + /* If we've done the pre-check, then we don't need to check the + first byte again here. */ + ldrb data2, [src2, #2] + uxtb tmp1, data1, ror #BYTE2_OFFSET + subs tmp1, tmp1, data2 + bne L(misaligned_exit) + cbz data2, L(misaligned_exit) + +L(aligned_m2): + ldrb data2, [src2, #3] + uxtb tmp1, data1, ror #BYTE3_OFFSET + subs tmp1, tmp1, data2 + bne L(misaligned_exit) + cbnz data2, L(aligned_m1) +#endif + +L(misaligned_exit): + .cfi_remember_state + mov result, tmp1 + ldr r4, [sp], #16 + .cfi_restore 4 + bx lr + +#if STRCMP_NO_PRECHECK == 0 +L(aligned_m1): + add src2, src2, #4 +#endif +L(src1_aligned): + .cfi_restore_state + /* src1 is word aligned, but src2 has no common alignment + with it. */ + ldr data1, [src1], #4 + lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */ + + bic src2, src2, #3 + ldr data2, [src2], #4 + bhi L(overlap1) /* C=1, Z=0 => src2[1:0] = 0b11. */ + bcs L(overlap2) /* C=1, Z=1 => src2[1:0] = 0b10. */ + + /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ +L(overlap3): + bic tmp1, data1, #MSB + uadd8 syndrome, data1, const_m1 + eors syndrome, tmp1, data2, S2LO #8 + sel syndrome, syndrome, const_m1 + bne 4f + cbnz syndrome, 5f + ldr data2, [src2], #4 + eor tmp1, tmp1, data1 + cmp tmp1, data2, S2HI #24 + bne 6f + ldr data1, [src1], #4 + b L(overlap3) +4: + S2LO data2, data2, #8 + b L(strcmp_tail) + +5: + bics syndrome, syndrome, #MSB + bne L(strcmp_done_equal) + + /* We can only get here if the MSB of data1 contains 0, so + fast-path the exit. */ + ldrb result, [src2] + .cfi_remember_state + ldrd r4, r5, [sp], #16 + .cfi_restore 4 + .cfi_restore 5 + /* R6/7 Not used in this sequence. */ + .cfi_restore 6 + .cfi_restore 7 + neg result, result + bx lr + +6: + .cfi_restore_state + S2LO data1, data1, #24 + and data2, data2, #LSB + b L(strcmp_tail) + + .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ +L(overlap2): + and tmp1, data1, const_m1, S2LO #16 + uadd8 syndrome, data1, const_m1 + eors syndrome, tmp1, data2, S2LO #16 + sel syndrome, syndrome, const_m1 + bne 4f + cbnz syndrome, 5f + ldr data2, [src2], #4 + eor tmp1, tmp1, data1 + cmp tmp1, data2, S2HI #16 + bne 6f + ldr data1, [src1], #4 + b L(overlap2) +4: + S2LO data2, data2, #16 + b L(strcmp_tail) +5: + ands syndrome, syndrome, const_m1, S2LO #16 + bne L(strcmp_done_equal) + + ldrh data2, [src2] + S2LO data1, data1, #16 +#ifdef __ARM_BIG_ENDIAN + lsl data2, data2, #16 +#endif + b L(strcmp_tail) + +6: + S2LO data1, data1, #16 + and data2, data2, const_m1, S2LO #16 + b L(strcmp_tail) + + .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ +L(overlap1): + and tmp1, data1, #LSB + uadd8 syndrome, data1, const_m1 + eors syndrome, tmp1, data2, S2LO #24 + sel syndrome, syndrome, const_m1 + bne 4f + cbnz syndrome, 5f + ldr data2, [src2], #4 + eor tmp1, tmp1, data1 + cmp tmp1, data2, S2HI #8 + bne 6f + ldr data1, [src1], #4 + b L(overlap1) +4: + S2LO data2, data2, #24 + b L(strcmp_tail) +5: + tst syndrome, #LSB + bne L(strcmp_done_equal) + ldr data2, [src2] +6: + S2LO data1, data1, #8 + bic data2, data2, #MSB + b L(strcmp_tail) + +L(strcmp_done_equal): + mov result, #0 + .cfi_remember_state + ldrd r4, r5, [sp], #16 + .cfi_restore 4 + .cfi_restore 5 + /* R6/7 not used in this sequence. */ + .cfi_restore 6 + .cfi_restore 7 + bx lr + +L(strcmp_tail): + .cfi_restore_state +#ifndef __ARM_BIG_ENDIAN + rev data1, data1 + rev data2, data2 + /* Now everything looks big-endian... */ +#endif + uadd8 tmp1, data1, const_m1 + eor tmp1, data1, data2 + sel syndrome, tmp1, const_m1 + clz tmp1, syndrome + lsl data1, data1, tmp1 + lsl data2, data2, tmp1 + lsr result, data1, #24 + ldrd r4, r5, [sp], #16 + .cfi_restore 4 + .cfi_restore 5 + /* R6/7 not used in this sequence. */ + .cfi_restore 6 + .cfi_restore 7 + sub result, result, data2, lsr #24 + bx lr + +END (__strcmp_arm) + +#endif /* __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 */ diff --git a/string/arm/strcpy.c b/string/arm/strcpy.c new file mode 100644 index 000000000000..02cf94ff4be0 --- /dev/null +++ b/string/arm/strcpy.c @@ -0,0 +1,133 @@ +/* + * strcpy + * + * Copyright (c) 2008-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if defined (__thumb2__) && !defined (__thumb__) + +/* For GLIBC: +#include <string.h> +#include <memcopy.h> + +#undef strcmp +*/ + +#ifdef __thumb2__ +#define magic1(REG) "#0x01010101" +#define magic2(REG) "#0x80808080" +#else +#define magic1(REG) #REG +#define magic2(REG) #REG ", lsl #7" +#endif + +char* __attribute__((naked)) +__strcpy_arm (char* dst, const char* src) +{ + __asm__ ( + "pld [r1, #0]\n\t" + "eor r2, r0, r1\n\t" + "mov ip, r0\n\t" + "tst r2, #3\n\t" + "bne 4f\n\t" + "tst r1, #3\n\t" + "bne 3f\n" + "5:\n\t" +# ifndef __thumb2__ + "str r5, [sp, #-4]!\n\t" + "mov r5, #0x01\n\t" + "orr r5, r5, r5, lsl #8\n\t" + "orr r5, r5, r5, lsl #16\n\t" +# endif + + "str r4, [sp, #-4]!\n\t" + "tst r1, #4\n\t" + "ldr r3, [r1], #4\n\t" + "beq 2f\n\t" + "sub r2, r3, "magic1(r5)"\n\t" + "bics r2, r2, r3\n\t" + "tst r2, "magic2(r5)"\n\t" + "itt eq\n\t" + "streq r3, [ip], #4\n\t" + "ldreq r3, [r1], #4\n" + "bne 1f\n\t" + /* Inner loop. We now know that r1 is 64-bit aligned, so we + can safely fetch up to two words. This allows us to avoid + load stalls. */ + ".p2align 2\n" + "2:\n\t" + "pld [r1, #8]\n\t" + "ldr r4, [r1], #4\n\t" + "sub r2, r3, "magic1(r5)"\n\t" + "bics r2, r2, r3\n\t" + "tst r2, "magic2(r5)"\n\t" + "sub r2, r4, "magic1(r5)"\n\t" + "bne 1f\n\t" + "str r3, [ip], #4\n\t" + "bics r2, r2, r4\n\t" + "tst r2, "magic2(r5)"\n\t" + "itt eq\n\t" + "ldreq r3, [r1], #4\n\t" + "streq r4, [ip], #4\n\t" + "beq 2b\n\t" + "mov r3, r4\n" + "1:\n\t" +# ifdef __ARMEB__ + "rors r3, r3, #24\n\t" +# endif + "strb r3, [ip], #1\n\t" + "tst r3, #0xff\n\t" +# ifdef __ARMEL__ + "ror r3, r3, #8\n\t" +# endif + "bne 1b\n\t" + "ldr r4, [sp], #4\n\t" +# ifndef __thumb2__ + "ldr r5, [sp], #4\n\t" +# endif + "BX LR\n" + + /* Strings have the same offset from word alignment, but it's + not zero. */ + "3:\n\t" + "tst r1, #1\n\t" + "beq 1f\n\t" + "ldrb r2, [r1], #1\n\t" + "strb r2, [ip], #1\n\t" + "cmp r2, #0\n\t" + "it eq\n" + "BXEQ LR\n" + "1:\n\t" + "tst r1, #2\n\t" + "beq 5b\n\t" + "ldrh r2, [r1], #2\n\t" +# ifdef __ARMEB__ + "tst r2, #0xff00\n\t" + "iteet ne\n\t" + "strneh r2, [ip], #2\n\t" + "lsreq r2, r2, #8\n\t" + "streqb r2, [ip]\n\t" + "tstne r2, #0xff\n\t" +# else + "tst r2, #0xff\n\t" + "itet ne\n\t" + "strneh r2, [ip], #2\n\t" + "streqb r2, [ip]\n\t" + "tstne r2, #0xff00\n\t" +# endif + "bne 5b\n\t" + "BX LR\n" + + /* src and dst do not have a common word-alignement. Fall back to + byte copying. */ + "4:\n\t" + "ldrb r2, [r1], #1\n\t" + "strb r2, [ip], #1\n\t" + "cmp r2, #0\n\t" + "bne 4b\n\t" + "BX LR"); +} +/* For GLIBC: libc_hidden_builtin_def (strcpy) */ + +#endif /* defined (__thumb2__) && !defined (__thumb__) */ diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S new file mode 100644 index 000000000000..5ad30c941586 --- /dev/null +++ b/string/arm/strlen-armv6t2.S @@ -0,0 +1,124 @@ +/* + * strlen - calculate the length of a string + * + * Copyright (c) 2010-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2 + +/* + Assumes: + ARMv6T2, AArch32 + + */ + +#include "../asmdefs.h" + +#ifdef __ARMEB__ +#define S2LO lsl +#define S2HI lsr +#else +#define S2LO lsr +#define S2HI lsl +#endif + + /* This code requires Thumb. */ + .thumb + .syntax unified + +/* Parameters and result. */ +#define srcin r0 +#define result r0 + +/* Internal variables. */ +#define src r1 +#define data1a r2 +#define data1b r3 +#define const_m1 r12 +#define const_0 r4 +#define tmp1 r4 /* Overlaps const_0 */ +#define tmp2 r5 + +ENTRY (__strlen_armv6t2) + pld [srcin, #0] + strd r4, r5, [sp, #-8]! + bic src, srcin, #7 + mvn const_m1, #0 + ands tmp1, srcin, #7 /* (8 - bytes) to alignment. */ + pld [src, #32] + bne.w L(misaligned8) + mov const_0, #0 + mov result, #-8 +L(loop_aligned): + /* Bytes 0-7. */ + ldrd data1a, data1b, [src] + pld [src, #64] + add result, result, #8 +L(start_realigned): + uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */ + sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ + uadd8 data1b, data1b, const_m1 + sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ + cbnz data1b, L(null_found) + + /* Bytes 8-15. */ + ldrd data1a, data1b, [src, #8] + uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */ + add result, result, #8 + sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ + uadd8 data1b, data1b, const_m1 + sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ + cbnz data1b, L(null_found) + + /* Bytes 16-23. */ + ldrd data1a, data1b, [src, #16] + uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */ + add result, result, #8 + sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ + uadd8 data1b, data1b, const_m1 + sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ + cbnz data1b, L(null_found) + + /* Bytes 24-31. */ + ldrd data1a, data1b, [src, #24] + add src, src, #32 + uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */ + add result, result, #8 + sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ + uadd8 data1b, data1b, const_m1 + sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ + cmp data1b, #0 + beq L(loop_aligned) + +L(null_found): + cmp data1a, #0 + itt eq + addeq result, result, #4 + moveq data1a, data1b +#ifndef __ARMEB__ + rev data1a, data1a +#endif + clz data1a, data1a + ldrd r4, r5, [sp], #8 + add result, result, data1a, lsr #3 /* Bits -> Bytes. */ + bx lr + +L(misaligned8): + ldrd data1a, data1b, [src] + and tmp2, tmp1, #3 + rsb result, tmp1, #0 + lsl tmp2, tmp2, #3 /* Bytes -> bits. */ + tst tmp1, #4 + pld [src, #64] + S2HI tmp2, const_m1, tmp2 + orn data1a, data1a, tmp2 + itt ne + ornne data1b, data1b, tmp2 + movne data1a, const_m1 + mov const_0, #0 + b L(start_realigned) + +END (__strlen_armv6t2) + +#endif /* __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2 */ diff --git a/string/asmdefs.h b/string/asmdefs.h new file mode 100644 index 000000000000..340b427a505b --- /dev/null +++ b/string/asmdefs.h @@ -0,0 +1,98 @@ +/* + * Macros for asm code. + * + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#ifndef _ASMDEFS_H +#define _ASMDEFS_H + +#if defined(__aarch64__) + +/* Branch Target Identitication support. */ +#define BTI_C hint 34 +#define BTI_J hint 36 +/* Return address signing support (pac-ret). */ +#define PACIASP hint 25; .cfi_window_save +#define AUTIASP hint 29; .cfi_window_save + +/* GNU_PROPERTY_AARCH64_* macros from elf.h. */ +#define FEATURE_1_AND 0xc0000000 +#define FEATURE_1_BTI 1 +#define FEATURE_1_PAC 2 + +/* Add a NT_GNU_PROPERTY_TYPE_0 note. */ +#define GNU_PROPERTY(type, value) \ + .section .note.gnu.property, "a"; \ + .p2align 3; \ + .word 4; \ + .word 16; \ + .word 5; \ + .asciz "GNU"; \ + .word type; \ + .word 4; \ + .word value; \ + .word 0; \ + .text + +/* If set then the GNU Property Note section will be added to + mark objects to support BTI and PAC-RET. */ +#ifndef WANT_GNU_PROPERTY +#define WANT_GNU_PROPERTY 1 +#endif + +#if WANT_GNU_PROPERTY +/* Add property note with supported features to all asm files. */ +GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) +#endif + +#define ENTRY_ALIGN(name, alignment) \ + .global name; \ + .type name,%function; \ + .align alignment; \ + name: \ + .cfi_startproc; \ + BTI_C; + +#else + +#define END_FILE + +#define ENTRY_ALIGN(name, alignment) \ + .global name; \ + .type name,%function; \ + .align alignment; \ + name: \ + .cfi_startproc; + +#endif + +#define ENTRY(name) ENTRY_ALIGN(name, 6) + +#define ENTRY_ALIAS(name) \ + .global name; \ + .type name,%function; \ + name: + +#define END(name) \ + .cfi_endproc; \ + .size name, .-name; + +#define L(l) .L ## l + +#ifdef __ILP32__ + /* Sanitize padding bits of pointer arguments as per aapcs64 */ +#define PTR_ARG(n) mov w##n, w##n +#else +#define PTR_ARG(n) +#endif + +#ifdef __ILP32__ + /* Sanitize padding bits of size arguments as per aapcs64 */ +#define SIZE_ARG(n) mov w##n, w##n +#else +#define SIZE_ARG(n) +#endif + +#endif diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c new file mode 100644 index 000000000000..d5d4ea7e0309 --- /dev/null +++ b/string/bench/memcpy.c @@ -0,0 +1,260 @@ +/* + * memcpy benchmark. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#define _GNU_SOURCE +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> +#include "stringlib.h" +#include "benchlib.h" + +#define ITERS 5000 +#define ITERS2 20000000 +#define ITERS3 500000 +#define MAX_COPIES 8192 +#define SIZE (256*1024) + +static uint8_t a[SIZE + 4096] __attribute__((__aligned__(64))); +static uint8_t b[SIZE + 4096] __attribute__((__aligned__(64))); + +#define F(x) {#x, x}, + +static const struct fun +{ + const char *name; + void *(*fun)(void *, const void *, size_t); +} funtab[] = +{ + F(memcpy) +#if __aarch64__ + F(__memcpy_aarch64) +# if __ARM_NEON + F(__memcpy_aarch64_simd) +# endif +#elif __arm__ + F(__memcpy_arm) +#endif +#undef F + {0, 0} +}; + +typedef struct { uint16_t size; uint16_t freq; } freq_data_t; +typedef struct { uint8_t align; uint16_t freq; } align_data_t; + +#define SIZE_NUM 65536 +#define SIZE_MASK (SIZE_NUM-1) +static uint8_t size_arr[SIZE_NUM]; + +/* Frequency data for memcpy of less than 4096 bytes based on SPEC2017. */ +static freq_data_t size_freq[] = +{ +{32,22320}, { 16,9554}, { 8,8915}, {152,5327}, { 4,2159}, {292,2035}, +{ 12,1608}, { 24,1343}, {1152,895}, {144, 813}, {884, 733}, {284, 721}, +{120, 661}, { 2, 649}, {882, 550}, { 5, 475}, { 7, 461}, {108, 460}, +{ 10, 361}, { 9, 361}, { 6, 334}, { 3, 326}, {464, 308}, {2048,303}, +{ 1, 298}, { 64, 250}, { 11, 197}, {296, 194}, { 68, 187}, { 15, 185}, +{192, 184}, {1764,183}, { 13, 173}, {560, 126}, {160, 115}, {288, 96}, +{104, 96}, {1144, 83}, { 18, 80}, { 23, 78}, { 40, 77}, { 19, 68}, +{ 48, 63}, { 17, 57}, { 72, 54}, {1280, 51}, { 20, 49}, { 28, 47}, +{ 22, 46}, {640, 45}, { 25, 41}, { 14, 40}, { 56, 37}, { 27, 35}, +{ 35, 33}, {384, 33}, { 29, 32}, { 80, 30}, {4095, 22}, {232, 22}, +{ 36, 19}, {184, 17}, { 21, 17}, {256, 16}, { 44, 15}, { 26, 15}, +{ 31, 14}, { 88, 14}, {176, 13}, { 33, 12}, {1024, 12}, {208, 11}, +{ 62, 11}, {128, 10}, {704, 10}, {324, 10}, { 96, 10}, { 60, 9}, +{136, 9}, {124, 9}, { 34, 8}, { 30, 8}, {480, 8}, {1344, 8}, +{273, 7}, {520, 7}, {112, 6}, { 52, 6}, {344, 6}, {336, 6}, +{504, 5}, {168, 5}, {424, 5}, { 0, 4}, { 76, 3}, {200, 3}, +{512, 3}, {312, 3}, {240, 3}, {960, 3}, {264, 2}, {672, 2}, +{ 38, 2}, {328, 2}, { 84, 2}, { 39, 2}, {216, 2}, { 42, 2}, +{ 37, 2}, {1608, 2}, { 70, 2}, { 46, 2}, {536, 2}, {280, 1}, +{248, 1}, { 47, 1}, {1088, 1}, {1288, 1}, {224, 1}, { 41, 1}, +{ 50, 1}, { 49, 1}, {808, 1}, {360, 1}, {440, 1}, { 43, 1}, +{ 45, 1}, { 78, 1}, {968, 1}, {392, 1}, { 54, 1}, { 53, 1}, +{ 59, 1}, {376, 1}, {664, 1}, { 58, 1}, {272, 1}, { 66, 1}, +{2688, 1}, {472, 1}, {568, 1}, {720, 1}, { 51, 1}, { 63, 1}, +{ 86, 1}, {496, 1}, {776, 1}, { 57, 1}, {680, 1}, {792, 1}, +{122, 1}, {760, 1}, {824, 1}, {552, 1}, { 67, 1}, {456, 1}, +{984, 1}, { 74, 1}, {408, 1}, { 75, 1}, { 92, 1}, {576, 1}, +{116, 1}, { 65, 1}, {117, 1}, { 82, 1}, {352, 1}, { 55, 1}, +{100, 1}, { 90, 1}, {696, 1}, {111, 1}, {880, 1}, { 79, 1}, +{488, 1}, { 61, 1}, {114, 1}, { 94, 1}, {1032, 1}, { 98, 1}, +{ 87, 1}, {584, 1}, { 85, 1}, {648, 1}, {0, 0} +}; + +#define ALIGN_NUM 1024 +#define ALIGN_MASK (ALIGN_NUM-1) +static uint8_t src_align_arr[ALIGN_NUM]; +static uint8_t dst_align_arr[ALIGN_NUM]; + +/* Source alignment frequency for memcpy based on SPEC2017. */ +static align_data_t src_align_freq[] = +{ + {8, 300}, {16, 292}, {32, 168}, {64, 153}, {4, 79}, {2, 14}, {1, 18}, {0, 0} +}; + +static align_data_t dst_align_freq[] = +{ + {8, 265}, {16, 263}, {64, 209}, {32, 174}, {4, 90}, {2, 10}, {1, 13}, {0, 0} +}; + +typedef struct +{ + uint64_t src : 24; + uint64_t dst : 24; + uint64_t len : 16; +} copy_t; + +static copy_t copy[MAX_COPIES]; + +typedef char *(*proto_t) (char *, const char *, size_t); + +static void +init_copy_distribution (void) +{ + int i, j, freq, size, n; + + for (n = i = 0; (freq = size_freq[i].freq) != 0; i++) + for (j = 0, size = size_freq[i].size; j < freq; j++) + size_arr[n++] = size; + assert (n == SIZE_NUM); + + for (n = i = 0; (freq = src_align_freq[i].freq) != 0; i++) + for (j = 0, size = src_align_freq[i].align; j < freq; j++) + src_align_arr[n++] = size - 1; + assert (n == ALIGN_NUM); + + for (n = i = 0; (freq = dst_align_freq[i].freq) != 0; i++) + for (j = 0, size = dst_align_freq[i].align; j < freq; j++) + dst_align_arr[n++] = size - 1; + assert (n == ALIGN_NUM); +} + +static size_t +init_copies (size_t max_size) +{ + size_t total = 0; + /* Create a random set of copies with the given size and alignment + distributions. */ + for (int i = 0; i < MAX_COPIES; i++) + { + copy[i].dst = (rand32 (0) & (max_size - 1)); + copy[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK]; + copy[i].src = (rand32 (0) & (max_size - 1)); + copy[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK]; + copy[i].len = size_arr[rand32 (0) & SIZE_MASK]; + total += copy[i].len; + } + + return total; +} + +int main (void) +{ + init_copy_distribution (); + + memset (a, 1, sizeof (a)); + memset (b, 2, sizeof (b)); + + printf("Random memcpy:\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + size_t total = 0; + uint64_t tsum = 0; + printf ("%22s (B/ns) ", funtab[f].name); + rand32 (0x12345678); + + for (int size = 16384; size <= SIZE; size *= 2) + { + size_t copy_size = init_copies (size) * ITERS; + + for (int c = 0; c < MAX_COPIES; c++) + funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len); + + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS; i++) + for (int c = 0; c < MAX_COPIES; c++) + funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len); + t = clock_get_ns () - t; + total += copy_size; + tsum += t; + printf ("%dK: %.2f ", size / 1024, (double)copy_size / t); + } + printf( "avg %.2f\n", (double)total / tsum); + } + + printf ("\nMedium memcpy:\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s (B/ns) ", funtab[f].name); + + for (int size = 16; size <= 512; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + funtab[f].fun (b, a, size); + t = clock_get_ns () - t; + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t); + } + printf ("\n"); + } + + printf ("\nLarge memcpy:\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s (B/ns) ", funtab[f].name); + + for (int size = 1024; size <= 32768; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + funtab[f].fun (b, a, size); + t = clock_get_ns () - t; + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); + } + printf ("\n"); + } + + printf ("\nUnaligned forwards memmove:\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s (B/ns) ", funtab[f].name); + + for (int size = 1024; size <= 32768; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + funtab[f].fun (a, a + 256 + (i & 31), size); + t = clock_get_ns () - t; + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); + } + printf ("\n"); + } + + + printf ("\nUnaligned backwards memmove:\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s (B/ns) ", funtab[f].name); + + for (int size = 1024; size <= 32768; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + funtab[f].fun (a + 256 + (i & 31), a, size); + t = clock_get_ns () - t; + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); + } + printf ("\n"); + } + + return 0; +} diff --git a/string/bench/strlen.c b/string/bench/strlen.c new file mode 100644 index 000000000000..cc0f04bee547 --- /dev/null +++ b/string/bench/strlen.c @@ -0,0 +1,221 @@ +/* + * strlen benchmark. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#define _GNU_SOURCE +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> +#include "stringlib.h" +#include "benchlib.h" + +#define ITERS 2000 +#define ITERS2 20000000 +#define ITERS3 2000000 +#define NUM_STRLEN 16384 + +#define MAX_ALIGN 32 +#define MAX_STRLEN 256 + +static char a[(MAX_STRLEN + 1) * MAX_ALIGN] __attribute__((__aligned__(4096))); + +#define F(x, mte) {#x, x, mte}, + +static const struct fun +{ + const char *name; + size_t (*fun) (const char *s); + int test_mte; +} funtab[] = { + // clang-format off + F(strlen, 0) +#if __aarch64__ + F(__strlen_aarch64, 0) + F(__strlen_aarch64_mte, 1) +# if __ARM_FEATURE_SVE + F(__strlen_aarch64_sve, 1) +# endif +#elif __arm__ +# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2 + F(__strlen_armv6t2, 0) +# endif +#endif + {0, 0, 0} + // clang-format on +}; +#undef F + +static uint16_t strlen_tests[NUM_STRLEN]; + +typedef struct { uint16_t size; uint16_t freq; } freq_data_t; +typedef struct { uint8_t align; uint16_t freq; } align_data_t; + +#define SIZE_NUM 65536 +#define SIZE_MASK (SIZE_NUM - 1) +static uint8_t strlen_len_arr[SIZE_NUM]; + +/* Frequency data for strlen sizes up to 128 based on SPEC2017. */ +static freq_data_t strlen_len_freq[] = +{ + { 12,22671}, { 18,12834}, { 13, 9555}, { 6, 6348}, { 17, 6095}, { 11, 2115}, + { 10, 1335}, { 7, 814}, { 2, 646}, { 9, 483}, { 8, 471}, { 16, 418}, + { 4, 390}, { 1, 388}, { 5, 233}, { 3, 204}, { 0, 79}, { 14, 79}, + { 15, 69}, { 26, 36}, { 22, 35}, { 31, 24}, { 32, 24}, { 19, 21}, + { 25, 17}, { 28, 15}, { 21, 14}, { 33, 14}, { 20, 13}, { 24, 9}, + { 29, 9}, { 30, 9}, { 23, 7}, { 34, 7}, { 27, 6}, { 44, 5}, + { 42, 4}, { 45, 3}, { 47, 3}, { 40, 2}, { 41, 2}, { 43, 2}, + { 58, 2}, { 78, 2}, { 36, 2}, { 48, 1}, { 52, 1}, { 60, 1}, + { 64, 1}, { 56, 1}, { 76, 1}, { 68, 1}, { 80, 1}, { 84, 1}, + { 72, 1}, { 86, 1}, { 35, 1}, { 39, 1}, { 50, 1}, { 38, 1}, + { 37, 1}, { 46, 1}, { 98, 1}, {102, 1}, {128, 1}, { 51, 1}, + {107, 1}, { 0, 0} +}; + +#define ALIGN_NUM 1024 +#define ALIGN_MASK (ALIGN_NUM - 1) +static uint8_t strlen_align_arr[ALIGN_NUM]; + +/* Alignment data for strlen based on SPEC2017. */ +static align_data_t string_align_freq[] = +{ + {8, 470}, {32, 427}, {16, 99}, {1, 19}, {2, 6}, {4, 3}, {0, 0} +}; + +static void +init_strlen_distribution (void) +{ + int i, j, freq, size, n; + + for (n = i = 0; (freq = strlen_len_freq[i].freq) != 0; i++) + for (j = 0, size = strlen_len_freq[i].size; j < freq; j++) + strlen_len_arr[n++] = size; + assert (n == SIZE_NUM); + + for (n = i = 0; (freq = string_align_freq[i].freq) != 0; i++) + for (j = 0, size = string_align_freq[i].align; j < freq; j++) + strlen_align_arr[n++] = size; + assert (n == ALIGN_NUM); +} + +static void +init_strlen_tests (void) +{ + uint16_t index[MAX_ALIGN]; + + memset (a, 'x', sizeof (a)); + + /* Create indices for strings at all alignments. */ + for (int i = 0; i < MAX_ALIGN; i++) + { + index[i] = i * (MAX_STRLEN + 1); + a[index[i] + MAX_STRLEN] = 0; + } + + /* Create a random set of strlen input strings using the string length + and alignment distributions. */ + for (int n = 0; n < NUM_STRLEN; n++) + { + int align = strlen_align_arr[rand32 (0) & ALIGN_MASK]; + int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK]; + + strlen_tests[n] = + index[(align + exp_len) & (MAX_ALIGN - 1)] + MAX_STRLEN - exp_len; + } +} + +static volatile size_t maskv = 0; + +int main (void) +{ + rand32 (0x12345678); + init_strlen_distribution (); + init_strlen_tests (); + + printf ("\nRandom strlen (bytes/ns):\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + size_t res = 0, strlen_size = 0, mask = maskv; + printf ("%22s ", funtab[f].name); + + for (int c = 0; c < NUM_STRLEN; c++) + strlen_size += funtab[f].fun (a + strlen_tests[c]); + strlen_size *= ITERS; + + /* Measure latency of strlen result with (res & mask). */ + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS; i++) + for (int c = 0; c < NUM_STRLEN; c++) + res = funtab[f].fun (a + strlen_tests[c] + (res & mask)); + t = clock_get_ns () - t; + printf ("%.2f\n", (double)strlen_size / t); + } + + printf ("\nSmall aligned strlen (bytes/ns):\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s ", funtab[f].name); + + for (int size = 1; size <= 64; size *= 2) + { + memset (a, 'x', size); + a[size - 1] = 0; + + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + funtab[f].fun (a); + t = clock_get_ns () - t; + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t); + } + printf ("\n"); + } + + printf ("\nSmall unaligned strlen (bytes/ns):\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s ", funtab[f].name); + + int align = 9; + for (int size = 1; size <= 64; size *= 2) + { + memset (a + align, 'x', size); + a[align + size - 1] = 0; + + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + funtab[f].fun (a + align); + t = clock_get_ns () - t; + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t); + } + printf ("\n"); + } + + printf ("\nMedium strlen (bytes/ns):\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s ", funtab[f].name); + + for (int size = 128; size <= 4096; size *= 2) + { + memset (a, 'x', size); + a[size - 1] = 0; + + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + funtab[f].fun (a); + t = clock_get_ns () - t; + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); + } + printf ("\n"); + } + + printf ("\n"); + + return 0; +} diff --git a/string/include/benchlib.h b/string/include/benchlib.h new file mode 100644 index 000000000000..0f2ce2eb6bce --- /dev/null +++ b/string/include/benchlib.h @@ -0,0 +1,33 @@ +/* + * Benchmark support functions. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <time.h> + +/* Fast and accurate timer returning nanoseconds. */ +static inline uint64_t +clock_get_ns (void) +{ + struct timespec ts; + clock_gettime (CLOCK_MONOTONIC, &ts); + return ts.tv_sec * (uint64_t) 1000000000 + ts.tv_nsec; +} + +/* Fast 32-bit random number generator. Passing a non-zero seed + value resets the internal state. */ +static inline uint32_t +rand32 (uint32_t seed) +{ + static uint64_t state = 0xb707be451df0bb19ULL; + if (seed != 0) + state = seed; + uint32_t res = state >> 32; + state = state * 6364136223846793005ULL + 1; + return res; +} + + diff --git a/string/include/stringlib.h b/string/include/stringlib.h new file mode 100644 index 000000000000..378c3cd2d645 --- /dev/null +++ b/string/include/stringlib.h @@ -0,0 +1,69 @@ +/* + * Public API. + * + * Copyright (c) 2019-2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stddef.h> + +/* restrict is not needed, but kept for documenting the interface contract. */ +#ifndef __restrict +# define __restrict +#endif + +#if __aarch64__ +void *__memcpy_aarch64 (void *__restrict, const void *__restrict, size_t); +void *__memmove_aarch64 (void *, const void *, size_t); +void *__memset_aarch64 (void *, int, size_t); +void *__memchr_aarch64 (const void *, int, size_t); +void *__memrchr_aarch64 (const void *, int, size_t); +int __memcmp_aarch64 (const void *, const void *, size_t); +char *__strcpy_aarch64 (char *__restrict, const char *__restrict); +char *__stpcpy_aarch64 (char *__restrict, const char *__restrict); +int __strcmp_aarch64 (const char *, const char *); +char *__strchr_aarch64 (const char *, int); +char *__strrchr_aarch64 (const char *, int); +char *__strchrnul_aarch64 (const char *, int ); +size_t __strlen_aarch64 (const char *); +size_t __strnlen_aarch64 (const char *, size_t); +int __strncmp_aarch64 (const char *, const char *, size_t); +void * __memchr_aarch64_mte (const void *, int, size_t); +char *__strcpy_aarch64_mte (char *__restrict, const char *__restrict); +char *__stpcpy_aarch64_mte (char *__restrict, const char *__restrict); +char *__strchr_aarch64_mte (const char *, int); +char * __strchrnul_aarch64_mte (const char *, int ); +size_t __strlen_aarch64_mte (const char *); +char *__strrchr_aarch64_mte (const char *, int); +int __strcmp_aarch64_mte (const char *, const char *); +int __strncmp_aarch64_mte (const char *, const char *, size_t); +#if __ARM_NEON +void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t); +void *__memmove_aarch64_simd (void *, const void *, size_t); +#endif +# if __ARM_FEATURE_SVE +void *__memchr_aarch64_sve (const void *, int, size_t); +int __memcmp_aarch64_sve (const void *, const void *, size_t); +char *__strchr_aarch64_sve (const char *, int); +char *__strrchr_aarch64_sve (const char *, int); +char *__strchrnul_aarch64_sve (const char *, int ); +int __strcmp_aarch64_sve (const char *, const char *); +char *__strcpy_aarch64_sve (char *__restrict, const char *__restrict); +char *__stpcpy_aarch64_sve (char *__restrict, const char *__restrict); +size_t __strlen_aarch64_sve (const char *); +size_t __strnlen_aarch64_sve (const char *, size_t); +int __strncmp_aarch64_sve (const char *, const char *, size_t); +# endif +# if __ARM_FEATURE_MEMORY_TAGGING +void *__mtag_tag_region (void *, size_t); +void *__mtag_tag_zero_region (void *, size_t); +# endif +#elif __arm__ +void *__memcpy_arm (void *__restrict, const void *__restrict, size_t); +void *__memset_arm (void *, int, size_t); +void *__memchr_arm (const void *, int, size_t); +char *__strcpy_arm (char *__restrict, const char *__restrict); +int __strcmp_arm (const char *, const char *); +int __strcmp_armv6m (const char *, const char *); +size_t __strlen_armv6t2 (const char *); +#endif diff --git a/string/test/__mtag_tag_region.c b/string/test/__mtag_tag_region.c new file mode 100644 index 000000000000..d8c02d92d626 --- /dev/null +++ b/string/test/__mtag_tag_region.c @@ -0,0 +1,147 @@ +/* + * __mtag_tag_region test. + * + * Copyright (c) 2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "mte.h" +#include "stringlib.h" +#include "stringtest.h" + +static void +mtag_quoteat (const char *prefix, void *p, int len, int at) +{ + /* Print tag, untag and quote the context. */ + printf ("location: %p\n", __arm_mte_get_tag ((char *) p + at)); + untag_buffer (p, len, 1); + p = untag_pointer (p); + quoteat (prefix, p, len, at); +} + +#define F(x) {#x, x}, + +static const struct fun +{ + const char *name; + void *(*fun) (void *s, size_t n); +} funtab[] = { +// clang-format off +#if __aarch64__ + F(__mtag_tag_region) +#endif + {0, 0} + // clang-format on +}; +#undef F + +#define A 64 +#define LEN 250000 +static unsigned char *sbuf; + +static void * +alignup (void *p) +{ + return (void *) (((uintptr_t) p + A - 1) & -A); +} + +static void +test (const struct fun *fun, int salign, int len) +{ + unsigned char *src = alignup (sbuf); + unsigned char *s = src + salign; + void *p; + int i; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || salign >= A) + abort (); + for (i = 0; i < len + 2 * A; i++) + src[i] = '?'; + for (i = 0; i < len; i++) + s[i] = 'a'; + + src = tag_buffer (src, len + 2 * A, 1); + s = src + salign; + /* Use different tag. */ + s = __arm_mte_increment_tag (s, 1); + p = fun->fun (s, len); + + if (p != s) + ERR ("%s(%p,..) returned %p\n", fun->name, s, p); + + for (i = 0; i < salign; i++) + { + if (src[i] != '?') + { + ERR ("%s(align %d, %d) failed\n", fun->name, salign, len); + mtag_quoteat ("got head", src, len + 2 * A, i); + return; + } + } + + for (; i < salign + len; i++) + { + if (s[i - salign] != 'a') + { + ERR ("%s(align %d, %d) failed\n", fun->name, salign, len); + mtag_quoteat ("got body", src, len + 2 * A, i); + return; + } + } + + for (; i < len + 2 * A; i++) + { + if (src[i] != '?') + { + ERR ("%s(align %d, %d) failed\n", fun->name, salign, len); + mtag_quoteat ("got tail", src, len + 2 * A, i); + return; + } + } + + untag_buffer (src, len + 2 * A, 1); +} + +int +main () +{ + if (!mte_enabled ()) + return 0; + + sbuf = mte_mmap (LEN + 3 * A); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int s = 0; s < A; s += 16) + { + int n; + for (n = 0; n < 200; n += 16) + { + test (funtab + i, s, n); + } + for (; n < LEN; n *= 2) + { + test (funtab + i, s, n); + } + } + printf ("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name); + if (err_count) + r = -1; + } + return r; +} +#else +int +main () +{ + return 0; +} +#endif diff --git a/string/test/__mtag_tag_zero_region.c b/string/test/__mtag_tag_zero_region.c new file mode 100644 index 000000000000..221c223a2f31 --- /dev/null +++ b/string/test/__mtag_tag_zero_region.c @@ -0,0 +1,147 @@ +/* + * __mtag_tag_zero_region test. + * + * Copyright (c) 2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "mte.h" +#include "stringlib.h" +#include "stringtest.h" + +static void +mtag_quoteat (const char *prefix, void *p, int len, int at) +{ + /* Print tag, untag and quote the context. */ + printf ("location: %p\n", __arm_mte_get_tag ((char *) p + at)); + untag_buffer (p, len, 1); + p = untag_pointer (p); + quoteat (prefix, p, len, at); +} + +#define F(x) {#x, x}, + +static const struct fun +{ + const char *name; + void *(*fun) (void *s, size_t n); +} funtab[] = { +// clang-format off +#if __aarch64__ + F(__mtag_tag_zero_region) +#endif + {0, 0} + // clang-format on +}; +#undef F + +#define A 64 +#define LEN 250000 +static unsigned char *sbuf; + +static void * +alignup (void *p) +{ + return (void *) (((uintptr_t) p + A - 1) & -A); +} + +static void +test (const struct fun *fun, int salign, int len) +{ + unsigned char *src = alignup (sbuf); + unsigned char *s = src + salign; + void *p; + int i; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || salign >= A) + abort (); + for (i = 0; i < len + 2 * A; i++) + src[i] = '?'; + for (i = 0; i < len; i++) + s[i] = 'a' + i % 23; + + src = tag_buffer (src, len + 2 * A, 1); + s = src + salign; + /* Use different tag. */ + s = __arm_mte_increment_tag (s, 1); + p = fun->fun (s, len); + + if (p != s) + ERR ("%s(%p,..) returned %p\n", fun->name, s, p); + + for (i = 0; i < salign; i++) + { + if (src[i] != '?') + { + ERR ("%s(align %d, %d) failed\n", fun->name, salign, len); + mtag_quoteat ("got head", src, len + 2 * A, i); + return; + } + } + + for (; i < salign + len; i++) + { + if (s[i - salign] != 0) + { + ERR ("%s(align %d, %d) failed\n", fun->name, salign, len); + mtag_quoteat ("got body", src, len + 2 * A, i); + return; + } + } + + for (; i < len + 2 * A; i++) + { + if (src[i] != '?') + { + ERR ("%s(align %d, %d) failed\n", fun->name, salign, len); + mtag_quoteat ("got tail", src, len + 2 * A, i); + return; + } + } + + untag_buffer (src, len + 2 * A, 1); +} + +int +main () +{ + if (!mte_enabled ()) + return 0; + + sbuf = mte_mmap (LEN + 3 * A); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int s = 0; s < A; s += 16) + { + int n; + for (n = 0; n < 200; n += 16) + { + test (funtab + i, s, n); + } + for (; n < LEN; n *= 2) + { + test (funtab + i, s, n); + } + } + printf ("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name); + if (err_count) + r = -1; + } + return r; +} +#else +int +main () +{ + return 0; +} +#endif diff --git a/string/test/memchr.c b/string/test/memchr.c new file mode 100644 index 000000000000..0ff77f5710bf --- /dev/null +++ b/string/test/memchr.c @@ -0,0 +1,110 @@ +/* + * memchr test. + * + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> +#include "mte.h" +#include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, + +static const struct fun +{ + const char *name; + void *(*fun) (const void *s, int c, size_t n); + int test_mte; +} funtab[] = { + // clang-format off + F(memchr, 0) +#if __aarch64__ + F(__memchr_aarch64, 0) + F(__memchr_aarch64_mte, 1) +# if __ARM_FEATURE_SVE + F(__memchr_aarch64_sve, 1) +# endif +#elif __arm__ + F(__memchr_arm, 0) +#endif + {0, 0, 0} + // clang-format on +}; +#undef F + +#define ALIGN 32 +#define LEN 512 +static char *sbuf; + +static void * +alignup (void *p) +{ + return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN); +} + +static void +test (const struct fun *fun, int align, size_t seekpos, size_t len, + size_t maxlen) +{ + char *src = alignup (sbuf); + char *s = src + align; + char *f = seekpos < maxlen ? s + seekpos : NULL; + int seekchar = 1; + void *p; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || seekpos > LEN || align > ALIGN) + abort (); + + for (int i = 0; src + i < s; i++) + src[i] = seekchar; + for (int i = 0; i <= ALIGN; i++) + s[len + i] = seekchar; + for (int i = 0; i < len; i++) + s[i] = 'a' + (i & 31); + s[seekpos] = seekchar; + s[((len ^ align) & 1) ? seekpos + 1 : len] = seekchar; + + int mte_len = seekpos != -1 ? seekpos + 1 : maxlen; + s = tag_buffer (s, mte_len, fun->test_mte); + p = fun->fun (s, seekchar, maxlen); + untag_buffer (s, mte_len, fun->test_mte); + p = untag_pointer (p); + + if (p != f) + { + ERR ("%s (%p, 0x%02x, %zu) returned %p, expected %p\n", fun->name, s, + seekchar, maxlen, p, f); + quote ("input", s, len); + } +} + +int +main (void) +{ + sbuf = mte_mmap (LEN + 3 * ALIGN); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int a = 0; a < ALIGN; a++) + for (int n = 0; n < LEN; n++) + { + for (int sp = 0; sp < LEN; sp++) + test (funtab + i, a, sp, n, n); + test (funtab + i, a, n, n, SIZE_MAX - a); + } + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; +} diff --git a/string/test/memcmp.c b/string/test/memcmp.c new file mode 100644 index 000000000000..7a7cf9cff35a --- /dev/null +++ b/string/test/memcmp.c @@ -0,0 +1,125 @@ +/* + * memcmp test. + * + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "mte.h" +#include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, + +static const struct fun +{ + const char *name; + int (*fun) (const void *s1, const void *s2, size_t n); + int test_mte; +} funtab[] = { + // clang-format off + F(memcmp, 0) +#if __aarch64__ + F(__memcmp_aarch64, 1) +# if __ARM_FEATURE_SVE + F(__memcmp_aarch64_sve, 1) +# endif +#endif + {0, 0, 0} + // clang-format on +}; +#undef F + +#define A 32 +#define LEN 250000 +static unsigned char *s1buf; +static unsigned char *s2buf; + +static void * +alignup (void *p) +{ + return (void *) (((uintptr_t) p + A - 1) & -A); +} + +static void +test (const struct fun *fun, int s1align, int s2align, int len, int diffpos, + int delta) +{ + unsigned char *src1 = alignup (s1buf); + unsigned char *src2 = alignup (s2buf); + unsigned char *s1 = src1 + s1align; + unsigned char *s2 = src2 + s2align; + int r; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || s1align >= A || s2align >= A) + abort (); + if (diffpos >= len) + abort (); + if ((diffpos < 0) != (delta == 0)) + abort (); + + for (int i = 0; i < len + A; i++) + src1[i] = src2[i] = '?'; + for (int i = 0; i < len; i++) + s1[i] = s2[i] = 'a' + i % 23; + if (delta) + s1[diffpos] += delta; + + s1 = tag_buffer (s1, len, fun->test_mte); + s2 = tag_buffer (s2, len, fun->test_mte); + r = fun->fun (s1, s2, len); + untag_buffer (s1, len, fun->test_mte); + untag_buffer (s2, len, fun->test_mte); + + if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0)) + { + ERR ("%s(align %d, align %d, %d) failed, returned %d\n", fun->name, + s1align, s2align, len, r); + quoteat ("src1", src1, len + A, diffpos); + quoteat ("src2", src2, len + A, diffpos); + } +} + +int +main () +{ + s1buf = mte_mmap (LEN + 2 * A); + s2buf = mte_mmap (LEN + 2 * A); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int d = 0; d < A; d++) + for (int s = 0; s < A; s++) + { + int n; + test (funtab + i, d, s, 0, -1, 0); + test (funtab + i, d, s, 1, -1, 0); + test (funtab + i, d, s, 1, 0, -1); + test (funtab + i, d, s, 1, 0, 1); + for (n = 2; n < 100; n++) + { + test (funtab + i, d, s, n, -1, 0); + test (funtab + i, d, s, n, 0, -1); + test (funtab + i, d, s, n, n - 1, -1); + test (funtab + i, d, s, n, n / 2, 1); + } + for (; n < LEN; n *= 2) + { + test (funtab + i, d, s, n, -1, 0); + test (funtab + i, d, s, n, n / 2, -1); + } + } + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; +} diff --git a/string/test/memcpy.c b/string/test/memcpy.c new file mode 100644 index 000000000000..ce0ceeef5ee8 --- /dev/null +++ b/string/test/memcpy.c @@ -0,0 +1,120 @@ +/* + * memcpy test. + * + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "mte.h" +#include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, + +static const struct fun +{ + const char *name; + void *(*fun) (void *, const void *, size_t); + int test_mte; +} funtab[] = { + // clang-format off + F(memcpy, 0) +#if __aarch64__ + F(__memcpy_aarch64, 1) +# if __ARM_NEON + F(__memcpy_aarch64_simd, 1) +# endif +#elif __arm__ + F(__memcpy_arm, 0) +#endif + {0, 0, 0} + // clang-format on +}; +#undef F + +#define A 32 +#define LEN 250000 +static unsigned char *dbuf; +static unsigned char *sbuf; +static unsigned char wbuf[LEN + 2 * A]; + +static void * +alignup (void *p) +{ + return (void *) (((uintptr_t) p + A - 1) & -A); +} + +static void +test (const struct fun *fun, int dalign, int salign, int len) +{ + unsigned char *src = alignup (sbuf); + unsigned char *dst = alignup (dbuf); + unsigned char *want = wbuf; + unsigned char *s = src + salign; + unsigned char *d = dst + dalign; + unsigned char *w = want + dalign; + void *p; + int i; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || dalign >= A || salign >= A) + abort (); + for (i = 0; i < len + A; i++) + { + src[i] = '?'; + want[i] = dst[i] = '*'; + } + for (i = 0; i < len; i++) + s[i] = w[i] = 'a' + i % 23; + + s = tag_buffer (s, len, fun->test_mte); + d = tag_buffer (d, len, fun->test_mte); + p = fun->fun (d, s, len); + untag_buffer (s, len, fun->test_mte); + untag_buffer (d, len, fun->test_mte); + + if (p != d) + ERR ("%s(%p,..) returned %p\n", fun->name, d, p); + for (i = 0; i < len + A; i++) + { + if (dst[i] != want[i]) + { + ERR ("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, + len); + quoteat ("got", dst, len + A, i); + quoteat ("want", want, len + A, i); + break; + } + } +} + +int +main () +{ + dbuf = mte_mmap (LEN + 2 * A); + sbuf = mte_mmap (LEN + 2 * A); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int d = 0; d < A; d++) + for (int s = 0; s < A; s++) + { + int n; + for (n = 0; n < 100; n++) + test (funtab + i, d, s, n); + for (; n < LEN; n *= 2) + test (funtab + i, d, s, n); + } + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; +} diff --git a/string/test/memmove.c b/string/test/memmove.c new file mode 100644 index 000000000000..689b68c98af2 --- /dev/null +++ b/string/test/memmove.c @@ -0,0 +1,164 @@ +/* + * memmove test. + * + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "mte.h" +#include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, + +static const struct fun +{ + const char *name; + void *(*fun) (void *, const void *, size_t); + int test_mte; +} funtab[] = { + // clang-format off + F(memmove, 0) +#if __aarch64__ + F(__memmove_aarch64, 1) +# if __ARM_NEON + F(__memmove_aarch64_simd, 1) +# endif +#endif + {0, 0, 0} + // clang-format on +}; +#undef F + +#define A 32 +#define LEN 250000 +static unsigned char *dbuf; +static unsigned char *sbuf; +static unsigned char wbuf[LEN + 2 * A]; + +static void * +alignup (void *p) +{ + return (void *) (((uintptr_t) p + A - 1) & -A); +} + +static void +test (const struct fun *fun, int dalign, int salign, int len) +{ + unsigned char *src = alignup (sbuf); + unsigned char *dst = alignup (dbuf); + unsigned char *want = wbuf; + unsigned char *s = src + salign; + unsigned char *d = dst + dalign; + unsigned char *w = want + dalign; + void *p; + int i; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || dalign >= A || salign >= A) + abort (); + for (i = 0; i < len + A; i++) + { + src[i] = '?'; + want[i] = dst[i] = '*'; + } + for (i = 0; i < len; i++) + s[i] = w[i] = 'a' + i % 23; + + p = fun->fun (d, s, len); + if (p != d) + ERR ("%s(%p,..) returned %p\n", fun->name, d, p); + for (i = 0; i < len + A; i++) + { + if (dst[i] != want[i]) + { + ERR ("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, + len); + quoteat ("got", dst, len + A, i); + quoteat ("want", want, len + A, i); + break; + } + } +} + +static void +test_overlap (const struct fun *fun, int dalign, int salign, int len) +{ + unsigned char *src = alignup (sbuf); + unsigned char *dst = src; + unsigned char *want = wbuf; + unsigned char *s = src + salign; + unsigned char *d = dst + dalign; + unsigned char *w = wbuf + dalign; + void *p; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || dalign >= A || salign >= A) + abort (); + + for (int i = 0; i < len + A; i++) + src[i] = want[i] = '?'; + + for (int i = 0; i < len; i++) + s[i] = want[salign + i] = 'a' + i % 23; + for (int i = 0; i < len; i++) + w[i] = s[i]; + + s = tag_buffer (s, len, fun->test_mte); + d = tag_buffer (d, len, fun->test_mte); + p = fun->fun (d, s, len); + untag_buffer (s, len, fun->test_mte); + untag_buffer (d, len, fun->test_mte); + + if (p != d) + ERR ("%s(%p,..) returned %p\n", fun->name, d, p); + for (int i = 0; i < len + A; i++) + { + if (dst[i] != want[i]) + { + ERR ("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, + len); + quoteat ("got", dst, len + A, i); + quoteat ("want", want, len + A, i); + break; + } + } +} + +int +main () +{ + dbuf = mte_mmap (LEN + 2 * A); + sbuf = mte_mmap (LEN + 2 * A); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int d = 0; d < A; d++) + for (int s = 0; s < A; s++) + { + int n; + for (n = 0; n < 100; n++) + { + test (funtab + i, d, s, n); + test_overlap (funtab + i, d, s, n); + } + for (; n < LEN; n *= 2) + { + test (funtab + i, d, s, n); + test_overlap (funtab + i, d, s, n); + } + } + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; +} diff --git a/string/test/memrchr.c b/string/test/memrchr.c new file mode 100644 index 000000000000..adf96f049cc9 --- /dev/null +++ b/string/test/memrchr.c @@ -0,0 +1,106 @@ +/* + * memchr test. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> +#include "mte.h" +#include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, + +static const struct fun +{ + const char *name; + void *(*fun) (const void *s, int c, size_t n); + int test_mte; +} funtab[] = { + // clang-format off + F(memrchr, 0) +#if __aarch64__ + F(__memrchr_aarch64, 1) +#endif + {0, 0, 0} + // clang-format on +}; +#undef F + +#define ALIGN 32 +#define LEN 512 +static char *sbuf; + +static void * +alignup (void *p) +{ + return (void *) (((uintptr_t) p + ALIGN) & -ALIGN); +} + +static void +test (const struct fun *fun, int align, size_t seekpos, size_t len, + size_t maxlen) +{ + char *src = alignup (sbuf); + char *s = src + align; + char *f = seekpos < maxlen ? s + seekpos : NULL; + int seekchar = 1; + void *p; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || seekpos > LEN || align > ALIGN) + abort (); + + for (int i = 0; src + i < s; i++) + src[i] = seekchar; + for (int i = 0; i <= ALIGN; i++) + s[len + i] = seekchar; + for (int i = 0; i < len; i++) + s[i] = 'a' + (i & 31); + s[seekpos] = seekchar; + s[((len ^ align) & 1) && seekpos < maxlen ? seekpos - 1 : len] = seekchar; + + s = tag_buffer (s, maxlen, fun->test_mte); + p = fun->fun (s, seekchar, maxlen); + untag_buffer (s, maxlen, fun->test_mte); + p = untag_pointer (p); + + if (p != f) + { + ERR ("%s (%p, 0x%02x, %zu) returned %p, expected %p\n", fun->name, s, + seekchar, maxlen, p, f); + quote ("input", s, len); + } +} + +int +main (void) +{ + sbuf = mte_mmap (LEN + 3 * ALIGN); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int a = 0; a < ALIGN; a++) + for (int n = 0; n < LEN; n++) + { + for (int sp = 0; sp < LEN; sp++) + test (funtab + i, a, sp, n, n); + } + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; +} diff --git a/string/test/memset.c b/string/test/memset.c new file mode 100644 index 000000000000..f1721442dbaf --- /dev/null +++ b/string/test/memset.c @@ -0,0 +1,129 @@ +/* + * memset test. + * + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "mte.h" +#include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, + +static const struct fun +{ + const char *name; + void *(*fun) (void *s, int c, size_t n); + int test_mte; +} funtab[] = { + // clang-format off + F(memset, 0) +#if __aarch64__ + F(__memset_aarch64, 1) +#elif __arm__ + F(__memset_arm, 0) +#endif + {0, 0, 0} + // clang-format on +}; +#undef F + +#define A 32 +#define LEN 250000 +static unsigned char *sbuf; + +static void * +alignup (void *p) +{ + return (void *) (((uintptr_t) p + A - 1) & -A); +} + +static void +test (const struct fun *fun, int salign, int c, int len) +{ + unsigned char *src = alignup (sbuf); + unsigned char *s = src + salign; + void *p; + int i; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || salign >= A) + abort (); + for (i = 0; i < len + A; i++) + src[i] = '?'; + for (i = 0; i < len; i++) + s[i] = 'a' + i % 23; + + s = tag_buffer (s, len, fun->test_mte); + p = fun->fun (s, c, len); + untag_buffer (s, len, fun->test_mte); + + if (p != s) + ERR ("%s(%p,..) returned %p\n", fun->name, s, p); + + for (i = 0; i < salign; i++) + { + if (src[i] != '?') + { + ERR ("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len); + quoteat ("got", src, len + A, i); + return; + } + } + for (; i < salign + len; i++) + { + if (src[i] != (unsigned char) c) + { + ERR ("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len); + quoteat ("got", src, len + A, i); + return; + } + } + for (; i < len + A; i++) + { + if (src[i] != '?') + { + ERR ("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len); + quoteat ("got", src, len + A, i); + return; + } + } +} + +int +main () +{ + sbuf = mte_mmap (LEN + 2 * A); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int s = 0; s < A; s++) + { + int n; + for (n = 0; n < 100; n++) + { + test (funtab + i, s, 0, n); + test (funtab + i, s, 0x25, n); + test (funtab + i, s, 0xaa25, n); + } + for (; n < LEN; n *= 2) + { + test (funtab + i, s, 0, n); + test (funtab + i, s, 0x25, n); + test (funtab + i, s, 0xaa25, n); + } + } + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; +} diff --git a/string/test/mte.h b/string/test/mte.h new file mode 100644 index 000000000000..e67cbd9d2d40 --- /dev/null +++ b/string/test/mte.h @@ -0,0 +1,142 @@ +/* + * Memory tagging testing code. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#ifndef __TEST_MTE_H +#define __TEST_MTE_H + +#include <stdlib.h> + +#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST +#include <arm_acle.h> +#include <sys/mman.h> +#include <sys/prctl.h> + +// These depend on a not yet merged kernel ABI. +#define PR_SET_TAGGED_ADDR_CTRL 55 +#define PR_TAGGED_ADDR_ENABLE (1UL << 0) +#define PR_MTE_TCF_SHIFT 1 +#define PR_MTE_TCF_SYNC (1UL << PR_MTE_TCF_SHIFT) +#define PR_MTE_TAG_SHIFT 3 +#define PROT_MTE 0x20 + +#define MTE_GRANULE_SIZE 16 + +int +mte_enabled () +{ + static int enabled = -1; + if (enabled == -1) + { + int res = prctl (PR_SET_TAGGED_ADDR_CTRL, + PR_TAGGED_ADDR_ENABLE | PR_MTE_TCF_SYNC + | (0xfffe << PR_MTE_TAG_SHIFT), + 0, 0, 0); + enabled = (res == 0); + } + return enabled; +} + +static void * +mte_mmap (size_t size) +{ + if (mte_enabled ()) + { + return mmap (NULL, size, PROT_READ | PROT_WRITE | PROT_MTE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + } + else + { + return malloc (size); + } +} + +void * +alignup_mte (void *p) +{ + return (void *) (((uintptr_t) p + MTE_GRANULE_SIZE - 1) + & ~(MTE_GRANULE_SIZE - 1)); +} + +void * +aligndown_mte (void *p) +{ + return (void *) ((uintptr_t) p & ~(MTE_GRANULE_SIZE - 1)); +} + +void * +untag_pointer (void *p) +{ + return (void *) ((unsigned long long) p & (~0ULL >> 8)); +} + +void +tag_buffer_helper (void *p, int len) +{ + char *ptr = p; + char *end = alignup_mte (ptr + len); + ptr = aligndown_mte (p); + for (; ptr < end; ptr += MTE_GRANULE_SIZE) + { + __arm_mte_set_tag (ptr); + } +} + +void * +tag_buffer (void *p, int len, int test_mte) +{ + if (test_mte && mte_enabled ()) + { + p = __arm_mte_increment_tag (p, 1); + tag_buffer_helper (p, len); + } + return p; +} + +void * +untag_buffer (void *p, int len, int test_mte) +{ + p = untag_pointer (p); + if (test_mte && mte_enabled ()) + { + tag_buffer_helper (p, len); + } + return p; +} + +#else // __ARM_FEATURE_MEMORY_TAGGING +int +mte_enabled () +{ + return 0; +} +static void * +mte_mmap (size_t size) +{ + return malloc (size); +} +void * +tag_buffer (void *p, int len, int test_mte) +{ + (void) len; + (void) test_mte; + return p; +} +void * +untag_buffer (void *p, int len, int test_mte) +{ + (void) len; + (void) test_mte; + return p; +} +void * +untag_pointer (void *p) +{ + return p; +} +#endif // __ARM_FEATURE_MEMORY_TAGGING + +#endif diff --git a/string/test/stpcpy.c b/string/test/stpcpy.c new file mode 100644 index 000000000000..1827e68c9a30 --- /dev/null +++ b/string/test/stpcpy.c @@ -0,0 +1,125 @@ +/* + * stpcpy test. + * + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "mte.h" +#include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, + +static const struct fun +{ + const char *name; + char *(*fun) (char *dest, const char *src); + int test_mte; +} funtab[] = { + // clang-format off + F(stpcpy, 0) +#if __aarch64__ + F(__stpcpy_aarch64, 0) + F(__stpcpy_aarch64_mte, 1) +# if __ARM_FEATURE_SVE + F(__stpcpy_aarch64_sve, 1) +# endif +#endif + {0, 0, 0} + // clang-format on +}; +#undef F + +#define ALIGN 32 +#define LEN 512 +static char *dbuf; +static char *sbuf; +static char wbuf[LEN + 3 * ALIGN]; + +static void * +alignup (void *p) +{ + return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN); +} + +static void +test (const struct fun *fun, int dalign, int salign, int len) +{ + char *src = alignup (sbuf); + char *dst = alignup (dbuf); + char *want = wbuf; + char *s = src + salign; + char *d = dst + dalign; + char *w = want + dalign; + void *p; + int i; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || dalign >= ALIGN || salign >= ALIGN) + abort (); + for (i = 0; i < len + ALIGN; i++) + { + src[i] = '?'; + want[i] = dst[i] = '*'; + } + for (int i = 0; src + i < s; i++) + src[i] = 0; + for (int i = 1; i <= ALIGN; i++) + s[len + i] = (len + salign) & 1 ? 1 : 0; + for (i = 0; i < len; i++) + s[i] = w[i] = 'a' + (i & 31); + s[len] = w[len] = '\0'; + + s = tag_buffer (s, len + 1, fun->test_mte); + d = tag_buffer (d, len + 1, fun->test_mte); + p = fun->fun (d, s); + untag_buffer (s, len + 1, fun->test_mte); + untag_buffer (d, len + 1, fun->test_mte); + + if (p != d + len) + ERR ("%s (%p,..) returned %p expected %p\n", fun->name, d, p, d + len); + + for (i = 0; i < len + ALIGN; i++) + { + if (dst[i] != want[i]) + { + ERR ("%s (align %d, align %d, %d) failed\n", + fun->name, dalign, salign, len); + quoteat ("got", dst, len + ALIGN, i); + quoteat ("want", want, len + ALIGN, i); + break; + } + } +} + +int +main (void) +{ + sbuf = mte_mmap (LEN + 3 * ALIGN); + dbuf = mte_mmap (LEN + 3 * ALIGN); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int d = 0; d < ALIGN; d++) + for (int s = 0; s < ALIGN; s++) + for (int n = 0; n < LEN; n++) + test (funtab + i, d, s, n); + + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; +} diff --git a/string/test/strchr.c b/string/test/strchr.c new file mode 100644 index 000000000000..f3ae982ef0ad --- /dev/null +++ b/string/test/strchr.c @@ -0,0 +1,121 @@ +/* + * strchr test. + * + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> +#include "mte.h" +#include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, + +static const struct fun +{ + const char *name; + char *(*fun) (const char *s, int c); + int test_mte; +} funtab[] = { + // clang-format off + F(strchr, 0) +#if __aarch64__ + F(__strchr_aarch64, 0) + F(__strchr_aarch64_mte, 1) +# if __ARM_FEATURE_SVE + F(__strchr_aarch64_sve, 1) +# endif +#endif + {0, 0, 0} + // clang-format on +}; +#undef F + +#define ALIGN 32 +#define LEN 512 +static char *sbuf; + +static void * +alignup (void *p) +{ + return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN); +} + +static void +test (const struct fun *fun, int align, int seekpos, int len) +{ + char *src = alignup (sbuf); + char *s = src + align; + char *f = seekpos != -1 ? s + seekpos : 0; + int seekchar = 0x1; + void *p; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || seekpos >= len || align >= ALIGN) + abort (); + + for (int i = 0; src + i < s; i++) + src[i] = (i + len) & 1 ? seekchar : 0; + for (int i = 1; i <= ALIGN; i++) + s[len + i] = (i + len) & 1 ? seekchar : 0; + for (int i = 0; i < len; i++) + s[i] = 'a' + (i & 31); + if (seekpos != -1) + s[seekpos] = seekchar; + if (seekpos != -1 && (len + align) & 1) + s[seekpos + 1] = seekchar; + s[len] = '\0'; + + s = tag_buffer (s, len + 1, fun->test_mte); + p = fun->fun (s, seekchar); + untag_buffer (s, len + 1, fun->test_mte); + p = untag_pointer (p); + + if (p != f) + { + ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n", + fun->name, s, seekchar, len, p, f, seekpos); + quote ("input", s, len); + } + + s = tag_buffer (s, len + 1, fun->test_mte); + p = fun->fun (s, 0); + untag_buffer (s, len + 1, fun->test_mte); + + if (p != s + len) + { + ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n", + fun->name, s, 0, len, p, f, len); + quote ("input", s, len); + } +} + +int +main (void) +{ + sbuf = mte_mmap (LEN + 3 * ALIGN); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int a = 0; a < ALIGN; a++) + for (int n = 0; n < LEN; n++) + { + for (int sp = 0; sp < n; sp++) + test (funtab + i, a, sp, n); + test (funtab + i, a, -1, n); + } + + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; +} diff --git a/string/test/strchrnul.c b/string/test/strchrnul.c new file mode 100644 index 000000000000..6c30ab2123f1 --- /dev/null +++ b/string/test/strchrnul.c @@ -0,0 +1,126 @@ +/* + * strchrnul test. + * + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> +#include "mte.h" +#include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, + +static const struct fun +{ + const char *name; + char *(*fun) (const char *s, int c); + int test_mte; +} funtab[] = { + // clang-format off + F(strchrnul, 0) +#if __aarch64__ + F(__strchrnul_aarch64, 0) + F(__strchrnul_aarch64_mte, 1) +# if __ARM_FEATURE_SVE + F(__strchrnul_aarch64_sve, 1) +# endif +#endif + {0, 0, 0} + // clang-format on +}; +#undef F + +#define ALIGN 32 +#define LEN 512 +static char *sbuf; + +static void * +alignup (void *p) +{ + return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN); +} + +static void +test (const struct fun *fun, int align, int seekpos, int len) +{ + char *src = alignup (sbuf); + char *s = src + align; + char *f = seekpos != -1 ? s + seekpos : s + len; + int seekchar = 0x1; + void *p; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || seekpos >= len || align >= ALIGN) + abort (); + + for (int i = 0; src + i < s; i++) + src[i] = (i + len) & 1 ? seekchar : 0; + for (int i = 1; i <= ALIGN; i++) + s[len + i] = (i + len) & 1 ? seekchar : 0; + for (int i = 0; i < len; i++) + s[i] = 'a' + (i & 31); + if (seekpos != -1) + s[seekpos] = seekchar; + if (seekpos != -1 && (len + align) & 1) + s[seekpos + 1] = seekchar; + s[len] = '\0'; + + int mte_len = seekpos != -1 ? seekpos + 1 : len + 1; + s = tag_buffer (s, mte_len, fun->test_mte); + p = fun->fun (s, seekchar); + untag_buffer (s, mte_len, fun->test_mte); + p = untag_pointer (p); + + if (p != f) + { + ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n", + fun->name, s, seekchar, len, p, f, seekpos); + quote ("input", s, len); + } + + s = tag_buffer (s, len + 1, fun->test_mte); + p = fun->fun (s, 0); + untag_buffer (s, len + 1, fun->test_mte); + + if (p != s + len) + { + ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n", + fun->name, s, 0, len, p, f, len); + quote ("input", s, len); + } +} + +int +main (void) +{ + sbuf = mte_mmap (LEN + 3 * ALIGN); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int a = 0; a < ALIGN; a++) + for (int n = 0; n < LEN; n++) + { + for (int sp = 0; sp < n; sp++) + test (funtab + i, a, sp, n); + test (funtab + i, a, -1, n); + } + + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; +} diff --git a/string/test/strcmp.c b/string/test/strcmp.c new file mode 100644 index 000000000000..d57b54ed50a8 --- /dev/null +++ b/string/test/strcmp.c @@ -0,0 +1,132 @@ +/* + * strcmp test. + * + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "mte.h" +#include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, + +static const struct fun +{ + const char *name; + int (*fun) (const char *s1, const char *s2); + int test_mte; +} funtab[] = { + // clang-format off + F(strcmp, 0) +#if __aarch64__ + F(__strcmp_aarch64, 0) + F(__strcmp_aarch64_mte, 1) +# if __ARM_FEATURE_SVE + F(__strcmp_aarch64_sve, 1) +# endif +#elif __arm__ +# if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 + F(__strcmp_arm, 0) +# elif __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1 + F(__strcmp_armv6m, 0) +# endif +#endif + {0, 0, 0} + // clang-format on +}; +#undef F + +#define A 32 +#define LEN 250000 +static char *s1buf; +static char *s2buf; + +static void * +alignup (void *p) +{ + return (void *) (((uintptr_t) p + A - 1) & -A); +} + +static void +test (const struct fun *fun, int s1align, int s2align, int len, int diffpos, + int delta) +{ + char *src1 = alignup (s1buf); + char *src2 = alignup (s2buf); + char *s1 = src1 + s1align; + char *s2 = src2 + s2align; + int r; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || s1align >= A || s2align >= A) + abort (); + if (diffpos >= len) + abort (); + if ((diffpos < 0) != (delta == 0)) + abort (); + + for (int i = 0; i < len + A; i++) + src1[i] = src2[i] = '?'; + for (int i = 0; i < len; i++) + s1[i] = s2[i] = 'a' + i % 23; + if (delta) + s1[diffpos] += delta; + s1[len] = s2[len] = '\0'; + + s1 = tag_buffer (s1, len + 1, fun->test_mte); + s2 = tag_buffer (s2, len + 1, fun->test_mte); + r = fun->fun (s1, s2); + untag_buffer (s1, len + 1, fun->test_mte); + untag_buffer (s2, len + 1, fun->test_mte); + + if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0)) + { + ERR ("%s(align %d, align %d, %d) failed, returned %d\n", fun->name, + s1align, s2align, len, r); + quoteat ("src1", src1, len + A, diffpos); + quoteat ("src2", src2, len + A, diffpos); + } +} + +int +main () +{ + s1buf = mte_mmap (LEN + 2 * A + 1); + s2buf = mte_mmap (LEN + 2 * A + 1); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int d = 0; d < A; d++) + for (int s = 0; s < A; s++) + { + int n; + test (funtab + i, d, s, 0, -1, 0); + test (funtab + i, d, s, 1, -1, 0); + test (funtab + i, d, s, 1, 0, 1); + test (funtab + i, d, s, 1, 0, -1); + for (n = 2; n < 100; n++) + { + test (funtab + i, d, s, n, -1, 0); + test (funtab + i, d, s, n, n - 1, -1); + test (funtab + i, d, s, n, n / 2, 1); + } + for (; n < LEN; n *= 2) + { + test (funtab + i, d, s, n, -1, 0); + test (funtab + i, d, s, n, n / 2, -1); + } + } + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; +} diff --git a/string/test/strcpy.c b/string/test/strcpy.c new file mode 100644 index 000000000000..e84cace9c8c6 --- /dev/null +++ b/string/test/strcpy.c @@ -0,0 +1,123 @@ +/* + * strcpy test. + * + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "mte.h" +#include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, + +static const struct fun +{ + const char *name; + char *(*fun) (char *dest, const char *src); + int test_mte; +} funtab[] = { + // clang-format off + F(strcpy, 0) +#if __aarch64__ + F(__strcpy_aarch64, 0) + F(__strcpy_aarch64_mte, 1) +# if __ARM_FEATURE_SVE + F(__strcpy_aarch64_sve, 1) +# endif +#elif __arm__ && defined (__thumb2__) && !defined (__thumb__) + F(__strcpy_arm, 0) +#endif + {0, 0, 0} + // clang-format on +}; +#undef F + +#define ALIGN 32 +#define LEN 512 +static char *dbuf; +static char *sbuf; +static char wbuf[LEN + 3 * ALIGN]; + +static void * +alignup (void *p) +{ + return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN); +} + +static void +test (const struct fun *fun, int dalign, int salign, int len) +{ + char *src = alignup (sbuf); + char *dst = alignup (dbuf); + char *want = wbuf; + char *s = src + salign; + char *d = dst + dalign; + char *w = want + dalign; + void *p; + int i; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || dalign >= ALIGN || salign >= ALIGN) + abort (); + for (i = 0; i < len + ALIGN; i++) + { + src[i] = '?'; + want[i] = dst[i] = '*'; + } + for (int i = 0; src + i < s; i++) + src[i] = 0; + for (int i = 1; i <= ALIGN; i++) + s[len + i] = (len + salign) & 1 ? 1 : 0; + for (i = 0; i < len; i++) + s[i] = w[i] = 'a' + (i & 31); + s[len] = w[len] = '\0'; + + s = tag_buffer (s, len + 1, fun->test_mte); + d = tag_buffer (d, len + 1, fun->test_mte); + p = fun->fun (d, s); + untag_buffer (s, len + 1, fun->test_mte); + untag_buffer (d, len + 1, fun->test_mte); + + if (p != d) + ERR ("%s (%p,..) returned %p\n", fun->name, d, p); + + for (i = 0; i < len + ALIGN; i++) + { + if (dst[i] != want[i]) + { + ERR ("%s (align %d, align %d, %d) failed\n", + fun->name, dalign, salign, len); + quoteat ("got", dst, len + ALIGN, i); + quoteat ("want", want, len + ALIGN, i); + break; + } + } +} + +int +main (void) +{ + sbuf = mte_mmap (LEN + 3 * ALIGN); + dbuf = mte_mmap (LEN + 3 * ALIGN); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int d = 0; d < ALIGN; d++) + for (int s = 0; s < ALIGN; s++) + for (int n = 0; n < LEN; n++) + test (funtab + i, d, s, n); + + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; +} diff --git a/string/test/stringtest.h b/string/test/stringtest.h new file mode 100644 index 000000000000..fe855fc21736 --- /dev/null +++ b/string/test/stringtest.h @@ -0,0 +1,55 @@ +/* + * Common string test code. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <ctype.h> +#include <stdio.h> + +/* Accounting errors for a test case. */ +static int err_count; +#define ERR_LIMIT 10 +#define ERR(...) (err_count++, printf (__VA_ARGS__)) + +static inline void +quotechar (unsigned char c) +{ + if (isprint (c)) + putchar (c); + else + printf ("\\x%02x", c); +} + +/* quoted print around at or the entire string if at < 0. */ +static void +quoteat (const char *prefix, const void *p, int len, int at) +{ + static const int CTXLEN = 15; + int i; + const char *pre = "\""; + const char *post = "\""; + const char *s = p; + if (at > CTXLEN) + { + s += at - CTXLEN; + len -= at - CTXLEN; + pre = "...\""; + } + if (at >= 0 && len > 2 * CTXLEN + 1) + { + len = 2 * CTXLEN + 1; + post = "\"..."; + } + printf ("%4s: %s", prefix, pre); + for (i = 0; i < len; i++) + quotechar (s[i]); + printf ("%s\n", post); +} + +static inline void +quote (const char *prefix, const void *p, int len) +{ + quoteat (prefix, p, len, -1); +} diff --git a/string/test/strlen.c b/string/test/strlen.c new file mode 100644 index 000000000000..6278380f26df --- /dev/null +++ b/string/test/strlen.c @@ -0,0 +1,103 @@ +/* + * strlen test. + * + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <limits.h> +#include "mte.h" +#include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, + +static const struct fun +{ + const char *name; + size_t (*fun) (const char *s); + int test_mte; +} funtab[] = { + // clang-format off + F(strlen, 0) +#if __aarch64__ + F(__strlen_aarch64, 0) + F(__strlen_aarch64_mte, 1) +# if __ARM_FEATURE_SVE + F(__strlen_aarch64_sve, 1) +# endif +#elif __arm__ +# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2 + F(__strlen_armv6t2, 0) +# endif +#endif + {0, 0, 0} + // clang-format on +}; +#undef F + +#define ALIGN 32 +#define LEN 512 +static char *sbuf; + +static void * +alignup (void *p) +{ + return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN); +} + +static void +test (const struct fun *fun, int align, int len) +{ + char *src = alignup (sbuf); + char *s = src + align; + size_t r; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || align >= ALIGN) + abort (); + + for (int i = 0; src + i < s; i++) + src[i] = 0; + for (int i = 1; i <= ALIGN; i++) + s[len + i] = (len + align) & 1 ? 1 : 0; + for (int i = 0; i < len; i++) + s[i] = 'a' + (i & 31); + s[len] = '\0'; + + s = tag_buffer (s, len + 1, fun->test_mte); + r = fun->fun (s); + untag_buffer (s, len + 1, fun->test_mte); + + if (r != len) + { + ERR ("%s (%p) returned %zu expected %d\n", fun->name, s, r, len); + quote ("input", src, len); + } +} + +int +main (void) +{ + sbuf = mte_mmap (LEN + 3 * ALIGN); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int a = 0; a < ALIGN; a++) + for (int n = 0; n < LEN; n++) + test (funtab + i, a, n); + + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; +} diff --git a/string/test/strncmp.c b/string/test/strncmp.c new file mode 100644 index 000000000000..018a8a431ab8 --- /dev/null +++ b/string/test/strncmp.c @@ -0,0 +1,139 @@ +/* + * strncmp test. + * + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "mte.h" +#include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, + +static const struct fun +{ + const char *name; + int (*fun) (const char *, const char *, size_t); + int test_mte; +} funtab[] = { + // clang-format off + F(strncmp, 0) +#if __aarch64__ + F(__strncmp_aarch64, 0) + F(__strncmp_aarch64_mte, 1) +# if __ARM_FEATURE_SVE + F(__strncmp_aarch64_sve, 1) +# endif +#endif + {0, 0, 0} + // clang-format on +}; +#undef F + +#define A 32 +#define LEN 250000 +static char *s1buf; +static char *s2buf; + +static void * +alignup (void *p) +{ + return (void *) (((uintptr_t) p + A - 1) & -A); +} + +static void +test (const struct fun *fun, int s1align, int s2align, int maxlen, int diffpos, + int len, int delta) +{ + char *src1 = alignup (s1buf); + char *src2 = alignup (s2buf); + char *s1 = src1 + s1align; + char *s2 = src2 + s2align; + int r; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || s1align >= A || s2align >= A) + abort (); + if (diffpos >= len) + abort (); + if ((diffpos < 0) != (delta == 0)) + abort (); + + for (int i = 0; i < len + A; i++) + src1[i] = src2[i] = '?'; + for (int i = 0; i < len; i++) + s1[i] = s2[i] = 'a' + i % 23; + if (delta) + s1[diffpos] += delta; + s1[len] = s2[len] = '\0'; + + size_t mte_len = maxlen < len + 1 ? maxlen : len + 1; + s1 = tag_buffer (s1, mte_len, fun->test_mte); + s2 = tag_buffer (s2, mte_len, fun->test_mte); + r = fun->fun (s1, s2, maxlen); + untag_buffer (s1, mte_len, fun->test_mte); + untag_buffer (s2, mte_len, fun->test_mte); + + if (diffpos >= maxlen) + { + diffpos = -1; + delta = 0; + } + if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0)) + { + ERR ( + "%s(align %d, align %d, %d) (len=%d, diffpos=%d) failed, returned %d\n", + fun->name, s1align, s2align, maxlen, len, diffpos, r); + quoteat ("src1", src1, len + A, diffpos); + quoteat ("src2", src2, len + A, diffpos); + } +} + +int +main () +{ + s1buf = mte_mmap (LEN + 2 * A + 1); + s2buf = mte_mmap (LEN + 2 * A + 1); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int d = 0; d < A; d++) + for (int s = 0; s < A; s++) + { + int n; + test (funtab + i, d, s, 0, -1, 0, 0); + test (funtab + i, d, s, 1, -1, 0, 0); + test (funtab + i, d, s, 0, -1, 1, 0); + test (funtab + i, d, s, 1, -1, 1, 0); + test (funtab + i, d, s, 2, -1, 1, 0); + test (funtab + i, d, s, 1, 0, 1, 1); + test (funtab + i, d, s, 1, 0, 1, -1); + for (n = 2; n < 100; n++) + { + test (funtab + i, d, s, n, -1, n, 0); + test (funtab + i, d, s, n, n / 2, n, 1); + test (funtab + i, d, s, n / 2, -1, n, 0); + test (funtab + i, d, s, n / 2, n / 2, n, -1); + } + for (; n < LEN; n *= 2) + { + test (funtab + i, d, s, n, -1, n, 0); + test (funtab + i, d, s, n, n / 2, n, -1); + test (funtab + i, d, s, n / 2, -1, n, 0); + test (funtab + i, d, s, n / 2, n / 2, n, 1); + } + } + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; +} diff --git a/string/test/strnlen.c b/string/test/strnlen.c new file mode 100644 index 000000000000..0dea00eaf8e3 --- /dev/null +++ b/string/test/strnlen.c @@ -0,0 +1,109 @@ +/* + * strnlen test. + * + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> +#include "mte.h" +#include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, + +static const struct fun +{ + const char *name; + size_t (*fun) (const char *s, size_t m); + int test_mte; +} funtab[] = { + // clang-format off + F(strnlen, 0) +#if __aarch64__ + F(__strnlen_aarch64, 1) +# if __ARM_FEATURE_SVE + F(__strnlen_aarch64_sve, 1) +# endif +#endif + {0, 0, 0} + // clang-format on +}; +#undef F + +#define ALIGN 32 +#define LEN 512 +static char *sbuf; + +static void * +alignup (void *p) +{ + return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN); +} + +static void +test (const struct fun *fun, int align, size_t maxlen, size_t len) +{ + char *src = alignup (sbuf); + char *s = src + align; + size_t r; + size_t e = maxlen < len ? maxlen : len; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || align >= ALIGN) + abort (); + + for (int i = 0; src + i < s; i++) + src[i] = 0; + for (int i = 1; i <= ALIGN; i++) + s[len + i] = (len + align) & 1 ? 1 : 0; + for (int i = 0; i < len; i++) + s[i] = 'a' + (i & 31); + s[len] = 0; + if ((len + align) & 1) + s[e + 1] = 0; + + size_t mte_len = maxlen < len + 1 ? maxlen : len + 1; + s = tag_buffer (s, mte_len, fun->test_mte); + r = fun->fun (s, maxlen); + untag_buffer (s, mte_len, fun->test_mte); + + if (r != e) + { + ERR ("%s (%p, %zu) len %zu returned %zu, expected %zu\n", + fun->name, s, maxlen, len, r, e); + quote ("input", s, len); + } +} + +int +main (void) +{ + sbuf = mte_mmap (LEN + 3 * ALIGN); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int a = 0; a < ALIGN; a++) + for (int n = 0; n < LEN; n++) + { + for (int maxlen = 0; maxlen < LEN; maxlen++) + test (funtab + i, a, maxlen, n); + test (funtab + i, a, SIZE_MAX - a, n); + } + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; +} diff --git a/string/test/strrchr.c b/string/test/strrchr.c new file mode 100644 index 000000000000..fedbdc52fcc1 --- /dev/null +++ b/string/test/strrchr.c @@ -0,0 +1,121 @@ +/* + * strrchr test. + * + * Copyright (c) 2019-2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> +#include "mte.h" +#include "stringlib.h" +#include "stringtest.h" + +#define F(x, mte) {#x, x, mte}, + +static const struct fun +{ + const char *name; + char *(*fun) (const char *s, int c); + int test_mte; +} funtab[] = { + // clang-format off + F(strrchr, 0) +#if __aarch64__ + F(__strrchr_aarch64, 0) + F(__strrchr_aarch64_mte, 1) +# if __ARM_FEATURE_SVE + F(__strrchr_aarch64_sve, 1) +# endif +#endif + {0, 0, 0} + // clang-format on +}; +#undef F + +#define ALIGN 32 +#define LEN 512 +static char *sbuf; + +static void * +alignup (void *p) +{ + return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN); +} + +static void +test (const struct fun *fun, int align, int seekpos, int len) +{ + char *src = alignup (sbuf); + char *s = src + align; + char *f = seekpos != -1 ? s + seekpos : 0; + int seekchar = 0x1; + void *p; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || seekpos >= len || align >= ALIGN) + abort (); + + for (int i = 0; src + i < s; i++) + src[i] = (i + len) & 1 ? seekchar : 0; + for (int i = 1; i <= ALIGN; i++) + s[len + i] = (i + len) & 1 ? seekchar : 0; + for (int i = 0; i < len; i++) + s[i] = 'a' + (i & 31); + if (seekpos != -1) + s[seekpos / 2] = s[seekpos] = seekchar; + if (seekpos > 0 && (len + align) & 1) + s[seekpos - 1] = seekchar; + s[len] = '\0'; + + s = tag_buffer (s, len + 1, fun->test_mte); + p = fun->fun (s, seekchar); + untag_buffer (s, len + 1, fun->test_mte); + p = untag_pointer (p); + + if (p != f) + { + ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n", + fun->name, s, seekchar, len, p, f, seekpos); + quote ("input", s, len); + } + + s = tag_buffer (s, len + 1, fun->test_mte); + p = fun->fun (s, 0); + untag_buffer (s, len + 1, fun->test_mte); + + if (p != s + len) + { + ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n", + fun->name, s, 0, len, p, s + len, len); + quote ("input", s, len); + } +} + +int +main (void) +{ + sbuf = mte_mmap (LEN + 3 * ALIGN); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int a = 0; a < ALIGN; a++) + for (int n = 0; n < LEN; n++) + { + for (int sp = 0; sp < n; sp++) + test (funtab + i, a, sp, n); + test (funtab + i, a, -1, n); + } + + char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS"; + printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name); + if (err_count) + r = -1; + } + return r; +} diff --git a/string/x86_64/check-arch.S b/string/x86_64/check-arch.S new file mode 100644 index 000000000000..26ade0a0c7db --- /dev/null +++ b/string/x86_64/check-arch.S @@ -0,0 +1,10 @@ +/* + * check ARCH setting. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if !__x86_64__ +# error ARCH setting does not match the compiler. +#endif |