diff options
Diffstat (limited to 'contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S')
| -rw-r--r-- | contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S | 69 | 
1 files changed, 36 insertions, 33 deletions
| diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S b/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S index f97f2c3047b9..cbf4c581500e 100644 --- a/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S +++ b/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S @@ -1,8 +1,8 @@  /*   * memcpy - copy memory area   * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception   */  /* Assumptions: @@ -11,7 +11,7 @@   *   */ -#include "../asmdefs.h" +#include "asmdefs.h"  #define dstin	x0  #define src	x1 @@ -52,15 +52,13 @@  ENTRY_ALIAS (__memmove_aarch64_simd)  ENTRY (__memcpy_aarch64_simd) -	PTR_ARG (0) -	PTR_ARG (1) -	SIZE_ARG (2)  	add	srcend, src, count -	add	dstend, dstin, count  	cmp	count, 128  	b.hi	L(copy_long) +	add	dstend, dstin, count  	cmp	count, 32  	b.hi	L(copy32_128) +	nop  	/* Small copies: 0..32 bytes.  */  	cmp	count, 16 @@ -71,6 +69,18 @@ ENTRY (__memcpy_aarch64_simd)  	str	B_q, [dstend, -16]  	ret +	.p2align 4 +	/* Medium copies: 33..128 bytes.  */ +L(copy32_128): +	ldp	A_q, B_q, [src] +	ldp	C_q, D_q, [srcend, -32] +	cmp	count, 64 +	b.hi	L(copy128) +	stp	A_q, B_q, [dstin] +	stp	C_q, D_q, [dstend, -32] +	ret + +	.p2align 4  	/* Copy 8-15 bytes.  */  L(copy16):  	tbz	count, 3, L(copy8) @@ -80,7 +90,6 @@ L(copy16):  	str	A_h, [dstend, -8]  	ret -	.p2align 3  	/* Copy 4-7 bytes.  */  L(copy8):  	tbz	count, 2, L(copy4) @@ -90,31 +99,6 @@ L(copy8):  	str	B_lw, [dstend, -4]  	ret -	/* Copy 0..3 bytes using a branchless sequence.  */ -L(copy4): -	cbz	count, L(copy0) -	lsr	tmp1, count, 1 -	ldrb	A_lw, [src] -	ldrb	C_lw, [srcend, -1] -	ldrb	B_lw, [src, tmp1] -	strb	A_lw, [dstin] -	strb	B_lw, [dstin, tmp1] -	strb	C_lw, [dstend, -1] -L(copy0): -	ret - -	.p2align 4 -	/* Medium copies: 33..128 bytes.  */ -L(copy32_128): -	ldp	A_q, B_q, [src] -	ldp	C_q, D_q, [srcend, -32] -	cmp	count, 64 -	b.hi	L(copy128) -	stp	A_q, B_q, [dstin] -	stp	C_q, D_q, [dstend, -32] -	ret - -	.p2align 4  	/* Copy 65..128 bytes.  */  L(copy128):  	ldp	E_q, F_q, [src, 32] @@ -128,8 +112,24 @@ L(copy96):  	stp	C_q, D_q, [dstend, -32]  	ret +	/* Copy 0..3 bytes using a branchless sequence.  */ +L(copy4): +	cbz	count, L(copy0) +	lsr	tmp1, count, 1 +	ldrb	A_lw, [src] +	ldrb	C_lw, [srcend, -1] +	ldrb	B_lw, [src, tmp1] +	strb	A_lw, [dstin] +	strb	B_lw, [dstin, tmp1] +	strb	C_lw, [dstend, -1] +L(copy0): +	ret + +	.p2align 3  	/* Copy more than 128 bytes.  */  L(copy_long): +	add	dstend, dstin, count +  	/* Use backwards copy if there is an overlap.  */  	sub	tmp1, dstin, src  	cmp	tmp1, count @@ -166,6 +166,9 @@ L(copy64_from_end):  	stp	A_q, B_q, [dstend, -32]  	ret +	.p2align 4 +	nop +  	/* Large backwards copy for overlapping copies.  	   Copy 16 bytes and then align srcend to 16-byte alignment.  */  L(copy_long_backwards): | 
