diff options
Diffstat (limited to 'contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S')
-rw-r--r-- | contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S | 69 |
1 files changed, 36 insertions, 33 deletions
diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S b/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S index f97f2c3047b9..cbf4c581500e 100644 --- a/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S +++ b/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S @@ -1,8 +1,8 @@ /* * memcpy - copy memory area * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * */ -#include "../asmdefs.h" +#include "asmdefs.h" #define dstin x0 #define src x1 @@ -52,15 +52,13 @@ ENTRY_ALIAS (__memmove_aarch64_simd) ENTRY (__memcpy_aarch64_simd) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) add srcend, src, count - add dstend, dstin, count cmp count, 128 b.hi L(copy_long) + add dstend, dstin, count cmp count, 32 b.hi L(copy32_128) + nop /* Small copies: 0..32 bytes. */ cmp count, 16 @@ -71,6 +69,18 @@ ENTRY (__memcpy_aarch64_simd) str B_q, [dstend, -16] ret + .p2align 4 + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_q, B_q, [src] + ldp C_q, D_q, [srcend, -32] + cmp count, 64 + b.hi L(copy128) + stp A_q, B_q, [dstin] + stp C_q, D_q, [dstend, -32] + ret + + .p2align 4 /* Copy 8-15 bytes. */ L(copy16): tbz count, 3, L(copy8) @@ -80,7 +90,6 @@ L(copy16): str A_h, [dstend, -8] ret - .p2align 3 /* Copy 4-7 bytes. */ L(copy8): tbz count, 2, L(copy4) @@ -90,31 +99,6 @@ L(copy8): str B_lw, [dstend, -4] ret - /* Copy 0..3 bytes using a branchless sequence. */ -L(copy4): - cbz count, L(copy0) - lsr tmp1, count, 1 - ldrb A_lw, [src] - ldrb C_lw, [srcend, -1] - ldrb B_lw, [src, tmp1] - strb A_lw, [dstin] - strb B_lw, [dstin, tmp1] - strb C_lw, [dstend, -1] -L(copy0): - ret - - .p2align 4 - /* Medium copies: 33..128 bytes. */ -L(copy32_128): - ldp A_q, B_q, [src] - ldp C_q, D_q, [srcend, -32] - cmp count, 64 - b.hi L(copy128) - stp A_q, B_q, [dstin] - stp C_q, D_q, [dstend, -32] - ret - - .p2align 4 /* Copy 65..128 bytes. */ L(copy128): ldp E_q, F_q, [src, 32] @@ -128,8 +112,24 @@ L(copy96): stp C_q, D_q, [dstend, -32] ret + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb C_lw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb C_lw, [dstend, -1] +L(copy0): + ret + + .p2align 3 /* Copy more than 128 bytes. */ L(copy_long): + add dstend, dstin, count + /* Use backwards copy if there is an overlap. */ sub tmp1, dstin, src cmp tmp1, count @@ -166,6 +166,9 @@ L(copy64_from_end): stp A_q, B_q, [dstend, -32] ret + .p2align 4 + nop + /* Large backwards copy for overlapping copies. Copy 16 bytes and then align srcend to 16-byte alignment. */ L(copy_long_backwards): |