aboutsummaryrefslogtreecommitdiff
path: root/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S')
-rw-r--r--contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S69
1 files changed, 36 insertions, 33 deletions
diff --git a/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S b/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S
index f97f2c3047b9..cbf4c581500e 100644
--- a/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S
+++ b/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S
@@ -1,8 +1,8 @@
/*
* memcpy - copy memory area
*
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
*
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define dstin x0
#define src x1
@@ -52,15 +52,13 @@
ENTRY_ALIAS (__memmove_aarch64_simd)
ENTRY (__memcpy_aarch64_simd)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
add srcend, src, count
- add dstend, dstin, count
cmp count, 128
b.hi L(copy_long)
+ add dstend, dstin, count
cmp count, 32
b.hi L(copy32_128)
+ nop
/* Small copies: 0..32 bytes. */
cmp count, 16
@@ -71,6 +69,18 @@ ENTRY (__memcpy_aarch64_simd)
str B_q, [dstend, -16]
ret
+ .p2align 4
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ ldp A_q, B_q, [src]
+ ldp C_q, D_q, [srcend, -32]
+ cmp count, 64
+ b.hi L(copy128)
+ stp A_q, B_q, [dstin]
+ stp C_q, D_q, [dstend, -32]
+ ret
+
+ .p2align 4
/* Copy 8-15 bytes. */
L(copy16):
tbz count, 3, L(copy8)
@@ -80,7 +90,6 @@ L(copy16):
str A_h, [dstend, -8]
ret
- .p2align 3
/* Copy 4-7 bytes. */
L(copy8):
tbz count, 2, L(copy4)
@@ -90,31 +99,6 @@ L(copy8):
str B_lw, [dstend, -4]
ret
- /* Copy 0..3 bytes using a branchless sequence. */
-L(copy4):
- cbz count, L(copy0)
- lsr tmp1, count, 1
- ldrb A_lw, [src]
- ldrb C_lw, [srcend, -1]
- ldrb B_lw, [src, tmp1]
- strb A_lw, [dstin]
- strb B_lw, [dstin, tmp1]
- strb C_lw, [dstend, -1]
-L(copy0):
- ret
-
- .p2align 4
- /* Medium copies: 33..128 bytes. */
-L(copy32_128):
- ldp A_q, B_q, [src]
- ldp C_q, D_q, [srcend, -32]
- cmp count, 64
- b.hi L(copy128)
- stp A_q, B_q, [dstin]
- stp C_q, D_q, [dstend, -32]
- ret
-
- .p2align 4
/* Copy 65..128 bytes. */
L(copy128):
ldp E_q, F_q, [src, 32]
@@ -128,8 +112,24 @@ L(copy96):
stp C_q, D_q, [dstend, -32]
ret
+ /* Copy 0..3 bytes using a branchless sequence. */
+L(copy4):
+ cbz count, L(copy0)
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb C_lw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb C_lw, [dstend, -1]
+L(copy0):
+ ret
+
+ .p2align 3
/* Copy more than 128 bytes. */
L(copy_long):
+ add dstend, dstin, count
+
/* Use backwards copy if there is an overlap. */
sub tmp1, dstin, src
cmp tmp1, count
@@ -166,6 +166,9 @@ L(copy64_from_end):
stp A_q, B_q, [dstend, -32]
ret
+ .p2align 4
+ nop
+
/* Large backwards copy for overlapping copies.
Copy 16 bytes and then align srcend to 16-byte alignment. */
L(copy_long_backwards):