aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEd Maste <emaste@FreeBSD.org>2022-08-15 21:24:08 +0000
committerEd Maste <emaste@FreeBSD.org>2022-08-16 17:24:08 +0000
commitd77520609f5240f5fad18fa1fd2275ac1de7cbb5 (patch)
tree0d4c3ec4a774bc05892f5f5e9f40715682bd11bc
downloadsrc-vendor/bionic-x86_64-string.tar.gz
src-vendor/bionic-x86_64-string.zip
Import bionic's x86_64 string routines to vendorvendor/bionic-x86_64-string
Obtained from https://android.googlesource.com/platform/bionic libc/arch-x86_64/string/ at commit 919fb7f2e0e0c877dd5e9bbaa71d4c4a73e50ad3 Requested by: mjg Sponsored by: The FreeBSD Foundation
-rw-r--r--avx2-memset-kbl.S160
-rw-r--r--cache.h36
-rw-r--r--sse2-memmove-slm.S518
-rw-r--r--sse2-memset-slm.S149
-rw-r--r--sse2-stpcpy-slm.S33
-rw-r--r--sse2-stpncpy-slm.S34
-rw-r--r--sse2-strcat-slm.S87
-rw-r--r--sse2-strcpy-slm.S1921
-rw-r--r--sse2-strlen-slm.S294
-rw-r--r--sse2-strncat-slm.S33
-rw-r--r--sse2-strncpy-slm.S33
-rw-r--r--sse4-memcmp-slm.S1799
-rw-r--r--ssse3-strcmp-slm.S1925
-rw-r--r--ssse3-strncmp-slm.S33
14 files changed, 7055 insertions, 0 deletions
diff --git a/avx2-memset-kbl.S b/avx2-memset-kbl.S
new file mode 100644
index 000000000000..09dd07dd1655
--- /dev/null
+++ b/avx2-memset-kbl.S
@@ -0,0 +1,160 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <private/bionic_asm.h>
+
+#include "cache.h"
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n) .p2align n
+#endif
+
+ .section .text.avx2,"ax",@progbits
+
+ENTRY(__memset_chk_avx2)
+ # %rdi = dst, %rsi = byte, %rdx = n, %rcx = dst_len
+ cmp %rcx, %rdx
+ ja __memset_chk_fail
+ // Fall through to memset...
+END(__memset_chk_avx2)
+
+ENTRY(memset_avx2)
+ movq %rdi, %rax
+ and $0xff, %rsi
+ mov $0x0101010101010101, %rcx
+ imul %rsi, %rcx
+ cmpq $16, %rdx
+ jae L(16bytesormore)
+ testb $8, %dl
+ jnz L(8_15bytes)
+ testb $4, %dl
+ jnz L(4_7bytes)
+ testb $2, %dl
+ jnz L(2_3bytes)
+ testb $1, %dl
+ jz L(return)
+ movb %cl, (%rdi)
+L(return):
+ ret
+
+L(8_15bytes):
+ movq %rcx, (%rdi)
+ movq %rcx, -8(%rdi, %rdx)
+ ret
+
+L(4_7bytes):
+ movl %ecx, (%rdi)
+ movl %ecx, -4(%rdi, %rdx)
+ ret
+
+L(2_3bytes):
+ movw %cx, (%rdi)
+ movw %cx, -2(%rdi, %rdx)
+ ret
+
+ ALIGN (4)
+L(16bytesormore):
+ movd %rcx, %xmm0
+ pshufd $0, %xmm0, %xmm0
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm0, -16(%rdi, %rdx)
+ cmpq $32, %rdx
+ jbe L(32bytesless)
+ movdqu %xmm0, 16(%rdi)
+ movdqu %xmm0, -32(%rdi, %rdx)
+ cmpq $64, %rdx
+ jbe L(64bytesless)
+ movdqu %xmm0, 32(%rdi)
+ movdqu %xmm0, 48(%rdi)
+ movdqu %xmm0, -64(%rdi, %rdx)
+ movdqu %xmm0, -48(%rdi, %rdx)
+ cmpq $128, %rdx
+ jbe L(128bytesless)
+ vpbroadcastb %xmm0, %ymm0
+ vmovdqu %ymm0, 64(%rdi)
+ vmovdqu %ymm0, 96(%rdi)
+ vmovdqu %ymm0, -128(%rdi, %rdx)
+ vmovdqu %ymm0, -96(%rdi, %rdx)
+ cmpq $256, %rdx
+ ja L(256bytesmore)
+L(32bytesless):
+L(64bytesless):
+L(128bytesless):
+ ret
+
+ ALIGN (4)
+L(256bytesmore):
+ leaq 128(%rdi), %rcx
+ andq $-128, %rcx
+ movq %rdx, %r8
+ addq %rdi, %rdx
+ andq $-128, %rdx
+ cmpq %rcx, %rdx
+ je L(return)
+
+#ifdef SHARED_CACHE_SIZE
+ cmp $SHARED_CACHE_SIZE, %r8
+#else
+ cmp __x86_64_shared_cache_size(%rip), %r8
+#endif
+ ja L(256bytesmore_nt)
+
+ ALIGN (4)
+L(256bytesmore_normal):
+ vmovdqa %ymm0, (%rcx)
+ vmovdqa %ymm0, 32(%rcx)
+ vmovdqa %ymm0, 64(%rcx)
+ vmovdqa %ymm0, 96(%rcx)
+ addq $128, %rcx
+ cmpq %rcx, %rdx
+ jne L(256bytesmore_normal)
+ ret
+
+ ALIGN (4)
+L(256bytesmore_nt):
+ movntdq %xmm0, (%rcx)
+ movntdq %xmm0, 16(%rcx)
+ movntdq %xmm0, 32(%rcx)
+ movntdq %xmm0, 48(%rcx)
+ movntdq %xmm0, 64(%rcx)
+ movntdq %xmm0, 80(%rcx)
+ movntdq %xmm0, 96(%rcx)
+ movntdq %xmm0, 112(%rcx)
+ leaq 128(%rcx), %rcx
+ cmpq %rcx, %rdx
+ jne L(256bytesmore_nt)
+ sfence
+ ret
+
+END(memset_avx2)
diff --git a/cache.h b/cache.h
new file mode 100644
index 000000000000..4131509fbbc9
--- /dev/null
+++ b/cache.h
@@ -0,0 +1,36 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* Values are optimized for Core Architecture */
+#define SHARED_CACHE_SIZE (4096*1024) /* Core Architecture L2 Cache */
+#define DATA_CACHE_SIZE (24*1024) /* Core Architecture L1 Data Cache */
+
+#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
+#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2)
diff --git a/sse2-memmove-slm.S b/sse2-memmove-slm.S
new file mode 100644
index 000000000000..739502888b97
--- /dev/null
+++ b/sse2-memmove-slm.S
@@ -0,0 +1,518 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "cache.h"
+
+#ifndef MEMMOVE
+# define MEMMOVE memmove
+#endif
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg) .cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef ALIAS_SYMBOL
+# define ALIAS_SYMBOL(alias, original) \
+ .globl alias; \
+ .equ alias, original
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) push REG;
+#define POP(REG) pop REG;
+
+#define ENTRANCE PUSH (%rbx);
+#define RETURN_END POP (%rbx); ret
+#define RETURN RETURN_END;
+
+ .section .text.sse2,"ax",@progbits
+ENTRY (MEMMOVE)
+ ENTRANCE
+ mov %rdi, %rax
+
+/* Check whether we should copy backward or forward. */
+ cmp %rsi, %rdi
+ je L(mm_return)
+ jg L(mm_len_0_or_more_backward)
+
+/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
+ separately. */
+ cmp $16, %rdx
+ jbe L(mm_len_0_16_bytes_forward)
+
+ cmp $32, %rdx
+ ja L(mm_len_32_or_more_forward)
+
+/* Copy [0..32] and return. */
+ movdqu (%rsi), %xmm0
+ movdqu -16(%rsi, %rdx), %xmm1
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, -16(%rdi, %rdx)
+ jmp L(mm_return)
+
+L(mm_len_32_or_more_forward):
+ cmp $64, %rdx
+ ja L(mm_len_64_or_more_forward)
+
+/* Copy [0..64] and return. */
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm1
+ movdqu -16(%rsi, %rdx), %xmm2
+ movdqu -32(%rsi, %rdx), %xmm3
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, 16(%rdi)
+ movdqu %xmm2, -16(%rdi, %rdx)
+ movdqu %xmm3, -32(%rdi, %rdx)
+ jmp L(mm_return)
+
+L(mm_len_64_or_more_forward):
+ cmp $128, %rdx
+ ja L(mm_len_128_or_more_forward)
+
+/* Copy [0..128] and return. */
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm1
+ movdqu 32(%rsi), %xmm2
+ movdqu 48(%rsi), %xmm3
+ movdqu -64(%rsi, %rdx), %xmm4
+ movdqu -48(%rsi, %rdx), %xmm5
+ movdqu -32(%rsi, %rdx), %xmm6
+ movdqu -16(%rsi, %rdx), %xmm7
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, 16(%rdi)
+ movdqu %xmm2, 32(%rdi)
+ movdqu %xmm3, 48(%rdi)
+ movdqu %xmm4, -64(%rdi, %rdx)
+ movdqu %xmm5, -48(%rdi, %rdx)
+ movdqu %xmm6, -32(%rdi, %rdx)
+ movdqu %xmm7, -16(%rdi, %rdx)
+ jmp L(mm_return)
+
+L(mm_len_128_or_more_forward):
+/* Aligning the address of destination. */
+/* save first unaligned 64 bytes */
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm1
+ movdqu 32(%rsi), %xmm2
+ movdqu 48(%rsi), %xmm3
+
+ lea 64(%rdi), %r8
+ and $-64, %r8 /* r8 now aligned to next 64 byte boundary */
+ sub %rdi, %rsi /* rsi = src - dst = diff */
+
+ movdqu (%r8, %rsi), %xmm4
+ movdqu 16(%r8, %rsi), %xmm5
+ movdqu 32(%r8, %rsi), %xmm6
+ movdqu 48(%r8, %rsi), %xmm7
+
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, 16(%rdi)
+ movdqu %xmm2, 32(%rdi)
+ movdqu %xmm3, 48(%rdi)
+ movdqa %xmm4, (%r8)
+ movaps %xmm5, 16(%r8)
+ movaps %xmm6, 32(%r8)
+ movaps %xmm7, 48(%r8)
+ add $64, %r8
+
+ lea (%rdi, %rdx), %rbx
+ and $-64, %rbx
+ cmp %r8, %rbx
+ jbe L(mm_copy_remaining_forward)
+
+ cmp $SHARED_CACHE_SIZE_HALF, %rdx
+ jae L(mm_large_page_loop_forward)
+
+ .p2align 4
+L(mm_main_loop_forward):
+
+ prefetcht0 128(%r8, %rsi)
+
+ movdqu (%r8, %rsi), %xmm0
+ movdqu 16(%r8, %rsi), %xmm1
+ movdqu 32(%r8, %rsi), %xmm2
+ movdqu 48(%r8, %rsi), %xmm3
+ movdqa %xmm0, (%r8)
+ movaps %xmm1, 16(%r8)
+ movaps %xmm2, 32(%r8)
+ movaps %xmm3, 48(%r8)
+ lea 64(%r8), %r8
+ cmp %r8, %rbx
+ ja L(mm_main_loop_forward)
+
+L(mm_copy_remaining_forward):
+ add %rdi, %rdx
+ sub %r8, %rdx
+/* We copied all up till %rdi position in the dst.
+ In %rdx now is how many bytes are left to copy.
+ Now we need to advance %r8. */
+ lea (%r8, %rsi), %r9
+
+L(mm_remaining_0_64_bytes_forward):
+ cmp $32, %rdx
+ ja L(mm_remaining_33_64_bytes_forward)
+ cmp $16, %rdx
+ ja L(mm_remaining_17_32_bytes_forward)
+ test %rdx, %rdx
+ .p2align 4,,2
+ je L(mm_return)
+
+ cmpb $8, %dl
+ ja L(mm_remaining_9_16_bytes_forward)
+ cmpb $4, %dl
+ .p2align 4,,5
+ ja L(mm_remaining_5_8_bytes_forward)
+ cmpb $2, %dl
+ .p2align 4,,1
+ ja L(mm_remaining_3_4_bytes_forward)
+ movzbl -1(%r9,%rdx), %esi
+ movzbl (%r9), %ebx
+ movb %sil, -1(%r8,%rdx)
+ movb %bl, (%r8)
+ jmp L(mm_return)
+
+L(mm_remaining_33_64_bytes_forward):
+ movdqu (%r9), %xmm0
+ movdqu 16(%r9), %xmm1
+ movdqu -32(%r9, %rdx), %xmm2
+ movdqu -16(%r9, %rdx), %xmm3
+ movdqu %xmm0, (%r8)
+ movdqu %xmm1, 16(%r8)
+ movdqu %xmm2, -32(%r8, %rdx)
+ movdqu %xmm3, -16(%r8, %rdx)
+ jmp L(mm_return)
+
+L(mm_remaining_17_32_bytes_forward):
+ movdqu (%r9), %xmm0
+ movdqu -16(%r9, %rdx), %xmm1
+ movdqu %xmm0, (%r8)
+ movdqu %xmm1, -16(%r8, %rdx)
+ jmp L(mm_return)
+
+L(mm_remaining_5_8_bytes_forward):
+ movl (%r9), %esi
+ movl -4(%r9,%rdx), %ebx
+ movl %esi, (%r8)
+ movl %ebx, -4(%r8,%rdx)
+ jmp L(mm_return)
+
+L(mm_remaining_9_16_bytes_forward):
+ mov (%r9), %rsi
+ mov -8(%r9, %rdx), %rbx
+ mov %rsi, (%r8)
+ mov %rbx, -8(%r8, %rdx)
+ jmp L(mm_return)
+
+L(mm_remaining_3_4_bytes_forward):
+ movzwl -2(%r9,%rdx), %esi
+ movzwl (%r9), %ebx
+ movw %si, -2(%r8,%rdx)
+ movw %bx, (%r8)
+ jmp L(mm_return)
+
+L(mm_len_0_16_bytes_forward):
+ testb $24, %dl
+ jne L(mm_len_9_16_bytes_forward)
+ testb $4, %dl
+ .p2align 4,,5
+ jne L(mm_len_5_8_bytes_forward)
+ test %rdx, %rdx
+ .p2align 4,,2
+ je L(mm_return)
+ testb $2, %dl
+ .p2align 4,,1
+ jne L(mm_len_2_4_bytes_forward)
+ movzbl -1(%rsi,%rdx), %ebx
+ movzbl (%rsi), %esi
+ movb %bl, -1(%rdi,%rdx)
+ movb %sil, (%rdi)
+ jmp L(mm_return)
+
+L(mm_len_2_4_bytes_forward):
+ movzwl -2(%rsi,%rdx), %ebx
+ movzwl (%rsi), %esi
+ movw %bx, -2(%rdi,%rdx)
+ movw %si, (%rdi)
+ jmp L(mm_return)
+
+L(mm_len_5_8_bytes_forward):
+ movl (%rsi), %ebx
+ movl -4(%rsi,%rdx), %esi
+ movl %ebx, (%rdi)
+ movl %esi, -4(%rdi,%rdx)
+ jmp L(mm_return)
+
+L(mm_len_9_16_bytes_forward):
+ mov (%rsi), %rbx
+ mov -8(%rsi, %rdx), %rsi
+ mov %rbx, (%rdi)
+ mov %rsi, -8(%rdi, %rdx)
+ jmp L(mm_return)
+
+L(mm_recalc_len):
+/* Compute in %rdx how many bytes are left to copy after
+ the main loop stops. */
+ mov %rbx, %rdx
+ sub %rdi, %rdx
+/* The code for copying backwards. */
+L(mm_len_0_or_more_backward):
+
+/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
+ separately. */
+ cmp $16, %rdx
+ jbe L(mm_len_0_16_bytes_backward)
+
+ cmp $32, %rdx
+ ja L(mm_len_32_or_more_backward)
+
+/* Copy [0..32] and return. */
+ movdqu (%rsi), %xmm0
+ movdqu -16(%rsi, %rdx), %xmm1
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, -16(%rdi, %rdx)
+ jmp L(mm_return)
+
+L(mm_len_32_or_more_backward):
+ cmp $64, %rdx
+ ja L(mm_len_64_or_more_backward)
+
+/* Copy [0..64] and return. */
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm1
+ movdqu -16(%rsi, %rdx), %xmm2
+ movdqu -32(%rsi, %rdx), %xmm3
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, 16(%rdi)
+ movdqu %xmm2, -16(%rdi, %rdx)
+ movdqu %xmm3, -32(%rdi, %rdx)
+ jmp L(mm_return)
+
+L(mm_len_64_or_more_backward):
+ cmp $128, %rdx
+ ja L(mm_len_128_or_more_backward)
+
+/* Copy [0..128] and return. */
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm1
+ movdqu 32(%rsi), %xmm2
+ movdqu 48(%rsi), %xmm3
+ movdqu -64(%rsi, %rdx), %xmm4
+ movdqu -48(%rsi, %rdx), %xmm5
+ movdqu -32(%rsi, %rdx), %xmm6
+ movdqu -16(%rsi, %rdx), %xmm7
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, 16(%rdi)
+ movdqu %xmm2, 32(%rdi)
+ movdqu %xmm3, 48(%rdi)
+ movdqu %xmm4, -64(%rdi, %rdx)
+ movdqu %xmm5, -48(%rdi, %rdx)
+ movdqu %xmm6, -32(%rdi, %rdx)
+ movdqu %xmm7, -16(%rdi, %rdx)
+ jmp L(mm_return)
+
+L(mm_len_128_or_more_backward):
+/* Aligning the address of destination. We need to save
+ 16 bits from the source in order not to overwrite them. */
+ movdqu -16(%rsi, %rdx), %xmm0
+ movdqu -32(%rsi, %rdx), %xmm1
+ movdqu -48(%rsi, %rdx), %xmm2
+ movdqu -64(%rsi, %rdx), %xmm3
+
+ lea (%rdi, %rdx), %r9
+ and $-64, %r9 /* r9 = aligned dst */
+
+ mov %rsi, %r8
+ sub %rdi, %r8 /* r8 = src - dst, diff */
+
+ movdqu -16(%r9, %r8), %xmm4
+ movdqu -32(%r9, %r8), %xmm5
+ movdqu -48(%r9, %r8), %xmm6
+ movdqu -64(%r9, %r8), %xmm7
+
+ movdqu %xmm0, -16(%rdi, %rdx)
+ movdqu %xmm1, -32(%rdi, %rdx)
+ movdqu %xmm2, -48(%rdi, %rdx)
+ movdqu %xmm3, -64(%rdi, %rdx)
+ movdqa %xmm4, -16(%r9)
+ movaps %xmm5, -32(%r9)
+ movaps %xmm6, -48(%r9)
+ movaps %xmm7, -64(%r9)
+ lea -64(%r9), %r9
+
+ lea 64(%rdi), %rbx
+ and $-64, %rbx
+
+ cmp %r9, %rbx
+ jae L(mm_recalc_len)
+
+ cmp $SHARED_CACHE_SIZE_HALF, %rdx
+ jae L(mm_large_page_loop_backward)
+
+ .p2align 4
+L(mm_main_loop_backward):
+
+ prefetcht0 -128(%r9, %r8)
+
+ movdqu -64(%r9, %r8), %xmm0
+ movdqu -48(%r9, %r8), %xmm1
+ movdqu -32(%r9, %r8), %xmm2
+ movdqu -16(%r9, %r8), %xmm3
+ movdqa %xmm0, -64(%r9)
+ movaps %xmm1, -48(%r9)
+ movaps %xmm2, -32(%r9)
+ movaps %xmm3, -16(%r9)
+ lea -64(%r9), %r9
+ cmp %r9, %rbx
+ jb L(mm_main_loop_backward)
+ jmp L(mm_recalc_len)
+
+/* Copy [0..16] and return. */
+L(mm_len_0_16_bytes_backward):
+ testb $24, %dl
+ jnz L(mm_len_9_16_bytes_backward)
+ testb $4, %dl
+ .p2align 4,,5
+ jnz L(mm_len_5_8_bytes_backward)
+ test %rdx, %rdx
+ .p2align 4,,2
+ je L(mm_return)
+ testb $2, %dl
+ .p2align 4,,1
+ jne L(mm_len_3_4_bytes_backward)
+ movzbl -1(%rsi,%rdx), %ebx
+ movzbl (%rsi), %ecx
+ movb %bl, -1(%rdi,%rdx)
+ movb %cl, (%rdi)
+ jmp L(mm_return)
+
+L(mm_len_3_4_bytes_backward):
+ movzwl -2(%rsi,%rdx), %ebx
+ movzwl (%rsi), %ecx
+ movw %bx, -2(%rdi,%rdx)
+ movw %cx, (%rdi)
+ jmp L(mm_return)
+
+L(mm_len_9_16_bytes_backward):
+ movl -4(%rsi,%rdx), %ebx
+ movl -8(%rsi,%rdx), %ecx
+ movl %ebx, -4(%rdi,%rdx)
+ movl %ecx, -8(%rdi,%rdx)
+ sub $8, %rdx
+ jmp L(mm_len_0_16_bytes_backward)
+
+L(mm_len_5_8_bytes_backward):
+ movl (%rsi), %ebx
+ movl -4(%rsi,%rdx), %ecx
+ movl %ebx, (%rdi)
+ movl %ecx, -4(%rdi,%rdx)
+
+L(mm_return):
+ RETURN
+
+/* Big length copy forward part. */
+
+ .p2align 4
+L(mm_large_page_loop_forward):
+ movdqu (%r8, %rsi), %xmm0
+ movdqu 16(%r8, %rsi), %xmm1
+ movdqu 32(%r8, %rsi), %xmm2
+ movdqu 48(%r8, %rsi), %xmm3
+ movntdq %xmm0, (%r8)
+ movntdq %xmm1, 16(%r8)
+ movntdq %xmm2, 32(%r8)
+ movntdq %xmm3, 48(%r8)
+ lea 64(%r8), %r8
+ cmp %r8, %rbx
+ ja L(mm_large_page_loop_forward)
+ sfence
+ jmp L(mm_copy_remaining_forward)
+
+/* Big length copy backward part. */
+ .p2align 4
+L(mm_large_page_loop_backward):
+ movdqu -64(%r9, %r8), %xmm0
+ movdqu -48(%r9, %r8), %xmm1
+ movdqu -32(%r9, %r8), %xmm2
+ movdqu -16(%r9, %r8), %xmm3
+ movntdq %xmm0, -64(%r9)
+ movntdq %xmm1, -48(%r9)
+ movntdq %xmm2, -32(%r9)
+ movntdq %xmm3, -16(%r9)
+ lea -64(%r9), %r9
+ cmp %r9, %rbx
+ jb L(mm_large_page_loop_backward)
+ sfence
+ jmp L(mm_recalc_len)
+
+END (MEMMOVE)
+
+ALIAS_SYMBOL(memcpy, MEMMOVE)
diff --git a/sse2-memset-slm.S b/sse2-memset-slm.S
new file mode 100644
index 000000000000..cceadd2977d1
--- /dev/null
+++ b/sse2-memset-slm.S
@@ -0,0 +1,149 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <private/bionic_asm.h>
+
+#include "cache.h"
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n) .p2align n
+#endif
+
+
+ENTRY(__memset_chk_generic)
+ # %rdi = dst, %rsi = byte, %rdx = n, %rcx = dst_len
+ cmp %rcx, %rdx
+ ja __memset_chk_fail
+ // Fall through to memset...
+END(__memset_chk_generic)
+
+
+ .section .text.sse2,"ax",@progbits
+ENTRY(memset_generic)
+ movq %rdi, %rax
+ and $0xff, %rsi
+ mov $0x0101010101010101, %rcx
+ imul %rsi, %rcx
+ cmpq $16, %rdx
+ jae L(16bytesormore)
+ testb $8, %dl
+ jnz L(8_15bytes)
+ testb $4, %dl
+ jnz L(4_7bytes)
+ testb $2, %dl
+ jnz L(2_3bytes)
+ testb $1, %dl
+ jz L(return)
+ movb %cl, (%rdi)
+L(return):
+ ret
+
+L(8_15bytes):
+ movq %rcx, (%rdi)
+ movq %rcx, -8(%rdi, %rdx)
+ ret
+
+L(4_7bytes):
+ movl %ecx, (%rdi)
+ movl %ecx, -4(%rdi, %rdx)
+ ret
+
+L(2_3bytes):
+ movw %cx, (%rdi)
+ movw %cx, -2(%rdi, %rdx)
+ ret
+
+ ALIGN (4)
+L(16bytesormore):
+ movd %rcx, %xmm0
+ pshufd $0, %xmm0, %xmm0
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm0, -16(%rdi, %rdx)
+ cmpq $32, %rdx
+ jbe L(32bytesless)
+ movdqu %xmm0, 16(%rdi)
+ movdqu %xmm0, -32(%rdi, %rdx)
+ cmpq $64, %rdx
+ jbe L(64bytesless)
+ movdqu %xmm0, 32(%rdi)
+ movdqu %xmm0, 48(%rdi)
+ movdqu %xmm0, -64(%rdi, %rdx)
+ movdqu %xmm0, -48(%rdi, %rdx)
+ cmpq $128, %rdx
+ ja L(128bytesmore)
+L(32bytesless):
+L(64bytesless):
+ ret
+
+ ALIGN (4)
+L(128bytesmore):
+ leaq 64(%rdi), %rcx
+ andq $-64, %rcx
+ movq %rdx, %r8
+ addq %rdi, %rdx
+ andq $-64, %rdx
+ cmpq %rcx, %rdx
+ je L(return)
+
+#ifdef SHARED_CACHE_SIZE
+ cmp $SHARED_CACHE_SIZE, %r8
+#else
+ cmp __x86_64_shared_cache_size(%rip), %r8
+#endif
+ ja L(128bytesmore_nt)
+
+ ALIGN (4)
+L(128bytesmore_normal):
+ movdqa %xmm0, (%rcx)
+ movaps %xmm0, 0x10(%rcx)
+ movaps %xmm0, 0x20(%rcx)
+ movaps %xmm0, 0x30(%rcx)
+ addq $64, %rcx
+ cmpq %rcx, %rdx
+ jne L(128bytesmore_normal)
+ ret
+
+ ALIGN (4)
+L(128bytesmore_nt):
+ movntdq %xmm0, (%rcx)
+ movntdq %xmm0, 0x10(%rcx)
+ movntdq %xmm0, 0x20(%rcx)
+ movntdq %xmm0, 0x30(%rcx)
+ leaq 64(%rcx), %rcx
+ cmpq %rcx, %rdx
+ jne L(128bytesmore_nt)
+ sfence
+ ret
+
+END(memset_generic)
diff --git a/sse2-stpcpy-slm.S b/sse2-stpcpy-slm.S
new file mode 100644
index 000000000000..0ad2d44cf8ba
--- /dev/null
+++ b/sse2-stpcpy-slm.S
@@ -0,0 +1,33 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_STPCPY
+#define STRCPY stpcpy
+#include "sse2-strcpy-slm.S"
diff --git a/sse2-stpncpy-slm.S b/sse2-stpncpy-slm.S
new file mode 100644
index 000000000000..30666850bea2
--- /dev/null
+++ b/sse2-stpncpy-slm.S
@@ -0,0 +1,34 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_STRNCPY
+#define USE_AS_STPCPY
+#define STRCPY stpncpy
+#include "sse2-strcpy-slm.S"
diff --git a/sse2-strcat-slm.S b/sse2-strcat-slm.S
new file mode 100644
index 000000000000..dd8207ff54c4
--- /dev/null
+++ b/sse2-strcat-slm.S
@@ -0,0 +1,87 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef STRCAT
+# define STRCAT strcat
+#endif
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+#define USE_AS_STRCAT
+
+.text
+ENTRY (STRCAT)
+ mov %rdi, %r9
+#ifdef USE_AS_STRNCAT
+ mov %rdx, %r8
+#endif
+
+#define RETURN jmp L(Strcpy)
+#include "sse2-strlen-slm.S"
+
+#undef RETURN
+#define RETURN ret
+
+L(Strcpy):
+ lea (%r9, %rax), %rdi
+ mov %rsi, %rcx
+ mov %r9, %rax /* save result */
+
+#ifdef USE_AS_STRNCAT
+ test %r8, %r8
+ jz L(ExitZero)
+# define USE_AS_STRNCPY
+#endif
+#include "sse2-strcpy-slm.S"
diff --git a/sse2-strcpy-slm.S b/sse2-strcpy-slm.S
new file mode 100644
index 000000000000..3e146bfbcfa5
--- /dev/null
+++ b/sse2-strcpy-slm.S
@@ -0,0 +1,1921 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef USE_AS_STRCAT
+
+# ifndef STRCPY
+# define STRCPY strcpy
+# endif
+
+# ifndef L
+# define L(label) .L##label
+# endif
+
+# ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+# endif
+
+# ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+# endif
+
+# ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+# endif
+
+# ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+# endif
+
+#endif
+
+#define JMPTBL(I, B) I - B
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ lea TABLE(%rip), %r11; \
+ movslq (%r11, INDEX, SCALE), %rcx; \
+ lea (%r11, %rcx), %rcx; \
+ jmp *%rcx
+
+#ifndef USE_AS_STRCAT
+
+# define RETURN ret
+
+.text
+ENTRY (STRCPY)
+# ifdef USE_AS_STRNCPY
+ mov %rdx, %r8
+ test %r8, %r8
+ jz L(ExitZero)
+# endif
+ mov %rsi, %rcx
+# ifndef USE_AS_STPCPY
+ mov %rdi, %rax /* save result */
+# endif
+
+#endif
+ and $63, %rcx
+ cmp $32, %rcx
+ jbe L(SourceStringAlignmentLess32)
+
+ and $-16, %rsi
+ and $15, %rcx
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+
+ pcmpeqb (%rsi), %xmm1
+ pmovmskb %xmm1, %rdx
+ shr %cl, %rdx
+#ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+ mov $16, %r10
+ sub %rcx, %r10
+ cmp %r10, %r8
+# else
+ mov $17, %r10
+ sub %rcx, %r10
+ cmp %r10, %r8
+# endif
+ jbe L(CopyFrom1To16BytesTailCase2OrCase3)
+#endif
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesTail)
+
+ pcmpeqb 16(%rsi), %xmm0
+ pmovmskb %xmm0, %rdx
+#ifdef USE_AS_STRNCPY
+ add $16, %r10
+ cmp %r10, %r8
+ jbe L(CopyFrom1To32BytesCase2OrCase3)
+#endif
+ test %rdx, %rdx
+ jnz L(CopyFrom1To32Bytes)
+
+ movdqu (%rsi, %rcx), %xmm1 /* copy 16 bytes */
+ movdqu %xmm1, (%rdi)
+
+/* If source adress alignment != destination adress alignment */
+ .p2align 4
+L(Unalign16Both):
+ sub %rcx, %rdi
+#ifdef USE_AS_STRNCPY
+ add %rcx, %r8
+#endif
+ mov $16, %rcx
+ movdqa (%rsi, %rcx), %xmm1
+ movaps 16(%rsi, %rcx), %xmm2
+ movdqu %xmm1, (%rdi, %rcx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ sub $48, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+ test %rdx, %rdx
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm2)
+#else
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ movaps 16(%rsi, %rcx), %xmm3
+ movdqu %xmm2, (%rdi, %rcx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+ test %rdx, %rdx
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm3)
+#else
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ movaps 16(%rsi, %rcx), %xmm4
+ movdqu %xmm3, (%rdi, %rcx)
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+ test %rdx, %rdx
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm4)
+#else
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ movaps 16(%rsi, %rcx), %xmm1
+ movdqu %xmm4, (%rdi, %rcx)
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+ test %rdx, %rdx
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm1)
+#else
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ movaps 16(%rsi, %rcx), %xmm2
+ movdqu %xmm1, (%rdi, %rcx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+ test %rdx, %rdx
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm2)
+#else
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ movaps 16(%rsi, %rcx), %xmm3
+ movdqu %xmm2, (%rdi, %rcx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+ test %rdx, %rdx
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm3)
+#else
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ movdqu %xmm3, (%rdi, %rcx)
+ mov %rsi, %rdx
+ lea 16(%rsi, %rcx), %rsi
+ and $-0x40, %rsi
+ sub %rsi, %rdx
+ sub %rdx, %rdi
+#ifdef USE_AS_STRNCPY
+ lea 128(%r8, %rdx), %r8
+#endif
+L(Unaligned64Loop):
+ movaps (%rsi), %xmm2
+ movaps %xmm2, %xmm4
+ movaps 16(%rsi), %xmm5
+ movaps 32(%rsi), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 48(%rsi), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %rdx
+#ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(UnalignedLeaveCase2OrCase3)
+#endif
+ test %rdx, %rdx
+ jnz L(Unaligned64Leave)
+
+L(Unaligned64Loop_start):
+ add $64, %rdi
+ add $64, %rsi
+ movdqu %xmm4, -64(%rdi)
+ movaps (%rsi), %xmm2
+ movdqa %xmm2, %xmm4
+ movdqu %xmm5, -48(%rdi)
+ movaps 16(%rsi), %xmm5
+ pminub %xmm5, %xmm2
+ movaps 32(%rsi), %xmm3
+ movdqu %xmm6, -32(%rdi)
+ movaps %xmm3, %xmm6
+ movdqu %xmm7, -16(%rdi)
+ movaps 48(%rsi), %xmm7
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %rdx
+#ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(UnalignedLeaveCase2OrCase3)
+#endif
+ test %rdx, %rdx
+ jz L(Unaligned64Loop_start)
+
+L(Unaligned64Leave):
+ pxor %xmm1, %xmm1
+
+ pcmpeqb %xmm4, %xmm0
+ pcmpeqb %xmm5, %xmm1
+ pmovmskb %xmm0, %rdx
+ pmovmskb %xmm1, %rcx
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesUnaligned_0)
+ test %rcx, %rcx
+ jnz L(CopyFrom1To16BytesUnaligned_16)
+
+ pcmpeqb %xmm6, %xmm0
+ pcmpeqb %xmm7, %xmm1
+ pmovmskb %xmm0, %rdx
+ pmovmskb %xmm1, %rcx
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesUnaligned_32)
+
+ bsf %rcx, %rdx
+ movdqu %xmm4, (%rdi)
+ movdqu %xmm5, 16(%rdi)
+ movdqu %xmm6, 32(%rdi)
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+ lea 48(%rdi, %rdx), %rax
+# endif
+ movdqu %xmm7, 48(%rdi)
+ add $15, %r8
+ sub %rdx, %r8
+ lea 49(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+#else
+ add $48, %rsi
+ add $48, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+#endif
+
+/* If source adress alignment == destination adress alignment */
+
+L(SourceStringAlignmentLess32):
+ pxor %xmm0, %xmm0
+ movdqu (%rsi), %xmm1
+ movdqu 16(%rsi), %xmm2
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %rdx
+
+#ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+ cmp $16, %r8
+# else
+ cmp $17, %r8
+# endif
+ jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
+#endif
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesTail1)
+
+ pcmpeqb %xmm2, %xmm0
+ movdqu %xmm1, (%rdi)
+ pmovmskb %xmm0, %rdx
+
+#ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+ cmp $32, %r8
+# else
+ cmp $33, %r8
+# endif
+ jbe L(CopyFrom1To32Bytes1Case2OrCase3)
+#endif
+ test %rdx, %rdx
+ jnz L(CopyFrom1To32Bytes1)
+
+ and $15, %rcx
+ and $-16, %rsi
+
+ jmp L(Unalign16Both)
+
+/*------End of main part with loops---------------------*/
+
+/* Case1 */
+
+#if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
+ .p2align 4
+L(CopyFrom1To16Bytes):
+ add %rcx, %rdi
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+#endif
+ .p2align 4
+L(CopyFrom1To16BytesTail):
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1):
+ add $16, %rsi
+ add $16, %rdi
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $16, %r8
+#endif
+L(CopyFrom1To16BytesTail1):
+ bsf %rdx, %rdx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes):
+ bsf %rdx, %rdx
+ add %rcx, %rsi
+ add $16, %rdx
+ sub %rcx, %rdx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+ bsf %rdx, %rdx
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+# endif
+ movdqu %xmm4, (%rdi)
+ add $63, %r8
+ sub %rdx, %r8
+ lea 1(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+#else
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+#endif
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+ bsf %rcx, %rdx
+ movdqu %xmm4, (%rdi)
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+ lea 16(%rdi, %rdx), %rax
+# endif
+ movdqu %xmm5, 16(%rdi)
+ add $47, %r8
+ sub %rdx, %r8
+ lea 17(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+#else
+ add $16, %rsi
+ add $16, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+#endif
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+ bsf %rdx, %rdx
+ movdqu %xmm4, (%rdi)
+ movdqu %xmm5, 16(%rdi)
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+ lea 32(%rdi, %rdx), %rax
+# endif
+ movdqu %xmm6, 32(%rdi)
+ add $31, %r8
+ sub %rdx, %r8
+ lea 33(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+#else
+ add $32, %rsi
+ add $32, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+#endif
+
+#ifdef USE_AS_STRNCPY
+# ifndef USE_AS_STRCAT
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm6):
+ movdqu %xmm6, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm5):
+ movdqu %xmm5, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm4):
+ movdqu %xmm4, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm3):
+ movdqu %xmm3, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm1):
+ movdqu %xmm1, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+# endif
+
+ .p2align 4
+L(CopyFrom1To16BytesExit):
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+/* Case2 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %r8
+ add %rcx, %rdi
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2):
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ add $16, %rdx
+ sub %rcx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+/* Case2 or Case3, Case3 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+ add $16, %r8
+ add %rcx, %rdi
+ add %rcx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyFrom1To32BytesCase2)
+ add %rcx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesTailCase2)
+ add %rcx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+ add $16, %rdi
+ add $16, %rsi
+ sub $16, %r8
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesTail1Case2)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+#endif
+
+/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/
+
+ .p2align 4
+L(Exit1):
+ mov %dh, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea (%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $1, %r8
+ lea 1(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit2):
+ mov (%rsi), %dx
+ mov %dx, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 1(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $2, %r8
+ lea 2(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit3):
+ mov (%rsi), %cx
+ mov %cx, (%rdi)
+ mov %dh, 2(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 2(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $3, %r8
+ lea 3(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit4):
+ mov (%rsi), %edx
+ mov %edx, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 3(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $4, %r8
+ lea 4(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit5):
+ mov (%rsi), %ecx
+ mov %dh, 4(%rdi)
+ mov %ecx, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 4(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $5, %r8
+ lea 5(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit6):
+ mov (%rsi), %ecx
+ mov 4(%rsi), %dx
+ mov %ecx, (%rdi)
+ mov %dx, 4(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 5(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $6, %r8
+ lea 6(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit7):
+ mov (%rsi), %ecx
+ mov 3(%rsi), %edx
+ mov %ecx, (%rdi)
+ mov %edx, 3(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 6(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $7, %r8
+ lea 7(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit8):
+ mov (%rsi), %rdx
+ mov %rdx, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 7(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $8, %r8
+ lea 8(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit9):
+ mov (%rsi), %rcx
+ mov %dh, 8(%rdi)
+ mov %rcx, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 8(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $9, %r8
+ lea 9(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit10):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %dx
+ mov %rcx, (%rdi)
+ mov %dx, 8(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 9(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $10, %r8
+ lea 10(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit11):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 7(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 10(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $11, %r8
+ lea 11(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit12):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 8(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 11(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $12, %r8
+ lea 12(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit13):
+ mov (%rsi), %rcx
+ mov 5(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 5(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 12(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $13, %r8
+ lea 13(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit14):
+ mov (%rsi), %rcx
+ mov 6(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 6(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 13(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $14, %r8
+ lea 14(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit15):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 7(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 14(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $15, %r8
+ lea 15(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit16):
+ movdqu (%rsi), %xmm0
+ movdqu %xmm0, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 15(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $16, %r8
+ lea 16(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit17):
+ movdqu (%rsi), %xmm0
+ movdqu %xmm0, (%rdi)
+ mov %dh, 16(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 16(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $17, %r8
+ lea 17(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit18):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %cx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 17(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $18, %r8
+ lea 18(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit19):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 15(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 18(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $19, %r8
+ lea 19(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit20):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 19(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $20, %r8
+ lea 20(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit21):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+ mov %dh, 20(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 20(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $21, %r8
+ lea 21(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit22):
+ movdqu (%rsi), %xmm0
+ mov 14(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 14(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 21(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $22, %r8
+ lea 22(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit23):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 15(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 22(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $23, %r8
+ lea 23(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit24):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 23(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $24, %r8
+ lea 24(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit25):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 16(%rdi)
+ mov %dh, 24(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 24(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $25, %r8
+ lea 25(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit26):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %cx, 24(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 25(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $26, %r8
+ lea 26(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit27):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 23(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 23(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 26(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $27, %r8
+ lea 27(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit28):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 24(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 27(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $28, %r8
+ lea 28(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit29):
+ movdqu (%rsi), %xmm0
+ movdqu 13(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 13(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 28(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $29, %r8
+ lea 29(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit30):
+ movdqu (%rsi), %xmm0
+ movdqu 14(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 14(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 29(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $30, %r8
+ lea 30(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit31):
+ movdqu (%rsi), %xmm0
+ movdqu 15(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 15(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 30(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $31, %r8
+ lea 31(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit32):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 31(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $32, %r8
+ lea 32(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+#ifdef USE_AS_STRNCPY
+
+ .p2align 4
+L(StrncpyExit0):
+#ifdef USE_AS_STPCPY
+ mov %rdi, %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, (%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit1):
+ mov (%rsi), %dl
+ mov %dl, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 1(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 1(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit2):
+ mov (%rsi), %dx
+ mov %dx, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 2(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 2(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit3):
+ mov (%rsi), %cx
+ mov 2(%rsi), %dl
+ mov %cx, (%rdi)
+ mov %dl, 2(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 3(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 3(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit4):
+ mov (%rsi), %edx
+ mov %edx, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 4(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 4(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit5):
+ mov (%rsi), %ecx
+ mov 4(%rsi), %dl
+ mov %ecx, (%rdi)
+ mov %dl, 4(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 5(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 5(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit6):
+ mov (%rsi), %ecx
+ mov 4(%rsi), %dx
+ mov %ecx, (%rdi)
+ mov %dx, 4(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 6(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 6(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit7):
+ mov (%rsi), %ecx
+ mov 3(%rsi), %edx
+ mov %ecx, (%rdi)
+ mov %edx, 3(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 7(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 7(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit8):
+ mov (%rsi), %rdx
+ mov %rdx, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 8(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 8(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit9):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %dl
+ mov %rcx, (%rdi)
+ mov %dl, 8(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 9(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 9(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit10):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %dx
+ mov %rcx, (%rdi)
+ mov %dx, 8(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 10(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 10(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit11):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 7(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 11(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 11(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit12):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 8(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 12(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 12(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit13):
+ mov (%rsi), %rcx
+ mov 5(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 5(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 13(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 13(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit14):
+ mov (%rsi), %rcx
+ mov 6(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 6(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 14(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 14(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit15):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 7(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 15(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 15(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit16):
+ movdqu (%rsi), %xmm0
+ movdqu %xmm0, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 16(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 16(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit17):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %cl
+ movdqu %xmm0, (%rdi)
+ mov %cl, 16(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 17(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 17(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit18):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %cx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 18(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 18(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit19):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 15(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 19(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 19(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit20):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 20(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 20(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit21):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ mov 20(%rsi), %dl
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+ mov %dl, 20(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 21(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 21(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit22):
+ movdqu (%rsi), %xmm0
+ mov 14(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 14(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 22(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 22(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit23):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 15(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 23(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 23(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit24):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 24(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 24(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit25):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %cl
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %cl, 24(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 25(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 25(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit26):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %cx, 24(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 26(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 26(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit27):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 23(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 23(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 27(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 27(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit28):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 24(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 28(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 28(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit29):
+ movdqu (%rsi), %xmm0
+ movdqu 13(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 13(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 29(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 29(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit30):
+ movdqu (%rsi), %xmm0
+ movdqu 14(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 14(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 30(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 30(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit31):
+ movdqu (%rsi), %xmm0
+ movdqu 15(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 15(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 31(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 31(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit32):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 32(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 32(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit33):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm2
+ mov 32(%rsi), %cl
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+ mov %cl, 32(%rdi)
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 33(%rdi)
+#endif
+ RETURN
+
+#ifndef USE_AS_STRCAT
+
+ .p2align 4
+L(Fill0):
+ RETURN
+
+ .p2align 4
+L(Fill1):
+ mov %dl, (%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill2):
+ mov %dx, (%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill3):
+ mov %edx, -1(%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill4):
+ mov %edx, (%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill5):
+ mov %edx, (%rdi)
+ mov %dl, 4(%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill6):
+ mov %edx, (%rdi)
+ mov %dx, 4(%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill7):
+ mov %rdx, -1(%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill8):
+ mov %rdx, (%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill9):
+ mov %rdx, (%rdi)
+ mov %dl, 8(%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill10):
+ mov %rdx, (%rdi)
+ mov %dx, 8(%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill11):
+ mov %rdx, (%rdi)
+ mov %edx, 7(%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill12):
+ mov %rdx, (%rdi)
+ mov %edx, 8(%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill13):
+ mov %rdx, (%rdi)
+ mov %rdx, 5(%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill14):
+ mov %rdx, (%rdi)
+ mov %rdx, 6(%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill15):
+ movdqu %xmm0, -1(%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill16):
+ movdqu %xmm0, (%rdi)
+ RETURN
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm2):
+ movdqu %xmm2, (%rdi, %rcx)
+
+ .p2align 4
+L(CopyFrom1To16BytesXmmExit):
+ bsf %rdx, %rdx
+ add $15, %r8
+ add %rcx, %rdi
+#ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+#endif
+ sub %rdx, %r8
+ lea 1(%rdi, %rdx), %rdi
+
+ .p2align 4
+L(StrncpyFillTailWithZero):
+ pxor %xmm0, %xmm0
+ xor %rdx, %rdx
+ sub $16, %r8
+ jbe L(StrncpyFillExit)
+
+ movdqu %xmm0, (%rdi)
+ add $16, %rdi
+
+ mov %rdi, %rsi
+ and $0xf, %rsi
+ sub %rsi, %rdi
+ add %rsi, %r8
+ sub $64, %r8
+ jb L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+ movdqa %xmm0, (%rdi)
+ movdqa %xmm0, 16(%rdi)
+ movdqa %xmm0, 32(%rdi)
+ movdqa %xmm0, 48(%rdi)
+ add $64, %rdi
+ sub $64, %r8
+ jae L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+ add $32, %r8
+ jl L(StrncpyFillLess32)
+ movdqa %xmm0, (%rdi)
+ movdqa %xmm0, 16(%rdi)
+ add $32, %rdi
+ sub $16, %r8
+ jl L(StrncpyFillExit)
+ movdqa %xmm0, (%rdi)
+ add $16, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+L(StrncpyFillLess32):
+ add $16, %r8
+ jl L(StrncpyFillExit)
+ movdqa %xmm0, (%rdi)
+ add $16, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+L(StrncpyFillExit):
+ add $16, %r8
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+/* end of ifndef USE_AS_STRCAT */
+#endif
+
+ .p2align 4
+L(UnalignedLeaveCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+ lea 64(%r8), %rcx
+ and $-16, %rcx
+ add $48, %r8
+ jl L(CopyFrom1To16BytesCase3)
+ movdqu %xmm4, (%rdi)
+ sub $16, %r8
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm5, 16(%rdi)
+ sub $16, %r8
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm6, 32(%rdi)
+ sub $16, %r8
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm7, 48(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 64(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 64(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(Unaligned64LeaveCase2):
+ xor %rcx, %rcx
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $48, %r8
+ jle L(CopyFrom1To16BytesCase2OrCase3)
+ test %rdx, %rdx
+#ifndef USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm4)
+#else
+ jnz L(CopyFrom1To16Bytes)
+#endif
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %rdx
+ movdqu %xmm4, (%rdi)
+ add $16, %rcx
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %rdx, %rdx
+#ifndef USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm5)
+#else
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ pcmpeqb %xmm6, %xmm0
+ pmovmskb %xmm0, %rdx
+ movdqu %xmm5, 16(%rdi)
+ add $16, %rcx
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %rdx, %rdx
+#ifndef USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm6)
+#else
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %rdx
+ movdqu %xmm6, 32(%rdi)
+ lea 16(%rdi, %rcx), %rdi
+ lea 16(%rsi, %rcx), %rsi
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(ExitZero):
+#ifndef USE_AS_STRCAT
+ mov %rdi, %rax
+#endif
+ RETURN
+
+#endif
+
+#ifndef USE_AS_STRCAT
+END (STRCPY)
+#else
+END (STRCAT)
+#endif
+ .p2align 4
+ .section .rodata
+L(ExitTable):
+ .int JMPTBL(L(Exit1), L(ExitTable))
+ .int JMPTBL(L(Exit2), L(ExitTable))
+ .int JMPTBL(L(Exit3), L(ExitTable))
+ .int JMPTBL(L(Exit4), L(ExitTable))
+ .int JMPTBL(L(Exit5), L(ExitTable))
+ .int JMPTBL(L(Exit6), L(ExitTable))
+ .int JMPTBL(L(Exit7), L(ExitTable))
+ .int JMPTBL(L(Exit8), L(ExitTable))
+ .int JMPTBL(L(Exit9), L(ExitTable))
+ .int JMPTBL(L(Exit10), L(ExitTable))
+ .int JMPTBL(L(Exit11), L(ExitTable))
+ .int JMPTBL(L(Exit12), L(ExitTable))
+ .int JMPTBL(L(Exit13), L(ExitTable))
+ .int JMPTBL(L(Exit14), L(ExitTable))
+ .int JMPTBL(L(Exit15), L(ExitTable))
+ .int JMPTBL(L(Exit16), L(ExitTable))
+ .int JMPTBL(L(Exit17), L(ExitTable))
+ .int JMPTBL(L(Exit18), L(ExitTable))
+ .int JMPTBL(L(Exit19), L(ExitTable))
+ .int JMPTBL(L(Exit20), L(ExitTable))
+ .int JMPTBL(L(Exit21), L(ExitTable))
+ .int JMPTBL(L(Exit22), L(ExitTable))
+ .int JMPTBL(L(Exit23), L(ExitTable))
+ .int JMPTBL(L(Exit24), L(ExitTable))
+ .int JMPTBL(L(Exit25), L(ExitTable))
+ .int JMPTBL(L(Exit26), L(ExitTable))
+ .int JMPTBL(L(Exit27), L(ExitTable))
+ .int JMPTBL(L(Exit28), L(ExitTable))
+ .int JMPTBL(L(Exit29), L(ExitTable))
+ .int JMPTBL(L(Exit30), L(ExitTable))
+ .int JMPTBL(L(Exit31), L(ExitTable))
+ .int JMPTBL(L(Exit32), L(ExitTable))
+#ifdef USE_AS_STRNCPY
+L(ExitStrncpyTable):
+ .int JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+# ifndef USE_AS_STRCAT
+ .p2align 4
+L(FillTable):
+ .int JMPTBL(L(Fill0), L(FillTable))
+ .int JMPTBL(L(Fill1), L(FillTable))
+ .int JMPTBL(L(Fill2), L(FillTable))
+ .int JMPTBL(L(Fill3), L(FillTable))
+ .int JMPTBL(L(Fill4), L(FillTable))
+ .int JMPTBL(L(Fill5), L(FillTable))
+ .int JMPTBL(L(Fill6), L(FillTable))
+ .int JMPTBL(L(Fill7), L(FillTable))
+ .int JMPTBL(L(Fill8), L(FillTable))
+ .int JMPTBL(L(Fill9), L(FillTable))
+ .int JMPTBL(L(Fill10), L(FillTable))
+ .int JMPTBL(L(Fill11), L(FillTable))
+ .int JMPTBL(L(Fill12), L(FillTable))
+ .int JMPTBL(L(Fill13), L(FillTable))
+ .int JMPTBL(L(Fill14), L(FillTable))
+ .int JMPTBL(L(Fill15), L(FillTable))
+ .int JMPTBL(L(Fill16), L(FillTable))
+# endif
+#endif
diff --git a/sse2-strlen-slm.S b/sse2-strlen-slm.S
new file mode 100644
index 000000000000..3772fe77043a
--- /dev/null
+++ b/sse2-strlen-slm.S
@@ -0,0 +1,294 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef USE_AS_STRCAT
+
+#ifndef STRLEN
+# define STRLEN strlen
+#endif
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+#define RETURN ret
+ .section .text.sse2,"ax",@progbits
+ENTRY (STRLEN)
+/* end ifndef USE_AS_STRCAT */
+#endif
+ xor %rax, %rax
+ mov %edi, %ecx
+ and $0x3f, %ecx
+ pxor %xmm0, %xmm0
+ cmp $0x30, %ecx
+ ja L(next)
+ movdqu (%rdi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit_less16)
+ mov %rdi, %rax
+ and $-16, %rax
+ jmp L(align16_start)
+L(next):
+ mov %rdi, %rax
+ and $-16, %rax
+ pcmpeqb (%rax), %xmm0
+ mov $-1, %r10d
+ sub %rax, %rcx
+ shl %cl, %r10d
+ pmovmskb %xmm0, %edx
+ and %r10d, %edx
+ jnz L(exit)
+L(align16_start):
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+ pcmpeqb 16(%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz L(exit64)
+
+ pcmpeqb 80(%rax), %xmm0
+ add $64, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz L(exit64)
+
+ pcmpeqb 80(%rax), %xmm0
+ add $64, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz L(exit64)
+
+ pcmpeqb 80(%rax), %xmm0
+ add $64, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz L(exit64)
+
+
+ test $0x3f, %rax
+ jz L(align64_loop)
+
+ pcmpeqb 80(%rax), %xmm0
+ add $80, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ test $0x3f, %rax
+ jz L(align64_loop)
+
+ pcmpeqb 16(%rax), %xmm1
+ add $16, %rax
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ test $0x3f, %rax
+ jz L(align64_loop)
+
+ pcmpeqb 16(%rax), %xmm2
+ add $16, %rax
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ test $0x3f, %rax
+ jz L(align64_loop)
+
+ pcmpeqb 16(%rax), %xmm3
+ add $16, %rax
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ add $16, %rax
+ .p2align 4
+ L(align64_loop):
+ movaps (%rax), %xmm4
+ pminub 16(%rax), %xmm4
+ movaps 32(%rax), %xmm5
+ pminub 48(%rax), %xmm5
+ add $64, %rax
+ pminub %xmm4, %xmm5
+ pcmpeqb %xmm0, %xmm5
+ pmovmskb %xmm5, %edx
+ test %edx, %edx
+ jz L(align64_loop)
+
+
+ pcmpeqb -64(%rax), %xmm0
+ sub $80, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $64, %rax
+ RETURN
+
+ .p2align 4
+L(exit):
+ sub %rdi, %rax
+L(exit_less16):
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ RETURN
+ .p2align 4
+L(exit16):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $16, %rax
+ RETURN
+ .p2align 4
+L(exit32):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $32, %rax
+ RETURN
+ .p2align 4
+L(exit48):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $48, %rax
+ RETURN
+ .p2align 4
+L(exit64):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $64, %rax
+#ifndef USE_AS_STRCAT
+ RETURN
+
+END (STRLEN)
+#endif
diff --git a/sse2-strncat-slm.S b/sse2-strncat-slm.S
new file mode 100644
index 000000000000..6b4a430846b0
--- /dev/null
+++ b/sse2-strncat-slm.S
@@ -0,0 +1,33 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_STRNCAT
+#define STRCAT strncat
+#include "sse2-strcat-slm.S"
diff --git a/sse2-strncpy-slm.S b/sse2-strncpy-slm.S
new file mode 100644
index 000000000000..594e78f74479
--- /dev/null
+++ b/sse2-strncpy-slm.S
@@ -0,0 +1,33 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_STRNCPY
+#define STRCPY strncpy
+#include "sse2-strcpy-slm.S"
diff --git a/sse4-memcmp-slm.S b/sse4-memcmp-slm.S
new file mode 100644
index 000000000000..8a8b180a29e5
--- /dev/null
+++ b/sse4-memcmp-slm.S
@@ -0,0 +1,1799 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "cache.h"
+
+#ifndef MEMCMP
+# define MEMCMP memcmp
+#endif
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n) .p2align n
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n) .p2align n
+#endif
+
+#define JMPTBL(I, B) (I - B)
+
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ lea TABLE(%rip), %r11; \
+ movslq (%r11, INDEX, SCALE), %rcx; \
+ add %r11, %rcx; \
+ jmp *%rcx; \
+ ud2
+
+ .section .text.sse4.1,"ax",@progbits
+ENTRY (MEMCMP)
+#ifdef USE_AS_WMEMCMP
+ shl $2, %rdx
+#endif
+ pxor %xmm0, %xmm0
+ cmp $79, %rdx
+ ja L(79bytesormore)
+#ifndef USE_AS_WMEMCMP
+ cmp $1, %rdx
+ je L(firstbyte)
+#endif
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+#ifndef USE_AS_WMEMCMP
+ ALIGN (4)
+L(firstbyte):
+ movzbl (%rdi), %eax
+ movzbl (%rsi), %ecx
+ sub %ecx, %eax
+ ret
+#endif
+
+ ALIGN (4)
+L(79bytesormore):
+ movdqu (%rsi), %xmm1
+ movdqu (%rdi), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+ mov %rsi, %rcx
+ and $-16, %rsi
+ add $16, %rsi
+ sub %rsi, %rcx
+
+ sub %rcx, %rdi
+ add %rcx, %rdx
+ test $0xf, %rdi
+ jz L(2aligned)
+
+ cmp $128, %rdx
+ ja L(128bytesormore)
+L(less128bytes):
+ sub $64, %rdx
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqu 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqu 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+ cmp $32, %rdx
+ jb L(less32bytesin64)
+
+ movdqu 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqu 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin64):
+ add $64, %rdi
+ add $64, %rsi
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+L(128bytesormore):
+ cmp $512, %rdx
+ ja L(512bytesormore)
+ cmp $256, %rdx
+ ja L(less512bytes)
+L(less256bytes):
+ sub $128, %rdx
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqu 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqu 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+
+ movdqu 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqu 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+
+ movdqu 96(%rdi), %xmm2
+ pxor 96(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(112bytesin256)
+
+ movdqu 112(%rdi), %xmm2
+ pxor 112(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(128bytesin256)
+
+ add $128, %rsi
+ add $128, %rdi
+
+ cmp $64, %rdx
+ jae L(less128bytes)
+
+ cmp $32, %rdx
+ jb L(less32bytesin128)
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin128):
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+L(less512bytes):
+ sub $256, %rdx
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqu 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqu 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+
+ movdqu 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqu 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+
+ movdqu 96(%rdi), %xmm2
+ pxor 96(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(112bytesin256)
+
+ movdqu 112(%rdi), %xmm2
+ pxor 112(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(128bytesin256)
+
+ movdqu 128(%rdi), %xmm2
+ pxor 128(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(144bytesin256)
+
+ movdqu 144(%rdi), %xmm2
+ pxor 144(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(160bytesin256)
+
+ movdqu 160(%rdi), %xmm2
+ pxor 160(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(176bytesin256)
+
+ movdqu 176(%rdi), %xmm2
+ pxor 176(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(192bytesin256)
+
+ movdqu 192(%rdi), %xmm2
+ pxor 192(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(208bytesin256)
+
+ movdqu 208(%rdi), %xmm2
+ pxor 208(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(224bytesin256)
+
+ movdqu 224(%rdi), %xmm2
+ pxor 224(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(240bytesin256)
+
+ movdqu 240(%rdi), %xmm2
+ pxor 240(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(256bytesin256)
+
+ add $256, %rsi
+ add $256, %rdi
+
+ cmp $128, %rdx
+ jae L(less256bytes)
+
+ cmp $64, %rdx
+ jae L(less128bytes)
+
+ cmp $32, %rdx
+ jb L(less32bytesin256)
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin256):
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+ ALIGN (4)
+L(512bytesormore):
+#ifdef DATA_CACHE_SIZE_HALF
+ mov $DATA_CACHE_SIZE_HALF, %r8
+#else
+ mov __x86_64_data_cache_size_half(%rip), %r8
+#endif
+ mov %r8, %r9
+ shr $1, %r8
+ add %r9, %r8
+ cmp %r8, %rdx
+ ja L(L2_L3_cache_unaglined)
+ sub $64, %rdx
+ ALIGN (4)
+L(64bytesormore_loop):
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ movdqa %xmm2, %xmm1
+
+ movdqu 16(%rdi), %xmm3
+ pxor 16(%rsi), %xmm3
+ por %xmm3, %xmm1
+
+ movdqu 32(%rdi), %xmm4
+ pxor 32(%rsi), %xmm4
+ por %xmm4, %xmm1
+
+ movdqu 48(%rdi), %xmm5
+ pxor 48(%rsi), %xmm5
+ por %xmm5, %xmm1
+
+ ptest %xmm1, %xmm0
+ jnc L(64bytesormore_loop_end)
+ add $64, %rsi
+ add $64, %rdi
+ sub $64, %rdx
+ jae L(64bytesormore_loop)
+
+ add $64, %rdx
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+L(L2_L3_cache_unaglined):
+ sub $64, %rdx
+ ALIGN (4)
+L(L2_L3_unaligned_128bytes_loop):
+ prefetchnta 0x1c0(%rdi)
+ prefetchnta 0x1c0(%rsi)
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ movdqa %xmm2, %xmm1
+
+ movdqu 16(%rdi), %xmm3
+ pxor 16(%rsi), %xmm3
+ por %xmm3, %xmm1
+
+ movdqu 32(%rdi), %xmm4
+ pxor 32(%rsi), %xmm4
+ por %xmm4, %xmm1
+
+ movdqu 48(%rdi), %xmm5
+ pxor 48(%rsi), %xmm5
+ por %xmm5, %xmm1
+
+ ptest %xmm1, %xmm0
+ jnc L(64bytesormore_loop_end)
+ add $64, %rsi
+ add $64, %rdi
+ sub $64, %rdx
+ jae L(L2_L3_unaligned_128bytes_loop)
+
+ add $64, %rdx
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+/*
+ * This case is for machines which are sensitive for unaligned instructions.
+ */
+ ALIGN (4)
+L(2aligned):
+ cmp $128, %rdx
+ ja L(128bytesormorein2aligned)
+L(less128bytesin2aligned):
+ sub $64, %rdx
+
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqa 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqa 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqa 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+ cmp $32, %rdx
+ jb L(less32bytesin64in2alinged)
+
+ movdqa 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqa 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin64in2alinged):
+ add $64, %rdi
+ add $64, %rsi
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+ ALIGN (4)
+L(128bytesormorein2aligned):
+ cmp $512, %rdx
+ ja L(512bytesormorein2aligned)
+ cmp $256, %rdx
+ ja L(256bytesormorein2aligned)
+L(less256bytesin2alinged):
+ sub $128, %rdx
+
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqa 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqa 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqa 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+
+ movdqa 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqa 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+
+ movdqa 96(%rdi), %xmm2
+ pxor 96(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(112bytesin256)
+
+ movdqa 112(%rdi), %xmm2
+ pxor 112(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(128bytesin256)
+
+ add $128, %rsi
+ add $128, %rdi
+
+ cmp $64, %rdx
+ jae L(less128bytesin2aligned)
+
+ cmp $32, %rdx
+ jb L(less32bytesin128in2aligned)
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin128in2aligned):
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+ ALIGN (4)
+L(256bytesormorein2aligned):
+
+ sub $256, %rdx
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqa 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqa 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqa 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+
+ movdqa 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqa 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+
+ movdqa 96(%rdi), %xmm2
+ pxor 96(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(112bytesin256)
+
+ movdqa 112(%rdi), %xmm2
+ pxor 112(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(128bytesin256)
+
+ movdqa 128(%rdi), %xmm2
+ pxor 128(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(144bytesin256)
+
+ movdqa 144(%rdi), %xmm2
+ pxor 144(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(160bytesin256)
+
+ movdqa 160(%rdi), %xmm2
+ pxor 160(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(176bytesin256)
+
+ movdqa 176(%rdi), %xmm2
+ pxor 176(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(192bytesin256)
+
+ movdqa 192(%rdi), %xmm2
+ pxor 192(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(208bytesin256)
+
+ movdqa 208(%rdi), %xmm2
+ pxor 208(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(224bytesin256)
+
+ movdqa 224(%rdi), %xmm2
+ pxor 224(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(240bytesin256)
+
+ movdqa 240(%rdi), %xmm2
+ pxor 240(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(256bytesin256)
+
+ add $256, %rsi
+ add $256, %rdi
+
+ cmp $128, %rdx
+ jae L(less256bytesin2alinged)
+
+ cmp $64, %rdx
+ jae L(less128bytesin2aligned)
+
+ cmp $32, %rdx
+ jb L(less32bytesin256in2alinged)
+
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqa 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin256in2alinged):
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+ ALIGN (4)
+L(512bytesormorein2aligned):
+#ifdef DATA_CACHE_SIZE_HALF
+ mov $DATA_CACHE_SIZE_HALF, %r8
+#else
+ mov __x86_64_data_cache_size_half(%rip), %r8
+#endif
+ mov %r8, %r9
+ shr $1, %r8
+ add %r9, %r8
+ cmp %r8, %rdx
+ ja L(L2_L3_cache_aglined)
+
+ sub $64, %rdx
+ ALIGN (4)
+L(64bytesormore_loopin2aligned):
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ movdqa %xmm2, %xmm1
+
+ movdqa 16(%rdi), %xmm3
+ pxor 16(%rsi), %xmm3
+ por %xmm3, %xmm1
+
+ movdqa 32(%rdi), %xmm4
+ pxor 32(%rsi), %xmm4
+ por %xmm4, %xmm1
+
+ movdqa 48(%rdi), %xmm5
+ pxor 48(%rsi), %xmm5
+ por %xmm5, %xmm1
+
+ ptest %xmm1, %xmm0
+ jnc L(64bytesormore_loop_end)
+ add $64, %rsi
+ add $64, %rdi
+ sub $64, %rdx
+ jae L(64bytesormore_loopin2aligned)
+
+ add $64, %rdx
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+L(L2_L3_cache_aglined):
+ sub $64, %rdx
+ ALIGN (4)
+L(L2_L3_aligned_128bytes_loop):
+ prefetchnta 0x1c0(%rdi)
+ prefetchnta 0x1c0(%rsi)
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ movdqa %xmm2, %xmm1
+
+ movdqa 16(%rdi), %xmm3
+ pxor 16(%rsi), %xmm3
+ por %xmm3, %xmm1
+
+ movdqa 32(%rdi), %xmm4
+ pxor 32(%rsi), %xmm4
+ por %xmm4, %xmm1
+
+ movdqa 48(%rdi), %xmm5
+ pxor 48(%rsi), %xmm5
+ por %xmm5, %xmm1
+
+ ptest %xmm1, %xmm0
+ jnc L(64bytesormore_loop_end)
+ add $64, %rsi
+ add $64, %rdi
+ sub $64, %rdx
+ jae L(L2_L3_aligned_128bytes_loop)
+
+ add $64, %rdx
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+
+ ALIGN (4)
+L(64bytesormore_loop_end):
+ add $16, %rdi
+ add $16, %rsi
+ ptest %xmm2, %xmm0
+ jnc L(16bytes)
+
+ add $16, %rdi
+ add $16, %rsi
+ ptest %xmm3, %xmm0
+ jnc L(16bytes)
+
+ add $16, %rdi
+ add $16, %rsi
+ ptest %xmm4, %xmm0
+ jnc L(16bytes)
+
+ add $16, %rdi
+ add $16, %rsi
+ jmp L(16bytes)
+
+L(256bytesin256):
+ add $256, %rdi
+ add $256, %rsi
+ jmp L(16bytes)
+L(240bytesin256):
+ add $240, %rdi
+ add $240, %rsi
+ jmp L(16bytes)
+L(224bytesin256):
+ add $224, %rdi
+ add $224, %rsi
+ jmp L(16bytes)
+L(208bytesin256):
+ add $208, %rdi
+ add $208, %rsi
+ jmp L(16bytes)
+L(192bytesin256):
+ add $192, %rdi
+ add $192, %rsi
+ jmp L(16bytes)
+L(176bytesin256):
+ add $176, %rdi
+ add $176, %rsi
+ jmp L(16bytes)
+L(160bytesin256):
+ add $160, %rdi
+ add $160, %rsi
+ jmp L(16bytes)
+L(144bytesin256):
+ add $144, %rdi
+ add $144, %rsi
+ jmp L(16bytes)
+L(128bytesin256):
+ add $128, %rdi
+ add $128, %rsi
+ jmp L(16bytes)
+L(112bytesin256):
+ add $112, %rdi
+ add $112, %rsi
+ jmp L(16bytes)
+L(96bytesin256):
+ add $96, %rdi
+ add $96, %rsi
+ jmp L(16bytes)
+L(80bytesin256):
+ add $80, %rdi
+ add $80, %rsi
+ jmp L(16bytes)
+L(64bytesin256):
+ add $64, %rdi
+ add $64, %rsi
+ jmp L(16bytes)
+L(48bytesin256):
+ add $16, %rdi
+ add $16, %rsi
+L(32bytesin256):
+ add $16, %rdi
+ add $16, %rsi
+L(16bytesin256):
+ add $16, %rdi
+ add $16, %rsi
+L(16bytes):
+ mov -16(%rdi), %rax
+ mov -16(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+L(8bytes):
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(12bytes):
+ mov -12(%rdi), %rax
+ mov -12(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+L(4bytes):
+ mov -4(%rsi), %ecx
+ mov -4(%rdi), %eax
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+L(0bytes):
+ xor %eax, %eax
+ ret
+
+#ifndef USE_AS_WMEMCMP
+/* unreal case for wmemcmp */
+ ALIGN (4)
+L(65bytes):
+ movdqu -65(%rdi), %xmm1
+ movdqu -65(%rsi), %xmm2
+ mov $-65, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(49bytes):
+ movdqu -49(%rdi), %xmm1
+ movdqu -49(%rsi), %xmm2
+ mov $-49, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(33bytes):
+ movdqu -33(%rdi), %xmm1
+ movdqu -33(%rsi), %xmm2
+ mov $-33, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(17bytes):
+ mov -17(%rdi), %rax
+ mov -17(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+L(9bytes):
+ mov -9(%rdi), %rax
+ mov -9(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ movzbl -1(%rdi), %eax
+ movzbl -1(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(13bytes):
+ mov -13(%rdi), %rax
+ mov -13(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(5bytes):
+ mov -5(%rdi), %eax
+ mov -5(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ movzbl -1(%rdi), %eax
+ movzbl -1(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(66bytes):
+ movdqu -66(%rdi), %xmm1
+ movdqu -66(%rsi), %xmm2
+ mov $-66, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(50bytes):
+ movdqu -50(%rdi), %xmm1
+ movdqu -50(%rsi), %xmm2
+ mov $-50, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(34bytes):
+ movdqu -34(%rdi), %xmm1
+ movdqu -34(%rsi), %xmm2
+ mov $-34, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(18bytes):
+ mov -18(%rdi), %rax
+ mov -18(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+L(10bytes):
+ mov -10(%rdi), %rax
+ mov -10(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ movzwl -2(%rdi), %eax
+ movzwl -2(%rsi), %ecx
+ cmp %cl, %al
+ jne L(end)
+ and $0xffff, %eax
+ and $0xffff, %ecx
+ sub %ecx, %eax
+ ret
+
+ ALIGN (4)
+L(14bytes):
+ mov -14(%rdi), %rax
+ mov -14(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(6bytes):
+ mov -6(%rdi), %eax
+ mov -6(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+L(2bytes):
+ movzwl -2(%rsi), %ecx
+ movzwl -2(%rdi), %eax
+ cmp %cl, %al
+ jne L(end)
+ and $0xffff, %eax
+ and $0xffff, %ecx
+ sub %ecx, %eax
+ ret
+
+ ALIGN (4)
+L(67bytes):
+ movdqu -67(%rdi), %xmm2
+ movdqu -67(%rsi), %xmm1
+ mov $-67, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(51bytes):
+ movdqu -51(%rdi), %xmm2
+ movdqu -51(%rsi), %xmm1
+ mov $-51, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(35bytes):
+ movdqu -35(%rsi), %xmm1
+ movdqu -35(%rdi), %xmm2
+ mov $-35, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(19bytes):
+ mov -19(%rdi), %rax
+ mov -19(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+L(11bytes):
+ mov -11(%rdi), %rax
+ mov -11(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -4(%rdi), %eax
+ mov -4(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(15bytes):
+ mov -15(%rdi), %rax
+ mov -15(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(7bytes):
+ mov -7(%rdi), %eax
+ mov -7(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ mov -4(%rdi), %eax
+ mov -4(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(3bytes):
+ movzwl -3(%rdi), %eax
+ movzwl -3(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin2bytes)
+L(1bytes):
+ movzbl -1(%rdi), %eax
+ movzbl -1(%rsi), %ecx
+ sub %ecx, %eax
+ ret
+#endif
+
+ ALIGN (4)
+L(68bytes):
+ movdqu -68(%rdi), %xmm2
+ movdqu -68(%rsi), %xmm1
+ mov $-68, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(52bytes):
+ movdqu -52(%rdi), %xmm2
+ movdqu -52(%rsi), %xmm1
+ mov $-52, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(36bytes):
+ movdqu -36(%rdi), %xmm2
+ movdqu -36(%rsi), %xmm1
+ mov $-36, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(20bytes):
+ movdqu -20(%rdi), %xmm2
+ movdqu -20(%rsi), %xmm1
+ mov $-20, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -4(%rdi), %eax
+ mov -4(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
+
+#ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
+ ALIGN (4)
+L(69bytes):
+ movdqu -69(%rsi), %xmm1
+ movdqu -69(%rdi), %xmm2
+ mov $-69, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(53bytes):
+ movdqu -53(%rsi), %xmm1
+ movdqu -53(%rdi), %xmm2
+ mov $-53, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(37bytes):
+ movdqu -37(%rsi), %xmm1
+ movdqu -37(%rdi), %xmm2
+ mov $-37, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(21bytes):
+ movdqu -21(%rsi), %xmm1
+ movdqu -21(%rdi), %xmm2
+ mov $-21, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(70bytes):
+ movdqu -70(%rsi), %xmm1
+ movdqu -70(%rdi), %xmm2
+ mov $-70, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(54bytes):
+ movdqu -54(%rsi), %xmm1
+ movdqu -54(%rdi), %xmm2
+ mov $-54, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(38bytes):
+ movdqu -38(%rsi), %xmm1
+ movdqu -38(%rdi), %xmm2
+ mov $-38, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(22bytes):
+ movdqu -22(%rsi), %xmm1
+ movdqu -22(%rdi), %xmm2
+ mov $-22, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(71bytes):
+ movdqu -71(%rsi), %xmm1
+ movdqu -71(%rdi), %xmm2
+ mov $-71, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(55bytes):
+ movdqu -55(%rdi), %xmm2
+ movdqu -55(%rsi), %xmm1
+ mov $-55, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(39bytes):
+ movdqu -39(%rdi), %xmm2
+ movdqu -39(%rsi), %xmm1
+ mov $-39, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(23bytes):
+ movdqu -23(%rdi), %xmm2
+ movdqu -23(%rsi), %xmm1
+ mov $-23, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+#endif
+
+ ALIGN (4)
+L(72bytes):
+ movdqu -72(%rsi), %xmm1
+ movdqu -72(%rdi), %xmm2
+ mov $-72, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(56bytes):
+ movdqu -56(%rdi), %xmm2
+ movdqu -56(%rsi), %xmm1
+ mov $-56, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(40bytes):
+ movdqu -40(%rdi), %xmm2
+ movdqu -40(%rsi), %xmm1
+ mov $-40, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(24bytes):
+ movdqu -24(%rdi), %xmm2
+ movdqu -24(%rsi), %xmm1
+ mov $-24, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+#ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
+ ALIGN (4)
+L(73bytes):
+ movdqu -73(%rsi), %xmm1
+ movdqu -73(%rdi), %xmm2
+ mov $-73, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(57bytes):
+ movdqu -57(%rdi), %xmm2
+ movdqu -57(%rsi), %xmm1
+ mov $-57, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(41bytes):
+ movdqu -41(%rdi), %xmm2
+ movdqu -41(%rsi), %xmm1
+ mov $-41, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(25bytes):
+ movdqu -25(%rdi), %xmm2
+ movdqu -25(%rsi), %xmm1
+ mov $-25, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -9(%rdi), %rax
+ mov -9(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ movzbl -1(%rdi), %eax
+ movzbl -1(%rsi), %ecx
+ sub %ecx, %eax
+ ret
+
+ ALIGN (4)
+L(74bytes):
+ movdqu -74(%rsi), %xmm1
+ movdqu -74(%rdi), %xmm2
+ mov $-74, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(58bytes):
+ movdqu -58(%rdi), %xmm2
+ movdqu -58(%rsi), %xmm1
+ mov $-58, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(42bytes):
+ movdqu -42(%rdi), %xmm2
+ movdqu -42(%rsi), %xmm1
+ mov $-42, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(26bytes):
+ movdqu -26(%rdi), %xmm2
+ movdqu -26(%rsi), %xmm1
+ mov $-26, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -10(%rdi), %rax
+ mov -10(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ movzwl -2(%rdi), %eax
+ movzwl -2(%rsi), %ecx
+ jmp L(diffin2bytes)
+
+ ALIGN (4)
+L(75bytes):
+ movdqu -75(%rsi), %xmm1
+ movdqu -75(%rdi), %xmm2
+ mov $-75, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(59bytes):
+ movdqu -59(%rdi), %xmm2
+ movdqu -59(%rsi), %xmm1
+ mov $-59, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(43bytes):
+ movdqu -43(%rdi), %xmm2
+ movdqu -43(%rsi), %xmm1
+ mov $-43, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(27bytes):
+ movdqu -27(%rdi), %xmm2
+ movdqu -27(%rsi), %xmm1
+ mov $-27, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -11(%rdi), %rax
+ mov -11(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -4(%rdi), %eax
+ mov -4(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
+#endif
+ ALIGN (4)
+L(76bytes):
+ movdqu -76(%rsi), %xmm1
+ movdqu -76(%rdi), %xmm2
+ mov $-76, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(60bytes):
+ movdqu -60(%rdi), %xmm2
+ movdqu -60(%rsi), %xmm1
+ mov $-60, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(44bytes):
+ movdqu -44(%rdi), %xmm2
+ movdqu -44(%rsi), %xmm1
+ mov $-44, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(28bytes):
+ movdqu -28(%rdi), %xmm2
+ movdqu -28(%rsi), %xmm1
+ mov $-28, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -12(%rdi), %rax
+ mov -12(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -4(%rdi), %eax
+ mov -4(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
+
+#ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
+ ALIGN (4)
+L(77bytes):
+ movdqu -77(%rsi), %xmm1
+ movdqu -77(%rdi), %xmm2
+ mov $-77, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(61bytes):
+ movdqu -61(%rdi), %xmm2
+ movdqu -61(%rsi), %xmm1
+ mov $-61, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(45bytes):
+ movdqu -45(%rdi), %xmm2
+ movdqu -45(%rsi), %xmm1
+ mov $-45, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(29bytes):
+ movdqu -29(%rdi), %xmm2
+ movdqu -29(%rsi), %xmm1
+ mov $-29, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -13(%rdi), %rax
+ mov -13(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(78bytes):
+ movdqu -78(%rsi), %xmm1
+ movdqu -78(%rdi), %xmm2
+ mov $-78, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(62bytes):
+ movdqu -62(%rdi), %xmm2
+ movdqu -62(%rsi), %xmm1
+ mov $-62, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(46bytes):
+ movdqu -46(%rdi), %xmm2
+ movdqu -46(%rsi), %xmm1
+ mov $-46, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(30bytes):
+ movdqu -30(%rdi), %xmm2
+ movdqu -30(%rsi), %xmm1
+ mov $-30, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -14(%rdi), %rax
+ mov -14(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(79bytes):
+ movdqu -79(%rsi), %xmm1
+ movdqu -79(%rdi), %xmm2
+ mov $-79, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(63bytes):
+ movdqu -63(%rdi), %xmm2
+ movdqu -63(%rsi), %xmm1
+ mov $-63, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(47bytes):
+ movdqu -47(%rdi), %xmm2
+ movdqu -47(%rsi), %xmm1
+ mov $-47, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(31bytes):
+ movdqu -31(%rdi), %xmm2
+ movdqu -31(%rsi), %xmm1
+ mov $-31, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -15(%rdi), %rax
+ mov -15(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+#endif
+ ALIGN (4)
+L(64bytes):
+ movdqu -64(%rdi), %xmm2
+ movdqu -64(%rsi), %xmm1
+ mov $-64, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(48bytes):
+ movdqu -48(%rdi), %xmm2
+ movdqu -48(%rsi), %xmm1
+ mov $-48, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(32bytes):
+ movdqu -32(%rdi), %xmm2
+ movdqu -32(%rsi), %xmm1
+ mov $-32, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -16(%rdi), %rax
+ mov -16(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+/*
+ * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
+ */
+ ALIGN (3)
+L(less16bytes):
+ movsbq %dl, %rdx
+ mov (%rsi, %rdx), %rcx
+ mov (%rdi, %rdx), %rax
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov 8(%rsi, %rdx), %rcx
+ mov 8(%rdi, %rdx), %rax
+L(diffin8bytes):
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ shr $32, %rcx
+ shr $32, %rax
+
+#ifdef USE_AS_WMEMCMP
+/* for wmemcmp */
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
+#endif
+
+L(diffin4bytes):
+#ifndef USE_AS_WMEMCMP
+ cmp %cx, %ax
+ jne L(diffin2bytes)
+ shr $16, %ecx
+ shr $16, %eax
+L(diffin2bytes):
+ cmp %cl, %al
+ jne L(end)
+ and $0xffff, %eax
+ and $0xffff, %ecx
+ sub %ecx, %eax
+ ret
+#else
+
+/* for wmemcmp */
+ mov $1, %eax
+ jl L(nequal_bigger)
+ neg %eax
+ ret
+
+ ALIGN (4)
+L(nequal_bigger):
+ ret
+
+L(unreal_case):
+ xor %eax, %eax
+ ret
+#endif
+
+ ALIGN (4)
+L(end):
+ and $0xff, %eax
+ and $0xff, %ecx
+ sub %ecx, %eax
+ ret
+
+END (MEMCMP)
+
+ .section .rodata.sse4.1,"a",@progbits
+ ALIGN (3)
+#ifndef USE_AS_WMEMCMP
+L(table_64bytes):
+ .int JMPTBL (L(0bytes), L(table_64bytes))
+ .int JMPTBL (L(1bytes), L(table_64bytes))
+ .int JMPTBL (L(2bytes), L(table_64bytes))
+ .int JMPTBL (L(3bytes), L(table_64bytes))
+ .int JMPTBL (L(4bytes), L(table_64bytes))
+ .int JMPTBL (L(5bytes), L(table_64bytes))
+ .int JMPTBL (L(6bytes), L(table_64bytes))
+ .int JMPTBL (L(7bytes), L(table_64bytes))
+ .int JMPTBL (L(8bytes), L(table_64bytes))
+ .int JMPTBL (L(9bytes), L(table_64bytes))
+ .int JMPTBL (L(10bytes), L(table_64bytes))
+ .int JMPTBL (L(11bytes), L(table_64bytes))
+ .int JMPTBL (L(12bytes), L(table_64bytes))
+ .int JMPTBL (L(13bytes), L(table_64bytes))
+ .int JMPTBL (L(14bytes), L(table_64bytes))
+ .int JMPTBL (L(15bytes), L(table_64bytes))
+ .int JMPTBL (L(16bytes), L(table_64bytes))
+ .int JMPTBL (L(17bytes), L(table_64bytes))
+ .int JMPTBL (L(18bytes), L(table_64bytes))
+ .int JMPTBL (L(19bytes), L(table_64bytes))
+ .int JMPTBL (L(20bytes), L(table_64bytes))
+ .int JMPTBL (L(21bytes), L(table_64bytes))
+ .int JMPTBL (L(22bytes), L(table_64bytes))
+ .int JMPTBL (L(23bytes), L(table_64bytes))
+ .int JMPTBL (L(24bytes), L(table_64bytes))
+ .int JMPTBL (L(25bytes), L(table_64bytes))
+ .int JMPTBL (L(26bytes), L(table_64bytes))
+ .int JMPTBL (L(27bytes), L(table_64bytes))
+ .int JMPTBL (L(28bytes), L(table_64bytes))
+ .int JMPTBL (L(29bytes), L(table_64bytes))
+ .int JMPTBL (L(30bytes), L(table_64bytes))
+ .int JMPTBL (L(31bytes), L(table_64bytes))
+ .int JMPTBL (L(32bytes), L(table_64bytes))
+ .int JMPTBL (L(33bytes), L(table_64bytes))
+ .int JMPTBL (L(34bytes), L(table_64bytes))
+ .int JMPTBL (L(35bytes), L(table_64bytes))
+ .int JMPTBL (L(36bytes), L(table_64bytes))
+ .int JMPTBL (L(37bytes), L(table_64bytes))
+ .int JMPTBL (L(38bytes), L(table_64bytes))
+ .int JMPTBL (L(39bytes), L(table_64bytes))
+ .int JMPTBL (L(40bytes), L(table_64bytes))
+ .int JMPTBL (L(41bytes), L(table_64bytes))
+ .int JMPTBL (L(42bytes), L(table_64bytes))
+ .int JMPTBL (L(43bytes), L(table_64bytes))
+ .int JMPTBL (L(44bytes), L(table_64bytes))
+ .int JMPTBL (L(45bytes), L(table_64bytes))
+ .int JMPTBL (L(46bytes), L(table_64bytes))
+ .int JMPTBL (L(47bytes), L(table_64bytes))
+ .int JMPTBL (L(48bytes), L(table_64bytes))
+ .int JMPTBL (L(49bytes), L(table_64bytes))
+ .int JMPTBL (L(50bytes), L(table_64bytes))
+ .int JMPTBL (L(51bytes), L(table_64bytes))
+ .int JMPTBL (L(52bytes), L(table_64bytes))
+ .int JMPTBL (L(53bytes), L(table_64bytes))
+ .int JMPTBL (L(54bytes), L(table_64bytes))
+ .int JMPTBL (L(55bytes), L(table_64bytes))
+ .int JMPTBL (L(56bytes), L(table_64bytes))
+ .int JMPTBL (L(57bytes), L(table_64bytes))
+ .int JMPTBL (L(58bytes), L(table_64bytes))
+ .int JMPTBL (L(59bytes), L(table_64bytes))
+ .int JMPTBL (L(60bytes), L(table_64bytes))
+ .int JMPTBL (L(61bytes), L(table_64bytes))
+ .int JMPTBL (L(62bytes), L(table_64bytes))
+ .int JMPTBL (L(63bytes), L(table_64bytes))
+ .int JMPTBL (L(64bytes), L(table_64bytes))
+ .int JMPTBL (L(65bytes), L(table_64bytes))
+ .int JMPTBL (L(66bytes), L(table_64bytes))
+ .int JMPTBL (L(67bytes), L(table_64bytes))
+ .int JMPTBL (L(68bytes), L(table_64bytes))
+ .int JMPTBL (L(69bytes), L(table_64bytes))
+ .int JMPTBL (L(70bytes), L(table_64bytes))
+ .int JMPTBL (L(71bytes), L(table_64bytes))
+ .int JMPTBL (L(72bytes), L(table_64bytes))
+ .int JMPTBL (L(73bytes), L(table_64bytes))
+ .int JMPTBL (L(74bytes), L(table_64bytes))
+ .int JMPTBL (L(75bytes), L(table_64bytes))
+ .int JMPTBL (L(76bytes), L(table_64bytes))
+ .int JMPTBL (L(77bytes), L(table_64bytes))
+ .int JMPTBL (L(78bytes), L(table_64bytes))
+ .int JMPTBL (L(79bytes), L(table_64bytes))
+#else
+L(table_64bytes):
+ .int JMPTBL (L(0bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(4bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(8bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(12bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(16bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(20bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(24bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(28bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(32bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(36bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(40bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(44bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(48bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(52bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(56bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(60bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(64bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(68bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(72bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(76bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+#endif
diff --git a/ssse3-strcmp-slm.S b/ssse3-strcmp-slm.S
new file mode 100644
index 000000000000..e8acd5ba4612
--- /dev/null
+++ b/ssse3-strcmp-slm.S
@@ -0,0 +1,1925 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef USE_AS_STRNCMP
+/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
+ if the new counter > the old one or is 0. */
+#define UPDATE_STRNCMP_COUNTER \
+ /* calculate left number to compare */ \
+ lea -16(%rcx, %r11), %r9; \
+ cmp %r9, %r11; \
+ jb L(strcmp_exitz); \
+ test %r9, %r9; \
+ je L(strcmp_exitz); \
+ mov %r9, %r11
+
+#else
+#define UPDATE_STRNCMP_COUNTER
+#ifndef STRCMP
+#define STRCMP strcmp
+#endif
+#endif
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+#define RETURN ret
+ .section .text.ssse3,"ax",@progbits
+ENTRY (STRCMP)
+/*
+ * This implementation uses SSE to compare up to 16 bytes at a time.
+ */
+#ifdef USE_AS_STRNCMP
+ test %rdx, %rdx
+ je L(strcmp_exitz)
+ cmp $1, %rdx
+ je L(Byte0)
+ mov %rdx, %r11
+#endif
+ mov %esi, %ecx
+ mov %edi, %eax
+/* Use 64bit AND here to avoid long NOP padding. */
+ and $0x3f, %rcx /* rsi alignment in cache line */
+ and $0x3f, %rax /* rdi alignment in cache line */
+ cmp $0x30, %ecx
+ ja L(crosscache) /* rsi: 16-byte load will cross cache line */
+ cmp $0x30, %eax
+ ja L(crosscache) /* rdi: 16-byte load will cross cache line */
+ movlpd (%rdi), %xmm1
+ movlpd (%rsi), %xmm2
+ movhpd 8(%rdi), %xmm1
+ movhpd 8(%rsi), %xmm2
+ pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
+ pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
+ jnz L(less16bytes) /* If not, find different value or null char */
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz) /* finish comparision */
+#endif
+ add $16, %rsi /* prepare to search next 16 bytes */
+ add $16, %rdi /* prepare to search next 16 bytes */
+
+ /*
+ * Determine source and destination string offsets from 16-byte alignment.
+ * Use relative offset difference between the two to determine which case
+ * below to use.
+ */
+ .p2align 4
+L(crosscache):
+ and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
+ and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
+ mov $0xffff, %edx /* for equivalent offset */
+ xor %r8d, %r8d
+ and $0xf, %ecx /* offset of rsi */
+ and $0xf, %eax /* offset of rdi */
+ cmp %eax, %ecx
+ je L(ashr_0) /* rsi and rdi relative offset same */
+ ja L(bigger)
+ mov %edx, %r8d /* r8d is offset flag for exit tail */
+ xchg %ecx, %eax
+ xchg %rsi, %rdi
+L(bigger):
+ lea 15(%rax), %r9
+ sub %rcx, %r9
+ lea L(unaligned_table)(%rip), %r10
+ movslq (%r10, %r9,4), %r9
+ lea (%r10, %r9), %r10
+ jmp *%r10 /* jump to corresponding case */
+
+/*
+ * The following cases will be handled by ashr_0
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(0~15) n(0~15) 15(15+ n-n) ashr_0
+ */
+ .p2align 4
+L(ashr_0):
+
+ movdqa (%rsi), %xmm1
+ pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
+ pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %r9d
+ shr %cl, %edx /* adjust 0xffff for offset */
+ shr %cl, %r9d /* adjust for 16-byte offset */
+ sub %r9d, %edx
+ /*
+ * edx must be the same with r9d if in left byte (16-rcx) is equal to
+ * the start from (16-rax) and no null char was seen.
+ */
+ jne L(less32bytes) /* mismatch or null char */
+ UPDATE_STRNCMP_COUNTER
+ mov $16, %rcx
+ mov $16, %r9
+ pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */
+
+ /*
+ * Now both strings are aligned at 16-byte boundary. Loop over strings
+ * checking 32-bytes per iteration.
+ */
+ .p2align 4
+L(loop_ashr_0):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit) /* mismatch or null char seen */
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+ add $16, %rcx
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+ add $16, %rcx
+ jmp L(loop_ashr_0)
+
+/*
+ * The following cases will be handled by ashr_1
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(15) n -15 0(15 +(n-15) - n) ashr_1
+ */
+ .p2align 4
+L(ashr_1):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
+ pslldq $15, %xmm2 /* shift first string to align with second */
+ pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx /* adjust 0xffff for offset */
+ shr %cl, %r9d /* adjust for 16-byte offset */
+ sub %r9d, %edx
+ jnz L(less32bytes) /* mismatch or null char seen */
+ movdqa (%rdi), %xmm3
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads*/
+ mov $1, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 1(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_1):
+ add $16, %r10
+ jg L(nibble_ashr_1) /* cross page boundary */
+
+L(gobble_ashr_1):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4 /* store for next cycle */
+
+ palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_1) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4 /* store for next cycle */
+
+ palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_1)
+
+ /*
+ * Nibble avoids loads across page boundary. This is to avoid a potential
+ * access into unmapped memory.
+ */
+ .p2align 4
+L(nibble_ashr_1):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/
+ pmovmskb %xmm0, %edx
+ test $0xfffe, %edx
+ jnz L(ashr_1_exittail) /* find null char*/
+
+#ifdef USE_AS_STRNCMP
+ cmp $14, %r11
+ jbe L(ashr_1_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10 /* substract 4K from %r10 */
+ jmp L(gobble_ashr_1)
+
+ /*
+ * Once find null char, determine if there is a string mismatch
+ * before the null char.
+ */
+ .p2align 4
+L(ashr_1_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $1, %xmm0
+ psrldq $1, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_2
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
+ */
+ .p2align 4
+L(ashr_2):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $14, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $2, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 2(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_2):
+ add $16, %r10
+ jg L(nibble_ashr_2)
+
+L(gobble_ashr_2):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_2) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_2)
+
+ .p2align 4
+L(nibble_ashr_2):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xfffc, %edx
+ jnz L(ashr_2_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $13, %r11
+ jbe L(ashr_2_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_2)
+
+ .p2align 4
+L(ashr_2_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $2, %xmm0
+ psrldq $2, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_3
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
+ */
+ .p2align 4
+L(ashr_3):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $13, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $3, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 3(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_3):
+ add $16, %r10
+ jg L(nibble_ashr_3)
+
+L(gobble_ashr_3):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_3) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_3)
+
+ .p2align 4
+L(nibble_ashr_3):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xfff8, %edx
+ jnz L(ashr_3_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $12, %r11
+ jbe L(ashr_3_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_3)
+
+ .p2align 4
+L(ashr_3_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $3, %xmm0
+ psrldq $3, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_4
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
+ */
+ .p2align 4
+L(ashr_4):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $12, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $4, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 4(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_4):
+ add $16, %r10
+ jg L(nibble_ashr_4)
+
+L(gobble_ashr_4):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_4) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_4)
+
+ .p2align 4
+L(nibble_ashr_4):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xfff0, %edx
+ jnz L(ashr_4_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $11, %r11
+ jbe L(ashr_4_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_4)
+
+ .p2align 4
+L(ashr_4_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $4, %xmm0
+ psrldq $4, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_5
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
+ */
+ .p2align 4
+L(ashr_5):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $11, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $5, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 5(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_5):
+ add $16, %r10
+ jg L(nibble_ashr_5)
+
+L(gobble_ashr_5):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_5) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_5)
+
+ .p2align 4
+L(nibble_ashr_5):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xffe0, %edx
+ jnz L(ashr_5_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $10, %r11
+ jbe L(ashr_5_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_5)
+
+ .p2align 4
+L(ashr_5_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $5, %xmm0
+ psrldq $5, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_6
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
+ */
+ .p2align 4
+L(ashr_6):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $10, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $6, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 6(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_6):
+ add $16, %r10
+ jg L(nibble_ashr_6)
+
+L(gobble_ashr_6):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_6) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_6)
+
+ .p2align 4
+L(nibble_ashr_6):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xffc0, %edx
+ jnz L(ashr_6_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $9, %r11
+ jbe L(ashr_6_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_6)
+
+ .p2align 4
+L(ashr_6_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $6, %xmm0
+ psrldq $6, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_7
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
+ */
+ .p2align 4
+L(ashr_7):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $9, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $7, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 7(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_7):
+ add $16, %r10
+ jg L(nibble_ashr_7)
+
+L(gobble_ashr_7):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_7) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_7)
+
+ .p2align 4
+L(nibble_ashr_7):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xff80, %edx
+ jnz L(ashr_7_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $8, %r11
+ jbe L(ashr_7_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_7)
+
+ .p2align 4
+L(ashr_7_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $7, %xmm0
+ psrldq $7, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_8
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
+ */
+ .p2align 4
+L(ashr_8):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $8, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $8, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 8(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_8):
+ add $16, %r10
+ jg L(nibble_ashr_8)
+
+L(gobble_ashr_8):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_8) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_8)
+
+ .p2align 4
+L(nibble_ashr_8):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xff00, %edx
+ jnz L(ashr_8_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $7, %r11
+ jbe L(ashr_8_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_8)
+
+ .p2align 4
+L(ashr_8_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $8, %xmm0
+ psrldq $8, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_9
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
+ */
+ .p2align 4
+L(ashr_9):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $7, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $9, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 9(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_9):
+ add $16, %r10
+ jg L(nibble_ashr_9)
+
+L(gobble_ashr_9):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_9) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3 /* store for next cycle */
+ jmp L(loop_ashr_9)
+
+ .p2align 4
+L(nibble_ashr_9):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xfe00, %edx
+ jnz L(ashr_9_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $6, %r11
+ jbe L(ashr_9_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_9)
+
+ .p2align 4
+L(ashr_9_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $9, %xmm0
+ psrldq $9, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_10
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
+ */
+ .p2align 4
+L(ashr_10):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $6, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $10, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 10(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_10):
+ add $16, %r10
+ jg L(nibble_ashr_10)
+
+L(gobble_ashr_10):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_10) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_10)
+
+ .p2align 4
+L(nibble_ashr_10):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xfc00, %edx
+ jnz L(ashr_10_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $5, %r11
+ jbe L(ashr_10_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_10)
+
+ .p2align 4
+L(ashr_10_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $10, %xmm0
+ psrldq $10, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_11
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
+ */
+ .p2align 4
+L(ashr_11):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $5, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $11, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 11(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_11):
+ add $16, %r10
+ jg L(nibble_ashr_11)
+
+L(gobble_ashr_11):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_11) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_11)
+
+ .p2align 4
+L(nibble_ashr_11):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xf800, %edx
+ jnz L(ashr_11_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $4, %r11
+ jbe L(ashr_11_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_11)
+
+ .p2align 4
+L(ashr_11_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $11, %xmm0
+ psrldq $11, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_12
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
+ */
+ .p2align 4
+L(ashr_12):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $4, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $12, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 12(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_12):
+ add $16, %r10
+ jg L(nibble_ashr_12)
+
+L(gobble_ashr_12):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_12) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_12)
+
+ .p2align 4
+L(nibble_ashr_12):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xf000, %edx
+ jnz L(ashr_12_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $3, %r11
+ jbe L(ashr_12_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_12)
+
+ .p2align 4
+L(ashr_12_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $12, %xmm0
+ psrldq $12, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_13
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
+ */
+ .p2align 4
+L(ashr_13):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $3, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $13, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 13(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_13):
+ add $16, %r10
+ jg L(nibble_ashr_13)
+
+L(gobble_ashr_13):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_13) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_13)
+
+ .p2align 4
+L(nibble_ashr_13):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xe000, %edx
+ jnz L(ashr_13_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $2, %r11
+ jbe L(ashr_13_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_13)
+
+ .p2align 4
+L(ashr_13_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $13, %xmm0
+ psrldq $13, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_14
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
+ */
+ .p2align 4
+L(ashr_14):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $2, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $14, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 14(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_14):
+ add $16, %r10
+ jg L(nibble_ashr_14)
+
+L(gobble_ashr_14):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_14) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_14)
+
+ .p2align 4
+L(nibble_ashr_14):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xc000, %edx
+ jnz L(ashr_14_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $1, %r11
+ jbe L(ashr_14_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_14)
+
+ .p2align 4
+L(ashr_14_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $14, %xmm0
+ psrldq $14, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_15
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
+ */
+ .p2align 4
+L(ashr_15):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $1, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $15, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 15(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_15):
+ add $16, %r10
+ jg L(nibble_ashr_15)
+
+L(gobble_ashr_15):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_15) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_15)
+
+ .p2align 4
+L(nibble_ashr_15):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0x8000, %edx
+ jnz L(ashr_15_exittail)
+
+#ifdef USE_AS_STRNCMP
+ test %r11, %r11
+ je L(ashr_15_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_15)
+
+ .p2align 4
+L(ashr_15_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $15, %xmm3
+ psrldq $15, %xmm0
+
+ .p2align 4
+L(aftertail):
+ pcmpeqb %xmm3, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ not %edx
+
+ .p2align 4
+L(exit):
+ lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */
+L(less32bytes):
+ lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
+ lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
+ test %r8d, %r8d
+ jz L(ret)
+ xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
+
+ .p2align 4
+L(ret):
+L(less16bytes):
+ bsf %rdx, %rdx /* find and store bit index in %rdx */
+
+#ifdef USE_AS_STRNCMP
+ sub %rdx, %r11
+ jbe L(strcmp_exitz)
+#endif
+ movzbl (%rsi, %rdx), %ecx
+ movzbl (%rdi, %rdx), %eax
+
+ sub %ecx, %eax
+ ret
+
+L(strcmp_exitz):
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(Byte0):
+ movzbl (%rsi), %ecx
+ movzbl (%rdi), %eax
+
+ sub %ecx, %eax
+ ret
+END (STRCMP)
+
+ .section .rodata,"a",@progbits
+ .p2align 3
+L(unaligned_table):
+ .int L(ashr_1) - L(unaligned_table)
+ .int L(ashr_2) - L(unaligned_table)
+ .int L(ashr_3) - L(unaligned_table)
+ .int L(ashr_4) - L(unaligned_table)
+ .int L(ashr_5) - L(unaligned_table)
+ .int L(ashr_6) - L(unaligned_table)
+ .int L(ashr_7) - L(unaligned_table)
+ .int L(ashr_8) - L(unaligned_table)
+ .int L(ashr_9) - L(unaligned_table)
+ .int L(ashr_10) - L(unaligned_table)
+ .int L(ashr_11) - L(unaligned_table)
+ .int L(ashr_12) - L(unaligned_table)
+ .int L(ashr_13) - L(unaligned_table)
+ .int L(ashr_14) - L(unaligned_table)
+ .int L(ashr_15) - L(unaligned_table)
+ .int L(ashr_0) - L(unaligned_table)
diff --git a/ssse3-strncmp-slm.S b/ssse3-strncmp-slm.S
new file mode 100644
index 000000000000..0e407751714f
--- /dev/null
+++ b/ssse3-strncmp-slm.S
@@ -0,0 +1,33 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_STRNCMP
+#define STRCMP strncmp
+#include "ssse3-strcmp-slm.S"