diff options
author | Ed Maste <emaste@FreeBSD.org> | 2022-08-15 21:24:08 +0000 |
---|---|---|
committer | Ed Maste <emaste@FreeBSD.org> | 2022-08-16 17:24:08 +0000 |
commit | d77520609f5240f5fad18fa1fd2275ac1de7cbb5 (patch) | |
tree | 0d4c3ec4a774bc05892f5f5e9f40715682bd11bc /sse2-strcpy-slm.S | |
download | src-d77520609f5240f5fad18fa1fd2275ac1de7cbb5.tar.gz src-d77520609f5240f5fad18fa1fd2275ac1de7cbb5.zip |
Import bionic's x86_64 string routines to vendorvendor/bionic-x86_64-string
Obtained from https://android.googlesource.com/platform/bionic
libc/arch-x86_64/string/ at commit
919fb7f2e0e0c877dd5e9bbaa71d4c4a73e50ad3
Requested by: mjg
Sponsored by: The FreeBSD Foundation
Diffstat (limited to 'sse2-strcpy-slm.S')
-rw-r--r-- | sse2-strcpy-slm.S | 1921 |
1 files changed, 1921 insertions, 0 deletions
diff --git a/sse2-strcpy-slm.S b/sse2-strcpy-slm.S new file mode 100644 index 000000000000..3e146bfbcfa5 --- /dev/null +++ b/sse2-strcpy-slm.S @@ -0,0 +1,1921 @@ +/* +Copyright (c) 2014, Intel Corporation +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef USE_AS_STRCAT + +# ifndef STRCPY +# define STRCPY strcpy +# endif + +# ifndef L +# define L(label) .L##label +# endif + +# ifndef cfi_startproc +# define cfi_startproc .cfi_startproc +# endif + +# ifndef cfi_endproc +# define cfi_endproc .cfi_endproc +# endif + +# ifndef ENTRY +# define ENTRY(name) \ + .type name, @function; \ + .globl name; \ + .p2align 4; \ +name: \ + cfi_startproc +# endif + +# ifndef END +# define END(name) \ + cfi_endproc; \ + .size name, .-name +# endif + +#endif + +#define JMPTBL(I, B) I - B +#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + lea TABLE(%rip), %r11; \ + movslq (%r11, INDEX, SCALE), %rcx; \ + lea (%r11, %rcx), %rcx; \ + jmp *%rcx + +#ifndef USE_AS_STRCAT + +# define RETURN ret + +.text +ENTRY (STRCPY) +# ifdef USE_AS_STRNCPY + mov %rdx, %r8 + test %r8, %r8 + jz L(ExitZero) +# endif + mov %rsi, %rcx +# ifndef USE_AS_STPCPY + mov %rdi, %rax /* save result */ +# endif + +#endif + and $63, %rcx + cmp $32, %rcx + jbe L(SourceStringAlignmentLess32) + + and $-16, %rsi + and $15, %rcx + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + pcmpeqb (%rsi), %xmm1 + pmovmskb %xmm1, %rdx + shr %cl, %rdx +#ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + mov $16, %r10 + sub %rcx, %r10 + cmp %r10, %r8 +# else + mov $17, %r10 + sub %rcx, %r10 + cmp %r10, %r8 +# endif + jbe L(CopyFrom1To16BytesTailCase2OrCase3) +#endif + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTail) + + pcmpeqb 16(%rsi), %xmm0 + pmovmskb %xmm0, %rdx +#ifdef USE_AS_STRNCPY + add $16, %r10 + cmp %r10, %r8 + jbe L(CopyFrom1To32BytesCase2OrCase3) +#endif + test %rdx, %rdx + jnz L(CopyFrom1To32Bytes) + + movdqu (%rsi, %rcx), %xmm1 /* copy 16 bytes */ + movdqu %xmm1, (%rdi) + +/* If source adress alignment != destination adress alignment */ + .p2align 4 +L(Unalign16Both): + sub %rcx, %rdi +#ifdef USE_AS_STRNCPY + add %rcx, %r8 +#endif + mov $16, %rcx + movdqa (%rsi, %rcx), %xmm1 + movaps 16(%rsi, %rcx), %xmm2 + movdqu %xmm1, (%rdi, %rcx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +#ifdef USE_AS_STRNCPY + sub $48, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +#endif + test %rdx, %rdx +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm2) +#else + jnz L(CopyFrom1To16Bytes) +#endif + + movaps 16(%rsi, %rcx), %xmm3 + movdqu %xmm2, (%rdi, %rcx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +#endif + test %rdx, %rdx +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm3) +#else + jnz L(CopyFrom1To16Bytes) +#endif + + movaps 16(%rsi, %rcx), %xmm4 + movdqu %xmm3, (%rdi, %rcx) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +#endif + test %rdx, %rdx +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm4) +#else + jnz L(CopyFrom1To16Bytes) +#endif + + movaps 16(%rsi, %rcx), %xmm1 + movdqu %xmm4, (%rdi, %rcx) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +#endif + test %rdx, %rdx +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm1) +#else + jnz L(CopyFrom1To16Bytes) +#endif + + movaps 16(%rsi, %rcx), %xmm2 + movdqu %xmm1, (%rdi, %rcx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +#endif + test %rdx, %rdx +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm2) +#else + jnz L(CopyFrom1To16Bytes) +#endif + + movaps 16(%rsi, %rcx), %xmm3 + movdqu %xmm2, (%rdi, %rcx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +#endif + test %rdx, %rdx +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm3) +#else + jnz L(CopyFrom1To16Bytes) +#endif + + movdqu %xmm3, (%rdi, %rcx) + mov %rsi, %rdx + lea 16(%rsi, %rcx), %rsi + and $-0x40, %rsi + sub %rsi, %rdx + sub %rdx, %rdi +#ifdef USE_AS_STRNCPY + lea 128(%r8, %rdx), %r8 +#endif +L(Unaligned64Loop): + movaps (%rsi), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%rsi), %xmm5 + movaps 32(%rsi), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%rsi), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %rdx +#ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(UnalignedLeaveCase2OrCase3) +#endif + test %rdx, %rdx + jnz L(Unaligned64Leave) + +L(Unaligned64Loop_start): + add $64, %rdi + add $64, %rsi + movdqu %xmm4, -64(%rdi) + movaps (%rsi), %xmm2 + movdqa %xmm2, %xmm4 + movdqu %xmm5, -48(%rdi) + movaps 16(%rsi), %xmm5 + pminub %xmm5, %xmm2 + movaps 32(%rsi), %xmm3 + movdqu %xmm6, -32(%rdi) + movaps %xmm3, %xmm6 + movdqu %xmm7, -16(%rdi) + movaps 48(%rsi), %xmm7 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %rdx +#ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(UnalignedLeaveCase2OrCase3) +#endif + test %rdx, %rdx + jz L(Unaligned64Loop_start) + +L(Unaligned64Leave): + pxor %xmm1, %xmm1 + + pcmpeqb %xmm4, %xmm0 + pcmpeqb %xmm5, %xmm1 + pmovmskb %xmm0, %rdx + pmovmskb %xmm1, %rcx + test %rdx, %rdx + jnz L(CopyFrom1To16BytesUnaligned_0) + test %rcx, %rcx + jnz L(CopyFrom1To16BytesUnaligned_16) + + pcmpeqb %xmm6, %xmm0 + pcmpeqb %xmm7, %xmm1 + pmovmskb %xmm0, %rdx + pmovmskb %xmm1, %rcx + test %rdx, %rdx + jnz L(CopyFrom1To16BytesUnaligned_32) + + bsf %rcx, %rdx + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea 48(%rdi, %rdx), %rax +# endif + movdqu %xmm7, 48(%rdi) + add $15, %r8 + sub %rdx, %r8 + lea 49(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +#else + add $48, %rsi + add $48, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +#endif + +/* If source adress alignment == destination adress alignment */ + +L(SourceStringAlignmentLess32): + pxor %xmm0, %xmm0 + movdqu (%rsi), %xmm1 + movdqu 16(%rsi), %xmm2 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %rdx + +#ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + cmp $16, %r8 +# else + cmp $17, %r8 +# endif + jbe L(CopyFrom1To16BytesTail1Case2OrCase3) +#endif + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTail1) + + pcmpeqb %xmm2, %xmm0 + movdqu %xmm1, (%rdi) + pmovmskb %xmm0, %rdx + +#ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + cmp $32, %r8 +# else + cmp $33, %r8 +# endif + jbe L(CopyFrom1To32Bytes1Case2OrCase3) +#endif + test %rdx, %rdx + jnz L(CopyFrom1To32Bytes1) + + and $15, %rcx + and $-16, %rsi + + jmp L(Unalign16Both) + +/*------End of main part with loops---------------------*/ + +/* Case1 */ + +#if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) + .p2align 4 +L(CopyFrom1To16Bytes): + add %rcx, %rdi + add %rcx, %rsi + bsf %rdx, %rdx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +#endif + .p2align 4 +L(CopyFrom1To16BytesTail): + add %rcx, %rsi + bsf %rdx, %rdx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1): + add $16, %rsi + add $16, %rdi +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $16, %r8 +#endif +L(CopyFrom1To16BytesTail1): + bsf %rdx, %rdx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes): + bsf %rdx, %rdx + add %rcx, %rsi + add $16, %rdx + sub %rcx, %rdx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_0): + bsf %rdx, %rdx +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif + movdqu %xmm4, (%rdi) + add $63, %r8 + sub %rdx, %r8 + lea 1(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +#else + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +#endif + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_16): + bsf %rcx, %rdx + movdqu %xmm4, (%rdi) +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea 16(%rdi, %rdx), %rax +# endif + movdqu %xmm5, 16(%rdi) + add $47, %r8 + sub %rdx, %r8 + lea 17(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +#else + add $16, %rsi + add $16, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +#endif + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_32): + bsf %rdx, %rdx + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea 32(%rdi, %rdx), %rax +# endif + movdqu %xmm6, 32(%rdi) + add $31, %r8 + sub %rdx, %r8 + lea 33(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +#else + add $32, %rsi + add $32, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +#endif + +#ifdef USE_AS_STRNCPY +# ifndef USE_AS_STRCAT + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm6): + movdqu %xmm6, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm5): + movdqu %xmm5, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm4): + movdqu %xmm4, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm3): + movdqu %xmm3, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm1): + movdqu %xmm1, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) +# endif + + .p2align 4 +L(CopyFrom1To16BytesExit): + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) + +/* Case2 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %r8 + add %rcx, %rdi + add %rcx, %rsi + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2): + add %rcx, %rsi + bsf %rdx, %rdx + add $16, %rdx + sub %rcx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + +L(CopyFrom1To16BytesTailCase2): + add %rcx, %rsi + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + +L(CopyFrom1To16BytesTail1Case2): + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + +/* Case2 or Case3, Case3 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To16BytesCase2) +L(CopyFrom1To16BytesCase3): + add $16, %r8 + add %rcx, %rdi + add %rcx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To32BytesCase2) + add %rcx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To16BytesTailCase2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTailCase2) + add %rcx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1Case2OrCase3): + add $16, %rdi + add $16, %rsi + sub $16, %r8 +L(CopyFrom1To16BytesTail1Case2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTail1Case2) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + +#endif + +/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/ + + .p2align 4 +L(Exit1): + mov %dh, (%rdi) +#ifdef USE_AS_STPCPY + lea (%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $1, %r8 + lea 1(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit2): + mov (%rsi), %dx + mov %dx, (%rdi) +#ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $2, %r8 + lea 2(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit3): + mov (%rsi), %cx + mov %cx, (%rdi) + mov %dh, 2(%rdi) +#ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $3, %r8 + lea 3(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit4): + mov (%rsi), %edx + mov %edx, (%rdi) +#ifdef USE_AS_STPCPY + lea 3(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $4, %r8 + lea 4(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit5): + mov (%rsi), %ecx + mov %dh, 4(%rdi) + mov %ecx, (%rdi) +#ifdef USE_AS_STPCPY + lea 4(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $5, %r8 + lea 5(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit6): + mov (%rsi), %ecx + mov 4(%rsi), %dx + mov %ecx, (%rdi) + mov %dx, 4(%rdi) +#ifdef USE_AS_STPCPY + lea 5(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $6, %r8 + lea 6(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit7): + mov (%rsi), %ecx + mov 3(%rsi), %edx + mov %ecx, (%rdi) + mov %edx, 3(%rdi) +#ifdef USE_AS_STPCPY + lea 6(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $7, %r8 + lea 7(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit8): + mov (%rsi), %rdx + mov %rdx, (%rdi) +#ifdef USE_AS_STPCPY + lea 7(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $8, %r8 + lea 8(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit9): + mov (%rsi), %rcx + mov %dh, 8(%rdi) + mov %rcx, (%rdi) +#ifdef USE_AS_STPCPY + lea 8(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $9, %r8 + lea 9(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit10): + mov (%rsi), %rcx + mov 8(%rsi), %dx + mov %rcx, (%rdi) + mov %dx, 8(%rdi) +#ifdef USE_AS_STPCPY + lea 9(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $10, %r8 + lea 10(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit11): + mov (%rsi), %rcx + mov 7(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 7(%rdi) +#ifdef USE_AS_STPCPY + lea 10(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $11, %r8 + lea 11(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit12): + mov (%rsi), %rcx + mov 8(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 8(%rdi) +#ifdef USE_AS_STPCPY + lea 11(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $12, %r8 + lea 12(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit13): + mov (%rsi), %rcx + mov 5(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 5(%rdi) +#ifdef USE_AS_STPCPY + lea 12(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $13, %r8 + lea 13(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit14): + mov (%rsi), %rcx + mov 6(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 6(%rdi) +#ifdef USE_AS_STPCPY + lea 13(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $14, %r8 + lea 14(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit15): + mov (%rsi), %rcx + mov 7(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 7(%rdi) +#ifdef USE_AS_STPCPY + lea 14(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $15, %r8 + lea 15(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit16): + movdqu (%rsi), %xmm0 + movdqu %xmm0, (%rdi) +#ifdef USE_AS_STPCPY + lea 15(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $16, %r8 + lea 16(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit17): + movdqu (%rsi), %xmm0 + movdqu %xmm0, (%rdi) + mov %dh, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 16(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $17, %r8 + lea 17(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit18): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %cx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 17(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $18, %r8 + lea 18(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit19): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 15(%rdi) +#ifdef USE_AS_STPCPY + lea 18(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $19, %r8 + lea 19(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit20): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 19(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $20, %r8 + lea 20(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit21): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) + mov %dh, 20(%rdi) +#ifdef USE_AS_STPCPY + lea 20(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $21, %r8 + lea 21(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit22): + movdqu (%rsi), %xmm0 + mov 14(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 14(%rdi) +#ifdef USE_AS_STPCPY + lea 21(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $22, %r8 + lea 22(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit23): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 15(%rdi) +#ifdef USE_AS_STPCPY + lea 22(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $23, %r8 + lea 23(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit24): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 23(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $24, %r8 + lea 24(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit25): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 16(%rdi) + mov %dh, 24(%rdi) +#ifdef USE_AS_STPCPY + lea 24(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $25, %r8 + lea 25(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit26): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %cx, 24(%rdi) +#ifdef USE_AS_STPCPY + lea 25(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $26, %r8 + lea 26(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit27): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 23(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 23(%rdi) +#ifdef USE_AS_STPCPY + lea 26(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $27, %r8 + lea 27(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit28): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 24(%rdi) +#ifdef USE_AS_STPCPY + lea 27(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $28, %r8 + lea 28(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit29): + movdqu (%rsi), %xmm0 + movdqu 13(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 13(%rdi) +#ifdef USE_AS_STPCPY + lea 28(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $29, %r8 + lea 29(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit30): + movdqu (%rsi), %xmm0 + movdqu 14(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 14(%rdi) +#ifdef USE_AS_STPCPY + lea 29(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $30, %r8 + lea 30(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit31): + movdqu (%rsi), %xmm0 + movdqu 15(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 15(%rdi) +#ifdef USE_AS_STPCPY + lea 30(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $31, %r8 + lea 31(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit32): + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 31(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $32, %r8 + lea 32(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + +#ifdef USE_AS_STRNCPY + + .p2align 4 +L(StrncpyExit0): +#ifdef USE_AS_STPCPY + mov %rdi, %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, (%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit1): + mov (%rsi), %dl + mov %dl, (%rdi) +#ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 1(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit2): + mov (%rsi), %dx + mov %dx, (%rdi) +#ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 2(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit3): + mov (%rsi), %cx + mov 2(%rsi), %dl + mov %cx, (%rdi) + mov %dl, 2(%rdi) +#ifdef USE_AS_STPCPY + lea 3(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 3(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit4): + mov (%rsi), %edx + mov %edx, (%rdi) +#ifdef USE_AS_STPCPY + lea 4(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 4(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit5): + mov (%rsi), %ecx + mov 4(%rsi), %dl + mov %ecx, (%rdi) + mov %dl, 4(%rdi) +#ifdef USE_AS_STPCPY + lea 5(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 5(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit6): + mov (%rsi), %ecx + mov 4(%rsi), %dx + mov %ecx, (%rdi) + mov %dx, 4(%rdi) +#ifdef USE_AS_STPCPY + lea 6(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 6(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit7): + mov (%rsi), %ecx + mov 3(%rsi), %edx + mov %ecx, (%rdi) + mov %edx, 3(%rdi) +#ifdef USE_AS_STPCPY + lea 7(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 7(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit8): + mov (%rsi), %rdx + mov %rdx, (%rdi) +#ifdef USE_AS_STPCPY + lea 8(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 8(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit9): + mov (%rsi), %rcx + mov 8(%rsi), %dl + mov %rcx, (%rdi) + mov %dl, 8(%rdi) +#ifdef USE_AS_STPCPY + lea 9(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 9(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit10): + mov (%rsi), %rcx + mov 8(%rsi), %dx + mov %rcx, (%rdi) + mov %dx, 8(%rdi) +#ifdef USE_AS_STPCPY + lea 10(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 10(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit11): + mov (%rsi), %rcx + mov 7(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 7(%rdi) +#ifdef USE_AS_STPCPY + lea 11(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 11(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit12): + mov (%rsi), %rcx + mov 8(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 8(%rdi) +#ifdef USE_AS_STPCPY + lea 12(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 12(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit13): + mov (%rsi), %rcx + mov 5(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 5(%rdi) +#ifdef USE_AS_STPCPY + lea 13(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 13(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit14): + mov (%rsi), %rcx + mov 6(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 6(%rdi) +#ifdef USE_AS_STPCPY + lea 14(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 14(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit15): + mov (%rsi), %rcx + mov 7(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 7(%rdi) +#ifdef USE_AS_STPCPY + lea 15(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 15(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit16): + movdqu (%rsi), %xmm0 + movdqu %xmm0, (%rdi) +#ifdef USE_AS_STPCPY + lea 16(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 16(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit17): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %cl + movdqu %xmm0, (%rdi) + mov %cl, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 17(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 17(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit18): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %cx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 18(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 18(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit19): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 15(%rdi) +#ifdef USE_AS_STPCPY + lea 19(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 19(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit20): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 20(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 20(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit21): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + mov 20(%rsi), %dl + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) + mov %dl, 20(%rdi) +#ifdef USE_AS_STPCPY + lea 21(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 21(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit22): + movdqu (%rsi), %xmm0 + mov 14(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 14(%rdi) +#ifdef USE_AS_STPCPY + lea 22(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 22(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit23): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 15(%rdi) +#ifdef USE_AS_STPCPY + lea 23(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 23(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit24): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 24(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 24(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit25): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %cl + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %cl, 24(%rdi) +#ifdef USE_AS_STPCPY + lea 25(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 25(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit26): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %cx, 24(%rdi) +#ifdef USE_AS_STPCPY + lea 26(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 26(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit27): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 23(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 23(%rdi) +#ifdef USE_AS_STPCPY + lea 27(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 27(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit28): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 24(%rdi) +#ifdef USE_AS_STPCPY + lea 28(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 28(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit29): + movdqu (%rsi), %xmm0 + movdqu 13(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 13(%rdi) +#ifdef USE_AS_STPCPY + lea 29(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 29(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit30): + movdqu (%rsi), %xmm0 + movdqu 14(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 14(%rdi) +#ifdef USE_AS_STPCPY + lea 30(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 30(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit31): + movdqu (%rsi), %xmm0 + movdqu 15(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 15(%rdi) +#ifdef USE_AS_STPCPY + lea 31(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 31(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit32): + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 32(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 32(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit33): + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm2 + mov 32(%rsi), %cl + movdqu %xmm0, (%rdi) + movdqu %xmm2, 16(%rdi) + mov %cl, 32(%rdi) +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 33(%rdi) +#endif + RETURN + +#ifndef USE_AS_STRCAT + + .p2align 4 +L(Fill0): + RETURN + + .p2align 4 +L(Fill1): + mov %dl, (%rdi) + RETURN + + .p2align 4 +L(Fill2): + mov %dx, (%rdi) + RETURN + + .p2align 4 +L(Fill3): + mov %edx, -1(%rdi) + RETURN + + .p2align 4 +L(Fill4): + mov %edx, (%rdi) + RETURN + + .p2align 4 +L(Fill5): + mov %edx, (%rdi) + mov %dl, 4(%rdi) + RETURN + + .p2align 4 +L(Fill6): + mov %edx, (%rdi) + mov %dx, 4(%rdi) + RETURN + + .p2align 4 +L(Fill7): + mov %rdx, -1(%rdi) + RETURN + + .p2align 4 +L(Fill8): + mov %rdx, (%rdi) + RETURN + + .p2align 4 +L(Fill9): + mov %rdx, (%rdi) + mov %dl, 8(%rdi) + RETURN + + .p2align 4 +L(Fill10): + mov %rdx, (%rdi) + mov %dx, 8(%rdi) + RETURN + + .p2align 4 +L(Fill11): + mov %rdx, (%rdi) + mov %edx, 7(%rdi) + RETURN + + .p2align 4 +L(Fill12): + mov %rdx, (%rdi) + mov %edx, 8(%rdi) + RETURN + + .p2align 4 +L(Fill13): + mov %rdx, (%rdi) + mov %rdx, 5(%rdi) + RETURN + + .p2align 4 +L(Fill14): + mov %rdx, (%rdi) + mov %rdx, 6(%rdi) + RETURN + + .p2align 4 +L(Fill15): + movdqu %xmm0, -1(%rdi) + RETURN + + .p2align 4 +L(Fill16): + movdqu %xmm0, (%rdi) + RETURN + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm2): + movdqu %xmm2, (%rdi, %rcx) + + .p2align 4 +L(CopyFrom1To16BytesXmmExit): + bsf %rdx, %rdx + add $15, %r8 + add %rcx, %rdi +#ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +#endif + sub %rdx, %r8 + lea 1(%rdi, %rdx), %rdi + + .p2align 4 +L(StrncpyFillTailWithZero): + pxor %xmm0, %xmm0 + xor %rdx, %rdx + sub $16, %r8 + jbe L(StrncpyFillExit) + + movdqu %xmm0, (%rdi) + add $16, %rdi + + mov %rdi, %rsi + and $0xf, %rsi + sub %rsi, %rdi + add %rsi, %r8 + sub $64, %r8 + jb L(StrncpyFillLess64) + +L(StrncpyFillLoopMovdqa): + movdqa %xmm0, (%rdi) + movdqa %xmm0, 16(%rdi) + movdqa %xmm0, 32(%rdi) + movdqa %xmm0, 48(%rdi) + add $64, %rdi + sub $64, %r8 + jae L(StrncpyFillLoopMovdqa) + +L(StrncpyFillLess64): + add $32, %r8 + jl L(StrncpyFillLess32) + movdqa %xmm0, (%rdi) + movdqa %xmm0, 16(%rdi) + add $32, %rdi + sub $16, %r8 + jl L(StrncpyFillExit) + movdqa %xmm0, (%rdi) + add $16, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4) + +L(StrncpyFillLess32): + add $16, %r8 + jl L(StrncpyFillExit) + movdqa %xmm0, (%rdi) + add $16, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4) + +L(StrncpyFillExit): + add $16, %r8 + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4) + +/* end of ifndef USE_AS_STRCAT */ +#endif + + .p2align 4 +L(UnalignedLeaveCase2OrCase3): + test %rdx, %rdx + jnz L(Unaligned64LeaveCase2) +L(Unaligned64LeaveCase3): + lea 64(%r8), %rcx + and $-16, %rcx + add $48, %r8 + jl L(CopyFrom1To16BytesCase3) + movdqu %xmm4, (%rdi) + sub $16, %r8 + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm5, 16(%rdi) + sub $16, %r8 + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm6, 32(%rdi) + sub $16, %r8 + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm7, 48(%rdi) +#ifdef USE_AS_STPCPY + lea 64(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 64(%rdi) +#endif + RETURN + + .p2align 4 +L(Unaligned64LeaveCase2): + xor %rcx, %rcx + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %rdx + add $48, %r8 + jle L(CopyFrom1To16BytesCase2OrCase3) + test %rdx, %rdx +#ifndef USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm4) +#else + jnz L(CopyFrom1To16Bytes) +#endif + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %rdx + movdqu %xmm4, (%rdi) + add $16, %rcx + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %rdx, %rdx +#ifndef USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm5) +#else + jnz L(CopyFrom1To16Bytes) +#endif + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %rdx + movdqu %xmm5, 16(%rdi) + add $16, %rcx + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %rdx, %rdx +#ifndef USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm6) +#else + jnz L(CopyFrom1To16Bytes) +#endif + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %rdx + movdqu %xmm6, 32(%rdi) + lea 16(%rdi, %rcx), %rdi + lea 16(%rsi, %rcx), %rsi + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(ExitZero): +#ifndef USE_AS_STRCAT + mov %rdi, %rax +#endif + RETURN + +#endif + +#ifndef USE_AS_STRCAT +END (STRCPY) +#else +END (STRCAT) +#endif + .p2align 4 + .section .rodata +L(ExitTable): + .int JMPTBL(L(Exit1), L(ExitTable)) + .int JMPTBL(L(Exit2), L(ExitTable)) + .int JMPTBL(L(Exit3), L(ExitTable)) + .int JMPTBL(L(Exit4), L(ExitTable)) + .int JMPTBL(L(Exit5), L(ExitTable)) + .int JMPTBL(L(Exit6), L(ExitTable)) + .int JMPTBL(L(Exit7), L(ExitTable)) + .int JMPTBL(L(Exit8), L(ExitTable)) + .int JMPTBL(L(Exit9), L(ExitTable)) + .int JMPTBL(L(Exit10), L(ExitTable)) + .int JMPTBL(L(Exit11), L(ExitTable)) + .int JMPTBL(L(Exit12), L(ExitTable)) + .int JMPTBL(L(Exit13), L(ExitTable)) + .int JMPTBL(L(Exit14), L(ExitTable)) + .int JMPTBL(L(Exit15), L(ExitTable)) + .int JMPTBL(L(Exit16), L(ExitTable)) + .int JMPTBL(L(Exit17), L(ExitTable)) + .int JMPTBL(L(Exit18), L(ExitTable)) + .int JMPTBL(L(Exit19), L(ExitTable)) + .int JMPTBL(L(Exit20), L(ExitTable)) + .int JMPTBL(L(Exit21), L(ExitTable)) + .int JMPTBL(L(Exit22), L(ExitTable)) + .int JMPTBL(L(Exit23), L(ExitTable)) + .int JMPTBL(L(Exit24), L(ExitTable)) + .int JMPTBL(L(Exit25), L(ExitTable)) + .int JMPTBL(L(Exit26), L(ExitTable)) + .int JMPTBL(L(Exit27), L(ExitTable)) + .int JMPTBL(L(Exit28), L(ExitTable)) + .int JMPTBL(L(Exit29), L(ExitTable)) + .int JMPTBL(L(Exit30), L(ExitTable)) + .int JMPTBL(L(Exit31), L(ExitTable)) + .int JMPTBL(L(Exit32), L(ExitTable)) +#ifdef USE_AS_STRNCPY +L(ExitStrncpyTable): + .int JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable)) +# ifndef USE_AS_STRCAT + .p2align 4 +L(FillTable): + .int JMPTBL(L(Fill0), L(FillTable)) + .int JMPTBL(L(Fill1), L(FillTable)) + .int JMPTBL(L(Fill2), L(FillTable)) + .int JMPTBL(L(Fill3), L(FillTable)) + .int JMPTBL(L(Fill4), L(FillTable)) + .int JMPTBL(L(Fill5), L(FillTable)) + .int JMPTBL(L(Fill6), L(FillTable)) + .int JMPTBL(L(Fill7), L(FillTable)) + .int JMPTBL(L(Fill8), L(FillTable)) + .int JMPTBL(L(Fill9), L(FillTable)) + .int JMPTBL(L(Fill10), L(FillTable)) + .int JMPTBL(L(Fill11), L(FillTable)) + .int JMPTBL(L(Fill12), L(FillTable)) + .int JMPTBL(L(Fill13), L(FillTable)) + .int JMPTBL(L(Fill14), L(FillTable)) + .int JMPTBL(L(Fill15), L(FillTable)) + .int JMPTBL(L(Fill16), L(FillTable)) +# endif +#endif |