diff options
Diffstat (limited to 'lib/libc/amd64')
41 files changed, 3534 insertions, 575 deletions
diff --git a/lib/libc/amd64/SYS.h b/lib/libc/amd64/SYS.h deleted file mode 100644 index c2c8ef8a56ef..000000000000 --- a/lib/libc/amd64/SYS.h +++ /dev/null @@ -1,53 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-3-Clause - * - * Copyright (c) 1990 The Regents of the University of California. - * All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * William Jolitz. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)SYS.h 5.5 (Berkeley) 5/7/91 - */ - -#include <sys/syscall.h> -#include <machine/asm.h> - -#define RSYSCALL(name) ENTRY(__sys_##name); \ - WEAK_REFERENCE(__sys_##name, name); \ - WEAK_REFERENCE(__sys_##name, _##name); \ - mov $SYS_##name,%eax; KERNCALL; \ - jb HIDENAME(cerror); ret; \ - END(__sys_##name) - -#define PSEUDO(name) ENTRY(__sys_##name); \ - WEAK_REFERENCE(__sys_##name, _##name); \ - mov $SYS_##name,%eax; KERNCALL; \ - jb HIDENAME(cerror); ret; \ - END(__sys_##name) - -#define KERNCALL movq %rcx, %r10; syscall diff --git a/lib/libc/amd64/Symbol.map b/lib/libc/amd64/Symbol.map index 39a913bd5e84..36f54de24fbd 100644 --- a/lib/libc/amd64/Symbol.map +++ b/lib/libc/amd64/Symbol.map @@ -1,20 +1,12 @@ /* - */ - -/* * This only needs to contain symbols that are not listed in * symbol maps from other parts of libc (i.e., not found in * stdlib/Symbol.map, string/Symbol.map, sys/Symbol.map, ...). */ FBSD_1.0 { - /* PSEUDO syscalls */ - _exit; - .mcount; - _setjmp; - _longjmp; - fabs; __flt_rounds; + brk; fpgetmask; fpgetprec; fpgetround; @@ -22,32 +14,7 @@ FBSD_1.0 { fpsetmask; fpsetprec; fpsetround; - __infinity; - __nan; - makecontext; - rfork_thread; - setjmp; - longjmp; - sigsetjmp; - siglongjmp; - htonl; - htons; - ntohl; - ntohs; - amd64_get_fsbase; - amd64_get_gsbase; - amd64_set_fsbase; - amd64_set_gsbase; - brk; sbrk; - vfork; -}; - -FBSD_1.6 { - x86_pkru_get_perm; - x86_pkru_set_perm; - x86_pkru_protect_range; - x86_pkru_unprotect_range; }; /* @@ -56,15 +23,10 @@ FBSD_1.6 { * */ FBSDprivate_1.0 { - /* PSEUDO syscalls */ - _getlogin; - ___longjmp; - __makecontext; __longjmp; __signalcontext; signalcontext; __siglongjmp; _brk; - _vfork; }; diff --git a/lib/libc/amd64/gen/Makefile.inc b/lib/libc/amd64/gen/Makefile.inc index 4869973ca254..aaffcb0481f1 100644 --- a/lib/libc/amd64/gen/Makefile.inc +++ b/lib/libc/amd64/gen/Makefile.inc @@ -1,6 +1,4 @@ -# @(#)Makefile.inc 8.1 (Berkeley) 6/4/93 - -SRCS+= _setjmp.S rfork_thread.S setjmp.S sigsetjmp.S \ +SRCS+= _setjmp.S setjmp.S sigsetjmp.S \ fabs.S \ infinity.c ldexp.c makecontext.c signalcontext.c \ flt_rounds.c fpgetmask.c fpsetmask.c fpgetprec.c fpsetprec.c \ diff --git a/lib/libc/amd64/gen/_setjmp.S b/lib/libc/amd64/gen/_setjmp.S index 43af2b68b3f2..93b27de49ea0 100644 --- a/lib/libc/amd64/gen/_setjmp.S +++ b/lib/libc/amd64/gen/_setjmp.S @@ -30,9 +30,6 @@ * SUCH DAMAGE. */ -#if defined(LIBC_SCCS) && !defined(lint) - .asciz "@(#)_setjmp.s 5.1 (Berkeley) 4/23/90" -#endif /* LIBC_SCCS and not lint */ #include <machine/asm.h> /* * C library -- _setjmp, _longjmp diff --git a/lib/libc/amd64/gen/flt_rounds.c b/lib/libc/amd64/gen/flt_rounds.c index 018ea029ee3f..cd7e501af5af 100644 --- a/lib/libc/amd64/gen/flt_rounds.c +++ b/lib/libc/amd64/gen/flt_rounds.c @@ -3,7 +3,6 @@ * Public domain. */ -#include <sys/cdefs.h> #include <float.h> static const int map[] = { diff --git a/lib/libc/amd64/gen/infinity.c b/lib/libc/amd64/gen/infinity.c index b9db2fc84efa..bc05708abd2b 100644 --- a/lib/libc/amd64/gen/infinity.c +++ b/lib/libc/amd64/gen/infinity.c @@ -2,7 +2,6 @@ * infinity.c */ -#include <sys/cdefs.h> #include <math.h> /* bytes for +Infinity on a 387 */ diff --git a/lib/libc/amd64/gen/makecontext.c b/lib/libc/amd64/gen/makecontext.c index dcc3b8ab9b45..c5767c9d5d75 100644 --- a/lib/libc/amd64/gen/makecontext.c +++ b/lib/libc/amd64/gen/makecontext.c @@ -26,7 +26,6 @@ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include <sys/cdefs.h> #include <sys/types.h> #include <sys/ucontext.h> #include <stdarg.h> diff --git a/lib/libc/amd64/gen/rfork_thread.S b/lib/libc/amd64/gen/rfork_thread.S deleted file mode 100644 index a3c64fad7994..000000000000 --- a/lib/libc/amd64/gen/rfork_thread.S +++ /dev/null @@ -1,93 +0,0 @@ -/*- - * Copyright (c) 2000 Peter Wemm <peter@FreeBSD.org> - * Copyright (c) 2003 Alan L. Cox <alc@cs.rice.edu> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <machine/asm.h> -/* - * With thanks to John Dyson for the original version of this. - */ - -#include <SYS.h> - -/* - * %edi %rsi %rdx %rcx - * rfork_thread(flags, stack_addr, start_fnc, start_arg); - * - * flags: Flags to rfork system call. See rfork(2). - * stack_addr: Top of stack for thread. - * start_fnc: Address of thread function to call in child. - * start_arg: Argument to pass to the thread function in child. - */ - -ENTRY(rfork_thread) - pushq %rbx - pushq %r12 - movq %rdx, %rbx - movq %rcx, %r12 - - /* - * Prepare and execute the thread creation syscall - */ - movq $SYS_rfork, %rax - KERNCALL - jb 2f - - /* - * Check to see if we are in the parent or child - */ - cmpl $0, %edx - jnz 1f - popq %r12 - popq %rbx - ret - - /* - * If we are in the child (new thread), then - * set-up the call to the internal subroutine. If it - * returns, then call __exit. - */ -1: - movq %rsi, %rsp - movq %r12, %rdi - call *%rbx - movl %eax, %edi - - /* - * Exit system call - */ - movq $SYS_exit, %rax - KERNCALL - - /* - * Branch here if the thread creation fails: - */ -2: - popq %r12 - popq %rbx - jmp HIDENAME(cerror) -END(rfork_thread) - - .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/gen/setjmp.S b/lib/libc/amd64/gen/setjmp.S index 6f469c4c08e8..54939f123807 100644 --- a/lib/libc/amd64/gen/setjmp.S +++ b/lib/libc/amd64/gen/setjmp.S @@ -30,9 +30,6 @@ * SUCH DAMAGE. */ -#if defined(LIBC_SCCS) && !defined(lint) - .asciz "@(#)setjmp.s 5.1 (Berkeley) 4/23/90" -#endif /* LIBC_SCCS and not lint */ #include <machine/asm.h> /* * C library -- _setjmp, _longjmp diff --git a/lib/libc/amd64/gen/signalcontext.c b/lib/libc/amd64/gen/signalcontext.c index cc1c2523c754..a97dd158542a 100644 --- a/lib/libc/amd64/gen/signalcontext.c +++ b/lib/libc/amd64/gen/signalcontext.c @@ -26,7 +26,6 @@ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include <sys/cdefs.h> #include <sys/types.h> #include <sys/ucontext.h> #include <signal.h> diff --git a/lib/libc/amd64/gen/sigsetjmp.S b/lib/libc/amd64/gen/sigsetjmp.S index 757280159d82..c4775b1c2bea 100644 --- a/lib/libc/amd64/gen/sigsetjmp.S +++ b/lib/libc/amd64/gen/sigsetjmp.S @@ -28,8 +28,6 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * @(#)setjmp.s 5.1 (Berkeley) 4/23/90" */ #if defined(LIBC_SCCS) && !defined(lint) diff --git a/lib/libc/amd64/stdlib/Makefile.inc b/lib/libc/amd64/stdlib/Makefile.inc index 8b9af2b3eab1..568f8eb4afa7 100644 --- a/lib/libc/amd64/stdlib/Makefile.inc +++ b/lib/libc/amd64/stdlib/Makefile.inc @@ -1,3 +1 @@ -# @(#)Makefile.inc 8.1 (Berkeley) 6/4/93 - MDSRCS+=div.S ldiv.S lldiv.S diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc index 4df4ff8f1417..d5bb646c5c53 100644 --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -1,14 +1,36 @@ - MDSRCS+= \ amd64_archlevel.c \ bcmp.S \ + memchr.S \ memcmp.S \ + memccpy.S \ memcpy.S \ memmove.S \ + memrchr.S \ memset.S \ stpcpy.S \ + stpncpy.S \ strcat.S \ strchrnul.S \ strcmp.S \ + strcpy.c \ + strcspn.S \ + strlcat.c \ + strlcpy.S \ strlen.S \ - strcpy.c + strncat.c \ + strncmp.S \ + strncpy.c \ + strnlen.c \ + strpbrk.c \ + strrchr.S \ + strsep.c \ + strspn.S \ + timingsafe_bcmp.S \ + timingsafe_memcmp.S + +.if ${MK_ASAN} != "no" +# Disable ASAN for amd64_archlevel.c since its code is executed before the +# sanitizer runtime can initialize itself. +CFLAGS.amd64_archlevel.c+= -fno-sanitize=address +.endif diff --git a/lib/libc/amd64/string/bcopy.c b/lib/libc/amd64/string/bcopy.c index 406b28f0b97a..0dee529fb9df 100644 --- a/lib/libc/amd64/string/bcopy.c +++ b/lib/libc/amd64/string/bcopy.c @@ -2,9 +2,10 @@ * Public domain. */ -#include <sys/cdefs.h> #include <string.h> +#undef bcopy /* _FORTIFY_SOURCE */ + void bcopy(const void *src, void *dst, size_t len) { diff --git a/lib/libc/amd64/string/bzero.c b/lib/libc/amd64/string/bzero.c index a4fdb74d6bb4..d82f3061865b 100644 --- a/lib/libc/amd64/string/bzero.c +++ b/lib/libc/amd64/string/bzero.c @@ -2,9 +2,10 @@ * Public domain. */ -#include <sys/cdefs.h> #include <string.h> +#undef bzero /* _FORTIFY_SOURCE */ + void bzero(void *b, size_t len) { diff --git a/lib/libc/amd64/string/memccpy.S b/lib/libc/amd64/string/memccpy.S new file mode 100644 index 000000000000..69b650fffc33 --- /dev/null +++ b/lib/libc/amd64/string/memccpy.S @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2023, 2024 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include <machine/asm.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + + .weak memccpy + .set memccpy, __memccpy +ARCHFUNCS(__memccpy) + ARCHFUNC(__memccpy, scalar) + ARCHFUNC(__memccpy, baseline) +ENDARCHFUNCS(__memccpy) + +ARCHENTRY(__memccpy, scalar) + push %rbp # establish stack frame + mov %rsp, %rbp + push %rax # dummy push for alignment + push %rbx + push %rdi + push %rsi + + mov %rsi, %rdi + mov %edx, %esi + mov %rcx, %rdx + mov %rcx, %rbx + call CNAME(__memchr) # ptr = memchr(src, c, len) + + pop %rsi + pop %rdi + lea 1(%rax), %rdx + sub %rsi, %rdx # size = ptr - src + 1 + mov %rbx, %rcx + lea (%rdi, %rdx, 1), %rbx # res = dest + size + test %rax, %rax # if (ptr == NULL) + cmovz %rcx, %rdx # size = len + cmovz %rax, %rbx # res = NULL + call CNAME(memcpy) + + mov %rbx, %rax # return (res) + pop %rbx + leave + ret +ARCHEND(__memccpy, scalar) + +ARCHENTRY(__memccpy, baseline) + sub $1, %rcx # RCX refers to last character in buffer + jb .L0 # go to special code path if len was 0 + + movd %edx, %xmm4 + mov %rcx, %rdx + punpcklbw %xmm4, %xmm4 # c -> cc + mov %esi, %ecx + punpcklwd %xmm4, %xmm4 # cc -> cccc + mov %rsi, %r9 # stash a copy of the source pointer for later + pshufd $0, %xmm4, %xmm4 # cccc -> cccccccccccccccc + and $~0xf, %rsi + movdqa %xmm4, %xmm1 + pcmpeqb (%rsi), %xmm1 # c found in head? + and $0xf, %ecx + mov $-1, %eax + pmovmskb %xmm1, %r8d + lea -32(%rcx), %r11 + shl %cl, %eax # mask of bytes in the string + add %rdx, %r11 # distance from alignment boundary - 32 + jnc .Lrunt # jump if buffer length is 32 or less + + and %r8d, %eax + jz 0f # match (or induced match) found? + + /* match in first chunk */ + tzcnt %eax, %edx # where is c? + sub %ecx, %edx # ... from the beginning of the string? + lea 1(%rdi, %rdx, 1), %rax # return value + jmp .L0116 + +0: movdqa 16(%rsi), %xmm3 # load second string chunk + movdqu (%r9), %xmm2 # load unaligned string head + movdqa %xmm4, %xmm1 + pcmpeqb %xmm3, %xmm1 # c found in second chunk? + + /* process second chunk */ + pmovmskb %xmm1, %eax + test %eax, %eax + jz 0f + + /* match in second chunk */ + tzcnt %eax, %edx # where is c? + sub $16, %ecx + sub %ecx, %edx # adjust for alignment offset + lea 1(%rdi, %rdx, 1), %rax # return value + jmp .L0132 + + /* c not found in second chunk: prepare for main loop */ +0: movdqa 32(%rsi), %xmm0 # load next string chunk + movdqa %xmm4, %xmm1 + movdqu %xmm2, (%rdi) # deposit head into buffer + sub %rcx, %rdi # adjust RDI to correspond to RSI + mov %r11, %rdx + movdqu %xmm3, 16(%rdi) # deposit second chunk + sub %rsi, %rdi # express RDI as distance from RSI + add $32, %rsi # advance RSI past first two chunks + sub $16, %rdx # enough left for another round? + jb 1f + + /* main loop unrolled twice */ + ALIGN_TEXT +0: pcmpeqb %xmm0, %xmm1 # c encountered? + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 3f + + movdqu %xmm0, (%rsi, %rdi) + movdqa 16(%rsi), %xmm0 # load next string chunk + movdqa %xmm4, %xmm1 + cmp $16, %rdx # more than a full chunk left? + jb 2f + + add $32, %rsi # advance pointers to next chunk + pcmpeqb %xmm0, %xmm1 # c encountered? + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 4f + + movdqu %xmm0, -16(%rsi, %rdi) + movdqa (%rsi), %xmm0 # load next string chunk + movdqa %xmm4, %xmm1 + sub $32, %rdx + jae 0b + +1: sub $16, %rsi # undo second advancement + add $16, %edx + + /* 1--16 bytes left in the buffer but string has not ended yet */ +2: pcmpeqb %xmm1, %xmm0 # c encountered? + pmovmskb %xmm0, %r8d + mov %r8d, %ecx + bts %edx, %r8d # treat end of buffer as end of string + tzcnt %r8d, %r8d # find tail length + add %rsi, %rdi # restore RDI + movdqu 1(%rsi, %r8, 1), %xmm0 # load string tail + movdqu %xmm0, 1(%rdi, %r8, 1) # store string tail + lea 17(%rdi, %r8, 1), %rsi # return value if terminator encountered + xor %eax, %eax # return value if no terminator encountered + bt %r8d, %ecx # terminator encountered inside buffer? + cmovc %rsi, %rax # if yes, return pointer, else NULL + ret + +4: sub $16, %rsi # undo second advancement + + /* terminator found and buffer has not ended yet */ +3: tzcnt %eax, %eax # find length of string tail + movdqu -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. c) + add %rsi, %rdi # restore destination pointer + movdqu %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. c) + lea 1(%rdi, %rax, 1), %rax # compute return value + ret + + /* buffer is 1--32 bytes in size */ + ALIGN_TEXT +.Lrunt: add $32, %r11d # undo earlier decrement + mov %r8d, %r10d # keep a copy of the original match mask + bts %r11d, %r8d # induce match at buffer end + and %ax, %r8w # is there a match in the first 16 bytes? + jnz 0f # if yes, skip looking at second chunk + + pcmpeqb 16(%rsi), %xmm4 # check for match in second chunk + pmovmskb %xmm4, %r8d + shl $16, %r8d # place second chunk matches in bits 16--31 + mov %r8d, %r10d # keep a copy of the original match mask + bts %r11d, %r8d # induce a match at buffer end + +0: xor %eax, %eax # return value if terminator not found + tzcnt %r8d, %edx # find string/buffer length from alignment boundary + lea 1(%rdi, %rdx, 1), %r8 # return value if terminator found + rcx + sub %rcx, %r8 + bt %edx, %r10d # was the terminator present? + cmovc %r8, %rax # if yes, return pointer, else NULL + sub %ecx, %edx # find actual string/buffer length + + ALIGN_TEXT +.L0132: cmp $16, %rdx # at least 17 bytes to copy? + jb .L0116 + + /* copy 17--32 bytes */ + movdqu (%r9), %xmm0 # load first 16 bytes + movdqu -15(%r9, %rdx, 1), %xmm1 # load last 16 bytes + movdqu %xmm0, (%rdi) + movdqu %xmm1, -15(%rdi, %rdx, 1) + ret + + /* process strings of 1--16 bytes (rdx: min(buflen, srclen), rax: srclen) */ + ALIGN_TEXT +.L0116: cmp $8, %rdx # at least 9 bytes to copy? + jae .L0916 + + cmp $4, %rdx # at least 5 bytes to copy? + jae .L0508 + + cmp $2, %rdx # at least 3 bytes to copy? + jae .L0304 + + /* copy one or two bytes */ + movzbl (%r9), %ecx # load first byte from src + movzbl (%r9, %rdx, 1), %esi # load last byte from src + mov %cl, (%rdi) # deposit into destination + mov %sil, (%rdi, %rdx, 1) + ret + +.L0304: movzwl (%r9), %ecx + movzwl -1(%r9, %rdx, 1), %esi + mov %cx, (%rdi) + mov %si, -1(%rdi, %rdx, 1) + ret + +.L0508: mov (%r9), %ecx + mov -3(%r9, %rdx, 1), %esi + mov %ecx, (%rdi) + mov %esi, -3(%rdi, %rdx, 1) + ret + +.L0916: mov (%r9), %rcx + mov -7(%r9, %rdx, 1), %rsi + mov %rcx, (%rdi) + mov %rsi, -7(%rdi, %rdx, 1) + ret + + /* length zero destination: return null pointer */ +.L0: xor %eax, %eax + ret +ARCHEND(__memccpy, baseline) + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/memchr.S b/lib/libc/amd64/string/memchr.S new file mode 100644 index 000000000000..cfab9b1302de --- /dev/null +++ b/lib/libc/amd64/string/memchr.S @@ -0,0 +1,207 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include <machine/asm.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ + + .weak memchr + .set memchr, __memchr +ARCHFUNCS(__memchr) + ARCHFUNC(__memchr, scalar) + ARCHFUNC(__memchr, baseline) +ENDARCHFUNCS(__memchr) + +ARCHENTRY(__memchr, scalar) + test %rdx, %rdx # empty input? + je .Lnomatch + + lea (, %rdi, 8), %ecx + mov $-1, %rax + add %rdi, %rdx # pointer to end of buffer or to end of + cmovc %rax, %rdx # address space (whichever comes first) + and $~7, %rdi # align to 8 bytes + mov (%rdi), %rax # load first word + movzbl %sil, %esi # clear stray high bits + movabs $0x0101010101010101, %r8 + imul %r8, %rsi # replicate char 8 times + + /* compute head and tail masks */ + mov %r8, %r10 + movabs $0x8080808080808080, %r9 + shl %cl, %r10 # 0x01 where string head is + lea (, %rdx, 8), %ecx + xor %r8, %r10 # 0x01 where it is not + neg %r8 # negate 01..01 so we can use lea + mov %r9, %r11 + xor %rsi, %rax # str ^ c (0x00 where str[i] == c) + neg %ecx + or %r10, %rax # except before the string + shr %cl, %r11 # 0x80 where string tail is + + add $8, %rdi # advance to next 8 bytes + cmp %rdx, %rdi # end of buffer reached during head? + jae .Ltail # and go to tail-processing code + + /* main loop, unrolled twice */ + ALIGN_TEXT +0: lea (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01 + not %rax # ~(str ^ c) + and %r9, %rax # ((str^c) - 0x01..01) & ~(str^c) + and %rcx, %rax # not including junk bytes + jnz .Lmatch + + mov (%rdi), %rax + add $8, %rdi + xor %rsi, %rax # str ^ c + cmp %rdx, %rdi + jae .Ltail + + lea (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01 + not %rax # ~(str ^ c) + and %r9, %rax # ((str^c) - 0x01..01) & ~(str^c) + and %rcx, %rax # not including junk bytes + jnz .Lmatch + + mov (%rdi), %rax + add $8, %rdi + xor %rsi, %rax # str ^ c + cmp %rdx, %rdi + jb 0b + +.Ltail: lea (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01 + not %rax # ~(str ^ c) + and %r11, %rax # ((str^c) - 0x01..01) & ~(str^c) + and %rcx, %rax # not including junk bytes or bytes past buffer + jz .Lnomatch + +.Lmatch: + tzcnt %rax, %rax # first match + shr $3, %eax # scale from bit to byte index + lea -8(%rdi, %rax), %rax # pointer to found c + ret + + /* no match found */ +.Lnomatch: + xor %eax, %eax # return null pointer + ret +ARCHEND(__memchr, scalar) + +ARCHENTRY(__memchr, baseline) + test %rdx, %rdx # empty input? + je .Lnomatchb + + movd %esi, %xmm2 + mov %edi, %ecx + mov $-1, %r9 + add %rdi, %rdx # pointer to end of buffer or to end of + cmovc %r9, %rdx # address space (whichever comes first) + and $~0x1f, %rdi # align to 32 bytes + movdqa (%rdi), %xmm0 # load first 32 bytes + movdqa 16(%rdi), %xmm1 + + punpcklbw %xmm2, %xmm2 # c -> cc + + shl %cl, %r9d # mask with zeroes before the string + + punpcklwd %xmm2, %xmm2 # cc -> cccc + + mov $-1, %r8d + xor %ecx, %ecx + sub %edx, %ecx # edx = -ecx + shr %cl, %r8d # bytes in tail that are part of the buffer + + pshufd $0, %xmm2, %xmm2 # cccc -> cccccccccccccccc + + add $32, %rdi # advance to next 32 bytes + mov $-1, %eax + cmp %rdx, %rdi # end of buffer reached during head? + cmovae %r8d, %eax # if yes, do combined head/tail processing + and %r9d, %eax # mask of bytes in head part of string + + /* process head */ + pcmpeqb %xmm2, %xmm1 + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm1, %esi + pmovmskb %xmm0, %ecx + shl $16, %esi + or %esi, %ecx # locations of matches + and %ecx, %eax # any match inside buffer? + jnz .Lprecisematchb + + cmp %rdx, %rdi # did the buffer end here? + jae .Lnomatchb # if yes we are done + + /* main loop */ + ALIGN_TEXT +0: movdqa (%rdi), %xmm0 # load next string chunk + movdqa 16(%rdi), %xmm1 + add $32, %rdi + cmp %rdx, %rdi # ready for main loop? + jae .Ltailb + + pcmpeqb %xmm2, %xmm0 + pcmpeqb %xmm2, %xmm1 + por %xmm1, %xmm0 # match in either half? + pmovmskb %xmm0, %eax + test %eax, %eax + jz 0b + +.Lmatchb: + pcmpeqb -32(%rdi), %xmm2 # redo comparison of first 16 bytes + pmovmskb %xmm1, %ecx + pmovmskb %xmm2, %eax + shl $16, %ecx + or %ecx, %eax # location of matches + +.Lprecisematchb: + tzcnt %eax, %eax # find location of match + lea -32(%rdi, %rax, 1), %rax # point to matching byte + ret + +.Ltailb: + pcmpeqb %xmm2, %xmm1 + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm1, %edx + pmovmskb %xmm0, %eax + shl $16, %edx + or %edx, %eax # location of matches + and %r8d, %eax # mask out matches beyond buffer + bsf %eax, %edx # location of match + lea -32(%rdi, %rdx, 1), %rdx # pointer to match (if any) + cmovnz %rdx, %rax # point to match if present, + ret # else null pointer + +.Lnomatchb: + xor %eax, %eax # return null pointer + ret +ARCHEND(__memchr, baseline) + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/memcmp.S b/lib/libc/amd64/string/memcmp.S index d192229677b3..dc8bcff73cb9 100644 --- a/lib/libc/amd64/string/memcmp.S +++ b/lib/libc/amd64/string/memcmp.S @@ -328,13 +328,28 @@ ARCHENTRY(memcmp, baseline) movdqu 16(%rsi, %rdi, 1), %xmm1 pcmpeqb 16(%rdi), %xmm1 # compare second half of this iteration add %rcx, %rdx # pointer to last byte in buffer - pcmpeqb %xmm2, %xmm0 + jc .Loverflow # did this overflow? +0: pcmpeqb %xmm2, %xmm0 pmovmskb %xmm0, %eax xor $0xffff, %eax # any mismatch? jne .Lmismatch_head add $64, %rdi # advance to next iteration jmp 1f # and get going with the loop + /* + * If we got here, a buffer length was passed to memcmp(a, b, len) + * such that a + len < a. While this sort of usage is illegal, + * it is plausible that a caller tries to do something like + * memcmp(a, b, SIZE_MAX) if a and b are known to differ, intending + * for memcmp() to stop comparing at the first mismatch. This + * behaviour is not guaranteed by any version of ISO/IEC 9899, + * but usually works out in practice. Let's try to make this + * case work by comparing until the end of the address space. + */ +.Loverflow: + mov $-1, %rdx # compare until the end of memory + jmp 0b + /* process buffer 32 bytes at a time */ ALIGN_TEXT 0: movdqu -32(%rsi, %rdi, 1), %xmm0 diff --git a/lib/libc/amd64/string/memrchr.S b/lib/libc/amd64/string/memrchr.S new file mode 100644 index 000000000000..4f6c5a238daa --- /dev/null +++ b/lib/libc/amd64/string/memrchr.S @@ -0,0 +1,166 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Robert Clausecker + */ + +#include <machine/asm.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + +ARCHFUNCS(memrchr) + ARCHFUNC(memrchr, scalar) + ARCHFUNC(memrchr, baseline) +ENDARCHFUNCS(memrchr) + +ARCHENTRY(memrchr, scalar) + xor %eax, %eax # prospective return value + sub $4, %rdx # 4 bytes left to process? + jb 1f + + ALIGN_TEXT +0: xor %r8, %r8 + lea 2(%rdi), %r10 + cmp %sil, 2(%rdi) + cmovne %r8, %r10 # point to null if no match + + cmp %sil, (%rdi) + cmove %rdi, %r8 # point to first char if match + + lea 1(%rdi), %r9 + cmp %sil, 1(%rdi) + cmovne %r8, %r9 # point to first result if no match in second + + lea 3(%rdi), %r11 + cmp %sil, 3(%rdi) + cmovne %r10, %r11 + + test %r11, %r11 + cmovz %r9, %r11 # take first pair match if none in second + + test %r11, %r11 + cmovnz %r11, %rax # take match in current set if any + + add $4, %rdi + sub $4, %rdx + jae 0b + +1: cmp $-3, %edx # a least one character left to process? + jb 2f + + cmp %sil, (%rdi) + cmove %rdi, %rax + + lea 1(%rdi), %rcx + cmp $-2, %edx # at least two characters left to process? + jb 2f + + cmp %sil, 1(%rdi) + cmove %rcx, %rax + + lea 2(%rdi), %rcx + cmp $-1, %edx # at least three character left to process? + jb 2f + + cmp %sil, 2(%rdi) + cmove %rcx, %rax + +2: ret +ARCHEND(memrchr, scalar) + +ARCHENTRY(memrchr, baseline) + movd %esi, %xmm4 + test %rdx, %rdx # empty buffer? + jz .L0 # if yes, return immediately + + punpcklbw %xmm4, %xmm4 # c -> cc + mov %edi, %ecx + punpcklwd %xmm4, %xmm4 # cc -> cccc + and $~0xf, %rdi # align source pointer + pshufd $0, %xmm4, %xmm4 # cccc -> cccccccccccccccc + and $0xf, %ecx + movdqa %xmm4, %xmm0 + mov $-1, %r8d + pcmpeqb (%rdi), %xmm0 # compare aligned head + shl %cl, %r8d # mask of bytes in the head of the buffer + pmovmskb %xmm0, %eax + + sub $16, %rcx + and %r8d, %eax # match mask + add %rcx, %rdx # advance past head + cmc + jbe .Lrunt # did the string end in the buffer? + + mov %rdi, %rsi # pointer to matching chunk + add $16, %rdi + sub $16, %rdx # enough left for another round? + jbe 1f + + /* main loop unrolled twice */ + ALIGN_TEXT +0: movdqa %xmm4, %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %r8d + + cmp $16, %rdx # enough left for second chunk? + jbe 2f + + movdqa %xmm4, %xmm0 + pcmpeqb 16(%rdi), %xmm0 + pmovmskb %xmm0, %ecx + + lea 16(%rdi), %r9 + test %ecx, %ecx # match found in second chunk? + cmovz %r8d, %ecx # if not, use match data from first chunk + cmovz %rdi, %r9 + + test %ecx, %ecx # any match found? + cmovnz %ecx, %eax # if yes, overwrite previously found match + cmovnz %r9, %rsi + + add $32, %rdi # advance to next iteration + sub $32, %rdx # advance to next chunks + ja 0b + + /* process remaining 1--16 bytes */ +1: pcmpeqb (%rdi), %xmm4 + mov $0xffff, %r8d + xor %ecx, %ecx + sub %edx, %ecx # number of bytes to be masked out + pmovmskb %xmm4, %r9d + shr %cl, %r8d # mask of bytes to be kept in the buffer + and %r9d, %r8d + cmovnz %r8d, %eax + cmovnz %rdi, %rsi + bsr %eax, %eax + lea (%rsi, %rax, 1), %rsi # pointer to match (or junk) + cmovnz %rsi, %rax # if any match was found, return it + ret + + /* end of chunk reached within first half iteration */ +2: test %r8d, %r8d # match in previous chunk? + cmovnz %r8d, %eax # if yes, overwrite previous chunks + cmovnz %rdi, %rsi + add $16, %rdi # point to tail + sub $16, %edx + jmp 1b # handle tail the same otherwise + + /* runt: string ends within head, edx has negated amount of invalid head bytes */ +.Lrunt: mov $0xffff, %r8d + xor %ecx, %ecx + sub %edx, %ecx + shr %cl, %r8d + and %r8d, %eax + bsr %eax, %eax + lea (%rdi, %rax, 1), %rdi + cmovnz %rdi, %rax + ret + + /* empty buffer: return a null pointer */ +.L0: xor %eax, %eax + ret +ARCHEND(memrchr, baseline) + + .section .note.GNU-stack, "", %progbits diff --git a/lib/libc/amd64/string/stpncpy.S b/lib/libc/amd64/string/stpncpy.S new file mode 100644 index 000000000000..5ce0dd093a9e --- /dev/null +++ b/lib/libc/amd64/string/stpncpy.S @@ -0,0 +1,283 @@ +/* + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include <machine/asm.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + + .weak stpncpy + .set stpncpy, __stpncpy +ARCHFUNCS(__stpncpy) + ARCHFUNC(__stpncpy, scalar) + ARCHFUNC(__stpncpy, baseline) +ENDARCHFUNCS(__stpncpy) + +ARCHENTRY(__stpncpy, scalar) + push %rbp # establish stack frame + mov %rsp, %rbp + + push %rdx + push %rdi + push %rsi + push %rax # dummy push for alignment + + mov %rsi, %rdi + xor %esi, %esi + call CNAME(__memchr) # memchr(src, '\0', len) + pop %rcx # dummy pop + pop %rsi + mov -16(%rbp), %rdi + + test %rax, %rax # NUL found? + jz .Lfullcopy + + mov %rax, %rdx + sub %rsi, %rdx # copy until the NUL byte + add %rdx, -16(%rbp) # advance destination by string length + sub %rdx, -8(%rbp) # and shorten buffer size by string length + call CNAME(memcpy) + + pop %rdi + pop %rdx + xor %esi, %esi + pop %rbp + jmp CNAME(memset) # clear remaining buffer + +.Lfullcopy: + mov -8(%rbp), %rdx + call CNAME(memcpy) # copy whole string + add -8(%rbp), %rax # point to dest[n] + leave + ret +ARCHEND(__stpncpy, scalar) + + /* + * this mask allows us to generate masks of 16-n 0xff bytes + * followed by n 0x00 bytes by loading from .Lmask+n. + */ + .section .rodata +.Lmask: .quad 0xffffffffffffffff + .quad 0xffffffffffffffff + .quad 0x0000000000000000 + .quad 0x0000000000000000 + +/* stpncpy(char *restrict rdi, const char *rsi, size_t rdx) */ +ARCHENTRY(__stpncpy, baseline) +#define bounce (-3*16-8) /* location of on-stack bounce buffer */ + + test %rdx, %rdx # no bytes to copy? + jz .L0 + + mov %esi, %ecx + and $~0xf, %rsi # align source to 16 bytes + movdqa (%rsi), %xmm0 # load head + and $0xf, %ecx # offset from alignment + mov $-1, %r9d + lea -32(%rcx), %rax # set up overflow-proof comparison rdx+rcx<=32 + shl %cl, %r9d # mask of bytes belonging to the string + sub %rcx, %rdi # adjust RDI to correspond to RSI + pxor %xmm1, %xmm1 + movdqa %xmm0, bounce(%rsp) # stash copy of head on the stack + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %r8d + + lea (%rdx, %rcx, 1), %r10 # buffer length from alignment boundary + add %rdx, %rax # less than 2 chunks (32 bytes) to play with? + jnc .Lrunt # if yes, use special runt processing + + movdqu %xmm1, -16(%rdi, %r10, 1) # clear final bytes of destination + and %r9d, %r8d # end of string within head? + jnz .Lheadnul + + movdqu (%rsi, %rcx, 1), %xmm2 # load head from source buffer + movdqu %xmm2, (%rdi, %rcx, 1) # an deposit + + add $16, %rsi + add $16, %rdi + sub $32, %r10 + + /* main loop unrolled twice */ + ALIGN_TEXT +0: movdqa (%rsi), %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 # NUL byte encountered? + pmovmskb %xmm1, %r8d + test %r8d, %r8d + jnz 3f + + movdqu %xmm0, (%rdi) + cmp $16, %r10 # more than a full chunk left? + jbe 1f + + movdqa 16(%rsi), %xmm0 + add $32, %rdi # advance pointers to next chunk + add $32, %rsi + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 # NUL byte encountered? + pmovmskb %xmm1, %r8d + test %r8d, %r8d + jnz 2f + + movdqu %xmm0, -16(%rdi) + sub $32, %r10 # more than another full chunk left? + ja 0b + + sub $16, %rdi # undo second advancement + sub $16, %rsi + add $16, %r10d # restore number of remaining bytes + + /* 1--16 bytes left but string has not ended yet */ +1: pxor %xmm1, %xmm1 + pcmpeqb 16(%rsi), %xmm1 # NUL byte in source tail? + pmovmskb %xmm1, %r8d + bts %r10d, %r8d # treat end of buffer as NUL + tzcnt %r8d, %r8d # where is the NUL byte? + movdqu (%rsi, %r8, 1), %xmm0 # load source tail before NUL + lea 16(%rdi, %r8, 1), %rax # point return value to NUL byte + # or end of buffer + movdqu %xmm0, (%rdi, %r8, 1) # store tail into the buffer + ret + +2: sub $16, %rdi # undo second advancement + sub $16, %rsi + sub $16, %r10 + + /* string has ended and buffer has not */ +3: tzcnt %r8d, %r8d # where did the string end? + lea .Lmask+16(%rip), %rcx + lea (%rdi, %r8, 1), %rax # where the NUL byte will be + neg %r8 + movdqu (%rcx, %r8, 1), %xmm1 # mask with FF where the string is, + # 00 where it is not + pand %xmm1, %xmm0 # mask out bytes after the string + movdqu %xmm0, (%rdi) # store masked current chunk + pxor %xmm1, %xmm1 + sub $16, %r10 # another full chunk left? + jbe 1f + + /* clear remaining destination buffer (tail has been cleared earlier) */ + ALIGN_TEXT +0: movdqu %xmm1, 16(%rdi) + cmp $16, %r10 + jbe 1f + + movdqu %xmm1, 32(%rdi) + add $32, %rdi + sub $32, %r10 + ja 0b + +1: ret + + /* at least two chunks to play with and NUL while processing head */ +.Lheadnul: + movdqu bounce(%rsp, %rcx, 1), %xmm0 # load start of source from stack + tzcnt %r8d, %r8d # find location of NUL byte + movdqu %xmm0, (%rdi, %rcx, 1) # deposit head in the destination + movdqu %xmm1, (%rdi, %r8, 1) # clear out following bytes + movdqu %xmm1, 16(%rdi) # clear out second chunk + lea (%rdi, %r8, 1), %rax # make RAX point to the NUL byte + + add $32, %rdi # advance past first two chunks + sub $32+16, %r10 # advance past first three chunks + jbe 1f # did we pass the end of the buffer? + + /* clear remaining destination buffer (tail has been cleared earlier) */ + ALIGN_TEXT +0: movdqu %xmm1, (%rdi) # clear out buffer chunk + cmp $16, %r10 + jbe 1f + + movdqu %xmm1, 16(%rdi) + add $32, %rdi + sub $32, %r10 + ja 0b + +1: ret + + /* 1--32 bytes to copy, bounce through the stack */ +.Lrunt: movdqa %xmm1, bounce+16(%rsp) # clear out rest of on-stack copy + bts %r10d, %r8d # treat end of buffer as end of string + and %r9w, %r8w # end of string within first buffer? + jnz 0f # if yes, do not inspect second buffer + + movdqa 16(%rsi), %xmm0 # load second chunk of input + movdqa %xmm0, bounce+16(%rsp) # stash copy on stack + pcmpeqb %xmm1, %xmm0 # NUL in second chunk? + pmovmskb %xmm0, %r9d + shl $16, %r9d + or %r9d, %r8d # merge found NUL bytes into NUL mask + + /* end of string after one buffer */ +0: tzcnt %r8d, %r8d # location of last char in string + movdqu %xmm1, bounce(%rsp, %r8, 1) # clear bytes behind string + lea bounce(%rsp, %rcx, 1), %rsi # start of string copy on stack + lea (%rdi, %r8, 1), %rax # return pointer to NUL byte + + cmp $16, %edx # at least 16 bytes to transfer? + jae .L1631 + + mov (%rsi), %r8 # load string head + cmp $8, %edx # at least 8 bytes to transfer? + jae .L0815 + + cmp $4, %edx # at least 4 bytes to transfer? + jae .L0407 + + movzwl -2(%rsi, %rdx, 1), %esi # load last two bytes of string + mov %r8b, (%rdi, %rcx, 1) # store first byte + + cmp $2, %edx # at least 2 bytes to transfer? + jb .L1 + + mov %si, -2(%rdi, %r10, 1) # store last two bytes of string +.L1: ret + +.L1631: movdqu (%rsi), %xmm0 # load first 16 bytes of string + movdqu -16(%rsi, %rdx, 1), %xmm1 # load last 16 bytes of string + movdqu %xmm0, (%rdi, %rcx, 1) + movdqu %xmm1, -16(%rdi, %r10, 1) + ret + +.L0815: mov -8(%rsi, %rdx, 1), %rdx # load last 8 bytes of string + mov %r8, (%rdi, %rcx, 1) + mov %rdx, -8(%rdi, %r10, 1) + ret + +.L0407: mov -4(%rsi, %rdx, 1), %edx # load last four bytes of string + mov %r8d, (%rdi, %rcx, 1) + mov %edx, -4(%rdi, %r10, 1) + ret + + /* length 0 buffer: just return dest */ +.L0: mov %rdi, %rax + ret +ARCHEND(__stpncpy, baseline) + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/strcat.S b/lib/libc/amd64/string/strcat.S index 0834408acfb7..081e98840cee 100644 --- a/lib/libc/amd64/string/strcat.S +++ b/lib/libc/amd64/string/strcat.S @@ -1,6 +1,14 @@ -/* - * Written by J.T. Conklin <jtc@acorntoolworks.com> - * Public domain. +/*- + * Copyright (c) 2023, The FreeBSD Foundation + * + * SPDX-License-Expression: BSD-2-Clause + * + * Portions of this software were developed by Robert Clausecker + * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation. + * + * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcat.S + * written by J.T. Conklin <jtc@acorntoolworks.com> + * that was originally dedicated to the public domain */ #include <machine/asm.h> @@ -8,7 +16,14 @@ RCSID("$NetBSD: strcat.S,v 1.4 2004/07/26 18:51:21 drochner Exp $") #endif -ENTRY(strcat) +#include "amd64_archlevel.h" + +ARCHFUNCS(strcat) + ARCHFUNC(strcat, scalar) + ARCHFUNC(strcat, baseline) +ENDARCHFUNCS(strcat) + +ARCHENTRY(strcat, scalar) movq %rdi,%rax movabsq $0x0101010101010101,%r8 movabsq $0x8080808080808080,%r9 @@ -161,6 +176,28 @@ ENTRY(strcat) .Ldone: ret -END(strcat) +ARCHEND(strcat, scalar) + +/* + * Call into strlen + strcpy if we have any SIMD at all. + * The scalar implementation above is better for the scalar + * case as it avoids the function call overhead, but pessimal + * if we could call SIMD routines instead. + */ +ARCHENTRY(strcat, baseline) + push %rbp + mov %rsp, %rbp + push %rsi + push %rbx + mov %rdi, %rbx # remember destination for later + call CNAME(strlen) # strlen(dest) + mov -8(%rbp), %rsi + lea (%rbx, %rax, 1), %rdi # dest + strlen(dest) + call CNAME(__stpcpy) # stpcpy(dest + strlen(dest), src) + mov %rbx, %rax # return dest + pop %rbx + leave + ret +ARCHEND(strcat, baseline) .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/strcmp.S b/lib/libc/amd64/string/strcmp.S index 437db7eca43a..eb354bd2af82 100644 --- a/lib/libc/amd64/string/strcmp.S +++ b/lib/libc/amd64/string/strcmp.S @@ -1,14 +1,33 @@ -/* - * Written by J.T. Conklin <jtc@acorntoolworks.com> - * Public domain. +/*- + * Copyright (c) 2023, The FreeBSD Foundation + * + * SPDX-License-Expression: BSD-2-Clause + * + * Portions of this software were developed by Robert Clausecker + * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation. + * + * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcmp.S + * written by J.T. Conklin <jtc@acorntoolworks.com> that was originally + * dedicated to the public domain. */ #include <machine/asm.h> +#include <machine/param.h> + #if 0 RCSID("$NetBSD: strcmp.S,v 1.3 2004/07/19 20:04:41 drochner Exp $") #endif -ENTRY(strcmp) +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + +ARCHFUNCS(strcmp) + ARCHFUNC(strcmp, scalar) + ARCHFUNC(strcmp, baseline) +ENDARCHFUNCS(strcmp) + +ARCHENTRY(strcmp, scalar) /* * Align s1 to word boundary. * Consider unrolling loop? @@ -39,7 +58,7 @@ ENTRY(strcmp) movabsq $0x8080808080808080,%r9 subq $8,%rsi - .align 4 + ALIGN_TEXT .Lword_loop: movq 8(%rdi),%rax addq $8,%rdi @@ -53,7 +72,7 @@ ENTRY(strcmp) testq %r9,%rdx je .Lword_loop - .align 4 + ALIGN_TEXT .Lbyte_loop: movb (%rdi),%al incq %rdi @@ -69,6 +88,272 @@ ENTRY(strcmp) movzbq %dl,%rdx subq %rdx,%rax ret -END(strcmp) +ARCHEND(strcmp, scalar) + +ARCHENTRY(strcmp, baseline) + /* check if either string crosses a page in the head */ + lea 15(%rdi), %r8d # end of head + lea 15(%rsi), %r9d + mov %edi, %eax + mov %esi, %edx + xor %edi, %r8d # bits that changed between first and last byte + xor %esi, %r9d + and $~0xf, %rdi # align heads to 16 bytes + and $~0xf, %rsi + or %r8d, %r9d # in either RSI or RDI + and $0xf, %eax # offset from alignment + and $0xf, %edx + pxor %xmm1, %xmm1 + test $PAGE_SIZE, %r9d # did the page change? + jz 0f # if not, take fast path + + /* heads may cross page boundary, avoid unmapped loads */ + movdqa (%rdi), %xmm0 # load aligned heads + movdqa (%rsi), %xmm2 + mov $-1, %r8d + mov $-1, %r9d + mov %eax, %ecx + shl %cl, %r8d # string head in XMM0 + mov %edx, %ecx + shl %cl, %r9d # string head in XMM2 + movdqa %xmm0, -40(%rsp) # stash copies of the heads on the stack + movdqa %xmm2, -24(%rsp) + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm0, %r10d + pmovmskb %xmm2, %r11d + test %r8d, %r10d # NUL byte present in first string? + lea -40(%rsp), %r8 + cmovz %rdi, %r8 + test %r9d, %r11d # NUL byte present in second string? + lea -24(%rsp), %r9 + cmovz %rsi, %r9 + movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads + movdqu (%r9, %rdx, 1), %xmm4 + jmp 1f + +0: movdqu (%rdi, %rax, 1), %xmm0 # load true heads + movdqu (%rsi, %rdx, 1), %xmm4 +1: pxor %xmm2, %xmm2 + pcmpeqb %xmm0, %xmm2 # NUL byte present? + pcmpeqb %xmm0, %xmm4 # which bytes match? + pandn %xmm4, %xmm2 # match and not NUL byte? + pmovmskb %xmm2, %r9d + xor $0xffff, %r9d # mismatch or NUL byte? + jnz .Lhead_mismatch + + /* load head and second chunk */ + movdqa 16(%rdi), %xmm2 # load second chunks + movdqa 16(%rsi), %xmm3 + sub %rdx, %rax # is a&0xf >= b&0xf? + jb .Lswapped # if not, proceed with swapped operands + + neg %rax + movdqu 16(%rsi, %rax, 1), %xmm0 + sub %rdi, %rsi # express RSI as distance from RDI + lea (%rsi, %rax, 1), %rdx # point RDX to offset in second string + neg %rax + pcmpeqb %xmm3, %xmm1 # ... corresponding to RDI + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + add $16, %rdi + test %r8d, %r8d + jnz .Lnul_found + xor $0xffff, %r9d + jnz .Lmismatch + add $16, %rdi # advance aligned pointers + + /* + * During the main loop, the layout of the two strings is something like: + * + * v ------1------ v ------2------ v + * RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... + * RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... + * + * where v indicates the alignment boundaries and corresponding chunks + * of the strings have the same letters. Chunk A has been checked in + * the previous iteration. This iteration, we first check that string + * RSI doesn't end within region 2, then we compare chunk B between the + * two strings. As RSI is known not to hold a NUL byte in regsions 1 + * and 2 at this point, this also ensures that RDI has not ended yet. + */ + ALIGN_TEXT +0: movdqu (%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI? + pxor %xmm1, %xmm1 + pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI? + pcmpeqb (%rdi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + test %r8d, %r8d + jnz .Lnul_found + xor $0xffff, %r9d # any mismatches? + jnz .Lmismatch + + /* main loop unrolled twice */ + movdqu 16(%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI? + pxor %xmm1, %xmm1 + pcmpeqb 16(%rdi, %rsi, 1), %xmm1 # end of string in RSI? + pcmpeqb 16(%rdi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + add $32, %rdi + test %r8d, %r8d + jnz .Lnul_found2 + xor $0xffff, %r9d # any mismatches? + jz 0b + + sub $16, %rdi # roll back second increment + + /* a mismatch has been found between RDX and RSI */ +.Lmismatch: + tzcnt %r9d, %r9d # where is the mismatch? + add %rdi, %rdx # turn RDX from offset to pointer + movzbl (%rdx, %r9, 1), %ecx + movzbl (%rdi, %r9, 1), %eax + sub %ecx, %eax # difference of the mismatching chars + ret + + /* mismatch in true heads */ +.Lhead_mismatch: + tzcnt %r9d, %r9d # where is the mismatch? + add %rax, %rdi # return to true heads + add %rdx, %rsi + movzbl (%rdi, %r9, 1), %eax # mismatching characters + movzbl (%rsi, %r9, 1), %ecx + sub %ecx, %eax + ret + +.Lnul_found2: + sub $16, %rdi # roll back second increment + + /* a NUL has been found in RSI */ +.Lnul_found: + mov %eax, %ecx + mov %r8d, %r10d + shl %cl, %r8w # adjust NUL mask to positions in RDI/RDX + xor $0xffff, %r9d # mask of mismatches + or %r8d, %r9d # NUL bytes also count as mismatches + jnz .Lmismatch + + /* + * (RDI) == (RSI) and NUL is past the string. + * Compare (RSI) with the corresponding part + * of the other string until the NUL byte. + */ + movdqu (%rdi, %rax, 1), %xmm0 + pcmpeqb (%rdi, %rsi, 1), %xmm0 + add %rdi, %rsi # restore RSI pointer + add %rax, %rdi # point RDI to chunk corresponding to (RSI) + pmovmskb %xmm0, %ecx # mask of matches + not %ecx # mask of mismatches + or %r10d, %ecx # mask of mismatches or NUL bytes + tzcnt %ecx, %ecx # location of first mismatch + movzbl (%rdi, %rcx, 1), %eax + movzbl (%rsi, %rcx, 1), %ecx + sub %ecx, %eax + ret + + /* + * If (a&0xf) < (b&0xf), we do the same thing but with swapped + * operands. I found that this performs slightly better than + * using conditional moves to do the swap branchless. + */ +.Lswapped: + movdqu 16(%rdi, %rax, 1), %xmm0 + sub %rsi, %rdi # express RDI as distance from RSI + lea (%rdi, %rax, 1), %rdx # point RDX to offset in RDI corresponding to RSI + neg %rax # make difference positive + pcmpeqb %xmm2, %xmm1 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + add $16, %rsi # advance aligned pointers + test %r8d, %r8d + jnz .Lnul_founds + xor $0xffff, %r9d + jnz .Lmismatchs + add $16, %rsi + + /* + * During the main loop, the layout of the two strings is something like: + * + * v ------1------ v ------2------ v + * RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... + * RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... + * + * where v indicates the alignment boundaries and corresponding chunks + * of the strings have the same letters. Chunk A has been checked in + * the previous iteration. This iteration, we first check that string + * RSI doesn't end within region 2, then we compare chunk B between the + * two strings. As RSI is known not to hold a NUL byte in regsions 1 + * and 2 at this point, this also ensures that RDI has not ended yet. + */ + ALIGN_TEXT +0: movdqu (%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI? + pxor %xmm1, %xmm1 + pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RSI? + pcmpeqb (%rsi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + test %r8d, %r8d + jnz .Lnul_founds + xor $0xffff, %r9d # any mismatches? + jnz .Lmismatchs + + /* main loop unrolled twice */ + movdqu 16(%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI? + pxor %xmm1, %xmm1 + pcmpeqb 16(%rsi, %rdi, 1), %xmm1 # end of string in RSI? + pcmpeqb 16(%rsi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + add $32, %rsi + test %r8d, %r8d + jnz .Lnul_found2s + xor $0xffff, %r9d # any mismatches? + jz 0b + + sub $16, %rsi # roll back second increment + + /* a mismatch has been found between RDX and RDI */ +.Lmismatchs: + tzcnt %r9d, %r9d # where is the mismatch? + add %rsi, %rdx # turn RDX from offset to pointer + movzbl (%rdx, %r9, 1), %eax + movzbl (%rsi, %r9, 1), %ecx + sub %ecx, %eax # difference of the mismatching chars + ret + +.Lnul_found2s: + sub $16, %rsi # roll back second increment + + /* a NUL has been found in RSI */ +.Lnul_founds: + mov %eax, %ecx + mov %r8d, %r10d + shl %cl, %r8w # adjust NUL mask to positions in RDI/RDX + xor $0xffff, %r9d # mask of mismatches + or %r8d, %r9d # NUL bytes also count as mismatches + jnz .Lmismatchs + + /* + * (RDI) == (RSI) and NUL is past the string. + * Compare (RSI) with the corresponding part + * of the other string until the NUL byte. + */ + movdqu (%rsi, %rax, 1), %xmm0 + pcmpeqb (%rsi, %rdi, 1), %xmm0 + add %rsi, %rdi # restore RDI pointer + add %rax, %rsi # point RSI to chunk corresponding to (RDI) + pmovmskb %xmm0, %ecx # mask of matches + not %ecx # mask of mismatches + or %r10d, %ecx # mask of mismatches or NUL bytes + tzcnt %ecx, %ecx # location of first mismatch + movzbl (%rdi, %rcx, 1), %eax + movzbl (%rsi, %rcx, 1), %ecx + sub %ecx, %eax + ret +ARCHEND(strcmp, baseline) .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/strcpy.c b/lib/libc/amd64/string/strcpy.c index fbc661462ff2..eb93b0defbaa 100644 --- a/lib/libc/amd64/string/strcpy.c +++ b/lib/libc/amd64/string/strcpy.c @@ -27,7 +27,6 @@ * SUCH DAMAGE. */ -#include <sys/cdefs.h> char *__stpcpy(char * __restrict, const char * __restrict); char * diff --git a/lib/libc/amd64/string/strcspn.S b/lib/libc/amd64/string/strcspn.S new file mode 100644 index 000000000000..7ebd7a847d67 --- /dev/null +++ b/lib/libc/amd64/string/strcspn.S @@ -0,0 +1,396 @@ +/* + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include <machine/asm.h> +#include <machine/param.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ + + .weak strcspn + .set strcspn, __strcspn +ARCHFUNCS(__strcspn) + ARCHFUNC(__strcspn, scalar) + NOARCHFUNC + ARCHFUNC(__strcspn, x86_64_v2) +ENDARCHFUNCS(__strcspn) + +ARCHENTRY(__strcspn, scalar) + push %rbp # align stack to enable function call + mov %rsp, %rbp + sub $256, %rsp # allocate space for lookup table + + /* check for special cases */ + movzbl (%rsi), %eax # first character in the set + test %eax, %eax + jz .Lstrlen + + movzbl 1(%rsi), %edx # second character in the set + test %edx, %edx + jz .Lstrchr + + /* no special case matches -- prepare lookup table */ + xor %r8d, %r8d + mov $28, %ecx +0: mov %r8, (%rsp, %rcx, 8) + mov %r8, 8(%rsp, %rcx, 8) + mov %r8, 16(%rsp, %rcx, 8) + mov %r8, 24(%rsp, %rcx, 8) + sub $4, %ecx + jnc 0b + + add $2, %rsi + movb $1, (%rsp, %rax, 1) # register first chars in set + movb $1, (%rsp, %rdx, 1) + mov %rdi, %rax # a copy of the source to iterate over + + /* process remaining chars in set */ + ALIGN_TEXT +0: movzbl (%rsi), %ecx + movb $1, (%rsp, %rcx, 1) + test %ecx, %ecx + jz 1f + + movzbl 1(%rsi), %ecx + movb $1, (%rsp, %rcx, 1) + test %ecx, %ecx + jz 1f + + add $2, %rsi + jmp 0b + + /* find match */ + ALIGN_TEXT +1: movzbl (%rax), %ecx + cmpb $0, (%rsp, %rcx, 1) + jne 2f + + movzbl 1(%rax), %ecx + cmpb $0, (%rsp, %rcx, 1) + jne 3f + + movzbl 2(%rax), %ecx + cmpb $0, (%rsp, %rcx, 1) + jne 4f + + movzbl 3(%rax), %ecx + add $4, %rax + cmpb $0, (%rsp, %rcx, 1) + je 1b + + sub $3, %rax +4: dec %rdi +3: inc %rax +2: sub %rdi, %rax # number of characters preceding match + leave + ret + + /* set is empty, degrades to strlen */ +.Lstrlen: + leave + jmp CNAME(strlen) + + /* just one character in set, degrades to strchr */ +.Lstrchr: + mov %rdi, (%rsp) # stash a copy of the string + mov %eax, %esi # find the character in the set + call CNAME(strchrnul) + sub (%rsp), %rax # length of prefix before match + leave + ret +ARCHEND(__strcspn, scalar) + + /* + * This kernel uses pcmpistri to do the heavy lifting. + * We provide five code paths, depending on set size: + * + * 0: call strlen() + * 1: call strchr() + * 2--16: one pcmpistri per 16 bytes of input + * 17--32: two pcmpistri per 16 bytes of input + * >=33: fall back to look up table + */ +ARCHENTRY(__strcspn, x86_64_v2) + push %rbp + mov %rsp, %rbp + sub $256, %rsp + + /* check for special cases */ + movzbl (%rsi), %eax + test %eax, %eax # empty string? + jz .Lstrlenv2 + + cmpb $0, 1(%rsi) # single character string? + jz .Lstrchrv2 + + /* find set size and copy up to 32 bytes to (%rsp) */ + mov %esi, %ecx + and $~0xf, %rsi # align set pointer + movdqa (%rsi), %xmm0 + pxor %xmm1, %xmm1 + and $0xf, %ecx # amount of bytes rsi is past alignment + xor %edx, %edx + pcmpeqb %xmm0, %xmm1 # end of string reached? + movdqa %xmm0, 32(%rsp) # transfer head of set to stack + pmovmskb %xmm1, %eax + shr %cl, %eax # clear out junk before string + test %eax, %eax # end of set reached? + jnz 0f + + movdqa 16(%rsi), %xmm0 # second chunk of the set + mov $16, %edx + sub %ecx, %edx # length of set preceding xmm0 + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 + movdqa %xmm0, 48(%rsp) + movdqu 32(%rsp, %rcx, 1), %xmm2 # head of set + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 1f + + movdqa 32(%rsi), %xmm0 # third chunk + add $16, %edx + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 + movdqa %xmm0, 64(%rsp) + pmovmskb %xmm1, %eax + test %eax, %eax # still not done? + jz .Lgt32v2 + +0: movdqu 32(%rsp, %rcx, 1), %xmm2 # head of set +1: tzcnt %eax, %eax + add %eax, %edx # length of set (excluding NUL byte) + cmp $32, %edx # above 32 bytes? + ja .Lgt32v2 + + /* + * At this point we know that we want to use pcmpistri. + * one last problem obtains: the head of the string is not + * aligned and may cross a cacheline. If this is the case, + * we take the part before the page boundary and repeat the + * last byte to fill up the xmm register. + */ + mov %rdi, %rax # save original string pointer + lea 15(%rdi), %esi # last byte of the head + xor %edi, %esi + test $PAGE_SIZE, %esi # does the head cross a page? + jz 0f + + /* head crosses page: copy to stack to fix up */ + and $~0xf, %rax # align head pointer temporarily + movzbl 15(%rax), %esi # last head byte on the page + movdqa (%rax), %xmm0 + movabs $0x0101010101010101, %r8 + imul %r8, %rsi # repeated 8 times + movdqa %xmm0, (%rsp) # head word on stack + mov %rsi, 16(%rsp) # followed by filler (last byte x8) + mov %rsi, 24(%rsp) + mov %edi, %eax + and $0xf, %eax # offset of head from alignment + add %rsp, %rax # pointer to fake head + +0: movdqu (%rax), %xmm0 # load head (fake or real) + lea 16(%rdi), %rax + and $~0xf, %rax # second 16 bytes of string (aligned) +1: cmp $16, %edx # 16--32 bytes? + ja .Lgt16v2 + + + /* set is 2--16 bytes in size */ + + /* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT */ + pcmpistri $0, %xmm0, %xmm2 # match in head? + jbe .Lheadmatchv2 + + ALIGN_TEXT +0: pcmpistri $0, (%rax), %xmm2 + jbe 1f # match or end of string? + pcmpistri $0, 16(%rax), %xmm2 + lea 32(%rax), %rax + ja 0b # match or end of string? + +3: lea -16(%rax), %rax # go back to second half +1: jc 2f # jump if match found + movdqa (%rax), %xmm0 # reload string piece + pxor %xmm1, %xmm1 + pcmpeqb %xmm1, %xmm0 # where is the NUL byte? + pmovmskb %xmm0, %ecx + tzcnt %ecx, %ecx # location of NUL byte in (%rax) +2: sub %rdi, %rax # offset of %xmm0 from beginning of string + add %rcx, %rax # prefix length before match/NUL + leave + ret + +.Lheadmatchv2: + jc 2f # jump if match found + pxor %xmm1, %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %ecx + tzcnt %ecx, %ecx # location of NUL byte +2: mov %ecx, %eax # prefix length before match/NUL + leave + ret + + /* match in first set half during head */ +.Lheadmatchv2first: + mov %ecx, %eax + pcmpistri $0, %xmm0, %xmm3 # match in second set half? + cmp %ecx, %eax # before the first half match? + cmova %ecx, %eax # use the earlier match + leave + ret + +.Lgt16v2: + movdqu 48(%rsp, %rcx, 1), %xmm3 # second part of set + + /* set is 17--32 bytes in size */ + pcmpistri $0, %xmm0, %xmm2 # match in first set half? + jb .Lheadmatchv2first + pcmpistri $0, %xmm0, %xmm3 # match in second set half or end of string? + jbe .Lheadmatchv2 + + ALIGN_TEXT +0: movdqa (%rax), %xmm0 + pcmpistri $0, %xmm0, %xmm2 + jb 4f # match in first set half? + pcmpistri $0, %xmm0, %xmm3 + jbe 1f # match in second set half or end of string? + movdqa 16(%rax), %xmm0 + add $32, %rax + pcmpistri $0, %xmm0, %xmm2 + jb 3f # match in first set half? + pcmpistri $0, %xmm0, %xmm3 + ja 0b # neither match in 2nd half nor string end? + + /* match in second half or NUL */ + lea -16(%rax), %rax # go back to second half +1: jc 2f # jump if match found + pxor %xmm1, %xmm1 + pcmpeqb %xmm1, %xmm0 # where is the NUL byte? + pmovmskb %xmm0, %ecx + tzcnt %ecx, %ecx # location of NUL byte in (%rax) +2: sub %rdi, %rax # offset of %xmm0 from beginning of string + add %rcx, %rax # prefix length before match/NUL + leave + ret + + /* match in first half */ +3: sub $16, %rax # go back to second half +4: sub %rdi, %rax # offset of %xmm0 from beginning of string + mov %ecx, %edx + pcmpistri $0, %xmm0, %xmm3 # match in second set half? + cmp %ecx, %edx # before the first half match? + cmova %ecx, %edx # use the earlier match + add %rdx, %rax # return full ofset + leave + ret + + /* set is empty, degrades to strlen */ +.Lstrlenv2: + leave + jmp CNAME(strlen) + + /* just one character in set, degrades to strchr */ +.Lstrchrv2: + mov %rdi, (%rsp) # stash a copy of the string + mov %eax, %esi # find this character + call CNAME(strchrnul) + sub (%rsp), %rax # length of prefix before match + leave + ret + + /* set is >=33 bytes in size */ +.Lgt32v2: + xorps %xmm0, %xmm0 + mov $256-64, %edx + + /* clear out look up table */ +0: movaps %xmm0, (%rsp, %rdx, 1) + movaps %xmm0, 16(%rsp, %rdx, 1) + movaps %xmm0, 32(%rsp, %rdx, 1) + movaps %xmm0, 48(%rsp, %rdx, 1) + sub $64, %edx + jnc 0b + + add %rcx, %rsi # restore string pointer + mov %rdi, %rax # keep a copy of the string + + /* initialise look up table */ + ALIGN_TEXT +0: movzbl (%rsi), %ecx + movb $1, (%rsp, %rcx, 1) + test %ecx, %ecx + jz 1f + + movzbl 1(%rsi), %ecx + movb $1, (%rsp, %rcx, 1) + test %ecx, %ecx + jz 1f + + movzbl 2(%rsi), %ecx + movb $1, (%rsp, %rcx, 1) + test %ecx, %ecx + jz 1f + + movzbl 3(%rsi), %ecx + movb $1, (%rsp, %rcx, 1) + test %ecx, %ecx + jz 1f + + add $4, %rsi + jmp 0b + + /* find match */ + ALIGN_TEXT +1: movzbl (%rax), %ecx + cmpb $0, (%rsp, %rcx, 1) + jne 2f + + movzbl 1(%rax), %ecx + cmpb $0, (%rsp, %rcx, 1) + jne 3f + + movzbl 2(%rax), %ecx + cmpb $0, (%rsp, %rcx, 1) + jne 4f + + movzbl 3(%rax), %ecx + add $4, %rax + cmpb $0, (%rsp, %rcx, 1) + je 1b + + sub $3, %rax +4: dec %rdi +3: inc %rax +2: sub %rdi, %rax # number of characters preceding match + leave + ret +ARCHEND(__strcspn, x86_64_v2) + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/strlcat.c b/lib/libc/amd64/string/strlcat.c new file mode 100644 index 000000000000..94fdc0963dc3 --- /dev/null +++ b/lib/libc/amd64/string/strlcat.c @@ -0,0 +1,27 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Robert Clausecker + */ + +#include <sys/cdefs.h> + +#include <string.h> + +#undef strlcat /* FORTIFY_SOURCE */ + +void *__memchr(const void *, int, size_t); +size_t __strlcpy(char *restrict, const char *restrict, size_t); + +size_t +strlcat(char *restrict dst, const char *restrict src, size_t dstsize) +{ + char *loc = __memchr(dst, '\0', dstsize); + + if (loc != NULL) { + size_t dstlen = (size_t)(loc - dst); + + return (dstlen + __strlcpy(loc, src, dstsize - dstlen)); + } else + return (dstsize + strlen(src)); +} diff --git a/lib/libc/amd64/string/strlcpy.S b/lib/libc/amd64/string/strlcpy.S new file mode 100644 index 000000000000..2b32c6c78047 --- /dev/null +++ b/lib/libc/amd64/string/strlcpy.S @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include <machine/asm.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + + .weak strlcpy + .set strlcpy, __strlcpy +ARCHFUNCS(__strlcpy) + ARCHFUNC(__strlcpy, scalar) + ARCHFUNC(__strlcpy, baseline) +ENDARCHFUNCS(__strlcpy) + +ARCHENTRY(__strlcpy, scalar) + push %rbp # establish stack frame + mov %rsp, %rbp + push %rsi + push %rbx + push %rdi + push %rdx + mov %rsi, %rdi + call CNAME(strlen) # strlen(src) + pop %rdx + pop %rdi + mov -8(%rbp), %rsi + mov %rax, %rbx # remember string length for return value + sub $1, %rdx # do not copy into the final byte of the buffer + jc 0f # skip copying altogether if buffer was empty + cmp %rax, %rdx # is the buffer longer than the input? + cmova %rax, %rdx # if yes, only copy the part that fits + movb $0, (%rdi, %rdx, 1) # NUL-terminate output buffer + call CNAME(memcpy) # copy string to output +0: mov %rbx, %rax # restore return value + pop %rbx + leave + ret +ARCHEND(__strlcpy, scalar) + +ARCHENTRY(__strlcpy, baseline) + sub $1, %rdx # do not count NUL byte in buffer length + jb .L0 # go to special code path if len was 0 + + mov %esi, %ecx + pxor %xmm1, %xmm1 + mov %rsi, %r9 # stash a copy of the source pointer for later + and $~0xf, %rsi + pcmpeqb (%rsi), %xmm1 # NUL found in head? + mov $-1, %r8d + and $0xf, %ecx + shl %cl, %r8d # mask of bytes in the string + pmovmskb %xmm1, %eax + and %r8d, %eax + jnz .Lhead_nul + + movdqa 16(%rsi), %xmm3 # load second string chunk + movdqu (%r9), %xmm2 # load unaligned string head + mov $32, %r8d + sub %ecx, %r8d # head length + length of second chunk + pxor %xmm1, %xmm1 + pcmpeqb %xmm3, %xmm1 # NUL found in second chunk? + + sub %r8, %rdx # enough space left for the second chunk? + jbe .Lhead_buf_end + + /* process second chunk */ + pmovmskb %xmm1, %eax + test %eax, %eax + jnz .Lsecond_nul + + /* string didn't end in second chunk and neither did buffer -- not a runt! */ + movdqa 32(%rsi), %xmm0 # load next string chunk + pxor %xmm1, %xmm1 + movdqu %xmm2, (%rdi) # deposit head into buffer + sub %rcx, %rdi # adjust RDI to correspond to RSI + movdqu %xmm3, 16(%rdi) # deposit second chunk + sub %rsi, %rdi # express RDI as distance from RSI + add $32, %rsi # advance RSI past first two chunks + sub $16, %rdx # enough left for another round? + jbe 1f + + /* main loop unrolled twice */ + ALIGN_TEXT +0: pcmpeqb %xmm0, %xmm1 # NUL byte encountered? + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 3f + + movdqu %xmm0, (%rsi, %rdi) + movdqa 16(%rsi), %xmm0 # load next string chunk + pxor %xmm1, %xmm1 + cmp $16, %rdx # more than a full chunk left? + jbe 2f + + add $32, %rsi # advance pointers to next chunk + pcmpeqb %xmm0, %xmm1 # NUL byte encountered? + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 4f + + movdqu %xmm0, -16(%rsi, %rdi) + movdqa (%rsi), %xmm0 # load next string chunk + pxor %xmm1, %xmm1 + sub $32, %rdx + ja 0b + +1: sub $16, %rsi # undo second advancement + add $16, %edx + + /* 1--16 bytes left in the buffer but string has not ended yet */ +2: pcmpeqb %xmm1, %xmm0 # NUL byte encountered? + pmovmskb %xmm0, %r8d + mov %r8d, %eax + bts %edx, %r8d # treat end of buffer as end of string + tzcnt %r8d, %r8d # find tail length + add %rsi, %rdi # restore RDI + movdqu (%rsi, %r8, 1), %xmm0 # load string tail + movdqu %xmm0, (%rdi, %r8, 1) # store string tail + movb $0, 16(%rdi, %r8, 1) # NUL terminate + + /* continue to find the end of the string */ + test %eax, %eax # end of string already reached? + jnz 1f + + ALIGN_TEXT +0: pcmpeqb 32(%rsi), %xmm1 + pmovmskb %xmm1, %eax + pxor %xmm1, %xmm1 + test %eax, %eax + jnz 2f + + pcmpeqb 48(%rsi), %xmm1 + pmovmskb %xmm1, %eax + add $32, %rsi + pxor %xmm1, %xmm1 + test %eax, %eax + jz 0b + +1: sub $16, %rsi # undo second advancement +2: tzcnt %eax, %eax # where is the NUL byte? + sub %r9, %rsi + lea 32(%rsi, %rax, 1), %rax # return string length + ret + +4: sub $16, %rsi # undo second advancement + add $16, %rdx # restore number of remaining bytes + + /* string has ended but buffer has not */ +3: tzcnt %eax, %eax # find length of string tail + movdqu -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. NUL) + add %rsi, %rdi # restore destination pointer + movdqu %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. NUL) + sub %r9, %rsi # string length to current chunk + add %rsi, %rax # plus length of current chunk + ret + +.Lhead_buf_end: + pmovmskb %xmm1, %r8d + add $32, %edx # restore edx to (len-1) + ecx + mov %r8d, %eax + shl $16, %r8d # place 2nd chunk NUL mask into bits 16--31 + bts %rdx, %r8 # treat end of buffer as end of string + tzcnt %r8, %rdx # find string/bufer len from alignment boundary + sub %ecx, %edx # find actual string/buffer len + movb $0, (%rdi, %rdx, 1) # write NUL terminator + + /* continue to find the end of the string */ + test %eax, %eax # end of string already reached? + jnz 1f + + ALIGN_TEXT +0: pcmpeqb 32(%rsi), %xmm1 + pmovmskb %xmm1, %eax + pxor %xmm1, %xmm1 + test %eax, %eax + jnz 2f + + pcmpeqb 48(%rsi), %xmm1 + pmovmskb %xmm1, %eax + add $32, %rsi + pxor %xmm1, %xmm1 + test %eax, %eax + jz 0b + +1: sub $16, %rsi +2: tzcnt %eax, %eax + sub %r9, %rsi + lea 32(%rsi, %rax, 1), %rax # return string length + jmp .L0031 + +.Lsecond_nul: + add %r8, %rdx # restore buffer length + tzcnt %eax, %eax # where is the NUL byte? + lea -16(%rcx), %r8d + sub %r8d, %eax # string length + cmp %rax, %rdx # is the string shorter than the buffer? + cmova %rax, %rdx # copy only min(buflen, srclen) bytes + movb $0, (%rdi, %rdx, 1) # write NUL terminator +.L0031: cmp $16, %rdx # at least 16 bytes to copy (not incl NUL)? + jb .L0015 + + /* copy 16--31 bytes */ + movdqu (%r9), %xmm0 # load first 16 bytes + movdqu -16(%r9, %rdx, 1), %xmm1 # load last 16 bytes + movdqu %xmm0, (%rdi) + movdqu %xmm1, -16(%rdi, %rdx, 1) + ret + +.Lhead_nul: + tzcnt %eax, %eax # where is the NUL byte? + sub %ecx, %eax # ... from the beginning of the string? + cmp %rax, %rdx # is the string shorter than the buffer? + cmova %rax, %rdx # copy only min(buflen, srclen) bytes + movb $0, (%rdi, %rdx, 1) # write NUL terminator + + /* process strings of 0--15 bytes (rdx: min(buflen, srclen), rax: srclen) */ +.L0015: cmp $8, %rdx # at least 8 bytes to copy? + jae .L0815 + + cmp $4, %rdx # at least 4 bytes to copy? + jae .L0407 + + cmp $2, %rdx # at least 2 bytes to copy? + jae .L0203 + + movzbl (%r9), %ecx # load first byte from src + mov %cl, (%rdi) # deposit into destination + movb $0, (%rdi, %rdx, 1) # add NUL terminator (again) + ret + +.L0203: movzwl (%r9), %ecx + movzwl -2(%r9, %rdx, 1), %esi + mov %cx, (%rdi) + mov %si, -2(%rdi, %rdx, 1) + ret + +.L0407: mov (%r9), %ecx + mov -4(%r9, %rdx, 1), %esi + mov %ecx, (%rdi) + mov %esi, -4(%rdi, %rdx, 1) + ret + +.L0815: mov (%r9), %rcx + mov -8(%r9, %rdx, 1), %rsi + mov %rcx, (%rdi) + mov %rsi, -8(%rdi, %rdx, 1) + ret + + /* length zero destination: just return the string length */ +.L0: mov %rsi, %rdi + jmp CNAME(strlen) +ARCHEND(__strlcpy, baseline) + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/strncat.c b/lib/libc/amd64/string/strncat.c new file mode 100644 index 000000000000..2c63ab50b3c3 --- /dev/null +++ b/lib/libc/amd64/string/strncat.c @@ -0,0 +1,31 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Robert Clausecker + */ + +#include <sys/cdefs.h> + +#include <string.h> + +#undef strncat /* _FORTIFY_SOURCE */ + +void *__memccpy(void *restrict, const void *restrict, int, size_t); + +char * +strncat(char *dest, const char *src, size_t n) +{ + size_t len; + char *endptr; + + len = strlen(dest); + endptr = __memccpy(dest + len, src, '\0', n); + + /* avoid an extra branch */ + if (endptr == NULL) + endptr = dest + len + n + 1; + + endptr[-1] = '\0'; + + return (dest); +} diff --git a/lib/libc/amd64/string/strncmp.S b/lib/libc/amd64/string/strncmp.S new file mode 100644 index 000000000000..932cf078bdfc --- /dev/null +++ b/lib/libc/amd64/string/strncmp.S @@ -0,0 +1,488 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include <machine/asm.h> +#include <machine/param.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + +ARCHFUNCS(strncmp) + ARCHFUNC(strncmp, scalar) + ARCHFUNC(strncmp, baseline) +ENDARCHFUNCS(strncmp) + +/* + * This is just the scalar loop unrolled a bunch of times. + */ +ARCHENTRY(strncmp, scalar) + xor %eax, %eax + sub $4, %rdx # 4 chars left to compare? + jbe 1f + + ALIGN_TEXT +0: movzbl (%rdi), %ecx + test %ecx, %ecx # NUL char in first string? + jz .L0 + cmpb (%rsi), %cl # mismatch between strings? + jnz .L0 + + movzbl 1(%rdi), %ecx + test %ecx, %ecx + jz .L1 + cmpb 1(%rsi), %cl + jnz .L1 + + movzbl 2(%rdi), %ecx + test %ecx, %ecx + jz .L2 + cmpb 2(%rsi), %cl + jnz .L2 + + movzbl 3(%rdi), %ecx + test %ecx, %ecx + jz .L3 + cmpb 3(%rsi), %cl + jnz .L3 + + add $4, %rdi # advance to next iteration + add $4, %rsi + sub $4, %rdx + ja 0b + + /* end of string within the next 4 characters */ +1: cmp $-4, %edx # end of string reached immediately? + jz .Leq + movzbl (%rdi), %ecx + test %ecx, %ecx + jz .L0 + cmpb (%rsi), %cl + jnz .L0 + + cmp $-3, %edx # end of string reached after 1 char? + jz .Leq + movzbl 1(%rdi), %ecx + test %ecx, %ecx + jz .L1 + cmpb 1(%rsi), %cl + jnz .L1 + + cmp $-2, %edx + jz .Leq + movzbl 2(%rdi), %ecx + test %ecx, %ecx + jz .L2 + cmpb 2(%rsi), %cl + jnz .L2 + + cmp $-1, %edx # either end of string after 3 chars, + jz .Leq # or it boils down to the last char + +.L3: inc %eax +.L2: inc %eax +.L1: inc %eax +.L0: movzbl (%rsi, %rax, 1), %ecx + movzbl (%rdi, %rax, 1), %eax + sub %ecx, %eax +.Leq: ret +ARCHEND(strncmp, scalar) + +ARCHENTRY(strncmp, baseline) + push %rbx + sub $1, %rdx # RDX--, so RDX points to the last byte to compare + jb .Lempty # where there any bytes to compare at all? + + lea 15(%rdi), %r8d # end of head + lea 15(%rsi), %r9d + mov %edi, %eax + mov %esi, %ebx + xor %edi, %r8d # bits that changed between first and last byte + xor %esi, %r9d + and $~0xf, %rdi # align heads to 16 bytes + and $~0xf, %rsi + or %r8d, %r9d + and $0xf, %eax # offset from alignment + and $0xf, %ebx + movdqa (%rdi), %xmm0 # load aligned heads + movdqa (%rsi), %xmm2 + pxor %xmm1, %xmm1 + cmp $16, %rdx # end of buffer within the first 32 bytes? + jb .Llt16 + + test $PAGE_SIZE, %r9d # did the page change? + jz 0f # if not, take fast path + + + /* heads may cross page boundary, avoid unmapped loads */ + movdqa %xmm0, -32(%rsp) # stash copies of the heads on the stack + movdqa %xmm2, -16(%rsp) + mov $-1, %r8d + mov $-1, %r9d + mov %eax, %ecx + shl %cl, %r8d # string head in XMM0 + mov %ebx, %ecx + shl %cl, %r9d # string head in XMM2 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm0, %r10d + pmovmskb %xmm2, %r11d + test %r8d, %r10d # NUL byte present in first string? + lea -32(%rsp), %r8 + cmovz %rdi, %r8 + test %r9d, %r11d # NUL byte present in second string? + lea -16(%rsp), %r9 + cmovz %rsi, %r9 + movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads + movdqu (%r9, %rbx, 1), %xmm4 + jmp 1f + + /* rdx == 0 */ +.Lempty: + xor %eax, %eax # zero-length buffers compare equal + pop %rbx + ret + +0: movdqu (%rdi, %rax, 1), %xmm0 # load true heads + movdqu (%rsi, %rbx, 1), %xmm4 +1: pxor %xmm2, %xmm2 + pcmpeqb %xmm0, %xmm2 # NUL byte present? + pcmpeqb %xmm0, %xmm4 # which bytes match? + pandn %xmm4, %xmm2 # match and not NUL byte? + pmovmskb %xmm2, %r9d + xor $0xffff, %r9d # mismatch or NUL byte? + jnz .Lhead_mismatch + + /* load head and second chunk */ + movdqa 16(%rdi), %xmm2 # load second chunks + movdqa 16(%rsi), %xmm3 + lea -16(%rdx, %rbx, 1), %rdx # account for length of RSI chunk + sub %rbx, %rax # is a&0xf >= b&0xf? + jb .Lswapped # if not, proceed with swapped operands + jmp .Lnormal + + /* buffer ends within the first 16 bytes */ +.Llt16: test $PAGE_SIZE, %r9d # did the page change? + jz 0f # if not, take fast path + + /* heads may cross page boundary */ + movdqa %xmm0, -32(%rsp) # stash copies of the heads on the stack + movdqa %xmm2, -16(%rsp) + mov $-1, %r8d + mov $-1, %r9d + mov %eax, %ecx + shl %cl, %r8d # string head in XMM0 + mov %ebx, %ecx + shl %cl, %r9d # string head in XMM2 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm0, %r10d + pmovmskb %xmm2, %r11d + lea (%rdx, %rax, 1), %ecx # location of last buffer byte in xmm0 + bts %ecx, %r10d # treat as if NUL byte present + lea (%rdx, %rbx, 1), %ecx + bts %ecx, %r11d + test %r8w, %r10w # NUL byte present in first string head? + lea -32(%rsp), %r8 + cmovz %rdi, %r8 + test %r9w, %r11w # NUL byte present in second string head? + lea -16(%rsp), %r9 + cmovz %rsi, %r9 + movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads + movdqu (%r9, %rbx, 1), %xmm4 + jmp 1f + +0: movdqu (%rdi, %rax, 1), %xmm0 # load true heads + movdqu (%rsi, %rbx, 1), %xmm4 +1: pxor %xmm2, %xmm2 + pcmpeqb %xmm0, %xmm2 # NUL byte present? + pcmpeqb %xmm0, %xmm4 # which bytes match? + pandn %xmm4, %xmm2 # match and not NUL byte? + pmovmskb %xmm2, %r9d + btr %edx, %r9d # induce mismatch in last byte of buffer + not %r9d # mismatch or NUL byte? + + /* mismatch in true heads */ + ALIGN_TEXT +.Lhead_mismatch: + tzcnt %r9d, %r9d # where is the mismatch? + add %rax, %rdi # return to true heads + add %rbx, %rsi + movzbl (%rdi, %r9, 1), %eax # mismatching characters + movzbl (%rsi, %r9, 1), %ecx + sub %ecx, %eax + pop %rbx + ret + + /* rax >= 0 */ + ALIGN_TEXT +.Lnormal: + neg %rax + movdqu 16(%rsi, %rax, 1), %xmm0 + sub %rdi, %rsi # express RSI as distance from RDI + lea (%rsi, %rax, 1), %rbx # point RBX to offset in second string + neg %rax # ... corresponding to RDI + pcmpeqb %xmm3, %xmm1 # NUL present? + pcmpeqb %xmm2, %xmm0 # Mismatch between chunks? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + mov $16, %ecx + cmp %rcx, %rdx # does the buffer end within (RDI,RSI,1)? + cmovb %edx, %ecx # ECX = min(16, RDX) + add $32, %rdi # advance to next iteration + bts %ecx, %r8d # mark end-of-buffer as if there was a NUL byte + test %r8w, %r8w # NUL or end of buffer found? + jnz .Lnul_found2 + xor $0xffff, %r9d + jnz .Lmismatch2 + sub $48, %rdx # end of buffer within first main loop iteration? + jb .Ltail # if yes, process tail + + /* + * During the main loop, the layout of the two strings is something like: + * + * v ------1------ v ------2------ v + * RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... + * RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... + * + * where v indicates the alignment boundaries and corresponding chunks + * of the strings have the same letters. Chunk A has been checked in + * the previous iteration. This iteration, we first check that string + * RSI doesn't end within region 2, then we compare chunk B between the + * two strings. As RSI is known not to hold a NUL byte in regsions 1 + * and 2 at this point, this also ensures that RDI has not ended yet. + */ + ALIGN_TEXT +0: movdqu (%rdi, %rbx, 1), %xmm0 # chunk of 2nd string corresponding to RDI + pxor %xmm1, %xmm1 + pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI? + pcmpeqb (%rdi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + test %r8d, %r8d + jnz .Lnul_found + xor $0xffff, %r9d # any mismatches? + jnz .Lmismatch + + /* main loop unrolled twice */ + movdqu 16(%rdi, %rbx, 1), %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb 16(%rdi, %rsi, 1), %xmm1 + pcmpeqb 16(%rdi), %xmm0 + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + add $32, %rdi + test %r8d, %r8d + jnz .Lnul_found2 + xor $0xffff, %r9d + jnz .Lmismatch2 + sub $32, %rdx # end of buffer within next iteration? + jae 0b + + /* end of buffer will occur in next 32 bytes */ +.Ltail: movdqu (%rdi, %rbx, 1), %xmm0 # chunk of 2nd string corresponding to RDI + pxor %xmm1, %xmm1 + pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI? + pcmpeqb (%rdi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + bts %edx, %r8d # indicate NUL byte at last byte in buffer + test %r8w, %r8w # NUL byte in first chunk? + jnz .Lnul_found + xor $0xffff, %r9d # any mismatches? + jnz .Lmismatch + + /* main loop unrolled twice */ + movdqu 16(%rdi, %rbx, 1), %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb 16(%rdi, %rsi, 1), %xmm1 + pcmpeqb 16(%rdi), %xmm0 + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + sub $16, %edx # take first half into account + bts %edx, %r8d # indicate NUL byte at last byte in buffer + add $32, %rdi + +.Lnul_found2: + sub $16, %rdi + +.Lnul_found: + mov %eax, %ecx + mov %r8d, %r10d + shl %cl, %r8d # adjust NUL mask to positions in RDI/RBX + not %r9d # mask of mismatches + or %r8w, %r9w # NUL bytes als count as mismatches + jnz .Lmismatch + + /* + * (RDI) == (RSI) and NUL is past the string. + * compare (RSI) with the corresponding part + * of the other string until the NUL byte. + */ + movdqu (%rdi, %rax, 1), %xmm0 + pcmpeqb (%rdi, %rsi, 1), %xmm0 + add %rdi, %rsi # restore RSI pointer + add %rax, %rdi # point RDI to chunk corresponding to (RSI) + pmovmskb %xmm0, %ecx # mask of matches + not %ecx # mask of mismatches + or %r10d, %ecx # mask of mismatches or NUL bytes + tzcnt %ecx, %ecx # location of first mismatch + movzbl (%rdi, %rcx, 1), %eax + movzbl (%rsi, %rcx, 1), %ecx + sub %ecx, %eax + pop %rbx + ret + +.Lmismatch2: + sub $16, %rdi + + /* a mismatch has been found between RBX and RSI */ +.Lmismatch: + tzcnt %r9d, %r9d # where is the mismatch? + add %rdi, %rbx # turn RBX from offset into pointer + movzbl (%rbx, %r9, 1), %ecx + movzbl (%rdi, %r9, 1), %eax + sub %ecx, %eax + pop %rbx + ret + + /* rax < 0 */ + ALIGN_TEXT +.Lswapped: + movdqu 16(%rdi, %rax, 1), %xmm0 + sub %rsi, %rdi # express RDI as distance from RDI + lea (%rdi, %rax, 1), %rbx # point RBX to offset in first string + pcmpeqb %xmm2, %xmm1 # NUL present? + pcmpeqb %xmm3, %xmm0 # mismatch between chunks? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + add %rax, %rdx # RDX points to buffer end in RSI + neg %rax # ... corresponding to RSI + mov $16, %ecx + cmp %rcx, %rdx # does the buffer end within (RSI,RDI,1)? + cmovb %edx, %ecx # ECX = min(16, RDX) + add $32, %rsi + bts %ecx, %r8d # mark end-of-buffer as if there was a NUL byte + test %r8w, %r8w # NUL or end of buffer found? + jnz .Lnul_found2s + xor $0xffff, %r9d + jnz .Lmismatch2s + sub $48, %rdx # end of buffer within first main loop iteration? + jb .Ltails # if yes, process tail + + ALIGN_TEXT +0: movdqu (%rsi, %rbx, 1), %xmm0 # chunk of 1st string corresponding to RSI + pxor %xmm1, %xmm1 + pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RDI? + pcmpeqb (%rsi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + test %r8d, %r8d + jnz .Lnul_founds + xor $0xffff, %r9d # any mismatches? + jnz .Lmismatchs + + /* main loop unrolled twice */ + movdqu 16(%rsi, %rbx, 1), %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb 16(%rsi, %rdi, 1), %xmm1 + pcmpeqb 16(%rsi), %xmm0 + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + add $32, %rsi + test %r8d, %r8d + jnz .Lnul_found2s + xor $0xffff, %r9d + jnz .Lmismatch2s + sub $32, %rdx # end of buffer within next iteration? + jae 0b + + /* end of buffer will occur in next 32 bytes */ +.Ltails: + movdqu (%rsi, %rbx, 1), %xmm0 # chunk of 1st string corresponding to RSI + pxor %xmm1, %xmm1 + pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RDI? + pcmpeqb (%rsi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + bts %edx, %r8d # indicate NUL byte at laste byte in buffer + test %r8w, %r8w # NUL byte in first chunk? + jnz .Lnul_founds + xor $0xffff, %r9d # any mismatches? + jnz .Lmismatchs + + /* main loop unrolled twice */ + movdqu 16(%rsi, %rbx, 1), %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb 16(%rsi, %rdi, 1), %xmm1 + pcmpeqb 16(%rsi), %xmm0 + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + sub $16, %edx # take first half into account + bts %edx, %r8d # indicate NUL byte at laste byte in buffer + add $32, %rsi + +.Lnul_found2s: + sub $16, %rsi + +.Lnul_founds: + mov %eax, %ecx + mov %r8d, %r10d + shl %cl, %r8d # adjust NUL mask to positions in RSI/RBX + not %r9d # mask of mismatches + or %r8w, %r9w # NUL bytes also count as mismatches + jnz .Lmismatchs + + movdqu (%rsi, %rax, 1), %xmm0 + pcmpeqb (%rsi, %rdi, 1), %xmm0 + add %rsi, %rdi # restore RDI pointer + add %rax, %rsi # point RSI to chunk corresponding to (RDI) + pmovmskb %xmm0, %ecx # mask of matches + not %ecx # mask of mismatches + or %r10d, %ecx # mask of mismatches or NUL bytes + tzcnt %ecx, %ecx # location of first mismatch + movzbl (%rdi, %rcx, 1), %eax + movzbl (%rsi, %rcx, 1), %ecx + sub %ecx, %eax + pop %rbx + ret + +.Lmismatch2s: + sub $16, %rsi + +.Lmismatchs: + tzcnt %r9d, %r9d # where is the mismatch? + add %rsi, %rbx # turn RBX from offset into pointer + movzbl (%rbx, %r9, 1), %eax + movzbl (%rsi, %r9, 1), %ecx + sub %ecx, %eax + pop %rbx + ret +ARCHEND(strncmp, baseline) + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/static_tls.h b/lib/libc/amd64/string/strncpy.c index 1ee738b231c7..0e7a58222aa8 100644 --- a/lib/libc/amd64/static_tls.h +++ b/lib/libc/amd64/string/strncpy.c @@ -1,9 +1,7 @@ /*- - * SPDX-License-Identifier: BSD-2-Clause + * Copyright (c) 2023 The FreeBSD Foundation * - * Copyright (c) 2019 The FreeBSD Foundation - * - * This software was developed by Konstantin Belousov <kib@FreeBSD.org> + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without @@ -15,7 +13,7 @@ * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE @@ -25,20 +23,21 @@ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. + * SUCH DAMAGE */ -#ifndef _LIBC_AMD64_STATIC_TLS_H -#define _LIBC_AMD64_STATIC_TLS_H +#include <sys/cdefs.h> +#include <string.h> + +#undef strncpy /* _FORTIFY_SOURCE */ + +char *__stpncpy(char *restrict, const char *restrict, size_t); -static __inline uintptr_t -_libc_get_static_tls_base(size_t offset) +char * +strncpy(char *restrict dst, const char *restrict src, size_t len) { - uintptr_t tlsbase; - __asm __volatile("movq %%fs:0, %0" : "=r" (tlsbase)); - tlsbase -= offset; - return (tlsbase); -} + __stpncpy(dst, src, len); -#endif + return (dst); +} diff --git a/lib/libc/amd64/sys/amd64_set_gsbase.c b/lib/libc/amd64/string/strnlen.c index 10004afe8234..74020f1b1c65 100644 --- a/lib/libc/amd64/sys/amd64_set_gsbase.c +++ b/lib/libc/amd64/string/strnlen.c @@ -1,11 +1,7 @@ /*- - * SPDX-License-Identifier: BSD-2-Clause + * Copyright (c) 2023 The FreeBSD Foundation * - * Copyright (c) 2003 Peter Wemm - * Copyright (c) 2017, 2018 The FreeBSD Foundation - * All rights reserved. - * - * Portions of this software were developed by Konstantin Belousov + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without @@ -17,48 +13,29 @@ * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. + * SUCH DAMAGE */ -#include <sys/cdefs.h> -#define _WANT_P_OSREL -#include <sys/param.h> -#include <machine/cpufunc.h> -#include <machine/specialreg.h> -#include <machine/sysarch.h> -#include <x86/ifunc.h> -#include "libc_private.h" - -static int -amd64_set_gsbase_cpu(void *addr) -{ +#include <string.h> - wrgsbase((uintptr_t)addr); - return (0); -} +char *__memchr(const void *, int, size_t); -static int -amd64_set_gsbase_syscall(void *addr) +size_t +strnlen(const char *s, size_t maxlen) { + const char *loc; - return (sysarch(AMD64_SET_GSBASE, &addr)); -} - -DEFINE_UIFUNC(, int, amd64_set_gsbase, (void *)) -{ + loc = __memchr(s, '\0', maxlen); - if (__getosreldate() >= P_OSREL_WRFSBASE && - (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) - return (amd64_set_gsbase_cpu); - return (amd64_set_gsbase_syscall); + return (loc == NULL ? maxlen : (size_t)(loc - s)); } diff --git a/lib/libc/amd64/sys/amd64_set_fsbase.c b/lib/libc/amd64/string/strpbrk.c index 24dddcad48f8..87f587789991 100644 --- a/lib/libc/amd64/sys/amd64_set_fsbase.c +++ b/lib/libc/amd64/string/strpbrk.c @@ -1,11 +1,7 @@ /*- - * SPDX-License-Identifier: BSD-2-Clause + * Copyright (c) 2023 The FreeBSD Foundation * - * Copyright (c) 2003 Peter Wemm - * Copyright (c) 2017, 2018 The FreeBSD Foundation - * All rights reserved. - * - * Portions of this software were developed by Konstantin Belousov + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without @@ -17,48 +13,31 @@ * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. + * SUCH DAMAGE */ #include <sys/cdefs.h> -#define _WANT_P_OSREL -#include <sys/param.h> -#include <machine/cpufunc.h> -#include <machine/specialreg.h> -#include <machine/sysarch.h> -#include <x86/ifunc.h> -#include "libc_private.h" -static int -amd64_set_fsbase_cpu(void *addr) -{ +#include <string.h> - wrfsbase((uintptr_t)addr); - return (0); -} +size_t __strcspn(const char *, const char *); -static int -amd64_set_fsbase_syscall(void *addr) +char * +strpbrk(const char *s, const char *charset) { + size_t loc; - return (sysarch(AMD64_SET_FSBASE, &addr)); -} - -DEFINE_UIFUNC(, int, amd64_set_fsbase, (void *)) -{ + loc = __strcspn(s, charset); - if (__getosreldate() >= P_OSREL_WRFSBASE && - (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) - return (amd64_set_fsbase_cpu); - return (amd64_set_fsbase_syscall); + return (s[loc] == '\0' ? NULL : (char *)&s[loc]); } diff --git a/lib/libc/amd64/string/strrchr.S b/lib/libc/amd64/string/strrchr.S new file mode 100644 index 000000000000..e397bbcd3478 --- /dev/null +++ b/lib/libc/amd64/string/strrchr.S @@ -0,0 +1,209 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include <machine/asm.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4,0x90 # 16-byte alignment, nop-filled + + .weak rindex + .set rindex, strrchr + +ARCHFUNCS(strrchr) + ARCHFUNC(strrchr, scalar) + ARCHFUNC(strrchr, baseline) +ENDARCHFUNCS(strrchr) + +ARCHENTRY(strrchr, scalar) + mov %edi, %ecx + and $~7, %rdi # align to 8 byte + movzbl %sil, %esi # clear stray high bits + movabs $0x0101010101010101, %r8 + mov (%rdi), %rax # load first word + imul %r8, %rsi # replicate char 8 times + + /* + * Unaligned input: align to 8 bytes. Then proceed the same + * way as with aligned input, but prevent matches before the + * beginning of the string. This is achieved by oring 0x01 + * into each byte of the buffer before the string + */ + shl $3, %ecx + mov %r8, %r10 + shl %cl, %r10 # 0x01 where the string is + xor %r8, %r10 # 0x01 where it is not + neg %r8 # negate 01..01 so we can use lea + movabs $0x8080808080808080, %r9 + + mov %rsi, %rcx + xor %rax, %rcx # str ^ c + or %r10, %rax # ensure str != 0 before string + or %r10, %rcx # ensure str^c != 0 before string + bswap %rcx # in reverse order, to find last match + mov %rdi, %r10 # location of initial mismatch (if any) + xor %r11, %r11 # initial mismatch (none) + add $8, %rdi # advance to next iteration + lea (%rax, %r8, 1), %rdx # str - 0x01..01 + not %rax # ~str + and %rdx, %rax # (str - 0x01..01) & ~str + and %r9, %rax # not including junk bits + jnz 1f # end of string? + + lea (%rcx, %r8, 1), %rdx # (str ^ c) - 0x01..01 + not %rcx # ~(str ^ c) + and %rdx, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) + and %r9, %rcx # not including junk bits + mov %rcx, %r11 # remember mismatch in head + jmp 0f + + /* main loop unrolled twice */ + ALIGN_TEXT +3: lea (%rcx, %r8, 1), %rdx # (str ^ c) - 0x01..01 + not %rcx # ~(str ^ c) + and %rdx, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) + and %r9, %rcx # not including junk bits + lea -8(%rdi), %rdx + cmovnz %rdx, %r10 # remember location of current mismatch + cmovnz %rcx, %r11 + +0: mov (%rdi), %rax # str + mov %rsi, %rcx + xor %rax, %rcx # str ^ c + bswap %rcx # in reverse order, to find last match + lea (%rax, %r8, 1), %rdx # str - 0x01..01 + not %rax # ~str + and %rdx, %rax # (str - 0x01..01) & ~str + and %r9, %rax # not including junk bits + jnz 2f # end of string? + + lea (%rcx, %r8, 1), %rdx # (str ^ c) - 0x01..01 + not %rcx # ~(str ^ c) + and %rdx, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) + and %r9, %rcx # not including junk bits + cmovnz %rdi, %r10 # remember location of current mismatch + cmovnz %rcx, %r11 + + mov 8(%rdi), %rax # str + add $16, %rdi + mov %rsi, %rcx + xor %rax, %rcx # str ^ c + bswap %rcx + lea (%rax, %r8, 1), %rdx # str - 0x01..01 + not %rax # ~str + and %rdx, %rax # (str - 0x01..01) & ~str + and %r9, %rax # not including junk bits + jz 3b # end of string? + + /* NUL found */ +1: sub $8, %rdi # undo advance past buffer +2: lea (%rcx, %r8, 1), %rdx # (str ^ c) - 0x01..01 + not %rcx # ~(str ^ c) + and %rdx, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) + and %r9, %rcx # not including junk bits + lea -1(%rax), %rdx + xor %rdx, %rax # mask of bytes in the string + bswap %rdx # in reverse order + and %rdx, %rcx # c found in the tail? + cmovnz %rdi, %r10 + cmovnz %rcx, %r11 + bswap %r11 # unreverse byte order + bsr %r11, %rcx # last location of c in (R10) + shr $3, %rcx # as byte offset + lea (%r10, %rcx, 1), %rax # pointer to match + test %r11, %r11 # was there actually a match? + cmovz %r11, %rax # if not, return null pointer + ret +ARCHEND(strrchr, scalar) + +ARCHENTRY(strrchr, baseline) + mov %edi, %ecx + and $~0xf, %rdi # align to 16 bytes + movdqa (%rdi), %xmm1 + movd %esi, %xmm0 + and $0xf, %ecx # offset from alignment + pxor %xmm2, %xmm2 + mov $-1, %edx + punpcklbw %xmm0, %xmm0 # c -> cc + shl %cl, %edx # bits corresponding to bytes in the string + punpcklwd %xmm0, %xmm0 # cc -> cccc + xor %r8, %r8 # address of latest match + mov $1, %esi # bit mask of latest match + mov %rdi, %r9 # candidate location for next match + add $16, %rdi # advance to next chunk + + /* check for match in head */ + pcmpeqb %xmm1, %xmm2 # NUL byte present? + pshufd $0, %xmm0, %xmm0 # cccc -> cccccccccccccccc + pcmpeqb %xmm0, %xmm1 # c present? + pmovmskb %xmm2, %eax + pmovmskb %xmm1, %ecx + and %edx, %ecx # c present in the string? + and %edx, %eax # NUL present in the string? + jnz .Lend2 + + /* main loop unrolled twice */ + ALIGN_TEXT +0: movdqa (%rdi), %xmm1 + test %ecx, %ecx # was there a match in the last iter.? + cmovnz %r9, %r8 # remember match if any + cmovnz %ecx, %esi + pxor %xmm2, %xmm2 + pcmpeqb %xmm1, %xmm2 # NUL byte present? + pcmpeqb %xmm0, %xmm1 # c present? + pmovmskb %xmm2, %eax + pmovmskb %xmm1, %ecx + test %eax, %eax # end of string in first half? + jnz .Lend + + movdqa 16(%rdi), %xmm1 + test %ecx, %ecx # was there a match in the last iter.? + cmovnz %rdi, %r8 # remember match if any + cmovnz %ecx, %esi + pxor %xmm2, %xmm2 + pcmpeqb %xmm1, %xmm2 # NUL byte present? + pcmpeqb %xmm0, %xmm1 # c present? + pmovmskb %xmm2, %eax + pmovmskb %xmm1, %ecx + lea 16(%rdi), %r9 + add $32, %rdi + test %eax, %eax # end of string in second half? + jz 0b + + ALIGN_TEXT +.Lend2: sub $16, %rdi +.Lend: lea -1(%rax), %edx + xor %edx, %eax # mask of bytes in the string + and %eax, %ecx # c found in the tail? + cmovnz %rdi, %r8 + cmovnz %ecx, %esi + bsr %esi, %esi # last location of c in (R8) + lea (%r8, %rsi, 1), %rax # pointer to match + ret +ARCHEND(strrchr, baseline) + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/sys/getcontext.S b/lib/libc/amd64/string/strsep.c index 6860a3cf9bef..9fda56d7e135 100644 --- a/lib/libc/amd64/sys/getcontext.S +++ b/lib/libc/amd64/string/strsep.c @@ -1,6 +1,8 @@ /*- - * Copyright (c) 2003 Peter Wemm <peter@FreeBSD.org> - * All rights reserved. + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -11,36 +13,45 @@ * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. + * SUCH DAMAGE */ -#include <machine/asm.h> -#include <SYS.h> +#include <sys/cdefs.h> +#include <string.h> + +size_t __strcspn(const char *, const char *); /* - * This has to be magic to handle the multiple returns. - * Otherwise, the setcontext() syscall will return here and we'll - * pop off the return address and go to the *setcontext* call. + * We have a fast strcspn() on amd64. Use it over a direct + * implementation of strsep for better performance. */ - WEAK_REFERENCE(__sys_getcontext, _getcontext) - WEAK_REFERENCE(__sys_getcontext, getcontext) -ENTRY(__sys_getcontext) - movq (%rsp),%rsi /* save getcontext return address */ - mov $SYS_getcontext,%rax - KERNCALL - jb HIDENAME(cerror) - addq $8,%rsp /* remove stale (setcontext) return address */ - jmp *%rsi /* restore return address */ -END(__sys_getcontext) +char * +strsep(char **stringp, const char *delim) +{ + size_t n; + char *s; + + s = *stringp; + if (s == NULL) + return (NULL); + + n = __strcspn(s, delim); + if (s[n] == '\0') + *stringp = NULL; + else { + s[n] = '\0'; + *stringp = s + n + 1; + } - .section .note.GNU-stack,"",%progbits + return (s); +} diff --git a/lib/libc/amd64/string/strspn.S b/lib/libc/amd64/string/strspn.S new file mode 100644 index 000000000000..565330f0c385 --- /dev/null +++ b/lib/libc/amd64/string/strspn.S @@ -0,0 +1,358 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include <machine/asm.h> +#include <machine/param.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ + +ARCHFUNCS(strspn) + ARCHFUNC(strspn, scalar) + NOARCHFUNC + ARCHFUNC(strspn, x86_64_v2) +ENDARCHFUNCS(strspn) + +ARCHENTRY(strspn, scalar) + push %rbp # align stack to enable function call + mov %rsp, %rbp + sub $256, %rsp # allocate space for lookup table + + /* check for special cases */ + movzbl (%rsi), %edx # first character in the set + test %edx, %edx + jz .Lzero # empty set always returns 0 + + movzbl 1(%rsi), %eax # second character in the set + test %eax, %eax + jz .Lsingle + + /* no special case matches -- prepare lookup table */ + xor %r8d, %r8d + mov $28, %ecx +0: mov %r8, (%rsp, %rcx, 8) + mov %r8, 8(%rsp, %rcx, 8) + mov %r8, 16(%rsp, %rcx, 8) + mov %r8, 24(%rsp, %rcx, 8) + sub $4, %ecx + jnc 0b + + movb $1, (%rsp, %rdx, 1) # register first char in set + add $2, %rsi + + /* process remaining chars in set */ + ALIGN_TEXT +0: movb $1, (%rsp, %rax, 1) # register previous char + movzbl (%rsi), %eax # next char in set + test %eax, %eax # end of string? + jz 1f + + movb $1, (%rsp, %rax, 1) + add $2, %rsi + movzbl -1(%rsi), %eax + test %eax, %eax + jnz 0b + +1: mov %rdi, %rax # a copy of the source to iterate over + + /* find mismatch */ + ALIGN_TEXT +0: movzbl (%rax), %ecx + cmpb $0, (%rsp, %rcx, 1) + je 2f + + movzbl 1(%rax), %ecx + cmpb $0, (%rsp, %rcx, 1) + je 3f + + movzbl 2(%rax), %ecx + cmpb $0, (%rsp, %rcx, 1) + je 4f + + movzbl 3(%rax), %ecx + add $4, %rax + cmpb $0, (%rsp, %rcx, 1) + jne 0b + + sub $3, %rax +4: dec %rdi +3: inc %rax +2: sub %rdi, %rax # number of characters preceding match + leave + ret + + /* empty set never matches */ +.Lzero: xor %eax, %eax + leave + ret + + /* find repeated single character */ + ALIGN_TEXT +.Lsingle: + cmpb %dl, (%rdi, %rax, 1) + jne 1f + + cmpb %dl, 1(%rdi, %rax, 1) + jne 2f + + cmpb %dl, 2(%rdi, %rax, 1) + jne 3f + + cmpb %dl, 3(%rdi, %rax, 1) + lea 4(%rax), %rax + je .Lsingle + + sub $3, %rax +3: inc %rax +2: inc %rax +1: leave + ret +ARCHEND(strspn, scalar) + + /* + * This kernel uses pcmpistri to do the heavy lifting. + * We provide three code paths, depending on set size: + * + * 0--16: one pcmpistri per 16 bytes of input + * 17--32: two pcmpistri per 16 bytes of input + * >=33: fall back to look up table + */ +ARCHENTRY(strspn, x86_64_v2) + push %rbp + mov %rsp, %rbp + sub $256, %rsp + + /* find set size and copy up to 32 bytes to (%rsp) */ + mov %esi, %ecx + and $~0xf, %rsi # align set pointer + movdqa (%rsi), %xmm0 + pxor %xmm1, %xmm1 + and $0xf, %ecx # amount of bytes rsi is past alignment + xor %edx, %edx + pcmpeqb %xmm0, %xmm1 # end of string reached? + movdqa %xmm0, 32(%rsp) # transfer head of set to stack + pmovmskb %xmm1, %eax + shr %cl, %eax # clear out junk before string + test %eax, %eax # end of set reached? + jnz 0f + + movdqa 16(%rsi), %xmm0 # second chunk of the set + mov $16, %edx + sub %ecx, %edx # length of set preceding xmm0 + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 + movdqa %xmm0, 48(%rsp) + movdqu 32(%rsp, %rcx, 1), %xmm2 # head of set + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 1f + + movdqa 32(%rsi), %xmm0 # third chunk + add $16, %edx + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 + movdqa %xmm0, 64(%rsp) + pmovmskb %xmm1, %eax + test %eax, %eax # still not done? + jz .Lgt32v2 + +0: movdqu 32(%rsp, %rcx, 1), %xmm2 # head of set +1: tzcnt %eax, %eax + add %eax, %edx # length of set (excluding NUL byte) + cmp $32, %edx # above 32 bytes? + ja .Lgt32v2 + + /* + * At this point we know that we want to use pcmpistri. + * one last problem obtains: the head of the string is not + * aligned and may cross a cacheline. If this is the case, + * we take the part before the page boundary and repeat the + * last byte to fill up the xmm register. + */ + mov %rdi, %rax # save original string pointer + lea 15(%rdi), %esi # last byte of the head + xor %edi, %esi + test $PAGE_SIZE, %esi # does the head cross a page? + jz 0f + + /* head crosses page: copy to stack to fix up */ + and $~0xf, %rax # align head pointer temporarily + movzbl 15(%rax), %esi # last head byte on the page + movdqa (%rax), %xmm0 + movabs $0x0101010101010101, %r8 + imul %r8, %rsi # repeated 8 times + movdqa %xmm0, (%rsp) # head word on stack + mov %rsi, 16(%rsp) # followed by filler (last byte x8) + mov %rsi, 24(%rsp) + mov %edi, %eax + and $0xf, %eax # offset of head from alignment + add %rsp, %rax # pointer to fake head + +0: movdqu (%rax), %xmm1 # load head (fake or real) + lea 16(%rdi), %rax + and $~0xf, %rax # second 16 bytes of string (aligned) +1: cmp $16, %edx # 16--32 bytes? + ja .Lgt16v2 + + + /* set is 2--16 bytes in size */ + + /* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT|_SIDD_NEGATIVE_POLARITY */ + pcmpistri $0x10, %xmm1, %xmm2 # match in head? + jc .Lheadmismatchv2 + + ALIGN_TEXT +0: pcmpistri $0x10, (%rax), %xmm2 + jc 1f # match or end of string? + pcmpistri $0x10, 16(%rax), %xmm2 + lea 32(%rax), %rax + jnc 0b # match or end of string? + + sub $16, %rax # go back to second half +1: sub %rdi, %rax # offset of (%rax) from beginning of string + add %rcx, %rax # prefix length before match/NUL + leave + ret + +.Lheadmismatchv2: + mov %ecx, %eax # prefix length before mismatch/NUL + leave + ret + + /* set is 17--32 bytes in size */ +.Lgt16v2: + movdqu 48(%rsp, %rcx, 1), %xmm3 # second part of set + + /* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_BIT_MASK|_SIDD_NEGATIVE_POLARITY */ + pcmpistrm $0x10, %xmm1, %xmm2 # any mismatch in first half? + movdqa %xmm0, %xmm4 + pcmpistrm $0x10, %xmm1, %xmm3 # any mismatch in the second half? + ptest %xmm0, %xmm4 # any entry that doesn't match either? + jnz 2f + + ALIGN_TEXT +0: movdqa (%rax), %xmm1 + pcmpistrm $0x10, %xmm1, %xmm2 + movdqa %xmm0, %xmm4 + pcmpistrm $0x10, %xmm1, %xmm3 + ptest %xmm0, %xmm4 + jnz 1f + movdqa 16(%rax), %xmm1 + add $32, %rax + pcmpistrm $0x10, %xmm1, %xmm2 + movdqa %xmm0, %xmm4 + pcmpistrm $0x10, %xmm1, %xmm3 + ptest %xmm0, %xmm4 + jz 0b + + sub $16, %rax +1: pand %xmm4, %xmm0 + movd %xmm0, %ecx + sub %rdi, %rax # offset of %xmm1 from beginning of string + tzcnt %ecx, %ecx + add %rcx, %rax # prefix length before match/NUL + leave + ret + + /* mismatch or string end in head */ +2: pand %xmm4, %xmm0 # bit mask of mismatches (end of string counts) + movd %xmm0, %eax + tzcnt %eax, %eax # prefix length before mismatch/NUL + leave + ret + + /* set is >=33 bytes in size */ +.Lgt32v2: + xorps %xmm0, %xmm0 + mov $256-64, %edx + + /* clear out look up table */ +0: movaps %xmm0, (%rsp, %rdx, 1) + movaps %xmm0, 16(%rsp, %rdx, 1) + movaps %xmm0, 32(%rsp, %rdx, 1) + movaps %xmm0, 48(%rsp, %rdx, 1) + sub $64, %edx + jnc 0b + + add %rcx, %rsi # restore string pointer + mov %rdi, %rax # keep a copy of the string + + /* initialise look up table */ + movzbl (%rsi), %ecx # string is known not to be empty + + ALIGN_TEXT +0: movb $1, (%rsp, %rcx, 1) + movzbl 1(%rsi), %ecx + test %ecx, %ecx + jz 1f + + movb $1, (%rsp, %rcx, 1) + movzbl 2(%rsi), %ecx + test %ecx, %ecx + jz 1f + + movb $1, (%rsp, %rcx, 1) + movzbl 3(%rsi), %ecx + add $4, %rsi + test %ecx, %ecx + jz 1f + + movb $1, (%rsp, %rcx, 1) + movzbl (%rsi), %ecx + test %ecx, %ecx + jnz 0b + + /* find match */ + ALIGN_TEXT +1: movzbl (%rax), %ecx + cmpb $0, (%rsp, %rcx, 1) + je 2f + + movzbl 1(%rax), %ecx + cmpb $0, (%rsp, %rcx, 1) + je 3f + + movzbl 2(%rax), %ecx + cmpb $0, (%rsp, %rcx, 1) + je 4f + + movzbl 3(%rax), %ecx + add $4, %rax + cmpb $0, (%rsp, %rcx, 1) + jne 1b + + sub $3, %rax +4: dec %rdi +3: inc %rax +2: sub %rdi, %rax # number of characters preceding match + leave + ret +ARCHEND(strspn, x86_64_v2) + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/timingsafe_bcmp.S b/lib/libc/amd64/string/timingsafe_bcmp.S new file mode 100644 index 000000000000..c003da2ea9a7 --- /dev/null +++ b/lib/libc/amd64/string/timingsafe_bcmp.S @@ -0,0 +1,232 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include <machine/asm.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ + +ARCHFUNCS(timingsafe_bcmp) + ARCHFUNC(timingsafe_bcmp, scalar) + ARCHFUNC(timingsafe_bcmp, baseline) +ENDARCHFUNCS(timingsafe_bcmp) + +ARCHENTRY(timingsafe_bcmp, scalar) + cmp $16, %rdx # at least 17 bytes to process? + ja .Lgt16 + + cmp $8, %edx # at least 9 bytes to process? + ja .L0916 + + cmp $4, %edx # at least 5 bytes to process? + ja .L0508 + + cmp $2, %edx # at least 3 bytes to process? + ja .L0304 + + test %edx, %edx # buffer empty? + jnz .L0102 + + xor %eax, %eax # empty buffer always matches + ret + +.L0102: movzbl (%rdi), %eax # load 1--2 bytes from first buffer + movzbl -1(%rdi, %rdx, 1), %ecx + xor (%rsi), %al # xor in second buffer + xor -1(%rsi, %rdx, 1), %cl + or %ecx, %eax # mismatch in any of the two? + ret + +.L0304: movzwl (%rdi), %eax + movzwl -2(%rdi, %rdx, 1), %ecx + xor (%rsi), %ax + xor -2(%rsi, %rdx, 1), %cx + or %ecx, %eax + ret + +.L0508: mov (%rdi), %eax + mov -4(%rdi, %rdx, 1), %ecx + xor (%rsi), %eax + xor -4(%rsi, %rdx, 1), %ecx + or %ecx, %eax + ret + +.L0916: mov (%rdi), %rax + mov -8(%rdi, %rdx, 1), %rcx + xor (%rsi), %rax + xor -8(%rsi, %rdx, 1), %rcx + or %rcx, %rax + setnz %al # ensure EAX nonzero even if only + ret # high bits of RAX were set + + /* more than 16 bytes: process buffer in a loop */ +.Lgt16: mov (%rdi), %rax # process first 16 bytes + mov 8(%rdi), %r9 + mov $32, %ecx + xor (%rsi), %rax + xor 8(%rsi), %r9 + or %r9, %rax + + cmp %rdx, %rcx # enough left for a full iteration? + jae .Ltail + + /* main loop processing 16 bytes per iteration */ + ALIGN_TEXT +0: mov -16(%rdi, %rcx, 1), %r8 + mov -8(%rdi, %rcx, 1), %r9 + xor -16(%rsi, %rcx, 1), %r8 + xor -8(%rsi, %rcx, 1), %r9 + add $16, %rcx + or %r9, %r8 + or %r8, %rax + + cmp %rdx, %rcx + jb 0b + + /* process last 16 bytes */ +.Ltail: mov -16(%rdi, %rdx, 1), %r8 + mov -8(%rdi, %rdx, 1), %r9 + xor -16(%rsi, %rdx, 1), %r8 + xor -8(%rsi, %rdx, 1), %r9 + or %r9, %r8 + or %r8, %rax + setnz %al + ret +ARCHEND(timingsafe_bcmp, scalar) + +ARCHENTRY(timingsafe_bcmp, baseline) + cmp $32, %rdx # at least 33 bytes to process? + ja .Lgt32b + + cmp $16, %edx # at least 17 bytes to process? + ja .L1732b + + cmp $8, %edx # at least 9 bytes to process? + ja .L0916b + + cmp $4, %edx # at least 5 bytes to process? + ja .L0508b + + cmp $2, %edx # at least 3 bytes to process? + ja .L0304b + + test %edx, %edx # buffer empty? + jnz .L0102b + + xor %eax, %eax # empty buffer always matches + ret + +.L0102b: + movzbl (%rdi), %eax # load 1--2 bytes from first buffer + movzbl -1(%rdi, %rdx, 1), %ecx + xor (%rsi), %al # xor in second buffer + xor -1(%rsi, %rdx, 1), %cl + or %ecx, %eax # mismatch in any of the two? + ret + +.L0304b: + movzwl (%rdi), %eax + movzwl -2(%rdi, %rdx, 1), %ecx + xor (%rsi), %ax + xor -2(%rsi, %rdx, 1), %cx + or %ecx, %eax + ret + +.L0508b: + mov (%rdi), %eax + mov -4(%rdi, %rdx, 1), %ecx + xor (%rsi), %eax + xor -4(%rsi, %rdx, 1), %ecx + or %ecx, %eax + ret + +.L0916b: + mov (%rdi), %rax + mov -8(%rdi, %rdx, 1), %rcx + xor (%rsi), %rax + xor -8(%rsi, %rdx, 1), %rcx + or %rcx, %rax + setnz %al # ensure EAX nonzero even if only + ret # high bits of RAX were set + +.L1732b: + movdqu (%rdi), %xmm0 + movdqu (%rsi), %xmm2 + movdqu -16(%rdi, %rdx, 1), %xmm1 + movdqu -16(%rsi, %rdx, 1), %xmm3 + pcmpeqb %xmm2, %xmm0 + pcmpeqb %xmm3, %xmm1 + pand %xmm1, %xmm0 + pmovmskb %xmm0, %eax # 1 where equal + xor $0xffff, %eax # 1 where not equal + ret + + /* more than 32 bytes: process buffer in a loop */ +.Lgt32b: + movdqu (%rdi), %xmm4 + movdqu (%rsi), %xmm2 + movdqu 16(%rdi), %xmm1 + movdqu 16(%rsi), %xmm3 + mov $64, %ecx + pcmpeqb %xmm2, %xmm4 + pcmpeqb %xmm3, %xmm1 + pand %xmm1, %xmm4 + cmp %rdx, %rcx # enough left for a full iteration? + jae .Ltailb + + /* main loop processing 32 bytes per iteration */ + ALIGN_TEXT +0: movdqu -32(%rdi, %rcx, 1), %xmm0 + movdqu -32(%rsi, %rcx, 1), %xmm2 + movdqu -16(%rdi, %rcx, 1), %xmm1 + movdqu -16(%rsi, %rcx, 1), %xmm3 + add $32, %rcx + pcmpeqb %xmm2, %xmm0 + pcmpeqb %xmm3, %xmm1 + pand %xmm1, %xmm0 + pand %xmm0, %xmm4 + cmp %rdx, %rcx + jb 0b + + /* process last 32 bytes */ +.Ltailb: + movdqu -32(%rdi, %rdx, 1), %xmm0 + movdqu -32(%rsi, %rdx, 1), %xmm2 + movdqu -16(%rdi, %rdx, 1), %xmm1 + movdqu -16(%rsi, %rdx, 1), %xmm3 + pcmpeqb %xmm2, %xmm0 + pcmpeqb %xmm3, %xmm1 + pand %xmm1, %xmm0 + pand %xmm4, %xmm0 + pmovmskb %xmm0, %eax + xor $0xffff, %eax + ret +ARCHEND(timingsafe_bcmp, baseline) + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/timingsafe_memcmp.S b/lib/libc/amd64/string/timingsafe_memcmp.S new file mode 100644 index 000000000000..3f1eccdbd640 --- /dev/null +++ b/lib/libc/amd64/string/timingsafe_memcmp.S @@ -0,0 +1,145 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include <machine/asm.h> + +#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ + +/* int timingsafe_memcmp(const void *rdi, const void *rsi, size_t rdx) */ +ENTRY(timingsafe_memcmp) + cmp $16, %rdx # at least 17 bytes to process? + ja .Lgt16 + + cmp $8, %edx # at least 9 bytes to process? + ja .L0916 + + cmp $4, %edx # at least 5 bytes to process? + ja .L0508 + + cmp $2, %edx # at least 3 bytes to process? + ja .L0304 + + test %edx, %edx # buffer empty? + jnz .L0102 + + xor %eax, %eax # empty buffer always matches + ret + +.L0102: movzbl -1(%rdi, %rdx, 1), %eax # load 1--2 bytes from first buffer + movzbl -1(%rsi, %rdx, 1), %ecx + mov (%rdi), %ah # in big endian + mov (%rsi), %ch + sub %ecx, %eax + ret + +.L0304: movzwl -2(%rdi, %rdx, 1), %ecx + movzwl -2(%rsi, %rdx, 1), %edx + movzwl (%rdi), %eax + movzwl (%rsi), %esi + bswap %ecx # convert to big endian + bswap %edx # dito for edx, (e)ax, and (e)si + rol $8, %ax # ROLW is used here so the upper two + rol $8, %si # bytes stay clear, allowing us to + sub %edx, %ecx # save a SBB compared to .L0508 + sbb %esi, %eax + or %eax, %ecx # nonzero if not equal + setnz %al + ret + +.L0508: mov -4(%rdi, %rdx, 1), %ecx + mov -4(%rsi, %rdx, 1), %edx + mov (%rdi), %edi + mov (%rsi), %esi + bswap %ecx # compare in big endian + bswap %edx + bswap %edi + bswap %esi + sub %edx, %ecx + sbb %esi, %edi + sbb %eax, %eax # -1 if less, 0 if greater or equal + or %edi, %ecx # nonzero if not equal + setnz %al # negative if <, 0 if =, 1 if > + ret + +.L0916: mov -8(%rdi, %rdx, 1), %rcx + mov -8(%rsi, %rdx, 1), %rdx + mov (%rdi), %rdi + mov (%rsi), %rsi + bswap %rcx # compare in big endian + bswap %rdx + bswap %rdi + bswap %rsi + sub %rdx, %rcx + sbb %rsi, %rdi + sbb %eax, %eax # -1 if less, 0 if greater or equal + or %rdi, %rcx # nonzero if not equal + setnz %al # negative if <, 0 if =, 1 if > + ret + + /* compare 17+ bytes */ +.Lgt16: mov (%rdi), %r8 # process first 16 bytes + mov (%rsi), %r9 + mov $32, %ecx + cmp %r8, %r9 # mismatch in head? + cmove 8(%rdi), %r8 # if not, try second pair + cmove 8(%rsi), %r9 + cmp %rdx, %rcx + jae .Ltail + + /* main loop processing 16 bytes per iteration */ + ALIGN_TEXT +0: mov -16(%rdi, %rcx, 1), %r10 + mov -16(%rsi, %rcx, 1), %r11 + cmp %r10, %r11 # mismatch in first pair? + cmove -8(%rdi, %rcx, 1), %r10 # if not, try second pair + cmove -8(%rsi, %rcx, 1), %r11 + cmp %r8, %r9 # was there a mismatch previously? + cmove %r10, %r8 # apply new pair if there was not + cmove %r11, %r9 + add $16, %rcx + cmp %rdx, %rcx + jb 0b + +.Ltail: mov -8(%rdi, %rdx, 1), %r10 + mov -8(%rsi, %rdx, 1), %r11 + cmp %r8, %r9 + cmove -16(%rdi, %rdx, 1), %r8 + cmove -16(%rsi, %rdx, 1), %r9 + bswap %r10 # compare in big endian + bswap %r11 + bswap %r8 + bswap %r9 + sub %r11, %r10 + sbb %r9, %r8 + sbb %eax, %eax # -1 if less, 0 if greater or equal + or %r10, %r8 # nonzero if not equal + setnz %al # negative if <, 0 if =, 1 if > + ret +END(timingsafe_memcmp) + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/sys/Makefile.inc b/lib/libc/amd64/sys/Makefile.inc deleted file mode 100644 index 43af1d2a85a2..000000000000 --- a/lib/libc/amd64/sys/Makefile.inc +++ /dev/null @@ -1,12 +0,0 @@ -# from: Makefile.inc,v 1.1 1993/09/03 19:04:23 jtc Exp - -SRCS+= \ - amd64_get_fsbase.c \ - amd64_get_gsbase.c \ - amd64_set_fsbase.c \ - amd64_set_gsbase.c - -MDASM= vfork.S cerror.S getcontext.S - -# Don't generate default code for these syscalls: -NOASM+= sbrk.o vfork.o diff --git a/lib/libc/amd64/sys/amd64_get_fsbase.c b/lib/libc/amd64/sys/amd64_get_fsbase.c deleted file mode 100644 index 4784bb0baf42..000000000000 --- a/lib/libc/amd64/sys/amd64_get_fsbase.c +++ /dev/null @@ -1,64 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause - * - * Copyright (c) 2003 Peter Wemm - * Copyright (c) 2017, 2018 The FreeBSD Foundation - * All rights reserved. - * - * Portions of this software were developed by Konstantin Belousov - * under sponsorship from the FreeBSD Foundation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <sys/cdefs.h> -#define _WANT_P_OSREL -#include <sys/param.h> -#include <machine/cpufunc.h> -#include <machine/specialreg.h> -#include <machine/sysarch.h> -#include <x86/ifunc.h> -#include "libc_private.h" - -static int -amd64_get_fsbase_cpu(void **addr) -{ - - *addr = (void *)rdfsbase(); - return (0); -} - -static int -amd64_get_fsbase_syscall(void **addr) -{ - - return (sysarch(AMD64_GET_FSBASE, addr)); -} - -DEFINE_UIFUNC(, int, amd64_get_fsbase, (void **)) -{ - - if (__getosreldate() >= P_OSREL_WRFSBASE && - (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) - return (amd64_get_fsbase_cpu); - return (amd64_get_fsbase_syscall); -} diff --git a/lib/libc/amd64/sys/amd64_get_gsbase.c b/lib/libc/amd64/sys/amd64_get_gsbase.c deleted file mode 100644 index c81773c4b78c..000000000000 --- a/lib/libc/amd64/sys/amd64_get_gsbase.c +++ /dev/null @@ -1,64 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause - * - * Copyright (c) 2003 Peter Wemm - * Copyright (c) 2017, 2018 The FreeBSD Foundation - * All rights reserved. - * - * Portions of this software were developed by Konstantin Belousov - * under sponsorship from the FreeBSD Foundation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <sys/cdefs.h> -#define _WANT_P_OSREL -#include <sys/param.h> -#include <machine/cpufunc.h> -#include <machine/specialreg.h> -#include <machine/sysarch.h> -#include <x86/ifunc.h> -#include "libc_private.h" - -static int -amd64_get_gsbase_cpu(void **addr) -{ - - *addr = (void *)rdgsbase(); - return (0); -} - -static int -amd64_get_gsbase_syscall(void **addr) -{ - - return (sysarch(AMD64_GET_GSBASE, addr)); -} - -DEFINE_UIFUNC(, int, amd64_get_gsbase, (void **)) -{ - - if (__getosreldate() >= P_OSREL_WRFSBASE && - (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) - return (amd64_get_gsbase_cpu); - return (amd64_get_gsbase_syscall); -} diff --git a/lib/libc/amd64/sys/cerror.S b/lib/libc/amd64/sys/cerror.S deleted file mode 100644 index d0b11888562e..000000000000 --- a/lib/libc/amd64/sys/cerror.S +++ /dev/null @@ -1,58 +0,0 @@ -/*- - * Copyright (c) 1990 The Regents of the University of California. - * All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * William Jolitz. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#if defined(SYSLIBC_SCCS) && !defined(lint) - .asciz "@(#)cerror.s 5.1 (Berkeley) 4/23/90" -#endif /* SYSLIBC_SCCS and not lint */ -#include <machine/asm.h> -#include "SYS.h" - - .globl HIDENAME(cerror) - .hidden HIDENAME(cerror) - - /* - * The __error() function is thread aware. For non-threaded - * programs and the initial thread in threaded programs, - * it returns a pointer to the global errno variable. - */ - .globl CNAME(__error) - .type CNAME(__error),@function -HIDENAME(cerror): - pushq %rax - call PIC_PLT(CNAME(__error)) - popq %rcx - movl %ecx,(%rax) - movq $-1,%rax - movq $-1,%rdx - ret - - .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/sys/vfork.S b/lib/libc/amd64/sys/vfork.S deleted file mode 100644 index 11faadc6b310..000000000000 --- a/lib/libc/amd64/sys/vfork.S +++ /dev/null @@ -1,52 +0,0 @@ -/*- - * Copyright (c) 1990 The Regents of the University of California. - * All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * William Jolitz. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#if defined(SYSLIBC_SCCS) && !defined(lint) - .asciz "@(#)Ovfork.s 5.1 (Berkeley) 4/23/90" -#endif /* SYSLIBC_SCCS and not lint */ -#include <machine/asm.h> -#include "SYS.h" - - WEAK_REFERENCE(__sys_vfork, _vfork) - WEAK_REFERENCE(__sys_vfork, vfork) -ENTRY(__sys_vfork) - popq %rsi /* fetch return address (%rsi preserved) */ - mov $SYS_vfork,%rax - KERNCALL - jb 1f - jmp *%rsi -1: - pushq %rsi - jmp HIDENAME(cerror) -END(__sys_vfork) - - .section .note.GNU-stack,"",%progbits |