diff options
Diffstat (limited to 'lib/libc/amd64')
64 files changed, 4402 insertions, 690 deletions
diff --git a/lib/libc/amd64/Makefile.inc b/lib/libc/amd64/Makefile.inc index cd8f0f121ea6..1e5bebcab52f 100644 --- a/lib/libc/amd64/Makefile.inc +++ b/lib/libc/amd64/Makefile.inc @@ -1,4 +1,3 @@ -# $FreeBSD$ # # Machine dependent definitions for the amd64 architecture. # diff --git a/lib/libc/amd64/SYS.h b/lib/libc/amd64/SYS.h deleted file mode 100644 index 3adc0b7fb655..000000000000 --- a/lib/libc/amd64/SYS.h +++ /dev/null @@ -1,54 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-3-Clause - * - * Copyright (c) 1990 The Regents of the University of California. - * All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * William Jolitz. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)SYS.h 5.5 (Berkeley) 5/7/91 - * $FreeBSD$ - */ - -#include <sys/syscall.h> -#include <machine/asm.h> - -#define RSYSCALL(name) ENTRY(__sys_##name); \ - WEAK_REFERENCE(__sys_##name, name); \ - WEAK_REFERENCE(__sys_##name, _##name); \ - mov $SYS_##name,%eax; KERNCALL; \ - jb HIDENAME(cerror); ret; \ - END(__sys_##name) - -#define PSEUDO(name) ENTRY(__sys_##name); \ - WEAK_REFERENCE(__sys_##name, _##name); \ - mov $SYS_##name,%eax; KERNCALL; \ - jb HIDENAME(cerror); ret; \ - END(__sys_##name) - -#define KERNCALL movq %rcx, %r10; syscall diff --git a/lib/libc/amd64/Symbol.map b/lib/libc/amd64/Symbol.map index 297792eb0e82..36f54de24fbd 100644 --- a/lib/libc/amd64/Symbol.map +++ b/lib/libc/amd64/Symbol.map @@ -1,21 +1,12 @@ /* - * $FreeBSD$ - */ - -/* * This only needs to contain symbols that are not listed in * symbol maps from other parts of libc (i.e., not found in * stdlib/Symbol.map, string/Symbol.map, sys/Symbol.map, ...). */ FBSD_1.0 { - /* PSEUDO syscalls */ - _exit; - .mcount; - _setjmp; - _longjmp; - fabs; __flt_rounds; + brk; fpgetmask; fpgetprec; fpgetround; @@ -23,32 +14,7 @@ FBSD_1.0 { fpsetmask; fpsetprec; fpsetround; - __infinity; - __nan; - makecontext; - rfork_thread; - setjmp; - longjmp; - sigsetjmp; - siglongjmp; - htonl; - htons; - ntohl; - ntohs; - amd64_get_fsbase; - amd64_get_gsbase; - amd64_set_fsbase; - amd64_set_gsbase; - brk; sbrk; - vfork; -}; - -FBSD_1.6 { - x86_pkru_get_perm; - x86_pkru_set_perm; - x86_pkru_protect_range; - x86_pkru_unprotect_range; }; /* @@ -57,15 +23,10 @@ FBSD_1.6 { * */ FBSDprivate_1.0 { - /* PSEUDO syscalls */ - _getlogin; - ___longjmp; - __makecontext; __longjmp; __signalcontext; signalcontext; __siglongjmp; _brk; - _vfork; }; diff --git a/lib/libc/amd64/_fpmath.h b/lib/libc/amd64/_fpmath.h index 8be7b7dbaf7d..d56138f48ba0 100644 --- a/lib/libc/amd64/_fpmath.h +++ b/lib/libc/amd64/_fpmath.h @@ -1,5 +1,5 @@ /*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2002, 2003 David Schultz <das@FreeBSD.ORG> * All rights reserved. @@ -24,8 +24,6 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * $FreeBSD$ */ union IEEEl2bits { diff --git a/lib/libc/amd64/amd64_archlevel.h b/lib/libc/amd64/amd64_archlevel.h new file mode 100644 index 000000000000..047beb9855d1 --- /dev/null +++ b/lib/libc/amd64/amd64_archlevel.h @@ -0,0 +1,90 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +/* must be macros so they can be accessed from assembly */ +#define X86_64_SCALAR 0 /* disable SIMD optimisations */ +#define X86_64_BASELINE 1 /* CMOV, CX8, FPU, FXSR, MMX, OSFXSR, SSE, SSE2 */ +#define X86_64_V2 2 /* CMPXCHG16B, LAHF-SAHF, POPCNT, SSE3, SSSE3, SSE4_1, SSE4_2 */ +#define X86_64_V3 3 /* AVX, AVX2, BMI1, BMI2, F16C, FMA, LZCNT, MOVBE, OSXSAVE */ +#define X86_64_V4 4 /* AVX512F, AVX512BW, AVX512CD, AVX512DQ, AVX512VL */ + +#define X86_64_MAX X86_64_V4 /* highest supported architecture level */ +#define X86_64_UNDEFINED -1 /* architecture level not set yet */ + +#ifndef __ASSEMBLER__ +#include <dlfcn.h> + +dlfunc_t __archlevel_resolve(u_int, u_int, u_int, u_int, + int32_t[X86_64_MAX + 1]) __hidden; +#else +#include <machine/asm.h> + +#define ARCHRESOLVE(func) \ + .globl CNAME(func); \ + .type CNAME(func), @gnu_indirect_function; \ + .set CNAME(func), __CONCAT(func,_resolver); \ + ARCHENTRY(func, resolver); \ + lea __CONCAT(func,_funcs)(%rip), %r8; \ + jmp CNAME(__archlevel_resolve); \ + ARCHEND(func, resolver) + +/* + * The func_funcs array stores the location of the implementations + * as the distance from the func_funcs array to the function. Due + * to compiling for the medium code model, a 32 bit integer suffices + * to hold the distance. + * + * Doing it this way both saves storage and avoids giving rtld + * relocations to process at load time. + */ +#define ARCHFUNCS(func) \ + ARCHRESOLVE(func); \ + .section .rodata; \ + .align 4; \ + __CONCAT(func,_funcs): + +#define NOARCHFUNC \ + .4byte 0 + +#define ARCHFUNC(func, level) \ + .4byte __CONCAT(__CONCAT(func,_),level) - __CONCAT(func,_funcs) + +#define ENDARCHFUNCS(func) \ + .zero 4*(X86_64_MAX+1)-(.-__CONCAT(func,_funcs)); \ + .size __CONCAT(func,_funcs), .-__CONCAT(func,_funcs) + +#define ARCHENTRY(func, level) \ + _START_ENTRY; \ + .type __CONCAT(__CONCAT(func,_),level), @function; \ + __CONCAT(__CONCAT(func,_),level):; \ + .cfi_startproc + +#define ARCHEND(func, level) \ + END(__CONCAT(__CONCAT(func,_),level)) + +#endif /* __ASSEMBLER__ */ diff --git a/lib/libc/amd64/arith.h b/lib/libc/amd64/arith.h index ecb1a33fccb0..9c695c74a33e 100644 --- a/lib/libc/amd64/arith.h +++ b/lib/libc/amd64/arith.h @@ -1,7 +1,5 @@ /* * MD header for contrib/gdtoa - * - * $FreeBSD$ */ /* diff --git a/lib/libc/amd64/gd_qnan.h b/lib/libc/amd64/gd_qnan.h index 39923860db07..3387ae94a862 100644 --- a/lib/libc/amd64/gd_qnan.h +++ b/lib/libc/amd64/gd_qnan.h @@ -3,8 +3,6 @@ * * This file can be generated by compiling and running contrib/gdtoa/qnan.c * on the target architecture after arith.h has been generated. - * - * $FreeBSD$ */ #define f_QNAN 0x7fc00000 diff --git a/lib/libc/amd64/gen/Makefile.inc b/lib/libc/amd64/gen/Makefile.inc index f904e1e71eb0..aaffcb0481f1 100644 --- a/lib/libc/amd64/gen/Makefile.inc +++ b/lib/libc/amd64/gen/Makefile.inc @@ -1,7 +1,4 @@ -# @(#)Makefile.inc 8.1 (Berkeley) 6/4/93 -# $FreeBSD$ - -SRCS+= _setjmp.S rfork_thread.S setjmp.S sigsetjmp.S \ +SRCS+= _setjmp.S setjmp.S sigsetjmp.S \ fabs.S \ infinity.c ldexp.c makecontext.c signalcontext.c \ flt_rounds.c fpgetmask.c fpsetmask.c fpgetprec.c fpsetprec.c \ diff --git a/lib/libc/amd64/gen/_setjmp.S b/lib/libc/amd64/gen/_setjmp.S index 0e1e5f8f265d..93b27de49ea0 100644 --- a/lib/libc/amd64/gen/_setjmp.S +++ b/lib/libc/amd64/gen/_setjmp.S @@ -30,12 +30,7 @@ * SUCH DAMAGE. */ -#if defined(LIBC_SCCS) && !defined(lint) - .asciz "@(#)_setjmp.s 5.1 (Berkeley) 4/23/90" -#endif /* LIBC_SCCS and not lint */ #include <machine/asm.h> -__FBSDID("$FreeBSD$"); - /* * C library -- _setjmp, _longjmp * @@ -48,8 +43,8 @@ __FBSDID("$FreeBSD$"); ENTRY(_setjmp) movq %rdi,%rax - movq 0(%rsp),%rdx /* retval */ - movq %rdx, 0(%rax) /* 0; retval */ + movq 0(%rsp),%rdx /* return address */ + movq %rdx, 0(%rax) /* 0; return address */ movq %rbx, 8(%rax) /* 1; rbx */ movq %rsp,16(%rax) /* 2; rsp */ movq %rbp,24(%rax) /* 3; rbp */ @@ -88,7 +83,7 @@ ENTRY(___longjmp) testq %rax,%rax jnz 1f incq %rax -1: movq %rcx,0(%rsp) +1: movq %rcx,0(%rsp) /* return address */ ret END(___longjmp) diff --git a/lib/libc/amd64/gen/fabs.S b/lib/libc/amd64/gen/fabs.S index 38e67ab03873..8c7e4464d1f5 100644 --- a/lib/libc/amd64/gen/fabs.S +++ b/lib/libc/amd64/gen/fabs.S @@ -25,8 +25,6 @@ */ #include <machine/asm.h> -__FBSDID("$FreeBSD$"); - /* * Return floating point absolute value of a double. */ diff --git a/lib/libc/amd64/gen/flt_rounds.c b/lib/libc/amd64/gen/flt_rounds.c index c0ce81f6dfa9..cd7e501af5af 100644 --- a/lib/libc/amd64/gen/flt_rounds.c +++ b/lib/libc/amd64/gen/flt_rounds.c @@ -3,9 +3,6 @@ * Public domain. */ -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - #include <float.h> static const int map[] = { diff --git a/lib/libc/amd64/gen/fpgetmask.c b/lib/libc/amd64/gen/fpgetmask.c index 03bb2741ef6a..a1ab8a79c2ea 100644 --- a/lib/libc/amd64/gen/fpgetmask.c +++ b/lib/libc/amd64/gen/fpgetmask.c @@ -1,4 +1,3 @@ -/* $FreeBSD$ */ #define __IEEEFP_NOINLINES__ 1 #include <ieeefp.h> diff --git a/lib/libc/amd64/gen/fpgetprec.c b/lib/libc/amd64/gen/fpgetprec.c index 22d21480566a..eed244f3c36f 100644 --- a/lib/libc/amd64/gen/fpgetprec.c +++ b/lib/libc/amd64/gen/fpgetprec.c @@ -1,4 +1,3 @@ -/* $FreeBSD$ */ #define __IEEEFP_NOINLINES__ 1 #include <ieeefp.h> diff --git a/lib/libc/amd64/gen/fpgetround.c b/lib/libc/amd64/gen/fpgetround.c index 9c066b149ca2..6d79bb9d30c6 100644 --- a/lib/libc/amd64/gen/fpgetround.c +++ b/lib/libc/amd64/gen/fpgetround.c @@ -1,4 +1,3 @@ -/* $FreeBSD$ */ #define __IEEEFP_NOINLINES__ 1 #include <ieeefp.h> diff --git a/lib/libc/amd64/gen/fpgetsticky.c b/lib/libc/amd64/gen/fpgetsticky.c index c3acb91d63e5..842fb2cdf748 100644 --- a/lib/libc/amd64/gen/fpgetsticky.c +++ b/lib/libc/amd64/gen/fpgetsticky.c @@ -1,4 +1,3 @@ -/* $FreeBSD$ */ #define __IEEEFP_NOINLINES__ 1 #include <ieeefp.h> diff --git a/lib/libc/amd64/gen/fpsetmask.c b/lib/libc/amd64/gen/fpsetmask.c index 996e167ac1d9..808144dce4b8 100644 --- a/lib/libc/amd64/gen/fpsetmask.c +++ b/lib/libc/amd64/gen/fpsetmask.c @@ -1,4 +1,3 @@ -/* $FreeBSD$ */ #define __IEEEFP_NOINLINES__ 1 #include <ieeefp.h> diff --git a/lib/libc/amd64/gen/fpsetprec.c b/lib/libc/amd64/gen/fpsetprec.c index 5898de7e0e82..8fd0249652bd 100644 --- a/lib/libc/amd64/gen/fpsetprec.c +++ b/lib/libc/amd64/gen/fpsetprec.c @@ -1,4 +1,3 @@ -/* $FreeBSD$ */ #define __IEEEFP_NOINLINES__ 1 #include <ieeefp.h> diff --git a/lib/libc/amd64/gen/fpsetround.c b/lib/libc/amd64/gen/fpsetround.c index 6f13367510d6..a069ec2e372c 100644 --- a/lib/libc/amd64/gen/fpsetround.c +++ b/lib/libc/amd64/gen/fpsetround.c @@ -1,4 +1,3 @@ -/* $FreeBSD$ */ #define __IEEEFP_NOINLINES__ 1 #include <ieeefp.h> diff --git a/lib/libc/amd64/gen/infinity.c b/lib/libc/amd64/gen/infinity.c index 464b4029b35a..bc05708abd2b 100644 --- a/lib/libc/amd64/gen/infinity.c +++ b/lib/libc/amd64/gen/infinity.c @@ -2,9 +2,6 @@ * infinity.c */ -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - #include <math.h> /* bytes for +Infinity on a 387 */ diff --git a/lib/libc/amd64/gen/makecontext.c b/lib/libc/amd64/gen/makecontext.c index 720b48b6a97d..c5767c9d5d75 100644 --- a/lib/libc/amd64/gen/makecontext.c +++ b/lib/libc/amd64/gen/makecontext.c @@ -1,5 +1,5 @@ /*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2003 Marcel Moolenaar * All rights reserved. @@ -26,9 +26,6 @@ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - #include <sys/types.h> #include <sys/ucontext.h> #include <stdarg.h> diff --git a/lib/libc/amd64/gen/rfork_thread.S b/lib/libc/amd64/gen/rfork_thread.S deleted file mode 100644 index d08700fb4fe4..000000000000 --- a/lib/libc/amd64/gen/rfork_thread.S +++ /dev/null @@ -1,95 +0,0 @@ -/*- - * Copyright (c) 2000 Peter Wemm <peter@FreeBSD.org> - * Copyright (c) 2003 Alan L. Cox <alc@cs.rice.edu> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <machine/asm.h> -__FBSDID("$FreeBSD$"); - -/* - * With thanks to John Dyson for the original version of this. - */ - -#include <SYS.h> - -/* - * %edi %rsi %rdx %rcx - * rfork_thread(flags, stack_addr, start_fnc, start_arg); - * - * flags: Flags to rfork system call. See rfork(2). - * stack_addr: Top of stack for thread. - * start_fnc: Address of thread function to call in child. - * start_arg: Argument to pass to the thread function in child. - */ - -ENTRY(rfork_thread) - pushq %rbx - pushq %r12 - movq %rdx, %rbx - movq %rcx, %r12 - - /* - * Prepare and execute the thread creation syscall - */ - movq $SYS_rfork, %rax - KERNCALL - jb 2f - - /* - * Check to see if we are in the parent or child - */ - cmpl $0, %edx - jnz 1f - popq %r12 - popq %rbx - ret - - /* - * If we are in the child (new thread), then - * set-up the call to the internal subroutine. If it - * returns, then call __exit. - */ -1: - movq %rsi, %rsp - movq %r12, %rdi - call *%rbx - movl %eax, %edi - - /* - * Exit system call - */ - movq $SYS_exit, %rax - KERNCALL - - /* - * Branch here if the thread creation fails: - */ -2: - popq %r12 - popq %rbx - jmp HIDENAME(cerror) -END(rfork_thread) - - .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/gen/setjmp.S b/lib/libc/amd64/gen/setjmp.S index 90eff9e9ed6b..54939f123807 100644 --- a/lib/libc/amd64/gen/setjmp.S +++ b/lib/libc/amd64/gen/setjmp.S @@ -30,12 +30,7 @@ * SUCH DAMAGE. */ -#if defined(LIBC_SCCS) && !defined(lint) - .asciz "@(#)setjmp.s 5.1 (Berkeley) 4/23/90" -#endif /* LIBC_SCCS and not lint */ #include <machine/asm.h> -__FBSDID("$FreeBSD$"); - /* * C library -- _setjmp, _longjmp * @@ -58,8 +53,8 @@ ENTRY(setjmp) call __libc_sigprocmask popq %rdi movq %rdi,%rcx - movq 0(%rsp),%rdx /* retval */ - movq %rdx, 0(%rcx) /* 0; retval */ + movq 0(%rsp),%rdx /* return address */ + movq %rdx, 0(%rcx) /* 0; return address */ movq %rbx, 8(%rcx) /* 1; rbx */ movq %rsp,16(%rcx) /* 2; rsp */ movq %rbp,24(%rcx) /* 3; rbp */ @@ -109,7 +104,7 @@ ENTRY(__longjmp) testq %rax,%rax jnz 1f incq %rax -1: movq %rcx,0(%rsp) +1: movq %rcx,0(%rsp) /* return address */ ret END(__longjmp) diff --git a/lib/libc/amd64/gen/signalcontext.c b/lib/libc/amd64/gen/signalcontext.c index a489a14b3012..a97dd158542a 100644 --- a/lib/libc/amd64/gen/signalcontext.c +++ b/lib/libc/amd64/gen/signalcontext.c @@ -1,5 +1,5 @@ /*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2003 Marcel Moolenaar * All rights reserved. @@ -26,9 +26,6 @@ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - #include <sys/types.h> #include <sys/ucontext.h> #include <signal.h> diff --git a/lib/libc/amd64/gen/sigsetjmp.S b/lib/libc/amd64/gen/sigsetjmp.S index 447134122019..c4775b1c2bea 100644 --- a/lib/libc/amd64/gen/sigsetjmp.S +++ b/lib/libc/amd64/gen/sigsetjmp.S @@ -28,8 +28,6 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * @(#)setjmp.s 5.1 (Berkeley) 4/23/90" */ #if defined(LIBC_SCCS) && !defined(lint) @@ -37,8 +35,6 @@ .asciz "$Id: sigsetjmp.S,v 1.1 1993/12/05 13:01:05 ats Exp $" #endif /* LIBC_SCCS and not lint */ #include <machine/asm.h> -__FBSDID("$FreeBSD$"); - #include "SYS.h" /*- @@ -66,8 +62,8 @@ ENTRY(sigsetjmp) call __libc_sigprocmask popq %rdi 2: movq %rdi,%rcx - movq 0(%rsp),%rdx /* retval */ - movq %rdx, 0(%rcx) /* 0; retval */ + movq 0(%rsp),%rdx /* return address */ + movq %rdx, 0(%rcx) /* 0; return address */ movq %rbx, 8(%rcx) /* 1; rbx */ movq %rsp,16(%rcx) /* 2; rsp */ movq %rbp,24(%rcx) /* 3; rbp */ @@ -109,7 +105,7 @@ ENTRY(__siglongjmp) testq %rax,%rax jnz 1f incq %rax -1: movq %rcx,0(%rsp) +1: movq %rcx,0(%rsp) /* return address */ ret END(__siglongjmp) diff --git a/lib/libc/amd64/stdlib/Makefile.inc b/lib/libc/amd64/stdlib/Makefile.inc index 5b7e675ede0d..568f8eb4afa7 100644 --- a/lib/libc/amd64/stdlib/Makefile.inc +++ b/lib/libc/amd64/stdlib/Makefile.inc @@ -1,4 +1 @@ -# @(#)Makefile.inc 8.1 (Berkeley) 6/4/93 -# $FreeBSD$ - MDSRCS+=div.S ldiv.S lldiv.S diff --git a/lib/libc/amd64/stdlib/div.S b/lib/libc/amd64/stdlib/div.S index 366010c3d208..f15ef2ffc5bb 100644 --- a/lib/libc/amd64/stdlib/div.S +++ b/lib/libc/amd64/stdlib/div.S @@ -6,8 +6,6 @@ */ #include <machine/asm.h> -__FBSDID("$FreeBSD$"); - ENTRY(div) movl %edi,%eax cltd diff --git a/lib/libc/amd64/stdlib/ldiv.S b/lib/libc/amd64/stdlib/ldiv.S index f11472c671dc..6c1e2292d7f2 100644 --- a/lib/libc/amd64/stdlib/ldiv.S +++ b/lib/libc/amd64/stdlib/ldiv.S @@ -6,8 +6,6 @@ */ #include <machine/asm.h> -__FBSDID("$FreeBSD$"); - ENTRY(ldiv) movq %rdi,%rax cqto diff --git a/lib/libc/amd64/stdlib/lldiv.S b/lib/libc/amd64/stdlib/lldiv.S index 4dab0fdd5278..744cfefd064b 100644 --- a/lib/libc/amd64/stdlib/lldiv.S +++ b/lib/libc/amd64/stdlib/lldiv.S @@ -6,8 +6,6 @@ */ #include <machine/asm.h> -__FBSDID("$FreeBSD$"); - ENTRY(lldiv) movq %rdi,%rax cqto diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc index cb370bc6be1c..d5bb646c5c53 100644 --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -1,12 +1,36 @@ -# $FreeBSD$ - MDSRCS+= \ + amd64_archlevel.c \ bcmp.S \ + memchr.S \ memcmp.S \ + memccpy.S \ memcpy.S \ memmove.S \ + memrchr.S \ memset.S \ + stpcpy.S \ + stpncpy.S \ strcat.S \ + strchrnul.S \ strcmp.S \ + strcpy.c \ + strcspn.S \ + strlcat.c \ + strlcpy.S \ strlen.S \ - stpcpy.S + strncat.c \ + strncmp.S \ + strncpy.c \ + strnlen.c \ + strpbrk.c \ + strrchr.S \ + strsep.c \ + strspn.S \ + timingsafe_bcmp.S \ + timingsafe_memcmp.S + +.if ${MK_ASAN} != "no" +# Disable ASAN for amd64_archlevel.c since its code is executed before the +# sanitizer runtime can initialize itself. +CFLAGS.amd64_archlevel.c+= -fno-sanitize=address +.endif diff --git a/lib/libc/amd64/string/amd64_archlevel.c b/lib/libc/amd64/string/amd64_archlevel.c new file mode 100644 index 000000000000..c06566658c59 --- /dev/null +++ b/lib/libc/amd64/string/amd64_archlevel.c @@ -0,0 +1,241 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include <sys/types.h> + +#include <machine/atomic.h> +#include <machine/cpufunc.h> +#include <machine/specialreg.h> + +#include <stddef.h> +#include <string.h> + +#include "amd64_archlevel.h" +#include "libc_private.h" + +#define ARCHLEVEL_ENV "ARCHLEVEL" + +static volatile int amd64_archlevel = X86_64_UNDEFINED; + +static const struct archlevel { + char name[10]; + /* CPUID feature bits that need to be present */ + u_int feat_edx, feat_ecx, amd_ecx, ext_ebx; +} levels[] = { + { + .name = "scalar", + .feat_edx = 0, + .feat_ecx = 0, + .amd_ecx = 0, + .ext_ebx = 0, + }, { +#define FEAT_EDX_BASELINE (CPUID_FPU | CPUID_CX8 | CPUID_CMOV | CPUID_MMX | \ + CPUID_FXSR | CPUID_SSE | CPUID_SSE2) + .name = "baseline", + .feat_edx = FEAT_EDX_BASELINE, + .feat_ecx = 0, + .amd_ecx = 0, + .ext_ebx = 0, + }, { +#define FEAT_ECX_V2 (CPUID2_SSE3 | CPUID2_SSSE3 | CPUID2_CX16 | CPUID2_SSE41 | \ + CPUID2_SSE42 | CPUID2_POPCNT) +#define AMD_ECX_V2 AMDID2_LAHF + .name = "x86-64-v2", + .feat_edx = FEAT_EDX_BASELINE, + .feat_ecx = FEAT_ECX_V2, + .amd_ecx = AMD_ECX_V2, + .ext_ebx = 0, + }, { +#define FEAT_ECX_V3 (FEAT_ECX_V2 | CPUID2_FMA | CPUID2_MOVBE | \ + CPUID2_OSXSAVE | CPUID2_AVX | CPUID2_F16C) +#define AMD_ECX_V3 (AMD_ECX_V2 | AMDID2_ABM) +#define EXT_EBX_V3 (CPUID_STDEXT_BMI1 | CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2) + .name = "x86-64-v3", + .feat_edx = FEAT_EDX_BASELINE, + .feat_ecx = FEAT_ECX_V3, + .amd_ecx = AMD_ECX_V3, + .ext_ebx = EXT_EBX_V3, + }, { +#define EXT_EBX_V4 (EXT_EBX_V3 | CPUID_STDEXT_AVX512F | \ + CPUID_STDEXT_AVX512DQ | CPUID_STDEXT_AVX512CD | \ + CPUID_STDEXT_AVX512BW | CPUID_STDEXT_AVX512VL) + .name = "x86-64-v4", + .feat_edx = FEAT_EDX_BASELINE, + .feat_ecx = FEAT_ECX_V3, + .amd_ecx = AMD_ECX_V3, + .ext_ebx = EXT_EBX_V4, + } +}; + +static int +supported_archlevel(u_int feat_edx, u_int feat_ecx, u_int ext_ebx, u_int ext_ecx) +{ + int level; + u_int p[4], max_leaf; + u_int amd_ecx = 0; + + (void)ext_ecx; + + do_cpuid(0x80000000, p); + max_leaf = p[0]; + + if (max_leaf >= 0x80000001) { + do_cpuid(0x80000001, p); + amd_ecx = p[2]; + } + + for (level = X86_64_BASELINE; level <= X86_64_MAX; level++) { + const struct archlevel *lvl = &levels[level]; + + if ((lvl->feat_edx & feat_edx) != lvl->feat_edx || + (lvl->feat_ecx & feat_ecx) != lvl->feat_ecx || + (lvl->amd_ecx & amd_ecx) != lvl->amd_ecx || + (lvl->ext_ebx & ext_ebx) != lvl->ext_ebx) + return (level - 1); + } + + return (X86_64_MAX); +} + +static int +match_archlevel(const char *str, int *force) +{ + int level, want_force = 0; + + *force = 0; + + if (str[0] == '!') { + str++; + want_force = 1; + } + + for (level = 0; level <= X86_64_MAX; level++) { + size_t i; + const char *candidate = levels[level].name; + + /* can't use strcmp here: would recurse during ifunc resolution */ + for (i = 0; str[i] == candidate[i]; i++) + /* suffixes starting with : or + are ignored for future extensions */ + if (str[i] == '\0' || str[i] == ':' || str[i] == '+') { + if (want_force) + *force = 1; + + return (level); + } + } + + return (X86_64_UNDEFINED); +} + +/* + * We can't use getenv(), strcmp(), and a bunch of other functions here as + * they may in turn call SIMD-optimised string functions. + * + * *force is set to 1 if the architecture level is valid and begins with a ! + * and to 0 otherwise. + */ +static int +env_archlevel(int *force) +{ + size_t i; + + if (environ == NULL) + return (X86_64_UNDEFINED); + + for (i = 0; environ[i] != NULL; i++) { + size_t j; + + for (j = 0; environ[i][j] == ARCHLEVEL_ENV "="[j]; j++) + if (environ[i][j] == '=') + return (match_archlevel(&environ[i][j + 1], force)); + } + + *force = 0; + + return (X86_64_UNDEFINED); + +} + +/* + * Determine the architecture level by checking the CPU capabilities + * and the environment: + * + * 1. If environment variable ARCHLEVEL starts with a ! and is followed + * by a valid architecture level, that level is returned. + * 2. Else if ARCHLEVEL is set to a valid architecture level that is + * supported by the CPU, that level is returned. + * 3. Else the highest architecture level supported by the CPU is + * returned. + * + * Valid architecture levels are those defined in the levels array. + * The architecture level "scalar" indicates that SIMD enhancements + * shall not be used. + */ +static int +archlevel(u_int feat_edx, u_int feat_ecx, u_int ext_ebx, u_int ext_ecx) +{ + int islevel, wantlevel, hwlevel, force; + + islevel = atomic_load_int(&amd64_archlevel); + if (islevel != X86_64_UNDEFINED) + return (islevel); + + wantlevel = env_archlevel(&force); + if (!force) { + hwlevel = supported_archlevel(feat_edx, feat_ecx, ext_ebx, ext_ecx); + if (wantlevel == X86_64_UNDEFINED || wantlevel > hwlevel) + wantlevel = hwlevel; + } + + /* + * Ensure amd64_archlevel is set only once and + * all calls agree on what it was set to. + */ + if (atomic_cmpset_int(&amd64_archlevel, islevel, wantlevel)) + return (wantlevel); + else + return (atomic_load_int(&amd64_archlevel)); +} + +/* + * Helper function for SIMD ifunc dispatch: select the highest level + * implementation up to the current architecture level. + */ +dlfunc_t +__archlevel_resolve(u_int feat_edx, u_int feat_ecx, u_int ext_ebx, + u_int ext_ecx, int32_t funcs[static X86_64_MAX + 1]) +{ + int level; + + for (level = archlevel(feat_edx, feat_ecx, ext_ebx, ext_ecx); level >= 0; level--) + if (funcs[level] != 0) + return (dlfunc_t)((uintptr_t)funcs + (ptrdiff_t)funcs[level]); + + /* no function is present -- what now? */ + __builtin_trap(); +} diff --git a/lib/libc/amd64/string/bcopy.c b/lib/libc/amd64/string/bcopy.c index 9e0c4187e439..0dee529fb9df 100644 --- a/lib/libc/amd64/string/bcopy.c +++ b/lib/libc/amd64/string/bcopy.c @@ -2,11 +2,10 @@ * Public domain. */ -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - #include <string.h> +#undef bcopy /* _FORTIFY_SOURCE */ + void bcopy(const void *src, void *dst, size_t len) { diff --git a/lib/libc/amd64/string/bzero.c b/lib/libc/amd64/string/bzero.c index 1ab391076b0d..d82f3061865b 100644 --- a/lib/libc/amd64/string/bzero.c +++ b/lib/libc/amd64/string/bzero.c @@ -2,11 +2,10 @@ * Public domain. */ -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - #include <string.h> +#undef bzero /* _FORTIFY_SOURCE */ + void bzero(void *b, size_t len) { diff --git a/lib/libc/amd64/string/memccpy.S b/lib/libc/amd64/string/memccpy.S new file mode 100644 index 000000000000..69b650fffc33 --- /dev/null +++ b/lib/libc/amd64/string/memccpy.S @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2023, 2024 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include <machine/asm.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + + .weak memccpy + .set memccpy, __memccpy +ARCHFUNCS(__memccpy) + ARCHFUNC(__memccpy, scalar) + ARCHFUNC(__memccpy, baseline) +ENDARCHFUNCS(__memccpy) + +ARCHENTRY(__memccpy, scalar) + push %rbp # establish stack frame + mov %rsp, %rbp + push %rax # dummy push for alignment + push %rbx + push %rdi + push %rsi + + mov %rsi, %rdi + mov %edx, %esi + mov %rcx, %rdx + mov %rcx, %rbx + call CNAME(__memchr) # ptr = memchr(src, c, len) + + pop %rsi + pop %rdi + lea 1(%rax), %rdx + sub %rsi, %rdx # size = ptr - src + 1 + mov %rbx, %rcx + lea (%rdi, %rdx, 1), %rbx # res = dest + size + test %rax, %rax # if (ptr == NULL) + cmovz %rcx, %rdx # size = len + cmovz %rax, %rbx # res = NULL + call CNAME(memcpy) + + mov %rbx, %rax # return (res) + pop %rbx + leave + ret +ARCHEND(__memccpy, scalar) + +ARCHENTRY(__memccpy, baseline) + sub $1, %rcx # RCX refers to last character in buffer + jb .L0 # go to special code path if len was 0 + + movd %edx, %xmm4 + mov %rcx, %rdx + punpcklbw %xmm4, %xmm4 # c -> cc + mov %esi, %ecx + punpcklwd %xmm4, %xmm4 # cc -> cccc + mov %rsi, %r9 # stash a copy of the source pointer for later + pshufd $0, %xmm4, %xmm4 # cccc -> cccccccccccccccc + and $~0xf, %rsi + movdqa %xmm4, %xmm1 + pcmpeqb (%rsi), %xmm1 # c found in head? + and $0xf, %ecx + mov $-1, %eax + pmovmskb %xmm1, %r8d + lea -32(%rcx), %r11 + shl %cl, %eax # mask of bytes in the string + add %rdx, %r11 # distance from alignment boundary - 32 + jnc .Lrunt # jump if buffer length is 32 or less + + and %r8d, %eax + jz 0f # match (or induced match) found? + + /* match in first chunk */ + tzcnt %eax, %edx # where is c? + sub %ecx, %edx # ... from the beginning of the string? + lea 1(%rdi, %rdx, 1), %rax # return value + jmp .L0116 + +0: movdqa 16(%rsi), %xmm3 # load second string chunk + movdqu (%r9), %xmm2 # load unaligned string head + movdqa %xmm4, %xmm1 + pcmpeqb %xmm3, %xmm1 # c found in second chunk? + + /* process second chunk */ + pmovmskb %xmm1, %eax + test %eax, %eax + jz 0f + + /* match in second chunk */ + tzcnt %eax, %edx # where is c? + sub $16, %ecx + sub %ecx, %edx # adjust for alignment offset + lea 1(%rdi, %rdx, 1), %rax # return value + jmp .L0132 + + /* c not found in second chunk: prepare for main loop */ +0: movdqa 32(%rsi), %xmm0 # load next string chunk + movdqa %xmm4, %xmm1 + movdqu %xmm2, (%rdi) # deposit head into buffer + sub %rcx, %rdi # adjust RDI to correspond to RSI + mov %r11, %rdx + movdqu %xmm3, 16(%rdi) # deposit second chunk + sub %rsi, %rdi # express RDI as distance from RSI + add $32, %rsi # advance RSI past first two chunks + sub $16, %rdx # enough left for another round? + jb 1f + + /* main loop unrolled twice */ + ALIGN_TEXT +0: pcmpeqb %xmm0, %xmm1 # c encountered? + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 3f + + movdqu %xmm0, (%rsi, %rdi) + movdqa 16(%rsi), %xmm0 # load next string chunk + movdqa %xmm4, %xmm1 + cmp $16, %rdx # more than a full chunk left? + jb 2f + + add $32, %rsi # advance pointers to next chunk + pcmpeqb %xmm0, %xmm1 # c encountered? + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 4f + + movdqu %xmm0, -16(%rsi, %rdi) + movdqa (%rsi), %xmm0 # load next string chunk + movdqa %xmm4, %xmm1 + sub $32, %rdx + jae 0b + +1: sub $16, %rsi # undo second advancement + add $16, %edx + + /* 1--16 bytes left in the buffer but string has not ended yet */ +2: pcmpeqb %xmm1, %xmm0 # c encountered? + pmovmskb %xmm0, %r8d + mov %r8d, %ecx + bts %edx, %r8d # treat end of buffer as end of string + tzcnt %r8d, %r8d # find tail length + add %rsi, %rdi # restore RDI + movdqu 1(%rsi, %r8, 1), %xmm0 # load string tail + movdqu %xmm0, 1(%rdi, %r8, 1) # store string tail + lea 17(%rdi, %r8, 1), %rsi # return value if terminator encountered + xor %eax, %eax # return value if no terminator encountered + bt %r8d, %ecx # terminator encountered inside buffer? + cmovc %rsi, %rax # if yes, return pointer, else NULL + ret + +4: sub $16, %rsi # undo second advancement + + /* terminator found and buffer has not ended yet */ +3: tzcnt %eax, %eax # find length of string tail + movdqu -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. c) + add %rsi, %rdi # restore destination pointer + movdqu %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. c) + lea 1(%rdi, %rax, 1), %rax # compute return value + ret + + /* buffer is 1--32 bytes in size */ + ALIGN_TEXT +.Lrunt: add $32, %r11d # undo earlier decrement + mov %r8d, %r10d # keep a copy of the original match mask + bts %r11d, %r8d # induce match at buffer end + and %ax, %r8w # is there a match in the first 16 bytes? + jnz 0f # if yes, skip looking at second chunk + + pcmpeqb 16(%rsi), %xmm4 # check for match in second chunk + pmovmskb %xmm4, %r8d + shl $16, %r8d # place second chunk matches in bits 16--31 + mov %r8d, %r10d # keep a copy of the original match mask + bts %r11d, %r8d # induce a match at buffer end + +0: xor %eax, %eax # return value if terminator not found + tzcnt %r8d, %edx # find string/buffer length from alignment boundary + lea 1(%rdi, %rdx, 1), %r8 # return value if terminator found + rcx + sub %rcx, %r8 + bt %edx, %r10d # was the terminator present? + cmovc %r8, %rax # if yes, return pointer, else NULL + sub %ecx, %edx # find actual string/buffer length + + ALIGN_TEXT +.L0132: cmp $16, %rdx # at least 17 bytes to copy? + jb .L0116 + + /* copy 17--32 bytes */ + movdqu (%r9), %xmm0 # load first 16 bytes + movdqu -15(%r9, %rdx, 1), %xmm1 # load last 16 bytes + movdqu %xmm0, (%rdi) + movdqu %xmm1, -15(%rdi, %rdx, 1) + ret + + /* process strings of 1--16 bytes (rdx: min(buflen, srclen), rax: srclen) */ + ALIGN_TEXT +.L0116: cmp $8, %rdx # at least 9 bytes to copy? + jae .L0916 + + cmp $4, %rdx # at least 5 bytes to copy? + jae .L0508 + + cmp $2, %rdx # at least 3 bytes to copy? + jae .L0304 + + /* copy one or two bytes */ + movzbl (%r9), %ecx # load first byte from src + movzbl (%r9, %rdx, 1), %esi # load last byte from src + mov %cl, (%rdi) # deposit into destination + mov %sil, (%rdi, %rdx, 1) + ret + +.L0304: movzwl (%r9), %ecx + movzwl -1(%r9, %rdx, 1), %esi + mov %cx, (%rdi) + mov %si, -1(%rdi, %rdx, 1) + ret + +.L0508: mov (%r9), %ecx + mov -3(%r9, %rdx, 1), %esi + mov %ecx, (%rdi) + mov %esi, -3(%rdi, %rdx, 1) + ret + +.L0916: mov (%r9), %rcx + mov -7(%r9, %rdx, 1), %rsi + mov %rcx, (%rdi) + mov %rsi, -7(%rdi, %rdx, 1) + ret + + /* length zero destination: return null pointer */ +.L0: xor %eax, %eax + ret +ARCHEND(__memccpy, baseline) + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/memchr.S b/lib/libc/amd64/string/memchr.S new file mode 100644 index 000000000000..cfab9b1302de --- /dev/null +++ b/lib/libc/amd64/string/memchr.S @@ -0,0 +1,207 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include <machine/asm.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ + + .weak memchr + .set memchr, __memchr +ARCHFUNCS(__memchr) + ARCHFUNC(__memchr, scalar) + ARCHFUNC(__memchr, baseline) +ENDARCHFUNCS(__memchr) + +ARCHENTRY(__memchr, scalar) + test %rdx, %rdx # empty input? + je .Lnomatch + + lea (, %rdi, 8), %ecx + mov $-1, %rax + add %rdi, %rdx # pointer to end of buffer or to end of + cmovc %rax, %rdx # address space (whichever comes first) + and $~7, %rdi # align to 8 bytes + mov (%rdi), %rax # load first word + movzbl %sil, %esi # clear stray high bits + movabs $0x0101010101010101, %r8 + imul %r8, %rsi # replicate char 8 times + + /* compute head and tail masks */ + mov %r8, %r10 + movabs $0x8080808080808080, %r9 + shl %cl, %r10 # 0x01 where string head is + lea (, %rdx, 8), %ecx + xor %r8, %r10 # 0x01 where it is not + neg %r8 # negate 01..01 so we can use lea + mov %r9, %r11 + xor %rsi, %rax # str ^ c (0x00 where str[i] == c) + neg %ecx + or %r10, %rax # except before the string + shr %cl, %r11 # 0x80 where string tail is + + add $8, %rdi # advance to next 8 bytes + cmp %rdx, %rdi # end of buffer reached during head? + jae .Ltail # and go to tail-processing code + + /* main loop, unrolled twice */ + ALIGN_TEXT +0: lea (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01 + not %rax # ~(str ^ c) + and %r9, %rax # ((str^c) - 0x01..01) & ~(str^c) + and %rcx, %rax # not including junk bytes + jnz .Lmatch + + mov (%rdi), %rax + add $8, %rdi + xor %rsi, %rax # str ^ c + cmp %rdx, %rdi + jae .Ltail + + lea (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01 + not %rax # ~(str ^ c) + and %r9, %rax # ((str^c) - 0x01..01) & ~(str^c) + and %rcx, %rax # not including junk bytes + jnz .Lmatch + + mov (%rdi), %rax + add $8, %rdi + xor %rsi, %rax # str ^ c + cmp %rdx, %rdi + jb 0b + +.Ltail: lea (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01 + not %rax # ~(str ^ c) + and %r11, %rax # ((str^c) - 0x01..01) & ~(str^c) + and %rcx, %rax # not including junk bytes or bytes past buffer + jz .Lnomatch + +.Lmatch: + tzcnt %rax, %rax # first match + shr $3, %eax # scale from bit to byte index + lea -8(%rdi, %rax), %rax # pointer to found c + ret + + /* no match found */ +.Lnomatch: + xor %eax, %eax # return null pointer + ret +ARCHEND(__memchr, scalar) + +ARCHENTRY(__memchr, baseline) + test %rdx, %rdx # empty input? + je .Lnomatchb + + movd %esi, %xmm2 + mov %edi, %ecx + mov $-1, %r9 + add %rdi, %rdx # pointer to end of buffer or to end of + cmovc %r9, %rdx # address space (whichever comes first) + and $~0x1f, %rdi # align to 32 bytes + movdqa (%rdi), %xmm0 # load first 32 bytes + movdqa 16(%rdi), %xmm1 + + punpcklbw %xmm2, %xmm2 # c -> cc + + shl %cl, %r9d # mask with zeroes before the string + + punpcklwd %xmm2, %xmm2 # cc -> cccc + + mov $-1, %r8d + xor %ecx, %ecx + sub %edx, %ecx # edx = -ecx + shr %cl, %r8d # bytes in tail that are part of the buffer + + pshufd $0, %xmm2, %xmm2 # cccc -> cccccccccccccccc + + add $32, %rdi # advance to next 32 bytes + mov $-1, %eax + cmp %rdx, %rdi # end of buffer reached during head? + cmovae %r8d, %eax # if yes, do combined head/tail processing + and %r9d, %eax # mask of bytes in head part of string + + /* process head */ + pcmpeqb %xmm2, %xmm1 + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm1, %esi + pmovmskb %xmm0, %ecx + shl $16, %esi + or %esi, %ecx # locations of matches + and %ecx, %eax # any match inside buffer? + jnz .Lprecisematchb + + cmp %rdx, %rdi # did the buffer end here? + jae .Lnomatchb # if yes we are done + + /* main loop */ + ALIGN_TEXT +0: movdqa (%rdi), %xmm0 # load next string chunk + movdqa 16(%rdi), %xmm1 + add $32, %rdi + cmp %rdx, %rdi # ready for main loop? + jae .Ltailb + + pcmpeqb %xmm2, %xmm0 + pcmpeqb %xmm2, %xmm1 + por %xmm1, %xmm0 # match in either half? + pmovmskb %xmm0, %eax + test %eax, %eax + jz 0b + +.Lmatchb: + pcmpeqb -32(%rdi), %xmm2 # redo comparison of first 16 bytes + pmovmskb %xmm1, %ecx + pmovmskb %xmm2, %eax + shl $16, %ecx + or %ecx, %eax # location of matches + +.Lprecisematchb: + tzcnt %eax, %eax # find location of match + lea -32(%rdi, %rax, 1), %rax # point to matching byte + ret + +.Ltailb: + pcmpeqb %xmm2, %xmm1 + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm1, %edx + pmovmskb %xmm0, %eax + shl $16, %edx + or %edx, %eax # location of matches + and %r8d, %eax # mask out matches beyond buffer + bsf %eax, %edx # location of match + lea -32(%rdi, %rdx, 1), %rdx # pointer to match (if any) + cmovnz %rdx, %rax # point to match if present, + ret # else null pointer + +.Lnomatchb: + xor %eax, %eax # return null pointer + ret +ARCHEND(__memchr, baseline) + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/memcmp.S b/lib/libc/amd64/string/memcmp.S index 3e6df7966312..dc8bcff73cb9 100644 --- a/lib/libc/amd64/string/memcmp.S +++ b/lib/libc/amd64/string/memcmp.S @@ -1,9 +1,12 @@ /*- - * Copyright (c) 2018 The FreeBSD Foundation + * Copyright (c) 2018, 2023 The FreeBSD Foundation * * This software was developed by Mateusz Guzik <mjg@FreeBSD.org> * under sponsorship from the FreeBSD Foundation. * + * Portions of this software were developed by Robert Clausecker + * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -24,12 +27,12 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * $FreeBSD$ */ #include <machine/asm.h> -__FBSDID("$FreeBSD$"); +#include <machine/param.h> + +#include "amd64_archlevel.h" /* * Note: this routine was written with kernel use in mind (read: no simd), @@ -40,10 +43,15 @@ __FBSDID("$FreeBSD$"); #define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ #ifdef BCMP -ENTRY(bcmp) -#else -ENTRY(memcmp) +#define memcmp bcmp #endif + +ARCHFUNCS(memcmp) + ARCHFUNC(memcmp, scalar) + ARCHFUNC(memcmp, baseline) +ENDARCHFUNCS(memcmp) + +ARCHENTRY(memcmp, scalar) xorl %eax,%eax 10: cmpq $16,%rdx @@ -161,7 +169,6 @@ ENTRY(memcmp) 1: leal 1(%eax),%eax ret -END(bcmp) #else /* * We need to compute the difference between strings. @@ -234,7 +241,180 @@ END(bcmp) 2: subl %r8d,%eax ret -END(memcmp) #endif +ARCHEND(memcmp, scalar) + +ARCHENTRY(memcmp, baseline) + cmp $32, %rdx # enough to permit use of the long kernel? + ja .Llong + + test %rdx, %rdx # zero bytes buffer? + je .L0 + + /* + * Compare strings of 1--32 bytes. We want to do this by + * loading into two xmm registers and then comparing. To avoid + * crossing into unmapped pages, we either load 32 bytes from + * the start of the buffer or 32 bytes before its end, depending + * on whether there is a page boundary between the overread area + * or not. + */ + + /* check for page boundaries overreads */ + lea 31(%rdi), %eax # end of overread + lea 31(%rsi), %r8d + lea -1(%rdi, %rdx, 1), %ecx # last character in buffer + lea -1(%rsi, %rdx, 1), %r9d + xor %ecx, %eax + xor %r9d, %r8d + test $PAGE_SIZE, %eax # are they on different pages? + jz 0f + + /* fix up rdi */ + movdqu -32(%rdi, %rdx, 1), %xmm0 + movdqu -16(%rdi, %rdx, 1), %xmm1 + lea -8(%rsp), %rdi # end of replacement buffer + sub %rdx, %rdi # start of replacement buffer + movdqa %xmm0, -40(%rsp) # copy to replacement buffer + movdqa %xmm1, -24(%rsp) + +0: test $PAGE_SIZE, %r8d + jz 0f + + /* fix up rsi */ + movdqu -32(%rsi, %rdx, 1), %xmm0 + movdqu -16(%rsi, %rdx, 1), %xmm1 + lea -40(%rsp), %rsi # end of replacement buffer + sub %rdx, %rsi # start of replacement buffer + movdqa %xmm0, -72(%rsp) # copy to replacement buffer + movdqa %xmm1, -56(%rsp) + + /* load data and compare properly */ +0: movdqu 16(%rdi), %xmm1 + movdqu 16(%rsi), %xmm3 + movdqu (%rdi), %xmm0 + movdqu (%rsi), %xmm2 + mov %edx, %ecx + mov $-1, %edx + shl %cl, %rdx # ones where the buffer is not + pcmpeqb %xmm3, %xmm1 + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm1, %ecx + pmovmskb %xmm0, %eax + shl $16, %ecx + or %ecx, %eax # ones where the buffers match + or %edx, %eax # including where the buffer is not + not %eax # ones where there is a mismatch +#ifndef BCMP + bsf %eax, %edx # location of the first mismatch + cmovz %eax, %edx # including if there is no mismatch + movzbl (%rdi, %rdx, 1), %eax # mismatching bytes + movzbl (%rsi, %rdx, 1), %edx + sub %edx, %eax +#endif + ret + + /* empty input */ +.L0: xor %eax, %eax + ret + + /* compare 33+ bytes */ + ALIGN_TEXT +.Llong: movdqu (%rdi), %xmm0 # load head + movdqu (%rsi), %xmm2 + mov %rdi, %rcx + sub %rdi, %rsi # express rsi as distance from rdi + and $~0xf, %rdi # align rdi to 16 bytes + movdqu 16(%rsi, %rdi, 1), %xmm1 + pcmpeqb 16(%rdi), %xmm1 # compare second half of this iteration + add %rcx, %rdx # pointer to last byte in buffer + jc .Loverflow # did this overflow? +0: pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + xor $0xffff, %eax # any mismatch? + jne .Lmismatch_head + add $64, %rdi # advance to next iteration + jmp 1f # and get going with the loop + + /* + * If we got here, a buffer length was passed to memcmp(a, b, len) + * such that a + len < a. While this sort of usage is illegal, + * it is plausible that a caller tries to do something like + * memcmp(a, b, SIZE_MAX) if a and b are known to differ, intending + * for memcmp() to stop comparing at the first mismatch. This + * behaviour is not guaranteed by any version of ISO/IEC 9899, + * but usually works out in practice. Let's try to make this + * case work by comparing until the end of the address space. + */ +.Loverflow: + mov $-1, %rdx # compare until the end of memory + jmp 0b + + /* process buffer 32 bytes at a time */ + ALIGN_TEXT +0: movdqu -32(%rsi, %rdi, 1), %xmm0 + movdqu -16(%rsi, %rdi, 1), %xmm1 + pcmpeqb -32(%rdi), %xmm0 + pcmpeqb -16(%rdi), %xmm1 + add $32, %rdi # advance to next iteration +1: pand %xmm0, %xmm1 # 0xff where both halves matched + pmovmskb %xmm1, %eax + cmp $0xffff, %eax # all bytes matched? + jne .Lmismatch + cmp %rdx, %rdi # end of buffer reached? + jb 0b + + /* less than 32 bytes left to compare */ + movdqu -16(%rdx), %xmm1 # load 32 byte tail through end pointer + movdqu -16(%rdx, %rsi, 1), %xmm3 + movdqu -32(%rdx), %xmm0 + movdqu -32(%rdx, %rsi, 1), %xmm2 + pcmpeqb %xmm3, %xmm1 + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm1, %ecx + pmovmskb %xmm0, %eax + shl $16, %ecx + or %ecx, %eax # ones where the buffers match + not %eax # ones where there is a mismatch +#ifndef BCMP + bsf %eax, %ecx # location of the first mismatch + cmovz %eax, %ecx # including if there is no mismatch + add %rcx, %rdx # pointer to potential mismatch + movzbl -32(%rdx), %eax # mismatching bytes + movzbl -32(%rdx, %rsi, 1), %edx + sub %edx, %eax +#endif + ret + +#ifdef BCMP +.Lmismatch: + mov $1, %eax +.Lmismatch_head: + ret +#else /* memcmp */ +.Lmismatch_head: + tzcnt %eax, %eax # location of mismatch + add %rax, %rcx # pointer to mismatch + movzbl (%rcx), %eax # mismatching bytes + movzbl (%rcx, %rsi, 1), %ecx + sub %ecx, %eax + ret + +.Lmismatch: + movdqu -48(%rsi, %rdi, 1), %xmm1 + pcmpeqb -48(%rdi), %xmm1 # reconstruct xmm1 before PAND + pmovmskb %xmm0, %eax # mismatches in first 16 bytes + pmovmskb %xmm1, %edx # mismatches in second 16 bytes + shl $16, %edx + or %edx, %eax # mismatches in both + not %eax # matches in both + tzcnt %eax, %eax # location of mismatch + add %rax, %rdi # pointer to mismatch + movzbl -64(%rdi), %eax # mismatching bytes + movzbl -64(%rdi, %rsi, 1), %ecx + sub %ecx, %eax + ret +#endif +ARCHEND(memcmp, baseline) .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/memcpy.S b/lib/libc/amd64/string/memcpy.S index 2b6c73abeb98..1876ee600f20 100644 --- a/lib/libc/amd64/string/memcpy.S +++ b/lib/libc/amd64/string/memcpy.S @@ -1,5 +1,4 @@ /* $NetBSD: memcpy.S,v 1.1 2001/06/19 00:25:05 fvdl Exp $ */ -/* $FreeBSD$ */ #define MEMCPY #include "memmove.S" diff --git a/lib/libc/amd64/string/memmove.S b/lib/libc/amd64/string/memmove.S index ea92cb18782a..7878e6e9bee6 100644 --- a/lib/libc/amd64/string/memmove.S +++ b/lib/libc/amd64/string/memmove.S @@ -27,8 +27,6 @@ */ #include <machine/asm.h> -__FBSDID("$FreeBSD$"); - /* * Note: this routine was written with kernel use in mind (read: no simd), * it is only present in userspace as a temporary measure until something @@ -306,3 +304,5 @@ ENTRY(memcpy) MEMMOVE erms=0 overlap=1 begin=MEMMOVE_BEGIN end=MEMMOVE_END END(memcpy) #endif + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/memrchr.S b/lib/libc/amd64/string/memrchr.S new file mode 100644 index 000000000000..80fb306af2a3 --- /dev/null +++ b/lib/libc/amd64/string/memrchr.S @@ -0,0 +1,158 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023, 2025 Robert Clausecker <fuz@FreeBSD.org> + */ + +#include <machine/asm.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + +ARCHFUNCS(memrchr) + ARCHFUNC(memrchr, scalar) + ARCHFUNC(memrchr, baseline) +ENDARCHFUNCS(memrchr) + +ARCHENTRY(memrchr, scalar) + lea -1(%rdi, %rdx, 1), %rax # point to last char in buffer + sub $4, %rdx # 4 bytes left to process? + jb .Ltail + + ALIGN_TEXT +0: cmp %sil, (%rax) # match at last entry? + je 1f + + cmp %sil, -1(%rax) # match at second to last entry? + je 2f + + cmp %sil, -2(%rax) # match at third to last entry? + je 3f + + cmp %sil, -3(%rax) # match at fourth to last entry? + je 4f + + sub $4, %rax + sub $4, %rdx + jae 0b + +.Ltail: cmp $-3, %edx # at least one character left to process? + jb .Lnotfound + + cmp %sil, (%rax) + je 1f + + cmp $-2, %edx # at least two characters left to process? + jb .Lnotfound + + cmp %sil, -1(%rax) + je 2f + + cmp $-1, %edx # at least three characters left to process? + jb .Lnotfound + + cmp %sil, -2(%rax) + je 3f + +.Lnotfound: + xor %eax, %eax + ret + + /* match found -- adjust rax to point to matching byte */ +4: dec %rax +3: dec %rax +2: dec %rax +1: ret +ARCHEND(memrchr, scalar) + +ARCHENTRY(memrchr, baseline) + test %rdx, %rdx # empty input? + je .Lnomatchb + + + lea (%rdi, %rdx, 1), %ecx # pointer to end of buffer + lea -1(%rdi, %rdx, 1), %rdx # pointer to last char in buffer + movd %esi, %xmm2 + and $~0x1f, %rdx # pointer to final 32 buffer bytes + movdqa (%rdx), %xmm0 # load last 32 bytes + movdqa 16(%rdx), %xmm1 + + punpcklbw %xmm2, %xmm2 # c -> cc + + mov $-1, %r8d + neg %ecx + mov %r8d, %r9d + shr %cl, %r8d # mask with zeroes after the string + + punpcklwd %xmm2, %xmm2 # cc -> cccc + + mov %edi, %ecx + mov %r9d, %eax + shl %cl, %r9d # mask with zeroes before the string + + pshufd $0, %xmm2, %xmm2 # cccc -> cccccccccccccccc + + cmp %rdx, %rdi # tail is beginning of buffer? + cmovae %r9d, %eax # if yes, do combined head/tail processing + and %r8d, %eax # mak of bytes in tail part of string + + /* process tail */ + pcmpeqb %xmm2, %xmm1 + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm1, %esi + pmovmskb %xmm0, %ecx + shl $16, %esi + or %esi, %ecx # locations of matches + and %ecx, %eax # any match inside buffer? + jnz .Lprecisematchb + + cmp %rdx, %rdi # did the buffer begin here? + jae .Lnomatchb # if yes, we are done + + /* main loop */ + ALIGN_TEXT +0: movdqa -32(%rdx), %xmm0 # load previous string chunk + movdqa -16(%rdx), %xmm1 + sub $32, %rdx # beginning of string reached? + cmp %rdx, %rdi + jae .Ltailb + + pcmpeqb %xmm2, %xmm0 + pcmpeqb %xmm2, %xmm1 + por %xmm1, %xmm0 # match in either half? + pmovmskb %xmm0, %eax + test %eax, %eax + jz 0b + +.Lmatchb: + pcmpeqb (%rdx), %xmm2 # redo comparison of first 16 bytes + pmovmskb %xmm1, %ecx + pmovmskb %xmm2, %eax + shl $16, %ecx + or %ecx, %eax # location of matches + +.Lprecisematchb: + bsr %eax, %eax # find location of match + add %rdx, %rax # point to matching byte + ret + +.Ltailb: + pcmpeqb %xmm2, %xmm1 + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm1, %ecx + pmovmskb %xmm0, %eax + shl $16, %ecx + or %ecx, %eax # location of matches + and %r9d, %eax # mask out matches before buffer + bsr %eax, %edi # location of match + lea (%rdx, %rdi, 1), %rdx # pointer to match (if any) + cmovnz %rdx, %rax # point to match if present, + ret # else null pointer + +.Lnomatchb: + xor %eax, %eax # return null pointer + ret +ARCHEND(memrchr, baseline) + + .section .note.GNU-stack, "", %progbits diff --git a/lib/libc/amd64/string/memset.S b/lib/libc/amd64/string/memset.S index 050711302354..bfab34eeaeb9 100644 --- a/lib/libc/amd64/string/memset.S +++ b/lib/libc/amd64/string/memset.S @@ -24,13 +24,9 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * $FreeBSD$ */ #include <machine/asm.h> -__FBSDID("$FreeBSD$"); - /* * Note: this routine was written with kernel use in mind (read: no simd), * it is only present in userspace as a temporary measure until something diff --git a/lib/libc/amd64/string/stpcpy.S b/lib/libc/amd64/string/stpcpy.S index 52ac69c5f7e2..59358e3245a8 100644 --- a/lib/libc/amd64/string/stpcpy.S +++ b/lib/libc/amd64/string/stpcpy.S @@ -1,11 +1,29 @@ -/* - * Adapted by Guillaume Morin <guillaume@morinfr.org> from strcpy.S - * written by J.T. Conklin <jtc@acorntoolworks.com> - * Public domain. +/*- + * Copyright (c) 2023, The FreeBSD Foundation + * + * SPDX-License-Expression: BSD-2-Clause + * + * Portions of this software were developed by Robert Clausecker + * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation. + * + * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcpy.S + * written by J.T. Conklin <jtc@acorntoolworks.com> and + * adapted by Guillaume Morin <guillaume@morinfr.org> to implement stpcpy + * that was originally dedicated to the public domain */ #include <machine/asm.h> -__FBSDID("$FreeBSD$"); + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + + .weak stpcpy + .set stpcpy, __stpcpy +ARCHFUNCS(__stpcpy) + ARCHFUNC(__stpcpy, scalar) + ARCHFUNC(__stpcpy, baseline) +ENDARCHFUNCS(__stpcpy) /* * This stpcpy implementation copies a byte at a time until the @@ -20,9 +38,7 @@ __FBSDID("$FreeBSD$"); * requirements. */ - .globl stpcpy,__stpcpy -ENTRY(stpcpy) -__stpcpy: +ARCHENTRY(__stpcpy, scalar) movabsq $0x0101010101010101,%r8 movabsq $0x8080808080808080,%r9 @@ -43,7 +59,7 @@ __stpcpy: dec %rax ret - .p2align 4 + ALIGN_TEXT .Lloop: movq %rdx,(%rdi) addq $8,%rdi @@ -111,6 +127,111 @@ __stpcpy: .Ldone: movq %rdi,%rax ret -END(stpcpy) - +ARCHEND(__stpcpy, scalar) + +ARCHENTRY(__stpcpy, baseline) + mov %esi, %ecx + mov %rdi, %rdx + sub %rsi, %rdi # express destination as distance to surce + and $~0xf, %rsi # align source to 16 byte + movdqa (%rsi), %xmm0 # head of string with junk before + pxor %xmm1, %xmm1 + and $0xf, %ecx # misalignment in bytes + pcmpeqb %xmm1, %xmm0 # NUL byte present? + pmovmskb %xmm0, %eax + shr %cl, %eax # clear out matches in junk bytes + bsf %eax, %eax # find match if any + jnz .Lrunt + + /* first normal iteration: write head back if it succeeds */ + movdqa 16(%rsi), %xmm0 # 16 bytes of current iteration + movdqu (%rsi, %rcx, 1), %xmm2 # first 16 bytes of the string + pcmpeqb %xmm0, %xmm1 # NUL byte present? + pmovmskb %xmm1, %eax + test %eax, %eax # find match if any + jnz .Lshorty + + movdqu %xmm2, (%rdx) # store beginning of string + + /* main loop, unrolled twice */ + ALIGN_TEXT +0: movdqa 32(%rsi), %xmm2 # load current iteraion + movdqu %xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion + pxor %xmm1, %xmm1 + add $32, %rsi + pcmpeqb %xmm2, %xmm1 # NUL byte present? + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 1f + + movdqa 16(%rsi), %xmm0 # load current iteraion + movdqu %xmm2, (%rsi, %rdi, 1) # write back previous iteraion + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 # NUL byte present? + pmovmskb %xmm1, %eax + test %eax, %eax + jz 0b + + /* end of string after main loop has iterated */ + add $16, %rsi # advance rsi to second unrolled half +1: tzcnt %eax, %eax # find location of match + # (behaves as bsf on pre-x86-64-v3 CPUs) + add %rsi, %rax # point to NUL byte + movdqu -15(%rax), %xmm0 # last 16 bytes of string + movdqu %xmm0, -15(%rax, %rdi, 1) # copied to destination + add %rdi, %rax # point to destination's NUL byte + ret + + /* NUL encountered in second iteration */ +.Lshorty: + tzcnt %eax, %eax + add $16, %eax # account for length of first iteration + sub %ecx, %eax # but not the parts before the string + + /* NUL encountered in first iteration */ +.Lrunt: lea 1(%rax), %edi # string length including NUL byte + add %rcx, %rsi # point to beginning of string + add %rdx, %rax # point to NUL byte + + /* transfer 16--32 bytes */ +.L1632: cmp $16, %edi + jb .L0815 + + movdqu -16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes + movdqu %xmm2, (%rdx) # store first 16 bytes + movdqu %xmm0, -15(%rax) # store last 16 bytes + ret + + /* transfer 8--15 bytes */ +.L0815: cmp $8, %edi + jb .L0407 + + mov (%rsi), %rcx # load first 8 bytes + mov -8(%rsi, %rdi, 1), %rdi # load last 8 bytes + mov %rcx, (%rdx) # store to dst + mov %rdi, -7(%rax) # dito + ret + + /* transfer 4--7 bytes */ +.L0407: cmp $4, %edi + jb .L0203 + + mov (%rsi), %ecx + mov -4(%rsi, %rdi, 1), %edi + mov %ecx, (%rdx) + mov %edi, -3(%rax) + ret + + /* transfer 2--3 bytes */ +.L0203: cmp $2, %edi + jb .L0101 + + movzwl (%rsi), %ecx + mov %cx, (%rdx) # store first two bytes + + /* transfer 0 bytes (last byte is always NUL) */ +.L0101: movb $0, (%rax) # store terminating NUL byte + ret +ARCHEND(__stpcpy, baseline) + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/stpncpy.S b/lib/libc/amd64/string/stpncpy.S new file mode 100644 index 000000000000..5ce0dd093a9e --- /dev/null +++ b/lib/libc/amd64/string/stpncpy.S @@ -0,0 +1,283 @@ +/* + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include <machine/asm.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + + .weak stpncpy + .set stpncpy, __stpncpy +ARCHFUNCS(__stpncpy) + ARCHFUNC(__stpncpy, scalar) + ARCHFUNC(__stpncpy, baseline) +ENDARCHFUNCS(__stpncpy) + +ARCHENTRY(__stpncpy, scalar) + push %rbp # establish stack frame + mov %rsp, %rbp + + push %rdx + push %rdi + push %rsi + push %rax # dummy push for alignment + + mov %rsi, %rdi + xor %esi, %esi + call CNAME(__memchr) # memchr(src, '\0', len) + pop %rcx # dummy pop + pop %rsi + mov -16(%rbp), %rdi + + test %rax, %rax # NUL found? + jz .Lfullcopy + + mov %rax, %rdx + sub %rsi, %rdx # copy until the NUL byte + add %rdx, -16(%rbp) # advance destination by string length + sub %rdx, -8(%rbp) # and shorten buffer size by string length + call CNAME(memcpy) + + pop %rdi + pop %rdx + xor %esi, %esi + pop %rbp + jmp CNAME(memset) # clear remaining buffer + +.Lfullcopy: + mov -8(%rbp), %rdx + call CNAME(memcpy) # copy whole string + add -8(%rbp), %rax # point to dest[n] + leave + ret +ARCHEND(__stpncpy, scalar) + + /* + * this mask allows us to generate masks of 16-n 0xff bytes + * followed by n 0x00 bytes by loading from .Lmask+n. + */ + .section .rodata +.Lmask: .quad 0xffffffffffffffff + .quad 0xffffffffffffffff + .quad 0x0000000000000000 + .quad 0x0000000000000000 + +/* stpncpy(char *restrict rdi, const char *rsi, size_t rdx) */ +ARCHENTRY(__stpncpy, baseline) +#define bounce (-3*16-8) /* location of on-stack bounce buffer */ + + test %rdx, %rdx # no bytes to copy? + jz .L0 + + mov %esi, %ecx + and $~0xf, %rsi # align source to 16 bytes + movdqa (%rsi), %xmm0 # load head + and $0xf, %ecx # offset from alignment + mov $-1, %r9d + lea -32(%rcx), %rax # set up overflow-proof comparison rdx+rcx<=32 + shl %cl, %r9d # mask of bytes belonging to the string + sub %rcx, %rdi # adjust RDI to correspond to RSI + pxor %xmm1, %xmm1 + movdqa %xmm0, bounce(%rsp) # stash copy of head on the stack + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %r8d + + lea (%rdx, %rcx, 1), %r10 # buffer length from alignment boundary + add %rdx, %rax # less than 2 chunks (32 bytes) to play with? + jnc .Lrunt # if yes, use special runt processing + + movdqu %xmm1, -16(%rdi, %r10, 1) # clear final bytes of destination + and %r9d, %r8d # end of string within head? + jnz .Lheadnul + + movdqu (%rsi, %rcx, 1), %xmm2 # load head from source buffer + movdqu %xmm2, (%rdi, %rcx, 1) # an deposit + + add $16, %rsi + add $16, %rdi + sub $32, %r10 + + /* main loop unrolled twice */ + ALIGN_TEXT +0: movdqa (%rsi), %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 # NUL byte encountered? + pmovmskb %xmm1, %r8d + test %r8d, %r8d + jnz 3f + + movdqu %xmm0, (%rdi) + cmp $16, %r10 # more than a full chunk left? + jbe 1f + + movdqa 16(%rsi), %xmm0 + add $32, %rdi # advance pointers to next chunk + add $32, %rsi + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 # NUL byte encountered? + pmovmskb %xmm1, %r8d + test %r8d, %r8d + jnz 2f + + movdqu %xmm0, -16(%rdi) + sub $32, %r10 # more than another full chunk left? + ja 0b + + sub $16, %rdi # undo second advancement + sub $16, %rsi + add $16, %r10d # restore number of remaining bytes + + /* 1--16 bytes left but string has not ended yet */ +1: pxor %xmm1, %xmm1 + pcmpeqb 16(%rsi), %xmm1 # NUL byte in source tail? + pmovmskb %xmm1, %r8d + bts %r10d, %r8d # treat end of buffer as NUL + tzcnt %r8d, %r8d # where is the NUL byte? + movdqu (%rsi, %r8, 1), %xmm0 # load source tail before NUL + lea 16(%rdi, %r8, 1), %rax # point return value to NUL byte + # or end of buffer + movdqu %xmm0, (%rdi, %r8, 1) # store tail into the buffer + ret + +2: sub $16, %rdi # undo second advancement + sub $16, %rsi + sub $16, %r10 + + /* string has ended and buffer has not */ +3: tzcnt %r8d, %r8d # where did the string end? + lea .Lmask+16(%rip), %rcx + lea (%rdi, %r8, 1), %rax # where the NUL byte will be + neg %r8 + movdqu (%rcx, %r8, 1), %xmm1 # mask with FF where the string is, + # 00 where it is not + pand %xmm1, %xmm0 # mask out bytes after the string + movdqu %xmm0, (%rdi) # store masked current chunk + pxor %xmm1, %xmm1 + sub $16, %r10 # another full chunk left? + jbe 1f + + /* clear remaining destination buffer (tail has been cleared earlier) */ + ALIGN_TEXT +0: movdqu %xmm1, 16(%rdi) + cmp $16, %r10 + jbe 1f + + movdqu %xmm1, 32(%rdi) + add $32, %rdi + sub $32, %r10 + ja 0b + +1: ret + + /* at least two chunks to play with and NUL while processing head */ +.Lheadnul: + movdqu bounce(%rsp, %rcx, 1), %xmm0 # load start of source from stack + tzcnt %r8d, %r8d # find location of NUL byte + movdqu %xmm0, (%rdi, %rcx, 1) # deposit head in the destination + movdqu %xmm1, (%rdi, %r8, 1) # clear out following bytes + movdqu %xmm1, 16(%rdi) # clear out second chunk + lea (%rdi, %r8, 1), %rax # make RAX point to the NUL byte + + add $32, %rdi # advance past first two chunks + sub $32+16, %r10 # advance past first three chunks + jbe 1f # did we pass the end of the buffer? + + /* clear remaining destination buffer (tail has been cleared earlier) */ + ALIGN_TEXT +0: movdqu %xmm1, (%rdi) # clear out buffer chunk + cmp $16, %r10 + jbe 1f + + movdqu %xmm1, 16(%rdi) + add $32, %rdi + sub $32, %r10 + ja 0b + +1: ret + + /* 1--32 bytes to copy, bounce through the stack */ +.Lrunt: movdqa %xmm1, bounce+16(%rsp) # clear out rest of on-stack copy + bts %r10d, %r8d # treat end of buffer as end of string + and %r9w, %r8w # end of string within first buffer? + jnz 0f # if yes, do not inspect second buffer + + movdqa 16(%rsi), %xmm0 # load second chunk of input + movdqa %xmm0, bounce+16(%rsp) # stash copy on stack + pcmpeqb %xmm1, %xmm0 # NUL in second chunk? + pmovmskb %xmm0, %r9d + shl $16, %r9d + or %r9d, %r8d # merge found NUL bytes into NUL mask + + /* end of string after one buffer */ +0: tzcnt %r8d, %r8d # location of last char in string + movdqu %xmm1, bounce(%rsp, %r8, 1) # clear bytes behind string + lea bounce(%rsp, %rcx, 1), %rsi # start of string copy on stack + lea (%rdi, %r8, 1), %rax # return pointer to NUL byte + + cmp $16, %edx # at least 16 bytes to transfer? + jae .L1631 + + mov (%rsi), %r8 # load string head + cmp $8, %edx # at least 8 bytes to transfer? + jae .L0815 + + cmp $4, %edx # at least 4 bytes to transfer? + jae .L0407 + + movzwl -2(%rsi, %rdx, 1), %esi # load last two bytes of string + mov %r8b, (%rdi, %rcx, 1) # store first byte + + cmp $2, %edx # at least 2 bytes to transfer? + jb .L1 + + mov %si, -2(%rdi, %r10, 1) # store last two bytes of string +.L1: ret + +.L1631: movdqu (%rsi), %xmm0 # load first 16 bytes of string + movdqu -16(%rsi, %rdx, 1), %xmm1 # load last 16 bytes of string + movdqu %xmm0, (%rdi, %rcx, 1) + movdqu %xmm1, -16(%rdi, %r10, 1) + ret + +.L0815: mov -8(%rsi, %rdx, 1), %rdx # load last 8 bytes of string + mov %r8, (%rdi, %rcx, 1) + mov %rdx, -8(%rdi, %r10, 1) + ret + +.L0407: mov -4(%rsi, %rdx, 1), %edx # load last four bytes of string + mov %r8d, (%rdi, %rcx, 1) + mov %edx, -4(%rdi, %r10, 1) + ret + + /* length 0 buffer: just return dest */ +.L0: mov %rdi, %rax + ret +ARCHEND(__stpncpy, baseline) + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/strcat.S b/lib/libc/amd64/string/strcat.S index 7b5a1dd39cd3..081e98840cee 100644 --- a/lib/libc/amd64/string/strcat.S +++ b/lib/libc/amd64/string/strcat.S @@ -1,16 +1,29 @@ -/* - * Written by J.T. Conklin <jtc@acorntoolworks.com> - * Public domain. +/*- + * Copyright (c) 2023, The FreeBSD Foundation + * + * SPDX-License-Expression: BSD-2-Clause + * + * Portions of this software were developed by Robert Clausecker + * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation. + * + * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcat.S + * written by J.T. Conklin <jtc@acorntoolworks.com> + * that was originally dedicated to the public domain */ #include <machine/asm.h> -__FBSDID("$FreeBSD$"); - #if 0 RCSID("$NetBSD: strcat.S,v 1.4 2004/07/26 18:51:21 drochner Exp $") #endif -ENTRY(strcat) +#include "amd64_archlevel.h" + +ARCHFUNCS(strcat) + ARCHFUNC(strcat, scalar) + ARCHFUNC(strcat, baseline) +ENDARCHFUNCS(strcat) + +ARCHENTRY(strcat, scalar) movq %rdi,%rax movabsq $0x0101010101010101,%r8 movabsq $0x8080808080808080,%r9 @@ -163,6 +176,28 @@ ENTRY(strcat) .Ldone: ret -END(strcat) +ARCHEND(strcat, scalar) + +/* + * Call into strlen + strcpy if we have any SIMD at all. + * The scalar implementation above is better for the scalar + * case as it avoids the function call overhead, but pessimal + * if we could call SIMD routines instead. + */ +ARCHENTRY(strcat, baseline) + push %rbp + mov %rsp, %rbp + push %rsi + push %rbx + mov %rdi, %rbx # remember destination for later + call CNAME(strlen) # strlen(dest) + mov -8(%rbp), %rsi + lea (%rbx, %rax, 1), %rdi # dest + strlen(dest) + call CNAME(__stpcpy) # stpcpy(dest + strlen(dest), src) + mov %rbx, %rax # return dest + pop %rbx + leave + ret +ARCHEND(strcat, baseline) .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/strchrnul.S b/lib/libc/amd64/string/strchrnul.S new file mode 100644 index 000000000000..0e70b02311d7 --- /dev/null +++ b/lib/libc/amd64/string/strchrnul.S @@ -0,0 +1,170 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include <machine/asm.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4,0x90 # 16-byte alignment, nop-filled + + .weak strchrnul + .set strchrnul, __strchrnul + +ARCHFUNCS(__strchrnul) + ARCHFUNC(__strchrnul, scalar) + ARCHFUNC(__strchrnul, baseline) +ENDARCHFUNCS(__strchrnul) + +/* + * strchrnul(str, c) + * This is implemented like strlen(str), but we check for the + * presence of both NUL and c in each iteration. + */ +ARCHENTRY(__strchrnul, scalar) + mov %edi, %ecx + and $~7, %rdi # align to 8 byte + movzbl %sil, %esi # clear stray high bits + movabs $0x0101010101010101, %r8 + mov (%rdi), %rax # load first word + imul %r8, %rsi # replicate char 8 times + + /* + * Unaligned input: align to 8 bytes. Then proceed the same + * way as with aligned input, but prevent matches before the + * beginning of the string. This is achieved by oring 0x01 + * into each byte of the buffer before the string + */ + shl $3, %ecx + mov %r8, %r10 + add $8, %rdi + shl %cl, %r10 # 0x01 where the string is + xor %r8, %r10 # 0x01 where it is not + neg %r8 # negate 01..01 so we can use lea + movabs $0x8080808080808080, %r9 + + mov %rsi, %rcx + xor %rax, %rcx # str ^ c + or %r10, %rax # str without NUL bytes before it + or %r10, %rcx # (str ^ c) without matches before it + lea (%rax, %r8, 1), %rdx # str - 0x01..01 + lea (%rcx, %r8, 1), %r11 # (str ^ c) - 0x01..01 + not %rax # ~str + not %rcx # ~(str ^ c) + and %rdx, %rax # (str - 0x01..01) & ~str + and %r11, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) + or %rcx, %rax # matches for both + and %r9, %rax # not including junk bytes + jnz 1f + + /* main loop unrolled twice */ + ALIGN_TEXT +0: mov (%rdi), %rax # str + mov %rsi, %rcx + xor %rax, %rcx # str ^ c + lea (%rax, %r8, 1), %rdx # str - 0x01..01 + lea (%rcx, %r8, 1), %r11 # (str ^ c) - 0x01..01 + not %rax # ~str + not %rcx # ~(str ^ c) + and %rdx, %rax # (str - 0x01..01) & ~str + and %r11, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) + or %rcx, %rax # matches for both + and %r9, %rax # not including junk bits + jnz 2f + + mov 8(%rdi), %rax # str + add $16, %rdi + mov %rsi, %rcx + xor %rax, %rcx # str ^ c + lea (%rax, %r8, 1), %rdx # str - 0x01..01 + lea (%rcx, %r8, 1), %r11 # (str ^ c) - 0x01..01 + not %rax # ~str + not %rcx # ~(str ^ c) + and %rdx, %rax # (str - 0x01..01) & ~str + and %r11, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) + or %rcx, %rax # matches for both + and %r9, %rax # not including junk bits + jz 0b + + /* NUL or c found */ +1: sub $8, %rdi # undo advance past buffer +2: tzcnt %rax, %rax # first NUL or c byte match + shr $3, %eax # scale from bit to byte index + add %rdi, %rax # pointer to found c or NUL + ret +ARCHEND(__strchrnul, scalar) + +ARCHENTRY(__strchrnul, baseline) + mov %edi, %ecx + and $~0xf, %rdi # align to 16 byte + movdqa (%rdi), %xmm1 + movd %esi, %xmm0 + and $0xf, %ecx # distance from (%rdi) to start of string + pxor %xmm2, %xmm2 + mov $-1, %edx + punpcklbw %xmm0, %xmm0 # c -> cc + shl %cl, %edx # bits corresponding to bytes in the string + punpcklwd %xmm0, %xmm0 # cc -> cccc + add $16, %rdi + + /* check for match in head */ + pcmpeqb %xmm1, %xmm2 # NUL bytes present? + pshufd $0, %xmm0, %xmm0 # cccc -> cccccccccccccccc + pcmpeqb %xmm0, %xmm1 # c present? + por %xmm2, %xmm1 # either present? + pmovmskb %xmm1, %eax + and %edx, %eax # match in the string? + jnz 1f + + /* main loop unrolled twice */ + ALIGN_TEXT +0: movdqa (%rdi), %xmm1 + pxor %xmm2, %xmm2 + pcmpeqb %xmm1, %xmm2 # NUL bytes present? + pcmpeqb %xmm0, %xmm1 # c present? + por %xmm2, %xmm1 # either present? + pmovmskb %xmm1, %eax + test %eax, %eax # match in the string? + jnz 2f + + movdqa 16(%rdi), %xmm1 + add $32, %rdi + pxor %xmm2, %xmm2 + pcmpeqb %xmm1, %xmm2 # NUL bytes present? + pcmpeqb %xmm0, %xmm1 # c present? + por %xmm2, %xmm1 # either present? + pmovmskb %xmm1, %eax + test %eax, %eax # match in the string? + jz 0b + +1: sub $16, %rdi # undo advance past buffer +2: tzcnt %eax, %eax # where is the match? + add %rdi, %rax # pointer to found c or NUL + ret +ARCHEND(__strchrnul, baseline) + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/strcmp.S b/lib/libc/amd64/string/strcmp.S index 787e1e07b1b0..eb354bd2af82 100644 --- a/lib/libc/amd64/string/strcmp.S +++ b/lib/libc/amd64/string/strcmp.S @@ -1,16 +1,33 @@ -/* - * Written by J.T. Conklin <jtc@acorntoolworks.com> - * Public domain. +/*- + * Copyright (c) 2023, The FreeBSD Foundation + * + * SPDX-License-Expression: BSD-2-Clause + * + * Portions of this software were developed by Robert Clausecker + * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation. + * + * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcmp.S + * written by J.T. Conklin <jtc@acorntoolworks.com> that was originally + * dedicated to the public domain. */ #include <machine/asm.h> -__FBSDID("$FreeBSD$"); +#include <machine/param.h> #if 0 RCSID("$NetBSD: strcmp.S,v 1.3 2004/07/19 20:04:41 drochner Exp $") #endif -ENTRY(strcmp) +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + +ARCHFUNCS(strcmp) + ARCHFUNC(strcmp, scalar) + ARCHFUNC(strcmp, baseline) +ENDARCHFUNCS(strcmp) + +ARCHENTRY(strcmp, scalar) /* * Align s1 to word boundary. * Consider unrolling loop? @@ -41,7 +58,7 @@ ENTRY(strcmp) movabsq $0x8080808080808080,%r9 subq $8,%rsi - .align 4 + ALIGN_TEXT .Lword_loop: movq 8(%rdi),%rax addq $8,%rdi @@ -55,7 +72,7 @@ ENTRY(strcmp) testq %r9,%rdx je .Lword_loop - .align 4 + ALIGN_TEXT .Lbyte_loop: movb (%rdi),%al incq %rdi @@ -71,6 +88,272 @@ ENTRY(strcmp) movzbq %dl,%rdx subq %rdx,%rax ret -END(strcmp) +ARCHEND(strcmp, scalar) + +ARCHENTRY(strcmp, baseline) + /* check if either string crosses a page in the head */ + lea 15(%rdi), %r8d # end of head + lea 15(%rsi), %r9d + mov %edi, %eax + mov %esi, %edx + xor %edi, %r8d # bits that changed between first and last byte + xor %esi, %r9d + and $~0xf, %rdi # align heads to 16 bytes + and $~0xf, %rsi + or %r8d, %r9d # in either RSI or RDI + and $0xf, %eax # offset from alignment + and $0xf, %edx + pxor %xmm1, %xmm1 + test $PAGE_SIZE, %r9d # did the page change? + jz 0f # if not, take fast path + + /* heads may cross page boundary, avoid unmapped loads */ + movdqa (%rdi), %xmm0 # load aligned heads + movdqa (%rsi), %xmm2 + mov $-1, %r8d + mov $-1, %r9d + mov %eax, %ecx + shl %cl, %r8d # string head in XMM0 + mov %edx, %ecx + shl %cl, %r9d # string head in XMM2 + movdqa %xmm0, -40(%rsp) # stash copies of the heads on the stack + movdqa %xmm2, -24(%rsp) + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm0, %r10d + pmovmskb %xmm2, %r11d + test %r8d, %r10d # NUL byte present in first string? + lea -40(%rsp), %r8 + cmovz %rdi, %r8 + test %r9d, %r11d # NUL byte present in second string? + lea -24(%rsp), %r9 + cmovz %rsi, %r9 + movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads + movdqu (%r9, %rdx, 1), %xmm4 + jmp 1f + +0: movdqu (%rdi, %rax, 1), %xmm0 # load true heads + movdqu (%rsi, %rdx, 1), %xmm4 +1: pxor %xmm2, %xmm2 + pcmpeqb %xmm0, %xmm2 # NUL byte present? + pcmpeqb %xmm0, %xmm4 # which bytes match? + pandn %xmm4, %xmm2 # match and not NUL byte? + pmovmskb %xmm2, %r9d + xor $0xffff, %r9d # mismatch or NUL byte? + jnz .Lhead_mismatch + + /* load head and second chunk */ + movdqa 16(%rdi), %xmm2 # load second chunks + movdqa 16(%rsi), %xmm3 + sub %rdx, %rax # is a&0xf >= b&0xf? + jb .Lswapped # if not, proceed with swapped operands + + neg %rax + movdqu 16(%rsi, %rax, 1), %xmm0 + sub %rdi, %rsi # express RSI as distance from RDI + lea (%rsi, %rax, 1), %rdx # point RDX to offset in second string + neg %rax + pcmpeqb %xmm3, %xmm1 # ... corresponding to RDI + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + add $16, %rdi + test %r8d, %r8d + jnz .Lnul_found + xor $0xffff, %r9d + jnz .Lmismatch + add $16, %rdi # advance aligned pointers + + /* + * During the main loop, the layout of the two strings is something like: + * + * v ------1------ v ------2------ v + * RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... + * RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... + * + * where v indicates the alignment boundaries and corresponding chunks + * of the strings have the same letters. Chunk A has been checked in + * the previous iteration. This iteration, we first check that string + * RSI doesn't end within region 2, then we compare chunk B between the + * two strings. As RSI is known not to hold a NUL byte in regsions 1 + * and 2 at this point, this also ensures that RDI has not ended yet. + */ + ALIGN_TEXT +0: movdqu (%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI? + pxor %xmm1, %xmm1 + pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI? + pcmpeqb (%rdi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + test %r8d, %r8d + jnz .Lnul_found + xor $0xffff, %r9d # any mismatches? + jnz .Lmismatch + + /* main loop unrolled twice */ + movdqu 16(%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI? + pxor %xmm1, %xmm1 + pcmpeqb 16(%rdi, %rsi, 1), %xmm1 # end of string in RSI? + pcmpeqb 16(%rdi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + add $32, %rdi + test %r8d, %r8d + jnz .Lnul_found2 + xor $0xffff, %r9d # any mismatches? + jz 0b + + sub $16, %rdi # roll back second increment + + /* a mismatch has been found between RDX and RSI */ +.Lmismatch: + tzcnt %r9d, %r9d # where is the mismatch? + add %rdi, %rdx # turn RDX from offset to pointer + movzbl (%rdx, %r9, 1), %ecx + movzbl (%rdi, %r9, 1), %eax + sub %ecx, %eax # difference of the mismatching chars + ret + + /* mismatch in true heads */ +.Lhead_mismatch: + tzcnt %r9d, %r9d # where is the mismatch? + add %rax, %rdi # return to true heads + add %rdx, %rsi + movzbl (%rdi, %r9, 1), %eax # mismatching characters + movzbl (%rsi, %r9, 1), %ecx + sub %ecx, %eax + ret + +.Lnul_found2: + sub $16, %rdi # roll back second increment + + /* a NUL has been found in RSI */ +.Lnul_found: + mov %eax, %ecx + mov %r8d, %r10d + shl %cl, %r8w # adjust NUL mask to positions in RDI/RDX + xor $0xffff, %r9d # mask of mismatches + or %r8d, %r9d # NUL bytes also count as mismatches + jnz .Lmismatch + + /* + * (RDI) == (RSI) and NUL is past the string. + * Compare (RSI) with the corresponding part + * of the other string until the NUL byte. + */ + movdqu (%rdi, %rax, 1), %xmm0 + pcmpeqb (%rdi, %rsi, 1), %xmm0 + add %rdi, %rsi # restore RSI pointer + add %rax, %rdi # point RDI to chunk corresponding to (RSI) + pmovmskb %xmm0, %ecx # mask of matches + not %ecx # mask of mismatches + or %r10d, %ecx # mask of mismatches or NUL bytes + tzcnt %ecx, %ecx # location of first mismatch + movzbl (%rdi, %rcx, 1), %eax + movzbl (%rsi, %rcx, 1), %ecx + sub %ecx, %eax + ret + + /* + * If (a&0xf) < (b&0xf), we do the same thing but with swapped + * operands. I found that this performs slightly better than + * using conditional moves to do the swap branchless. + */ +.Lswapped: + movdqu 16(%rdi, %rax, 1), %xmm0 + sub %rsi, %rdi # express RDI as distance from RSI + lea (%rdi, %rax, 1), %rdx # point RDX to offset in RDI corresponding to RSI + neg %rax # make difference positive + pcmpeqb %xmm2, %xmm1 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + add $16, %rsi # advance aligned pointers + test %r8d, %r8d + jnz .Lnul_founds + xor $0xffff, %r9d + jnz .Lmismatchs + add $16, %rsi + + /* + * During the main loop, the layout of the two strings is something like: + * + * v ------1------ v ------2------ v + * RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... + * RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... + * + * where v indicates the alignment boundaries and corresponding chunks + * of the strings have the same letters. Chunk A has been checked in + * the previous iteration. This iteration, we first check that string + * RSI doesn't end within region 2, then we compare chunk B between the + * two strings. As RSI is known not to hold a NUL byte in regsions 1 + * and 2 at this point, this also ensures that RDI has not ended yet. + */ + ALIGN_TEXT +0: movdqu (%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI? + pxor %xmm1, %xmm1 + pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RSI? + pcmpeqb (%rsi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + test %r8d, %r8d + jnz .Lnul_founds + xor $0xffff, %r9d # any mismatches? + jnz .Lmismatchs + + /* main loop unrolled twice */ + movdqu 16(%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI? + pxor %xmm1, %xmm1 + pcmpeqb 16(%rsi, %rdi, 1), %xmm1 # end of string in RSI? + pcmpeqb 16(%rsi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + add $32, %rsi + test %r8d, %r8d + jnz .Lnul_found2s + xor $0xffff, %r9d # any mismatches? + jz 0b + + sub $16, %rsi # roll back second increment + + /* a mismatch has been found between RDX and RDI */ +.Lmismatchs: + tzcnt %r9d, %r9d # where is the mismatch? + add %rsi, %rdx # turn RDX from offset to pointer + movzbl (%rdx, %r9, 1), %eax + movzbl (%rsi, %r9, 1), %ecx + sub %ecx, %eax # difference of the mismatching chars + ret + +.Lnul_found2s: + sub $16, %rsi # roll back second increment + + /* a NUL has been found in RSI */ +.Lnul_founds: + mov %eax, %ecx + mov %r8d, %r10d + shl %cl, %r8w # adjust NUL mask to positions in RDI/RDX + xor $0xffff, %r9d # mask of mismatches + or %r8d, %r9d # NUL bytes also count as mismatches + jnz .Lmismatchs + + /* + * (RDI) == (RSI) and NUL is past the string. + * Compare (RSI) with the corresponding part + * of the other string until the NUL byte. + */ + movdqu (%rsi, %rax, 1), %xmm0 + pcmpeqb (%rsi, %rdi, 1), %xmm0 + add %rsi, %rdi # restore RDI pointer + add %rax, %rsi # point RSI to chunk corresponding to (RDI) + pmovmskb %xmm0, %ecx # mask of matches + not %ecx # mask of mismatches + or %r10d, %ecx # mask of mismatches or NUL bytes + tzcnt %ecx, %ecx # location of first mismatch + movzbl (%rdi, %rcx, 1), %eax + movzbl (%rsi, %rcx, 1), %ecx + sub %ecx, %eax + ret +ARCHEND(strcmp, baseline) .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/strcpy.c b/lib/libc/amd64/string/strcpy.c index df1facefb105..eb93b0defbaa 100644 --- a/lib/libc/amd64/string/strcpy.c +++ b/lib/libc/amd64/string/strcpy.c @@ -1,5 +1,5 @@ /*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * SPDX-License-Identifier: BSD-2-Clause * * Copyright 2011 George V. Neville-Neil. All rights reserved. * @@ -27,9 +27,6 @@ * SUCH DAMAGE. */ -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - char *__stpcpy(char * __restrict, const char * __restrict); char * diff --git a/lib/libc/amd64/string/strcspn.S b/lib/libc/amd64/string/strcspn.S new file mode 100644 index 000000000000..7ebd7a847d67 --- /dev/null +++ b/lib/libc/amd64/string/strcspn.S @@ -0,0 +1,396 @@ +/* + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include <machine/asm.h> +#include <machine/param.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ + + .weak strcspn + .set strcspn, __strcspn +ARCHFUNCS(__strcspn) + ARCHFUNC(__strcspn, scalar) + NOARCHFUNC + ARCHFUNC(__strcspn, x86_64_v2) +ENDARCHFUNCS(__strcspn) + +ARCHENTRY(__strcspn, scalar) + push %rbp # align stack to enable function call + mov %rsp, %rbp + sub $256, %rsp # allocate space for lookup table + + /* check for special cases */ + movzbl (%rsi), %eax # first character in the set + test %eax, %eax + jz .Lstrlen + + movzbl 1(%rsi), %edx # second character in the set + test %edx, %edx + jz .Lstrchr + + /* no special case matches -- prepare lookup table */ + xor %r8d, %r8d + mov $28, %ecx +0: mov %r8, (%rsp, %rcx, 8) + mov %r8, 8(%rsp, %rcx, 8) + mov %r8, 16(%rsp, %rcx, 8) + mov %r8, 24(%rsp, %rcx, 8) + sub $4, %ecx + jnc 0b + + add $2, %rsi + movb $1, (%rsp, %rax, 1) # register first chars in set + movb $1, (%rsp, %rdx, 1) + mov %rdi, %rax # a copy of the source to iterate over + + /* process remaining chars in set */ + ALIGN_TEXT +0: movzbl (%rsi), %ecx + movb $1, (%rsp, %rcx, 1) + test %ecx, %ecx + jz 1f + + movzbl 1(%rsi), %ecx + movb $1, (%rsp, %rcx, 1) + test %ecx, %ecx + jz 1f + + add $2, %rsi + jmp 0b + + /* find match */ + ALIGN_TEXT +1: movzbl (%rax), %ecx + cmpb $0, (%rsp, %rcx, 1) + jne 2f + + movzbl 1(%rax), %ecx + cmpb $0, (%rsp, %rcx, 1) + jne 3f + + movzbl 2(%rax), %ecx + cmpb $0, (%rsp, %rcx, 1) + jne 4f + + movzbl 3(%rax), %ecx + add $4, %rax + cmpb $0, (%rsp, %rcx, 1) + je 1b + + sub $3, %rax +4: dec %rdi +3: inc %rax +2: sub %rdi, %rax # number of characters preceding match + leave + ret + + /* set is empty, degrades to strlen */ +.Lstrlen: + leave + jmp CNAME(strlen) + + /* just one character in set, degrades to strchr */ +.Lstrchr: + mov %rdi, (%rsp) # stash a copy of the string + mov %eax, %esi # find the character in the set + call CNAME(strchrnul) + sub (%rsp), %rax # length of prefix before match + leave + ret +ARCHEND(__strcspn, scalar) + + /* + * This kernel uses pcmpistri to do the heavy lifting. + * We provide five code paths, depending on set size: + * + * 0: call strlen() + * 1: call strchr() + * 2--16: one pcmpistri per 16 bytes of input + * 17--32: two pcmpistri per 16 bytes of input + * >=33: fall back to look up table + */ +ARCHENTRY(__strcspn, x86_64_v2) + push %rbp + mov %rsp, %rbp + sub $256, %rsp + + /* check for special cases */ + movzbl (%rsi), %eax + test %eax, %eax # empty string? + jz .Lstrlenv2 + + cmpb $0, 1(%rsi) # single character string? + jz .Lstrchrv2 + + /* find set size and copy up to 32 bytes to (%rsp) */ + mov %esi, %ecx + and $~0xf, %rsi # align set pointer + movdqa (%rsi), %xmm0 + pxor %xmm1, %xmm1 + and $0xf, %ecx # amount of bytes rsi is past alignment + xor %edx, %edx + pcmpeqb %xmm0, %xmm1 # end of string reached? + movdqa %xmm0, 32(%rsp) # transfer head of set to stack + pmovmskb %xmm1, %eax + shr %cl, %eax # clear out junk before string + test %eax, %eax # end of set reached? + jnz 0f + + movdqa 16(%rsi), %xmm0 # second chunk of the set + mov $16, %edx + sub %ecx, %edx # length of set preceding xmm0 + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 + movdqa %xmm0, 48(%rsp) + movdqu 32(%rsp, %rcx, 1), %xmm2 # head of set + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 1f + + movdqa 32(%rsi), %xmm0 # third chunk + add $16, %edx + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 + movdqa %xmm0, 64(%rsp) + pmovmskb %xmm1, %eax + test %eax, %eax # still not done? + jz .Lgt32v2 + +0: movdqu 32(%rsp, %rcx, 1), %xmm2 # head of set +1: tzcnt %eax, %eax + add %eax, %edx # length of set (excluding NUL byte) + cmp $32, %edx # above 32 bytes? + ja .Lgt32v2 + + /* + * At this point we know that we want to use pcmpistri. + * one last problem obtains: the head of the string is not + * aligned and may cross a cacheline. If this is the case, + * we take the part before the page boundary and repeat the + * last byte to fill up the xmm register. + */ + mov %rdi, %rax # save original string pointer + lea 15(%rdi), %esi # last byte of the head + xor %edi, %esi + test $PAGE_SIZE, %esi # does the head cross a page? + jz 0f + + /* head crosses page: copy to stack to fix up */ + and $~0xf, %rax # align head pointer temporarily + movzbl 15(%rax), %esi # last head byte on the page + movdqa (%rax), %xmm0 + movabs $0x0101010101010101, %r8 + imul %r8, %rsi # repeated 8 times + movdqa %xmm0, (%rsp) # head word on stack + mov %rsi, 16(%rsp) # followed by filler (last byte x8) + mov %rsi, 24(%rsp) + mov %edi, %eax + and $0xf, %eax # offset of head from alignment + add %rsp, %rax # pointer to fake head + +0: movdqu (%rax), %xmm0 # load head (fake or real) + lea 16(%rdi), %rax + and $~0xf, %rax # second 16 bytes of string (aligned) +1: cmp $16, %edx # 16--32 bytes? + ja .Lgt16v2 + + + /* set is 2--16 bytes in size */ + + /* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT */ + pcmpistri $0, %xmm0, %xmm2 # match in head? + jbe .Lheadmatchv2 + + ALIGN_TEXT +0: pcmpistri $0, (%rax), %xmm2 + jbe 1f # match or end of string? + pcmpistri $0, 16(%rax), %xmm2 + lea 32(%rax), %rax + ja 0b # match or end of string? + +3: lea -16(%rax), %rax # go back to second half +1: jc 2f # jump if match found + movdqa (%rax), %xmm0 # reload string piece + pxor %xmm1, %xmm1 + pcmpeqb %xmm1, %xmm0 # where is the NUL byte? + pmovmskb %xmm0, %ecx + tzcnt %ecx, %ecx # location of NUL byte in (%rax) +2: sub %rdi, %rax # offset of %xmm0 from beginning of string + add %rcx, %rax # prefix length before match/NUL + leave + ret + +.Lheadmatchv2: + jc 2f # jump if match found + pxor %xmm1, %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %ecx + tzcnt %ecx, %ecx # location of NUL byte +2: mov %ecx, %eax # prefix length before match/NUL + leave + ret + + /* match in first set half during head */ +.Lheadmatchv2first: + mov %ecx, %eax + pcmpistri $0, %xmm0, %xmm3 # match in second set half? + cmp %ecx, %eax # before the first half match? + cmova %ecx, %eax # use the earlier match + leave + ret + +.Lgt16v2: + movdqu 48(%rsp, %rcx, 1), %xmm3 # second part of set + + /* set is 17--32 bytes in size */ + pcmpistri $0, %xmm0, %xmm2 # match in first set half? + jb .Lheadmatchv2first + pcmpistri $0, %xmm0, %xmm3 # match in second set half or end of string? + jbe .Lheadmatchv2 + + ALIGN_TEXT +0: movdqa (%rax), %xmm0 + pcmpistri $0, %xmm0, %xmm2 + jb 4f # match in first set half? + pcmpistri $0, %xmm0, %xmm3 + jbe 1f # match in second set half or end of string? + movdqa 16(%rax), %xmm0 + add $32, %rax + pcmpistri $0, %xmm0, %xmm2 + jb 3f # match in first set half? + pcmpistri $0, %xmm0, %xmm3 + ja 0b # neither match in 2nd half nor string end? + + /* match in second half or NUL */ + lea -16(%rax), %rax # go back to second half +1: jc 2f # jump if match found + pxor %xmm1, %xmm1 + pcmpeqb %xmm1, %xmm0 # where is the NUL byte? + pmovmskb %xmm0, %ecx + tzcnt %ecx, %ecx # location of NUL byte in (%rax) +2: sub %rdi, %rax # offset of %xmm0 from beginning of string + add %rcx, %rax # prefix length before match/NUL + leave + ret + + /* match in first half */ +3: sub $16, %rax # go back to second half +4: sub %rdi, %rax # offset of %xmm0 from beginning of string + mov %ecx, %edx + pcmpistri $0, %xmm0, %xmm3 # match in second set half? + cmp %ecx, %edx # before the first half match? + cmova %ecx, %edx # use the earlier match + add %rdx, %rax # return full ofset + leave + ret + + /* set is empty, degrades to strlen */ +.Lstrlenv2: + leave + jmp CNAME(strlen) + + /* just one character in set, degrades to strchr */ +.Lstrchrv2: + mov %rdi, (%rsp) # stash a copy of the string + mov %eax, %esi # find this character + call CNAME(strchrnul) + sub (%rsp), %rax # length of prefix before match + leave + ret + + /* set is >=33 bytes in size */ +.Lgt32v2: + xorps %xmm0, %xmm0 + mov $256-64, %edx + + /* clear out look up table */ +0: movaps %xmm0, (%rsp, %rdx, 1) + movaps %xmm0, 16(%rsp, %rdx, 1) + movaps %xmm0, 32(%rsp, %rdx, 1) + movaps %xmm0, 48(%rsp, %rdx, 1) + sub $64, %edx + jnc 0b + + add %rcx, %rsi # restore string pointer + mov %rdi, %rax # keep a copy of the string + + /* initialise look up table */ + ALIGN_TEXT +0: movzbl (%rsi), %ecx + movb $1, (%rsp, %rcx, 1) + test %ecx, %ecx + jz 1f + + movzbl 1(%rsi), %ecx + movb $1, (%rsp, %rcx, 1) + test %ecx, %ecx + jz 1f + + movzbl 2(%rsi), %ecx + movb $1, (%rsp, %rcx, 1) + test %ecx, %ecx + jz 1f + + movzbl 3(%rsi), %ecx + movb $1, (%rsp, %rcx, 1) + test %ecx, %ecx + jz 1f + + add $4, %rsi + jmp 0b + + /* find match */ + ALIGN_TEXT +1: movzbl (%rax), %ecx + cmpb $0, (%rsp, %rcx, 1) + jne 2f + + movzbl 1(%rax), %ecx + cmpb $0, (%rsp, %rcx, 1) + jne 3f + + movzbl 2(%rax), %ecx + cmpb $0, (%rsp, %rcx, 1) + jne 4f + + movzbl 3(%rax), %ecx + add $4, %rax + cmpb $0, (%rsp, %rcx, 1) + je 1b + + sub $3, %rax +4: dec %rdi +3: inc %rax +2: sub %rdi, %rax # number of characters preceding match + leave + ret +ARCHEND(__strcspn, x86_64_v2) + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/strlcat.c b/lib/libc/amd64/string/strlcat.c new file mode 100644 index 000000000000..94fdc0963dc3 --- /dev/null +++ b/lib/libc/amd64/string/strlcat.c @@ -0,0 +1,27 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Robert Clausecker + */ + +#include <sys/cdefs.h> + +#include <string.h> + +#undef strlcat /* FORTIFY_SOURCE */ + +void *__memchr(const void *, int, size_t); +size_t __strlcpy(char *restrict, const char *restrict, size_t); + +size_t +strlcat(char *restrict dst, const char *restrict src, size_t dstsize) +{ + char *loc = __memchr(dst, '\0', dstsize); + + if (loc != NULL) { + size_t dstlen = (size_t)(loc - dst); + + return (dstlen + __strlcpy(loc, src, dstsize - dstlen)); + } else + return (dstsize + strlen(src)); +} diff --git a/lib/libc/amd64/string/strlcpy.S b/lib/libc/amd64/string/strlcpy.S new file mode 100644 index 000000000000..2b32c6c78047 --- /dev/null +++ b/lib/libc/amd64/string/strlcpy.S @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include <machine/asm.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + + .weak strlcpy + .set strlcpy, __strlcpy +ARCHFUNCS(__strlcpy) + ARCHFUNC(__strlcpy, scalar) + ARCHFUNC(__strlcpy, baseline) +ENDARCHFUNCS(__strlcpy) + +ARCHENTRY(__strlcpy, scalar) + push %rbp # establish stack frame + mov %rsp, %rbp + push %rsi + push %rbx + push %rdi + push %rdx + mov %rsi, %rdi + call CNAME(strlen) # strlen(src) + pop %rdx + pop %rdi + mov -8(%rbp), %rsi + mov %rax, %rbx # remember string length for return value + sub $1, %rdx # do not copy into the final byte of the buffer + jc 0f # skip copying altogether if buffer was empty + cmp %rax, %rdx # is the buffer longer than the input? + cmova %rax, %rdx # if yes, only copy the part that fits + movb $0, (%rdi, %rdx, 1) # NUL-terminate output buffer + call CNAME(memcpy) # copy string to output +0: mov %rbx, %rax # restore return value + pop %rbx + leave + ret +ARCHEND(__strlcpy, scalar) + +ARCHENTRY(__strlcpy, baseline) + sub $1, %rdx # do not count NUL byte in buffer length + jb .L0 # go to special code path if len was 0 + + mov %esi, %ecx + pxor %xmm1, %xmm1 + mov %rsi, %r9 # stash a copy of the source pointer for later + and $~0xf, %rsi + pcmpeqb (%rsi), %xmm1 # NUL found in head? + mov $-1, %r8d + and $0xf, %ecx + shl %cl, %r8d # mask of bytes in the string + pmovmskb %xmm1, %eax + and %r8d, %eax + jnz .Lhead_nul + + movdqa 16(%rsi), %xmm3 # load second string chunk + movdqu (%r9), %xmm2 # load unaligned string head + mov $32, %r8d + sub %ecx, %r8d # head length + length of second chunk + pxor %xmm1, %xmm1 + pcmpeqb %xmm3, %xmm1 # NUL found in second chunk? + + sub %r8, %rdx # enough space left for the second chunk? + jbe .Lhead_buf_end + + /* process second chunk */ + pmovmskb %xmm1, %eax + test %eax, %eax + jnz .Lsecond_nul + + /* string didn't end in second chunk and neither did buffer -- not a runt! */ + movdqa 32(%rsi), %xmm0 # load next string chunk + pxor %xmm1, %xmm1 + movdqu %xmm2, (%rdi) # deposit head into buffer + sub %rcx, %rdi # adjust RDI to correspond to RSI + movdqu %xmm3, 16(%rdi) # deposit second chunk + sub %rsi, %rdi # express RDI as distance from RSI + add $32, %rsi # advance RSI past first two chunks + sub $16, %rdx # enough left for another round? + jbe 1f + + /* main loop unrolled twice */ + ALIGN_TEXT +0: pcmpeqb %xmm0, %xmm1 # NUL byte encountered? + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 3f + + movdqu %xmm0, (%rsi, %rdi) + movdqa 16(%rsi), %xmm0 # load next string chunk + pxor %xmm1, %xmm1 + cmp $16, %rdx # more than a full chunk left? + jbe 2f + + add $32, %rsi # advance pointers to next chunk + pcmpeqb %xmm0, %xmm1 # NUL byte encountered? + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 4f + + movdqu %xmm0, -16(%rsi, %rdi) + movdqa (%rsi), %xmm0 # load next string chunk + pxor %xmm1, %xmm1 + sub $32, %rdx + ja 0b + +1: sub $16, %rsi # undo second advancement + add $16, %edx + + /* 1--16 bytes left in the buffer but string has not ended yet */ +2: pcmpeqb %xmm1, %xmm0 # NUL byte encountered? + pmovmskb %xmm0, %r8d + mov %r8d, %eax + bts %edx, %r8d # treat end of buffer as end of string + tzcnt %r8d, %r8d # find tail length + add %rsi, %rdi # restore RDI + movdqu (%rsi, %r8, 1), %xmm0 # load string tail + movdqu %xmm0, (%rdi, %r8, 1) # store string tail + movb $0, 16(%rdi, %r8, 1) # NUL terminate + + /* continue to find the end of the string */ + test %eax, %eax # end of string already reached? + jnz 1f + + ALIGN_TEXT +0: pcmpeqb 32(%rsi), %xmm1 + pmovmskb %xmm1, %eax + pxor %xmm1, %xmm1 + test %eax, %eax + jnz 2f + + pcmpeqb 48(%rsi), %xmm1 + pmovmskb %xmm1, %eax + add $32, %rsi + pxor %xmm1, %xmm1 + test %eax, %eax + jz 0b + +1: sub $16, %rsi # undo second advancement +2: tzcnt %eax, %eax # where is the NUL byte? + sub %r9, %rsi + lea 32(%rsi, %rax, 1), %rax # return string length + ret + +4: sub $16, %rsi # undo second advancement + add $16, %rdx # restore number of remaining bytes + + /* string has ended but buffer has not */ +3: tzcnt %eax, %eax # find length of string tail + movdqu -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. NUL) + add %rsi, %rdi # restore destination pointer + movdqu %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. NUL) + sub %r9, %rsi # string length to current chunk + add %rsi, %rax # plus length of current chunk + ret + +.Lhead_buf_end: + pmovmskb %xmm1, %r8d + add $32, %edx # restore edx to (len-1) + ecx + mov %r8d, %eax + shl $16, %r8d # place 2nd chunk NUL mask into bits 16--31 + bts %rdx, %r8 # treat end of buffer as end of string + tzcnt %r8, %rdx # find string/bufer len from alignment boundary + sub %ecx, %edx # find actual string/buffer len + movb $0, (%rdi, %rdx, 1) # write NUL terminator + + /* continue to find the end of the string */ + test %eax, %eax # end of string already reached? + jnz 1f + + ALIGN_TEXT +0: pcmpeqb 32(%rsi), %xmm1 + pmovmskb %xmm1, %eax + pxor %xmm1, %xmm1 + test %eax, %eax + jnz 2f + + pcmpeqb 48(%rsi), %xmm1 + pmovmskb %xmm1, %eax + add $32, %rsi + pxor %xmm1, %xmm1 + test %eax, %eax + jz 0b + +1: sub $16, %rsi +2: tzcnt %eax, %eax + sub %r9, %rsi + lea 32(%rsi, %rax, 1), %rax # return string length + jmp .L0031 + +.Lsecond_nul: + add %r8, %rdx # restore buffer length + tzcnt %eax, %eax # where is the NUL byte? + lea -16(%rcx), %r8d + sub %r8d, %eax # string length + cmp %rax, %rdx # is the string shorter than the buffer? + cmova %rax, %rdx # copy only min(buflen, srclen) bytes + movb $0, (%rdi, %rdx, 1) # write NUL terminator +.L0031: cmp $16, %rdx # at least 16 bytes to copy (not incl NUL)? + jb .L0015 + + /* copy 16--31 bytes */ + movdqu (%r9), %xmm0 # load first 16 bytes + movdqu -16(%r9, %rdx, 1), %xmm1 # load last 16 bytes + movdqu %xmm0, (%rdi) + movdqu %xmm1, -16(%rdi, %rdx, 1) + ret + +.Lhead_nul: + tzcnt %eax, %eax # where is the NUL byte? + sub %ecx, %eax # ... from the beginning of the string? + cmp %rax, %rdx # is the string shorter than the buffer? + cmova %rax, %rdx # copy only min(buflen, srclen) bytes + movb $0, (%rdi, %rdx, 1) # write NUL terminator + + /* process strings of 0--15 bytes (rdx: min(buflen, srclen), rax: srclen) */ +.L0015: cmp $8, %rdx # at least 8 bytes to copy? + jae .L0815 + + cmp $4, %rdx # at least 4 bytes to copy? + jae .L0407 + + cmp $2, %rdx # at least 2 bytes to copy? + jae .L0203 + + movzbl (%r9), %ecx # load first byte from src + mov %cl, (%rdi) # deposit into destination + movb $0, (%rdi, %rdx, 1) # add NUL terminator (again) + ret + +.L0203: movzwl (%r9), %ecx + movzwl -2(%r9, %rdx, 1), %esi + mov %cx, (%rdi) + mov %si, -2(%rdi, %rdx, 1) + ret + +.L0407: mov (%r9), %ecx + mov -4(%r9, %rdx, 1), %esi + mov %ecx, (%rdi) + mov %esi, -4(%rdi, %rdx, 1) + ret + +.L0815: mov (%r9), %rcx + mov -8(%r9, %rdx, 1), %rsi + mov %rcx, (%rdi) + mov %rsi, -8(%rdi, %rdx, 1) + ret + + /* length zero destination: just return the string length */ +.L0: mov %rsi, %rdi + jmp CNAME(strlen) +ARCHEND(__strlcpy, baseline) + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/strlen.S b/lib/libc/amd64/string/strlen.S index 1d2428e3420e..cc248af001ac 100644 --- a/lib/libc/amd64/string/strlen.S +++ b/lib/libc/amd64/string/strlen.S @@ -1,10 +1,15 @@ -/* +/*- * Written by Mateusz Guzik <mjg@freebsd.org> + * Copyright (c) 2023 The FreeBSD Foundation + * + * Portions of this software were developed by Robert Clausecker + * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation. + * * Public domain. */ #include <machine/asm.h> -__FBSDID("$FreeBSD$"); +#include "amd64_archlevel.h" /* * Note: this routine was written with kernel use in mind (read: no simd), @@ -14,6 +19,11 @@ __FBSDID("$FreeBSD$"); #define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ +ARCHFUNCS(strlen) + ARCHFUNC(strlen, scalar) + ARCHFUNC(strlen, baseline) +ENDARCHFUNCS(strlen) + /* * strlen(string) * %rdi @@ -30,7 +40,7 @@ __FBSDID("$FreeBSD$"); * * The latter contains a 32-bit variant of the same algorithm coded in assembly for i386. */ -ENTRY(strlen) +ARCHENTRY(strlen, scalar) movabsq $0xfefefefefefefeff,%r8 movabsq $0x8080808080808080,%r9 @@ -76,6 +86,46 @@ ENTRY(strlen) leaq (%rcx,%rdi),%rax subq %r10,%rax ret -END(strlen) +ARCHEND(strlen, scalar) + +ARCHENTRY(strlen, baseline) + mov %rdi, %rcx + pxor %xmm1, %xmm1 + and $~0xf, %rdi # align string + pcmpeqb (%rdi), %xmm1 # compare head (with junk before string) + mov %rcx, %rsi # string pointer copy for later + and $0xf, %ecx # amount of bytes rdi is past 16 byte alignment + pmovmskb %xmm1, %eax + add $32, %rdi # advance to next iteration + shr %cl, %eax # clear out matches in junk bytes + test %eax, %eax # any match? (can't use ZF from SHR as CL=0 is possible) + jnz 2f + + ALIGN_TEXT +1: pxor %xmm1, %xmm1 + pcmpeqb -16(%rdi), %xmm1 # find NUL bytes + pmovmskb %xmm1, %eax + test %eax, %eax # were any NUL bytes present? + jnz 3f + + /* the same unrolled once more */ + pxor %xmm1, %xmm1 + pcmpeqb (%rdi), %xmm1 + pmovmskb %xmm1, %eax + add $32, %rdi # advance to next iteration + test %eax, %eax + jz 1b + + /* match found in loop body */ + sub $16, %rdi # undo half the advancement +3: tzcnt %eax, %eax # find the first NUL byte + sub %rsi, %rdi # string length until beginning of (%rdi) + lea -16(%rdi, %rax, 1), %rax # that plus loc. of NUL byte: full string length + ret + + /* match found in head */ +2: tzcnt %eax, %eax # compute string length + ret +ARCHEND(strlen, baseline) .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/strncat.c b/lib/libc/amd64/string/strncat.c new file mode 100644 index 000000000000..2c63ab50b3c3 --- /dev/null +++ b/lib/libc/amd64/string/strncat.c @@ -0,0 +1,31 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Robert Clausecker + */ + +#include <sys/cdefs.h> + +#include <string.h> + +#undef strncat /* _FORTIFY_SOURCE */ + +void *__memccpy(void *restrict, const void *restrict, int, size_t); + +char * +strncat(char *dest, const char *src, size_t n) +{ + size_t len; + char *endptr; + + len = strlen(dest); + endptr = __memccpy(dest + len, src, '\0', n); + + /* avoid an extra branch */ + if (endptr == NULL) + endptr = dest + len + n + 1; + + endptr[-1] = '\0'; + + return (dest); +} diff --git a/lib/libc/amd64/string/strncmp.S b/lib/libc/amd64/string/strncmp.S new file mode 100644 index 000000000000..932cf078bdfc --- /dev/null +++ b/lib/libc/amd64/string/strncmp.S @@ -0,0 +1,488 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include <machine/asm.h> +#include <machine/param.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + +ARCHFUNCS(strncmp) + ARCHFUNC(strncmp, scalar) + ARCHFUNC(strncmp, baseline) +ENDARCHFUNCS(strncmp) + +/* + * This is just the scalar loop unrolled a bunch of times. + */ +ARCHENTRY(strncmp, scalar) + xor %eax, %eax + sub $4, %rdx # 4 chars left to compare? + jbe 1f + + ALIGN_TEXT +0: movzbl (%rdi), %ecx + test %ecx, %ecx # NUL char in first string? + jz .L0 + cmpb (%rsi), %cl # mismatch between strings? + jnz .L0 + + movzbl 1(%rdi), %ecx + test %ecx, %ecx + jz .L1 + cmpb 1(%rsi), %cl + jnz .L1 + + movzbl 2(%rdi), %ecx + test %ecx, %ecx + jz .L2 + cmpb 2(%rsi), %cl + jnz .L2 + + movzbl 3(%rdi), %ecx + test %ecx, %ecx + jz .L3 + cmpb 3(%rsi), %cl + jnz .L3 + + add $4, %rdi # advance to next iteration + add $4, %rsi + sub $4, %rdx + ja 0b + + /* end of string within the next 4 characters */ +1: cmp $-4, %edx # end of string reached immediately? + jz .Leq + movzbl (%rdi), %ecx + test %ecx, %ecx + jz .L0 + cmpb (%rsi), %cl + jnz .L0 + + cmp $-3, %edx # end of string reached after 1 char? + jz .Leq + movzbl 1(%rdi), %ecx + test %ecx, %ecx + jz .L1 + cmpb 1(%rsi), %cl + jnz .L1 + + cmp $-2, %edx + jz .Leq + movzbl 2(%rdi), %ecx + test %ecx, %ecx + jz .L2 + cmpb 2(%rsi), %cl + jnz .L2 + + cmp $-1, %edx # either end of string after 3 chars, + jz .Leq # or it boils down to the last char + +.L3: inc %eax +.L2: inc %eax +.L1: inc %eax +.L0: movzbl (%rsi, %rax, 1), %ecx + movzbl (%rdi, %rax, 1), %eax + sub %ecx, %eax +.Leq: ret +ARCHEND(strncmp, scalar) + +ARCHENTRY(strncmp, baseline) + push %rbx + sub $1, %rdx # RDX--, so RDX points to the last byte to compare + jb .Lempty # where there any bytes to compare at all? + + lea 15(%rdi), %r8d # end of head + lea 15(%rsi), %r9d + mov %edi, %eax + mov %esi, %ebx + xor %edi, %r8d # bits that changed between first and last byte + xor %esi, %r9d + and $~0xf, %rdi # align heads to 16 bytes + and $~0xf, %rsi + or %r8d, %r9d + and $0xf, %eax # offset from alignment + and $0xf, %ebx + movdqa (%rdi), %xmm0 # load aligned heads + movdqa (%rsi), %xmm2 + pxor %xmm1, %xmm1 + cmp $16, %rdx # end of buffer within the first 32 bytes? + jb .Llt16 + + test $PAGE_SIZE, %r9d # did the page change? + jz 0f # if not, take fast path + + + /* heads may cross page boundary, avoid unmapped loads */ + movdqa %xmm0, -32(%rsp) # stash copies of the heads on the stack + movdqa %xmm2, -16(%rsp) + mov $-1, %r8d + mov $-1, %r9d + mov %eax, %ecx + shl %cl, %r8d # string head in XMM0 + mov %ebx, %ecx + shl %cl, %r9d # string head in XMM2 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm0, %r10d + pmovmskb %xmm2, %r11d + test %r8d, %r10d # NUL byte present in first string? + lea -32(%rsp), %r8 + cmovz %rdi, %r8 + test %r9d, %r11d # NUL byte present in second string? + lea -16(%rsp), %r9 + cmovz %rsi, %r9 + movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads + movdqu (%r9, %rbx, 1), %xmm4 + jmp 1f + + /* rdx == 0 */ +.Lempty: + xor %eax, %eax # zero-length buffers compare equal + pop %rbx + ret + +0: movdqu (%rdi, %rax, 1), %xmm0 # load true heads + movdqu (%rsi, %rbx, 1), %xmm4 +1: pxor %xmm2, %xmm2 + pcmpeqb %xmm0, %xmm2 # NUL byte present? + pcmpeqb %xmm0, %xmm4 # which bytes match? + pandn %xmm4, %xmm2 # match and not NUL byte? + pmovmskb %xmm2, %r9d + xor $0xffff, %r9d # mismatch or NUL byte? + jnz .Lhead_mismatch + + /* load head and second chunk */ + movdqa 16(%rdi), %xmm2 # load second chunks + movdqa 16(%rsi), %xmm3 + lea -16(%rdx, %rbx, 1), %rdx # account for length of RSI chunk + sub %rbx, %rax # is a&0xf >= b&0xf? + jb .Lswapped # if not, proceed with swapped operands + jmp .Lnormal + + /* buffer ends within the first 16 bytes */ +.Llt16: test $PAGE_SIZE, %r9d # did the page change? + jz 0f # if not, take fast path + + /* heads may cross page boundary */ + movdqa %xmm0, -32(%rsp) # stash copies of the heads on the stack + movdqa %xmm2, -16(%rsp) + mov $-1, %r8d + mov $-1, %r9d + mov %eax, %ecx + shl %cl, %r8d # string head in XMM0 + mov %ebx, %ecx + shl %cl, %r9d # string head in XMM2 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm0, %r10d + pmovmskb %xmm2, %r11d + lea (%rdx, %rax, 1), %ecx # location of last buffer byte in xmm0 + bts %ecx, %r10d # treat as if NUL byte present + lea (%rdx, %rbx, 1), %ecx + bts %ecx, %r11d + test %r8w, %r10w # NUL byte present in first string head? + lea -32(%rsp), %r8 + cmovz %rdi, %r8 + test %r9w, %r11w # NUL byte present in second string head? + lea -16(%rsp), %r9 + cmovz %rsi, %r9 + movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads + movdqu (%r9, %rbx, 1), %xmm4 + jmp 1f + +0: movdqu (%rdi, %rax, 1), %xmm0 # load true heads + movdqu (%rsi, %rbx, 1), %xmm4 +1: pxor %xmm2, %xmm2 + pcmpeqb %xmm0, %xmm2 # NUL byte present? + pcmpeqb %xmm0, %xmm4 # which bytes match? + pandn %xmm4, %xmm2 # match and not NUL byte? + pmovmskb %xmm2, %r9d + btr %edx, %r9d # induce mismatch in last byte of buffer + not %r9d # mismatch or NUL byte? + + /* mismatch in true heads */ + ALIGN_TEXT +.Lhead_mismatch: + tzcnt %r9d, %r9d # where is the mismatch? + add %rax, %rdi # return to true heads + add %rbx, %rsi + movzbl (%rdi, %r9, 1), %eax # mismatching characters + movzbl (%rsi, %r9, 1), %ecx + sub %ecx, %eax + pop %rbx + ret + + /* rax >= 0 */ + ALIGN_TEXT +.Lnormal: + neg %rax + movdqu 16(%rsi, %rax, 1), %xmm0 + sub %rdi, %rsi # express RSI as distance from RDI + lea (%rsi, %rax, 1), %rbx # point RBX to offset in second string + neg %rax # ... corresponding to RDI + pcmpeqb %xmm3, %xmm1 # NUL present? + pcmpeqb %xmm2, %xmm0 # Mismatch between chunks? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + mov $16, %ecx + cmp %rcx, %rdx # does the buffer end within (RDI,RSI,1)? + cmovb %edx, %ecx # ECX = min(16, RDX) + add $32, %rdi # advance to next iteration + bts %ecx, %r8d # mark end-of-buffer as if there was a NUL byte + test %r8w, %r8w # NUL or end of buffer found? + jnz .Lnul_found2 + xor $0xffff, %r9d + jnz .Lmismatch2 + sub $48, %rdx # end of buffer within first main loop iteration? + jb .Ltail # if yes, process tail + + /* + * During the main loop, the layout of the two strings is something like: + * + * v ------1------ v ------2------ v + * RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... + * RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... + * + * where v indicates the alignment boundaries and corresponding chunks + * of the strings have the same letters. Chunk A has been checked in + * the previous iteration. This iteration, we first check that string + * RSI doesn't end within region 2, then we compare chunk B between the + * two strings. As RSI is known not to hold a NUL byte in regsions 1 + * and 2 at this point, this also ensures that RDI has not ended yet. + */ + ALIGN_TEXT +0: movdqu (%rdi, %rbx, 1), %xmm0 # chunk of 2nd string corresponding to RDI + pxor %xmm1, %xmm1 + pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI? + pcmpeqb (%rdi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + test %r8d, %r8d + jnz .Lnul_found + xor $0xffff, %r9d # any mismatches? + jnz .Lmismatch + + /* main loop unrolled twice */ + movdqu 16(%rdi, %rbx, 1), %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb 16(%rdi, %rsi, 1), %xmm1 + pcmpeqb 16(%rdi), %xmm0 + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + add $32, %rdi + test %r8d, %r8d + jnz .Lnul_found2 + xor $0xffff, %r9d + jnz .Lmismatch2 + sub $32, %rdx # end of buffer within next iteration? + jae 0b + + /* end of buffer will occur in next 32 bytes */ +.Ltail: movdqu (%rdi, %rbx, 1), %xmm0 # chunk of 2nd string corresponding to RDI + pxor %xmm1, %xmm1 + pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI? + pcmpeqb (%rdi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + bts %edx, %r8d # indicate NUL byte at last byte in buffer + test %r8w, %r8w # NUL byte in first chunk? + jnz .Lnul_found + xor $0xffff, %r9d # any mismatches? + jnz .Lmismatch + + /* main loop unrolled twice */ + movdqu 16(%rdi, %rbx, 1), %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb 16(%rdi, %rsi, 1), %xmm1 + pcmpeqb 16(%rdi), %xmm0 + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + sub $16, %edx # take first half into account + bts %edx, %r8d # indicate NUL byte at last byte in buffer + add $32, %rdi + +.Lnul_found2: + sub $16, %rdi + +.Lnul_found: + mov %eax, %ecx + mov %r8d, %r10d + shl %cl, %r8d # adjust NUL mask to positions in RDI/RBX + not %r9d # mask of mismatches + or %r8w, %r9w # NUL bytes als count as mismatches + jnz .Lmismatch + + /* + * (RDI) == (RSI) and NUL is past the string. + * compare (RSI) with the corresponding part + * of the other string until the NUL byte. + */ + movdqu (%rdi, %rax, 1), %xmm0 + pcmpeqb (%rdi, %rsi, 1), %xmm0 + add %rdi, %rsi # restore RSI pointer + add %rax, %rdi # point RDI to chunk corresponding to (RSI) + pmovmskb %xmm0, %ecx # mask of matches + not %ecx # mask of mismatches + or %r10d, %ecx # mask of mismatches or NUL bytes + tzcnt %ecx, %ecx # location of first mismatch + movzbl (%rdi, %rcx, 1), %eax + movzbl (%rsi, %rcx, 1), %ecx + sub %ecx, %eax + pop %rbx + ret + +.Lmismatch2: + sub $16, %rdi + + /* a mismatch has been found between RBX and RSI */ +.Lmismatch: + tzcnt %r9d, %r9d # where is the mismatch? + add %rdi, %rbx # turn RBX from offset into pointer + movzbl (%rbx, %r9, 1), %ecx + movzbl (%rdi, %r9, 1), %eax + sub %ecx, %eax + pop %rbx + ret + + /* rax < 0 */ + ALIGN_TEXT +.Lswapped: + movdqu 16(%rdi, %rax, 1), %xmm0 + sub %rsi, %rdi # express RDI as distance from RDI + lea (%rdi, %rax, 1), %rbx # point RBX to offset in first string + pcmpeqb %xmm2, %xmm1 # NUL present? + pcmpeqb %xmm3, %xmm0 # mismatch between chunks? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + add %rax, %rdx # RDX points to buffer end in RSI + neg %rax # ... corresponding to RSI + mov $16, %ecx + cmp %rcx, %rdx # does the buffer end within (RSI,RDI,1)? + cmovb %edx, %ecx # ECX = min(16, RDX) + add $32, %rsi + bts %ecx, %r8d # mark end-of-buffer as if there was a NUL byte + test %r8w, %r8w # NUL or end of buffer found? + jnz .Lnul_found2s + xor $0xffff, %r9d + jnz .Lmismatch2s + sub $48, %rdx # end of buffer within first main loop iteration? + jb .Ltails # if yes, process tail + + ALIGN_TEXT +0: movdqu (%rsi, %rbx, 1), %xmm0 # chunk of 1st string corresponding to RSI + pxor %xmm1, %xmm1 + pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RDI? + pcmpeqb (%rsi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + test %r8d, %r8d + jnz .Lnul_founds + xor $0xffff, %r9d # any mismatches? + jnz .Lmismatchs + + /* main loop unrolled twice */ + movdqu 16(%rsi, %rbx, 1), %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb 16(%rsi, %rdi, 1), %xmm1 + pcmpeqb 16(%rsi), %xmm0 + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + add $32, %rsi + test %r8d, %r8d + jnz .Lnul_found2s + xor $0xffff, %r9d + jnz .Lmismatch2s + sub $32, %rdx # end of buffer within next iteration? + jae 0b + + /* end of buffer will occur in next 32 bytes */ +.Ltails: + movdqu (%rsi, %rbx, 1), %xmm0 # chunk of 1st string corresponding to RSI + pxor %xmm1, %xmm1 + pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RDI? + pcmpeqb (%rsi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + bts %edx, %r8d # indicate NUL byte at laste byte in buffer + test %r8w, %r8w # NUL byte in first chunk? + jnz .Lnul_founds + xor $0xffff, %r9d # any mismatches? + jnz .Lmismatchs + + /* main loop unrolled twice */ + movdqu 16(%rsi, %rbx, 1), %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb 16(%rsi, %rdi, 1), %xmm1 + pcmpeqb 16(%rsi), %xmm0 + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + sub $16, %edx # take first half into account + bts %edx, %r8d # indicate NUL byte at laste byte in buffer + add $32, %rsi + +.Lnul_found2s: + sub $16, %rsi + +.Lnul_founds: + mov %eax, %ecx + mov %r8d, %r10d + shl %cl, %r8d # adjust NUL mask to positions in RSI/RBX + not %r9d # mask of mismatches + or %r8w, %r9w # NUL bytes also count as mismatches + jnz .Lmismatchs + + movdqu (%rsi, %rax, 1), %xmm0 + pcmpeqb (%rsi, %rdi, 1), %xmm0 + add %rsi, %rdi # restore RDI pointer + add %rax, %rsi # point RSI to chunk corresponding to (RDI) + pmovmskb %xmm0, %ecx # mask of matches + not %ecx # mask of mismatches + or %r10d, %ecx # mask of mismatches or NUL bytes + tzcnt %ecx, %ecx # location of first mismatch + movzbl (%rdi, %rcx, 1), %eax + movzbl (%rsi, %rcx, 1), %ecx + sub %ecx, %eax + pop %rbx + ret + +.Lmismatch2s: + sub $16, %rsi + +.Lmismatchs: + tzcnt %r9d, %r9d # where is the mismatch? + add %rsi, %rbx # turn RBX from offset into pointer + movzbl (%rbx, %r9, 1), %eax + movzbl (%rsi, %r9, 1), %ecx + sub %ecx, %eax + pop %rbx + ret +ARCHEND(strncmp, baseline) + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/static_tls.h b/lib/libc/amd64/string/strncpy.c index 1e9b76f58221..0e7a58222aa8 100644 --- a/lib/libc/amd64/static_tls.h +++ b/lib/libc/amd64/string/strncpy.c @@ -1,9 +1,7 @@ /*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * Copyright (c) 2023 The FreeBSD Foundation * - * Copyright (c) 2019 The FreeBSD Foundation - * - * This software was developed by Konstantin Belousov <kib@FreeBSD.org> + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without @@ -15,7 +13,7 @@ * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE @@ -25,22 +23,21 @@ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ + * SUCH DAMAGE */ -#ifndef _LIBC_AMD64_STATIC_TLS_H -#define _LIBC_AMD64_STATIC_TLS_H +#include <sys/cdefs.h> +#include <string.h> + +#undef strncpy /* _FORTIFY_SOURCE */ -static __inline uintptr_t -_libc_get_static_tls_base(size_t offset) +char *__stpncpy(char *restrict, const char *restrict, size_t); + +char * +strncpy(char *restrict dst, const char *restrict src, size_t len) { - uintptr_t tlsbase; - __asm __volatile("movq %%fs:0, %0" : "=r" (tlsbase)); - tlsbase -= offset; - return (tlsbase); -} + __stpncpy(dst, src, len); -#endif + return (dst); +} diff --git a/lib/libc/amd64/sys/amd64_set_gsbase.c b/lib/libc/amd64/string/strnlen.c index c4880c126ae9..74020f1b1c65 100644 --- a/lib/libc/amd64/sys/amd64_set_gsbase.c +++ b/lib/libc/amd64/string/strnlen.c @@ -1,11 +1,7 @@ /*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * Copyright (c) 2023 The FreeBSD Foundation * - * Copyright (c) 2003 Peter Wemm - * Copyright (c) 2017, 2018 The FreeBSD Foundation - * All rights reserved. - * - * Portions of this software were developed by Konstantin Belousov + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without @@ -17,51 +13,29 @@ * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. + * SUCH DAMAGE */ -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#define IN_RTLD 1 -#include <sys/param.h> -#undef IN_RTLD -#include <machine/cpufunc.h> -#include <machine/specialreg.h> -#include <machine/sysarch.h> -#include <x86/ifunc.h> -#include "libc_private.h" - -static int -amd64_set_gsbase_cpu(void *addr) -{ +#include <string.h> - wrgsbase((uintptr_t)addr); - return (0); -} +char *__memchr(const void *, int, size_t); -static int -amd64_set_gsbase_syscall(void *addr) +size_t +strnlen(const char *s, size_t maxlen) { + const char *loc; - return (sysarch(AMD64_SET_GSBASE, &addr)); -} - -DEFINE_UIFUNC(, int, amd64_set_gsbase, (void *)) -{ + loc = __memchr(s, '\0', maxlen); - if (__getosreldate() >= P_OSREL_WRFSBASE && - (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) - return (amd64_set_gsbase_cpu); - return (amd64_set_gsbase_syscall); + return (loc == NULL ? maxlen : (size_t)(loc - s)); } diff --git a/lib/libc/amd64/sys/amd64_set_fsbase.c b/lib/libc/amd64/string/strpbrk.c index 02ca9233d855..87f587789991 100644 --- a/lib/libc/amd64/sys/amd64_set_fsbase.c +++ b/lib/libc/amd64/string/strpbrk.c @@ -1,11 +1,7 @@ /*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * Copyright (c) 2023 The FreeBSD Foundation * - * Copyright (c) 2003 Peter Wemm - * Copyright (c) 2017, 2018 The FreeBSD Foundation - * All rights reserved. - * - * Portions of this software were developed by Konstantin Belousov + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without @@ -17,51 +13,31 @@ * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. + * SUCH DAMAGE */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#define IN_RTLD 1 -#include <sys/param.h> -#undef IN_RTLD -#include <machine/cpufunc.h> -#include <machine/specialreg.h> -#include <machine/sysarch.h> -#include <x86/ifunc.h> -#include "libc_private.h" -static int -amd64_set_fsbase_cpu(void *addr) -{ +#include <string.h> - wrfsbase((uintptr_t)addr); - return (0); -} +size_t __strcspn(const char *, const char *); -static int -amd64_set_fsbase_syscall(void *addr) +char * +strpbrk(const char *s, const char *charset) { + size_t loc; - return (sysarch(AMD64_SET_FSBASE, &addr)); -} - -DEFINE_UIFUNC(, int, amd64_set_fsbase, (void *)) -{ + loc = __strcspn(s, charset); - if (__getosreldate() >= P_OSREL_WRFSBASE && - (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) - return (amd64_set_fsbase_cpu); - return (amd64_set_fsbase_syscall); + return (s[loc] == '\0' ? NULL : (char *)&s[loc]); } diff --git a/lib/libc/amd64/string/strrchr.S b/lib/libc/amd64/string/strrchr.S new file mode 100644 index 000000000000..e397bbcd3478 --- /dev/null +++ b/lib/libc/amd64/string/strrchr.S @@ -0,0 +1,209 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include <machine/asm.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4,0x90 # 16-byte alignment, nop-filled + + .weak rindex + .set rindex, strrchr + +ARCHFUNCS(strrchr) + ARCHFUNC(strrchr, scalar) + ARCHFUNC(strrchr, baseline) +ENDARCHFUNCS(strrchr) + +ARCHENTRY(strrchr, scalar) + mov %edi, %ecx + and $~7, %rdi # align to 8 byte + movzbl %sil, %esi # clear stray high bits + movabs $0x0101010101010101, %r8 + mov (%rdi), %rax # load first word + imul %r8, %rsi # replicate char 8 times + + /* + * Unaligned input: align to 8 bytes. Then proceed the same + * way as with aligned input, but prevent matches before the + * beginning of the string. This is achieved by oring 0x01 + * into each byte of the buffer before the string + */ + shl $3, %ecx + mov %r8, %r10 + shl %cl, %r10 # 0x01 where the string is + xor %r8, %r10 # 0x01 where it is not + neg %r8 # negate 01..01 so we can use lea + movabs $0x8080808080808080, %r9 + + mov %rsi, %rcx + xor %rax, %rcx # str ^ c + or %r10, %rax # ensure str != 0 before string + or %r10, %rcx # ensure str^c != 0 before string + bswap %rcx # in reverse order, to find last match + mov %rdi, %r10 # location of initial mismatch (if any) + xor %r11, %r11 # initial mismatch (none) + add $8, %rdi # advance to next iteration + lea (%rax, %r8, 1), %rdx # str - 0x01..01 + not %rax # ~str + and %rdx, %rax # (str - 0x01..01) & ~str + and %r9, %rax # not including junk bits + jnz 1f # end of string? + + lea (%rcx, %r8, 1), %rdx # (str ^ c) - 0x01..01 + not %rcx # ~(str ^ c) + and %rdx, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) + and %r9, %rcx # not including junk bits + mov %rcx, %r11 # remember mismatch in head + jmp 0f + + /* main loop unrolled twice */ + ALIGN_TEXT +3: lea (%rcx, %r8, 1), %rdx # (str ^ c) - 0x01..01 + not %rcx # ~(str ^ c) + and %rdx, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) + and %r9, %rcx # not including junk bits + lea -8(%rdi), %rdx + cmovnz %rdx, %r10 # remember location of current mismatch + cmovnz %rcx, %r11 + +0: mov (%rdi), %rax # str + mov %rsi, %rcx + xor %rax, %rcx # str ^ c + bswap %rcx # in reverse order, to find last match + lea (%rax, %r8, 1), %rdx # str - 0x01..01 + not %rax # ~str + and %rdx, %rax # (str - 0x01..01) & ~str + and %r9, %rax # not including junk bits + jnz 2f # end of string? + + lea (%rcx, %r8, 1), %rdx # (str ^ c) - 0x01..01 + not %rcx # ~(str ^ c) + and %rdx, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) + and %r9, %rcx # not including junk bits + cmovnz %rdi, %r10 # remember location of current mismatch + cmovnz %rcx, %r11 + + mov 8(%rdi), %rax # str + add $16, %rdi + mov %rsi, %rcx + xor %rax, %rcx # str ^ c + bswap %rcx + lea (%rax, %r8, 1), %rdx # str - 0x01..01 + not %rax # ~str + and %rdx, %rax # (str - 0x01..01) & ~str + and %r9, %rax # not including junk bits + jz 3b # end of string? + + /* NUL found */ +1: sub $8, %rdi # undo advance past buffer +2: lea (%rcx, %r8, 1), %rdx # (str ^ c) - 0x01..01 + not %rcx # ~(str ^ c) + and %rdx, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) + and %r9, %rcx # not including junk bits + lea -1(%rax), %rdx + xor %rdx, %rax # mask of bytes in the string + bswap %rdx # in reverse order + and %rdx, %rcx # c found in the tail? + cmovnz %rdi, %r10 + cmovnz %rcx, %r11 + bswap %r11 # unreverse byte order + bsr %r11, %rcx # last location of c in (R10) + shr $3, %rcx # as byte offset + lea (%r10, %rcx, 1), %rax # pointer to match + test %r11, %r11 # was there actually a match? + cmovz %r11, %rax # if not, return null pointer + ret +ARCHEND(strrchr, scalar) + +ARCHENTRY(strrchr, baseline) + mov %edi, %ecx + and $~0xf, %rdi # align to 16 bytes + movdqa (%rdi), %xmm1 + movd %esi, %xmm0 + and $0xf, %ecx # offset from alignment + pxor %xmm2, %xmm2 + mov $-1, %edx + punpcklbw %xmm0, %xmm0 # c -> cc + shl %cl, %edx # bits corresponding to bytes in the string + punpcklwd %xmm0, %xmm0 # cc -> cccc + xor %r8, %r8 # address of latest match + mov $1, %esi # bit mask of latest match + mov %rdi, %r9 # candidate location for next match + add $16, %rdi # advance to next chunk + + /* check for match in head */ + pcmpeqb %xmm1, %xmm2 # NUL byte present? + pshufd $0, %xmm0, %xmm0 # cccc -> cccccccccccccccc + pcmpeqb %xmm0, %xmm1 # c present? + pmovmskb %xmm2, %eax + pmovmskb %xmm1, %ecx + and %edx, %ecx # c present in the string? + and %edx, %eax # NUL present in the string? + jnz .Lend2 + + /* main loop unrolled twice */ + ALIGN_TEXT +0: movdqa (%rdi), %xmm1 + test %ecx, %ecx # was there a match in the last iter.? + cmovnz %r9, %r8 # remember match if any + cmovnz %ecx, %esi + pxor %xmm2, %xmm2 + pcmpeqb %xmm1, %xmm2 # NUL byte present? + pcmpeqb %xmm0, %xmm1 # c present? + pmovmskb %xmm2, %eax + pmovmskb %xmm1, %ecx + test %eax, %eax # end of string in first half? + jnz .Lend + + movdqa 16(%rdi), %xmm1 + test %ecx, %ecx # was there a match in the last iter.? + cmovnz %rdi, %r8 # remember match if any + cmovnz %ecx, %esi + pxor %xmm2, %xmm2 + pcmpeqb %xmm1, %xmm2 # NUL byte present? + pcmpeqb %xmm0, %xmm1 # c present? + pmovmskb %xmm2, %eax + pmovmskb %xmm1, %ecx + lea 16(%rdi), %r9 + add $32, %rdi + test %eax, %eax # end of string in second half? + jz 0b + + ALIGN_TEXT +.Lend2: sub $16, %rdi +.Lend: lea -1(%rax), %edx + xor %edx, %eax # mask of bytes in the string + and %eax, %ecx # c found in the tail? + cmovnz %rdi, %r8 + cmovnz %ecx, %esi + bsr %esi, %esi # last location of c in (R8) + lea (%r8, %rsi, 1), %rax # pointer to match + ret +ARCHEND(strrchr, baseline) + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/sys/getcontext.S b/lib/libc/amd64/string/strsep.c index b11e65caf580..9fda56d7e135 100644 --- a/lib/libc/amd64/sys/getcontext.S +++ b/lib/libc/amd64/string/strsep.c @@ -1,6 +1,8 @@ /*- - * Copyright (c) 2003 Peter Wemm <peter@FreeBSD.org> - * All rights reserved. + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -11,38 +13,45 @@ * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. + * SUCH DAMAGE */ -#include <machine/asm.h> -__FBSDID("$FreeBSD$"); +#include <sys/cdefs.h> +#include <string.h> -#include <SYS.h> +size_t __strcspn(const char *, const char *); /* - * This has to be magic to handle the multiple returns. - * Otherwise, the setcontext() syscall will return here and we'll - * pop off the return address and go to the *setcontext* call. + * We have a fast strcspn() on amd64. Use it over a direct + * implementation of strsep for better performance. */ - WEAK_REFERENCE(__sys_getcontext, _getcontext) - WEAK_REFERENCE(__sys_getcontext, getcontext) -ENTRY(__sys_getcontext) - movq (%rsp),%rsi /* save getcontext return address */ - mov $SYS_getcontext,%rax - KERNCALL - jb HIDENAME(cerror) - addq $8,%rsp /* remove stale (setcontext) return address */ - jmp *%rsi /* restore return address */ -END(__sys_getcontext) +char * +strsep(char **stringp, const char *delim) +{ + size_t n; + char *s; + + s = *stringp; + if (s == NULL) + return (NULL); + + n = __strcspn(s, delim); + if (s[n] == '\0') + *stringp = NULL; + else { + s[n] = '\0'; + *stringp = s + n + 1; + } - .section .note.GNU-stack,"",%progbits + return (s); +} diff --git a/lib/libc/amd64/string/strspn.S b/lib/libc/amd64/string/strspn.S new file mode 100644 index 000000000000..565330f0c385 --- /dev/null +++ b/lib/libc/amd64/string/strspn.S @@ -0,0 +1,358 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include <machine/asm.h> +#include <machine/param.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ + +ARCHFUNCS(strspn) + ARCHFUNC(strspn, scalar) + NOARCHFUNC + ARCHFUNC(strspn, x86_64_v2) +ENDARCHFUNCS(strspn) + +ARCHENTRY(strspn, scalar) + push %rbp # align stack to enable function call + mov %rsp, %rbp + sub $256, %rsp # allocate space for lookup table + + /* check for special cases */ + movzbl (%rsi), %edx # first character in the set + test %edx, %edx + jz .Lzero # empty set always returns 0 + + movzbl 1(%rsi), %eax # second character in the set + test %eax, %eax + jz .Lsingle + + /* no special case matches -- prepare lookup table */ + xor %r8d, %r8d + mov $28, %ecx +0: mov %r8, (%rsp, %rcx, 8) + mov %r8, 8(%rsp, %rcx, 8) + mov %r8, 16(%rsp, %rcx, 8) + mov %r8, 24(%rsp, %rcx, 8) + sub $4, %ecx + jnc 0b + + movb $1, (%rsp, %rdx, 1) # register first char in set + add $2, %rsi + + /* process remaining chars in set */ + ALIGN_TEXT +0: movb $1, (%rsp, %rax, 1) # register previous char + movzbl (%rsi), %eax # next char in set + test %eax, %eax # end of string? + jz 1f + + movb $1, (%rsp, %rax, 1) + add $2, %rsi + movzbl -1(%rsi), %eax + test %eax, %eax + jnz 0b + +1: mov %rdi, %rax # a copy of the source to iterate over + + /* find mismatch */ + ALIGN_TEXT +0: movzbl (%rax), %ecx + cmpb $0, (%rsp, %rcx, 1) + je 2f + + movzbl 1(%rax), %ecx + cmpb $0, (%rsp, %rcx, 1) + je 3f + + movzbl 2(%rax), %ecx + cmpb $0, (%rsp, %rcx, 1) + je 4f + + movzbl 3(%rax), %ecx + add $4, %rax + cmpb $0, (%rsp, %rcx, 1) + jne 0b + + sub $3, %rax +4: dec %rdi +3: inc %rax +2: sub %rdi, %rax # number of characters preceding match + leave + ret + + /* empty set never matches */ +.Lzero: xor %eax, %eax + leave + ret + + /* find repeated single character */ + ALIGN_TEXT +.Lsingle: + cmpb %dl, (%rdi, %rax, 1) + jne 1f + + cmpb %dl, 1(%rdi, %rax, 1) + jne 2f + + cmpb %dl, 2(%rdi, %rax, 1) + jne 3f + + cmpb %dl, 3(%rdi, %rax, 1) + lea 4(%rax), %rax + je .Lsingle + + sub $3, %rax +3: inc %rax +2: inc %rax +1: leave + ret +ARCHEND(strspn, scalar) + + /* + * This kernel uses pcmpistri to do the heavy lifting. + * We provide three code paths, depending on set size: + * + * 0--16: one pcmpistri per 16 bytes of input + * 17--32: two pcmpistri per 16 bytes of input + * >=33: fall back to look up table + */ +ARCHENTRY(strspn, x86_64_v2) + push %rbp + mov %rsp, %rbp + sub $256, %rsp + + /* find set size and copy up to 32 bytes to (%rsp) */ + mov %esi, %ecx + and $~0xf, %rsi # align set pointer + movdqa (%rsi), %xmm0 + pxor %xmm1, %xmm1 + and $0xf, %ecx # amount of bytes rsi is past alignment + xor %edx, %edx + pcmpeqb %xmm0, %xmm1 # end of string reached? + movdqa %xmm0, 32(%rsp) # transfer head of set to stack + pmovmskb %xmm1, %eax + shr %cl, %eax # clear out junk before string + test %eax, %eax # end of set reached? + jnz 0f + + movdqa 16(%rsi), %xmm0 # second chunk of the set + mov $16, %edx + sub %ecx, %edx # length of set preceding xmm0 + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 + movdqa %xmm0, 48(%rsp) + movdqu 32(%rsp, %rcx, 1), %xmm2 # head of set + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 1f + + movdqa 32(%rsi), %xmm0 # third chunk + add $16, %edx + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 + movdqa %xmm0, 64(%rsp) + pmovmskb %xmm1, %eax + test %eax, %eax # still not done? + jz .Lgt32v2 + +0: movdqu 32(%rsp, %rcx, 1), %xmm2 # head of set +1: tzcnt %eax, %eax + add %eax, %edx # length of set (excluding NUL byte) + cmp $32, %edx # above 32 bytes? + ja .Lgt32v2 + + /* + * At this point we know that we want to use pcmpistri. + * one last problem obtains: the head of the string is not + * aligned and may cross a cacheline. If this is the case, + * we take the part before the page boundary and repeat the + * last byte to fill up the xmm register. + */ + mov %rdi, %rax # save original string pointer + lea 15(%rdi), %esi # last byte of the head + xor %edi, %esi + test $PAGE_SIZE, %esi # does the head cross a page? + jz 0f + + /* head crosses page: copy to stack to fix up */ + and $~0xf, %rax # align head pointer temporarily + movzbl 15(%rax), %esi # last head byte on the page + movdqa (%rax), %xmm0 + movabs $0x0101010101010101, %r8 + imul %r8, %rsi # repeated 8 times + movdqa %xmm0, (%rsp) # head word on stack + mov %rsi, 16(%rsp) # followed by filler (last byte x8) + mov %rsi, 24(%rsp) + mov %edi, %eax + and $0xf, %eax # offset of head from alignment + add %rsp, %rax # pointer to fake head + +0: movdqu (%rax), %xmm1 # load head (fake or real) + lea 16(%rdi), %rax + and $~0xf, %rax # second 16 bytes of string (aligned) +1: cmp $16, %edx # 16--32 bytes? + ja .Lgt16v2 + + + /* set is 2--16 bytes in size */ + + /* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT|_SIDD_NEGATIVE_POLARITY */ + pcmpistri $0x10, %xmm1, %xmm2 # match in head? + jc .Lheadmismatchv2 + + ALIGN_TEXT +0: pcmpistri $0x10, (%rax), %xmm2 + jc 1f # match or end of string? + pcmpistri $0x10, 16(%rax), %xmm2 + lea 32(%rax), %rax + jnc 0b # match or end of string? + + sub $16, %rax # go back to second half +1: sub %rdi, %rax # offset of (%rax) from beginning of string + add %rcx, %rax # prefix length before match/NUL + leave + ret + +.Lheadmismatchv2: + mov %ecx, %eax # prefix length before mismatch/NUL + leave + ret + + /* set is 17--32 bytes in size */ +.Lgt16v2: + movdqu 48(%rsp, %rcx, 1), %xmm3 # second part of set + + /* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_BIT_MASK|_SIDD_NEGATIVE_POLARITY */ + pcmpistrm $0x10, %xmm1, %xmm2 # any mismatch in first half? + movdqa %xmm0, %xmm4 + pcmpistrm $0x10, %xmm1, %xmm3 # any mismatch in the second half? + ptest %xmm0, %xmm4 # any entry that doesn't match either? + jnz 2f + + ALIGN_TEXT +0: movdqa (%rax), %xmm1 + pcmpistrm $0x10, %xmm1, %xmm2 + movdqa %xmm0, %xmm4 + pcmpistrm $0x10, %xmm1, %xmm3 + ptest %xmm0, %xmm4 + jnz 1f + movdqa 16(%rax), %xmm1 + add $32, %rax + pcmpistrm $0x10, %xmm1, %xmm2 + movdqa %xmm0, %xmm4 + pcmpistrm $0x10, %xmm1, %xmm3 + ptest %xmm0, %xmm4 + jz 0b + + sub $16, %rax +1: pand %xmm4, %xmm0 + movd %xmm0, %ecx + sub %rdi, %rax # offset of %xmm1 from beginning of string + tzcnt %ecx, %ecx + add %rcx, %rax # prefix length before match/NUL + leave + ret + + /* mismatch or string end in head */ +2: pand %xmm4, %xmm0 # bit mask of mismatches (end of string counts) + movd %xmm0, %eax + tzcnt %eax, %eax # prefix length before mismatch/NUL + leave + ret + + /* set is >=33 bytes in size */ +.Lgt32v2: + xorps %xmm0, %xmm0 + mov $256-64, %edx + + /* clear out look up table */ +0: movaps %xmm0, (%rsp, %rdx, 1) + movaps %xmm0, 16(%rsp, %rdx, 1) + movaps %xmm0, 32(%rsp, %rdx, 1) + movaps %xmm0, 48(%rsp, %rdx, 1) + sub $64, %edx + jnc 0b + + add %rcx, %rsi # restore string pointer + mov %rdi, %rax # keep a copy of the string + + /* initialise look up table */ + movzbl (%rsi), %ecx # string is known not to be empty + + ALIGN_TEXT +0: movb $1, (%rsp, %rcx, 1) + movzbl 1(%rsi), %ecx + test %ecx, %ecx + jz 1f + + movb $1, (%rsp, %rcx, 1) + movzbl 2(%rsi), %ecx + test %ecx, %ecx + jz 1f + + movb $1, (%rsp, %rcx, 1) + movzbl 3(%rsi), %ecx + add $4, %rsi + test %ecx, %ecx + jz 1f + + movb $1, (%rsp, %rcx, 1) + movzbl (%rsi), %ecx + test %ecx, %ecx + jnz 0b + + /* find match */ + ALIGN_TEXT +1: movzbl (%rax), %ecx + cmpb $0, (%rsp, %rcx, 1) + je 2f + + movzbl 1(%rax), %ecx + cmpb $0, (%rsp, %rcx, 1) + je 3f + + movzbl 2(%rax), %ecx + cmpb $0, (%rsp, %rcx, 1) + je 4f + + movzbl 3(%rax), %ecx + add $4, %rax + cmpb $0, (%rsp, %rcx, 1) + jne 1b + + sub $3, %rax +4: dec %rdi +3: inc %rax +2: sub %rdi, %rax # number of characters preceding match + leave + ret +ARCHEND(strspn, x86_64_v2) + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/timingsafe_bcmp.S b/lib/libc/amd64/string/timingsafe_bcmp.S new file mode 100644 index 000000000000..c003da2ea9a7 --- /dev/null +++ b/lib/libc/amd64/string/timingsafe_bcmp.S @@ -0,0 +1,232 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include <machine/asm.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ + +ARCHFUNCS(timingsafe_bcmp) + ARCHFUNC(timingsafe_bcmp, scalar) + ARCHFUNC(timingsafe_bcmp, baseline) +ENDARCHFUNCS(timingsafe_bcmp) + +ARCHENTRY(timingsafe_bcmp, scalar) + cmp $16, %rdx # at least 17 bytes to process? + ja .Lgt16 + + cmp $8, %edx # at least 9 bytes to process? + ja .L0916 + + cmp $4, %edx # at least 5 bytes to process? + ja .L0508 + + cmp $2, %edx # at least 3 bytes to process? + ja .L0304 + + test %edx, %edx # buffer empty? + jnz .L0102 + + xor %eax, %eax # empty buffer always matches + ret + +.L0102: movzbl (%rdi), %eax # load 1--2 bytes from first buffer + movzbl -1(%rdi, %rdx, 1), %ecx + xor (%rsi), %al # xor in second buffer + xor -1(%rsi, %rdx, 1), %cl + or %ecx, %eax # mismatch in any of the two? + ret + +.L0304: movzwl (%rdi), %eax + movzwl -2(%rdi, %rdx, 1), %ecx + xor (%rsi), %ax + xor -2(%rsi, %rdx, 1), %cx + or %ecx, %eax + ret + +.L0508: mov (%rdi), %eax + mov -4(%rdi, %rdx, 1), %ecx + xor (%rsi), %eax + xor -4(%rsi, %rdx, 1), %ecx + or %ecx, %eax + ret + +.L0916: mov (%rdi), %rax + mov -8(%rdi, %rdx, 1), %rcx + xor (%rsi), %rax + xor -8(%rsi, %rdx, 1), %rcx + or %rcx, %rax + setnz %al # ensure EAX nonzero even if only + ret # high bits of RAX were set + + /* more than 16 bytes: process buffer in a loop */ +.Lgt16: mov (%rdi), %rax # process first 16 bytes + mov 8(%rdi), %r9 + mov $32, %ecx + xor (%rsi), %rax + xor 8(%rsi), %r9 + or %r9, %rax + + cmp %rdx, %rcx # enough left for a full iteration? + jae .Ltail + + /* main loop processing 16 bytes per iteration */ + ALIGN_TEXT +0: mov -16(%rdi, %rcx, 1), %r8 + mov -8(%rdi, %rcx, 1), %r9 + xor -16(%rsi, %rcx, 1), %r8 + xor -8(%rsi, %rcx, 1), %r9 + add $16, %rcx + or %r9, %r8 + or %r8, %rax + + cmp %rdx, %rcx + jb 0b + + /* process last 16 bytes */ +.Ltail: mov -16(%rdi, %rdx, 1), %r8 + mov -8(%rdi, %rdx, 1), %r9 + xor -16(%rsi, %rdx, 1), %r8 + xor -8(%rsi, %rdx, 1), %r9 + or %r9, %r8 + or %r8, %rax + setnz %al + ret +ARCHEND(timingsafe_bcmp, scalar) + +ARCHENTRY(timingsafe_bcmp, baseline) + cmp $32, %rdx # at least 33 bytes to process? + ja .Lgt32b + + cmp $16, %edx # at least 17 bytes to process? + ja .L1732b + + cmp $8, %edx # at least 9 bytes to process? + ja .L0916b + + cmp $4, %edx # at least 5 bytes to process? + ja .L0508b + + cmp $2, %edx # at least 3 bytes to process? + ja .L0304b + + test %edx, %edx # buffer empty? + jnz .L0102b + + xor %eax, %eax # empty buffer always matches + ret + +.L0102b: + movzbl (%rdi), %eax # load 1--2 bytes from first buffer + movzbl -1(%rdi, %rdx, 1), %ecx + xor (%rsi), %al # xor in second buffer + xor -1(%rsi, %rdx, 1), %cl + or %ecx, %eax # mismatch in any of the two? + ret + +.L0304b: + movzwl (%rdi), %eax + movzwl -2(%rdi, %rdx, 1), %ecx + xor (%rsi), %ax + xor -2(%rsi, %rdx, 1), %cx + or %ecx, %eax + ret + +.L0508b: + mov (%rdi), %eax + mov -4(%rdi, %rdx, 1), %ecx + xor (%rsi), %eax + xor -4(%rsi, %rdx, 1), %ecx + or %ecx, %eax + ret + +.L0916b: + mov (%rdi), %rax + mov -8(%rdi, %rdx, 1), %rcx + xor (%rsi), %rax + xor -8(%rsi, %rdx, 1), %rcx + or %rcx, %rax + setnz %al # ensure EAX nonzero even if only + ret # high bits of RAX were set + +.L1732b: + movdqu (%rdi), %xmm0 + movdqu (%rsi), %xmm2 + movdqu -16(%rdi, %rdx, 1), %xmm1 + movdqu -16(%rsi, %rdx, 1), %xmm3 + pcmpeqb %xmm2, %xmm0 + pcmpeqb %xmm3, %xmm1 + pand %xmm1, %xmm0 + pmovmskb %xmm0, %eax # 1 where equal + xor $0xffff, %eax # 1 where not equal + ret + + /* more than 32 bytes: process buffer in a loop */ +.Lgt32b: + movdqu (%rdi), %xmm4 + movdqu (%rsi), %xmm2 + movdqu 16(%rdi), %xmm1 + movdqu 16(%rsi), %xmm3 + mov $64, %ecx + pcmpeqb %xmm2, %xmm4 + pcmpeqb %xmm3, %xmm1 + pand %xmm1, %xmm4 + cmp %rdx, %rcx # enough left for a full iteration? + jae .Ltailb + + /* main loop processing 32 bytes per iteration */ + ALIGN_TEXT +0: movdqu -32(%rdi, %rcx, 1), %xmm0 + movdqu -32(%rsi, %rcx, 1), %xmm2 + movdqu -16(%rdi, %rcx, 1), %xmm1 + movdqu -16(%rsi, %rcx, 1), %xmm3 + add $32, %rcx + pcmpeqb %xmm2, %xmm0 + pcmpeqb %xmm3, %xmm1 + pand %xmm1, %xmm0 + pand %xmm0, %xmm4 + cmp %rdx, %rcx + jb 0b + + /* process last 32 bytes */ +.Ltailb: + movdqu -32(%rdi, %rdx, 1), %xmm0 + movdqu -32(%rsi, %rdx, 1), %xmm2 + movdqu -16(%rdi, %rdx, 1), %xmm1 + movdqu -16(%rsi, %rdx, 1), %xmm3 + pcmpeqb %xmm2, %xmm0 + pcmpeqb %xmm3, %xmm1 + pand %xmm1, %xmm0 + pand %xmm4, %xmm0 + pmovmskb %xmm0, %eax + xor $0xffff, %eax + ret +ARCHEND(timingsafe_bcmp, baseline) + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/timingsafe_memcmp.S b/lib/libc/amd64/string/timingsafe_memcmp.S new file mode 100644 index 000000000000..3f1eccdbd640 --- /dev/null +++ b/lib/libc/amd64/string/timingsafe_memcmp.S @@ -0,0 +1,145 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include <machine/asm.h> + +#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ + +/* int timingsafe_memcmp(const void *rdi, const void *rsi, size_t rdx) */ +ENTRY(timingsafe_memcmp) + cmp $16, %rdx # at least 17 bytes to process? + ja .Lgt16 + + cmp $8, %edx # at least 9 bytes to process? + ja .L0916 + + cmp $4, %edx # at least 5 bytes to process? + ja .L0508 + + cmp $2, %edx # at least 3 bytes to process? + ja .L0304 + + test %edx, %edx # buffer empty? + jnz .L0102 + + xor %eax, %eax # empty buffer always matches + ret + +.L0102: movzbl -1(%rdi, %rdx, 1), %eax # load 1--2 bytes from first buffer + movzbl -1(%rsi, %rdx, 1), %ecx + mov (%rdi), %ah # in big endian + mov (%rsi), %ch + sub %ecx, %eax + ret + +.L0304: movzwl -2(%rdi, %rdx, 1), %ecx + movzwl -2(%rsi, %rdx, 1), %edx + movzwl (%rdi), %eax + movzwl (%rsi), %esi + bswap %ecx # convert to big endian + bswap %edx # dito for edx, (e)ax, and (e)si + rol $8, %ax # ROLW is used here so the upper two + rol $8, %si # bytes stay clear, allowing us to + sub %edx, %ecx # save a SBB compared to .L0508 + sbb %esi, %eax + or %eax, %ecx # nonzero if not equal + setnz %al + ret + +.L0508: mov -4(%rdi, %rdx, 1), %ecx + mov -4(%rsi, %rdx, 1), %edx + mov (%rdi), %edi + mov (%rsi), %esi + bswap %ecx # compare in big endian + bswap %edx + bswap %edi + bswap %esi + sub %edx, %ecx + sbb %esi, %edi + sbb %eax, %eax # -1 if less, 0 if greater or equal + or %edi, %ecx # nonzero if not equal + setnz %al # negative if <, 0 if =, 1 if > + ret + +.L0916: mov -8(%rdi, %rdx, 1), %rcx + mov -8(%rsi, %rdx, 1), %rdx + mov (%rdi), %rdi + mov (%rsi), %rsi + bswap %rcx # compare in big endian + bswap %rdx + bswap %rdi + bswap %rsi + sub %rdx, %rcx + sbb %rsi, %rdi + sbb %eax, %eax # -1 if less, 0 if greater or equal + or %rdi, %rcx # nonzero if not equal + setnz %al # negative if <, 0 if =, 1 if > + ret + + /* compare 17+ bytes */ +.Lgt16: mov (%rdi), %r8 # process first 16 bytes + mov (%rsi), %r9 + mov $32, %ecx + cmp %r8, %r9 # mismatch in head? + cmove 8(%rdi), %r8 # if not, try second pair + cmove 8(%rsi), %r9 + cmp %rdx, %rcx + jae .Ltail + + /* main loop processing 16 bytes per iteration */ + ALIGN_TEXT +0: mov -16(%rdi, %rcx, 1), %r10 + mov -16(%rsi, %rcx, 1), %r11 + cmp %r10, %r11 # mismatch in first pair? + cmove -8(%rdi, %rcx, 1), %r10 # if not, try second pair + cmove -8(%rsi, %rcx, 1), %r11 + cmp %r8, %r9 # was there a mismatch previously? + cmove %r10, %r8 # apply new pair if there was not + cmove %r11, %r9 + add $16, %rcx + cmp %rdx, %rcx + jb 0b + +.Ltail: mov -8(%rdi, %rdx, 1), %r10 + mov -8(%rsi, %rdx, 1), %r11 + cmp %r8, %r9 + cmove -16(%rdi, %rdx, 1), %r8 + cmove -16(%rsi, %rdx, 1), %r9 + bswap %r10 # compare in big endian + bswap %r11 + bswap %r8 + bswap %r9 + sub %r11, %r10 + sbb %r9, %r8 + sbb %eax, %eax # -1 if less, 0 if greater or equal + or %r10, %r8 # nonzero if not equal + setnz %al # negative if <, 0 if =, 1 if > + ret +END(timingsafe_memcmp) + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/sys/Makefile.inc b/lib/libc/amd64/sys/Makefile.inc deleted file mode 100644 index aea6755182a0..000000000000 --- a/lib/libc/amd64/sys/Makefile.inc +++ /dev/null @@ -1,13 +0,0 @@ -# from: Makefile.inc,v 1.1 1993/09/03 19:04:23 jtc Exp -# $FreeBSD$ - -SRCS+= \ - amd64_get_fsbase.c \ - amd64_get_gsbase.c \ - amd64_set_fsbase.c \ - amd64_set_gsbase.c - -MDASM= vfork.S cerror.S getcontext.S - -# Don't generate default code for these syscalls: -NOASM+= sbrk.o vfork.o diff --git a/lib/libc/amd64/sys/amd64_get_fsbase.c b/lib/libc/amd64/sys/amd64_get_fsbase.c deleted file mode 100644 index 2de99912daf2..000000000000 --- a/lib/libc/amd64/sys/amd64_get_fsbase.c +++ /dev/null @@ -1,67 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 2003 Peter Wemm - * Copyright (c) 2017, 2018 The FreeBSD Foundation - * All rights reserved. - * - * Portions of this software were developed by Konstantin Belousov - * under sponsorship from the FreeBSD Foundation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#define IN_RTLD 1 -#include <sys/param.h> -#undef IN_RTLD -#include <machine/cpufunc.h> -#include <machine/specialreg.h> -#include <machine/sysarch.h> -#include <x86/ifunc.h> -#include "libc_private.h" - -static int -amd64_get_fsbase_cpu(void **addr) -{ - - *addr = (void *)rdfsbase(); - return (0); -} - -static int -amd64_get_fsbase_syscall(void **addr) -{ - - return (sysarch(AMD64_GET_FSBASE, addr)); -} - -DEFINE_UIFUNC(, int, amd64_get_fsbase, (void **)) -{ - - if (__getosreldate() >= P_OSREL_WRFSBASE && - (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) - return (amd64_get_fsbase_cpu); - return (amd64_get_fsbase_syscall); -} diff --git a/lib/libc/amd64/sys/amd64_get_gsbase.c b/lib/libc/amd64/sys/amd64_get_gsbase.c deleted file mode 100644 index 0deac34c90d1..000000000000 --- a/lib/libc/amd64/sys/amd64_get_gsbase.c +++ /dev/null @@ -1,67 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 2003 Peter Wemm - * Copyright (c) 2017, 2018 The FreeBSD Foundation - * All rights reserved. - * - * Portions of this software were developed by Konstantin Belousov - * under sponsorship from the FreeBSD Foundation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#define IN_RTLD 1 -#include <sys/param.h> -#undef IN_RTLD -#include <machine/cpufunc.h> -#include <machine/specialreg.h> -#include <machine/sysarch.h> -#include <x86/ifunc.h> -#include "libc_private.h" - -static int -amd64_get_gsbase_cpu(void **addr) -{ - - *addr = (void *)rdgsbase(); - return (0); -} - -static int -amd64_get_gsbase_syscall(void **addr) -{ - - return (sysarch(AMD64_GET_GSBASE, addr)); -} - -DEFINE_UIFUNC(, int, amd64_get_gsbase, (void **)) -{ - - if (__getosreldate() >= P_OSREL_WRFSBASE && - (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) - return (amd64_get_gsbase_cpu); - return (amd64_get_gsbase_syscall); -} diff --git a/lib/libc/amd64/sys/cerror.S b/lib/libc/amd64/sys/cerror.S deleted file mode 100644 index 1928acd0b7a9..000000000000 --- a/lib/libc/amd64/sys/cerror.S +++ /dev/null @@ -1,60 +0,0 @@ -/*- - * Copyright (c) 1990 The Regents of the University of California. - * All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * William Jolitz. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#if defined(SYSLIBC_SCCS) && !defined(lint) - .asciz "@(#)cerror.s 5.1 (Berkeley) 4/23/90" -#endif /* SYSLIBC_SCCS and not lint */ -#include <machine/asm.h> -__FBSDID("$FreeBSD$"); - -#include "SYS.h" - - .globl HIDENAME(cerror) - .hidden HIDENAME(cerror) - - /* - * The __error() function is thread aware. For non-threaded - * programs and the initial thread in threaded programs, - * it returns a pointer to the global errno variable. - */ - .globl CNAME(__error) - .type CNAME(__error),@function -HIDENAME(cerror): - pushq %rax - call PIC_PLT(CNAME(__error)) - popq %rcx - movl %ecx,(%rax) - movq $-1,%rax - movq $-1,%rdx - ret - - .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/sys/vfork.S b/lib/libc/amd64/sys/vfork.S deleted file mode 100644 index 550f14c3b60b..000000000000 --- a/lib/libc/amd64/sys/vfork.S +++ /dev/null @@ -1,54 +0,0 @@ -/*- - * Copyright (c) 1990 The Regents of the University of California. - * All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * William Jolitz. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#if defined(SYSLIBC_SCCS) && !defined(lint) - .asciz "@(#)Ovfork.s 5.1 (Berkeley) 4/23/90" -#endif /* SYSLIBC_SCCS and not lint */ -#include <machine/asm.h> -__FBSDID("$FreeBSD$"); - -#include "SYS.h" - - WEAK_REFERENCE(__sys_vfork, _vfork) - WEAK_REFERENCE(__sys_vfork, vfork) -ENTRY(__sys_vfork) - popq %rsi /* fetch return address (%rsi preserved) */ - mov $SYS_vfork,%rax - KERNCALL - jb 1f - jmp *%rsi -1: - pushq %rsi - jmp HIDENAME(cerror) -END(__sys_vfork) - - .section .note.GNU-stack,"",%progbits |