aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDmitry Chagin <dchagin@FreeBSD.org>2022-06-17 19:33:07 +0000
committerDmitry Chagin <dchagin@FreeBSD.org>2022-06-17 19:33:07 +0000
commita340b5b4bd4814ad2010c5e7bfaa51082427c4ae (patch)
tree8c9a413c2df5d174e0d27bcae2cb9792104ba1f3
parent54689a282aee8075063228881ee577de181967b6 (diff)
downloadsrc-a340b5b4bd48.tar.gz
src-a340b5b4bd48.zip
linux(4); Almost complete the vDSO.
The vDSO (virtual dynamic shared object) is a small shared library that the kernel maps R/O into the address space of all Linux processes on image activation. The vDSO is a fully formed ELF image, shared by all processes with the same ABI, has no process private data. The primary purpose of the vDSO: - non-executable stack, signal trampolines not copied to the stack; - signal trampolines unwind, mandatory for the NPTL; - to avoid contex-switch overhead frequently used system calls can be implemented in the vDSO: for now gettimeofday, clock_gettime. The first two have been implemented, so add the implementation of system calls. System calls implemenation based on a native timekeeping code with some limitations: - ifunc can't be used, as vDSO r/o mapped to the process VA and rtld can't relocate symbols; - reading HPET memory is not implemented for now (TODO). In case on any error vDSO system calls fallback to the kernel system calls. For unimplemented vDSO system calls added prototypes which call corresponding kernel system call. Relnotes: yes Tested by: trasz (arm64) Differential revision: https://reviews.freebsd.org/D30900 MFC after: 2 weeks (cherry picked from commit 9931033bbfbe56a037723638cf3712366c6d943f)
-rw-r--r--sys/amd64/linux/linux_locore.asm2
-rw-r--r--sys/amd64/linux/linux_sysvec.c161
-rw-r--r--sys/amd64/linux/linux_vdso.lds.s12
-rw-r--r--sys/amd64/linux/linux_vdso_gtod.c146
-rw-r--r--sys/amd64/linux32/linux32_locore.asm6
-rw-r--r--sys/amd64/linux32/linux32_sysvec.c158
-rw-r--r--sys/amd64/linux32/linux32_vdso.lds.s24
-rw-r--r--sys/amd64/linux32/linux32_vdso_gtod.c146
-rw-r--r--sys/arm64/linux/linux_sysvec.c153
-rw-r--r--sys/arm64/linux/linux_vdso.lds.s65
-rw-r--r--sys/arm64/linux/linux_vdso_gtod.c153
-rw-r--r--sys/compat/linux/linux_vdso.c211
-rw-r--r--sys/compat/linux/linux_vdso.h10
-rw-r--r--sys/compat/linux/linux_vdso_gtod.inc337
-rw-r--r--sys/i386/linux/linux.h3
-rw-r--r--sys/i386/linux/linux_locore.asm6
-rw-r--r--sys/i386/linux/linux_sysvec.c161
-rw-r--r--sys/i386/linux/linux_vdso.lds.s25
-rw-r--r--sys/i386/linux/linux_vdso_gtod.c145
-rw-r--r--sys/modules/linux/Makefile60
-rw-r--r--sys/modules/linux64/Makefile49
-rw-r--r--sys/x86/linux/linux_vdso_gettc_x86.inc164
-rw-r--r--sys/x86/linux/linux_vdso_tsc_selector_x86.c57
-rw-r--r--sys/x86/linux/linux_x86.h33
24 files changed, 1955 insertions, 332 deletions
diff --git a/sys/amd64/linux/linux_locore.asm b/sys/amd64/linux/linux_locore.asm
index 4ac44c35274b..8f7431d42737 100644
--- a/sys/amd64/linux/linux_locore.asm
+++ b/sys/amd64/linux/linux_locore.asm
@@ -17,7 +17,7 @@ linux_platform:
* To avoid excess stack frame the signal trampoline code emulates
* the 'call' instruction.
*/
-NON_GPROF_ENTRY(linux_rt_sigcode)
+ENTRY(linux_rt_sigcode)
movq %rsp, %rbx /* preserve sigframe */
call .getip
.getip:
diff --git a/sys/amd64/linux/linux_sysvec.c b/sys/amd64/linux/linux_sysvec.c
index bcc8cbf0b0bd..f13526b00d85 100644
--- a/sys/amd64/linux/linux_sysvec.c
+++ b/sys/amd64/linux/linux_sysvec.c
@@ -50,6 +50,7 @@ __FBSDID("$FreeBSD$");
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
+#include <sys/stddef.h>
#include <sys/signalvar.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
@@ -72,6 +73,7 @@ __FBSDID("$FreeBSD$");
#include <machine/specialreg.h>
#include <machine/trap.h>
+#include <x86/linux/linux_x86.h>
#include <amd64/linux/linux.h>
#include <amd64/linux/linux_proto.h>
#include <compat/linux/linux_emul.h>
@@ -85,11 +87,24 @@ __FBSDID("$FreeBSD$");
MODULE_VERSION(linux64, 1);
+#define LINUX_VDSOPAGE_SIZE PAGE_SIZE * 2
+#define LINUX_VDSOPAGE_LA48 (VM_MAXUSER_ADDRESS_LA48 - \
+ LINUX_VDSOPAGE_SIZE)
+#define LINUX_SHAREDPAGE_LA48 (LINUX_VDSOPAGE_LA48 - PAGE_SIZE)
+ /*
+ * PAGE_SIZE - the size
+ * of the native SHAREDPAGE
+ */
+#define LINUX_USRSTACK_LA48 LINUX_SHAREDPAGE_LA48
+#define LINUX_PS_STRINGS_LA48 (LINUX_USRSTACK_LA48 - \
+ sizeof(struct ps_strings))
+
static int linux_szsigcode;
-static vm_object_t linux_shared_page_obj;
-static char *linux_shared_page_mapping;
-extern char _binary_linux_locore_o_start;
-extern char _binary_linux_locore_o_end;
+static vm_object_t linux_vdso_obj;
+static char *linux_vdso_mapping;
+extern char _binary_linux_vdso_so_o_start;
+extern char _binary_linux_vdso_so_o_end;
+static vm_offset_t linux_vdso_base;
extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
@@ -102,10 +117,12 @@ static int linux_fixup_elf(uintptr_t *stack_base,
static bool linux_trans_osrel(const Elf_Note *note, int32_t *osrel);
static void linux_vdso_install(void *param);
static void linux_vdso_deinstall(void *param);
+static void linux_vdso_reloc(char *mapping, Elf_Addr offset);
static void linux_set_syscall_retval(struct thread *td, int error);
static int linux_fetch_syscall_args(struct thread *td);
static void linux_exec_setregs(struct thread *td, struct image_params *imgp,
uintptr_t stack);
+static void linux_exec_sysvec_init(void *param);
static int linux_on_exec_vmspace(struct proc *p,
struct image_params *imgp);
static int linux_vsyscall(struct thread *td);
@@ -151,6 +168,8 @@ static int _bsd_to_linux_trapcode[] = {
LINUX_VDSO_SYM_INTPTR(linux_rt_sigcode);
LINUX_VDSO_SYM_CHAR(linux_platform);
+LINUX_VDSO_SYM_INTPTR(kern_timekeep_base);
+LINUX_VDSO_SYM_INTPTR(kern_tsc_selector);
/*
* If FreeBSD & Linux have a difference of opinion about what a trap
@@ -263,8 +282,7 @@ linux_copyout_auxargs(struct image_params *imgp, uintptr_t base)
M_WAITOK | M_ZERO);
issetugid = p->p_flag & P_SUGID ? 1 : 0;
- AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR,
- imgp->proc->p_sysent->sv_shared_page_base);
+ AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, linux_vdso_base);
AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature);
AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
AUXARGS_ENTRY(pos, LINUX_AT_CLKTCK, stclohz);
@@ -732,7 +750,7 @@ struct sysentvec elf_linux_sysvec = {
.sv_transtrap = linux_translate_traps,
.sv_fixup = linux_fixup_elf,
.sv_sendsig = linux_rt_sendsig,
- .sv_sigcode = &_binary_linux_locore_o_start,
+ .sv_sigcode = &_binary_linux_vdso_so_o_start,
.sv_szsigcode = &linux_szsigcode,
.sv_name = "Linux ELF64",
.sv_coredump = elf64_coredump,
@@ -743,8 +761,8 @@ struct sysentvec elf_linux_sysvec = {
.sv_minsigstksz = LINUX_MINSIGSTKSZ,
.sv_minuser = VM_MIN_ADDRESS,
.sv_maxuser = VM_MAXUSER_ADDRESS_LA48,
- .sv_usrstack = USRSTACK_LA48,
- .sv_psstrings = PS_STRINGS_LA48,
+ .sv_usrstack = LINUX_USRSTACK_LA48,
+ .sv_psstrings = LINUX_PS_STRINGS_LA48,
.sv_psstringssz = sizeof(struct ps_strings),
.sv_stackprot = VM_PROT_ALL,
.sv_copyout_auxargs = linux_copyout_auxargs,
@@ -753,11 +771,11 @@ struct sysentvec elf_linux_sysvec = {
.sv_fixlimit = NULL,
.sv_maxssiz = NULL,
.sv_flags = SV_ABI_LINUX | SV_LP64 | SV_SHP | SV_SIG_DISCIGN |
- SV_SIG_WAITNDQ,
+ SV_SIG_WAITNDQ | SV_TIMEKEEP,
.sv_set_syscall_retval = linux_set_syscall_retval,
.sv_fetch_syscall_args = linux_fetch_syscall_args,
.sv_syscallnames = NULL,
- .sv_shared_page_base = SHAREDPAGE_LA48,
+ .sv_shared_page_base = LINUX_SHAREDPAGE_LA48,
.sv_shared_page_len = PAGE_SIZE,
.sv_schedtail = linux_schedtail,
.sv_thread_detach = linux_thread_detach,
@@ -771,47 +789,130 @@ struct sysentvec elf_linux_sysvec = {
static int
linux_on_exec_vmspace(struct proc *p, struct image_params *imgp)
{
+ int error;
- linux_on_exec(p, imgp);
- return (0);
+ error = linux_map_vdso(p, linux_vdso_obj, linux_vdso_base,
+ LINUX_VDSOPAGE_SIZE, imgp);
+ if (error == 0)
+ linux_on_exec(p, imgp);
+ return (error);
}
static void
-linux_vdso_install(void *param)
+linux_exec_sysvec_init(void *param)
{
+ l_uintptr_t *ktimekeep_base, *ktsc_selector;
+ struct sysentvec *sv;
+ ptrdiff_t tkoff;
+
+ sv = param;
+ amd64_lower_shared_page(sv);
+ /* Fill timekeep_base */
+ exec_sysvec_init(sv);
+
+ tkoff = kern_timekeep_base - linux_vdso_base;
+ ktimekeep_base = (l_uintptr_t *)(linux_vdso_mapping + tkoff);
+ *ktimekeep_base = sv->sv_timekeep_base;
+
+ tkoff = kern_tsc_selector - linux_vdso_base;
+ ktsc_selector = (l_uintptr_t *)(linux_vdso_mapping + tkoff);
+ *ktsc_selector = linux_vdso_tsc_selector_idx();
+ if (bootverbose)
+ printf("Linux x86-64 vDSO tsc_selector: %lu\n", *ktsc_selector);
+}
+SYSINIT(elf_linux_exec_sysvec_init, SI_SUB_EXEC, SI_ORDER_ANY,
+ linux_exec_sysvec_init, &elf_linux_sysvec);
- amd64_lower_shared_page(&elf_linux_sysvec);
-
- linux_szsigcode = (&_binary_linux_locore_o_end -
- &_binary_linux_locore_o_start);
+static void
+linux_vdso_install(void *param)
+{
+ char *vdso_start = &_binary_linux_vdso_so_o_start;
+ char *vdso_end = &_binary_linux_vdso_so_o_end;
- if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len)
- panic("Linux invalid vdso size\n");
+ linux_szsigcode = vdso_end - vdso_start;
+ MPASS(linux_szsigcode <= LINUX_VDSOPAGE_SIZE);
- __elfN(linux_vdso_fixup)(&elf_linux_sysvec);
+ linux_vdso_base = LINUX_VDSOPAGE_LA48;
+ if (hw_lower_amd64_sharedpage != 0)
+ linux_vdso_base -= PAGE_SIZE;
- linux_shared_page_obj = __elfN(linux_shared_page_init)
- (&linux_shared_page_mapping);
+ __elfN(linux_vdso_fixup)(vdso_start, linux_vdso_base);
- __elfN(linux_vdso_reloc)(&elf_linux_sysvec);
+ linux_vdso_obj = __elfN(linux_shared_page_init)
+ (&linux_vdso_mapping, LINUX_VDSOPAGE_SIZE);
+ bcopy(vdso_start, linux_vdso_mapping, linux_szsigcode);
- bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping,
- linux_szsigcode);
- elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj;
+ linux_vdso_reloc(linux_vdso_mapping, linux_vdso_base);
}
-SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY,
+SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_FIRST,
linux_vdso_install, NULL);
static void
linux_vdso_deinstall(void *param)
{
- __elfN(linux_shared_page_fini)(linux_shared_page_obj,
- linux_shared_page_mapping);
+ __elfN(linux_shared_page_fini)(linux_vdso_obj,
+ linux_vdso_mapping, LINUX_VDSOPAGE_SIZE);
}
SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST,
linux_vdso_deinstall, NULL);
+static void
+linux_vdso_reloc(char *mapping, Elf_Addr offset)
+{
+ const Elf_Ehdr *ehdr;
+ const Elf_Shdr *shdr;
+ Elf64_Addr *where, val;
+ Elf_Size rtype, symidx;
+ const Elf_Rela *rela;
+ Elf_Addr addr, addend;
+ int relacnt;
+ int i, j;
+
+ MPASS(offset != 0);
+
+ relacnt = 0;
+ ehdr = (const Elf_Ehdr *)mapping;
+ shdr = (const Elf_Shdr *)(mapping + ehdr->e_shoff);
+ for (i = 0; i < ehdr->e_shnum; i++)
+ {
+ switch (shdr[i].sh_type) {
+ case SHT_REL:
+ printf("Linux x86_64 vDSO: unexpected Rel section\n");
+ break;
+ case SHT_RELA:
+ rela = (const Elf_Rela *)(mapping + shdr[i].sh_offset);
+ relacnt = shdr[i].sh_size / sizeof(*rela);
+ }
+ }
+
+ for (j = 0; j < relacnt; j++, rela++) {
+ where = (Elf_Addr *)(mapping + rela->r_offset);
+ addend = rela->r_addend;
+ rtype = ELF_R_TYPE(rela->r_info);
+ symidx = ELF_R_SYM(rela->r_info);
+
+ switch (rtype) {
+ case R_X86_64_NONE: /* none */
+ break;
+
+ case R_X86_64_RELATIVE: /* B + A */
+ addr = (Elf_Addr)(offset + addend);
+ val = addr;
+ if (*where != val)
+ *where = val;
+ break;
+ case R_X86_64_IRELATIVE:
+ printf("Linux x86_64 vDSO: unexpected ifunc relocation, "
+ "symbol index %ld\n", symidx);
+ break;
+ default:
+ printf("Linux x86_64 vDSO: unexpected relocation type %ld, "
+ "symbol index %ld\n", rtype, symidx);
+ }
+ }
+}
+
static char GNULINUX_ABI_VENDOR[] = "GNU";
static int GNULINUX_ABI_DESC = 0;
diff --git a/sys/amd64/linux/linux_vdso.lds.s b/sys/amd64/linux/linux_vdso.lds.s
index 94f0266095fb..ccf7c80565bb 100644
--- a/sys/amd64/linux/linux_vdso.lds.s
+++ b/sys/amd64/linux/linux_vdso.lds.s
@@ -54,16 +54,20 @@ VERSION
{
LINUX_2.6 {
global:
- time;
__vdso_time;
- gettimeofday;
__vdso_gettimeofday;
- getcpu;
__vdso_getcpu;
- clock_gettime;
__vdso_clock_gettime;
+ __vdso_clock_getres;
+ local: *;
+ };
+
+ LINUX_0.0 {
+ global:
linux_rt_sigcode;
linux_platform;
+ kern_timekeep_base;
+ kern_tsc_selector;
local: *;
};
}
diff --git a/sys/amd64/linux/linux_vdso_gtod.c b/sys/amd64/linux/linux_vdso_gtod.c
new file mode 100644
index 000000000000..ad23dc33575a
--- /dev/null
+++ b/sys/amd64/linux/linux_vdso_gtod.c
@@ -0,0 +1,146 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Dmitry Chagin <dchagin@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/elf.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/stddef.h>
+#define _KERNEL
+#include <sys/vdso.h>
+#undef _KERNEL
+#include <stdbool.h>
+#include <strings.h>
+
+#include <machine/atomic.h>
+#include <machine/stdarg.h>
+
+#include <amd64/linux/linux.h>
+#include <amd64/linux/linux_syscall.h>
+#include <compat/linux/linux_errno.h>
+#include <compat/linux/linux_timer.h>
+
+/* The kernel fixup this at vDSO install */
+uintptr_t *kern_timekeep_base = NULL;
+uint32_t kern_tsc_selector = 0;
+
+#include <x86/linux/linux_vdso_gettc_x86.inc>
+
+/* for debug purpose */
+static int
+write(int fd, const void *buf, size_t size)
+{
+ int res;
+
+ __asm__ __volatile__
+ (
+ "syscall"
+ : "=a"(res)
+ : "a"(LINUX_SYS_write), "D"(fd), "S"(buf), "d"(size)
+ : "cc", "rcx", "r11", "memory"
+ );
+ return (res);
+}
+
+static int
+__vdso_clock_gettime_fallback(clockid_t clock_id, struct l_timespec *ts)
+{
+ int res;
+
+ __asm__ __volatile__
+ (
+ "syscall"
+ : "=a"(res)
+ : "a"(LINUX_SYS_linux_clock_gettime), "D"(clock_id), "S"(ts)
+ : "cc", "rcx", "r11", "memory"
+ );
+ return (res);
+}
+
+static int
+__vdso_gettimeofday_fallback(l_timeval *tv, struct timezone *tz)
+{
+ int res;
+
+ __asm__ __volatile__
+ (
+ "syscall"
+ : "=a"(res)
+ : "a"(LINUX_SYS_gettimeofday), "D"(tv), "S"(tz)
+ : "cc", "rcx", "r11", "memory"
+ );
+ return (res);
+}
+
+static int
+__vdso_clock_getres_fallback(clockid_t clock_id, struct l_timespec *ts)
+{
+ int res;
+
+ __asm__ __volatile__
+ (
+ "syscall"
+ : "=a"(res)
+ : "a"(LINUX_SYS_linux_clock_getres), "D"(clock_id), "S"(ts)
+ : "cc", "rcx", "r11", "memory"
+ );
+ return (res);
+}
+
+static int
+__vdso_getcpu_fallback(uint32_t *cpu, uint32_t *node, void *cache)
+{
+ int res;
+
+ __asm__ __volatile__
+ (
+ "syscall"
+ : "=a"(res)
+ : "a"(LINUX_SYS_linux_getcpu), "D"(cpu), "S"(node), "d"(cache)
+ : "cc", "rcx", "r11", "memory"
+ );
+ return (res);
+}
+
+static int
+__vdso_time_fallback(long *tm)
+{
+ int res;
+
+ __asm__ __volatile__
+ (
+ "syscall"
+ : "=a"(res)
+ : "a"(LINUX_SYS_linux_time), "D"(tm)
+ : "cc", "rcx", "r11", "memory"
+ );
+ return (res);
+}
+
+#include <compat/linux/linux_vdso_gtod.inc>
diff --git a/sys/amd64/linux32/linux32_locore.asm b/sys/amd64/linux32/linux32_locore.asm
index 5862f0a0d674..f96b3e730f9f 100644
--- a/sys/amd64/linux32/linux32_locore.asm
+++ b/sys/amd64/linux32/linux32_locore.asm
@@ -18,7 +18,7 @@ linux_platform:
* To avoid excess stack frame the signal trampoline code emulates
* the 'call' instruction.
*/
-NON_GPROF_ENTRY(linux32_sigcode)
+ENTRY(__kernel_sigreturn)
movl %esp, %ebx /* preserve sigframe */
call .getip0
.getip0:
@@ -33,7 +33,7 @@ NON_GPROF_ENTRY(linux32_sigcode)
.endsigcode:
0: jmp 0b
-NON_GPROF_ENTRY(linux32_rt_sigcode)
+ENTRY(__kernel_rt_sigreturn)
leal LINUX_RT_SIGF_UC(%esp),%ebx /* linux ucp */
leal LINUX_RT_SIGF_SC(%ebx),%ecx /* linux sigcontext */
movl %esp, %edi
@@ -49,7 +49,7 @@ NON_GPROF_ENTRY(linux32_rt_sigcode)
.endrtsigcode:
0: jmp 0b
-NON_GPROF_ENTRY(linux32_vsyscall)
+ENTRY(__kernel_vsyscall)
.startvsyscall:
int $0x80
ret
diff --git a/sys/amd64/linux32/linux32_sysvec.c b/sys/amd64/linux32/linux32_sysvec.c
index 2a3fde78852d..10f616c56510 100644
--- a/sys/amd64/linux32/linux32_sysvec.c
+++ b/sys/amd64/linux32/linux32_sysvec.c
@@ -56,6 +56,7 @@ __FBSDID("$FreeBSD$");
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
+#include <sys/stddef.h>
#include <sys/signalvar.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
@@ -78,6 +79,7 @@ __FBSDID("$FreeBSD$");
#include <machine/specialreg.h>
#include <machine/trap.h>
+#include <x86/linux/linux_x86.h>
#include <amd64/linux32/linux.h>
#include <amd64/linux32/linux32_proto.h>
#include <compat/linux/linux_emul.h>
@@ -91,14 +93,21 @@ __FBSDID("$FreeBSD$");
MODULE_VERSION(linux, 1);
#define LINUX32_MAXUSER ((1ul << 32) - PAGE_SIZE)
-#define LINUX32_SHAREDPAGE (LINUX32_MAXUSER - PAGE_SIZE)
+#define LINUX32_VDSOPAGE_SIZE PAGE_SIZE * 2
+#define LINUX32_VDSOPAGE (LINUX32_MAXUSER - LINUX32_VDSOPAGE_SIZE)
+#define LINUX32_SHAREDPAGE (LINUX32_VDSOPAGE - PAGE_SIZE)
+ /*
+ * PAGE_SIZE - the size
+ * of the native SHAREDPAGE
+ */
#define LINUX32_USRSTACK LINUX32_SHAREDPAGE
static int linux_szsigcode;
-static vm_object_t linux_shared_page_obj;
-static char *linux_shared_page_mapping;
-extern char _binary_linux32_locore_o_start;
-extern char _binary_linux32_locore_o_end;
+static vm_object_t linux_vdso_obj;
+static char *linux_vdso_mapping;
+extern char _binary_linux32_vdso_so_o_start;
+extern char _binary_linux32_vdso_so_o_end;
+static vm_offset_t linux_vdso_base;
extern struct sysent linux32_sysent[LINUX32_SYS_MAXSYSCALL];
@@ -111,12 +120,14 @@ static int linux_copyout_strings(struct image_params *imgp,
static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
static void linux_exec_setregs(struct thread *td,
struct image_params *imgp, uintptr_t stack);
+static void linux_exec_sysvec_init(void *param);
static int linux_on_exec_vmspace(struct proc *p,
struct image_params *imgp);
static void linux32_fixlimit(struct rlimit *rl, int which);
static bool linux32_trans_osrel(const Elf_Note *note, int32_t *osrel);
static void linux_vdso_install(void *param);
static void linux_vdso_deinstall(void *param);
+static void linux_vdso_reloc(char *mapping, Elf_Addr offset);
static void linux32_set_syscall_retval(struct thread *td, int error);
#define LINUX_T_UNKNOWN 255
@@ -167,9 +178,11 @@ struct linux32_ps_strings {
#define LINUX32_PS_STRINGS (LINUX32_USRSTACK - \
sizeof(struct linux32_ps_strings))
-LINUX_VDSO_SYM_INTPTR(linux32_sigcode);
-LINUX_VDSO_SYM_INTPTR(linux32_rt_sigcode);
-LINUX_VDSO_SYM_INTPTR(linux32_vsyscall);
+LINUX_VDSO_SYM_INTPTR(__kernel_vsyscall);
+LINUX_VDSO_SYM_INTPTR(__kernel_sigreturn);
+LINUX_VDSO_SYM_INTPTR(__kernel_rt_sigreturn);
+LINUX_VDSO_SYM_INTPTR(kern_timekeep_base);
+LINUX_VDSO_SYM_INTPTR(kern_tsc_selector);
LINUX_VDSO_SYM_CHAR(linux_platform);
/*
@@ -206,9 +219,8 @@ linux_copyout_auxargs(struct image_params *imgp, uintptr_t base)
M_WAITOK | M_ZERO);
issetugid = imgp->proc->p_flag & P_SUGID ? 1 : 0;
- AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO, linux32_vsyscall);
- AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR,
- imgp->proc->p_sysent->sv_shared_page_base);
+ AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO, __kernel_vsyscall);
+ AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, linux_vdso_base);
AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature);
AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
@@ -354,7 +366,7 @@ linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
/* Build context to run handler in. */
regs->tf_rsp = PTROUT(fp);
- regs->tf_rip = linux32_rt_sigcode;
+ regs->tf_rip = __kernel_rt_sigreturn;
regs->tf_rflags &= ~(PSL_T | PSL_D);
regs->tf_cs = _ucode32sel;
regs->tf_ss = _udatasel;
@@ -460,7 +472,7 @@ linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
/* Build context to run handler in. */
regs->tf_rsp = PTROUT(fp);
- regs->tf_rip = linux32_sigcode;
+ regs->tf_rip = __kernel_sigreturn;
regs->tf_rflags &= ~(PSL_T | PSL_D);
regs->tf_cs = _ucode32sel;
regs->tf_ss = _udatasel;
@@ -901,7 +913,7 @@ struct sysentvec elf_linux_sysvec = {
.sv_transtrap = linux_translate_traps,
.sv_fixup = linux_fixup_elf,
.sv_sendsig = linux_sendsig,
- .sv_sigcode = &_binary_linux32_locore_o_start,
+ .sv_sigcode = &_binary_linux32_vdso_so_o_start,
.sv_szsigcode = &linux_szsigcode,
.sv_name = "Linux ELF32",
.sv_coredump = elf32_coredump,
@@ -922,7 +934,7 @@ struct sysentvec elf_linux_sysvec = {
.sv_fixlimit = linux32_fixlimit,
.sv_maxssiz = &linux32_maxssiz,
.sv_flags = SV_ABI_LINUX | SV_ILP32 | SV_IA32 | SV_SHP |
- SV_SIG_DISCIGN | SV_SIG_WAITNDQ,
+ SV_SIG_DISCIGN | SV_SIG_WAITNDQ | SV_TIMEKEEP,
.sv_set_syscall_retval = linux32_set_syscall_retval,
.sv_fetch_syscall_args = linux32_fetch_syscall_args,
.sv_syscallnames = NULL,
@@ -940,45 +952,127 @@ struct sysentvec elf_linux_sysvec = {
static int
linux_on_exec_vmspace(struct proc *p, struct image_params *imgp)
{
+ int error;
- linux_on_exec(p, imgp);
- return (0);
+ error = linux_map_vdso(p, linux_vdso_obj, linux_vdso_base,
+ LINUX32_VDSOPAGE_SIZE, imgp);
+ if (error == 0)
+ linux_on_exec(p, imgp);
+ return (error);
}
static void
-linux_vdso_install(void *param)
+linux_exec_sysvec_init(void *param)
{
+ l_uintptr_t *ktimekeep_base, *ktsc_selector;
+ struct sysentvec *sv;
+ ptrdiff_t tkoff;
+
+ sv = param;
+ /* Fill timekeep_base */
+ exec_sysvec_init(sv);
+
+ tkoff = kern_timekeep_base - linux_vdso_base;
+ ktimekeep_base = (l_uintptr_t *)(linux_vdso_mapping + tkoff);
+ *ktimekeep_base = sv->sv_timekeep_base;
+
+ tkoff = kern_tsc_selector - linux_vdso_base;
+ ktsc_selector = (l_uintptr_t *)(linux_vdso_mapping + tkoff);
+ *ktsc_selector = linux_vdso_tsc_selector_idx();
+ if (bootverbose)
+ printf("Linux i386 vDSO tsc_selector: %u\n", *ktsc_selector);
+}
+SYSINIT(elf_linux_exec_sysvec_init, SI_SUB_EXEC, SI_ORDER_ANY,
+ linux_exec_sysvec_init, &elf_linux_sysvec);
- linux_szsigcode = (&_binary_linux32_locore_o_end -
- &_binary_linux32_locore_o_start);
+static void
+linux_vdso_install(void *param)
+{
+ char *vdso_start = &_binary_linux32_vdso_so_o_start;
+ char *vdso_end = &_binary_linux32_vdso_so_o_end;
- if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len)
- panic("Linux invalid vdso size\n");
+ linux_szsigcode = vdso_end - vdso_start;
+ MPASS(linux_szsigcode <= LINUX32_VDSOPAGE_SIZE);
- __elfN(linux_vdso_fixup)(&elf_linux_sysvec);
+ linux_vdso_base = LINUX32_VDSOPAGE;
- linux_shared_page_obj = __elfN(linux_shared_page_init)
- (&linux_shared_page_mapping);
+ __elfN(linux_vdso_fixup)(vdso_start, linux_vdso_base);
- __elfN(linux_vdso_reloc)(&elf_linux_sysvec);
+ linux_vdso_obj = __elfN(linux_shared_page_init)
+ (&linux_vdso_mapping, LINUX32_VDSOPAGE_SIZE);
+ bcopy(vdso_start, linux_vdso_mapping, linux_szsigcode);
- bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping,
- linux_szsigcode);
- elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj;
+ linux_vdso_reloc(linux_vdso_mapping, linux_vdso_base);
}
-SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY,
+SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_FIRST,
linux_vdso_install, NULL);
static void
linux_vdso_deinstall(void *param)
{
- __elfN(linux_shared_page_fini)(linux_shared_page_obj,
- linux_shared_page_mapping);
+ __elfN(linux_shared_page_fini)(linux_vdso_obj,
+ linux_vdso_mapping, LINUX32_VDSOPAGE_SIZE);
}
SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST,
linux_vdso_deinstall, NULL);
+static void
+linux_vdso_reloc(char *mapping, Elf_Addr offset)
+{
+ const Elf_Shdr *shdr;
+ const Elf_Rel *rel;
+ const Elf_Ehdr *ehdr;
+ Elf32_Addr *where;
+ Elf_Size rtype, symidx;
+ Elf32_Addr addr, addend;
+ int i, relcnt;
+
+ MPASS(offset != 0);
+
+ relcnt = 0;
+ ehdr = (const Elf_Ehdr *)mapping;
+ shdr = (const Elf_Shdr *)(mapping + ehdr->e_shoff);
+ for (i = 0; i < ehdr->e_shnum; i++)
+ {
+ switch (shdr[i].sh_type) {
+ case SHT_REL:
+ rel = (const Elf_Rel *)(mapping + shdr[i].sh_offset);
+ relcnt = shdr[i].sh_size / sizeof(*rel);
+ break;
+ case SHT_RELA:
+ printf("Linux i386 vDSO: unexpected Rela section\n");
+ break;
+ }
+ }
+
+ for (i = 0; i < relcnt; i++, rel++) {
+ where = (Elf32_Addr *)(mapping + rel->r_offset);
+ addend = *where;
+ rtype = ELF_R_TYPE(rel->r_info);
+ symidx = ELF_R_SYM(rel->r_info);
+
+ switch (rtype) {
+ case R_386_NONE: /* none */
+ break;
+
+ case R_386_RELATIVE: /* B + A */
+ addr = (Elf32_Addr)PTROUT(offset + addend);
+ if (*where != addr)
+ *where = addr;
+ break;
+
+ case R_386_IRELATIVE:
+ printf("Linux i386 vDSO: unexpected ifunc relocation, "
+ "symbol index %ld\n", (intmax_t)symidx);
+ break;
+ default:
+ printf("Linux i386 vDSO: unexpected relocation type %ld, "
+ "symbol index %ld\n", (intmax_t)rtype, (intmax_t)symidx);
+ }
+ }
+}
+
static char GNU_ABI_VENDOR[] = "GNU";
static int GNULINUX_ABI_DESC = 0;
diff --git a/sys/amd64/linux32/linux32_vdso.lds.s b/sys/amd64/linux32/linux32_vdso.lds.s
index a49c209a1ebc..0a392e6380b6 100644
--- a/sys/amd64/linux32/linux32_vdso.lds.s
+++ b/sys/amd64/linux32/linux32_vdso.lds.s
@@ -51,16 +51,30 @@ PHDRS
eh_frame_hdr PT_GNU_EH_FRAME;
}
-ENTRY(linux32_vsyscall);
-
VERSION
{
+ LINUX_2.6 {
+ global:
+ __vdso_clock_gettime;
+ __vdso_gettimeofday;
+ __vdso_time;
+ __vdso_clock_getres;
+ __vdso_clock_gettime64;
+ };
+
LINUX_2.5 {
global:
- linux32_vsyscall;
- linux32_sigcode;
- linux32_rt_sigcode;
+ __kernel_vsyscall;
+ __kernel_sigreturn;
+ __kernel_rt_sigreturn;
+ local: *;
+ };
+
+ LINUX_0.0 {
+ global:
linux_platform;
+ kern_timekeep_base;
+ kern_tsc_selector;
local: *;
};
}
diff --git a/sys/amd64/linux32/linux32_vdso_gtod.c b/sys/amd64/linux32/linux32_vdso_gtod.c
new file mode 100644
index 000000000000..f1573ca3c1b1
--- /dev/null
+++ b/sys/amd64/linux32/linux32_vdso_gtod.c
@@ -0,0 +1,146 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Dmitry Chagin <dchagin@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/elf.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/stddef.h>
+#define _KERNEL
+#include <sys/vdso.h>
+#undef _KERNEL
+#include <stdbool.h>
+#include <strings.h>
+
+#include <machine/atomic.h>
+#include <machine/stdarg.h>
+
+#include <amd64/linux32/linux.h>
+#include <amd64/linux32/linux32_syscall.h>
+#include <compat/linux/linux_errno.h>
+#include <compat/linux/linux_timer.h>
+
+/* The kernel fixup this at vDSO install */
+uintptr_t *kern_timekeep_base = NULL;
+uint32_t kern_tsc_selector = 0;
+
+#include <x86/linux/linux_vdso_gettc_x86.inc>
+
+static int
+write(int fd, const void *buf, size_t size)
+{
+ int res;
+
+ __asm__ __volatile__
+ (
+ "int $0x80"
+ : "=a"(res)
+ : "a"(LINUX32_SYS_write), "b"(fd), "c"(buf), "d"(size)
+ : "cc", "memory"
+ );
+ return (res);
+}
+
+static int
+__vdso_clock_gettime_fallback(clockid_t clock_id, struct l_timespec *ts)
+{
+ int res;
+
+ __asm__ __volatile__
+ (
+ "int $0x80"
+ : "=a"(res)
+ : "a"(LINUX32_SYS_linux_clock_gettime), "b"(clock_id), "c"(ts)
+ : "cc", "memory"
+ );
+ return (res);
+}
+
+static int
+__vdso_clock_gettime64_fallback(clockid_t clock_id, struct l_timespec64 *ts)
+{
+ int res;
+
+ __asm__ __volatile__
+ (
+ "int $0x80"
+ : "=a"(res)
+ : "a"(LINUX32_SYS_linux_clock_gettime64), "b"(clock_id), "c"(ts)
+ : "cc", "memory"
+ );
+ return (res);
+}
+
+static int
+__vdso_gettimeofday_fallback(l_timeval *tv, struct timezone *tz)
+{
+ int res;
+
+ __asm__ __volatile__
+ (
+ "int $0x80"
+ : "=a"(res)
+ : "a"(LINUX32_SYS_linux_gettimeofday), "b"(tv), "c"(tz)
+ : "cc", "memory"
+ );
+ return (res);
+}
+
+static int
+__vdso_clock_getres_fallback(clockid_t clock_id, struct l_timespec *ts)
+{
+ int res;
+
+ __asm__ __volatile__
+ (
+ "int $0x80"
+ : "=a"(res)
+ : "a"(LINUX32_SYS_linux_clock_getres), "b"(clock_id), "c"(ts)
+ : "cc", "memory"
+ );
+ return (res);
+}
+
+static int
+__vdso_time_fallback(long *tm)
+{
+ int res;
+
+ __asm__ __volatile__
+ (
+ "int $0x80"
+ : "=a"(res)
+ : "a"(LINUX32_SYS_linux_time), "b"(tm)
+ : "cc", "memory"
+ );
+ return (res);
+}
+
+#include <compat/linux/linux_vdso_gtod.inc>
diff --git a/sys/arm64/linux/linux_sysvec.c b/sys/arm64/linux/linux_sysvec.c
index 44f4ffab5286..48d628d365d8 100644
--- a/sys/arm64/linux/linux_sysvec.c
+++ b/sys/arm64/linux/linux_sysvec.c
@@ -41,14 +41,18 @@ __FBSDID("$FreeBSD$");
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/proc.h>
+#include <sys/stddef.h>
#include <sys/signalvar.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <vm/vm.h>
#include <vm/pmap.h>
-#include <vm/vm_param.h>
#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_param.h>
#include <arm64/linux/linux.h>
#include <arm64/linux/linux_proto.h>
@@ -68,11 +72,24 @@ __FBSDID("$FreeBSD$");
MODULE_VERSION(linux64elf, 1);
+#define LINUX_VDSOPAGE_SIZE PAGE_SIZE * 2
+#define LINUX_VDSOPAGE (VM_MAXUSER_ADDRESS - \
+ LINUX_VDSOPAGE_SIZE)
+#define LINUX_SHAREDPAGE (LINUX_VDSOPAGE - PAGE_SIZE)
+ /*
+ * PAGE_SIZE - the size
+ * of the native SHAREDPAGE
+ */
+#define LINUX_USRSTACK LINUX_SHAREDPAGE
+#define LINUX_PS_STRINGS (LINUX_USRSTACK - \
+ sizeof(struct ps_strings))
+
static int linux_szsigcode;
-static vm_object_t linux_shared_page_obj;
-static char *linux_shared_page_mapping;
-extern char _binary_linux_locore_o_start;
-extern char _binary_linux_locore_o_end;
+static vm_object_t linux_vdso_obj;
+static char *linux_vdso_mapping;
+extern char _binary_linux_vdso_so_o_start;
+extern char _binary_linux_vdso_so_o_end;
+static vm_offset_t linux_vdso_base;
extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
@@ -85,10 +102,12 @@ static int linux_elf_fixup(uintptr_t *stack_base,
static bool linux_trans_osrel(const Elf_Note *note, int32_t *osrel);
static void linux_vdso_install(const void *param);
static void linux_vdso_deinstall(const void *param);
+static void linux_vdso_reloc(char *mapping, Elf_Addr offset);
static void linux_set_syscall_retval(struct thread *td, int error);
static int linux_fetch_syscall_args(struct thread *td);
static void linux_exec_setregs(struct thread *td, struct image_params *imgp,
uintptr_t stack);
+static void linux_exec_sysvec_init(void *param);
static int linux_on_exec_vmspace(struct proc *p,
struct image_params *imgp);
@@ -105,6 +124,10 @@ LIN_SDT_PROBE_DEFINE0(sysvec, linux_rt_sendsig, todo);
LIN_SDT_PROBE_DEFINE0(sysvec, linux_vdso_install, todo);
LIN_SDT_PROBE_DEFINE0(sysvec, linux_vdso_deinstall, todo);
+LINUX_VDSO_SYM_CHAR(linux_platform);
+LINUX_VDSO_SYM_INTPTR(kern_timekeep_base);
+LINUX_VDSO_SYM_INTPTR(__kernel_rt_sigreturn);
+
/* LINUXTODO: do we have traps to translate? */
static int
linux_translate_traps(int signal, int trap_code)
@@ -114,8 +137,6 @@ linux_translate_traps(int signal, int trap_code)
return (signal);
}
-LINUX_VDSO_SYM_CHAR(linux_platform);
-
static int
linux_fetch_syscall_args(struct thread *td)
{
@@ -171,8 +192,7 @@ linux_copyout_auxargs(struct image_params *imgp, uintptr_t base)
M_WAITOK | M_ZERO);
issetugid = p->p_flag & P_SUGID ? 1 : 0;
- AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR,
- imgp->proc->p_sysent->sv_shared_page_base);
+ AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, linux_vdso_base);
AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, *imgp->sysent->sv_hwcap);
AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
AUXARGS_ENTRY(pos, LINUX_AT_CLKTCK, stclohz);
@@ -401,7 +421,7 @@ struct sysentvec elf_linux_sysvec = {
.sv_transtrap = linux_translate_traps,
.sv_fixup = linux_elf_fixup,
.sv_sendsig = linux_rt_sendsig,
- .sv_sigcode = &_binary_linux_locore_o_start,
+ .sv_sigcode = &_binary_linux_vdso_so_o_start,
.sv_szsigcode = &linux_szsigcode,
.sv_name = "Linux ELF64",
.sv_coredump = elf64_coredump,
@@ -412,8 +432,8 @@ struct sysentvec elf_linux_sysvec = {
.sv_minsigstksz = LINUX_MINSIGSTKSZ,
.sv_minuser = VM_MIN_ADDRESS,
.sv_maxuser = VM_MAXUSER_ADDRESS,
- .sv_usrstack = USRSTACK,
- .sv_psstrings = PS_STRINGS, /* XXX */
+ .sv_usrstack = LINUX_USRSTACK,
+ .sv_psstrings = LINUX_PS_STRINGS,
.sv_psstringssz = sizeof(struct ps_strings),
.sv_stackprot = VM_PROT_READ | VM_PROT_WRITE,
.sv_copyout_auxargs = linux_copyout_auxargs,
@@ -422,11 +442,11 @@ struct sysentvec elf_linux_sysvec = {
.sv_fixlimit = NULL,
.sv_maxssiz = NULL,
.sv_flags = SV_ABI_LINUX | SV_LP64 | SV_SHP | SV_SIG_DISCIGN |
- SV_SIG_WAITNDQ,
+ SV_SIG_WAITNDQ | SV_TIMEKEEP,
.sv_set_syscall_retval = linux_set_syscall_retval,
.sv_fetch_syscall_args = linux_fetch_syscall_args,
.sv_syscallnames = NULL,
- .sv_shared_page_base = SHAREDPAGE,
+ .sv_shared_page_base = LINUX_SHAREDPAGE,
.sv_shared_page_len = PAGE_SIZE,
.sv_schedtail = linux_schedtail,
.sv_thread_detach = linux_thread_detach,
@@ -442,46 +462,115 @@ struct sysentvec elf_linux_sysvec = {
static int
linux_on_exec_vmspace(struct proc *p, struct image_params *imgp)
{
+ int error;
- linux_on_exec(p, imgp);
- return (0);
+ error = linux_map_vdso(p, linux_vdso_obj, linux_vdso_base,
+ LINUX_VDSOPAGE_SIZE, imgp);
+ if (error == 0)
+ linux_on_exec(p, imgp);
+ return (error);
}
static void
-linux_vdso_install(const void *param)
+linux_exec_sysvec_init(void *param)
{
+ l_uintptr_t *ktimekeep_base;
+ struct sysentvec *sv;
+ ptrdiff_t tkoff;
+
+ sv = param;
+ /* Fill timekeep_base */
+ exec_sysvec_init(sv);
+
+ tkoff = kern_timekeep_base - linux_vdso_base;
+ ktimekeep_base = (l_uintptr_t *)(linux_vdso_mapping + tkoff);
+ *ktimekeep_base = sv->sv_timekeep_base;
+}
+SYSINIT(elf_linux_exec_sysvec_init, SI_SUB_EXEC, SI_ORDER_ANY,
+ linux_exec_sysvec_init, &elf_linux_sysvec);
- linux_szsigcode = (&_binary_linux_locore_o_end -
- &_binary_linux_locore_o_start);
+static void
+linux_vdso_install(const void *param)
+{
+ char *vdso_start = &_binary_linux_vdso_so_o_start;
+ char *vdso_end = &_binary_linux_vdso_so_o_end;
- if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len)
- panic("invalid Linux VDSO size\n");
+ linux_szsigcode = vdso_end - vdso_start;
+ MPASS(linux_szsigcode <= LINUX_VDSOPAGE_SIZE);
- __elfN(linux_vdso_fixup)(&elf_linux_sysvec);
+ linux_vdso_base = LINUX_VDSOPAGE;
- linux_shared_page_obj = __elfN(linux_shared_page_init)
- (&linux_shared_page_mapping);
+ __elfN(linux_vdso_fixup)(vdso_start, linux_vdso_base);
- __elfN(linux_vdso_reloc)(&elf_linux_sysvec);
+ linux_vdso_obj = __elfN(linux_shared_page_init)
+ (&linux_vdso_mapping, LINUX_VDSOPAGE_SIZE);
+ bcopy(vdso_start, linux_vdso_mapping, linux_szsigcode);
- memcpy(linux_shared_page_mapping, elf_linux_sysvec.sv_sigcode,
- linux_szsigcode);
- elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj;
+ linux_vdso_reloc(linux_vdso_mapping, linux_vdso_base);
}
-SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY,
+SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_FIRST,
linux_vdso_install, NULL);
static void
linux_vdso_deinstall(const void *param)
{
- LIN_SDT_PROBE0(sysvec, linux_vdso_deinstall, todo);
- __elfN(linux_shared_page_fini)(linux_shared_page_obj,
- linux_shared_page_mapping);
+ __elfN(linux_shared_page_fini)(linux_vdso_obj,
+ linux_vdso_mapping, LINUX_VDSOPAGE_SIZE);
}
SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST,
linux_vdso_deinstall, NULL);
+static void
+linux_vdso_reloc(char *mapping, Elf_Addr offset)
+{
+ Elf_Size rtype, symidx;
+ const Elf_Rela *rela;
+ const Elf_Shdr *shdr;
+ const Elf_Ehdr *ehdr;
+ Elf_Addr *where;
+ Elf_Addr addr, addend;
+ int i, relacnt;
+
+ MPASS(offset != 0);
+
+ relacnt = 0;
+ ehdr = (const Elf_Ehdr *)mapping;
+ shdr = (const Elf_Shdr *)(mapping + ehdr->e_shoff);
+ for (i = 0; i < ehdr->e_shnum; i++)
+ {
+ switch (shdr[i].sh_type) {
+ case SHT_REL:
+ printf("Linux Aarch64 vDSO: unexpected Rel section\n");
+ break;
+ case SHT_RELA:
+ rela = (const Elf_Rela *)(mapping + shdr[i].sh_offset);
+ relacnt = shdr[i].sh_size / sizeof(*rela);
+ }
+ }
+
+ for (i = 0; i < relacnt; i++, rela++) {
+ where = (Elf_Addr *)(mapping + rela->r_offset);
+ addend = rela->r_addend;
+ rtype = ELF_R_TYPE(rela->r_info);
+ symidx = ELF_R_SYM(rela->r_info);
+
+ switch (rtype) {
+ case R_AARCH64_NONE: /* none */
+ break;
+
+ case R_AARCH64_RELATIVE: /* B + A */
+ addr = (Elf_Addr)(mapping + addend);
+ if (*where != addr)
+ *where = addr;
+ break;
+ default:
+ printf("Linux Aarch64 vDSO: unexpected relocation type %ld, "
+ "symbol index %ld\n", rtype, symidx);
+ }
+ }
+}
+
static char GNU_ABI_VENDOR[] = "GNU";
static int GNU_ABI_LINUX = 0;
diff --git a/sys/arm64/linux/linux_vdso.lds.s b/sys/arm64/linux/linux_vdso.lds.s
index 86f8de91bf60..98cbb9a5736b 100644
--- a/sys/arm64/linux/linux_vdso.lds.s
+++ b/sys/arm64/linux/linux_vdso.lds.s
@@ -1,6 +1,6 @@
/*
- * Stub arm64 vdso linker script.
- * LINUXTODO: update along with VDSO implementation
+ * Linker script for 64-bit vDSO.
+ * Copied from Linux kernel arch/x86/vdso/vdso-layout.lds.S
*
* $FreeBSD$
*/
@@ -8,15 +8,66 @@
SECTIONS
{
. = . + SIZEOF_HEADERS;
- .text : { *(.text*) }
- .rodata : { *(.rodata*) }
- .hash : { *(.hash) }
+
+ .hash : { *(.hash) } :text
.gnu.hash : { *(.gnu.hash) }
.dynsym : { *(.dynsym) }
.dynstr : { *(.dynstr) }
.gnu.version : { *(.gnu.version) }
.gnu.version_d : { *(.gnu.version_d) }
.gnu.version_r : { *(.gnu.version_r) }
- .data : { *(.data*) }
- .dynamic : { *(.dynamic) }
+
+ .note : { *(.note.*) } :text :note
+
+ .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
+ .eh_frame : { KEEP (*(.eh_frame)) } :text
+
+ .dynamic : { *(.dynamic) } :text :dynamic
+
+ .rodata : { *(.rodata*) } :text
+ .data : {
+ *(.data*)
+ *(.sdata*)
+ *(.got.plt) *(.got)
+ *(.gnu.linkonce.d.*)
+ *(.bss*)
+ *(.dynbss*)
+ *(.gnu.linkonce.b.*)
+ }
+
+ .altinstructions : { *(.altinstructions) }
+ .altinstr_replacement : { *(.altinstr_replacement) }
+
+ . = ALIGN(0x100);
+ .text : { *(.test .text*) } :text =0x90909090
+}
+
+PHDRS
+{
+ text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */
+ dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
+ note PT_NOTE FLAGS(4); /* PF_R */
+ eh_frame_hdr PT_GNU_EH_FRAME;
+}
+
+/*
+ * This controls what symbols we export from the DSO.
+ */
+VERSION
+{
+ LINUX_2.6.39 {
+ global:
+ __kernel_rt_sigreturn;
+ __kernel_gettimeofday;
+ __kernel_clock_gettime;
+ __kernel_clock_getres;
+ local: *;
+ };
+
+ LINUX_0.0 {
+ global:
+ linux_platform;
+ kern_timekeep_base;
+ local: *;
+ };
}
diff --git a/sys/arm64/linux/linux_vdso_gtod.c b/sys/arm64/linux/linux_vdso_gtod.c
new file mode 100644
index 000000000000..682735cf2fa1
--- /dev/null
+++ b/sys/arm64/linux/linux_vdso_gtod.c
@@ -0,0 +1,153 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
+ * Copyright (c) 2021 Dmitry Chagin <dchagin@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/elf.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/stddef.h>
+#define _KERNEL
+#include <sys/vdso.h>
+#undef _KERNEL
+#include <stdbool.h>
+#include <strings.h>
+
+#include <machine/atomic.h>
+#include <machine/stdarg.h>
+
+#include <arm64/linux/linux.h>
+#include <arm64/linux/linux_syscall.h>
+#include <compat/linux/linux_errno.h>
+#include <compat/linux/linux_timer.h>
+
+/* The kernel fixup this at vDSO install */
+uintptr_t *kern_timekeep_base = NULL;
+uint32_t kern_tsc_selector = 0;
+
+static int
+write(int lfd, const void *lbuf, size_t lsize)
+{
+ register long svc asm("x8") = LINUX_SYS_write;
+ register int fd asm("x0") = lfd;
+ register const char *buf asm("x1") = lbuf;
+ register long size asm("x2") = lsize;
+ register long res asm ("x0");
+
+ asm volatile(
+ " svc #0\n"
+ : "=r" (res)
+ : "r" (fd), "r" (buf), "r" (size), "r" (svc)
+ : "memory");
+ return (res);
+}
+
+static int
+__vdso_clock_gettime_fallback(clockid_t clock_id, struct l_timespec *lts)
+{
+ register long svc asm("x8") = LINUX_SYS_linux_clock_gettime;
+ register clockid_t clockid asm("x0") = clock_id;
+ register struct l_timespec *ts asm("x1") = lts;
+ register long res asm ("x0");
+
+ asm volatile(
+ " svc #0\n"
+ : "=r" (res)
+ : "r" (clockid), "r" (ts), "r" (svc)
+ : "memory");
+ return (res);
+}
+
+static int
+__vdso_gettimeofday_fallback(l_timeval *ltv, struct timezone *ltz)
+{
+ register long svc asm("x8") = LINUX_SYS_gettimeofday;
+ register l_timeval *tv asm("x0") = ltv;
+ register struct timezone *tz asm("x1") = ltz;
+ register long res asm ("x0");
+
+ asm volatile(
+ " svc #0\n"
+ : "=r" (res)
+ : "r" (tv), "r" (tz), "r" (svc)
+ : "memory");
+ return (res);
+}
+
+static int
+__vdso_clock_getres_fallback(clockid_t clock_id, struct l_timespec *lts)
+{
+ register long svc asm("x8") = LINUX_SYS_linux_clock_getres;
+ register clockid_t clockid asm("x0") = clock_id;
+ register struct l_timespec *ts asm("x1") = lts;
+ register long res asm ("x0");
+
+ asm volatile(
+ " svc #0\n"
+ : "=r" (res)
+ : "r" (clockid), "r" (ts), "r" (svc)
+ : "memory");
+ return (res);
+}
+
+/*
+ * copied from lib/libc/aarch64/sys/__vdso_gettc.c
+ */
+
+static inline uint64_t
+cp15_cntvct_get(void)
+{
+ uint64_t reg;
+
+ __asm __volatile("mrs %0, cntvct_el0" : "=r" (reg));
+ return (reg);
+}
+
+static inline uint64_t
+cp15_cntpct_get(void)
+{
+ uint64_t reg;
+
+ __asm __volatile("mrs %0, cntpct_el0" : "=r" (reg));
+ return (reg);
+}
+
+int
+__vdso_gettc(const struct vdso_timehands *th, u_int *tc)
+{
+
+ if (th->th_algo != VDSO_TH_ALGO_ARM_GENTIM)
+ return (ENOSYS);
+ __asm __volatile("isb" : : : "memory");
+ *tc = th->th_physical == 0 ? cp15_cntvct_get() : cp15_cntpct_get();
+ return (0);
+}
+
+#include <compat/linux/linux_vdso_gtod.inc>
diff --git a/sys/compat/linux/linux_vdso.c b/sys/compat/linux/linux_vdso.c
index ba828c1b6816..7c498e50d242 100644
--- a/sys/compat/linux/linux_vdso.c
+++ b/sys/compat/linux/linux_vdso.c
@@ -38,17 +38,16 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/elf.h>
+#include <sys/imgact.h>
#include <sys/kernel.h>
-#include <sys/lock.h>
+#include <sys/proc.h>
#include <sys/rwlock.h>
#include <sys/queue.h>
#include <sys/sysent.h>
-#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_extern.h>
-#include <vm/vm_kern.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
@@ -59,12 +58,6 @@ __FBSDID("$FreeBSD$");
SLIST_HEAD(, linux_vdso_sym) __elfN(linux_vdso_syms) =
SLIST_HEAD_INITIALIZER(__elfN(linux_vdso_syms));
-static int __elfN(symtabindex);
-static int __elfN(symstrindex);
-
-static void
-__elfN(linux_vdso_lookup)(Elf_Ehdr *, struct linux_vdso_sym *);
-
void
__elfN(linux_vdso_sym_init)(struct linux_vdso_sym *s)
{
@@ -73,176 +66,118 @@ __elfN(linux_vdso_sym_init)(struct linux_vdso_sym *s)
}
vm_object_t
-__elfN(linux_shared_page_init)(char **mapping)
+__elfN(linux_shared_page_init)(char **mapping, vm_size_t size)
{
vm_page_t m;
vm_object_t obj;
vm_offset_t addr;
+ size_t n, pages;
+
+ pages = size / PAGE_SIZE;
- obj = vm_pager_allocate(OBJT_PHYS, 0, PAGE_SIZE,
+ addr = kva_alloc(size);
+ obj = vm_pager_allocate(OBJT_PHYS, 0, size,
VM_PROT_DEFAULT, 0, NULL);
VM_OBJECT_WLOCK(obj);
- m = vm_page_grab(obj, 0, VM_ALLOC_ZERO);
+ for (n = 0; n < pages; n++) {
+ m = vm_page_grab(obj, n,
+ VM_ALLOC_ZERO);
+ vm_page_valid(m);
+ vm_page_xunbusy(m);
+ pmap_qenter(addr + n * PAGE_SIZE, &m, 1);
+ }
VM_OBJECT_WUNLOCK(obj);
- vm_page_valid(m);
- vm_page_xunbusy(m);
- addr = kva_alloc(PAGE_SIZE);
- pmap_qenter(addr, &m, 1);
*mapping = (char *)addr;
return (obj);
}
void
-__elfN(linux_shared_page_fini)(vm_object_t obj, void *mapping)
+__elfN(linux_shared_page_fini)(vm_object_t obj, void *mapping,
+ vm_size_t size)
{
vm_offset_t va;
va = (vm_offset_t)mapping;
- pmap_qremove(va, 1);
- kva_free(va, PAGE_SIZE);
+ pmap_qremove(va, size / PAGE_SIZE);
+ kva_free(va, size);
vm_object_deallocate(obj);
}
void
-__elfN(linux_vdso_fixup)(struct sysentvec *sv)
+__elfN(linux_vdso_fixup)(char *base, vm_offset_t offset)
{
+ struct linux_vdso_sym *lsym;
+ const Elf_Shdr *shdr;
Elf_Ehdr *ehdr;
- Elf_Shdr *shdr;
- int i;
+ Elf_Sym *dsym, *sym;
+ char *strtab, *symname;
+ int i, symcnt;
- ehdr = __DECONST(Elf_Ehdr *, sv->sv_sigcode);
+ ehdr = __DECONST(Elf_Ehdr *, base);
- if (!IS_ELF(*ehdr) ||
- ehdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
- ehdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
- ehdr->e_ident[EI_VERSION] != EV_CURRENT ||
- ehdr->e_shoff == 0 ||
- ehdr->e_shentsize != sizeof(Elf_Shdr))
- panic("Linux invalid vdso header.\n");
+ MPASS(IS_ELF(*ehdr));
+ MPASS(ehdr->e_ident[EI_CLASS] == ELF_TARG_CLASS);
+ MPASS(ehdr->e_ident[EI_DATA] == ELF_TARG_DATA);
+ MPASS(ehdr->e_ident[EI_VERSION] == EV_CURRENT);
+ MPASS(ehdr->e_shentsize == sizeof(Elf_Shdr));
+ MPASS(ehdr->e_shoff != 0);
+ MPASS(ehdr->e_type == ET_DYN);
- if (ehdr->e_type != ET_DYN)
- panic("Linux invalid vdso header.\n");
+ shdr = (const Elf_Shdr *)(base + ehdr->e_shoff);
- shdr = (Elf_Shdr *) ((caddr_t)ehdr + ehdr->e_shoff);
-
- __elfN(symtabindex) = -1;
- __elfN(symstrindex) = -1;
+ dsym = NULL;
for (i = 0; i < ehdr->e_shnum; i++) {
if (shdr[i].sh_size == 0)
continue;
if (shdr[i].sh_type == SHT_DYNSYM) {
- __elfN(symtabindex) = i;
- __elfN(symstrindex) = shdr[i].sh_link;
+ dsym = (Elf_Sym *)(base + shdr[i].sh_offset);
+ strtab = base + shdr[shdr[i].sh_link].sh_offset;
+ symcnt = shdr[i].sh_size / sizeof(*dsym);
+ break;
}
}
-
- if (__elfN(symtabindex) == -1 || __elfN(symstrindex) == -1)
- panic("Linux invalid vdso header.\n");
+ MPASS(dsym != NULL);
ehdr->e_ident[EI_OSABI] = ELFOSABI_LINUX;
-}
+ /*
+ * VDSO is readonly mapped to the process VA and
+ * can't be relocated by rtld.
+ */
+ SLIST_FOREACH(lsym, &__elfN(linux_vdso_syms), sym) {
+ for (i = 0, sym = dsym; i < symcnt; i++, sym++) {
+ symname = strtab + sym->st_name;
+ if (strncmp(lsym->symname, symname, lsym->size) == 0) {
+ sym->st_value += offset;
+ *lsym->ptr = sym->st_value;
+ break;
-void
-__elfN(linux_vdso_reloc)(struct sysentvec *sv)
-{
- struct linux_vdso_sym *lsym;
- Elf_Ehdr *ehdr;
- Elf_Phdr *phdr;
- Elf_Shdr *shdr;
- Elf_Dyn *dyn;
- Elf_Sym *sym;
- int i, j, symcnt;
-
- ehdr = __DECONST(Elf_Ehdr *, sv->sv_sigcode);
-
- /* Adjust our so relative to the sigcode_base */
- if (sv->sv_shared_page_base != 0) {
- ehdr->e_entry += sv->sv_shared_page_base;
- phdr = (Elf_Phdr *)((caddr_t)ehdr + ehdr->e_phoff);
-
- /* phdrs */
- for (i = 0; i < ehdr->e_phnum; i++) {
- phdr[i].p_vaddr += sv->sv_shared_page_base;
- if (phdr[i].p_type != PT_DYNAMIC)
- continue;
- dyn = (Elf_Dyn *)((caddr_t)ehdr + phdr[i].p_offset);
- for(; dyn->d_tag != DT_NULL; dyn++) {
- switch (dyn->d_tag) {
- case DT_PLTGOT:
- case DT_HASH:
- case DT_STRTAB:
- case DT_SYMTAB:
- case DT_RELA:
- case DT_INIT:
- case DT_FINI:
- case DT_REL:
- case DT_DEBUG:
- case DT_JMPREL:
- case DT_VERSYM:
- case DT_VERDEF:
- case DT_VERNEED:
- case DT_ADDRRNGLO ... DT_ADDRRNGHI:
- dyn->d_un.d_ptr += sv->sv_shared_page_base;
- break;
- case DT_ENCODING ... DT_LOOS-1:
- case DT_LOOS ... DT_HIOS:
- if (dyn->d_tag >= DT_ENCODING &&
- (dyn->d_tag & 1) == 0)
- dyn->d_un.d_ptr += sv->sv_shared_page_base;
- break;
- default:
- break;
- }
- }
- }
-
- /* sections */
- shdr = (Elf_Shdr *)((caddr_t)ehdr + ehdr->e_shoff);
- for(i = 0; i < ehdr->e_shnum; i++) {
- if (!(shdr[i].sh_flags & SHF_ALLOC))
- continue;
- shdr[i].sh_addr += sv->sv_shared_page_base;
- if (shdr[i].sh_type != SHT_SYMTAB &&
- shdr[i].sh_type != SHT_DYNSYM)
- continue;
-
- sym = (Elf_Sym *)((caddr_t)ehdr + shdr[i].sh_offset);
- symcnt = shdr[i].sh_size / sizeof(*sym);
-
- for(j = 0; j < symcnt; j++, sym++) {
- if (sym->st_shndx == SHN_UNDEF ||
- sym->st_shndx == SHN_ABS)
- continue;
- sym->st_value += sv->sv_shared_page_base;
}
}
}
-
- SLIST_FOREACH(lsym, &__elfN(linux_vdso_syms), sym)
- __elfN(linux_vdso_lookup)(ehdr, lsym);
}
-static void
-__elfN(linux_vdso_lookup)(Elf_Ehdr *ehdr, struct linux_vdso_sym *vsym)
+int
+linux_map_vdso(struct proc *p, vm_object_t obj, vm_offset_t base,
+ vm_offset_t size, struct image_params *imgp)
{
- vm_offset_t strtab, symname;
- uint32_t symcnt;
- Elf_Shdr *shdr;
- int i;
-
- shdr = (Elf_Shdr *) ((caddr_t)ehdr + ehdr->e_shoff);
-
- strtab = (vm_offset_t)((caddr_t)ehdr +
- shdr[__elfN(symstrindex)].sh_offset);
- Elf_Sym *sym = (Elf_Sym *)((caddr_t)ehdr +
- shdr[__elfN(symtabindex)].sh_offset);
- symcnt = shdr[__elfN(symtabindex)].sh_size / sizeof(*sym);
-
- for (i = 0; i < symcnt; ++i, ++sym) {
- symname = strtab + sym->st_name;
- if (strncmp(vsym->symname, (char *)symname, vsym->size) == 0) {
- *vsym->ptr = (uintptr_t)sym->st_value;
- break;
- }
+ struct vmspace *vmspace;
+ vm_map_t map;
+ int error;
+
+ MPASS((imgp->sysent->sv_flags & SV_ABI_MASK) == SV_ABI_LINUX);
+ MPASS(obj != NULL);
+
+ vmspace = p->p_vmspace;
+ map = &vmspace->vm_map;
+
+ vm_object_reference(obj);
+ error = vm_map_fixed(map, obj, 0, base, size,
+ VM_PROT_READ | VM_PROT_EXECUTE,
+ VM_PROT_READ | VM_PROT_EXECUTE,
+ MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE);
+ if (error != KERN_SUCCESS) {
+ vm_object_deallocate(obj);
+ return (vm_mmap_to_errno(error));
}
+ return (0);
}
diff --git a/sys/compat/linux/linux_vdso.h b/sys/compat/linux/linux_vdso.h
index 073c51696387..870d0dd97fa2 100644
--- a/sys/compat/linux/linux_vdso.h
+++ b/sys/compat/linux/linux_vdso.h
@@ -38,12 +38,14 @@ struct linux_vdso_sym {
char symname[];
};
-vm_object_t __elfN(linux_shared_page_init)(char **);
-void __elfN(linux_shared_page_fini)(vm_object_t, void *);
-void __elfN(linux_vdso_fixup)(struct sysentvec *);
-void __elfN(linux_vdso_reloc)(struct sysentvec *);
+vm_object_t __elfN(linux_shared_page_init)(char **, vm_size_t);
+void __elfN(linux_shared_page_fini)(vm_object_t, void *, vm_size_t);
+void __elfN(linux_vdso_fixup)(char *, vm_offset_t);
void __elfN(linux_vdso_sym_init)(struct linux_vdso_sym *);
+int linux_map_vdso(struct proc *, vm_object_t, vm_offset_t,
+ vm_offset_t, struct image_params *);
+
#define LINUX_VDSO_SYM_INTPTR(name) \
uintptr_t name; \
LINUX_VDSO_SYM_DEFINE(name)
diff --git a/sys/compat/linux/linux_vdso_gtod.inc b/sys/compat/linux/linux_vdso_gtod.inc
new file mode 100644
index 000000000000..a90b7dc8efdf
--- /dev/null
+++ b/sys/compat/linux/linux_vdso_gtod.inc
@@ -0,0 +1,337 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
+ * Copyright (c) 2021 Dmitry Chagin <dchagin@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+static int
+__vdso_native_to_linux_timespec(struct l_timespec *lts,
+ struct timespec *nts)
+{
+
+#ifdef COMPAT_LINUX32
+ if (nts->tv_sec > INT_MAX || nts->tv_sec < INT_MIN)
+ return (LINUX_EOVERFLOW);
+#endif
+ lts->tv_sec = nts->tv_sec;
+ lts->tv_nsec = nts->tv_nsec;
+ return (0);
+}
+
+static int
+__vdso_native_to_linux_timeval(l_timeval *ltv,
+ struct timeval *ntv)
+{
+
+#ifdef COMPAT_LINUX32
+ if (ntv->tv_sec > INT_MAX || ntv->tv_sec < INT_MIN)
+ return (LINUX_EOVERFLOW);
+#endif
+ ltv->tv_sec = ntv->tv_sec;
+ ltv->tv_usec = ntv->tv_usec;
+ return (0);
+}
+
+
+#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
+static int
+__vdso_native_to_linux_timespec64(struct l_timespec64 *lts,
+ struct timespec *nts)
+{
+
+ lts->tv_sec = nts->tv_sec;
+ lts->tv_nsec = nts->tv_nsec;
+ return (0);
+}
+#endif
+
+static int
+__vdso_linux_to_native_clockid(clockid_t *n, clockid_t l)
+{
+
+ switch (l) {
+ case LINUX_CLOCK_REALTIME:
+ *n = CLOCK_REALTIME;
+ break;
+ case LINUX_CLOCK_MONOTONIC:
+ *n = CLOCK_MONOTONIC;
+ break;
+ case LINUX_CLOCK_REALTIME_COARSE:
+ *n = CLOCK_REALTIME_FAST;
+ break;
+ case LINUX_CLOCK_MONOTONIC_COARSE:
+ case LINUX_CLOCK_MONOTONIC_RAW:
+ *n = CLOCK_MONOTONIC_FAST;
+ break;
+ case LINUX_CLOCK_BOOTTIME:
+ *n = CLOCK_UPTIME;
+ break;
+ default:
+ return (LINUX_EINVAL);
+ }
+ return (0);
+}
+
+/*
+ * The code below adapted from
+ * lib/libc/sys/__vdso_gettimeofday.c
+ */
+
+static inline void
+__vdso_gettimekeep(struct vdso_timekeep **tk)
+{
+
+ *tk = (struct vdso_timekeep *)kern_timekeep_base;
+}
+
+static int
+tc_delta(const struct vdso_timehands *th, u_int *delta)
+{
+ int error;
+ u_int tc;
+
+ error = __vdso_gettc(th, &tc);
+ if (error == 0)
+ *delta = (tc - th->th_offset_count) & th->th_counter_mask;
+ return (error);
+}
+
+/*
+ * Calculate the absolute or boot-relative time from the
+ * machine-specific fast timecounter and the published timehands
+ * structure read from the shared page.
+ *
+ * The lockless reading scheme is similar to the one used to read the
+ * in-kernel timehands, see sys/kern/kern_tc.c:binuptime(). This code
+ * is based on the kernel implementation.
+ */
+static int
+freebsd_binuptime(struct bintime *bt, struct vdso_timekeep *tk, bool abs)
+{
+ struct vdso_timehands *th;
+ uint32_t curr, gen;
+ uint64_t scale, x;
+ u_int delta, scale_bits;
+ int error;
+
+ do {
+ if (!tk->tk_enabled)
+ return (ENOSYS);
+
+ curr = atomic_load_acq_32(&tk->tk_current);
+ th = &tk->tk_th[curr];
+ gen = atomic_load_acq_32(&th->th_gen);
+ *bt = th->th_offset;
+ error = tc_delta(th, &delta);
+ if (error == EAGAIN)
+ continue;
+ if (error != 0)
+ return (error);
+ scale = th->th_scale;
+#ifdef _LP64
+ scale_bits = ffsl(scale);
+#else
+ scale_bits = ffsll(scale);
+#endif
+ if (__predict_false(scale_bits + fls(delta) > 63)) {
+ x = (scale >> 32) * delta;
+ scale &= 0xffffffff;
+ bt->sec += x >> 32;
+ bintime_addx(bt, x << 32);
+ }
+ bintime_addx(bt, scale * delta);
+ if (abs)
+ bintime_add(bt, &th->th_boottime);
+
+ /*
+ * Ensure that the load of th_offset is completed
+ * before the load of th_gen.
+ */
+ atomic_thread_fence_acq();
+ } while (curr != tk->tk_current || gen == 0 || gen != th->th_gen);
+ return (0);
+}
+
+static int
+freebsd_getnanouptime(struct bintime *bt, struct vdso_timekeep *tk)
+{
+ struct vdso_timehands *th;
+ uint32_t curr, gen;
+
+ do {
+ if (!tk->tk_enabled)
+ return (ENOSYS);
+
+ curr = atomic_load_acq_32(&tk->tk_current);
+ th = &tk->tk_th[curr];
+ gen = atomic_load_acq_32(&th->th_gen);
+ *bt = th->th_offset;
+
+ /*
+ * Ensure that the load of th_offset is completed
+ * before the load of th_gen.
+ */
+ atomic_thread_fence_acq();
+ } while (curr != tk->tk_current || gen == 0 || gen != th->th_gen);
+ return (0);
+}
+
+static int
+freebsd_gettimeofday(struct timeval *tv, struct timezone *tz)
+{
+ struct vdso_timekeep *tk;
+ struct bintime bt;
+ int error;
+
+ if (tz != NULL)
+ return (ENOSYS);
+ __vdso_gettimekeep(&tk);
+ if (tk == NULL)
+ return (ENOSYS);
+ if (tk->tk_ver != VDSO_TK_VER_CURR)
+ return (ENOSYS);
+ error = freebsd_binuptime(&bt, tk, true);
+ if (error == 0)
+ bintime2timeval(&bt, tv);
+ return (error);
+}
+
+static int
+freebsd_clock_gettime(clockid_t clock_id, struct timespec *ts)
+{
+ struct vdso_timekeep *tk;
+ struct bintime bt;
+ int error;
+
+ __vdso_gettimekeep(&tk);
+ if (tk == NULL)
+ return (ENOSYS);
+ if (tk->tk_ver != VDSO_TK_VER_CURR)
+ return (ENOSYS);
+ switch (clock_id) {
+ case CLOCK_REALTIME:
+ case CLOCK_REALTIME_PRECISE:
+ case CLOCK_REALTIME_FAST:
+ error = freebsd_binuptime(&bt, tk, true);
+ break;
+ case CLOCK_MONOTONIC:
+ case CLOCK_MONOTONIC_PRECISE:
+ case CLOCK_UPTIME:
+ case CLOCK_UPTIME_PRECISE:
+ error = freebsd_binuptime(&bt, tk, false);
+ break;
+ case CLOCK_MONOTONIC_FAST:
+ case CLOCK_UPTIME_FAST:
+ error = freebsd_getnanouptime(&bt, tk);
+ break;
+ default:
+ error = ENOSYS;
+ break;
+ }
+ if (error == 0)
+ bintime2timespec(&bt, ts);
+ return (error);
+}
+
+/*
+ * Linux vDSO interfaces
+ *
+ */
+int
+__vdso_clock_gettime(clockid_t clock_id, struct l_timespec *lts)
+{
+ struct timespec ts;
+ clockid_t which;
+ int error;
+
+ error = __vdso_linux_to_native_clockid(&which, clock_id);
+ if (error != 0)
+ return (__vdso_clock_gettime_fallback(clock_id, lts));
+ error = freebsd_clock_gettime(which, &ts);
+ if (error == 0)
+ return (-__vdso_native_to_linux_timespec(lts, &ts));
+ else
+ return (__vdso_clock_gettime_fallback(clock_id, lts));
+}
+
+int
+__vdso_gettimeofday(l_timeval *ltv, struct timezone *tz)
+{
+ struct timeval tv;
+ int error;
+
+ error = freebsd_gettimeofday(&tv, tz);
+ if (error != 0)
+ return (__vdso_gettimeofday_fallback(ltv, tz));
+ return (-__vdso_native_to_linux_timeval(ltv, &tv));
+}
+
+int
+__vdso_clock_getres(clockid_t clock_id, struct l_timespec *lts)
+{
+
+ return (__vdso_clock_getres_fallback(clock_id, lts));
+}
+
+#if defined(__i386__) || defined(COMPAT_LINUX32)
+int
+__vdso_clock_gettime64(clockid_t clock_id, struct l_timespec64 *lts)
+{
+ struct timespec ts;
+ clockid_t which;
+ int error;
+
+ error = __vdso_linux_to_native_clockid(&which, clock_id);
+ if (error != 0)
+ return (__vdso_clock_gettime64_fallback(clock_id, lts));
+ error = freebsd_clock_gettime(which, &ts);
+ if (error == 0)
+ return(-__vdso_native_to_linux_timespec64(lts, &ts));
+ else
+ return(__vdso_clock_gettime64_fallback(clock_id, lts));
+}
+
+int clock_gettime64(clockid_t clock_id, struct l_timespec64 *lts)
+ __attribute__((weak, alias("__vdso_clock_gettime64")));
+#endif
+
+#if defined(__amd64__) && !defined(COMPAT_LINUX32)
+int
+__vdso_getcpu(uint32_t *cpu, uint32_t *node, void *cache)
+{
+
+ return (__vdso_getcpu_fallback(cpu, node, cache));
+}
+#endif
+
+#if defined(__i386__) || defined(__amd64__)
+int
+__vdso_time(long *tm)
+{
+
+ return (__vdso_time_fallback(tm));
+}
+#endif
diff --git a/sys/i386/linux/linux.h b/sys/i386/linux/linux.h
index bab89cb5a43a..af7bb554b340 100644
--- a/sys/i386/linux/linux.h
+++ b/sys/i386/linux/linux.h
@@ -39,9 +39,6 @@
#define LINUX_DTRACE linuxulator
-#define LINUX_SHAREDPAGE (VM_MAXUSER_ADDRESS - PAGE_SIZE)
-#define LINUX_USRSTACK LINUX_SHAREDPAGE
-
/*
* Provide a separate set of types for the Linux types.
*/
diff --git a/sys/i386/linux/linux_locore.asm b/sys/i386/linux/linux_locore.asm
index 11427345f7ce..4c5246bd5725 100644
--- a/sys/i386/linux/linux_locore.asm
+++ b/sys/i386/linux/linux_locore.asm
@@ -19,7 +19,7 @@ linux_platform:
* To avoid excess stack frame the signal trampoline code emulates
* the 'call' instruction.
*/
-NON_GPROF_ENTRY(linux_sigcode)
+ENTRY(__kernel_sigreturn)
movl %esp, %ebx /* preserve sigframe */
call .getip0
.getip0:
@@ -34,7 +34,7 @@ NON_GPROF_ENTRY(linux_sigcode)
.endsigcode:
0: jmp 0b
-NON_GPROF_ENTRY(linux_rt_sigcode)
+ENTRY(__kernel_rt_sigreturn)
leal LINUX_RT_SIGF_UC(%esp),%ebx /* linux ucp */
leal LINUX_RT_SIGF_SC(%ebx),%ecx /* linux sigcontext */
movl %esp, %edi
@@ -50,7 +50,7 @@ NON_GPROF_ENTRY(linux_rt_sigcode)
.endrtsigcode:
0: jmp 0b
-NON_GPROF_ENTRY(linux_vsyscall)
+ENTRY(__kernel_vsyscall)
.startvsyscall:
int $0x80
ret
diff --git a/sys/i386/linux/linux_sysvec.c b/sys/i386/linux/linux_sysvec.c
index b1a5fb5ba062..0736fb8734d8 100644
--- a/sys/i386/linux/linux_sysvec.c
+++ b/sys/i386/linux/linux_sysvec.c
@@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$");
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/proc.h>
+#include <sys/stddef.h>
#include <sys/signalvar.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
@@ -63,6 +64,7 @@ __FBSDID("$FreeBSD$");
#include <machine/pcb.h>
#include <machine/trap.h>
+#include <x86/linux/linux_x86.h>
#include <i386/linux/linux.h>
#include <i386/linux/linux_proto.h>
#include <compat/linux/linux_emul.h>
@@ -75,13 +77,22 @@ __FBSDID("$FreeBSD$");
MODULE_VERSION(linux, 1);
+#define LINUX_VDSOPAGE_SIZE PAGE_SIZE * 2
+#define LINUX_VDSOPAGE (VM_MAXUSER_ADDRESS - LINUX_VDSOPAGE_SIZE)
+#define LINUX_SHAREDPAGE (LINUX_VDSOPAGE - PAGE_SIZE)
+ /*
+ * PAGE_SIZE - the size
+ * of the native SHAREDPAGE
+ */
+#define LINUX_USRSTACK LINUX_SHAREDPAGE
#define LINUX_PS_STRINGS (LINUX_USRSTACK - sizeof(struct ps_strings))
static int linux_szsigcode;
-static vm_object_t linux_shared_page_obj;
-static char *linux_shared_page_mapping;
-extern char _binary_linux_locore_o_start;
-extern char _binary_linux_locore_o_end;
+static vm_object_t linux_vdso_obj;
+static char *linux_vdso_mapping;
+extern char _binary_linux_vdso_so_o_start;
+extern char _binary_linux_vdso_so_o_end;
+static vm_offset_t linux_vdso_base;
extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
@@ -94,6 +105,7 @@ static int linux_fixup_elf(uintptr_t *stack_base,
static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
static void linux_exec_setregs(struct thread *td,
struct image_params *imgp, uintptr_t stack);
+static void linux_exec_sysvec_init(void *param);
static int linux_on_exec_vmspace(struct proc *p,
struct image_params *imgp);
static int linux_copyout_strings(struct image_params *imgp,
@@ -101,6 +113,7 @@ static int linux_copyout_strings(struct image_params *imgp,
static bool linux_trans_osrel(const Elf_Note *note, int32_t *osrel);
static void linux_vdso_install(void *param);
static void linux_vdso_deinstall(void *param);
+static void linux_vdso_reloc(char *mapping, Elf_Addr offset);
#define LINUX_T_UNKNOWN 255
static int _bsd_to_linux_trapcode[] = {
@@ -142,9 +155,11 @@ static int _bsd_to_linux_trapcode[] = {
LINUX_T_UNKNOWN)
LINUX_VDSO_SYM_CHAR(linux_platform);
-LINUX_VDSO_SYM_INTPTR(linux_sigcode);
-LINUX_VDSO_SYM_INTPTR(linux_rt_sigcode);
-LINUX_VDSO_SYM_INTPTR(linux_vsyscall);
+LINUX_VDSO_SYM_INTPTR(__kernel_vsyscall);
+LINUX_VDSO_SYM_INTPTR(__kernel_sigreturn);
+LINUX_VDSO_SYM_INTPTR(__kernel_rt_sigreturn);
+LINUX_VDSO_SYM_INTPTR(kern_timekeep_base);
+LINUX_VDSO_SYM_INTPTR(kern_tsc_selector);
/*
* If FreeBSD & Linux have a difference of opinion about what a trap
@@ -202,9 +217,8 @@ linux_copyout_auxargs(struct image_params *imgp, uintptr_t base)
argarray = pos = malloc(LINUX_AT_COUNT * sizeof(*pos), M_TEMP,
M_WAITOK | M_ZERO);
- AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR,
- imgp->proc->p_sysent->sv_shared_page_base);
- AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO, linux_vsyscall);
+ AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, linux_vdso_base);
+ AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO, __kernel_vsyscall);
AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature);
/*
@@ -464,7 +478,7 @@ linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
/* Build context to run handler in. */
regs->tf_esp = (int)fp;
- regs->tf_eip = linux_rt_sigcode;
+ regs->tf_eip = __kernel_rt_sigreturn;
regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D);
regs->tf_cs = _ucodesel;
regs->tf_ds = _udatasel;
@@ -566,7 +580,7 @@ linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
/* Build context to run handler in. */
regs->tf_esp = (int)fp;
- regs->tf_eip = linux_sigcode;
+ regs->tf_eip = __kernel_sigreturn;
regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D);
regs->tf_cs = _ucodesel;
regs->tf_ds = _udatasel;
@@ -812,7 +826,7 @@ struct sysentvec linux_sysvec = {
.sv_transtrap = linux_translate_traps,
.sv_fixup = linux_fixup,
.sv_sendsig = linux_sendsig,
- .sv_sigcode = &_binary_linux_locore_o_start,
+ .sv_sigcode = &_binary_linux_vdso_so_o_start,
.sv_szsigcode = &linux_szsigcode,
.sv_name = "Linux a.out",
.sv_coredump = NULL,
@@ -849,7 +863,7 @@ struct sysentvec elf_linux_sysvec = {
.sv_transtrap = linux_translate_traps,
.sv_fixup = linux_fixup_elf,
.sv_sendsig = linux_sendsig,
- .sv_sigcode = &_binary_linux_locore_o_start,
+ .sv_sigcode = &_binary_linux_vdso_so_o_start,
.sv_szsigcode = &linux_szsigcode,
.sv_name = "Linux ELF32",
.sv_coredump = elf32_coredump,
@@ -870,7 +884,7 @@ struct sysentvec elf_linux_sysvec = {
.sv_fixlimit = NULL,
.sv_maxssiz = NULL,
.sv_flags = SV_ABI_LINUX | SV_IA32 | SV_ILP32 | SV_SHP |
- SV_SIG_DISCIGN | SV_SIG_WAITNDQ,
+ SV_SIG_DISCIGN | SV_SIG_WAITNDQ | SV_TIMEKEEP,
.sv_set_syscall_retval = linux_set_syscall_retval,
.sv_fetch_syscall_args = linux_fetch_syscall_args,
.sv_syscallnames = NULL,
@@ -888,45 +902,128 @@ struct sysentvec elf_linux_sysvec = {
static int
linux_on_exec_vmspace(struct proc *p, struct image_params *imgp)
{
+ int error = 0;
- linux_on_exec(p, imgp);
- return (0);
+ if (SV_PROC_FLAG(p, SV_SHP) != 0)
+ error = linux_map_vdso(p, linux_vdso_obj,
+ linux_vdso_base, LINUX_VDSOPAGE_SIZE, imgp);
+ if (error == 0)
+ linux_on_exec(p, imgp);
+ return (error);
}
static void
-linux_vdso_install(void *param)
+linux_exec_sysvec_init(void *param)
{
+ l_uintptr_t *ktimekeep_base, *ktsc_selector;
+ struct sysentvec *sv;
+ ptrdiff_t tkoff;
+
+ sv = param;
+ /* Fill timekeep_base */
+ exec_sysvec_init(sv);
+
+ tkoff = kern_timekeep_base - linux_vdso_base;
+ ktimekeep_base = (l_uintptr_t *)(linux_vdso_mapping + tkoff);
+ *ktimekeep_base = sv->sv_timekeep_base;
+
+ tkoff = kern_tsc_selector - linux_vdso_base;
+ ktsc_selector = (l_uintptr_t *)(linux_vdso_mapping + tkoff);
+ *ktsc_selector = linux_vdso_tsc_selector_idx();
+ if (bootverbose)
+ printf("Linux i386 vDSO tsc_selector: %u\n", *ktsc_selector);
+}
+SYSINIT(elf_linux_exec_sysvec_init, SI_SUB_EXEC, SI_ORDER_ANY,
+ linux_exec_sysvec_init, &elf_linux_sysvec);
- linux_szsigcode = (&_binary_linux_locore_o_end -
- &_binary_linux_locore_o_start);
+static void
+linux_vdso_install(void *param)
+{
+ char *vdso_start = &_binary_linux_vdso_so_o_start;
+ char *vdso_end = &_binary_linux_vdso_so_o_end;
- if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len)
- panic("Linux invalid vdso size\n");
+ linux_szsigcode = vdso_end - vdso_start;
+ MPASS(linux_szsigcode <= LINUX_VDSOPAGE_SIZE);
- __elfN(linux_vdso_fixup)(&elf_linux_sysvec);
+ linux_vdso_base = LINUX_VDSOPAGE;
- linux_shared_page_obj = __elfN(linux_shared_page_init)
- (&linux_shared_page_mapping);
+ __elfN(linux_vdso_fixup)(vdso_start, linux_vdso_base);
- __elfN(linux_vdso_reloc)(&elf_linux_sysvec);
+ linux_vdso_obj = __elfN(linux_shared_page_init)
+ (&linux_vdso_mapping, LINUX_VDSOPAGE_SIZE);
+ bcopy(vdso_start, linux_vdso_mapping, linux_szsigcode);
- bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping,
- linux_szsigcode);
- elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj;
+ linux_vdso_reloc(linux_vdso_mapping, linux_vdso_base);
}
-SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY,
+SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_FIRST,
linux_vdso_install, NULL);
static void
linux_vdso_deinstall(void *param)
{
- __elfN(linux_shared_page_fini)(linux_shared_page_obj,
- linux_shared_page_mapping);
+ __elfN(linux_shared_page_fini)(linux_vdso_obj,
+ linux_vdso_mapping, LINUX_VDSOPAGE_SIZE);
}
SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST,
linux_vdso_deinstall, NULL);
+static void
+linux_vdso_reloc(char *mapping, Elf_Addr offset)
+{
+ const Elf_Shdr *shdr;
+ const Elf_Rel *rel;
+ const Elf_Ehdr *ehdr;
+ Elf_Addr *where;
+ Elf_Size rtype, symidx;
+ Elf_Addr addr, addend;
+ int i, relcnt;
+
+ MPASS(offset != 0);
+
+ relcnt = 0;
+ ehdr = (const Elf_Ehdr *)mapping;
+ shdr = (const Elf_Shdr *)(mapping + ehdr->e_shoff);
+ for (i = 0; i < ehdr->e_shnum; i++)
+ {
+ switch (shdr[i].sh_type) {
+ case SHT_REL:
+ rel = (const Elf_Rel *)(mapping + shdr[i].sh_offset);
+ relcnt = shdr[i].sh_size / sizeof(*rel);
+ break;
+ case SHT_RELA:
+ printf("Linux i386 vDSO: unexpected Rela section\n");
+ break;
+ }
+ }
+
+ for (i = 0; i < relcnt; i++, rel++) {
+ where = (Elf_Addr *)(mapping + rel->r_offset);
+ addend = *where;
+ rtype = ELF_R_TYPE(rel->r_info);
+ symidx = ELF_R_SYM(rel->r_info);
+
+ switch (rtype) {
+ case R_386_NONE: /* none */
+ break;
+
+ case R_386_RELATIVE: /* B + A */
+ addr = (Elf_Addr)PTROUT(offset + addend);
+ if (*where != addr)
+ *where = addr;
+ break;
+
+ case R_386_IRELATIVE:
+ printf("Linux i386 vDSO: unexpected ifunc relocation, "
+ "symbol index %d\n", symidx);
+ break;
+ default:
+ printf("Linux i386 vDSO: unexpected relocation type %d, "
+ "symbol index %d\n", rtype, symidx);
+ }
+ }
+}
+
static char GNU_ABI_VENDOR[] = "GNU";
static int GNULINUX_ABI_DESC = 0;
diff --git a/sys/i386/linux/linux_vdso.lds.s b/sys/i386/linux/linux_vdso.lds.s
index dcb61cf5e06f..0a392e6380b6 100644
--- a/sys/i386/linux/linux_vdso.lds.s
+++ b/sys/i386/linux/linux_vdso.lds.s
@@ -51,15 +51,30 @@ PHDRS
eh_frame_hdr PT_GNU_EH_FRAME;
}
-ENTRY(linux_vsyscall);
-
VERSION
{
+ LINUX_2.6 {
+ global:
+ __vdso_clock_gettime;
+ __vdso_gettimeofday;
+ __vdso_time;
+ __vdso_clock_getres;
+ __vdso_clock_gettime64;
+ };
+
LINUX_2.5 {
global:
- linux_vsyscall;
- linux_sigcode;
- linux_rt_sigcode;
+ __kernel_vsyscall;
+ __kernel_sigreturn;
+ __kernel_rt_sigreturn;
+ local: *;
+ };
+
+ LINUX_0.0 {
+ global:
+ linux_platform;
+ kern_timekeep_base;
+ kern_tsc_selector;
local: *;
};
}
diff --git a/sys/i386/linux/linux_vdso_gtod.c b/sys/i386/linux/linux_vdso_gtod.c
new file mode 100644
index 000000000000..b1e4a4620ee4
--- /dev/null
+++ b/sys/i386/linux/linux_vdso_gtod.c
@@ -0,0 +1,145 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Dmitry Chagin <dchagin@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/elf.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/stddef.h>
+#define _KERNEL
+#include <sys/vdso.h>
+#undef _KERNEL
+#include <stdbool.h>
+#include <strings.h>
+
+#include <machine/atomic.h>
+#include <machine/stdarg.h>
+
+#include <i386/linux/linux.h>
+#include <i386/linux/linux_syscall.h>
+#include <compat/linux/linux_errno.h>
+#include <compat/linux/linux_timer.h>
+
+/* The kernel fixup this at vDSO install */
+uintptr_t *kern_timekeep_base = NULL;
+uint32_t kern_tsc_selector = 0;
+
+#include <x86/linux/linux_vdso_gettc_x86.inc>
+
+static int
+write(int fd, const void *buf, size_t size)
+{
+ int res;
+
+ __asm__ __volatile__
+ (
+ "int $0x80"
+ : "=a"(res)
+ : "a"(LINUX_SYS_write), "b"(fd), "c"(buf), "d"(size)
+ : "cc", "memory"
+ );
+ return (res);
+}
+
+static int
+__vdso_clock_gettime_fallback(clockid_t clock_id, struct l_timespec *ts)
+{
+ int res;
+
+ __asm__ __volatile__
+ (
+ "int $0x80"
+ : "=a"(res)
+ : "a"(LINUX_SYS_linux_clock_gettime), "b"(clock_id), "c"(ts)
+ : "cc", "memory"
+ );
+ return (res);
+}
+
+static int
+__vdso_clock_gettime64_fallback(clockid_t clock_id, struct l_timespec64 *ts)
+{
+ int res;
+
+ __asm__ __volatile__
+ (
+ "int $0x80"
+ : "=a"(res)
+ : "a"(LINUX_SYS_linux_clock_gettime64), "b"(clock_id), "c"(ts)
+ : "cc", "memory"
+ );
+ return (res);
+}
+
+static int
+__vdso_gettimeofday_fallback(l_timeval *tv, struct timezone *tz)
+{
+ int res;
+
+ __asm__ __volatile__
+ (
+ "int $0x80"
+ : "=a"(res)
+ : "a"(LINUX_SYS_gettimeofday), "b"(tv), "c"(tz)
+ : "cc", "memory"
+ );
+ return (res);
+}
+
+static int
+__vdso_clock_getres_fallback(clockid_t clock_id, struct l_timespec *ts)
+{
+ int res;
+
+ __asm__ __volatile__
+ (
+ "int $0x80"
+ : "=a"(res)
+ : "a"(LINUX_SYS_linux_clock_getres), "b"(clock_id), "c"(ts)
+ : "cc", "memory"
+ );
+ return (res);
+}
+
+static int
+__vdso_time_fallback(long *tm)
+{
+ int res;
+
+ __asm__ __volatile__
+ (
+ "int $0x80"
+ : "=a"(res)
+ : "a"(LINUX_SYS_linux_time), "b"(tm)
+ : "cc", "memory"
+ );
+ return (res);
+}
+
+#include <compat/linux/linux_vdso_gtod.inc>
diff --git a/sys/modules/linux/Makefile b/sys/modules/linux/Makefile
index 4f12c4dc6913..6b5c3b9b8612 100644
--- a/sys/modules/linux/Makefile
+++ b/sys/modules/linux/Makefile
@@ -10,8 +10,6 @@ CFLAGS+=-DCOMPAT_FREEBSD32 -DCOMPAT_LINUX32
.PATH: ${SRCTOP}/sys/x86/linux
.endif
-VDSO= linux${SFX}_vdso
-
KMOD= linux
SRCS= linux_fork.c linux${SFX}_dummy_machdep.c linux_file.c linux_event.c \
linux_futex.c linux_getcwd.c linux_ioctl.c linux_ipc.c \
@@ -22,7 +20,8 @@ SRCS= linux_fork.c linux${SFX}_dummy_machdep.c linux_file.c linux_event.c \
opt_inet6.h opt_compat.h opt_posix.h opt_usb.h vnode_if.h \
device_if.h bus_if.h
.if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "amd64"
-SRCS+= linux_dummy_x86.c
+SRCS+= linux_dummy_x86.c linux_vdso_tsc_selector_x86.c
+VDSODEPS=linux_vdso_gettc_x86.inc
.endif
.if ${MACHINE_CPUARCH} == "amd64"
SRCS+= linux${SFX}_support.s
@@ -38,7 +37,7 @@ SRCS+= opt_kstack_pages.h opt_nfs.h opt_hwpmc_hooks.h
SRCS+= opt_apic.h
.endif
-OBJS= ${VDSO}.so
+OBJS= linux${SFX}_vdso.so
.if ${MACHINE_CPUARCH} == "i386"
SRCS+= linux_ptrace.c imgact_linux.c linux_util.c linux_mib.c linux_mmap.c \
@@ -55,33 +54,54 @@ EXPORT_SYMS+= linux_ioctl_unregister_handler
.endif
CLEANFILES= linux${SFX}_assym.h linux${SFX}_genassym.o linux${SFX}_locore.o \
- genassym.o
+ genassym.o linux${SFX}_vdso_gtod.o linux${SFX}_vdso.so.o
linux${SFX}_assym.h: linux${SFX}_genassym.o
sh ${SYSDIR}/kern/genassym.sh linux${SFX}_genassym.o > ${.TARGET}
+.if ${MACHINE_CPUARCH} == "amd64"
+VDSOFLAGS=-DCOMPAT_FREEBSD32 -DCOMPAT_LINUX32 -m32
+.endif
+
linux${SFX}_locore.o: linux${SFX}_assym.h assym.inc
- ${CC} ${CCLDFLAGS} -x assembler-with-cpp -DLOCORE -m32 -shared -s \
- -pipe -I. -I${SYSDIR} ${WERROR} -Wall -fno-common -nostdinc -nostdlib \
- -fno-omit-frame-pointer -fPIC \
- -Wl,-T${SRCTOP}/sys/${MACHINE_CPUARCH}/linux${SFX}/${VDSO}.lds.s \
- -Wl,-soname=${VDSO}.so.1,--eh-frame-hdr,-warn-common \
+ ${CC} -c -x assembler-with-cpp -DLOCORE -fPIC -pipe -O2 -Werror \
+ -msoft-float -mregparm=0 \
+ -mcmodel=small -fno-common -nostdinc -fasynchronous-unwind-tables \
+ -fno-omit-frame-pointer -foptimize-sibling-calls ${VDSOFLAGS} \
+ -fno-stack-protector -I. -I${SYSDIR} -I${SRCTOP}/include \
${.IMPSRC} -o ${.TARGET}
+linux${SFX}_vdso_gtod.o: linux_vdso_gtod.inc ${VDSODEPS}
+ ${CC} -c -fPIC -pipe -O2 -Werror -msoft-float -mregparm=0 \
+ -mcmodel=small -fno-common -nostdinc -fasynchronous-unwind-tables \
+ -fno-omit-frame-pointer -foptimize-sibling-calls ${VDSOFLAGS} \
+ -fno-stack-protector -I. -I${SYSDIR} -I${SRCTOP}/include \
+ ${.IMPSRC} -o ${.TARGET}
+
+linux${SFX}_vdso.so.o: linux${SFX}_locore.o linux${SFX}_vdso_gtod.o
+ ${LD} -m elf_i386 --shared --eh-frame-hdr -soname=linux-gate.so.1 \
+ --no-undefined --hash-style=both -warn-common -nostdlib \
+ --strip-debug -s --build-id=sha1 --Bsymbolic \
+ -T${SRCTOP}/sys/${MACHINE}/linux${SFX}/linux${SFX}_vdso.lds.s \
+ -o ${.TARGET} ${.ALLSRC:M*.o}
+
+.if ${MACHINE_CPUARCH} == "amd64"
+OBJCOPY_TARGET=--output-target elf64-x86-64-freebsd --binary-architecture i386
+.elif ${MACHINE_CPUARCH} == "i386"
+OBJCOPY_TARGET=--output-target elf32-i386-freebsd --binary-architecture i386
+.else
+.error ${MACHINE_CPUARCH} not yet supported by linux
+.endif
+
+linux${SFX}_vdso.so: linux${SFX}_vdso.so.o
+ ${OBJCOPY} --input-target binary ${OBJCOPY_TARGET} \
+ linux${SFX}_vdso.so.o ${.TARGET}
+ ${STRIPBIN} -N _binary_linux${SFX}_vdso_so_o_size ${.TARGET}
+
.if ${MACHINE_CPUARCH} == "amd64"
linux${SFX}_support.o: linux${SFX}_assym.h assym.inc
${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \
${.IMPSRC} -o ${.TARGET}
-
-${VDSO}.so: linux${SFX}_locore.o
- ${OBJCOPY} --input-target binary --output-target elf64-x86-64-freebsd \
- --binary-architecture i386 linux${SFX}_locore.o ${.TARGET}
- ${STRIPBIN} -N _binary_linux${SFX}_locore_o_size ${.TARGET}
-.else
-${VDSO}.so: linux${SFX}_locore.o
- ${OBJCOPY} --input-target binary --output-target elf32-i386-freebsd \
- --binary-architecture i386 linux${SFX}_locore.o ${.TARGET}
- ${STRIPBIN} -N _binary_linux_locore_o_size ${.TARGET}
.endif
linux${SFX}_genassym.o: offset.inc
diff --git a/sys/modules/linux64/Makefile b/sys/modules/linux64/Makefile
index 0be94033a494..0764d1b0dc99 100644
--- a/sys/modules/linux64/Makefile
+++ b/sys/modules/linux64/Makefile
@@ -5,8 +5,6 @@
.PATH: ${SRCTOP}/sys/x86/linux
.endif
-VDSO= linux_vdso
-
KMOD= linux64
SRCS= linux_elf64.c linux_fork.c linux_dummy_machdep.c linux_file.c \
linux_event.c linux_futex.c linux_getcwd.c linux_ioctl.c linux_ipc.c \
@@ -17,7 +15,7 @@ SRCS= linux_elf64.c linux_fork.c linux_dummy_machdep.c linux_file.c \
vnode_if.h device_if.h bus_if.h \
linux_support.s
.if ${MACHINE_CPUARCH} == "amd64"
-SRCS+= linux_dummy_x86.c
+SRCS+= linux_dummy_x86.c linux_vdso_tsc_selector_x86.c
.endif
DPSRCS= assym.inc linux_genassym.c
@@ -25,20 +23,44 @@ DPSRCS= assym.inc linux_genassym.c
SRCS+= opt_kstack_pages.h opt_nfs.h opt_hwpmc_hooks.h
CLEANFILES= linux_assym.h linux_genassym.o linux_locore.o \
- genassym.o
+ genassym.o linux_vdso_gtod.o linux_vdso.so.o
-OBJS= ${VDSO}.so
+OBJS= linux_vdso.so
linux_assym.h: linux_genassym.o
sh ${SYSDIR}/kern/genassym.sh linux_genassym.o > ${.TARGET}
-linux_locore.o: linux_locore.asm linux_assym.h
- ${CC} ${CCLDFLAGS} -x assembler-with-cpp -DLOCORE -shared -mcmodel=small \
- -pipe -I. -I${SYSDIR} ${WERROR} -Wall -fno-common -fPIC -nostdinc \
- -Wl,-T${SRCTOP}/sys/${MACHINE}/linux/${VDSO}.lds.s \
- -Wl,-soname=${VDSO}.so.1,-warn-common -nostdlib \
+.if ${MACHINE_CPUARCH} == "amd64"
+VDSOFLAGS=-mregparm=0 -mcmodel=small -msoft-float
+VDSODEPS=linux_vdso_gettc_x86.inc
+.elif ${MACHINE_CPUARCH} == "aarch64"
+# The Linux uses tiny memory model, but our ld does not know about
+# some of relocation types which is generated by cc
+VDSOFLAGS=-mgeneral-regs-only -mcmodel=small -ffixed-x18
+.endif
+
+linux_locore.o: linux_assym.h assym.inc
+ ${CC} -c -x assembler-with-cpp -DLOCORE \
+ -fPIC -pipe -O2 -Werror ${VDSOFLAGS} \
+ -nostdinc -fasynchronous-unwind-tables \
+ -fno-omit-frame-pointer -foptimize-sibling-calls \
+ -fno-stack-protector -I. -I${SYSDIR} -I${SRCTOP}/include \
${.IMPSRC} -o ${.TARGET}
+linux_vdso_gtod.o: linux_vdso_gtod.inc ${VDSODEPS}
+ ${CC} -c -fPIC -pipe -O2 -Werror ${VDSOFLAGS} \
+ -nostdinc -fasynchronous-unwind-tables \
+ -fno-omit-frame-pointer -foptimize-sibling-calls \
+ -fno-stack-protector -I. -I${SYSDIR} -I${SRCTOP}/include \
+ ${.IMPSRC} -o ${.TARGET}
+
+linux_vdso.so.o: linux_locore.o linux_vdso_gtod.o
+ ${LD} --shared --eh-frame-hdr -soname=linux-vdso.so.1 \
+ --no-undefined --hash-style=both -warn-common -nostdlib \
+ --strip-debug -s --build-id=sha1 -Bsymbolic \
+ -T${SRCTOP}/sys/${MACHINE}/linux/linux_vdso.lds.s \
+ -o ${.TARGET} ${.ALLSRC:M*.o}
+
.if ${MACHINE_CPUARCH} == "aarch64"
OBJCOPY_TARGET=--output-target elf64-littleaarch64 --binary-architecture aarch64
.elif ${MACHINE_CPUARCH} == "amd64"
@@ -46,10 +68,11 @@ OBJCOPY_TARGET=--output-target elf64-x86-64 --binary-architecture i386:x86-64
.else
.error ${MACHINE_CPUARCH} not yet supported by linux64
.endif
-${VDSO}.so: linux_locore.o
+
+linux_vdso.so: linux_vdso.so.o
${OBJCOPY} --input-target binary ${OBJCOPY_TARGET} \
- linux_locore.o ${.TARGET}
- ${STRIPBIN} -N _binary_linux_locore_o_size ${.TARGET}
+ linux_vdso.so.o ${.TARGET}
+ ${STRIPBIN} -N _binary_linux_vdso_so_o_size ${.TARGET}
linux_support.o: assym.inc linux_assym.h
${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \
diff --git a/sys/x86/linux/linux_vdso_gettc_x86.inc b/sys/x86/linux/linux_vdso_gettc_x86.inc
new file mode 100644
index 000000000000..ade78a03486b
--- /dev/null
+++ b/sys/x86/linux/linux_vdso_gettc_x86.inc
@@ -0,0 +1,164 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
+ * Copyright (c) 2016, 2017, 2019 The FreeBSD Foundation
+ * Copyright (c) 2021 Dmitry Chagin <dchagin@FreeBSD.org>
+ *
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+
+#if defined(__i386__) || defined(COMPAT_LINUX32)
+#include <i386/include/atomic.h>
+#include <i386/include/cpufunc.h>
+#else
+#include <amd64/include/atomic.h>
+#include <amd64/include/cpufunc.h>
+#endif
+
+static inline u_int
+rdtsc_low(const struct vdso_timehands *th)
+{
+ u_int rv;
+
+ __asm __volatile("rdtsc; shrd %%cl, %%edx, %0"
+ : "=a" (rv) : "c" (th->th_x86_shift) : "edx");
+ return (rv);
+}
+
+static inline u_int
+rdtscp_low(const struct vdso_timehands *th)
+{
+ u_int rv;
+
+ __asm __volatile("rdtscp; movl %%edi,%%ecx; shrd %%cl, %%edx, %0"
+ : "=a" (rv) : "D" (th->th_x86_shift) : "ecx", "edx");
+ return (rv);
+}
+
+static u_int
+rdtsc_low_mb_lfence(const struct vdso_timehands *th)
+{
+ lfence();
+ return (rdtsc_low(th));
+}
+
+static u_int
+rdtsc_low_mb_mfence(const struct vdso_timehands *th)
+{
+ mfence();
+ return (rdtsc_low(th));
+}
+
+static u_int
+rdtsc_low_mb_none(const struct vdso_timehands *th)
+{
+ return (rdtsc_low(th));
+}
+
+static u_int
+rdtsc32_mb_lfence(void)
+{
+ lfence();
+ return (rdtsc32());
+}
+
+static u_int
+rdtsc32_mb_mfence(void)
+{
+ mfence();
+ return (rdtsc32());
+}
+
+static u_int
+rdtsc32_mb_none(void)
+{
+ return (rdtsc32());
+}
+
+static u_int
+rdtscp32_(void)
+{
+ return (rdtscp32());
+}
+
+struct tsc_selector_tag {
+ u_int (*ts_rdtsc32)(void);
+ u_int (*ts_rdtsc_low)(const struct vdso_timehands *);
+};
+
+static const struct tsc_selector_tag tsc_selector[] = {
+ [0] = { /* Intel, LFENCE */
+ .ts_rdtsc32 = rdtsc32_mb_lfence,
+ .ts_rdtsc_low = rdtsc_low_mb_lfence,
+ },
+ [1] = { /* AMD, MFENCE */
+ .ts_rdtsc32 = rdtsc32_mb_mfence,
+ .ts_rdtsc_low = rdtsc_low_mb_mfence,
+ },
+ [2] = { /* No SSE2 */
+ .ts_rdtsc32 = rdtsc32_mb_none,
+ .ts_rdtsc_low = rdtsc_low_mb_none,
+ },
+ [3] = { /* RDTSCP */
+ .ts_rdtsc32 = rdtscp32_,
+ .ts_rdtsc_low = rdtscp_low,
+ },
+};
+
+static u_int
+__vdso_gettc_rdtsc_low(const struct vdso_timehands *th)
+{
+
+ return (tsc_selector[kern_tsc_selector].ts_rdtsc_low(th));
+}
+
+static u_int
+__vdso_gettc_rdtsc32(void)
+{
+
+ return (tsc_selector[kern_tsc_selector].ts_rdtsc32());
+}
+
+int
+__vdso_gettc(const struct vdso_timehands *th, u_int *tc)
+{
+
+ switch (th->th_algo) {
+ case VDSO_TH_ALGO_X86_TSC:
+ *tc = th->th_x86_shift > 0 ? __vdso_gettc_rdtsc_low(th) :
+ __vdso_gettc_rdtsc32();
+ return (0);
+ case VDSO_TH_ALGO_X86_HPET:
+ /* TODO */
+ default:
+ return (ENOSYS);
+ }
+}
diff --git a/sys/x86/linux/linux_vdso_tsc_selector_x86.c b/sys/x86/linux/linux_vdso_tsc_selector_x86.c
new file mode 100644
index 000000000000..a3a65a6d337d
--- /dev/null
+++ b/sys/x86/linux/linux_vdso_tsc_selector_x86.c
@@ -0,0 +1,57 @@
+/*-
+ * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
+ * Copyright (c) 2016, 2017, 2019 The FreeBSD Foundation
+ * Copyright (c) 2021 Dmitry Chagin <dchagin@FreeBSD.org>
+ *
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <x86/cputypes.h>
+#include <x86/x86_var.h>
+#include <x86/specialreg.h>
+
+#include <x86/linux/linux_x86.h>
+
+int
+linux_vdso_tsc_selector_idx()
+{
+ bool amd_cpu;
+
+ if (cpu_feature == 0)
+ return (2); /* should not happen due to RDTSC */
+
+ amd_cpu = (cpu_vendor_id == CPU_VENDOR_AMD ||
+ cpu_vendor_id == CPU_VENDOR_HYGON);
+
+ if ((amd_feature & AMDID_RDTSCP) != 0)
+ return (3);
+ if ((cpu_feature & CPUID_SSE2) == 0)
+ return (2);
+ return (amd_cpu ? 1 : 0);
+}
diff --git a/sys/x86/linux/linux_x86.h b/sys/x86/linux/linux_x86.h
new file mode 100644
index 000000000000..73736eb7eb84
--- /dev/null
+++ b/sys/x86/linux/linux_x86.h
@@ -0,0 +1,33 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Dmitry Chagin <dchagin@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _X86_INCLUDE_LINUX_LINUX_X86_H_
+#define _X86_INCLUDE_LINUX_LINUX_X86_H_
+
+int linux_vdso_tsc_selector_idx(void);
+
+#endif /* _X86_INCLUDE_LINUX_LINUX_X86_H_ */