src/sys/arch/amd64/amd64/machdep.c

2149 lines
52 KiB
C

/* $OpenBSD: machdep.c,v 1.290 2024/02/03 16:21:22 deraadt Exp $ */
/* $NetBSD: machdep.c,v 1.3 2003/05/07 22:58:18 fvdl Exp $ */
/*-
* Copyright (c) 1996, 1997, 1998, 2000 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
* Simulation Facility, NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* William Jolitz.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)machdep.c 7.4 (Berkeley) 6/3/91
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/signal.h>
#include <sys/signalvar.h>
#include <sys/proc.h>
#include <sys/user.h>
#include <sys/exec.h>
#include <sys/buf.h>
#include <sys/reboot.h>
#include <sys/conf.h>
#include <sys/msgbuf.h>
#include <sys/mount.h>
#include <sys/extent.h>
#include <sys/core.h>
#include <sys/kcore.h>
#include <sys/syscallargs.h>
#include <dev/cons.h>
#include <stand/boot/bootarg.h>
#include <net/if.h>
#include <uvm/uvm_extern.h>
#include <sys/sysctl.h>
#include <machine/cpu_full.h>
#include <machine/cpufunc.h>
#include <machine/pio.h>
#include <machine/psl.h>
#include <machine/reg.h>
#include <machine/fpu.h>
#include <machine/biosvar.h>
#include <machine/mpbiosvar.h>
#include <machine/kcore.h>
#include <machine/tss.h>
#include <dev/isa/isareg.h>
#include <dev/ic/i8042reg.h>
#ifdef DDB
#include <machine/db_machdep.h>
#include <ddb/db_extern.h>
extern int db_console;
#endif
#include "isa.h"
#include "isadma.h"
#include "ksyms.h"
#include "acpi.h"
#if NACPI > 0
#include <dev/acpi/acpivar.h>
#endif
#include "com.h"
#if NCOM > 0
#include <sys/tty.h>
#include <dev/ic/comvar.h>
#include <dev/ic/comreg.h>
#endif
#include "efi.h"
#if NEFI > 0
#include <dev/efi/efi.h>
#endif
#include "softraid.h"
#if NSOFTRAID > 0
#include <dev/softraidvar.h>
#endif
#ifdef HIBERNATE
#include <machine/hibernate_var.h>
#endif /* HIBERNATE */
#include "ukbd.h"
#include "pckbc.h"
#if NPCKBC > 0 && NUKBD > 0
#include <dev/ic/pckbcvar.h>
#endif
/* #define MACHDEP_DEBUG */
#ifdef MACHDEP_DEBUG
#define DPRINTF(x...) do { printf(x); } while(0)
#else
#define DPRINTF(x...)
#endif /* MACHDEP_DEBUG */
/* the following is used externally (sysctl_hw) */
char machine[] = MACHINE;
/*
* switchto vectors
*/
void cpu_idle_cycle_hlt(void);
void (*cpu_idle_cycle_fcn)(void) = &cpu_idle_cycle_hlt;
/* the following is used externally for concurrent handlers */
int setperf_prio = 0;
#ifdef CPURESET_DELAY
int cpureset_delay = CPURESET_DELAY;
#else
int cpureset_delay = 0;
#endif
char *ssym = 0, *esym = 0; /* start and end of symbol table */
dev_t bootdev = 0; /* device we booted from */
int biosbasemem = 0; /* base memory reported by BIOS */
u_int bootapiver = 0; /* /boot API version */
int physmem;
u_int64_t dumpmem_low;
u_int64_t dumpmem_high;
extern int boothowto;
int cpu_class;
paddr_t dumpmem_paddr;
vaddr_t dumpmem_vaddr;
psize_t dumpmem_sz;
vaddr_t kern_end;
vaddr_t msgbuf_vaddr;
paddr_t msgbuf_paddr;
vaddr_t idt_vaddr;
paddr_t idt_paddr;
vaddr_t lo32_vaddr;
paddr_t lo32_paddr;
paddr_t tramp_pdirpa;
int kbd_reset;
int lid_action = 1;
int pwr_action = 1;
int forceukbd;
/*
* safepri is a safe priority for sleep to set for a spin-wait
* during autoconfiguration or after a panic.
*/
int safepri = 0;
struct vm_map *exec_map = NULL;
struct vm_map *phys_map = NULL;
/* UVM constraint ranges. */
struct uvm_constraint_range isa_constraint = { 0x0, 0x00ffffffUL };
struct uvm_constraint_range dma_constraint = { 0x0, 0xffffffffUL };
struct uvm_constraint_range *uvm_md_constraints[] = {
&isa_constraint,
&dma_constraint,
NULL,
};
paddr_t avail_start;
paddr_t avail_end;
void (*delay_func)(int) = i8254_delay;
void (*initclock_func)(void) = i8254_initclocks;
void (*startclock_func)(void) = i8254_start_both_clocks;
/*
* Format of boot information passed to us by 32-bit /boot
*/
typedef struct _boot_args32 {
int ba_type;
int ba_size;
int ba_nextX; /* a ptr in 32-bit world, but not here */
char ba_arg[1];
} bootarg32_t;
#define BOOTARGC_MAX NBPG /* one page */
bios_bootmac_t *bios_bootmac;
/* locore copies the arguments from /boot to here for us */
char bootinfo[BOOTARGC_MAX];
int bootinfo_size = BOOTARGC_MAX;
void getbootinfo(char *, int);
/* Data passed to us by /boot, filled in by getbootinfo() */
bios_diskinfo_t *bios_diskinfo;
bios_memmap_t *bios_memmap;
u_int32_t bios_cksumlen;
bios_efiinfo_t *bios_efiinfo;
bios_ucode_t *bios_ucode;
#if NEFI > 0
EFI_MEMORY_DESCRIPTOR *mmap;
#endif
/*
* Size of memory segments, before any memory is stolen.
*/
phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
int mem_cluster_cnt;
int cpu_dump(void);
int cpu_dumpsize(void);
u_long cpu_dump_mempagecnt(void);
void dumpsys(void);
void cpu_init_extents(void);
void map_tramps(void);
void init_x86_64(paddr_t);
void (*cpuresetfn)(void);
void enter_shared_special_pages(void);
#ifdef APERTURE
int allowaperture = 0;
#endif
/*
* Machine-dependent startup code
*/
void
cpu_startup(void)
{
vaddr_t minaddr, maxaddr;
msgbuf_vaddr = PMAP_DIRECT_MAP(msgbuf_paddr);
initmsgbuf((caddr_t)msgbuf_vaddr, round_page(MSGBUFSIZE));
printf("%s", version);
startclocks();
rtcinit();
printf("real mem = %lu (%luMB)\n", ptoa((psize_t)physmem),
ptoa((psize_t)physmem)/1024/1024);
/*
* Allocate a submap for exec arguments. This map effectively
* limits the number of processes exec'ing at any time.
*/
minaddr = vm_map_min(kernel_map);
exec_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
16*NCARGS, VM_MAP_PAGEABLE, FALSE, NULL);
/*
* Allocate a submap for physio
*/
minaddr = vm_map_min(kernel_map);
phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
VM_PHYS_SIZE, 0, FALSE, NULL);
printf("avail mem = %lu (%luMB)\n", ptoa((psize_t)uvmexp.free),
ptoa((psize_t)uvmexp.free)/1024/1024);
bufinit();
if (boothowto & RB_CONFIG) {
#ifdef BOOT_CONFIG
user_config();
#else
printf("kernel does not support -c; continuing..\n");
#endif
}
/* Safe for i/o port / memory space allocation to use malloc now. */
x86_bus_space_mallocok();
#ifndef SMALL_KERNEL
cpu_ucode_setup();
cpu_ucode_apply(&cpu_info_primary);
#endif
cpu_tsx_disable(&cpu_info_primary);
/* enter the IDT and trampoline code in the u-k maps */
enter_shared_special_pages();
/* initialize CPU0's TSS and GDT and put them in the u-k maps */
cpu_enter_pages(&cpu_info_full_primary);
}
/*
* enter_shared_special_pages
*
* Requests mapping of various special pages required in the Intel Meltdown
* case (to be entered into the U-K page table):
*
* 1 IDT page
* Various number of pages covering the U-K ".kutext" section. This section
* contains code needed during trampoline operation
* Various number of pages covering the U-K ".kudata" section. This section
* contains data accessed by the trampoline, before switching to U+K
* (for example, various shared global variables used by IPIs, etc)
*
* The linker script places the required symbols in the sections above.
*
* On CPUs not affected by Meltdown, the calls to pmap_enter_special below
* become no-ops.
*/
void
enter_shared_special_pages(void)
{
extern char __kutext_start[], __kutext_end[], __kernel_kutext_phys[];
extern char __text_page_start[], __text_page_end[];
extern char __kernel_kutext_page_phys[];
extern char __kudata_start[], __kudata_end[], __kernel_kudata_phys[];
vaddr_t va;
paddr_t pa;
/* idt */
pmap_enter_special(idt_vaddr, idt_paddr, PROT_READ);
DPRINTF("%s: entered idt page va 0x%llx pa 0x%llx\n", __func__,
(uint64_t)idt_vaddr, (uint64_t)idt_paddr);
/* .kutext section */
va = (vaddr_t)__kutext_start;
pa = (paddr_t)__kernel_kutext_phys;
while (va < (vaddr_t)__kutext_end) {
pmap_enter_special(va, pa, PROT_READ | PROT_EXEC);
DPRINTF("%s: entered kutext page va 0x%llx pa 0x%llx\n",
__func__, (uint64_t)va, (uint64_t)pa);
va += PAGE_SIZE;
pa += PAGE_SIZE;
}
/* .kutext.page section */
va = (vaddr_t)__text_page_start;
pa = (paddr_t)__kernel_kutext_page_phys;
while (va < (vaddr_t)__text_page_end) {
pmap_enter_special(va, pa, PROT_READ | PROT_EXEC);
DPRINTF("%s: entered kutext.page va 0x%llx pa 0x%llx\n",
__func__, (uint64_t)va, (uint64_t)pa);
va += PAGE_SIZE;
pa += PAGE_SIZE;
}
/* .kudata section */
va = (vaddr_t)__kudata_start;
pa = (paddr_t)__kernel_kudata_phys;
while (va < (vaddr_t)__kudata_end) {
pmap_enter_special(va, pa, PROT_READ | PROT_WRITE);
DPRINTF("%s: entered kudata page va 0x%llx pa 0x%llx\n",
__func__, (uint64_t)va, (uint64_t)pa);
va += PAGE_SIZE;
pa += PAGE_SIZE;
}
}
/*
* Set up proc0's PCB and the cpu's TSS.
*/
void
x86_64_proc0_tss_ldt_init(void)
{
struct pcb *pcb;
cpu_info_primary.ci_curpcb = pcb = &proc0.p_addr->u_pcb;
pcb->pcb_fsbase = 0;
pcb->pcb_kstack = (u_int64_t)proc0.p_addr + USPACE - 16;
proc0.p_md.md_regs = (struct trapframe *)pcb->pcb_kstack - 1;
ltr(GSYSSEL(GPROC0_SEL, SEL_KPL));
lldt(0);
}
bios_diskinfo_t *
bios_getdiskinfo(dev_t dev)
{
bios_diskinfo_t *pdi;
if (bios_diskinfo == NULL)
return NULL;
for (pdi = bios_diskinfo; pdi->bios_number != -1; pdi++) {
if ((dev & B_MAGICMASK) == B_DEVMAGIC) { /* search by bootdev */
if (pdi->bsd_dev == dev)
break;
} else {
if (pdi->bios_number == dev)
break;
}
}
if (pdi->bios_number == -1)
return NULL;
else
return pdi;
}
int
bios_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
size_t newlen, struct proc *p)
{
bios_diskinfo_t *pdi;
int biosdev;
/* all sysctl names at this level except diskinfo are terminal */
if (namelen != 1 && name[0] != BIOS_DISKINFO)
return (ENOTDIR); /* overloaded */
if (!(bootapiver & BAPIV_VECTOR))
return EOPNOTSUPP;
switch (name[0]) {
case BIOS_DEV:
if ((pdi = bios_getdiskinfo(bootdev)) == NULL)
return ENXIO;
biosdev = pdi->bios_number;
return sysctl_rdint(oldp, oldlenp, newp, biosdev);
case BIOS_DISKINFO:
if (namelen != 2)
return ENOTDIR;
if ((pdi = bios_getdiskinfo(name[1])) == NULL)
return ENXIO;
return sysctl_rdstruct(oldp, oldlenp, newp, pdi, sizeof(*pdi));
case BIOS_CKSUMLEN:
return sysctl_rdint(oldp, oldlenp, newp, bios_cksumlen);
default:
return EOPNOTSUPP;
}
/* NOTREACHED */
}
extern int tsc_is_invariant;
extern int amd64_has_xcrypt;
extern int need_retpoline;
const struct sysctl_bounded_args cpuctl_vars[] = {
{ CPU_LIDACTION, &lid_action, 0, 2 },
{ CPU_PWRACTION, &pwr_action, 0, 2 },
{ CPU_CPUID, &cpu_id, SYSCTL_INT_READONLY },
{ CPU_CPUFEATURE, &cpu_feature, SYSCTL_INT_READONLY },
{ CPU_XCRYPT, &amd64_has_xcrypt, SYSCTL_INT_READONLY },
{ CPU_INVARIANTTSC, &tsc_is_invariant, SYSCTL_INT_READONLY },
{ CPU_RETPOLINE, &need_retpoline, SYSCTL_INT_READONLY },
};
/*
* machine dependent system variables.
*/
int
cpu_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
size_t newlen, struct proc *p)
{
extern uint64_t tsc_frequency;
dev_t consdev;
dev_t dev;
switch (name[0]) {
case CPU_CONSDEV:
if (namelen != 1)
return (ENOTDIR); /* overloaded */
if (cn_tab != NULL)
consdev = cn_tab->cn_dev;
else
consdev = NODEV;
return (sysctl_rdstruct(oldp, oldlenp, newp, &consdev,
sizeof consdev));
case CPU_CHR2BLK:
if (namelen != 2)
return (ENOTDIR); /* overloaded */
dev = chrtoblk((dev_t)name[1]);
return sysctl_rdstruct(oldp, oldlenp, newp, &dev, sizeof(dev));
case CPU_BIOS:
return bios_sysctl(name + 1, namelen - 1, oldp, oldlenp,
newp, newlen, p);
case CPU_CPUVENDOR:
return (sysctl_rdstring(oldp, oldlenp, newp, cpu_vendor));
case CPU_KBDRESET:
return (sysctl_securelevel_int(oldp, oldlenp, newp, newlen,
&kbd_reset));
case CPU_ALLOWAPERTURE:
if (namelen != 1)
return (ENOTDIR); /* overloaded */
#ifdef APERTURE
if (securelevel > 0)
return (sysctl_int_lower(oldp, oldlenp, newp, newlen,
&allowaperture));
else
return (sysctl_int(oldp, oldlenp, newp, newlen,
&allowaperture));
#else
return (sysctl_rdint(oldp, oldlenp, newp, 0));
#endif
#if NPCKBC > 0 && NUKBD > 0
case CPU_FORCEUKBD:
{
int error;
if (forceukbd)
return (sysctl_rdint(oldp, oldlenp, newp, forceukbd));
error = sysctl_int(oldp, oldlenp, newp, newlen, &forceukbd);
if (forceukbd)
pckbc_release_console();
return (error);
}
#endif
case CPU_TSCFREQ:
return (sysctl_rdquad(oldp, oldlenp, newp, tsc_frequency));
default:
return (sysctl_bounded_arr(cpuctl_vars, nitems(cpuctl_vars),
name, namelen, oldp, oldlenp, newp, newlen));
}
/* NOTREACHED */
}
static inline void
maybe_enable_user_cet(struct proc *p)
{
#ifndef SMALL_KERNEL
/* Enable indirect-branch tracking if present and not disabled */
if ((xsave_mask & XFEATURE_CET_U) &&
(p->p_p->ps_flags & PS_NOBTCFI) == 0) {
uint64_t msr = rdmsr(MSR_U_CET);
wrmsr(MSR_U_CET, msr | MSR_CET_ENDBR_EN | MSR_CET_NO_TRACK_EN);
}
#endif
}
static inline void
initialize_thread_xstate(struct proc *p)
{
if (cpu_use_xsaves) {
xrstors(fpu_cleandata, xsave_mask);
maybe_enable_user_cet(p);
} else {
/* Reset FPU state in PCB */
memcpy(&p->p_addr->u_pcb.pcb_savefpu, fpu_cleandata,
fpu_save_len);
if (curcpu()->ci_pflags & CPUPF_USERXSTATE) {
/* state in CPU is obsolete; reset it */
fpureset();
}
}
/* The reset state _is_ the userspace state for this thread now */
curcpu()->ci_pflags |= CPUPF_USERXSTATE;
}
/*
* Copy out the FPU state, massaging it to be usable from userspace
* and acceptable to xrstor_user()
*/
static inline int
copyoutfpu(struct savefpu *sfp, char *sp, size_t len)
{
uint64_t bvs[2];
if (copyout(sfp, sp, len))
return 1;
if (len > offsetof(struct savefpu, fp_xstate.xstate_bv)) {
sp += offsetof(struct savefpu, fp_xstate.xstate_bv);
len -= offsetof(struct savefpu, fp_xstate.xstate_bv);
bvs[0] = sfp->fp_xstate.xstate_bv & XFEATURE_XCR0_MASK;
bvs[1] = sfp->fp_xstate.xstate_xcomp_bv &
(XFEATURE_XCR0_MASK | XFEATURE_COMPRESSED);
if (copyout(bvs, sp, min(len, sizeof bvs)))
return 1;
}
return 0;
}
/*
* Send an interrupt to process.
*
* Stack is set up to allow sigcode to call routine, followed by
* syscall to sigreturn routine below. After sigreturn resets the
* signal mask, the stack, and the frame pointer, it returns to the
* user specified pc.
*/
int
sendsig(sig_t catcher, int sig, sigset_t mask, const siginfo_t *ksip,
int info, int onstack)
{
struct proc *p = curproc;
struct trapframe *tf = p->p_md.md_regs;
struct sigcontext ksc;
struct savefpu *sfp = &p->p_addr->u_pcb.pcb_savefpu;
register_t sp, scp, sip;
u_long sss;
memset(&ksc, 0, sizeof ksc);
ksc.sc_rdi = tf->tf_rdi;
ksc.sc_rsi = tf->tf_rsi;
ksc.sc_rdx = tf->tf_rdx;
ksc.sc_rcx = tf->tf_rcx;
ksc.sc_r8 = tf->tf_r8;
ksc.sc_r9 = tf->tf_r9;
ksc.sc_r10 = tf->tf_r10;
ksc.sc_r11 = tf->tf_r11;
ksc.sc_r12 = tf->tf_r12;
ksc.sc_r13 = tf->tf_r13;
ksc.sc_r14 = tf->tf_r14;
ksc.sc_r15 = tf->tf_r15;
ksc.sc_rbx = tf->tf_rbx;
ksc.sc_rax = tf->tf_rax;
ksc.sc_rbp = tf->tf_rbp;
ksc.sc_rip = tf->tf_rip;
ksc.sc_cs = tf->tf_cs;
ksc.sc_rflags = tf->tf_rflags;
ksc.sc_rsp = tf->tf_rsp;
ksc.sc_ss = tf->tf_ss;
ksc.sc_mask = mask;
/* Allocate space for the signal handler context. */
if ((p->p_sigstk.ss_flags & SS_DISABLE) == 0 &&
!sigonstack(tf->tf_rsp) && onstack)
sp = trunc_page((vaddr_t)p->p_sigstk.ss_sp + p->p_sigstk.ss_size);
else
sp = tf->tf_rsp - 128;
sp -= fpu_save_len;
if (cpu_use_xsaves)
sp &= ~63ULL; /* just in case */
else
sp &= ~15ULL; /* just in case */
/* Save FPU state to PCB if necessary, then copy it out */
if (curcpu()->ci_pflags & CPUPF_USERXSTATE)
fpusave(&p->p_addr->u_pcb.pcb_savefpu);
if (copyoutfpu(sfp, (void *)sp, fpu_save_len))
return 1;
initialize_thread_xstate(p);
ksc.sc_fpstate = (struct fxsave64 *)sp;
sss = (sizeof(ksc) + 15) & ~15;
sip = 0;
if (info) {
sip = sp - ((sizeof(*ksip) + 15) & ~15);
sss += (sizeof(*ksip) + 15) & ~15;
if (copyout(ksip, (void *)sip, sizeof(*ksip)))
return 1;
}
scp = sp - sss;
ksc.sc_cookie = (long)scp ^ p->p_p->ps_sigcookie;
if (copyout(&ksc, (void *)scp, sizeof(ksc)))
return 1;
/*
* Build context to run handler in.
*/
tf->tf_rax = (u_int64_t)catcher;
tf->tf_rdi = sig;
tf->tf_rsi = sip;
tf->tf_rdx = scp;
tf->tf_rip = (u_int64_t)p->p_p->ps_sigcode;
tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
tf->tf_rflags &= ~(PSL_T|PSL_D|PSL_VM|PSL_AC);
tf->tf_rsp = scp;
tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
return 0;
}
/*
* System call to cleanup state after a signal
* has been taken. Reset signal mask and
* stack state from context left by sendsig (above).
* Return to previous pc and psl as specified by
* context left by sendsig. Check carefully to
* make sure that the user has not modified the
* psl to gain improper privileges or to cause
* a machine fault.
*/
int
sys_sigreturn(struct proc *p, void *v, register_t *retval)
{
struct sys_sigreturn_args /* {
syscallarg(struct sigcontext *) sigcntxp;
} */ *uap = v;
struct sigcontext ksc, *scp = SCARG(uap, sigcntxp);
struct trapframe *tf = p->p_md.md_regs;
struct savefpu *sfp = &p->p_addr->u_pcb.pcb_savefpu;
int error;
if (PROC_PC(p) != p->p_p->ps_sigcoderet) {
sigexit(p, SIGILL);
return (EPERM);
}
if ((error = copyin((caddr_t)scp, &ksc, sizeof ksc)))
return (error);
if (ksc.sc_cookie != ((long)scp ^ p->p_p->ps_sigcookie)) {
sigexit(p, SIGILL);
return (EFAULT);
}
/* Prevent reuse of the sigcontext cookie */
ksc.sc_cookie = 0;
(void)copyout(&ksc.sc_cookie, (caddr_t)scp +
offsetof(struct sigcontext, sc_cookie), sizeof (ksc.sc_cookie));
if (((ksc.sc_rflags ^ tf->tf_rflags) & PSL_USERSTATIC) != 0 ||
!USERMODE(ksc.sc_cs, ksc.sc_eflags))
return (EINVAL);
/* Current FPU state is obsolete; toss it and force a reload */
if (curcpu()->ci_pflags & CPUPF_USERXSTATE) {
curcpu()->ci_pflags &= ~CPUPF_USERXSTATE;
fpureset();
}
/* Copy in the FPU state to restore */
if (__predict_true(ksc.sc_fpstate != NULL)) {
if ((error = copyin(ksc.sc_fpstate, sfp, fpu_save_len)))
return error;
if (xrstor_user(sfp, xsave_mask)) {
memcpy(sfp, fpu_cleandata, fpu_save_len);
return EINVAL;
}
maybe_enable_user_cet(p);
curcpu()->ci_pflags |= CPUPF_USERXSTATE;
} else {
/* shouldn't happen, but handle it */
initialize_thread_xstate(p);
}
tf->tf_rdi = ksc.sc_rdi;
tf->tf_rsi = ksc.sc_rsi;
tf->tf_rdx = ksc.sc_rdx;
tf->tf_rcx = ksc.sc_rcx;
tf->tf_r8 = ksc.sc_r8;
tf->tf_r9 = ksc.sc_r9;
tf->tf_r10 = ksc.sc_r10;
tf->tf_r11 = ksc.sc_r11;
tf->tf_r12 = ksc.sc_r12;
tf->tf_r13 = ksc.sc_r13;
tf->tf_r14 = ksc.sc_r14;
tf->tf_r15 = ksc.sc_r15;
tf->tf_rbx = ksc.sc_rbx;
tf->tf_rax = ksc.sc_rax;
tf->tf_rbp = ksc.sc_rbp;
tf->tf_rip = ksc.sc_rip;
tf->tf_cs = ksc.sc_cs;
tf->tf_rflags = ksc.sc_rflags;
tf->tf_rsp = ksc.sc_rsp;
tf->tf_ss = ksc.sc_ss;
/* Restore signal mask. */
p->p_sigmask = ksc.sc_mask & ~sigcantmask;
/*
* sigreturn() needs to return to userspace via the 'iretq'
* method, so that if the process was interrupted (by tick,
* an IPI, whatever) as opposed to already being in the kernel
* when a signal was being delivered, the process will be
* completely restored, including the userland %rcx and %r11
* registers which the 'sysretq' instruction cannot restore.
* Also need to make sure we can handle faulting on xrstor.
*/
p->p_md.md_flags |= MDP_IRET;
return (EJUSTRETURN);
}
#ifdef MULTIPROCESSOR
/* force a CPU into the kernel, whether or not it's idle */
void
cpu_kick(struct cpu_info *ci)
{
/* only need to kick other CPUs */
if (ci != curcpu()) {
if (cpu_mwait_size > 0) {
/*
* If not idling, then send an IPI, else
* just clear the "keep idling" bit.
*/
if ((ci->ci_mwait & MWAIT_IN_IDLE) == 0)
x86_send_ipi(ci, X86_IPI_NOP);
else
atomic_clearbits_int(&ci->ci_mwait,
MWAIT_KEEP_IDLING);
} else {
/* no mwait, so need an IPI */
x86_send_ipi(ci, X86_IPI_NOP);
}
}
}
#endif
/*
* Notify the current process (p) that it has a signal pending,
* process as soon as possible.
*/
void
signotify(struct proc *p)
{
aston(p);
cpu_kick(p->p_cpu);
}
#ifdef MULTIPROCESSOR
void
cpu_unidle(struct cpu_info *ci)
{
if (cpu_mwait_size > 0 && (ci->ci_mwait & MWAIT_ONLY)) {
/*
* Just clear the "keep idling" bit; if it wasn't
* idling then we didn't need to do anything anyway.
*/
atomic_clearbits_int(&ci->ci_mwait, MWAIT_KEEP_IDLING);
return;
}
if (ci != curcpu())
x86_send_ipi(ci, X86_IPI_NOP);
}
#endif
int waittime = -1;
struct pcb dumppcb;
__dead void
boot(int howto)
{
if ((howto & RB_POWERDOWN) != 0)
lid_action = 0;
if ((howto & RB_RESET) != 0)
goto doreset;
if (cold) {
if ((howto & RB_USERREQ) == 0)
howto |= RB_HALT;
goto haltsys;
}
boothowto = howto;
if ((howto & RB_NOSYNC) == 0 && waittime < 0) {
waittime = 0;
vfs_shutdown(curproc);
if ((howto & RB_TIMEBAD) == 0) {
resettodr();
} else {
printf("WARNING: not updating battery clock\n");
}
}
if_downall();
uvm_shutdown();
splhigh();
cold = 1;
if ((howto & RB_DUMP) != 0)
dumpsys();
haltsys:
config_suspend_all(DVACT_POWERDOWN);
#ifdef MULTIPROCESSOR
x86_broadcast_ipi(X86_IPI_HALT);
#endif
if ((howto & RB_HALT) != 0) {
#if NACPI > 0 && !defined(SMALL_KERNEL)
extern int acpi_enabled;
if (acpi_enabled) {
delay(500000);
if ((howto & RB_POWERDOWN) != 0)
acpi_powerdown();
}
#endif
printf("\n");
printf("The operating system has halted.\n");
printf("Please press any key to reboot.\n\n");
cnpollc(1); /* for proper keyboard command handling */
cngetc();
cnpollc(0);
}
doreset:
printf("rebooting...\n");
if (cpureset_delay > 0)
delay(cpureset_delay * 1000);
cpu_reset();
for (;;)
continue;
/* NOTREACHED */
}
/*
* These variables are needed by /sbin/savecore
*/
u_long dumpmag = 0x8fca0101; /* magic number */
int dumpsize = 0; /* pages */
long dumplo = 0; /* blocks */
/*
* cpu_dump: dump the machine-dependent kernel core dump headers.
*/
int
cpu_dump(void)
{
int (*dump)(dev_t, daddr_t, caddr_t, size_t);
char buf[dbtob(1)];
kcore_seg_t *segp;
cpu_kcore_hdr_t *cpuhdrp;
phys_ram_seg_t *memsegp;
caddr_t va;
int i;
dump = bdevsw[major(dumpdev)].d_dump;
memset(buf, 0, sizeof buf);
segp = (kcore_seg_t *)buf;
cpuhdrp = (cpu_kcore_hdr_t *)&buf[ALIGN(sizeof(*segp))];
memsegp = (phys_ram_seg_t *)&buf[ALIGN(sizeof(*segp)) +
ALIGN(sizeof(*cpuhdrp))];
/*
* Generate a segment header.
*/
CORE_SETMAGIC(*segp, KCORE_MAGIC, MID_MACHINE, CORE_CPU);
segp->c_size = dbtob(1) - ALIGN(sizeof(*segp));
/*
* Add the machine-dependent header info.
*/
cpuhdrp->ptdpaddr = proc0.p_addr->u_pcb.pcb_cr3;
cpuhdrp->nmemsegs = mem_cluster_cnt;
/*
* Fill in the memory segment descriptors.
*/
for (i = 0; i < mem_cluster_cnt; i++) {
memsegp[i].start = mem_clusters[i].start;
memsegp[i].size = mem_clusters[i].size & ~PAGE_MASK;
}
/*
* If we have dump memory then assume the kernel stack is in high
* memory and bounce
*/
if (dumpmem_vaddr != 0) {
memcpy((char *)dumpmem_vaddr, buf, sizeof(buf));
va = (caddr_t)dumpmem_vaddr;
} else {
va = (caddr_t)buf;
}
return (dump(dumpdev, dumplo, va, dbtob(1)));
}
/*
* This is called by main to set dumplo and dumpsize.
* Dumps always skip the first PAGE_SIZE of disk space
* in case there might be a disk label stored there.
* If there is extra space, put dump at the end to
* reduce the chance that swapping trashes it.
*/
void
dumpconf(void)
{
int nblks, dumpblks; /* size of dump area */
if (dumpdev == NODEV ||
(nblks = (bdevsw[major(dumpdev)].d_psize)(dumpdev)) == 0)
return;
if (nblks <= ctod(1))
return;
dumpblks = cpu_dumpsize();
if (dumpblks < 0)
return;
dumpblks += ctod(cpu_dump_mempagecnt());
/* If dump won't fit (incl. room for possible label), punt. */
if (dumpblks > (nblks - ctod(1)))
return;
/* Put dump at end of partition */
dumplo = nblks - dumpblks;
/* dumpsize is in page units, and doesn't include headers. */
dumpsize = cpu_dump_mempagecnt();
}
/*
* Doadump comes here after turning off memory management and
* getting on the dump stack, either when called above, or by
* the auto-restart code.
*/
#define BYTES_PER_DUMP MAXPHYS /* must be a multiple of pagesize */
void
dumpsys(void)
{
u_long totalbytesleft, bytes, i, n, memseg;
u_long maddr;
daddr_t blkno;
void *va;
int (*dump)(dev_t, daddr_t, caddr_t, size_t);
int error;
/* Save registers. */
savectx(&dumppcb);
if (dumpdev == NODEV)
return;
/*
* For dumps during autoconfiguration,
* if dump device has already configured...
*/
if (dumpsize == 0)
dumpconf();
if (dumplo <= 0 || dumpsize == 0) {
printf("\ndump to dev %u,%u not possible\n", major(dumpdev),
minor(dumpdev));
return;
}
printf("\ndumping to dev %u,%u offset %ld\n", major(dumpdev),
minor(dumpdev), dumplo);
error = (*bdevsw[major(dumpdev)].d_psize)(dumpdev);
printf("dump ");
if (error == -1) {
printf("area unavailable\n");
return;
}
if ((error = cpu_dump()) != 0)
goto err;
totalbytesleft = ptoa(cpu_dump_mempagecnt());
blkno = dumplo + cpu_dumpsize();
dump = bdevsw[major(dumpdev)].d_dump;
error = 0;
for (memseg = 0; memseg < mem_cluster_cnt; memseg++) {
maddr = mem_clusters[memseg].start;
bytes = mem_clusters[memseg].size;
for (i = 0; i < bytes; i += n, totalbytesleft -= n) {
/* Print out how many MBs we have left to go. */
if ((totalbytesleft % (1024*1024)) < BYTES_PER_DUMP)
printf("%ld ", totalbytesleft / (1024 * 1024));
/* Limit size for next transfer. */
n = bytes - i;
if (n > BYTES_PER_DUMP)
n = BYTES_PER_DUMP;
if (maddr > 0xffffffff) {
va = (void *)dumpmem_vaddr;
if (n > dumpmem_sz)
n = dumpmem_sz;
memcpy(va, (void *)PMAP_DIRECT_MAP(maddr), n);
} else {
va = (void *)PMAP_DIRECT_MAP(maddr);
}
error = (*dump)(dumpdev, blkno, va, n);
if (error)
goto err;
maddr += n;
blkno += btodb(n); /* XXX? */
#if 0 /* XXX this doesn't work. grr. */
/* operator aborting dump? */
if (sget() != NULL) {
error = EINTR;
break;
}
#endif
}
}
err:
switch (error) {
case ENXIO:
printf("device bad\n");
break;
case EFAULT:
printf("device not ready\n");
break;
case EINVAL:
printf("area improper\n");
break;
case EIO:
printf("i/o error\n");
break;
case EINTR:
printf("aborted from console\n");
break;
case 0:
printf("succeeded\n");
break;
default:
printf("error %d\n", error);
break;
}
printf("\n\n");
delay(5000000); /* 5 seconds */
}
/*
* Force the userspace FS.base to be reloaded from the PCB on return from
* the kernel, and reset the segment registers (%ds, %es, %fs, and %gs)
* to their expected userspace value.
*/
void
reset_segs(void)
{
/*
* This operates like the cpu_switchto() sequence: if we
* haven't reset %[defg]s already, do so now.
*/
if (curcpu()->ci_pflags & CPUPF_USERSEGS) {
curcpu()->ci_pflags &= ~CPUPF_USERSEGS;
__asm volatile(
"movw %%ax,%%ds\n\t"
"movw %%ax,%%es\n\t"
"movw %%ax,%%fs\n\t"
"cli\n\t" /* block intr when on user GS.base */
"swapgs\n\t" /* swap from kernel to user GS.base */
"movw %%ax,%%gs\n\t"/* set %gs to UDATA and GS.base to 0 */
"swapgs\n\t" /* back to kernel GS.base */
"sti" : : "a"(GSEL(GUDATA_SEL, SEL_UPL)));
}
}
/*
* Clear registers on exec
*/
void
setregs(struct proc *p, struct exec_package *pack, u_long stack,
struct ps_strings *arginfo)
{
struct trapframe *tf;
initialize_thread_xstate(p);
/* To reset all registers we have to return via iretq */
p->p_md.md_flags |= MDP_IRET;
reset_segs();
p->p_addr->u_pcb.pcb_fsbase = 0;
tf = p->p_md.md_regs;
memset(tf, 0, sizeof *tf);
tf->tf_rip = pack->ep_entry;
tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
tf->tf_rflags = PSL_USERSET;
tf->tf_rsp = stack;
tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
}
/*
* Initialize segments and descriptor tables
*/
struct gate_descriptor *idt;
char idt_allocmap[NIDT];
struct user *proc0paddr = NULL;
void
setgate(struct gate_descriptor *gd, void *func, int ist, int type, int dpl,
int sel)
{
gd->gd_looffset = (u_int64_t)func & 0xffff;
gd->gd_selector = sel;
gd->gd_ist = ist;
gd->gd_type = type;
gd->gd_dpl = dpl;
gd->gd_p = 1;
gd->gd_hioffset = (u_int64_t)func >> 16;
gd->gd_zero = 0;
gd->gd_xx1 = 0;
gd->gd_xx2 = 0;
gd->gd_xx3 = 0;
}
void
unsetgate(struct gate_descriptor *gd)
{
memset(gd, 0, sizeof (*gd));
}
void
setregion(struct region_descriptor *rd, void *base, u_int16_t limit)
{
rd->rd_limit = limit;
rd->rd_base = (u_int64_t)base;
}
/*
* Note that the base and limit fields are ignored in long mode.
*/
void
set_mem_segment(struct mem_segment_descriptor *sd, void *base, size_t limit,
int type, int dpl, int gran, int def32, int is64)
{
sd->sd_lolimit = (unsigned)limit;
sd->sd_lobase = (unsigned long)base;
sd->sd_type = type;
sd->sd_dpl = dpl;
sd->sd_p = 1;
sd->sd_hilimit = (unsigned)limit >> 16;
sd->sd_avl = 0;
sd->sd_long = is64;
sd->sd_def32 = def32;
sd->sd_gran = gran;
sd->sd_hibase = (unsigned long)base >> 24;
}
void
set_sys_segment(struct sys_segment_descriptor *sd, void *base, size_t limit,
int type, int dpl, int gran)
{
memset(sd, 0, sizeof *sd);
sd->sd_lolimit = (unsigned)limit;
sd->sd_lobase = (u_int64_t)base;
sd->sd_type = type;
sd->sd_dpl = dpl;
sd->sd_p = 1;
sd->sd_hilimit = (unsigned)limit >> 16;
sd->sd_gran = gran;
sd->sd_hibase = (u_int64_t)base >> 24;
}
void cpu_init_idt(void)
{
struct region_descriptor region;
setregion(&region, idt, NIDT * sizeof(idt[0]) - 1);
lidt(&region);
}
void
cpu_init_extents(void)
{
extern struct extent *iomem_ex;
static int already_done;
int i;
/* We get called for each CPU, only first should do this */
if (already_done)
return;
/*
* Allocate the physical addresses used by RAM from the iomem
* extent map.
*/
for (i = 0; i < mem_cluster_cnt; i++) {
if (extent_alloc_region(iomem_ex, mem_clusters[i].start,
mem_clusters[i].size, EX_NOWAIT)) {
/* XXX What should we do? */
printf("WARNING: CAN'T ALLOCATE RAM (%llx-%llx)"
" FROM IOMEM EXTENT MAP!\n", mem_clusters[i].start,
mem_clusters[i].start + mem_clusters[i].size - 1);
}
}
already_done = 1;
}
void
map_tramps(void)
{
#if defined(MULTIPROCESSOR) || \
(NACPI > 0 && !defined(SMALL_KERNEL))
struct pmap *kmp = pmap_kernel();
extern paddr_t tramp_pdirpa;
#ifdef MULTIPROCESSOR
extern u_char cpu_spinup_trampoline[];
extern u_char cpu_spinup_trampoline_end[];
extern u_char mp_tramp_data_start[];
extern u_char mp_tramp_data_end[];
extern u_int32_t mp_pdirpa;
#endif
/*
* The initial PML4 pointer must be below 4G, so if the
* current one isn't, use a "bounce buffer" and save it
* for tramps to use.
*/
if (kmp->pm_pdirpa > 0xffffffff) {
pmap_kenter_pa(lo32_vaddr, lo32_paddr, PROT_READ | PROT_WRITE);
memcpy((void *)lo32_vaddr, kmp->pm_pdir, PAGE_SIZE);
tramp_pdirpa = lo32_paddr;
pmap_kremove(lo32_vaddr, PAGE_SIZE);
} else
tramp_pdirpa = kmp->pm_pdirpa;
#ifdef MULTIPROCESSOR
/* Map MP tramp code and data pages RW for copy */
pmap_kenter_pa(MP_TRAMPOLINE, MP_TRAMPOLINE,
PROT_READ | PROT_WRITE);
pmap_kenter_pa(MP_TRAMP_DATA, MP_TRAMP_DATA,
PROT_READ | PROT_WRITE);
memset((caddr_t)MP_TRAMPOLINE, 0xcc, PAGE_SIZE);
memset((caddr_t)MP_TRAMP_DATA, 0xcc, PAGE_SIZE);
memcpy((caddr_t)MP_TRAMPOLINE,
cpu_spinup_trampoline,
cpu_spinup_trampoline_end-cpu_spinup_trampoline);
memcpy((caddr_t)MP_TRAMP_DATA,
mp_tramp_data_start,
mp_tramp_data_end - mp_tramp_data_start);
/*
* We need to patch this after we copy the tramp data,
* the symbol points into the copied tramp data page.
*/
mp_pdirpa = tramp_pdirpa;
/* Unmap, will be remapped in cpu_start_secondary */
pmap_kremove(MP_TRAMPOLINE, PAGE_SIZE);
pmap_kremove(MP_TRAMP_DATA, PAGE_SIZE);
#endif /* MULTIPROCESSOR */
#endif
}
#define IDTVEC(name) __CONCAT(X, name)
typedef void (vector)(void);
extern vector *IDTVEC(exceptions)[];
paddr_t early_pte_pages;
void
init_x86_64(paddr_t first_avail)
{
struct region_descriptor region;
bios_memmap_t *bmp;
int x, ist;
uint64_t max_dm_size = ((uint64_t)512 * NUM_L4_SLOT_DIRECT) << 30;
/*
* locore0 mapped 3 pages for use before the pmap is initialized
* starting at first_avail. These pages are currently used by
* efifb to create early-use VAs for the framebuffer before efifb
* is attached.
*/
early_pte_pages = first_avail;
first_avail += 3 * NBPG;
cpu_init_msrs(&cpu_info_primary);
proc0.p_addr = proc0paddr;
cpu_info_primary.ci_curpcb = &proc0.p_addr->u_pcb;
x86_bus_space_init();
i8254_startclock();
/*
* Initialize PAGE_SIZE-dependent variables.
*/
uvm_setpagesize();
/*
* Boot arguments are in a single page specified by /boot.
*
* We require the "new" vector form, as well as memory ranges
* to be given in bytes rather than KB.
*
* locore copies the data into bootinfo[] for us.
*/
if ((bootapiver & (BAPIV_VECTOR | BAPIV_BMEMMAP)) ==
(BAPIV_VECTOR | BAPIV_BMEMMAP)) {
if (bootinfo_size >= sizeof(bootinfo))
panic("boot args too big");
getbootinfo(bootinfo, bootinfo_size);
} else
panic("invalid /boot");
cninit();
/*
* Memory on the AMD64 port is described by three different things.
*
* 1. biosbasemem - This is outdated, and should really only be used to
* sanitize the other values. This is what we get back from the BIOS
* using the legacy routines, describing memory below 640KB.
*
* 2. bios_memmap[] - This is the memory map as the bios has returned
* it to us. It includes memory the kernel occupies, etc.
*
* 3. mem_cluster[] - This is the massaged free memory segments after
* taking into account the contents of bios_memmap, biosbasemem,
* and locore/machdep/pmap kernel allocations of physical
* pages.
*
* The other thing is that the physical page *RANGE* is described by
* three more variables:
*
* avail_start - This is a physical address of the start of available
* pages, until IOM_BEGIN. This is basically the start
* of the UVM managed range of memory, with some holes...
*
* avail_end - This is the end of physical pages. All physical pages
* that UVM manages are between avail_start and avail_end.
* There are holes...
*
* first_avail - This is the first available physical page after the
* kernel, page tables, etc.
*
* We skip the first few pages for trampolines, hibernate, and to avoid
* buggy SMI implementations that could corrupt the first 64KB.
*/
avail_start = 16*PAGE_SIZE;
#ifdef MULTIPROCESSOR
if (avail_start < MP_TRAMPOLINE + PAGE_SIZE)
avail_start = MP_TRAMPOLINE + PAGE_SIZE;
if (avail_start < MP_TRAMP_DATA + PAGE_SIZE)
avail_start = MP_TRAMP_DATA + PAGE_SIZE;
#endif
#if (NACPI > 0 && !defined(SMALL_KERNEL))
if (avail_start < ACPI_TRAMPOLINE + PAGE_SIZE)
avail_start = ACPI_TRAMPOLINE + PAGE_SIZE;
if (avail_start < ACPI_TRAMP_DATA + PAGE_SIZE)
avail_start = ACPI_TRAMP_DATA + PAGE_SIZE;
#endif
#ifdef HIBERNATE
if (avail_start < HIBERNATE_HIBALLOC_PAGE + PAGE_SIZE)
avail_start = HIBERNATE_HIBALLOC_PAGE + PAGE_SIZE;
#endif /* HIBERNATE */
/*
* We need to go through the BIOS memory map given, and
* fill out mem_clusters and mem_cluster_cnt stuff, taking
* into account all the points listed above.
*/
avail_end = mem_cluster_cnt = 0;
for (bmp = bios_memmap; bmp->type != BIOS_MAP_END; bmp++) {
paddr_t s1, s2, e1, e2;
/* Ignore non-free memory */
if (bmp->type != BIOS_MAP_FREE)
continue;
if (bmp->size < PAGE_SIZE)
continue;
/* Init our segment(s), round/trunc to pages */
s1 = round_page(bmp->addr);
e1 = trunc_page(bmp->addr + bmp->size);
s2 = e2 = 0;
/*
* XXX Some buggy ACPI BIOSes use memory that they
* declare as free. Current worst offender is
* Supermicro 5019D-FTN4. Typically the affected memory
* areas are small blocks between areas reserved for
* ACPI and other BIOS goo. So skip areas smaller
* than 32 MB above the 16 MB boundary (to avoid
* affecting legacy stuff).
*/
if (s1 > 16*1024*1024 && (e1 - s1) < 32*1024*1024)
continue;
/* Check and adjust our segment(s) */
/* Nuke low pages */
if (s1 < avail_start) {
s1 = avail_start;
if (s1 > e1)
continue;
}
/*
* The direct map is limited to 512GB * NUM_L4_SLOT_DIRECT of
* memory, so discard anything above that.
*/
if (e1 >= max_dm_size) {
e1 = max_dm_size;
if (s1 > e1)
continue;
}
/* Crop stuff into "640K hole" */
if (s1 < IOM_BEGIN && e1 > IOM_BEGIN)
e1 = IOM_BEGIN;
if (s1 < biosbasemem && e1 > biosbasemem)
e1 = biosbasemem;
/* Split any segments straddling the 16MB boundary */
if (s1 < 16*1024*1024 && e1 > 16*1024*1024) {
e2 = e1;
s2 = e1 = 16*1024*1024;
}
/* Store segment(s) */
if (e1 - s1 >= PAGE_SIZE) {
mem_clusters[mem_cluster_cnt].start = s1;
mem_clusters[mem_cluster_cnt].size = e1 - s1;
mem_cluster_cnt++;
}
if (e2 - s2 >= PAGE_SIZE) {
mem_clusters[mem_cluster_cnt].start = s2;
mem_clusters[mem_cluster_cnt].size = e2 - s2;
mem_cluster_cnt++;
}
if (avail_end < e1) avail_end = e1;
if (avail_end < e2) avail_end = e2;
}
/*
* Call pmap initialization to make new kernel address space.
* We must do this before loading pages into the VM system.
*/
first_avail = pmap_bootstrap(first_avail, trunc_page(avail_end));
#if NEFI > 0
/* Relocate the EFI memory map. */
if (bios_efiinfo && bios_efiinfo->mmap_start) {
mmap = (EFI_MEMORY_DESCRIPTOR *)PMAP_DIRECT_MAP(first_avail);
memcpy(mmap, (void *)PMAP_DIRECT_MAP(bios_efiinfo->mmap_start),
bios_efiinfo->mmap_size);
first_avail += round_page(bios_efiinfo->mmap_size);
}
#endif
/* Allocate these out of the 640KB base memory */
if (avail_start != PAGE_SIZE)
avail_start = pmap_prealloc_lowmem_ptps(avail_start);
cpu_init_extents();
/* Make sure the end of the space used by the kernel is rounded. */
first_avail = round_page(first_avail);
kern_end = KERNBASE + first_avail;
/*
* Now, load the memory clusters (which have already been
* flensed) into the VM system.
*/
for (x = 0; x < mem_cluster_cnt; x++) {
paddr_t seg_start = mem_clusters[x].start;
paddr_t seg_end = seg_start + mem_clusters[x].size;
if (seg_start < first_avail) seg_start = first_avail;
if (seg_start > seg_end) continue;
if (seg_end - seg_start < PAGE_SIZE) continue;
physmem += atop(mem_clusters[x].size);
#if DEBUG_MEMLOAD
printf("loading 0x%lx-0x%lx (0x%lx-0x%lx)\n",
seg_start, seg_end, atop(seg_start), atop(seg_end));
#endif
uvm_page_physload(atop(seg_start), atop(seg_end),
atop(seg_start), atop(seg_end), 0);
}
/*
* Now, load the memory between the end of I/O memory "hole"
* and the kernel.
*/
{
paddr_t seg_start = round_page(IOM_END);
paddr_t seg_end = trunc_page(KERNTEXTOFF - KERNBASE);
if (seg_start < seg_end) {
#if DEBUG_MEMLOAD
printf("loading 0x%lx-0x%lx\n", seg_start, seg_end);
#endif
uvm_page_physload(atop(seg_start), atop(seg_end),
atop(seg_start), atop(seg_end), 0);
}
}
#if DEBUG_MEMLOAD
printf("avail_start = 0x%lx\n", avail_start);
printf("avail_end = 0x%lx\n", avail_end);
printf("first_avail = 0x%lx\n", first_avail);
#endif
/*
* Steal memory for the message buffer (at end of core).
*/
{
struct vm_physseg *vps = NULL;
psize_t sz = round_page(MSGBUFSIZE);
psize_t reqsz = sz;
for (x = 0; x < vm_nphysseg; x++) {
vps = &vm_physmem[x];
if (ptoa(vps->avail_end) == avail_end)
break;
}
if (x == vm_nphysseg)
panic("init_x86_64: can't find end of memory");
/* Shrink so it'll fit in the last segment. */
if ((vps->avail_end - vps->avail_start) < atop(sz))
sz = ptoa(vps->avail_end - vps->avail_start);
vps->avail_end -= atop(sz);
vps->end -= atop(sz);
msgbuf_paddr = ptoa(vps->avail_end);
/* Remove the last segment if it now has no pages. */
if (vps->start == vps->end) {
for (vm_nphysseg--; x < vm_nphysseg; x++)
vm_physmem[x] = vm_physmem[x + 1];
}
/* Now find where the new avail_end is. */
for (avail_end = 0, x = 0; x < vm_nphysseg; x++)
if (vm_physmem[x].avail_end > avail_end)
avail_end = vm_physmem[x].avail_end;
avail_end = ptoa(avail_end);
/* Warn if the message buffer had to be shrunk. */
if (sz != reqsz)
printf("WARNING: %ld bytes not available for msgbuf "
"in last cluster (%ld used)\n", reqsz, sz);
}
/*
* Steal some memory for a dump bouncebuffer if we have memory over
* the 32-bit barrier.
*/
if (avail_end > 0xffffffff) {
struct vm_physseg *vps = NULL;
psize_t sz = round_page(MAX(BYTES_PER_DUMP, dbtob(1)));
/* XXX assumes segments are ordered */
for (x = 0; x < vm_nphysseg; x++) {
vps = &vm_physmem[x];
/* Find something between 16meg and 4gig */
if (ptoa(vps->avail_end) <= 0xffffffff &&
ptoa(vps->avail_start) >= 0xffffff)
break;
}
if (x == vm_nphysseg)
panic("init_x86_64: no memory between "
"0xffffff-0xffffffff");
/* Shrink so it'll fit in the segment. */
if ((vps->avail_end - vps->avail_start) < atop(sz))
sz = ptoa(vps->avail_end - vps->avail_start);
vps->avail_end -= atop(sz);
vps->end -= atop(sz);
dumpmem_paddr = ptoa(vps->avail_end);
dumpmem_vaddr = PMAP_DIRECT_MAP(dumpmem_paddr);
dumpmem_sz = sz;
/* Remove the last segment if it now has no pages. */
if (vps->start == vps->end) {
for (vm_nphysseg--; x < vm_nphysseg; x++)
vm_physmem[x] = vm_physmem[x + 1];
}
}
pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024);
pmap_kenter_pa(idt_vaddr, idt_paddr, PROT_READ | PROT_WRITE);
idt = (struct gate_descriptor *)idt_vaddr;
cpu_info_primary.ci_tss = &cpu_info_full_primary.cif_tss;
cpu_info_primary.ci_gdt = &cpu_info_full_primary.cif_gdt;
/* make gdt gates and memory segments */
set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GCODE_SEL), 0,
0xfffff, SDT_MEMERA, SEL_KPL, 1, 0, 1);
set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GDATA_SEL), 0,
0xfffff, SDT_MEMRWA, SEL_KPL, 1, 0, 1);
set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GUCODE32_SEL), 0,
atop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMERA, SEL_UPL, 1, 1, 0);
set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GUDATA_SEL), 0,
atop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 0, 1);
set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GUCODE_SEL), 0,
atop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 0, 1);
set_sys_segment(GDT_ADDR_SYS(cpu_info_primary.ci_gdt, GPROC0_SEL),
cpu_info_primary.ci_tss, sizeof (struct x86_64_tss)-1,
SDT_SYS386TSS, SEL_KPL, 0);
/* exceptions */
for (x = 0; x < 32; x++) {
/* trap2 == NMI, trap8 == double fault */
ist = (x == 2) ? 2 : (x == 8) ? 1 : 0;
setgate(&idt[x], IDTVEC(exceptions)[x], ist, SDT_SYS386IGT,
(x == 3) ? SEL_UPL : SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
idt_allocmap[x] = 1;
}
setregion(&region, cpu_info_primary.ci_gdt, GDT_SIZE - 1);
lgdt(&region);
cpu_init_idt();
intr_default_setup();
fpuinit(&cpu_info_primary);
softintr_init();
splraise(IPL_IPI);
intr_enable();
#ifdef DDB
db_machine_init();
ddb_init();
if (boothowto & RB_KDB)
db_enter();
#endif
}
void
cpu_reset(void)
{
intr_disable();
if (cpuresetfn)
(*cpuresetfn)();
/*
* The keyboard controller has 4 random output pins, one of which is
* connected to the RESET pin on the CPU in many PCs. We tell the
* keyboard controller to pulse this line a couple of times.
*/
outb(IO_KBD + KBCMDP, KBC_PULSE0);
delay(100000);
outb(IO_KBD + KBCMDP, KBC_PULSE0);
delay(100000);
/*
* Try to cause a triple fault and watchdog reset by making the IDT
* invalid and causing a fault.
*/
memset((caddr_t)idt, 0, NIDT * sizeof(idt[0]));
__asm volatile("divl %0,%1" : : "q" (0), "a" (0));
for (;;)
continue;
/* NOTREACHED */
}
/*
* cpu_dumpsize: calculate size of machine-dependent kernel core dump headers.
*/
int
cpu_dumpsize(void)
{
int size;
size = ALIGN(sizeof(kcore_seg_t)) +
ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t));
if (roundup(size, dbtob(1)) != dbtob(1))
return (-1);
return (1);
}
/*
* cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped.
*/
u_long
cpu_dump_mempagecnt(void)
{
u_long i, n;
n = 0;
for (i = 0; i < mem_cluster_cnt; i++)
n += atop(mem_clusters[i].size);
return (n);
}
/*
* Figure out which portions of memory are used by the kernel/system.
*/
int
amd64_pa_used(paddr_t addr)
{
struct vm_page *pg;
/* Kernel manages these */
if ((pg = PHYS_TO_VM_PAGE(addr)) && (pg->pg_flags & PG_DEV) == 0)
return 1;
/* Kernel is loaded here */
if (addr > IOM_END && addr < (kern_end - KERNBASE))
return 1;
/* Low memory used for various bootstrap things */
if (addr < avail_start)
return 1;
/*
* The only regions I can think of that are left are the things
* we steal away from UVM. The message buffer?
* XXX - ignore these for now.
*/
return 0;
}
void
cpu_initclocks(void)
{
(*initclock_func)();
}
void
cpu_startclock(void)
{
(*startclock_func)();
}
void
need_resched(struct cpu_info *ci)
{
ci->ci_want_resched = 1;
/* There's a risk we'll be called before the idle threads start */
if (ci->ci_curproc) {
aston(ci->ci_curproc);
cpu_kick(ci);
}
}
/*
* Allocate an IDT vector slot within the given range.
* XXX needs locking to avoid MP allocation races.
*/
int
idt_vec_alloc(int low, int high)
{
int vec;
for (vec = low; vec <= high; vec++) {
if (idt_allocmap[vec] == 0) {
idt_allocmap[vec] = 1;
return vec;
}
}
return 0;
}
int
idt_vec_alloc_range(int low, int high, int num)
{
int i, vec;
KASSERT(powerof2(num));
low = (low + num - 1) & ~(num - 1);
high = ((high + 1) & ~(num - 1)) - 1;
for (vec = low; vec <= high; vec += num) {
for (i = 0; i < num; i++) {
if (idt_allocmap[vec + i] != 0)
break;
}
if (i == num) {
for (i = 0; i < num; i++)
idt_allocmap[vec + i] = 1;
return vec;
}
}
return 0;
}
void
idt_vec_set(int vec, void (*function)(void))
{
/*
* Vector should be allocated, so no locking needed.
*/
KASSERT(idt_allocmap[vec] == 1);
setgate(&idt[vec], function, 0, SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
}
void
idt_vec_free(int vec)
{
unsetgate(&idt[vec]);
idt_allocmap[vec] = 0;
}
#ifdef DIAGNOSTIC
void
splassert_check(int wantipl, const char *func)
{
int cpl = curcpu()->ci_ilevel;
int floor = curcpu()->ci_handled_intr_level;
if (cpl < wantipl) {
splassert_fail(wantipl, cpl, func);
}
if (floor > wantipl) {
splassert_fail(wantipl, floor, func);
}
}
#endif
int
copyin32(const uint32_t *uaddr, uint32_t *kaddr)
{
if ((vaddr_t)uaddr & 0x3)
return EFAULT;
/* copyin(9) is atomic */
return copyin(uaddr, kaddr, sizeof(uint32_t));
}
void
getbootinfo(char *bootinfo, int bootinfo_size)
{
bootarg32_t *q;
bios_ddb_t *bios_ddb;
bios_bootduid_t *bios_bootduid;
bios_bootsr_t *bios_bootsr;
#undef BOOTINFO_DEBUG
#ifdef BOOTINFO_DEBUG
printf("bootargv:");
#endif
for (q = (bootarg32_t *)bootinfo;
(q->ba_type != BOOTARG_END) &&
((((char *)q) - bootinfo) < bootinfo_size);
q = (bootarg32_t *)(((char *)q) + q->ba_size)) {
switch (q->ba_type) {
case BOOTARG_MEMMAP:
bios_memmap = (bios_memmap_t *)q->ba_arg;
#ifdef BOOTINFO_DEBUG
printf(" memmap %p", bios_memmap);
#endif
break;
case BOOTARG_DISKINFO:
bios_diskinfo = (bios_diskinfo_t *)q->ba_arg;
#ifdef BOOTINFO_DEBUG
printf(" diskinfo %p", bios_diskinfo);
#endif
break;
case BOOTARG_APMINFO:
/* generated by i386 boot loader */
break;
case BOOTARG_CKSUMLEN:
bios_cksumlen = *(u_int32_t *)q->ba_arg;
#ifdef BOOTINFO_DEBUG
printf(" cksumlen %d", bios_cksumlen);
#endif
break;
case BOOTARG_PCIINFO:
/* generated by i386 boot loader */
break;
case BOOTARG_CONSDEV: {
#if NCOM > 0
bios_consdev_t *cdp = (bios_consdev_t*)q->ba_arg;
static const int ports[] =
{ 0x3f8, 0x2f8, 0x3e8, 0x2e8 };
int unit = minor(cdp->consdev);
uint64_t consaddr = cdp->consaddr;
if (consaddr == -1 && unit >= 0 && unit < nitems(ports))
consaddr = ports[unit];
if (major(cdp->consdev) == 8 && consaddr != -1) {
comconsunit = unit;
comconsaddr = consaddr;
comconsrate = cdp->conspeed;
comconsfreq = cdp->consfreq;
comcons_reg_width = cdp->reg_width;
comcons_reg_shift = cdp->reg_shift;
if (cdp->flags & BCD_MMIO)
comconsiot = X86_BUS_SPACE_MEM;
else
comconsiot = X86_BUS_SPACE_IO;
}
#endif
#ifdef BOOTINFO_DEBUG
printf(" console 0x%x:%d", cdp->consdev, cdp->conspeed);
#endif
break;
}
case BOOTARG_BOOTMAC:
bios_bootmac = (bios_bootmac_t *)q->ba_arg;
break;
case BOOTARG_DDB:
bios_ddb = (bios_ddb_t *)q->ba_arg;
#ifdef DDB
db_console = bios_ddb->db_console;
#endif
break;
case BOOTARG_BOOTDUID:
bios_bootduid = (bios_bootduid_t *)q->ba_arg;
memcpy(bootduid, bios_bootduid, sizeof(bootduid));
break;
case BOOTARG_BOOTSR:
bios_bootsr = (bios_bootsr_t *)q->ba_arg;
#if NSOFTRAID > 0
memcpy(&sr_bootuuid, &bios_bootsr->uuid,
sizeof(sr_bootuuid));
memcpy(&sr_bootkey, &bios_bootsr->maskkey,
sizeof(sr_bootkey));
#endif
explicit_bzero(bios_bootsr, sizeof(bios_bootsr_t));
break;
case BOOTARG_EFIINFO:
bios_efiinfo = (bios_efiinfo_t *)q->ba_arg;
break;
case BOOTARG_UCODE:
bios_ucode = (bios_ucode_t *)q->ba_arg;
break;
default:
#ifdef BOOTINFO_DEBUG
printf(" unsupported arg (%d) %p", q->ba_type,
q->ba_arg);
#endif
break;
}
}
#ifdef BOOTINFO_DEBUG
printf("\n");
#endif
}
int
check_context(const struct reg *regs, struct trapframe *tf)
{
uint16_t sel;
if (((regs->r_rflags ^ tf->tf_rflags) & PSL_USERSTATIC) != 0)
return EINVAL;
sel = regs->r_ss & 0xffff;
if (!VALID_USER_DSEL(sel))
return EINVAL;
sel = regs->r_cs & 0xffff;
if (!VALID_USER_CSEL(sel))
return EINVAL;
if (regs->r_rip >= VM_MAXUSER_ADDRESS)
return EINVAL;
return 0;
}
int amd64_delay_quality;
void
delay_init(void(*fn)(int), int fn_quality)
{
if (fn_quality > amd64_delay_quality) {
delay_func = fn;
amd64_delay_quality = fn_quality;
}
}
void
delay_fini(void (*fn)(int))
{
if (fn == delay_func) {
delay_func = i8254_delay;
amd64_delay_quality = 0;
}
}