/* $OpenBSD: machdep.c,v 1.290 2024/02/03 16:21:22 deraadt Exp $ */ /* $NetBSD: machdep.c,v 1.3 2003/05/07 22:58:18 fvdl Exp $ */ /*- * Copyright (c) 1996, 1997, 1998, 2000 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace * Simulation Facility, NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include extern int db_console; #endif #include "isa.h" #include "isadma.h" #include "ksyms.h" #include "acpi.h" #if NACPI > 0 #include #endif #include "com.h" #if NCOM > 0 #include #include #include #endif #include "efi.h" #if NEFI > 0 #include #endif #include "softraid.h" #if NSOFTRAID > 0 #include #endif #ifdef HIBERNATE #include #endif /* HIBERNATE */ #include "ukbd.h" #include "pckbc.h" #if NPCKBC > 0 && NUKBD > 0 #include #endif /* #define MACHDEP_DEBUG */ #ifdef MACHDEP_DEBUG #define DPRINTF(x...) do { printf(x); } while(0) #else #define DPRINTF(x...) #endif /* MACHDEP_DEBUG */ /* the following is used externally (sysctl_hw) */ char machine[] = MACHINE; /* * switchto vectors */ void cpu_idle_cycle_hlt(void); void (*cpu_idle_cycle_fcn)(void) = &cpu_idle_cycle_hlt; /* the following is used externally for concurrent handlers */ int setperf_prio = 0; #ifdef CPURESET_DELAY int cpureset_delay = CPURESET_DELAY; #else int cpureset_delay = 0; #endif char *ssym = 0, *esym = 0; /* start and end of symbol table */ dev_t bootdev = 0; /* device we booted from */ int biosbasemem = 0; /* base memory reported by BIOS */ u_int bootapiver = 0; /* /boot API version */ int physmem; u_int64_t dumpmem_low; u_int64_t dumpmem_high; extern int boothowto; int cpu_class; paddr_t dumpmem_paddr; vaddr_t dumpmem_vaddr; psize_t dumpmem_sz; vaddr_t kern_end; vaddr_t msgbuf_vaddr; paddr_t msgbuf_paddr; vaddr_t idt_vaddr; paddr_t idt_paddr; vaddr_t lo32_vaddr; paddr_t lo32_paddr; paddr_t tramp_pdirpa; int kbd_reset; int lid_action = 1; int pwr_action = 1; int forceukbd; /* * safepri is a safe priority for sleep to set for a spin-wait * during autoconfiguration or after a panic. */ int safepri = 0; struct vm_map *exec_map = NULL; struct vm_map *phys_map = NULL; /* UVM constraint ranges. */ struct uvm_constraint_range isa_constraint = { 0x0, 0x00ffffffUL }; struct uvm_constraint_range dma_constraint = { 0x0, 0xffffffffUL }; struct uvm_constraint_range *uvm_md_constraints[] = { &isa_constraint, &dma_constraint, NULL, }; paddr_t avail_start; paddr_t avail_end; void (*delay_func)(int) = i8254_delay; void (*initclock_func)(void) = i8254_initclocks; void (*startclock_func)(void) = i8254_start_both_clocks; /* * Format of boot information passed to us by 32-bit /boot */ typedef struct _boot_args32 { int ba_type; int ba_size; int ba_nextX; /* a ptr in 32-bit world, but not here */ char ba_arg[1]; } bootarg32_t; #define BOOTARGC_MAX NBPG /* one page */ bios_bootmac_t *bios_bootmac; /* locore copies the arguments from /boot to here for us */ char bootinfo[BOOTARGC_MAX]; int bootinfo_size = BOOTARGC_MAX; void getbootinfo(char *, int); /* Data passed to us by /boot, filled in by getbootinfo() */ bios_diskinfo_t *bios_diskinfo; bios_memmap_t *bios_memmap; u_int32_t bios_cksumlen; bios_efiinfo_t *bios_efiinfo; bios_ucode_t *bios_ucode; #if NEFI > 0 EFI_MEMORY_DESCRIPTOR *mmap; #endif /* * Size of memory segments, before any memory is stolen. */ phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX]; int mem_cluster_cnt; int cpu_dump(void); int cpu_dumpsize(void); u_long cpu_dump_mempagecnt(void); void dumpsys(void); void cpu_init_extents(void); void map_tramps(void); void init_x86_64(paddr_t); void (*cpuresetfn)(void); void enter_shared_special_pages(void); #ifdef APERTURE int allowaperture = 0; #endif /* * Machine-dependent startup code */ void cpu_startup(void) { vaddr_t minaddr, maxaddr; msgbuf_vaddr = PMAP_DIRECT_MAP(msgbuf_paddr); initmsgbuf((caddr_t)msgbuf_vaddr, round_page(MSGBUFSIZE)); printf("%s", version); startclocks(); rtcinit(); printf("real mem = %lu (%luMB)\n", ptoa((psize_t)physmem), ptoa((psize_t)physmem)/1024/1024); /* * Allocate a submap for exec arguments. This map effectively * limits the number of processes exec'ing at any time. */ minaddr = vm_map_min(kernel_map); exec_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr, 16*NCARGS, VM_MAP_PAGEABLE, FALSE, NULL); /* * Allocate a submap for physio */ minaddr = vm_map_min(kernel_map); phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr, VM_PHYS_SIZE, 0, FALSE, NULL); printf("avail mem = %lu (%luMB)\n", ptoa((psize_t)uvmexp.free), ptoa((psize_t)uvmexp.free)/1024/1024); bufinit(); if (boothowto & RB_CONFIG) { #ifdef BOOT_CONFIG user_config(); #else printf("kernel does not support -c; continuing..\n"); #endif } /* Safe for i/o port / memory space allocation to use malloc now. */ x86_bus_space_mallocok(); #ifndef SMALL_KERNEL cpu_ucode_setup(); cpu_ucode_apply(&cpu_info_primary); #endif cpu_tsx_disable(&cpu_info_primary); /* enter the IDT and trampoline code in the u-k maps */ enter_shared_special_pages(); /* initialize CPU0's TSS and GDT and put them in the u-k maps */ cpu_enter_pages(&cpu_info_full_primary); } /* * enter_shared_special_pages * * Requests mapping of various special pages required in the Intel Meltdown * case (to be entered into the U-K page table): * * 1 IDT page * Various number of pages covering the U-K ".kutext" section. This section * contains code needed during trampoline operation * Various number of pages covering the U-K ".kudata" section. This section * contains data accessed by the trampoline, before switching to U+K * (for example, various shared global variables used by IPIs, etc) * * The linker script places the required symbols in the sections above. * * On CPUs not affected by Meltdown, the calls to pmap_enter_special below * become no-ops. */ void enter_shared_special_pages(void) { extern char __kutext_start[], __kutext_end[], __kernel_kutext_phys[]; extern char __text_page_start[], __text_page_end[]; extern char __kernel_kutext_page_phys[]; extern char __kudata_start[], __kudata_end[], __kernel_kudata_phys[]; vaddr_t va; paddr_t pa; /* idt */ pmap_enter_special(idt_vaddr, idt_paddr, PROT_READ); DPRINTF("%s: entered idt page va 0x%llx pa 0x%llx\n", __func__, (uint64_t)idt_vaddr, (uint64_t)idt_paddr); /* .kutext section */ va = (vaddr_t)__kutext_start; pa = (paddr_t)__kernel_kutext_phys; while (va < (vaddr_t)__kutext_end) { pmap_enter_special(va, pa, PROT_READ | PROT_EXEC); DPRINTF("%s: entered kutext page va 0x%llx pa 0x%llx\n", __func__, (uint64_t)va, (uint64_t)pa); va += PAGE_SIZE; pa += PAGE_SIZE; } /* .kutext.page section */ va = (vaddr_t)__text_page_start; pa = (paddr_t)__kernel_kutext_page_phys; while (va < (vaddr_t)__text_page_end) { pmap_enter_special(va, pa, PROT_READ | PROT_EXEC); DPRINTF("%s: entered kutext.page va 0x%llx pa 0x%llx\n", __func__, (uint64_t)va, (uint64_t)pa); va += PAGE_SIZE; pa += PAGE_SIZE; } /* .kudata section */ va = (vaddr_t)__kudata_start; pa = (paddr_t)__kernel_kudata_phys; while (va < (vaddr_t)__kudata_end) { pmap_enter_special(va, pa, PROT_READ | PROT_WRITE); DPRINTF("%s: entered kudata page va 0x%llx pa 0x%llx\n", __func__, (uint64_t)va, (uint64_t)pa); va += PAGE_SIZE; pa += PAGE_SIZE; } } /* * Set up proc0's PCB and the cpu's TSS. */ void x86_64_proc0_tss_ldt_init(void) { struct pcb *pcb; cpu_info_primary.ci_curpcb = pcb = &proc0.p_addr->u_pcb; pcb->pcb_fsbase = 0; pcb->pcb_kstack = (u_int64_t)proc0.p_addr + USPACE - 16; proc0.p_md.md_regs = (struct trapframe *)pcb->pcb_kstack - 1; ltr(GSYSSEL(GPROC0_SEL, SEL_KPL)); lldt(0); } bios_diskinfo_t * bios_getdiskinfo(dev_t dev) { bios_diskinfo_t *pdi; if (bios_diskinfo == NULL) return NULL; for (pdi = bios_diskinfo; pdi->bios_number != -1; pdi++) { if ((dev & B_MAGICMASK) == B_DEVMAGIC) { /* search by bootdev */ if (pdi->bsd_dev == dev) break; } else { if (pdi->bios_number == dev) break; } } if (pdi->bios_number == -1) return NULL; else return pdi; } int bios_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen, struct proc *p) { bios_diskinfo_t *pdi; int biosdev; /* all sysctl names at this level except diskinfo are terminal */ if (namelen != 1 && name[0] != BIOS_DISKINFO) return (ENOTDIR); /* overloaded */ if (!(bootapiver & BAPIV_VECTOR)) return EOPNOTSUPP; switch (name[0]) { case BIOS_DEV: if ((pdi = bios_getdiskinfo(bootdev)) == NULL) return ENXIO; biosdev = pdi->bios_number; return sysctl_rdint(oldp, oldlenp, newp, biosdev); case BIOS_DISKINFO: if (namelen != 2) return ENOTDIR; if ((pdi = bios_getdiskinfo(name[1])) == NULL) return ENXIO; return sysctl_rdstruct(oldp, oldlenp, newp, pdi, sizeof(*pdi)); case BIOS_CKSUMLEN: return sysctl_rdint(oldp, oldlenp, newp, bios_cksumlen); default: return EOPNOTSUPP; } /* NOTREACHED */ } extern int tsc_is_invariant; extern int amd64_has_xcrypt; extern int need_retpoline; const struct sysctl_bounded_args cpuctl_vars[] = { { CPU_LIDACTION, &lid_action, 0, 2 }, { CPU_PWRACTION, &pwr_action, 0, 2 }, { CPU_CPUID, &cpu_id, SYSCTL_INT_READONLY }, { CPU_CPUFEATURE, &cpu_feature, SYSCTL_INT_READONLY }, { CPU_XCRYPT, &amd64_has_xcrypt, SYSCTL_INT_READONLY }, { CPU_INVARIANTTSC, &tsc_is_invariant, SYSCTL_INT_READONLY }, { CPU_RETPOLINE, &need_retpoline, SYSCTL_INT_READONLY }, }; /* * machine dependent system variables. */ int cpu_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen, struct proc *p) { extern uint64_t tsc_frequency; dev_t consdev; dev_t dev; switch (name[0]) { case CPU_CONSDEV: if (namelen != 1) return (ENOTDIR); /* overloaded */ if (cn_tab != NULL) consdev = cn_tab->cn_dev; else consdev = NODEV; return (sysctl_rdstruct(oldp, oldlenp, newp, &consdev, sizeof consdev)); case CPU_CHR2BLK: if (namelen != 2) return (ENOTDIR); /* overloaded */ dev = chrtoblk((dev_t)name[1]); return sysctl_rdstruct(oldp, oldlenp, newp, &dev, sizeof(dev)); case CPU_BIOS: return bios_sysctl(name + 1, namelen - 1, oldp, oldlenp, newp, newlen, p); case CPU_CPUVENDOR: return (sysctl_rdstring(oldp, oldlenp, newp, cpu_vendor)); case CPU_KBDRESET: return (sysctl_securelevel_int(oldp, oldlenp, newp, newlen, &kbd_reset)); case CPU_ALLOWAPERTURE: if (namelen != 1) return (ENOTDIR); /* overloaded */ #ifdef APERTURE if (securelevel > 0) return (sysctl_int_lower(oldp, oldlenp, newp, newlen, &allowaperture)); else return (sysctl_int(oldp, oldlenp, newp, newlen, &allowaperture)); #else return (sysctl_rdint(oldp, oldlenp, newp, 0)); #endif #if NPCKBC > 0 && NUKBD > 0 case CPU_FORCEUKBD: { int error; if (forceukbd) return (sysctl_rdint(oldp, oldlenp, newp, forceukbd)); error = sysctl_int(oldp, oldlenp, newp, newlen, &forceukbd); if (forceukbd) pckbc_release_console(); return (error); } #endif case CPU_TSCFREQ: return (sysctl_rdquad(oldp, oldlenp, newp, tsc_frequency)); default: return (sysctl_bounded_arr(cpuctl_vars, nitems(cpuctl_vars), name, namelen, oldp, oldlenp, newp, newlen)); } /* NOTREACHED */ } static inline void maybe_enable_user_cet(struct proc *p) { #ifndef SMALL_KERNEL /* Enable indirect-branch tracking if present and not disabled */ if ((xsave_mask & XFEATURE_CET_U) && (p->p_p->ps_flags & PS_NOBTCFI) == 0) { uint64_t msr = rdmsr(MSR_U_CET); wrmsr(MSR_U_CET, msr | MSR_CET_ENDBR_EN | MSR_CET_NO_TRACK_EN); } #endif } static inline void initialize_thread_xstate(struct proc *p) { if (cpu_use_xsaves) { xrstors(fpu_cleandata, xsave_mask); maybe_enable_user_cet(p); } else { /* Reset FPU state in PCB */ memcpy(&p->p_addr->u_pcb.pcb_savefpu, fpu_cleandata, fpu_save_len); if (curcpu()->ci_pflags & CPUPF_USERXSTATE) { /* state in CPU is obsolete; reset it */ fpureset(); } } /* The reset state _is_ the userspace state for this thread now */ curcpu()->ci_pflags |= CPUPF_USERXSTATE; } /* * Copy out the FPU state, massaging it to be usable from userspace * and acceptable to xrstor_user() */ static inline int copyoutfpu(struct savefpu *sfp, char *sp, size_t len) { uint64_t bvs[2]; if (copyout(sfp, sp, len)) return 1; if (len > offsetof(struct savefpu, fp_xstate.xstate_bv)) { sp += offsetof(struct savefpu, fp_xstate.xstate_bv); len -= offsetof(struct savefpu, fp_xstate.xstate_bv); bvs[0] = sfp->fp_xstate.xstate_bv & XFEATURE_XCR0_MASK; bvs[1] = sfp->fp_xstate.xstate_xcomp_bv & (XFEATURE_XCR0_MASK | XFEATURE_COMPRESSED); if (copyout(bvs, sp, min(len, sizeof bvs))) return 1; } return 0; } /* * Send an interrupt to process. * * Stack is set up to allow sigcode to call routine, followed by * syscall to sigreturn routine below. After sigreturn resets the * signal mask, the stack, and the frame pointer, it returns to the * user specified pc. */ int sendsig(sig_t catcher, int sig, sigset_t mask, const siginfo_t *ksip, int info, int onstack) { struct proc *p = curproc; struct trapframe *tf = p->p_md.md_regs; struct sigcontext ksc; struct savefpu *sfp = &p->p_addr->u_pcb.pcb_savefpu; register_t sp, scp, sip; u_long sss; memset(&ksc, 0, sizeof ksc); ksc.sc_rdi = tf->tf_rdi; ksc.sc_rsi = tf->tf_rsi; ksc.sc_rdx = tf->tf_rdx; ksc.sc_rcx = tf->tf_rcx; ksc.sc_r8 = tf->tf_r8; ksc.sc_r9 = tf->tf_r9; ksc.sc_r10 = tf->tf_r10; ksc.sc_r11 = tf->tf_r11; ksc.sc_r12 = tf->tf_r12; ksc.sc_r13 = tf->tf_r13; ksc.sc_r14 = tf->tf_r14; ksc.sc_r15 = tf->tf_r15; ksc.sc_rbx = tf->tf_rbx; ksc.sc_rax = tf->tf_rax; ksc.sc_rbp = tf->tf_rbp; ksc.sc_rip = tf->tf_rip; ksc.sc_cs = tf->tf_cs; ksc.sc_rflags = tf->tf_rflags; ksc.sc_rsp = tf->tf_rsp; ksc.sc_ss = tf->tf_ss; ksc.sc_mask = mask; /* Allocate space for the signal handler context. */ if ((p->p_sigstk.ss_flags & SS_DISABLE) == 0 && !sigonstack(tf->tf_rsp) && onstack) sp = trunc_page((vaddr_t)p->p_sigstk.ss_sp + p->p_sigstk.ss_size); else sp = tf->tf_rsp - 128; sp -= fpu_save_len; if (cpu_use_xsaves) sp &= ~63ULL; /* just in case */ else sp &= ~15ULL; /* just in case */ /* Save FPU state to PCB if necessary, then copy it out */ if (curcpu()->ci_pflags & CPUPF_USERXSTATE) fpusave(&p->p_addr->u_pcb.pcb_savefpu); if (copyoutfpu(sfp, (void *)sp, fpu_save_len)) return 1; initialize_thread_xstate(p); ksc.sc_fpstate = (struct fxsave64 *)sp; sss = (sizeof(ksc) + 15) & ~15; sip = 0; if (info) { sip = sp - ((sizeof(*ksip) + 15) & ~15); sss += (sizeof(*ksip) + 15) & ~15; if (copyout(ksip, (void *)sip, sizeof(*ksip))) return 1; } scp = sp - sss; ksc.sc_cookie = (long)scp ^ p->p_p->ps_sigcookie; if (copyout(&ksc, (void *)scp, sizeof(ksc))) return 1; /* * Build context to run handler in. */ tf->tf_rax = (u_int64_t)catcher; tf->tf_rdi = sig; tf->tf_rsi = sip; tf->tf_rdx = scp; tf->tf_rip = (u_int64_t)p->p_p->ps_sigcode; tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); tf->tf_rflags &= ~(PSL_T|PSL_D|PSL_VM|PSL_AC); tf->tf_rsp = scp; tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL); return 0; } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int sys_sigreturn(struct proc *p, void *v, register_t *retval) { struct sys_sigreturn_args /* { syscallarg(struct sigcontext *) sigcntxp; } */ *uap = v; struct sigcontext ksc, *scp = SCARG(uap, sigcntxp); struct trapframe *tf = p->p_md.md_regs; struct savefpu *sfp = &p->p_addr->u_pcb.pcb_savefpu; int error; if (PROC_PC(p) != p->p_p->ps_sigcoderet) { sigexit(p, SIGILL); return (EPERM); } if ((error = copyin((caddr_t)scp, &ksc, sizeof ksc))) return (error); if (ksc.sc_cookie != ((long)scp ^ p->p_p->ps_sigcookie)) { sigexit(p, SIGILL); return (EFAULT); } /* Prevent reuse of the sigcontext cookie */ ksc.sc_cookie = 0; (void)copyout(&ksc.sc_cookie, (caddr_t)scp + offsetof(struct sigcontext, sc_cookie), sizeof (ksc.sc_cookie)); if (((ksc.sc_rflags ^ tf->tf_rflags) & PSL_USERSTATIC) != 0 || !USERMODE(ksc.sc_cs, ksc.sc_eflags)) return (EINVAL); /* Current FPU state is obsolete; toss it and force a reload */ if (curcpu()->ci_pflags & CPUPF_USERXSTATE) { curcpu()->ci_pflags &= ~CPUPF_USERXSTATE; fpureset(); } /* Copy in the FPU state to restore */ if (__predict_true(ksc.sc_fpstate != NULL)) { if ((error = copyin(ksc.sc_fpstate, sfp, fpu_save_len))) return error; if (xrstor_user(sfp, xsave_mask)) { memcpy(sfp, fpu_cleandata, fpu_save_len); return EINVAL; } maybe_enable_user_cet(p); curcpu()->ci_pflags |= CPUPF_USERXSTATE; } else { /* shouldn't happen, but handle it */ initialize_thread_xstate(p); } tf->tf_rdi = ksc.sc_rdi; tf->tf_rsi = ksc.sc_rsi; tf->tf_rdx = ksc.sc_rdx; tf->tf_rcx = ksc.sc_rcx; tf->tf_r8 = ksc.sc_r8; tf->tf_r9 = ksc.sc_r9; tf->tf_r10 = ksc.sc_r10; tf->tf_r11 = ksc.sc_r11; tf->tf_r12 = ksc.sc_r12; tf->tf_r13 = ksc.sc_r13; tf->tf_r14 = ksc.sc_r14; tf->tf_r15 = ksc.sc_r15; tf->tf_rbx = ksc.sc_rbx; tf->tf_rax = ksc.sc_rax; tf->tf_rbp = ksc.sc_rbp; tf->tf_rip = ksc.sc_rip; tf->tf_cs = ksc.sc_cs; tf->tf_rflags = ksc.sc_rflags; tf->tf_rsp = ksc.sc_rsp; tf->tf_ss = ksc.sc_ss; /* Restore signal mask. */ p->p_sigmask = ksc.sc_mask & ~sigcantmask; /* * sigreturn() needs to return to userspace via the 'iretq' * method, so that if the process was interrupted (by tick, * an IPI, whatever) as opposed to already being in the kernel * when a signal was being delivered, the process will be * completely restored, including the userland %rcx and %r11 * registers which the 'sysretq' instruction cannot restore. * Also need to make sure we can handle faulting on xrstor. */ p->p_md.md_flags |= MDP_IRET; return (EJUSTRETURN); } #ifdef MULTIPROCESSOR /* force a CPU into the kernel, whether or not it's idle */ void cpu_kick(struct cpu_info *ci) { /* only need to kick other CPUs */ if (ci != curcpu()) { if (cpu_mwait_size > 0) { /* * If not idling, then send an IPI, else * just clear the "keep idling" bit. */ if ((ci->ci_mwait & MWAIT_IN_IDLE) == 0) x86_send_ipi(ci, X86_IPI_NOP); else atomic_clearbits_int(&ci->ci_mwait, MWAIT_KEEP_IDLING); } else { /* no mwait, so need an IPI */ x86_send_ipi(ci, X86_IPI_NOP); } } } #endif /* * Notify the current process (p) that it has a signal pending, * process as soon as possible. */ void signotify(struct proc *p) { aston(p); cpu_kick(p->p_cpu); } #ifdef MULTIPROCESSOR void cpu_unidle(struct cpu_info *ci) { if (cpu_mwait_size > 0 && (ci->ci_mwait & MWAIT_ONLY)) { /* * Just clear the "keep idling" bit; if it wasn't * idling then we didn't need to do anything anyway. */ atomic_clearbits_int(&ci->ci_mwait, MWAIT_KEEP_IDLING); return; } if (ci != curcpu()) x86_send_ipi(ci, X86_IPI_NOP); } #endif int waittime = -1; struct pcb dumppcb; __dead void boot(int howto) { if ((howto & RB_POWERDOWN) != 0) lid_action = 0; if ((howto & RB_RESET) != 0) goto doreset; if (cold) { if ((howto & RB_USERREQ) == 0) howto |= RB_HALT; goto haltsys; } boothowto = howto; if ((howto & RB_NOSYNC) == 0 && waittime < 0) { waittime = 0; vfs_shutdown(curproc); if ((howto & RB_TIMEBAD) == 0) { resettodr(); } else { printf("WARNING: not updating battery clock\n"); } } if_downall(); uvm_shutdown(); splhigh(); cold = 1; if ((howto & RB_DUMP) != 0) dumpsys(); haltsys: config_suspend_all(DVACT_POWERDOWN); #ifdef MULTIPROCESSOR x86_broadcast_ipi(X86_IPI_HALT); #endif if ((howto & RB_HALT) != 0) { #if NACPI > 0 && !defined(SMALL_KERNEL) extern int acpi_enabled; if (acpi_enabled) { delay(500000); if ((howto & RB_POWERDOWN) != 0) acpi_powerdown(); } #endif printf("\n"); printf("The operating system has halted.\n"); printf("Please press any key to reboot.\n\n"); cnpollc(1); /* for proper keyboard command handling */ cngetc(); cnpollc(0); } doreset: printf("rebooting...\n"); if (cpureset_delay > 0) delay(cpureset_delay * 1000); cpu_reset(); for (;;) continue; /* NOTREACHED */ } /* * These variables are needed by /sbin/savecore */ u_long dumpmag = 0x8fca0101; /* magic number */ int dumpsize = 0; /* pages */ long dumplo = 0; /* blocks */ /* * cpu_dump: dump the machine-dependent kernel core dump headers. */ int cpu_dump(void) { int (*dump)(dev_t, daddr_t, caddr_t, size_t); char buf[dbtob(1)]; kcore_seg_t *segp; cpu_kcore_hdr_t *cpuhdrp; phys_ram_seg_t *memsegp; caddr_t va; int i; dump = bdevsw[major(dumpdev)].d_dump; memset(buf, 0, sizeof buf); segp = (kcore_seg_t *)buf; cpuhdrp = (cpu_kcore_hdr_t *)&buf[ALIGN(sizeof(*segp))]; memsegp = (phys_ram_seg_t *)&buf[ALIGN(sizeof(*segp)) + ALIGN(sizeof(*cpuhdrp))]; /* * Generate a segment header. */ CORE_SETMAGIC(*segp, KCORE_MAGIC, MID_MACHINE, CORE_CPU); segp->c_size = dbtob(1) - ALIGN(sizeof(*segp)); /* * Add the machine-dependent header info. */ cpuhdrp->ptdpaddr = proc0.p_addr->u_pcb.pcb_cr3; cpuhdrp->nmemsegs = mem_cluster_cnt; /* * Fill in the memory segment descriptors. */ for (i = 0; i < mem_cluster_cnt; i++) { memsegp[i].start = mem_clusters[i].start; memsegp[i].size = mem_clusters[i].size & ~PAGE_MASK; } /* * If we have dump memory then assume the kernel stack is in high * memory and bounce */ if (dumpmem_vaddr != 0) { memcpy((char *)dumpmem_vaddr, buf, sizeof(buf)); va = (caddr_t)dumpmem_vaddr; } else { va = (caddr_t)buf; } return (dump(dumpdev, dumplo, va, dbtob(1))); } /* * This is called by main to set dumplo and dumpsize. * Dumps always skip the first PAGE_SIZE of disk space * in case there might be a disk label stored there. * If there is extra space, put dump at the end to * reduce the chance that swapping trashes it. */ void dumpconf(void) { int nblks, dumpblks; /* size of dump area */ if (dumpdev == NODEV || (nblks = (bdevsw[major(dumpdev)].d_psize)(dumpdev)) == 0) return; if (nblks <= ctod(1)) return; dumpblks = cpu_dumpsize(); if (dumpblks < 0) return; dumpblks += ctod(cpu_dump_mempagecnt()); /* If dump won't fit (incl. room for possible label), punt. */ if (dumpblks > (nblks - ctod(1))) return; /* Put dump at end of partition */ dumplo = nblks - dumpblks; /* dumpsize is in page units, and doesn't include headers. */ dumpsize = cpu_dump_mempagecnt(); } /* * Doadump comes here after turning off memory management and * getting on the dump stack, either when called above, or by * the auto-restart code. */ #define BYTES_PER_DUMP MAXPHYS /* must be a multiple of pagesize */ void dumpsys(void) { u_long totalbytesleft, bytes, i, n, memseg; u_long maddr; daddr_t blkno; void *va; int (*dump)(dev_t, daddr_t, caddr_t, size_t); int error; /* Save registers. */ savectx(&dumppcb); if (dumpdev == NODEV) return; /* * For dumps during autoconfiguration, * if dump device has already configured... */ if (dumpsize == 0) dumpconf(); if (dumplo <= 0 || dumpsize == 0) { printf("\ndump to dev %u,%u not possible\n", major(dumpdev), minor(dumpdev)); return; } printf("\ndumping to dev %u,%u offset %ld\n", major(dumpdev), minor(dumpdev), dumplo); error = (*bdevsw[major(dumpdev)].d_psize)(dumpdev); printf("dump "); if (error == -1) { printf("area unavailable\n"); return; } if ((error = cpu_dump()) != 0) goto err; totalbytesleft = ptoa(cpu_dump_mempagecnt()); blkno = dumplo + cpu_dumpsize(); dump = bdevsw[major(dumpdev)].d_dump; error = 0; for (memseg = 0; memseg < mem_cluster_cnt; memseg++) { maddr = mem_clusters[memseg].start; bytes = mem_clusters[memseg].size; for (i = 0; i < bytes; i += n, totalbytesleft -= n) { /* Print out how many MBs we have left to go. */ if ((totalbytesleft % (1024*1024)) < BYTES_PER_DUMP) printf("%ld ", totalbytesleft / (1024 * 1024)); /* Limit size for next transfer. */ n = bytes - i; if (n > BYTES_PER_DUMP) n = BYTES_PER_DUMP; if (maddr > 0xffffffff) { va = (void *)dumpmem_vaddr; if (n > dumpmem_sz) n = dumpmem_sz; memcpy(va, (void *)PMAP_DIRECT_MAP(maddr), n); } else { va = (void *)PMAP_DIRECT_MAP(maddr); } error = (*dump)(dumpdev, blkno, va, n); if (error) goto err; maddr += n; blkno += btodb(n); /* XXX? */ #if 0 /* XXX this doesn't work. grr. */ /* operator aborting dump? */ if (sget() != NULL) { error = EINTR; break; } #endif } } err: switch (error) { case ENXIO: printf("device bad\n"); break; case EFAULT: printf("device not ready\n"); break; case EINVAL: printf("area improper\n"); break; case EIO: printf("i/o error\n"); break; case EINTR: printf("aborted from console\n"); break; case 0: printf("succeeded\n"); break; default: printf("error %d\n", error); break; } printf("\n\n"); delay(5000000); /* 5 seconds */ } /* * Force the userspace FS.base to be reloaded from the PCB on return from * the kernel, and reset the segment registers (%ds, %es, %fs, and %gs) * to their expected userspace value. */ void reset_segs(void) { /* * This operates like the cpu_switchto() sequence: if we * haven't reset %[defg]s already, do so now. */ if (curcpu()->ci_pflags & CPUPF_USERSEGS) { curcpu()->ci_pflags &= ~CPUPF_USERSEGS; __asm volatile( "movw %%ax,%%ds\n\t" "movw %%ax,%%es\n\t" "movw %%ax,%%fs\n\t" "cli\n\t" /* block intr when on user GS.base */ "swapgs\n\t" /* swap from kernel to user GS.base */ "movw %%ax,%%gs\n\t"/* set %gs to UDATA and GS.base to 0 */ "swapgs\n\t" /* back to kernel GS.base */ "sti" : : "a"(GSEL(GUDATA_SEL, SEL_UPL))); } } /* * Clear registers on exec */ void setregs(struct proc *p, struct exec_package *pack, u_long stack, struct ps_strings *arginfo) { struct trapframe *tf; initialize_thread_xstate(p); /* To reset all registers we have to return via iretq */ p->p_md.md_flags |= MDP_IRET; reset_segs(); p->p_addr->u_pcb.pcb_fsbase = 0; tf = p->p_md.md_regs; memset(tf, 0, sizeof *tf); tf->tf_rip = pack->ep_entry; tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); tf->tf_rflags = PSL_USERSET; tf->tf_rsp = stack; tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL); } /* * Initialize segments and descriptor tables */ struct gate_descriptor *idt; char idt_allocmap[NIDT]; struct user *proc0paddr = NULL; void setgate(struct gate_descriptor *gd, void *func, int ist, int type, int dpl, int sel) { gd->gd_looffset = (u_int64_t)func & 0xffff; gd->gd_selector = sel; gd->gd_ist = ist; gd->gd_type = type; gd->gd_dpl = dpl; gd->gd_p = 1; gd->gd_hioffset = (u_int64_t)func >> 16; gd->gd_zero = 0; gd->gd_xx1 = 0; gd->gd_xx2 = 0; gd->gd_xx3 = 0; } void unsetgate(struct gate_descriptor *gd) { memset(gd, 0, sizeof (*gd)); } void setregion(struct region_descriptor *rd, void *base, u_int16_t limit) { rd->rd_limit = limit; rd->rd_base = (u_int64_t)base; } /* * Note that the base and limit fields are ignored in long mode. */ void set_mem_segment(struct mem_segment_descriptor *sd, void *base, size_t limit, int type, int dpl, int gran, int def32, int is64) { sd->sd_lolimit = (unsigned)limit; sd->sd_lobase = (unsigned long)base; sd->sd_type = type; sd->sd_dpl = dpl; sd->sd_p = 1; sd->sd_hilimit = (unsigned)limit >> 16; sd->sd_avl = 0; sd->sd_long = is64; sd->sd_def32 = def32; sd->sd_gran = gran; sd->sd_hibase = (unsigned long)base >> 24; } void set_sys_segment(struct sys_segment_descriptor *sd, void *base, size_t limit, int type, int dpl, int gran) { memset(sd, 0, sizeof *sd); sd->sd_lolimit = (unsigned)limit; sd->sd_lobase = (u_int64_t)base; sd->sd_type = type; sd->sd_dpl = dpl; sd->sd_p = 1; sd->sd_hilimit = (unsigned)limit >> 16; sd->sd_gran = gran; sd->sd_hibase = (u_int64_t)base >> 24; } void cpu_init_idt(void) { struct region_descriptor region; setregion(®ion, idt, NIDT * sizeof(idt[0]) - 1); lidt(®ion); } void cpu_init_extents(void) { extern struct extent *iomem_ex; static int already_done; int i; /* We get called for each CPU, only first should do this */ if (already_done) return; /* * Allocate the physical addresses used by RAM from the iomem * extent map. */ for (i = 0; i < mem_cluster_cnt; i++) { if (extent_alloc_region(iomem_ex, mem_clusters[i].start, mem_clusters[i].size, EX_NOWAIT)) { /* XXX What should we do? */ printf("WARNING: CAN'T ALLOCATE RAM (%llx-%llx)" " FROM IOMEM EXTENT MAP!\n", mem_clusters[i].start, mem_clusters[i].start + mem_clusters[i].size - 1); } } already_done = 1; } void map_tramps(void) { #if defined(MULTIPROCESSOR) || \ (NACPI > 0 && !defined(SMALL_KERNEL)) struct pmap *kmp = pmap_kernel(); extern paddr_t tramp_pdirpa; #ifdef MULTIPROCESSOR extern u_char cpu_spinup_trampoline[]; extern u_char cpu_spinup_trampoline_end[]; extern u_char mp_tramp_data_start[]; extern u_char mp_tramp_data_end[]; extern u_int32_t mp_pdirpa; #endif /* * The initial PML4 pointer must be below 4G, so if the * current one isn't, use a "bounce buffer" and save it * for tramps to use. */ if (kmp->pm_pdirpa > 0xffffffff) { pmap_kenter_pa(lo32_vaddr, lo32_paddr, PROT_READ | PROT_WRITE); memcpy((void *)lo32_vaddr, kmp->pm_pdir, PAGE_SIZE); tramp_pdirpa = lo32_paddr; pmap_kremove(lo32_vaddr, PAGE_SIZE); } else tramp_pdirpa = kmp->pm_pdirpa; #ifdef MULTIPROCESSOR /* Map MP tramp code and data pages RW for copy */ pmap_kenter_pa(MP_TRAMPOLINE, MP_TRAMPOLINE, PROT_READ | PROT_WRITE); pmap_kenter_pa(MP_TRAMP_DATA, MP_TRAMP_DATA, PROT_READ | PROT_WRITE); memset((caddr_t)MP_TRAMPOLINE, 0xcc, PAGE_SIZE); memset((caddr_t)MP_TRAMP_DATA, 0xcc, PAGE_SIZE); memcpy((caddr_t)MP_TRAMPOLINE, cpu_spinup_trampoline, cpu_spinup_trampoline_end-cpu_spinup_trampoline); memcpy((caddr_t)MP_TRAMP_DATA, mp_tramp_data_start, mp_tramp_data_end - mp_tramp_data_start); /* * We need to patch this after we copy the tramp data, * the symbol points into the copied tramp data page. */ mp_pdirpa = tramp_pdirpa; /* Unmap, will be remapped in cpu_start_secondary */ pmap_kremove(MP_TRAMPOLINE, PAGE_SIZE); pmap_kremove(MP_TRAMP_DATA, PAGE_SIZE); #endif /* MULTIPROCESSOR */ #endif } #define IDTVEC(name) __CONCAT(X, name) typedef void (vector)(void); extern vector *IDTVEC(exceptions)[]; paddr_t early_pte_pages; void init_x86_64(paddr_t first_avail) { struct region_descriptor region; bios_memmap_t *bmp; int x, ist; uint64_t max_dm_size = ((uint64_t)512 * NUM_L4_SLOT_DIRECT) << 30; /* * locore0 mapped 3 pages for use before the pmap is initialized * starting at first_avail. These pages are currently used by * efifb to create early-use VAs for the framebuffer before efifb * is attached. */ early_pte_pages = first_avail; first_avail += 3 * NBPG; cpu_init_msrs(&cpu_info_primary); proc0.p_addr = proc0paddr; cpu_info_primary.ci_curpcb = &proc0.p_addr->u_pcb; x86_bus_space_init(); i8254_startclock(); /* * Initialize PAGE_SIZE-dependent variables. */ uvm_setpagesize(); /* * Boot arguments are in a single page specified by /boot. * * We require the "new" vector form, as well as memory ranges * to be given in bytes rather than KB. * * locore copies the data into bootinfo[] for us. */ if ((bootapiver & (BAPIV_VECTOR | BAPIV_BMEMMAP)) == (BAPIV_VECTOR | BAPIV_BMEMMAP)) { if (bootinfo_size >= sizeof(bootinfo)) panic("boot args too big"); getbootinfo(bootinfo, bootinfo_size); } else panic("invalid /boot"); cninit(); /* * Memory on the AMD64 port is described by three different things. * * 1. biosbasemem - This is outdated, and should really only be used to * sanitize the other values. This is what we get back from the BIOS * using the legacy routines, describing memory below 640KB. * * 2. bios_memmap[] - This is the memory map as the bios has returned * it to us. It includes memory the kernel occupies, etc. * * 3. mem_cluster[] - This is the massaged free memory segments after * taking into account the contents of bios_memmap, biosbasemem, * and locore/machdep/pmap kernel allocations of physical * pages. * * The other thing is that the physical page *RANGE* is described by * three more variables: * * avail_start - This is a physical address of the start of available * pages, until IOM_BEGIN. This is basically the start * of the UVM managed range of memory, with some holes... * * avail_end - This is the end of physical pages. All physical pages * that UVM manages are between avail_start and avail_end. * There are holes... * * first_avail - This is the first available physical page after the * kernel, page tables, etc. * * We skip the first few pages for trampolines, hibernate, and to avoid * buggy SMI implementations that could corrupt the first 64KB. */ avail_start = 16*PAGE_SIZE; #ifdef MULTIPROCESSOR if (avail_start < MP_TRAMPOLINE + PAGE_SIZE) avail_start = MP_TRAMPOLINE + PAGE_SIZE; if (avail_start < MP_TRAMP_DATA + PAGE_SIZE) avail_start = MP_TRAMP_DATA + PAGE_SIZE; #endif #if (NACPI > 0 && !defined(SMALL_KERNEL)) if (avail_start < ACPI_TRAMPOLINE + PAGE_SIZE) avail_start = ACPI_TRAMPOLINE + PAGE_SIZE; if (avail_start < ACPI_TRAMP_DATA + PAGE_SIZE) avail_start = ACPI_TRAMP_DATA + PAGE_SIZE; #endif #ifdef HIBERNATE if (avail_start < HIBERNATE_HIBALLOC_PAGE + PAGE_SIZE) avail_start = HIBERNATE_HIBALLOC_PAGE + PAGE_SIZE; #endif /* HIBERNATE */ /* * We need to go through the BIOS memory map given, and * fill out mem_clusters and mem_cluster_cnt stuff, taking * into account all the points listed above. */ avail_end = mem_cluster_cnt = 0; for (bmp = bios_memmap; bmp->type != BIOS_MAP_END; bmp++) { paddr_t s1, s2, e1, e2; /* Ignore non-free memory */ if (bmp->type != BIOS_MAP_FREE) continue; if (bmp->size < PAGE_SIZE) continue; /* Init our segment(s), round/trunc to pages */ s1 = round_page(bmp->addr); e1 = trunc_page(bmp->addr + bmp->size); s2 = e2 = 0; /* * XXX Some buggy ACPI BIOSes use memory that they * declare as free. Current worst offender is * Supermicro 5019D-FTN4. Typically the affected memory * areas are small blocks between areas reserved for * ACPI and other BIOS goo. So skip areas smaller * than 32 MB above the 16 MB boundary (to avoid * affecting legacy stuff). */ if (s1 > 16*1024*1024 && (e1 - s1) < 32*1024*1024) continue; /* Check and adjust our segment(s) */ /* Nuke low pages */ if (s1 < avail_start) { s1 = avail_start; if (s1 > e1) continue; } /* * The direct map is limited to 512GB * NUM_L4_SLOT_DIRECT of * memory, so discard anything above that. */ if (e1 >= max_dm_size) { e1 = max_dm_size; if (s1 > e1) continue; } /* Crop stuff into "640K hole" */ if (s1 < IOM_BEGIN && e1 > IOM_BEGIN) e1 = IOM_BEGIN; if (s1 < biosbasemem && e1 > biosbasemem) e1 = biosbasemem; /* Split any segments straddling the 16MB boundary */ if (s1 < 16*1024*1024 && e1 > 16*1024*1024) { e2 = e1; s2 = e1 = 16*1024*1024; } /* Store segment(s) */ if (e1 - s1 >= PAGE_SIZE) { mem_clusters[mem_cluster_cnt].start = s1; mem_clusters[mem_cluster_cnt].size = e1 - s1; mem_cluster_cnt++; } if (e2 - s2 >= PAGE_SIZE) { mem_clusters[mem_cluster_cnt].start = s2; mem_clusters[mem_cluster_cnt].size = e2 - s2; mem_cluster_cnt++; } if (avail_end < e1) avail_end = e1; if (avail_end < e2) avail_end = e2; } /* * Call pmap initialization to make new kernel address space. * We must do this before loading pages into the VM system. */ first_avail = pmap_bootstrap(first_avail, trunc_page(avail_end)); #if NEFI > 0 /* Relocate the EFI memory map. */ if (bios_efiinfo && bios_efiinfo->mmap_start) { mmap = (EFI_MEMORY_DESCRIPTOR *)PMAP_DIRECT_MAP(first_avail); memcpy(mmap, (void *)PMAP_DIRECT_MAP(bios_efiinfo->mmap_start), bios_efiinfo->mmap_size); first_avail += round_page(bios_efiinfo->mmap_size); } #endif /* Allocate these out of the 640KB base memory */ if (avail_start != PAGE_SIZE) avail_start = pmap_prealloc_lowmem_ptps(avail_start); cpu_init_extents(); /* Make sure the end of the space used by the kernel is rounded. */ first_avail = round_page(first_avail); kern_end = KERNBASE + first_avail; /* * Now, load the memory clusters (which have already been * flensed) into the VM system. */ for (x = 0; x < mem_cluster_cnt; x++) { paddr_t seg_start = mem_clusters[x].start; paddr_t seg_end = seg_start + mem_clusters[x].size; if (seg_start < first_avail) seg_start = first_avail; if (seg_start > seg_end) continue; if (seg_end - seg_start < PAGE_SIZE) continue; physmem += atop(mem_clusters[x].size); #if DEBUG_MEMLOAD printf("loading 0x%lx-0x%lx (0x%lx-0x%lx)\n", seg_start, seg_end, atop(seg_start), atop(seg_end)); #endif uvm_page_physload(atop(seg_start), atop(seg_end), atop(seg_start), atop(seg_end), 0); } /* * Now, load the memory between the end of I/O memory "hole" * and the kernel. */ { paddr_t seg_start = round_page(IOM_END); paddr_t seg_end = trunc_page(KERNTEXTOFF - KERNBASE); if (seg_start < seg_end) { #if DEBUG_MEMLOAD printf("loading 0x%lx-0x%lx\n", seg_start, seg_end); #endif uvm_page_physload(atop(seg_start), atop(seg_end), atop(seg_start), atop(seg_end), 0); } } #if DEBUG_MEMLOAD printf("avail_start = 0x%lx\n", avail_start); printf("avail_end = 0x%lx\n", avail_end); printf("first_avail = 0x%lx\n", first_avail); #endif /* * Steal memory for the message buffer (at end of core). */ { struct vm_physseg *vps = NULL; psize_t sz = round_page(MSGBUFSIZE); psize_t reqsz = sz; for (x = 0; x < vm_nphysseg; x++) { vps = &vm_physmem[x]; if (ptoa(vps->avail_end) == avail_end) break; } if (x == vm_nphysseg) panic("init_x86_64: can't find end of memory"); /* Shrink so it'll fit in the last segment. */ if ((vps->avail_end - vps->avail_start) < atop(sz)) sz = ptoa(vps->avail_end - vps->avail_start); vps->avail_end -= atop(sz); vps->end -= atop(sz); msgbuf_paddr = ptoa(vps->avail_end); /* Remove the last segment if it now has no pages. */ if (vps->start == vps->end) { for (vm_nphysseg--; x < vm_nphysseg; x++) vm_physmem[x] = vm_physmem[x + 1]; } /* Now find where the new avail_end is. */ for (avail_end = 0, x = 0; x < vm_nphysseg; x++) if (vm_physmem[x].avail_end > avail_end) avail_end = vm_physmem[x].avail_end; avail_end = ptoa(avail_end); /* Warn if the message buffer had to be shrunk. */ if (sz != reqsz) printf("WARNING: %ld bytes not available for msgbuf " "in last cluster (%ld used)\n", reqsz, sz); } /* * Steal some memory for a dump bouncebuffer if we have memory over * the 32-bit barrier. */ if (avail_end > 0xffffffff) { struct vm_physseg *vps = NULL; psize_t sz = round_page(MAX(BYTES_PER_DUMP, dbtob(1))); /* XXX assumes segments are ordered */ for (x = 0; x < vm_nphysseg; x++) { vps = &vm_physmem[x]; /* Find something between 16meg and 4gig */ if (ptoa(vps->avail_end) <= 0xffffffff && ptoa(vps->avail_start) >= 0xffffff) break; } if (x == vm_nphysseg) panic("init_x86_64: no memory between " "0xffffff-0xffffffff"); /* Shrink so it'll fit in the segment. */ if ((vps->avail_end - vps->avail_start) < atop(sz)) sz = ptoa(vps->avail_end - vps->avail_start); vps->avail_end -= atop(sz); vps->end -= atop(sz); dumpmem_paddr = ptoa(vps->avail_end); dumpmem_vaddr = PMAP_DIRECT_MAP(dumpmem_paddr); dumpmem_sz = sz; /* Remove the last segment if it now has no pages. */ if (vps->start == vps->end) { for (vm_nphysseg--; x < vm_nphysseg; x++) vm_physmem[x] = vm_physmem[x + 1]; } } pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024); pmap_kenter_pa(idt_vaddr, idt_paddr, PROT_READ | PROT_WRITE); idt = (struct gate_descriptor *)idt_vaddr; cpu_info_primary.ci_tss = &cpu_info_full_primary.cif_tss; cpu_info_primary.ci_gdt = &cpu_info_full_primary.cif_gdt; /* make gdt gates and memory segments */ set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GCODE_SEL), 0, 0xfffff, SDT_MEMERA, SEL_KPL, 1, 0, 1); set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GDATA_SEL), 0, 0xfffff, SDT_MEMRWA, SEL_KPL, 1, 0, 1); set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GUCODE32_SEL), 0, atop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMERA, SEL_UPL, 1, 1, 0); set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GUDATA_SEL), 0, atop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 0, 1); set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GUCODE_SEL), 0, atop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 0, 1); set_sys_segment(GDT_ADDR_SYS(cpu_info_primary.ci_gdt, GPROC0_SEL), cpu_info_primary.ci_tss, sizeof (struct x86_64_tss)-1, SDT_SYS386TSS, SEL_KPL, 0); /* exceptions */ for (x = 0; x < 32; x++) { /* trap2 == NMI, trap8 == double fault */ ist = (x == 2) ? 2 : (x == 8) ? 1 : 0; setgate(&idt[x], IDTVEC(exceptions)[x], ist, SDT_SYS386IGT, (x == 3) ? SEL_UPL : SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); idt_allocmap[x] = 1; } setregion(®ion, cpu_info_primary.ci_gdt, GDT_SIZE - 1); lgdt(®ion); cpu_init_idt(); intr_default_setup(); fpuinit(&cpu_info_primary); softintr_init(); splraise(IPL_IPI); intr_enable(); #ifdef DDB db_machine_init(); ddb_init(); if (boothowto & RB_KDB) db_enter(); #endif } void cpu_reset(void) { intr_disable(); if (cpuresetfn) (*cpuresetfn)(); /* * The keyboard controller has 4 random output pins, one of which is * connected to the RESET pin on the CPU in many PCs. We tell the * keyboard controller to pulse this line a couple of times. */ outb(IO_KBD + KBCMDP, KBC_PULSE0); delay(100000); outb(IO_KBD + KBCMDP, KBC_PULSE0); delay(100000); /* * Try to cause a triple fault and watchdog reset by making the IDT * invalid and causing a fault. */ memset((caddr_t)idt, 0, NIDT * sizeof(idt[0])); __asm volatile("divl %0,%1" : : "q" (0), "a" (0)); for (;;) continue; /* NOTREACHED */ } /* * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers. */ int cpu_dumpsize(void) { int size; size = ALIGN(sizeof(kcore_seg_t)) + ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t)); if (roundup(size, dbtob(1)) != dbtob(1)) return (-1); return (1); } /* * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped. */ u_long cpu_dump_mempagecnt(void) { u_long i, n; n = 0; for (i = 0; i < mem_cluster_cnt; i++) n += atop(mem_clusters[i].size); return (n); } /* * Figure out which portions of memory are used by the kernel/system. */ int amd64_pa_used(paddr_t addr) { struct vm_page *pg; /* Kernel manages these */ if ((pg = PHYS_TO_VM_PAGE(addr)) && (pg->pg_flags & PG_DEV) == 0) return 1; /* Kernel is loaded here */ if (addr > IOM_END && addr < (kern_end - KERNBASE)) return 1; /* Low memory used for various bootstrap things */ if (addr < avail_start) return 1; /* * The only regions I can think of that are left are the things * we steal away from UVM. The message buffer? * XXX - ignore these for now. */ return 0; } void cpu_initclocks(void) { (*initclock_func)(); } void cpu_startclock(void) { (*startclock_func)(); } void need_resched(struct cpu_info *ci) { ci->ci_want_resched = 1; /* There's a risk we'll be called before the idle threads start */ if (ci->ci_curproc) { aston(ci->ci_curproc); cpu_kick(ci); } } /* * Allocate an IDT vector slot within the given range. * XXX needs locking to avoid MP allocation races. */ int idt_vec_alloc(int low, int high) { int vec; for (vec = low; vec <= high; vec++) { if (idt_allocmap[vec] == 0) { idt_allocmap[vec] = 1; return vec; } } return 0; } int idt_vec_alloc_range(int low, int high, int num) { int i, vec; KASSERT(powerof2(num)); low = (low + num - 1) & ~(num - 1); high = ((high + 1) & ~(num - 1)) - 1; for (vec = low; vec <= high; vec += num) { for (i = 0; i < num; i++) { if (idt_allocmap[vec + i] != 0) break; } if (i == num) { for (i = 0; i < num; i++) idt_allocmap[vec + i] = 1; return vec; } } return 0; } void idt_vec_set(int vec, void (*function)(void)) { /* * Vector should be allocated, so no locking needed. */ KASSERT(idt_allocmap[vec] == 1); setgate(&idt[vec], function, 0, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); } void idt_vec_free(int vec) { unsetgate(&idt[vec]); idt_allocmap[vec] = 0; } #ifdef DIAGNOSTIC void splassert_check(int wantipl, const char *func) { int cpl = curcpu()->ci_ilevel; int floor = curcpu()->ci_handled_intr_level; if (cpl < wantipl) { splassert_fail(wantipl, cpl, func); } if (floor > wantipl) { splassert_fail(wantipl, floor, func); } } #endif int copyin32(const uint32_t *uaddr, uint32_t *kaddr) { if ((vaddr_t)uaddr & 0x3) return EFAULT; /* copyin(9) is atomic */ return copyin(uaddr, kaddr, sizeof(uint32_t)); } void getbootinfo(char *bootinfo, int bootinfo_size) { bootarg32_t *q; bios_ddb_t *bios_ddb; bios_bootduid_t *bios_bootduid; bios_bootsr_t *bios_bootsr; #undef BOOTINFO_DEBUG #ifdef BOOTINFO_DEBUG printf("bootargv:"); #endif for (q = (bootarg32_t *)bootinfo; (q->ba_type != BOOTARG_END) && ((((char *)q) - bootinfo) < bootinfo_size); q = (bootarg32_t *)(((char *)q) + q->ba_size)) { switch (q->ba_type) { case BOOTARG_MEMMAP: bios_memmap = (bios_memmap_t *)q->ba_arg; #ifdef BOOTINFO_DEBUG printf(" memmap %p", bios_memmap); #endif break; case BOOTARG_DISKINFO: bios_diskinfo = (bios_diskinfo_t *)q->ba_arg; #ifdef BOOTINFO_DEBUG printf(" diskinfo %p", bios_diskinfo); #endif break; case BOOTARG_APMINFO: /* generated by i386 boot loader */ break; case BOOTARG_CKSUMLEN: bios_cksumlen = *(u_int32_t *)q->ba_arg; #ifdef BOOTINFO_DEBUG printf(" cksumlen %d", bios_cksumlen); #endif break; case BOOTARG_PCIINFO: /* generated by i386 boot loader */ break; case BOOTARG_CONSDEV: { #if NCOM > 0 bios_consdev_t *cdp = (bios_consdev_t*)q->ba_arg; static const int ports[] = { 0x3f8, 0x2f8, 0x3e8, 0x2e8 }; int unit = minor(cdp->consdev); uint64_t consaddr = cdp->consaddr; if (consaddr == -1 && unit >= 0 && unit < nitems(ports)) consaddr = ports[unit]; if (major(cdp->consdev) == 8 && consaddr != -1) { comconsunit = unit; comconsaddr = consaddr; comconsrate = cdp->conspeed; comconsfreq = cdp->consfreq; comcons_reg_width = cdp->reg_width; comcons_reg_shift = cdp->reg_shift; if (cdp->flags & BCD_MMIO) comconsiot = X86_BUS_SPACE_MEM; else comconsiot = X86_BUS_SPACE_IO; } #endif #ifdef BOOTINFO_DEBUG printf(" console 0x%x:%d", cdp->consdev, cdp->conspeed); #endif break; } case BOOTARG_BOOTMAC: bios_bootmac = (bios_bootmac_t *)q->ba_arg; break; case BOOTARG_DDB: bios_ddb = (bios_ddb_t *)q->ba_arg; #ifdef DDB db_console = bios_ddb->db_console; #endif break; case BOOTARG_BOOTDUID: bios_bootduid = (bios_bootduid_t *)q->ba_arg; memcpy(bootduid, bios_bootduid, sizeof(bootduid)); break; case BOOTARG_BOOTSR: bios_bootsr = (bios_bootsr_t *)q->ba_arg; #if NSOFTRAID > 0 memcpy(&sr_bootuuid, &bios_bootsr->uuid, sizeof(sr_bootuuid)); memcpy(&sr_bootkey, &bios_bootsr->maskkey, sizeof(sr_bootkey)); #endif explicit_bzero(bios_bootsr, sizeof(bios_bootsr_t)); break; case BOOTARG_EFIINFO: bios_efiinfo = (bios_efiinfo_t *)q->ba_arg; break; case BOOTARG_UCODE: bios_ucode = (bios_ucode_t *)q->ba_arg; break; default: #ifdef BOOTINFO_DEBUG printf(" unsupported arg (%d) %p", q->ba_type, q->ba_arg); #endif break; } } #ifdef BOOTINFO_DEBUG printf("\n"); #endif } int check_context(const struct reg *regs, struct trapframe *tf) { uint16_t sel; if (((regs->r_rflags ^ tf->tf_rflags) & PSL_USERSTATIC) != 0) return EINVAL; sel = regs->r_ss & 0xffff; if (!VALID_USER_DSEL(sel)) return EINVAL; sel = regs->r_cs & 0xffff; if (!VALID_USER_CSEL(sel)) return EINVAL; if (regs->r_rip >= VM_MAXUSER_ADDRESS) return EINVAL; return 0; } int amd64_delay_quality; void delay_init(void(*fn)(int), int fn_quality) { if (fn_quality > amd64_delay_quality) { delay_func = fn; amd64_delay_quality = fn_quality; } } void delay_fini(void (*fn)(int)) { if (fn == delay_func) { delay_func = i8254_delay; amd64_delay_quality = 0; } }