3424 lines
82 KiB
C
3424 lines
82 KiB
C
/* $OpenBSD: pmap.c,v 1.165 2023/12/29 13:23:27 jca Exp $ */
|
|
/* $NetBSD: pmap.c,v 1.3 2003/05/08 18:13:13 thorpej Exp $ */
|
|
|
|
/*
|
|
* Copyright (c) 1997 Charles D. Cranor and Washington University.
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
|
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/*
|
|
* Copyright 2001 (c) Wasabi Systems, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* Written by Frank van der Linden for Wasabi Systems, Inc.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* 3. All advertising materials mentioning features or use of this software
|
|
* must display the following acknowledgement:
|
|
* This product includes software developed for the NetBSD Project by
|
|
* Wasabi Systems, Inc.
|
|
* 4. The name of Wasabi Systems, Inc. may not be used to endorse
|
|
* or promote products derived from this software without specific prior
|
|
* written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
|
|
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/*
|
|
* This is the i386 pmap modified and generalized to support x86-64
|
|
* as well. The idea is to hide the upper N levels of the page tables
|
|
* inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
|
|
* is mostly untouched, except that it uses some more generalized
|
|
* macros and interfaces.
|
|
*
|
|
* This pmap has been tested on the i386 as well, and it can be easily
|
|
* adapted to PAE.
|
|
*
|
|
* fvdl@wasabisystems.com 18-Jun-2001
|
|
*/
|
|
|
|
/*
|
|
* pmap.c: i386 pmap module rewrite
|
|
* Chuck Cranor <chuck@ccrc.wustl.edu>
|
|
* 11-Aug-97
|
|
*
|
|
* history of this pmap module: in addition to my own input, i used
|
|
* the following references for this rewrite of the i386 pmap:
|
|
*
|
|
* [1] the NetBSD i386 pmap. this pmap appears to be based on the
|
|
* BSD hp300 pmap done by Mike Hibler at University of Utah.
|
|
* it was then ported to the i386 by William Jolitz of UUNET
|
|
* Technologies, Inc. Then Charles M. Hannum of the NetBSD
|
|
* project fixed some bugs and provided some speed ups.
|
|
*
|
|
* [2] the FreeBSD i386 pmap. this pmap seems to be the
|
|
* Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
|
|
* and David Greenman.
|
|
*
|
|
* [3] the Mach pmap. this pmap, from CMU, seems to have migrated
|
|
* between several processors. the VAX version was done by
|
|
* Avadis Tevanian, Jr., and Michael Wayne Young. the i386
|
|
* version was done by Lance Berc, Mike Kupfer, Bob Baron,
|
|
* David Golub, and Richard Draves. the alpha version was
|
|
* done by Alessandro Forin (CMU/Mach) and Chris Demetriou
|
|
* (NetBSD/alpha).
|
|
*/
|
|
|
|
#include <sys/param.h>
|
|
#include <sys/systm.h>
|
|
#include <sys/atomic.h>
|
|
#include <sys/proc.h>
|
|
#include <sys/pool.h>
|
|
#include <sys/user.h>
|
|
#include <sys/mutex.h>
|
|
|
|
#include <uvm/uvm.h>
|
|
|
|
#include <machine/cpu.h>
|
|
#ifdef MULTIPROCESSOR
|
|
#include <machine/i82489reg.h>
|
|
#include <machine/i82489var.h>
|
|
#endif
|
|
|
|
#include "vmm.h"
|
|
|
|
#if NVMM > 0
|
|
#include <machine/vmmvar.h>
|
|
#endif /* NVMM > 0 */
|
|
|
|
#include "acpi.h"
|
|
|
|
/* #define PMAP_DEBUG */
|
|
|
|
#ifdef PMAP_DEBUG
|
|
#define DPRINTF(x...) do { printf(x); } while(0)
|
|
#else
|
|
#define DPRINTF(x...)
|
|
#endif /* PMAP_DEBUG */
|
|
|
|
|
|
/*
|
|
* general info:
|
|
*
|
|
* - for an explanation of how the i386 MMU hardware works see
|
|
* the comments in <machine/pte.h>.
|
|
*
|
|
* - for an explanation of the general memory structure used by
|
|
* this pmap (including the recursive mapping), see the comments
|
|
* in <machine/pmap.h>.
|
|
*
|
|
* this file contains the code for the "pmap module." the module's
|
|
* job is to manage the hardware's virtual to physical address mappings.
|
|
* note that there are two levels of mapping in the VM system:
|
|
*
|
|
* [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
|
|
* to map ranges of virtual address space to objects/files. for
|
|
* example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
|
|
* to the file /bin/ls starting at offset zero." note that
|
|
* the upper layer mapping is not concerned with how individual
|
|
* vm_pages are mapped.
|
|
*
|
|
* [2] the lower layer of the VM system (the pmap) maintains the mappings
|
|
* from virtual addresses. it is concerned with which vm_page is
|
|
* mapped where. for example, when you run /bin/ls and start
|
|
* at page 0x1000 the fault routine may lookup the correct page
|
|
* of the /bin/ls file and then ask the pmap layer to establish
|
|
* a mapping for it.
|
|
*
|
|
* note that information in the lower layer of the VM system can be
|
|
* thrown away since it can easily be reconstructed from the info
|
|
* in the upper layer.
|
|
*
|
|
* data structures we use include:
|
|
* - struct pmap: describes the address space of one process
|
|
* - struct pv_entry: describes one <PMAP,VA> mapping of a PA
|
|
* - struct pg_to_free: a list of virtual addresses whose mappings
|
|
* have been changed. used for TLB flushing.
|
|
*/
|
|
|
|
/*
|
|
* memory allocation
|
|
*
|
|
* - there are three data structures that we must dynamically allocate:
|
|
*
|
|
* [A] new process' page directory page (PDP)
|
|
* - plan 1: done at pmap_create() we use
|
|
* pool_get(&pmap_pmap_pool, PR_WAITOK) to do this allocation.
|
|
*
|
|
* if we are low in free physical memory then we sleep in
|
|
* pool_get() -- in this case this is ok since we are creating
|
|
* a new pmap and should not be holding any locks.
|
|
*
|
|
* XXX: the fork code currently has no way to return an "out of
|
|
* memory, try again" error code since uvm_fork [fka vm_fork]
|
|
* is a void function.
|
|
*
|
|
* [B] new page tables pages (PTP)
|
|
* call uvm_pagealloc()
|
|
* => success: zero page, add to pm_pdir
|
|
* => failure: we are out of free vm_pages, let pmap_enter()
|
|
* tell UVM about it.
|
|
*
|
|
* note: for kernel PTPs, we start with NKPTP of them. as we map
|
|
* kernel memory (at uvm_map time) we check to see if we've grown
|
|
* the kernel pmap. if so, we call the optional function
|
|
* pmap_growkernel() to grow the kernel PTPs in advance.
|
|
*
|
|
* [C] pv_entry structures
|
|
* - try to allocate one from the pool.
|
|
* If we fail, we simply let pmap_enter() tell UVM about it.
|
|
*/
|
|
|
|
long nkptp[] = NKPTP_INITIALIZER;
|
|
|
|
const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
|
|
const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
|
|
const long nkptpmax[] = NKPTPMAX_INITIALIZER;
|
|
const long nbpd[] = NBPD_INITIALIZER;
|
|
pd_entry_t *const normal_pdes[] = PDES_INITIALIZER;
|
|
|
|
#define pmap_pte_set(p, n) atomic_swap_64(p, n)
|
|
#define pmap_pte_clearbits(p, b) x86_atomic_clearbits_u64(p, b)
|
|
#define pmap_pte_setbits(p, b) x86_atomic_setbits_u64(p, b)
|
|
|
|
/*
|
|
* global data structures
|
|
*/
|
|
|
|
struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */
|
|
|
|
/*
|
|
* pg_nx: NX PTE bit (if CPU supports)
|
|
* pg_g_kern: PG_G if global pages should be used in kernel mappings,
|
|
* 0 otherwise (for insecure CPUs)
|
|
*/
|
|
pt_entry_t pg_nx = 0;
|
|
pt_entry_t pg_g_kern = 0;
|
|
|
|
/* pg_xo: XO PTE bits, set to PKU key1 (if cpu supports PKU) */
|
|
pt_entry_t pg_xo;
|
|
|
|
/*
|
|
* pmap_pg_wc: if our processor supports PAT then we set this
|
|
* to be the pte bits for Write Combining. Else we fall back to
|
|
* UC- so mtrrs can override the cacheability;
|
|
*/
|
|
int pmap_pg_wc = PG_UCMINUS;
|
|
|
|
/*
|
|
* pmap_use_pcid: nonzero if PCID use is enabled (currently we require INVPCID)
|
|
*
|
|
* The next three are zero unless and until PCID support is enabled so code
|
|
* can just 'or' them in as needed without tests.
|
|
* cr3_pcid: CR3_REUSE_PCID
|
|
* cr3_pcid_proc and cr3_pcid_temp: PCID_PROC and PCID_TEMP
|
|
*/
|
|
#if PCID_KERN != 0
|
|
# error "pmap.c assumes PCID_KERN is zero"
|
|
#endif
|
|
int pmap_use_pcid;
|
|
static u_int cr3_pcid_proc;
|
|
static u_int cr3_pcid_temp;
|
|
/* these two are accessed from locore.o */
|
|
paddr_t cr3_reuse_pcid;
|
|
paddr_t cr3_pcid_proc_intel;
|
|
|
|
/*
|
|
* other data structures
|
|
*/
|
|
|
|
pt_entry_t protection_codes[8]; /* maps MI prot to i386 prot code */
|
|
int pmap_initialized = 0; /* pmap_init done yet? */
|
|
|
|
/*
|
|
* pv management structures.
|
|
*/
|
|
struct pool pmap_pv_pool;
|
|
|
|
/*
|
|
* linked list of all non-kernel pmaps
|
|
*/
|
|
|
|
struct pmap_head pmaps;
|
|
struct mutex pmaps_lock = MUTEX_INITIALIZER(IPL_VM);
|
|
|
|
/*
|
|
* pool that pmap structures are allocated from
|
|
*/
|
|
|
|
struct pool pmap_pmap_pool;
|
|
|
|
/*
|
|
* When we're freeing a ptp, we need to delay the freeing until all
|
|
* tlb shootdown has been done. This is the list of the to-be-freed pages.
|
|
*/
|
|
TAILQ_HEAD(pg_to_free, vm_page);
|
|
|
|
/*
|
|
* pool that PDPs are allocated from
|
|
*/
|
|
|
|
struct pool pmap_pdp_pool;
|
|
void pmap_pdp_ctor(pd_entry_t *);
|
|
void pmap_pdp_ctor_intel(pd_entry_t *);
|
|
|
|
extern vaddr_t msgbuf_vaddr;
|
|
extern paddr_t msgbuf_paddr;
|
|
|
|
extern vaddr_t idt_vaddr; /* we allocate IDT early */
|
|
extern paddr_t idt_paddr;
|
|
|
|
extern vaddr_t lo32_vaddr;
|
|
extern vaddr_t lo32_paddr;
|
|
|
|
vaddr_t virtual_avail;
|
|
extern int end;
|
|
|
|
/*
|
|
* local prototypes
|
|
*/
|
|
|
|
void pmap_enter_pv(struct vm_page *, struct pv_entry *, struct pmap *,
|
|
vaddr_t, struct vm_page *);
|
|
struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t);
|
|
struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
|
|
int pmap_find_pte_direct(struct pmap *pm, vaddr_t va, pt_entry_t **pd, int *offs);
|
|
void pmap_free_ptp(struct pmap *, struct vm_page *,
|
|
vaddr_t, struct pg_to_free *);
|
|
void pmap_freepage(struct pmap *, struct vm_page *, int, struct pg_to_free *);
|
|
#ifdef MULTIPROCESSOR
|
|
static int pmap_is_active(struct pmap *, struct cpu_info *);
|
|
#endif
|
|
paddr_t pmap_map_ptes(struct pmap *);
|
|
struct pv_entry *pmap_remove_pv(struct vm_page *, struct pmap *, vaddr_t);
|
|
void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int);
|
|
void pmap_remove_ept(struct pmap *, vaddr_t, vaddr_t);
|
|
void pmap_do_remove_ept(struct pmap *, vaddr_t);
|
|
int pmap_enter_ept(struct pmap *, vaddr_t, paddr_t, vm_prot_t);
|
|
int pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
|
|
vaddr_t, int, struct pv_entry **);
|
|
void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t,
|
|
vaddr_t, vaddr_t, int, struct pv_entry **);
|
|
#define PMAP_REMOVE_ALL 0 /* remove all mappings */
|
|
#define PMAP_REMOVE_SKIPWIRED 1 /* skip wired mappings */
|
|
|
|
void pmap_unmap_ptes(struct pmap *, paddr_t);
|
|
int pmap_get_physpage(vaddr_t, int, paddr_t *);
|
|
int pmap_pdes_valid(vaddr_t, pd_entry_t *);
|
|
void pmap_alloc_level(vaddr_t, int, long *);
|
|
|
|
static inline
|
|
void pmap_sync_flags_pte(struct vm_page *, u_long);
|
|
|
|
void pmap_tlb_shootpage(struct pmap *, vaddr_t, int);
|
|
void pmap_tlb_shootrange(struct pmap *, vaddr_t, vaddr_t, int);
|
|
void pmap_tlb_shoottlb(struct pmap *, int);
|
|
#ifdef MULTIPROCESSOR
|
|
void pmap_tlb_shootwait(void);
|
|
#else
|
|
#define pmap_tlb_shootwait() do { } while (0)
|
|
#endif
|
|
|
|
/*
|
|
* p m a p i n l i n e h e l p e r f u n c t i o n s
|
|
*/
|
|
|
|
/*
|
|
* pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
|
|
* of course the kernel is always loaded
|
|
*/
|
|
|
|
static inline int
|
|
pmap_is_curpmap(struct pmap *pmap)
|
|
{
|
|
return((pmap == pmap_kernel()) ||
|
|
(pmap->pm_pdirpa == (rcr3() & CR3_PADDR)));
|
|
}
|
|
|
|
/*
|
|
* pmap_is_active: is this pmap loaded into the specified processor's %cr3?
|
|
*/
|
|
|
|
#ifdef MULTIPROCESSOR
|
|
static inline int
|
|
pmap_is_active(struct pmap *pmap, struct cpu_info *ci)
|
|
{
|
|
return pmap == pmap_kernel() || pmap == ci->ci_proc_pmap;
|
|
}
|
|
#endif
|
|
|
|
static inline u_int
|
|
pmap_pte2flags(u_long pte)
|
|
{
|
|
return (((pte & PG_U) ? PG_PMAP_REF : 0) |
|
|
((pte & PG_M) ? PG_PMAP_MOD : 0));
|
|
}
|
|
|
|
static inline void
|
|
pmap_sync_flags_pte(struct vm_page *pg, u_long pte)
|
|
{
|
|
if (pte & (PG_U|PG_M)) {
|
|
atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(pte));
|
|
}
|
|
}
|
|
|
|
/*
|
|
* pmap_map_ptes: map a pmap's PTEs into KVM
|
|
*
|
|
* This should not be done for EPT pmaps
|
|
*/
|
|
paddr_t
|
|
pmap_map_ptes(struct pmap *pmap)
|
|
{
|
|
paddr_t cr3;
|
|
|
|
KASSERT(pmap->pm_type != PMAP_TYPE_EPT);
|
|
|
|
/* the kernel's pmap is always accessible */
|
|
if (pmap == pmap_kernel())
|
|
return 0;
|
|
|
|
/*
|
|
* Lock the target map before switching to its page tables to
|
|
* guarantee other CPUs have finished changing the tables before
|
|
* we potentially start caching table and TLB entries.
|
|
*/
|
|
mtx_enter(&pmap->pm_mtx);
|
|
|
|
cr3 = rcr3();
|
|
KASSERT((cr3 & CR3_PCID) == PCID_KERN ||
|
|
(cr3 & CR3_PCID) == PCID_PROC);
|
|
if (pmap->pm_pdirpa == (cr3 & CR3_PADDR))
|
|
cr3 = 0;
|
|
else {
|
|
cr3 |= cr3_reuse_pcid;
|
|
lcr3(pmap->pm_pdirpa | cr3_pcid_temp);
|
|
}
|
|
|
|
return cr3;
|
|
}
|
|
|
|
void
|
|
pmap_unmap_ptes(struct pmap *pmap, paddr_t save_cr3)
|
|
{
|
|
if (pmap != pmap_kernel())
|
|
mtx_leave(&pmap->pm_mtx);
|
|
|
|
if (save_cr3 != 0)
|
|
lcr3(save_cr3);
|
|
}
|
|
|
|
int
|
|
pmap_find_pte_direct(struct pmap *pm, vaddr_t va, pt_entry_t **pd, int *offs)
|
|
{
|
|
u_long mask, shift;
|
|
pd_entry_t pde;
|
|
paddr_t pdpa;
|
|
int lev;
|
|
|
|
pdpa = pm->pm_pdirpa;
|
|
shift = L4_SHIFT;
|
|
mask = L4_MASK;
|
|
for (lev = PTP_LEVELS; lev > 0; lev--) {
|
|
*pd = (pd_entry_t *)PMAP_DIRECT_MAP(pdpa);
|
|
*offs = (VA_SIGN_POS(va) & mask) >> shift;
|
|
pde = (*pd)[*offs];
|
|
|
|
/* Large pages are different, break early if we run into one. */
|
|
if ((pde & (PG_PS|PG_V)) != PG_V)
|
|
return (lev - 1);
|
|
|
|
pdpa = ((*pd)[*offs] & PG_FRAME);
|
|
/* 4096/8 == 512 == 2^9 entries per level */
|
|
shift -= 9;
|
|
mask >>= 9;
|
|
}
|
|
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* p m a p k e n t e r f u n c t i o n s
|
|
*
|
|
* functions to quickly enter/remove pages from the kernel address
|
|
* space. pmap_kremove is exported to MI kernel. we make use of
|
|
* the recursive PTE mappings.
|
|
*/
|
|
|
|
/*
|
|
* pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
|
|
*
|
|
* => no need to lock anything, assume va is already allocated
|
|
* => should be faster than normal pmap enter function
|
|
*/
|
|
|
|
void
|
|
pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot)
|
|
{
|
|
pt_entry_t *pte, opte, npte;
|
|
|
|
pte = kvtopte(va);
|
|
|
|
npte = (pa & PMAP_PA_MASK) | ((prot & PROT_WRITE) ? PG_RW : PG_RO) |
|
|
((pa & PMAP_NOCACHE) ? PG_N : 0) |
|
|
((pa & PMAP_WC) ? pmap_pg_wc : 0) | PG_V;
|
|
|
|
/* special 1:1 mappings in the first 2MB must not be global */
|
|
if (va >= (vaddr_t)NBPD_L2)
|
|
npte |= pg_g_kern;
|
|
|
|
if (!(prot & PROT_EXEC))
|
|
npte |= pg_nx;
|
|
opte = pmap_pte_set(pte, npte);
|
|
#ifdef LARGEPAGES
|
|
/* XXX For now... */
|
|
if (opte & PG_PS)
|
|
panic("%s: PG_PS", __func__);
|
|
#endif
|
|
if (pmap_valid_entry(opte)) {
|
|
if (pa & PMAP_NOCACHE && (opte & PG_N) == 0)
|
|
wbinvd_on_all_cpus();
|
|
/* This shouldn't happen */
|
|
pmap_tlb_shootpage(pmap_kernel(), va, 1);
|
|
pmap_tlb_shootwait();
|
|
}
|
|
}
|
|
|
|
/*
|
|
* pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
|
|
*
|
|
* => no need to lock anything
|
|
* => caller must dispose of any vm_page mapped in the va range
|
|
* => note: not an inline function
|
|
* => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
|
|
* => we assume kernel only unmaps valid addresses and thus don't bother
|
|
* checking the valid bit before doing TLB flushing
|
|
*/
|
|
|
|
void
|
|
pmap_kremove(vaddr_t sva, vsize_t len)
|
|
{
|
|
pt_entry_t *pte, opte;
|
|
vaddr_t va, eva;
|
|
|
|
eva = sva + len;
|
|
|
|
for (va = sva; va != eva; va += PAGE_SIZE) {
|
|
pte = kvtopte(va);
|
|
|
|
opte = pmap_pte_set(pte, 0);
|
|
#ifdef LARGEPAGES
|
|
KASSERT((opte & PG_PS) == 0);
|
|
#endif
|
|
KASSERT((opte & PG_PVLIST) == 0);
|
|
}
|
|
|
|
pmap_tlb_shootrange(pmap_kernel(), sva, eva, 1);
|
|
pmap_tlb_shootwait();
|
|
}
|
|
|
|
/*
|
|
* pmap_set_pml4_early
|
|
*
|
|
* Utility function to map 2GB of 2MB pages to 'pa'. The VA that is assigned
|
|
* is the pml4 entry for 'early mappings' (see pmap.h). This function is used
|
|
* by display drivers that need to map their framebuffers early, before the
|
|
* pmap is fully initialized (eg, to show panic messages).
|
|
*
|
|
* Users of this function must call pmap_clear_pml4_early to remove the
|
|
* mapping when finished.
|
|
*
|
|
* Parameters:
|
|
* pa: phys addr to map
|
|
*
|
|
* Return value:
|
|
* VA mapping to 'pa'. This mapping is 2GB in size and starts at the base
|
|
* of the 2MB region containing 'va'.
|
|
*/
|
|
vaddr_t
|
|
pmap_set_pml4_early(paddr_t pa)
|
|
{
|
|
extern paddr_t early_pte_pages;
|
|
pt_entry_t *pml4e, *pte;
|
|
int i, j, off;
|
|
paddr_t curpa;
|
|
vaddr_t va;
|
|
|
|
pml4e = (pt_entry_t *)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE);
|
|
pml4e[PDIR_SLOT_EARLY] = (pd_entry_t)early_pte_pages | PG_V | PG_RW;
|
|
|
|
off = pa & PAGE_MASK_L2;
|
|
curpa = pa & L2_FRAME;
|
|
|
|
pte = (pt_entry_t *)PMAP_DIRECT_MAP(early_pte_pages);
|
|
memset(pte, 0, 3 * NBPG);
|
|
|
|
pte[0] = (early_pte_pages + NBPG) | PG_V | PG_RW;
|
|
pte[1] = (early_pte_pages + 2 * NBPG) | PG_V | PG_RW;
|
|
|
|
pte = (pt_entry_t *)PMAP_DIRECT_MAP(early_pte_pages + NBPG);
|
|
for (i = 0; i < 2; i++) {
|
|
/* 2 early pages of mappings */
|
|
for (j = 0; j < 512; j++) {
|
|
/* j[0..511] : 2MB mappings per page */
|
|
pte[(i * 512) + j] = curpa | PG_V | PG_RW | PG_PS;
|
|
curpa += (2 * 1024 * 1024);
|
|
}
|
|
}
|
|
|
|
va = (vaddr_t)((PDIR_SLOT_EARLY * 512ULL) << L3_SHIFT) + off;
|
|
return VA_SIGN_NEG(va);
|
|
}
|
|
|
|
/*
|
|
* pmap_clear_pml4_early
|
|
*
|
|
* Clears the mapping previously established with pmap_set_pml4_early.
|
|
*/
|
|
void
|
|
pmap_clear_pml4_early(void)
|
|
{
|
|
extern paddr_t early_pte_pages;
|
|
pt_entry_t *pml4e, *pte;
|
|
|
|
pte = (pt_entry_t *)PMAP_DIRECT_MAP(early_pte_pages);
|
|
memset(pte, 0, 3 * NBPG);
|
|
|
|
pml4e = (pd_entry_t *)pmap_kernel()->pm_pdir;
|
|
pml4e[PDIR_SLOT_EARLY] = 0;
|
|
tlbflush();
|
|
}
|
|
|
|
/*
|
|
* p m a p i n i t f u n c t i o n s
|
|
*
|
|
* pmap_bootstrap and pmap_init are called during system startup
|
|
* to init the pmap module. pmap_bootstrap() does a low level
|
|
* init just to get things rolling. pmap_init() finishes the job.
|
|
*/
|
|
|
|
/*
|
|
* pmap_bootstrap: get the system in a state where it can run with VM
|
|
* properly enabled (called before main()). the VM system is
|
|
* fully init'd later...
|
|
*/
|
|
|
|
paddr_t
|
|
pmap_bootstrap(paddr_t first_avail, paddr_t max_pa)
|
|
{
|
|
vaddr_t kva_start = VM_MIN_KERNEL_ADDRESS;
|
|
struct pmap *kpm;
|
|
int curslot, i, j, p;
|
|
long ndmpdp;
|
|
paddr_t dmpd, dmpdp, start_cur, cur_pa;
|
|
vaddr_t kva, kva_end;
|
|
pt_entry_t *pml3, *pml2;
|
|
|
|
/*
|
|
* define the boundaries of the managed kernel virtual address
|
|
* space.
|
|
*/
|
|
|
|
virtual_avail = kva_start; /* first free KVA */
|
|
|
|
/*
|
|
* If PKU is available, initialize PROT_EXEC entry correctly,
|
|
* and enable the feature before it gets used
|
|
* XXX Some Hypervisors forget to save/restore PKU
|
|
*/
|
|
if (cpuid_level >= 0x7) {
|
|
uint32_t ecx, dummy;
|
|
|
|
CPUID_LEAF(0x7, 0, dummy, dummy, ecx, dummy);
|
|
if (ecx & SEFF0ECX_PKU) {
|
|
lcr4(rcr4() | CR4_PKE);
|
|
pg_xo = PG_XO;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* set up protection_codes: we need to be able to convert from
|
|
* a MI protection code (some combo of VM_PROT...) to something
|
|
* we can jam into a i386 PTE.
|
|
*/
|
|
|
|
protection_codes[PROT_NONE] = pg_nx; /* --- */
|
|
protection_codes[PROT_EXEC] = pg_xo; ; /* --x */
|
|
protection_codes[PROT_READ] = PG_RO | pg_nx; /* -r- */
|
|
protection_codes[PROT_READ | PROT_EXEC] = PG_RO; /* -rx */
|
|
protection_codes[PROT_WRITE] = PG_RW | pg_nx; /* w-- */
|
|
protection_codes[PROT_WRITE | PROT_EXEC] = PG_RW; /* w-x */
|
|
protection_codes[PROT_WRITE | PROT_READ] = PG_RW | pg_nx; /* wr- */
|
|
protection_codes[PROT_READ | PROT_WRITE | PROT_EXEC] = PG_RW; /* wrx */
|
|
|
|
/*
|
|
* now we init the kernel's pmap
|
|
*
|
|
* the kernel pmap's pm_obj is not used for much. however, in
|
|
* user pmaps the pm_obj contains the list of active PTPs.
|
|
* the pm_obj currently does not have a pager.
|
|
*/
|
|
|
|
kpm = pmap_kernel();
|
|
for (i = 0; i < PTP_LEVELS - 1; i++) {
|
|
uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, 1);
|
|
kpm->pm_ptphint[i] = NULL;
|
|
}
|
|
memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */
|
|
kpm->pm_pdir = (pd_entry_t *)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE);
|
|
kpm->pm_pdirpa = proc0.p_addr->u_pcb.pcb_cr3;
|
|
kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
|
|
atop(kva_start - VM_MIN_KERNEL_ADDRESS);
|
|
/*
|
|
* the above is just a rough estimate and not critical to the proper
|
|
* operation of the system.
|
|
*/
|
|
|
|
kpm->pm_type = PMAP_TYPE_NORMAL;
|
|
|
|
curpcb->pcb_pmap = kpm; /* proc0's pcb */
|
|
|
|
/*
|
|
* Configure and enable PCID use if supported.
|
|
* Currently we require INVPCID support.
|
|
*/
|
|
if ((cpu_ecxfeature & CPUIDECX_PCID) && cpuid_level >= 0x07) {
|
|
uint32_t ebx, dummy;
|
|
CPUID_LEAF(0x7, 0, dummy, ebx, dummy, dummy);
|
|
if (ebx & SEFF0EBX_INVPCID) {
|
|
pmap_use_pcid = 1;
|
|
/*
|
|
* We cannot use global mappings because
|
|
* invpcid function 0 does not invalidate global
|
|
* mappings. The hardware can cache kernel
|
|
* mappings based on PCID_KERN, i.e. there is no
|
|
* need for global mappings.
|
|
*/
|
|
pg_g_kern = 0;
|
|
lcr4( rcr4() | CR4_PCIDE );
|
|
cr3_pcid_proc = PCID_PROC;
|
|
cr3_pcid_temp = PCID_TEMP;
|
|
cr3_reuse_pcid = CR3_REUSE_PCID;
|
|
cr3_pcid_proc_intel = PCID_PROC_INTEL;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Add PG_G attribute to already mapped kernel pages. pg_g_kern
|
|
* is calculated in locore0.S and may be set to:
|
|
*
|
|
* 0 if this CPU does not safely support global pages in the kernel
|
|
* (Intel/Meltdown)
|
|
* PG_G if this CPU does safely support global pages in the kernel
|
|
* (AMD)
|
|
*/
|
|
#if KERNBASE == VM_MIN_KERNEL_ADDRESS
|
|
for (kva = VM_MIN_KERNEL_ADDRESS ; kva < virtual_avail ;
|
|
#else
|
|
kva_end = roundup((vaddr_t)&end, PAGE_SIZE);
|
|
for (kva = KERNBASE; kva < kva_end ;
|
|
#endif
|
|
kva += PAGE_SIZE) {
|
|
unsigned long p1i = pl1_i(kva);
|
|
if (pmap_valid_entry(PTE_BASE[p1i]))
|
|
PTE_BASE[p1i] |= pg_g_kern;
|
|
}
|
|
|
|
/*
|
|
* Map the direct map. The first 4GB were mapped in locore, here
|
|
* we map the rest if it exists. We actually use the direct map
|
|
* here to set up the page tables, we're assuming that we're still
|
|
* operating in the lower 4GB of memory.
|
|
*
|
|
* Map (up to) the first 512GB of physical memory first. This part
|
|
* is handled differently than physical memory > 512GB since we have
|
|
* already mapped part of this range in locore0.
|
|
*/
|
|
ndmpdp = (max_pa + NBPD_L3 - 1) >> L3_SHIFT;
|
|
if (ndmpdp < NDML2_ENTRIES)
|
|
ndmpdp = NDML2_ENTRIES; /* At least 4GB */
|
|
if (ndmpdp > 512)
|
|
ndmpdp = 512; /* At most 512GB */
|
|
|
|
dmpdp = kpm->pm_pdir[PDIR_SLOT_DIRECT] & PG_FRAME;
|
|
|
|
dmpd = first_avail; first_avail += ndmpdp * PAGE_SIZE;
|
|
|
|
for (i = NDML2_ENTRIES; i < NPDPG * ndmpdp; i++) {
|
|
paddr_t pdp;
|
|
vaddr_t va;
|
|
|
|
pdp = (paddr_t)&(((pd_entry_t *)dmpd)[i]);
|
|
va = PMAP_DIRECT_MAP(pdp);
|
|
|
|
*((pd_entry_t *)va) = ((paddr_t)i << L2_SHIFT);
|
|
*((pd_entry_t *)va) |= PG_RW | PG_V | PG_PS | pg_g_kern | PG_U |
|
|
PG_M | pg_nx;
|
|
}
|
|
|
|
for (i = NDML2_ENTRIES; i < ndmpdp; i++) {
|
|
paddr_t pdp;
|
|
vaddr_t va;
|
|
|
|
pdp = (paddr_t)&(((pd_entry_t *)dmpdp)[i]);
|
|
va = PMAP_DIRECT_MAP(pdp);
|
|
|
|
*((pd_entry_t *)va) = dmpd + (i << PAGE_SHIFT);
|
|
*((pd_entry_t *)va) |= PG_RW | PG_V | PG_U | PG_M | pg_nx;
|
|
}
|
|
|
|
kpm->pm_pdir[PDIR_SLOT_DIRECT] = dmpdp | PG_V | PG_KW | PG_U |
|
|
PG_M | pg_nx;
|
|
|
|
/* Map any remaining physical memory > 512GB */
|
|
for (curslot = 1 ; curslot < NUM_L4_SLOT_DIRECT ; curslot++) {
|
|
/*
|
|
* Start of current range starts at PA (curslot) * 512GB
|
|
*/
|
|
start_cur = (paddr_t)(curslot * NBPD_L4);
|
|
if (max_pa > start_cur) {
|
|
/* Next 512GB, new PML4e and L3(512GB) page */
|
|
dmpd = first_avail; first_avail += PAGE_SIZE;
|
|
pml3 = (pt_entry_t *)PMAP_DIRECT_MAP(dmpd);
|
|
kpm->pm_pdir[PDIR_SLOT_DIRECT + curslot] = dmpd |
|
|
PG_KW | PG_V | PG_U | PG_M | pg_nx;
|
|
|
|
/* Calculate full 1GB pages in this 512GB region */
|
|
p = ((max_pa - start_cur) >> L3_SHIFT);
|
|
|
|
/* Check if a partial (<1GB) page remains */
|
|
if (max_pa & L2_MASK)
|
|
p++;
|
|
|
|
/*
|
|
* Handle the case where this range is full and there
|
|
* is still more memory after (p would be > 512).
|
|
*/
|
|
if (p > NPDPG)
|
|
p = NPDPG;
|
|
|
|
/* Allocate 'p' L2(1GB) pages and populate */
|
|
for (i = 0; i < p; i++) {
|
|
dmpd = first_avail; first_avail += PAGE_SIZE;
|
|
pml2 = (pt_entry_t *)PMAP_DIRECT_MAP(dmpd);
|
|
pml3[i] = dmpd |
|
|
PG_RW | PG_V | PG_U | PG_M | pg_nx;
|
|
|
|
cur_pa = start_cur + (i << L3_SHIFT);
|
|
j = 0;
|
|
|
|
while (cur_pa < max_pa && j < NPDPG) {
|
|
pml2[j] = curslot * NBPD_L4 +
|
|
(uint64_t)i * NBPD_L3 +
|
|
(uint64_t)j * NBPD_L2;
|
|
pml2[j] |= PG_RW | PG_V | pg_g_kern |
|
|
PG_U | PG_M | pg_nx | PG_PS;
|
|
cur_pa += NBPD_L2;
|
|
j++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
tlbflush();
|
|
|
|
msgbuf_vaddr = virtual_avail;
|
|
virtual_avail += round_page(MSGBUFSIZE);
|
|
|
|
idt_vaddr = virtual_avail;
|
|
virtual_avail += 2 * PAGE_SIZE;
|
|
idt_paddr = first_avail; /* steal a page */
|
|
first_avail += 2 * PAGE_SIZE;
|
|
|
|
#if defined(MULTIPROCESSOR) || \
|
|
(NACPI > 0 && !defined(SMALL_KERNEL))
|
|
/*
|
|
* Grab a page below 4G for things that need it (i.e.
|
|
* having an initial %cr3 for the MP trampoline).
|
|
*/
|
|
lo32_vaddr = virtual_avail;
|
|
virtual_avail += PAGE_SIZE;
|
|
lo32_paddr = first_avail;
|
|
first_avail += PAGE_SIZE;
|
|
#endif
|
|
|
|
/*
|
|
* init the global lists.
|
|
*/
|
|
LIST_INIT(&pmaps);
|
|
|
|
/*
|
|
* initialize the pmap pools.
|
|
*/
|
|
|
|
pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, IPL_VM, 0,
|
|
"pmappl", NULL);
|
|
pool_init(&pmap_pv_pool, sizeof(struct pv_entry), 0, IPL_VM, 0,
|
|
"pvpl", &pool_allocator_single);
|
|
pool_sethiwat(&pmap_pv_pool, 32 * 1024);
|
|
|
|
/*
|
|
* initialize the PDE pool.
|
|
*/
|
|
|
|
pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, IPL_VM, 0,
|
|
"pdppl", &pool_allocator_single);
|
|
|
|
kpm->pm_pdir_intel = NULL;
|
|
kpm->pm_pdirpa_intel = 0;
|
|
|
|
/*
|
|
* ensure the TLB is sync'd with reality by flushing it...
|
|
*/
|
|
|
|
tlbflush();
|
|
|
|
return first_avail;
|
|
}
|
|
|
|
void
|
|
pmap_init_percpu(void)
|
|
{
|
|
pool_cache_init(&pmap_pv_pool);
|
|
}
|
|
|
|
/*
|
|
* pmap_randomize
|
|
*
|
|
* Randomizes the location of the kernel pmap
|
|
*/
|
|
void
|
|
pmap_randomize(void)
|
|
{
|
|
pd_entry_t *pml4va, *oldpml4va;
|
|
paddr_t pml4pa;
|
|
int i;
|
|
|
|
pml4va = km_alloc(PAGE_SIZE, &kv_page, &kp_zero, &kd_nowait);
|
|
if (pml4va == NULL)
|
|
panic("%s: km_alloc failed", __func__);
|
|
|
|
/* Copy old PML4 page to new one */
|
|
oldpml4va = pmap_kernel()->pm_pdir;
|
|
memcpy(pml4va, oldpml4va, PAGE_SIZE);
|
|
|
|
/* Switch to new PML4 */
|
|
pmap_extract(pmap_kernel(), (vaddr_t)pml4va, &pml4pa);
|
|
lcr3(pml4pa);
|
|
|
|
/* Fixup pmap_kernel and proc0's %cr3 */
|
|
pmap_kernel()->pm_pdirpa = pml4pa;
|
|
pmap_kernel()->pm_pdir = pml4va;
|
|
proc0.p_addr->u_pcb.pcb_cr3 = pml4pa;
|
|
|
|
/* Fixup recursive PTE PML4E slot. We are only changing the PA */
|
|
pml4va[PDIR_SLOT_PTE] = pml4pa | (pml4va[PDIR_SLOT_PTE] & ~PG_FRAME);
|
|
|
|
for (i = 0; i < NPDPG; i++) {
|
|
/* PTE slot already handled earlier */
|
|
if (i == PDIR_SLOT_PTE)
|
|
continue;
|
|
|
|
if (pml4va[i] & PG_FRAME)
|
|
pmap_randomize_level(&pml4va[i], 3);
|
|
}
|
|
|
|
/* Wipe out bootstrap PML4 */
|
|
memset(oldpml4va, 0, PAGE_SIZE);
|
|
tlbflush();
|
|
}
|
|
|
|
void
|
|
pmap_randomize_level(pd_entry_t *pde, int level)
|
|
{
|
|
pd_entry_t *new_pd_va;
|
|
paddr_t old_pd_pa, new_pd_pa;
|
|
vaddr_t old_pd_va;
|
|
struct vm_page *pg;
|
|
int i;
|
|
|
|
if (level == 0)
|
|
return;
|
|
|
|
if (level < PTP_LEVELS - 1 && (*pde & PG_PS))
|
|
return;
|
|
|
|
new_pd_va = km_alloc(PAGE_SIZE, &kv_page, &kp_zero, &kd_nowait);
|
|
if (new_pd_va == NULL)
|
|
panic("%s: cannot allocate page for L%d page directory",
|
|
__func__, level);
|
|
|
|
old_pd_pa = *pde & PG_FRAME;
|
|
old_pd_va = PMAP_DIRECT_MAP(old_pd_pa);
|
|
pmap_extract(pmap_kernel(), (vaddr_t)new_pd_va, &new_pd_pa);
|
|
memcpy(new_pd_va, (void *)old_pd_va, PAGE_SIZE);
|
|
*pde = new_pd_pa | (*pde & ~PG_FRAME);
|
|
|
|
tlbflush();
|
|
memset((void *)old_pd_va, 0, PAGE_SIZE);
|
|
|
|
pg = PHYS_TO_VM_PAGE(old_pd_pa);
|
|
if (pg != NULL) {
|
|
pg->wire_count--;
|
|
pmap_kernel()->pm_stats.resident_count--;
|
|
if (pg->wire_count <= 1)
|
|
uvm_pagefree(pg);
|
|
}
|
|
|
|
for (i = 0; i < NPDPG; i++)
|
|
if (new_pd_va[i] & PG_FRAME)
|
|
pmap_randomize_level(&new_pd_va[i], level - 1);
|
|
}
|
|
|
|
/*
|
|
* Pre-allocate PTPs for low memory, so that 1:1 mappings for various
|
|
* trampoline code can be entered.
|
|
*/
|
|
paddr_t
|
|
pmap_prealloc_lowmem_ptps(paddr_t first_avail)
|
|
{
|
|
pd_entry_t *pdes;
|
|
int level;
|
|
paddr_t newp;
|
|
|
|
pdes = pmap_kernel()->pm_pdir;
|
|
level = PTP_LEVELS;
|
|
for (;;) {
|
|
newp = first_avail; first_avail += PAGE_SIZE;
|
|
memset((void *)PMAP_DIRECT_MAP(newp), 0, PAGE_SIZE);
|
|
pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW;
|
|
level--;
|
|
if (level <= 1)
|
|
break;
|
|
pdes = normal_pdes[level - 2];
|
|
}
|
|
|
|
return first_avail;
|
|
}
|
|
|
|
/*
|
|
* pmap_init: no further initialization required on this platform
|
|
*/
|
|
void
|
|
pmap_init(void)
|
|
{
|
|
pmap_initialized = 1;
|
|
}
|
|
|
|
/*
|
|
* p v _ e n t r y f u n c t i o n s
|
|
*/
|
|
|
|
/*
|
|
* main pv_entry manipulation functions:
|
|
* pmap_enter_pv: enter a mapping onto a pv list
|
|
* pmap_remove_pv: remove a mapping from a pv list
|
|
*/
|
|
|
|
/*
|
|
* pmap_enter_pv: enter a mapping onto a pv list
|
|
*
|
|
* => caller should adjust ptp's wire_count before calling
|
|
*
|
|
* pve: preallocated pve for us to use
|
|
* ptp: PTP in pmap that maps this VA
|
|
*/
|
|
|
|
void
|
|
pmap_enter_pv(struct vm_page *pg, struct pv_entry *pve, struct pmap *pmap,
|
|
vaddr_t va, struct vm_page *ptp)
|
|
{
|
|
pve->pv_pmap = pmap;
|
|
pve->pv_va = va;
|
|
pve->pv_ptp = ptp; /* NULL for kernel pmap */
|
|
mtx_enter(&pg->mdpage.pv_mtx);
|
|
pve->pv_next = pg->mdpage.pv_list; /* add to ... */
|
|
pg->mdpage.pv_list = pve; /* ... list */
|
|
mtx_leave(&pg->mdpage.pv_mtx);
|
|
}
|
|
|
|
/*
|
|
* pmap_remove_pv: try to remove a mapping from a pv_list
|
|
*
|
|
* => caller should adjust ptp's wire_count and free PTP if needed
|
|
* => we return the removed pve
|
|
*/
|
|
|
|
struct pv_entry *
|
|
pmap_remove_pv(struct vm_page *pg, struct pmap *pmap, vaddr_t va)
|
|
{
|
|
struct pv_entry *pve, **prevptr;
|
|
|
|
mtx_enter(&pg->mdpage.pv_mtx);
|
|
prevptr = &pg->mdpage.pv_list;
|
|
while ((pve = *prevptr) != NULL) {
|
|
if (pve->pv_pmap == pmap && pve->pv_va == va) { /* match? */
|
|
*prevptr = pve->pv_next; /* remove it! */
|
|
break;
|
|
}
|
|
prevptr = &pve->pv_next; /* previous pointer */
|
|
}
|
|
mtx_leave(&pg->mdpage.pv_mtx);
|
|
return(pve); /* return removed pve */
|
|
}
|
|
|
|
/*
|
|
* p t p f u n c t i o n s
|
|
*/
|
|
|
|
struct vm_page *
|
|
pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
|
|
{
|
|
int lidx = level - 1;
|
|
struct vm_page *pg;
|
|
|
|
if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
|
|
pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx]))
|
|
return (pmap->pm_ptphint[lidx]);
|
|
|
|
pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
|
|
|
|
return pg;
|
|
}
|
|
|
|
void
|
|
pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level,
|
|
struct pg_to_free *pagelist)
|
|
{
|
|
int lidx;
|
|
struct uvm_object *obj;
|
|
|
|
lidx = level - 1;
|
|
|
|
obj = &pmap->pm_obj[lidx];
|
|
pmap->pm_stats.resident_count--;
|
|
if (pmap->pm_ptphint[lidx] == ptp)
|
|
pmap->pm_ptphint[lidx] = RBT_ROOT(uvm_objtree, &obj->memt);
|
|
ptp->wire_count = 0;
|
|
uvm_pagerealloc(ptp, NULL, 0);
|
|
TAILQ_INSERT_TAIL(pagelist, ptp, pageq);
|
|
}
|
|
|
|
void
|
|
pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
|
|
struct pg_to_free *pagelist)
|
|
{
|
|
unsigned long index;
|
|
int level;
|
|
vaddr_t invaladdr;
|
|
|
|
level = 1;
|
|
do {
|
|
pmap_freepage(pmap, ptp, level, pagelist);
|
|
index = pl_i(va, level + 1);
|
|
pmap_pte_set(&normal_pdes[level - 1][index], 0);
|
|
if (level == PTP_LEVELS - 1 && pmap->pm_pdir_intel != NULL) {
|
|
/* Zap special meltdown PML4e */
|
|
pmap_pte_set(&pmap->pm_pdir_intel[index], 0);
|
|
DPRINTF("%s: cleared meltdown PML4e @ index %lu "
|
|
"(va range start 0x%llx)\n", __func__, index,
|
|
(uint64_t)(index << L4_SHIFT));
|
|
}
|
|
invaladdr = level == 1 ? (vaddr_t)PTE_BASE :
|
|
(vaddr_t)normal_pdes[level - 2];
|
|
pmap_tlb_shootpage(pmap, invaladdr + index * PAGE_SIZE,
|
|
pmap_is_curpmap(curpcb->pcb_pmap));
|
|
if (level < PTP_LEVELS - 1) {
|
|
ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
|
|
ptp->wire_count--;
|
|
if (ptp->wire_count > 1)
|
|
break;
|
|
}
|
|
} while (++level < PTP_LEVELS);
|
|
}
|
|
|
|
/*
|
|
* pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
|
|
*
|
|
* => pmap should NOT be pmap_kernel()
|
|
*/
|
|
|
|
struct vm_page *
|
|
pmap_get_ptp(struct pmap *pmap, vaddr_t va)
|
|
{
|
|
struct vm_page *ptp, *pptp;
|
|
int i;
|
|
unsigned long index;
|
|
pd_entry_t *pva, *pva_intel;
|
|
paddr_t ppa, pa;
|
|
struct uvm_object *obj;
|
|
|
|
ptp = NULL;
|
|
pa = (paddr_t)-1;
|
|
|
|
/*
|
|
* Loop through all page table levels seeing if we need to
|
|
* add a new page to that level.
|
|
*/
|
|
for (i = PTP_LEVELS; i > 1; i--) {
|
|
/*
|
|
* Save values from previous round.
|
|
*/
|
|
pptp = ptp;
|
|
ppa = pa;
|
|
|
|
index = pl_i(va, i);
|
|
pva = normal_pdes[i - 2];
|
|
|
|
if (pmap_valid_entry(pva[index])) {
|
|
ppa = pva[index] & PG_FRAME;
|
|
ptp = NULL;
|
|
continue;
|
|
}
|
|
|
|
obj = &pmap->pm_obj[i-2];
|
|
ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
|
|
UVM_PGA_USERESERVE|UVM_PGA_ZERO);
|
|
|
|
if (ptp == NULL)
|
|
return NULL;
|
|
|
|
atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
|
|
ptp->wire_count = 1;
|
|
pmap->pm_ptphint[i - 2] = ptp;
|
|
pa = VM_PAGE_TO_PHYS(ptp);
|
|
pva[index] = (pd_entry_t) (pa | PG_u | PG_RW | PG_V);
|
|
|
|
/*
|
|
* Meltdown Special case - if we are adding a new PML4e for
|
|
* usermode addresses, just copy the PML4e to the U-K page
|
|
* table.
|
|
*/
|
|
if (pmap->pm_pdir_intel != NULL && i == PTP_LEVELS &&
|
|
va < VM_MAXUSER_ADDRESS) {
|
|
pva_intel = pmap->pm_pdir_intel;
|
|
pva_intel[index] = pva[index];
|
|
DPRINTF("%s: copying usermode PML4e (content=0x%llx) "
|
|
"from 0x%llx -> 0x%llx\n", __func__, pva[index],
|
|
(uint64_t)&pva[index], (uint64_t)&pva_intel[index]);
|
|
}
|
|
|
|
pmap->pm_stats.resident_count++;
|
|
/*
|
|
* If we're not in the top level, increase the
|
|
* wire count of the parent page.
|
|
*/
|
|
if (i < PTP_LEVELS) {
|
|
if (pptp == NULL)
|
|
pptp = pmap_find_ptp(pmap, va, ppa, i);
|
|
#ifdef DIAGNOSTIC
|
|
if (pptp == NULL)
|
|
panic("%s: pde page disappeared", __func__);
|
|
#endif
|
|
pptp->wire_count++;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* ptp is not NULL if we just allocated a new ptp. If it's
|
|
* still NULL, we must look up the existing one.
|
|
*/
|
|
if (ptp == NULL) {
|
|
ptp = pmap_find_ptp(pmap, va, ppa, 1);
|
|
#ifdef DIAGNOSTIC
|
|
if (ptp == NULL) {
|
|
printf("va %lx ppa %lx\n", (unsigned long)va,
|
|
(unsigned long)ppa);
|
|
panic("%s: unmanaged user PTP", __func__);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
pmap->pm_ptphint[0] = ptp;
|
|
return(ptp);
|
|
}
|
|
|
|
/*
|
|
* p m a p l i f e c y c l e f u n c t i o n s
|
|
*/
|
|
|
|
/*
|
|
* pmap_pdp_ctor: constructor for the PDP cache.
|
|
*/
|
|
|
|
void
|
|
pmap_pdp_ctor(pd_entry_t *pdir)
|
|
{
|
|
paddr_t pdirpa;
|
|
int npde, i;
|
|
struct pmap *kpm = pmap_kernel();
|
|
|
|
/* fetch the physical address of the page directory. */
|
|
(void) pmap_extract(kpm, (vaddr_t) pdir, &pdirpa);
|
|
|
|
/* zero init area */
|
|
memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
|
|
|
|
/* put in recursive PDE to map the PTEs */
|
|
pdir[PDIR_SLOT_PTE] = pdirpa | PG_V | PG_KW | pg_nx;
|
|
|
|
npde = nkptp[PTP_LEVELS - 1];
|
|
|
|
/* put in kernel VM PDEs */
|
|
memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
|
|
npde * sizeof(pd_entry_t));
|
|
|
|
/* zero the rest */
|
|
memset(&pdir[PDIR_SLOT_KERN + npde], 0,
|
|
(NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t));
|
|
|
|
for (i = 0; i < NUM_L4_SLOT_DIRECT; i++)
|
|
pdir[PDIR_SLOT_DIRECT + i] = kpm->pm_pdir[PDIR_SLOT_DIRECT + i];
|
|
|
|
#if VM_MIN_KERNEL_ADDRESS != KERNBASE
|
|
pdir[pl4_pi(KERNBASE)] = PDP_BASE[pl4_pi(KERNBASE)];
|
|
#endif
|
|
}
|
|
|
|
void
|
|
pmap_pdp_ctor_intel(pd_entry_t *pdir)
|
|
{
|
|
struct pmap *kpm = pmap_kernel();
|
|
|
|
/* Copy PML4es from pmap_kernel's U-K view */
|
|
memcpy(pdir, kpm->pm_pdir_intel, PAGE_SIZE);
|
|
}
|
|
|
|
/*
|
|
* pmap_create: create a pmap
|
|
*
|
|
* => note: old pmap interface took a "size" args which allowed for
|
|
* the creation of "software only" pmaps (not in bsd).
|
|
*/
|
|
|
|
struct pmap *
|
|
pmap_create(void)
|
|
{
|
|
struct pmap *pmap;
|
|
int i;
|
|
|
|
pmap = pool_get(&pmap_pmap_pool, PR_WAITOK);
|
|
|
|
mtx_init(&pmap->pm_mtx, IPL_VM);
|
|
|
|
/* init uvm_object */
|
|
for (i = 0; i < PTP_LEVELS - 1; i++) {
|
|
uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, 1);
|
|
pmap->pm_ptphint[i] = NULL;
|
|
}
|
|
pmap->pm_stats.wired_count = 0;
|
|
pmap->pm_stats.resident_count = 1; /* count the PDP allocd below */
|
|
pmap->pm_type = PMAP_TYPE_NORMAL;
|
|
pmap->eptp = 0;
|
|
|
|
/* allocate PDP */
|
|
|
|
/*
|
|
* note that there is no need to splvm to protect us from
|
|
* malloc since malloc allocates out of a submap and we should
|
|
* have already allocated kernel PTPs to cover the range...
|
|
*/
|
|
|
|
pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK);
|
|
pmap_pdp_ctor(pmap->pm_pdir);
|
|
|
|
pmap->pm_pdirpa = pmap->pm_pdir[PDIR_SLOT_PTE] & PG_FRAME;
|
|
|
|
/*
|
|
* Intel CPUs need a special page table to be used during usermode
|
|
* execution, one that lacks all kernel mappings.
|
|
*/
|
|
if (cpu_meltdown) {
|
|
pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool, PR_WAITOK);
|
|
pmap_pdp_ctor_intel(pmap->pm_pdir_intel);
|
|
pmap->pm_stats.resident_count++;
|
|
if (!pmap_extract(pmap_kernel(), (vaddr_t)pmap->pm_pdir_intel,
|
|
&pmap->pm_pdirpa_intel))
|
|
panic("%s: unknown PA mapping for meltdown PML4",
|
|
__func__);
|
|
} else {
|
|
pmap->pm_pdir_intel = NULL;
|
|
pmap->pm_pdirpa_intel = 0;
|
|
}
|
|
|
|
mtx_enter(&pmaps_lock);
|
|
LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
|
|
mtx_leave(&pmaps_lock);
|
|
return (pmap);
|
|
}
|
|
|
|
/*
|
|
* pmap_destroy: drop reference count on pmap. free pmap if
|
|
* reference count goes to zero.
|
|
*/
|
|
|
|
void
|
|
pmap_destroy(struct pmap *pmap)
|
|
{
|
|
struct vm_page *pg;
|
|
int refs;
|
|
int i;
|
|
|
|
/*
|
|
* drop reference count
|
|
*/
|
|
|
|
refs = atomic_dec_int_nv(&pmap->pm_obj[0].uo_refs);
|
|
if (refs > 0) {
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* remove it from global list of pmaps
|
|
*/
|
|
mtx_enter(&pmaps_lock);
|
|
LIST_REMOVE(pmap, pm_list);
|
|
mtx_leave(&pmaps_lock);
|
|
|
|
/*
|
|
* free any remaining PTPs
|
|
*/
|
|
|
|
for (i = 0; i < PTP_LEVELS - 1; i++) {
|
|
while ((pg = RBT_ROOT(uvm_objtree,
|
|
&pmap->pm_obj[i].memt)) != NULL) {
|
|
KASSERT((pg->pg_flags & PG_BUSY) == 0);
|
|
|
|
pg->wire_count = 0;
|
|
pmap->pm_stats.resident_count--;
|
|
|
|
uvm_pagefree(pg);
|
|
}
|
|
}
|
|
|
|
pool_put(&pmap_pdp_pool, pmap->pm_pdir);
|
|
|
|
if (pmap->pm_pdir_intel != NULL) {
|
|
pmap->pm_stats.resident_count--;
|
|
pool_put(&pmap_pdp_pool, pmap->pm_pdir_intel);
|
|
}
|
|
|
|
pool_put(&pmap_pmap_pool, pmap);
|
|
}
|
|
|
|
/*
|
|
* Add a reference to the specified pmap.
|
|
*/
|
|
|
|
void
|
|
pmap_reference(struct pmap *pmap)
|
|
{
|
|
atomic_inc_int(&pmap->pm_obj[0].uo_refs);
|
|
}
|
|
|
|
/*
|
|
* pmap_activate: activate a process' pmap (fill in %cr3)
|
|
*
|
|
* => called from cpu_fork() and when switching pmaps during exec
|
|
* => if p is the curproc, then load it into the MMU
|
|
*/
|
|
|
|
void
|
|
pmap_activate(struct proc *p)
|
|
{
|
|
struct pcb *pcb = &p->p_addr->u_pcb;
|
|
struct pmap *pmap = p->p_vmspace->vm_map.pmap;
|
|
|
|
pcb->pcb_pmap = pmap;
|
|
pcb->pcb_cr3 = pmap->pm_pdirpa;
|
|
pcb->pcb_cr3 |= (pmap != pmap_kernel()) ? cr3_pcid_proc :
|
|
(PCID_KERN | cr3_reuse_pcid);
|
|
|
|
if (p != curproc)
|
|
return;
|
|
|
|
if ((p->p_flag & P_SYSTEM) == 0) {
|
|
struct cpu_info *self = curcpu();
|
|
|
|
/* mark the pmap in use by this processor */
|
|
self->ci_proc_pmap = pmap;
|
|
|
|
/* in case we return to userspace without context switching */
|
|
if (cpu_meltdown) {
|
|
self->ci_kern_cr3 = pcb->pcb_cr3 | cr3_reuse_pcid;
|
|
self->ci_user_cr3 = pmap->pm_pdirpa_intel |
|
|
cr3_pcid_proc_intel;
|
|
}
|
|
}
|
|
|
|
lcr3(pcb->pcb_cr3);
|
|
}
|
|
|
|
/*
|
|
* pmap_deactivate: deactivate a process' pmap
|
|
*/
|
|
|
|
void
|
|
pmap_deactivate(struct proc *p)
|
|
{
|
|
if ((p->p_flag & P_SYSTEM) == 0) {
|
|
struct cpu_info *self = curcpu();
|
|
|
|
/*
|
|
* mark the pmap no longer in use by this processor.
|
|
*/
|
|
KASSERT(self->ci_proc_pmap == p->p_vmspace->vm_map.pmap);
|
|
self->ci_proc_pmap = NULL;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* end of lifecycle functions
|
|
*/
|
|
|
|
/*
|
|
* some misc. functions
|
|
*/
|
|
|
|
int
|
|
pmap_pdes_valid(vaddr_t va, pd_entry_t *lastpde)
|
|
{
|
|
int i;
|
|
unsigned long index;
|
|
pd_entry_t pde;
|
|
|
|
for (i = PTP_LEVELS; i > 1; i--) {
|
|
index = pl_i(va, i);
|
|
pde = normal_pdes[i - 2][index];
|
|
if (!pmap_valid_entry(pde))
|
|
return 0;
|
|
}
|
|
if (lastpde != NULL)
|
|
*lastpde = pde;
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* pmap_extract: extract a PA for the given VA
|
|
*/
|
|
|
|
int
|
|
pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
|
|
{
|
|
pt_entry_t *ptes, pte;
|
|
int level, offs;
|
|
|
|
if (pmap == pmap_kernel() && va >= PMAP_DIRECT_BASE &&
|
|
va < PMAP_DIRECT_END) {
|
|
*pap = va - PMAP_DIRECT_BASE;
|
|
return 1;
|
|
}
|
|
|
|
if (pmap != pmap_kernel())
|
|
mtx_enter(&pmap->pm_mtx);
|
|
|
|
level = pmap_find_pte_direct(pmap, va, &ptes, &offs);
|
|
pte = ptes[offs];
|
|
|
|
if (pmap != pmap_kernel())
|
|
mtx_leave(&pmap->pm_mtx);
|
|
|
|
if (__predict_true(level == 0 && pmap_valid_entry(pte))) {
|
|
if (pap != NULL)
|
|
*pap = (pte & PG_FRAME) | (va & PAGE_MASK);
|
|
return 1;
|
|
}
|
|
if (level == 1 && (pte & (PG_PS|PG_V)) == (PG_PS|PG_V)) {
|
|
if (pap != NULL)
|
|
*pap = (pte & PG_LGFRAME) | (va & PAGE_MASK_L2);
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* pmap_zero_page: zero a page
|
|
*/
|
|
|
|
void
|
|
pmap_zero_page(struct vm_page *pg)
|
|
{
|
|
pagezero(pmap_map_direct(pg));
|
|
}
|
|
|
|
/*
|
|
* pmap_flush_cache: flush the cache for a virtual address.
|
|
*/
|
|
void
|
|
pmap_flush_cache(vaddr_t addr, vsize_t len)
|
|
{
|
|
vaddr_t i;
|
|
|
|
if (curcpu()->ci_cflushsz == 0) {
|
|
wbinvd_on_all_cpus();
|
|
return;
|
|
}
|
|
|
|
/* all cpus that have clflush also have mfence. */
|
|
mfence();
|
|
for (i = addr; i < addr + len; i += curcpu()->ci_cflushsz)
|
|
clflush(i);
|
|
mfence();
|
|
}
|
|
|
|
/*
|
|
* pmap_copy_page: copy a page
|
|
*/
|
|
|
|
void
|
|
pmap_copy_page(struct vm_page *srcpg, struct vm_page *dstpg)
|
|
{
|
|
vaddr_t srcva = pmap_map_direct(srcpg);
|
|
vaddr_t dstva = pmap_map_direct(dstpg);
|
|
|
|
memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
|
|
}
|
|
|
|
/*
|
|
* p m a p r e m o v e f u n c t i o n s
|
|
*
|
|
* functions that remove mappings
|
|
*/
|
|
|
|
/*
|
|
* pmap_remove_ptes: remove PTEs from a PTP
|
|
*
|
|
* => PTP must be mapped into KVA
|
|
* => PTP should be null if pmap == pmap_kernel()
|
|
*/
|
|
|
|
void
|
|
pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
|
|
vaddr_t startva, vaddr_t endva, int flags, struct pv_entry **free_pvs)
|
|
{
|
|
struct pv_entry *pve;
|
|
pt_entry_t *pte = (pt_entry_t *) ptpva;
|
|
struct vm_page *pg;
|
|
pt_entry_t opte;
|
|
|
|
/*
|
|
* note that ptpva points to the PTE that maps startva. this may
|
|
* or may not be the first PTE in the PTP.
|
|
*
|
|
* we loop through the PTP while there are still PTEs to look at
|
|
* and the wire_count is greater than 1 (because we use the wire_count
|
|
* to keep track of the number of real PTEs in the PTP).
|
|
*/
|
|
|
|
for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
|
|
; pte++, startva += PAGE_SIZE) {
|
|
if (!pmap_valid_entry(*pte))
|
|
continue; /* VA not mapped */
|
|
if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
|
|
continue;
|
|
}
|
|
|
|
/* atomically save the old PTE and zap! it */
|
|
opte = pmap_pte_set(pte, 0);
|
|
|
|
if (opte & PG_W)
|
|
pmap->pm_stats.wired_count--;
|
|
pmap->pm_stats.resident_count--;
|
|
|
|
if (ptp != NULL)
|
|
ptp->wire_count--; /* dropping a PTE */
|
|
|
|
pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
|
|
|
|
/*
|
|
* if we are not on a pv list we are done.
|
|
*/
|
|
|
|
if ((opte & PG_PVLIST) == 0) {
|
|
#ifdef DIAGNOSTIC
|
|
if (pg != NULL)
|
|
panic("%s: managed page without PG_PVLIST: "
|
|
"va 0x%lx, opte 0x%llx", __func__,
|
|
startva, opte);
|
|
#endif
|
|
continue;
|
|
}
|
|
|
|
#ifdef DIAGNOSTIC
|
|
if (pg == NULL)
|
|
panic("%s: unmanaged page marked PG_PVLIST: "
|
|
"va 0x%lx, opte 0x%llx", __func__,
|
|
startva, opte);
|
|
#endif
|
|
|
|
/* sync R/M bits */
|
|
pmap_sync_flags_pte(pg, opte);
|
|
pve = pmap_remove_pv(pg, pmap, startva);
|
|
if (pve != NULL) {
|
|
pve->pv_next = *free_pvs;
|
|
*free_pvs = pve;
|
|
}
|
|
|
|
/* end of "for" loop: time for next pte */
|
|
}
|
|
}
|
|
|
|
/*
|
|
* pmap_remove_pte: remove a single PTE from a PTP
|
|
*
|
|
* => PTP must be mapped into KVA
|
|
* => PTP should be null if pmap == pmap_kernel()
|
|
* => returns true if we removed a mapping
|
|
*/
|
|
|
|
int
|
|
pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
|
|
vaddr_t va, int flags, struct pv_entry **free_pvs)
|
|
{
|
|
struct pv_entry *pve;
|
|
struct vm_page *pg;
|
|
pt_entry_t opte;
|
|
|
|
if (!pmap_valid_entry(*pte))
|
|
return 0; /* VA not mapped */
|
|
if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
|
|
return 0;
|
|
}
|
|
|
|
/* atomically save the old PTE and zap! it */
|
|
opte = pmap_pte_set(pte, 0);
|
|
|
|
if (opte & PG_W)
|
|
pmap->pm_stats.wired_count--;
|
|
pmap->pm_stats.resident_count--;
|
|
|
|
if (ptp != NULL)
|
|
ptp->wire_count--; /* dropping a PTE */
|
|
|
|
pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
|
|
|
|
/*
|
|
* if we are not on a pv list we are done.
|
|
*/
|
|
if ((opte & PG_PVLIST) == 0) {
|
|
#ifdef DIAGNOSTIC
|
|
if (pg != NULL)
|
|
panic("%s: managed page without PG_PVLIST: "
|
|
"va 0x%lx, opte 0x%llx", __func__, va, opte);
|
|
#endif
|
|
return 1;
|
|
}
|
|
|
|
#ifdef DIAGNOSTIC
|
|
if (pg == NULL)
|
|
panic("%s: unmanaged page marked PG_PVLIST: "
|
|
"va 0x%lx, opte 0x%llx", __func__, va, opte);
|
|
#endif
|
|
|
|
/* sync R/M bits */
|
|
pmap_sync_flags_pte(pg, opte);
|
|
pve = pmap_remove_pv(pg, pmap, va);
|
|
if (pve != NULL) {
|
|
pve->pv_next = *free_pvs;
|
|
*free_pvs = pve;
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* pmap_remove: top level mapping removal function
|
|
*
|
|
* => caller should not be holding any pmap locks
|
|
*/
|
|
|
|
void
|
|
pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
|
|
{
|
|
if (pmap->pm_type == PMAP_TYPE_EPT)
|
|
pmap_remove_ept(pmap, sva, eva);
|
|
else
|
|
pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL);
|
|
}
|
|
|
|
/*
|
|
* pmap_do_remove: mapping removal guts
|
|
*
|
|
* => caller should not be holding any pmap locks
|
|
*/
|
|
|
|
void
|
|
pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags)
|
|
{
|
|
pd_entry_t pde;
|
|
int result;
|
|
paddr_t ptppa;
|
|
vaddr_t blkendva;
|
|
struct vm_page *ptp;
|
|
struct pv_entry *pve;
|
|
struct pv_entry *free_pvs = NULL;
|
|
vaddr_t va;
|
|
int shootall = 0, shootself;
|
|
struct pg_to_free empty_ptps;
|
|
paddr_t scr3;
|
|
|
|
TAILQ_INIT(&empty_ptps);
|
|
|
|
scr3 = pmap_map_ptes(pmap);
|
|
shootself = (scr3 == 0);
|
|
|
|
/*
|
|
* removing one page? take shortcut function.
|
|
*/
|
|
|
|
if (sva + PAGE_SIZE == eva) {
|
|
if (pmap_pdes_valid(sva, &pde)) {
|
|
|
|
/* PA of the PTP */
|
|
ptppa = pde & PG_FRAME;
|
|
|
|
/* get PTP if non-kernel mapping */
|
|
|
|
if (pmap == pmap_kernel()) {
|
|
/* we never free kernel PTPs */
|
|
ptp = NULL;
|
|
} else {
|
|
ptp = pmap_find_ptp(pmap, sva, ptppa, 1);
|
|
#ifdef DIAGNOSTIC
|
|
if (ptp == NULL)
|
|
panic("%s: unmanaged PTP detected "
|
|
"in shortcut path", __func__);
|
|
#endif
|
|
}
|
|
|
|
/* do it! */
|
|
result = pmap_remove_pte(pmap, ptp,
|
|
&PTE_BASE[pl1_i(sva)], sva, flags, &free_pvs);
|
|
|
|
/*
|
|
* if mapping removed and the PTP is no longer
|
|
* being used, free it!
|
|
*/
|
|
|
|
if (result && ptp && ptp->wire_count <= 1)
|
|
pmap_free_ptp(pmap, ptp, sva, &empty_ptps);
|
|
pmap_tlb_shootpage(pmap, sva, shootself);
|
|
pmap_unmap_ptes(pmap, scr3);
|
|
pmap_tlb_shootwait();
|
|
} else {
|
|
pmap_unmap_ptes(pmap, scr3);
|
|
}
|
|
|
|
goto cleanup;
|
|
}
|
|
|
|
if ((eva - sva > 32 * PAGE_SIZE) && sva < VM_MIN_KERNEL_ADDRESS)
|
|
shootall = 1;
|
|
|
|
for (va = sva; va < eva; va = blkendva) {
|
|
/* determine range of block */
|
|
blkendva = x86_round_pdr(va + 1);
|
|
if (blkendva > eva)
|
|
blkendva = eva;
|
|
|
|
/*
|
|
* XXXCDC: our PTE mappings should never be removed
|
|
* with pmap_remove! if we allow this (and why would
|
|
* we?) then we end up freeing the pmap's page
|
|
* directory page (PDP) before we are finished using
|
|
* it when we hit it in the recursive mapping. this
|
|
* is BAD.
|
|
*
|
|
* long term solution is to move the PTEs out of user
|
|
* address space. and into kernel address space (up
|
|
* with APTE). then we can set VM_MAXUSER_ADDRESS to
|
|
* be VM_MAX_ADDRESS.
|
|
*/
|
|
|
|
if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
|
|
/* XXXCDC: ugly hack to avoid freeing PDP here */
|
|
continue;
|
|
|
|
if (!pmap_pdes_valid(va, &pde))
|
|
continue;
|
|
|
|
/* PA of the PTP */
|
|
ptppa = pde & PG_FRAME;
|
|
|
|
/* get PTP if non-kernel mapping */
|
|
if (pmap == pmap_kernel()) {
|
|
/* we never free kernel PTPs */
|
|
ptp = NULL;
|
|
} else {
|
|
ptp = pmap_find_ptp(pmap, va, ptppa, 1);
|
|
#ifdef DIAGNOSTIC
|
|
if (ptp == NULL)
|
|
panic("%s: unmanaged PTP detected", __func__);
|
|
#endif
|
|
}
|
|
pmap_remove_ptes(pmap, ptp, (vaddr_t)&PTE_BASE[pl1_i(va)],
|
|
va, blkendva, flags, &free_pvs);
|
|
|
|
/* if PTP is no longer being used, free it! */
|
|
if (ptp && ptp->wire_count <= 1) {
|
|
pmap_free_ptp(pmap, ptp, va, &empty_ptps);
|
|
}
|
|
}
|
|
|
|
if (shootall)
|
|
pmap_tlb_shoottlb(pmap, shootself);
|
|
else
|
|
pmap_tlb_shootrange(pmap, sva, eva, shootself);
|
|
|
|
pmap_unmap_ptes(pmap, scr3);
|
|
pmap_tlb_shootwait();
|
|
|
|
cleanup:
|
|
while ((pve = free_pvs) != NULL) {
|
|
free_pvs = pve->pv_next;
|
|
pool_put(&pmap_pv_pool, pve);
|
|
}
|
|
|
|
while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
|
|
TAILQ_REMOVE(&empty_ptps, ptp, pageq);
|
|
uvm_pagefree(ptp);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* pmap_page_remove: remove a managed vm_page from all pmaps that map it
|
|
*
|
|
* => R/M bits are sync'd back to attrs
|
|
*/
|
|
|
|
void
|
|
pmap_page_remove(struct vm_page *pg)
|
|
{
|
|
struct pv_entry *pve;
|
|
struct pmap *pm;
|
|
pt_entry_t opte;
|
|
#ifdef DIAGNOSTIC
|
|
pd_entry_t pde;
|
|
#endif
|
|
struct pg_to_free empty_ptps;
|
|
struct vm_page *ptp;
|
|
paddr_t scr3;
|
|
int shootself;
|
|
|
|
TAILQ_INIT(&empty_ptps);
|
|
|
|
mtx_enter(&pg->mdpage.pv_mtx);
|
|
while ((pve = pg->mdpage.pv_list) != NULL) {
|
|
pmap_reference(pve->pv_pmap);
|
|
pm = pve->pv_pmap;
|
|
mtx_leave(&pg->mdpage.pv_mtx);
|
|
|
|
/* XXX use direct map? */
|
|
scr3 = pmap_map_ptes(pm); /* locks pmap */
|
|
shootself = (scr3 == 0);
|
|
|
|
/*
|
|
* We dropped the pvlist lock before grabbing the pmap
|
|
* lock to avoid lock ordering problems. This means
|
|
* we have to check the pvlist again since somebody
|
|
* else might have modified it. All we care about is
|
|
* that the pvlist entry matches the pmap we just
|
|
* locked. If it doesn't, unlock the pmap and try
|
|
* again.
|
|
*/
|
|
mtx_enter(&pg->mdpage.pv_mtx);
|
|
if ((pve = pg->mdpage.pv_list) == NULL ||
|
|
pve->pv_pmap != pm) {
|
|
mtx_leave(&pg->mdpage.pv_mtx);
|
|
pmap_unmap_ptes(pm, scr3); /* unlocks pmap */
|
|
pmap_destroy(pm);
|
|
mtx_enter(&pg->mdpage.pv_mtx);
|
|
continue;
|
|
}
|
|
|
|
pg->mdpage.pv_list = pve->pv_next;
|
|
mtx_leave(&pg->mdpage.pv_mtx);
|
|
|
|
#ifdef DIAGNOSTIC
|
|
if (pve->pv_ptp != NULL && pmap_pdes_valid(pve->pv_va, &pde) &&
|
|
(pde & PG_FRAME) != VM_PAGE_TO_PHYS(pve->pv_ptp)) {
|
|
printf("%s: pg=%p: va=%lx, pv_ptp=%p\n", __func__,
|
|
pg, pve->pv_va, pve->pv_ptp);
|
|
printf("%s: PTP's phys addr: "
|
|
"actual=%lx, recorded=%lx\n", __func__,
|
|
(unsigned long)(pde & PG_FRAME),
|
|
VM_PAGE_TO_PHYS(pve->pv_ptp));
|
|
panic("%s: mapped managed page has "
|
|
"invalid pv_ptp field", __func__);
|
|
}
|
|
#endif
|
|
|
|
/* atomically save the old PTE and zap it */
|
|
opte = pmap_pte_set(&PTE_BASE[pl1_i(pve->pv_va)], 0);
|
|
|
|
if (opte & PG_W)
|
|
pve->pv_pmap->pm_stats.wired_count--;
|
|
pve->pv_pmap->pm_stats.resident_count--;
|
|
|
|
pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va, shootself);
|
|
|
|
pmap_sync_flags_pte(pg, opte);
|
|
|
|
/* update the PTP reference count. free if last reference. */
|
|
if (pve->pv_ptp != NULL) {
|
|
pve->pv_ptp->wire_count--;
|
|
if (pve->pv_ptp->wire_count <= 1) {
|
|
pmap_free_ptp(pve->pv_pmap, pve->pv_ptp,
|
|
pve->pv_va, &empty_ptps);
|
|
}
|
|
}
|
|
pmap_unmap_ptes(pve->pv_pmap, scr3); /* unlocks pmap */
|
|
pmap_destroy(pve->pv_pmap);
|
|
pool_put(&pmap_pv_pool, pve);
|
|
mtx_enter(&pg->mdpage.pv_mtx);
|
|
}
|
|
mtx_leave(&pg->mdpage.pv_mtx);
|
|
|
|
pmap_tlb_shootwait();
|
|
|
|
while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
|
|
TAILQ_REMOVE(&empty_ptps, ptp, pageq);
|
|
uvm_pagefree(ptp);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* p m a p a t t r i b u t e f u n c t i o n s
|
|
* functions that test/change managed page's attributes
|
|
* since a page can be mapped multiple times we must check each PTE that
|
|
* maps it by going down the pv lists.
|
|
*/
|
|
|
|
/*
|
|
* pmap_test_attrs: test a page's attributes
|
|
*/
|
|
|
|
int
|
|
pmap_test_attrs(struct vm_page *pg, unsigned int testbits)
|
|
{
|
|
struct pv_entry *pve;
|
|
pt_entry_t *ptes;
|
|
int level, offs;
|
|
u_long mybits, testflags;
|
|
|
|
testflags = pmap_pte2flags(testbits);
|
|
|
|
if (pg->pg_flags & testflags)
|
|
return 1;
|
|
|
|
mybits = 0;
|
|
mtx_enter(&pg->mdpage.pv_mtx);
|
|
for (pve = pg->mdpage.pv_list; pve != NULL && mybits == 0;
|
|
pve = pve->pv_next) {
|
|
level = pmap_find_pte_direct(pve->pv_pmap, pve->pv_va, &ptes,
|
|
&offs);
|
|
mybits |= (ptes[offs] & testbits);
|
|
}
|
|
mtx_leave(&pg->mdpage.pv_mtx);
|
|
|
|
if (mybits == 0)
|
|
return 0;
|
|
|
|
atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(mybits));
|
|
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* pmap_clear_attrs: change a page's attributes
|
|
*
|
|
* => we return 1 if we cleared one of the bits we were asked to
|
|
*/
|
|
|
|
int
|
|
pmap_clear_attrs(struct vm_page *pg, unsigned long clearbits)
|
|
{
|
|
struct pv_entry *pve;
|
|
pt_entry_t *ptes, opte;
|
|
u_long clearflags;
|
|
int result, level, offs;
|
|
|
|
clearflags = pmap_pte2flags(clearbits);
|
|
|
|
result = pg->pg_flags & clearflags;
|
|
if (result)
|
|
atomic_clearbits_int(&pg->pg_flags, clearflags);
|
|
|
|
mtx_enter(&pg->mdpage.pv_mtx);
|
|
for (pve = pg->mdpage.pv_list; pve != NULL; pve = pve->pv_next) {
|
|
level = pmap_find_pte_direct(pve->pv_pmap, pve->pv_va, &ptes,
|
|
&offs);
|
|
opte = ptes[offs];
|
|
if (opte & clearbits) {
|
|
result = 1;
|
|
pmap_pte_clearbits(&ptes[offs], (opte & clearbits));
|
|
pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va,
|
|
pmap_is_curpmap(pve->pv_pmap));
|
|
}
|
|
}
|
|
mtx_leave(&pg->mdpage.pv_mtx);
|
|
|
|
pmap_tlb_shootwait();
|
|
|
|
return (result != 0);
|
|
}
|
|
|
|
/*
|
|
* p m a p p r o t e c t i o n f u n c t i o n s
|
|
*/
|
|
|
|
/*
|
|
* pmap_page_protect: change the protection of all recorded mappings
|
|
* of a managed page
|
|
*
|
|
* => NOTE: this is an inline function in pmap.h
|
|
*/
|
|
|
|
/* see pmap.h */
|
|
|
|
/*
|
|
* pmap_protect: set the protection in of the pages in a pmap
|
|
*
|
|
* => NOTE: this is an inline function in pmap.h
|
|
*/
|
|
|
|
/* see pmap.h */
|
|
|
|
/*
|
|
* pmap_write_protect: write-protect pages in a pmap
|
|
*/
|
|
|
|
void
|
|
pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
|
|
{
|
|
pt_entry_t *spte, *epte;
|
|
pt_entry_t clear = 0, set = 0;
|
|
vaddr_t blockend;
|
|
int shootall = 0, shootself;
|
|
vaddr_t va;
|
|
paddr_t scr3;
|
|
|
|
scr3 = pmap_map_ptes(pmap);
|
|
shootself = (scr3 == 0);
|
|
|
|
/* should be ok, but just in case ... */
|
|
sva &= PG_FRAME;
|
|
eva &= PG_FRAME;
|
|
|
|
if (!(prot & PROT_READ))
|
|
set |= pg_xo;
|
|
if (!(prot & PROT_WRITE))
|
|
clear = PG_RW;
|
|
if (!(prot & PROT_EXEC))
|
|
set |= pg_nx;
|
|
|
|
if ((eva - sva > 32 * PAGE_SIZE) && sva < VM_MIN_KERNEL_ADDRESS)
|
|
shootall = 1;
|
|
|
|
for (va = sva; va < eva ; va = blockend) {
|
|
blockend = (va & L2_FRAME) + NBPD_L2;
|
|
if (blockend > eva)
|
|
blockend = eva;
|
|
|
|
/*
|
|
* XXXCDC: our PTE mappings should never be write-protected!
|
|
*
|
|
* long term solution is to move the PTEs out of user
|
|
* address space. and into kernel address space (up
|
|
* with APTE). then we can set VM_MAXUSER_ADDRESS to
|
|
* be VM_MAX_ADDRESS.
|
|
*/
|
|
|
|
/* XXXCDC: ugly hack to avoid freeing PDP here */
|
|
if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
|
|
continue;
|
|
|
|
/* empty block? */
|
|
if (!pmap_pdes_valid(va, NULL))
|
|
continue;
|
|
|
|
#ifdef DIAGNOSTIC
|
|
if (va >= VM_MAXUSER_ADDRESS && va < VM_MAX_ADDRESS)
|
|
panic("%s: PTE space", __func__);
|
|
#endif
|
|
|
|
spte = &PTE_BASE[pl1_i(va)];
|
|
epte = &PTE_BASE[pl1_i(blockend)];
|
|
|
|
for (/*null */; spte < epte ; spte++) {
|
|
if (!pmap_valid_entry(*spte))
|
|
continue;
|
|
pmap_pte_clearbits(spte, clear);
|
|
pmap_pte_setbits(spte, set);
|
|
}
|
|
}
|
|
|
|
if (shootall)
|
|
pmap_tlb_shoottlb(pmap, shootself);
|
|
else
|
|
pmap_tlb_shootrange(pmap, sva, eva, shootself);
|
|
|
|
pmap_unmap_ptes(pmap, scr3);
|
|
pmap_tlb_shootwait();
|
|
}
|
|
|
|
/*
|
|
* end of protection functions
|
|
*/
|
|
|
|
/*
|
|
* pmap_unwire: clear the wired bit in the PTE
|
|
*
|
|
* => mapping should already be in map
|
|
*/
|
|
|
|
void
|
|
pmap_unwire(struct pmap *pmap, vaddr_t va)
|
|
{
|
|
pt_entry_t *ptes;
|
|
int level, offs;
|
|
|
|
level = pmap_find_pte_direct(pmap, va, &ptes, &offs);
|
|
|
|
if (level == 0) {
|
|
|
|
#ifdef DIAGNOSTIC
|
|
if (!pmap_valid_entry(ptes[offs]))
|
|
panic("%s: invalid (unmapped) va 0x%lx", __func__, va);
|
|
#endif
|
|
if (__predict_true((ptes[offs] & PG_W) != 0)) {
|
|
pmap_pte_clearbits(&ptes[offs], PG_W);
|
|
pmap->pm_stats.wired_count--;
|
|
}
|
|
#ifdef DIAGNOSTIC
|
|
else {
|
|
printf("%s: wiring for pmap %p va 0x%lx "
|
|
"didn't change!\n", __func__, pmap, va);
|
|
}
|
|
#endif
|
|
}
|
|
#ifdef DIAGNOSTIC
|
|
else {
|
|
panic("%s: invalid PDE", __func__);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
#if 0
|
|
/*
|
|
* pmap_collect: free resources held by a pmap
|
|
*
|
|
* => optional function.
|
|
* => called when a process is swapped out to free memory.
|
|
*/
|
|
|
|
void
|
|
pmap_collect(struct pmap *pmap)
|
|
{
|
|
/*
|
|
* free all of the pt pages by removing the physical mappings
|
|
* for its entire address space.
|
|
*/
|
|
|
|
pmap_do_remove(pmap, VM_MIN_ADDRESS, VM_MAX_ADDRESS,
|
|
PMAP_REMOVE_SKIPWIRED);
|
|
}
|
|
#endif
|
|
|
|
void
|
|
pmap_enter_special(vaddr_t va, paddr_t pa, vm_prot_t prot)
|
|
{
|
|
uint64_t l4idx, l3idx, l2idx, l1idx;
|
|
pd_entry_t *pd, *ptp;
|
|
paddr_t npa;
|
|
struct pmap *pmap = pmap_kernel();
|
|
pt_entry_t *ptes;
|
|
int level, offs;
|
|
|
|
/* If CPU is secure, no need to do anything */
|
|
if (!cpu_meltdown)
|
|
return;
|
|
|
|
/* Must be kernel VA */
|
|
if (va < VM_MIN_KERNEL_ADDRESS)
|
|
panic("%s: invalid special mapping va 0x%lx requested",
|
|
__func__, va);
|
|
|
|
if (pmap->pm_pdir_intel == NULL)
|
|
pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool,
|
|
PR_WAITOK | PR_ZERO);
|
|
|
|
l4idx = (va & L4_MASK) >> L4_SHIFT; /* PML4E idx */
|
|
l3idx = (va & L3_MASK) >> L3_SHIFT; /* PDPTE idx */
|
|
l2idx = (va & L2_MASK) >> L2_SHIFT; /* PDE idx */
|
|
l1idx = (va & L1_MASK) >> L1_SHIFT; /* PTE idx */
|
|
|
|
DPRINTF("%s: va=0x%llx pa=0x%llx l4idx=%lld l3idx=%lld "
|
|
"l2idx=%lld l1idx=%lld\n", __func__, (uint64_t)va,
|
|
(uint64_t)pa, l4idx, l3idx, l2idx, l1idx);
|
|
|
|
/* Start at PML4 / top level */
|
|
pd = pmap->pm_pdir_intel;
|
|
|
|
if (pd == NULL)
|
|
panic("%s: PML4 not initialized for pmap @ %p", __func__,
|
|
pmap);
|
|
|
|
/* npa = physaddr of PDPT */
|
|
npa = pd[l4idx] & PMAP_PA_MASK;
|
|
|
|
/* Valid PML4e for the 512GB region containing va? */
|
|
if (!npa) {
|
|
/* No valid PML4E - allocate PDPT page and set PML4E */
|
|
|
|
ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO);
|
|
|
|
if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
|
|
panic("%s: can't locate PDPT page", __func__);
|
|
|
|
pd[l4idx] = (npa | PG_RW | PG_V);
|
|
|
|
DPRINTF("%s: allocated new PDPT page at phys 0x%llx, "
|
|
"setting PML4e[%lld] = 0x%llx\n", __func__,
|
|
(uint64_t)npa, l4idx, pd[l4idx]);
|
|
}
|
|
|
|
pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
|
|
if (pd == NULL)
|
|
panic("%s: can't locate PDPT @ pa=0x%llx", __func__,
|
|
(uint64_t)npa);
|
|
|
|
/* npa = physaddr of PD page */
|
|
npa = pd[l3idx] & PMAP_PA_MASK;
|
|
|
|
/* Valid PDPTe for the 1GB region containing va? */
|
|
if (!npa) {
|
|
/* No valid PDPTe - allocate PD page and set PDPTe */
|
|
|
|
ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO);
|
|
|
|
if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
|
|
panic("%s: can't locate PD page", __func__);
|
|
|
|
pd[l3idx] = (npa | PG_RW | PG_V);
|
|
|
|
DPRINTF("%s: allocated new PD page at phys 0x%llx, "
|
|
"setting PDPTe[%lld] = 0x%llx\n", __func__,
|
|
(uint64_t)npa, l3idx, pd[l3idx]);
|
|
}
|
|
|
|
pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
|
|
if (pd == NULL)
|
|
panic("%s: can't locate PD page @ pa=0x%llx", __func__,
|
|
(uint64_t)npa);
|
|
|
|
/* npa = physaddr of PT page */
|
|
npa = pd[l2idx] & PMAP_PA_MASK;
|
|
|
|
/* Valid PDE for the 2MB region containing va? */
|
|
if (!npa) {
|
|
/* No valid PDE - allocate PT page and set PDE */
|
|
|
|
ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO);
|
|
|
|
if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
|
|
panic("%s: can't locate PT page", __func__);
|
|
|
|
pd[l2idx] = (npa | PG_RW | PG_V);
|
|
|
|
DPRINTF("%s: allocated new PT page at phys 0x%llx, "
|
|
"setting PDE[%lld] = 0x%llx\n", __func__,
|
|
(uint64_t)npa, l2idx, pd[l2idx]);
|
|
}
|
|
|
|
pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
|
|
if (pd == NULL)
|
|
panic("%s: can't locate PT page @ pa=0x%llx", __func__,
|
|
(uint64_t)npa);
|
|
|
|
DPRINTF("%s: setting PTE, PT page @ phys 0x%llx virt 0x%llx prot "
|
|
"0x%llx was 0x%llx\n", __func__, (uint64_t)npa, (uint64_t)pd,
|
|
(uint64_t)prot, (uint64_t)pd[l1idx]);
|
|
|
|
pd[l1idx] = pa | protection_codes[prot] | PG_V | PG_W;
|
|
|
|
/*
|
|
* Look up the corresponding U+K entry. If we're installing the
|
|
* same PA into the U-K map then set the PG_G bit on both and copy
|
|
* the cache-control bits from the U+K entry to the U-K entry.
|
|
*/
|
|
level = pmap_find_pte_direct(pmap, va, &ptes, &offs);
|
|
if (__predict_true(level == 0 && pmap_valid_entry(ptes[offs]))) {
|
|
if (((pd[l1idx] ^ ptes[offs]) & PG_FRAME) == 0) {
|
|
pd[l1idx] |= PG_G | (ptes[offs] & (PG_N | PG_WT));
|
|
ptes[offs] |= PG_G;
|
|
} else {
|
|
DPRINTF("%s: special diffing mapping at %llx\n",
|
|
__func__, (long long)va);
|
|
}
|
|
} else
|
|
DPRINTF("%s: no U+K mapping for special mapping?\n", __func__);
|
|
|
|
DPRINTF("%s: setting PTE[%lld] = 0x%llx\n", __func__, l1idx, pd[l1idx]);
|
|
}
|
|
|
|
void
|
|
pmap_remove_ept(struct pmap *pmap, vaddr_t sgpa, vaddr_t egpa)
|
|
{
|
|
vaddr_t v;
|
|
#if NVMM > 0
|
|
struct vmx_invept_descriptor vid;
|
|
#endif /* NVMM > 0 */
|
|
|
|
DPRINTF("%s: sgpa=0x%llx egpa=0x%llx\n", __func__, (uint64_t)sgpa,
|
|
(uint64_t)egpa);
|
|
for (v = sgpa; v < egpa + PAGE_SIZE; v += PAGE_SIZE)
|
|
pmap_do_remove_ept(pmap, v);
|
|
|
|
#if NVMM > 0
|
|
if (pmap->eptp != 0) {
|
|
memset(&vid, 0, sizeof(vid));
|
|
vid.vid_eptp = pmap->eptp;
|
|
DPRINTF("%s: flushing EPT TLB for EPTP 0x%llx\n", __func__,
|
|
vid.vid_eptp);
|
|
invept(IA32_VMX_INVEPT_SINGLE_CTX, &vid);
|
|
}
|
|
#endif /* NVMM > 0 */
|
|
}
|
|
|
|
void
|
|
pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa)
|
|
{
|
|
uint64_t l4idx, l3idx, l2idx, l1idx;
|
|
struct vm_page *pg3, *pg2, *pg1;
|
|
paddr_t npa3, npa2, npa1;
|
|
pd_entry_t *pd4, *pd3, *pd2, *pd1;
|
|
pd_entry_t *pptes;
|
|
|
|
l4idx = (gpa & L4_MASK) >> L4_SHIFT; /* PML4E idx */
|
|
l3idx = (gpa & L3_MASK) >> L3_SHIFT; /* PDPTE idx */
|
|
l2idx = (gpa & L2_MASK) >> L2_SHIFT; /* PDE idx */
|
|
l1idx = (gpa & L1_MASK) >> L1_SHIFT; /* PTE idx */
|
|
|
|
/* Start at PML4 / top level */
|
|
pd4 = (pd_entry_t *)pmap->pm_pdir;
|
|
|
|
if (pd4 == NULL)
|
|
return;
|
|
|
|
/* npa3 = physaddr of PDPT */
|
|
npa3 = pd4[l4idx] & PMAP_PA_MASK;
|
|
if (!npa3)
|
|
return;
|
|
pd3 = (pd_entry_t *)PMAP_DIRECT_MAP(npa3);
|
|
pg3 = PHYS_TO_VM_PAGE(npa3);
|
|
|
|
/* npa2 = physaddr of PD page */
|
|
npa2 = pd3[l3idx] & PMAP_PA_MASK;
|
|
if (!npa2)
|
|
return;
|
|
pd2 = (pd_entry_t *)PMAP_DIRECT_MAP(npa2);
|
|
pg2 = PHYS_TO_VM_PAGE(npa2);
|
|
|
|
/* npa1 = physaddr of PT page */
|
|
npa1 = pd2[l2idx] & PMAP_PA_MASK;
|
|
if (!npa1)
|
|
return;
|
|
pd1 = (pd_entry_t *)PMAP_DIRECT_MAP(npa1);
|
|
pg1 = PHYS_TO_VM_PAGE(npa1);
|
|
|
|
if (pd1[l1idx] == 0)
|
|
return;
|
|
|
|
pd1[l1idx] = 0;
|
|
pg1->wire_count--;
|
|
pmap->pm_stats.resident_count--;
|
|
|
|
if (pg1->wire_count > 1)
|
|
return;
|
|
|
|
pg1->wire_count = 0;
|
|
pptes = (pd_entry_t *)PMAP_DIRECT_MAP(npa2);
|
|
pptes[l2idx] = 0;
|
|
uvm_pagefree(pg1);
|
|
pmap->pm_stats.resident_count--;
|
|
|
|
pg2->wire_count--;
|
|
if (pg2->wire_count > 1)
|
|
return;
|
|
|
|
pg2->wire_count = 0;
|
|
pptes = (pd_entry_t *)PMAP_DIRECT_MAP(npa3);
|
|
pptes[l3idx] = 0;
|
|
uvm_pagefree(pg2);
|
|
pmap->pm_stats.resident_count--;
|
|
|
|
pg3->wire_count--;
|
|
if (pg3->wire_count > 1)
|
|
return;
|
|
|
|
pg3->wire_count = 0;
|
|
pptes = pd4;
|
|
pptes[l4idx] = 0;
|
|
uvm_pagefree(pg3);
|
|
pmap->pm_stats.resident_count--;
|
|
}
|
|
|
|
int
|
|
pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t hpa, vm_prot_t prot)
|
|
{
|
|
uint64_t l4idx, l3idx, l2idx, l1idx;
|
|
pd_entry_t *pd, npte;
|
|
struct vm_page *ptp, *pptp;
|
|
paddr_t npa;
|
|
struct uvm_object *obj;
|
|
|
|
if (gpa > MAXDSIZ)
|
|
return ENOMEM;
|
|
|
|
l4idx = (gpa & L4_MASK) >> L4_SHIFT; /* PML4E idx */
|
|
l3idx = (gpa & L3_MASK) >> L3_SHIFT; /* PDPTE idx */
|
|
l2idx = (gpa & L2_MASK) >> L2_SHIFT; /* PDE idx */
|
|
l1idx = (gpa & L1_MASK) >> L1_SHIFT; /* PTE idx */
|
|
|
|
/* Start at PML4 / top level */
|
|
pd = (pd_entry_t *)pmap->pm_pdir;
|
|
|
|
if (pd == NULL)
|
|
return ENOMEM;
|
|
|
|
/* npa = physaddr of PDPT */
|
|
npa = pd[l4idx] & PMAP_PA_MASK;
|
|
|
|
/* Valid PML4e for the 512GB region containing gpa? */
|
|
if (!npa) {
|
|
/* No valid PML4e - allocate PDPT page and set PML4e */
|
|
obj = &pmap->pm_obj[2]; /* PML4 UVM object */
|
|
ptp = uvm_pagealloc(obj, ptp_va2o(gpa, 3), NULL,
|
|
UVM_PGA_USERESERVE|UVM_PGA_ZERO);
|
|
|
|
if (ptp == NULL)
|
|
return ENOMEM;
|
|
|
|
/*
|
|
* New PDPT page - we are setting the first entry, so set
|
|
* the wired count to 1
|
|
*/
|
|
ptp->wire_count = 1;
|
|
|
|
/* Calculate phys address of this new PDPT page */
|
|
npa = VM_PAGE_TO_PHYS(ptp);
|
|
|
|
/*
|
|
* Higher levels get full perms; specific permissions are
|
|
* entered at the lowest level.
|
|
*/
|
|
pd[l4idx] = (npa | EPT_R | EPT_W | EPT_X);
|
|
|
|
pmap->pm_stats.resident_count++;
|
|
|
|
pptp = ptp;
|
|
} else {
|
|
/* Already allocated PML4e */
|
|
pptp = PHYS_TO_VM_PAGE(npa);
|
|
}
|
|
|
|
pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
|
|
if (pd == NULL)
|
|
panic("%s: can't locate PDPT @ pa=0x%llx", __func__,
|
|
(uint64_t)npa);
|
|
|
|
/* npa = physaddr of PD page */
|
|
npa = pd[l3idx] & PMAP_PA_MASK;
|
|
|
|
/* Valid PDPTe for the 1GB region containing gpa? */
|
|
if (!npa) {
|
|
/* No valid PDPTe - allocate PD page and set PDPTe */
|
|
obj = &pmap->pm_obj[1]; /* PDPT UVM object */
|
|
ptp = uvm_pagealloc(obj, ptp_va2o(gpa, 2), NULL,
|
|
UVM_PGA_USERESERVE|UVM_PGA_ZERO);
|
|
|
|
if (ptp == NULL)
|
|
return ENOMEM;
|
|
|
|
/*
|
|
* New PD page - we are setting the first entry, so set
|
|
* the wired count to 1
|
|
*/
|
|
ptp->wire_count = 1;
|
|
pptp->wire_count++;
|
|
|
|
npa = VM_PAGE_TO_PHYS(ptp);
|
|
|
|
/*
|
|
* Higher levels get full perms; specific permissions are
|
|
* entered at the lowest level.
|
|
*/
|
|
pd[l3idx] = (npa | EPT_R | EPT_W | EPT_X);
|
|
|
|
pmap->pm_stats.resident_count++;
|
|
|
|
pptp = ptp;
|
|
} else {
|
|
/* Already allocated PDPTe */
|
|
pptp = PHYS_TO_VM_PAGE(npa);
|
|
}
|
|
|
|
pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
|
|
if (pd == NULL)
|
|
panic("%s: can't locate PD page @ pa=0x%llx", __func__,
|
|
(uint64_t)npa);
|
|
|
|
/* npa = physaddr of PT page */
|
|
npa = pd[l2idx] & PMAP_PA_MASK;
|
|
|
|
/* Valid PDE for the 2MB region containing gpa? */
|
|
if (!npa) {
|
|
/* No valid PDE - allocate PT page and set PDE */
|
|
obj = &pmap->pm_obj[0]; /* PDE UVM object */
|
|
ptp = uvm_pagealloc(obj, ptp_va2o(gpa, 1), NULL,
|
|
UVM_PGA_USERESERVE|UVM_PGA_ZERO);
|
|
|
|
if (ptp == NULL)
|
|
return ENOMEM;
|
|
|
|
pptp->wire_count++;
|
|
|
|
npa = VM_PAGE_TO_PHYS(ptp);
|
|
|
|
/*
|
|
* Higher level get full perms; specific permissions are
|
|
* entered at the lowest level.
|
|
*/
|
|
pd[l2idx] = (npa | EPT_R | EPT_W | EPT_X);
|
|
|
|
pmap->pm_stats.resident_count++;
|
|
|
|
} else {
|
|
/* Find final ptp */
|
|
ptp = PHYS_TO_VM_PAGE(npa);
|
|
if (ptp == NULL)
|
|
panic("%s: ptp page vanished?", __func__);
|
|
}
|
|
|
|
pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
|
|
if (pd == NULL)
|
|
panic("%s: can't locate PT page @ pa=0x%llx", __func__,
|
|
(uint64_t)npa);
|
|
|
|
npte = hpa | EPT_WB;
|
|
if (prot & PROT_READ)
|
|
npte |= EPT_R;
|
|
if (prot & PROT_WRITE)
|
|
npte |= EPT_W;
|
|
if (prot & PROT_EXEC)
|
|
npte |= EPT_X;
|
|
|
|
if (pd[l1idx] == 0) {
|
|
ptp->wire_count++;
|
|
pmap->pm_stats.resident_count++;
|
|
} else {
|
|
/* XXX flush ept */
|
|
}
|
|
|
|
pd[l1idx] = npte;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* pmap_enter: enter a mapping into a pmap
|
|
*
|
|
* => must be done "now" ... no lazy-evaluation
|
|
*/
|
|
|
|
int
|
|
pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, int flags)
|
|
{
|
|
pt_entry_t opte, npte;
|
|
struct vm_page *ptp, *pg = NULL;
|
|
struct pv_entry *pve, *opve = NULL;
|
|
int ptpdelta, wireddelta, resdelta;
|
|
int wired = (flags & PMAP_WIRED) != 0;
|
|
int nocache = (pa & PMAP_NOCACHE) != 0;
|
|
int wc = (pa & PMAP_WC) != 0;
|
|
int error, shootself;
|
|
paddr_t scr3;
|
|
|
|
if (pmap->pm_type == PMAP_TYPE_EPT)
|
|
return pmap_enter_ept(pmap, va, pa, prot);
|
|
|
|
KASSERT(!(wc && nocache));
|
|
pa &= PMAP_PA_MASK;
|
|
|
|
#ifdef DIAGNOSTIC
|
|
if (va == (vaddr_t) PDP_BASE)
|
|
panic("%s: trying to map over PDP!", __func__);
|
|
|
|
/* sanity check: kernel PTPs should already have been pre-allocated */
|
|
if (va >= VM_MIN_KERNEL_ADDRESS &&
|
|
!pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]))
|
|
panic("%s: missing kernel PTP for va %lx!", __func__, va);
|
|
|
|
#endif
|
|
|
|
pve = pool_get(&pmap_pv_pool, PR_NOWAIT);
|
|
if (pve == NULL) {
|
|
if (flags & PMAP_CANFAIL) {
|
|
error = ENOMEM;
|
|
goto out;
|
|
}
|
|
panic("%s: no pv entries available", __func__);
|
|
}
|
|
|
|
/*
|
|
* map in ptes and get a pointer to our PTP (unless we are the kernel)
|
|
*/
|
|
|
|
scr3 = pmap_map_ptes(pmap);
|
|
shootself = (scr3 == 0);
|
|
if (pmap == pmap_kernel()) {
|
|
ptp = NULL;
|
|
} else {
|
|
ptp = pmap_get_ptp(pmap, va);
|
|
if (ptp == NULL) {
|
|
if (flags & PMAP_CANFAIL) {
|
|
pmap_unmap_ptes(pmap, scr3);
|
|
error = ENOMEM;
|
|
goto out;
|
|
}
|
|
panic("%s: get ptp failed", __func__);
|
|
}
|
|
}
|
|
opte = PTE_BASE[pl1_i(va)]; /* old PTE */
|
|
|
|
/*
|
|
* is there currently a valid mapping at our VA?
|
|
*/
|
|
|
|
if (pmap_valid_entry(opte)) {
|
|
/*
|
|
* first, calculate pm_stats updates. resident count will not
|
|
* change since we are replacing/changing a valid mapping.
|
|
* wired count might change...
|
|
*/
|
|
|
|
resdelta = 0;
|
|
if (wired && (opte & PG_W) == 0)
|
|
wireddelta = 1;
|
|
else if (!wired && (opte & PG_W) != 0)
|
|
wireddelta = -1;
|
|
else
|
|
wireddelta = 0;
|
|
ptpdelta = 0;
|
|
|
|
/*
|
|
* is the currently mapped PA the same as the one we
|
|
* want to map?
|
|
*/
|
|
|
|
if ((opte & PG_FRAME) == pa) {
|
|
|
|
/* if this is on the PVLIST, sync R/M bit */
|
|
if (opte & PG_PVLIST) {
|
|
pg = PHYS_TO_VM_PAGE(pa);
|
|
#ifdef DIAGNOSTIC
|
|
if (pg == NULL)
|
|
panic("%s: same pa, PG_PVLIST "
|
|
"mapping with unmanaged page: "
|
|
"va 0x%lx, opte 0x%llx, pa 0x%lx",
|
|
__func__, va, opte, pa);
|
|
#endif
|
|
pmap_sync_flags_pte(pg, opte);
|
|
} else {
|
|
#ifdef DIAGNOSTIC
|
|
if (PHYS_TO_VM_PAGE(pa) != NULL)
|
|
panic("%s: same pa, no PG_PVLIST "
|
|
"mapping with managed page: "
|
|
"va 0x%lx, opte 0x%llx, pa 0x%lx",
|
|
__func__, va, opte, pa);
|
|
#endif
|
|
}
|
|
goto enter_now;
|
|
}
|
|
|
|
/*
|
|
* changing PAs: we must remove the old one first
|
|
*/
|
|
|
|
/*
|
|
* if current mapping is on a pvlist,
|
|
* remove it (sync R/M bits)
|
|
*/
|
|
|
|
if (opte & PG_PVLIST) {
|
|
pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
|
|
#ifdef DIAGNOSTIC
|
|
if (pg == NULL)
|
|
panic("%s: PG_PVLIST mapping with unmanaged "
|
|
"page: va 0x%lx, opte 0x%llx, pa 0x%lx",
|
|
__func__, va, opte, pa);
|
|
#endif
|
|
pmap_sync_flags_pte(pg, opte);
|
|
opve = pmap_remove_pv(pg, pmap, va);
|
|
pg = NULL; /* This is not the page we are looking for */
|
|
}
|
|
} else { /* opte not valid */
|
|
resdelta = 1;
|
|
if (wired)
|
|
wireddelta = 1;
|
|
else
|
|
wireddelta = 0;
|
|
if (ptp != NULL)
|
|
ptpdelta = 1;
|
|
else
|
|
ptpdelta = 0;
|
|
}
|
|
|
|
/*
|
|
* pve is either NULL or points to a now-free pv_entry structure
|
|
* (the latter case is if we called pmap_remove_pv above).
|
|
*
|
|
* if this entry is to be on a pvlist, enter it now.
|
|
*/
|
|
|
|
if (pmap_initialized)
|
|
pg = PHYS_TO_VM_PAGE(pa);
|
|
|
|
if (pg != NULL) {
|
|
pmap_enter_pv(pg, pve, pmap, va, ptp);
|
|
pve = NULL;
|
|
}
|
|
|
|
enter_now:
|
|
/*
|
|
* at this point pg is !NULL if we want the PG_PVLIST bit set
|
|
*/
|
|
|
|
pmap->pm_stats.resident_count += resdelta;
|
|
pmap->pm_stats.wired_count += wireddelta;
|
|
if (ptp != NULL)
|
|
ptp->wire_count += ptpdelta;
|
|
|
|
KASSERT(pg == PHYS_TO_VM_PAGE(pa));
|
|
|
|
npte = pa | protection_codes[prot] | PG_V;
|
|
if (pg != NULL) {
|
|
npte |= PG_PVLIST;
|
|
/*
|
|
* make sure that if the page is write combined all
|
|
* instances of pmap_enter make it so.
|
|
*/
|
|
if (pg->pg_flags & PG_PMAP_WC) {
|
|
KASSERT(nocache == 0);
|
|
wc = 1;
|
|
}
|
|
}
|
|
if (wc)
|
|
npte |= pmap_pg_wc;
|
|
if (wired)
|
|
npte |= PG_W;
|
|
if (nocache)
|
|
npte |= PG_N;
|
|
if (va < VM_MAXUSER_ADDRESS)
|
|
npte |= ((flags & PMAP_EFI) ? 0 : PG_u);
|
|
else if (va < VM_MAX_ADDRESS)
|
|
npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */
|
|
if (pmap == pmap_kernel())
|
|
npte |= pg_g_kern;
|
|
|
|
/*
|
|
* If the old entry wasn't valid, we can just update it and
|
|
* go. If it was valid, and this isn't a read->write
|
|
* transition, then we can safely just update it and flush
|
|
* any old TLB entries.
|
|
*
|
|
* If it _was_ valid and this _is_ a read->write transition,
|
|
* then this could be a CoW resolution and we need to make
|
|
* sure no CPU can see the new writable mapping while another
|
|
* still has the old mapping in its TLB, so insert a correct
|
|
* but unwritable mapping, flush any old TLB entries, then
|
|
* make it writable.
|
|
*/
|
|
if (! pmap_valid_entry(opte)) {
|
|
PTE_BASE[pl1_i(va)] = npte;
|
|
} else if ((opte | (npte ^ PG_RW)) & PG_RW) {
|
|
/* previously writable or not making writable */
|
|
PTE_BASE[pl1_i(va)] = npte;
|
|
if (nocache && (opte & PG_N) == 0)
|
|
wbinvd_on_all_cpus();
|
|
pmap_tlb_shootpage(pmap, va, shootself);
|
|
} else {
|
|
PTE_BASE[pl1_i(va)] = npte ^ PG_RW;
|
|
if (nocache && (opte & PG_N) == 0) /* XXX impossible? */
|
|
wbinvd_on_all_cpus();
|
|
pmap_tlb_shootpage(pmap, va, shootself);
|
|
pmap_tlb_shootwait();
|
|
PTE_BASE[pl1_i(va)] = npte;
|
|
}
|
|
|
|
pmap_unmap_ptes(pmap, scr3);
|
|
pmap_tlb_shootwait();
|
|
|
|
error = 0;
|
|
|
|
out:
|
|
if (pve != NULL)
|
|
pool_put(&pmap_pv_pool, pve);
|
|
if (opve != NULL)
|
|
pool_put(&pmap_pv_pool, opve);
|
|
|
|
return error;
|
|
}
|
|
|
|
int
|
|
pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp)
|
|
{
|
|
struct vm_page *ptp;
|
|
struct pmap *kpm = pmap_kernel();
|
|
|
|
if (uvm.page_init_done == 0) {
|
|
vaddr_t va;
|
|
|
|
/*
|
|
* we're growing the kernel pmap early (from
|
|
* uvm_pageboot_alloc()). this case must be
|
|
* handled a little differently.
|
|
*/
|
|
|
|
va = pmap_steal_memory(PAGE_SIZE, NULL, NULL);
|
|
*paddrp = PMAP_DIRECT_UNMAP(va);
|
|
} else {
|
|
ptp = uvm_pagealloc(&kpm->pm_obj[level - 1],
|
|
ptp_va2o(va, level), NULL,
|
|
UVM_PGA_USERESERVE|UVM_PGA_ZERO);
|
|
if (ptp == NULL)
|
|
panic("%s: out of memory", __func__);
|
|
atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
|
|
ptp->wire_count = 1;
|
|
*paddrp = VM_PAGE_TO_PHYS(ptp);
|
|
}
|
|
kpm->pm_stats.resident_count++;
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* Allocate the amount of specified ptps for a ptp level, and populate
|
|
* all levels below accordingly, mapping virtual addresses starting at
|
|
* kva.
|
|
*
|
|
* Used by pmap_growkernel.
|
|
*/
|
|
void
|
|
pmap_alloc_level(vaddr_t kva, int lvl, long *needed_ptps)
|
|
{
|
|
unsigned long i;
|
|
vaddr_t va;
|
|
paddr_t pa;
|
|
unsigned long index, endindex;
|
|
int level;
|
|
pd_entry_t *pdep;
|
|
|
|
for (level = lvl; level > 1; level--) {
|
|
if (level == PTP_LEVELS)
|
|
pdep = pmap_kernel()->pm_pdir;
|
|
else
|
|
pdep = normal_pdes[level - 2];
|
|
va = kva;
|
|
index = pl_i(kva, level);
|
|
endindex = index + needed_ptps[level - 1];
|
|
/*
|
|
* XXX special case for first time call.
|
|
*/
|
|
if (nkptp[level - 1] != 0)
|
|
index++;
|
|
else
|
|
endindex--;
|
|
|
|
for (i = index; i <= endindex; i++) {
|
|
pmap_get_physpage(va, level - 1, &pa);
|
|
pdep[i] = pa | PG_RW | PG_V | pg_nx;
|
|
nkptp[level - 1]++;
|
|
va += nbpd[level - 1];
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* pmap_growkernel: increase usage of KVM space
|
|
*
|
|
* => we allocate new PTPs for the kernel and install them in all
|
|
* the pmaps on the system.
|
|
*/
|
|
|
|
static vaddr_t pmap_maxkvaddr = VM_MIN_KERNEL_ADDRESS;
|
|
|
|
vaddr_t
|
|
pmap_growkernel(vaddr_t maxkvaddr)
|
|
{
|
|
struct pmap *kpm = pmap_kernel(), *pm;
|
|
int s, i;
|
|
unsigned newpdes;
|
|
long needed_kptp[PTP_LEVELS], target_nptp, old;
|
|
|
|
if (maxkvaddr <= pmap_maxkvaddr)
|
|
return pmap_maxkvaddr;
|
|
|
|
maxkvaddr = x86_round_pdr(maxkvaddr);
|
|
old = nkptp[PTP_LEVELS - 1];
|
|
/*
|
|
* This loop could be optimized more, but pmap_growkernel()
|
|
* is called infrequently.
|
|
*/
|
|
for (i = PTP_LEVELS - 1; i >= 1; i--) {
|
|
target_nptp = pl_i(maxkvaddr, i + 1) -
|
|
pl_i(VM_MIN_KERNEL_ADDRESS, i + 1);
|
|
/*
|
|
* XXX only need to check toplevel.
|
|
*/
|
|
if (target_nptp > nkptpmax[i])
|
|
panic("%s: out of KVA space", __func__);
|
|
needed_kptp[i] = target_nptp - nkptp[i] + 1;
|
|
}
|
|
|
|
|
|
s = splhigh(); /* to be safe */
|
|
pmap_alloc_level(pmap_maxkvaddr, PTP_LEVELS, needed_kptp);
|
|
|
|
/*
|
|
* If the number of top level entries changed, update all
|
|
* pmaps.
|
|
*/
|
|
if (needed_kptp[PTP_LEVELS - 1] != 0) {
|
|
newpdes = nkptp[PTP_LEVELS - 1] - old;
|
|
mtx_enter(&pmaps_lock);
|
|
LIST_FOREACH(pm, &pmaps, pm_list) {
|
|
memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
|
|
&kpm->pm_pdir[PDIR_SLOT_KERN + old],
|
|
newpdes * sizeof (pd_entry_t));
|
|
}
|
|
mtx_leave(&pmaps_lock);
|
|
}
|
|
pmap_maxkvaddr = maxkvaddr;
|
|
splx(s);
|
|
|
|
return maxkvaddr;
|
|
}
|
|
|
|
vaddr_t
|
|
pmap_steal_memory(vsize_t size, vaddr_t *start, vaddr_t *end)
|
|
{
|
|
int segno;
|
|
u_int npg;
|
|
vaddr_t va;
|
|
paddr_t pa;
|
|
struct vm_physseg *seg;
|
|
|
|
size = round_page(size);
|
|
npg = atop(size);
|
|
|
|
for (segno = 0, seg = vm_physmem; segno < vm_nphysseg; segno++, seg++) {
|
|
if (seg->avail_end - seg->avail_start < npg)
|
|
continue;
|
|
/*
|
|
* We can only steal at an ``unused'' segment boundary,
|
|
* i.e. either at the start or at the end.
|
|
*/
|
|
if (seg->avail_start == seg->start ||
|
|
seg->avail_end == seg->end)
|
|
break;
|
|
}
|
|
if (segno == vm_nphysseg) {
|
|
panic("%s: out of memory", __func__);
|
|
} else {
|
|
if (seg->avail_start == seg->start) {
|
|
pa = ptoa(seg->avail_start);
|
|
seg->avail_start += npg;
|
|
seg->start += npg;
|
|
} else {
|
|
pa = ptoa(seg->avail_end) - size;
|
|
seg->avail_end -= npg;
|
|
seg->end -= npg;
|
|
}
|
|
/*
|
|
* If all the segment has been consumed now, remove it.
|
|
* Note that the crash dump code still knows about it
|
|
* and will dump it correctly.
|
|
*/
|
|
if (seg->start == seg->end) {
|
|
if (vm_nphysseg-- == 1)
|
|
panic("%s: out of memory", __func__);
|
|
while (segno < vm_nphysseg) {
|
|
seg[0] = seg[1]; /* struct copy */
|
|
seg++;
|
|
segno++;
|
|
}
|
|
}
|
|
|
|
va = PMAP_DIRECT_MAP(pa);
|
|
memset((void *)va, 0, size);
|
|
}
|
|
|
|
if (start != NULL)
|
|
*start = virtual_avail;
|
|
if (end != NULL)
|
|
*end = VM_MAX_KERNEL_ADDRESS;
|
|
|
|
return (va);
|
|
}
|
|
|
|
void
|
|
pmap_virtual_space(vaddr_t *vstartp, vaddr_t *vendp)
|
|
{
|
|
*vstartp = virtual_avail;
|
|
*vendp = VM_MAX_KERNEL_ADDRESS;
|
|
}
|
|
|
|
/*
|
|
* pmap_convert
|
|
*
|
|
* Converts 'pmap' to the new 'mode'.
|
|
*
|
|
* Parameters:
|
|
* pmap: the pmap to convert
|
|
* mode: the new mode (see pmap.h, PMAP_TYPE_xxx)
|
|
*/
|
|
void
|
|
pmap_convert(struct pmap *pmap, int mode)
|
|
{
|
|
pt_entry_t *pte;
|
|
|
|
pmap->pm_type = mode;
|
|
|
|
if (mode == PMAP_TYPE_EPT) {
|
|
/* Clear PML4 */
|
|
pte = (pt_entry_t *)pmap->pm_pdir;
|
|
memset(pte, 0, PAGE_SIZE);
|
|
|
|
/* Give back the meltdown pdir */
|
|
if (pmap->pm_pdir_intel != NULL) {
|
|
pool_put(&pmap_pdp_pool, pmap->pm_pdir_intel);
|
|
pmap->pm_pdir_intel = NULL;
|
|
}
|
|
}
|
|
}
|
|
|
|
#ifdef MULTIPROCESSOR
|
|
/*
|
|
* Locking for tlb shootdown.
|
|
*
|
|
* We lock by setting tlb_shoot_wait to the number of cpus that will
|
|
* receive our tlb shootdown. After sending the IPIs, we don't need to
|
|
* worry about locking order or interrupts spinning for the lock because
|
|
* the call that grabs the "lock" isn't the one that releases it. And
|
|
* there is nothing that can block the IPI that releases the lock.
|
|
*
|
|
* The functions are organized so that we first count the number of
|
|
* cpus we need to send the IPI to, then we grab the counter, then
|
|
* we send the IPIs, then we finally do our own shootdown.
|
|
*
|
|
* Our shootdown is last to make it parallel with the other cpus
|
|
* to shorten the spin time.
|
|
*
|
|
* Notice that we depend on failures to send IPIs only being able to
|
|
* happen during boot. If they happen later, the above assumption
|
|
* doesn't hold since we can end up in situations where noone will
|
|
* release the lock if we get an interrupt in a bad moment.
|
|
*/
|
|
#ifdef MP_LOCKDEBUG
|
|
#include <ddb/db_output.h>
|
|
extern int __mp_lock_spinout;
|
|
#endif
|
|
|
|
volatile long tlb_shoot_wait __attribute__((section(".kudata")));
|
|
|
|
volatile vaddr_t tlb_shoot_addr1 __attribute__((section(".kudata")));
|
|
volatile vaddr_t tlb_shoot_addr2 __attribute__((section(".kudata")));
|
|
volatile int tlb_shoot_first_pcid __attribute__((section(".kudata")));
|
|
|
|
|
|
/* Obtain the "lock" for TLB shooting */
|
|
static inline int
|
|
pmap_start_tlb_shoot(long wait, const char *func)
|
|
{
|
|
int s = splvm();
|
|
|
|
while (atomic_cas_ulong(&tlb_shoot_wait, 0, wait) != 0) {
|
|
#ifdef MP_LOCKDEBUG
|
|
int nticks = __mp_lock_spinout;
|
|
#endif
|
|
while (tlb_shoot_wait != 0) {
|
|
CPU_BUSY_CYCLE();
|
|
#ifdef MP_LOCKDEBUG
|
|
if (--nticks <= 0) {
|
|
db_printf("%s: spun out", func);
|
|
db_enter();
|
|
nticks = __mp_lock_spinout;
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
void
|
|
pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself)
|
|
{
|
|
struct cpu_info *ci, *self = curcpu();
|
|
CPU_INFO_ITERATOR cii;
|
|
long wait = 0;
|
|
u_int64_t mask = 0;
|
|
int is_kva = va >= VM_MIN_KERNEL_ADDRESS;
|
|
|
|
CPU_INFO_FOREACH(cii, ci) {
|
|
if (ci == self || !(ci->ci_flags & CPUF_RUNNING))
|
|
continue;
|
|
if (!is_kva && !pmap_is_active(pm, ci))
|
|
continue;
|
|
mask |= (1ULL << ci->ci_cpuid);
|
|
wait++;
|
|
}
|
|
|
|
if (wait > 0) {
|
|
int s = pmap_start_tlb_shoot(wait, __func__);
|
|
|
|
tlb_shoot_first_pcid = is_kva ? PCID_KERN : PCID_PROC;
|
|
tlb_shoot_addr1 = va;
|
|
CPU_INFO_FOREACH(cii, ci) {
|
|
if ((mask & (1ULL << ci->ci_cpuid)) == 0)
|
|
continue;
|
|
if (x86_fast_ipi(ci, LAPIC_IPI_INVLPG) != 0)
|
|
panic("%s: ipi failed", __func__);
|
|
}
|
|
splx(s);
|
|
}
|
|
|
|
if (!pmap_use_pcid) {
|
|
if (shootself)
|
|
pmap_update_pg(va);
|
|
} else if (is_kva) {
|
|
invpcid(INVPCID_ADDR, PCID_PROC, va);
|
|
invpcid(INVPCID_ADDR, PCID_KERN, va);
|
|
} else if (shootself) {
|
|
invpcid(INVPCID_ADDR, PCID_PROC, va);
|
|
if (cpu_meltdown)
|
|
invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va);
|
|
}
|
|
}
|
|
|
|
void
|
|
pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva, int shootself)
|
|
{
|
|
struct cpu_info *ci, *self = curcpu();
|
|
CPU_INFO_ITERATOR cii;
|
|
long wait = 0;
|
|
u_int64_t mask = 0;
|
|
int is_kva = sva >= VM_MIN_KERNEL_ADDRESS;
|
|
vaddr_t va;
|
|
|
|
CPU_INFO_FOREACH(cii, ci) {
|
|
if (ci == self || !(ci->ci_flags & CPUF_RUNNING))
|
|
continue;
|
|
if (!is_kva && !pmap_is_active(pm, ci))
|
|
continue;
|
|
mask |= (1ULL << ci->ci_cpuid);
|
|
wait++;
|
|
}
|
|
|
|
if (wait > 0) {
|
|
int s = pmap_start_tlb_shoot(wait, __func__);
|
|
|
|
tlb_shoot_first_pcid = is_kva ? PCID_KERN : PCID_PROC;
|
|
tlb_shoot_addr1 = sva;
|
|
tlb_shoot_addr2 = eva;
|
|
CPU_INFO_FOREACH(cii, ci) {
|
|
if ((mask & (1ULL << ci->ci_cpuid)) == 0)
|
|
continue;
|
|
if (x86_fast_ipi(ci, LAPIC_IPI_INVLRANGE) != 0)
|
|
panic("%s: ipi failed", __func__);
|
|
}
|
|
splx(s);
|
|
}
|
|
|
|
if (!pmap_use_pcid) {
|
|
if (shootself) {
|
|
for (va = sva; va < eva; va += PAGE_SIZE)
|
|
pmap_update_pg(va);
|
|
}
|
|
} else if (is_kva) {
|
|
for (va = sva; va < eva; va += PAGE_SIZE) {
|
|
invpcid(INVPCID_ADDR, PCID_PROC, va);
|
|
invpcid(INVPCID_ADDR, PCID_KERN, va);
|
|
}
|
|
} else if (shootself) {
|
|
if (cpu_meltdown) {
|
|
for (va = sva; va < eva; va += PAGE_SIZE) {
|
|
invpcid(INVPCID_ADDR, PCID_PROC, va);
|
|
invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va);
|
|
}
|
|
} else {
|
|
for (va = sva; va < eva; va += PAGE_SIZE)
|
|
invpcid(INVPCID_ADDR, PCID_PROC, va);
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
pmap_tlb_shoottlb(struct pmap *pm, int shootself)
|
|
{
|
|
struct cpu_info *ci, *self = curcpu();
|
|
CPU_INFO_ITERATOR cii;
|
|
long wait = 0;
|
|
u_int64_t mask = 0;
|
|
|
|
KASSERT(pm != pmap_kernel());
|
|
|
|
CPU_INFO_FOREACH(cii, ci) {
|
|
if (ci == self || !pmap_is_active(pm, ci) ||
|
|
!(ci->ci_flags & CPUF_RUNNING))
|
|
continue;
|
|
mask |= (1ULL << ci->ci_cpuid);
|
|
wait++;
|
|
}
|
|
|
|
if (wait) {
|
|
int s = pmap_start_tlb_shoot(wait, __func__);
|
|
|
|
CPU_INFO_FOREACH(cii, ci) {
|
|
if ((mask & (1ULL << ci->ci_cpuid)) == 0)
|
|
continue;
|
|
if (x86_fast_ipi(ci, LAPIC_IPI_INVLTLB) != 0)
|
|
panic("%s: ipi failed", __func__);
|
|
}
|
|
splx(s);
|
|
}
|
|
|
|
if (shootself) {
|
|
if (!pmap_use_pcid)
|
|
tlbflush();
|
|
else {
|
|
invpcid(INVPCID_PCID, PCID_PROC, 0);
|
|
if (cpu_meltdown)
|
|
invpcid(INVPCID_PCID, PCID_PROC_INTEL, 0);
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
pmap_tlb_shootwait(void)
|
|
{
|
|
#ifdef MP_LOCKDEBUG
|
|
int nticks = __mp_lock_spinout;
|
|
#endif
|
|
while (tlb_shoot_wait != 0) {
|
|
CPU_BUSY_CYCLE();
|
|
#ifdef MP_LOCKDEBUG
|
|
if (--nticks <= 0) {
|
|
db_printf("%s: spun out", __func__);
|
|
db_enter();
|
|
nticks = __mp_lock_spinout;
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
|
|
#else /* MULTIPROCESSOR */
|
|
|
|
void
|
|
pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself)
|
|
{
|
|
if (!pmap_use_pcid) {
|
|
if (shootself)
|
|
pmap_update_pg(va);
|
|
} else if (va >= VM_MIN_KERNEL_ADDRESS) {
|
|
invpcid(INVPCID_ADDR, PCID_PROC, va);
|
|
invpcid(INVPCID_ADDR, PCID_KERN, va);
|
|
} else if (shootself) {
|
|
invpcid(INVPCID_ADDR, PCID_PROC, va);
|
|
if (cpu_meltdown)
|
|
invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va);
|
|
}
|
|
}
|
|
|
|
void
|
|
pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva, int shootself)
|
|
{
|
|
vaddr_t va;
|
|
|
|
if (!pmap_use_pcid) {
|
|
if (shootself) {
|
|
for (va = sva; va < eva; va += PAGE_SIZE)
|
|
pmap_update_pg(va);
|
|
}
|
|
} else if (sva >= VM_MIN_KERNEL_ADDRESS) {
|
|
for (va = sva; va < eva; va += PAGE_SIZE) {
|
|
invpcid(INVPCID_ADDR, PCID_PROC, va);
|
|
invpcid(INVPCID_ADDR, PCID_KERN, va);
|
|
}
|
|
} else if (shootself) {
|
|
if (cpu_meltdown) {
|
|
for (va = sva; va < eva; va += PAGE_SIZE) {
|
|
invpcid(INVPCID_ADDR, PCID_PROC, va);
|
|
invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va);
|
|
}
|
|
} else {
|
|
for (va = sva; va < eva; va += PAGE_SIZE)
|
|
invpcid(INVPCID_ADDR, PCID_PROC, va);
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
pmap_tlb_shoottlb(struct pmap *pm, int shootself)
|
|
{
|
|
if (shootself) {
|
|
if (!pmap_use_pcid)
|
|
tlbflush();
|
|
else {
|
|
invpcid(INVPCID_PCID, PCID_PROC, 0);
|
|
if (cpu_meltdown)
|
|
invpcid(INVPCID_PCID, PCID_PROC_INTEL, 0);
|
|
}
|
|
}
|
|
}
|
|
#endif /* MULTIPROCESSOR */
|