src/sys/uvm/uvm_swap.c

/*	$OpenBSD: uvm_swap.c,v 1.170 2024/04/16 10:06:37 claudio Exp $	*/
/*	$NetBSD: uvm_swap.c,v 1.40 2000/11/17 11:39:39 mrg Exp $	*/

/*
 * Copyright (c) 1995, 1996, 1997 Matthew R. Green
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
 */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/proc.h>
#include <sys/namei.h>
#include <sys/disklabel.h>
#include <sys/errno.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/vnode.h>
#include <sys/fcntl.h>
#include <sys/extent.h>
#include <sys/blist.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/pool.h>
#include <sys/syscallargs.h>
#include <sys/swap.h>
#include <sys/disk.h>
#include <sys/task.h>
#include <sys/pledge.h>
#if defined(NFSCLIENT)
#include <sys/socket.h>
#include <netinet/in.h>
#include <nfs/nfsproto.h>
#include <nfs/nfsdiskless.h>
#endif

#include <uvm/uvm.h>
#ifdef UVM_SWAP_ENCRYPT
#include <uvm/uvm_swap_encrypt.h>
#endif

#include <sys/specdev.h>

#include "vnd.h"

/*
 * uvm_swap.c: manage configuration and i/o to swap space.
 */

/*
 * swap space is managed in the following way:
 *
 * each swap partition or file is described by a "swapdev" structure.
 * each "swapdev" structure contains a "swapent" structure which contains
 * information that is passed up to the user (via system calls).
 *
 * each swap partition is assigned a "priority" (int) which controls
 * swap partition usage.
 *
 * the system maintains a global data structure describing all swap
 * partitions/files.   there is a sorted LIST of "swappri" structures
 * which describe "swapdev"'s at that priority.   this LIST is headed
 * by the "swap_priority" global var.    each "swappri" contains a
 * TAILQ of "swapdev" structures at that priority.
 *
 * locking:
 *  - swap_syscall_lock (sleep lock): this lock serializes the swapctl
 *    system call and prevents the swap priority list from changing
 *    while we are in the middle of a system call (e.g. SWAP_STATS).
 *  - uvm_swap_data_lock (mutex): this lock protects all swap data
 *    structures including the priority list, the swapdev structures,
 *    and the swapmap arena.
 *
 * each swap device has the following info:
 *  - swap device in use (could be disabled, preventing future use)
 *  - swap enabled (allows new allocations on swap)
 *  - map info in /dev/drum
 *  - vnode pointer
 * for swap files only:
 *  - block size
 *  - max byte count in buffer
 *  - buffer
 *  - credentials to use when doing i/o to file
 *
 * userland controls and configures swap with the swapctl(2) system call.
 * the sys_swapctl performs the following operations:
 *  [1] SWAP_NSWAP: returns the number of swap devices currently configured
 *  [2] SWAP_STATS: given a pointer to an array of swapent structures
 *	(passed in via "arg") of a size passed in via "misc" ... we load
 *	the current swap config into the array.
 *  [3] SWAP_ON: given a pathname in arg (could be device or file) and a
 *	priority in "misc", start swapping on it.
 *  [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
 *  [5] SWAP_CTL: changes the priority of a swap device (new priority in
 *	"misc")
 */

/*
 * swapdev: describes a single swap partition/file
 *
 * note the following should be true:
 * swd_inuse <= swd_nblks  [number of blocks in use is <= total blocks]
 * swd_nblks <= swd_mapsize [because mapsize includes disklabel]
 */
struct swapdev {
	struct swapent	swd_se;
#define	swd_dev		swd_se.se_dev		/* device id */
#define	swd_flags	swd_se.se_flags		/* flags:inuse/enable/fake */
#define	swd_priority	swd_se.se_priority	/* our priority */
#define	swd_inuse	swd_se.se_inuse		/* blocks used */
#define	swd_nblks	swd_se.se_nblks		/* total blocks */
	char			*swd_path;	/* saved pathname of device */
	int			swd_pathlen;	/* length of pathname */
	int			swd_npages;	/* #pages we can use */
	int			swd_npginuse;	/* #pages in use */
	int			swd_npgbad;	/* #pages bad */
	int			swd_drumoffset;	/* page0 offset in drum */
	int			swd_drumsize;	/* #pages in drum */
	blist_t			swd_blist;	/* blist for this swapdev */
	struct vnode		*swd_vp;	/* backing vnode */
	TAILQ_ENTRY(swapdev)	swd_next;	/* priority tailq */

	int			swd_bsize;	/* blocksize (bytes) */
	int			swd_maxactive;	/* max active i/o reqs */
	int			swd_active;	/* # of active i/o reqs */
	struct bufq		swd_bufq;
	struct ucred		*swd_cred;	/* cred for file access */
#ifdef UVM_SWAP_ENCRYPT
#define SWD_KEY_SHIFT		7		/* One key per 0.5 MByte */
#define SWD_KEY(x,y)		&((x)->swd_keys[((y) - (x)->swd_drumoffset) >> SWD_KEY_SHIFT])
#define	SWD_KEY_SIZE(x)	(((x) + (1 << SWD_KEY_SHIFT) - 1) >> SWD_KEY_SHIFT)

#define SWD_DCRYPT_SHIFT	5
#define SWD_DCRYPT_BITS		32
#define SWD_DCRYPT_MASK		(SWD_DCRYPT_BITS - 1)
#define SWD_DCRYPT_OFF(x)	((x) >> SWD_DCRYPT_SHIFT)
#define SWD_DCRYPT_BIT(x)	((x) & SWD_DCRYPT_MASK)
#define SWD_DCRYPT_SIZE(x)	(SWD_DCRYPT_OFF((x) + SWD_DCRYPT_MASK) * sizeof(u_int32_t))
	u_int32_t		*swd_decrypt;	/* bitmap for decryption */
	struct swap_key		*swd_keys;	/* keys for different parts */
#endif
};

/*
 * swap device priority entry; the list is kept sorted on `spi_priority'.
 */
struct swappri {
	int			spi_priority;     /* priority */
	TAILQ_HEAD(spi_swapdev, swapdev)	spi_swapdev;
	/* tailq of swapdevs at this priority */
	LIST_ENTRY(swappri)	spi_swappri;      /* global list of pri's */
};

/*
 * The following two structures are used to keep track of data transfers
 * on swap devices associated with regular files.
 * NOTE: this code is more or less a copy of vnd.c; we use the same
 * structure names here to ease porting..
 */
struct vndxfer {
	struct buf	*vx_bp;		/* Pointer to parent buffer */
	struct swapdev	*vx_sdp;
	int		vx_error;
	int		vx_pending;	/* # of pending aux buffers */
	int		vx_flags;
#define VX_BUSY		1
#define VX_DEAD		2
};

struct vndbuf {
	struct buf	vb_buf;
	struct vndxfer	*vb_vnx;
	struct task	vb_task;
};

/*
 * We keep a of pool vndbuf's and vndxfer structures.
 */
struct pool vndxfer_pool;
struct pool vndbuf_pool;


/*
 * local variables
 */
struct extent *swapmap;		/* controls the mapping of /dev/drum */

/* list of all active swap devices [by priority] */
LIST_HEAD(swap_priority, swappri);
struct swap_priority swap_priority;	/* [S] */

/* locks */
struct mutex uvm_swap_data_lock = MUTEX_INITIALIZER(IPL_MPFLOOR);
struct rwlock swap_syscall_lock = RWLOCK_INITIALIZER("swplk");

struct mutex oommtx = MUTEX_INITIALIZER(IPL_VM);
struct vm_page *oompps[SWCLUSTPAGES];
int oom = 0;

/*
 * prototypes
 */
void		 swapdrum_add(struct swapdev *, int);
struct swapdev	*swapdrum_getsdp(int);

struct swapdev	*swaplist_find(struct vnode *, int);
void		 swaplist_insert(struct swapdev *,
 				     struct swappri *, int);
void		 swaplist_trim(void);

int swap_on(struct proc *, struct swapdev *);
int swap_off(struct proc *, struct swapdev *);

void sw_reg_strategy(struct swapdev *, struct buf *, int);
void sw_reg_iodone(struct buf *);
void sw_reg_iodone_internal(void *);
void sw_reg_start(struct swapdev *);

int uvm_swap_io(struct vm_page **, int, int, int);

void swapmount(void);
int uvm_swap_allocpages(struct vm_page **, int, int);

#ifdef UVM_SWAP_ENCRYPT
/* for swap encrypt */
void uvm_swap_markdecrypt(struct swapdev *, int, int, int);
boolean_t uvm_swap_needdecrypt(struct swapdev *, int);
void uvm_swap_initcrypt(struct swapdev *, int);
#endif

/*
 * uvm_swap_init: init the swap system data structures and locks
 *
 * => called at boot time from init_main.c after the filesystems
 *	are brought up (which happens after uvm_init())
 */
void
uvm_swap_init(void)
{
	int error;

	/*
	 * first, init the swap list, its counter, and its lock.
	 * then get a handle on the vnode for /dev/drum by using
	 * the its dev_t number ("swapdev", from MD conf.c).
	 */
	LIST_INIT(&swap_priority);
	uvmexp.nswapdev = 0;

	if (!swapdev_vp && bdevvp(swapdev, &swapdev_vp))
		panic("uvm_swap_init: can't get vnode for swap device");

	/*
	 * create swap block extent to map /dev/drum. The extent spans
	 * 1 to INT_MAX allows 2 gigablocks of swap space.  Note that
	 * block 0 is reserved (used to indicate an allocation failure,
	 * or no allocation).
	 */
	swapmap = extent_create("swapmap", 1, INT_MAX,
				M_VMSWAP, 0, 0, EX_NOWAIT);
	if (swapmap == 0)
		panic("uvm_swap_init: extent_create failed");

	/* allocate pools for structures used for swapping to files. */
	pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, IPL_BIO, 0,
	    "swp vnx", NULL);
	pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, IPL_BIO, 0,
	    "swp vnd", NULL);

	/* allocate pages for OOM situations. */
	error = uvm_swap_allocpages(oompps, SWCLUSTPAGES, UVM_PLA_NOWAIT);
	KASSERT(error == 0);

	/* Setup the initial swap partition */
	swapmount();
}

#ifdef UVM_SWAP_ENCRYPT
void
uvm_swap_initcrypt_all(void)
{
	struct swapdev *sdp;
	struct swappri *spp;
	int npages;


	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
		TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
			if (sdp->swd_decrypt == NULL) {
				npages = dbtob((uint64_t)sdp->swd_nblks) >>
				    PAGE_SHIFT;
				uvm_swap_initcrypt(sdp, npages);
			}
		}
	}
}

void
uvm_swap_initcrypt(struct swapdev *sdp, int npages)
{
	/*
	 * keep information if a page needs to be decrypted when we get it
	 * from the swap device.
	 * We cannot chance a malloc later, if we are doing ASYNC puts,
	 * we may not call malloc with M_WAITOK.  This consumes only
	 * 8KB memory for a 256MB swap partition.
	 */
	sdp->swd_decrypt = malloc(SWD_DCRYPT_SIZE(npages), M_VMSWAP,
	    M_WAITOK|M_ZERO);
	sdp->swd_keys = mallocarray(SWD_KEY_SIZE(npages),
	    sizeof(struct swap_key), M_VMSWAP, M_WAITOK|M_ZERO);
}

#endif /* UVM_SWAP_ENCRYPT */

int
uvm_swap_allocpages(struct vm_page **pps, int npages, int flags)
{
	struct pglist	pgl;
	int error, i;

	KASSERT(npages <= SWCLUSTPAGES);

	TAILQ_INIT(&pgl);
again:
	error = uvm_pglistalloc(npages * PAGE_SIZE, dma_constraint.ucr_low,
	    dma_constraint.ucr_high, 0, 0, &pgl, npages, flags);
	if (error && (curproc == uvm.pagedaemon_proc)) {
		mtx_enter(&oommtx);
		if (oom) {
			msleep_nsec(&oom, &oommtx, PVM | PNORELOCK,
			 "oom", INFSLP);
			goto again;
		}
		oom = 1;
		for (i = 0; i < npages; i++) {
			pps[i] = oompps[i];
			atomic_setbits_int(&pps[i]->pg_flags, PG_BUSY);
		}
		mtx_leave(&oommtx);
		return 0;
	}
	if (error)
		return error;

	for (i = 0; i < npages; i++) {
		pps[i] = TAILQ_FIRST(&pgl);
		/* *sigh* */
		atomic_setbits_int(&pps[i]->pg_flags, PG_BUSY);
		TAILQ_REMOVE(&pgl, pps[i], pageq);
	}

	return 0;
}

void
uvm_swap_freepages(struct vm_page **pps, int npages)
{
	int i;

	if (pps[0] == oompps[0]) {
		for (i = 0; i < npages; i++)
			uvm_pageclean(pps[i]);

		mtx_enter(&oommtx);
		KASSERT(oom == 1);
		oom = 0;
		mtx_leave(&oommtx);
		wakeup(&oom);
		return;
	}

	uvm_lock_pageq();
	for (i = 0; i < npages; i++)
		uvm_pagefree(pps[i]);
	uvm_unlock_pageq();

}

#ifdef UVM_SWAP_ENCRYPT
/*
 * Mark pages on the swap device for later decryption
 */

void
uvm_swap_markdecrypt(struct swapdev *sdp, int startslot, int npages,
    int decrypt)
{
	int pagestart, i;
	int off, bit;

	if (!sdp)
		return;

	pagestart = startslot - sdp->swd_drumoffset;
	for (i = 0; i < npages; i++, pagestart++) {
		off = SWD_DCRYPT_OFF(pagestart);
		bit = SWD_DCRYPT_BIT(pagestart);
		if (decrypt)
			/* pages read need decryption */
			sdp->swd_decrypt[off] |= 1 << bit;
		else
			/* pages read do not need decryption */
			sdp->swd_decrypt[off] &= ~(1 << bit);
	}
}

/*
 * Check if the page that we got from disk needs to be decrypted
 */

boolean_t
uvm_swap_needdecrypt(struct swapdev *sdp, int off)
{
	if (!sdp)
		return FALSE;

	off -= sdp->swd_drumoffset;
	return sdp->swd_decrypt[SWD_DCRYPT_OFF(off)] & (1 << SWD_DCRYPT_BIT(off)) ?
		TRUE : FALSE;
}

void
uvm_swap_finicrypt_all(void)
{
	struct swapdev *sdp;
	struct swappri *spp;
	struct swap_key *key;
	unsigned int nkeys;

	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
		TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
			if (sdp->swd_decrypt == NULL)
				continue;

			nkeys = dbtob((uint64_t)sdp->swd_nblks) >> PAGE_SHIFT;
			key = sdp->swd_keys + (SWD_KEY_SIZE(nkeys) - 1);
			do {
				if (key->refcount != 0)
					swap_key_delete(key);
			} while (key-- != sdp->swd_keys);
		}
	}
}
#endif /* UVM_SWAP_ENCRYPT */

/*
 * swaplist functions: functions that operate on the list of swap
 * devices on the system.
 */

/*
 * swaplist_insert: insert swap device "sdp" into the global list
 *
 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
 * => caller must provide a newly allocated swappri structure (we will
 *	FREE it if we don't need it... this it to prevent allocation
 *	blocking here while adding swap)
 */
void
swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority)
{
	struct swappri *spp, *pspp;

	KASSERT(rw_write_held(&swap_syscall_lock));
	MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);

	/*
	 * find entry at or after which to insert the new device.
	 */
	pspp = NULL;
	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
		if (priority <= spp->spi_priority)
			break;
		pspp = spp;
	}

	/*
	 * new priority?
	 */
	if (spp == NULL || spp->spi_priority != priority) {
		spp = newspp;  /* use newspp! */

		spp->spi_priority = priority;
		TAILQ_INIT(&spp->spi_swapdev);

		if (pspp)
			LIST_INSERT_AFTER(pspp, spp, spi_swappri);
		else
			LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
	} else {
	  	/* we don't need a new priority structure, free it */
		free(newspp, M_VMSWAP, sizeof(*newspp));
	}

	/*
	 * priority found (or created).   now insert on the priority's
	 * tailq list and bump the total number of swapdevs.
	 */
	sdp->swd_priority = priority;
	TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
	uvmexp.nswapdev++;
}

/*
 * swaplist_find: find and optionally remove a swap device from the
 *	global list.
 *
 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
 * => we return the swapdev we found (and removed)
 */
struct swapdev *
swaplist_find(struct vnode *vp, boolean_t remove)
{
	struct swapdev *sdp;
	struct swappri *spp;

	KASSERT(rw_write_held(&swap_syscall_lock));
	MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);

	/*
	 * search the lists for the requested vp
	 */
	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
		TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
			if (sdp->swd_vp != vp)
				continue;
			if (remove) {
				TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
				uvmexp.nswapdev--;
			}
			return (sdp);
		}
	}
	return (NULL);
}


/*
 * swaplist_trim: scan priority list for empty priority entries and kill
 *	them.
 *
 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
 */
void
swaplist_trim(void)
{
	struct swappri *spp, *nextspp;

	KASSERT(rw_write_held(&swap_syscall_lock));
	MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);

	LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) {
		if (!TAILQ_EMPTY(&spp->spi_swapdev))
			continue;
		LIST_REMOVE(spp, spi_swappri);
		free(spp, M_VMSWAP, sizeof(*spp));
	}
}

/*
 * swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area.
 *
 * => caller must hold swap_syscall_lock
 * => uvm_swap_data_lock should be unlocked (we may sleep)
 */
void
swapdrum_add(struct swapdev *sdp, int npages)
{
	u_long result;

	if (extent_alloc(swapmap, npages, EX_NOALIGN, 0, EX_NOBOUNDARY,
	    EX_WAITOK, &result))
		panic("swapdrum_add");

	sdp->swd_drumoffset = result;
	sdp->swd_drumsize = npages;
}

/*
 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
 *	to the "swapdev" that maps that section of the drum.
 *
 * => each swapdev takes one big contig chunk of the drum
 * => caller must hold uvm_swap_data_lock
 */
struct swapdev *
swapdrum_getsdp(int pgno)
{
	struct swapdev *sdp;
	struct swappri *spp;

	MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);

	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
		TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
			if (pgno >= sdp->swd_drumoffset &&
			    pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
				return sdp;
			}
		}
	}
	return NULL;
}


/*
 * sys_swapctl: main entry point for swapctl(2) system call
 * 	[with two helper functions: swap_on and swap_off]
 */
int
sys_swapctl(struct proc *p, void *v, register_t *retval)
{
	struct sys_swapctl_args /* {
		syscallarg(int) cmd;
		syscallarg(void *) arg;
		syscallarg(int) misc;
	} */ *uap = (struct sys_swapctl_args *)v;
	struct vnode *vp;
	struct nameidata nd;
	struct swappri *spp;
	struct swapdev *sdp;
	struct swapent *sep;
	char	userpath[MAXPATHLEN];
	size_t	len;
	int	count, error, misc;
	int	priority;

	misc = SCARG(uap, misc);

	if ((error = pledge_swapctl(p, SCARG(uap, cmd))))
		return error;

	/*
	 * ensure serialized syscall access by grabbing the swap_syscall_lock
	 */
	rw_enter_write(&swap_syscall_lock);

	/*
	 * we handle the non-priv NSWAP and STATS request first.
	 *
	 * SWAP_NSWAP: return number of config'd swap devices
	 * [can also be obtained with uvmexp sysctl]
	 */
	if (SCARG(uap, cmd) == SWAP_NSWAP) {
		*retval = uvmexp.nswapdev;
		error = 0;
		goto out;
	}

	/*
	 * SWAP_STATS: get stats on current # of configured swap devs
	 *
	 * note that the swap_priority list can't change as long
	 * as we are holding the swap_syscall_lock.  we don't want
	 * to grab the uvm_swap_data_lock because we may fault&sleep during
	 * copyout() and we don't want to be holding that lock then!
	 */
	if (SCARG(uap, cmd) == SWAP_STATS) {
		sep = (struct swapent *)SCARG(uap, arg);
		count = 0;

		LIST_FOREACH(spp, &swap_priority, spi_swappri) {
			TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
				if (count >= misc)
					continue;

				sdp->swd_inuse =
				    btodb((u_int64_t)sdp->swd_npginuse <<
				    PAGE_SHIFT);
				error = copyout(&sdp->swd_se, sep,
				    sizeof(struct swapent));
				if (error)
					goto out;

				/* now copy out the path if necessary */
				error = copyoutstr(sdp->swd_path,
				    sep->se_path, sizeof(sep->se_path), NULL);
				if (error)
					goto out;

				count++;
				sep++;
			}
		}

		*retval = count;
		error = 0;
		goto out;
	}

	/* all other requests require superuser privs.   verify. */
	if ((error = suser(p)))
		goto out;

	/*
	 * at this point we expect a path name in arg.   we will
	 * use namei() to gain a vnode reference (vref), and lock
	 * the vnode (VOP_LOCK).
	 */
	error = copyinstr(SCARG(uap, arg), userpath, sizeof(userpath), &len);
	if (error)
		goto out;
	disk_map(userpath, userpath, sizeof(userpath), DM_OPENBLCK);
	NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, UIO_SYSSPACE, userpath, p);
	if ((error = namei(&nd)))
		goto out;
	vp = nd.ni_vp;
	/* note: "vp" is referenced and locked */

	error = 0;		/* assume no error */
	switch(SCARG(uap, cmd)) {
	case SWAP_DUMPDEV:
		if (vp->v_type != VBLK) {
			error = ENOTBLK;
			break;
		}
		dumpdev = vp->v_rdev;
		break;
	case SWAP_CTL:
		/*
		 * get new priority, remove old entry (if any) and then
		 * reinsert it in the correct place.  finally, prune out
		 * any empty priority structures.
		 */
		priority = SCARG(uap, misc);
		spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
		mtx_enter(&uvm_swap_data_lock);
		if ((sdp = swaplist_find(vp, 1)) == NULL) {
			error = ENOENT;
		} else {
			swaplist_insert(sdp, spp, priority);
			swaplist_trim();
		}
		mtx_leave(&uvm_swap_data_lock);
		if (error)
			free(spp, M_VMSWAP, sizeof(*spp));
		break;
	case SWAP_ON:
		/*
		 * If the device is a regular file, make sure the filesystem
		 * can be used for swapping.
		 */
		if (vp->v_type == VREG &&
		    (vp->v_mount->mnt_flag & MNT_SWAPPABLE) == 0) {
			error = ENOTSUP;
			break;
		}

		/*
		 * check for duplicates.   if none found, then insert a
		 * dummy entry on the list to prevent someone else from
		 * trying to enable this device while we are working on
		 * it.
		 */

		priority = SCARG(uap, misc);
		sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK|M_ZERO);
		spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
		sdp->swd_flags = SWF_FAKE;	/* placeholder only */
		sdp->swd_vp = vp;
		sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;

		/*
		 * XXX Is NFS elaboration necessary?
		 */
		if (vp->v_type == VREG) {
			sdp->swd_cred = crdup(p->p_ucred);
		}

		mtx_enter(&uvm_swap_data_lock);
		if (swaplist_find(vp, 0) != NULL) {
			error = EBUSY;
			mtx_leave(&uvm_swap_data_lock);
			if (vp->v_type == VREG) {
				crfree(sdp->swd_cred);
			}
			free(sdp, M_VMSWAP, sizeof *sdp);
			free(spp, M_VMSWAP, sizeof *spp);
			break;
		}
		swaplist_insert(sdp, spp, priority);
		mtx_leave(&uvm_swap_data_lock);

		sdp->swd_pathlen = len;
		sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK);
		strlcpy(sdp->swd_path, userpath, len);

		/*
		 * we've now got a FAKE placeholder in the swap list.
		 * now attempt to enable swap on it.  if we fail, undo
		 * what we've done and kill the fake entry we just inserted.
		 * if swap_on is a success, it will clear the SWF_FAKE flag
		 */

		if ((error = swap_on(p, sdp)) != 0) {
			mtx_enter(&uvm_swap_data_lock);
			(void) swaplist_find(vp, 1);  /* kill fake entry */
			swaplist_trim();
			mtx_leave(&uvm_swap_data_lock);
			if (vp->v_type == VREG) {
				crfree(sdp->swd_cred);
			}
			free(sdp->swd_path, M_VMSWAP, sdp->swd_pathlen);
			free(sdp, M_VMSWAP, sizeof(*sdp));
			break;
		}
		break;
	case SWAP_OFF:
		mtx_enter(&uvm_swap_data_lock);
		if ((sdp = swaplist_find(vp, 0)) == NULL) {
			mtx_leave(&uvm_swap_data_lock);
			error = ENXIO;
			break;
		}

		/*
		 * If a device isn't in use or enabled, we
		 * can't stop swapping from it (again).
		 */
		if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
			mtx_leave(&uvm_swap_data_lock);
			error = EBUSY;
			break;
		}

		/*
		 * do the real work.
		 */
		error = swap_off(p, sdp);
		break;
	default:
		error = EINVAL;
	}

	/* done!  release the ref gained by namei() and unlock. */
	vput(vp);

out:
	rw_exit_write(&swap_syscall_lock);

	return (error);
}

/*
 * swap_on: attempt to enable a swapdev for swapping.   note that the
 *	swapdev is already on the global list, but disabled (marked
 *	SWF_FAKE).
 *
 * => we avoid the start of the disk (to protect disk labels)
 * => caller should leave uvm_swap_data_lock unlocked, we may lock it
 *	if needed.
 */
int
swap_on(struct proc *p, struct swapdev *sdp)
{
	struct vnode *vp;
	int error, npages, nblocks, size;
	long addr;
	struct vattr va;
#if defined(NFSCLIENT)
	extern const struct vops nfs_vops;
#endif /* defined(NFSCLIENT) */
	dev_t dev;

	/*
	 * we want to enable swapping on sdp.   the swd_vp contains
	 * the vnode we want (locked and ref'd), and the swd_dev
	 * contains the dev_t of the file, if it a block device.
	 */

	vp = sdp->swd_vp;
	dev = sdp->swd_dev;

#if NVND > 0
	/* no swapping to vnds. */
	if (bdevsw[major(dev)].d_strategy == vndstrategy)
		return (EOPNOTSUPP);
#endif

	/*
	 * open the swap file (mostly useful for block device files to
	 * let device driver know what is up).
	 *
	 * we skip the open/close for root on swap because the root
	 * has already been opened when root was mounted (mountroot).
	 */
	if (vp != rootvp) {
		if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p)))
			return (error);
	}

	/* XXX this only works for block devices */
	/*
	 * we now need to determine the size of the swap area.   for
	 * block specials we can call the d_psize function.
	 * for normal files, we must stat [get attrs].
	 *
	 * we put the result in nblks.
	 * for normal files, we also want the filesystem block size
	 * (which we get with statfs).
	 */
	switch (vp->v_type) {
	case VBLK:
		if (bdevsw[major(dev)].d_psize == 0 ||
		    (nblocks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) {
			error = ENXIO;
			goto bad;
		}
		break;

	case VREG:
		if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)))
			goto bad;
		nblocks = (int)btodb(va.va_size);
		if ((error =
		     VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0)
			goto bad;

		sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize;
		/*
		 * limit the max # of outstanding I/O requests we issue
		 * at any one time.   take it easy on NFS servers.
		 */
#if defined(NFSCLIENT)
		if (vp->v_op == &nfs_vops)
			sdp->swd_maxactive = 2; /* XXX */
		else
#endif /* defined(NFSCLIENT) */
			sdp->swd_maxactive = 8; /* XXX */
		bufq_init(&sdp->swd_bufq, BUFQ_FIFO);
		break;

	default:
		error = ENXIO;
		goto bad;
	}

	/*
	 * save nblocks in a safe place and convert to pages.
	 */

	sdp->swd_nblks = nblocks;
	npages = dbtob((u_int64_t)nblocks) >> PAGE_SHIFT;

	/*
	 * for block special files, we want to make sure that leave
	 * the disklabel and bootblocks alone, so we arrange to skip
	 * over them (arbitrarily choosing to skip PAGE_SIZE bytes).
	 * note that because of this the "size" can be less than the
	 * actual number of blocks on the device.
	 */
	if (vp->v_type == VBLK) {
		/* we use pages 1 to (size - 1) [inclusive] */
		size = npages - 1;
		addr = 1;
	} else {
		/* we use pages 0 to (size - 1) [inclusive] */
		size = npages;
		addr = 0;
	}

	/*
	 * make sure we have enough blocks for a reasonable sized swap
	 * area.   we want at least one page.
	 */

	if (size < 1) {
		error = EINVAL;
		goto bad;
	}

	/*
	 * now we need to allocate a blist to manage this swap device
	 */
	sdp->swd_blist = blist_create(npages);
	/* mark all expect the `saved' region free. */
	blist_free(sdp->swd_blist, addr, size);

#ifdef HIBERNATE
	/*
	 * Lock down the last region of primary disk swap, in case
	 * hibernate needs to place a signature there.
	 */
	if (dev == swdevt[0].sw_dev && vp->v_type == VBLK && size > 3 ) {
		if (blist_fill(sdp->swd_blist, npages - 1, 1) != 1)
			panic("hibernate reserve");
	}
#endif

	/* add a ref to vp to reflect usage as a swap device. */
	vref(vp);

#ifdef UVM_SWAP_ENCRYPT
	if (uvm_doswapencrypt)
		uvm_swap_initcrypt(sdp, npages);
#endif
	/* now add the new swapdev to the drum and enable. */
	swapdrum_add(sdp, npages);
	sdp->swd_npages = size;
	mtx_enter(&uvm_swap_data_lock);
	sdp->swd_flags &= ~SWF_FAKE;	/* going live */
	sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
	uvmexp.swpages += size;
	mtx_leave(&uvm_swap_data_lock);
	return (0);

	/*
	 * failure: clean up and return error.
	 */

bad:
	if (vp != rootvp)
		(void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p);
	return (error);
}

/*
 * swap_off: stop swapping on swapdev
 *
 * => swap data should be locked, we will unlock.
 */
int
swap_off(struct proc *p, struct swapdev *sdp)
{
	int npages = sdp->swd_npages;
	int error = 0;

	KASSERT(rw_write_held(&swap_syscall_lock));
	MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);

	/* disable the swap area being removed */
	sdp->swd_flags &= ~SWF_ENABLE;
	mtx_leave(&uvm_swap_data_lock);

	/*
	 * the idea is to find all the pages that are paged out to this
	 * device, and page them all in.  in uvm, swap-backed pageable
	 * memory can take two forms: aobjs and anons.  call the
	 * swapoff hook for each subsystem to bring in pages.
	 */

	if (uao_swap_off(sdp->swd_drumoffset,
			 sdp->swd_drumoffset + sdp->swd_drumsize) ||
	    amap_swap_off(sdp->swd_drumoffset,
			  sdp->swd_drumoffset + sdp->swd_drumsize)) {
		error = ENOMEM;
	} else if (sdp->swd_npginuse > sdp->swd_npgbad) {
		error = EBUSY;
	}

	if (error) {
		mtx_enter(&uvm_swap_data_lock);
		sdp->swd_flags |= SWF_ENABLE;
		mtx_leave(&uvm_swap_data_lock);
		return error;
	}

	/*
	 * done with the vnode and saved creds.
	 * drop our ref on the vnode before calling VOP_CLOSE()
	 * so that spec_close() can tell if this is the last close.
	 */
	if (sdp->swd_vp->v_type == VREG) {
		crfree(sdp->swd_cred);
		bufq_destroy(&sdp->swd_bufq);
	}
	vrele(sdp->swd_vp);
	if (sdp->swd_vp != rootvp) {
		(void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p);
	}

	mtx_enter(&uvm_swap_data_lock);
	uvmexp.swpages -= npages;

	if (swaplist_find(sdp->swd_vp, 1) == NULL)
		panic("swap_off: swapdev not in list");
	swaplist_trim();
	mtx_leave(&uvm_swap_data_lock);

	/*
	 * free all resources!
	 */
	extent_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize,
		    EX_WAITOK);
	blist_destroy(sdp->swd_blist);
	/* free sdp->swd_path ? */
	free(sdp, M_VMSWAP, sizeof(*sdp));
	return (0);
}

/*
 * /dev/drum interface and i/o functions
 */

/*
 * swstrategy: perform I/O on the drum
 *
 * => we must map the i/o request from the drum to the correct swapdev.
 */
void
swstrategy(struct buf *bp)
{
	struct swapdev *sdp;
	int s, pageno, bn;

	/*
	 * convert block number to swapdev.   note that swapdev can't
	 * be yanked out from under us because we are holding resources
	 * in it (i.e. the blocks we are doing I/O on).
	 */
	pageno = dbtob((u_int64_t)bp->b_blkno) >> PAGE_SHIFT;
	mtx_enter(&uvm_swap_data_lock);
	sdp = swapdrum_getsdp(pageno);
	mtx_leave(&uvm_swap_data_lock);
	if (sdp == NULL) {
		bp->b_error = EINVAL;
		bp->b_flags |= B_ERROR;
		s = splbio();
		biodone(bp);
		splx(s);
		return;
	}

	/* convert drum page number to block number on this swapdev. */
	pageno -= sdp->swd_drumoffset;	/* page # on swapdev */
	bn = btodb((u_int64_t)pageno << PAGE_SHIFT); /* convert to diskblock */

	/*
	 * for block devices we finish up here.
	 * for regular files we have to do more work which we delegate
	 * to sw_reg_strategy().
	 */
	switch (sdp->swd_vp->v_type) {
	default:
		panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type);
	case VBLK:
		/*
		 * must convert "bp" from an I/O on /dev/drum to an I/O
		 * on the swapdev (sdp).
		 */
		s = splbio();
		buf_replacevnode(bp, sdp->swd_vp);

		bp->b_blkno = bn;
      		splx(s);
		VOP_STRATEGY(bp->b_vp, bp);
		return;
	case VREG:
		/* delegate to sw_reg_strategy function. */
		sw_reg_strategy(sdp, bp, bn);
		return;
	}
	/* NOTREACHED */
}

/*
 * sw_reg_strategy: handle swap i/o to regular files
 */
void
sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn)
{
	struct vnode	*vp;
	struct vndxfer	*vnx;
	daddr_t	nbn;
	caddr_t		addr;
	off_t		byteoff;
	int		s, off, nra, error, sz, resid;

	/*
	 * allocate a vndxfer head for this transfer and point it to
	 * our buffer.
	 */
	vnx = pool_get(&vndxfer_pool, PR_WAITOK);
	vnx->vx_flags = VX_BUSY;
	vnx->vx_error = 0;
	vnx->vx_pending = 0;
	vnx->vx_bp = bp;
	vnx->vx_sdp = sdp;

	/*
	 * setup for main loop where we read filesystem blocks into
	 * our buffer.
	 */
	error = 0;
	bp->b_resid = bp->b_bcount;	/* nothing transferred yet! */
	addr = bp->b_data;		/* current position in buffer */
	byteoff = dbtob((u_int64_t)bn);

	for (resid = bp->b_resid; resid; resid -= sz) {
		struct vndbuf	*nbp;
		/*
		 * translate byteoffset into block number.  return values:
		 *   vp = vnode of underlying device
		 *  nbn = new block number (on underlying vnode dev)
		 *  nra = num blocks we can read-ahead (excludes requested
		 *	block)
		 */
		nra = 0;
		error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
				 	&vp, &nbn, &nra);

		if (error == 0 && nbn == -1) {
			/*
			 * this used to just set error, but that doesn't
			 * do the right thing.  Instead, it causes random
			 * memory errors.  The panic() should remain until
			 * this condition doesn't destabilize the system.
			 */
#if 1
			panic("sw_reg_strategy: swap to sparse file");
#else
			error = EIO;	/* failure */
#endif
		}

		/*
		 * punt if there was an error or a hole in the file.
		 * we must wait for any i/o ops we have already started
		 * to finish before returning.
		 *
		 * XXX we could deal with holes here but it would be
		 * a hassle (in the write case).
		 */
		if (error) {
			s = splbio();
			vnx->vx_error = error;	/* pass error up */
			goto out;
		}

		/*
		 * compute the size ("sz") of this transfer (in bytes).
		 */
		off = byteoff % sdp->swd_bsize;
		sz = (1 + nra) * sdp->swd_bsize - off;
		if (sz > resid)
			sz = resid;

		/*
		 * now get a buf structure.   note that the vb_buf is
		 * at the front of the nbp structure so that you can
		 * cast pointers between the two structure easily.
		 */
		nbp = pool_get(&vndbuf_pool, PR_WAITOK);
		nbp->vb_buf.b_flags    = bp->b_flags | B_CALL;
		nbp->vb_buf.b_bcount   = sz;
		nbp->vb_buf.b_bufsize  = sz;
		nbp->vb_buf.b_error    = 0;
		nbp->vb_buf.b_data     = addr;
		nbp->vb_buf.b_bq       = NULL;
		nbp->vb_buf.b_blkno    = nbn + btodb(off);
		nbp->vb_buf.b_proc     = bp->b_proc;
		nbp->vb_buf.b_iodone   = sw_reg_iodone;
		nbp->vb_buf.b_vp       = NULLVP;
		nbp->vb_buf.b_vnbufs.le_next = NOLIST;

		/*
		 * set b_dirtyoff/end and b_validoff/end.   this is
		 * required by the NFS client code (otherwise it will
		 * just discard our I/O request).
		 */
		if (bp->b_dirtyend == 0) {
			nbp->vb_buf.b_dirtyoff = 0;
			nbp->vb_buf.b_dirtyend = sz;
		} else {
			nbp->vb_buf.b_dirtyoff =
			    max(0, bp->b_dirtyoff - (bp->b_bcount-resid));
			nbp->vb_buf.b_dirtyend =
			    min(sz,
				max(0, bp->b_dirtyend - (bp->b_bcount-resid)));
		}
		if (bp->b_validend == 0) {
			nbp->vb_buf.b_validoff = 0;
			nbp->vb_buf.b_validend = sz;
		} else {
			nbp->vb_buf.b_validoff =
			    max(0, bp->b_validoff - (bp->b_bcount-resid));
			nbp->vb_buf.b_validend =
			    min(sz,
				max(0, bp->b_validend - (bp->b_bcount-resid)));
		}

		/* patch it back to the vnx */
		nbp->vb_vnx = vnx;
		task_set(&nbp->vb_task, sw_reg_iodone_internal, nbp);

		s = splbio();
		if (vnx->vx_error != 0) {
			pool_put(&vndbuf_pool, nbp);
			goto out;
		}
		vnx->vx_pending++;

		/* assoc new buffer with underlying vnode */
		bgetvp(vp, &nbp->vb_buf);

		/* start I/O if we are not over our limit */
		bufq_queue(&sdp->swd_bufq, &nbp->vb_buf);
		sw_reg_start(sdp);
		splx(s);

		/*
		 * advance to the next I/O
		 */
		byteoff += sz;
		addr += sz;
	}

	s = splbio();

out: /* Arrive here at splbio */
	vnx->vx_flags &= ~VX_BUSY;
	if (vnx->vx_pending == 0) {
		if (vnx->vx_error != 0) {
			bp->b_error = vnx->vx_error;
			bp->b_flags |= B_ERROR;
		}
		pool_put(&vndxfer_pool, vnx);
		biodone(bp);
	}
	splx(s);
}

/* sw_reg_start: start an I/O request on the requested swapdev. */
void
sw_reg_start(struct swapdev *sdp)
{
	struct buf	*bp;

	/* XXX: recursion control */
	if ((sdp->swd_flags & SWF_BUSY) != 0)
		return;

	sdp->swd_flags |= SWF_BUSY;

	while (sdp->swd_active < sdp->swd_maxactive) {
		bp = bufq_dequeue(&sdp->swd_bufq);
		if (bp == NULL)
			break;

		sdp->swd_active++;

		if ((bp->b_flags & B_READ) == 0)
			bp->b_vp->v_numoutput++;

		VOP_STRATEGY(bp->b_vp, bp);
	}
	sdp->swd_flags &= ~SWF_BUSY;
}

/*
 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
 *
 * => note that we can recover the vndbuf struct by casting the buf ptr
 *
 * XXX:
 * We only put this onto a taskq here, because of the maxactive game since
 * it basically requires us to call back into VOP_STRATEGY() (where we must
 * be able to sleep) via sw_reg_start().
 */
void
sw_reg_iodone(struct buf *bp)
{
	struct vndbuf *vbp = (struct vndbuf *)bp;
	task_add(systq, &vbp->vb_task);
}

void
sw_reg_iodone_internal(void *xvbp)
{
	struct vndbuf *vbp = xvbp;
	struct vndxfer *vnx = vbp->vb_vnx;
	struct buf *pbp = vnx->vx_bp;		/* parent buffer */
	struct swapdev	*sdp = vnx->vx_sdp;
	int resid, s;

	s = splbio();

	resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
	pbp->b_resid -= resid;
	vnx->vx_pending--;

	/* pass error upward */
	if (vbp->vb_buf.b_error)
		vnx->vx_error = vbp->vb_buf.b_error;

	/* disassociate this buffer from the vnode (if any). */
	if (vbp->vb_buf.b_vp != NULL) {
		brelvp(&vbp->vb_buf);
	}

	/* kill vbp structure */
	pool_put(&vndbuf_pool, vbp);

	/*
	 * wrap up this transaction if it has run to completion or, in
	 * case of an error, when all auxiliary buffers have returned.
	 */
	if (vnx->vx_error != 0) {
		/* pass error upward */
		pbp->b_flags |= B_ERROR;
		pbp->b_error = vnx->vx_error;
		if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
			pool_put(&vndxfer_pool, vnx);
			biodone(pbp);
		}
	} else if (pbp->b_resid == 0) {
		KASSERT(vnx->vx_pending == 0);
		if ((vnx->vx_flags & VX_BUSY) == 0) {
			pool_put(&vndxfer_pool, vnx);
			biodone(pbp);
		}
	}

	/*
	 * done!   start next swapdev I/O if one is pending
	 */
	sdp->swd_active--;
	sw_reg_start(sdp);
	splx(s);
}


/*
 * uvm_swap_alloc: allocate space on swap
 *
 * => allocation is done "round robin" down the priority list, as we
 *	allocate in a priority we "rotate" the tail queue.
 * => space can be freed with uvm_swap_free
 * => we return the page slot number in /dev/drum (0 == invalid slot)
 * => we lock uvm_swap_data_lock
 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
 */
int
uvm_swap_alloc(int *nslots, boolean_t lessok)
{
	struct swapdev *sdp;
	struct swappri *spp;

	/*
	 * no swap devices configured yet?   definite failure.
	 */
	if (uvmexp.nswapdev < 1)
		return 0;

	/*
	 * lock data lock, convert slots into blocks, and enter loop
	 */
	KERNEL_ASSERT_LOCKED();
	mtx_enter(&uvm_swap_data_lock);

ReTry:	/* XXXMRG */
	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
		TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
			swblk_t result;

			/* if it's not enabled, then we can't swap from it */
			if ((sdp->swd_flags & SWF_ENABLE) == 0)
				continue;
			if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
				continue;
			result = blist_alloc(sdp->swd_blist, *nslots);
			if (result == SWAPBLK_NONE) {
				continue;
			}
			KASSERT(result < sdp->swd_drumsize);

			/*
			 * successful allocation!  now rotate the tailq.
			 */
			TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
			TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
			sdp->swd_npginuse += *nslots;
			uvmexp.swpginuse += *nslots;
			mtx_leave(&uvm_swap_data_lock);
			/* done!  return drum slot number */
			return result + sdp->swd_drumoffset;
		}
	}

	/* XXXMRG: BEGIN HACK */
	if (*nslots > 1 && lessok) {
		*nslots = 1;
		/* XXXMRG: ugh!  blist should support this for us */
		goto ReTry;
	}
	/* XXXMRG: END HACK */

	mtx_leave(&uvm_swap_data_lock);
	return 0;		/* failed */
}

/*
 * uvm_swapisfilled: return true if the amount of free space in swap is
 * smaller than the size of a cluster.
 *
 * As long as some swap slots are being used by pages currently in memory,
 * it is possible to reuse them.  Even if the swap space has been completly
 * filled we do not consider it full.
 */
int
uvm_swapisfilled(void)
{
	int result;

	mtx_enter(&uvm_swap_data_lock);
	KASSERT(uvmexp.swpginuse <= uvmexp.swpages);
	result = (uvmexp.swpginuse + SWCLUSTPAGES) >= uvmexp.swpages;
	mtx_leave(&uvm_swap_data_lock);

	return result;
}

/*
 * uvm_swapisfull: return true if the amount of pages only in swap
 * accounts for more than 99% of the total swap space.
 *
 */
int
uvm_swapisfull(void)
{
	int result;

	mtx_enter(&uvm_swap_data_lock);
	KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
	result = (uvmexp.swpgonly >= ((long)uvmexp.swpages * 99 / 100));
	mtx_leave(&uvm_swap_data_lock);

	return result;
}

/*
 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
 *
 * => we lock uvm_swap_data_lock
 */
void
uvm_swap_markbad(int startslot, int nslots)
{
	struct swapdev *sdp;

	mtx_enter(&uvm_swap_data_lock);
	sdp = swapdrum_getsdp(startslot);
	if (sdp != NULL) {
		/*
		 * we just keep track of how many pages have been marked bad
		 * in this device, to make everything add up in swap_off().
		 * we assume here that the range of slots will all be within
		 * one swap device.
		 */
		sdp->swd_npgbad += nslots;
	}
	mtx_leave(&uvm_swap_data_lock);
}

/*
 * uvm_swap_free: free swap slots
 *
 * => this can be all or part of an allocation made by uvm_swap_alloc
 * => we lock uvm_swap_data_lock
 */
void
uvm_swap_free(int startslot, int nslots)
{
	struct swapdev *sdp;

	/*
	 * ignore attempts to free the "bad" slot.
	 */

	if (startslot == SWSLOT_BAD) {
		return;
	}

	/*
	 * convert drum slot offset back to sdp, free the blocks
	 * in the extent, and return.   must hold pri lock to do
	 * lookup and access the extent.
	 */
	KERNEL_LOCK();
	mtx_enter(&uvm_swap_data_lock);
	sdp = swapdrum_getsdp(startslot);
	KASSERT(uvmexp.nswapdev >= 1);
	KASSERT(sdp != NULL);
	KASSERT(sdp->swd_npginuse >= nslots);
	blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots);
	sdp->swd_npginuse -= nslots;
	uvmexp.swpginuse -= nslots;
	mtx_leave(&uvm_swap_data_lock);

#ifdef UVM_SWAP_ENCRYPT
	{
		int i;
		if (swap_encrypt_initialized) {
			/* Dereference keys */
			for (i = 0; i < nslots; i++)
				if (uvm_swap_needdecrypt(sdp, startslot + i)) {
					struct swap_key *key;

					key = SWD_KEY(sdp, startslot + i);
					if (key->refcount != 0)
						SWAP_KEY_PUT(sdp, key);
				}

			/* Mark range as not decrypt */
			uvm_swap_markdecrypt(sdp, startslot, nslots, 0);
		}
	}
#endif /* UVM_SWAP_ENCRYPT */
	KERNEL_UNLOCK();
}

/*
 * uvm_swap_put: put any number of pages into a contig place on swap
 *
 * => can be sync or async
 */
int
uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags)
{
	int	result;

	result = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
	    ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));

	return (result);
}

/*
 * uvm_swap_get: get a single page from swap
 *
 * => usually a sync op (from fault)
 */
int
uvm_swap_get(struct vm_page *page, int swslot, int flags)
{
	int	result;

	atomic_inc_int(&uvmexp.nswget);
	KASSERT(flags & PGO_SYNCIO);
	if (swslot == SWSLOT_BAD) {
		return VM_PAGER_ERROR;
	}

	KERNEL_LOCK();
	result = uvm_swap_io(&page, swslot, 1, B_READ);
	KERNEL_UNLOCK();

	if (result == VM_PAGER_OK || result == VM_PAGER_PEND) {
		/*
		 * this page is no longer only in swap.
		 */
		atomic_dec_int(&uvmexp.swpgonly);
	}
	return (result);
}

/*
 * uvm_swap_io: do an i/o operation to swap
 */

int
uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags)
{
	daddr_t startblk;
	struct	buf *bp;
	vaddr_t kva;
	int	result, s, mapinflags, pflag, bounce = 0, i;
	boolean_t write, async;
	vaddr_t bouncekva;
	struct vm_page *tpps[SWCLUSTPAGES];
	int pdaemon = (curproc == uvm.pagedaemon_proc);
#ifdef UVM_SWAP_ENCRYPT
	struct swapdev *sdp;
	int	encrypt = 0;
#endif

	KERNEL_ASSERT_LOCKED();

	write = (flags & B_READ) == 0;
	async = (flags & B_ASYNC) != 0;

	/* convert starting drum slot to block number */
	startblk = btodb((u_int64_t)startslot << PAGE_SHIFT);

	pflag = (async || pdaemon) ? PR_NOWAIT : PR_WAITOK;
	bp = pool_get(&bufpool, pflag | PR_ZERO);
	if (bp == NULL)
		return (VM_PAGER_AGAIN);

	/*
	 * map the pages into the kernel (XXX: currently required
	 * by buffer system).
	 */
	mapinflags = !write ? UVMPAGER_MAPIN_READ : UVMPAGER_MAPIN_WRITE;
	if (!async)
		mapinflags |= UVMPAGER_MAPIN_WAITOK;
	kva = uvm_pagermapin(pps, npages, mapinflags);
	if (kva == 0) {
		pool_put(&bufpool, bp);
		return (VM_PAGER_AGAIN);
	}

#ifdef UVM_SWAP_ENCRYPT
	if (write) {
		/*
		 * Check if we need to do swap encryption on old pages.
		 * Later we need a different scheme, that swap encrypts
		 * all pages of a process that had at least one page swap
		 * encrypted.  Then we might not need to copy all pages
		 * in the cluster, and avoid the memory overheard in
		 * swapping.
		 */
		if (uvm_doswapencrypt)
			encrypt = 1;
	}

	if (swap_encrypt_initialized || encrypt) {
		/*
		 * we need to know the swap device that we are swapping to/from
		 * to see if the pages need to be marked for decryption or
		 * actually need to be decrypted.
		 * XXX - does this information stay the same over the whole
		 * execution of this function?
		 */
		mtx_enter(&uvm_swap_data_lock);
		sdp = swapdrum_getsdp(startslot);
		mtx_leave(&uvm_swap_data_lock);
	}

	/*
	 * Check that we are dma capable for read (write always bounces
	 * through the swapencrypt anyway...
	 */
	if (write && encrypt) {
		bounce = 1; /* bounce through swapencrypt always */
	} else {
#else
	{
#endif

		for (i = 0; i < npages; i++) {
			if (VM_PAGE_TO_PHYS(pps[i]) < dma_constraint.ucr_low ||
			   VM_PAGE_TO_PHYS(pps[i]) > dma_constraint.ucr_high) {
				bounce = 1;
				break;
			}
		}
	}

	if (bounce)  {
		int swmapflags, plaflags;

		/* We always need write access. */
		swmapflags = UVMPAGER_MAPIN_READ;
		plaflags = UVM_PLA_NOWAIT;
		if (!async) {
			swmapflags |= UVMPAGER_MAPIN_WAITOK;
			plaflags = UVM_PLA_WAITOK;
		}
		if (uvm_swap_allocpages(tpps, npages, plaflags)) {
			pool_put(&bufpool, bp);
			uvm_pagermapout(kva, npages);
			return (VM_PAGER_AGAIN);
		}

		bouncekva = uvm_pagermapin(tpps, npages, swmapflags);
		if (bouncekva == 0) {
			pool_put(&bufpool, bp);
			uvm_pagermapout(kva, npages);
			uvm_swap_freepages(tpps, npages);
			return (VM_PAGER_AGAIN);
		}
	}

	/* encrypt to swap */
	if (write && bounce) {
		int i, opages;
		caddr_t src, dst;
		u_int64_t block;

		src = (caddr_t) kva;
		dst = (caddr_t) bouncekva;
		block = startblk;
		for (i = 0; i < npages; i++) {
#ifdef UVM_SWAP_ENCRYPT
			struct swap_key *key;

			if (encrypt) {
				key = SWD_KEY(sdp, startslot + i);
				SWAP_KEY_GET(sdp, key);	/* add reference */

				swap_encrypt(key, src, dst, block, PAGE_SIZE);
				block += btodb(PAGE_SIZE);
			} else {
#else
			{
#endif /* UVM_SWAP_ENCRYPT */
				memcpy(dst, src, PAGE_SIZE);
			}
			/* this just tells async callbacks to free */
			atomic_setbits_int(&tpps[i]->pg_flags, PQ_ENCRYPT);
			src += PAGE_SIZE;
			dst += PAGE_SIZE;
		}

		uvm_pagermapout(kva, npages);

		/* dispose of pages we dont use anymore */
		opages = npages;
		uvm_pager_dropcluster(NULL, NULL, pps, &opages,
				      PGO_PDFREECLUST);

		kva = bouncekva;
	}

	/*
	 * prevent ASYNC reads.
	 * uvm_swap_io is only called from uvm_swap_get, uvm_swap_get
	 * assumes that all gets are SYNCIO.  Just make sure here.
	 * XXXARTUBC - might not be true anymore.
	 */
	if (!write) {
		flags &= ~B_ASYNC;
		async = 0;
	}

	/*
	 * fill in the bp.   we currently route our i/o through
	 * /dev/drum's vnode [swapdev_vp].
	 */
	bp->b_flags = B_BUSY | B_NOCACHE | B_RAW | (flags & (B_READ|B_ASYNC));
	bp->b_proc = &proc0;	/* XXX */
	bp->b_vnbufs.le_next = NOLIST;
	if (bounce)
		bp->b_data = (caddr_t)bouncekva;
	else
		bp->b_data = (caddr_t)kva;
	bp->b_bq = NULL;
	bp->b_blkno = startblk;
	s = splbio();
	bp->b_vp = NULL;
	buf_replacevnode(bp, swapdev_vp);
	splx(s);
	bp->b_bufsize = bp->b_bcount = (long)npages << PAGE_SHIFT;

	/*
	 * for pageouts we must set "dirtyoff" [NFS client code needs it].
	 * and we bump v_numoutput (counter of number of active outputs).
	 */
	if (write) {
		bp->b_dirtyoff = 0;
		bp->b_dirtyend = npages << PAGE_SHIFT;
#ifdef UVM_SWAP_ENCRYPT
		/* mark the pages in the drum for decryption */
		if (swap_encrypt_initialized)
			uvm_swap_markdecrypt(sdp, startslot, npages, encrypt);
#endif
		s = splbio();
		swapdev_vp->v_numoutput++;
		splx(s);
	}

	/* for async ops we must set up the iodone handler. */
	if (async) {
		bp->b_flags |= B_CALL | (pdaemon ? B_PDAEMON : 0);
		bp->b_iodone = uvm_aio_biodone;
	}

	/* now we start the I/O, and if async, return. */
	VOP_STRATEGY(bp->b_vp, bp);
	if (async)
		return (VM_PAGER_PEND);

	/* must be sync i/o.   wait for it to finish */
	(void) biowait(bp);
	result = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK;

	/* decrypt swap */
	if (!write && !(bp->b_flags & B_ERROR)) {
		int i;
		caddr_t data = (caddr_t)kva;
		caddr_t dst = (caddr_t)kva;
		u_int64_t block = startblk;

		if (bounce)
			data = (caddr_t)bouncekva;

		for (i = 0; i < npages; i++) {
#ifdef UVM_SWAP_ENCRYPT
			struct swap_key *key;

			/* Check if we need to decrypt */
			if (swap_encrypt_initialized &&
			    uvm_swap_needdecrypt(sdp, startslot + i)) {
				key = SWD_KEY(sdp, startslot + i);
				if (key->refcount == 0) {
					result = VM_PAGER_ERROR;
					break;
				}
				swap_decrypt(key, data, dst, block, PAGE_SIZE);
			} else if (bounce) {
#else
			if (bounce) {
#endif
				memcpy(dst, data, PAGE_SIZE);
			}
			data += PAGE_SIZE;
			dst += PAGE_SIZE;
			block += btodb(PAGE_SIZE);
		}
		if (bounce)
			uvm_pagermapout(bouncekva, npages);
	}
	/* kill the pager mapping */
	uvm_pagermapout(kva, npages);

	/*  Not anymore needed, free after encryption/bouncing */
	if (!write && bounce)
		uvm_swap_freepages(tpps, npages);

	/* now dispose of the buf */
	s = splbio();
	if (bp->b_vp)
		brelvp(bp);

	if (write && bp->b_vp)
		vwakeup(bp->b_vp);
	pool_put(&bufpool, bp);
	splx(s);

	/* finally return. */
	return (result);
}

void
swapmount(void)
{
	struct swapdev *sdp;
	struct swappri *spp;
	struct vnode *vp;
	dev_t swap_dev = swdevt[0].sw_dev;
	char *nam;
	char path[MNAMELEN + 1];

	if (swap_dev == NODEV)
		return;

	rw_enter_write(&swap_syscall_lock);

#if defined(NFSCLIENT)
	if (swap_dev == NETDEV) {
		extern struct nfs_diskless nfs_diskless;

		snprintf(path, sizeof(path), "%s",
		    nfs_diskless.nd_swap.ndm_host);
		vp = nfs_diskless.sw_vp;
		goto gotit;
	} else
#endif
	if (bdevvp(swap_dev, &vp)) {
		rw_exit_write(&swap_syscall_lock);
		return;
	}

	/* Construct a potential path to swap */
	if ((nam = findblkname(major(swap_dev))))
		snprintf(path, sizeof(path), "/dev/%s%d%c", nam,
		    DISKUNIT(swap_dev), 'a' + DISKPART(swap_dev));
	else
		snprintf(path, sizeof(path), "blkdev0x%x",
		    swap_dev);

#if defined(NFSCLIENT)
gotit:
#endif
	sdp = malloc(sizeof(*sdp), M_VMSWAP, M_WAITOK|M_ZERO);
	spp = malloc(sizeof(*spp), M_VMSWAP, M_WAITOK);

	sdp->swd_flags = SWF_FAKE;
	sdp->swd_dev = swap_dev;

	sdp->swd_pathlen = strlen(path) + 1;
	sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK | M_ZERO);
	strlcpy(sdp->swd_path, path, sdp->swd_pathlen);

	sdp->swd_vp = vp;

	mtx_enter(&uvm_swap_data_lock);
	swaplist_insert(sdp, spp, 0);
	mtx_leave(&uvm_swap_data_lock);

	if (swap_on(curproc, sdp)) {
		mtx_enter(&uvm_swap_data_lock);
		swaplist_find(vp, 1);
		swaplist_trim();
		vput(sdp->swd_vp);
		mtx_leave(&uvm_swap_data_lock);
		rw_exit_write(&swap_syscall_lock);
		free(sdp->swd_path, M_VMSWAP, sdp->swd_pathlen);
		free(sdp, M_VMSWAP, sizeof(*sdp));
		return;
	}
	rw_exit_write(&swap_syscall_lock);
}

#ifdef HIBERNATE
int
uvm_hibswap(dev_t dev, u_long *sp, u_long *ep)
{
	struct swapdev *sdp, *swd = NULL;
	struct swappri *spp;

	/* no swap devices configured yet? */
	if (uvmexp.nswapdev < 1 || dev != swdevt[0].sw_dev)
		return (1);

	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
		TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
			if (sdp->swd_dev == dev)
				swd = sdp;
		}
	}

	if (swd == NULL || (swd->swd_flags & SWF_ENABLE) == 0)
		return (1);

	blist_gapfind(swd->swd_blist, sp, ep);

	if (*ep - *sp == 0)
		/* no gap found */
		return (1);

	/*
	 * blist_gapfind returns the gap as [sp,ep[ ,
	 * whereas [sp,ep] is expected from uvm_hibswap().
	 */
	*ep -= 1;

	return (0);
}
#endif /* HIBERNATE */

#ifdef DDB
void
swap_print_all(int (*pr)(const char *, ...))
{
	struct swappri *spp;
	struct swapdev *sdp;

	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
		TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
#ifdef HIBERNATE
			u_long bgap = 0, egap = 0;
#endif

			pr("swap %p path \"%s\" flags 0x%x\n", sdp,
			    sdp->swd_path, sdp->swd_flags);

			blist_print(sdp->swd_blist);

#ifdef HIBERNATE
			if (!uvm_hibswap(sdp->swd_dev, &bgap, &egap))
				pr("hibernate gap: [0x%lx, 0x%lx] size=%lu\n",
				    bgap, egap, (egap - bgap + 1));
			else
				pr("hibernate gap: not found\n");
#endif
		}
	}
}
#endif /* DDB */