diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index f9449037af99..534dfb2cd682 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -132,6 +132,7 @@ #include #ifdef _KERNEL #include +#include #endif #include #include @@ -4503,6 +4504,14 @@ top: demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); #ifdef _KERNEL +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_force(curproc, RACCT_READBPS, size); + racct_add_force(curproc, RACCT_READIOPS, 1); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ curthread->td_ru.ru_inblock++; #endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c index b60236f6ff17..af8d36650f6e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -47,6 +47,7 @@ #include #include #ifdef _KERNEL +#include #include #include #endif @@ -427,6 +428,15 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, } dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); +#if defined(_KERNEL) && defined(RACCT) + if (racct_enable && !read) { + PROC_LOCK(curproc); + racct_add_force(curproc, RACCT_WRITEBPS, length); + racct_add_force(curproc, RACCT_WRITEIOPS, nblks); + PROC_UNLOCK(curproc); + } +#endif + zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, 0, offset); for (i = 0; i < nblks; i++) { @@ -1422,7 +1432,15 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA) { #ifdef _KERNEL curthread->td_ru.ru_oublock++; -#endif +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_force(curproc, RACCT_WRITEBPS, blksz); + racct_add_force(curproc, RACCT_WRITEIOPS, 1); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ +#endif /* _KERNEL */ dbuf_assign_arcbuf(db, buf, tx); dbuf_rele(db, FTAG); } else { diff --git a/sys/fs/ext2fs/ext2_bmap.c b/sys/fs/ext2fs/ext2_bmap.c index 8e5e9863aeae..7966b9bfb8b3 100644 --- a/sys/fs/ext2fs/ext2_bmap.c +++ b/sys/fs/ext2fs/ext2_bmap.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -247,6 +248,13 @@ ext2_bmaparray(struct vnode *vp, daddr_t bn, daddr_t *bnp, int *runp, int *runb) vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, bp, 0); + PROC_UNLOCK(curproc); + } +#endif curthread->td_ru.ru_inblock++; error = bufwait(bp); if (error) { diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c index a1483865fe5a..31d8f1660fe7 100644 --- a/sys/kern/kern_physio.c +++ b/sys/kern/kern_physio.c @@ -27,6 +27,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -109,6 +110,22 @@ physio(struct cdev *dev, struct uio *uio, int ioflag) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ error = 0; for (i = 0; i < uio->uio_iovcnt; i++) { +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + if (uio->uio_rw == UIO_READ) { + racct_add_force(curproc, RACCT_READBPS, + uio->uio_iov[i].iov_len); + racct_add_force(curproc, RACCT_READIOPS, 1); + } else { + racct_add_force(curproc, RACCT_WRITEBPS, + uio->uio_iov[i].iov_len); + racct_add_force(curproc, RACCT_WRITEIOPS, 1); + } + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ + while (uio->uio_iov[i].iov_len) { g_reset_bio(bp); if (uio->uio_rw == UIO_READ) { diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c index bbd50ca6e213..438a249a323c 100644 --- a/sys/kern/kern_racct.c +++ b/sys/kern/kern_racct.c @@ -35,6 +35,7 @@ __FBSDID("$FreeBSD$"); #include "opt_sched.h" #include +#include #include #include #include @@ -177,7 +178,15 @@ int racct_types[] = { [RACCT_WALLCLOCK] = RACCT_IN_MILLIONS, [RACCT_PCTCPU] = - RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS }; + RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS, + [RACCT_READBPS] = + RACCT_DECAYING, + [RACCT_WRITEBPS] = + RACCT_DECAYING, + [RACCT_READIOPS] = + RACCT_DECAYING, + [RACCT_WRITEIOPS] = + RACCT_DECAYING }; static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE; @@ -634,6 +643,28 @@ racct_add_cred(struct ucred *cred, int resource, uint64_t amount) RACCT_UNLOCK(); } +/* + * Account for disk IO resource consumption. Checks for limits, + * but never fails, due to disk limits being undeniable. + */ +void +racct_add_buf(struct proc *p, const struct buf *bp, int is_write) +{ + + ASSERT_RACCT_ENABLED(); + PROC_LOCK_ASSERT(p, MA_OWNED); + + RACCT_LOCK(); + if (is_write) { + racct_add_locked(curproc, RACCT_WRITEBPS, bp->b_bcount, 1); + racct_add_locked(curproc, RACCT_WRITEIOPS, 1, 1); + } else { + racct_add_locked(curproc, RACCT_READBPS, bp->b_bcount, 1); + racct_add_locked(curproc, RACCT_READIOPS, 1, 1); + } + RACCT_UNLOCK(); +} + static int racct_set_locked(struct proc *p, int resource, uint64_t amount, int force) { @@ -655,7 +686,7 @@ racct_set_locked(struct proc *p, int resource, uint64_t amount, int force) * The diffs may be negative. */ diff_proc = amount - old_amount; - if (RACCT_IS_DECAYING(resource)) { + if (resource == RACCT_PCTCPU) { /* * Resources in per-credential racct containers may decay. * If this is the case, we need to calculate the difference @@ -1043,14 +1074,19 @@ racct_move(struct racct *dest, struct racct *src) RACCT_UNLOCK(); } -static void -racct_proc_throttle(struct proc *p) +/* + * Make the process sleep in userret() for 'timeout' ticks. Setting + * timeout to -1 makes it sleep until woken up by racct_proc_wakeup(). + */ +void +racct_proc_throttle(struct proc *p, int timeout) { struct thread *td; #ifdef SMP int cpuid; #endif + KASSERT(timeout != 0, ("timeout %d", timeout)); ASSERT_RACCT_ENABLED(); PROC_LOCK_ASSERT(p, MA_OWNED); @@ -1058,10 +1094,13 @@ racct_proc_throttle(struct proc *p) * Do not block kernel processes. Also do not block processes with * low %cpu utilization to improve interactivity. */ - if (((p->p_flag & (P_SYSTEM | P_KPROC)) != 0) || - (p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold)) + if ((p->p_flag & (P_SYSTEM | P_KPROC)) != 0) return; - p->p_throttled = 1; + + if (p->p_throttled < 0 || (timeout > 0 && p->p_throttled > timeout)) + return; + + p->p_throttled = timeout; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); @@ -1102,7 +1141,7 @@ racct_proc_wakeup(struct proc *p) PROC_LOCK_ASSERT(p, MA_OWNED); - if (p->p_throttled) { + if (p->p_throttled != 0) { p->p_throttled = 0; wakeup(p->p_racct); } @@ -1116,6 +1155,13 @@ racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2) ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); +#ifdef RCTL + rctl_throttle_decay(racct, RACCT_READBPS); + rctl_throttle_decay(racct, RACCT_WRITEBPS); + rctl_throttle_decay(racct, RACCT_READIOPS); + rctl_throttle_decay(racct, RACCT_WRITEIOPS); +#endif + r_old = racct->r_resources[RACCT_PCTCPU]; /* If there is nothing to decay, just exit. */ @@ -1206,6 +1252,12 @@ racctd(void) pct_estimate = 0; pct = racct_getpcpu(p, pct_estimate); RACCT_LOCK(); +#ifdef RCTL + rctl_throttle_decay(p->p_racct, RACCT_READBPS); + rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS); + rctl_throttle_decay(p->p_racct, RACCT_READIOPS); + rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS); +#endif racct_set_locked(p, RACCT_PCTCPU, pct, 1); racct_set_locked(p, RACCT_CPU, runtime, 0); racct_set_locked(p, RACCT_WALLCLOCK, @@ -1228,10 +1280,13 @@ racctd(void) continue; } - if (racct_pcpu_available(p) <= 0) - racct_proc_throttle(p); - else if (p->p_throttled) + if (racct_pcpu_available(p) <= 0) { + if (p->p_racct->r_resources[RACCT_PCTCPU] > + pcpu_threshold) + racct_proc_throttle(p, -1); + } else if (p->p_throttled == -1) { racct_proc_wakeup(p); + } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); diff --git a/sys/kern/kern_rctl.c b/sys/kern/kern_rctl.c index 7f6a7ad5075f..8f301b8c20f6 100644 --- a/sys/kern/kern_rctl.c +++ b/sys/kern/kern_rctl.c @@ -77,9 +77,13 @@ FEATURE(rctl, "Resource Limits"); #define RCTL_PCPU_SHIFT (10 * 1000000) -unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE; +static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE; static int rctl_log_rate_limit = 10; static int rctl_devctl_rate_limit = 10; +static unsigned int rctl_throttle_min = 0; +static unsigned int rctl_throttle_max = 0; +static unsigned int rctl_throttle_pct = 0; +static unsigned int rctl_throttle_pct2 = 0; SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits"); SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN, @@ -88,6 +92,16 @@ SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW, &rctl_log_rate_limit, 0, "Maximum number of log messages per second"); SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RW, &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second"); +SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_min, CTLFLAG_RDTUN, + &rctl_throttle_min, 0, "Shortest throttling duration, in hz"); +SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_max, CTLFLAG_RDTUN, + &rctl_throttle_max, 0, "Longest throttling duration, in hz"); +SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_pct, CTLFLAG_RDTUN, + &rctl_throttle_pct, 0, + "Throttling penalty for process consumption, in percent"); +SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_pct2, CTLFLAG_RDTUN, + &rctl_throttle_pct2, 0, + "Throttling penalty for container consumption, in percent"); /* * 'rctl_rule_link' connects a rule with every racct it's related to. @@ -134,6 +148,10 @@ static struct dict resourcenames[] = { { "shmsize", RACCT_SHMSIZE }, { "wallclock", RACCT_WALLCLOCK }, { "pcpu", RACCT_PCTCPU }, + { "readbps", RACCT_READBPS }, + { "writebps", RACCT_WRITEBPS }, + { "readiops", RACCT_READIOPS }, + { "writeiops", RACCT_WRITEIOPS }, { NULL, -1 }}; static struct dict actionnames[] = { @@ -171,6 +189,7 @@ static struct dict actionnames[] = { { "deny", RCTL_ACTION_DENY }, { "log", RCTL_ACTION_LOG }, { "devctl", RCTL_ACTION_DEVCTL }, + { "throttle", RCTL_ACTION_THROTTLE }, { NULL, -1 }}; static void rctl_init(void); @@ -274,23 +293,53 @@ rctl_available_resource(const struct proc *p, const struct rctl_rule *rule) } /* - * Return non-zero if allocating 'amount' by proc 'p' would exceed - * resource limit specified by 'rule'. + * Called every second for proc, uidinfo, loginclass, and jail containers. + * If the limit isn't exceeded, it decreases the usage amount to zero. + * Otherwise, it decreases it by the value of the limit. This way + * resource consumption exceeding the limit "carries over" to the next + * period. */ -static int -rctl_would_exceed(const struct proc *p, const struct rctl_rule *rule, - int64_t amount) +void +rctl_throttle_decay(struct racct *racct, int resource) { - int64_t available; + struct rctl_rule *rule; + struct rctl_rule_link *link; + int64_t minavailable; ASSERT_RACCT_ENABLED(); - RCTL_LOCK_ASSERT(); - available = rctl_available_resource(p, rule); - if (available >= amount) - return (0); + minavailable = INT64_MAX; - return (1); + RCTL_RLOCK(); + + LIST_FOREACH(link, &racct->r_rule_links, rrl_next) { + rule = link->rrl_rule; + + if (rule->rr_resource != resource) + continue; + if (rule->rr_action != RCTL_ACTION_THROTTLE) + continue; + + if (rule->rr_amount < minavailable) + minavailable = rule->rr_amount; + } + + RCTL_RUNLOCK(); + + if (racct->r_resources[resource] < minavailable) { + racct->r_resources[resource] = 0; + } else { + /* + * Cap utilization counter at ten times the limit. Otherwise, + * if we changed the rule lowering the allowed amount, it could + * take unreasonably long time for the accumulated resource + * usage to drop. + */ + if (racct->r_resources[resource] > minavailable * 10) + racct->r_resources[resource] = minavailable * 10; + + racct->r_resources[resource] -= minavailable; + } } /* @@ -340,6 +389,38 @@ rctl_pcpu_available(const struct proc *p) { return (minavailable); } +static uint64_t +xadd(uint64_t a, uint64_t b) +{ + uint64_t c; + + c = a + b; + + /* + * Detect overflow. + */ + if (c < a || c < b) + return (UINT64_MAX); + + return (c); +} + +static uint64_t +xmul(uint64_t a, uint64_t b) +{ + uint64_t c; + + if (a == 0 || b == 0) + return (0); + + c = a * b; + + if (c < a || c < b) + return (UINT64_MAX); + + return (c); +} + /* * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition * to what it keeps allocated now. Returns non-zero if the allocation should @@ -353,9 +434,12 @@ rctl_enforce(struct proc *p, int resource, uint64_t amount) struct rctl_rule *rule; struct rctl_rule_link *link; struct sbuf sb; + int64_t available; + uint64_t sleep_ms, sleep_ratio; int should_deny = 0; char *buf; + ASSERT_RACCT_ENABLED(); RCTL_RLOCK(); @@ -368,7 +452,9 @@ rctl_enforce(struct proc *p, int resource, uint64_t amount) rule = link->rrl_rule; if (rule->rr_resource != resource) continue; - if (!rctl_would_exceed(p, rule, amount)) { + + available = rctl_available_resource(p, rule); + if (available >= (int64_t)amount) { link->rrl_exceeded = 0; continue; } @@ -421,7 +507,7 @@ rctl_enforce(struct proc *p, int resource, uint64_t amount) if (p->p_state != PRS_NORMAL) continue; - + if (!ppsratecheck(&devctl_lasttime, &devctl_curtime, rctl_devctl_rate_limit)) continue; @@ -444,6 +530,69 @@ rctl_enforce(struct proc *p, int resource, uint64_t amount) free(buf, M_RCTL); link->rrl_exceeded = 1; continue; + case RCTL_ACTION_THROTTLE: + if (p->p_state != PRS_NORMAL) + continue; + + /* + * Make the process sleep for a fraction of second + * proportional to the ratio of process' resource + * utilization compared to the limit. The point is + * to penalize resource hogs: processes that consume + * more of the available resources sleep for longer. + * + * We're trying to defer division until the very end, + * to minimize the rounding effects. The following + * calculation could have been written in a clearer + * way like this: + * + * sleep_ms = hz * p->p_racct->r_resources[resource] / + * rule->rr_amount; + * sleep_ms *= rctl_throttle_pct / 100; + * if (sleep_ms < rctl_throttle_min) + * sleep_ms = rctl_throttle_min; + * + */ + sleep_ms = xmul(hz, p->p_racct->r_resources[resource]); + sleep_ms = xmul(sleep_ms, rctl_throttle_pct) / 100; + if (sleep_ms < rctl_throttle_min * rule->rr_amount) + sleep_ms = rctl_throttle_min * rule->rr_amount; + + /* + * Multiply that by the ratio of the resource + * consumption for the container compared to the limit, + * squared. In other words, a process in a container + * that is two times over the limit will be throttled + * four times as much for hitting the same rule. The + * point is to penalize processes more if the container + * itself (eg certain UID or jail) is above the limit. + */ + if (available < 0) + sleep_ratio = -available / rule->rr_amount; + else + sleep_ratio = 0; + sleep_ratio = xmul(sleep_ratio, sleep_ratio); + sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100; + sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio)); + + /* + * Finally the division. + */ + sleep_ms /= rule->rr_amount; + + if (sleep_ms > rctl_throttle_max) + sleep_ms = rctl_throttle_max; +#if 0 + printf("%s: pid %d (%s), %jd of %jd, will sleep for %ld ms (ratio %ld, available %ld)\n", + __func__, p->p_pid, p->p_comm, + p->p_racct->r_resources[resource], + rule->rr_amount, sleep_ms, sleep_ratio, available); +#endif + + KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n", + __func__, (uintmax_t)sleep_ms, rctl_throttle_min)); + racct_proc_throttle(p, sleep_ms); + continue; default: if (link->rrl_exceeded != 0) continue; @@ -1073,20 +1222,32 @@ rctl_rule_add(struct rctl_rule *rule) KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); /* - * Some rules just don't make sense. Note that the one below - * cannot be rewritten using RACCT_IS_DENIABLE(); the RACCT_PCTCPU, - * for example, is not deniable in the racct sense, but the - * limit is enforced in a different way, so "deny" rules for %CPU - * do make sense. + * Some rules just don't make sense, like "deny" rule for an undeniable + * resource. The exception are the RSS and %CPU resources - they are + * not deniable in the racct sense, but the limit is enforced in + * a different way. */ if (rule->rr_action == RCTL_ACTION_DENY && - (rule->rr_resource == RACCT_CPU || - rule->rr_resource == RACCT_WALLCLOCK)) + !RACCT_IS_DENIABLE(rule->rr_resource) && + rule->rr_resource != RACCT_RSS && + rule->rr_resource != RACCT_PCTCPU) { return (EOPNOTSUPP); + } + + if (rule->rr_action == RCTL_ACTION_THROTTLE && + !RACCT_IS_DECAYING(rule->rr_resource)) { + return (EOPNOTSUPP); + } + + if (rule->rr_action == RCTL_ACTION_THROTTLE && + rule->rr_resource == RACCT_PCTCPU) { + return (EOPNOTSUPP); + } if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS && - RACCT_IS_SLOPPY(rule->rr_resource)) + RACCT_IS_SLOPPY(rule->rr_resource)) { return (EOPNOTSUPP); + } /* * Make sure there are no duplicated rules. Also, for the "deny" @@ -1960,6 +2121,15 @@ rctl_init(void) UMA_ALIGN_PTR, UMA_ZONE_NOFREE); rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + + if (rctl_throttle_min <= 0) + rctl_throttle_min = 1; + if (rctl_throttle_max <= 0) + rctl_throttle_max = 2 * hz; + if (rctl_throttle_pct <= 0) + rctl_throttle_pct = 100; + if (rctl_throttle_pct2 <= 0) + rctl_throttle_pct2 = 100; } #else /* !RCTL */ diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c index a3714563ccac..6d1ac709febb 100644 --- a/sys/kern/subr_trap.c +++ b/sys/kern/subr_trap.c @@ -172,10 +172,14 @@ userret(struct thread *td, struct trapframe *frame) (td->td_vnet_lpush != NULL) ? td->td_vnet_lpush : "N/A")); #endif #ifdef RACCT - if (racct_enable && p->p_throttled == 1) { + if (racct_enable && p->p_throttled != 0) { PROC_LOCK(p); - while (p->p_throttled == 1) - msleep(p->p_racct, &p->p_mtx, 0, "racct", 0); + while (p->p_throttled != 0) { + msleep(p->p_racct, &p->p_mtx, 0, "racct", + p->p_throttled < 0 ? 0 : p->p_throttled); + if (p->p_throttled > 0) + p->p_throttled = 0; + } PROC_UNLOCK(p); } #endif diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index e272f9de557a..b7b9641662bd 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -1784,8 +1785,16 @@ breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); if ((rabp->b_flags & B_CACHE) == 0) { - if (!TD_IS_IDLETHREAD(curthread)) + if (!TD_IS_IDLETHREAD(curthread)) { +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, rabp, 0); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ curthread->td_ru.ru_inblock++; + } rabp->b_flags |= B_ASYNC; rabp->b_flags &= ~B_INVAL; rabp->b_ioflags &= ~BIO_ERROR; @@ -1829,8 +1838,16 @@ breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno, /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { - if (!TD_IS_IDLETHREAD(curthread)) + if (!TD_IS_IDLETHREAD(curthread)) { +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, bp, 0); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ curthread->td_ru.ru_inblock++; + } bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; @@ -1926,8 +1943,16 @@ bufwrite(struct buf *bp) bp->b_runningbufspace = bp->b_bufsize; space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace); - if (!TD_IS_IDLETHREAD(curthread)) + if (!TD_IS_IDLETHREAD(curthread)) { +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, bp, 1); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ curthread->td_ru.ru_oublock++; + } if (oldflags & B_ASYNC) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index 9871a505631d..40dc0c0c3203 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -241,6 +242,13 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size, BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, bp, 0); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ curthread->td_ru.ru_inblock++; } @@ -294,6 +302,13 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size, BUF_KERNPROC(rbp); rbp->b_iooffset = dbtob(rbp->b_blkno); bstrategy(rbp); +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, rbp, 0); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ curthread->td_ru.ru_inblock++; } diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 2d1769e43091..d2b617c469dd 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -623,7 +623,7 @@ struct proc { after fork. */ uint64_t p_prev_runtime; /* (c) Resource usage accounting. */ struct racct *p_racct; /* (b) Resource accounting. */ - u_char p_throttled; /* (c) Flag for racct pcpu throttling */ + int p_throttled; /* (c) Flag for racct pcpu throttling */ struct vm_domain_policy p_vm_dom_policy; /* (c) process default VM domain, or -1 */ /* * An orphan is the child that has beed re-parented to the diff --git a/sys/sys/racct.h b/sys/sys/racct.h index 8d1f2fa2cec2..5330c637bf36 100644 --- a/sys/sys/racct.h +++ b/sys/sys/racct.h @@ -42,6 +42,7 @@ #include #include +struct buf; struct proc; struct rctl_rule_link; struct ucred; @@ -71,7 +72,11 @@ struct ucred; #define RACCT_SHMSIZE 18 #define RACCT_WALLCLOCK 19 #define RACCT_PCTCPU 20 -#define RACCT_MAX RACCT_PCTCPU +#define RACCT_READBPS 21 +#define RACCT_WRITEBPS 22 +#define RACCT_READIOPS 23 +#define RACCT_WRITEIOPS 24 +#define RACCT_MAX RACCT_WRITEIOPS /* * Resource properties. @@ -153,6 +158,7 @@ SYSCTL_DECL(_kern_racct); int racct_add(struct proc *p, int resource, uint64_t amount); void racct_add_cred(struct ucred *cred, int resource, uint64_t amount); void racct_add_force(struct proc *p, int resource, uint64_t amount); +void racct_add_buf(struct proc *p, const struct buf *bufp, int is_write); int racct_set(struct proc *p, int resource, uint64_t amount); void racct_set_force(struct proc *p, int resource, uint64_t amount); void racct_sub(struct proc *p, int resource, uint64_t amount); @@ -170,6 +176,7 @@ void racct_proc_exit(struct proc *p); void racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred, struct ucred *newcred); void racct_move(struct racct *dest, struct racct *src); +void racct_proc_throttle(struct proc *p, int timeout); #else diff --git a/sys/sys/rctl.h b/sys/sys/rctl.h index e1a45a4949fa..b9e6cd6d17f7 100644 --- a/sys/sys/rctl.h +++ b/sys/sys/rctl.h @@ -129,7 +129,8 @@ struct rctl_rule { #define RCTL_ACTION_DENY (RCTL_ACTION_SIGNAL_MAX + 1) #define RCTL_ACTION_LOG (RCTL_ACTION_SIGNAL_MAX + 2) #define RCTL_ACTION_DEVCTL (RCTL_ACTION_SIGNAL_MAX + 3) -#define RCTL_ACTION_MAX RCTL_ACTION_DEVCTL +#define RCTL_ACTION_THROTTLE (RCTL_ACTION_SIGNAL_MAX + 4) +#define RCTL_ACTION_MAX RCTL_ACTION_THROTTLE #define RCTL_AMOUNT_UNDEFINED -1 @@ -140,6 +141,7 @@ void rctl_rule_release(struct rctl_rule *rule); int rctl_rule_add(struct rctl_rule *rule); int rctl_rule_remove(struct rctl_rule *filter); int rctl_enforce(struct proc *p, int resource, uint64_t amount); +void rctl_throttle_decay(struct racct *racct, int resource); int64_t rctl_pcpu_available(const struct proc *p); uint64_t rctl_get_limit(struct proc *p, int resource); uint64_t rctl_get_available(struct proc *p, int resource); diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c index c8dac1b41514..020282097cff 100644 --- a/sys/ufs/ffs/ffs_inode.c +++ b/sys/ufs/ffs/ffs_inode.c @@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -659,6 +660,13 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp) vp = ITOV(ip); bp = getblk(vp, lbn, (int)fs->fs_bsize, 0, 0, 0); if ((bp->b_flags & B_CACHE) == 0) { +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, bp, 0); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ curthread->td_ru.ru_inblock++; /* pay for read */ bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c index bedc8e1b04f1..bc0693a65aa6 100644 --- a/sys/ufs/ffs/ffs_softdep.c +++ b/sys/ufs/ffs/ffs_softdep.c @@ -69,6 +69,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -6229,6 +6230,13 @@ setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno) vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, bp, 0); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ curthread->td_ru.ru_inblock++; error = bufwait(bp); if (error) { diff --git a/sys/ufs/ufs/ufs_bmap.c b/sys/ufs/ufs/ufs_bmap.c index 9819ef5eb8e5..768298f61a5a 100644 --- a/sys/ufs/ufs/ufs_bmap.c +++ b/sys/ufs/ufs/ufs_bmap.c @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -223,6 +224,13 @@ ufs_bmaparray(vp, bn, bnp, nbp, runp, runb) vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, bp, 0); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ curthread->td_ru.ru_inblock++; error = bufwait(bp); if (error) { diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index a7e3d3760e56..13a5757aadca 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -83,6 +83,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -994,6 +995,21 @@ vnode_locked: if (hardfault) { PCPU_INC(cnt.v_io_faults); curthread->td_ru.ru_majflt++; +#ifdef RACCT + if (racct_enable && fs.object->type == OBJT_VNODE) { + PROC_LOCK(curproc); + if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { + racct_add_force(curproc, RACCT_WRITEBPS, + PAGE_SIZE + behind * PAGE_SIZE); + racct_add_force(curproc, RACCT_WRITEIOPS, 1); + } else { + racct_add_force(curproc, RACCT_READBPS, + PAGE_SIZE + ahead * PAGE_SIZE); + racct_add_force(curproc, RACCT_READIOPS, 1); + } + PROC_UNLOCK(curproc); + } +#endif } else curthread->td_ru.ru_minflt++; diff --git a/usr.bin/rctl/rctl.8 b/usr.bin/rctl/rctl.8 index ec97623a56b3..2d92d5446dcf 100644 --- a/usr.bin/rctl/rctl.8 +++ b/usr.bin/rctl/rctl.8 @@ -25,7 +25,7 @@ .\" .\" $FreeBSD$ .\" -.Dd November 29, 2015 +.Dd January 30, 2016 .Dt RCTL 8 .Os .Sh NAME @@ -204,14 +204,22 @@ resource would be .It Sy shmsize Ta "SysV shared memory size, in bytes" .It Sy wallclock Ta "wallclock time, in seconds" .It Sy pcpu Ta "%CPU, in percents of a single CPU core" +.It Sy readbps Ta "filesystem reads, in bytes per second" +.It Sy writebps Ta "filesystem writes, in bytes per second" +.It Sy readiops Ta "filesystem reads, in operations per second" +.It Sy writeiops Ta "filesystem writes, in operations per second" .El .Sh ACTIONS .Bl -column -offset 3n "pseudoterminals" .It Em action .It Sy deny Ta deny the allocation; not supported for -.Sy cputime +.Sy cputime , +.Sy wallclock , +.Sy readbps , +.Sy writebps , +.Sy readiops , and -.Sy wallclock +.Sy writeiops .It Sy log Ta "log a warning to the console" .It Sy devctl Ta "send notification to" .Xr devd 8 @@ -228,6 +236,12 @@ send a signal to the offending process. See .Xr signal 3 for a list of supported signals +.It Sy throttle Ta "slow down process execution"; only supported for +.Sy readbps , +.Sy writebps , +.Sy readiops , +and +.Sy writeiops . .El .Pp Not all actions are supported for all resources. @@ -287,3 +301,22 @@ under sponsorship from the FreeBSD Foundation. Limiting .Sy memoryuse may kill the machine due to thrashing. +.Pp +The +.Sy readiops +and +.Sy writeiops +counters are only approximations. +Like +.Sy readbps +and +.Sy writebps , +they are calculated in the filesystem layer, where it is difficult +or even impossible to observe actual disk device operations. +.Pp +The +.Sy writebps +and +.Sy writeiops +resources generally account for writes to the filesystem cache, +not to actual devices.