src/sys/kern/kern_clockintr.c

670 lines
16 KiB
C

/* $OpenBSD: kern_clockintr.c,v 1.70 2024/02/25 19:15:50 cheloha Exp $ */
/*
* Copyright (c) 2003 Dale Rahn <drahn@openbsd.org>
* Copyright (c) 2020 Mark Kettenis <kettenis@openbsd.org>
* Copyright (c) 2020-2024 Scott Cheloha <cheloha@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/clockintr.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/resourcevar.h>
#include <sys/queue.h>
#include <sys/sched.h>
#include <sys/stdint.h>
#include <sys/sysctl.h>
#include <sys/time.h>
void clockintr_cancel_locked(struct clockintr *);
void clockintr_hardclock(struct clockrequest *, void *, void *);
void clockintr_schedule_locked(struct clockintr *, uint64_t);
void clockqueue_intrclock_install(struct clockqueue *,
const struct intrclock *);
void clockqueue_intrclock_reprogram(struct clockqueue *);
uint64_t clockqueue_next(const struct clockqueue *);
void clockqueue_pend_delete(struct clockqueue *, struct clockintr *);
void clockqueue_pend_insert(struct clockqueue *, struct clockintr *,
uint64_t);
void intrclock_rearm(struct intrclock *, uint64_t);
void intrclock_trigger(struct intrclock *);
uint64_t nsec_advance(uint64_t *, uint64_t, uint64_t);
/*
* Ready the calling CPU for clockintr_dispatch(). If this is our
* first time here, install the intrclock, if any, and set necessary
* flags. Advance the schedule as needed.
*/
void
clockintr_cpu_init(const struct intrclock *ic)
{
uint64_t multiplier = 0;
struct cpu_info *ci = curcpu();
struct clockqueue *cq = &ci->ci_queue;
struct schedstate_percpu *spc = &ci->ci_schedstate;
int reset_cq_intrclock = 0;
if (ic != NULL)
clockqueue_intrclock_install(cq, ic);
/* TODO: Remove this from struct clockqueue. */
if (CPU_IS_PRIMARY(ci) && cq->cq_hardclock.cl_expiration == 0) {
clockintr_bind(&cq->cq_hardclock, ci, clockintr_hardclock,
NULL);
}
/*
* Mask CQ_INTRCLOCK while we're advancing the internal clock
* interrupts. We don't want the intrclock to fire until this
* thread reaches clockintr_trigger().
*/
if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
CLR(cq->cq_flags, CQ_INTRCLOCK);
reset_cq_intrclock = 1;
}
/*
* Until we understand scheduler lock contention better, stagger
* the hardclock and statclock so they don't all happen at once.
* If we have no intrclock it doesn't matter, we have no control
* anyway. The primary CPU's starting offset is always zero, so
* leave the multiplier zero.
*/
if (!CPU_IS_PRIMARY(ci) && reset_cq_intrclock)
multiplier = CPU_INFO_UNIT(ci);
/*
* The first time we do this, the primary CPU cannot skip any
* hardclocks. We can skip hardclocks on subsequent calls because
* the global tick value is advanced during inittodr(9) on our
* behalf.
*/
if (CPU_IS_PRIMARY(ci)) {
if (cq->cq_hardclock.cl_expiration == 0)
clockintr_schedule(&cq->cq_hardclock, 0);
else
clockintr_advance(&cq->cq_hardclock, hardclock_period);
}
/*
* We can always advance the statclock. There is no reason to
* stagger a randomized statclock.
*/
if (!statclock_is_randomized) {
if (spc->spc_statclock.cl_expiration == 0) {
clockintr_stagger(&spc->spc_statclock, statclock_avg,
multiplier, MAXCPUS);
}
}
clockintr_advance(&spc->spc_statclock, statclock_avg);
/*
* XXX Need to find a better place to do this. We can't do it in
* sched_init_cpu() because initclocks() runs after it.
*/
if (spc->spc_itimer.cl_expiration == 0) {
clockintr_stagger(&spc->spc_itimer, hardclock_period,
multiplier, MAXCPUS);
}
if (spc->spc_profclock.cl_expiration == 0) {
clockintr_stagger(&spc->spc_profclock, profclock_period,
multiplier, MAXCPUS);
}
if (spc->spc_roundrobin.cl_expiration == 0) {
clockintr_stagger(&spc->spc_roundrobin, hardclock_period,
multiplier, MAXCPUS);
}
clockintr_advance(&spc->spc_roundrobin, roundrobin_period);
if (reset_cq_intrclock)
SET(cq->cq_flags, CQ_INTRCLOCK);
}
/*
* If we have an intrclock, trigger it to start the dispatch cycle.
*/
void
clockintr_trigger(void)
{
struct clockqueue *cq = &curcpu()->ci_queue;
KASSERT(ISSET(cq->cq_flags, CQ_INIT));
if (ISSET(cq->cq_flags, CQ_INTRCLOCK))
intrclock_trigger(&cq->cq_intrclock);
}
/*
* Run all expired events scheduled on the calling CPU.
*/
int
clockintr_dispatch(void *frame)
{
uint64_t lateness, run = 0, start;
struct cpu_info *ci = curcpu();
struct clockintr *cl;
struct clockqueue *cq = &ci->ci_queue;
struct clockrequest *request = &cq->cq_request;
void *arg;
void (*func)(struct clockrequest *, void *, void *);
uint32_t ogen;
if (cq->cq_dispatch != 0)
panic("%s: recursive dispatch", __func__);
cq->cq_dispatch = 1;
splassert(IPL_CLOCK);
KASSERT(ISSET(cq->cq_flags, CQ_INIT));
mtx_enter(&cq->cq_mtx);
/*
* If nothing is scheduled or we arrived too early, we have
* nothing to do.
*/
start = nsecuptime();
cq->cq_uptime = start;
if (TAILQ_EMPTY(&cq->cq_pend))
goto stats;
if (cq->cq_uptime < clockqueue_next(cq))
goto rearm;
lateness = start - clockqueue_next(cq);
/*
* Dispatch expired events.
*/
for (;;) {
cl = TAILQ_FIRST(&cq->cq_pend);
if (cl == NULL)
break;
if (cq->cq_uptime < cl->cl_expiration) {
/* Double-check the time before giving up. */
cq->cq_uptime = nsecuptime();
if (cq->cq_uptime < cl->cl_expiration)
break;
}
/*
* This clockintr has expired. Execute it.
*/
clockqueue_pend_delete(cq, cl);
request->cr_expiration = cl->cl_expiration;
arg = cl->cl_arg;
func = cl->cl_func;
cq->cq_running = cl;
mtx_leave(&cq->cq_mtx);
func(request, frame, arg);
mtx_enter(&cq->cq_mtx);
cq->cq_running = NULL;
if (ISSET(cq->cq_flags, CQ_IGNORE_REQUEST)) {
CLR(cq->cq_flags, CQ_IGNORE_REQUEST);
CLR(request->cr_flags, CR_RESCHEDULE);
}
if (ISSET(request->cr_flags, CR_RESCHEDULE)) {
CLR(request->cr_flags, CR_RESCHEDULE);
clockqueue_pend_insert(cq, cl, request->cr_expiration);
}
if (ISSET(cq->cq_flags, CQ_NEED_WAKEUP)) {
CLR(cq->cq_flags, CQ_NEED_WAKEUP);
mtx_leave(&cq->cq_mtx);
wakeup(&cq->cq_running);
mtx_enter(&cq->cq_mtx);
}
run++;
}
/*
* Dispatch complete.
*/
rearm:
/* Rearm the interrupt clock if we have one. */
if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
if (!TAILQ_EMPTY(&cq->cq_pend)) {
intrclock_rearm(&cq->cq_intrclock,
clockqueue_next(cq) - cq->cq_uptime);
}
}
stats:
/* Update our stats. */
ogen = cq->cq_gen;
cq->cq_gen = 0;
membar_producer();
cq->cq_stat.cs_dispatched += cq->cq_uptime - start;
if (run > 0) {
cq->cq_stat.cs_lateness += lateness;
cq->cq_stat.cs_prompt++;
cq->cq_stat.cs_run += run;
} else if (!TAILQ_EMPTY(&cq->cq_pend)) {
cq->cq_stat.cs_early++;
cq->cq_stat.cs_earliness += clockqueue_next(cq) - cq->cq_uptime;
} else
cq->cq_stat.cs_spurious++;
membar_producer();
cq->cq_gen = MAX(1, ogen + 1);
mtx_leave(&cq->cq_mtx);
if (cq->cq_dispatch != 1)
panic("%s: unexpected value: %u", __func__, cq->cq_dispatch);
cq->cq_dispatch = 0;
return run > 0;
}
uint64_t
clockintr_advance(struct clockintr *cl, uint64_t period)
{
uint64_t count, expiration;
struct clockqueue *cq = cl->cl_queue;
mtx_enter(&cq->cq_mtx);
expiration = cl->cl_expiration;
count = nsec_advance(&expiration, period, nsecuptime());
clockintr_schedule_locked(cl, expiration);
mtx_leave(&cq->cq_mtx);
return count;
}
uint64_t
clockrequest_advance(struct clockrequest *cr, uint64_t period)
{
struct clockqueue *cq = cr->cr_queue;
KASSERT(cr == &cq->cq_request);
SET(cr->cr_flags, CR_RESCHEDULE);
return nsec_advance(&cr->cr_expiration, period, cq->cq_uptime);
}
uint64_t
clockrequest_advance_random(struct clockrequest *cr, uint64_t min,
uint32_t mask)
{
uint64_t count = 0;
struct clockqueue *cq = cr->cr_queue;
uint32_t off;
KASSERT(cr == &cq->cq_request);
while (cr->cr_expiration <= cq->cq_uptime) {
while ((off = (random() & mask)) == 0)
continue;
cr->cr_expiration += min + off;
count++;
}
SET(cr->cr_flags, CR_RESCHEDULE);
return count;
}
void
clockintr_cancel(struct clockintr *cl)
{
struct clockqueue *cq = cl->cl_queue;
mtx_enter(&cq->cq_mtx);
clockintr_cancel_locked(cl);
mtx_leave(&cq->cq_mtx);
}
void
clockintr_cancel_locked(struct clockintr *cl)
{
struct clockqueue *cq = cl->cl_queue;
int was_next;
MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
if (ISSET(cl->cl_flags, CLST_PENDING)) {
was_next = cl == TAILQ_FIRST(&cq->cq_pend);
clockqueue_pend_delete(cq, cl);
if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
if (was_next && !TAILQ_EMPTY(&cq->cq_pend)) {
if (cq == &curcpu()->ci_queue)
clockqueue_intrclock_reprogram(cq);
}
}
}
if (cl == cq->cq_running)
SET(cq->cq_flags, CQ_IGNORE_REQUEST);
}
void
clockintr_bind(struct clockintr *cl, struct cpu_info *ci,
void (*func)(struct clockrequest *, void *, void *), void *arg)
{
struct clockqueue *cq = &ci->ci_queue;
splassert(IPL_NONE);
KASSERT(cl->cl_queue == NULL);
mtx_enter(&cq->cq_mtx);
cl->cl_arg = arg;
cl->cl_func = func;
cl->cl_queue = cq;
TAILQ_INSERT_TAIL(&cq->cq_all, cl, cl_alink);
mtx_leave(&cq->cq_mtx);
}
void
clockintr_unbind(struct clockintr *cl, uint32_t flags)
{
struct clockqueue *cq = cl->cl_queue;
KASSERT(!ISSET(flags, ~CL_FLAG_MASK));
mtx_enter(&cq->cq_mtx);
clockintr_cancel_locked(cl);
cl->cl_arg = NULL;
cl->cl_func = NULL;
cl->cl_queue = NULL;
TAILQ_REMOVE(&cq->cq_all, cl, cl_alink);
if (ISSET(flags, CL_BARRIER) && cl == cq->cq_running) {
SET(cq->cq_flags, CQ_NEED_WAKEUP);
msleep_nsec(&cq->cq_running, &cq->cq_mtx, PWAIT | PNORELOCK,
"clkbar", INFSLP);
} else
mtx_leave(&cq->cq_mtx);
}
void
clockintr_schedule(struct clockintr *cl, uint64_t expiration)
{
struct clockqueue *cq = cl->cl_queue;
mtx_enter(&cq->cq_mtx);
clockintr_schedule_locked(cl, expiration);
mtx_leave(&cq->cq_mtx);
}
void
clockintr_schedule_locked(struct clockintr *cl, uint64_t expiration)
{
struct clockqueue *cq = cl->cl_queue;
MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
if (ISSET(cl->cl_flags, CLST_PENDING))
clockqueue_pend_delete(cq, cl);
clockqueue_pend_insert(cq, cl, expiration);
if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
if (cl == TAILQ_FIRST(&cq->cq_pend)) {
if (cq == &curcpu()->ci_queue)
clockqueue_intrclock_reprogram(cq);
}
}
if (cl == cq->cq_running)
SET(cq->cq_flags, CQ_IGNORE_REQUEST);
}
void
clockintr_stagger(struct clockintr *cl, uint64_t period, uint32_t numer,
uint32_t denom)
{
struct clockqueue *cq = cl->cl_queue;
KASSERT(numer < denom);
mtx_enter(&cq->cq_mtx);
if (ISSET(cl->cl_flags, CLST_PENDING))
panic("%s: clock interrupt pending", __func__);
cl->cl_expiration = period / denom * numer;
mtx_leave(&cq->cq_mtx);
}
void
clockintr_hardclock(struct clockrequest *cr, void *frame, void *arg)
{
uint64_t count, i;
count = clockrequest_advance(cr, hardclock_period);
for (i = 0; i < count; i++)
hardclock(frame);
}
void
clockqueue_init(struct clockqueue *cq)
{
if (ISSET(cq->cq_flags, CQ_INIT))
return;
cq->cq_request.cr_queue = cq;
mtx_init(&cq->cq_mtx, IPL_CLOCK);
TAILQ_INIT(&cq->cq_all);
TAILQ_INIT(&cq->cq_pend);
cq->cq_gen = 1;
SET(cq->cq_flags, CQ_INIT);
}
void
clockqueue_intrclock_install(struct clockqueue *cq,
const struct intrclock *ic)
{
mtx_enter(&cq->cq_mtx);
if (!ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
cq->cq_intrclock = *ic;
SET(cq->cq_flags, CQ_INTRCLOCK);
}
mtx_leave(&cq->cq_mtx);
}
uint64_t
clockqueue_next(const struct clockqueue *cq)
{
MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
return TAILQ_FIRST(&cq->cq_pend)->cl_expiration;
}
void
clockqueue_pend_delete(struct clockqueue *cq, struct clockintr *cl)
{
MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
KASSERT(ISSET(cl->cl_flags, CLST_PENDING));
TAILQ_REMOVE(&cq->cq_pend, cl, cl_plink);
CLR(cl->cl_flags, CLST_PENDING);
}
void
clockqueue_pend_insert(struct clockqueue *cq, struct clockintr *cl,
uint64_t expiration)
{
struct clockintr *elm;
MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
KASSERT(!ISSET(cl->cl_flags, CLST_PENDING));
cl->cl_expiration = expiration;
TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink) {
if (cl->cl_expiration < elm->cl_expiration)
break;
}
if (elm == NULL)
TAILQ_INSERT_TAIL(&cq->cq_pend, cl, cl_plink);
else
TAILQ_INSERT_BEFORE(elm, cl, cl_plink);
SET(cl->cl_flags, CLST_PENDING);
}
void
clockqueue_intrclock_reprogram(struct clockqueue *cq)
{
uint64_t exp, now;
MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
KASSERT(ISSET(cq->cq_flags, CQ_INTRCLOCK));
exp = clockqueue_next(cq);
now = nsecuptime();
if (now < exp)
intrclock_rearm(&cq->cq_intrclock, exp - now);
else
intrclock_trigger(&cq->cq_intrclock);
}
void
intrclock_rearm(struct intrclock *ic, uint64_t nsecs)
{
ic->ic_rearm(ic->ic_cookie, nsecs);
}
void
intrclock_trigger(struct intrclock *ic)
{
ic->ic_trigger(ic->ic_cookie);
}
/*
* Advance *next in increments of period until it exceeds now.
* Returns the number of increments *next was advanced.
*
* We check the common cases first to avoid division if possible.
* This does no overflow checking.
*/
uint64_t
nsec_advance(uint64_t *next, uint64_t period, uint64_t now)
{
uint64_t elapsed;
if (now < *next)
return 0;
if (now < *next + period) {
*next += period;
return 1;
}
elapsed = (now - *next) / period + 1;
*next += period * elapsed;
return elapsed;
}
int
sysctl_clockintr(int *name, u_int namelen, void *oldp, size_t *oldlenp,
void *newp, size_t newlen)
{
struct clockintr_stat sum, tmp;
struct clockqueue *cq;
struct cpu_info *ci;
CPU_INFO_ITERATOR cii;
uint32_t gen;
if (namelen != 1)
return ENOTDIR;
switch (name[0]) {
case KERN_CLOCKINTR_STATS:
memset(&sum, 0, sizeof sum);
CPU_INFO_FOREACH(cii, ci) {
cq = &ci->ci_queue;
if (!ISSET(cq->cq_flags, CQ_INIT))
continue;
do {
gen = cq->cq_gen;
membar_consumer();
tmp = cq->cq_stat;
membar_consumer();
} while (gen == 0 || gen != cq->cq_gen);
sum.cs_dispatched += tmp.cs_dispatched;
sum.cs_early += tmp.cs_early;
sum.cs_earliness += tmp.cs_earliness;
sum.cs_lateness += tmp.cs_lateness;
sum.cs_prompt += tmp.cs_prompt;
sum.cs_run += tmp.cs_run;
sum.cs_spurious += tmp.cs_spurious;
}
return sysctl_rdstruct(oldp, oldlenp, newp, &sum, sizeof sum);
default:
break;
}
return EINVAL;
}
#ifdef DDB
#include <machine/db_machdep.h>
#include <ddb/db_interface.h>
#include <ddb/db_output.h>
#include <ddb/db_sym.h>
void db_show_clockintr(const struct clockintr *, const char *, u_int);
void db_show_clockintr_cpu(struct cpu_info *);
void
db_show_all_clockintr(db_expr_t addr, int haddr, db_expr_t count, char *modif)
{
struct timespec now;
struct cpu_info *ci;
CPU_INFO_ITERATOR cii;
int width = sizeof(long) * 2 + 2; /* +2 for "0x" prefix */
nanouptime(&now);
db_printf("%20s\n", "UPTIME");
db_printf("%10lld.%09ld\n", now.tv_sec, now.tv_nsec);
db_printf("\n");
db_printf("%20s %5s %3s %*s %s\n",
"EXPIRATION", "STATE", "CPU", width, "ARG", "NAME");
CPU_INFO_FOREACH(cii, ci) {
if (ISSET(ci->ci_queue.cq_flags, CQ_INIT))
db_show_clockintr_cpu(ci);
}
}
void
db_show_clockintr_cpu(struct cpu_info *ci)
{
struct clockintr *elm;
struct clockqueue *cq = &ci->ci_queue;
u_int cpu = CPU_INFO_UNIT(ci);
if (cq->cq_running != NULL)
db_show_clockintr(cq->cq_running, "run", cpu);
TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink)
db_show_clockintr(elm, "pend", cpu);
TAILQ_FOREACH(elm, &cq->cq_all, cl_alink) {
if (!ISSET(elm->cl_flags, CLST_PENDING))
db_show_clockintr(elm, "idle", cpu);
}
}
void
db_show_clockintr(const struct clockintr *cl, const char *state, u_int cpu)
{
struct timespec ts;
char *name;
db_expr_t offset;
int width = sizeof(long) * 2;
NSEC_TO_TIMESPEC(cl->cl_expiration, &ts);
db_find_sym_and_offset((vaddr_t)cl->cl_func, &name, &offset);
if (name == NULL)
name = "?";
db_printf("%10lld.%09ld %5s %3u 0x%0*lx %s\n",
ts.tv_sec, ts.tv_nsec, state, cpu,
width, (unsigned long)cl->cl_arg, name);
}
#endif /* DDB */