diff options
author | Cyril Zhang <cyril@freebsdfoundation.org> | 2025-08-05 23:20:56 +0000 |
---|---|---|
committer | Mark Johnston <markj@FreeBSD.org> | 2025-08-05 23:33:55 +0000 |
commit | c72188d85a793c7610208beafb83af544de6e3b7 (patch) | |
tree | ac8f47afb7b23c0b8f845cfb9d32d9216c72fa54 | |
parent | b4894eca1ade4e9cec9afbe76021305d23bf914a (diff) |
The previous scheme would inflate the CPU consumption of short-lived
processes. For containers (e.g., processes, jails), the total pcpu
usage was computed as a sum of the pcpu usage of all constituent
threads, which makes little sense for a decaying average.
Instead, aggregate wallclock time of all on-CPU threads and compute the
pcpu resource as a decaying average as the sum. This gives much more
reasonable and accurate values in various simple tests.
PR: 235556
Reviewed by: markj
MFC after: 1 month
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D30878
-rw-r--r-- | sys/kern/kern_racct.c | 307 | ||||
-rw-r--r-- | sys/sys/proc.h | 1 | ||||
-rw-r--r-- | sys/sys/racct.h | 6 |
3 files changed, 156 insertions, 158 deletions
diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c index 7ee3b9e2048a..7351e9cb6313 100644 --- a/sys/kern/kern_racct.c +++ b/sys/kern/kern_racct.c @@ -96,6 +96,13 @@ static void racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount); static void racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount); +static int racct_set_locked(struct proc *p, int resource, uint64_t amount, + int force); +static void racct_updatepcpu_locked(struct proc *p); +static void racct_updatepcpu_racct_locked(struct racct *racct); +static void racct_updatepcpu_containers(void); +static void racct_settime_locked(struct proc *p, bool exit); +static void racct_zeropcpu_locked(struct proc *p); SDT_PROVIDER_DEFINE(racct); SDT_PROBE_DEFINE3(racct, , rusage, add, @@ -308,68 +315,6 @@ fixpt_t ccpu_exp[] = { #define CCPU_EXP_MAX 110 -/* - * This function is analogical to the getpcpu() function in the ps(1) command. - * They should both calculate in the same way so that the racct %cpu - * calculations are consistent with the values shown by the ps(1) tool. - * The calculations are more complex in the 4BSD scheduler because of the value - * of the ccpu variable. In ULE it is defined to be zero which saves us some - * work. - */ -static uint64_t -racct_getpcpu(struct proc *p, u_int pcpu) -{ - u_int swtime; -#ifdef SCHED_4BSD - fixpt_t pctcpu, pctcpu_next; -#endif - fixpt_t p_pctcpu; - struct thread *td; - - ASSERT_RACCT_ENABLED(); - KASSERT((p->p_flag & P_IDLEPROC) == 0, - ("racct_getpcpu: idle process %p", p)); - - swtime = (ticks - p->p_swtick) / hz; - - /* - * For short-lived processes, the sched_pctcpu() returns small - * values even for cpu intensive processes. Therefore we use - * our own estimate in this case. - */ - if (swtime < RACCT_PCPU_SECS) - return (pcpu); - - p_pctcpu = 0; - FOREACH_THREAD_IN_PROC(p, td) { - thread_lock(td); -#ifdef SCHED_4BSD - pctcpu = sched_pctcpu(td); - /* Count also the yet unfinished second. */ - pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT; - pctcpu_next += sched_pctcpu_delta(td); - p_pctcpu += max(pctcpu, pctcpu_next); -#else - /* - * In ULE the %cpu statistics are updated on every - * sched_pctcpu() call. So special calculations to - * account for the latest (unfinished) second are - * not needed. - */ - p_pctcpu += sched_pctcpu(td); -#endif - thread_unlock(td); - } - -#ifdef SCHED_4BSD - if (swtime <= CCPU_EXP_MAX) - return ((100 * (uint64_t)p_pctcpu * 1000000) / - (FSCALE - ccpu_exp[swtime])); -#endif - - return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE); -} - static void racct_add_racct(struct racct *dest, const struct racct *src) { @@ -499,19 +444,6 @@ racct_adjust_resource(struct racct *racct, int resource, ("%s: resource %d usage < 0", __func__, resource)); racct->r_resources[resource] = 0; } - - /* - * There are some cases where the racct %cpu resource would grow - * beyond 100% per core. For example in racct_proc_exit() we add - * the process %cpu usage to the ucred racct containers. If too - * many processes terminated in a short time span, the ucred %cpu - * resource could grow too much. Also, the 4BSD scheduler sometimes - * returns for a thread more than 100% cpu usage. So we set a sane - * boundary here to 100% * the maximum number of CPUs. - */ - if ((resource == RACCT_PCTCPU) && - (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * (int64_t)MAXCPU)) - racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * (int64_t)MAXCPU; } static int @@ -635,10 +567,44 @@ racct_add_buf(struct proc *p, const struct buf *bp, int is_write) RACCT_UNLOCK(); } +static void +racct_settime_locked(struct proc *p, bool exit) +{ + struct thread *td; + struct timeval wallclock; + uint64_t runtime; + + ASSERT_RACCT_ENABLED(); + RACCT_LOCK_ASSERT(); + PROC_LOCK_ASSERT(p, MA_OWNED); + + if (exit) { + /* + * proc_reap() has already calculated rux + * and added crux to rux. + */ + runtime = cputick2usec(p->p_rux.rux_runtime - + p->p_crux.rux_runtime); + } else { + PROC_STATLOCK(p); + FOREACH_THREAD_IN_PROC(p, td) + ruxagg(p, td); + PROC_STATUNLOCK(p); + runtime = cputick2usec(p->p_rux.rux_runtime); + } + microuptime(&wallclock); + timevalsub(&wallclock, &p->p_stats->p_start); + + racct_set_locked(p, RACCT_CPU, runtime, 0); + racct_set_locked(p, RACCT_WALLCLOCK, + (uint64_t)wallclock.tv_sec * 1000000 + + wallclock.tv_usec, 0); +} + static int racct_set_locked(struct proc *p, int resource, uint64_t amount, int force) { - int64_t old_amount, decayed_amount, diff_proc, diff_cred; + int64_t old_amount, diff_proc, diff_cred; #ifdef RCTL int error; #endif @@ -655,17 +621,7 @@ racct_set_locked(struct proc *p, int resource, uint64_t amount, int force) * The diffs may be negative. */ diff_proc = amount - old_amount; - if (resource == RACCT_PCTCPU) { - /* - * Resources in per-credential racct containers may decay. - * If this is the case, we need to calculate the difference - * between the new amount and the proportional value of the - * old amount that has decayed in the ucred racct containers. - */ - decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; - diff_cred = amount - decayed_amount; - } else - diff_cred = diff_proc; + diff_cred = diff_proc; #ifdef notyet KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource), ("%s: usage of non-droppable resource %d dropping", __func__, @@ -908,8 +864,6 @@ racct_proc_fork(struct proc *parent, struct proc *child) goto out; #endif - /* Init process cpu time. */ - child->p_prev_runtime = 0; child->p_throttled = 0; /* @@ -964,37 +918,16 @@ racct_proc_fork_done(struct proc *child) void racct_proc_exit(struct proc *p) { - struct timeval wallclock; - uint64_t pct_estimate, pct, runtime; int i; if (!racct_enable) return; PROC_LOCK(p); - /* - * We don't need to calculate rux, proc_reap() has already done this. - */ - runtime = cputick2usec(p->p_rux.rux_runtime); -#ifdef notyet - KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime")); -#else - if (runtime < p->p_prev_runtime) - runtime = p->p_prev_runtime; -#endif - microuptime(&wallclock); - timevalsub(&wallclock, &p->p_stats->p_start); - if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { - pct_estimate = (1000000 * runtime * 100) / - ((uint64_t)wallclock.tv_sec * 1000000 + - wallclock.tv_usec); - } else - pct_estimate = 0; - pct = racct_getpcpu(p, pct_estimate); - RACCT_LOCK(); - racct_set_locked(p, RACCT_CPU, runtime, 0); - racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct); + + racct_settime_locked(p, true); + racct_zeropcpu_locked(p); KASSERT(p->p_racct->r_resources[RACCT_RSS] == 0, ("process reaped with %ju allocated for RSS\n", @@ -1068,6 +1001,10 @@ racct_move(struct racct *dest, struct racct *src) RACCT_LOCK(); racct_add_racct(dest, src); racct_sub_racct(src, src); + dest->r_runtime = src->r_runtime; + dest->r_time = src->r_time; + src->r_runtime = 0; + timevalsub(&src->r_time, &src->r_time); RACCT_UNLOCK(); } @@ -1170,8 +1107,6 @@ racct_proc_wakeup(struct proc *p) static void racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2) { - int64_t r_old, r_new; - ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); @@ -1181,15 +1116,6 @@ racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2) rctl_throttle_decay(racct, RACCT_READIOPS); rctl_throttle_decay(racct, RACCT_WRITEIOPS); #endif - - r_old = racct->r_resources[RACCT_PCTCPU]; - - /* If there is nothing to decay, just exit. */ - if (r_old <= 0) - return; - - r_new = r_old * RACCT_DECAY_FACTOR / FSCALE; - racct->r_resources[RACCT_PCTCPU] = r_new; } static void @@ -1221,15 +1147,105 @@ racct_decay(void) } static void +racct_updatepcpu_racct_locked(struct racct *racct) +{ + struct timeval diff; + uint64_t elapsed; + uint64_t runtime; + uint64_t newpcpu; + uint64_t oldpcpu; + + ASSERT_RACCT_ENABLED(); + RACCT_LOCK_ASSERT(); + + /* Difference between now and previously-recorded time. */ + microuptime(&diff); + timevalsub(&diff, &racct->r_time); + elapsed = (uint64_t)diff.tv_sec * 1000000 + diff.tv_usec; + + /* Difference between current and previously-recorded runtime. */ + runtime = racct->r_resources[RACCT_CPU] - racct->r_runtime; + + newpcpu = runtime * 100 * 1000000 / elapsed; + oldpcpu = racct->r_resources[RACCT_PCTCPU]; + /* + * This calculation is equivalent to + * (1 - 0.3) * newpcpu + 0.3 * oldpcpu + * where RACCT_DECAY_FACTOR = 0.3 * FSCALE. + */ + racct->r_resources[RACCT_PCTCPU] = ((FSCALE - RACCT_DECAY_FACTOR) * + newpcpu + RACCT_DECAY_FACTOR * oldpcpu) / FSCALE; + if (racct->r_resources[RACCT_PCTCPU] > + 100 * 1000000 * (uint64_t)mp_ncpus) + racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * + (uint64_t)mp_ncpus; + + /* Record current times. */ + racct->r_runtime = racct->r_resources[RACCT_CPU]; + timevaladd(&racct->r_time, &diff); +} + +static void +racct_zeropcpu_locked(struct proc *p) +{ + ASSERT_RACCT_ENABLED(); + PROC_LOCK_ASSERT(p, MA_OWNED); + + p->p_racct->r_resources[RACCT_PCTCPU] = 0; +} + +static void +racct_updatepcpu_locked(struct proc *p) +{ + ASSERT_RACCT_ENABLED(); + PROC_LOCK_ASSERT(p, MA_OWNED); + + racct_updatepcpu_racct_locked(p->p_racct); +} + +static void +racct_updatepcpu_pre(void) +{ + + RACCT_LOCK(); +} + +static void +racct_updatepcpu_post(void) +{ + + RACCT_UNLOCK(); +} + +static void +racct_updatepcpu_racct_callback(struct racct *racct, void *dummy1, void *dummy2) +{ + racct_updatepcpu_racct_locked(racct); +} + +static void +racct_updatepcpu_containers(void) +{ + ASSERT_RACCT_ENABLED(); + + ui_racct_foreach(racct_updatepcpu_racct_callback, racct_updatepcpu_pre, + racct_updatepcpu_post, NULL, NULL); + loginclass_racct_foreach(racct_updatepcpu_racct_callback, racct_updatepcpu_pre, + racct_updatepcpu_post, NULL, NULL); + prison_racct_foreach(racct_updatepcpu_racct_callback, racct_updatepcpu_pre, + racct_updatepcpu_post, NULL, NULL); +} + +static void racctd(void) { - struct thread *td; struct proc *p; - struct timeval wallclock; - uint64_t pct, pct_estimate, runtime; + struct proc *idle; ASSERT_RACCT_ENABLED(); + idle = STAILQ_FIRST(&cpuhead)->pc_idlethread->td_proc; + for (;;) { racct_decay(); @@ -1237,36 +1253,16 @@ racctd(void) FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); + if (p == idle) { + PROC_UNLOCK(p); + continue; + } if (p->p_state != PRS_NORMAL || (p->p_flag & P_IDLEPROC) != 0) { - if (p->p_state == PRS_ZOMBIE) - racct_set(p, RACCT_PCTCPU, 0); PROC_UNLOCK(p); continue; } - microuptime(&wallclock); - timevalsub(&wallclock, &p->p_stats->p_start); - PROC_STATLOCK(p); - FOREACH_THREAD_IN_PROC(p, td) - ruxagg(p, td); - runtime = cputick2usec(p->p_rux.rux_runtime); - PROC_STATUNLOCK(p); -#ifdef notyet - KASSERT(runtime >= p->p_prev_runtime, - ("runtime < p_prev_runtime")); -#else - if (runtime < p->p_prev_runtime) - runtime = p->p_prev_runtime; -#endif - p->p_prev_runtime = runtime; - if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { - pct_estimate = (1000000 * runtime * 100) / - ((uint64_t)wallclock.tv_sec * 1000000 + - wallclock.tv_usec); - } else - pct_estimate = 0; - pct = racct_getpcpu(p, pct_estimate); RACCT_LOCK(); #ifdef RCTL rctl_throttle_decay(p->p_racct, RACCT_READBPS); @@ -1274,11 +1270,8 @@ racctd(void) rctl_throttle_decay(p->p_racct, RACCT_READIOPS); rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS); #endif - racct_set_locked(p, RACCT_PCTCPU, pct, 1); - racct_set_locked(p, RACCT_CPU, runtime, 0); - racct_set_locked(p, RACCT_WALLCLOCK, - (uint64_t)wallclock.tv_sec * 1000000 + - wallclock.tv_usec, 0); + racct_settime_locked(p, false); + racct_updatepcpu_locked(p); RACCT_UNLOCK(); PROC_UNLOCK(p); } @@ -1306,6 +1299,8 @@ racctd(void) PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); + + racct_updatepcpu_containers(); pause("-", hz); } } diff --git a/sys/sys/proc.h b/sys/sys/proc.h index af9cafa99dd0..9140cee56885 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -763,7 +763,6 @@ struct proc { LIST_HEAD(, mqueue_notifier) p_mqnotifier; /* (c) mqueue notifiers.*/ struct kdtrace_proc *p_dtrace; /* (*) DTrace-specific data. */ struct cv p_pwait; /* (*) wait cv for exit/exec. */ - uint64_t p_prev_runtime; /* (c) Resource usage accounting. */ struct racct *p_racct; /* (b) Resource accounting. */ int p_throttled; /* (c) Flag for racct pcpu throttling */ /* diff --git a/sys/sys/racct.h b/sys/sys/racct.h index c6020b82c865..92b50353774e 100644 --- a/sys/sys/racct.h +++ b/sys/sys/racct.h @@ -141,13 +141,17 @@ extern bool racct_enable; /* * The 'racct' structure defines resource consumption for a particular - * subject, such as process or jail. + * subject, such as process or jail. It also contains the total + * cpu time and real time of the subject, recorded at the most recent + * time that RACCT_PCPU was updated. * * This structure must be filled with zeroes initially. */ struct racct { int64_t r_resources[RACCT_MAX + 1]; LIST_HEAD(, rctl_rule_link) r_rule_links; + uint64_t r_runtime; + struct timeval r_time; }; SYSCTL_DECL(_kern_racct); |