1 /* 2 * Routines to maintain a decaying average of per-process CPU utilization, in a 3 * way that results in numbers that are (hopefully) similar to those produced 4 * by NetBSD. Once a second, NetBSD performs the following basic computation 5 * for each process: 6 * 7 * avg = ccpu * avg + (1 - ccpu) * (run / hz) 8 * 9 * In this formula, 'avg' is the running average, 'hz' is the number of clock 10 * ticks per second, 'run' is the number of ticks during which the process was 11 * found running in the last second, and 'ccpu' is a decay value chosen such 12 * that only 5% of the original average remains after 60 seconds: e**(-1/20). 13 * 14 * Here, the idea is that we update the average lazily, namely, only when the 15 * process is running when the kernel processes a clock tick - no matter how 16 * long it had not been running before that. The result is that at any given 17 * time, the average may be out of date. For that reason, this code is shared 18 * between the kernel and the MIB service: the latter occasionally obtains the 19 * raw kernel process table, for example because a user runs ps(1), and it then 20 * needs to bring the values up to date. The kernel could do that itself just 21 * before copying out the process table, but the MIB service is equally capable 22 * of doing it post-copy - while also being preemptible during the computation. 23 * There is more to be said about this, but the summary is that it is not clear 24 * which of the two options is better in practice. We simply chose this one. 25 * 26 * In addition, we deliberately delay updating the actual average by one 27 * second, keeping the last second's number of process run ticks in a separate 28 * variable 'last'. This allows us to produce an estimate of short-term 29 * activity of the process as well. We use this to generate a "CPU estimate" 30 * value. BSD generates such a value for the purpose of scheduling, but we 31 * have no actual use for that, and generating the value just for userland is 32 * a bit too costly in our case. Our inaccurate value should suffice for most 33 * practical purposes though (e.g., comparisons between active processes). 34 * 35 * Overall, in terms of overhead, our approach should produce the same values 36 * as NetBSD while having only the same overhead as NetBSD in the very worst 37 * case, and much less overhead on average. Even in the worst case, in our 38 * case, the computation is spread out across each second, rather than all done 39 * at once. In terms of implementation, since this code is running in the 40 * kernel, we make use of small tables of precomputed values, and we try to 41 * save on computation as much as possible. We copy much of the NetBSD 42 * approach of avoiding divisions using FSCALE. 43 * 44 * Another difference with NetBSD is that our kernel does not actually call 45 * this function from its clock interrupt handler, but rather when a process 46 * has spent a number of CPU cycles that adds up to one clock tick worth of 47 * execution time. The result is better accuracy (no process can escape 48 * accounting by yielding just before each clock interrupt), but due to the 49 * inaccuracy of converting CPU cycles to clock ticks, a process may end up 50 * using more than 'hz' clock ticks per second. We could correct for this; 51 * however, it has not yet shown to be a problem. 52 * 53 * Zooming out a bit again, the current average is fairly accurate but not 54 * very precise. There are two reasons for this. First, the accounting is in 55 * clock tick fractions, which means that a per-second CPU usage below 1/hz 56 * cannot be measured. Second, the NetBSD FSCALE and ccpu values are such that 57 * (FSCALE - ccpu) equals 100, which means that a per-second CPU usage below 58 * 1/100 cannot be measured either. Both issues can be resolved by switching 59 * to a CPU cycle based accounting approach, which requires 64-bit arithmetic 60 * and a MINIX3-specific FSCALE value. For now, this is just not worth doing. 61 * 62 * Finally, it should be noted that in terms of overall operating system 63 * functionality, the CPU averages feature is entirely optional; as of writing, 64 * the produced values are only used in the output of utilities such as ps(1). 65 * If computing the CPU average becomes too burdensome in terms of either 66 * performance or maintenance, it can simply be removed again. 67 * 68 * Original author: David van Moolenbroek <david@minix3.org> 69 */ 70 71 #include "sysutil.h" 72 #include <sys/param.h> 73 74 #define CCPUTAB_SHIFT 3 /* 2**3 == 8 */ 75 #define CCPUTAB_MASK ((1 << CCPUTAB_SHIFT) - 1) 76 77 #define F(n) ((uint32_t)((n) * FSCALE)) 78 79 /* e**(-1/20*n)*FSCALE for n=1..(2**CCPUTAB_SHIFT-1) */ 80 static const uint32_t ccpu_low[CCPUTAB_MASK] = { 81 F(0.951229424501), F(0.904837418036), F(0.860707976425), 82 F(0.818730753078), F(0.778800783071), F(0.740818220682), 83 F(0.704688089719) 84 }; 85 #define ccpu (ccpu_low[0]) 86 87 /* e**(-1/20*8*n)*FSCALE for n=1.. until the value is zero (for FSCALE=2048) */ 88 static const uint32_t ccpu_high[] = { 89 F(0.670320046036), F(0.449328964117), F(0.301194211912), 90 F(0.201896517995), F(0.135335283237), F(0.090717953289), 91 F(0.060810062625), F(0.040762203978), F(0.027323722447), 92 F(0.018315638889), F(0.012277339903), F(0.008229747049), 93 F(0.005516564421), F(0.003697863716), F(0.002478752177), 94 F(0.001661557273), F(0.001113775148), F(0.000746585808), 95 F(0.000500451433) 96 }; 97 98 /* 99 * Initialize the per-process CPU average structure. To be called when the 100 * process is started, that is, as part of a fork call. 101 */ 102 void 103 cpuavg_init(struct cpuavg * ca) 104 { 105 106 ca->ca_base = 0; 107 ca->ca_run = 0; 108 ca->ca_last = 0; 109 ca->ca_avg = 0; 110 } 111 112 /* 113 * Return a new CPU usage average value, resulting from decaying the old value 114 * by the given number of seconds, using the formula (avg * ccpu**secs). 115 * We use two-level lookup tables to limit the computational expense to two 116 * multiplications while keeping the tables themselves relatively small. 117 */ 118 static uint32_t 119 cpuavg_decay(uint32_t avg, uint32_t secs) 120 { 121 unsigned int slot; 122 123 /* 124 * The ccpu_high table is set up such that with the default FSCALE, the 125 * values of any array entries beyond the end would be zero. That is, 126 * the average would be decayed to a value that, if represented in 127 * FSCALE units, would be zero. Thus, if it has been that long ago 128 * that we updated the average, we can just reset it to zero. 129 */ 130 if (secs > (__arraycount(ccpu_high) << CCPUTAB_SHIFT)) 131 return 0; 132 133 if (secs > CCPUTAB_MASK) { 134 slot = (secs >> CCPUTAB_SHIFT) - 1; 135 136 avg = (ccpu_high[slot] * avg) >> FSHIFT; /* decay #3 */ 137 138 secs &= CCPUTAB_MASK; 139 } 140 141 if (secs > 0) 142 avg = (ccpu_low[secs - 1] * avg) >> FSHIFT; /* decay #4 */ 143 144 return avg; 145 } 146 147 /* 148 * Update the CPU average value, either because the kernel is processing a 149 * clock tick, or because the MIB service updates obtained averages. We 150 * perform the decay in at most four computation steps (shown as "decay #n"), 151 * and thus, this algorithm is O(1). 152 */ 153 static void 154 cpuavg_update(struct cpuavg * ca, clock_t now, clock_t hz) 155 { 156 clock_t delta; 157 uint32_t secs; 158 159 delta = now - ca->ca_base; 160 161 /* 162 * If at least a second elapsed since we last updated the average, we 163 * must do so now. If not, we need not do anything for now. 164 */ 165 if (delta < hz) 166 return; 167 168 /* 169 * Decay the average by one second, and merge in the run fraction of 170 * the previous second, as though that second only just ended - even 171 * though the real time is at least one whole second ahead. By doing 172 * so, we roll the statistics time forward by one virtual second. 173 */ 174 ca->ca_avg = (ccpu * ca->ca_avg) >> FSHIFT; /* decay #1 */ 175 ca->ca_avg += (FSCALE - ccpu) * (ca->ca_last / hz) >> FSHIFT; 176 177 ca->ca_last = ca->ca_run; /* move 'run' into 'last' */ 178 ca->ca_run = 0; 179 180 ca->ca_base += hz; /* move forward by a second */ 181 delta -= hz; 182 183 if (delta < hz) 184 return; 185 186 /* 187 * At least a whole second more elapsed since the start of the recorded 188 * second. That means that our current 'run' counter (now moved into 189 * 'last') is also outdated, and we need to merge it in as well, before 190 * performing the next decay steps. 191 */ 192 ca->ca_avg = (ccpu * ca->ca_avg) >> FSHIFT; /* decay #2 */ 193 ca->ca_avg += (FSCALE - ccpu) * (ca->ca_last / hz) >> FSHIFT; 194 195 ca->ca_last = 0; /* 'run' is already zero now */ 196 197 ca->ca_base += hz; /* move forward by a second */ 198 delta -= hz; 199 200 if (delta < hz) 201 return; 202 203 /* 204 * If additional whole seconds elapsed since the start of the last 205 * second slot, roll forward in time by that many whole seconds, thus 206 * decaying the value properly while maintaining alignment to whole- 207 * second slots. The decay takes up to another two computation steps. 208 */ 209 secs = delta / hz; 210 211 ca->ca_avg = cpuavg_decay(ca->ca_avg, secs); 212 213 ca->ca_base += secs * hz; /* move forward by whole seconds */ 214 } 215 216 /* 217 * The clock ticked, and this last clock tick is accounted to the process for 218 * which the CPU average statistics are stored in 'ca'. Update the statistics 219 * accordingly, decaying the average as necessary. The current system uptime 220 * must be given as 'now', and the number of clock ticks per second must be 221 * given as 'hz'. 222 */ 223 void 224 cpuavg_increment(struct cpuavg * ca, clock_t now, clock_t hz) 225 { 226 227 if (ca->ca_base == 0) 228 ca->ca_base = now; 229 else 230 cpuavg_update(ca, now, hz); 231 232 /* 233 * Register that the process was running at this clock tick. We could 234 * avoid one division above by precomputing (FSCALE/hz), but this is 235 * typically not a clean division and would therefore result in (more) 236 * loss of accuracy. 237 */ 238 ca->ca_run += FSCALE; 239 } 240 241 /* 242 * Retrieve the decaying CPU utilization average (as return value), the number 243 * of CPU run ticks in the current second so far (stored in 'cpticks'), and an 244 * opaque CPU utilization estimate (stored in 'estcpu'). The caller must 245 * provide the CPU average structure ('ca_orig'), which will not be modified, 246 * as well as the current uptime in clock ticks ('now') and the number of clock 247 * ticks per second ('hz'). 248 */ 249 uint32_t 250 cpuavg_getstats(const struct cpuavg * ca_orig, uint32_t * cpticks, 251 uint32_t * estcpu, clock_t now, clock_t hz) 252 { 253 struct cpuavg ca; 254 255 ca = *ca_orig; 256 257 /* Update the average as necessary. */ 258 cpuavg_update(&ca, now, hz); 259 260 /* Merge the last second into the average. */ 261 ca.ca_avg = (ccpu * ca.ca_avg) >> FSHIFT; 262 ca.ca_avg += (FSCALE - ccpu) * (ca.ca_last / hz) >> FSHIFT; 263 264 *cpticks = ca.ca_run >> FSHIFT; 265 266 /* 267 * NetBSD's estcpu value determines a scheduling queue, and decays to 268 * 10% in 5*(the current load average) seconds. Our 'estcpu' simply 269 * reports the process's percentage of CPU usage in the last second, 270 * thus yielding a value in the range 0..100 with a decay of 100% after 271 * one second. This should be good enough for most practical purposes. 272 */ 273 *estcpu = (ca.ca_last / hz * 100) >> FSHIFT; 274 275 return ca.ca_avg; 276 } 277 278 /* 279 * Return the ccpu decay value, in FSCALE units. 280 */ 281 uint32_t 282 cpuavg_getccpu(void) 283 { 284 285 return ccpu; 286 } 287