1 /* MIB service - proc.c - functionality based on service process tables */
2 /* Eventually, the CTL_PROC subtree might end up here as well. */
3
4 #include "mib.h"
5
6 #include <sys/exec.h>
7 #include <minix/sysinfo.h>
8
9 #include <machine/archtypes.h>
10 #include "kernel/proc.h"
11 #include "servers/pm/mproc.h"
12 #include "servers/vfs/const.h"
13 #include "servers/vfs/fproc.h"
14
15 typedef struct proc ixfer_proc_t;
16 typedef struct mproc ixfer_mproc_t;
17
18 static ixfer_proc_t proc_tab[NR_TASKS + NR_PROCS];
19 static ixfer_mproc_t mproc_tab[NR_PROCS];
20 static struct fproc_light fproc_tab[NR_PROCS];
21
22 /*
23 * The number of processes added to the current number of processes when doing
24 * a size estimation, so that the actual data retrieval does not end up with
25 * too little space if new processes have forked between the two calls. We do
26 * a process table update only once per clock tick, which means that typically
27 * no update will take place between the user process's size estimation request
28 * and its subsequent data retrieval request. On the other hand, if we do
29 * update process tables in between, quite a bit might have changed.
30 */
31 #define EXTRA_PROCS 8
32
33 #define HASH_SLOTS (NR_PROCS / 4) /* expected nr. of processes in use */
34 #define NO_SLOT (-1)
35 static int hash_tab[HASH_SLOTS]; /* hash table mapping from PID.. */
36 static int hnext_tab[NR_PROCS]; /* ..to PM process slot */
37
38 static clock_t tabs_updated = 0; /* when the tables were last updated */
39 static int tabs_valid = TRUE; /* FALSE if obtaining tables failed */
40
41 /*
42 * Update the process tables by pulling in new copies from the kernel, PM, and
43 * VFS, but only every so often and only if it has not failed before. Return
44 * TRUE iff the tables are now valid.
45 */
46 static int
update_tables(void)47 update_tables(void)
48 {
49 clock_t now;
50 pid_t pid;
51 int r, kslot, mslot, hslot;
52
53 /*
54 * If retrieving the tables failed at some point, do not keep trying
55 * all the time. Such a failure is very unlikely to be transient.
56 */
57 if (tabs_valid == FALSE)
58 return FALSE;
59
60 /*
61 * Update the tables once per clock tick at most. The update operation
62 * is rather heavy, transferring several hundreds of kilobytes between
63 * servers. Userland should be able to live with information that is
64 * outdated by at most one clock tick.
65 */
66 now = getticks();
67
68 if (tabs_updated != 0 && tabs_updated == now)
69 return TRUE;
70
71 /* Perform an actual update now. */
72 tabs_valid = FALSE;
73
74 /* Retrieve and check the kernel process table. */
75 if ((r = sys_getproctab(proc_tab)) != OK) {
76 printf("MIB: unable to obtain kernel process table (%d)\n", r);
77
78 return FALSE;
79 }
80
81 for (kslot = 0; kslot < NR_TASKS + NR_PROCS; kslot++) {
82 if (proc_tab[kslot].p_magic != PMAGIC) {
83 printf("MIB: kernel process table mismatch\n");
84
85 return FALSE;
86 }
87 }
88
89 /* Retrieve and check the PM process table. */
90 r = getsysinfo(PM_PROC_NR, SI_PROC_TAB, mproc_tab, sizeof(mproc_tab));
91 if (r != OK) {
92 printf("MIB: unable to obtain PM process table (%d)\n", r);
93
94 return FALSE;
95 }
96
97 for (mslot = 0; mslot < NR_PROCS; mslot++) {
98 if (mproc_tab[mslot].mp_magic != MP_MAGIC) {
99 printf("MIB: PM process table mismatch\n");
100
101 return FALSE;
102 }
103 }
104
105 /* Retrieve an extract of the VFS process table. */
106 r = getsysinfo(VFS_PROC_NR, SI_PROCLIGHT_TAB, fproc_tab,
107 sizeof(fproc_tab));
108 if (r != OK) {
109 printf("MIB: unable to obtain VFS process table (%d)\n", r);
110
111 return FALSE;
112 }
113
114 tabs_valid = TRUE;
115 tabs_updated = now;
116
117 /*
118 * Build a hash table mapping from process IDs to slot numbers, for
119 * fast access. TODO: decide if this is better done on demand only.
120 */
121 for (hslot = 0; hslot < HASH_SLOTS; hslot++)
122 hash_tab[hslot] = NO_SLOT;
123
124 for (mslot = 0; mslot < NR_PROCS; mslot++) {
125 if (mproc_tab[mslot].mp_flags & IN_USE) {
126 if ((pid = mproc_tab[mslot].mp_pid) <= 0)
127 continue;
128
129 hslot = mproc_tab[mslot].mp_pid % HASH_SLOTS;
130
131 hnext_tab[mslot] = hash_tab[hslot];
132 hash_tab[hslot] = mslot;
133 }
134 }
135
136 return TRUE;
137 }
138
139 /*
140 * Return the PM slot number for the given PID, or NO_SLOT if the PID is not in
141 * use by a process.
142 */
143 static int
get_mslot(pid_t pid)144 get_mslot(pid_t pid)
145 {
146 int mslot;
147
148 /* PID 0 identifies the kernel; checking this is up to the caller. */
149 if (pid <= 0)
150 return NO_SLOT;
151
152 for (mslot = hash_tab[pid % HASH_SLOTS]; mslot != NO_SLOT;
153 mslot = hnext_tab[mslot])
154 if (mproc_tab[mslot].mp_pid == pid)
155 break;
156
157 return mslot;
158 }
159
160 /*
161 * Store the given number of clock ticks as a timeval structure.
162 */
163 static void
ticks_to_timeval(struct timeval * tv,clock_t ticks)164 ticks_to_timeval(struct timeval * tv, clock_t ticks)
165 {
166 clock_t hz;
167
168 hz = sys_hz();
169
170 tv->tv_sec = ticks / hz;
171 tv->tv_usec = (long)((ticks % hz) * 1000000ULL / hz);
172 }
173
174 /*
175 * Generate a wchan message text for the cases that the process is blocked on
176 * IPC with another process, of which the endpoint is given as 'endpt' here.
177 * The name of the other process is to be stored in 'wmesg', which is a buffer
178 * of size 'wmsz'. The result should be null terminated. If 'ipc' is set, the
179 * process is blocked on a direct IPC call, in which case the name of the other
180 * process is enclosed in parentheses. If 'ipc' is not set, the call is made
181 * indirectly through VFS, and the name of the other process should not be
182 * enclosed in parentheses. If no name can be obtained, we use the endpoint of
183 * the other process instead.
184 */
185 static void
fill_wmesg(char * wmesg,size_t wmsz,endpoint_t endpt,int ipc)186 fill_wmesg(char * wmesg, size_t wmsz, endpoint_t endpt, int ipc)
187 {
188 const char *name;
189 int mslot;
190
191 switch (endpt) {
192 case ANY:
193 name = "any";
194 break;
195 case SELF:
196 name = "self";
197 break;
198 case NONE:
199 name = "none";
200 break;
201 default:
202 mslot = _ENDPOINT_P(endpt);
203 if (mslot >= -NR_TASKS && mslot < NR_PROCS &&
204 (mslot < 0 || (mproc_tab[mslot].mp_flags & IN_USE)))
205 name = proc_tab[NR_TASKS + mslot].p_name;
206 else
207 name = NULL;
208 }
209
210 if (name != NULL)
211 snprintf(wmesg, wmsz, "%s%s%s",
212 ipc ? "(" : "", name, ipc ? ")" : "");
213 else
214 snprintf(wmesg, wmsz, "%s%d%s",
215 ipc ? "(" : "", endpt, ipc ? ")" : "");
216 }
217
218 /*
219 * Return the LWP status of a process, along with additional information in
220 * case the process is sleeping (LSSLEEP): a wchan value and text to indicate
221 * what the process is sleeping on, and possibly a flag field modification to
222 * indicate that the sleep is interruptible.
223 */
224 static int
get_lwp_stat(int mslot,uint64_t * wcptr,char * wmptr,size_t wmsz,int32_t * flag)225 get_lwp_stat(int mslot, uint64_t * wcptr, char * wmptr, size_t wmsz,
226 int32_t * flag)
227 {
228 struct mproc *mp;
229 struct fproc_light *fp;
230 struct proc *kp;
231 const char *wmesg;
232 uint64_t wchan;
233 endpoint_t endpt;
234
235 mp = &mproc_tab[mslot];
236 fp = &fproc_tab[mslot];
237 kp = &proc_tab[NR_TASKS + mslot];
238
239 /*
240 * First cover all the cases that the process is not sleeping. In
241 * those cases, we need not return additional sleep information either.
242 */
243 if (mp->mp_flags & (TRACE_ZOMBIE | ZOMBIE))
244 return LSZOMB;
245
246 if (mp->mp_flags & EXITING)
247 return LSDEAD;
248
249 if ((mp->mp_flags & TRACE_STOPPED) || RTS_ISSET(kp, RTS_P_STOP))
250 return LSSTOP;
251
252 if (proc_is_runnable(kp))
253 return LSRUN;
254
255 /*
256 * The process is sleeping. In that case, we must also figure out why,
257 * and return an appropriate wchan value and human-readable wmesg text.
258 *
259 * The process can be blocked on either a known sleep state in PM or
260 * VFS, or otherwise on IPC communication with another process, or
261 * otherwise on a kernel RTS flag. In each case, decide what to use as
262 * wchan value and wmesg text, and whether the sleep is interruptible.
263 *
264 * The wchan value should be unique for the sleep reason. We use its
265 * lower eight bits to indicate a class:
266 * 0x00 = kernel task
267 * 0x01 = kerel RTS block
268 * 0x02 = PM call
269 * 0x03 = VFS call
270 * 0x04 = MIB call
271 * 0xff = blocked on process
272 * The upper bits are used for class-specific information. The actual
273 * value does not really matter, as long as it is nonzero and there is
274 * no overlap between the different values.
275 */
276 wchan = 0;
277 wmesg = NULL;
278
279 /*
280 * First see if the process is marked as blocked in the tables of PM or
281 * VFS. Such a block reason is always an interruptible sleep. Note
282 * that we do not use the kernel table at all in this case: each of the
283 * three tables is consistent within itself, but not necessarily
284 * consistent with any of the other tables, so we avoid internal
285 * mismatches if we can.
286 */
287 if (mp->mp_flags & WAITING) {
288 wchan = 0x102;
289 wmesg = "wait";
290 } else if (mp->mp_flags & SIGSUSPENDED) {
291 wchan = 0x202;
292 wmesg = "pause";
293 } else if (fp->fpl_blocked_on != FP_BLOCKED_ON_NONE) {
294 wchan = (fp->fpl_blocked_on << 8) | 0x03;
295 switch (fp->fpl_blocked_on) {
296 case FP_BLOCKED_ON_PIPE:
297 wmesg = "pipe";
298 break;
299 case FP_BLOCKED_ON_FLOCK:
300 wmesg = "flock";
301 break;
302 case FP_BLOCKED_ON_POPEN:
303 wmesg = "popen";
304 break;
305 case FP_BLOCKED_ON_SELECT:
306 wmesg = "select";
307 break;
308 case FP_BLOCKED_ON_CDEV:
309 case FP_BLOCKED_ON_SDEV:
310 /*
311 * Add the task (= character or socket driver) endpoint
312 * to the wchan value, and use the driver's process
313 * name, without parentheses, as wmesg text.
314 */
315 wchan |= (uint64_t)fp->fpl_task << 16;
316 fill_wmesg(wmptr, wmsz, fp->fpl_task, FALSE /*ipc*/);
317 break;
318 default:
319 /* A newly added flag we don't yet know about? */
320 wmesg = "???";
321 break;
322 }
323 }
324 if (wchan != 0) {
325 *wcptr = wchan;
326 if (wmesg != NULL) /* NULL means "already set" here */
327 strlcpy(wmptr, wmesg, wmsz);
328 *flag |= L_SINTR;
329 }
330
331 /*
332 * See if the process is blocked on sending or receiving. If not, then
333 * use one of the kernel RTS flags as reason.
334 */
335 endpt = P_BLOCKEDON(kp);
336
337 switch (endpt) {
338 case MIB_PROC_NR:
339 /* This is really just aesthetics. */
340 wchan = 0x04;
341 wmesg = "sysctl";
342 break;
343 case NONE:
344 /*
345 * The process is not running, but also not blocked on IPC with
346 * another process. This means it must be stopped on a kernel
347 * RTS flag.
348 */
349 wchan = ((uint64_t)kp->p_rts_flags << 8) | 0x01;
350 if (RTS_ISSET(kp, RTS_PROC_STOP))
351 wmesg = "kstop";
352 else if (RTS_ISSET(kp, RTS_SIGNALED) ||
353 RTS_ISSET(kp, RTS_SIGNALED))
354 wmesg = "ksignal";
355 else if (RTS_ISSET(kp, RTS_NO_PRIV))
356 wmesg = "knopriv";
357 else if (RTS_ISSET(kp, RTS_PAGEFAULT) ||
358 RTS_ISSET(kp, RTS_VMREQTARGET))
359 wmesg = "fault";
360 else if (RTS_ISSET(kp, RTS_NO_QUANTUM))
361 wmesg = "sched";
362 else
363 wmesg = "kflag";
364 break;
365 case ANY:
366 /*
367 * If the process is blocked receiving from ANY, mark it as
368 * being in an interruptible sleep. This looks nicer, even
369 * though "interruptible" is not applicable to services at all.
370 */
371 *flag |= L_SINTR;
372 break;
373 }
374
375 /*
376 * If at this point wchan is still zero, the process is blocked sending
377 * or receiving. Use a wchan value based on the target endpoint, and
378 * use "(procname)" as wmesg text.
379 */
380 if (wchan == 0) {
381 *wcptr = ((uint64_t)endpt << 8) | 0xff;
382 fill_wmesg(wmptr, wmsz, endpt, TRUE /*ipc*/);
383 } else {
384 *wcptr = wchan;
385 if (wmesg != NULL) /* NULL means "already set" here */
386 strlcpy(wmptr, wmesg, wmsz);
387 }
388
389 return LSSLEEP;
390 }
391
392
393 /*
394 * Fill the part of a LWP structure that is common between kernel tasks and
395 * user processes. Also return a CPU estimate in 'estcpu', because we generate
396 * the value as a side effect here, and the LWP structure has no estcpu field.
397 */
398 static void
fill_lwp_common(struct kinfo_lwp * l,int kslot,uint32_t * estcpu)399 fill_lwp_common(struct kinfo_lwp * l, int kslot, uint32_t * estcpu)
400 {
401 struct proc *kp;
402 struct timeval tv;
403 clock_t uptime;
404 uint32_t hz;
405
406 kp = &proc_tab[kslot];
407
408 uptime = getticks();
409 hz = sys_hz();
410
411 /*
412 * We use the process endpoint as the LWP ID. Not only does this allow
413 * users to obtain process endpoints with "ps -s" (thus replacing the
414 * MINIX3 ps(1)'s "ps -E"), but if we ever do implement kernel threads,
415 * this is probably still going to be accurate.
416 */
417 l->l_lid = kp->p_endpoint;
418
419 /*
420 * The time during which the process has not been swapped in or out is
421 * not applicable for us, and thus, we set it to the time the process
422 * has been running (in seconds). This value is relevant mostly for
423 * ps(1)'s CPU usage correction for processes that have just started.
424 */
425 if (kslot >= NR_TASKS)
426 l->l_swtime = uptime - mproc_tab[kslot - NR_TASKS].mp_started;
427 else
428 l->l_swtime = uptime;
429 l->l_swtime /= hz;
430
431 /*
432 * Sleep (dequeue) times are not maintained for kernel tasks, so
433 * pretend they are never asleep (which is pretty accurate).
434 */
435 if (kslot < NR_TASKS)
436 l->l_slptime = 0;
437 else
438 l->l_slptime = (uptime - kp->p_dequeued) / hz;
439
440 l->l_priority = kp->p_priority;
441 l->l_usrpri = kp->p_priority;
442 l->l_cpuid = kp->p_cpu;
443 ticks_to_timeval(&tv, kp->p_user_time + kp->p_sys_time);
444 l->l_rtime_sec = tv.tv_sec;
445 l->l_rtime_usec = tv.tv_usec;
446
447 /*
448 * Obtain CPU usage percentages and estimates through library code
449 * shared between the kernel and this service; see its source for
450 * details. We note that the produced estcpu value is rather different
451 * from the one produced by NetBSD, but this should not be a problem.
452 */
453 l->l_pctcpu = cpuavg_getstats(&kp->p_cpuavg, &l->l_cpticks, estcpu,
454 uptime, hz);
455 }
456
457 /*
458 * Fill a LWP structure for a kernel task. Each kernel task has its own LWP,
459 * and all of them have negative PIDs.
460 */
461 static void
fill_lwp_kern(struct kinfo_lwp * l,int kslot)462 fill_lwp_kern(struct kinfo_lwp * l, int kslot)
463 {
464 uint32_t estcpu;
465
466 memset(l, 0, sizeof(*l));
467
468 l->l_flag = L_INMEM | L_SINTR | L_SYSTEM;
469 l->l_stat = LSSLEEP;
470 l->l_pid = kslot - NR_TASKS;
471
472 /*
473 * When showing LWP entries, ps(1) uses the process name rather than
474 * the LWP name. All kernel tasks are therefore shown as "[kernel]"
475 * anyway. We use the wmesg field to show the actual kernel task name.
476 */
477 l->l_wchan = ((uint64_t)(l->l_pid) << 8) | 0x00;
478 strlcpy(l->l_wmesg, proc_tab[kslot].p_name, sizeof(l->l_wmesg));
479 strlcpy(l->l_name, "kernel", sizeof(l->l_name));
480
481 fill_lwp_common(l, kslot, &estcpu);
482 }
483
484 /*
485 * Fill a LWP structure for a user process.
486 */
487 static void
fill_lwp_user(struct kinfo_lwp * l,int mslot)488 fill_lwp_user(struct kinfo_lwp * l, int mslot)
489 {
490 struct mproc *mp;
491 uint32_t estcpu;
492
493 memset(l, 0, sizeof(*l));
494
495 mp = &mproc_tab[mslot];
496
497 l->l_flag = L_INMEM;
498 l->l_stat = get_lwp_stat(mslot, &l->l_wchan, l->l_wmesg,
499 sizeof(l->l_wmesg), &l->l_flag);
500 l->l_pid = mp->mp_pid;
501 strlcpy(l->l_name, mp->mp_name, sizeof(l->l_name));
502
503 fill_lwp_common(l, NR_TASKS + mslot, &estcpu);
504 }
505
506 /*
507 * Implementation of CTL_KERN KERN_LWP.
508 */
509 ssize_t
mib_kern_lwp(struct mib_call * call,struct mib_node * node __unused,struct mib_oldp * oldp,struct mib_newp * newp __unused)510 mib_kern_lwp(struct mib_call * call, struct mib_node * node __unused,
511 struct mib_oldp * oldp, struct mib_newp * newp __unused)
512 {
513 struct kinfo_lwp lwp;
514 struct mproc *mp;
515 size_t copysz;
516 ssize_t off;
517 pid_t pid;
518 int r, elsz, elmax, kslot, mslot, last_mslot;
519
520 if (call->call_namelen != 3)
521 return EINVAL;
522
523 pid = (pid_t)call->call_name[0];
524 elsz = call->call_name[1];
525 elmax = call->call_name[2]; /* redundant with the given oldlen.. */
526
527 if (pid < -1 || elsz <= 0 || elmax < 0)
528 return EINVAL;
529
530 if (!update_tables())
531 return EINVAL;
532
533 off = 0;
534 copysz = MIN((size_t)elsz, sizeof(lwp));
535
536 /*
537 * We model kernel tasks as LWP threads of the kernel (with PID 0).
538 * Modeling the kernel tasks as processes with negative PIDs, like
539 * ProcFS does, conflicts with the KERN_LWP API here: a PID of -1
540 * indicates that the caller wants a full listing of LWPs.
541 */
542 if (pid <= 0) {
543 for (kslot = 0; kslot < NR_TASKS; kslot++) {
544 if (mib_inrange(oldp, off) && elmax > 0) {
545 fill_lwp_kern(&lwp, kslot);
546 if ((r = mib_copyout(oldp, off, &lwp,
547 copysz)) < 0)
548 return r;
549 elmax--;
550 }
551 off += elsz;
552 }
553
554 /* No need to add extra space here: NR_TASKS is static. */
555 if (pid == 0)
556 return off;
557 }
558
559 /*
560 * With PID 0 out of the way: the user requested the LWP for either a
561 * specific user process (pid > 0), or for all processes (pid < 0).
562 */
563 if (pid > 0) {
564 if ((mslot = get_mslot(pid)) == NO_SLOT ||
565 (mproc_tab[mslot].mp_flags & (TRACE_ZOMBIE | ZOMBIE)))
566 return ESRCH;
567 last_mslot = mslot;
568 } else {
569 mslot = 0;
570 last_mslot = NR_PROCS - 1;
571 }
572
573 for (; mslot <= last_mslot; mslot++) {
574 mp = &mproc_tab[mslot];
575
576 if ((mp->mp_flags & (IN_USE | TRACE_ZOMBIE | ZOMBIE)) !=
577 IN_USE)
578 continue;
579
580 if (mib_inrange(oldp, off) && elmax > 0) {
581 fill_lwp_user(&lwp, mslot);
582 if ((r = mib_copyout(oldp, off, &lwp, copysz)) < 0)
583 return r;
584 elmax--;
585 }
586 off += elsz;
587 }
588
589 if (oldp == NULL && pid < 0)
590 off += EXTRA_PROCS * elsz;
591
592 return off;
593 }
594
595
596 /*
597 * Fill the part of a process structure that is common between kernel tasks and
598 * user processes.
599 */
600 static void
fill_proc2_common(struct kinfo_proc2 * p,int kslot)601 fill_proc2_common(struct kinfo_proc2 * p, int kslot)
602 {
603 struct vm_usage_info vui;
604 struct timeval tv;
605 struct proc *kp;
606 struct kinfo_lwp l;
607
608 kp = &proc_tab[kslot];
609
610 /*
611 * Much of the information in the LWP structure also ends up in the
612 * process structure. In order to avoid duplication of some important
613 * code, first generate LWP values and then copy it them into the
614 * process structure.
615 */
616 memset(&l, 0, sizeof(l));
617 fill_lwp_common(&l, kslot, &p->p_estcpu);
618
619 /* Obtain memory usage information from VM. Ignore failures. */
620 memset(&vui, 0, sizeof(vui));
621 (void)vm_info_usage(kp->p_endpoint, &vui);
622
623 ticks_to_timeval(&tv, kp->p_user_time + kp->p_sys_time);
624 p->p_rtime_sec = l.l_rtime_sec;
625 p->p_rtime_usec = l.l_rtime_usec;
626 p->p_cpticks = l.l_cpticks;
627 p->p_pctcpu = l.l_pctcpu;
628 p->p_swtime = l.l_swtime;
629 p->p_slptime = l.l_slptime;
630 p->p_uticks = kp->p_user_time;
631 p->p_sticks = kp->p_sys_time;
632 /* TODO: p->p_iticks */
633 ticks_to_timeval(&tv, kp->p_user_time);
634 p->p_uutime_sec = tv.tv_sec;
635 p->p_uutime_usec = tv.tv_usec;
636 ticks_to_timeval(&tv, kp->p_sys_time);
637 p->p_ustime_sec = tv.tv_sec;
638 p->p_ustime_usec = tv.tv_usec;
639
640 p->p_priority = l.l_priority;
641 p->p_usrpri = l.l_usrpri;
642
643 p->p_vm_rssize = howmany(vui.vui_total, PAGE_SIZE);
644 p->p_vm_vsize = howmany(vui.vui_virtual, PAGE_SIZE);
645 p->p_vm_msize = howmany(vui.vui_mvirtual, PAGE_SIZE);
646
647 p->p_uru_maxrss = vui.vui_maxrss;
648 p->p_uru_minflt = vui.vui_minflt;
649 p->p_uru_majflt = vui.vui_majflt;
650
651 p->p_cpuid = l.l_cpuid;
652 }
653
654 /*
655 * Fill a process structure for the kernel pseudo-process (with PID 0).
656 */
657 static void
fill_proc2_kern(struct kinfo_proc2 * p)658 fill_proc2_kern(struct kinfo_proc2 * p)
659 {
660
661 memset(p, 0, sizeof(*p));
662
663 p->p_flag = L_INMEM | L_SYSTEM | L_SINTR;
664 p->p_pid = 0;
665 p->p_stat = LSSLEEP;
666 p->p_nice = NZERO;
667
668 /* Use the KERNEL task wchan, for consistency between ps and top. */
669 p->p_wchan = ((uint64_t)KERNEL << 8) | 0x00;
670 strlcpy(p->p_wmesg, "kernel", sizeof(p->p_wmesg));
671
672 strlcpy(p->p_comm, "kernel", sizeof(p->p_comm));
673 p->p_realflag = P_INMEM | P_SYSTEM | P_SINTR;
674 p->p_realstat = SACTIVE;
675 p->p_nlwps = NR_TASKS;
676
677 /*
678 * By using the KERNEL slot here, the kernel process will get a proper
679 * CPU usage average.
680 */
681 fill_proc2_common(p, KERNEL + NR_TASKS);
682 }
683
684 /*
685 * Fill a process structure for a user process.
686 */
687 static void
fill_proc2_user(struct kinfo_proc2 * p,int mslot)688 fill_proc2_user(struct kinfo_proc2 * p, int mslot)
689 {
690 struct mproc *mp;
691 struct fproc_light *fp;
692 time_t boottime;
693 dev_t tty;
694 struct timeval tv;
695 int i, r, kslot, zombie;
696
697 memset(p, 0, sizeof(*p));
698
699 if ((r = getuptime(NULL, NULL, &boottime)) != OK)
700 panic("getuptime failed: %d", r);
701
702 kslot = NR_TASKS + mslot;
703 mp = &mproc_tab[mslot];
704 fp = &fproc_tab[mslot];
705
706 zombie = (mp->mp_flags & (TRACE_ZOMBIE | ZOMBIE));
707 tty = (!zombie) ? fp->fpl_tty : NO_DEV;
708
709 p->p_eflag = 0;
710 if (tty != NO_DEV)
711 p->p_eflag |= EPROC_CTTY;
712 if (mp->mp_pid == mp->mp_procgrp) /* TODO: job control support */
713 p->p_eflag |= EPROC_SLEADER;
714
715 p->p_exitsig = SIGCHLD; /* TODO */
716
717 p->p_flag = P_INMEM;
718 if (mp->mp_flags & TAINTED)
719 p->p_flag |= P_SUGID;
720 if (mp->mp_tracer != NO_TRACER)
721 p->p_flag |= P_TRACED;
722 if (tty != NO_DEV)
723 p->p_flag |= P_CONTROLT;
724 p->p_pid = mp->mp_pid;
725 if (mp->mp_parent >= 0 && mp->mp_parent < NR_PROCS)
726 p->p_ppid = mproc_tab[mp->mp_parent].mp_pid;
727 p->p_sid = mp->mp_procgrp; /* TODO: job control supported */
728 p->p__pgid = mp->mp_procgrp;
729 p->p_tpgid = (tty != NO_DEV) ? mp->mp_procgrp : 0;
730 p->p_uid = mp->mp_effuid;
731 p->p_ruid = mp->mp_realuid;
732 p->p_gid = mp->mp_effgid;
733 p->p_rgid = mp->mp_realgid;
734 p->p_ngroups = MIN(mp->mp_ngroups, KI_NGROUPS);
735 for (i = 0; i < p->p_ngroups; i++)
736 p->p_groups[i] = mp->mp_sgroups[i];
737 p->p_tdev = tty;
738 memcpy(&p->p_siglist, &mp->mp_sigpending, sizeof(p->p_siglist));
739 memcpy(&p->p_sigmask, &mp->mp_sigmask, sizeof(p->p_sigmask));
740 memcpy(&p->p_sigcatch, &mp->mp_catch, sizeof(p->p_sigcatch));
741 memcpy(&p->p_sigignore, &mp->mp_ignore, sizeof(p->p_sigignore));
742 p->p_nice = mp->mp_nice + NZERO;
743 strlcpy(p->p_comm, mp->mp_name, sizeof(p->p_comm));
744 p->p_uvalid = 1;
745 ticks_to_timeval(&tv, mp->mp_started);
746 p->p_ustart_sec = boottime + tv.tv_sec;
747 p->p_ustart_usec = tv.tv_usec;
748 /* TODO: other rusage fields */
749 ticks_to_timeval(&tv, mp->mp_child_utime + mp->mp_child_stime);
750 p->p_uctime_sec = tv.tv_sec;
751 p->p_uctime_usec = tv.tv_usec;
752 p->p_realflag = p->p_flag;
753 p->p_nlwps = (zombie) ? 0 : 1;
754 p->p_svuid = mp->mp_svuid;
755 p->p_svgid = mp->mp_svgid;
756
757 p->p_stat = get_lwp_stat(mslot, &p->p_wchan, p->p_wmesg,
758 sizeof(p->p_wmesg), &p->p_flag);
759
760 switch (p->p_stat) {
761 case LSRUN:
762 p->p_realstat = SACTIVE;
763 p->p_nrlwps = 1;
764 break;
765 case LSSLEEP:
766 p->p_realstat = SACTIVE;
767 if (p->p_flag & L_SINTR)
768 p->p_realflag |= P_SINTR;
769 break;
770 case LSSTOP:
771 p->p_realstat = SSTOP;
772 break;
773 case LSZOMB:
774 p->p_realstat = SZOMB;
775 break;
776 case LSDEAD:
777 p->p_stat = LSZOMB; /* ps(1) STAT does not know LSDEAD */
778 p->p_realstat = SDEAD;
779 break;
780 default:
781 assert(0);
782 }
783
784 if (!zombie)
785 fill_proc2_common(p, kslot);
786 }
787
788 /*
789 * Implementation of CTL_KERN KERN_PROC2.
790 */
791 ssize_t
mib_kern_proc2(struct mib_call * call,struct mib_node * node __unused,struct mib_oldp * oldp,struct mib_newp * newp __unused)792 mib_kern_proc2(struct mib_call * call, struct mib_node * node __unused,
793 struct mib_oldp * oldp, struct mib_newp * newp __unused)
794 {
795 struct kinfo_proc2 proc2;
796 struct mproc *mp;
797 size_t copysz;
798 ssize_t off;
799 dev_t tty;
800 int r, req, arg, elsz, elmax, kmatch, zombie, mslot;
801
802 if (call->call_namelen != 4)
803 return EINVAL;
804
805 req = call->call_name[0];
806 arg = call->call_name[1];
807 elsz = call->call_name[2];
808 elmax = call->call_name[3]; /* redundant with the given oldlen.. */
809
810 /*
811 * The kernel is special, in that it does not have a slot in the PM or
812 * VFS tables. As such, it is dealt with separately. While checking
813 * arguments, we might as well check whether the kernel is matched.
814 */
815 switch (req) {
816 case KERN_PROC_ALL:
817 kmatch = TRUE;
818 break;
819 case KERN_PROC_PID:
820 case KERN_PROC_SESSION:
821 case KERN_PROC_PGRP:
822 case KERN_PROC_UID:
823 case KERN_PROC_RUID:
824 case KERN_PROC_GID:
825 case KERN_PROC_RGID:
826 kmatch = (arg == 0);
827 break;
828 case KERN_PROC_TTY:
829 kmatch = ((dev_t)arg == KERN_PROC_TTY_NODEV);
830 break;
831 default:
832 return EINVAL;
833 }
834
835 if (elsz <= 0 || elmax < 0)
836 return EINVAL;
837
838 if (!update_tables())
839 return EINVAL;
840
841 off = 0;
842 copysz = MIN((size_t)elsz, sizeof(proc2));
843
844 if (kmatch) {
845 if (mib_inrange(oldp, off) && elmax > 0) {
846 fill_proc2_kern(&proc2);
847 if ((r = mib_copyout(oldp, off, &proc2, copysz)) < 0)
848 return r;
849 elmax--;
850 }
851 off += elsz;
852 }
853
854 for (mslot = 0; mslot < NR_PROCS; mslot++) {
855 mp = &mproc_tab[mslot];
856
857 if (!(mp->mp_flags & IN_USE))
858 continue;
859
860 switch (req) {
861 case KERN_PROC_PID:
862 if ((pid_t)arg != mp->mp_pid)
863 continue;
864 break;
865 case KERN_PROC_SESSION: /* TODO: job control support */
866 case KERN_PROC_PGRP:
867 if ((pid_t)arg != mp->mp_procgrp)
868 continue;
869 break;
870 case KERN_PROC_TTY:
871 if ((dev_t)arg == KERN_PROC_TTY_REVOKE)
872 continue; /* TODO: revoke(2) support */
873 /* Do not access the fproc_tab slot of zombies. */
874 zombie = (mp->mp_flags & (TRACE_ZOMBIE | ZOMBIE));
875 tty = (zombie) ? fproc_tab[mslot].fpl_tty : NO_DEV;
876 if ((dev_t)arg == KERN_PROC_TTY_NODEV) {
877 if (tty != NO_DEV)
878 continue;
879 } else if ((dev_t)arg == NO_DEV || (dev_t)arg != tty)
880 continue;
881 break;
882 case KERN_PROC_UID:
883 if ((uid_t)arg != mp->mp_effuid)
884 continue;
885 break;
886 case KERN_PROC_RUID:
887 if ((uid_t)arg != mp->mp_realuid)
888 continue;
889 break;
890 case KERN_PROC_GID:
891 if ((gid_t)arg != mp->mp_effgid)
892 continue;
893 break;
894 case KERN_PROC_RGID:
895 if ((gid_t)arg != mp->mp_realgid)
896 continue;
897 break;
898 }
899
900 if (mib_inrange(oldp, off) && elmax > 0) {
901 fill_proc2_user(&proc2, mslot);
902 if ((r = mib_copyout(oldp, off, &proc2, copysz)) < 0)
903 return r;
904 elmax--;
905 }
906 off += elsz;
907 }
908
909 if (oldp == NULL && req != KERN_PROC_PID)
910 off += EXTRA_PROCS * elsz;
911
912 return off;
913 }
914
915 /*
916 * Implementation of CTL_KERN KERN_PROC_ARGS.
917 */
918 ssize_t
mib_kern_proc_args(struct mib_call * call,struct mib_node * node __unused,struct mib_oldp * oldp,struct mib_newp * newp __unused)919 mib_kern_proc_args(struct mib_call * call, struct mib_node * node __unused,
920 struct mib_oldp * oldp, struct mib_newp * newp __unused)
921 {
922 char vbuf[PAGE_SIZE], sbuf[PAGE_SIZE], obuf[PAGE_SIZE];
923 struct ps_strings pss;
924 struct mproc *mp;
925 char *buf, *p, *q, *pptr;
926 vir_bytes vaddr, vpage, spage, paddr, ppage;
927 size_t max, off, olen, oleft, oldlen, bytes, pleft;
928 unsigned int copybudget;
929 pid_t pid;
930 int req, mslot, count, aborted, ended;
931 ssize_t r;
932
933 if (call->call_namelen != 2)
934 return EINVAL;
935
936 pid = call->call_name[0];
937 req = call->call_name[1];
938
939 switch (req) {
940 case KERN_PROC_ARGV:
941 case KERN_PROC_ENV:
942 case KERN_PROC_NARGV:
943 case KERN_PROC_NENV:
944 break;
945 default:
946 return EOPNOTSUPP;
947 }
948
949 if (!update_tables())
950 return EINVAL;
951
952 if ((mslot = get_mslot(pid)) == NO_SLOT)
953 return ESRCH;
954 mp = &mproc_tab[mslot];
955 if (mp->mp_flags & (TRACE_ZOMBIE | ZOMBIE))
956 return ESRCH;
957
958 /* We can return the count field size without copying in any data. */
959 if (oldp == NULL && (req == KERN_PROC_NARGV || req == KERN_PROC_NENV))
960 return sizeof(count);
961
962 if (sys_datacopy(mp->mp_endpoint,
963 mp->mp_frame_addr + mp->mp_frame_len - sizeof(pss),
964 SELF, (vir_bytes)&pss, sizeof(pss)) != OK)
965 return EINVAL;
966
967 /*
968 * Determine the upper size limit of the requested data. Not only may
969 * the size never exceed ARG_MAX, it may also not exceed the frame
970 * length as given in its original exec call. In fact, the frame
971 * length should be substantially larger: all strings for both the
972 * arguments and the environment are in there, along with other stuff,
973 * and there must be no overlap between strings. It is possible that
974 * the application called setproctitle(3), in which case the ps_strings
975 * pointers refer to data outside the frame altogether. However, this
976 * data should not exceed 2048 bytes, and we cover this by rounding up
977 * the frame length to a multiple of the page size. Anyhow, NetBSD
978 * blindly returns ARG_MAX when asked for a size estimate, so with this
979 * maximum we are already quite a bit more accurate.
980 */
981 max = roundup(MIN(mp->mp_frame_len, ARG_MAX), PAGE_SIZE);
982
983 switch (req) {
984 case KERN_PROC_NARGV:
985 count = pss.ps_nargvstr;
986 return mib_copyout(oldp, 0, &count, sizeof(count));
987 case KERN_PROC_NENV:
988 count = pss.ps_nenvstr;
989 return mib_copyout(oldp, 0, &count, sizeof(count));
990 case KERN_PROC_ARGV:
991 if (oldp == NULL)
992 return max;
993 vaddr = (vir_bytes)pss.ps_argvstr;
994 count = pss.ps_nargvstr;
995 break;
996 case KERN_PROC_ENV:
997 if (oldp == NULL)
998 return max;
999 vaddr = (vir_bytes)pss.ps_envstr;
1000 count = pss.ps_nenvstr;
1001 break;
1002 }
1003
1004 /*
1005 * Go through the strings. Copy in entire, machine-aligned pages at
1006 * once, in the hope that all data is stored consecutively, which it
1007 * should be: we expect that the vector is followed by the strings, and
1008 * that the strings are stored in order of vector reference. We keep
1009 * up to two pages with copied-in data: one for the vector, and
1010 * optionally one for string data. In addition, we keep one page with
1011 * data to be copied out, so that we do not cause a lot of copy
1012 * overhead for short strings.
1013 *
1014 * We stop whenever any of the following conditions are met:
1015 * - copying in data from the target process fails for any reason;
1016 * - we have processed the last index ('count') into the vector;
1017 * - the current vector element is a NULL pointer;
1018 * - the requested number of output bytes ('oldlen') has been reached;
1019 * - the maximum number of output bytes ('max') has been reached;
1020 * - the number of page copy-ins exceeds an estimated threshold;
1021 * - copying out data fails for any reason (we then return the error).
1022 *
1023 * We limit the number of page copy-ins because otherwise a rogue
1024 * process could create an argument vector consisting of only two-byte
1025 * strings that all span two pages, causing us to copy up to 1GB of
1026 * data with the current ARG_MAX value of 256K. No reasonable vector
1027 * should cause more than (ARG_MAX / PAGE_SIZE) page copies for
1028 * strings; we are nice enough to allow twice that. Vector copies do
1029 * not count, as they are linear anyway.
1030 *
1031 * Unlike every other sysctl(2) call, we are supposed to truncate the
1032 * resulting size (the returned 'oldlen') to the requested size (the
1033 * given 'oldlen') *and* return the resulting size, rather than ENOMEM
1034 * and the real size. Unfortunately, libkvm actually relies on this.
1035 *
1036 * Generally speaking, upon failure we just return a truncated result.
1037 * In case of truncation, the data we copy out need not be null
1038 * terminated. It is up to userland to process the data correctly.
1039 */
1040 if (trunc_page(vaddr) == 0 || vaddr % sizeof(char *) != 0)
1041 return 0;
1042
1043 off = 0;
1044 olen = 0;
1045 aborted = FALSE;
1046
1047 oldlen = mib_getoldlen(oldp);
1048 if (oldlen > max)
1049 oldlen = max;
1050
1051 copybudget = (ARG_MAX / PAGE_SIZE) * 2;
1052
1053 vpage = 0;
1054 spage = 0;
1055
1056 while (count > 0 && off + olen < oldlen && !aborted) {
1057 /*
1058 * Start by fetching the page containing the current vector
1059 * element, if needed. We could limit the fetch to the vector
1060 * size, but our hope is that for the simple cases, the strings
1061 * are on the remainder of the same page, so we save a copy
1062 * call. TODO: since the strings should follow the vector, we
1063 * could start the copy at the base of the vector.
1064 */
1065 if (trunc_page(vaddr) != vpage) {
1066 vpage = trunc_page(vaddr);
1067 if (sys_datacopy(mp->mp_endpoint, vpage, SELF,
1068 (vir_bytes)vbuf, PAGE_SIZE) != OK)
1069 break;
1070 }
1071
1072 /* Get the current vector element, pointing to a string. */
1073 memcpy(&pptr, &vbuf[vaddr - vpage], sizeof(pptr));
1074 paddr = (vir_bytes)pptr;
1075 ppage = trunc_page(paddr);
1076 if (ppage == 0)
1077 break;
1078
1079 /* Fetch the string itself, one page at a time at most. */
1080 do {
1081 /*
1082 * See if the string pointer falls inside either the
1083 * vector page or the previously fetched string page
1084 * (if any). If not, fetch a string page.
1085 */
1086 if (ppage == vpage) {
1087 buf = vbuf;
1088 } else if (ppage == spage) {
1089 buf = sbuf;
1090 } else {
1091 if (--copybudget == 0) {
1092 aborted = TRUE;
1093 break;
1094 }
1095 spage = ppage;
1096 if (sys_datacopy(mp->mp_endpoint, spage, SELF,
1097 (vir_bytes)sbuf, PAGE_SIZE) != OK) {
1098 aborted = TRUE;
1099 break;
1100 }
1101 buf = sbuf;
1102 }
1103
1104 /*
1105 * We now have a string fragment in a buffer. See if
1106 * the string is null terminated. If not, all the data
1107 * up to the buffer end is part of the string, and the
1108 * string continues on the next page.
1109 */
1110 p = &buf[paddr - ppage];
1111 pleft = PAGE_SIZE - (paddr - ppage);
1112 assert(pleft > 0);
1113
1114 if ((q = memchr(p, '\0', pleft)) != NULL) {
1115 bytes = (size_t)(q - p + 1);
1116 assert(bytes <= pleft);
1117 ended = TRUE;
1118 } else {
1119 bytes = pleft;
1120 ended = FALSE;
1121 }
1122
1123 /* Limit the result to the requested length. */
1124 if (off + olen + bytes > oldlen)
1125 bytes = oldlen - off - olen;
1126
1127 /*
1128 * Add 'bytes' bytes from string pointer 'p' to the
1129 * output buffer, copying out its contents to userland
1130 * if it has filled up.
1131 */
1132 if (olen + bytes > sizeof(obuf)) {
1133 oleft = sizeof(obuf) - olen;
1134 memcpy(&obuf[olen], p, oleft);
1135
1136 if ((r = mib_copyout(oldp, off, obuf,
1137 sizeof(obuf))) < 0)
1138 return r;
1139 off += sizeof(obuf);
1140 olen = 0;
1141
1142 p += oleft;
1143 bytes -= oleft;
1144 }
1145 if (bytes > 0) {
1146 memcpy(&obuf[olen], p, bytes);
1147 olen += bytes;
1148 }
1149
1150 /*
1151 * Continue as long as we have not yet found the string
1152 * end, and we have not yet filled the output buffer.
1153 */
1154 paddr += pleft;
1155 assert(trunc_page(paddr) == paddr);
1156 ppage = paddr;
1157 } while (!ended && off + olen < oldlen);
1158
1159 vaddr += sizeof(char *);
1160 count--;
1161 }
1162
1163 /* Copy out any remainder of the output buffer. */
1164 if (olen > 0) {
1165 if ((r = mib_copyout(oldp, off, obuf, olen)) < 0)
1166 return r;
1167 off += olen;
1168 }
1169
1170 assert(off <= oldlen);
1171 return off;
1172 }
1173
1174 /*
1175 * Implementation of CTL_MINIX MINIX_PROC PROC_LIST.
1176 */
1177 ssize_t
mib_minix_proc_list(struct mib_call * call __unused,struct mib_node * node __unused,struct mib_oldp * oldp,struct mib_newp * newp __unused)1178 mib_minix_proc_list(struct mib_call * call __unused,
1179 struct mib_node * node __unused, struct mib_oldp * oldp,
1180 struct mib_newp * newp __unused)
1181 {
1182 struct minix_proc_list mpl[NR_PROCS];
1183 struct minix_proc_list *mplp;
1184 struct mproc *mp;
1185 unsigned int mslot;
1186
1187 if (oldp == NULL)
1188 return sizeof(mpl);
1189
1190 if (!update_tables())
1191 return EINVAL;
1192
1193 memset(&mpl, 0, sizeof(mpl));
1194
1195 mplp = mpl;
1196 mp = mproc_tab;
1197
1198 for (mslot = 0; mslot < NR_PROCS; mslot++, mplp++, mp++) {
1199 if (!(mp->mp_flags & IN_USE) || mp->mp_pid <= 0)
1200 continue;
1201
1202 mplp->mpl_flags = MPLF_IN_USE;
1203 if (mp->mp_flags & (TRACE_ZOMBIE | ZOMBIE))
1204 mplp->mpl_flags |= MPLF_ZOMBIE;
1205 mplp->mpl_pid = mp->mp_pid;
1206 mplp->mpl_uid = mp->mp_effuid;
1207 mplp->mpl_gid = mp->mp_effgid;
1208 }
1209
1210 return mib_copyout(oldp, 0, &mpl, sizeof(mpl));
1211 }
1212
1213 /*
1214 * Implementation of CTL_MINIX MINIX_PROC PROC_DATA.
1215 */
1216 ssize_t
mib_minix_proc_data(struct mib_call * call,struct mib_node * node __unused,struct mib_oldp * oldp,struct mib_newp * newp __unused)1217 mib_minix_proc_data(struct mib_call * call, struct mib_node * node __unused,
1218 struct mib_oldp * oldp, struct mib_newp * newp __unused)
1219 {
1220 struct minix_proc_data mpd;
1221 struct proc *kp;
1222 int kslot, mslot = 0;
1223 unsigned int mflags;
1224 pid_t pid;
1225
1226 /*
1227 * It is currently only possible to retrieve the process data for a
1228 * particular PID, which must be given as the last name component.
1229 */
1230 if (call->call_namelen != 1)
1231 return EINVAL;
1232
1233 pid = (pid_t)call->call_name[0];
1234
1235 if (!update_tables())
1236 return EINVAL;
1237
1238 /*
1239 * Unlike the CTL_KERN nodes, we use the ProcFS semantics here: if the
1240 * given PID is negative, it is a kernel task; otherwise, it identifies
1241 * a user process. A request for PID 0 will result in ESRCH.
1242 */
1243 if (pid < 0) {
1244 if (pid < -NR_TASKS)
1245 return ESRCH;
1246
1247 kslot = pid + NR_TASKS;
1248 assert(kslot < NR_TASKS);
1249 } else {
1250 if ((mslot = get_mslot(pid)) == NO_SLOT)
1251 return ESRCH;
1252
1253 kslot = NR_TASKS + mslot;
1254 }
1255
1256 if (oldp == NULL)
1257 return sizeof(mpd);
1258
1259 kp = &proc_tab[kslot];
1260
1261 mflags = (pid > 0) ? mproc_tab[mslot].mp_flags : 0;
1262
1263 memset(&mpd, 0, sizeof(mpd));
1264 mpd.mpd_endpoint = kp->p_endpoint;
1265 if (mflags & PRIV_PROC)
1266 mpd.mpd_flags |= MPDF_SYSTEM;
1267 if (mflags & (TRACE_ZOMBIE | ZOMBIE))
1268 mpd.mpd_flags |= MPDF_ZOMBIE;
1269 else if ((mflags & TRACE_STOPPED) || RTS_ISSET(kp, RTS_P_STOP))
1270 mpd.mpd_flags |= MPDF_STOPPED;
1271 else if (proc_is_runnable(kp))
1272 mpd.mpd_flags |= MPDF_RUNNABLE;
1273 mpd.mpd_blocked_on = P_BLOCKEDON(kp);
1274 mpd.mpd_priority = kp->p_priority;
1275 mpd.mpd_user_time = kp->p_user_time;
1276 mpd.mpd_sys_time = kp->p_sys_time;
1277 mpd.mpd_cycles = kp->p_cycles;
1278 mpd.mpd_kipc_cycles = kp->p_kipc_cycles;
1279 mpd.mpd_kcall_cycles = kp->p_kcall_cycles;
1280 if (kslot >= NR_TASKS) {
1281 mpd.mpd_nice = mproc_tab[mslot].mp_nice;
1282 strlcpy(mpd.mpd_name, mproc_tab[mslot].mp_name,
1283 sizeof(mpd.mpd_name));
1284 } else
1285 strlcpy(mpd.mpd_name, kp->p_name, sizeof(mpd.mpd_name));
1286
1287 return mib_copyout(oldp, 0, &mpd, sizeof(mpd));
1288 }
1289