1 /* MIB service - proc.c - functionality based on service process tables */ 2 /* Eventually, the CTL_PROC subtree might end up here as well. */ 3 4 #include "mib.h" 5 6 #include <sys/exec.h> 7 #include <minix/sysinfo.h> 8 9 #include <machine/archtypes.h> 10 #include "kernel/proc.h" 11 #include "servers/pm/mproc.h" 12 #include "servers/vfs/const.h" 13 #include "servers/vfs/fproc.h" 14 15 typedef struct proc ixfer_proc_t; 16 typedef struct mproc ixfer_mproc_t; 17 18 static ixfer_proc_t proc_tab[NR_TASKS + NR_PROCS]; 19 static ixfer_mproc_t mproc_tab[NR_PROCS]; 20 static struct fproc_light fproc_tab[NR_PROCS]; 21 22 /* 23 * The number of processes added to the current number of processes when doing 24 * a size estimation, so that the actual data retrieval does not end up with 25 * too little space if new processes have forked between the two calls. We do 26 * a process table update only once per clock tick, which means that typically 27 * no update will take place between the user process's size estimation request 28 * and its subsequent data retrieval request. On the other hand, if we do 29 * update process tables in between, quite a bit might have changed. 30 */ 31 #define EXTRA_PROCS 8 32 33 #define HASH_SLOTS (NR_PROCS / 4) /* expected nr. of processes in use */ 34 #define NO_SLOT (-1) 35 static int hash_tab[HASH_SLOTS]; /* hash table mapping from PID.. */ 36 static int hnext_tab[NR_PROCS]; /* ..to PM process slot */ 37 38 static clock_t tabs_updated = 0; /* when the tables were last updated */ 39 static int tabs_valid = TRUE; /* FALSE if obtaining tables failed */ 40 41 /* 42 * Update the process tables by pulling in new copies from the kernel, PM, and 43 * VFS, but only every so often and only if it has not failed before. Return 44 * TRUE iff the tables are now valid. 45 */ 46 static int 47 update_tables(void) 48 { 49 clock_t now; 50 pid_t pid; 51 int r, kslot, mslot, hslot; 52 53 /* 54 * If retrieving the tables failed at some point, do not keep trying 55 * all the time. Such a failure is very unlikely to be transient. 56 */ 57 if (tabs_valid == FALSE) 58 return FALSE; 59 60 /* 61 * Update the tables once per clock tick at most. The update operation 62 * is rather heavy, transferring several hundreds of kilobytes between 63 * servers. Userland should be able to live with information that is 64 * outdated by at most one clock tick. 65 */ 66 now = getticks(); 67 68 if (tabs_updated != 0 && tabs_updated == now) 69 return TRUE; 70 71 /* Perform an actual update now. */ 72 tabs_valid = FALSE; 73 74 /* Retrieve and check the kernel process table. */ 75 if ((r = sys_getproctab(proc_tab)) != OK) { 76 printf("MIB: unable to obtain kernel process table (%d)\n", r); 77 78 return FALSE; 79 } 80 81 for (kslot = 0; kslot < NR_TASKS + NR_PROCS; kslot++) { 82 if (proc_tab[kslot].p_magic != PMAGIC) { 83 printf("MIB: kernel process table mismatch\n"); 84 85 return FALSE; 86 } 87 } 88 89 /* Retrieve and check the PM process table. */ 90 r = getsysinfo(PM_PROC_NR, SI_PROC_TAB, mproc_tab, sizeof(mproc_tab)); 91 if (r != OK) { 92 printf("MIB: unable to obtain PM process table (%d)\n", r); 93 94 return FALSE; 95 } 96 97 for (mslot = 0; mslot < NR_PROCS; mslot++) { 98 if (mproc_tab[mslot].mp_magic != MP_MAGIC) { 99 printf("MIB: PM process table mismatch\n"); 100 101 return FALSE; 102 } 103 } 104 105 /* Retrieve an extract of the VFS process table. */ 106 r = getsysinfo(VFS_PROC_NR, SI_PROCLIGHT_TAB, fproc_tab, 107 sizeof(fproc_tab)); 108 if (r != OK) { 109 printf("MIB: unable to obtain VFS process table (%d)\n", r); 110 111 return FALSE; 112 } 113 114 tabs_valid = TRUE; 115 tabs_updated = now; 116 117 /* 118 * Build a hash table mapping from process IDs to slot numbers, for 119 * fast access. TODO: decide if this is better done on demand only. 120 */ 121 for (hslot = 0; hslot < HASH_SLOTS; hslot++) 122 hash_tab[hslot] = NO_SLOT; 123 124 for (mslot = 0; mslot < NR_PROCS; mslot++) { 125 if (mproc_tab[mslot].mp_flags & IN_USE) { 126 if ((pid = mproc_tab[mslot].mp_pid) <= 0) 127 continue; 128 129 hslot = mproc_tab[mslot].mp_pid % HASH_SLOTS; 130 131 hnext_tab[mslot] = hash_tab[hslot]; 132 hash_tab[hslot] = mslot; 133 } 134 } 135 136 return TRUE; 137 } 138 139 /* 140 * Return the PM slot number for the given PID, or NO_SLOT if the PID is not in 141 * use by a process. 142 */ 143 static int 144 get_mslot(pid_t pid) 145 { 146 int mslot; 147 148 /* PID 0 identifies the kernel; checking this is up to the caller. */ 149 if (pid <= 0) 150 return NO_SLOT; 151 152 for (mslot = hash_tab[pid % HASH_SLOTS]; mslot != NO_SLOT; 153 mslot = hnext_tab[mslot]) 154 if (mproc_tab[mslot].mp_pid == pid) 155 break; 156 157 return mslot; 158 } 159 160 /* 161 * Store the given number of clock ticks as a timeval structure. 162 */ 163 static void 164 ticks_to_timeval(struct timeval * tv, clock_t ticks) 165 { 166 clock_t hz; 167 168 hz = sys_hz(); 169 170 tv->tv_sec = ticks / hz; 171 tv->tv_usec = (long)((ticks % hz) * 1000000ULL / hz); 172 } 173 174 /* 175 * Generate a wchan message text for the cases that the process is blocked on 176 * IPC with another process, of which the endpoint is given as 'endpt' here. 177 * The name of the other process is to be stored in 'wmesg', which is a buffer 178 * of size 'wmsz'. The result should be null terminated. If 'ipc' is set, the 179 * process is blocked on a direct IPC call, in which case the name of the other 180 * process is enclosed in parentheses. If 'ipc' is not set, the call is made 181 * indirectly through VFS, and the name of the other process should not be 182 * enclosed in parentheses. If no name can be obtained, we use the endpoint of 183 * the other process instead. 184 */ 185 static void 186 fill_wmesg(char * wmesg, size_t wmsz, endpoint_t endpt, int ipc) 187 { 188 const char *name; 189 int mslot; 190 191 switch (endpt) { 192 case ANY: 193 name = "any"; 194 break; 195 case SELF: 196 name = "self"; 197 break; 198 case NONE: 199 name = "none"; 200 break; 201 default: 202 mslot = _ENDPOINT_P(endpt); 203 if (mslot >= -NR_TASKS && mslot < NR_PROCS && 204 (mslot < 0 || (mproc_tab[mslot].mp_flags & IN_USE))) 205 name = proc_tab[NR_TASKS + mslot].p_name; 206 else 207 name = NULL; 208 } 209 210 if (name != NULL) 211 snprintf(wmesg, wmsz, "%s%s%s", 212 ipc ? "(" : "", name, ipc ? ")" : ""); 213 else 214 snprintf(wmesg, wmsz, "%s%d%s", 215 ipc ? "(" : "", endpt, ipc ? ")" : ""); 216 } 217 218 /* 219 * Return the LWP status of a process, along with additional information in 220 * case the process is sleeping (LSSLEEP): a wchan value and text to indicate 221 * what the process is sleeping on, and possibly a flag field modification to 222 * indicate that the sleep is interruptible. 223 */ 224 static int 225 get_lwp_stat(int mslot, uint64_t * wcptr, char * wmptr, size_t wmsz, 226 int32_t * flag) 227 { 228 struct mproc *mp; 229 struct fproc_light *fp; 230 struct proc *kp; 231 const char *wmesg; 232 uint64_t wchan; 233 endpoint_t endpt; 234 235 mp = &mproc_tab[mslot]; 236 fp = &fproc_tab[mslot]; 237 kp = &proc_tab[NR_TASKS + mslot]; 238 239 /* 240 * First cover all the cases that the process is not sleeping. In 241 * those cases, we need not return additional sleep information either. 242 */ 243 if (mp->mp_flags & (TRACE_ZOMBIE | ZOMBIE)) 244 return LSZOMB; 245 246 if (mp->mp_flags & EXITING) 247 return LSDEAD; 248 249 if ((mp->mp_flags & TRACE_STOPPED) || RTS_ISSET(kp, RTS_P_STOP)) 250 return LSSTOP; 251 252 if (proc_is_runnable(kp)) 253 return LSRUN; 254 255 /* 256 * The process is sleeping. In that case, we must also figure out why, 257 * and return an appropriate wchan value and human-readable wmesg text. 258 * 259 * The process can be blocked on either a known sleep state in PM or 260 * VFS, or otherwise on IPC communication with another process, or 261 * otherwise on a kernel RTS flag. In each case, decide what to use as 262 * wchan value and wmesg text, and whether the sleep is interruptible. 263 * 264 * The wchan value should be unique for the sleep reason. We use its 265 * lower eight bits to indicate a class: 266 * 0x00 = kernel task 267 * 0x01 = kerel RTS block 268 * 0x02 = PM call 269 * 0x03 = VFS call 270 * 0x04 = MIB call 271 * 0xff = blocked on process 272 * The upper bits are used for class-specific information. The actual 273 * value does not really matter, as long as it is nonzero and there is 274 * no overlap between the different values. 275 */ 276 wchan = 0; 277 wmesg = NULL; 278 279 /* 280 * First see if the process is marked as blocked in the tables of PM or 281 * VFS. Such a block reason is always an interruptible sleep. Note 282 * that we do not use the kernel table at all in this case: each of the 283 * three tables is consistent within itself, but not necessarily 284 * consistent with any of the other tables, so we avoid internal 285 * mismatches if we can. 286 */ 287 if (mp->mp_flags & WAITING) { 288 wchan = 0x102; 289 wmesg = "wait"; 290 } else if (mp->mp_flags & SIGSUSPENDED) { 291 wchan = 0x202; 292 wmesg = "pause"; 293 } else if (fp->fpl_blocked_on != FP_BLOCKED_ON_NONE) { 294 wchan = (fp->fpl_blocked_on << 8) | 0x03; 295 switch (fp->fpl_blocked_on) { 296 case FP_BLOCKED_ON_PIPE: 297 wmesg = "pipe"; 298 break; 299 case FP_BLOCKED_ON_FLOCK: 300 wmesg = "flock"; 301 break; 302 case FP_BLOCKED_ON_POPEN: 303 wmesg = "popen"; 304 break; 305 case FP_BLOCKED_ON_SELECT: 306 wmesg = "select"; 307 break; 308 case FP_BLOCKED_ON_CDEV: 309 case FP_BLOCKED_ON_SDEV: 310 /* 311 * Add the task (= character or socket driver) endpoint 312 * to the wchan value, and use the driver's process 313 * name, without parentheses, as wmesg text. 314 */ 315 wchan |= (uint64_t)fp->fpl_task << 16; 316 fill_wmesg(wmptr, wmsz, fp->fpl_task, FALSE /*ipc*/); 317 break; 318 default: 319 /* A newly added flag we don't yet know about? */ 320 wmesg = "???"; 321 break; 322 } 323 } 324 if (wchan != 0) { 325 *wcptr = wchan; 326 if (wmesg != NULL) /* NULL means "already set" here */ 327 strlcpy(wmptr, wmesg, wmsz); 328 *flag |= L_SINTR; 329 } 330 331 /* 332 * See if the process is blocked on sending or receiving. If not, then 333 * use one of the kernel RTS flags as reason. 334 */ 335 endpt = P_BLOCKEDON(kp); 336 337 switch (endpt) { 338 case MIB_PROC_NR: 339 /* This is really just aesthetics. */ 340 wchan = 0x04; 341 wmesg = "sysctl"; 342 break; 343 case NONE: 344 /* 345 * The process is not running, but also not blocked on IPC with 346 * another process. This means it must be stopped on a kernel 347 * RTS flag. 348 */ 349 wchan = ((uint64_t)kp->p_rts_flags << 8) | 0x01; 350 if (RTS_ISSET(kp, RTS_PROC_STOP)) 351 wmesg = "kstop"; 352 else if (RTS_ISSET(kp, RTS_SIGNALED) || 353 RTS_ISSET(kp, RTS_SIGNALED)) 354 wmesg = "ksignal"; 355 else if (RTS_ISSET(kp, RTS_NO_PRIV)) 356 wmesg = "knopriv"; 357 else if (RTS_ISSET(kp, RTS_PAGEFAULT) || 358 RTS_ISSET(kp, RTS_VMREQTARGET)) 359 wmesg = "fault"; 360 else if (RTS_ISSET(kp, RTS_NO_QUANTUM)) 361 wmesg = "sched"; 362 else 363 wmesg = "kflag"; 364 break; 365 case ANY: 366 /* 367 * If the process is blocked receiving from ANY, mark it as 368 * being in an interruptible sleep. This looks nicer, even 369 * though "interruptible" is not applicable to services at all. 370 */ 371 *flag |= L_SINTR; 372 break; 373 } 374 375 /* 376 * If at this point wchan is still zero, the process is blocked sending 377 * or receiving. Use a wchan value based on the target endpoint, and 378 * use "(procname)" as wmesg text. 379 */ 380 if (wchan == 0) { 381 *wcptr = ((uint64_t)endpt << 8) | 0xff; 382 fill_wmesg(wmptr, wmsz, endpt, TRUE /*ipc*/); 383 } else { 384 *wcptr = wchan; 385 if (wmesg != NULL) /* NULL means "already set" here */ 386 strlcpy(wmptr, wmesg, wmsz); 387 } 388 389 return LSSLEEP; 390 } 391 392 393 /* 394 * Fill the part of a LWP structure that is common between kernel tasks and 395 * user processes. Also return a CPU estimate in 'estcpu', because we generate 396 * the value as a side effect here, and the LWP structure has no estcpu field. 397 */ 398 static void 399 fill_lwp_common(struct kinfo_lwp * l, int kslot, uint32_t * estcpu) 400 { 401 struct proc *kp; 402 struct timeval tv; 403 clock_t uptime; 404 uint32_t hz; 405 406 kp = &proc_tab[kslot]; 407 408 uptime = getticks(); 409 hz = sys_hz(); 410 411 /* 412 * We use the process endpoint as the LWP ID. Not only does this allow 413 * users to obtain process endpoints with "ps -s" (thus replacing the 414 * MINIX3 ps(1)'s "ps -E"), but if we ever do implement kernel threads, 415 * this is probably still going to be accurate. 416 */ 417 l->l_lid = kp->p_endpoint; 418 419 /* 420 * The time during which the process has not been swapped in or out is 421 * not applicable for us, and thus, we set it to the time the process 422 * has been running (in seconds). This value is relevant mostly for 423 * ps(1)'s CPU usage correction for processes that have just started. 424 */ 425 if (kslot >= NR_TASKS) 426 l->l_swtime = uptime - mproc_tab[kslot - NR_TASKS].mp_started; 427 else 428 l->l_swtime = uptime; 429 l->l_swtime /= hz; 430 431 /* 432 * Sleep (dequeue) times are not maintained for kernel tasks, so 433 * pretend they are never asleep (which is pretty accurate). 434 */ 435 if (kslot < NR_TASKS) 436 l->l_slptime = 0; 437 else 438 l->l_slptime = (uptime - kp->p_dequeued) / hz; 439 440 l->l_priority = kp->p_priority; 441 l->l_usrpri = kp->p_priority; 442 l->l_cpuid = kp->p_cpu; 443 ticks_to_timeval(&tv, kp->p_user_time + kp->p_sys_time); 444 l->l_rtime_sec = tv.tv_sec; 445 l->l_rtime_usec = tv.tv_usec; 446 447 /* 448 * Obtain CPU usage percentages and estimates through library code 449 * shared between the kernel and this service; see its source for 450 * details. We note that the produced estcpu value is rather different 451 * from the one produced by NetBSD, but this should not be a problem. 452 */ 453 l->l_pctcpu = cpuavg_getstats(&kp->p_cpuavg, &l->l_cpticks, estcpu, 454 uptime, hz); 455 } 456 457 /* 458 * Fill a LWP structure for a kernel task. Each kernel task has its own LWP, 459 * and all of them have negative PIDs. 460 */ 461 static void 462 fill_lwp_kern(struct kinfo_lwp * l, int kslot) 463 { 464 uint32_t estcpu; 465 466 memset(l, 0, sizeof(*l)); 467 468 l->l_flag = L_INMEM | L_SINTR | L_SYSTEM; 469 l->l_stat = LSSLEEP; 470 l->l_pid = kslot - NR_TASKS; 471 472 /* 473 * When showing LWP entries, ps(1) uses the process name rather than 474 * the LWP name. All kernel tasks are therefore shown as "[kernel]" 475 * anyway. We use the wmesg field to show the actual kernel task name. 476 */ 477 l->l_wchan = ((uint64_t)(l->l_pid) << 8) | 0x00; 478 strlcpy(l->l_wmesg, proc_tab[kslot].p_name, sizeof(l->l_wmesg)); 479 strlcpy(l->l_name, "kernel", sizeof(l->l_name)); 480 481 fill_lwp_common(l, kslot, &estcpu); 482 } 483 484 /* 485 * Fill a LWP structure for a user process. 486 */ 487 static void 488 fill_lwp_user(struct kinfo_lwp * l, int mslot) 489 { 490 struct mproc *mp; 491 uint32_t estcpu; 492 493 memset(l, 0, sizeof(*l)); 494 495 mp = &mproc_tab[mslot]; 496 497 l->l_flag = L_INMEM; 498 l->l_stat = get_lwp_stat(mslot, &l->l_wchan, l->l_wmesg, 499 sizeof(l->l_wmesg), &l->l_flag); 500 l->l_pid = mp->mp_pid; 501 strlcpy(l->l_name, mp->mp_name, sizeof(l->l_name)); 502 503 fill_lwp_common(l, NR_TASKS + mslot, &estcpu); 504 } 505 506 /* 507 * Implementation of CTL_KERN KERN_LWP. 508 */ 509 ssize_t 510 mib_kern_lwp(struct mib_call * call, struct mib_node * node __unused, 511 struct mib_oldp * oldp, struct mib_newp * newp __unused) 512 { 513 struct kinfo_lwp lwp; 514 struct mproc *mp; 515 size_t copysz; 516 ssize_t off; 517 pid_t pid; 518 int r, elsz, elmax, kslot, mslot, last_mslot; 519 520 if (call->call_namelen != 3) 521 return EINVAL; 522 523 pid = (pid_t)call->call_name[0]; 524 elsz = call->call_name[1]; 525 elmax = call->call_name[2]; /* redundant with the given oldlen.. */ 526 527 if (pid < -1 || elsz <= 0 || elmax < 0) 528 return EINVAL; 529 530 if (!update_tables()) 531 return EINVAL; 532 533 off = 0; 534 copysz = MIN((size_t)elsz, sizeof(lwp)); 535 536 /* 537 * We model kernel tasks as LWP threads of the kernel (with PID 0). 538 * Modeling the kernel tasks as processes with negative PIDs, like 539 * ProcFS does, conflicts with the KERN_LWP API here: a PID of -1 540 * indicates that the caller wants a full listing of LWPs. 541 */ 542 if (pid <= 0) { 543 for (kslot = 0; kslot < NR_TASKS; kslot++) { 544 if (mib_inrange(oldp, off) && elmax > 0) { 545 fill_lwp_kern(&lwp, kslot); 546 if ((r = mib_copyout(oldp, off, &lwp, 547 copysz)) < 0) 548 return r; 549 elmax--; 550 } 551 off += elsz; 552 } 553 554 /* No need to add extra space here: NR_TASKS is static. */ 555 if (pid == 0) 556 return off; 557 } 558 559 /* 560 * With PID 0 out of the way: the user requested the LWP for either a 561 * specific user process (pid > 0), or for all processes (pid < 0). 562 */ 563 if (pid > 0) { 564 if ((mslot = get_mslot(pid)) == NO_SLOT || 565 (mproc_tab[mslot].mp_flags & (TRACE_ZOMBIE | ZOMBIE))) 566 return ESRCH; 567 last_mslot = mslot; 568 } else { 569 mslot = 0; 570 last_mslot = NR_PROCS - 1; 571 } 572 573 for (; mslot <= last_mslot; mslot++) { 574 mp = &mproc_tab[mslot]; 575 576 if ((mp->mp_flags & (IN_USE | TRACE_ZOMBIE | ZOMBIE)) != 577 IN_USE) 578 continue; 579 580 if (mib_inrange(oldp, off) && elmax > 0) { 581 fill_lwp_user(&lwp, mslot); 582 if ((r = mib_copyout(oldp, off, &lwp, copysz)) < 0) 583 return r; 584 elmax--; 585 } 586 off += elsz; 587 } 588 589 if (oldp == NULL && pid < 0) 590 off += EXTRA_PROCS * elsz; 591 592 return off; 593 } 594 595 596 /* 597 * Fill the part of a process structure that is common between kernel tasks and 598 * user processes. 599 */ 600 static void 601 fill_proc2_common(struct kinfo_proc2 * p, int kslot) 602 { 603 struct vm_usage_info vui; 604 struct timeval tv; 605 struct proc *kp; 606 struct kinfo_lwp l; 607 608 kp = &proc_tab[kslot]; 609 610 /* 611 * Much of the information in the LWP structure also ends up in the 612 * process structure. In order to avoid duplication of some important 613 * code, first generate LWP values and then copy it them into the 614 * process structure. 615 */ 616 memset(&l, 0, sizeof(l)); 617 fill_lwp_common(&l, kslot, &p->p_estcpu); 618 619 /* Obtain memory usage information from VM. Ignore failures. */ 620 memset(&vui, 0, sizeof(vui)); 621 (void)vm_info_usage(kp->p_endpoint, &vui); 622 623 ticks_to_timeval(&tv, kp->p_user_time + kp->p_sys_time); 624 p->p_rtime_sec = l.l_rtime_sec; 625 p->p_rtime_usec = l.l_rtime_usec; 626 p->p_cpticks = l.l_cpticks; 627 p->p_pctcpu = l.l_pctcpu; 628 p->p_swtime = l.l_swtime; 629 p->p_slptime = l.l_slptime; 630 p->p_uticks = kp->p_user_time; 631 p->p_sticks = kp->p_sys_time; 632 /* TODO: p->p_iticks */ 633 ticks_to_timeval(&tv, kp->p_user_time); 634 p->p_uutime_sec = tv.tv_sec; 635 p->p_uutime_usec = tv.tv_usec; 636 ticks_to_timeval(&tv, kp->p_sys_time); 637 p->p_ustime_sec = tv.tv_sec; 638 p->p_ustime_usec = tv.tv_usec; 639 640 p->p_priority = l.l_priority; 641 p->p_usrpri = l.l_usrpri; 642 643 p->p_vm_rssize = howmany(vui.vui_total, PAGE_SIZE); 644 p->p_vm_vsize = howmany(vui.vui_virtual, PAGE_SIZE); 645 p->p_vm_msize = howmany(vui.vui_mvirtual, PAGE_SIZE); 646 647 p->p_uru_maxrss = vui.vui_maxrss; 648 p->p_uru_minflt = vui.vui_minflt; 649 p->p_uru_majflt = vui.vui_majflt; 650 651 p->p_cpuid = l.l_cpuid; 652 } 653 654 /* 655 * Fill a process structure for the kernel pseudo-process (with PID 0). 656 */ 657 static void 658 fill_proc2_kern(struct kinfo_proc2 * p) 659 { 660 661 memset(p, 0, sizeof(*p)); 662 663 p->p_flag = L_INMEM | L_SYSTEM | L_SINTR; 664 p->p_pid = 0; 665 p->p_stat = LSSLEEP; 666 p->p_nice = NZERO; 667 668 /* Use the KERNEL task wchan, for consistency between ps and top. */ 669 p->p_wchan = ((uint64_t)KERNEL << 8) | 0x00; 670 strlcpy(p->p_wmesg, "kernel", sizeof(p->p_wmesg)); 671 672 strlcpy(p->p_comm, "kernel", sizeof(p->p_comm)); 673 p->p_realflag = P_INMEM | P_SYSTEM | P_SINTR; 674 p->p_realstat = SACTIVE; 675 p->p_nlwps = NR_TASKS; 676 677 /* 678 * By using the KERNEL slot here, the kernel process will get a proper 679 * CPU usage average. 680 */ 681 fill_proc2_common(p, KERNEL + NR_TASKS); 682 } 683 684 /* 685 * Fill a process structure for a user process. 686 */ 687 static void 688 fill_proc2_user(struct kinfo_proc2 * p, int mslot) 689 { 690 struct mproc *mp; 691 struct fproc_light *fp; 692 time_t boottime; 693 dev_t tty; 694 struct timeval tv; 695 int i, r, kslot, zombie; 696 697 memset(p, 0, sizeof(*p)); 698 699 if ((r = getuptime(NULL, NULL, &boottime)) != OK) 700 panic("getuptime failed: %d", r); 701 702 kslot = NR_TASKS + mslot; 703 mp = &mproc_tab[mslot]; 704 fp = &fproc_tab[mslot]; 705 706 zombie = (mp->mp_flags & (TRACE_ZOMBIE | ZOMBIE)); 707 tty = (!zombie) ? fp->fpl_tty : NO_DEV; 708 709 p->p_eflag = 0; 710 if (tty != NO_DEV) 711 p->p_eflag |= EPROC_CTTY; 712 if (mp->mp_pid == mp->mp_procgrp) /* TODO: job control support */ 713 p->p_eflag |= EPROC_SLEADER; 714 715 p->p_exitsig = SIGCHLD; /* TODO */ 716 717 p->p_flag = P_INMEM; 718 if (mp->mp_flags & TAINTED) 719 p->p_flag |= P_SUGID; 720 if (mp->mp_tracer != NO_TRACER) 721 p->p_flag |= P_TRACED; 722 if (tty != NO_DEV) 723 p->p_flag |= P_CONTROLT; 724 p->p_pid = mp->mp_pid; 725 if (mp->mp_parent >= 0 && mp->mp_parent < NR_PROCS) 726 p->p_ppid = mproc_tab[mp->mp_parent].mp_pid; 727 p->p_sid = mp->mp_procgrp; /* TODO: job control supported */ 728 p->p__pgid = mp->mp_procgrp; 729 p->p_tpgid = (tty != NO_DEV) ? mp->mp_procgrp : 0; 730 p->p_uid = mp->mp_effuid; 731 p->p_ruid = mp->mp_realuid; 732 p->p_gid = mp->mp_effgid; 733 p->p_rgid = mp->mp_realgid; 734 p->p_ngroups = MIN(mp->mp_ngroups, KI_NGROUPS); 735 for (i = 0; i < p->p_ngroups; i++) 736 p->p_groups[i] = mp->mp_sgroups[i]; 737 p->p_tdev = tty; 738 memcpy(&p->p_siglist, &mp->mp_sigpending, sizeof(p->p_siglist)); 739 memcpy(&p->p_sigmask, &mp->mp_sigmask, sizeof(p->p_sigmask)); 740 memcpy(&p->p_sigcatch, &mp->mp_catch, sizeof(p->p_sigcatch)); 741 memcpy(&p->p_sigignore, &mp->mp_ignore, sizeof(p->p_sigignore)); 742 p->p_nice = mp->mp_nice + NZERO; 743 strlcpy(p->p_comm, mp->mp_name, sizeof(p->p_comm)); 744 p->p_uvalid = 1; 745 ticks_to_timeval(&tv, mp->mp_started); 746 p->p_ustart_sec = boottime + tv.tv_sec; 747 p->p_ustart_usec = tv.tv_usec; 748 /* TODO: other rusage fields */ 749 ticks_to_timeval(&tv, mp->mp_child_utime + mp->mp_child_stime); 750 p->p_uctime_sec = tv.tv_sec; 751 p->p_uctime_usec = tv.tv_usec; 752 p->p_realflag = p->p_flag; 753 p->p_nlwps = (zombie) ? 0 : 1; 754 p->p_svuid = mp->mp_svuid; 755 p->p_svgid = mp->mp_svgid; 756 757 p->p_stat = get_lwp_stat(mslot, &p->p_wchan, p->p_wmesg, 758 sizeof(p->p_wmesg), &p->p_flag); 759 760 switch (p->p_stat) { 761 case LSRUN: 762 p->p_realstat = SACTIVE; 763 p->p_nrlwps = 1; 764 break; 765 case LSSLEEP: 766 p->p_realstat = SACTIVE; 767 if (p->p_flag & L_SINTR) 768 p->p_realflag |= P_SINTR; 769 break; 770 case LSSTOP: 771 p->p_realstat = SSTOP; 772 break; 773 case LSZOMB: 774 p->p_realstat = SZOMB; 775 break; 776 case LSDEAD: 777 p->p_stat = LSZOMB; /* ps(1) STAT does not know LSDEAD */ 778 p->p_realstat = SDEAD; 779 break; 780 default: 781 assert(0); 782 } 783 784 if (!zombie) 785 fill_proc2_common(p, kslot); 786 } 787 788 /* 789 * Implementation of CTL_KERN KERN_PROC2. 790 */ 791 ssize_t 792 mib_kern_proc2(struct mib_call * call, struct mib_node * node __unused, 793 struct mib_oldp * oldp, struct mib_newp * newp __unused) 794 { 795 struct kinfo_proc2 proc2; 796 struct mproc *mp; 797 size_t copysz; 798 ssize_t off; 799 dev_t tty; 800 int r, req, arg, elsz, elmax, kmatch, zombie, mslot; 801 802 if (call->call_namelen != 4) 803 return EINVAL; 804 805 req = call->call_name[0]; 806 arg = call->call_name[1]; 807 elsz = call->call_name[2]; 808 elmax = call->call_name[3]; /* redundant with the given oldlen.. */ 809 810 /* 811 * The kernel is special, in that it does not have a slot in the PM or 812 * VFS tables. As such, it is dealt with separately. While checking 813 * arguments, we might as well check whether the kernel is matched. 814 */ 815 switch (req) { 816 case KERN_PROC_ALL: 817 kmatch = TRUE; 818 break; 819 case KERN_PROC_PID: 820 case KERN_PROC_SESSION: 821 case KERN_PROC_PGRP: 822 case KERN_PROC_UID: 823 case KERN_PROC_RUID: 824 case KERN_PROC_GID: 825 case KERN_PROC_RGID: 826 kmatch = (arg == 0); 827 break; 828 case KERN_PROC_TTY: 829 kmatch = ((dev_t)arg == KERN_PROC_TTY_NODEV); 830 break; 831 default: 832 return EINVAL; 833 } 834 835 if (elsz <= 0 || elmax < 0) 836 return EINVAL; 837 838 if (!update_tables()) 839 return EINVAL; 840 841 off = 0; 842 copysz = MIN((size_t)elsz, sizeof(proc2)); 843 844 if (kmatch) { 845 if (mib_inrange(oldp, off) && elmax > 0) { 846 fill_proc2_kern(&proc2); 847 if ((r = mib_copyout(oldp, off, &proc2, copysz)) < 0) 848 return r; 849 elmax--; 850 } 851 off += elsz; 852 } 853 854 for (mslot = 0; mslot < NR_PROCS; mslot++) { 855 mp = &mproc_tab[mslot]; 856 857 if (!(mp->mp_flags & IN_USE)) 858 continue; 859 860 switch (req) { 861 case KERN_PROC_PID: 862 if ((pid_t)arg != mp->mp_pid) 863 continue; 864 break; 865 case KERN_PROC_SESSION: /* TODO: job control support */ 866 case KERN_PROC_PGRP: 867 if ((pid_t)arg != mp->mp_procgrp) 868 continue; 869 break; 870 case KERN_PROC_TTY: 871 if ((dev_t)arg == KERN_PROC_TTY_REVOKE) 872 continue; /* TODO: revoke(2) support */ 873 /* Do not access the fproc_tab slot of zombies. */ 874 zombie = (mp->mp_flags & (TRACE_ZOMBIE | ZOMBIE)); 875 tty = (zombie) ? fproc_tab[mslot].fpl_tty : NO_DEV; 876 if ((dev_t)arg == KERN_PROC_TTY_NODEV) { 877 if (tty != NO_DEV) 878 continue; 879 } else if ((dev_t)arg == NO_DEV || (dev_t)arg != tty) 880 continue; 881 break; 882 case KERN_PROC_UID: 883 if ((uid_t)arg != mp->mp_effuid) 884 continue; 885 break; 886 case KERN_PROC_RUID: 887 if ((uid_t)arg != mp->mp_realuid) 888 continue; 889 break; 890 case KERN_PROC_GID: 891 if ((gid_t)arg != mp->mp_effgid) 892 continue; 893 break; 894 case KERN_PROC_RGID: 895 if ((gid_t)arg != mp->mp_realgid) 896 continue; 897 break; 898 } 899 900 if (mib_inrange(oldp, off) && elmax > 0) { 901 fill_proc2_user(&proc2, mslot); 902 if ((r = mib_copyout(oldp, off, &proc2, copysz)) < 0) 903 return r; 904 elmax--; 905 } 906 off += elsz; 907 } 908 909 if (oldp == NULL && req != KERN_PROC_PID) 910 off += EXTRA_PROCS * elsz; 911 912 return off; 913 } 914 915 /* 916 * Implementation of CTL_KERN KERN_PROC_ARGS. 917 */ 918 ssize_t 919 mib_kern_proc_args(struct mib_call * call, struct mib_node * node __unused, 920 struct mib_oldp * oldp, struct mib_newp * newp __unused) 921 { 922 char vbuf[PAGE_SIZE], sbuf[PAGE_SIZE], obuf[PAGE_SIZE]; 923 struct ps_strings pss; 924 struct mproc *mp; 925 char *buf, *p, *q, *pptr; 926 vir_bytes vaddr, vpage, spage, paddr, ppage; 927 size_t max, off, olen, oleft, oldlen, bytes, pleft; 928 unsigned int copybudget; 929 pid_t pid; 930 int req, mslot, count, aborted, ended; 931 ssize_t r; 932 933 if (call->call_namelen != 2) 934 return EINVAL; 935 936 pid = call->call_name[0]; 937 req = call->call_name[1]; 938 939 switch (req) { 940 case KERN_PROC_ARGV: 941 case KERN_PROC_ENV: 942 case KERN_PROC_NARGV: 943 case KERN_PROC_NENV: 944 break; 945 default: 946 return EOPNOTSUPP; 947 } 948 949 if (!update_tables()) 950 return EINVAL; 951 952 if ((mslot = get_mslot(pid)) == NO_SLOT) 953 return ESRCH; 954 mp = &mproc_tab[mslot]; 955 if (mp->mp_flags & (TRACE_ZOMBIE | ZOMBIE)) 956 return ESRCH; 957 958 /* We can return the count field size without copying in any data. */ 959 if (oldp == NULL && (req == KERN_PROC_NARGV || req == KERN_PROC_NENV)) 960 return sizeof(count); 961 962 if (sys_datacopy(mp->mp_endpoint, 963 mp->mp_frame_addr + mp->mp_frame_len - sizeof(pss), 964 SELF, (vir_bytes)&pss, sizeof(pss)) != OK) 965 return EINVAL; 966 967 /* 968 * Determine the upper size limit of the requested data. Not only may 969 * the size never exceed ARG_MAX, it may also not exceed the frame 970 * length as given in its original exec call. In fact, the frame 971 * length should be substantially larger: all strings for both the 972 * arguments and the environment are in there, along with other stuff, 973 * and there must be no overlap between strings. It is possible that 974 * the application called setproctitle(3), in which case the ps_strings 975 * pointers refer to data outside the frame altogether. However, this 976 * data should not exceed 2048 bytes, and we cover this by rounding up 977 * the frame length to a multiple of the page size. Anyhow, NetBSD 978 * blindly returns ARG_MAX when asked for a size estimate, so with this 979 * maximum we are already quite a bit more accurate. 980 */ 981 max = roundup(MIN(mp->mp_frame_len, ARG_MAX), PAGE_SIZE); 982 983 switch (req) { 984 case KERN_PROC_NARGV: 985 count = pss.ps_nargvstr; 986 return mib_copyout(oldp, 0, &count, sizeof(count)); 987 case KERN_PROC_NENV: 988 count = pss.ps_nenvstr; 989 return mib_copyout(oldp, 0, &count, sizeof(count)); 990 case KERN_PROC_ARGV: 991 if (oldp == NULL) 992 return max; 993 vaddr = (vir_bytes)pss.ps_argvstr; 994 count = pss.ps_nargvstr; 995 break; 996 case KERN_PROC_ENV: 997 if (oldp == NULL) 998 return max; 999 vaddr = (vir_bytes)pss.ps_envstr; 1000 count = pss.ps_nenvstr; 1001 break; 1002 } 1003 1004 /* 1005 * Go through the strings. Copy in entire, machine-aligned pages at 1006 * once, in the hope that all data is stored consecutively, which it 1007 * should be: we expect that the vector is followed by the strings, and 1008 * that the strings are stored in order of vector reference. We keep 1009 * up to two pages with copied-in data: one for the vector, and 1010 * optionally one for string data. In addition, we keep one page with 1011 * data to be copied out, so that we do not cause a lot of copy 1012 * overhead for short strings. 1013 * 1014 * We stop whenever any of the following conditions are met: 1015 * - copying in data from the target process fails for any reason; 1016 * - we have processed the last index ('count') into the vector; 1017 * - the current vector element is a NULL pointer; 1018 * - the requested number of output bytes ('oldlen') has been reached; 1019 * - the maximum number of output bytes ('max') has been reached; 1020 * - the number of page copy-ins exceeds an estimated threshold; 1021 * - copying out data fails for any reason (we then return the error). 1022 * 1023 * We limit the number of page copy-ins because otherwise a rogue 1024 * process could create an argument vector consisting of only two-byte 1025 * strings that all span two pages, causing us to copy up to 1GB of 1026 * data with the current ARG_MAX value of 256K. No reasonable vector 1027 * should cause more than (ARG_MAX / PAGE_SIZE) page copies for 1028 * strings; we are nice enough to allow twice that. Vector copies do 1029 * not count, as they are linear anyway. 1030 * 1031 * Unlike every other sysctl(2) call, we are supposed to truncate the 1032 * resulting size (the returned 'oldlen') to the requested size (the 1033 * given 'oldlen') *and* return the resulting size, rather than ENOMEM 1034 * and the real size. Unfortunately, libkvm actually relies on this. 1035 * 1036 * Generally speaking, upon failure we just return a truncated result. 1037 * In case of truncation, the data we copy out need not be null 1038 * terminated. It is up to userland to process the data correctly. 1039 */ 1040 if (trunc_page(vaddr) == 0 || vaddr % sizeof(char *) != 0) 1041 return 0; 1042 1043 off = 0; 1044 olen = 0; 1045 aborted = FALSE; 1046 1047 oldlen = mib_getoldlen(oldp); 1048 if (oldlen > max) 1049 oldlen = max; 1050 1051 copybudget = (ARG_MAX / PAGE_SIZE) * 2; 1052 1053 vpage = 0; 1054 spage = 0; 1055 1056 while (count > 0 && off + olen < oldlen && !aborted) { 1057 /* 1058 * Start by fetching the page containing the current vector 1059 * element, if needed. We could limit the fetch to the vector 1060 * size, but our hope is that for the simple cases, the strings 1061 * are on the remainder of the same page, so we save a copy 1062 * call. TODO: since the strings should follow the vector, we 1063 * could start the copy at the base of the vector. 1064 */ 1065 if (trunc_page(vaddr) != vpage) { 1066 vpage = trunc_page(vaddr); 1067 if (sys_datacopy(mp->mp_endpoint, vpage, SELF, 1068 (vir_bytes)vbuf, PAGE_SIZE) != OK) 1069 break; 1070 } 1071 1072 /* Get the current vector element, pointing to a string. */ 1073 memcpy(&pptr, &vbuf[vaddr - vpage], sizeof(pptr)); 1074 paddr = (vir_bytes)pptr; 1075 ppage = trunc_page(paddr); 1076 if (ppage == 0) 1077 break; 1078 1079 /* Fetch the string itself, one page at a time at most. */ 1080 do { 1081 /* 1082 * See if the string pointer falls inside either the 1083 * vector page or the previously fetched string page 1084 * (if any). If not, fetch a string page. 1085 */ 1086 if (ppage == vpage) { 1087 buf = vbuf; 1088 } else if (ppage == spage) { 1089 buf = sbuf; 1090 } else { 1091 if (--copybudget == 0) { 1092 aborted = TRUE; 1093 break; 1094 } 1095 spage = ppage; 1096 if (sys_datacopy(mp->mp_endpoint, spage, SELF, 1097 (vir_bytes)sbuf, PAGE_SIZE) != OK) { 1098 aborted = TRUE; 1099 break; 1100 } 1101 buf = sbuf; 1102 } 1103 1104 /* 1105 * We now have a string fragment in a buffer. See if 1106 * the string is null terminated. If not, all the data 1107 * up to the buffer end is part of the string, and the 1108 * string continues on the next page. 1109 */ 1110 p = &buf[paddr - ppage]; 1111 pleft = PAGE_SIZE - (paddr - ppage); 1112 assert(pleft > 0); 1113 1114 if ((q = memchr(p, '\0', pleft)) != NULL) { 1115 bytes = (size_t)(q - p + 1); 1116 assert(bytes <= pleft); 1117 ended = TRUE; 1118 } else { 1119 bytes = pleft; 1120 ended = FALSE; 1121 } 1122 1123 /* Limit the result to the requested length. */ 1124 if (off + olen + bytes > oldlen) 1125 bytes = oldlen - off - olen; 1126 1127 /* 1128 * Add 'bytes' bytes from string pointer 'p' to the 1129 * output buffer, copying out its contents to userland 1130 * if it has filled up. 1131 */ 1132 if (olen + bytes > sizeof(obuf)) { 1133 oleft = sizeof(obuf) - olen; 1134 memcpy(&obuf[olen], p, oleft); 1135 1136 if ((r = mib_copyout(oldp, off, obuf, 1137 sizeof(obuf))) < 0) 1138 return r; 1139 off += sizeof(obuf); 1140 olen = 0; 1141 1142 p += oleft; 1143 bytes -= oleft; 1144 } 1145 if (bytes > 0) { 1146 memcpy(&obuf[olen], p, bytes); 1147 olen += bytes; 1148 } 1149 1150 /* 1151 * Continue as long as we have not yet found the string 1152 * end, and we have not yet filled the output buffer. 1153 */ 1154 paddr += pleft; 1155 assert(trunc_page(paddr) == paddr); 1156 ppage = paddr; 1157 } while (!ended && off + olen < oldlen); 1158 1159 vaddr += sizeof(char *); 1160 count--; 1161 } 1162 1163 /* Copy out any remainder of the output buffer. */ 1164 if (olen > 0) { 1165 if ((r = mib_copyout(oldp, off, obuf, olen)) < 0) 1166 return r; 1167 off += olen; 1168 } 1169 1170 assert(off <= oldlen); 1171 return off; 1172 } 1173 1174 /* 1175 * Implementation of CTL_MINIX MINIX_PROC PROC_LIST. 1176 */ 1177 ssize_t 1178 mib_minix_proc_list(struct mib_call * call __unused, 1179 struct mib_node * node __unused, struct mib_oldp * oldp, 1180 struct mib_newp * newp __unused) 1181 { 1182 struct minix_proc_list mpl[NR_PROCS]; 1183 struct minix_proc_list *mplp; 1184 struct mproc *mp; 1185 unsigned int mslot; 1186 1187 if (oldp == NULL) 1188 return sizeof(mpl); 1189 1190 if (!update_tables()) 1191 return EINVAL; 1192 1193 memset(&mpl, 0, sizeof(mpl)); 1194 1195 mplp = mpl; 1196 mp = mproc_tab; 1197 1198 for (mslot = 0; mslot < NR_PROCS; mslot++, mplp++, mp++) { 1199 if (!(mp->mp_flags & IN_USE) || mp->mp_pid <= 0) 1200 continue; 1201 1202 mplp->mpl_flags = MPLF_IN_USE; 1203 if (mp->mp_flags & (TRACE_ZOMBIE | ZOMBIE)) 1204 mplp->mpl_flags |= MPLF_ZOMBIE; 1205 mplp->mpl_pid = mp->mp_pid; 1206 mplp->mpl_uid = mp->mp_effuid; 1207 mplp->mpl_gid = mp->mp_effgid; 1208 } 1209 1210 return mib_copyout(oldp, 0, &mpl, sizeof(mpl)); 1211 } 1212 1213 /* 1214 * Implementation of CTL_MINIX MINIX_PROC PROC_DATA. 1215 */ 1216 ssize_t 1217 mib_minix_proc_data(struct mib_call * call, struct mib_node * node __unused, 1218 struct mib_oldp * oldp, struct mib_newp * newp __unused) 1219 { 1220 struct minix_proc_data mpd; 1221 struct proc *kp; 1222 int kslot, mslot = 0; 1223 unsigned int mflags; 1224 pid_t pid; 1225 1226 /* 1227 * It is currently only possible to retrieve the process data for a 1228 * particular PID, which must be given as the last name component. 1229 */ 1230 if (call->call_namelen != 1) 1231 return EINVAL; 1232 1233 pid = (pid_t)call->call_name[0]; 1234 1235 if (!update_tables()) 1236 return EINVAL; 1237 1238 /* 1239 * Unlike the CTL_KERN nodes, we use the ProcFS semantics here: if the 1240 * given PID is negative, it is a kernel task; otherwise, it identifies 1241 * a user process. A request for PID 0 will result in ESRCH. 1242 */ 1243 if (pid < 0) { 1244 if (pid < -NR_TASKS) 1245 return ESRCH; 1246 1247 kslot = pid + NR_TASKS; 1248 assert(kslot < NR_TASKS); 1249 } else { 1250 if ((mslot = get_mslot(pid)) == NO_SLOT) 1251 return ESRCH; 1252 1253 kslot = NR_TASKS + mslot; 1254 } 1255 1256 if (oldp == NULL) 1257 return sizeof(mpd); 1258 1259 kp = &proc_tab[kslot]; 1260 1261 mflags = (pid > 0) ? mproc_tab[mslot].mp_flags : 0; 1262 1263 memset(&mpd, 0, sizeof(mpd)); 1264 mpd.mpd_endpoint = kp->p_endpoint; 1265 if (mflags & PRIV_PROC) 1266 mpd.mpd_flags |= MPDF_SYSTEM; 1267 if (mflags & (TRACE_ZOMBIE | ZOMBIE)) 1268 mpd.mpd_flags |= MPDF_ZOMBIE; 1269 else if ((mflags & TRACE_STOPPED) || RTS_ISSET(kp, RTS_P_STOP)) 1270 mpd.mpd_flags |= MPDF_STOPPED; 1271 else if (proc_is_runnable(kp)) 1272 mpd.mpd_flags |= MPDF_RUNNABLE; 1273 mpd.mpd_blocked_on = P_BLOCKEDON(kp); 1274 mpd.mpd_priority = kp->p_priority; 1275 mpd.mpd_user_time = kp->p_user_time; 1276 mpd.mpd_sys_time = kp->p_sys_time; 1277 mpd.mpd_cycles = kp->p_cycles; 1278 mpd.mpd_kipc_cycles = kp->p_kipc_cycles; 1279 mpd.mpd_kcall_cycles = kp->p_kcall_cycles; 1280 if (kslot >= NR_TASKS) { 1281 mpd.mpd_nice = mproc_tab[mslot].mp_nice; 1282 strlcpy(mpd.mpd_name, mproc_tab[mslot].mp_name, 1283 sizeof(mpd.mpd_name)); 1284 } else 1285 strlcpy(mpd.mpd_name, kp->p_name, sizeof(mpd.mpd_name)); 1286 1287 return mib_copyout(oldp, 0, &mpd, sizeof(mpd)); 1288 } 1289