1 /* 2 * Copyright (c) 1982, 1986, 1989, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94 35 * $FreeBSD: src/sys/kern/kern_fork.c,v 1.72.2.14 2003/06/26 04:15:10 silby Exp $ 36 */ 37 38 #include "opt_ktrace.h" 39 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/sysproto.h> 43 #include <sys/filedesc.h> 44 #include <sys/kernel.h> 45 #include <sys/sysctl.h> 46 #include <sys/malloc.h> 47 #include <sys/proc.h> 48 #include <sys/resourcevar.h> 49 #include <sys/vnode.h> 50 #include <sys/acct.h> 51 #include <sys/ktrace.h> 52 #include <sys/unistd.h> 53 #include <sys/jail.h> 54 55 #include <vm/vm.h> 56 #include <sys/lock.h> 57 #include <vm/pmap.h> 58 #include <vm/vm_map.h> 59 #include <vm/vm_extern.h> 60 61 #include <sys/vmmeter.h> 62 #include <sys/refcount.h> 63 #include <sys/thread2.h> 64 #include <sys/signal2.h> 65 #include <sys/spinlock2.h> 66 67 #include <sys/dsched.h> 68 69 static MALLOC_DEFINE(M_ATFORK, "atfork", "atfork callback"); 70 static MALLOC_DEFINE(M_REAPER, "reaper", "process reapers"); 71 72 /* 73 * These are the stuctures used to create a callout list for things to do 74 * when forking a process 75 */ 76 struct forklist { 77 forklist_fn function; 78 TAILQ_ENTRY(forklist) next; 79 }; 80 81 TAILQ_HEAD(forklist_head, forklist); 82 static struct forklist_head fork_list = TAILQ_HEAD_INITIALIZER(fork_list); 83 84 static struct lwp *lwp_fork(struct lwp *, struct proc *, int flags); 85 86 int forksleep; /* Place for fork1() to sleep on. */ 87 88 /* 89 * Red-Black tree support for LWPs 90 */ 91 92 static int 93 rb_lwp_compare(struct lwp *lp1, struct lwp *lp2) 94 { 95 if (lp1->lwp_tid < lp2->lwp_tid) 96 return(-1); 97 if (lp1->lwp_tid > lp2->lwp_tid) 98 return(1); 99 return(0); 100 } 101 102 RB_GENERATE2(lwp_rb_tree, lwp, u.lwp_rbnode, rb_lwp_compare, lwpid_t, lwp_tid); 103 104 /* 105 * fork() system call 106 */ 107 int 108 sys_fork(struct fork_args *uap) 109 { 110 struct lwp *lp = curthread->td_lwp; 111 struct proc *p2; 112 int error; 113 114 error = fork1(lp, RFFDG | RFPROC | RFPGLOCK, &p2); 115 if (error == 0) { 116 PHOLD(p2); 117 start_forked_proc(lp, p2); 118 uap->sysmsg_fds[0] = p2->p_pid; 119 uap->sysmsg_fds[1] = 0; 120 PRELE(p2); 121 } 122 return error; 123 } 124 125 /* 126 * vfork() system call 127 */ 128 int 129 sys_vfork(struct vfork_args *uap) 130 { 131 struct lwp *lp = curthread->td_lwp; 132 struct proc *p2; 133 int error; 134 135 error = fork1(lp, RFFDG | RFPROC | RFPPWAIT | RFMEM | RFPGLOCK, &p2); 136 if (error == 0) { 137 PHOLD(p2); 138 start_forked_proc(lp, p2); 139 uap->sysmsg_fds[0] = p2->p_pid; 140 uap->sysmsg_fds[1] = 0; 141 PRELE(p2); 142 } 143 return error; 144 } 145 146 /* 147 * Handle rforks. An rfork may (1) operate on the current process without 148 * creating a new, (2) create a new process that shared the current process's 149 * vmspace, signals, and/or descriptors, or (3) create a new process that does 150 * not share these things (normal fork). 151 * 152 * Note that we only call start_forked_proc() if a new process is actually 153 * created. 154 * 155 * rfork { int flags } 156 */ 157 int 158 sys_rfork(struct rfork_args *uap) 159 { 160 struct lwp *lp = curthread->td_lwp; 161 struct proc *p2; 162 int error; 163 164 if ((uap->flags & RFKERNELONLY) != 0) 165 return (EINVAL); 166 167 error = fork1(lp, uap->flags | RFPGLOCK, &p2); 168 if (error == 0) { 169 if (p2) { 170 PHOLD(p2); 171 start_forked_proc(lp, p2); 172 uap->sysmsg_fds[0] = p2->p_pid; 173 uap->sysmsg_fds[1] = 0; 174 PRELE(p2); 175 } else { 176 uap->sysmsg_fds[0] = 0; 177 uap->sysmsg_fds[1] = 0; 178 } 179 } 180 return error; 181 } 182 183 /* 184 * Low level thread create used by pthreads. 185 */ 186 int 187 sys_lwp_create(struct lwp_create_args *uap) 188 { 189 struct proc *p = curproc; 190 struct lwp *lp; 191 struct lwp_params params; 192 int error; 193 194 error = copyin(uap->params, ¶ms, sizeof(params)); 195 if (error) 196 goto fail2; 197 198 lwkt_gettoken(&p->p_token); 199 plimit_lwp_fork(p); /* force exclusive access */ 200 lp = lwp_fork(curthread->td_lwp, p, RFPROC); 201 error = cpu_prepare_lwp(lp, ¶ms); 202 if (error) 203 goto fail; 204 if (params.lwp_tid1 != NULL && 205 (error = copyout(&lp->lwp_tid, params.lwp_tid1, sizeof(lp->lwp_tid)))) 206 goto fail; 207 if (params.lwp_tid2 != NULL && 208 (error = copyout(&lp->lwp_tid, params.lwp_tid2, sizeof(lp->lwp_tid)))) 209 goto fail; 210 211 /* 212 * Now schedule the new lwp. 213 */ 214 p->p_usched->resetpriority(lp); 215 crit_enter(); 216 lp->lwp_stat = LSRUN; 217 p->p_usched->setrunqueue(lp); 218 crit_exit(); 219 lwkt_reltoken(&p->p_token); 220 221 return (0); 222 223 fail: 224 lwp_rb_tree_RB_REMOVE(&p->p_lwp_tree, lp); 225 --p->p_nthreads; 226 /* lwp_dispose expects an exited lwp, and a held proc */ 227 atomic_set_int(&lp->lwp_mpflags, LWP_MP_WEXIT); 228 lp->lwp_thread->td_flags |= TDF_EXITING; 229 lwkt_remove_tdallq(lp->lwp_thread); 230 PHOLD(p); 231 biosched_done(lp->lwp_thread); 232 dsched_exit_thread(lp->lwp_thread); 233 lwp_dispose(lp); 234 lwkt_reltoken(&p->p_token); 235 fail2: 236 return (error); 237 } 238 239 int nprocs = 1; /* process 0 */ 240 241 int 242 fork1(struct lwp *lp1, int flags, struct proc **procp) 243 { 244 struct proc *p1 = lp1->lwp_proc; 245 struct proc *p2; 246 struct proc *pptr; 247 struct pgrp *p1grp; 248 struct pgrp *plkgrp; 249 struct sysreaper *reap; 250 uid_t uid; 251 int ok, error; 252 static int curfail = 0; 253 static struct timeval lastfail; 254 struct forklist *ep; 255 struct filedesc_to_leader *fdtol; 256 257 if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) 258 return (EINVAL); 259 260 lwkt_gettoken(&p1->p_token); 261 plkgrp = NULL; 262 p2 = NULL; 263 264 /* 265 * Here we don't create a new process, but we divorce 266 * certain parts of a process from itself. 267 */ 268 if ((flags & RFPROC) == 0) { 269 /* 270 * This kind of stunt does not work anymore if 271 * there are native threads (lwps) running 272 */ 273 if (p1->p_nthreads != 1) { 274 error = EINVAL; 275 goto done; 276 } 277 278 vm_fork(p1, 0, flags); 279 280 /* 281 * Close all file descriptors. 282 */ 283 if (flags & RFCFDG) { 284 struct filedesc *fdtmp; 285 fdtmp = fdinit(p1); 286 fdfree(p1, fdtmp); 287 } 288 289 /* 290 * Unshare file descriptors (from parent.) 291 */ 292 if (flags & RFFDG) { 293 if (p1->p_fd->fd_refcnt > 1) { 294 struct filedesc *newfd; 295 error = fdcopy(p1, &newfd); 296 if (error != 0) { 297 error = ENOMEM; 298 goto done; 299 } 300 fdfree(p1, newfd); 301 } 302 } 303 *procp = NULL; 304 error = 0; 305 goto done; 306 } 307 308 /* 309 * Interlock against process group signal delivery. If signals 310 * are pending after the interlock is obtained we have to restart 311 * the system call to process the signals. If we don't the child 312 * can miss a pgsignal (such as ^C) sent during the fork. 313 * 314 * We can't use CURSIG() here because it will process any STOPs 315 * and cause the process group lock to be held indefinitely. If 316 * a STOP occurs, the fork will be restarted after the CONT. 317 */ 318 p1grp = p1->p_pgrp; 319 if ((flags & RFPGLOCK) && (plkgrp = p1->p_pgrp) != NULL) { 320 pgref(plkgrp); 321 lockmgr(&plkgrp->pg_lock, LK_SHARED); 322 if (CURSIG_NOBLOCK(lp1)) { 323 error = ERESTART; 324 goto done; 325 } 326 } 327 328 /* 329 * Although process entries are dynamically created, we still keep 330 * a global limit on the maximum number we will create. Don't allow 331 * a nonprivileged user to use the last ten processes; don't let root 332 * exceed the limit. The variable nprocs is the current number of 333 * processes, maxproc is the limit. 334 */ 335 uid = lp1->lwp_thread->td_ucred->cr_ruid; 336 if ((nprocs >= maxproc - 10 && uid != 0) || nprocs >= maxproc) { 337 if (ppsratecheck(&lastfail, &curfail, 1)) 338 kprintf("maxproc limit exceeded by uid %d, please " 339 "see tuning(7) and login.conf(5).\n", uid); 340 tsleep(&forksleep, 0, "fork", hz / 2); 341 error = EAGAIN; 342 goto done; 343 } 344 345 /* 346 * Increment the nprocs resource before blocking can occur. There 347 * are hard-limits as to the number of processes that can run. 348 */ 349 atomic_add_int(&nprocs, 1); 350 351 /* 352 * Increment the count of procs running with this uid. Don't allow 353 * a nonprivileged user to exceed their current limit. 354 */ 355 ok = chgproccnt(lp1->lwp_thread->td_ucred->cr_ruidinfo, 1, 356 (uid != 0) ? p1->p_rlimit[RLIMIT_NPROC].rlim_cur : 0); 357 if (!ok) { 358 /* 359 * Back out the process count 360 */ 361 atomic_add_int(&nprocs, -1); 362 if (ppsratecheck(&lastfail, &curfail, 1)) 363 kprintf("maxproc limit exceeded by uid %d, please " 364 "see tuning(7) and login.conf(5).\n", uid); 365 tsleep(&forksleep, 0, "fork", hz / 2); 366 error = EAGAIN; 367 goto done; 368 } 369 370 /* 371 * Allocate a new process, don't get fancy: zero the structure. 372 */ 373 p2 = kmalloc(sizeof(struct proc), M_PROC, M_WAITOK|M_ZERO); 374 375 /* 376 * Core initialization. SIDL is a safety state that protects the 377 * partially initialized process once it starts getting hooked 378 * into system structures and becomes addressable. 379 * 380 * We must be sure to acquire p2->p_token as well, we must hold it 381 * once the process is on the allproc list to avoid things such 382 * as competing modifications to p_flags. 383 */ 384 mycpu->gd_forkid += ncpus; 385 p2->p_forkid = mycpu->gd_forkid + mycpu->gd_cpuid; 386 p2->p_lasttid = -1; /* first tid will be 0 */ 387 p2->p_stat = SIDL; 388 389 /* 390 * NOTE: Process 0 will not have a reaper, but process 1 (init) and 391 * all other processes always will. 392 */ 393 if ((reap = p1->p_reaper) != NULL) { 394 reaper_hold(reap); 395 p2->p_reaper = reap; 396 } else { 397 p2->p_reaper = NULL; 398 } 399 400 RB_INIT(&p2->p_lwp_tree); 401 spin_init(&p2->p_spin, "procfork1"); 402 lwkt_token_init(&p2->p_token, "proc"); 403 lwkt_gettoken(&p2->p_token); 404 405 /* 406 * Setup linkage for kernel based threading XXX lwp. Also add the 407 * process to the allproclist. 408 * 409 * The process structure is addressable after this point. 410 */ 411 if (flags & RFTHREAD) { 412 p2->p_peers = p1->p_peers; 413 p1->p_peers = p2; 414 p2->p_leader = p1->p_leader; 415 } else { 416 p2->p_leader = p2; 417 } 418 proc_add_allproc(p2); 419 420 /* 421 * Initialize the section which is copied verbatim from the parent. 422 */ 423 bcopy(&p1->p_startcopy, &p2->p_startcopy, 424 ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy)); 425 426 /* 427 * Duplicate sub-structures as needed. Increase reference counts 428 * on shared objects. 429 * 430 * NOTE: because we are now on the allproc list it is possible for 431 * other consumers to gain temporary references to p2 432 * (p2->p_lock can change). 433 */ 434 if (p1->p_flags & P_PROFIL) 435 startprofclock(p2); 436 p2->p_ucred = crhold(lp1->lwp_thread->td_ucred); 437 438 if (jailed(p2->p_ucred)) 439 p2->p_flags |= P_JAILED; 440 441 if (p2->p_args) 442 refcount_acquire(&p2->p_args->ar_ref); 443 444 p2->p_usched = p1->p_usched; 445 /* XXX: verify copy of the secondary iosched stuff */ 446 dsched_new_proc(p2); 447 448 if (flags & RFSIGSHARE) { 449 p2->p_sigacts = p1->p_sigacts; 450 refcount_acquire(&p2->p_sigacts->ps_refcnt); 451 } else { 452 p2->p_sigacts = kmalloc(sizeof(*p2->p_sigacts), 453 M_SUBPROC, M_WAITOK); 454 bcopy(p1->p_sigacts, p2->p_sigacts, sizeof(*p2->p_sigacts)); 455 refcount_init(&p2->p_sigacts->ps_refcnt, 1); 456 } 457 if (flags & RFLINUXTHPN) 458 p2->p_sigparent = SIGUSR1; 459 else 460 p2->p_sigparent = SIGCHLD; 461 462 /* bump references to the text vnode (for procfs) */ 463 p2->p_textvp = p1->p_textvp; 464 if (p2->p_textvp) 465 vref(p2->p_textvp); 466 467 /* copy namecache handle to the text file */ 468 if (p1->p_textnch.mount) 469 cache_copy(&p1->p_textnch, &p2->p_textnch); 470 471 /* 472 * Handle file descriptors 473 */ 474 if (flags & RFCFDG) { 475 p2->p_fd = fdinit(p1); 476 fdtol = NULL; 477 } else if (flags & RFFDG) { 478 error = fdcopy(p1, &p2->p_fd); 479 if (error != 0) { 480 error = ENOMEM; 481 goto done; 482 } 483 fdtol = NULL; 484 } else { 485 p2->p_fd = fdshare(p1); 486 if (p1->p_fdtol == NULL) { 487 p1->p_fdtol = filedesc_to_leader_alloc(NULL, 488 p1->p_leader); 489 } 490 if ((flags & RFTHREAD) != 0) { 491 /* 492 * Shared file descriptor table and 493 * shared process leaders. 494 */ 495 fdtol = p1->p_fdtol; 496 fdtol->fdl_refcount++; 497 } else { 498 /* 499 * Shared file descriptor table, and 500 * different process leaders 501 */ 502 fdtol = filedesc_to_leader_alloc(p1->p_fdtol, p2); 503 } 504 } 505 p2->p_fdtol = fdtol; 506 p2->p_limit = plimit_fork(p1); 507 508 /* 509 * Preserve some more flags in subprocess. P_PROFIL has already 510 * been preserved. 511 */ 512 p2->p_flags |= p1->p_flags & P_SUGID; 513 if (p1->p_session->s_ttyvp != NULL && (p1->p_flags & P_CONTROLT)) 514 p2->p_flags |= P_CONTROLT; 515 if (flags & RFPPWAIT) { 516 p2->p_flags |= P_PPWAIT; 517 if (p1->p_upmap) 518 p1->p_upmap->invfork = 1; 519 } 520 521 522 /* 523 * Inherit the virtual kernel structure (allows a virtual kernel 524 * to fork to simulate multiple cpus). 525 */ 526 if (p1->p_vkernel) 527 vkernel_inherit(p1, p2); 528 529 /* 530 * Once we are on a pglist we may receive signals. XXX we might 531 * race a ^C being sent to the process group by not receiving it 532 * at all prior to this line. 533 */ 534 pgref(p1grp); 535 lwkt_gettoken(&p1grp->pg_token); 536 LIST_INSERT_AFTER(p1, p2, p_pglist); 537 lwkt_reltoken(&p1grp->pg_token); 538 539 /* 540 * Attach the new process to its parent. 541 * 542 * If RFNOWAIT is set, the newly created process becomes a child 543 * of the reaper (typically init). This effectively disassociates 544 * the child from the parent. 545 * 546 * Temporarily hold pptr for the RFNOWAIT case to avoid ripouts. 547 */ 548 if (flags & RFNOWAIT) { 549 pptr = reaper_get(reap); 550 if (pptr == NULL) { 551 pptr = initproc; 552 PHOLD(pptr); 553 } 554 } else { 555 pptr = p1; 556 } 557 p2->p_pptr = pptr; 558 LIST_INIT(&p2->p_children); 559 560 lwkt_gettoken(&pptr->p_token); 561 LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling); 562 lwkt_reltoken(&pptr->p_token); 563 564 if (flags & RFNOWAIT) 565 PRELE(pptr); 566 567 varsymset_init(&p2->p_varsymset, &p1->p_varsymset); 568 callout_init_mp(&p2->p_ithandle); 569 570 #ifdef KTRACE 571 /* 572 * Copy traceflag and tracefile if enabled. If not inherited, 573 * these were zeroed above but we still could have a trace race 574 * so make sure p2's p_tracenode is NULL. 575 */ 576 if ((p1->p_traceflag & KTRFAC_INHERIT) && p2->p_tracenode == NULL) { 577 p2->p_traceflag = p1->p_traceflag; 578 p2->p_tracenode = ktrinherit(p1->p_tracenode); 579 } 580 #endif 581 582 /* 583 * This begins the section where we must prevent the parent 584 * from being swapped. 585 * 586 * Gets PRELE'd in the caller in start_forked_proc(). 587 */ 588 PHOLD(p1); 589 590 vm_fork(p1, p2, flags); 591 592 /* 593 * Create the first lwp associated with the new proc. 594 * It will return via a different execution path later, directly 595 * into userland, after it was put on the runq by 596 * start_forked_proc(). 597 */ 598 lwp_fork(lp1, p2, flags); 599 600 if (flags == (RFFDG | RFPROC | RFPGLOCK)) { 601 mycpu->gd_cnt.v_forks++; 602 mycpu->gd_cnt.v_forkpages += p2->p_vmspace->vm_dsize + 603 p2->p_vmspace->vm_ssize; 604 } else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM | RFPGLOCK)) { 605 mycpu->gd_cnt.v_vforks++; 606 mycpu->gd_cnt.v_vforkpages += p2->p_vmspace->vm_dsize + 607 p2->p_vmspace->vm_ssize; 608 } else if (p1 == &proc0) { 609 mycpu->gd_cnt.v_kthreads++; 610 mycpu->gd_cnt.v_kthreadpages += p2->p_vmspace->vm_dsize + 611 p2->p_vmspace->vm_ssize; 612 } else { 613 mycpu->gd_cnt.v_rforks++; 614 mycpu->gd_cnt.v_rforkpages += p2->p_vmspace->vm_dsize + 615 p2->p_vmspace->vm_ssize; 616 } 617 618 /* 619 * Both processes are set up, now check if any loadable modules want 620 * to adjust anything. 621 * What if they have an error? XXX 622 */ 623 TAILQ_FOREACH(ep, &fork_list, next) { 624 (*ep->function)(p1, p2, flags); 625 } 626 627 /* 628 * Set the start time. Note that the process is not runnable. The 629 * caller is responsible for making it runnable. 630 */ 631 microtime(&p2->p_start); 632 p2->p_acflag = AFORK; 633 634 /* 635 * tell any interested parties about the new process 636 */ 637 KNOTE(&p1->p_klist, NOTE_FORK | p2->p_pid); 638 639 /* 640 * Return child proc pointer to parent. 641 */ 642 *procp = p2; 643 error = 0; 644 done: 645 if (p2) 646 lwkt_reltoken(&p2->p_token); 647 lwkt_reltoken(&p1->p_token); 648 if (plkgrp) { 649 lockmgr(&plkgrp->pg_lock, LK_RELEASE); 650 pgrel(plkgrp); 651 } 652 return (error); 653 } 654 655 static struct lwp * 656 lwp_fork(struct lwp *origlp, struct proc *destproc, int flags) 657 { 658 globaldata_t gd = mycpu; 659 struct lwp *lp; 660 struct thread *td; 661 662 lp = kmalloc(sizeof(struct lwp), M_LWP, M_WAITOK|M_ZERO); 663 664 lp->lwp_proc = destproc; 665 lp->lwp_vmspace = destproc->p_vmspace; 666 lp->lwp_stat = LSRUN; 667 bcopy(&origlp->lwp_startcopy, &lp->lwp_startcopy, 668 (unsigned) ((caddr_t)&lp->lwp_endcopy - 669 (caddr_t)&lp->lwp_startcopy)); 670 lp->lwp_flags |= origlp->lwp_flags & LWP_ALTSTACK; 671 /* 672 * Set cpbase to the last timeout that occured (not the upcoming 673 * timeout). 674 * 675 * A critical section is required since a timer IPI can update 676 * scheduler specific data. 677 */ 678 crit_enter(); 679 lp->lwp_cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic; 680 destproc->p_usched->heuristic_forking(origlp, lp); 681 crit_exit(); 682 CPUMASK_ANDMASK(lp->lwp_cpumask, usched_mastermask); 683 lwkt_token_init(&lp->lwp_token, "lwp_token"); 684 spin_init(&lp->lwp_spin, "lwptoken"); 685 686 /* 687 * Assign the thread to the current cpu to begin with so we 688 * can manipulate it. 689 */ 690 td = lwkt_alloc_thread(NULL, LWKT_THREAD_STACK, gd->gd_cpuid, 0); 691 lp->lwp_thread = td; 692 td->td_ucred = crhold(destproc->p_ucred); 693 td->td_proc = destproc; 694 td->td_lwp = lp; 695 td->td_switch = cpu_heavy_switch; 696 #ifdef NO_LWKT_SPLIT_USERPRI 697 lwkt_setpri(td, TDPRI_USER_NORM); 698 #else 699 lwkt_setpri(td, TDPRI_KERN_USER); 700 #endif 701 lwkt_set_comm(td, "%s", destproc->p_comm); 702 703 /* 704 * cpu_fork will copy and update the pcb, set up the kernel stack, 705 * and make the child ready to run. 706 */ 707 cpu_fork(origlp, lp, flags); 708 kqueue_init(&lp->lwp_kqueue, destproc->p_fd); 709 710 /* 711 * Assign a TID to the lp. Loop until the insert succeeds (returns 712 * NULL). 713 */ 714 lp->lwp_tid = destproc->p_lasttid; 715 do { 716 if (++lp->lwp_tid < 0) 717 lp->lwp_tid = 1; 718 } while (lwp_rb_tree_RB_INSERT(&destproc->p_lwp_tree, lp) != NULL); 719 destproc->p_lasttid = lp->lwp_tid; 720 destproc->p_nthreads++; 721 722 /* 723 * This flag is set and never cleared. It means that the process 724 * was threaded at some point. Used to improve exit performance. 725 */ 726 destproc->p_flags |= P_MAYBETHREADED; 727 728 return (lp); 729 } 730 731 /* 732 * The next two functionms are general routines to handle adding/deleting 733 * items on the fork callout list. 734 * 735 * at_fork(): 736 * Take the arguments given and put them onto the fork callout list, 737 * However first make sure that it's not already there. 738 * Returns 0 on success or a standard error number. 739 */ 740 int 741 at_fork(forklist_fn function) 742 { 743 struct forklist *ep; 744 745 #ifdef INVARIANTS 746 /* let the programmer know if he's been stupid */ 747 if (rm_at_fork(function)) { 748 kprintf("WARNING: fork callout entry (%p) already present\n", 749 function); 750 } 751 #endif 752 ep = kmalloc(sizeof(*ep), M_ATFORK, M_WAITOK|M_ZERO); 753 ep->function = function; 754 TAILQ_INSERT_TAIL(&fork_list, ep, next); 755 return (0); 756 } 757 758 /* 759 * Scan the exit callout list for the given item and remove it.. 760 * Returns the number of items removed (0 or 1) 761 */ 762 int 763 rm_at_fork(forklist_fn function) 764 { 765 struct forklist *ep; 766 767 TAILQ_FOREACH(ep, &fork_list, next) { 768 if (ep->function == function) { 769 TAILQ_REMOVE(&fork_list, ep, next); 770 kfree(ep, M_ATFORK); 771 return(1); 772 } 773 } 774 return (0); 775 } 776 777 /* 778 * Add a forked process to the run queue after any remaining setup, such 779 * as setting the fork handler, has been completed. 780 * 781 * p2 is held by the caller. 782 */ 783 void 784 start_forked_proc(struct lwp *lp1, struct proc *p2) 785 { 786 struct lwp *lp2 = ONLY_LWP_IN_PROC(p2); 787 int pflags; 788 789 /* 790 * Move from SIDL to RUN queue, and activate the process's thread. 791 * Activation of the thread effectively makes the process "a" 792 * current process, so we do not setrunqueue(). 793 * 794 * YYY setrunqueue works here but we should clean up the trampoline 795 * code so we just schedule the LWKT thread and let the trampoline 796 * deal with the userland scheduler on return to userland. 797 */ 798 KASSERT(p2->p_stat == SIDL, 799 ("cannot start forked process, bad status: %p", p2)); 800 p2->p_usched->resetpriority(lp2); 801 crit_enter(); 802 p2->p_stat = SACTIVE; 803 lp2->lwp_stat = LSRUN; 804 p2->p_usched->setrunqueue(lp2); 805 crit_exit(); 806 807 /* 808 * Now can be swapped. 809 */ 810 PRELE(lp1->lwp_proc); 811 812 /* 813 * Preserve synchronization semantics of vfork. P_PPWAIT is set in 814 * the child until it has retired the parent's resources. The parent 815 * must wait for the flag to be cleared by the child. 816 * 817 * Interlock the flag/tsleep with atomic ops to avoid unnecessary 818 * p_token conflicts. 819 * 820 * XXX Is this use of an atomic op on a field that is not normally 821 * manipulated with atomic ops ok? 822 */ 823 while ((pflags = p2->p_flags) & P_PPWAIT) { 824 cpu_ccfence(); 825 tsleep_interlock(lp1->lwp_proc, 0); 826 if (atomic_cmpset_int(&p2->p_flags, pflags, pflags)) 827 tsleep(lp1->lwp_proc, PINTERLOCKED, "ppwait", 0); 828 } 829 } 830 831 /* 832 * procctl (idtype_t idtype, id_t id, int cmd, void *arg) 833 */ 834 int 835 sys_procctl(struct procctl_args *uap) 836 { 837 struct proc *p = curproc; 838 struct proc *p2; 839 struct sysreaper *reap; 840 union reaper_info udata; 841 int error; 842 843 if (uap->idtype != P_PID || uap->id != (id_t)p->p_pid) 844 return EINVAL; 845 846 switch(uap->cmd) { 847 case PROC_REAP_ACQUIRE: 848 lwkt_gettoken(&p->p_token); 849 reap = kmalloc(sizeof(*reap), M_REAPER, M_WAITOK|M_ZERO); 850 if (p->p_reaper == NULL || p->p_reaper->p != p) { 851 reaper_init(p, reap); 852 error = 0; 853 } else { 854 kfree(reap, M_REAPER); 855 error = EALREADY; 856 } 857 lwkt_reltoken(&p->p_token); 858 break; 859 case PROC_REAP_RELEASE: 860 lwkt_gettoken(&p->p_token); 861 release_again: 862 reap = p->p_reaper; 863 KKASSERT(reap != NULL); 864 if (reap->p == p) { 865 reaper_hold(reap); /* in case of thread race */ 866 lockmgr(&reap->lock, LK_EXCLUSIVE); 867 if (reap->p != p) { 868 lockmgr(&reap->lock, LK_RELEASE); 869 reaper_drop(reap); 870 goto release_again; 871 } 872 reap->p = NULL; 873 p->p_reaper = reap->parent; 874 if (p->p_reaper) 875 reaper_hold(p->p_reaper); 876 lockmgr(&reap->lock, LK_RELEASE); 877 reaper_drop(reap); /* our ref */ 878 reaper_drop(reap); /* old p_reaper ref */ 879 error = 0; 880 } else { 881 error = ENOTCONN; 882 } 883 lwkt_reltoken(&p->p_token); 884 break; 885 case PROC_REAP_STATUS: 886 bzero(&udata, sizeof(udata)); 887 lwkt_gettoken_shared(&p->p_token); 888 if ((reap = p->p_reaper) != NULL && reap->p == p) { 889 udata.status.flags = reap->flags; 890 udata.status.refs = reap->refs - 1; /* minus ours */ 891 } 892 p2 = LIST_FIRST(&p->p_children); 893 udata.status.pid_head = p2 ? p2->p_pid : -1; 894 lwkt_reltoken(&p->p_token); 895 896 if (uap->data) { 897 error = copyout(&udata, uap->data, 898 sizeof(udata.status)); 899 } else { 900 error = 0; 901 } 902 break; 903 default: 904 error = EINVAL; 905 break; 906 } 907 return error; 908 } 909 910 /* 911 * Bump ref on reaper, preventing destruction 912 */ 913 void 914 reaper_hold(struct sysreaper *reap) 915 { 916 KKASSERT(reap->refs > 0); 917 refcount_acquire(&reap->refs); 918 } 919 920 /* 921 * Drop ref on reaper, destroy the structure on the 1->0 922 * transition and loop on the parent. 923 */ 924 void 925 reaper_drop(struct sysreaper *next) 926 { 927 struct sysreaper *reap; 928 929 while ((reap = next) != NULL) { 930 if (refcount_release(&reap->refs)) { 931 next = reap->parent; 932 KKASSERT(reap->p == NULL); 933 reap->parent = NULL; 934 kfree(reap, M_REAPER); 935 } else { 936 next = NULL; 937 } 938 } 939 } 940 941 /* 942 * Initialize a static or newly allocated reaper structure 943 */ 944 void 945 reaper_init(struct proc *p, struct sysreaper *reap) 946 { 947 reap->parent = p->p_reaper; 948 reap->p = p; 949 if (p == initproc) { 950 reap->flags = REAPER_STAT_OWNED | REAPER_STAT_REALINIT; 951 reap->refs = 2; 952 } else { 953 reap->flags = REAPER_STAT_OWNED; 954 reap->refs = 1; 955 } 956 lockinit(&reap->lock, "subrp", 0, 0); 957 cpu_sfence(); 958 p->p_reaper = reap; 959 } 960 961 /* 962 * Called with p->p_token held during exit. 963 * 964 * This is a bit simpler than RELEASE because there are no threads remaining 965 * to race. We only release if we own the reaper, the exit code will handle 966 * the final p_reaper release. 967 */ 968 struct sysreaper * 969 reaper_exit(struct proc *p) 970 { 971 struct sysreaper *reap; 972 973 /* 974 * Release acquired reaper 975 */ 976 if ((reap = p->p_reaper) != NULL && reap->p == p) { 977 lockmgr(&reap->lock, LK_EXCLUSIVE); 978 p->p_reaper = reap->parent; 979 if (p->p_reaper) 980 reaper_hold(p->p_reaper); 981 reap->p = NULL; 982 lockmgr(&reap->lock, LK_RELEASE); 983 reaper_drop(reap); 984 } 985 986 /* 987 * Return and clear reaper (caller is holding p_token for us) 988 * (reap->p does not equal p). Caller must drop it. 989 */ 990 if ((reap = p->p_reaper) != NULL) { 991 p->p_reaper = NULL; 992 } 993 return reap; 994 } 995 996 /* 997 * Return a held (PHOLD) process representing the reaper for process (p). 998 * NULL should not normally be returned. Caller should PRELE() the returned 999 * reaper process when finished. 1000 * 1001 * Remove dead internal nodes while we are at it. 1002 * 1003 * Process (p)'s token must be held on call. 1004 * The returned process's token is NOT acquired by this routine. 1005 */ 1006 struct proc * 1007 reaper_get(struct sysreaper *reap) 1008 { 1009 struct sysreaper *next; 1010 struct proc *reproc; 1011 1012 if (reap == NULL) 1013 return NULL; 1014 1015 /* 1016 * Extra hold for loop 1017 */ 1018 reaper_hold(reap); 1019 1020 while (reap) { 1021 lockmgr(&reap->lock, LK_SHARED); 1022 if (reap->p) { 1023 /* 1024 * Probable reaper 1025 */ 1026 if (reap->p) { 1027 reproc = reap->p; 1028 PHOLD(reproc); 1029 lockmgr(&reap->lock, LK_RELEASE); 1030 reaper_drop(reap); 1031 return reproc; 1032 } 1033 1034 /* 1035 * Raced, try again 1036 */ 1037 lockmgr(&reap->lock, LK_RELEASE); 1038 continue; 1039 } 1040 1041 /* 1042 * Traverse upwards in the reaper topology, destroy 1043 * dead internal nodes when possible. 1044 * 1045 * NOTE: Our ref on next means that a dead node should 1046 * have 2 (ours and reap->parent's). 1047 */ 1048 next = reap->parent; 1049 while (next) { 1050 reaper_hold(next); 1051 if (next->refs == 2 && next->p == NULL) { 1052 lockmgr(&reap->lock, LK_RELEASE); 1053 lockmgr(&reap->lock, LK_EXCLUSIVE); 1054 if (next->refs == 2 && 1055 reap->parent == next && 1056 next->p == NULL) { 1057 /* 1058 * reap->parent inherits ref from next. 1059 */ 1060 reap->parent = next->parent; 1061 next->parent = NULL; 1062 reaper_drop(next); /* ours */ 1063 reaper_drop(next); /* old parent */ 1064 next = reap->parent; 1065 continue; /* possible chain */ 1066 } 1067 } 1068 break; 1069 } 1070 lockmgr(&reap->lock, LK_RELEASE); 1071 reaper_drop(reap); 1072 reap = next; 1073 } 1074 return NULL; 1075 } 1076