1 /* 2 * Copyright (c) 1994 Jan-Simon Pendry 3 * Copyright (c) 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Jan-Simon Pendry. 8 * 9 * %sccs.include.redist.c% 10 * 11 * @(#)union_subr.c 8.3 (Berkeley) 02/10/94 12 */ 13 14 #include <sys/param.h> 15 #include <sys/systm.h> 16 #include <sys/time.h> 17 #include <sys/kernel.h> 18 #include <sys/vnode.h> 19 #include <sys/namei.h> 20 #include <sys/malloc.h> 21 #include <sys/file.h> 22 #include <sys/filedesc.h> 23 #include <sys/queue.h> 24 #include <miscfs/union/union.h> 25 26 #ifdef DIAGNOSTIC 27 #include <sys/proc.h> 28 #endif 29 30 /* must be power of two, otherwise change UNION_HASH() */ 31 #define NHASH 32 32 33 /* unsigned int ... */ 34 #define UNION_HASH(u, l) \ 35 (((((unsigned long) (u)) + ((unsigned long) l)) >> 8) & (NHASH-1)) 36 37 static LIST_HEAD(unhead, union_node) unhead[NHASH]; 38 static int unvplock[NHASH]; 39 40 int 41 union_init() 42 { 43 int i; 44 45 for (i = 0; i < NHASH; i++) 46 LIST_INIT(&unhead[i]); 47 bzero((caddr_t) unvplock, sizeof(unvplock)); 48 } 49 50 static int 51 union_list_lock(ix) 52 int ix; 53 { 54 55 if (unvplock[ix] & UN_LOCKED) { 56 unvplock[ix] |= UN_WANT; 57 sleep((caddr_t) &unvplock[ix], PINOD); 58 return (1); 59 } 60 61 unvplock[ix] |= UN_LOCKED; 62 63 return (0); 64 } 65 66 static void 67 union_list_unlock(ix) 68 int ix; 69 { 70 71 unvplock[ix] &= ~UN_LOCKED; 72 73 if (unvplock[ix] & UN_WANT) { 74 unvplock[ix] &= ~UN_WANT; 75 wakeup((caddr_t) &unvplock[ix]); 76 } 77 } 78 79 void 80 union_updatevp(un, uppervp, lowervp) 81 struct union_node *un; 82 struct vnode *uppervp; 83 struct vnode *lowervp; 84 { 85 int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp); 86 int nhash = UNION_HASH(uppervp, lowervp); 87 88 if (ohash != nhash) { 89 /* 90 * Ensure locking is ordered from lower to higher 91 * to avoid deadlocks. 92 */ 93 if (nhash < ohash) { 94 int t = ohash; 95 ohash = nhash; 96 nhash = t; 97 } 98 99 while (union_list_lock(ohash)) 100 continue; 101 102 while (union_list_lock(nhash)) 103 continue; 104 105 LIST_REMOVE(un, un_cache); 106 union_list_unlock(ohash); 107 } else { 108 while (union_list_lock(nhash)) 109 continue; 110 } 111 112 if (un->un_lowervp != lowervp) { 113 if (un->un_lowervp) { 114 vrele(un->un_lowervp); 115 if (un->un_path) { 116 free(un->un_path, M_TEMP); 117 un->un_path = 0; 118 } 119 if (un->un_dirvp) { 120 vrele(un->un_dirvp); 121 un->un_dirvp = NULLVP; 122 } 123 } 124 un->un_lowervp = lowervp; 125 } 126 127 if (un->un_uppervp != uppervp) { 128 if (un->un_uppervp) 129 vrele(un->un_uppervp); 130 131 un->un_uppervp = uppervp; 132 } 133 134 if (ohash != nhash) 135 LIST_INSERT_HEAD(&unhead[nhash], un, un_cache); 136 137 union_list_unlock(nhash); 138 } 139 140 void 141 union_newlower(un, lowervp) 142 struct union_node *un; 143 struct vnode *lowervp; 144 { 145 146 union_updatevp(un, un->un_uppervp, lowervp); 147 } 148 149 void 150 union_newupper(un, uppervp) 151 struct union_node *un; 152 struct vnode *uppervp; 153 { 154 155 union_updatevp(un, uppervp, un->un_lowervp); 156 } 157 158 /* 159 * allocate a union_node/vnode pair. the vnode is 160 * referenced and locked. the new vnode is returned 161 * via (vpp). (mp) is the mountpoint of the union filesystem, 162 * (dvp) is the parent directory where the upper layer object 163 * should exist (but doesn't) and (cnp) is the componentname 164 * information which is partially copied to allow the upper 165 * layer object to be created at a later time. (uppervp) 166 * and (lowervp) reference the upper and lower layer objects 167 * being mapped. either, but not both, can be nil. 168 * if supplied, (uppervp) is locked. 169 * the reference is either maintained in the new union_node 170 * object which is allocated, or they are vrele'd. 171 * 172 * all union_nodes are maintained on a singly-linked 173 * list. new nodes are only allocated when they cannot 174 * be found on this list. entries on the list are 175 * removed when the vfs reclaim entry is called. 176 * 177 * a single lock is kept for the entire list. this is 178 * needed because the getnewvnode() function can block 179 * waiting for a vnode to become free, in which case there 180 * may be more than one process trying to get the same 181 * vnode. this lock is only taken if we are going to 182 * call getnewvnode, since the kernel itself is single-threaded. 183 * 184 * if an entry is found on the list, then call vget() to 185 * take a reference. this is done because there may be 186 * zero references to it and so it needs to removed from 187 * the vnode free list. 188 */ 189 int 190 union_allocvp(vpp, mp, undvp, dvp, cnp, uppervp, lowervp) 191 struct vnode **vpp; 192 struct mount *mp; 193 struct vnode *undvp; 194 struct vnode *dvp; /* may be null */ 195 struct componentname *cnp; /* may be null */ 196 struct vnode *uppervp; /* may be null */ 197 struct vnode *lowervp; /* may be null */ 198 { 199 int error; 200 struct union_node *un; 201 struct union_node **pp; 202 struct vnode *xlowervp = NULLVP; 203 int hash; 204 int try; 205 206 if (uppervp == NULLVP && lowervp == NULLVP) 207 panic("union: unidentifiable allocation"); 208 209 if (uppervp && lowervp && (uppervp->v_type != lowervp->v_type)) { 210 xlowervp = lowervp; 211 lowervp = NULLVP; 212 } 213 214 loop: 215 for (try = 0; try < 3; try++) { 216 switch (try) { 217 case 0: 218 if (lowervp == NULLVP) 219 continue; 220 hash = UNION_HASH(uppervp, lowervp); 221 break; 222 223 case 1: 224 if (uppervp == NULLVP) 225 continue; 226 hash = UNION_HASH(uppervp, NULLVP); 227 break; 228 229 case 2: 230 if (lowervp == NULLVP) 231 continue; 232 hash = UNION_HASH(NULLVP, lowervp); 233 break; 234 } 235 236 while (union_list_lock(hash)) 237 continue; 238 239 for (un = unhead[hash].lh_first; un != 0; 240 un = un->un_cache.le_next) { 241 if ((un->un_lowervp == lowervp || 242 un->un_lowervp == NULLVP) && 243 (un->un_uppervp == uppervp || 244 un->un_uppervp == NULLVP) && 245 (UNIONTOV(un)->v_mount == mp)) { 246 if (vget(UNIONTOV(un), 0)) { 247 union_list_unlock(hash); 248 goto loop; 249 } 250 break; 251 } 252 } 253 254 union_list_unlock(hash); 255 256 if (un) 257 break; 258 } 259 260 if (un) { 261 /* 262 * Obtain a lock on the union_node. 263 * uppervp is locked, though un->un_uppervp 264 * may not be. this doesn't break the locking 265 * hierarchy since in the case that un->un_uppervp 266 * is not yet locked it will be vrele'd and replaced 267 * with uppervp. 268 */ 269 270 if ((dvp != NULLVP) && (uppervp == dvp)) { 271 /* 272 * Access ``.'', so (un) will already 273 * be locked. Since this process has 274 * the lock on (uppervp) no other 275 * process can hold the lock on (un). 276 */ 277 #ifdef DIAGNOSTIC 278 if ((un->un_flags & UN_LOCKED) == 0) 279 panic("union: . not locked"); 280 else if (curproc && un->un_pid != curproc->p_pid && 281 un->un_pid > -1 && curproc->p_pid > -1) 282 panic("union: allocvp not lock owner"); 283 #endif 284 } else { 285 if (un->un_flags & UN_LOCKED) { 286 vrele(UNIONTOV(un)); 287 un->un_flags |= UN_WANT; 288 sleep((caddr_t) &un->un_flags, PINOD); 289 goto loop; 290 } 291 un->un_flags |= UN_LOCKED; 292 293 #ifdef DIAGNOSTIC 294 if (curproc) 295 un->un_pid = curproc->p_pid; 296 else 297 un->un_pid = -1; 298 #endif 299 } 300 301 /* 302 * At this point, the union_node is locked, 303 * un->un_uppervp may not be locked, and uppervp 304 * is locked or nil. 305 */ 306 307 /* 308 * Save information about the upper layer. 309 */ 310 if (uppervp != un->un_uppervp) { 311 union_newupper(un, uppervp); 312 } else if (uppervp) { 313 vrele(uppervp); 314 } 315 316 if (un->un_uppervp) { 317 un->un_flags |= UN_ULOCK; 318 un->un_flags &= ~UN_KLOCK; 319 } 320 321 /* 322 * Save information about the lower layer. 323 * This needs to keep track of pathname 324 * and directory information which union_vn_create 325 * might need. 326 */ 327 if (lowervp != un->un_lowervp) { 328 union_newlower(un, lowervp); 329 if (cnp && (lowervp != NULLVP) && 330 (lowervp->v_type == VREG)) { 331 un->un_hash = cnp->cn_hash; 332 un->un_path = malloc(cnp->cn_namelen+1, 333 M_TEMP, M_WAITOK); 334 bcopy(cnp->cn_nameptr, un->un_path, 335 cnp->cn_namelen); 336 un->un_path[cnp->cn_namelen] = '\0'; 337 VREF(dvp); 338 un->un_dirvp = dvp; 339 } 340 } else if (lowervp) { 341 vrele(lowervp); 342 } 343 *vpp = UNIONTOV(un); 344 return (0); 345 } 346 347 /* 348 * otherwise lock the vp list while we call getnewvnode 349 * since that can block. 350 */ 351 hash = UNION_HASH(uppervp, lowervp); 352 353 if (union_list_lock(hash)) 354 goto loop; 355 356 error = getnewvnode(VT_UNION, mp, union_vnodeop_p, vpp); 357 if (error) { 358 if (uppervp) { 359 if (dvp == uppervp) 360 vrele(uppervp); 361 else 362 vput(uppervp); 363 } 364 if (lowervp) 365 vrele(lowervp); 366 367 goto out; 368 } 369 370 MALLOC((*vpp)->v_data, void *, sizeof(struct union_node), 371 M_TEMP, M_WAITOK); 372 373 if (uppervp) 374 (*vpp)->v_type = uppervp->v_type; 375 else 376 (*vpp)->v_type = lowervp->v_type; 377 un = VTOUNION(*vpp); 378 un->un_vnode = *vpp; 379 un->un_uppervp = uppervp; 380 un->un_lowervp = lowervp; 381 un->un_openl = 0; 382 un->un_flags = UN_LOCKED; 383 if (un->un_uppervp) 384 un->un_flags |= UN_ULOCK; 385 #ifdef DIAGNOSTIC 386 if (curproc) 387 un->un_pid = curproc->p_pid; 388 else 389 un->un_pid = -1; 390 #endif 391 if (cnp && (lowervp != NULLVP) && (lowervp->v_type == VREG)) { 392 un->un_hash = cnp->cn_hash; 393 un->un_path = malloc(cnp->cn_namelen+1, M_TEMP, M_WAITOK); 394 bcopy(cnp->cn_nameptr, un->un_path, cnp->cn_namelen); 395 un->un_path[cnp->cn_namelen] = '\0'; 396 VREF(dvp); 397 un->un_dirvp = dvp; 398 } else { 399 un->un_hash = 0; 400 un->un_path = 0; 401 un->un_dirvp = 0; 402 } 403 404 LIST_INSERT_HEAD(&unhead[hash], un, un_cache); 405 406 if (xlowervp) 407 vrele(xlowervp); 408 409 out: 410 union_list_unlock(hash); 411 412 return (error); 413 } 414 415 int 416 union_freevp(vp) 417 struct vnode *vp; 418 { 419 struct union_node *un = VTOUNION(vp); 420 421 LIST_REMOVE(un, un_cache); 422 423 if (un->un_uppervp) 424 vrele(un->un_uppervp); 425 if (un->un_lowervp) 426 vrele(un->un_lowervp); 427 if (un->un_dirvp) 428 vrele(un->un_dirvp); 429 if (un->un_path) 430 free(un->un_path, M_TEMP); 431 432 FREE(vp->v_data, M_TEMP); 433 vp->v_data = 0; 434 435 return (0); 436 } 437 438 /* 439 * copyfile. copy the vnode (fvp) to the vnode (tvp) 440 * using a sequence of reads and writes. both (fvp) 441 * and (tvp) are locked on entry and exit. 442 */ 443 int 444 union_copyfile(p, cred, fvp, tvp) 445 struct proc *p; 446 struct ucred *cred; 447 struct vnode *fvp; 448 struct vnode *tvp; 449 { 450 char *buf; 451 struct uio uio; 452 struct iovec iov; 453 int error = 0; 454 455 /* 456 * strategy: 457 * allocate a buffer of size MAXBSIZE. 458 * loop doing reads and writes, keeping track 459 * of the current uio offset. 460 * give up at the first sign of trouble. 461 */ 462 463 uio.uio_procp = p; 464 uio.uio_segflg = UIO_SYSSPACE; 465 uio.uio_offset = 0; 466 467 VOP_UNLOCK(fvp); /* XXX */ 468 LEASE_CHECK(fvp, p, cred, LEASE_READ); 469 VOP_LOCK(fvp); /* XXX */ 470 VOP_UNLOCK(tvp); /* XXX */ 471 LEASE_CHECK(tvp, p, cred, LEASE_WRITE); 472 VOP_LOCK(tvp); /* XXX */ 473 474 buf = malloc(MAXBSIZE, M_TEMP, M_WAITOK); 475 476 /* ugly loop follows... */ 477 do { 478 off_t offset = uio.uio_offset; 479 480 uio.uio_iov = &iov; 481 uio.uio_iovcnt = 1; 482 iov.iov_base = buf; 483 iov.iov_len = MAXBSIZE; 484 uio.uio_resid = iov.iov_len; 485 uio.uio_rw = UIO_READ; 486 error = VOP_READ(fvp, &uio, 0, cred); 487 488 if (error == 0) { 489 uio.uio_iov = &iov; 490 uio.uio_iovcnt = 1; 491 iov.iov_base = buf; 492 iov.iov_len = MAXBSIZE - uio.uio_resid; 493 uio.uio_offset = offset; 494 uio.uio_rw = UIO_WRITE; 495 uio.uio_resid = iov.iov_len; 496 497 if (uio.uio_resid == 0) 498 break; 499 500 do { 501 error = VOP_WRITE(tvp, &uio, 0, cred); 502 } while ((uio.uio_resid > 0) && (error == 0)); 503 } 504 505 } while (error == 0); 506 507 free(buf, M_TEMP); 508 return (error); 509 } 510 511 /* 512 * Create a shadow directory in the upper layer. 513 * The new vnode is returned locked. 514 * 515 * (um) points to the union mount structure for access to the 516 * the mounting process's credentials. 517 * (dvp) is the directory in which to create the shadow directory. 518 * it is unlocked on entry and exit. 519 * (cnp) is the componentname to be created. 520 * (vpp) is the returned newly created shadow directory, which 521 * is returned locked. 522 */ 523 int 524 union_mkshadow(um, dvp, cnp, vpp) 525 struct union_mount *um; 526 struct vnode *dvp; 527 struct componentname *cnp; 528 struct vnode **vpp; 529 { 530 int error; 531 struct vattr va; 532 struct proc *p = cnp->cn_proc; 533 struct componentname cn; 534 535 /* 536 * policy: when creating the shadow directory in the 537 * upper layer, create it owned by the user who did 538 * the mount, group from parent directory, and mode 539 * 777 modified by umask (ie mostly identical to the 540 * mkdir syscall). (jsp, kb) 541 */ 542 543 /* 544 * A new componentname structure must be faked up because 545 * there is no way to know where the upper level cnp came 546 * from or what it is being used for. This must duplicate 547 * some of the work done by NDINIT, some of the work done 548 * by namei, some of the work done by lookup and some of 549 * the work done by VOP_LOOKUP when given a CREATE flag. 550 * Conclusion: Horrible. 551 * 552 * The pathname buffer will be FREEed by VOP_MKDIR. 553 */ 554 cn.cn_pnbuf = malloc(cnp->cn_namelen+1, M_NAMEI, M_WAITOK); 555 bcopy(cnp->cn_nameptr, cn.cn_pnbuf, cnp->cn_namelen); 556 cn.cn_pnbuf[cnp->cn_namelen] = '\0'; 557 558 cn.cn_nameiop = CREATE; 559 cn.cn_flags = (LOCKPARENT|HASBUF|SAVENAME|SAVESTART|ISLASTCN); 560 cn.cn_proc = cnp->cn_proc; 561 cn.cn_cred = um->um_cred; 562 cn.cn_nameptr = cn.cn_pnbuf; 563 cn.cn_namelen = cnp->cn_namelen; 564 cn.cn_hash = cnp->cn_hash; 565 cn.cn_consume = cnp->cn_consume; 566 567 VREF(dvp); 568 if (error = relookup(dvp, vpp, &cn)) 569 return (error); 570 vrele(dvp); 571 572 if (*vpp) { 573 VOP_ABORTOP(dvp, &cn); 574 VOP_UNLOCK(dvp); 575 vrele(*vpp); 576 *vpp = NULLVP; 577 return (EEXIST); 578 } 579 580 VATTR_NULL(&va); 581 va.va_type = VDIR; 582 va.va_mode = um->um_cmode; 583 584 /* LEASE_CHECK: dvp is locked */ 585 LEASE_CHECK(dvp, p, p->p_ucred, LEASE_WRITE); 586 587 error = VOP_MKDIR(dvp, vpp, &cn, &va); 588 return (error); 589 } 590 591 /* 592 * union_vn_create: creates and opens a new shadow file 593 * on the upper union layer. this function is similar 594 * in spirit to calling vn_open but it avoids calling namei(). 595 * the problem with calling namei is that a) it locks too many 596 * things, and b) it doesn't start at the "right" directory, 597 * whereas relookup is told where to start. 598 */ 599 int 600 union_vn_create(vpp, un, p) 601 struct vnode **vpp; 602 struct union_node *un; 603 struct proc *p; 604 { 605 struct vnode *vp; 606 struct ucred *cred = p->p_ucred; 607 struct vattr vat; 608 struct vattr *vap = &vat; 609 int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL); 610 int error; 611 int cmode = UN_FILEMODE & ~p->p_fd->fd_cmask; 612 char *cp; 613 struct componentname cn; 614 615 *vpp = NULLVP; 616 617 /* 618 * Build a new componentname structure (for the same 619 * reasons outlines in union_mkshadow). 620 * The difference here is that the file is owned by 621 * the current user, rather than by the person who 622 * did the mount, since the current user needs to be 623 * able to write the file (that's why it is being 624 * copied in the first place). 625 */ 626 cn.cn_namelen = strlen(un->un_path); 627 cn.cn_pnbuf = (caddr_t) malloc(cn.cn_namelen, M_NAMEI, M_WAITOK); 628 bcopy(un->un_path, cn.cn_pnbuf, cn.cn_namelen+1); 629 cn.cn_nameiop = CREATE; 630 cn.cn_flags = (LOCKPARENT|HASBUF|SAVENAME|SAVESTART|ISLASTCN); 631 cn.cn_proc = p; 632 cn.cn_cred = p->p_ucred; 633 cn.cn_nameptr = cn.cn_pnbuf; 634 cn.cn_hash = un->un_hash; 635 cn.cn_consume = 0; 636 637 VREF(un->un_dirvp); 638 if (error = relookup(un->un_dirvp, &vp, &cn)) 639 return (error); 640 vrele(un->un_dirvp); 641 642 if (vp) { 643 VOP_ABORTOP(un->un_dirvp, &cn); 644 if (un->un_dirvp == vp) 645 vrele(un->un_dirvp); 646 else 647 vput(un->un_dirvp); 648 vrele(vp); 649 return (EEXIST); 650 } 651 652 /* 653 * Good - there was no race to create the file 654 * so go ahead and create it. The permissions 655 * on the file will be 0666 modified by the 656 * current user's umask. Access to the file, while 657 * it is unioned, will require access to the top *and* 658 * bottom files. Access when not unioned will simply 659 * require access to the top-level file. 660 * TODO: confirm choice of access permissions. 661 */ 662 VATTR_NULL(vap); 663 vap->va_type = VREG; 664 vap->va_mode = cmode; 665 LEASE_CHECK(un->un_dirvp, p, cred, LEASE_WRITE); 666 if (error = VOP_CREATE(un->un_dirvp, &vp, &cn, vap)) 667 return (error); 668 669 if (error = VOP_OPEN(vp, fmode, cred, p)) { 670 vput(vp); 671 return (error); 672 } 673 674 vp->v_writecount++; 675 *vpp = vp; 676 return (0); 677 } 678 679 int 680 union_vn_close(vp, fmode, cred, p) 681 struct vnode *vp; 682 int fmode; 683 struct ucred *cred; 684 struct proc *p; 685 { 686 if (fmode & FWRITE) 687 --vp->v_writecount; 688 return (VOP_CLOSE(vp, fmode)); 689 } 690 691 void 692 union_removed_upper(un) 693 struct union_node *un; 694 { 695 if (un->un_flags & UN_ULOCK) { 696 un->un_flags &= ~UN_ULOCK; 697 VOP_UNLOCK(un->un_uppervp); 698 } 699 700 union_newupper(un, NULLVP); 701 } 702 703 struct vnode * 704 union_lowervp(vp) 705 struct vnode *vp; 706 { 707 struct union_node *un = VTOUNION(vp); 708 709 if (un->un_lowervp && (vp->v_type == un->un_lowervp->v_type)) { 710 if (vget(un->un_lowervp, 0)) 711 return (NULLVP); 712 } 713 714 return (un->un_lowervp); 715 } 716