1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1984, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
26 /* All Rights Reserved */
27
28 /*
29 * University Copyright- Copyright (c) 1982, 1986, 1988
30 * The Regents of the University of California
31 * All Rights Reserved
32 *
33 * University Acknowledgment- Portions of this document are derived from
34 * software developed by the University of California, Berkeley, and its
35 * contributors.
36 */
37
38 /*
39 * Directory manipulation routines.
40 *
41 * When manipulating directories, the i_rwlock provides serialization
42 * since directories cannot be mmapped. The i_contents lock is redundant.
43 */
44
45 #include <sys/types.h>
46 #include <sys/t_lock.h>
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/signal.h>
50 #include <sys/cred.h>
51 #include <sys/proc.h>
52 #include <sys/disp.h>
53 #include <sys/user.h>
54 #include <sys/vfs.h>
55 #include <sys/vnode.h>
56 #include <sys/stat.h>
57 #include <sys/mode.h>
58 #include <sys/buf.h>
59 #include <sys/uio.h>
60 #include <sys/dnlc.h>
61 #include <sys/fs/ufs_inode.h>
62 #include <sys/fs/ufs_fs.h>
63 #include <sys/mount.h>
64 #include <sys/fs/ufs_fsdir.h>
65 #include <sys/fs/ufs_trans.h>
66 #include <sys/fs/ufs_panic.h>
67 #include <sys/fs/ufs_quota.h>
68 #include <sys/errno.h>
69 #include <sys/debug.h>
70 #include <vm/seg.h>
71 #include <sys/sysmacros.h>
72 #include <sys/cmn_err.h>
73 #include <sys/cpuvar.h>
74 #include <sys/unistd.h>
75 #include <sys/policy.h>
76
77 /*
78 * This is required since we're using P2ROUNDUP_TYPED on DIRBLKSIZ
79 */
80 #if !ISP2(DIRBLKSIZ)
81 #error "DIRBLKSIZ not a power of 2"
82 #endif
83
84 /*
85 * A virgin directory.
86 */
87 static struct dirtemplate mastertemplate = {
88 0, 12, 1, ".",
89 0, DIRBLKSIZ - 12, 2, ".."
90 };
91
92 #define LDIRSIZ(len) \
93 ((sizeof (struct direct) - (MAXNAMLEN + 1)) + ((len + 1 + 3) &~ 3))
94 #define MAX_DIR_NAME_LEN(len) \
95 (((len) - (sizeof (struct direct) - (MAXNAMLEN + 1))) - 1)
96
97 /*
98 * The dnlc directory cache allows a 64 bit handle for directory entries.
99 * For ufs we squeeze both the 32 bit inumber and a 32 bit disk offset
100 * into the handle. Note, a 32 bit offset allows a 4GB directory, which
101 * is way beyond what could be cached in memory by the directory
102 * caching routines. So we are quite safe with this limit.
103 * The macros below pack and unpack the handle.
104 */
105 #define H_TO_INO(h) (uint32_t)((h) & UINT_MAX)
106 #define H_TO_OFF(h) (off_t)((h) >> 32)
107 #define INO_OFF_TO_H(ino, off) (uint64_t)(((uint64_t)(off) << 32) | (ino))
108
109 /*
110 * The average size of a typical on disk directory entry is about 16 bytes
111 * and so defines AV_DIRECT_SHIFT : log2(16)
112 * This define is only used to approximate the number of entries
113 * is a directory. This is needed for dnlc_dir_start() which will immediately
114 * return an error if the value is not within its acceptable range of
115 * number of files in a directory.
116 */
117 #define AV_DIRECT_SHIFT 4
118 /*
119 * If the directory size (from i_size) is greater than the ufs_min_dir_cache
120 * tunable then we request dnlc directory caching.
121 * This has found to be profitable after 1024 file names.
122 */
123 int ufs_min_dir_cache = 1024 << AV_DIRECT_SHIFT;
124
125 /* The time point the dnlc directory caching was disabled */
126 static hrtime_t ufs_dc_disable_at;
127 /* directory caching disable duration */
128 static hrtime_t ufs_dc_disable_duration = (hrtime_t)NANOSEC * 5;
129
130 #ifdef DEBUG
131 int dirchk = 1;
132 #else /* !DEBUG */
133 int dirchk = 0;
134 #endif /* DEBUG */
135 int ufs_negative_cache = 1;
136 uint64_t ufs_dirremove_retry_cnt;
137
138 static void dirbad();
139 static int ufs_dirrename();
140 static int ufs_diraddentry();
141 static int ufs_dirempty();
142 static int ufs_dirscan();
143 static int ufs_dirclrdotdot();
144 static int ufs_dirfixdotdot();
145 static int ufs_dirpurgedotdot();
146 static int dirprepareentry();
147 static int ufs_dirmakedirect();
148 static int dirbadname();
149 static int dirmangled();
150
151 /*
152 * Check accessibility of directory against inquired mode and type.
153 * Execute access is required to search the directory.
154 * Access for write is interpreted as allowing
155 * deletion of files in the directory.
156 * Note, the reader i_contents lock will be acquired in
157 * ufs_iaccess().
158 */
159 int
ufs_diraccess(struct inode * ip,int mode,struct cred * cr)160 ufs_diraccess(struct inode *ip, int mode, struct cred *cr)
161 {
162 if (((ip->i_mode & IFMT) != IFDIR) &&
163 ((ip->i_mode & IFMT) != IFATTRDIR))
164 return (ENOTDIR);
165
166 return (ufs_iaccess(ip, mode, cr, 1));
167 }
168
169 /*
170 * Look for a given name in a directory. On successful return, *ipp
171 * will point to the VN_HELD inode.
172 * The caller is responsible for checking accessibility upfront
173 * via ufs_diraccess().
174 */
175 int
ufs_dirlook(struct inode * dp,char * namep,struct inode ** ipp,struct cred * cr,int skipdnlc,int skipcaching)176 ufs_dirlook(
177 struct inode *dp,
178 char *namep,
179 struct inode **ipp,
180 struct cred *cr,
181 int skipdnlc, /* skip the 1st level dnlc */
182 int skipcaching) /* force directory caching off */
183 {
184 uint64_t handle;
185 struct fbuf *fbp; /* a buffer of directory entries */
186 struct direct *ep; /* the current directory entry */
187 struct vnode *vp;
188 struct vnode *dvp; /* directory vnode ptr */
189 struct ulockfs *ulp;
190 dcanchor_t *dcap;
191 off_t endsearch; /* offset to end directory search */
192 off_t offset;
193 off_t start_off; /* starting offset from middle search */
194 off_t last_offset; /* last offset */
195 int entryoffsetinblock; /* offset of ep in addr's buffer */
196 int numdirpasses; /* strategy for directory search */
197 int namlen; /* length of name */
198 int err;
199 int doingchk;
200 int i;
201 int caching;
202 int indeadlock;
203 ino_t ep_ino; /* entry i number */
204 ino_t chkino;
205 ushort_t ep_reclen; /* direct local d_reclen */
206
207 ASSERT(*namep != '\0'); /* All callers ensure *namep is non null */
208
209 if (dp->i_ufsvfs)
210 ulp = &dp->i_ufsvfs->vfs_ulockfs;
211
212 /*
213 * Check the directory name lookup cache, first for individual files
214 * then for complete directories.
215 */
216 dvp = ITOV(dp);
217 if (!skipdnlc && (vp = dnlc_lookup(dvp, namep))) {
218 /* vp is already held from dnlc_lookup */
219 if (vp == DNLC_NO_VNODE) {
220 VN_RELE(vp);
221 return (ENOENT);
222 }
223 *ipp = VTOI(vp);
224 return (0);
225 }
226
227 dcap = &dp->i_danchor;
228
229 /*
230 * Grab the reader lock on the directory data before checking
231 * the dnlc to avoid a race with ufs_dirremove() & friends.
232 *
233 * ufs_tryirwlock uses rw_tryenter and checks for SLOCK to
234 * avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
235 * possible, retries the operation.
236 */
237 ufs_tryirwlock((&dp->i_rwlock), RW_READER, retry_dircache);
238 if (indeadlock)
239 return (EAGAIN);
240
241 switch (dnlc_dir_lookup(dcap, namep, &handle)) {
242 case DFOUND:
243 ep_ino = (ino_t)H_TO_INO(handle);
244 if (dp->i_number == ep_ino) {
245 VN_HOLD(dvp); /* want ourself, "." */
246 *ipp = dp;
247 rw_exit(&dp->i_rwlock);
248 return (0);
249 }
250 if (namep[0] == '.' && namep[1] == '.' && namep[2] == 0) {
251 uint64_t handle2;
252 /*
253 * release the lock on the dir we are searching
254 * to avoid a deadlock when grabbing the
255 * i_contents lock in ufs_iget_alloced().
256 */
257 rw_exit(&dp->i_rwlock);
258 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
259 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr);
260 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
261 /*
262 * must recheck as we dropped dp->i_rwlock
263 */
264 ufs_tryirwlock(&dp->i_rwlock, RW_READER, retry_parent);
265 if (indeadlock) {
266 if (!err)
267 VN_RELE(ITOV(*ipp));
268 return (EAGAIN);
269 }
270 if (!err && (dnlc_dir_lookup(dcap, namep, &handle2)
271 == DFOUND) && (handle == handle2)) {
272 dnlc_update(dvp, namep, ITOV(*ipp));
273 rw_exit(&dp->i_rwlock);
274 return (0);
275 }
276 /* check failed, read the actual directory */
277 if (!err) {
278 VN_RELE(ITOV(*ipp));
279 }
280 goto restart;
281 }
282 /* usual case of not "." nor ".." */
283 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
284 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr);
285 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
286 if (err) {
287 rw_exit(&dp->i_rwlock);
288 return (err);
289 }
290 dnlc_update(dvp, namep, ITOV(*ipp));
291 rw_exit(&dp->i_rwlock);
292 return (0);
293 case DNOENT:
294 if (ufs_negative_cache && (dp->i_nlink > 0)) {
295 dnlc_enter(dvp, namep, DNLC_NO_VNODE);
296 }
297 rw_exit(&dp->i_rwlock);
298 return (ENOENT);
299 default:
300 break;
301 }
302 restart:
303
304 fbp = NULL;
305 doingchk = 0;
306 chkino = 0;
307 caching = 0;
308
309 /*
310 * Attempt to cache any directories greater than the tunable
311 * ufs_min_cache_dir. If it fails due to memory shortage (DNOMEM),
312 * disable caching for this directory and record the system time.
313 * Any attempt after the disable time has expired will enable
314 * the caching again.
315 */
316 if (!skipcaching && (dp->i_size >= ufs_min_dir_cache)) {
317 /*
318 * if the directory caching disable time has expired
319 * enable the caching again.
320 */
321 if (dp->i_cachedir == CD_DISABLED_NOMEM &&
322 gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) {
323 ufs_dc_disable_at = 0;
324 dp->i_cachedir = CD_ENABLED;
325 }
326 if (dp->i_cachedir == CD_ENABLED) {
327 switch (dnlc_dir_start(dcap, dp->i_size >>
328 AV_DIRECT_SHIFT)) {
329 case DNOMEM:
330 dp->i_cachedir = CD_DISABLED_NOMEM;
331 ufs_dc_disable_at = gethrtime();
332 break;
333 case DTOOBIG:
334 dp->i_cachedir = CD_DISABLED_TOOBIG;
335 break;
336 case DOK:
337 caching = 1;
338 break;
339 default:
340 break;
341 }
342 }
343 }
344 /*
345 * If caching we don't stop when the file has been
346 * found, but need to know later, so clear *ipp now
347 */
348 *ipp = NULL;
349
350 recheck:
351 if (caching) {
352 offset = 0;
353 entryoffsetinblock = 0;
354 numdirpasses = 1;
355 } else {
356 /*
357 * Take care to look at dp->i_diroff only once, as it
358 * may be changing due to other threads/cpus.
359 */
360 offset = dp->i_diroff;
361 if (offset > dp->i_size) {
362 offset = 0;
363 }
364 if (offset == 0) {
365 entryoffsetinblock = 0;
366 numdirpasses = 1;
367 } else {
368 start_off = offset;
369
370 entryoffsetinblock = blkoff(dp->i_fs, offset);
371 if (entryoffsetinblock != 0) {
372 err = blkatoff(dp, offset, (char **)0, &fbp);
373 if (err)
374 goto bad;
375 }
376 numdirpasses = 2;
377 }
378 }
379 endsearch = P2ROUNDUP_TYPED(dp->i_size, DIRBLKSIZ, u_offset_t);
380 namlen = strlen(namep);
381 last_offset = 0;
382
383 searchloop:
384 while (offset < endsearch) {
385 /*
386 * If offset is on a block boundary,
387 * read the next directory block.
388 * Release previous if it exists.
389 */
390 if (blkoff(dp->i_fs, offset) == 0) {
391 if (fbp != NULL) {
392 fbrelse(fbp, S_OTHER);
393 }
394 err = blkatoff(dp, offset, (char **)0, &fbp);
395 if (err)
396 goto bad;
397 entryoffsetinblock = 0;
398 }
399
400 /*
401 * If the offset to the next entry is invalid or if the
402 * next entry is a zero length record or if the record
403 * length is invalid, then skip to the next directory
404 * block. Complete validation checks are done if the
405 * record length is invalid.
406 *
407 * Full validation checks are slow so they are disabled
408 * by default. Complete checks can be run by patching
409 * "dirchk" to be true.
410 *
411 * We have to check the validity of entryoffsetinblock
412 * here because it can be set to i_diroff above.
413 */
414 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblock);
415 if ((entryoffsetinblock & 0x3) || ep->d_reclen == 0 ||
416 (dirchk || (ep->d_reclen & 0x3)) &&
417 dirmangled(dp, ep, entryoffsetinblock, offset)) {
418 i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
419 offset += i;
420 entryoffsetinblock += i;
421 if (caching) {
422 dnlc_dir_purge(dcap);
423 caching = 0;
424 }
425 continue;
426 }
427
428 ep_reclen = ep->d_reclen;
429
430 /*
431 * Add named entries and free space into the directory cache
432 */
433 if (caching) {
434 ushort_t extra;
435 off_t off2;
436
437 if (ep->d_ino == 0) {
438 extra = ep_reclen;
439 if (offset & (DIRBLKSIZ - 1)) {
440 dnlc_dir_purge(dcap);
441 dp->i_cachedir = CD_DISABLED;
442 caching = 0;
443 }
444 } else {
445 /*
446 * entries hold the previous offset except the
447 * 1st which holds the offset + 1
448 */
449 if (offset & (DIRBLKSIZ - 1)) {
450 off2 = last_offset;
451 } else {
452 off2 = offset + 1;
453 }
454 caching = (dnlc_dir_add_entry(dcap, ep->d_name,
455 INO_OFF_TO_H(ep->d_ino, off2)) == DOK);
456 extra = ep_reclen - DIRSIZ(ep);
457 }
458 if (caching && (extra >= LDIRSIZ(1))) {
459 caching = (dnlc_dir_add_space(dcap, extra,
460 (uint64_t)offset) == DOK);
461 }
462 }
463
464 /*
465 * Check for a name match.
466 * We have the parent inode read locked with i_rwlock.
467 */
468 if (ep->d_ino && ep->d_namlen == namlen &&
469 *namep == *ep->d_name && /* fast chk 1st chr */
470 bcmp(namep, ep->d_name, (int)ep->d_namlen) == 0) {
471
472 /*
473 * We have to release the fbp early here to avoid
474 * a possible deadlock situation where we have the
475 * fbp and want the directory inode and someone doing
476 * a ufs_direnter_* has the directory inode and wants
477 * the fbp. XXX - is this still needed?
478 */
479 ep_ino = (ino_t)ep->d_ino;
480 ASSERT(fbp != NULL);
481 fbrelse(fbp, S_OTHER);
482 fbp = NULL;
483
484 /*
485 * Atomic update (read lock held)
486 */
487 dp->i_diroff = offset;
488
489 if (namlen == 2 && namep[0] == '.' && namep[1] == '.') {
490 struct timeval32 omtime;
491
492 if (caching) {
493 dnlc_dir_purge(dcap);
494 caching = 0;
495 }
496 if (doingchk) {
497 /*
498 * if the inumber didn't change
499 * continue with already found inode.
500 */
501 if (ep_ino == chkino)
502 goto checkok;
503 else {
504 VN_RELE(ITOV(*ipp));
505 /* *ipp is nulled at restart */
506 goto restart;
507 }
508 }
509 /*
510 * release the lock on the dir we are searching
511 * to avoid a deadlock when grabbing the
512 * i_contents lock in ufs_iget_alloced().
513 */
514 omtime = dp->i_mtime;
515 rw_exit(&dp->i_rwlock);
516 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock,
517 RW_READER);
518 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp,
519 cr);
520 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
521 ufs_tryirwlock(&dp->i_rwlock, RW_READER,
522 retry_disk);
523 if (indeadlock) {
524 if (!err)
525 VN_RELE(ITOV(*ipp));
526 return (EAGAIN);
527 }
528 if (err)
529 goto bad;
530 /*
531 * Since we released the lock on the directory,
532 * we must check that the same inode is still
533 * the ".." entry for this directory.
534 */
535 /*CSTYLED*/
536 if (timercmp(&omtime, &dp->i_mtime, !=)) {
537 /*
538 * Modification time changed on the
539 * directory, we must go check if
540 * the inumber changed for ".."
541 */
542 doingchk = 1;
543 chkino = ep_ino;
544 entryoffsetinblock = 0;
545 if (caching) {
546 /*
547 * Forget directory caching
548 * for this rare case
549 */
550 dnlc_dir_purge(dcap);
551 caching = 0;
552 }
553 goto recheck;
554 }
555 } else if (dp->i_number == ep_ino) {
556 VN_HOLD(dvp); /* want ourself, "." */
557 *ipp = dp;
558 if (caching) {
559 dnlc_dir_purge(dcap);
560 caching = 0;
561 }
562 } else {
563 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock,
564 RW_READER);
565 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp,
566 cr);
567 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
568 if (err)
569 goto bad;
570 }
571 checkok:
572 ASSERT(*ipp);
573 dnlc_update(dvp, namep, ITOV(*ipp));
574 /*
575 * If we are not caching then just return the entry
576 * otherwise complete loading up the cache
577 */
578 if (!caching) {
579 rw_exit(&dp->i_rwlock);
580 return (0);
581 }
582 err = blkatoff(dp, offset, (char **)0, &fbp);
583 if (err)
584 goto bad;
585 }
586 last_offset = offset;
587 offset += ep_reclen;
588 entryoffsetinblock += ep_reclen;
589 }
590 /*
591 * If we started in the middle of the directory and failed
592 * to find our target, we must check the beginning as well.
593 */
594 if (numdirpasses == 2) {
595 numdirpasses--;
596 offset = 0;
597 endsearch = start_off;
598 goto searchloop;
599 }
600
601 /*
602 * If whole directory caching is on (or was originally on) then
603 * the entry may have been found.
604 */
605 if (*ipp == NULL) {
606 err = ENOENT;
607 if (ufs_negative_cache && (dp->i_nlink > 0)) {
608 dnlc_enter(dvp, namep, DNLC_NO_VNODE);
609 }
610 }
611 if (caching) {
612 dnlc_dir_complete(dcap);
613 caching = 0;
614 }
615
616 bad:
617 if (err && *ipp) {
618 /*
619 * err and *ipp can both be set if we were attempting to
620 * cache the directory, and we found the entry, then later
621 * while trying to complete the directory cache encountered
622 * a error (eg reading a directory sector).
623 */
624 VN_RELE(ITOV(*ipp));
625 *ipp = NULL;
626 }
627
628 if (fbp)
629 fbrelse(fbp, S_OTHER);
630 rw_exit(&dp->i_rwlock);
631 if (caching)
632 dnlc_dir_purge(dcap);
633 return (err);
634 }
635
636 /*
637 * Write a new directory entry for DE_CREATE or DE_MKDIR operations.
638 */
639 int
ufs_direnter_cm(struct inode * tdp,char * namep,enum de_op op,struct vattr * vap,struct inode ** ipp,struct cred * cr,int flags)640 ufs_direnter_cm(
641 struct inode *tdp, /* target directory to make entry in */
642 char *namep, /* name of entry */
643 enum de_op op, /* entry operation */
644 struct vattr *vap, /* attributes if new inode needed */
645 struct inode **ipp, /* return entered inode here */
646 struct cred *cr, /* user credentials */
647 int flags) /* no entry exists */
648 {
649 struct inode *tip; /* inode of (existing) target file */
650 char *s;
651 struct ufs_slot slot; /* slot info to pass around */
652 int namlen; /* length of name */
653 int err; /* error number */
654 struct inode *nip; /* new inode */
655 int do_rele_nip = 0; /* release nip */
656 int noentry = flags & ~IQUIET;
657 int quiet = flags & IQUIET; /* Suppress out of inodes message */
658 int indeadlock;
659 struct ulockfs *ulp;
660
661 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
662
663 if (((tdp->i_mode & IFMT) == IFATTRDIR) && ((op == DE_MKDIR) ||
664 ((vap->va_type == VCHR) || (vap->va_type == VBLK) ||
665 (vap->va_type == VDOOR) || (vap->va_type == VSOCK) ||
666 (vap->va_type == VFIFO))))
667 return (EINVAL);
668
669 /* don't allow '/' characters in pathname component */
670 for (s = namep, namlen = 0; *s; s++, namlen++)
671 if (*s == '/')
672 return (EACCES);
673 ASSERT(namlen);
674
675 /*
676 * Check accessibility of target directory.
677 */
678 if (err = ufs_diraccess(tdp, IEXEC, cr))
679 return (err);
680
681 /*
682 * If name is "." or ".." then if this is a create look it up
683 * and return EEXIST.
684 */
685 if (namep[0] == '.' &&
686 (namlen == 1 || (namlen == 2 && namep[1] == '.'))) {
687 /*
688 * ufs_dirlook will acquire the i_rwlock
689 */
690 if (tdp->i_ufsvfs)
691 ulp = &tdp->i_ufsvfs->vfs_ulockfs;
692 rw_exit(&tdp->i_rwlock);
693 if (err = ufs_dirlook(tdp, namep, ipp, cr, 0, 0)) {
694 if (err == EAGAIN)
695 return (err);
696
697 /*
698 * ufs_tryirwlock uses rw_tryenter and checks for
699 * SLOCK to avoid i_rwlock, ufs_lockfs_begin deadlock.
700 * If deadlock possible, retries the operation.
701 */
702 ufs_tryirwlock(&tdp->i_rwlock, RW_WRITER, retry_err);
703 if (indeadlock)
704 return (EAGAIN);
705
706 return (err);
707 }
708 ufs_tryirwlock(&tdp->i_rwlock, RW_WRITER, retry);
709 if (indeadlock) {
710 VN_RELE(ITOV(*ipp));
711 return (EAGAIN);
712 }
713 return (EEXIST);
714 }
715
716 /*
717 * If target directory has not been removed, then we can consider
718 * allowing file to be created.
719 */
720 if (tdp->i_nlink <= 0) {
721 return (ENOENT);
722 }
723
724 /*
725 * Search for the entry. Return VN_HELD tip if found.
726 */
727 tip = NULL;
728 slot.fbp = NULL;
729 slot.status = NONE;
730 rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
731 rw_enter(&tdp->i_contents, RW_WRITER);
732 err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, noentry);
733 if (err)
734 goto out;
735 if (tip) {
736 ASSERT(!noentry);
737 *ipp = tip;
738 err = EEXIST;
739 } else {
740 /*
741 * The entry does not exist. Check write permission in
742 * directory to see if entry can be created.
743 */
744 if (err = ufs_iaccess(tdp, IWRITE, cr, 0))
745 goto out;
746 /*
747 * Make new inode and directory entry.
748 */
749 tdp->i_flag |= quiet;
750 if (err = ufs_dirmakeinode(tdp, &nip, vap, op, cr)) {
751 if (nip != NULL)
752 do_rele_nip = 1;
753 goto out;
754 }
755 if (err = ufs_diraddentry(tdp, namep, op,
756 namlen, &slot, nip, NULL, cr)) {
757 /*
758 * Unmake the inode we just made.
759 */
760 rw_enter(&nip->i_contents, RW_WRITER);
761 if (((nip->i_mode & IFMT) == IFDIR) ||
762 ((nip->i_mode & IFMT) == IFATTRDIR)) {
763 tdp->i_nlink--;
764 ufs_setreclaim(tdp);
765 tdp->i_flag |= ICHG;
766 tdp->i_seq++;
767 TRANS_INODE(tdp->i_ufsvfs, tdp);
768 ITIMES_NOLOCK(tdp);
769 }
770 nip->i_nlink = 0;
771 ufs_setreclaim(nip);
772 TRANS_INODE(nip->i_ufsvfs, nip);
773 nip->i_flag |= ICHG;
774 nip->i_seq++;
775 ITIMES_NOLOCK(nip);
776 rw_exit(&nip->i_contents);
777 do_rele_nip = 1;
778 } else {
779 *ipp = nip;
780 }
781 }
782
783 out:
784 if (slot.fbp)
785 fbrelse(slot.fbp, S_OTHER);
786
787 tdp->i_flag &= ~quiet;
788 rw_exit(&tdp->i_contents);
789
790 /*
791 * Drop vfs_dqrwlock before calling VN_RELE() on nip to
792 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
793 */
794 rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock);
795
796 if (do_rele_nip) {
797 VN_RELE(ITOV(nip));
798 }
799
800 return (err);
801 }
802
803 /*
804 * Write a new directory entry for DE_LINK, DE_SYMLINK or DE_RENAME operations.
805 */
806 int
ufs_direnter_lr(struct inode * tdp,char * namep,enum de_op op,struct inode * sdp,struct inode * sip,struct cred * cr)807 ufs_direnter_lr(
808 struct inode *tdp, /* target directory to make entry in */
809 char *namep, /* name of entry */
810 enum de_op op, /* entry operation */
811 struct inode *sdp, /* source inode parent if rename */
812 struct inode *sip, /* source inode */
813 struct cred *cr) /* user credentials */
814 {
815 struct inode *tip; /* inode of (existing) target file */
816 char *s;
817 struct ufs_slot slot; /* slot info to pass around */
818 int namlen; /* length of name */
819 int err; /* error number */
820
821 /* don't allow '/' characters in pathname component */
822 for (s = namep, namlen = 0; *s; s++, namlen++)
823 if (*s == '/')
824 return (EACCES);
825 ASSERT(namlen);
826 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
827
828 /*
829 * If name is "." or ".." then if this is a create look it up
830 * and return EEXIST. Rename or link TO "." or ".." is forbidden.
831 */
832 if (namep[0] == '.' &&
833 (namlen == 1 || (namlen == 2 && namep[1] == '.'))) {
834 if (op == DE_RENAME) {
835 return (EINVAL); /* *SIGH* should be ENOTEMPTY */
836 }
837 return (EEXIST);
838 }
839 /*
840 * For link and rename lock the source entry and check the link count
841 * to see if it has been removed while it was unlocked. If not, we
842 * increment the link count and force the inode to disk to make sure
843 * that it is there before any directory entry that points to it.
844 *
845 * In the case of a symbolic link, we are dealing with a new inode
846 * which does not yet have any links. We've created it with a link
847 * count of 1, and we don't want to increment it since this will be
848 * its first link.
849 *
850 * We are about to push the inode to disk. We make sure
851 * that the inode's data blocks are flushed first so the
852 * inode and it's data blocks are always in sync. This
853 * adds some robustness in in the event of a power failure
854 * or panic where sync fails. If we panic before the
855 * inode is updated, then the inode still refers to the
856 * old data blocks (or none for a new file). If we panic
857 * after the inode is updated, then the inode refers to
858 * the new data blocks.
859 *
860 * We do this before grabbing the i_contents lock because
861 * ufs_syncip() will want that lock. We could do the data
862 * syncing after the removal checks, but upon return from
863 * the data sync we would have to repeat the removal
864 * checks.
865 */
866 if (err = TRANS_SYNCIP(sip, 0, I_DSYNC, TOP_FSYNC)) {
867 return (err);
868 }
869
870 rw_enter(&sip->i_contents, RW_WRITER);
871 if (sip->i_nlink <= 0) {
872 rw_exit(&sip->i_contents);
873 return (ENOENT);
874 }
875 if (sip->i_nlink == MAXLINK) {
876 rw_exit(&sip->i_contents);
877 return (EMLINK);
878 }
879
880 /*
881 * Sync the indirect blocks associated with the file
882 * for the same reasons as described above. Since this
883 * call wants the i_contents lock held for it we can do
884 * this here with no extra work.
885 */
886 if (err = ufs_sync_indir(sip)) {
887 rw_exit(&sip->i_contents);
888 return (err);
889 }
890
891 if (op != DE_SYMLINK)
892 sip->i_nlink++;
893 TRANS_INODE(sip->i_ufsvfs, sip);
894 sip->i_flag |= ICHG;
895 sip->i_seq++;
896 ufs_iupdat(sip, I_SYNC);
897 rw_exit(&sip->i_contents);
898
899 /*
900 * If target directory has not been removed, then we can consider
901 * allowing file to be created.
902 */
903 if (tdp->i_nlink <= 0) {
904 err = ENOENT;
905 goto out2;
906 }
907
908 /*
909 * Check accessibility of target directory.
910 */
911 if (err = ufs_diraccess(tdp, IEXEC, cr))
912 goto out2;
913
914 /*
915 * Search for the entry. Return VN_HELD tip if found.
916 */
917 tip = NULL;
918 slot.status = NONE;
919 slot.fbp = NULL;
920 rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
921 rw_enter(&tdp->i_contents, RW_WRITER);
922 err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, 0);
923 if (err)
924 goto out;
925
926 if (tip) {
927 switch (op) {
928 case DE_RENAME:
929 err = ufs_dirrename(sdp, sip, tdp, namep,
930 tip, &slot, cr);
931 break;
932
933 case DE_LINK:
934 case DE_SYMLINK:
935 /*
936 * Can't link to an existing file.
937 */
938 err = EEXIST;
939 break;
940 default:
941 break;
942 }
943 } else {
944 /*
945 * The entry does not exist. Check write permission in
946 * directory to see if entry can be created.
947 */
948 if (err = ufs_iaccess(tdp, IWRITE, cr, 0))
949 goto out;
950 err = ufs_diraddentry(tdp, namep, op, namlen, &slot, sip, sdp,
951 cr);
952 }
953
954 out:
955 if (slot.fbp)
956 fbrelse(slot.fbp, S_OTHER);
957
958 rw_exit(&tdp->i_contents);
959
960 /*
961 * Drop vfs_dqrwlock before calling VN_RELE() on tip to
962 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
963 */
964 rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock);
965
966 /*
967 * If we renamed a file over the top of an existing file,
968 * or linked a file to an existing file (or tried to),
969 * then release and delete (or just release) the inode.
970 */
971 if (tip)
972 VN_RELE(ITOV(tip));
973
974 out2:
975 if (err) {
976 /*
977 * Undo bumped link count.
978 */
979 if (op != DE_SYMLINK) {
980 rw_enter(&sip->i_contents, RW_WRITER);
981 sip->i_nlink--;
982 ufs_setreclaim(sip);
983 TRANS_INODE(sip->i_ufsvfs, sip);
984 sip->i_flag |= ICHG;
985 sip->i_seq++;
986 ITIMES_NOLOCK(sip);
987 rw_exit(&sip->i_contents);
988 }
989 }
990 return (err);
991 }
992
993 /*
994 * Check for the existence of a name in a directory (unless noentry
995 * is set) , or else of an empty
996 * slot in which an entry may be made. If the requested name is found,
997 * then on return *ipp points at the inode and *offp contains
998 * its offset in the directory. If the name is not found, then *ipp
999 * will be NULL and *slotp will contain information about a directory slot in
1000 * which an entry may be made (either an empty slot, or the first position
1001 * past the end of the directory).
1002 * The target directory inode (tdp) is supplied write locked (i_rwlock).
1003 *
1004 * This may not be used on "." or "..", but aliases of "." are ok.
1005 */
1006 int
ufs_dircheckforname(struct inode * tdp,char * namep,int namlen,struct ufs_slot * slotp,struct inode ** ipp,struct cred * cr,int noentry)1007 ufs_dircheckforname(
1008 struct inode *tdp, /* inode of directory being checked */
1009 char *namep, /* name we're checking for */
1010 int namlen, /* length of name, excluding null */
1011 struct ufs_slot *slotp, /* slot structure */
1012 struct inode **ipp, /* return inode if we find one */
1013 struct cred *cr,
1014 int noentry) /* noentry - just look for space */
1015 {
1016 uint64_t handle;
1017 struct fbuf *fbp; /* pointer to directory block */
1018 struct direct *ep; /* directory entry */
1019 struct direct *nep; /* next directory entry */
1020 dcanchor_t *dcap;
1021 vnode_t *dvp; /* directory vnode ptr */
1022 off_t dirsize; /* size of the directory */
1023 off_t offset; /* offset in the directory */
1024 off_t last_offset; /* last offset */
1025 off_t enduseful; /* pointer past last used dir slot */
1026 int entryoffsetinblk; /* offset of ep in fbp's buffer */
1027 int i; /* length of mangled entry */
1028 int needed;
1029 int err;
1030 int first;
1031 int caching;
1032 int stat;
1033 ino_t ep_ino;
1034 slotstat_t initstat = slotp->status;
1035
1036 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1037 ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1038 ASSERT(*ipp == NULL);
1039 fbp = NULL;
1040
1041 /*
1042 * First check if there is a complete cache of the directory.
1043 */
1044 dvp = ITOV(tdp);
1045
1046 dcap = &tdp->i_danchor;
1047 if (noentry) {
1048 /*
1049 * We know from the 1st level dnlc cache that the entry
1050 * doesn't exist, so don't bother searching the directory
1051 * cache, but just look for space (possibly in the directory
1052 * cache).
1053 */
1054 stat = DNOENT;
1055 } else {
1056 stat = dnlc_dir_lookup(dcap, namep, &handle);
1057 }
1058 switch (stat) {
1059 case DFOUND:
1060 ep_ino = (ino_t)H_TO_INO(handle);
1061 if (tdp->i_number == ep_ino) {
1062 *ipp = tdp; /* we want ourself, ie "." */
1063 VN_HOLD(dvp);
1064 } else {
1065 err = ufs_iget_alloced(tdp->i_vfs, ep_ino, ipp, cr);
1066 if (err)
1067 return (err);
1068 }
1069 offset = H_TO_OFF(handle);
1070 first = 0;
1071 if (offset & 1) {
1072 /* This is the first entry in the block */
1073 first = 1;
1074 offset -= 1;
1075 ASSERT((offset & (DIRBLKSIZ - 1)) == 0);
1076 }
1077 err = blkatoff(tdp, offset, (char **)&ep, &fbp);
1078 if (err) {
1079 VN_RELE(ITOV(*ipp));
1080 *ipp = NULL;
1081 return (err);
1082 }
1083 /*
1084 * Check the validity of the entry.
1085 * If it's bad, then throw away the cache and
1086 * continue without it. The dirmangled() routine
1087 * will then be called upon it.
1088 */
1089 if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) {
1090 VN_RELE(ITOV(*ipp));
1091 *ipp = NULL;
1092 dnlc_dir_purge(dcap);
1093 break;
1094 }
1095 /*
1096 * Remember the returned offset is the offset of the
1097 * preceding record (unless this is the 1st record
1098 * in the DIRBLKSIZ sized block (disk sector)), then it's
1099 * offset + 1. Note, no real offsets are on odd boundaries.
1100 */
1101 if (first) {
1102 ASSERT((offset & (DIRBLKSIZ - 1)) == 0);
1103 slotp->offset = offset;
1104 slotp->size = 0;
1105 slotp->ep = ep;
1106 } else {
1107 /* get the next entry */
1108 nep = (struct direct *)((char *)ep + ep->d_reclen);
1109 /*
1110 * Check the validity of this entry as well
1111 * If it's bad, then throw away the cache and
1112 * continue without it. The dirmangled() routine
1113 * will then be called upon it.
1114 */
1115 if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) ||
1116 (nep->d_ino != ep_ino)) {
1117 VN_RELE(ITOV(*ipp));
1118 *ipp = NULL;
1119 dnlc_dir_purge(dcap);
1120 break;
1121 }
1122 slotp->offset = offset + ep->d_reclen;
1123 slotp->size = ep->d_reclen;
1124 slotp->ep = nep;
1125 }
1126 slotp->status = EXIST;
1127 slotp->fbp = fbp;
1128 slotp->endoff = 0;
1129 slotp->cached = 1;
1130 dnlc_update(dvp, namep, ITOV(*ipp));
1131 return (0);
1132 case DNOENT:
1133 /*
1134 * The caller gets to set the initial slot status to
1135 * indicate whether it's interested in getting a
1136 * empty slot. For example, the status can be set
1137 * to FOUND when an entry is being deleted.
1138 */
1139 ASSERT(slotp->fbp == NULL);
1140 if (slotp->status == FOUND) {
1141 return (0);
1142 }
1143 switch (dnlc_dir_rem_space_by_len(dcap, LDIRSIZ(namlen),
1144 &handle)) {
1145 case DFOUND:
1146 offset = (off_t)handle;
1147 err = blkatoff(tdp, offset, (char **)&ep, &fbp);
1148 if (err) {
1149 dnlc_dir_purge(dcap);
1150 ASSERT(*ipp == NULL);
1151 return (err);
1152 }
1153 /*
1154 * Check the validity of the entry.
1155 * If it's bad, then throw away the cache and
1156 * continue without it. The dirmangled() routine
1157 * will then be called upon it.
1158 */
1159 if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) {
1160 dnlc_dir_purge(dcap);
1161 break;
1162 }
1163 /*
1164 * Remember the returned offset is the offset of the
1165 * containing record.
1166 */
1167 slotp->status = FOUND;
1168 slotp->ep = ep;
1169 slotp->offset = offset;
1170 slotp->fbp = fbp;
1171 slotp->size = ep->d_reclen;
1172 /*
1173 * Set end offset to 0. Truncation is handled
1174 * because the dnlc cache will blow away the
1175 * cached directory when an entry is removed
1176 * that drops the entries left to less than half
1177 * the minumum number (dnlc_min_dir_cache).
1178 */
1179 slotp->endoff = 0;
1180 slotp->cached = 1;
1181 return (0);
1182 case DNOENT:
1183 slotp->status = NONE;
1184 slotp->offset = P2ROUNDUP_TYPED(tdp->i_size,
1185 DIRBLKSIZ, u_offset_t);
1186 slotp->size = DIRBLKSIZ;
1187 slotp->endoff = 0;
1188 slotp->cached = 1;
1189 return (0);
1190 default:
1191 break;
1192 }
1193 break;
1194 }
1195 slotp->cached = 0;
1196 caching = 0;
1197 if (!noentry && tdp->i_size >= ufs_min_dir_cache) {
1198 /*
1199 * if the directory caching disable time has expired
1200 * enable caching again.
1201 */
1202 if (tdp->i_cachedir == CD_DISABLED_NOMEM &&
1203 gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) {
1204 ufs_dc_disable_at = 0;
1205 tdp->i_cachedir = CD_ENABLED;
1206 }
1207 /*
1208 * Attempt to cache any directories greater than the tunable
1209 * ufs_min_cache_dir. If it fails due to memory shortage
1210 * (DNOMEM), disable caching for this directory and record
1211 * the system time. Any attempt after the disable time has
1212 * expired will enable the caching again.
1213 */
1214 if (tdp->i_cachedir == CD_ENABLED) {
1215 switch (dnlc_dir_start(dcap,
1216 tdp->i_size >> AV_DIRECT_SHIFT)) {
1217 case DNOMEM:
1218 tdp->i_cachedir = CD_DISABLED_NOMEM;
1219 ufs_dc_disable_at = gethrtime();
1220 break;
1221 case DTOOBIG:
1222 tdp->i_cachedir = CD_DISABLED_TOOBIG;
1223 break;
1224 case DOK:
1225 caching = 1;
1226 break;
1227 default:
1228 break;
1229 }
1230 }
1231 }
1232
1233 /*
1234 * No point in using i_diroff since we must search whole directory
1235 */
1236 dirsize = P2ROUNDUP_TYPED(tdp->i_size, DIRBLKSIZ, u_offset_t);
1237 enduseful = 0;
1238 offset = last_offset = 0;
1239 entryoffsetinblk = 0;
1240 needed = (int)LDIRSIZ(namlen);
1241 while (offset < dirsize) {
1242 /*
1243 * If offset is on a block boundary,
1244 * read the next directory block.
1245 * Release previous if it exists.
1246 */
1247 if (blkoff(tdp->i_fs, offset) == 0) {
1248 if (fbp != NULL)
1249 fbrelse(fbp, S_OTHER);
1250
1251 err = blkatoff(tdp, offset, (char **)0, &fbp);
1252 if (err) {
1253 ASSERT(*ipp == NULL);
1254 if (caching) {
1255 dnlc_dir_purge(dcap);
1256 }
1257 return (err);
1258 }
1259 entryoffsetinblk = 0;
1260 }
1261 /*
1262 * If still looking for a slot, and at a DIRBLKSIZ
1263 * boundary, have to start looking for free space
1264 * again.
1265 */
1266 if (slotp->status == NONE &&
1267 (entryoffsetinblk & (DIRBLKSIZ - 1)) == 0) {
1268 slotp->offset = -1;
1269 }
1270 /*
1271 * If the next entry is a zero length record or if the
1272 * record length is invalid, then skip to the next
1273 * directory block. Complete validation checks are
1274 * done if the record length is invalid.
1275 *
1276 * Full validation checks are slow so they are disabled
1277 * by default. Complete checks can be run by patching
1278 * "dirchk" to be true.
1279 *
1280 * We do not have to check the validity of
1281 * entryoffsetinblk here because it starts out as zero
1282 * and is only incremented by d_reclen values that we
1283 * validate here.
1284 */
1285 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk);
1286 if (ep->d_reclen == 0 ||
1287 (dirchk || (ep->d_reclen & 0x3)) &&
1288 dirmangled(tdp, ep, entryoffsetinblk, offset)) {
1289 i = DIRBLKSIZ - (entryoffsetinblk & (DIRBLKSIZ - 1));
1290 offset += i;
1291 entryoffsetinblk += i;
1292 if (caching) {
1293 dnlc_dir_purge(dcap);
1294 caching = 0;
1295 }
1296 continue;
1297 }
1298
1299 /*
1300 * Add named entries and free space into the directory cache
1301 */
1302 if (caching) {
1303 ushort_t extra;
1304 off_t off2;
1305
1306 if (ep->d_ino == 0) {
1307 extra = ep->d_reclen;
1308 if (offset & (DIRBLKSIZ - 1)) {
1309 dnlc_dir_purge(dcap);
1310 caching = 0;
1311 }
1312 } else {
1313 /*
1314 * entries hold the previous offset if
1315 * not the 1st one
1316 */
1317 if (offset & (DIRBLKSIZ - 1)) {
1318 off2 = last_offset;
1319 } else {
1320 off2 = offset + 1;
1321 }
1322 caching = (dnlc_dir_add_entry(dcap, ep->d_name,
1323 INO_OFF_TO_H(ep->d_ino, off2)) == DOK);
1324 extra = ep->d_reclen - DIRSIZ(ep);
1325 }
1326 if (caching && (extra >= LDIRSIZ(1))) {
1327 caching = (dnlc_dir_add_space(dcap, extra,
1328 (uint64_t)offset) == DOK);
1329 }
1330 }
1331
1332 /*
1333 * If an appropriate sized slot has not yet been found,
1334 * check to see if one is available.
1335 */
1336 if ((slotp->status != FOUND) && (slotp->status != EXIST)) {
1337 int size = ep->d_reclen;
1338
1339 if (ep->d_ino != 0)
1340 size -= DIRSIZ(ep);
1341 if (size > 0) {
1342 if (size >= needed) {
1343 slotp->offset = offset;
1344 slotp->size = ep->d_reclen;
1345 if (noentry) {
1346 slotp->ep = ep;
1347 slotp->fbp = fbp;
1348 slotp->status = FOUND;
1349 slotp->endoff = 0;
1350 return (0);
1351 }
1352 slotp->status = FOUND;
1353 } else if (slotp->status == NONE) {
1354 if (slotp->offset == -1)
1355 slotp->offset = offset;
1356 }
1357 }
1358 }
1359 /*
1360 * Check for a name match.
1361 */
1362 if (ep->d_ino && ep->d_namlen == namlen &&
1363 *namep == *ep->d_name && /* fast chk 1st char */
1364 bcmp(namep, ep->d_name, namlen) == 0) {
1365
1366 tdp->i_diroff = offset;
1367
1368 if (tdp->i_number == ep->d_ino) {
1369 *ipp = tdp; /* we want ourself, ie "." */
1370 VN_HOLD(dvp);
1371 } else {
1372 err = ufs_iget_alloced(tdp->i_vfs,
1373 (ino_t)ep->d_ino, ipp, cr);
1374 if (err) {
1375 fbrelse(fbp, S_OTHER);
1376 if (caching)
1377 dnlc_dir_purge(dcap);
1378 return (err);
1379 }
1380 }
1381 slotp->status = EXIST;
1382 slotp->offset = offset;
1383 slotp->size = (int)(offset - last_offset);
1384 slotp->fbp = fbp;
1385 slotp->ep = ep;
1386 slotp->endoff = 0;
1387 if (caching)
1388 dnlc_dir_purge(dcap);
1389 return (0);
1390 }
1391 last_offset = offset;
1392 offset += ep->d_reclen;
1393 entryoffsetinblk += ep->d_reclen;
1394 if (ep->d_ino)
1395 enduseful = offset;
1396 }
1397 if (fbp) {
1398 fbrelse(fbp, S_OTHER);
1399 }
1400
1401 if (caching) {
1402 dnlc_dir_complete(dcap);
1403 slotp->cached = 1;
1404 if (slotp->status == FOUND) {
1405 if (initstat == FOUND) {
1406 return (0);
1407 }
1408 (void) dnlc_dir_rem_space_by_handle(dcap,
1409 slotp->offset);
1410 slotp->endoff = 0;
1411 return (0);
1412 }
1413 }
1414
1415 if (slotp->status == NONE) {
1416 /*
1417 * We didn't find a slot; the new directory entry should be put
1418 * at the end of the directory. Return an indication of where
1419 * this is, and set "endoff" to zero; since we're going to have
1420 * to extend the directory, we're certainly not going to
1421 * truncate it.
1422 */
1423 slotp->offset = dirsize;
1424 slotp->size = DIRBLKSIZ;
1425 slotp->endoff = 0;
1426 } else {
1427 /*
1428 * We found a slot, and will return an indication of where that
1429 * slot is, as any new directory entry will be put there.
1430 * Since that slot will become a useful entry, if the last
1431 * useful entry we found was before this one, update the offset
1432 * of the last useful entry.
1433 */
1434 if (enduseful < slotp->offset + slotp->size)
1435 enduseful = slotp->offset + slotp->size;
1436 slotp->endoff = P2ROUNDUP_TYPED(enduseful, DIRBLKSIZ, off_t);
1437 }
1438 *ipp = NULL;
1439 return (0);
1440 }
1441
1442 uint64_t ufs_dirrename_retry_cnt;
1443
1444 /*
1445 * Rename the entry in the directory tdp so that it points to
1446 * sip instead of tip.
1447 */
1448 static int
ufs_dirrename(struct inode * sdp,struct inode * sip,struct inode * tdp,char * namep,struct inode * tip,struct ufs_slot * slotp,struct cred * cr)1449 ufs_dirrename(
1450 struct inode *sdp, /* parent directory of source */
1451 struct inode *sip, /* source inode */
1452 struct inode *tdp, /* parent directory of target */
1453 char *namep, /* entry we are trying to change */
1454 struct inode *tip, /* target inode */
1455 struct ufs_slot *slotp, /* slot for entry */
1456 struct cred *cr) /* credentials */
1457 {
1458 vnode_t *tdvp;
1459 off_t offset;
1460 int err;
1461 int doingdirectory;
1462
1463 ASSERT(sdp->i_ufsvfs != NULL);
1464 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1465 ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1466 /*
1467 * Short circuit rename of something to itself.
1468 */
1469 if (sip->i_number == tip->i_number) {
1470 return (ESAME); /* special KLUDGE error code */
1471 }
1472
1473 /*
1474 * We're locking 2 peer level locks, so must use tryenter
1475 * on the 2nd to avoid deadlocks that would occur
1476 * if we renamed a->b and b->a concurrently.
1477 */
1478 retry:
1479 rw_enter(&tip->i_contents, RW_WRITER);
1480 if (!rw_tryenter(&sip->i_contents, RW_READER)) {
1481 /*
1482 * drop tip and wait (sleep) until we stand a chance
1483 * of holding sip
1484 */
1485 rw_exit(&tip->i_contents);
1486 rw_enter(&sip->i_contents, RW_READER);
1487 /*
1488 * Reverse the lock grabs in case we have heavy
1489 * contention on the 2nd lock.
1490 */
1491 if (!rw_tryenter(&tip->i_contents, RW_WRITER)) {
1492 ufs_dirrename_retry_cnt++;
1493 rw_exit(&sip->i_contents);
1494 goto retry;
1495 }
1496 }
1497
1498 /*
1499 * Check that everything is on the same filesystem.
1500 */
1501 if ((ITOV(tip)->v_vfsp != ITOV(tdp)->v_vfsp) ||
1502 (ITOV(tip)->v_vfsp != ITOV(sip)->v_vfsp)) {
1503 err = EXDEV; /* XXX archaic */
1504 goto out;
1505 }
1506 /*
1507 * Must have write permission to rewrite target entry.
1508 * Perform additional checks for sticky directories.
1509 */
1510 if ((err = ufs_iaccess(tdp, IWRITE, cr, 0)) != 0 ||
1511 (err = ufs_sticky_remove_access(tdp, tip, cr)) != 0)
1512 goto out;
1513
1514 /*
1515 * Ensure source and target are compatible (both directories
1516 * or both not directories). If target is a directory it must
1517 * be empty and have no links to it; in addition it must not
1518 * be a mount point, and both the source and target must be
1519 * writable.
1520 */
1521 doingdirectory = (((sip->i_mode & IFMT) == IFDIR) ||
1522 ((sip->i_mode & IFMT) == IFATTRDIR));
1523 if (((tip->i_mode & IFMT) == IFDIR) ||
1524 ((tip->i_mode & IFMT) == IFATTRDIR)) {
1525 if (!doingdirectory) {
1526 err = EISDIR;
1527 goto out;
1528 }
1529 /*
1530 * vn_vfsrlock will prevent mounts from using the directory
1531 * until we are done.
1532 */
1533 if (vn_vfsrlock(ITOV(tip))) {
1534 err = EBUSY;
1535 goto out;
1536 }
1537 if (vn_mountedvfs(ITOV(tip)) != NULL) {
1538 vn_vfsunlock(ITOV(tip));
1539 err = EBUSY;
1540 goto out;
1541 }
1542 if (!ufs_dirempty(tip, tdp->i_number, cr) || tip->i_nlink > 2) {
1543 vn_vfsunlock(ITOV(tip));
1544 err = EEXIST; /* SIGH should be ENOTEMPTY */
1545 goto out;
1546 }
1547 } else if (doingdirectory) {
1548 err = ENOTDIR;
1549 goto out;
1550 }
1551
1552 /*
1553 * Rewrite the inode pointer for target name entry
1554 * from the target inode (ip) to the source inode (sip).
1555 * This prevents the target entry from disappearing
1556 * during a crash. Mark the directory inode to reflect the changes.
1557 */
1558 tdvp = ITOV(tdp);
1559 slotp->ep->d_ino = (int32_t)sip->i_number;
1560 dnlc_update(tdvp, namep, ITOV(sip));
1561 if (slotp->size) {
1562 offset = slotp->offset - slotp->size;
1563 } else {
1564 offset = slotp->offset + 1;
1565 }
1566 if (slotp->cached) {
1567 (void) dnlc_dir_update(&tdp->i_danchor, namep,
1568 INO_OFF_TO_H(slotp->ep->d_ino, offset));
1569 }
1570
1571 err = TRANS_DIR(tdp, slotp->offset);
1572 if (err)
1573 fbrelse(slotp->fbp, S_OTHER);
1574 else
1575 err = ufs_fbwrite(slotp->fbp, tdp);
1576
1577 slotp->fbp = NULL;
1578 if (err) {
1579 if (doingdirectory)
1580 vn_vfsunlock(ITOV(tip));
1581 goto out;
1582 }
1583
1584 TRANS_INODE(tdp->i_ufsvfs, tdp);
1585 tdp->i_flag |= IUPD|ICHG;
1586 tdp->i_seq++;
1587 ITIMES_NOLOCK(tdp);
1588
1589 /*
1590 * Decrement the link count of the target inode.
1591 * Fix the ".." entry in sip to point to dp.
1592 * This is done after the new entry is on the disk.
1593 */
1594 tip->i_nlink--;
1595 TRANS_INODE(tip->i_ufsvfs, tip);
1596 tip->i_flag |= ICHG;
1597 tip->i_seq++;
1598 ITIMES_NOLOCK(tip);
1599 if (doingdirectory) {
1600 /*
1601 * The entry for tip no longer exists so I can unlock the
1602 * vfslock.
1603 */
1604 vn_vfsunlock(ITOV(tip));
1605 /*
1606 * Decrement target link count once more if it was a directory.
1607 */
1608 if (--tip->i_nlink != 0) {
1609 err = ufs_fault(ITOV(tip),
1610 "ufs_dirrename: target directory link count != 0 (%s)",
1611 tip->i_fs->fs_fsmnt);
1612 rw_exit(&tip->i_contents);
1613 return (err);
1614 }
1615 TRANS_INODE(tip->i_ufsvfs, tip);
1616 ufs_setreclaim(tip);
1617 /*
1618 * Renaming a directory with the parent different
1619 * requires that ".." be rewritten. The window is
1620 * still there for ".." to be inconsistent, but this
1621 * is unavoidable, and a lot shorter than when it was
1622 * done in a user process. We decrement the link
1623 * count in the new parent as appropriate to reflect
1624 * the just-removed target. If the parent is the
1625 * same, this is appropriate since the original
1626 * directory is going away. If the new parent is
1627 * different, ufs_dirfixdotdot() will bump the link count
1628 * back.
1629 */
1630 tdp->i_nlink--;
1631 ufs_setreclaim(tdp);
1632 TRANS_INODE(tdp->i_ufsvfs, tdp);
1633 tdp->i_flag |= ICHG;
1634 tdp->i_seq++;
1635 ITIMES_NOLOCK(tdp);
1636 if (sdp != tdp) {
1637 rw_exit(&tip->i_contents);
1638 rw_exit(&sip->i_contents);
1639 err = ufs_dirfixdotdot(sip, sdp, tdp);
1640 return (err);
1641 }
1642 } else
1643 ufs_setreclaim(tip);
1644 out:
1645 rw_exit(&tip->i_contents);
1646 rw_exit(&sip->i_contents);
1647 return (err);
1648 }
1649
1650 /*
1651 * Fix the ".." entry of the child directory so that it points
1652 * to the new parent directory instead of the old one. Routine
1653 * assumes that dp is a directory and that all the inodes are on
1654 * the same file system.
1655 */
1656 static int
ufs_dirfixdotdot(struct inode * dp,struct inode * opdp,struct inode * npdp)1657 ufs_dirfixdotdot(
1658 struct inode *dp, /* child directory */
1659 struct inode *opdp, /* old parent directory */
1660 struct inode *npdp) /* new parent directory */
1661 {
1662 struct fbuf *fbp;
1663 struct dirtemplate *dirp;
1664 vnode_t *dvp;
1665 int err;
1666
1667 ASSERT(RW_WRITE_HELD(&npdp->i_rwlock));
1668 ASSERT(RW_WRITE_HELD(&npdp->i_contents));
1669
1670 /*
1671 * We hold the child directory's i_contents lock before calling
1672 * blkatoff so that we honor correct locking protocol which is
1673 * i_contents lock and then page lock. (blkatoff will call
1674 * ufs_getpage where we want the page lock)
1675 * We hold the child directory's i_rwlock before i_contents (as
1676 * per the locking protocol) since we are modifying the ".." entry
1677 * of the child directory.
1678 * We hold the i_rwlock and i_contents lock until we record
1679 * this directory delta to the log (via ufs_trans_dir) and have
1680 * done fbrelse.
1681 */
1682 rw_enter(&dp->i_rwlock, RW_WRITER);
1683 rw_enter(&dp->i_contents, RW_WRITER);
1684 err = blkatoff(dp, (off_t)0, (char **)&dirp, &fbp);
1685 if (err)
1686 goto bad;
1687
1688 if (dp->i_nlink <= 0 ||
1689 dp->i_size < sizeof (struct dirtemplate)) {
1690 err = ENOENT;
1691 goto bad;
1692 }
1693
1694 if (dirp->dotdot_namlen != 2 ||
1695 dirp->dotdot_name[0] != '.' ||
1696 dirp->dotdot_name[1] != '.') { /* Sanity check. */
1697 dirbad(dp, "mangled .. entry", (off_t)0);
1698 err = ENOTDIR;
1699 goto bad;
1700 }
1701
1702 /*
1703 * Increment the link count in the new parent inode and force it out.
1704 */
1705 if (npdp->i_nlink == MAXLINK) {
1706 err = EMLINK;
1707 goto bad;
1708 }
1709 npdp->i_nlink++;
1710 TRANS_INODE(npdp->i_ufsvfs, npdp);
1711 npdp->i_flag |= ICHG;
1712 npdp->i_seq++;
1713 ufs_iupdat(npdp, I_SYNC);
1714
1715 /*
1716 * Rewrite the child ".." entry and force it out.
1717 */
1718 dvp = ITOV(dp);
1719 dirp->dotdot_ino = (uint32_t)npdp->i_number;
1720 dnlc_update(dvp, "..", ITOV(npdp));
1721 (void) dnlc_dir_update(&dp->i_danchor, "..",
1722 INO_OFF_TO_H(dirp->dotdot_ino, 0));
1723
1724 err = TRANS_DIR(dp, 0);
1725 if (err)
1726 fbrelse(fbp, S_OTHER);
1727 else
1728 err = ufs_fbwrite(fbp, dp);
1729
1730 fbp = NULL;
1731 if (err)
1732 goto bad;
1733
1734 rw_exit(&dp->i_contents);
1735 rw_exit(&dp->i_rwlock);
1736
1737 /*
1738 * Decrement the link count of the old parent inode and force it out.
1739 */
1740 ASSERT(opdp);
1741 rw_enter(&opdp->i_contents, RW_WRITER);
1742 ASSERT(opdp->i_nlink > 0);
1743 opdp->i_nlink--;
1744 ufs_setreclaim(opdp);
1745 TRANS_INODE(opdp->i_ufsvfs, opdp);
1746 opdp->i_flag |= ICHG;
1747 opdp->i_seq++;
1748 ufs_iupdat(opdp, I_SYNC);
1749 rw_exit(&opdp->i_contents);
1750 return (0);
1751
1752 bad:
1753 if (fbp)
1754 fbrelse(fbp, S_OTHER);
1755 rw_exit(&dp->i_contents);
1756 rw_exit(&dp->i_rwlock);
1757 return (err);
1758 }
1759
1760 /*
1761 * Enter the file sip in the directory tdp with name namep.
1762 */
1763 static int
ufs_diraddentry(struct inode * tdp,char * namep,enum de_op op,int namlen,struct ufs_slot * slotp,struct inode * sip,struct inode * sdp,struct cred * cr)1764 ufs_diraddentry(
1765 struct inode *tdp,
1766 char *namep,
1767 enum de_op op,
1768 int namlen,
1769 struct ufs_slot *slotp,
1770 struct inode *sip,
1771 struct inode *sdp,
1772 struct cred *cr)
1773 {
1774 struct direct *ep, *nep;
1775 vnode_t *tdvp;
1776 dcanchor_t *dcap = &tdp->i_danchor;
1777 off_t offset;
1778 int err;
1779 ushort_t extra;
1780
1781 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1782 ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1783 /*
1784 * Prepare a new entry. If the caller has not supplied an
1785 * existing inode, make a new one.
1786 */
1787 err = dirprepareentry(tdp, slotp, cr);
1788 if (err) {
1789 if (slotp->fbp) {
1790 fbrelse(slotp->fbp, S_OTHER);
1791 slotp->fbp = NULL;
1792 }
1793 return (err);
1794 }
1795 /*
1796 * Check inode to be linked to see if it is in the
1797 * same filesystem.
1798 */
1799 if (ITOV(tdp)->v_vfsp != ITOV(sip)->v_vfsp) {
1800 err = EXDEV;
1801 goto bad;
1802 }
1803
1804 /*
1805 * If renaming a directory then fix up the ".." entry in the
1806 * directory to point to the new parent.
1807 */
1808 if ((op == DE_RENAME) && (((sip->i_mode & IFMT) == IFDIR) ||
1809 ((sip->i_mode & IFMT) == IFATTRDIR)) && (sdp != tdp)) {
1810 err = ufs_dirfixdotdot(sip, sdp, tdp);
1811 if (err)
1812 goto bad;
1813 }
1814
1815 /*
1816 * Fill in entry data.
1817 */
1818 ep = slotp->ep;
1819 ep->d_namlen = (ushort_t)namlen;
1820 (void) strncpy(ep->d_name, namep, (size_t)((namlen + 4) & ~3));
1821 ep->d_ino = (uint32_t)sip->i_number;
1822 tdvp = ITOV(tdp);
1823 dnlc_update(tdvp, namep, ITOV(sip));
1824 /*
1825 * Note the offset supplied for any named entry is
1826 * the offset of the previous one, unless it's the 1st.
1827 * slotp->size is used to pass the length to
1828 * the previous entry.
1829 */
1830 if (slotp->size) {
1831 offset = slotp->offset - slotp->size;
1832 } else {
1833 offset = slotp->offset + 1;
1834 }
1835
1836 if (slotp->cached) {
1837 /*
1838 * Add back any usable unused space to the dnlc directory
1839 * cache.
1840 */
1841 extra = ep->d_reclen - DIRSIZ(ep);
1842 if (extra >= LDIRSIZ(1)) {
1843 (void) dnlc_dir_add_space(dcap, extra,
1844 (uint64_t)slotp->offset);
1845 }
1846
1847 (void) dnlc_dir_add_entry(dcap, namep,
1848 INO_OFF_TO_H(ep->d_ino, offset));
1849
1850 /* adjust the previous offset of the next entry */
1851 nep = (struct direct *)((char *)ep + ep->d_reclen);
1852 if ((uintptr_t)nep & (DIRBLKSIZ - 1)) {
1853 /*
1854 * Not a new block.
1855 *
1856 * Check the validity of the next entry.
1857 * If it's bad, then throw away the cache, and
1858 * continue as before directory caching.
1859 */
1860 if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) ||
1861 dnlc_dir_update(dcap, nep->d_name,
1862 INO_OFF_TO_H(nep->d_ino, slotp->offset))
1863 == DNOENT) {
1864 dnlc_dir_purge(dcap);
1865 slotp->cached = 0;
1866 }
1867 }
1868 }
1869
1870 /*
1871 * Write out the directory block.
1872 */
1873 err = TRANS_DIR(tdp, slotp->offset);
1874 if (err)
1875 fbrelse(slotp->fbp, S_OTHER);
1876 else
1877 err = ufs_fbwrite(slotp->fbp, tdp);
1878
1879 slotp->fbp = NULL;
1880 /*
1881 * If this is a rename of a directory, then we have already
1882 * fixed the ".." entry to refer to the new parent. If err
1883 * is true at this point, we have failed to update the new
1884 * parent to refer to the renamed directory.
1885 * XXX - we need to unwind the ".." fix.
1886 */
1887 if (err)
1888 return (err);
1889
1890 /*
1891 * Mark the directory inode to reflect the changes.
1892 * Truncate the directory to chop off blocks of empty entries.
1893 */
1894
1895 TRANS_INODE(tdp->i_ufsvfs, tdp);
1896 tdp->i_flag |= IUPD|ICHG;
1897 tdp->i_seq++;
1898 tdp->i_diroff = 0;
1899 ITIMES_NOLOCK(tdp);
1900 /*
1901 * If the directory grew then dirprepareentry() will have
1902 * set IATTCHG in tdp->i_flag, then the directory inode must
1903 * be flushed out. This is because if fsync() is used later
1904 * the directory size must be correct, otherwise a crash would
1905 * cause fsck to move the file to lost+found. Also because later
1906 * a file may be linked in more than one directory, then there
1907 * is no way to flush the original directory. So it must be
1908 * flushed out on creation. See bug 4293809.
1909 */
1910 if (tdp->i_flag & IATTCHG) {
1911 ufs_iupdat(tdp, I_SYNC);
1912 }
1913
1914 if (slotp->endoff && (slotp->endoff < tdp->i_size)) {
1915 if (!TRANS_ISTRANS(tdp->i_ufsvfs)) {
1916 (void) ufs_itrunc(tdp, (u_offset_t)slotp->endoff, 0,
1917 cr);
1918 }
1919 }
1920
1921
1922 return (0);
1923
1924 bad:
1925 if (slotp->cached) {
1926 dnlc_dir_purge(dcap);
1927 fbrelse(slotp->fbp, S_OTHER);
1928 slotp->cached = 0;
1929 slotp->fbp = NULL;
1930 return (err);
1931 }
1932
1933 /*
1934 * Clear out entry prepared by dirprepareent.
1935 */
1936 slotp->ep->d_ino = 0;
1937 slotp->ep->d_namlen = 0;
1938
1939 /*
1940 * Don't touch err so we don't clobber the real error that got us here.
1941 */
1942 if (TRANS_DIR(tdp, slotp->offset))
1943 fbrelse(slotp->fbp, S_OTHER);
1944 else
1945 (void) ufs_fbwrite(slotp->fbp, tdp);
1946 slotp->fbp = NULL;
1947 return (err);
1948 }
1949
1950 /*
1951 * Prepare a directory slot to receive an entry.
1952 */
1953 static int
dirprepareentry(struct inode * dp,struct ufs_slot * slotp,struct cred * cr)1954 dirprepareentry(
1955 struct inode *dp, /* directory we are working in */
1956 struct ufs_slot *slotp, /* available slot info */
1957 struct cred *cr)
1958 {
1959 struct direct *ep, *nep;
1960 off_t entryend;
1961 int err;
1962 slotstat_t status = slotp->status;
1963 ushort_t dsize;
1964
1965 ASSERT((status == NONE) || (status == FOUND));
1966 ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
1967 ASSERT(RW_WRITE_HELD(&dp->i_contents));
1968 /*
1969 * If we didn't find a slot, then indicate that the
1970 * new slot belongs at the end of the directory.
1971 * If we found a slot, then the new entry can be
1972 * put at slotp->offset.
1973 */
1974 entryend = slotp->offset + slotp->size;
1975 if (status == NONE) {
1976 ASSERT((slotp->offset & (DIRBLKSIZ - 1)) == 0);
1977 if (DIRBLKSIZ > dp->i_fs->fs_fsize) {
1978 err = ufs_fault(ITOV(dp),
1979 "dirprepareentry: bad fs_fsize, DIRBLKSIZ: %d"
1980 " > dp->i_fs->fs_fsize: %d (%s)",
1981 DIRBLKSIZ, dp->i_fs->fs_fsize, dp->i_fs->fs_fsmnt);
1982 return (err);
1983 }
1984 /*
1985 * Allocate the new block.
1986 */
1987 err = BMAPALLOC(dp, (u_offset_t)slotp->offset,
1988 (int)(blkoff(dp->i_fs, slotp->offset) + DIRBLKSIZ), cr);
1989 if (err) {
1990 return (err);
1991 }
1992 dp->i_size = entryend;
1993 TRANS_INODE(dp->i_ufsvfs, dp);
1994 dp->i_flag |= IUPD|ICHG|IATTCHG;
1995 dp->i_seq++;
1996 ITIMES_NOLOCK(dp);
1997 } else if (entryend > dp->i_size) {
1998 /*
1999 * Adjust directory size, if needed. This should never
2000 * push the size past a new multiple of DIRBLKSIZ.
2001 * This is an artifact of the old (4.2BSD) way of initializing
2002 * directory sizes to be less than DIRBLKSIZ.
2003 */
2004 dp->i_size = P2ROUNDUP_TYPED(entryend, DIRBLKSIZ, off_t);
2005 TRANS_INODE(dp->i_ufsvfs, dp);
2006 dp->i_flag |= IUPD|ICHG|IATTCHG;
2007 dp->i_seq++;
2008 ITIMES_NOLOCK(dp);
2009 }
2010
2011 /*
2012 * Get the block containing the space for the new directory entry.
2013 */
2014 if (slotp->fbp == NULL) {
2015 err = blkatoff(dp, slotp->offset, (char **)&slotp->ep,
2016 &slotp->fbp);
2017 if (err) {
2018 return (err);
2019 }
2020 }
2021 ep = slotp->ep;
2022
2023 switch (status) {
2024 case NONE:
2025 /*
2026 * No space in the directory. slotp->offset will be on a
2027 * directory block boundary and we will write the new entry
2028 * into a fresh block.
2029 */
2030 ep->d_reclen = DIRBLKSIZ;
2031 slotp->size = 0; /* length of previous entry */
2032 break;
2033 case FOUND:
2034 /*
2035 * An entry of the required size has been found. Use it.
2036 */
2037 if (ep->d_ino == 0) {
2038 /* this is the 1st record in a block */
2039 slotp->size = 0; /* length of previous entry */
2040 } else {
2041 dsize = DIRSIZ(ep);
2042 nep = (struct direct *)((char *)ep + dsize);
2043 nep->d_reclen = ep->d_reclen - dsize;
2044 ep->d_reclen = dsize;
2045 slotp->ep = nep;
2046 slotp->offset += dsize;
2047 slotp->size = dsize; /* length of previous entry */
2048 }
2049 break;
2050 default:
2051 break;
2052 }
2053 return (0);
2054 }
2055
2056 /*
2057 * Allocate and initialize a new inode that will go into directory tdp.
2058 * This routine is called from ufs_symlink(), as well as within this file.
2059 */
2060 int
ufs_dirmakeinode(struct inode * tdp,struct inode ** ipp,struct vattr * vap,enum de_op op,struct cred * cr)2061 ufs_dirmakeinode(
2062 struct inode *tdp,
2063 struct inode **ipp,
2064 struct vattr *vap,
2065 enum de_op op,
2066 struct cred *cr)
2067 {
2068 struct inode *ip;
2069 enum vtype type;
2070 int imode; /* mode and format as in inode */
2071 ino_t ipref;
2072 int err;
2073 timestruc_t now;
2074
2075 ASSERT(vap != NULL);
2076 ASSERT(op == DE_CREATE || op == DE_MKDIR || op == DE_ATTRDIR ||
2077 op == DE_SYMLINK);
2078 ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
2079 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
2080 ASSERT(RW_WRITE_HELD(&tdp->i_contents));
2081 /*
2082 * Allocate a new inode.
2083 */
2084 type = vap->va_type;
2085 if (type == VDIR) {
2086 ipref = dirpref(tdp);
2087 } else {
2088 ipref = tdp->i_number;
2089 }
2090 if (op == DE_ATTRDIR)
2091 imode = vap->va_mode;
2092 else
2093 imode = MAKEIMODE(type, vap->va_mode);
2094 *ipp = NULL;
2095 err = ufs_ialloc(tdp, ipref, imode, &ip, cr);
2096 if (err)
2097 return (err);
2098
2099 /*
2100 * We don't need to grab vfs_dqrwlock here because it is held
2101 * in ufs_direnter_*() above us.
2102 */
2103 ASSERT(RW_READ_HELD(&ip->i_ufsvfs->vfs_dqrwlock));
2104 rw_enter(&ip->i_contents, RW_WRITER);
2105 if (ip->i_dquot != NULL) {
2106 err = ufs_fault(ITOV(ip),
2107 "ufs_dirmakeinode, ip->i_dquot != NULL: dquot (%s)",
2108 tdp->i_fs->fs_fsmnt);
2109 rw_exit(&ip->i_contents);
2110 return (err);
2111 }
2112 *ipp = ip;
2113 ip->i_mode = (o_mode_t)imode;
2114 if (type == VBLK || type == VCHR) {
2115 dev_t d = vap->va_rdev;
2116 dev32_t dev32;
2117
2118 /*
2119 * Don't allow a special file to be created with a
2120 * dev_t that cannot be represented by this filesystem
2121 * format on disk.
2122 */
2123 if (!cmpldev(&dev32, d)) {
2124 err = EOVERFLOW;
2125 goto fail;
2126 }
2127
2128 ITOV(ip)->v_rdev = ip->i_rdev = d;
2129
2130 if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
2131 ip->i_ordev = dev32; /* can't use old format */
2132 } else {
2133 ip->i_ordev = cmpdev(d);
2134 }
2135 }
2136 ITOV(ip)->v_type = type;
2137 ufs_reset_vnode(ip->i_vnode);
2138 if (type == VDIR) {
2139 ip->i_nlink = 2; /* anticipating a call to dirmakedirect */
2140 } else {
2141 ip->i_nlink = 1;
2142 }
2143
2144 if (op == DE_ATTRDIR) {
2145 ip->i_uid = vap->va_uid;
2146 ip->i_gid = vap->va_gid;
2147 } else
2148 ip->i_uid = crgetuid(cr);
2149 /*
2150 * To determine the group-id of the created file:
2151 * 1) If the gid is set in the attribute list (non-Sun & pre-4.0
2152 * clients are not likely to set the gid), then use it if
2153 * the process is privileged, belongs to the target group,
2154 * or the group is the same as the parent directory.
2155 * 2) If the filesystem was not mounted with the Old-BSD-compatible
2156 * GRPID option, and the directory's set-gid bit is clear,
2157 * then use the process's gid.
2158 * 3) Otherwise, set the group-id to the gid of the parent directory.
2159 */
2160 if (op != DE_ATTRDIR && (vap->va_mask & AT_GID) &&
2161 ((vap->va_gid == tdp->i_gid) || groupmember(vap->va_gid, cr) ||
2162 secpolicy_vnode_create_gid(cr) == 0)) {
2163 /*
2164 * XXX - is this only the case when a 4.0 NFS client, or a
2165 * client derived from that code, makes a call over the wire?
2166 */
2167 ip->i_gid = vap->va_gid;
2168 } else
2169 ip->i_gid = (tdp->i_mode & ISGID) ? tdp->i_gid : crgetgid(cr);
2170
2171 /*
2172 * For SunOS 5.0->5.4, the lines below read:
2173 *
2174 * ip->i_suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
2175 * ip->i_sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
2176 *
2177 * where MAXUID was set to 60002. See notes on this in ufs_inode.c
2178 */
2179 ip->i_suid =
2180 (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ? UID_LONG : ip->i_uid;
2181 ip->i_sgid =
2182 (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ? GID_LONG : ip->i_gid;
2183
2184 /*
2185 * If we're creating a directory, and the parent directory has the
2186 * set-GID bit set, set it on the new directory.
2187 * Otherwise, if the user is neither privileged nor a member of the
2188 * file's new group, clear the file's set-GID bit.
2189 */
2190 if ((tdp->i_mode & ISGID) && (type == VDIR))
2191 ip->i_mode |= ISGID;
2192 else {
2193 if ((ip->i_mode & ISGID) &&
2194 secpolicy_vnode_setids_setgids(cr, ip->i_gid) != 0)
2195 ip->i_mode &= ~ISGID;
2196 }
2197
2198 if (((vap->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2199 ((vap->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2200 err = EOVERFLOW;
2201 goto fail;
2202 }
2203
2204 /*
2205 * Extended attribute directories are not subject to quotas.
2206 */
2207 if (op != DE_ATTRDIR)
2208 ip->i_dquot = getinoquota(ip);
2209 else
2210 ip->i_dquot = NULL;
2211
2212 if (op == DE_MKDIR || op == DE_ATTRDIR) {
2213 err = ufs_dirmakedirect(ip, tdp, (op == DE_MKDIR) ? 0 : 1, cr);
2214 if (err)
2215 goto fail;
2216 }
2217
2218 /*
2219 * generate the shadow inode and attach it to the new object
2220 */
2221 ASSERT((tdp->i_shadow && tdp->i_ufs_acl) ||
2222 (!tdp->i_shadow && !tdp->i_ufs_acl));
2223 if (tdp->i_shadow && tdp->i_ufs_acl &&
2224 (((tdp->i_mode & IFMT) == IFDIR) ||
2225 ((tdp->i_mode & IFMT) == IFATTRDIR))) {
2226 err = ufs_si_inherit(ip, tdp, ip->i_mode, cr);
2227 if (err) {
2228 if (op == DE_MKDIR) {
2229 /*
2230 * clean up parent directory
2231 *
2232 * tdp->i_contents already locked from
2233 * ufs_direnter_*()
2234 */
2235 tdp->i_nlink--;
2236 TRANS_INODE(tdp->i_ufsvfs, tdp);
2237 tdp->i_flag |= ICHG;
2238 tdp->i_seq++;
2239 ufs_iupdat(tdp, I_SYNC);
2240 }
2241 goto fail;
2242 }
2243 }
2244
2245 /*
2246 * If the passed in attributes contain atime and/or mtime
2247 * settings, then use them instead of using the current
2248 * high resolution time.
2249 */
2250 if (vap->va_mask & (AT_MTIME|AT_ATIME)) {
2251 if (vap->va_mask & AT_ATIME) {
2252 ip->i_atime.tv_sec = vap->va_atime.tv_sec;
2253 ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2254 ip->i_flag &= ~IACC;
2255 } else
2256 ip->i_flag |= IACC;
2257 if (vap->va_mask & AT_MTIME) {
2258 ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
2259 ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2260 gethrestime(&now);
2261 if (now.tv_sec > TIME32_MAX) {
2262 /*
2263 * In 2038, ctime sticks forever..
2264 */
2265 ip->i_ctime.tv_sec = TIME32_MAX;
2266 ip->i_ctime.tv_usec = 0;
2267 } else {
2268 ip->i_ctime.tv_sec = now.tv_sec;
2269 ip->i_ctime.tv_usec = now.tv_nsec / 1000;
2270 }
2271 ip->i_flag &= ~(IUPD|ICHG);
2272 ip->i_flag |= IMODTIME;
2273 } else
2274 ip->i_flag |= IUPD|ICHG;
2275 ip->i_flag |= IMOD;
2276 } else
2277 ip->i_flag |= IACC|IUPD|ICHG;
2278 ip->i_seq++;
2279
2280 /*
2281 * If this is an attribute tag it as one.
2282 */
2283 if ((tdp->i_mode & IFMT) == IFATTRDIR) {
2284 ip->i_cflags |= IXATTR;
2285 }
2286
2287 /*
2288 * push inode before it's name appears in a directory
2289 */
2290 TRANS_INODE(ip->i_ufsvfs, ip);
2291 ufs_iupdat(ip, I_SYNC);
2292 rw_exit(&ip->i_contents);
2293 return (0);
2294
2295 fail:
2296 /* Throw away inode we just allocated. */
2297 ip->i_nlink = 0;
2298 ufs_setreclaim(ip);
2299 TRANS_INODE(ip->i_ufsvfs, ip);
2300 ip->i_flag |= ICHG;
2301 ip->i_seq++;
2302 ITIMES_NOLOCK(ip);
2303 rw_exit(&ip->i_contents);
2304 return (err);
2305 }
2306
2307 /*
2308 * Write a prototype directory into the empty inode ip, whose parent is dp.
2309 */
2310 static int
ufs_dirmakedirect(struct inode * ip,struct inode * dp,int attrdir,struct cred * cr)2311 ufs_dirmakedirect(
2312 struct inode *ip, /* new directory */
2313 struct inode *dp, /* parent directory */
2314 int attrdir,
2315 struct cred *cr)
2316 {
2317 struct dirtemplate *dirp;
2318 struct fbuf *fbp;
2319 int err;
2320
2321 ASSERT(RW_WRITE_HELD(&ip->i_contents));
2322 ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
2323 ASSERT(RW_WRITE_HELD(&dp->i_contents));
2324 /*
2325 * Allocate space for the directory we're creating.
2326 */
2327 err = BMAPALLOC(ip, (u_offset_t)0, DIRBLKSIZ, cr);
2328 if (err)
2329 return (err);
2330 if (DIRBLKSIZ > dp->i_fs->fs_fsize) {
2331 err = ufs_fault(ITOV(dp),
2332 "ufs_dirmakedirect: bad fs_fsize, DIRBLKSIZ: %d > dp->i_fs->fs_fsize: %d (%s)",
2333 DIRBLKSIZ, dp->i_fs->fs_fsize,
2334 dp->i_fs->fs_fsmnt);
2335 return (err);
2336 }
2337 ip->i_size = DIRBLKSIZ;
2338 TRANS_INODE(ip->i_ufsvfs, ip);
2339 ip->i_flag |= IUPD|ICHG|IATTCHG;
2340 ip->i_seq++;
2341 ITIMES_NOLOCK(ip);
2342 /*
2343 * Update the tdp link count and write out the change.
2344 * This reflects the ".." entry we'll soon write.
2345 */
2346 if (dp->i_nlink == MAXLINK)
2347 return (EMLINK);
2348 if (attrdir == 0)
2349 dp->i_nlink++;
2350 TRANS_INODE(dp->i_ufsvfs, dp);
2351 dp->i_flag |= ICHG;
2352 dp->i_seq++;
2353 ufs_iupdat(dp, I_SYNC);
2354 /*
2355 * Initialize directory with "."
2356 * and ".." from static template.
2357 *
2358 * Since the parent directory is locked, we don't have to
2359 * worry about anything changing when we drop the write
2360 * lock on (ip).
2361 *
2362 */
2363 err = fbread(ITOV(ip), (offset_t)0, (uint_t)ip->i_fs->fs_fsize,
2364 S_READ, &fbp);
2365
2366 if (err) {
2367 goto fail;
2368 }
2369 dirp = (struct dirtemplate *)fbp->fb_addr;
2370 /*
2371 * Now initialize the directory we're creating
2372 * with the "." and ".." entries.
2373 */
2374 *dirp = mastertemplate; /* structure assignment */
2375 dirp->dot_ino = (uint32_t)ip->i_number;
2376 dirp->dotdot_ino = (uint32_t)dp->i_number;
2377
2378 err = TRANS_DIR(ip, 0);
2379 if (err) {
2380 fbrelse(fbp, S_OTHER);
2381 goto fail;
2382 }
2383
2384 err = ufs_fbwrite(fbp, ip);
2385 if (err) {
2386 goto fail;
2387 }
2388
2389 return (0);
2390
2391 fail:
2392 if (attrdir == 0)
2393 dp->i_nlink--;
2394 TRANS_INODE(dp->i_ufsvfs, dp);
2395 dp->i_flag |= ICHG;
2396 dp->i_seq++;
2397 ufs_iupdat(dp, I_SYNC);
2398 return (err);
2399 }
2400
2401 /*
2402 * Delete a directory entry. If oip is nonzero the entry is checked
2403 * to make sure it still reflects oip.
2404 */
2405 int
ufs_dirremove(struct inode * dp,char * namep,struct inode * oip,struct vnode * cdir,enum dr_op op,struct cred * cr)2406 ufs_dirremove(
2407 struct inode *dp,
2408 char *namep,
2409 struct inode *oip,
2410 struct vnode *cdir,
2411 enum dr_op op,
2412 struct cred *cr)
2413 {
2414 struct direct *ep, *pep, *nep;
2415 struct inode *ip;
2416 vnode_t *dvp, *vp;
2417 struct ufs_slot slot;
2418 int namlen;
2419 int err;
2420 int mode;
2421 ushort_t extra;
2422
2423 namlen = (int)strlen(namep);
2424 if (namlen == 0) {
2425 struct fs *fs = dp->i_fs;
2426
2427 cmn_err(CE_WARN, "%s: ufs_dirremove: attempted to remove"
2428 " nameless file in directory (directory inode %llu)",
2429 fs->fs_fsmnt, (u_longlong_t)dp->i_number);
2430 ASSERT(namlen != 0);
2431
2432 return (ENOENT);
2433 }
2434
2435 /*
2436 * return error when removing . and ..
2437 */
2438 if (namep[0] == '.') {
2439 if (namlen == 1)
2440 return (EINVAL);
2441 else if (namlen == 2 && namep[1] == '.') {
2442 return (EEXIST); /* SIGH should be ENOTEMPTY */
2443 }
2444 }
2445
2446 ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
2447
2448 retry:
2449 /*
2450 * Check accessibility of directory.
2451 */
2452 if (err = ufs_diraccess(dp, IEXEC|IWRITE, cr))
2453 return (err);
2454
2455 ip = NULL;
2456 slot.fbp = NULL;
2457 slot.status = FOUND; /* don't need to look for empty slot */
2458 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
2459 rw_enter(&dp->i_contents, RW_WRITER);
2460
2461 err = ufs_dircheckforname(dp, namep, namlen, &slot, &ip, cr, 0);
2462 if (err)
2463 goto out_novfs;
2464 if (ip == NULL) {
2465 err = ENOENT;
2466 goto out_novfs;
2467 }
2468 vp = ITOV(ip);
2469 if (oip && oip != ip) {
2470 err = ENOENT;
2471 goto out_novfs;
2472 }
2473
2474 mode = ip->i_mode & IFMT;
2475 if (mode == IFDIR || mode == IFATTRDIR) {
2476
2477 /*
2478 * vn_vfsrlock() prevents races between mount and rmdir.
2479 */
2480 if (vn_vfsrlock(vp)) {
2481 err = EBUSY;
2482 goto out_novfs;
2483 }
2484 if (vn_mountedvfs(vp) != NULL && op != DR_RENAME) {
2485 err = EBUSY;
2486 goto out;
2487 }
2488 /*
2489 * If we are removing a directory, get a lock on it.
2490 * Taking a writer lock prevents a parallel ufs_dirlook from
2491 * incorrectly entering a negative cache vnode entry in the dnlc
2492 * If the directory is empty, it will stay empty until
2493 * we can remove it.
2494 */
2495 if (!rw_tryenter(&ip->i_rwlock, RW_WRITER)) {
2496 /*
2497 * It is possible that a thread in rename would have
2498 * acquired this rwlock. To prevent a deadlock we
2499 * do a rw_tryenter. If we fail to get the lock
2500 * we drop all the locks we have acquired, wait
2501 * for 2 ticks and reacquire the
2502 * directory's (dp) i_rwlock and try again.
2503 * If we dont drop dp's i_rwlock then we will panic
2504 * with a "Deadlock: cycle in blocking chain"
2505 * since in ufs_dircheckpath we want dp's i_rwlock.
2506 * dp is guaranteed to exist since ufs_dirremove is
2507 * called after a VN_HOLD(dp) has been done.
2508 */
2509 ufs_dirremove_retry_cnt++;
2510 vn_vfsunlock(vp);
2511 if (slot.fbp)
2512 fbrelse(slot.fbp, S_OTHER);
2513 rw_exit(&dp->i_contents);
2514 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
2515 rw_exit(&dp->i_rwlock);
2516 VN_RELE(vp);
2517 delay(2);
2518 rw_enter(&dp->i_rwlock, RW_WRITER);
2519 goto retry;
2520 }
2521 }
2522 rw_enter(&ip->i_contents, RW_READER);
2523
2524 /*
2525 * Now check the restrictions that apply on sticky directories.
2526 */
2527 if ((err = ufs_sticky_remove_access(dp, ip, cr)) != 0) {
2528 rw_exit(&ip->i_contents);
2529 if (mode == IFDIR || mode == IFATTRDIR)
2530 rw_exit(&ip->i_rwlock);
2531 goto out;
2532 }
2533
2534 if (op == DR_RMDIR) {
2535 /*
2536 * For rmdir(2), some special checks are required.
2537 * (a) Don't remove any alias of the parent (e.g. ".").
2538 * (b) Don't remove the current directory.
2539 * (c) Make sure the entry is (still) a directory.
2540 * (d) Make sure the directory is empty.
2541 */
2542
2543 if (dp == ip || vp == cdir)
2544 err = EINVAL;
2545 else if (((ip->i_mode & IFMT) != IFDIR) &&
2546 ((ip->i_mode & IFMT) != IFATTRDIR))
2547 err = ENOTDIR;
2548 else if ((ip->i_nlink > 2) ||
2549 !ufs_dirempty(ip, dp->i_number, cr)) {
2550 err = EEXIST; /* SIGH should be ENOTEMPTY */
2551 }
2552
2553 if (err) {
2554 rw_exit(&ip->i_contents);
2555 if (mode == IFDIR || mode == IFATTRDIR)
2556 rw_exit(&ip->i_rwlock);
2557 goto out;
2558 }
2559 } else if (op == DR_REMOVE) {
2560 /*
2561 * unlink(2) requires a different check: allow only
2562 * privileged users to unlink a directory.
2563 */
2564 if (vp->v_type == VDIR &&
2565 secpolicy_fs_linkdir(cr, vp->v_vfsp)) {
2566 err = EPERM;
2567 rw_exit(&ip->i_contents);
2568 rw_exit(&ip->i_rwlock);
2569 goto out;
2570 }
2571 }
2572
2573 rw_exit(&ip->i_contents);
2574
2575 /*
2576 * Remove the cache'd entry, if any.
2577 */
2578 dvp = ITOV(dp);
2579 dnlc_remove(dvp, namep);
2580 ep = slot.ep;
2581 ep->d_ino = 0;
2582
2583 if (slot.cached) {
2584 dcanchor_t *dcap = &dp->i_danchor;
2585
2586 (void) dnlc_dir_rem_entry(dcap, namep, NULL);
2587 if (((int)ep->d_reclen - (int)DIRSIZ(ep)) >= LDIRSIZ(1)) {
2588 (void) dnlc_dir_rem_space_by_handle(dcap, slot.offset);
2589 }
2590 if (slot.offset & (DIRBLKSIZ - 1)) {
2591 /*
2592 * Collapse new free space into previous entry.
2593 * Note, the previous entry has already been
2594 * validated in ufs_dircheckforname().
2595 */
2596 ASSERT(slot.size);
2597 pep = (struct direct *)((char *)ep - slot.size);
2598 if ((pep->d_ino == 0) &&
2599 ((uintptr_t)pep & (DIRBLKSIZ - 1))) {
2600 dnlc_dir_purge(dcap);
2601 slot.cached = 0;
2602 goto nocache;
2603 }
2604 if (pep->d_ino) {
2605 extra = pep->d_reclen - DIRSIZ(pep);
2606 } else {
2607 extra = pep->d_reclen;
2608 }
2609 if (extra >= LDIRSIZ(1)) {
2610 (void) dnlc_dir_rem_space_by_handle(dcap,
2611 (uint64_t)(slot.offset - slot.size));
2612 }
2613 pep->d_reclen += ep->d_reclen;
2614 (void) dnlc_dir_add_space(dcap, extra + ep->d_reclen,
2615 (uint64_t)(slot.offset - slot.size));
2616 /* adjust the previous pointer in the next entry */
2617 nep = (struct direct *)((char *)ep + ep->d_reclen);
2618 if ((uintptr_t)nep & (DIRBLKSIZ - 1)) {
2619 /*
2620 * Not a new block.
2621 *
2622 * Check the validity of the entry.
2623 * If it's bad, then throw away the cache and
2624 * continue.
2625 */
2626 if ((nep->d_reclen == 0) ||
2627 (nep->d_reclen & 0x3) ||
2628 (dnlc_dir_update(dcap, nep->d_name,
2629 INO_OFF_TO_H(nep->d_ino,
2630 slot.offset - slot.size)) == DNOENT)) {
2631 dnlc_dir_purge(dcap);
2632 slot.cached = 0;
2633 }
2634 }
2635 } else {
2636 (void) dnlc_dir_add_space(dcap, ep->d_reclen,
2637 (uint64_t)slot.offset);
2638 }
2639 } else {
2640 /*
2641 * If the entry isn't the first in the directory, we must
2642 * reclaim the space of the now empty record by adding
2643 * the record size to the size of the previous entry.
2644 */
2645 if (slot.offset & (DIRBLKSIZ - 1)) {
2646 /*
2647 * Collapse new free space into previous entry.
2648 */
2649 pep = (struct direct *)((char *)ep - slot.size);
2650 pep->d_reclen += ep->d_reclen;
2651 }
2652 }
2653 nocache:
2654
2655
2656 err = TRANS_DIR(dp, slot.offset);
2657 if (err)
2658 fbrelse(slot.fbp, S_OTHER);
2659 else
2660 err = ufs_fbwrite(slot.fbp, dp);
2661 slot.fbp = NULL;
2662
2663 /*
2664 * If we were removing a directory, it is 'gone' now, but we cannot
2665 * unlock it as a thread may be waiting for the lock in ufs_create. If
2666 * we did, it could then create a file in a deleted directory.
2667 */
2668
2669 if (err) {
2670 if (mode == IFDIR || mode == IFATTRDIR)
2671 rw_exit(&ip->i_rwlock);
2672 goto out;
2673 }
2674
2675 rw_enter(&ip->i_contents, RW_WRITER);
2676
2677 dp->i_flag |= IUPD|ICHG;
2678 dp->i_seq++;
2679 ip->i_flag |= ICHG;
2680 ip->i_seq++;
2681
2682 TRANS_INODE(dp->i_ufsvfs, dp);
2683 TRANS_INODE(ip->i_ufsvfs, ip);
2684 /*
2685 * Now dispose of the inode.
2686 */
2687 if (ip->i_nlink > 0) {
2688 /*
2689 * This is not done for IFATTRDIR's because they don't
2690 * have entries in the dnlc and the link counts are
2691 * not incremented when they are created.
2692 */
2693 if (op == DR_RMDIR && (ip->i_mode & IFMT) == IFDIR) {
2694 /*
2695 * Decrement by 2 because we're trashing the "."
2696 * entry as well as removing the entry in dp.
2697 * Clear the directory entry, but there may be
2698 * other hard links so don't free the inode.
2699 * Decrement the dp linkcount because we're
2700 * trashing the ".." entry.
2701 */
2702 ip->i_nlink -= 2;
2703 dp->i_nlink--;
2704 ufs_setreclaim(dp);
2705 /*
2706 * XXX need to discard negative cache entries
2707 * for vp. See comment in ufs_delete().
2708 */
2709 dnlc_remove(vp, ".");
2710 dnlc_remove(vp, "..");
2711 /*
2712 * The return value is ignored here bacause if
2713 * the directory purge fails we don't want to
2714 * stop the delete. If ufs_dirpurgedotdot fails
2715 * the delete will continue with the preexiting
2716 * behavior.
2717 */
2718 (void) ufs_dirpurgedotdot(ip, dp->i_number, cr);
2719 } else {
2720 ip->i_nlink--;
2721 }
2722 ufs_setreclaim(ip);
2723 }
2724 ITIMES_NOLOCK(dp);
2725 ITIMES_NOLOCK(ip);
2726
2727 if (!TRANS_ISTRANS(dp->i_ufsvfs))
2728 ufs_iupdat(dp, I_SYNC);
2729 if (!TRANS_ISTRANS(ip->i_ufsvfs))
2730 ufs_iupdat(ip, I_SYNC);
2731
2732 rw_exit(&ip->i_contents);
2733 if (mode == IFDIR || mode == IFATTRDIR)
2734 rw_exit(&ip->i_rwlock);
2735 out:
2736 if (mode == IFDIR || mode == IFATTRDIR) {
2737 vn_vfsunlock(vp);
2738 }
2739 out_novfs:
2740 ASSERT(RW_WRITE_HELD(&dp->i_contents));
2741
2742 if (slot.fbp)
2743 fbrelse(slot.fbp, S_OTHER);
2744
2745 rw_exit(&dp->i_contents);
2746 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
2747
2748 /*
2749 * Release (and delete) the inode after we drop vfs_dqrwlock to
2750 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
2751 */
2752 if (ip)
2753 VN_RELE(vp);
2754
2755 return (err);
2756 }
2757
2758 /*
2759 * Return buffer with contents of block "offset"
2760 * from the beginning of directory "ip". If "res"
2761 * is non-zero, fill it in with a pointer to the
2762 * remaining space in the directory.
2763 *
2764 */
2765
2766 int
blkatoff(struct inode * ip,off_t offset,char ** res,struct fbuf ** fbpp)2767 blkatoff(
2768 struct inode *ip,
2769 off_t offset,
2770 char **res,
2771 struct fbuf **fbpp)
2772 {
2773 struct fs *fs;
2774 struct fbuf *fbp;
2775 daddr_t lbn;
2776 uint_t bsize;
2777 int err;
2778
2779 CPU_STATS_ADD_K(sys, ufsdirblk, 1);
2780 fs = ip->i_fs;
2781 lbn = (daddr_t)lblkno(fs, offset);
2782 bsize = (uint_t)blksize(fs, ip, lbn);
2783 err = fbread(ITOV(ip), (offset_t)(offset & fs->fs_bmask),
2784 bsize, S_READ, &fbp);
2785 if (err) {
2786 *fbpp = (struct fbuf *)NULL;
2787 return (err);
2788 }
2789 if (res)
2790 *res = fbp->fb_addr + blkoff(fs, offset);
2791 *fbpp = fbp;
2792 return (0);
2793 }
2794
2795 /*
2796 * Do consistency checking:
2797 * record length must be multiple of 4
2798 * entry must fit in rest of its DIRBLKSIZ block
2799 * record must be large enough to contain entry
2800 * name is not longer than MAXNAMLEN
2801 * name must be as long as advertised, and null terminated
2802 * NOTE: record length must not be zero (should be checked previously).
2803 * This routine is only called if dirchk is true.
2804 * It would be nice to set the FSBAD flag in the super-block when
2805 * this routine fails so that a fsck is forced on next reboot,
2806 * but locking is a problem.
2807 */
2808 static int
dirmangled(struct inode * dp,struct direct * ep,int entryoffsetinblock,off_t offset)2809 dirmangled(
2810 struct inode *dp,
2811 struct direct *ep,
2812 int entryoffsetinblock,
2813 off_t offset)
2814 {
2815 int i;
2816
2817 i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
2818 if ((ep->d_reclen & 0x3) != 0 || (int)ep->d_reclen > i ||
2819 (uint_t)ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN ||
2820 ep->d_ino && dirbadname(ep->d_name, (int)ep->d_namlen)) {
2821 dirbad(dp, "mangled entry", offset);
2822 return (1);
2823 }
2824 return (0);
2825 }
2826
2827 static void
dirbad(struct inode * ip,char * how,off_t offset)2828 dirbad(struct inode *ip, char *how, off_t offset)
2829 {
2830 cmn_err(CE_NOTE, "%s: bad dir ino %d at offset %ld: %s",
2831 ip->i_fs->fs_fsmnt, (int)ip->i_number, offset, how);
2832 }
2833
2834 static int
dirbadname(char * sp,int l)2835 dirbadname(char *sp, int l)
2836 {
2837 while (l--) { /* check for nulls */
2838 if (*sp++ == '\0') {
2839 return (1);
2840 }
2841 }
2842 return (*sp); /* check for terminating null */
2843 }
2844
2845 /*
2846 * Check if a directory is empty or not.
2847 */
2848 static int
ufs_dirempty(struct inode * ip,ino_t parentino,struct cred * cr)2849 ufs_dirempty(
2850 struct inode *ip,
2851 ino_t parentino,
2852 struct cred *cr)
2853 {
2854 return (ufs_dirscan(ip, parentino, cr, 0));
2855 }
2856
2857 /*
2858 * clear the .. directory entry.
2859 */
2860 static int
ufs_dirpurgedotdot(struct inode * ip,ino_t parentino,struct cred * cr)2861 ufs_dirpurgedotdot(
2862 struct inode *ip,
2863 ino_t parentino,
2864 struct cred *cr)
2865 {
2866 return (ufs_dirscan(ip, parentino, cr, 1));
2867 }
2868
2869 /*
2870 * Scan the directoy. If clr_dotdot is true clear the ..
2871 * directory else check to see if the directory is empty.
2872 *
2873 * clr_dotdot is used as a flag to tell us if we need
2874 * to clear the dotdot entry
2875 *
2876 * N.B.: does not handle corrupted directories.
2877 */
2878 static int
ufs_dirscan(struct inode * ip,ino_t parentino,struct cred * cr,int clr_dotdot)2879 ufs_dirscan(
2880 struct inode *ip,
2881 ino_t parentino,
2882 struct cred *cr,
2883 int clr_dotdot)
2884 {
2885 offset_t off;
2886 struct tmp_dir dbuf, *dp;
2887 int err, count;
2888 int empty = 1; /* Assume it's empty */
2889
2890 dp = &dbuf;
2891 ASSERT(RW_LOCK_HELD(&ip->i_contents));
2892
2893 ASSERT(ip->i_size <= (offset_t)MAXOFF_T);
2894 for (off = 0; off < ip->i_size; off += dp->d_reclen) {
2895 err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp,
2896 sizeof (struct tmp_dir), off, UIO_SYSSPACE, &count, cr);
2897 /*
2898 * Since we read sizeof (struct tmp_dir), residual must
2899 * be 0 unless we're at end of file.
2900 */
2901 if (err || count != 0 || dp->d_reclen == 0) {
2902 empty = 0;
2903 break;
2904 }
2905 /* skip empty entries */
2906 if (dp->d_ino == 0)
2907 continue;
2908 /* accept only "." and ".." */
2909 if (dp->d_namlen > 2 || dp->d_name[0] != '.') {
2910 empty = 0;
2911 break;
2912 }
2913 /*
2914 * At this point d_namlen must be 1 or 2.
2915 * 1 implies ".", 2 implies ".." if second
2916 * char is also "."
2917 */
2918 if (dp->d_namlen == 1)
2919 continue;
2920 if (dp->d_name[1] == '.' &&
2921 (ino_t)dp->d_ino == parentino) {
2922 /*
2923 * If we're doing a purge we need to check for
2924 * the . and .. entries and clear the d_ino for ..
2925 *
2926 * if clr_dotdot is set ufs_dirscan does not
2927 * check for an empty directory.
2928 */
2929 if (clr_dotdot) {
2930 /*
2931 * Have to actually zap the ..
2932 * entry in the directory, as
2933 * otherwise someone might have
2934 * dp as its cwd and try to
2935 * open .., which now points to
2936 * an unallocated inode.
2937 */
2938 empty = ufs_dirclrdotdot(ip, parentino);
2939 break;
2940 } else {
2941 continue;
2942 }
2943 }
2944 empty = 0;
2945 break;
2946 }
2947 return (empty);
2948 }
2949
2950 clock_t retry_backoff_delay = 1; /* delay before retrying the i_rwlock */
2951 uint64_t dircheck_retry_cnt;
2952 /*
2953 * Check if source directory inode is in the path of the target directory.
2954 * Target is supplied locked.
2955 *
2956 * The source and target inode's should be different upon entry.
2957 */
2958 int
ufs_dircheckpath(ino_t source_ino,struct inode * target,struct inode * sdp,struct cred * cr)2959 ufs_dircheckpath(
2960 ino_t source_ino,
2961 struct inode *target,
2962 struct inode *sdp,
2963 struct cred *cr)
2964 {
2965 struct fbuf *fbp;
2966 struct dirtemplate *dirp;
2967 struct inode *ip;
2968 struct ufsvfs *ufsvfsp;
2969 struct inode *tip;
2970 ino_t dotdotino;
2971 int err;
2972
2973 ASSERT(target->i_ufsvfs != NULL);
2974 ASSERT(RW_LOCK_HELD(&target->i_rwlock));
2975 ASSERT(RW_LOCK_HELD(&sdp->i_rwlock));
2976
2977 ip = target;
2978 if (ip->i_number == source_ino) {
2979 err = EINVAL;
2980 goto out;
2981 }
2982 if (ip->i_number == UFSROOTINO) {
2983 err = 0;
2984 goto out;
2985 }
2986 /*
2987 * Search back through the directory tree, using the ".." entries.
2988 * Fail any attempt to move a directory into an ancestor directory.
2989 */
2990 fbp = NULL;
2991 for (;;) {
2992 struct vfs *vfs;
2993
2994 err = blkatoff(ip, (off_t)0, (char **)&dirp, &fbp);
2995 if (err)
2996 break;
2997 if (((ip->i_mode & IFMT) != IFDIR) || ip->i_nlink == 0 ||
2998 ip->i_size < sizeof (struct dirtemplate)) {
2999 dirbad(ip, "bad size, unlinked or not dir", (off_t)0);
3000 err = ENOTDIR;
3001 break;
3002 }
3003 if (dirp->dotdot_namlen != 2 ||
3004 dirp->dotdot_name[0] != '.' ||
3005 dirp->dotdot_name[1] != '.') {
3006 dirbad(ip, "mangled .. entry", (off_t)0);
3007 err = ENOTDIR; /* Sanity check */
3008 break;
3009 }
3010 dotdotino = (ino_t)dirp->dotdot_ino;
3011 if (dotdotino == source_ino) {
3012 err = EINVAL;
3013 break;
3014 }
3015 if (dotdotino == UFSROOTINO)
3016 break;
3017 if (fbp) {
3018 fbrelse(fbp, S_OTHER);
3019 fbp = NULL;
3020 }
3021 vfs = ip->i_vfs;
3022 ufsvfsp = ip->i_ufsvfs;
3023
3024 if (ip != target) {
3025 rw_exit(&ip->i_rwlock);
3026 VN_RELE(ITOV(ip));
3027 }
3028 /*
3029 * Race to get the inode.
3030 */
3031 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
3032 if (err = ufs_iget_alloced(vfs, dotdotino, &tip, cr)) {
3033 rw_exit(&ufsvfsp->vfs_dqrwlock);
3034 ip = NULL;
3035 break;
3036 }
3037 rw_exit(&ufsvfsp->vfs_dqrwlock);
3038 /*
3039 * If the directory of the source inode (also a directory)
3040 * is the same as this next entry up the chain, then
3041 * we know the source directory itself can't be in the
3042 * chain. This also prevents a panic because we already
3043 * have sdp->i_rwlock locked.
3044 */
3045 if (tip == sdp) {
3046 VN_RELE(ITOV(tip));
3047 ip = NULL;
3048 break;
3049 }
3050 ip = tip;
3051
3052 /*
3053 * If someone has set the WRITE_WANTED bit in this lock and if
3054 * this happens to be a sdp or tdp of another parallel rename
3055 * which is executing the same code and in similar situation
3056 * we end up in a 4 way deadlock. We need to make sure that
3057 * the WRITE_WANTED bit is not set.
3058 */
3059 retry_lock:
3060 if (!rw_tryenter(&ip->i_rwlock, RW_READER)) {
3061 /*
3062 * If the lock held as WRITER thats fine but if it
3063 * has WRITE_WANTED bit set we might end up in a
3064 * deadlock. If WRITE_WANTED is set we return
3065 * with EAGAIN else we just go back and try.
3066 */
3067 if (RW_ISWRITER(&ip->i_rwlock) &&
3068 !(RW_WRITE_HELD(&ip->i_rwlock))) {
3069 err = EAGAIN;
3070 if (fbp) {
3071 fbrelse(fbp, S_OTHER);
3072 }
3073 VN_RELE(ITOV(ip));
3074 return (err);
3075 } else {
3076 /*
3077 * The lock is being write held. We could
3078 * just do a rw_enter here but there is a
3079 * window between the check and now, where
3080 * the status could have changed, so to
3081 * avoid looping we backoff and go back to
3082 * try for the lock.
3083 */
3084 delay(retry_backoff_delay);
3085 dircheck_retry_cnt++;
3086 goto retry_lock;
3087 }
3088 }
3089 }
3090 if (fbp) {
3091 fbrelse(fbp, S_OTHER);
3092 }
3093 out:
3094 if (ip) {
3095 if (ip != target) {
3096 rw_exit(&ip->i_rwlock);
3097 VN_RELE(ITOV(ip));
3098 }
3099 }
3100 return (err);
3101 }
3102
3103 int
ufs_xattrdirempty(struct inode * ip,ino_t parentino,struct cred * cr)3104 ufs_xattrdirempty(struct inode *ip, ino_t parentino, struct cred *cr)
3105 {
3106 offset_t off;
3107 struct tmp_dir dbuf, *dp;
3108 int err, count;
3109 int empty = 1; /* Assume it's empty */
3110
3111 dp = &dbuf;
3112 ASSERT(RW_LOCK_HELD(&ip->i_contents));
3113
3114 ASSERT(ip->i_size <= (offset_t)MAXOFF_T);
3115 for (off = 0; off < ip->i_size; off += dp->d_reclen) {
3116 err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp,
3117 sizeof (struct tmp_dir), off, UIO_SYSSPACE, &count, cr);
3118 /*
3119 * Since we read sizeof (struct tmp_dir), residual must
3120 * be 0 unless we're at end of file.
3121 */
3122
3123 if (err || count != 0 || dp->d_reclen == 0) {
3124 empty = 0;
3125 break;
3126 }
3127 /* skip empty entries */
3128 if (dp->d_ino == 0)
3129 continue;
3130 /*
3131 * At this point d_namlen must be 1 or 2.
3132 * 1 implies ".", 2 implies ".." if second
3133 * char is also "."
3134 */
3135
3136 if (dp->d_namlen == 1 && dp->d_name[0] == '.' &&
3137 (ino_t)dp->d_ino == parentino)
3138 continue;
3139
3140 if (dp->d_namlen == 2 && dp->d_name[0] == '.' &&
3141 dp->d_name[1] == '.') {
3142 continue;
3143 }
3144 empty = 0;
3145 break;
3146 }
3147 return (empty);
3148 }
3149
3150
3151 /*
3152 * Allocate and initialize a new shadow inode to contain extended attributes.
3153 */
3154 int
ufs_xattrmkdir(struct inode * tdp,struct inode ** ipp,int flags,struct cred * cr)3155 ufs_xattrmkdir(
3156 struct inode *tdp,
3157 struct inode **ipp,
3158 int flags,
3159 struct cred *cr)
3160 {
3161 struct inode *ip;
3162 struct vattr va;
3163 int err;
3164 int retry = 1;
3165 struct ufsvfs *ufsvfsp;
3166 struct ulockfs *ulp;
3167 int issync;
3168 int trans_size;
3169 int dorwlock; /* 0 = not yet taken, */
3170 /* 1 = taken outside the transaction, */
3171 /* 2 = taken inside the transaction */
3172
3173 /*
3174 * Validate permission to create attribute directory
3175 */
3176
3177 if ((err = ufs_iaccess(tdp, IWRITE, cr, 1)) != 0) {
3178 return (err);
3179 }
3180
3181 if (vn_is_readonly(ITOV(tdp)))
3182 return (EROFS);
3183
3184 /*
3185 * No need to re-init err after again:, since it's set before
3186 * the next use of it.
3187 */
3188 again:
3189 dorwlock = 0;
3190 va.va_type = VDIR;
3191 va.va_uid = tdp->i_uid;
3192 va.va_gid = tdp->i_gid;
3193
3194 if ((tdp->i_mode & IFMT) == IFDIR) {
3195 va.va_mode = (o_mode_t)IFATTRDIR;
3196 va.va_mode |= tdp->i_mode & 0777;
3197 } else {
3198 va.va_mode = (o_mode_t)IFATTRDIR|0700;
3199 if (tdp->i_mode & 0040)
3200 va.va_mode |= 0750;
3201 if (tdp->i_mode & 0004)
3202 va.va_mode |= 0705;
3203 }
3204 va.va_mask = AT_TYPE|AT_MODE;
3205
3206 ufsvfsp = tdp->i_ufsvfs;
3207
3208 err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK);
3209 if (err)
3210 return (err);
3211
3212 /*
3213 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file.
3214 * This follows the protocol for read()/write().
3215 */
3216 if (ITOV(tdp)->v_type != VDIR) {
3217 rw_enter(&tdp->i_rwlock, RW_WRITER);
3218 dorwlock = 1;
3219 }
3220
3221 if (ulp) {
3222 trans_size = (int)TOP_MKDIR_SIZE(tdp);
3223 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR, trans_size);
3224 }
3225
3226 /*
3227 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory.
3228 * This follows the protocol established by
3229 * ufs_link/create/remove/rename/mkdir/rmdir/symlink.
3230 */
3231 if (dorwlock == 0) {
3232 rw_enter(&tdp->i_rwlock, RW_WRITER);
3233 dorwlock = 2;
3234 }
3235 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
3236 rw_enter(&tdp->i_contents, RW_WRITER);
3237
3238 /*
3239 * Suppress out of inodes messages if we will retry.
3240 */
3241 if (retry)
3242 tdp->i_flag |= IQUIET;
3243 err = ufs_dirmakeinode(tdp, &ip, &va, DE_ATTRDIR, cr);
3244 tdp->i_flag &= ~IQUIET;
3245
3246 if (err)
3247 goto fail;
3248
3249 if (flags) {
3250
3251 /*
3252 * Now attach it to src file.
3253 */
3254
3255 tdp->i_oeftflag = ip->i_number;
3256 }
3257
3258 ip->i_cflags |= IXATTR;
3259 ITOV(ip)->v_flag |= V_XATTRDIR;
3260 TRANS_INODE(ufsvfsp, tdp);
3261 tdp->i_flag |= ICHG | IUPD;
3262 tdp->i_seq++;
3263 ufs_iupdat(tdp, I_SYNC);
3264 rw_exit(&tdp->i_contents);
3265 rw_exit(&ufsvfsp->vfs_dqrwlock);
3266
3267 rw_enter(&ip->i_rwlock, RW_WRITER);
3268 rw_enter(&ip->i_contents, RW_WRITER);
3269 TRANS_INODE(ufsvfsp, ip);
3270 ip->i_flag |= ICHG| IUPD;
3271 ip->i_seq++;
3272 ufs_iupdat(ip, I_SYNC);
3273 rw_exit(&ip->i_contents);
3274 rw_exit(&ip->i_rwlock);
3275 if (dorwlock == 2)
3276 rw_exit(&tdp->i_rwlock);
3277 if (ulp) {
3278 int terr = 0;
3279
3280 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size);
3281 ufs_lockfs_end(ulp);
3282 if (err == 0)
3283 err = terr;
3284 }
3285 if (dorwlock == 1)
3286 rw_exit(&tdp->i_rwlock);
3287 *ipp = ip;
3288 return (err);
3289
3290 fail:
3291 rw_exit(&tdp->i_contents);
3292 rw_exit(&ufsvfsp->vfs_dqrwlock);
3293 if (dorwlock == 2)
3294 rw_exit(&tdp->i_rwlock);
3295 if (ulp) {
3296 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size);
3297 ufs_lockfs_end(ulp);
3298 }
3299 if (dorwlock == 1)
3300 rw_exit(&tdp->i_rwlock);
3301 if (ip != NULL)
3302 VN_RELE(ITOV(ip));
3303
3304 /*
3305 * No inodes? See if any are tied up in pending deletions.
3306 * This has to be done outside of any of the above, because
3307 * the draining operation can't be done from inside a transaction.
3308 */
3309 if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
3310 ufs_delete_drain_wait(ufsvfsp, 1);
3311 retry = 0;
3312 goto again;
3313 }
3314
3315 return (err);
3316 }
3317
3318 /*
3319 * clear the dotdot directory entry.
3320 * Used by ufs_dirscan when clr_dotdot
3321 * flag is set and we're deleting a
3322 * directory.
3323 */
3324 static int
ufs_dirclrdotdot(struct inode * ip,ino_t parentino)3325 ufs_dirclrdotdot(struct inode *ip, ino_t parentino)
3326 {
3327 struct fbuf *fbp;
3328 struct direct *dotp, *dotdotp;
3329 int err = 0;
3330
3331 ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
3332 ASSERT(RW_LOCK_HELD(&ip->i_contents));
3333 err = blkatoff(ip, 0, NULL, &fbp);
3334 if (err) {
3335 return (err);
3336 }
3337
3338 dotp = (struct direct *)fbp->fb_addr;
3339 if ((dotp->d_namlen < (MAXNAMLEN + 1)) &&
3340 ((DIRBLKSIZ - DIRSIZ(dotp)) >= (sizeof (struct dirtemplate) / 2))) {
3341 dotdotp = (struct direct *)((char *)dotp + dotp->d_reclen);
3342 if ((dotdotp->d_namlen < (MAXNAMLEN + 1)) &&
3343 ((DIRBLKSIZ - DIRSIZ(dotp)) >= dotdotp->d_reclen)) {
3344
3345 dotp->d_reclen += dotdotp->d_reclen;
3346 if (parentino == dotdotp->d_ino) {
3347 dotdotp->d_ino = 0;
3348 dotdotp->d_namlen = 0;
3349 dotdotp->d_reclen = 0;
3350 }
3351
3352 err = TRANS_DIR(ip, 0);
3353 if (err) {
3354 fbrelse(fbp, S_OTHER);
3355 } else {
3356 err = ufs_fbwrite(fbp, ip);
3357 }
3358 }
3359 } else {
3360 err = -1;
3361 }
3362 return (err);
3363 }
3364