1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /* Portions Copyright 2007 Jeremy Teo */
27
28 #ifdef _KERNEL
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/time.h>
32 #include <sys/systm.h>
33 #include <sys/sysmacros.h>
34 #include <sys/resource.h>
35 #include <sys/mntent.h>
36 #include <sys/u8_textprep.h>
37 #include <sys/dsl_dataset.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/file.h>
41 #include <sys/kmem.h>
42 #include <sys/errno.h>
43 #include <sys/unistd.h>
44 #include <sys/atomic.h>
45 #include <sys/zfs_dir.h>
46 #include <sys/zfs_acl.h>
47 #include <sys/zfs_ioctl.h>
48 #include <sys/zfs_rlock.h>
49 #include <sys/zfs_fuid.h>
50 #include <sys/fs/zfs.h>
51 #include <sys/kidmap.h>
52 #endif /* _KERNEL */
53
54 #include <sys/dmu.h>
55 #include <sys/refcount.h>
56 #include <sys/stat.h>
57 #include <sys/zap.h>
58 #include <sys/zfs_znode.h>
59
60 #include "zfs_prop.h"
61
62 #if defined(_KERNEL) && defined(__NetBSD__)
63 #include <miscfs/specfs/specdev.h>
64 static const struct genfs_ops zfs_genfsops = {
65 .gop_write = genfs_compat_gop_write,
66 };
67
68 #endif
69
70 extern int (**zfs_vnodeop_p)(void *);
71 extern int (**zfs_fifoop_p)(void *);
72 extern int (**zfs_specop_p)(void *);
73
74 /*
75 * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
76 * turned on when DEBUG is also defined.
77 */
78 #ifdef DEBUG
79 #define ZNODE_STATS
80 #endif /* DEBUG */
81
82 #ifdef ZNODE_STATS
83 #define ZNODE_STAT_ADD(stat) ((stat)++)
84 #else
85 #define ZNODE_STAT_ADD(stat) /* nothing */
86 #endif /* ZNODE_STATS */
87
88 #define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3))
89 #define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1))
90
91 /*
92 * Functions needed for userland (ie: libzpool) are not put under
93 * #ifdef_KERNEL; the rest of the functions have dependencies
94 * (such as VFS logic) that will not compile easily in userland.
95 */
96 #ifdef _KERNEL
97 /*
98 * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to
99 * be freed before it can be safely accessed.
100 */
101 krwlock_t zfsvfs_lock;
102
103 static kmem_cache_t *znode_cache = NULL;
104
105 /*ARGSUSED*/
106 static void
znode_evict_error(dmu_buf_t * dbuf,void * user_ptr)107 znode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
108 {
109 /*
110 * We should never drop all dbuf refs without first clearing
111 * the eviction callback.
112 */
113 panic("evicting znode %p\n", user_ptr);
114 }
115
116 /*ARGSUSED*/
117 static int
zfs_znode_cache_constructor(void * buf,void * arg,int kmflags)118 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
119 {
120 znode_t *zp = arg;
121
122 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
123
124 zp->z_vnode = NULL;
125
126 list_link_init(&zp->z_link_node);
127
128 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
129 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
130 rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
131 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
132
133 mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
134 avl_create(&zp->z_range_avl, zfs_range_compare,
135 sizeof (rl_t), offsetof(rl_t, r_node));
136
137 zp->z_dbuf = NULL;
138 zp->z_dirlocks = NULL;
139 zp->z_acl_cached = NULL;
140 return (0);
141 }
142
143 /*ARGSUSED*/
144 static void
zfs_znode_cache_destructor(void * buf,void * arg)145 zfs_znode_cache_destructor(void *buf, void *arg)
146 {
147 znode_t *zp = arg;
148
149 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
150 ASSERT(ZTOV(zp) == NULL);
151
152 ASSERT(!list_link_active(&zp->z_link_node));
153 mutex_destroy(&zp->z_lock);
154 rw_destroy(&zp->z_parent_lock);
155 rw_destroy(&zp->z_name_lock);
156 mutex_destroy(&zp->z_acl_lock);
157 avl_destroy(&zp->z_range_avl);
158 mutex_destroy(&zp->z_range_lock);
159
160 ASSERT(zp->z_dbuf == NULL);
161 ASSERT(zp->z_dirlocks == NULL);
162 ASSERT(zp->z_acl_cached == NULL);
163 }
164
165 #ifdef ZNODE_STATS
166 static struct {
167 uint64_t zms_zfsvfs_invalid;
168 uint64_t zms_zfsvfs_recheck1;
169 uint64_t zms_zfsvfs_unmounted;
170 uint64_t zms_zfsvfs_recheck2;
171 uint64_t zms_obj_held;
172 uint64_t zms_vnode_locked;
173 uint64_t zms_not_only_dnlc;
174 } znode_move_stats;
175 #endif /* ZNODE_STATS */
176
177 static void
zfs_znode_move_impl(znode_t * ozp,znode_t * nzp)178 zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
179 {
180 vnode_t *vp;
181
182 /* Copy fields. */
183 nzp->z_zfsvfs = ozp->z_zfsvfs;
184
185 /* Swap vnodes. */
186 vp = nzp->z_vnode;
187 nzp->z_vnode = ozp->z_vnode;
188 ozp->z_vnode = vp; /* let destructor free the overwritten vnode */
189 ZTOV(ozp)->v_data = ozp;
190 ZTOV(nzp)->v_data = nzp;
191
192 nzp->z_id = ozp->z_id;
193 ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */
194 ASSERT(avl_numnodes(&ozp->z_range_avl) == 0);
195 nzp->z_unlinked = ozp->z_unlinked;
196 nzp->z_atime_dirty = ozp->z_atime_dirty;
197 nzp->z_zn_prefetch = ozp->z_zn_prefetch;
198 nzp->z_blksz = ozp->z_blksz;
199 nzp->z_seq = ozp->z_seq;
200 nzp->z_mapcnt = ozp->z_mapcnt;
201 nzp->z_last_itx = ozp->z_last_itx;
202 nzp->z_gen = ozp->z_gen;
203 nzp->z_sync_cnt = ozp->z_sync_cnt;
204 nzp->z_phys = ozp->z_phys;
205 nzp->z_dbuf = ozp->z_dbuf;
206
207 /*
208 * Since this is just an idle znode and kmem is already dealing with
209 * memory pressure, release any cached ACL.
210 */
211 if (ozp->z_acl_cached) {
212 zfs_acl_free(ozp->z_acl_cached);
213 ozp->z_acl_cached = NULL;
214 }
215
216 /* Update back pointers. */
217 (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys,
218 znode_evict_error);
219
220 /*
221 * Invalidate the original znode by clearing fields that provide a
222 * pointer back to the znode. Set the low bit of the vfs pointer to
223 * ensure that zfs_znode_move() recognizes the znode as invalid in any
224 * subsequent callback.
225 */
226 ozp->z_dbuf = NULL;
227 POINTER_INVALIDATE(&ozp->z_zfsvfs);
228 }
229
230 #ifndef __NetBSD__
231 /*ARGSUSED*/
232 static kmem_cbrc_t
zfs_znode_move(void * buf,void * newbuf,size_t size,void * arg)233 zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
234 {
235 znode_t *ozp = buf, *nzp = newbuf;
236 zfsvfs_t *zfsvfs;
237 vnode_t *vp;
238
239 /*
240 * The znode is on the file system's list of known znodes if the vfs
241 * pointer is valid. We set the low bit of the vfs pointer when freeing
242 * the znode to invalidate it, and the memory patterns written by kmem
243 * (baddcafe and deadbeef) set at least one of the two low bits. A newly
244 * created znode sets the vfs pointer last of all to indicate that the
245 * znode is known and in a valid state to be moved by this function.
246 */
247 zfsvfs = ozp->z_zfsvfs;
248 if (!POINTER_IS_VALID(zfsvfs)) {
249 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid);
250 return (KMEM_CBRC_DONT_KNOW);
251 }
252
253 /*
254 * Close a small window in which it's possible that the filesystem could
255 * be unmounted and freed, and zfsvfs, though valid in the previous
256 * statement, could point to unrelated memory by the time we try to
257 * prevent the filesystem from being unmounted.
258 */
259 rw_enter(&zfsvfs_lock, RW_WRITER);
260 if (zfsvfs != ozp->z_zfsvfs) {
261 rw_exit(&zfsvfs_lock);
262 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1);
263 return (KMEM_CBRC_DONT_KNOW);
264 }
265
266 /*
267 * If the znode is still valid, then so is the file system. We know that
268 * no valid file system can be freed while we hold zfsvfs_lock, so we
269 * can safely ensure that the filesystem is not and will not be
270 * unmounted. The next statement is equivalent to ZFS_ENTER().
271 */
272 rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
273 if (zfsvfs->z_unmounted) {
274 ZFS_EXIT(zfsvfs);
275 rw_exit(&zfsvfs_lock);
276 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
277 return (KMEM_CBRC_DONT_KNOW);
278 }
279 rw_exit(&zfsvfs_lock);
280
281 mutex_enter(&zfsvfs->z_znodes_lock);
282 /*
283 * Recheck the vfs pointer in case the znode was removed just before
284 * acquiring the lock.
285 */
286 if (zfsvfs != ozp->z_zfsvfs) {
287 mutex_exit(&zfsvfs->z_znodes_lock);
288 ZFS_EXIT(zfsvfs);
289 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2);
290 return (KMEM_CBRC_DONT_KNOW);
291 }
292
293 /*
294 * At this point we know that as long as we hold z_znodes_lock, the
295 * znode cannot be freed and fields within the znode can be safely
296 * accessed. Now, prevent a race with zfs_zget().
297 */
298 if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) {
299 mutex_exit(&zfsvfs->z_znodes_lock);
300 ZFS_EXIT(zfsvfs);
301 ZNODE_STAT_ADD(znode_move_stats.zms_obj_held);
302 return (KMEM_CBRC_LATER);
303 }
304
305 vp = ZTOV(ozp);
306 if (mutex_tryenter(&vp->v_lock) == 0) {
307 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
308 mutex_exit(&zfsvfs->z_znodes_lock);
309 ZFS_EXIT(zfsvfs);
310 ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked);
311 return (KMEM_CBRC_LATER);
312 }
313
314 /* Only move znodes that are referenced _only_ by the DNLC. */
315 if (vp->v_count != 1 || !vn_in_dnlc(vp)) {
316 mutex_exit(&vp->v_lock);
317 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
318 mutex_exit(&zfsvfs->z_znodes_lock);
319 ZFS_EXIT(zfsvfs);
320 ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc);
321 return (KMEM_CBRC_LATER);
322 }
323
324 /*
325 * The znode is known and in a valid state to move. We're holding the
326 * locks needed to execute the critical section.
327 */
328 zfs_znode_move_impl(ozp, nzp);
329 mutex_exit(&vp->v_lock);
330 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
331
332 list_link_replace(&ozp->z_link_node, &nzp->z_link_node);
333 mutex_exit(&zfsvfs->z_znodes_lock);
334 ZFS_EXIT(zfsvfs);
335
336 return (KMEM_CBRC_YES);
337 }
338 #endif /* !__NetBSD__ */
339
340 void
zfs_znode_init(void)341 zfs_znode_init(void)
342 {
343 /*
344 * Initialize zcache
345 */
346 rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
347 ASSERT(znode_cache == NULL);
348 znode_cache = kmem_cache_create("zfs_znode_cache",
349 sizeof (znode_t), 0, zfs_znode_cache_constructor,
350 zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
351 }
352
353 void
zfs_znode_fini(void)354 zfs_znode_fini(void)
355 {
356
357 /*
358 * Cleanup zcache
359 */
360 if (znode_cache)
361 kmem_cache_destroy(znode_cache);
362 znode_cache = NULL;
363 rw_destroy(&zfsvfs_lock);
364 }
365
366 #ifndef __NetBSD__
367 struct vnodeops *zfs_dvnodeops;
368 struct vnodeops *zfs_fvnodeops;
369 struct vnodeops *zfs_symvnodeops;
370 struct vnodeops *zfs_xdvnodeops;
371 struct vnodeops *zfs_evnodeops;
372 struct vnodeops *zfs_sharevnodeops;
373 #endif
374
375 void
zfs_remove_op_tables()376 zfs_remove_op_tables()
377 {
378 #ifndef __NetBSD__
379 /*
380 * Remove vfs ops
381 */
382 ASSERT(zfsfstype);
383 (void) vfs_freevfsops_by_type(zfsfstype);
384 zfsfstype = 0;
385
386 /*
387 * Remove vnode ops
388 */
389 if (zfs_dvnodeops)
390 vn_freevnodeops(zfs_dvnodeops);
391 if (zfs_fvnodeops)
392 vn_freevnodeops(zfs_fvnodeops);
393 if (zfs_symvnodeops)
394 vn_freevnodeops(zfs_symvnodeops);
395 if (zfs_xdvnodeops)
396 vn_freevnodeops(zfs_xdvnodeops);
397 if (zfs_evnodeops)
398 vn_freevnodeops(zfs_evnodeops);
399 if (zfs_sharevnodeops)
400 vn_freevnodeops(zfs_sharevnodeops);
401
402 zfs_dvnodeops = NULL;
403 zfs_fvnodeops = NULL;
404 zfs_symvnodeops = NULL;
405 zfs_xdvnodeops = NULL;
406 zfs_evnodeops = NULL;
407 zfs_sharevnodeops = NULL;
408 #endif
409 }
410
411 #ifndef __NetBSD__
412 extern const fs_operation_def_t zfs_dvnodeops_template[];
413 extern const fs_operation_def_t zfs_fvnodeops_template[];
414 extern const fs_operation_def_t zfs_xdvnodeops_template[];
415 extern const fs_operation_def_t zfs_symvnodeops_template[];
416 extern const fs_operation_def_t zfs_evnodeops_template[];
417 extern const fs_operation_def_t zfs_sharevnodeops_template[];
418 #endif
419
420 int
zfs_create_op_tables()421 zfs_create_op_tables()
422 {
423 #ifndef __NetBSD__
424 int error;
425
426 /*
427 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
428 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
429 * In this case we just return as the ops vectors are already set up.
430 */
431 if (zfs_dvnodeops)
432 return (0);
433
434 error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
435 &zfs_dvnodeops);
436 if (error)
437 return (error);
438
439 error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
440 &zfs_fvnodeops);
441 if (error)
442 return (error);
443
444 error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
445 &zfs_symvnodeops);
446 if (error)
447 return (error);
448
449 error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
450 &zfs_xdvnodeops);
451 if (error)
452 return (error);
453
454 error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
455 &zfs_evnodeops);
456 if (error)
457 return (error);
458
459 error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template,
460 &zfs_sharevnodeops);
461
462 return (error);
463 #endif
464 return 0;
465 }
466
467 int
zfs_create_share_dir(zfsvfs_t * zfsvfs,dmu_tx_t * tx)468 zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
469 {
470 zfs_acl_ids_t acl_ids;
471 vattr_t vattr;
472 znode_t *sharezp;
473 znode_t *zp;
474 int error;
475
476 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
477 vattr.va_type = VDIR;
478 vattr.va_mode = S_IFDIR|0555;
479 vattr.va_uid = crgetuid(kcred);
480 vattr.va_gid = crgetgid(kcred);
481
482 sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
483 sharezp->z_unlinked = 0;
484 sharezp->z_atime_dirty = 0;
485 sharezp->z_zfsvfs = zfsvfs;
486
487 VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
488 kcred, NULL, &acl_ids));
489 zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE,
490 &zp, 0, &acl_ids);
491 ASSERT3P(zp, ==, sharezp);
492 #ifndef __NetBSD__
493 ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */
494 #endif
495 POINTER_INVALIDATE(&sharezp->z_zfsvfs);
496 error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
497 ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
498 zfsvfs->z_shares_dir = sharezp->z_id;
499
500 zfs_acl_ids_free(&acl_ids);
501 dmu_buf_rele(sharezp->z_dbuf, NULL);
502 sharezp->z_dbuf = NULL;
503 kmem_cache_free(znode_cache, sharezp);
504
505 return (error);
506 }
507
508 /*
509 * define a couple of values we need available
510 * for both 64 and 32 bit environments.
511 */
512 #ifndef NBITSMINOR64
513 #define NBITSMINOR64 32
514 #endif
515 #ifndef MAXMAJ64
516 #define MAXMAJ64 0xffffffffUL
517 #endif
518 #ifndef MAXMIN64
519 #define MAXMIN64 0xffffffffUL
520 #endif
521
522 /*
523 * Create special expldev for ZFS private use.
524 * Can't use standard expldev since it doesn't do
525 * what we want. The standard expldev() takes a
526 * dev32_t in LP64 and expands it to a long dev_t.
527 * We need an interface that takes a dev32_t in ILP32
528 * and expands it to a long dev_t.
529 */
530 static uint64_t
zfs_expldev(dev_t dev)531 zfs_expldev(dev_t dev)
532 {
533 return ((uint64_t)major(dev) << NBITSMINOR64) |
534 (minor_t)minor(dev);
535 }
536
537 /*
538 * Special cmpldev for ZFS private use.
539 * Can't use standard cmpldev since it takes
540 * a long dev_t and compresses it to dev32_t in
541 * LP64. We need to do a compaction of a long dev_t
542 * to a dev32_t in ILP32.
543 */
544 dev_t
zfs_cmpldev(uint64_t dev)545 zfs_cmpldev(uint64_t dev)
546 {
547 minor_t minor = (minor_t)dev & MAXMIN64;
548 major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
549
550 return makedev(minor, major);
551 }
552
553 static void
zfs_znode_dmu_init(zfsvfs_t * zfsvfs,znode_t * zp,dmu_buf_t * db)554 zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db)
555 {
556 znode_t *nzp;
557
558 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
559 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
560
561 mutex_enter(&zp->z_lock);
562
563 ASSERT(zp->z_dbuf == NULL);
564 ASSERT(zp->z_acl_cached == NULL);
565 zp->z_dbuf = db;
566 nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error);
567
568 /*
569 * there should be no
570 * concurrent zgets on this object.
571 */
572 if (nzp != NULL)
573 panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db);
574
575 /*
576 * Slap on VROOT if we are the root znode
577 */
578 if (zp->z_id == zfsvfs->z_root)
579 ZTOV(zp)->v_flag |= VROOT;
580
581 mutex_exit(&zp->z_lock);
582 vn_exists(ZTOV(zp));
583 }
584
585 void
zfs_znode_dmu_fini(znode_t * zp)586 zfs_znode_dmu_fini(znode_t *zp)
587 {
588 dmu_buf_t *db = zp->z_dbuf;
589 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
590 zp->z_unlinked ||
591 RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
592 ASSERT(zp->z_dbuf != NULL);
593 zp->z_dbuf = NULL;
594 VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL));
595 dmu_buf_rele(db, NULL);
596 }
597
598 /*
599 * Create a new DMU object to hold a zfs znode.
600 *
601 * IN: dzp - parent directory for new znode
602 * vap - file attributes for new znode
603 * tx - dmu transaction id for zap operations
604 * cr - credentials of caller
605 * flag - flags:
606 * IS_ROOT_NODE - new object will be root
607 * IS_XATTR - new object is an attribute
608 * bonuslen - length of bonus buffer
609 * setaclp - File/Dir initial ACL
610 * fuidp - Tracks fuid allocation.
611 *
612 * OUT: zpp - allocated znode
613 *
614 */
615 void
zfs_mknode(znode_t * dzp,vattr_t * vap,dmu_tx_t * tx,cred_t * cr,uint_t flag,znode_t ** zpp,int bonuslen,zfs_acl_ids_t * acl_ids)616 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
617 uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_ids_t *acl_ids)
618 {
619 dmu_buf_t *db;
620 znode_phys_t *pzp;
621 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
622 timestruc_t now;
623 uint64_t gen, obj;
624 int err;
625
626 ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
627
628 if (zfsvfs->z_replay) {
629 obj = vap->va_nodeid;
630 now = vap->va_ctime; /* see zfs_replay_create() */
631 gen = vap->va_nblocks; /* ditto */
632 } else {
633 obj = 0;
634 gethrestime(&now);
635 gen = dmu_tx_get_txg(tx);
636 }
637
638 /*
639 * Create a new DMU object.
640 */
641 /*
642 * There's currently no mechanism for pre-reading the blocks that will
643 * be to needed allocate a new object, so we accept the small chance
644 * that there will be an i/o error and we will fail one of the
645 * assertions below.
646 */
647 if (vap->va_type == VDIR) {
648 if (zfsvfs->z_replay) {
649 err = zap_create_claim_norm(zfsvfs->z_os, obj,
650 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
651 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
652 ASSERT3U(err, ==, 0);
653 } else {
654 obj = zap_create_norm(zfsvfs->z_os,
655 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
656 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
657 }
658 } else {
659 if (zfsvfs->z_replay) {
660 err = dmu_object_claim(zfsvfs->z_os, obj,
661 DMU_OT_PLAIN_FILE_CONTENTS, 0,
662 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
663 ASSERT3U(err, ==, 0);
664 } else {
665 obj = dmu_object_alloc(zfsvfs->z_os,
666 DMU_OT_PLAIN_FILE_CONTENTS, 0,
667 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
668 }
669 }
670
671 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
672 VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db));
673 dmu_buf_will_dirty(db, tx);
674
675 /*
676 * Initialize the znode physical data to zero.
677 */
678 ASSERT(db->db_size >= sizeof (znode_phys_t));
679 bzero(db->db_data, db->db_size);
680 pzp = db->db_data;
681
682 /*
683 * If this is the root, fix up the half-initialized parent pointer
684 * to reference the just-allocated physical data area.
685 */
686 if (flag & IS_ROOT_NODE) {
687 dzp->z_dbuf = db;
688 dzp->z_phys = pzp;
689 dzp->z_id = obj;
690 }
691
692 /*
693 * If parent is an xattr, so am I.
694 */
695 if (dzp->z_phys->zp_flags & ZFS_XATTR)
696 flag |= IS_XATTR;
697
698 if (vap->va_type == VBLK || vap->va_type == VCHR) {
699 pzp->zp_rdev = zfs_expldev(vap->va_rdev);
700 }
701
702 if (zfsvfs->z_use_fuids)
703 pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
704
705 if (vap->va_type == VDIR) {
706 pzp->zp_size = 2; /* contents ("." and "..") */
707 pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
708 }
709
710 pzp->zp_parent = dzp->z_id;
711 if (flag & IS_XATTR)
712 pzp->zp_flags |= ZFS_XATTR;
713
714 pzp->zp_gen = gen;
715
716 ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
717 ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
718
719 if (vap->va_mask & AT_ATIME) {
720 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
721 } else {
722 ZFS_TIME_ENCODE(&now, pzp->zp_atime);
723 }
724
725 if (vap->va_mask & AT_MTIME) {
726 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
727 } else {
728 ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
729 }
730 pzp->zp_uid = acl_ids->z_fuid;
731 pzp->zp_gid = acl_ids->z_fgid;
732 pzp->zp_mode = acl_ids->z_mode;
733 if (!(flag & IS_ROOT_NODE)) {
734 struct vnode *vp;
735
736 err = vcache_get(zfsvfs->z_vfs, &obj, sizeof(obj), &vp);
737 ASSERT3U(err, ==, 0);
738 *zpp = VTOZ(vp);
739 dmu_buf_rele(db, NULL);
740 } else {
741 /*
742 * If we are creating the root node, the "parent" we
743 * passed in is the znode for the root.
744 */
745 *zpp = dzp;
746 }
747 VERIFY(0 == zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
748 if (vap->va_mask & AT_XVATTR)
749 zfs_xvattr_set(*zpp, (xvattr_t *)vap);
750
751 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
752 }
753
754 void
zfs_xvattr_set(znode_t * zp,xvattr_t * xvap)755 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap)
756 {
757 xoptattr_t *xoap;
758
759 xoap = xva_getxoptattr(xvap);
760 ASSERT(xoap);
761
762 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
763 ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime);
764 XVA_SET_RTN(xvap, XAT_CREATETIME);
765 }
766 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
767 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly);
768 XVA_SET_RTN(xvap, XAT_READONLY);
769 }
770 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
771 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden);
772 XVA_SET_RTN(xvap, XAT_HIDDEN);
773 }
774 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
775 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system);
776 XVA_SET_RTN(xvap, XAT_SYSTEM);
777 }
778 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
779 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive);
780 XVA_SET_RTN(xvap, XAT_ARCHIVE);
781 }
782 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
783 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable);
784 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
785 }
786 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
787 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink);
788 XVA_SET_RTN(xvap, XAT_NOUNLINK);
789 }
790 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
791 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly);
792 XVA_SET_RTN(xvap, XAT_APPENDONLY);
793 }
794 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
795 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump);
796 XVA_SET_RTN(xvap, XAT_NODUMP);
797 }
798 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
799 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque);
800 XVA_SET_RTN(xvap, XAT_OPAQUE);
801 }
802 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
803 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
804 xoap->xoa_av_quarantined);
805 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
806 }
807 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
808 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified);
809 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
810 }
811 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
812 (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp,
813 sizeof (xoap->xoa_av_scanstamp));
814 zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP;
815 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
816 }
817 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
818 ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse);
819 XVA_SET_RTN(xvap, XAT_REPARSE);
820 }
821 }
822
823 int
zfs_loadvnode(struct mount * mp,struct vnode * vp,const void * key,size_t key_len,const void ** new_key)824 zfs_loadvnode(struct mount *mp, struct vnode *vp,
825 const void *key, size_t key_len, const void **new_key)
826 {
827 uint64_t obj_num;
828 zfsvfs_t *zfsvfs;
829 dmu_object_info_t doi;
830 dmu_buf_t *db;
831 znode_t *zp;
832 int err;
833
834 KASSERT(key_len == sizeof(obj_num));
835 memcpy(&obj_num, key, key_len);
836
837 zfsvfs = mp->mnt_data;
838
839 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
840 if (err) {
841 return err;
842 }
843
844 dmu_object_info_from_db(db, &doi);
845 if (doi.doi_bonus_type != DMU_OT_ZNODE ||
846 doi.doi_bonus_size < sizeof (znode_phys_t)) {
847 dmu_buf_rele(db, NULL);
848 return EINVAL;
849 }
850
851 KASSERT(dmu_buf_get_user(db) == NULL);
852
853 /*
854 * There is a small window where zfs_vget() could
855 * find this object while a file create is still in
856 * progress. Since a gen number can never be zero
857 * we will check that to determine if its an allocated
858 * file.
859 */
860
861 if (((znode_phys_t *)db->db_data)->zp_gen == 0) {
862 dmu_buf_rele(db, NULL);
863 return ENOENT;
864 }
865
866 zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
867
868 ASSERT(zp->z_dirlocks == NULL);
869 ASSERT(zp->z_dbuf == NULL);
870 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
871
872 /*
873 * Defer setting z_zfsvfs until the znode is ready to be a candidate for
874 * the zfs_znode_move() callback.
875 */
876 zp->z_phys = NULL;
877 zp->z_unlinked = 0;
878 zp->z_atime_dirty = 0;
879 zp->z_mapcnt = 0;
880 zp->z_last_itx = 0;
881 zp->z_id = db->db_object;
882 zp->z_blksz = doi.doi_data_block_size;
883 zp->z_seq = 0x7A4653;
884 zp->z_sync_cnt = 0;
885 zp->z_vnode = vp;
886
887 zfs_znode_dmu_init(zfsvfs, zp, db);
888
889 zp->z_gen = zp->z_phys->zp_gen;
890
891 vp->v_op = zfs_vnodeop_p;
892 vp->v_tag = VT_ZFS;
893 vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
894 vp->v_data = zp;
895 genfs_node_init(vp, &zfs_genfsops);
896 switch (vp->v_type) {
897 case VDIR:
898 zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
899 break;
900 case VBLK:
901 case VCHR:
902 /* XXX NetBSD vp->v_op = zfs_specop_p; */
903 spec_node_init(vp, zfs_cmpldev(zp->z_phys->zp_rdev));
904 break;
905 case VFIFO:
906 /* XXX NetBSD vp->v_op = zfs_fifoop_p; */
907 break;
908 }
909
910 dprintf("zfs_loadvnode znode %p -- vnode %p\n", zp, vp);
911 dprintf("zfs_loadvnode z_id %ld\n", zp->z_id);
912
913 uvm_vnp_setsize(vp, zp->z_phys->zp_size);
914
915 mutex_enter(&zfsvfs->z_znodes_lock);
916 list_insert_tail(&zfsvfs->z_all_znodes, zp);
917 membar_producer();
918 /*
919 * Everything else must be valid before assigning z_zfsvfs makes the
920 * znode eligible for zfs_znode_move().
921 */
922 zp->z_zfsvfs = zfsvfs;
923 mutex_exit(&zfsvfs->z_znodes_lock);
924
925 VFS_HOLD(zfsvfs->z_vfs);
926
927 *new_key = &zp->z_id;
928
929 return 0;
930 }
931
932 int
zfs_zget(zfsvfs_t * zfsvfs,uint64_t obj_num,znode_t ** zpp)933 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
934 {
935 struct vnode *vp;
936 int error;
937
938 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
939
940 error = vcache_get(zfsvfs->z_vfs, &obj_num, sizeof(obj_num), &vp);
941 if (error == 0 && VTOZ(vp)->z_unlinked) {
942 vrele(vp);
943 error = ENOENT;
944 }
945 if (error)
946 *zpp = NULL;
947 else
948 *zpp = VTOZ(vp);
949
950 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
951
952 return error;
953 }
954
955 int
zfs_rezget(znode_t * zp)956 zfs_rezget(znode_t *zp)
957 {
958 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
959 dmu_object_info_t doi;
960 dmu_buf_t *db;
961 uint64_t obj_num = zp->z_id;
962 int err;
963
964 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
965
966 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
967 if (err) {
968 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
969 return (err);
970 }
971
972 dmu_object_info_from_db(db, &doi);
973 if (doi.doi_bonus_type != DMU_OT_ZNODE ||
974 doi.doi_bonus_size < sizeof (znode_phys_t)) {
975 dmu_buf_rele(db, NULL);
976 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
977 return (EINVAL);
978 }
979
980 if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) {
981 dmu_buf_rele(db, NULL);
982 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
983 return (EIO);
984 }
985
986 mutex_enter(&zp->z_acl_lock);
987 if (zp->z_acl_cached) {
988 zfs_acl_free(zp->z_acl_cached);
989 zp->z_acl_cached = NULL;
990 }
991 mutex_exit(&zp->z_acl_lock);
992
993 zfs_znode_dmu_init(zfsvfs, zp, db);
994 zp->z_unlinked = (zp->z_phys->zp_links == 0);
995 zp->z_blksz = doi.doi_data_block_size;
996
997 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
998
999 return (0);
1000 }
1001
1002 void
zfs_znode_delete(znode_t * zp,dmu_tx_t * tx)1003 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
1004 {
1005 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1006 objset_t *os = zfsvfs->z_os;
1007 uint64_t obj = zp->z_id;
1008 uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
1009
1010 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
1011 if (acl_obj)
1012 VERIFY(0 == dmu_object_free(os, acl_obj, tx));
1013 VERIFY(0 == dmu_object_free(os, obj, tx));
1014 zfs_znode_dmu_fini(zp);
1015 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
1016 zfs_znode_free(zp);
1017 }
1018
1019 void
zfs_zinactive(znode_t * zp)1020 zfs_zinactive(znode_t *zp)
1021 {
1022 vnode_t *vp = ZTOV(zp);
1023 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1024 uint64_t z_id = zp->z_id;
1025
1026 ASSERT(zp->z_dbuf && zp->z_phys);
1027
1028 /*
1029 * Don't allow a zfs_zget() while were trying to release this znode
1030 */
1031 ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
1032
1033 mutex_enter(&zp->z_lock);
1034 /*
1035 * If this was the last reference to a file with no links,
1036 * remove the file from the file system.
1037 */
1038 if (zp->z_unlinked) {
1039 mutex_exit(&zp->z_lock);
1040 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1041 zfs_rmnode(zp);
1042 return;
1043 }
1044
1045 mutex_exit(&zp->z_lock);
1046 zfs_znode_dmu_fini(zp);
1047 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1048 zfs_znode_free(zp);
1049 }
1050
1051 void
zfs_znode_free(znode_t * zp)1052 zfs_znode_free(znode_t *zp)
1053 {
1054 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1055 struct vnode *vp;
1056
1057 mutex_enter(&zp->z_lock);
1058 vp = ZTOV(zp);
1059 if (vp != NULL) {
1060 vcache_remove(vp->v_mount, &zp->z_id, sizeof(zp->z_id));
1061 genfs_node_destroy(vp);
1062 /*
1063 * To interlock with zfs_sync().
1064 */
1065 mutex_enter(vp->v_interlock);
1066 vp->v_data = NULL;
1067 mutex_exit(vp->v_interlock);
1068 }
1069 mutex_exit(&zp->z_lock);
1070
1071 dprintf("destroying znode %p\n", zp);
1072 //cpu_Debugger();
1073 mutex_enter(&zfsvfs->z_znodes_lock);
1074 POINTER_INVALIDATE(&zp->z_zfsvfs);
1075 list_remove(&zfsvfs->z_all_znodes, zp);
1076 mutex_exit(&zfsvfs->z_znodes_lock);
1077
1078 if (zp->z_acl_cached) {
1079 zfs_acl_free(zp->z_acl_cached);
1080 zp->z_acl_cached = NULL;
1081 }
1082
1083 kmem_cache_free(znode_cache, zp);
1084
1085 VFS_RELE(zfsvfs->z_vfs);
1086 }
1087
1088 void
zfs_time_stamper_locked(znode_t * zp,uint_t flag,dmu_tx_t * tx)1089 zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
1090 {
1091 timestruc_t now;
1092
1093 ASSERT(MUTEX_HELD(&zp->z_lock));
1094
1095 gethrestime(&now);
1096
1097 if (tx) {
1098 dmu_buf_will_dirty(zp->z_dbuf, tx);
1099 zp->z_atime_dirty = 0;
1100 zp->z_seq++;
1101 } else {
1102 zp->z_atime_dirty = 1;
1103 }
1104
1105 if (flag & AT_ATIME)
1106 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
1107
1108 if (flag & AT_MTIME) {
1109 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
1110 if (zp->z_zfsvfs->z_use_fuids)
1111 zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED);
1112 }
1113
1114 if (flag & AT_CTIME) {
1115 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
1116 if (zp->z_zfsvfs->z_use_fuids)
1117 zp->z_phys->zp_flags |= ZFS_ARCHIVE;
1118 }
1119 }
1120
1121 /*
1122 * Update the requested znode timestamps with the current time.
1123 * If we are in a transaction, then go ahead and mark the znode
1124 * dirty in the transaction so the timestamps will go to disk.
1125 * Otherwise, we will get pushed next time the znode is updated
1126 * in a transaction, or when this znode eventually goes inactive.
1127 *
1128 * Why is this OK?
1129 * 1 - Only the ACCESS time is ever updated outside of a transaction.
1130 * 2 - Multiple consecutive updates will be collapsed into a single
1131 * znode update by the transaction grouping semantics of the DMU.
1132 */
1133 void
zfs_time_stamper(znode_t * zp,uint_t flag,dmu_tx_t * tx)1134 zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
1135 {
1136 mutex_enter(&zp->z_lock);
1137 zfs_time_stamper_locked(zp, flag, tx);
1138 mutex_exit(&zp->z_lock);
1139 }
1140
1141 /*
1142 * Grow the block size for a file.
1143 *
1144 * IN: zp - znode of file to free data in.
1145 * size - requested block size
1146 * tx - open transaction.
1147 *
1148 * NOTE: this function assumes that the znode is write locked.
1149 */
1150 void
zfs_grow_blocksize(znode_t * zp,uint64_t size,dmu_tx_t * tx)1151 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1152 {
1153 int error;
1154 u_longlong_t dummy;
1155
1156 if (size <= zp->z_blksz)
1157 return;
1158 /*
1159 * If the file size is already greater than the current blocksize,
1160 * we will not grow. If there is more than one block in a file,
1161 * the blocksize cannot change.
1162 */
1163 if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
1164 return;
1165
1166 error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
1167 size, 0, tx);
1168 if (error == ENOTSUP)
1169 return;
1170 ASSERT3U(error, ==, 0);
1171
1172 /* What blocksize did we actually get? */
1173 dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
1174 }
1175
1176 /*
1177 * Increase the file length
1178 *
1179 * IN: zp - znode of file to free data in.
1180 * end - new end-of-file
1181 *
1182 * RETURN: 0 if success
1183 * error code if failure
1184 */
1185 static int
zfs_extend(znode_t * zp,uint64_t end)1186 zfs_extend(znode_t *zp, uint64_t end)
1187 {
1188 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1189 dmu_tx_t *tx;
1190 rl_t *rl;
1191 uint64_t newblksz;
1192 int error;
1193
1194 /*
1195 * We will change zp_size, lock the whole file.
1196 */
1197 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
1198
1199 /*
1200 * Nothing to do if file already at desired length.
1201 */
1202 if (end <= zp->z_phys->zp_size) {
1203 zfs_range_unlock(rl);
1204 return (0);
1205 }
1206 top:
1207 tx = dmu_tx_create(zfsvfs->z_os);
1208 dmu_tx_hold_bonus(tx, zp->z_id);
1209 if (end > zp->z_blksz &&
1210 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
1211 /*
1212 * We are growing the file past the current block size.
1213 */
1214 if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
1215 ASSERT(!ISP2(zp->z_blksz));
1216 newblksz = MIN(end, SPA_MAXBLOCKSIZE);
1217 } else {
1218 newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
1219 }
1220 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1221 } else {
1222 newblksz = 0;
1223 }
1224
1225 error = dmu_tx_assign(tx, TXG_NOWAIT);
1226 if (error) {
1227 if (error == ERESTART) {
1228 dmu_tx_wait(tx);
1229 dmu_tx_abort(tx);
1230 goto top;
1231 }
1232 dmu_tx_abort(tx);
1233 zfs_range_unlock(rl);
1234 return (error);
1235 }
1236 dmu_buf_will_dirty(zp->z_dbuf, tx);
1237
1238 if (newblksz)
1239 zfs_grow_blocksize(zp, newblksz, tx);
1240
1241 zp->z_phys->zp_size = end;
1242
1243 zfs_range_unlock(rl);
1244
1245 dmu_tx_commit(tx);
1246
1247 uvm_vnp_setsize(ZTOV(zp), end);
1248
1249 return (0);
1250 }
1251
1252 /*
1253 * Free space in a file.
1254 *
1255 * IN: zp - znode of file to free data in.
1256 * off - start of section to free.
1257 * len - length of section to free.
1258 *
1259 * RETURN: 0 if success
1260 * error code if failure
1261 */
1262 static int
zfs_free_range(znode_t * zp,uint64_t off,uint64_t len)1263 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1264 {
1265 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1266 rl_t *rl;
1267 int error;
1268
1269 /*
1270 * Lock the range being freed.
1271 */
1272 rl = zfs_range_lock(zp, off, len, RL_WRITER);
1273
1274 /*
1275 * Nothing to do if file already at desired length.
1276 */
1277 if (off >= zp->z_phys->zp_size) {
1278 zfs_range_unlock(rl);
1279 return (0);
1280 }
1281
1282 if (off + len > zp->z_phys->zp_size)
1283 len = zp->z_phys->zp_size - off;
1284
1285 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
1286
1287 if (error == 0) {
1288 /*
1289 * In NetBSD we cannot free block in the middle of a file,
1290 * but only at the end of a file.
1291 */
1292 uvm_vnp_setsize(ZTOV(zp), off);
1293 }
1294
1295 zfs_range_unlock(rl);
1296
1297 return (error);
1298 }
1299
1300 /*
1301 * Truncate a file
1302 *
1303 * IN: zp - znode of file to free data in.
1304 * end - new end-of-file.
1305 *
1306 * RETURN: 0 if success
1307 * error code if failure
1308 */
1309 static int
zfs_trunc(znode_t * zp,uint64_t end)1310 zfs_trunc(znode_t *zp, uint64_t end)
1311 {
1312 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1313 vnode_t *vp = ZTOV(zp);
1314 dmu_tx_t *tx;
1315 rl_t *rl;
1316 int error;
1317
1318 /*
1319 * We will change zp_size, lock the whole file.
1320 */
1321 rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
1322
1323 /*
1324 * Nothing to do if file already at desired length.
1325 */
1326 if (end >= zp->z_phys->zp_size) {
1327 zfs_range_unlock(rl);
1328 return (0);
1329 }
1330
1331 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1);
1332 if (error) {
1333 zfs_range_unlock(rl);
1334 return (error);
1335 }
1336 top:
1337 tx = dmu_tx_create(zfsvfs->z_os);
1338 dmu_tx_hold_bonus(tx, zp->z_id);
1339 error = dmu_tx_assign(tx, TXG_NOWAIT);
1340 if (error) {
1341 if (error == ERESTART) {
1342 dmu_tx_wait(tx);
1343 dmu_tx_abort(tx);
1344 goto top;
1345 }
1346 dmu_tx_abort(tx);
1347 zfs_range_unlock(rl);
1348 return (error);
1349 }
1350 dmu_buf_will_dirty(zp->z_dbuf, tx);
1351
1352 zp->z_phys->zp_size = end;
1353
1354 dmu_tx_commit(tx);
1355
1356 zfs_range_unlock(rl);
1357
1358 /*
1359 * Clear any mapped pages in the truncated region. This has to
1360 * happen outside of the transaction to avoid the possibility of
1361 * a deadlock with someone trying to push a page that we are
1362 * about to invalidate.
1363 */
1364
1365 uvm_vnp_setsize(vp, end);
1366
1367 return (0);
1368 }
1369
1370 /*
1371 * Free space in a file
1372 *
1373 * IN: zp - znode of file to free data in.
1374 * off - start of range
1375 * len - end of range (0 => EOF)
1376 * flag - current file open mode flags.
1377 * log - TRUE if this action should be logged
1378 *
1379 * RETURN: 0 if success
1380 * error code if failure
1381 */
1382 int
zfs_freesp(znode_t * zp,uint64_t off,uint64_t len,int flag,boolean_t log)1383 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1384 {
1385 vnode_t *vp = ZTOV(zp);
1386 dmu_tx_t *tx;
1387 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1388 zilog_t *zilog = zfsvfs->z_log;
1389 int error;
1390
1391 if (off > zp->z_phys->zp_size) {
1392 error = zfs_extend(zp, off+len);
1393 if (error == 0 && log)
1394 goto log;
1395 else
1396 return (error);
1397 }
1398
1399 if (len == 0) {
1400 error = zfs_trunc(zp, off);
1401 } else {
1402 if ((error = zfs_free_range(zp, off, len)) == 0 &&
1403 off + len > zp->z_phys->zp_size)
1404 error = zfs_extend(zp, off+len);
1405 }
1406 if (error || !log)
1407 return (error);
1408 log:
1409 tx = dmu_tx_create(zfsvfs->z_os);
1410 dmu_tx_hold_bonus(tx, zp->z_id);
1411 error = dmu_tx_assign(tx, TXG_NOWAIT);
1412 if (error) {
1413 if (error == ERESTART) {
1414 dmu_tx_wait(tx);
1415 dmu_tx_abort(tx);
1416 goto log;
1417 }
1418 dmu_tx_abort(tx);
1419 return (error);
1420 }
1421
1422 zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
1423 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1424
1425 dmu_tx_commit(tx);
1426 return (0);
1427 }
1428
1429 void
zfs_create_fs(objset_t * os,cred_t * cr,nvlist_t * zplprops,dmu_tx_t * tx)1430 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1431 {
1432 zfsvfs_t zfsvfs;
1433 uint64_t moid, obj, version;
1434 uint64_t sense = ZFS_CASE_SENSITIVE;
1435 uint64_t norm = 0;
1436 nvpair_t *elem;
1437 int error;
1438 int i;
1439 znode_t *rootzp = NULL;
1440 vattr_t vattr;
1441 znode_t *zp;
1442 zfs_acl_ids_t acl_ids;
1443
1444 /*
1445 * First attempt to create master node.
1446 */
1447 /*
1448 * In an empty objset, there are no blocks to read and thus
1449 * there can be no i/o errors (which we assert below).
1450 */
1451 moid = MASTER_NODE_OBJ;
1452 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1453 DMU_OT_NONE, 0, tx);
1454 ASSERT(error == 0);
1455
1456 /*
1457 * Set starting attributes.
1458 */
1459 if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_USERSPACE)
1460 version = ZPL_VERSION;
1461 else if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
1462 version = ZPL_VERSION_USERSPACE - 1;
1463 else
1464 version = ZPL_VERSION_FUID - 1;
1465 elem = NULL;
1466 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1467 /* For the moment we expect all zpl props to be uint64_ts */
1468 uint64_t val;
1469 char *name;
1470
1471 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1472 VERIFY(nvpair_value_uint64(elem, &val) == 0);
1473 name = nvpair_name(elem);
1474 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
1475 if (val < version)
1476 version = val;
1477 } else {
1478 error = zap_update(os, moid, name, 8, 1, &val, tx);
1479 }
1480 ASSERT(error == 0);
1481 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1482 norm = val;
1483 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1484 sense = val;
1485 }
1486 ASSERT(version != 0);
1487 error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
1488
1489 /*
1490 * Create a delete queue.
1491 */
1492 obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
1493
1494 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
1495 ASSERT(error == 0);
1496
1497 /*
1498 * Create root znode. Create minimal znode/vnode/zfsvfs
1499 * to allow zfs_mknode to work.
1500 */
1501 vattr_null(&vattr);
1502 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
1503 vattr.va_type = VDIR;
1504 vattr.va_mode = S_IFDIR|0755;
1505 vattr.va_uid = crgetuid(cr);
1506 vattr.va_gid = crgetgid(cr);
1507
1508 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
1509 rootzp->z_unlinked = 0;
1510 rootzp->z_atime_dirty = 0;
1511
1512 bzero(&zfsvfs, sizeof (zfsvfs_t));
1513
1514 zfsvfs.z_os = os;
1515 zfsvfs.z_parent = &zfsvfs;
1516 zfsvfs.z_version = version;
1517 zfsvfs.z_use_fuids = USE_FUIDS(version, os);
1518 zfsvfs.z_norm = norm;
1519 /*
1520 * Fold case on file systems that are always or sometimes case
1521 * insensitive.
1522 */
1523 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
1524 zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER;
1525
1526 mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1527 list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
1528 offsetof(znode_t, z_link_node));
1529
1530 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1531 mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1532
1533 ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
1534 rootzp->z_zfsvfs = &zfsvfs;
1535 VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
1536 cr, NULL, &acl_ids));
1537 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, &acl_ids);
1538 ASSERT3P(zp, ==, rootzp);
1539 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1540 ASSERT(error == 0);
1541 zfs_acl_ids_free(&acl_ids);
1542 POINTER_INVALIDATE(&rootzp->z_zfsvfs);
1543
1544 dmu_buf_rele(rootzp->z_dbuf, NULL);
1545 rootzp->z_dbuf = NULL;
1546 kmem_cache_free(znode_cache, rootzp);
1547
1548 /*
1549 * Create shares directory
1550 */
1551
1552 error = zfs_create_share_dir(&zfsvfs, tx);
1553
1554 ASSERT(error == 0);
1555
1556 mutex_destroy(&zfsvfs.z_znodes_lock);
1557 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1558 mutex_destroy(&zfsvfs.z_hold_mtx[i]);
1559 }
1560
1561 #endif /* _KERNEL */
1562 /*
1563 * Given an object number, return its parent object number and whether
1564 * or not the object is an extended attribute directory.
1565 */
1566 static int
zfs_obj_to_pobj(objset_t * osp,uint64_t obj,uint64_t * pobjp,int * is_xattrdir)1567 zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir)
1568 {
1569 dmu_buf_t *db;
1570 dmu_object_info_t doi;
1571 znode_phys_t *zp;
1572 int error;
1573
1574 if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0)
1575 return (error);
1576
1577 dmu_object_info_from_db(db, &doi);
1578 if (doi.doi_bonus_type != DMU_OT_ZNODE ||
1579 doi.doi_bonus_size < sizeof (znode_phys_t)) {
1580 dmu_buf_rele(db, FTAG);
1581 return (EINVAL);
1582 }
1583
1584 zp = db->db_data;
1585 *pobjp = zp->zp_parent;
1586 *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) &&
1587 S_ISDIR(zp->zp_mode);
1588 dmu_buf_rele(db, FTAG);
1589
1590 return (0);
1591 }
1592
1593 int
zfs_obj_to_path(objset_t * osp,uint64_t obj,char * buf,int len)1594 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
1595 {
1596 char *path = buf + len - 1;
1597 int error;
1598
1599 *path = '\0';
1600
1601 for (;;) {
1602 uint64_t pobj;
1603 char component[MAXNAMELEN + 2];
1604 size_t complen;
1605 int is_xattrdir;
1606
1607 if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
1608 &is_xattrdir)) != 0)
1609 break;
1610
1611 if (pobj == obj) {
1612 if (path[0] != '/')
1613 *--path = '/';
1614 break;
1615 }
1616
1617 component[0] = '/';
1618 if (is_xattrdir) {
1619 (void) snprintf(component + 1, sizeof(component) - 1,
1620 "<xattrdir>");
1621 } else {
1622 error = zap_value_search(osp, pobj, obj,
1623 ZFS_DIRENT_OBJ(-1ULL), component + 1);
1624 if (error != 0)
1625 break;
1626 }
1627
1628 complen = strlen(component);
1629 path -= complen;
1630 ASSERT(path >= buf);
1631 bcopy(component, path, complen);
1632 obj = pobj;
1633 }
1634
1635 if (error == 0)
1636 (void) memmove(buf, path, buf + len - path);
1637 return (error);
1638 }
1639