1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
24  */
25 
26 /* Portions Copyright 2007 Jeremy Teo */
27 
28 #ifdef _KERNEL
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/time.h>
32 #include <sys/sysmacros.h>
33 #include <sys/mntent.h>
34 #include <sys/u8_textprep.h>
35 #include <sys/dsl_dataset.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/file.h>
39 #include <sys/kmem.h>
40 #include <sys/errno.h>
41 #include <sys/atomic.h>
42 #include <sys/zfs_dir.h>
43 #include <sys/zfs_acl.h>
44 #include <sys/zfs_ioctl.h>
45 #include <sys/zfs_rlock.h>
46 #include <sys/zfs_fuid.h>
47 #include <sys/zfs_vnops.h>
48 #include <sys/zfs_ctldir.h>
49 #include <sys/dnode.h>
50 #include <sys/fs/zfs.h>
51 #include <sys/zpl.h>
52 #endif /* _KERNEL */
53 
54 #include <sys/dmu.h>
55 #include <sys/dmu_objset.h>
56 #include <sys/dmu_tx.h>
57 #include <sys/zfs_refcount.h>
58 #include <sys/stat.h>
59 #include <sys/zap.h>
60 #include <sys/zfs_znode.h>
61 #include <sys/sa.h>
62 #include <sys/zfs_sa.h>
63 #include <sys/zfs_stat.h>
64 
65 #include "zfs_prop.h"
66 #include "zfs_comutil.h"
67 
68 /*
69  * Functions needed for userland (ie: libzpool) are not put under
70  * #ifdef_KERNEL; the rest of the functions have dependencies
71  * (such as VFS logic) that will not compile easily in userland.
72  */
73 #ifdef _KERNEL
74 
75 static kmem_cache_t *znode_cache = NULL;
76 static kmem_cache_t *znode_hold_cache = NULL;
77 unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
78 
79 /*
80  * This is used by the test suite so that it can delay znodes from being
81  * freed in order to inspect the unlinked set.
82  */
83 static int zfs_unlink_suspend_progress = 0;
84 
85 /*
86  * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
87  * z_rangelock. It will modify the offset and length of the lock to reflect
88  * znode-specific information, and convert RL_APPEND to RL_WRITER.  This is
89  * called with the rangelock_t's rl_lock held, which avoids races.
90  */
91 static void
92 zfs_rangelock_cb(zfs_locked_range_t *new, void *arg)
93 {
94 	znode_t *zp = arg;
95 
96 	/*
97 	 * If in append mode, convert to writer and lock starting at the
98 	 * current end of file.
99 	 */
100 	if (new->lr_type == RL_APPEND) {
101 		new->lr_offset = zp->z_size;
102 		new->lr_type = RL_WRITER;
103 	}
104 
105 	/*
106 	 * If we need to grow the block size then lock the whole file range.
107 	 */
108 	uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
109 	if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
110 	    zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
111 		new->lr_offset = 0;
112 		new->lr_length = UINT64_MAX;
113 	}
114 }
115 
116 static int
117 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
118 {
119 	(void) arg, (void) kmflags;
120 	znode_t *zp = buf;
121 
122 	inode_init_once(ZTOI(zp));
123 	list_link_init(&zp->z_link_node);
124 
125 	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
126 	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
127 	rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL);
128 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
129 	rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
130 
131 	zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
132 
133 	zp->z_dirlocks = NULL;
134 	zp->z_acl_cached = NULL;
135 	zp->z_xattr_cached = NULL;
136 	zp->z_xattr_parent = 0;
137 	zp->z_sync_writes_cnt = 0;
138 	zp->z_async_writes_cnt = 0;
139 
140 	return (0);
141 }
142 
143 static void
144 zfs_znode_cache_destructor(void *buf, void *arg)
145 {
146 	(void) arg;
147 	znode_t *zp = buf;
148 
149 	ASSERT(!list_link_active(&zp->z_link_node));
150 	mutex_destroy(&zp->z_lock);
151 	rw_destroy(&zp->z_parent_lock);
152 	rw_destroy(&zp->z_name_lock);
153 	mutex_destroy(&zp->z_acl_lock);
154 	rw_destroy(&zp->z_xattr_lock);
155 	zfs_rangelock_fini(&zp->z_rangelock);
156 
157 	ASSERT3P(zp->z_dirlocks, ==, NULL);
158 	ASSERT3P(zp->z_acl_cached, ==, NULL);
159 	ASSERT3P(zp->z_xattr_cached, ==, NULL);
160 
161 	ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
162 	ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
163 }
164 
165 static int
166 zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags)
167 {
168 	(void) arg, (void) kmflags;
169 	znode_hold_t *zh = buf;
170 
171 	mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL);
172 	zh->zh_refcount = 0;
173 
174 	return (0);
175 }
176 
177 static void
178 zfs_znode_hold_cache_destructor(void *buf, void *arg)
179 {
180 	(void) arg;
181 	znode_hold_t *zh = buf;
182 
183 	mutex_destroy(&zh->zh_lock);
184 }
185 
186 void
187 zfs_znode_init(void)
188 {
189 	/*
190 	 * Initialize zcache.  The KMC_SLAB hint is used in order that it be
191 	 * backed by kmalloc() when on the Linux slab in order that any
192 	 * wait_on_bit() operations on the related inode operate properly.
193 	 */
194 	ASSERT(znode_cache == NULL);
195 	znode_cache = kmem_cache_create("zfs_znode_cache",
196 	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
197 	    zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB);
198 
199 	ASSERT(znode_hold_cache == NULL);
200 	znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache",
201 	    sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor,
202 	    zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0);
203 }
204 
205 void
206 zfs_znode_fini(void)
207 {
208 	/*
209 	 * Cleanup zcache
210 	 */
211 	if (znode_cache)
212 		kmem_cache_destroy(znode_cache);
213 	znode_cache = NULL;
214 
215 	if (znode_hold_cache)
216 		kmem_cache_destroy(znode_hold_cache);
217 	znode_hold_cache = NULL;
218 }
219 
220 /*
221  * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to
222  * serialize access to a znode and its SA buffer while the object is being
223  * created or destroyed.  This kind of locking would normally reside in the
224  * znode itself but in this case that's impossible because the znode and SA
225  * buffer may not yet exist.  Therefore the locking is handled externally
226  * with an array of mutexes and AVLs trees which contain per-object locks.
227  *
228  * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted
229  * in to the correct AVL tree and finally the per-object lock is held.  In
230  * zfs_znode_hold_exit() the process is reversed.  The per-object lock is
231  * released, removed from the AVL tree and destroyed if there are no waiters.
232  *
233  * This scheme has two important properties:
234  *
235  * 1) No memory allocations are performed while holding one of the z_hold_locks.
236  *    This ensures evict(), which can be called from direct memory reclaim, will
237  *    never block waiting on a z_hold_locks which just happens to have hashed
238  *    to the same index.
239  *
240  * 2) All locks used to serialize access to an object are per-object and never
241  *    shared.  This minimizes lock contention without creating a large number
242  *    of dedicated locks.
243  *
244  * On the downside it does require znode_lock_t structures to be frequently
245  * allocated and freed.  However, because these are backed by a kmem cache
246  * and very short lived this cost is minimal.
247  */
248 int
249 zfs_znode_hold_compare(const void *a, const void *b)
250 {
251 	const znode_hold_t *zh_a = (const znode_hold_t *)a;
252 	const znode_hold_t *zh_b = (const znode_hold_t *)b;
253 
254 	return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj));
255 }
256 
257 static boolean_t __maybe_unused
258 zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj)
259 {
260 	znode_hold_t *zh, search;
261 	int i = ZFS_OBJ_HASH(zfsvfs, obj);
262 	boolean_t held;
263 
264 	search.zh_obj = obj;
265 
266 	mutex_enter(&zfsvfs->z_hold_locks[i]);
267 	zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
268 	held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE;
269 	mutex_exit(&zfsvfs->z_hold_locks[i]);
270 
271 	return (held);
272 }
273 
274 znode_hold_t *
275 zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj)
276 {
277 	znode_hold_t *zh, *zh_new, search;
278 	int i = ZFS_OBJ_HASH(zfsvfs, obj);
279 	boolean_t found = B_FALSE;
280 
281 	zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP);
282 	search.zh_obj = obj;
283 
284 	mutex_enter(&zfsvfs->z_hold_locks[i]);
285 	zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
286 	if (likely(zh == NULL)) {
287 		zh = zh_new;
288 		zh->zh_obj = obj;
289 		avl_add(&zfsvfs->z_hold_trees[i], zh);
290 	} else {
291 		ASSERT3U(zh->zh_obj, ==, obj);
292 		found = B_TRUE;
293 	}
294 	zh->zh_refcount++;
295 	ASSERT3S(zh->zh_refcount, >, 0);
296 	mutex_exit(&zfsvfs->z_hold_locks[i]);
297 
298 	if (found == B_TRUE)
299 		kmem_cache_free(znode_hold_cache, zh_new);
300 
301 	ASSERT(MUTEX_NOT_HELD(&zh->zh_lock));
302 	mutex_enter(&zh->zh_lock);
303 
304 	return (zh);
305 }
306 
307 void
308 zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh)
309 {
310 	int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj);
311 	boolean_t remove = B_FALSE;
312 
313 	ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj));
314 	mutex_exit(&zh->zh_lock);
315 
316 	mutex_enter(&zfsvfs->z_hold_locks[i]);
317 	ASSERT3S(zh->zh_refcount, >, 0);
318 	if (--zh->zh_refcount == 0) {
319 		avl_remove(&zfsvfs->z_hold_trees[i], zh);
320 		remove = B_TRUE;
321 	}
322 	mutex_exit(&zfsvfs->z_hold_locks[i]);
323 
324 	if (remove == B_TRUE)
325 		kmem_cache_free(znode_hold_cache, zh);
326 }
327 
328 dev_t
329 zfs_cmpldev(uint64_t dev)
330 {
331 	return (dev);
332 }
333 
334 static void
335 zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
336     dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
337 {
338 	ASSERT(zfs_znode_held(zfsvfs, zp->z_id));
339 
340 	mutex_enter(&zp->z_lock);
341 
342 	ASSERT(zp->z_sa_hdl == NULL);
343 	ASSERT(zp->z_acl_cached == NULL);
344 	if (sa_hdl == NULL) {
345 		VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
346 		    SA_HDL_SHARED, &zp->z_sa_hdl));
347 	} else {
348 		zp->z_sa_hdl = sa_hdl;
349 		sa_set_userp(sa_hdl, zp);
350 	}
351 
352 	zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
353 
354 	mutex_exit(&zp->z_lock);
355 }
356 
357 void
358 zfs_znode_dmu_fini(znode_t *zp)
359 {
360 	ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) ||
361 	    RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock));
362 
363 	sa_handle_destroy(zp->z_sa_hdl);
364 	zp->z_sa_hdl = NULL;
365 }
366 
367 /*
368  * Called by new_inode() to allocate a new inode.
369  */
370 int
371 zfs_inode_alloc(struct super_block *sb, struct inode **ip)
372 {
373 	znode_t *zp;
374 
375 	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
376 	*ip = ZTOI(zp);
377 
378 	return (0);
379 }
380 
381 /*
382  * Called in multiple places when an inode should be destroyed.
383  */
384 void
385 zfs_inode_destroy(struct inode *ip)
386 {
387 	znode_t *zp = ITOZ(ip);
388 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
389 
390 	mutex_enter(&zfsvfs->z_znodes_lock);
391 	if (list_link_active(&zp->z_link_node)) {
392 		list_remove(&zfsvfs->z_all_znodes, zp);
393 	}
394 	mutex_exit(&zfsvfs->z_znodes_lock);
395 
396 	if (zp->z_acl_cached) {
397 		zfs_acl_free(zp->z_acl_cached);
398 		zp->z_acl_cached = NULL;
399 	}
400 
401 	if (zp->z_xattr_cached) {
402 		nvlist_free(zp->z_xattr_cached);
403 		zp->z_xattr_cached = NULL;
404 	}
405 
406 	kmem_cache_free(znode_cache, zp);
407 }
408 
409 static void
410 zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
411 {
412 	uint64_t rdev = 0;
413 
414 	switch (ip->i_mode & S_IFMT) {
415 	case S_IFREG:
416 		ip->i_op = &zpl_inode_operations;
417 #ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
418 		ip->i_fop = &zpl_file_operations.kabi_fops;
419 #else
420 		ip->i_fop = &zpl_file_operations;
421 #endif
422 		ip->i_mapping->a_ops = &zpl_address_space_operations;
423 		break;
424 
425 	case S_IFDIR:
426 #ifdef HAVE_RENAME2_OPERATIONS_WRAPPER
427 		ip->i_flags |= S_IOPS_WRAPPER;
428 		ip->i_op = &zpl_dir_inode_operations.ops;
429 #else
430 		ip->i_op = &zpl_dir_inode_operations;
431 #endif
432 		ip->i_fop = &zpl_dir_file_operations;
433 		ITOZ(ip)->z_zn_prefetch = B_TRUE;
434 		break;
435 
436 	case S_IFLNK:
437 		ip->i_op = &zpl_symlink_inode_operations;
438 		break;
439 
440 	/*
441 	 * rdev is only stored in a SA only for device files.
442 	 */
443 	case S_IFCHR:
444 	case S_IFBLK:
445 		(void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev,
446 		    sizeof (rdev));
447 		zfs_fallthrough;
448 	case S_IFIFO:
449 	case S_IFSOCK:
450 		init_special_inode(ip, ip->i_mode, rdev);
451 		ip->i_op = &zpl_special_inode_operations;
452 		break;
453 
454 	default:
455 		zfs_panic_recover("inode %llu has invalid mode: 0x%x\n",
456 		    (u_longlong_t)ip->i_ino, ip->i_mode);
457 
458 		/* Assume the inode is a file and attempt to continue */
459 		ip->i_mode = S_IFREG | 0644;
460 		ip->i_op = &zpl_inode_operations;
461 #ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
462 		ip->i_fop = &zpl_file_operations.kabi_fops;
463 #else
464 		ip->i_fop = &zpl_file_operations;
465 #endif
466 		ip->i_mapping->a_ops = &zpl_address_space_operations;
467 		break;
468 	}
469 }
470 
471 static void
472 zfs_set_inode_flags(znode_t *zp, struct inode *ip)
473 {
474 	/*
475 	 * Linux and Solaris have different sets of file attributes, so we
476 	 * restrict this conversion to the intersection of the two.
477 	 */
478 #ifdef HAVE_INODE_SET_FLAGS
479 	unsigned int flags = 0;
480 	if (zp->z_pflags & ZFS_IMMUTABLE)
481 		flags |= S_IMMUTABLE;
482 	if (zp->z_pflags & ZFS_APPENDONLY)
483 		flags |= S_APPEND;
484 
485 	inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND);
486 #else
487 	if (zp->z_pflags & ZFS_IMMUTABLE)
488 		ip->i_flags |= S_IMMUTABLE;
489 	else
490 		ip->i_flags &= ~S_IMMUTABLE;
491 
492 	if (zp->z_pflags & ZFS_APPENDONLY)
493 		ip->i_flags |= S_APPEND;
494 	else
495 		ip->i_flags &= ~S_APPEND;
496 #endif
497 }
498 
499 /*
500  * Update the embedded inode given the znode.
501  */
502 void
503 zfs_znode_update_vfs(znode_t *zp)
504 {
505 	struct inode	*ip;
506 	uint32_t	blksize;
507 	u_longlong_t	i_blocks;
508 
509 	ASSERT(zp != NULL);
510 	ip = ZTOI(zp);
511 
512 	/* Skip .zfs control nodes which do not exist on disk. */
513 	if (zfsctl_is_node(ip))
514 		return;
515 
516 	dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks);
517 
518 	spin_lock(&ip->i_lock);
519 	ip->i_mode = zp->z_mode;
520 	ip->i_blocks = i_blocks;
521 	i_size_write(ip, zp->z_size);
522 	spin_unlock(&ip->i_lock);
523 }
524 
525 
526 /*
527  * Construct a znode+inode and initialize.
528  *
529  * This does not do a call to dmu_set_user() that is
530  * up to the caller to do, in case you don't want to
531  * return the znode
532  */
533 static znode_t *
534 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
535     dmu_object_type_t obj_type, sa_handle_t *hdl)
536 {
537 	znode_t	*zp;
538 	struct inode *ip;
539 	uint64_t mode;
540 	uint64_t parent;
541 	uint64_t tmp_gen;
542 	uint64_t links;
543 	uint64_t z_uid, z_gid;
544 	uint64_t atime[2], mtime[2], ctime[2], btime[2];
545 	inode_timespec_t tmp_ctime;
546 	uint64_t projid = ZFS_DEFAULT_PROJID;
547 	sa_bulk_attr_t bulk[12];
548 	int count = 0;
549 
550 	ASSERT(zfsvfs != NULL);
551 
552 	ip = new_inode(zfsvfs->z_sb);
553 	if (ip == NULL)
554 		return (NULL);
555 
556 	zp = ITOZ(ip);
557 	ASSERT(zp->z_dirlocks == NULL);
558 	ASSERT3P(zp->z_acl_cached, ==, NULL);
559 	ASSERT3P(zp->z_xattr_cached, ==, NULL);
560 	zp->z_unlinked = B_FALSE;
561 	zp->z_atime_dirty = B_FALSE;
562 #if !defined(HAVE_FILEMAP_RANGE_HAS_PAGE)
563 	zp->z_is_mapped = B_FALSE;
564 #endif
565 	zp->z_is_ctldir = B_FALSE;
566 	zp->z_suspended = B_FALSE;
567 	zp->z_sa_hdl = NULL;
568 	zp->z_mapcnt = 0;
569 	zp->z_id = db->db_object;
570 	zp->z_blksz = blksz;
571 	zp->z_seq = 0x7A4653;
572 	zp->z_sync_cnt = 0;
573 	zp->z_sync_writes_cnt = 0;
574 	zp->z_async_writes_cnt = 0;
575 
576 	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
577 
578 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
579 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8);
580 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
581 	    &zp->z_size, 8);
582 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
583 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
584 	    &zp->z_pflags, 8);
585 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
586 	    &parent, 8);
587 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8);
588 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8);
589 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
590 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
591 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
592 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16);
593 
594 	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 ||
595 	    (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
596 	    (zp->z_pflags & ZFS_PROJID) &&
597 	    sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
598 		if (hdl == NULL)
599 			sa_handle_destroy(zp->z_sa_hdl);
600 		zp->z_sa_hdl = NULL;
601 		goto error;
602 	}
603 
604 	zp->z_projid = projid;
605 	zp->z_mode = ip->i_mode = mode;
606 	ip->i_generation = (uint32_t)tmp_gen;
607 	ip->i_blkbits = SPA_MINBLOCKSHIFT;
608 	set_nlink(ip, (uint32_t)links);
609 	zfs_uid_write(ip, z_uid);
610 	zfs_gid_write(ip, z_gid);
611 	zfs_set_inode_flags(zp, ip);
612 
613 	/* Cache the xattr parent id */
614 	if (zp->z_pflags & ZFS_XATTR)
615 		zp->z_xattr_parent = parent;
616 
617 	ZFS_TIME_DECODE(&ip->i_atime, atime);
618 	ZFS_TIME_DECODE(&ip->i_mtime, mtime);
619 	ZFS_TIME_DECODE(&tmp_ctime, ctime);
620 	zpl_inode_set_ctime_to_ts(ip, tmp_ctime);
621 	ZFS_TIME_DECODE(&zp->z_btime, btime);
622 
623 	ip->i_ino = zp->z_id;
624 	zfs_znode_update_vfs(zp);
625 	zfs_inode_set_ops(zfsvfs, ip);
626 
627 	/*
628 	 * The only way insert_inode_locked() can fail is if the ip->i_ino
629 	 * number is already hashed for this super block.  This can never
630 	 * happen because the inode numbers map 1:1 with the object numbers.
631 	 *
632 	 * Exceptions include rolling back a mounted file system, either
633 	 * from the zfs rollback or zfs recv command.
634 	 *
635 	 * Active inodes are unhashed during the rollback, but since zrele
636 	 * can happen asynchronously, we can't guarantee they've been
637 	 * unhashed.  This can cause hash collisions in unlinked drain
638 	 * processing so do not hash unlinked znodes.
639 	 */
640 	if (links > 0)
641 		VERIFY3S(insert_inode_locked(ip), ==, 0);
642 
643 	mutex_enter(&zfsvfs->z_znodes_lock);
644 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
645 	mutex_exit(&zfsvfs->z_znodes_lock);
646 
647 	if (links > 0)
648 		unlock_new_inode(ip);
649 	return (zp);
650 
651 error:
652 	iput(ip);
653 	return (NULL);
654 }
655 
656 /*
657  * Safely mark an inode dirty.  Inodes which are part of a read-only
658  * file system or snapshot may not be dirtied.
659  */
660 void
661 zfs_mark_inode_dirty(struct inode *ip)
662 {
663 	zfsvfs_t *zfsvfs = ITOZSB(ip);
664 
665 	if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
666 		return;
667 
668 	mark_inode_dirty(ip);
669 }
670 
671 static uint64_t empty_xattr;
672 static uint64_t pad[4];
673 static zfs_acl_phys_t acl_phys;
674 /*
675  * Create a new DMU object to hold a zfs znode.
676  *
677  *	IN:	dzp	- parent directory for new znode
678  *		vap	- file attributes for new znode
679  *		tx	- dmu transaction id for zap operations
680  *		cr	- credentials of caller
681  *		flag	- flags:
682  *			  IS_ROOT_NODE	- new object will be root
683  *			  IS_TMPFILE	- new object is of O_TMPFILE
684  *			  IS_XATTR	- new object is an attribute
685  *		acl_ids	- ACL related attributes
686  *
687  *	OUT:	zpp	- allocated znode (set to dzp if IS_ROOT_NODE)
688  *
689  */
690 void
691 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
692     uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
693 {
694 	uint64_t	crtime[2], atime[2], mtime[2], ctime[2];
695 	uint64_t	mode, size, links, parent, pflags;
696 	uint64_t	projid = ZFS_DEFAULT_PROJID;
697 	uint64_t	rdev = 0;
698 	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
699 	dmu_buf_t	*db;
700 	inode_timespec_t now;
701 	uint64_t	gen, obj;
702 	int		bonuslen;
703 	int		dnodesize;
704 	sa_handle_t	*sa_hdl;
705 	dmu_object_type_t obj_type;
706 	sa_bulk_attr_t	*sa_attrs;
707 	int		cnt = 0;
708 	zfs_acl_locator_cb_t locate = { 0 };
709 	znode_hold_t	*zh;
710 
711 	if (zfsvfs->z_replay) {
712 		obj = vap->va_nodeid;
713 		now = vap->va_ctime;		/* see zfs_replay_create() */
714 		gen = vap->va_nblocks;		/* ditto */
715 		dnodesize = vap->va_fsid;	/* ditto */
716 	} else {
717 		obj = 0;
718 		gethrestime(&now);
719 		gen = dmu_tx_get_txg(tx);
720 		dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
721 	}
722 
723 	if (dnodesize == 0)
724 		dnodesize = DNODE_MIN_SIZE;
725 
726 	obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
727 
728 	bonuslen = (obj_type == DMU_OT_SA) ?
729 	    DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
730 
731 	/*
732 	 * Create a new DMU object.
733 	 */
734 	/*
735 	 * There's currently no mechanism for pre-reading the blocks that will
736 	 * be needed to allocate a new object, so we accept the small chance
737 	 * that there will be an i/o error and we will fail one of the
738 	 * assertions below.
739 	 */
740 	if (S_ISDIR(vap->va_mode)) {
741 		if (zfsvfs->z_replay) {
742 			VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
743 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
744 			    obj_type, bonuslen, dnodesize, tx));
745 		} else {
746 			obj = zap_create_norm_dnsize(zfsvfs->z_os,
747 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
748 			    obj_type, bonuslen, dnodesize, tx);
749 		}
750 	} else {
751 		if (zfsvfs->z_replay) {
752 			VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
753 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
754 			    obj_type, bonuslen, dnodesize, tx));
755 		} else {
756 			obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
757 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
758 			    obj_type, bonuslen, dnodesize, tx);
759 		}
760 	}
761 
762 	zh = zfs_znode_hold_enter(zfsvfs, obj);
763 	VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
764 
765 	/*
766 	 * If this is the root, fix up the half-initialized parent pointer
767 	 * to reference the just-allocated physical data area.
768 	 */
769 	if (flag & IS_ROOT_NODE) {
770 		dzp->z_id = obj;
771 	}
772 
773 	/*
774 	 * If parent is an xattr, so am I.
775 	 */
776 	if (dzp->z_pflags & ZFS_XATTR) {
777 		flag |= IS_XATTR;
778 	}
779 
780 	if (zfsvfs->z_use_fuids)
781 		pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
782 	else
783 		pflags = 0;
784 
785 	if (S_ISDIR(vap->va_mode)) {
786 		size = 2;		/* contents ("." and "..") */
787 		links = 2;
788 	} else {
789 		size = 0;
790 		links = (flag & IS_TMPFILE) ? 0 : 1;
791 	}
792 
793 	if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))
794 		rdev = vap->va_rdev;
795 
796 	parent = dzp->z_id;
797 	mode = acl_ids->z_mode;
798 	if (flag & IS_XATTR)
799 		pflags |= ZFS_XATTR;
800 
801 	if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) {
802 		/*
803 		 * With ZFS_PROJID flag, we can easily know whether there is
804 		 * project ID stored on disk or not. See zfs_space_delta_cb().
805 		 */
806 		if (obj_type != DMU_OT_ZNODE &&
807 		    dmu_objset_projectquota_enabled(zfsvfs->z_os))
808 			pflags |= ZFS_PROJID;
809 
810 		/*
811 		 * Inherit project ID from parent if required.
812 		 */
813 		projid = zfs_inherit_projid(dzp);
814 		if (dzp->z_pflags & ZFS_PROJINHERIT)
815 			pflags |= ZFS_PROJINHERIT;
816 	}
817 
818 	/*
819 	 * No execs denied will be determined when zfs_mode_compute() is called.
820 	 */
821 	pflags |= acl_ids->z_aclp->z_hints &
822 	    (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
823 	    ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
824 
825 	ZFS_TIME_ENCODE(&now, crtime);
826 	ZFS_TIME_ENCODE(&now, ctime);
827 
828 	if (vap->va_mask & ATTR_ATIME) {
829 		ZFS_TIME_ENCODE(&vap->va_atime, atime);
830 	} else {
831 		ZFS_TIME_ENCODE(&now, atime);
832 	}
833 
834 	if (vap->va_mask & ATTR_MTIME) {
835 		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
836 	} else {
837 		ZFS_TIME_ENCODE(&now, mtime);
838 	}
839 
840 	/* Now add in all of the "SA" attributes */
841 	VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
842 	    &sa_hdl));
843 
844 	/*
845 	 * Setup the array of attributes to be replaced/set on the new file
846 	 *
847 	 * order for  DMU_OT_ZNODE is critical since it needs to be constructed
848 	 * in the old znode_phys_t format.  Don't change this ordering
849 	 */
850 	sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
851 
852 	if (obj_type == DMU_OT_ZNODE) {
853 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
854 		    NULL, &atime, 16);
855 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
856 		    NULL, &mtime, 16);
857 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
858 		    NULL, &ctime, 16);
859 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
860 		    NULL, &crtime, 16);
861 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
862 		    NULL, &gen, 8);
863 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
864 		    NULL, &mode, 8);
865 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
866 		    NULL, &size, 8);
867 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
868 		    NULL, &parent, 8);
869 	} else {
870 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
871 		    NULL, &mode, 8);
872 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
873 		    NULL, &size, 8);
874 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
875 		    NULL, &gen, 8);
876 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
877 		    NULL, &acl_ids->z_fuid, 8);
878 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
879 		    NULL, &acl_ids->z_fgid, 8);
880 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
881 		    NULL, &parent, 8);
882 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
883 		    NULL, &pflags, 8);
884 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
885 		    NULL, &atime, 16);
886 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
887 		    NULL, &mtime, 16);
888 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
889 		    NULL, &ctime, 16);
890 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
891 		    NULL, &crtime, 16);
892 	}
893 
894 	SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
895 
896 	if (obj_type == DMU_OT_ZNODE) {
897 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
898 		    &empty_xattr, 8);
899 	} else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
900 	    pflags & ZFS_PROJID) {
901 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs),
902 		    NULL, &projid, 8);
903 	}
904 	if (obj_type == DMU_OT_ZNODE ||
905 	    (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) {
906 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
907 		    NULL, &rdev, 8);
908 	}
909 	if (obj_type == DMU_OT_ZNODE) {
910 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
911 		    NULL, &pflags, 8);
912 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
913 		    &acl_ids->z_fuid, 8);
914 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
915 		    &acl_ids->z_fgid, 8);
916 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
917 		    sizeof (uint64_t) * 4);
918 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
919 		    &acl_phys, sizeof (zfs_acl_phys_t));
920 	} else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
921 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
922 		    &acl_ids->z_aclp->z_acl_count, 8);
923 		locate.cb_aclp = acl_ids->z_aclp;
924 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
925 		    zfs_acl_data_locator, &locate,
926 		    acl_ids->z_aclp->z_acl_bytes);
927 		mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
928 		    acl_ids->z_fuid, acl_ids->z_fgid);
929 	}
930 
931 	VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
932 
933 	if (!(flag & IS_ROOT_NODE)) {
934 		/*
935 		 * The call to zfs_znode_alloc() may fail if memory is low
936 		 * via the call path: alloc_inode() -> inode_init_always() ->
937 		 * security_inode_alloc() -> inode_alloc_security().  Since
938 		 * the existing code is written such that zfs_mknode() can
939 		 * not fail retry until sufficient memory has been reclaimed.
940 		 */
941 		do {
942 			*zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
943 		} while (*zpp == NULL);
944 
945 		VERIFY(*zpp != NULL);
946 		VERIFY(dzp != NULL);
947 	} else {
948 		/*
949 		 * If we are creating the root node, the "parent" we
950 		 * passed in is the znode for the root.
951 		 */
952 		*zpp = dzp;
953 
954 		(*zpp)->z_sa_hdl = sa_hdl;
955 	}
956 
957 	(*zpp)->z_pflags = pflags;
958 	(*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode;
959 	(*zpp)->z_dnodesize = dnodesize;
960 	(*zpp)->z_projid = projid;
961 
962 	if (obj_type == DMU_OT_ZNODE ||
963 	    acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
964 		VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
965 	}
966 	kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
967 	zfs_znode_hold_exit(zfsvfs, zh);
968 }
969 
970 /*
971  * Update in-core attributes.  It is assumed the caller will be doing an
972  * sa_bulk_update to push the changes out.
973  */
974 void
975 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
976 {
977 	xoptattr_t *xoap;
978 	boolean_t update_inode = B_FALSE;
979 
980 	xoap = xva_getxoptattr(xvap);
981 	ASSERT(xoap);
982 
983 	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
984 		uint64_t times[2];
985 		ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
986 		(void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
987 		    &times, sizeof (times), tx);
988 		XVA_SET_RTN(xvap, XAT_CREATETIME);
989 	}
990 	if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
991 		ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
992 		    zp->z_pflags, tx);
993 		XVA_SET_RTN(xvap, XAT_READONLY);
994 	}
995 	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
996 		ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
997 		    zp->z_pflags, tx);
998 		XVA_SET_RTN(xvap, XAT_HIDDEN);
999 	}
1000 	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
1001 		ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
1002 		    zp->z_pflags, tx);
1003 		XVA_SET_RTN(xvap, XAT_SYSTEM);
1004 	}
1005 	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
1006 		ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
1007 		    zp->z_pflags, tx);
1008 		XVA_SET_RTN(xvap, XAT_ARCHIVE);
1009 	}
1010 	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
1011 		ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
1012 		    zp->z_pflags, tx);
1013 		XVA_SET_RTN(xvap, XAT_IMMUTABLE);
1014 
1015 		update_inode = B_TRUE;
1016 	}
1017 	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
1018 		ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
1019 		    zp->z_pflags, tx);
1020 		XVA_SET_RTN(xvap, XAT_NOUNLINK);
1021 	}
1022 	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
1023 		ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
1024 		    zp->z_pflags, tx);
1025 		XVA_SET_RTN(xvap, XAT_APPENDONLY);
1026 
1027 		update_inode = B_TRUE;
1028 	}
1029 	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
1030 		ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
1031 		    zp->z_pflags, tx);
1032 		XVA_SET_RTN(xvap, XAT_NODUMP);
1033 	}
1034 	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
1035 		ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
1036 		    zp->z_pflags, tx);
1037 		XVA_SET_RTN(xvap, XAT_OPAQUE);
1038 	}
1039 	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
1040 		ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
1041 		    xoap->xoa_av_quarantined, zp->z_pflags, tx);
1042 		XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
1043 	}
1044 	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
1045 		ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
1046 		    zp->z_pflags, tx);
1047 		XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
1048 	}
1049 	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
1050 		zfs_sa_set_scanstamp(zp, xvap, tx);
1051 		XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
1052 	}
1053 	if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
1054 		ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
1055 		    zp->z_pflags, tx);
1056 		XVA_SET_RTN(xvap, XAT_REPARSE);
1057 	}
1058 	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
1059 		ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
1060 		    zp->z_pflags, tx);
1061 		XVA_SET_RTN(xvap, XAT_OFFLINE);
1062 	}
1063 	if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
1064 		ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
1065 		    zp->z_pflags, tx);
1066 		XVA_SET_RTN(xvap, XAT_SPARSE);
1067 	}
1068 	if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
1069 		ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit,
1070 		    zp->z_pflags, tx);
1071 		XVA_SET_RTN(xvap, XAT_PROJINHERIT);
1072 	}
1073 
1074 	if (update_inode)
1075 		zfs_set_inode_flags(zp, ZTOI(zp));
1076 }
1077 
1078 int
1079 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
1080 {
1081 	dmu_object_info_t doi;
1082 	dmu_buf_t	*db;
1083 	znode_t		*zp;
1084 	znode_hold_t	*zh;
1085 	int err;
1086 	sa_handle_t	*hdl;
1087 
1088 	*zpp = NULL;
1089 
1090 again:
1091 	zh = zfs_znode_hold_enter(zfsvfs, obj_num);
1092 
1093 	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1094 	if (err) {
1095 		zfs_znode_hold_exit(zfsvfs, zh);
1096 		return (err);
1097 	}
1098 
1099 	dmu_object_info_from_db(db, &doi);
1100 	if (doi.doi_bonus_type != DMU_OT_SA &&
1101 	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
1102 	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
1103 	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1104 		sa_buf_rele(db, NULL);
1105 		zfs_znode_hold_exit(zfsvfs, zh);
1106 		return (SET_ERROR(EINVAL));
1107 	}
1108 
1109 	hdl = dmu_buf_get_user(db);
1110 	if (hdl != NULL) {
1111 		zp = sa_get_userdata(hdl);
1112 
1113 
1114 		/*
1115 		 * Since "SA" does immediate eviction we
1116 		 * should never find a sa handle that doesn't
1117 		 * know about the znode.
1118 		 */
1119 
1120 		ASSERT3P(zp, !=, NULL);
1121 
1122 		mutex_enter(&zp->z_lock);
1123 		ASSERT3U(zp->z_id, ==, obj_num);
1124 		/*
1125 		 * If zp->z_unlinked is set, the znode is already marked
1126 		 * for deletion and should not be discovered. Check this
1127 		 * after checking igrab() due to fsetxattr() & O_TMPFILE.
1128 		 *
1129 		 * If igrab() returns NULL the VFS has independently
1130 		 * determined the inode should be evicted and has
1131 		 * called iput_final() to start the eviction process.
1132 		 * The SA handle is still valid but because the VFS
1133 		 * requires that the eviction succeed we must drop
1134 		 * our locks and references to allow the eviction to
1135 		 * complete.  The zfs_zget() may then be retried.
1136 		 *
1137 		 * This unlikely case could be optimized by registering
1138 		 * a sops->drop_inode() callback.  The callback would
1139 		 * need to detect the active SA hold thereby informing
1140 		 * the VFS that this inode should not be evicted.
1141 		 */
1142 		if (igrab(ZTOI(zp)) == NULL) {
1143 			if (zp->z_unlinked)
1144 				err = SET_ERROR(ENOENT);
1145 			else
1146 				err = SET_ERROR(EAGAIN);
1147 		} else {
1148 			*zpp = zp;
1149 			err = 0;
1150 		}
1151 
1152 		mutex_exit(&zp->z_lock);
1153 		sa_buf_rele(db, NULL);
1154 		zfs_znode_hold_exit(zfsvfs, zh);
1155 
1156 		if (err == EAGAIN) {
1157 			/* inode might need this to finish evict */
1158 			cond_resched();
1159 			goto again;
1160 		}
1161 		return (err);
1162 	}
1163 
1164 	/*
1165 	 * Not found create new znode/vnode but only if file exists.
1166 	 *
1167 	 * There is a small window where zfs_vget() could
1168 	 * find this object while a file create is still in
1169 	 * progress.  This is checked for in zfs_znode_alloc()
1170 	 *
1171 	 * if zfs_znode_alloc() fails it will drop the hold on the
1172 	 * bonus buffer.
1173 	 */
1174 	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
1175 	    doi.doi_bonus_type, NULL);
1176 	if (zp == NULL) {
1177 		err = SET_ERROR(ENOENT);
1178 	} else {
1179 		*zpp = zp;
1180 	}
1181 	zfs_znode_hold_exit(zfsvfs, zh);
1182 	return (err);
1183 }
1184 
1185 int
1186 zfs_rezget(znode_t *zp)
1187 {
1188 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
1189 	dmu_object_info_t doi;
1190 	dmu_buf_t *db;
1191 	uint64_t obj_num = zp->z_id;
1192 	uint64_t mode;
1193 	uint64_t links;
1194 	sa_bulk_attr_t bulk[11];
1195 	int err;
1196 	int count = 0;
1197 	uint64_t gen;
1198 	uint64_t z_uid, z_gid;
1199 	uint64_t atime[2], mtime[2], ctime[2], btime[2];
1200 	inode_timespec_t tmp_ctime;
1201 	uint64_t projid = ZFS_DEFAULT_PROJID;
1202 	znode_hold_t *zh;
1203 
1204 	/*
1205 	 * skip ctldir, otherwise they will always get invalidated. This will
1206 	 * cause funny behaviour for the mounted snapdirs. Especially for
1207 	 * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent
1208 	 * anyone automount it again as long as someone is still using the
1209 	 * detached mount.
1210 	 */
1211 	if (zp->z_is_ctldir)
1212 		return (0);
1213 
1214 	zh = zfs_znode_hold_enter(zfsvfs, obj_num);
1215 
1216 	mutex_enter(&zp->z_acl_lock);
1217 	if (zp->z_acl_cached) {
1218 		zfs_acl_free(zp->z_acl_cached);
1219 		zp->z_acl_cached = NULL;
1220 	}
1221 	mutex_exit(&zp->z_acl_lock);
1222 
1223 	rw_enter(&zp->z_xattr_lock, RW_WRITER);
1224 	if (zp->z_xattr_cached) {
1225 		nvlist_free(zp->z_xattr_cached);
1226 		zp->z_xattr_cached = NULL;
1227 	}
1228 	rw_exit(&zp->z_xattr_lock);
1229 
1230 	ASSERT(zp->z_sa_hdl == NULL);
1231 	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1232 	if (err) {
1233 		zfs_znode_hold_exit(zfsvfs, zh);
1234 		return (err);
1235 	}
1236 
1237 	dmu_object_info_from_db(db, &doi);
1238 	if (doi.doi_bonus_type != DMU_OT_SA &&
1239 	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
1240 	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
1241 	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1242 		sa_buf_rele(db, NULL);
1243 		zfs_znode_hold_exit(zfsvfs, zh);
1244 		return (SET_ERROR(EINVAL));
1245 	}
1246 
1247 	zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
1248 
1249 	/* reload cached values */
1250 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
1251 	    &gen, sizeof (gen));
1252 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1253 	    &zp->z_size, sizeof (zp->z_size));
1254 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
1255 	    &links, sizeof (links));
1256 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
1257 	    &zp->z_pflags, sizeof (zp->z_pflags));
1258 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1259 	    &z_uid, sizeof (z_uid));
1260 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1261 	    &z_gid, sizeof (z_gid));
1262 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
1263 	    &mode, sizeof (mode));
1264 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
1265 	    &atime, 16);
1266 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
1267 	    &mtime, 16);
1268 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
1269 	    &ctime, 16);
1270 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16);
1271 
1272 	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
1273 		zfs_znode_dmu_fini(zp);
1274 		zfs_znode_hold_exit(zfsvfs, zh);
1275 		return (SET_ERROR(EIO));
1276 	}
1277 
1278 	if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) {
1279 		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs),
1280 		    &projid, 8);
1281 		if (err != 0 && err != ENOENT) {
1282 			zfs_znode_dmu_fini(zp);
1283 			zfs_znode_hold_exit(zfsvfs, zh);
1284 			return (SET_ERROR(err));
1285 		}
1286 	}
1287 
1288 	zp->z_projid = projid;
1289 	zp->z_mode = ZTOI(zp)->i_mode = mode;
1290 	zfs_uid_write(ZTOI(zp), z_uid);
1291 	zfs_gid_write(ZTOI(zp), z_gid);
1292 
1293 	ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime);
1294 	ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime);
1295 	ZFS_TIME_DECODE(&tmp_ctime, ctime);
1296 	zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ctime);
1297 	ZFS_TIME_DECODE(&zp->z_btime, btime);
1298 
1299 	if ((uint32_t)gen != ZTOI(zp)->i_generation) {
1300 		zfs_znode_dmu_fini(zp);
1301 		zfs_znode_hold_exit(zfsvfs, zh);
1302 		return (SET_ERROR(EIO));
1303 	}
1304 
1305 	set_nlink(ZTOI(zp), (uint32_t)links);
1306 	zfs_set_inode_flags(zp, ZTOI(zp));
1307 
1308 	zp->z_blksz = doi.doi_data_block_size;
1309 	zp->z_atime_dirty = B_FALSE;
1310 	zfs_znode_update_vfs(zp);
1311 
1312 	/*
1313 	 * If the file has zero links, then it has been unlinked on the send
1314 	 * side and it must be in the received unlinked set.
1315 	 * We call zfs_znode_dmu_fini() now to prevent any accesses to the
1316 	 * stale data and to prevent automatic removal of the file in
1317 	 * zfs_zinactive().  The file will be removed either when it is removed
1318 	 * on the send side and the next incremental stream is received or
1319 	 * when the unlinked set gets processed.
1320 	 */
1321 	zp->z_unlinked = (ZTOI(zp)->i_nlink == 0);
1322 	if (zp->z_unlinked)
1323 		zfs_znode_dmu_fini(zp);
1324 
1325 	zfs_znode_hold_exit(zfsvfs, zh);
1326 
1327 	return (0);
1328 }
1329 
1330 void
1331 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
1332 {
1333 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
1334 	objset_t *os = zfsvfs->z_os;
1335 	uint64_t obj = zp->z_id;
1336 	uint64_t acl_obj = zfs_external_acl(zp);
1337 	znode_hold_t *zh;
1338 
1339 	zh = zfs_znode_hold_enter(zfsvfs, obj);
1340 	if (acl_obj) {
1341 		VERIFY(!zp->z_is_sa);
1342 		VERIFY(0 == dmu_object_free(os, acl_obj, tx));
1343 	}
1344 	VERIFY(0 == dmu_object_free(os, obj, tx));
1345 	zfs_znode_dmu_fini(zp);
1346 	zfs_znode_hold_exit(zfsvfs, zh);
1347 }
1348 
1349 void
1350 zfs_zinactive(znode_t *zp)
1351 {
1352 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
1353 	uint64_t z_id = zp->z_id;
1354 	znode_hold_t *zh;
1355 
1356 	ASSERT(zp->z_sa_hdl);
1357 
1358 	/*
1359 	 * Don't allow a zfs_zget() while were trying to release this znode.
1360 	 */
1361 	zh = zfs_znode_hold_enter(zfsvfs, z_id);
1362 
1363 	mutex_enter(&zp->z_lock);
1364 
1365 	/*
1366 	 * If this was the last reference to a file with no links, remove
1367 	 * the file from the file system unless the file system is mounted
1368 	 * read-only.  That can happen, for example, if the file system was
1369 	 * originally read-write, the file was opened, then unlinked and
1370 	 * the file system was made read-only before the file was finally
1371 	 * closed.  The file will remain in the unlinked set.
1372 	 */
1373 	if (zp->z_unlinked) {
1374 		ASSERT(!zfsvfs->z_issnap);
1375 		if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) {
1376 			mutex_exit(&zp->z_lock);
1377 			zfs_znode_hold_exit(zfsvfs, zh);
1378 			zfs_rmnode(zp);
1379 			return;
1380 		}
1381 	}
1382 
1383 	mutex_exit(&zp->z_lock);
1384 	zfs_znode_dmu_fini(zp);
1385 
1386 	zfs_znode_hold_exit(zfsvfs, zh);
1387 }
1388 
1389 #if defined(HAVE_INODE_TIMESPEC64_TIMES)
1390 #define	zfs_compare_timespec timespec64_compare
1391 #else
1392 #define	zfs_compare_timespec timespec_compare
1393 #endif
1394 
1395 /*
1396  * Determine whether the znode's atime must be updated.  The logic mostly
1397  * duplicates the Linux kernel's relatime_need_update() functionality.
1398  * This function is only called if the underlying filesystem actually has
1399  * atime updates enabled.
1400  */
1401 boolean_t
1402 zfs_relatime_need_update(const struct inode *ip)
1403 {
1404 	inode_timespec_t now, tmp_ctime;
1405 
1406 	gethrestime(&now);
1407 	/*
1408 	 * In relatime mode, only update the atime if the previous atime
1409 	 * is earlier than either the ctime or mtime or if at least a day
1410 	 * has passed since the last update of atime.
1411 	 */
1412 	if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0)
1413 		return (B_TRUE);
1414 
1415 	tmp_ctime = zpl_inode_get_ctime(ip);
1416 	if (zfs_compare_timespec(&tmp_ctime, &ip->i_atime) >= 0)
1417 		return (B_TRUE);
1418 
1419 	if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60)
1420 		return (B_TRUE);
1421 
1422 	return (B_FALSE);
1423 }
1424 
1425 /*
1426  * Prepare to update znode time stamps.
1427  *
1428  *	IN:	zp	- znode requiring timestamp update
1429  *		flag	- ATTR_MTIME, ATTR_CTIME flags
1430  *
1431  *	OUT:	zp	- z_seq
1432  *		mtime	- new mtime
1433  *		ctime	- new ctime
1434  *
1435  *	Note: We don't update atime here, because we rely on Linux VFS to do
1436  *	atime updating.
1437  */
1438 void
1439 zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
1440     uint64_t ctime[2])
1441 {
1442 	inode_timespec_t now, tmp_ctime;
1443 
1444 	gethrestime(&now);
1445 
1446 	zp->z_seq++;
1447 
1448 	if (flag & ATTR_MTIME) {
1449 		ZFS_TIME_ENCODE(&now, mtime);
1450 		ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime);
1451 		if (ZTOZSB(zp)->z_use_fuids) {
1452 			zp->z_pflags |= (ZFS_ARCHIVE |
1453 			    ZFS_AV_MODIFIED);
1454 		}
1455 	}
1456 
1457 	if (flag & ATTR_CTIME) {
1458 		ZFS_TIME_ENCODE(&now, ctime);
1459 		ZFS_TIME_DECODE(&tmp_ctime, ctime);
1460 		zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ctime);
1461 		if (ZTOZSB(zp)->z_use_fuids)
1462 			zp->z_pflags |= ZFS_ARCHIVE;
1463 	}
1464 }
1465 
1466 /*
1467  * Grow the block size for a file.
1468  *
1469  *	IN:	zp	- znode of file to free data in.
1470  *		size	- requested block size
1471  *		tx	- open transaction.
1472  *
1473  * NOTE: this function assumes that the znode is write locked.
1474  */
1475 void
1476 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1477 {
1478 	int		error;
1479 	u_longlong_t	dummy;
1480 
1481 	if (size <= zp->z_blksz)
1482 		return;
1483 	/*
1484 	 * If the file size is already greater than the current blocksize,
1485 	 * we will not grow.  If there is more than one block in a file,
1486 	 * the blocksize cannot change.
1487 	 */
1488 	if (zp->z_blksz && zp->z_size > zp->z_blksz)
1489 		return;
1490 
1491 	error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id,
1492 	    size, 0, tx);
1493 
1494 	if (error == ENOTSUP)
1495 		return;
1496 	ASSERT0(error);
1497 
1498 	/* What blocksize did we actually get? */
1499 	dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
1500 }
1501 
1502 /*
1503  * Increase the file length
1504  *
1505  *	IN:	zp	- znode of file to free data in.
1506  *		end	- new end-of-file
1507  *
1508  *	RETURN:	0 on success, error code on failure
1509  */
1510 static int
1511 zfs_extend(znode_t *zp, uint64_t end)
1512 {
1513 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
1514 	dmu_tx_t *tx;
1515 	zfs_locked_range_t *lr;
1516 	uint64_t newblksz;
1517 	int error;
1518 
1519 	/*
1520 	 * We will change zp_size, lock the whole file.
1521 	 */
1522 	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
1523 
1524 	/*
1525 	 * Nothing to do if file already at desired length.
1526 	 */
1527 	if (end <= zp->z_size) {
1528 		zfs_rangelock_exit(lr);
1529 		return (0);
1530 	}
1531 	tx = dmu_tx_create(zfsvfs->z_os);
1532 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1533 	zfs_sa_upgrade_txholds(tx, zp);
1534 	if (end > zp->z_blksz &&
1535 	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
1536 		/*
1537 		 * We are growing the file past the current block size.
1538 		 */
1539 		if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) {
1540 			/*
1541 			 * File's blocksize is already larger than the
1542 			 * "recordsize" property.  Only let it grow to
1543 			 * the next power of 2.
1544 			 */
1545 			ASSERT(!ISP2(zp->z_blksz));
1546 			newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
1547 		} else {
1548 			newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz);
1549 		}
1550 		dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1551 	} else {
1552 		newblksz = 0;
1553 	}
1554 
1555 	error = dmu_tx_assign(tx, TXG_WAIT);
1556 	if (error) {
1557 		dmu_tx_abort(tx);
1558 		zfs_rangelock_exit(lr);
1559 		return (error);
1560 	}
1561 
1562 	if (newblksz)
1563 		zfs_grow_blocksize(zp, newblksz, tx);
1564 
1565 	zp->z_size = end;
1566 
1567 	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
1568 	    &zp->z_size, sizeof (zp->z_size), tx));
1569 
1570 	zfs_rangelock_exit(lr);
1571 
1572 	dmu_tx_commit(tx);
1573 
1574 	return (0);
1575 }
1576 
1577 /*
1578  * zfs_zero_partial_page - Modeled after update_pages() but
1579  * with different arguments and semantics for use by zfs_freesp().
1580  *
1581  * Zeroes a piece of a single page cache entry for zp at offset
1582  * start and length len.
1583  *
1584  * Caller must acquire a range lock on the file for the region
1585  * being zeroed in order that the ARC and page cache stay in sync.
1586  */
1587 static void
1588 zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len)
1589 {
1590 	struct address_space *mp = ZTOI(zp)->i_mapping;
1591 	struct page *pp;
1592 	int64_t	off;
1593 	void *pb;
1594 
1595 	ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK));
1596 
1597 	off = start & (PAGE_SIZE - 1);
1598 	start &= PAGE_MASK;
1599 
1600 	pp = find_lock_page(mp, start >> PAGE_SHIFT);
1601 	if (pp) {
1602 		if (mapping_writably_mapped(mp))
1603 			flush_dcache_page(pp);
1604 
1605 		pb = kmap(pp);
1606 		memset(pb + off, 0, len);
1607 		kunmap(pp);
1608 
1609 		if (mapping_writably_mapped(mp))
1610 			flush_dcache_page(pp);
1611 
1612 		mark_page_accessed(pp);
1613 		SetPageUptodate(pp);
1614 		ClearPageError(pp);
1615 		unlock_page(pp);
1616 		put_page(pp);
1617 	}
1618 }
1619 
1620 /*
1621  * Free space in a file.
1622  *
1623  *	IN:	zp	- znode of file to free data in.
1624  *		off	- start of section to free.
1625  *		len	- length of section to free.
1626  *
1627  *	RETURN:	0 on success, error code on failure
1628  */
1629 static int
1630 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1631 {
1632 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
1633 	zfs_locked_range_t *lr;
1634 	int error;
1635 
1636 	/*
1637 	 * Lock the range being freed.
1638 	 */
1639 	lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
1640 
1641 	/*
1642 	 * Nothing to do if file already at desired length.
1643 	 */
1644 	if (off >= zp->z_size) {
1645 		zfs_rangelock_exit(lr);
1646 		return (0);
1647 	}
1648 
1649 	if (off + len > zp->z_size)
1650 		len = zp->z_size - off;
1651 
1652 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
1653 
1654 	/*
1655 	 * Zero partial page cache entries.  This must be done under a
1656 	 * range lock in order to keep the ARC and page cache in sync.
1657 	 */
1658 	if (zn_has_cached_data(zp, off, off + len - 1)) {
1659 		loff_t first_page, last_page, page_len;
1660 		loff_t first_page_offset, last_page_offset;
1661 
1662 		/* first possible full page in hole */
1663 		first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT;
1664 		/* last page of hole */
1665 		last_page = (off + len) >> PAGE_SHIFT;
1666 
1667 		/* offset of first_page */
1668 		first_page_offset = first_page << PAGE_SHIFT;
1669 		/* offset of last_page */
1670 		last_page_offset = last_page << PAGE_SHIFT;
1671 
1672 		/* truncate whole pages */
1673 		if (last_page_offset > first_page_offset) {
1674 			truncate_inode_pages_range(ZTOI(zp)->i_mapping,
1675 			    first_page_offset, last_page_offset - 1);
1676 		}
1677 
1678 		/* truncate sub-page ranges */
1679 		if (first_page > last_page) {
1680 			/* entire punched area within a single page */
1681 			zfs_zero_partial_page(zp, off, len);
1682 		} else {
1683 			/* beginning of punched area at the end of a page */
1684 			page_len  = first_page_offset - off;
1685 			if (page_len > 0)
1686 				zfs_zero_partial_page(zp, off, page_len);
1687 
1688 			/* end of punched area at the beginning of a page */
1689 			page_len = off + len - last_page_offset;
1690 			if (page_len > 0)
1691 				zfs_zero_partial_page(zp, last_page_offset,
1692 				    page_len);
1693 		}
1694 	}
1695 	zfs_rangelock_exit(lr);
1696 
1697 	return (error);
1698 }
1699 
1700 /*
1701  * Truncate a file
1702  *
1703  *	IN:	zp	- znode of file to free data in.
1704  *		end	- new end-of-file.
1705  *
1706  *	RETURN:	0 on success, error code on failure
1707  */
1708 static int
1709 zfs_trunc(znode_t *zp, uint64_t end)
1710 {
1711 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
1712 	dmu_tx_t *tx;
1713 	zfs_locked_range_t *lr;
1714 	int error;
1715 	sa_bulk_attr_t bulk[2];
1716 	int count = 0;
1717 
1718 	/*
1719 	 * We will change zp_size, lock the whole file.
1720 	 */
1721 	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
1722 
1723 	/*
1724 	 * Nothing to do if file already at desired length.
1725 	 */
1726 	if (end >= zp->z_size) {
1727 		zfs_rangelock_exit(lr);
1728 		return (0);
1729 	}
1730 
1731 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
1732 	    DMU_OBJECT_END);
1733 	if (error) {
1734 		zfs_rangelock_exit(lr);
1735 		return (error);
1736 	}
1737 	tx = dmu_tx_create(zfsvfs->z_os);
1738 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1739 	zfs_sa_upgrade_txholds(tx, zp);
1740 	dmu_tx_mark_netfree(tx);
1741 	error = dmu_tx_assign(tx, TXG_WAIT);
1742 	if (error) {
1743 		dmu_tx_abort(tx);
1744 		zfs_rangelock_exit(lr);
1745 		return (error);
1746 	}
1747 
1748 	zp->z_size = end;
1749 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
1750 	    NULL, &zp->z_size, sizeof (zp->z_size));
1751 
1752 	if (end == 0) {
1753 		zp->z_pflags &= ~ZFS_SPARSE;
1754 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1755 		    NULL, &zp->z_pflags, 8);
1756 	}
1757 	VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
1758 
1759 	dmu_tx_commit(tx);
1760 	zfs_rangelock_exit(lr);
1761 
1762 	return (0);
1763 }
1764 
1765 /*
1766  * Free space in a file
1767  *
1768  *	IN:	zp	- znode of file to free data in.
1769  *		off	- start of range
1770  *		len	- end of range (0 => EOF)
1771  *		flag	- current file open mode flags.
1772  *		log	- TRUE if this action should be logged
1773  *
1774  *	RETURN:	0 on success, error code on failure
1775  */
1776 int
1777 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1778 {
1779 	dmu_tx_t *tx;
1780 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
1781 	zilog_t *zilog = zfsvfs->z_log;
1782 	uint64_t mode;
1783 	uint64_t mtime[2], ctime[2];
1784 	sa_bulk_attr_t bulk[3];
1785 	int count = 0;
1786 	int error;
1787 
1788 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
1789 	    sizeof (mode))) != 0)
1790 		return (error);
1791 
1792 	if (off > zp->z_size) {
1793 		error =  zfs_extend(zp, off+len);
1794 		if (error == 0 && log)
1795 			goto log;
1796 		goto out;
1797 	}
1798 
1799 	if (len == 0) {
1800 		error = zfs_trunc(zp, off);
1801 	} else {
1802 		if ((error = zfs_free_range(zp, off, len)) == 0 &&
1803 		    off + len > zp->z_size)
1804 			error = zfs_extend(zp, off+len);
1805 	}
1806 	if (error || !log)
1807 		goto out;
1808 log:
1809 	tx = dmu_tx_create(zfsvfs->z_os);
1810 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1811 	zfs_sa_upgrade_txholds(tx, zp);
1812 	error = dmu_tx_assign(tx, TXG_WAIT);
1813 	if (error) {
1814 		dmu_tx_abort(tx);
1815 		goto out;
1816 	}
1817 
1818 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
1819 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
1820 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1821 	    NULL, &zp->z_pflags, 8);
1822 	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
1823 	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1824 	ASSERT(error == 0);
1825 
1826 	zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1827 
1828 	dmu_tx_commit(tx);
1829 
1830 	zfs_znode_update_vfs(zp);
1831 	error = 0;
1832 
1833 out:
1834 	/*
1835 	 * Truncate the page cache - for file truncate operations, use
1836 	 * the purpose-built API for truncations.  For punching operations,
1837 	 * the truncation is handled under a range lock in zfs_free_range.
1838 	 */
1839 	if (len == 0)
1840 		truncate_setsize(ZTOI(zp), off);
1841 	return (error);
1842 }
1843 
1844 void
1845 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1846 {
1847 	struct super_block *sb;
1848 	zfsvfs_t	*zfsvfs;
1849 	uint64_t	moid, obj, sa_obj, version;
1850 	uint64_t	sense = ZFS_CASE_SENSITIVE;
1851 	uint64_t	norm = 0;
1852 	nvpair_t	*elem;
1853 	int		size;
1854 	int		error;
1855 	int		i;
1856 	znode_t		*rootzp = NULL;
1857 	vattr_t		vattr;
1858 	znode_t		*zp;
1859 	zfs_acl_ids_t	acl_ids;
1860 
1861 	/*
1862 	 * First attempt to create master node.
1863 	 */
1864 	/*
1865 	 * In an empty objset, there are no blocks to read and thus
1866 	 * there can be no i/o errors (which we assert below).
1867 	 */
1868 	moid = MASTER_NODE_OBJ;
1869 	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1870 	    DMU_OT_NONE, 0, tx);
1871 	ASSERT(error == 0);
1872 
1873 	/*
1874 	 * Set starting attributes.
1875 	 */
1876 	version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
1877 	elem = NULL;
1878 	while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1879 		/* For the moment we expect all zpl props to be uint64_ts */
1880 		uint64_t val;
1881 		const char *name;
1882 
1883 		ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1884 		VERIFY(nvpair_value_uint64(elem, &val) == 0);
1885 		name = nvpair_name(elem);
1886 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
1887 			if (val < version)
1888 				version = val;
1889 		} else {
1890 			error = zap_update(os, moid, name, 8, 1, &val, tx);
1891 		}
1892 		ASSERT(error == 0);
1893 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1894 			norm = val;
1895 		else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1896 			sense = val;
1897 	}
1898 	ASSERT(version != 0);
1899 	error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
1900 	ASSERT(error == 0);
1901 
1902 	/*
1903 	 * Create zap object used for SA attribute registration
1904 	 */
1905 
1906 	if (version >= ZPL_VERSION_SA) {
1907 		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
1908 		    DMU_OT_NONE, 0, tx);
1909 		error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
1910 		ASSERT(error == 0);
1911 	} else {
1912 		sa_obj = 0;
1913 	}
1914 	/*
1915 	 * Create a delete queue.
1916 	 */
1917 	obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
1918 
1919 	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
1920 	ASSERT(error == 0);
1921 
1922 	/*
1923 	 * Create root znode.  Create minimal znode/inode/zfsvfs/sb
1924 	 * to allow zfs_mknode to work.
1925 	 */
1926 	vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID;
1927 	vattr.va_mode = S_IFDIR|0755;
1928 	vattr.va_uid = crgetuid(cr);
1929 	vattr.va_gid = crgetgid(cr);
1930 
1931 	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
1932 	rootzp->z_unlinked = B_FALSE;
1933 	rootzp->z_atime_dirty = B_FALSE;
1934 	rootzp->z_is_sa = USE_SA(version, os);
1935 	rootzp->z_pflags = 0;
1936 
1937 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
1938 	zfsvfs->z_os = os;
1939 	zfsvfs->z_parent = zfsvfs;
1940 	zfsvfs->z_version = version;
1941 	zfsvfs->z_use_fuids = USE_FUIDS(version, os);
1942 	zfsvfs->z_use_sa = USE_SA(version, os);
1943 	zfsvfs->z_norm = norm;
1944 
1945 	sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP);
1946 	sb->s_fs_info = zfsvfs;
1947 
1948 	ZTOI(rootzp)->i_sb = sb;
1949 
1950 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
1951 	    &zfsvfs->z_attr_table);
1952 
1953 	ASSERT(error == 0);
1954 
1955 	/*
1956 	 * Fold case on file systems that are always or sometimes case
1957 	 * insensitive.
1958 	 */
1959 	if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
1960 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
1961 
1962 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1963 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1964 	    offsetof(znode_t, z_link_node));
1965 
1966 	size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX);
1967 	zfsvfs->z_hold_size = size;
1968 	zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
1969 	    KM_SLEEP);
1970 	zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
1971 	for (i = 0; i != size; i++) {
1972 		avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
1973 		    sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
1974 		mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
1975 	}
1976 
1977 	VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
1978 	    cr, NULL, &acl_ids, zfs_init_idmap));
1979 	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
1980 	ASSERT3P(zp, ==, rootzp);
1981 	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1982 	ASSERT(error == 0);
1983 	zfs_acl_ids_free(&acl_ids);
1984 
1985 	atomic_set(&ZTOI(rootzp)->i_count, 0);
1986 	sa_handle_destroy(rootzp->z_sa_hdl);
1987 	kmem_cache_free(znode_cache, rootzp);
1988 
1989 	for (i = 0; i != size; i++) {
1990 		avl_destroy(&zfsvfs->z_hold_trees[i]);
1991 		mutex_destroy(&zfsvfs->z_hold_locks[i]);
1992 	}
1993 
1994 	mutex_destroy(&zfsvfs->z_znodes_lock);
1995 
1996 	vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
1997 	vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
1998 	kmem_free(sb, sizeof (struct super_block));
1999 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
2000 }
2001 #endif /* _KERNEL */
2002 
2003 static int
2004 zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
2005 {
2006 	uint64_t sa_obj = 0;
2007 	int error;
2008 
2009 	error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
2010 	if (error != 0 && error != ENOENT)
2011 		return (error);
2012 
2013 	error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
2014 	return (error);
2015 }
2016 
2017 static int
2018 zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
2019     dmu_buf_t **db, const void *tag)
2020 {
2021 	dmu_object_info_t doi;
2022 	int error;
2023 
2024 	if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
2025 		return (error);
2026 
2027 	dmu_object_info_from_db(*db, &doi);
2028 	if ((doi.doi_bonus_type != DMU_OT_SA &&
2029 	    doi.doi_bonus_type != DMU_OT_ZNODE) ||
2030 	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
2031 	    doi.doi_bonus_size < sizeof (znode_phys_t))) {
2032 		sa_buf_rele(*db, tag);
2033 		return (SET_ERROR(ENOTSUP));
2034 	}
2035 
2036 	error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
2037 	if (error != 0) {
2038 		sa_buf_rele(*db, tag);
2039 		return (error);
2040 	}
2041 
2042 	return (0);
2043 }
2044 
2045 static void
2046 zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, const void *tag)
2047 {
2048 	sa_handle_destroy(hdl);
2049 	sa_buf_rele(db, tag);
2050 }
2051 
2052 /*
2053  * Given an object number, return its parent object number and whether
2054  * or not the object is an extended attribute directory.
2055  */
2056 static int
2057 zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
2058     uint64_t *pobjp, int *is_xattrdir)
2059 {
2060 	uint64_t parent;
2061 	uint64_t pflags;
2062 	uint64_t mode;
2063 	uint64_t parent_mode;
2064 	sa_bulk_attr_t bulk[3];
2065 	sa_handle_t *sa_hdl;
2066 	dmu_buf_t *sa_db;
2067 	int count = 0;
2068 	int error;
2069 
2070 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
2071 	    &parent, sizeof (parent));
2072 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
2073 	    &pflags, sizeof (pflags));
2074 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2075 	    &mode, sizeof (mode));
2076 
2077 	if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
2078 		return (error);
2079 
2080 	/*
2081 	 * When a link is removed its parent pointer is not changed and will
2082 	 * be invalid.  There are two cases where a link is removed but the
2083 	 * file stays around, when it goes to the delete queue and when there
2084 	 * are additional links.
2085 	 */
2086 	error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
2087 	if (error != 0)
2088 		return (error);
2089 
2090 	error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
2091 	zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2092 	if (error != 0)
2093 		return (error);
2094 
2095 	*is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
2096 
2097 	/*
2098 	 * Extended attributes can be applied to files, directories, etc.
2099 	 * Otherwise the parent must be a directory.
2100 	 */
2101 	if (!*is_xattrdir && !S_ISDIR(parent_mode))
2102 		return (SET_ERROR(EINVAL));
2103 
2104 	*pobjp = parent;
2105 
2106 	return (0);
2107 }
2108 
2109 /*
2110  * Given an object number, return some zpl level statistics
2111  */
2112 static int
2113 zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
2114     zfs_stat_t *sb)
2115 {
2116 	sa_bulk_attr_t bulk[4];
2117 	int count = 0;
2118 
2119 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2120 	    &sb->zs_mode, sizeof (sb->zs_mode));
2121 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
2122 	    &sb->zs_gen, sizeof (sb->zs_gen));
2123 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
2124 	    &sb->zs_links, sizeof (sb->zs_links));
2125 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
2126 	    &sb->zs_ctime, sizeof (sb->zs_ctime));
2127 
2128 	return (sa_bulk_lookup(hdl, bulk, count));
2129 }
2130 
2131 static int
2132 zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
2133     sa_attr_type_t *sa_table, char *buf, int len)
2134 {
2135 	sa_handle_t *sa_hdl;
2136 	sa_handle_t *prevhdl = NULL;
2137 	dmu_buf_t *prevdb = NULL;
2138 	dmu_buf_t *sa_db = NULL;
2139 	char *path = buf + len - 1;
2140 	int error;
2141 
2142 	*path = '\0';
2143 	sa_hdl = hdl;
2144 
2145 	uint64_t deleteq_obj;
2146 	VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
2147 	    ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
2148 	error = zap_lookup_int(osp, deleteq_obj, obj);
2149 	if (error == 0) {
2150 		return (ESTALE);
2151 	} else if (error != ENOENT) {
2152 		return (error);
2153 	}
2154 
2155 	for (;;) {
2156 		uint64_t pobj = 0;
2157 		char component[MAXNAMELEN + 2];
2158 		size_t complen;
2159 		int is_xattrdir = 0;
2160 
2161 		if (prevdb) {
2162 			ASSERT(prevhdl != NULL);
2163 			zfs_release_sa_handle(prevhdl, prevdb, FTAG);
2164 		}
2165 
2166 		if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
2167 		    &is_xattrdir)) != 0)
2168 			break;
2169 
2170 		if (pobj == obj) {
2171 			if (path[0] != '/')
2172 				*--path = '/';
2173 			break;
2174 		}
2175 
2176 		component[0] = '/';
2177 		if (is_xattrdir) {
2178 			strcpy(component + 1, "<xattrdir>");
2179 		} else {
2180 			error = zap_value_search(osp, pobj, obj,
2181 			    ZFS_DIRENT_OBJ(-1ULL), component + 1);
2182 			if (error != 0)
2183 				break;
2184 		}
2185 
2186 		complen = strlen(component);
2187 		path -= complen;
2188 		ASSERT(path >= buf);
2189 		memcpy(path, component, complen);
2190 		obj = pobj;
2191 
2192 		if (sa_hdl != hdl) {
2193 			prevhdl = sa_hdl;
2194 			prevdb = sa_db;
2195 		}
2196 		error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
2197 		if (error != 0) {
2198 			sa_hdl = prevhdl;
2199 			sa_db = prevdb;
2200 			break;
2201 		}
2202 	}
2203 
2204 	if (sa_hdl != NULL && sa_hdl != hdl) {
2205 		ASSERT(sa_db != NULL);
2206 		zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2207 	}
2208 
2209 	if (error == 0)
2210 		(void) memmove(buf, path, buf + len - path);
2211 
2212 	return (error);
2213 }
2214 
2215 int
2216 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
2217 {
2218 	sa_attr_type_t *sa_table;
2219 	sa_handle_t *hdl;
2220 	dmu_buf_t *db;
2221 	int error;
2222 
2223 	error = zfs_sa_setup(osp, &sa_table);
2224 	if (error != 0)
2225 		return (error);
2226 
2227 	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2228 	if (error != 0)
2229 		return (error);
2230 
2231 	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2232 
2233 	zfs_release_sa_handle(hdl, db, FTAG);
2234 	return (error);
2235 }
2236 
2237 int
2238 zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
2239     char *buf, int len)
2240 {
2241 	char *path = buf + len - 1;
2242 	sa_attr_type_t *sa_table;
2243 	sa_handle_t *hdl;
2244 	dmu_buf_t *db;
2245 	int error;
2246 
2247 	*path = '\0';
2248 
2249 	error = zfs_sa_setup(osp, &sa_table);
2250 	if (error != 0)
2251 		return (error);
2252 
2253 	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2254 	if (error != 0)
2255 		return (error);
2256 
2257 	error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
2258 	if (error != 0) {
2259 		zfs_release_sa_handle(hdl, db, FTAG);
2260 		return (error);
2261 	}
2262 
2263 	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2264 
2265 	zfs_release_sa_handle(hdl, db, FTAG);
2266 	return (error);
2267 }
2268 
2269 /*
2270  * Read a property stored within the master node.
2271  */
2272 int
2273 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
2274 {
2275 	uint64_t *cached_copy = NULL;
2276 
2277 	/*
2278 	 * Figure out where in the objset_t the cached copy would live, if it
2279 	 * is available for the requested property.
2280 	 */
2281 	if (os != NULL) {
2282 		switch (prop) {
2283 		case ZFS_PROP_VERSION:
2284 			cached_copy = &os->os_version;
2285 			break;
2286 		case ZFS_PROP_NORMALIZE:
2287 			cached_copy = &os->os_normalization;
2288 			break;
2289 		case ZFS_PROP_UTF8ONLY:
2290 			cached_copy = &os->os_utf8only;
2291 			break;
2292 		case ZFS_PROP_CASE:
2293 			cached_copy = &os->os_casesensitivity;
2294 			break;
2295 		default:
2296 			break;
2297 		}
2298 	}
2299 	if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
2300 		*value = *cached_copy;
2301 		return (0);
2302 	}
2303 
2304 	/*
2305 	 * If the property wasn't cached, look up the file system's value for
2306 	 * the property. For the version property, we look up a slightly
2307 	 * different string.
2308 	 */
2309 	const char *pname;
2310 	int error = ENOENT;
2311 	if (prop == ZFS_PROP_VERSION)
2312 		pname = ZPL_VERSION_STR;
2313 	else
2314 		pname = zfs_prop_to_name(prop);
2315 
2316 	if (os != NULL) {
2317 		ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
2318 		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
2319 	}
2320 
2321 	if (error == ENOENT) {
2322 		/* No value set, use the default value */
2323 		switch (prop) {
2324 		case ZFS_PROP_VERSION:
2325 			*value = ZPL_VERSION;
2326 			break;
2327 		case ZFS_PROP_NORMALIZE:
2328 		case ZFS_PROP_UTF8ONLY:
2329 			*value = 0;
2330 			break;
2331 		case ZFS_PROP_CASE:
2332 			*value = ZFS_CASE_SENSITIVE;
2333 			break;
2334 		case ZFS_PROP_ACLTYPE:
2335 			*value = ZFS_ACLTYPE_OFF;
2336 			break;
2337 		default:
2338 			return (error);
2339 		}
2340 		error = 0;
2341 	}
2342 
2343 	/*
2344 	 * If one of the methods for getting the property value above worked,
2345 	 * copy it into the objset_t's cache.
2346 	 */
2347 	if (error == 0 && cached_copy != NULL) {
2348 		*cached_copy = *value;
2349 	}
2350 
2351 	return (error);
2352 }
2353 
2354 #if defined(_KERNEL)
2355 EXPORT_SYMBOL(zfs_create_fs);
2356 EXPORT_SYMBOL(zfs_obj_to_path);
2357 
2358 /* CSTYLED */
2359 module_param(zfs_object_mutex_size, uint, 0644);
2360 MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array");
2361 module_param(zfs_unlink_suspend_progress, int, 0644);
2362 MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks "
2363 "(debug - leaks space into the unlinked set)");
2364 #endif
2365