xref: /dragonfly/sys/vfs/hammer2/hammer2_vnops.c (revision 666e46d7)
1 /*
2  * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/fcntl.h>
39 #include <sys/buf.h>
40 #include <sys/proc.h>
41 #include <sys/namei.h>
42 #include <sys/mount.h>
43 #include <sys/vnode.h>
44 #include <sys/mountctl.h>
45 #include <sys/dirent.h>
46 #include <sys/uio.h>
47 
48 #include "hammer2.h"
49 
50 #define ZFOFFSET	(-2LL)
51 
52 static int hammer2_read_file(hammer2_inode_t *ip, struct uio *uio,
53 				int seqcount);
54 static int hammer2_write_file(hammer2_inode_t *ip, struct uio *uio, int ioflag,
55 			      int seqcount);
56 static hammer2_off_t hammer2_assign_physical(hammer2_inode_t *ip,
57 				hammer2_key_t lbase, int lblksize, int *errorp);
58 static void hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize);
59 static void hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize);
60 
61 static __inline
62 void
63 hammer2_knote(struct vnode *vp, int flags)
64 {
65 	if (flags)
66 		KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
67 }
68 
69 /*
70  * Last reference to a vnode is going away but it is still cached.
71  */
72 static
73 int
74 hammer2_vop_inactive(struct vop_inactive_args *ap)
75 {
76 	struct vnode *vp;
77 	struct hammer2_inode *ip;
78 #if 0
79 	struct hammer2_mount *hmp;
80 #endif
81 
82 	vp = ap->a_vp;
83 	ip = VTOI(vp);
84 
85 	/*
86 	 * Degenerate case
87 	 */
88 	if (ip == NULL) {
89 		vrecycle(vp);
90 		return (0);
91 	}
92 
93 	/*
94 	 * Detect updates to the embedded data which may be synchronized by
95 	 * the strategy code.  Simply mark the inode modified so it gets
96 	 * picked up by our normal flush.
97 	 */
98 	if (ip->chain.flags & HAMMER2_CHAIN_DIRTYEMBED) {
99 		hammer2_inode_lock_ex(ip);
100 		atomic_clear_int(&ip->chain.flags, HAMMER2_CHAIN_DIRTYEMBED);
101 		hammer2_chain_modify(ip->hmp, &ip->chain, 0);
102 		hammer2_inode_unlock_ex(ip);
103 	}
104 
105 	/*
106 	 * Check for deleted inodes and recycle immediately.
107 	 */
108 	if (ip->chain.flags & HAMMER2_CHAIN_DELETED) {
109 		vrecycle(vp);
110 	}
111 	return (0);
112 }
113 
114 /*
115  * Reclaim a vnode so that it can be reused; after the inode is
116  * disassociated, the filesystem must manage it alone.
117  */
118 static
119 int
120 hammer2_vop_reclaim(struct vop_reclaim_args *ap)
121 {
122 	struct hammer2_inode *ip;
123 	struct hammer2_mount *hmp;
124 	struct vnode *vp;
125 
126 	vp = ap->a_vp;
127 	ip = VTOI(vp);
128 	if (ip == NULL)
129 		return(0);
130 	hmp = ip->hmp;
131 
132 	/*
133 	 * Set SUBMODIFIED so we can detect and propagate the DESTROYED
134 	 * bit in the flush code.
135 	 */
136 	hammer2_inode_lock_ex(ip);
137 	vp->v_data = NULL;
138 	ip->vp = NULL;
139 	if (ip->chain.flags & HAMMER2_CHAIN_DELETED) {
140 		atomic_set_int(&ip->chain.flags, HAMMER2_CHAIN_DESTROYED |
141 						 HAMMER2_CHAIN_SUBMODIFIED);
142 	}
143 	hammer2_chain_flush(hmp, &ip->chain, 0);
144 	hammer2_inode_unlock_ex(ip);
145 	hammer2_chain_drop(hmp, &ip->chain);	/* vp ref */
146 
147 	/*
148 	 * XXX handle background sync when ip dirty, kernel will no longer
149 	 * notify us regarding this inode because there is no longer a
150 	 * vnode attached to it.
151 	 */
152 
153 	return (0);
154 }
155 
156 static
157 int
158 hammer2_vop_fsync(struct vop_fsync_args *ap)
159 {
160 	struct hammer2_inode *ip;
161 	struct hammer2_mount *hmp;
162 	struct vnode *vp;
163 
164 	vp = ap->a_vp;
165 	ip = VTOI(vp);
166 	hmp = ip->hmp;
167 
168 	hammer2_inode_lock_ex(ip);
169 	vfsync(vp, ap->a_waitfor, 1, NULL, NULL);
170 
171 	/*
172 	 * Detect updates to the embedded data which may be synchronized by
173 	 * the strategy code.  Simply mark the inode modified so it gets
174 	 * picked up by our normal flush.
175 	 */
176 	if (ip->chain.flags & HAMMER2_CHAIN_DIRTYEMBED) {
177 		atomic_clear_int(&ip->chain.flags, HAMMER2_CHAIN_DIRTYEMBED);
178 		hammer2_chain_modify(hmp, &ip->chain, 0);
179 	}
180 
181 	/*
182 	 * Calling chain_flush here creates a lot of duplicative
183 	 * COW operations due to non-optimal vnode ordering.
184 	 *
185 	 * Only do it for an actual fsync() syscall.  The other forms
186 	 * which call this function will eventually call chain_flush
187 	 * on the volume root as a catch-all, which is far more optimal.
188 	 */
189 	if (ap->a_flags & VOP_FSYNC_SYSCALL)
190 		hammer2_chain_flush(hmp, &ip->chain, 0);
191 	hammer2_inode_unlock_ex(ip);
192 	return (0);
193 }
194 
195 static
196 int
197 hammer2_vop_access(struct vop_access_args *ap)
198 {
199 	hammer2_inode_t *ip = VTOI(ap->a_vp);
200 	uid_t uid;
201 	gid_t gid;
202 	int error;
203 
204 	uid = hammer2_to_unix_xid(&ip->ip_data.uid);
205 	gid = hammer2_to_unix_xid(&ip->ip_data.gid);
206 
207 	error = vop_helper_access(ap, uid, gid, ip->ip_data.mode,
208 				  ip->ip_data.uflags);
209 	return (error);
210 }
211 
212 static
213 int
214 hammer2_vop_getattr(struct vop_getattr_args *ap)
215 {
216 	hammer2_pfsmount_t *pmp;
217 	hammer2_inode_t *ip;
218 	struct vnode *vp;
219 	struct vattr *vap;
220 
221 	vp = ap->a_vp;
222 	vap = ap->a_vap;
223 
224 	ip = VTOI(vp);
225 	pmp = ip->pmp;
226 
227 	hammer2_inode_lock_sh(ip);
228 
229 	vap->va_fsid = pmp->mp->mnt_stat.f_fsid.val[0];
230 	vap->va_fileid = ip->ip_data.inum;
231 	vap->va_mode = ip->ip_data.mode;
232 	vap->va_nlink = ip->ip_data.nlinks;
233 	vap->va_uid = hammer2_to_unix_xid(&ip->ip_data.uid);
234 	vap->va_gid = hammer2_to_unix_xid(&ip->ip_data.gid);
235 	vap->va_rmajor = 0;
236 	vap->va_rminor = 0;
237 	vap->va_size = ip->ip_data.size;
238 	vap->va_blocksize = HAMMER2_PBUFSIZE;
239 	vap->va_flags = ip->ip_data.uflags;
240 	hammer2_time_to_timespec(ip->ip_data.ctime, &vap->va_ctime);
241 	hammer2_time_to_timespec(ip->ip_data.mtime, &vap->va_mtime);
242 	hammer2_time_to_timespec(ip->ip_data.mtime, &vap->va_atime);
243 	vap->va_gen = 1;
244 	vap->va_bytes = vap->va_size;	/* XXX */
245 	vap->va_type = hammer2_get_vtype(ip);
246 	vap->va_filerev = 0;
247 	vap->va_uid_uuid = ip->ip_data.uid;
248 	vap->va_gid_uuid = ip->ip_data.gid;
249 	vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
250 			  VA_FSID_UUID_VALID;
251 
252 	hammer2_inode_unlock_sh(ip);
253 
254 	return (0);
255 }
256 
257 static
258 int
259 hammer2_vop_setattr(struct vop_setattr_args *ap)
260 {
261 	hammer2_mount_t *hmp;
262 	hammer2_inode_t *ip;
263 	struct vnode *vp;
264 	struct vattr *vap;
265 	int error;
266 	int kflags = 0;
267 	int domtime = 0;
268 	uint64_t ctime;
269 
270 	vp = ap->a_vp;
271 	vap = ap->a_vap;
272 	hammer2_update_time(&ctime);
273 
274 	ip = VTOI(vp);
275 	hmp = ip->hmp;
276 
277 	if (hmp->ronly)
278 		return(EROFS);
279 
280 	hammer2_inode_lock_ex(ip);
281 	error = 0;
282 
283 	if (vap->va_flags != VNOVAL) {
284 		u_int32_t flags;
285 
286 		flags = ip->ip_data.uflags;
287 		error = vop_helper_setattr_flags(&flags, vap->va_flags,
288 					 hammer2_to_unix_xid(&ip->ip_data.uid),
289 					 ap->a_cred);
290 		if (error == 0) {
291 			if (ip->ip_data.uflags != flags) {
292 				hammer2_chain_modify(hmp, &ip->chain, 0);
293 				ip->ip_data.uflags = flags;
294 				ip->ip_data.ctime = ctime;
295 				kflags |= NOTE_ATTRIB;
296 			}
297 			if (ip->ip_data.uflags & (IMMUTABLE | APPEND)) {
298 				error = 0;
299 				goto done;
300 			}
301 		}
302 		goto done;
303 	}
304 	if (ip->ip_data.uflags & (IMMUTABLE | APPEND)) {
305 		error = EPERM;
306 		goto done;
307 	}
308 	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
309 		mode_t cur_mode = ip->ip_data.mode;
310 		uid_t cur_uid = hammer2_to_unix_xid(&ip->ip_data.uid);
311 		gid_t cur_gid = hammer2_to_unix_xid(&ip->ip_data.gid);
312 		uuid_t uuid_uid;
313 		uuid_t uuid_gid;
314 
315 		error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
316 					 ap->a_cred,
317 					 &cur_uid, &cur_gid, &cur_mode);
318 		if (error == 0) {
319 			hammer2_guid_to_uuid(&uuid_uid, cur_uid);
320 			hammer2_guid_to_uuid(&uuid_gid, cur_gid);
321 			if (bcmp(&uuid_uid, &ip->ip_data.uid,
322 				 sizeof(uuid_uid)) ||
323 			    bcmp(&uuid_gid, &ip->ip_data.gid,
324 				 sizeof(uuid_gid)) ||
325 			    ip->ip_data.mode != cur_mode
326 			) {
327 				hammer2_chain_modify(hmp, &ip->chain, 0);
328 				ip->ip_data.uid = uuid_uid;
329 				ip->ip_data.gid = uuid_gid;
330 				ip->ip_data.mode = cur_mode;
331 				ip->ip_data.ctime = ctime;
332 			}
333 			kflags |= NOTE_ATTRIB;
334 		}
335 	}
336 
337 	/*
338 	 * Resize the file
339 	 */
340 	if (vap->va_size != VNOVAL && ip->ip_data.size != vap->va_size) {
341 		switch(vp->v_type) {
342 		case VREG:
343 			if (vap->va_size == ip->ip_data.size)
344 				break;
345 			if (vap->va_size < ip->ip_data.size) {
346 				hammer2_truncate_file(ip, vap->va_size);
347 			} else {
348 				hammer2_extend_file(ip, vap->va_size);
349 			}
350 			domtime = 1;
351 			break;
352 		default:
353 			error = EINVAL;
354 			goto done;
355 		}
356 	}
357 #if 0
358 	/* atime not supported */
359 	if (vap->va_atime.tv_sec != VNOVAL) {
360 		hammer2_chain_modify(hmp, &ip->chain, 0);
361 		ip->ip_data.atime = hammer2_timespec_to_time(&vap->va_atime);
362 		kflags |= NOTE_ATTRIB;
363 	}
364 #endif
365 	if (vap->va_mtime.tv_sec != VNOVAL) {
366 		hammer2_chain_modify(hmp, &ip->chain, 0);
367 		ip->ip_data.mtime = hammer2_timespec_to_time(&vap->va_mtime);
368 		kflags |= NOTE_ATTRIB;
369 	}
370 	if (vap->va_mode != (mode_t)VNOVAL) {
371 		mode_t cur_mode = ip->ip_data.mode;
372 		uid_t cur_uid = hammer2_to_unix_xid(&ip->ip_data.uid);
373 		gid_t cur_gid = hammer2_to_unix_xid(&ip->ip_data.gid);
374 
375 		error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
376 					 cur_uid, cur_gid, &cur_mode);
377 		if (error == 0 && ip->ip_data.mode != cur_mode) {
378 			ip->ip_data.mode = cur_mode;
379 			ip->ip_data.ctime = ctime;
380 			kflags |= NOTE_ATTRIB;
381 		}
382 	}
383 done:
384 	hammer2_inode_unlock_ex(ip);
385 	return (error);
386 }
387 
388 static
389 int
390 hammer2_vop_readdir(struct vop_readdir_args *ap)
391 {
392 	hammer2_mount_t *hmp;
393 	hammer2_inode_t *ip;
394 	hammer2_inode_t *xip;
395 	hammer2_chain_t *parent;
396 	hammer2_chain_t *chain;
397 	hammer2_key_t lkey;
398 	struct uio *uio;
399 	off_t *cookies;
400 	off_t saveoff;
401 	int cookie_index;
402 	int ncookies;
403 	int error;
404 	int dtype;
405 	int r;
406 
407 	ip = VTOI(ap->a_vp);
408 	hmp = ip->hmp;
409 	uio = ap->a_uio;
410 	saveoff = uio->uio_offset;
411 
412 	/*
413 	 * Setup cookies directory entry cookies if requested
414 	 */
415 	if (ap->a_ncookies) {
416 		ncookies = uio->uio_resid / 16 + 1;
417 		if (ncookies > 1024)
418 			ncookies = 1024;
419 		cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
420 	} else {
421 		ncookies = -1;
422 		cookies = NULL;
423 	}
424 	cookie_index = 0;
425 
426 	/*
427 	 * Handle artificial entries.  To ensure that only positive 64 bit
428 	 * quantities are returned to userland we always strip off bit 63.
429 	 * The hash code is designed such that codes 0x0000-0x7FFF are not
430 	 * used, allowing us to use these codes for articial entries.
431 	 *
432 	 * Entry 0 is used for '.' and entry 1 is used for '..'.  Do not
433 	 * allow '..' to cross the mount point into (e.g.) the super-root.
434 	 */
435 	error = 0;
436 	chain = (void *)(intptr_t)-1;	/* non-NULL for early goto done case */
437 
438 	if (saveoff == 0) {
439 		r = vop_write_dirent(&error, uio,
440 				     ip->ip_data.inum &
441 					HAMMER2_DIRHASH_USERMSK,
442 				     DT_DIR, 1, ".");
443 		if (r)
444 			goto done;
445 		if (cookies)
446 			cookies[cookie_index] = saveoff;
447 		++saveoff;
448 		++cookie_index;
449 		if (cookie_index == ncookies)
450 			goto done;
451 	}
452 	if (saveoff == 1) {
453 		if (ip->pip == NULL || ip == ip->pmp->iroot)
454 			xip = ip;
455 		else
456 			xip = ip->pip;
457 
458 		r = vop_write_dirent(&error, uio,
459 				     xip->ip_data.inum &
460 				      HAMMER2_DIRHASH_USERMSK,
461 				     DT_DIR, 2, "..");
462 		if (r)
463 			goto done;
464 		if (cookies)
465 			cookies[cookie_index] = saveoff;
466 		++saveoff;
467 		++cookie_index;
468 		if (cookie_index == ncookies)
469 			goto done;
470 	}
471 
472 	lkey = saveoff | HAMMER2_DIRHASH_VISIBLE;
473 
474 	parent = &ip->chain;
475 	error = hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS |
476 						HAMMER2_RESOLVE_SHARED);
477 	if (error) {
478 		hammer2_chain_unlock(hmp, parent);
479 		goto done;
480 	}
481 	chain = hammer2_chain_lookup(hmp, &parent, lkey, lkey,
482 				     HAMMER2_LOOKUP_SHARED);
483 	if (chain == NULL) {
484 		chain = hammer2_chain_lookup(hmp, &parent,
485 					     lkey, (hammer2_key_t)-1,
486 					     HAMMER2_LOOKUP_SHARED);
487 	}
488 	while (chain) {
489 		if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
490 			dtype = hammer2_get_dtype(chain->u.ip);
491 			saveoff = chain->bref.key & HAMMER2_DIRHASH_USERMSK;
492 			r = vop_write_dirent(&error, uio,
493 					     chain->u.ip->ip_data.inum &
494 					      HAMMER2_DIRHASH_USERMSK,
495 					     dtype, chain->u.ip->ip_data.name_len,
496 					     chain->u.ip->ip_data.filename);
497 			if (r)
498 				break;
499 			if (cookies)
500 				cookies[cookie_index] = saveoff;
501 			++cookie_index;
502 		} else {
503 			/* XXX chain error */
504 			kprintf("bad chain type readdir %d\n",
505 				chain->bref.type);
506 		}
507 
508 		/*
509 		 * Keys may not be returned in order so once we have a
510 		 * placemarker (chain) the scan must allow the full range
511 		 * or some entries will be missed.
512 		 */
513 		chain = hammer2_chain_next(hmp, &parent, chain,
514 					   HAMMER2_DIRHASH_VISIBLE,
515 					   (hammer2_key_t)-1,
516 					   HAMMER2_LOOKUP_SHARED);
517 		if (chain) {
518 			saveoff = (chain->bref.key &
519 				   HAMMER2_DIRHASH_USERMSK) + 1;
520 		} else {
521 			saveoff = (hammer2_key_t)-1;
522 		}
523 		if (cookie_index == ncookies)
524 			break;
525 	}
526 	if (chain)
527 		hammer2_chain_unlock(hmp, chain);
528 	hammer2_chain_unlock(hmp, parent);
529 done:
530 	if (ap->a_eofflag)
531 		*ap->a_eofflag = (chain == NULL);
532 	uio->uio_offset = saveoff & ~HAMMER2_DIRHASH_VISIBLE;
533 	if (error && cookie_index == 0) {
534 		if (cookies) {
535 			kfree(cookies, M_TEMP);
536 			*ap->a_ncookies = 0;
537 			*ap->a_cookies = NULL;
538 		}
539 	} else {
540 		if (cookies) {
541 			*ap->a_ncookies = cookie_index;
542 			*ap->a_cookies = cookies;
543 		}
544 	}
545 	return (error);
546 }
547 
548 /*
549  * hammer2_vop_readlink { vp, uio, cred }
550  */
551 static
552 int
553 hammer2_vop_readlink(struct vop_readlink_args *ap)
554 {
555 	struct vnode *vp;
556 	hammer2_mount_t *hmp;
557 	hammer2_inode_t *ip;
558 	int error;
559 
560 	vp = ap->a_vp;
561 	if (vp->v_type != VLNK)
562 		return (EINVAL);
563 	ip = VTOI(vp);
564 	hmp = ip->hmp;
565 
566 	error = hammer2_read_file(ip, ap->a_uio, 0);
567 	return (error);
568 }
569 
570 static
571 int
572 hammer2_vop_read(struct vop_read_args *ap)
573 {
574 	struct vnode *vp;
575 	hammer2_mount_t *hmp;
576 	hammer2_inode_t *ip;
577 	struct uio *uio;
578 	int error;
579 	int seqcount;
580 	int bigread;
581 
582 	/*
583 	 * Read operations supported on this vnode?
584 	 */
585 	vp = ap->a_vp;
586 	if (vp->v_type != VREG)
587 		return (EINVAL);
588 
589 	/*
590 	 * Misc
591 	 */
592 	ip = VTOI(vp);
593 	hmp = ip->hmp;
594 	uio = ap->a_uio;
595 	error = 0;
596 
597 	seqcount = ap->a_ioflag >> 16;
598 	bigread = (uio->uio_resid > 100 * 1024 * 1024);
599 
600 	error = hammer2_read_file(ip, uio, seqcount);
601 	return (error);
602 }
603 
604 static
605 int
606 hammer2_vop_write(struct vop_write_args *ap)
607 {
608 	thread_t td;
609 	struct vnode *vp;
610 	hammer2_mount_t *hmp;
611 	hammer2_inode_t *ip;
612 	struct uio *uio;
613 	int error;
614 	int seqcount;
615 	int bigwrite;
616 
617 	/*
618 	 * Read operations supported on this vnode?
619 	 */
620 	vp = ap->a_vp;
621 	if (vp->v_type != VREG)
622 		return (EINVAL);
623 
624 	/*
625 	 * Misc
626 	 */
627 	ip = VTOI(vp);
628 	hmp = ip->hmp;
629 	uio = ap->a_uio;
630 	error = 0;
631 	if (hmp->ronly)
632 		return (EROFS);
633 
634 	seqcount = ap->a_ioflag >> 16;
635 	bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
636 
637 	/*
638 	 * Check resource limit
639 	 */
640 	if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc &&
641 	    uio->uio_offset + uio->uio_resid >
642 	     td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
643 		lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
644 		return (EFBIG);
645 	}
646 
647 	bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
648 
649 	/*
650 	 * ip must be locked if extending the file.
651 	 * ip must be locked to avoid racing a truncation.
652 	 *
653 	 * ip must be marked modified, particularly because the write
654 	 * might wind up being copied into the embedded data area.
655 	 */
656 	hammer2_inode_lock_ex(ip);
657 	error = hammer2_write_file(ip, uio, ap->a_ioflag, seqcount);
658 	hammer2_inode_unlock_ex(ip);
659 	return (error);
660 }
661 
662 /*
663  * Perform read operations on a file or symlink given an UNLOCKED
664  * inode and uio.
665  */
666 static
667 int
668 hammer2_read_file(hammer2_inode_t *ip, struct uio *uio, int seqcount)
669 {
670 	struct buf *bp;
671 	int error;
672 
673 	error = 0;
674 
675 	/*
676 	 * UIO read loop
677 	 */
678 	while (uio->uio_resid > 0 && uio->uio_offset < ip->ip_data.size) {
679 		hammer2_key_t lbase;
680 		hammer2_key_t leof;
681 		int lblksize;
682 		int loff;
683 		int n;
684 
685 		lblksize = hammer2_calc_logical(ip, uio->uio_offset,
686 						&lbase, &leof);
687 
688 		error = cluster_read(ip->vp, leof, lbase, lblksize,
689 				     uio->uio_resid, seqcount * BKVASIZE,
690 				     &bp);
691 
692 		if (error)
693 			break;
694 		loff = (int)(uio->uio_offset - lbase);
695 		n = lblksize - loff;
696 		if (n > uio->uio_resid)
697 			n = uio->uio_resid;
698 		if (n > ip->ip_data.size - uio->uio_offset)
699 			n = (int)(ip->ip_data.size - uio->uio_offset);
700 		bp->b_flags |= B_AGE;
701 		uiomove((char *)bp->b_data + loff, n, uio);
702 		bqrelse(bp);
703 	}
704 	return (error);
705 }
706 
707 /*
708  * Called with a locked (ip) to do the underlying write to a file or
709  * to build the symlink target.
710  */
711 static
712 int
713 hammer2_write_file(hammer2_inode_t *ip, struct uio *uio,
714 		   int ioflag, int seqcount)
715 {
716 	hammer2_key_t old_eof;
717 	struct buf *bp;
718 	int kflags;
719 	int error;
720 	int modified = 0;
721 
722 	/*
723 	 * Setup if append
724 	 */
725 	if (ioflag & IO_APPEND)
726 		uio->uio_offset = ip->ip_data.size;
727 	kflags = 0;
728 	error = 0;
729 
730 	/*
731 	 * Extend the file if necessary.  If the write fails at some point
732 	 * we will truncate it back down to cover as much as we were able
733 	 * to write.
734 	 *
735 	 * Doing this now makes it easier to calculate buffer sizes in
736 	 * the loop.
737 	 */
738 	old_eof = ip->ip_data.size;
739 	if (uio->uio_offset + uio->uio_resid > ip->ip_data.size) {
740 		modified = 1;
741 		hammer2_extend_file(ip, uio->uio_offset + uio->uio_resid);
742 		kflags |= NOTE_EXTEND;
743 	}
744 
745 	/*
746 	 * UIO write loop
747 	 */
748 	while (uio->uio_resid > 0) {
749 		hammer2_key_t lbase;
750 		hammer2_key_t leof;
751 		int trivial;
752 		int lblksize;
753 		int loff;
754 		int n;
755 
756 		/*
757 		 * Don't allow the buffer build to blow out the buffer
758 		 * cache.
759 		 */
760 		if ((ioflag & IO_RECURSE) == 0) {
761 			/*
762 			 * XXX should try to leave this unlocked through
763 			 *	the whole loop
764 			 */
765 			hammer2_chain_unlock(ip->hmp, &ip->chain);
766 			bwillwrite(HAMMER2_PBUFSIZE);
767 			hammer2_chain_lock(ip->hmp, &ip->chain,
768 					   HAMMER2_RESOLVE_ALWAYS);
769 		}
770 
771 		/* XXX bigwrite & signal check test */
772 
773 		/*
774 		 * This nominally tells us how much we can cluster and
775 		 * what the logical buffer size needs to be.  Currently
776 		 * we don't try to cluster the write and just handle one
777 		 * block at a time.
778 		 */
779 		lblksize = hammer2_calc_logical(ip, uio->uio_offset,
780 						&lbase, &leof);
781 		loff = (int)(uio->uio_offset - lbase);
782 
783 		/*
784 		 * Calculate bytes to copy this transfer and whether the
785 		 * copy completely covers the buffer or not.
786 		 */
787 		trivial = 0;
788 		n = lblksize - loff;
789 		if (n > uio->uio_resid) {
790 			n = uio->uio_resid;
791 			if (uio->uio_offset + n == ip->ip_data.size)
792 				trivial = 1;
793 		} else if (loff == 0) {
794 			trivial = 1;
795 		}
796 
797 		/*
798 		 * Get the buffer
799 		 */
800 		if (uio->uio_segflg == UIO_NOCOPY) {
801 			/*
802 			 * Issuing a write with the same data backing the
803 			 * buffer.  Instantiate the buffer to collect the
804 			 * backing vm pages, then read-in any missing bits.
805 			 *
806 			 * This case is used by vop_stdputpages().
807 			 */
808 			bp = getblk(ip->vp, lbase, lblksize, GETBLK_BHEAVY, 0);
809 			if ((bp->b_flags & B_CACHE) == 0) {
810 				bqrelse(bp);
811 				error = bread(ip->vp, lbase, lblksize, &bp);
812 			}
813 		} else if (trivial) {
814 			/*
815 			 * Even though we are entirely overwriting the buffer
816 			 * we may still have to zero it out to avoid a
817 			 * mmap/write visibility issue.
818 			 */
819 			bp = getblk(ip->vp, lbase, lblksize, GETBLK_BHEAVY, 0);
820 			if ((bp->b_flags & B_CACHE) == 0)
821 				vfs_bio_clrbuf(bp);
822 		} else {
823 			/*
824 			 * Partial overwrite, read in any missing bits then
825 			 * replace the portion being written.
826 			 *
827 			 * (The strategy code will detect zero-fill physical
828 			 * blocks for this case).
829 			 */
830 			error = bread(ip->vp, lbase, lblksize, &bp);
831 			if (error == 0)
832 				bheavy(bp);
833 		}
834 
835 		if (error) {
836 			brelse(bp);
837 			break;
838 		}
839 
840 		/*
841 		 * We have to assign physical storage to the buffer we intend
842 		 * to dirty or write now to avoid deadlocks in the strategy
843 		 * code later.
844 		 *
845 		 * This can return NOOFFSET for inode-embedded data.  The
846 		 * strategy code will take care of it in that case.
847 		 */
848 		bp->b_bio2.bio_offset =
849 			hammer2_assign_physical(ip, lbase, lblksize, &error);
850 		if (error) {
851 			brelse(bp);
852 			break;
853 		}
854 
855 		/*
856 		 * Ok, copy the data in
857 		 */
858 		hammer2_chain_unlock(ip->hmp, &ip->chain);
859 		error = uiomove(bp->b_data + loff, n, uio);
860 		hammer2_chain_lock(ip->hmp, &ip->chain, HAMMER2_RESOLVE_ALWAYS);
861 		kflags |= NOTE_WRITE;
862 		modified = 1;
863 
864 		if (error) {
865 			brelse(bp);
866 			break;
867 		}
868 
869 		/* XXX update ip_data.mtime */
870 
871 		/*
872 		 * Once we dirty a buffer any cached offset becomes invalid.
873 		 *
874 		 * NOTE: For cluster_write() always use the trailing block
875 		 *	 size, which is HAMMER2_PBUFSIZE.  lblksize is the
876 		 *	 eof-straddling blocksize and is incorrect.
877 		 */
878 		bp->b_flags |= B_AGE;
879 		if (ioflag & IO_SYNC) {
880 			bwrite(bp);
881 		} else if ((ioflag & IO_DIRECT) && loff + n == lblksize) {
882 			if (bp->b_bcount == HAMMER2_PBUFSIZE)
883 				bp->b_flags |= B_CLUSTEROK;
884 			bdwrite(bp);
885 		} else if (ioflag & IO_ASYNC) {
886 			bawrite(bp);
887 		} else if (hammer2_cluster_enable) {
888 			if (bp->b_bcount == HAMMER2_PBUFSIZE)
889 				bp->b_flags |= B_CLUSTEROK;
890 			cluster_write(bp, leof, HAMMER2_PBUFSIZE, seqcount);
891 		} else {
892 			if (bp->b_bcount == HAMMER2_PBUFSIZE)
893 				bp->b_flags |= B_CLUSTEROK;
894 			bdwrite(bp);
895 		}
896 	}
897 
898 	/*
899 	 * Cleanup.  If we extended the file EOF but failed to write through
900 	 * the entire write is a failure and we have to back-up.
901 	 */
902 	if (error && ip->ip_data.size != old_eof) {
903 		hammer2_truncate_file(ip, old_eof);
904 	} else if (modified) {
905 		hammer2_chain_modify(ip->hmp, &ip->chain, 0);
906 		hammer2_update_time(&ip->ip_data.mtime);
907 	}
908 	hammer2_knote(ip->vp, kflags);
909 	return error;
910 }
911 
912 /*
913  * Assign physical storage to a logical block.
914  *
915  * NOOFFSET is returned if the data is inode-embedded.  In this case the
916  * strategy code will simply bcopy() the data into the inode.
917  *
918  * The inode's delta_dcount is adjusted.
919  */
920 static
921 hammer2_off_t
922 hammer2_assign_physical(hammer2_inode_t *ip, hammer2_key_t lbase,
923 			int lblksize, int *errorp)
924 {
925 	hammer2_mount_t *hmp;
926 	hammer2_chain_t *parent;
927 	hammer2_chain_t *chain;
928 	hammer2_off_t pbase;
929 
930 	*errorp = 0;
931 	hmp = ip->hmp;
932 
933 	/*
934 	 * Locate the chain associated with lbase, return a locked chain.
935 	 * However, do not instantiate any data reference (which utilizes a
936 	 * device buffer) because we will be using direct IO via the
937 	 * logical buffer cache buffer.
938 	 */
939 	parent = &ip->chain;
940 	hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
941 
942 	chain = hammer2_chain_lookup(hmp, &parent,
943 				     lbase, lbase,
944 				     HAMMER2_LOOKUP_NODATA);
945 
946 	if (chain == NULL) {
947 		/*
948 		 * We found a hole, create a new chain entry.
949 		 *
950 		 * NOTE: DATA chains are created without device backing
951 		 *	 store (nor do we want any).
952 		 */
953 		chain = hammer2_chain_create(hmp, parent, NULL,
954 					     lbase, HAMMER2_PBUFRADIX,
955 					     HAMMER2_BREF_TYPE_DATA,
956 					     lblksize);
957 		pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
958 		ip->delta_dcount += lblksize;
959 	} else {
960 		switch (chain->bref.type) {
961 		case HAMMER2_BREF_TYPE_INODE:
962 			/*
963 			 * The data is embedded in the inode.  The
964 			 * caller is responsible for marking the inode
965 			 * modified and copying the data to the embedded
966 			 * area.
967 			 */
968 			pbase = NOOFFSET;
969 			break;
970 		case HAMMER2_BREF_TYPE_DATA:
971 			if (chain->bytes != lblksize) {
972 				panic("hammer2_assign_physical: "
973 				      "size mismatch %d/%d\n",
974 				      lblksize, chain->bytes);
975 			}
976 			hammer2_chain_modify(hmp, chain,
977 					     HAMMER2_MODIFY_OPTDATA);
978 			pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
979 			break;
980 		default:
981 			panic("hammer2_assign_physical: bad type");
982 			/* NOT REACHED */
983 			pbase = NOOFFSET;
984 			break;
985 		}
986 	}
987 
988 	if (chain)
989 		hammer2_chain_unlock(hmp, chain);
990 	hammer2_chain_unlock(hmp, parent);
991 
992 	return (pbase);
993 }
994 
995 /*
996  * Truncate the size of a file.
997  *
998  * This routine adjusts ip->ip_data.size smaller, destroying any related
999  * data beyond the new EOF and potentially resizing the block straddling
1000  * the EOF.
1001  *
1002  * The inode must be locked.
1003  */
1004 static
1005 void
1006 hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
1007 {
1008 	hammer2_chain_t *parent;
1009 	hammer2_chain_t *chain;
1010 	hammer2_mount_t *hmp = ip->hmp;
1011 	hammer2_key_t lbase;
1012 	hammer2_key_t leof;
1013 	struct buf *bp;
1014 	int loff;
1015 	int error;
1016 	int oblksize;
1017 	int nblksize;
1018 
1019 	hammer2_chain_modify(hmp, &ip->chain, 0);
1020 	bp = NULL;
1021 
1022 	/*
1023 	 * Destroy any logical buffer cache buffers beyond the file EOF.
1024 	 *
1025 	 * We call nvtruncbuf() w/ trivial == 1 to prevent it from messing
1026 	 * around with the buffer straddling EOF, because we need to assign
1027 	 * a new physical offset to it.
1028 	 */
1029 	if (ip->vp) {
1030 		nvtruncbuf(ip->vp, nsize,
1031 			   HAMMER2_PBUFSIZE, (int)nsize & HAMMER2_PBUFMASK,
1032 			   1);
1033 	}
1034 
1035 	/*
1036 	 * Setup for lookup/search
1037 	 */
1038 	parent = &ip->chain;
1039 	error = hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
1040 	if (error) {
1041 		hammer2_chain_unlock(hmp, parent);
1042 		/* XXX error reporting */
1043 		return;
1044 	}
1045 
1046 	/*
1047 	 * Handle the case where a chain/logical-buffer straddles the new
1048 	 * EOF.  We told nvtruncbuf() above not to mess with the logical
1049 	 * buffer straddling the EOF because we need to reassign its storage
1050 	 * and can't let the strategy code do it for us.
1051 	 */
1052 	loff = (int)nsize & HAMMER2_PBUFMASK;
1053 	if (loff && ip->vp) {
1054 		oblksize = hammer2_calc_logical(ip, nsize, &lbase, &leof);
1055 		error = bread(ip->vp, lbase, oblksize, &bp);
1056 		KKASSERT(error == 0);
1057 	}
1058 	ip->ip_data.size = nsize;
1059 	nblksize = hammer2_calc_logical(ip, nsize, &lbase, &leof);
1060 
1061 	/*
1062 	 * Fixup the chain element.  If we have a logical buffer in-hand
1063 	 * we don't want to create a conflicting device buffer.
1064 	 */
1065 	if (loff && bp) {
1066 		chain = hammer2_chain_lookup(hmp, &parent, lbase, lbase,
1067 					     HAMMER2_LOOKUP_NODATA);
1068 		if (chain) {
1069 			allocbuf(bp, nblksize);
1070 			switch(chain->bref.type) {
1071 			case HAMMER2_BREF_TYPE_DATA:
1072 				hammer2_chain_resize(ip, chain,
1073 					     hammer2_bytes_to_radix(nblksize),
1074 					     HAMMER2_MODIFY_OPTDATA);
1075 				bzero(bp->b_data + loff, nblksize - loff);
1076 				bp->b_bio2.bio_offset = chain->bref.data_off &
1077 							HAMMER2_OFF_MASK;
1078 				break;
1079 			case HAMMER2_BREF_TYPE_INODE:
1080 				bzero(bp->b_data + loff, nblksize - loff);
1081 				bp->b_bio2.bio_offset = NOOFFSET;
1082 				break;
1083 			default:
1084 				panic("hammer2_truncate_file: bad type");
1085 				break;
1086 			}
1087 			hammer2_chain_unlock(hmp, chain);
1088 			if (bp->b_bcount == HAMMER2_PBUFSIZE)
1089 				bp->b_flags |= B_CLUSTEROK;
1090 			bdwrite(bp);
1091 		} else {
1092 			/*
1093 			 * Destroy clean buffer w/ wrong buffer size.  Retain
1094 			 * backing store.
1095 			 */
1096 			bp->b_flags |= B_RELBUF;
1097 			KKASSERT(bp->b_bio2.bio_offset == NOOFFSET);
1098 			KKASSERT((bp->b_flags & B_DIRTY) == 0);
1099 			bqrelse(bp);
1100 		}
1101 	} else if (loff) {
1102 		/*
1103 		 * WARNING: This utilizes a device buffer for the data.
1104 		 *
1105 		 * This case should not occur because file truncations without
1106 		 * a vnode (and hence no logical buffer cache) should only
1107 		 * always truncate to 0-length.
1108 		 */
1109 		panic("hammer2_truncate_file: non-zero truncation, no-vnode");
1110 #if 0
1111 		chain = hammer2_chain_lookup(hmp, &parent, lbase, lbase, 0);
1112 		if (chain) {
1113 			switch(chain->bref.type) {
1114 			case HAMMER2_BREF_TYPE_DATA:
1115 				hammer2_chain_resize(ip, chain,
1116 					     hammer2_bytes_to_radix(nblksize),
1117 					     0);
1118 				hammer2_chain_modify(hmp, chain, 0);
1119 				bzero(chain->data->buf + loff, nblksize - loff);
1120 				break;
1121 			case HAMMER2_BREF_TYPE_INODE:
1122 				if (loff < HAMMER2_EMBEDDED_BYTES) {
1123 					hammer2_chain_modify(hmp, chain, 0);
1124 					bzero(chain->data->ipdata.u.data + loff,
1125 					      HAMMER2_EMBEDDED_BYTES - loff);
1126 				}
1127 				break;
1128 			}
1129 			hammer2_chain_unlock(hmp, chain);
1130 		}
1131 #endif
1132 	}
1133 
1134 	/*
1135 	 * Clean up any fragmentory VM pages now that we have properly
1136 	 * resized the straddling buffer.  These pages are no longer
1137 	 * part of the buffer.
1138 	 */
1139 	if (ip->vp) {
1140 		nvtruncbuf(ip->vp, nsize,
1141 			   nblksize, (int)nsize & (nblksize - 1),
1142 			   1);
1143 	}
1144 
1145 	/*
1146 	 * Destroy any physical blocks after the new EOF point.
1147 	 */
1148 	lbase = (nsize + HAMMER2_PBUFMASK64) & ~HAMMER2_PBUFMASK64;
1149 	chain = hammer2_chain_lookup(hmp, &parent,
1150 				     lbase, (hammer2_key_t)-1,
1151 				     HAMMER2_LOOKUP_NODATA);
1152 	while (chain) {
1153 		/*
1154 		 * Degenerate embedded data case, nothing to loop on.
1155 		 */
1156 		if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1157 			hammer2_chain_unlock(hmp, chain);
1158 			break;
1159 		}
1160 
1161 		/*
1162 		 * Delete physical data blocks past the file EOF.
1163 		 */
1164 		if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1165 			ip->delta_dcount -= chain->bytes;
1166 			hammer2_chain_delete(hmp, parent, chain, 0);
1167 		}
1168 		/* XXX check parent if empty indirect block & delete */
1169 		chain = hammer2_chain_next(hmp, &parent, chain,
1170 					   lbase, (hammer2_key_t)-1,
1171 					   HAMMER2_LOOKUP_NODATA);
1172 	}
1173 	hammer2_chain_unlock(hmp, parent);
1174 }
1175 
1176 /*
1177  * Extend the size of a file.  The inode must be locked.
1178  *
1179  * We may have to resize the block straddling the old EOF.
1180  */
1181 static
1182 void
1183 hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize)
1184 {
1185 	hammer2_mount_t *hmp;
1186 	hammer2_chain_t *parent;
1187 	hammer2_chain_t *chain;
1188 	struct buf *bp;
1189 	hammer2_key_t osize;
1190 	hammer2_key_t obase;
1191 	hammer2_key_t nbase;
1192 	hammer2_key_t leof;
1193 	int oblksize;
1194 	int nblksize;
1195 	int nradix;
1196 	int error;
1197 
1198 	KKASSERT(ip->vp);
1199 	hmp = ip->hmp;
1200 
1201 	hammer2_chain_modify(hmp, &ip->chain, 0);
1202 
1203 	/*
1204 	 * Nothing to do if the direct-data case is still intact
1205 	 */
1206 	if ((ip->ip_data.op_flags & HAMMER2_OPFLAG_DIRECTDATA) &&
1207 	    nsize <= HAMMER2_EMBEDDED_BYTES) {
1208 		ip->ip_data.size = nsize;
1209 		nvextendbuf(ip->vp,
1210 			    ip->ip_data.size, nsize,
1211 			    0, HAMMER2_EMBEDDED_BYTES,
1212 			    0, (int)nsize,
1213 			    1);
1214 		return;
1215 	}
1216 
1217 	/*
1218 	 * Calculate the blocksize at the original EOF and resize the block
1219 	 * if necessary.  Adjust the file size in the inode.
1220 	 */
1221 	osize = ip->ip_data.size;
1222 	oblksize = hammer2_calc_logical(ip, osize, &obase, &leof);
1223 	ip->ip_data.size = nsize;
1224 	nblksize = hammer2_calc_logical(ip, osize, &nbase, &leof);
1225 
1226 	/*
1227 	 * Do all required vnode operations, but do not mess with the
1228 	 * buffer straddling the orignal EOF.
1229 	 */
1230 	nvextendbuf(ip->vp,
1231 		    ip->ip_data.size, nsize,
1232 		    0, nblksize,
1233 		    0, (int)nsize & HAMMER2_PBUFMASK,
1234 		    1);
1235 
1236 	/*
1237 	 * Early return if we have no more work to do.
1238 	 */
1239 	if (obase == nbase && oblksize == nblksize &&
1240 	    (ip->ip_data.op_flags & HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1241 		return;
1242 	}
1243 
1244 	/*
1245 	 * We have work to do, including possibly resizing the buffer
1246 	 * at the previous EOF point and turning off DIRECTDATA mode.
1247 	 */
1248 	bp = NULL;
1249 	if (((int)osize & HAMMER2_PBUFMASK)) {
1250 		error = bread(ip->vp, obase, oblksize, &bp);
1251 		KKASSERT(error == 0);
1252 
1253 		if (obase != nbase) {
1254 			if (oblksize != HAMMER2_PBUFSIZE)
1255 				allocbuf(bp, HAMMER2_PBUFSIZE);
1256 		} else {
1257 			if (oblksize != nblksize)
1258 				allocbuf(bp, nblksize);
1259 		}
1260 	}
1261 
1262 	/*
1263 	 * Disable direct-data mode by loading up a buffer cache buffer
1264 	 * with the data, then converting the inode data area into the
1265 	 * inode indirect block array area.
1266 	 */
1267 	if (ip->ip_data.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1268 		ip->ip_data.op_flags &= ~HAMMER2_OPFLAG_DIRECTDATA;
1269 		bzero(&ip->ip_data.u.blockset, sizeof(ip->ip_data.u.blockset));
1270 	}
1271 
1272 	/*
1273 	 * Resize the chain element at the old EOF.
1274 	 */
1275 	if (((int)osize & HAMMER2_PBUFMASK)) {
1276 		parent = &ip->chain;
1277 		error = hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
1278 		KKASSERT(error == 0);
1279 
1280 		nradix = hammer2_bytes_to_radix(nblksize);
1281 
1282 		chain = hammer2_chain_lookup(hmp, &parent,
1283 					     obase, obase,
1284 					     HAMMER2_LOOKUP_NODATA);
1285 		if (chain == NULL) {
1286 			chain = hammer2_chain_create(hmp, parent, NULL,
1287 						     obase, nblksize,
1288 						     HAMMER2_BREF_TYPE_DATA,
1289 						     nblksize);
1290 			ip->delta_dcount += nblksize;
1291 		} else {
1292 			KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA);
1293 			hammer2_chain_resize(ip, chain, nradix,
1294 					     HAMMER2_MODIFY_OPTDATA);
1295 		}
1296 		bp->b_bio2.bio_offset = chain->bref.data_off &
1297 					HAMMER2_OFF_MASK;
1298 		hammer2_chain_unlock(hmp, chain);
1299 		if (bp->b_bcount == HAMMER2_PBUFSIZE)
1300 			bp->b_flags |= B_CLUSTEROK;
1301 		bdwrite(bp);
1302 		hammer2_chain_unlock(hmp, parent);
1303 	}
1304 }
1305 
1306 static
1307 int
1308 hammer2_vop_nresolve(struct vop_nresolve_args *ap)
1309 {
1310 	hammer2_inode_t *dip;
1311 	hammer2_inode_t *ip;
1312 	hammer2_mount_t *hmp;
1313 	hammer2_chain_t *parent;
1314 	hammer2_chain_t *chain;
1315 	struct namecache *ncp;
1316 	const uint8_t *name;
1317 	size_t name_len;
1318 	hammer2_key_t lhc;
1319 	int error = 0;
1320 	struct vnode *vp;
1321 
1322 	dip = VTOI(ap->a_dvp);
1323 	hmp = dip->hmp;
1324 	ncp = ap->a_nch->ncp;
1325 	name = ncp->nc_name;
1326 	name_len = ncp->nc_nlen;
1327 	lhc = hammer2_dirhash(name, name_len);
1328 
1329 	/*
1330 	 * Note: In DragonFly the kernel handles '.' and '..'.
1331 	 */
1332 	parent = &dip->chain;
1333 	hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS |
1334 					HAMMER2_RESOLVE_SHARED);
1335 	chain = hammer2_chain_lookup(hmp, &parent,
1336 				     lhc, lhc + HAMMER2_DIRHASH_LOMASK,
1337 				     HAMMER2_LOOKUP_SHARED);
1338 	while (chain) {
1339 		if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
1340 		    chain->u.ip &&
1341 		    name_len == chain->data->ipdata.name_len &&
1342 		    bcmp(name, chain->data->ipdata.filename, name_len) == 0) {
1343 			break;
1344 		}
1345 		chain = hammer2_chain_next(hmp, &parent, chain,
1346 					   lhc, lhc + HAMMER2_DIRHASH_LOMASK,
1347 					   HAMMER2_LOOKUP_SHARED);
1348 	}
1349 	hammer2_chain_unlock(hmp, parent);
1350 
1351 	/*
1352 	 * If the inode represents a forwarding entry for a hardlink we have
1353 	 * to locate the actual inode.  The original ip is saved for possible
1354 	 * deconsolidation.  (ip) will only be set to non-NULL when we have
1355 	 * to locate the real file via a hardlink.  ip will be referenced but
1356 	 * not locked in that situation.  chain is passed in locked and
1357 	 * returned locked.
1358 	 *
1359 	 * XXX what kind of chain lock?
1360 	 */
1361 	ip = NULL;
1362 	if (chain && chain->u.ip->ip_data.type == HAMMER2_OBJTYPE_HARDLINK) {
1363 		error = hammer2_hardlink_find(dip, &chain, &ip);
1364 		if (error) {
1365 			kprintf("hammer2: unable to find hardlink\n");
1366 			if (chain) {
1367 				hammer2_chain_unlock(hmp, chain);
1368 				chain = NULL;
1369 			}
1370 			goto failed;
1371 		}
1372 	}
1373 
1374 	/*
1375 	 * Deconsolidate any hardlink whos nlinks == 1.  Ignore errors.
1376 	 * If an error occurs chain and ip are left alone.
1377 	 *
1378 	 * XXX upgrade shared lock?
1379 	 */
1380 	if (ip && chain && chain->u.ip->ip_data.nlinks == 1 && !hmp->ronly) {
1381 		kprintf("hammer2: need to unconsolidate hardlink for %s\n",
1382 			chain->u.ip->ip_data.filename);
1383 		hammer2_hardlink_deconsolidate(dip, &chain, &ip);
1384 	}
1385 
1386 	/*
1387 	 * Acquire the related vnode
1388 	 */
1389 	if (chain) {
1390 		vp = hammer2_igetv(chain->u.ip, &error);
1391 		if (error == 0) {
1392 			vn_unlock(vp);
1393 			cache_setvp(ap->a_nch, vp);
1394 			vrele(vp);
1395 		}
1396 		hammer2_chain_unlock(hmp, chain);
1397 	} else {
1398 		error = ENOENT;
1399 failed:
1400 		cache_setvp(ap->a_nch, NULL);
1401 	}
1402 	if (ip)
1403 		hammer2_inode_drop(ip);
1404 	return error;
1405 }
1406 
1407 static
1408 int
1409 hammer2_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1410 {
1411 	hammer2_inode_t *dip;
1412 	hammer2_inode_t *ip;
1413 	hammer2_mount_t *hmp;
1414 	int error;
1415 
1416 	dip = VTOI(ap->a_dvp);
1417 	hmp = dip->hmp;
1418 
1419 	if ((ip = dip->pip) == NULL) {
1420 		*ap->a_vpp = NULL;
1421 		return ENOENT;
1422 	}
1423 	hammer2_chain_lock(hmp, &ip->chain, HAMMER2_RESOLVE_ALWAYS);
1424 	*ap->a_vpp = hammer2_igetv(ip, &error);
1425 	hammer2_chain_unlock(hmp, &ip->chain);
1426 
1427 	return error;
1428 }
1429 
1430 static
1431 int
1432 hammer2_vop_nmkdir(struct vop_nmkdir_args *ap)
1433 {
1434 	hammer2_mount_t *hmp;
1435 	hammer2_inode_t *dip;
1436 	hammer2_inode_t *nip;
1437 	struct namecache *ncp;
1438 	const uint8_t *name;
1439 	size_t name_len;
1440 	int error;
1441 
1442 	dip = VTOI(ap->a_dvp);
1443 	hmp = dip->hmp;
1444 	if (hmp->ronly)
1445 		return (EROFS);
1446 
1447 	ncp = ap->a_nch->ncp;
1448 	name = ncp->nc_name;
1449 	name_len = ncp->nc_nlen;
1450 
1451 	error = hammer2_inode_create(dip, ap->a_vap, ap->a_cred,
1452 				     name, name_len, &nip);
1453 	if (error) {
1454 		KKASSERT(nip == NULL);
1455 		*ap->a_vpp = NULL;
1456 		return error;
1457 	}
1458 	*ap->a_vpp = hammer2_igetv(nip, &error);
1459 	hammer2_chain_unlock(hmp, &nip->chain);
1460 
1461 	if (error == 0) {
1462 		cache_setunresolved(ap->a_nch);
1463 		cache_setvp(ap->a_nch, *ap->a_vpp);
1464 	}
1465 	return error;
1466 }
1467 
1468 /*
1469  * Return the largest contiguous physical disk range for the logical
1470  * request.
1471  *
1472  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
1473  */
1474 static
1475 int
1476 hammer2_vop_bmap(struct vop_bmap_args *ap)
1477 {
1478 	struct vnode *vp;
1479 	hammer2_mount_t *hmp;
1480 	hammer2_inode_t *ip;
1481 	hammer2_chain_t *parent;
1482 	hammer2_chain_t *chain;
1483 	hammer2_key_t lbeg;
1484 	hammer2_key_t lend;
1485 	hammer2_off_t pbeg;
1486 	hammer2_off_t pbytes;
1487 	hammer2_off_t array[HAMMER2_BMAP_COUNT][2];
1488 	int loff;
1489 	int ai;
1490 
1491 	/*
1492 	 * Only supported on regular files
1493 	 *
1494 	 * Only supported for read operations (required for cluster_read).
1495 	 * The block allocation is delayed for write operations.
1496 	 */
1497 	vp = ap->a_vp;
1498 	if (vp->v_type != VREG)
1499 		return (EOPNOTSUPP);
1500 	if (ap->a_cmd != BUF_CMD_READ)
1501 		return (EOPNOTSUPP);
1502 
1503 	ip = VTOI(vp);
1504 	hmp = ip->hmp;
1505 	bzero(array, sizeof(array));
1506 
1507 	/*
1508 	 * Calculate logical range
1509 	 */
1510 	KKASSERT((ap->a_loffset & HAMMER2_LBUFMASK64) == 0);
1511 	lbeg = ap->a_loffset & HAMMER2_OFF_MASK_HI;
1512 	lend = lbeg + HAMMER2_BMAP_COUNT * HAMMER2_PBUFSIZE - 1;
1513 	if (lend < lbeg)
1514 		lend = lbeg;
1515 	loff = ap->a_loffset & HAMMER2_OFF_MASK_LO;
1516 
1517 	parent = &ip->chain;
1518 	hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS |
1519 					HAMMER2_RESOLVE_SHARED);
1520 	chain = hammer2_chain_lookup(hmp, &parent,
1521 				     lbeg, lend,
1522 				     HAMMER2_LOOKUP_NODATA |
1523 				     HAMMER2_LOOKUP_SHARED);
1524 	if (chain == NULL) {
1525 		*ap->a_doffsetp = ZFOFFSET;
1526 		hammer2_chain_unlock(hmp, parent);
1527 		return (0);
1528 	}
1529 
1530 	while (chain) {
1531 		if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1532 			ai = (chain->bref.key - lbeg) / HAMMER2_PBUFSIZE;
1533 			KKASSERT(ai >= 0 && ai < HAMMER2_BMAP_COUNT);
1534 			array[ai][0] = chain->bref.data_off & HAMMER2_OFF_MASK;
1535 			array[ai][1] = chain->bytes;
1536 		}
1537 		chain = hammer2_chain_next(hmp, &parent, chain,
1538 					   lbeg, lend,
1539 					   HAMMER2_LOOKUP_NODATA |
1540 					   HAMMER2_LOOKUP_SHARED);
1541 	}
1542 	hammer2_chain_unlock(hmp, parent);
1543 
1544 	/*
1545 	 * If the requested loffset is not mappable physically we can't
1546 	 * bmap.  The caller will have to access the file data via a
1547 	 * device buffer.
1548 	 */
1549 	if (array[0][0] == 0 || array[0][1] < loff + HAMMER2_LBUFSIZE) {
1550 		*ap->a_doffsetp = NOOFFSET;
1551 		return (0);
1552 	}
1553 
1554 	/*
1555 	 * Calculate the physical disk offset range for array[0]
1556 	 */
1557 	pbeg = array[0][0] + loff;
1558 	pbytes = array[0][1] - loff;
1559 
1560 	for (ai = 1; ai < HAMMER2_BMAP_COUNT; ++ai) {
1561 		if (array[ai][0] != pbeg + pbytes)
1562 			break;
1563 		pbytes += array[ai][1];
1564 	}
1565 
1566 	*ap->a_doffsetp = pbeg;
1567 	if (ap->a_runp)
1568 		*ap->a_runp = pbytes;
1569 	return (0);
1570 }
1571 
1572 static
1573 int
1574 hammer2_vop_open(struct vop_open_args *ap)
1575 {
1576 	return vop_stdopen(ap);
1577 }
1578 
1579 /*
1580  * hammer2_vop_advlock { vp, id, op, fl, flags }
1581  */
1582 static
1583 int
1584 hammer2_vop_advlock(struct vop_advlock_args *ap)
1585 {
1586 	hammer2_inode_t *ip = VTOI(ap->a_vp);
1587 
1588 	return (lf_advlock(ap, &ip->advlock, ip->ip_data.size));
1589 }
1590 
1591 
1592 static
1593 int
1594 hammer2_vop_close(struct vop_close_args *ap)
1595 {
1596 	return vop_stdclose(ap);
1597 }
1598 
1599 /*
1600  * hammer2_vop_nlink { nch, dvp, vp, cred }
1601  *
1602  * Create a hardlink from (vp) to {dvp, nch}.
1603  */
1604 static
1605 int
1606 hammer2_vop_nlink(struct vop_nlink_args *ap)
1607 {
1608 	hammer2_inode_t *dip;	/* target directory to create link in */
1609 	hammer2_inode_t *ip;	/* inode we are hardlinking to */
1610 	hammer2_inode_t *oip;
1611 	hammer2_mount_t *hmp;
1612 	struct namecache *ncp;
1613 	const uint8_t *name;
1614 	size_t name_len;
1615 	int error;
1616 
1617 	dip = VTOI(ap->a_dvp);
1618 	hmp = dip->hmp;
1619 	if (hmp->ronly)
1620 		return (EROFS);
1621 
1622 	/*
1623 	 * (ip) is the inode we are linking to.
1624 	 */
1625 	ip = oip = VTOI(ap->a_vp);
1626 	hammer2_inode_lock_nlinks(ip);
1627 
1628 	ncp = ap->a_nch->ncp;
1629 	name = ncp->nc_name;
1630 	name_len = ncp->nc_nlen;
1631 
1632 	/*
1633 	 * Create a consolidated real file for the hardlink, adjust (ip),
1634 	 * and move the nlinks lock if necessary.  Tell the function to
1635 	 * bump the hardlink count on the consolidated file.
1636 	 */
1637 	error = hammer2_hardlink_consolidate(&ip, dip);
1638 	if (error)
1639 		goto done;
1640 
1641 	/*
1642 	 * If the consolidation changed ip to a HARDLINK pointer we have
1643 	 * to adjust the vnode to point to the actual ip.
1644 	 *
1645 	 * XXX this can race against concurrent vnode ops.
1646 	 */
1647 	if (oip != ip) {
1648 		hammer2_chain_ref(hmp, &ip->chain);
1649 		hammer2_inode_lock_ex(ip);
1650 		hammer2_inode_lock_ex(oip);
1651 		ip->vp = ap->a_vp;
1652 		ap->a_vp->v_data = ip;
1653 		oip->vp = NULL;
1654 		hammer2_inode_unlock_ex(oip);
1655 		hammer2_inode_unlock_ex(ip);
1656 		hammer2_chain_drop(hmp, &oip->chain);
1657 	}
1658 
1659 	/*
1660 	 * The act of connecting the existing (ip) will properly bump the
1661 	 * nlinks count.  However, vp will incorrectly point at the old
1662 	 * inode which has now been turned into a OBJTYPE_HARDLINK pointer.
1663 	 *
1664 	 * We must reconnect the vp.
1665 	 */
1666 	hammer2_chain_lock(hmp, &ip->chain, HAMMER2_RESOLVE_ALWAYS);
1667 	error = hammer2_inode_connect(dip, ip, name, name_len);
1668 	hammer2_chain_unlock(hmp, &ip->chain);
1669 	if (error == 0) {
1670 		cache_setunresolved(ap->a_nch);
1671 		cache_setvp(ap->a_nch, ap->a_vp);
1672 	}
1673 done:
1674 	hammer2_inode_unlock_nlinks(ip);
1675 	return error;
1676 }
1677 
1678 /*
1679  * hammer2_vop_ncreate { nch, dvp, vpp, cred, vap }
1680  *
1681  * The operating system has already ensured that the directory entry
1682  * does not exist and done all appropriate namespace locking.
1683  */
1684 static
1685 int
1686 hammer2_vop_ncreate(struct vop_ncreate_args *ap)
1687 {
1688 	hammer2_mount_t *hmp;
1689 	hammer2_inode_t *dip;
1690 	hammer2_inode_t *nip;
1691 	struct namecache *ncp;
1692 	const uint8_t *name;
1693 	size_t name_len;
1694 	int error;
1695 
1696 	dip = VTOI(ap->a_dvp);
1697 	hmp = dip->hmp;
1698 	if (hmp->ronly)
1699 		return (EROFS);
1700 
1701 	ncp = ap->a_nch->ncp;
1702 	name = ncp->nc_name;
1703 	name_len = ncp->nc_nlen;
1704 
1705 	error = hammer2_inode_create(dip, ap->a_vap, ap->a_cred,
1706 				     name, name_len, &nip);
1707 	if (error) {
1708 		KKASSERT(nip == NULL);
1709 		*ap->a_vpp = NULL;
1710 		return error;
1711 	}
1712 	*ap->a_vpp = hammer2_igetv(nip, &error);
1713 	hammer2_chain_unlock(hmp, &nip->chain);
1714 
1715 	if (error == 0) {
1716 		cache_setunresolved(ap->a_nch);
1717 		cache_setvp(ap->a_nch, *ap->a_vpp);
1718 	}
1719 	return error;
1720 }
1721 
1722 /*
1723  * hammer2_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1724  */
1725 static
1726 int
1727 hammer2_vop_nsymlink(struct vop_nsymlink_args *ap)
1728 {
1729 	hammer2_mount_t *hmp;
1730 	hammer2_inode_t *dip;
1731 	hammer2_inode_t *nip;
1732 	struct namecache *ncp;
1733 	const uint8_t *name;
1734 	size_t name_len;
1735 	int error;
1736 
1737 	dip = VTOI(ap->a_dvp);
1738 	hmp = dip->hmp;
1739 	if (hmp->ronly)
1740 		return (EROFS);
1741 
1742 	ncp = ap->a_nch->ncp;
1743 	name = ncp->nc_name;
1744 	name_len = ncp->nc_nlen;
1745 
1746 	ap->a_vap->va_type = VLNK;	/* enforce type */
1747 
1748 	error = hammer2_inode_create(dip, ap->a_vap, ap->a_cred,
1749 				     name, name_len, &nip);
1750 	if (error) {
1751 		KKASSERT(nip == NULL);
1752 		*ap->a_vpp = NULL;
1753 		return error;
1754 	}
1755 	*ap->a_vpp = hammer2_igetv(nip, &error);
1756 
1757 	/*
1758 	 * Build the softlink (~like file data) and finalize the namecache.
1759 	 */
1760 	if (error == 0) {
1761 		size_t bytes;
1762 		struct uio auio;
1763 		struct iovec aiov;
1764 
1765 		bytes = strlen(ap->a_target);
1766 
1767 		if (bytes <= HAMMER2_EMBEDDED_BYTES) {
1768 			KKASSERT(nip->ip_data.op_flags &
1769 				 HAMMER2_OPFLAG_DIRECTDATA);
1770 			bcopy(ap->a_target, nip->ip_data.u.data, bytes);
1771 			nip->ip_data.size = bytes;
1772 		} else {
1773 			bzero(&auio, sizeof(auio));
1774 			bzero(&aiov, sizeof(aiov));
1775 			auio.uio_iov = &aiov;
1776 			auio.uio_segflg = UIO_SYSSPACE;
1777 			auio.uio_rw = UIO_WRITE;
1778 			auio.uio_resid = bytes;
1779 			auio.uio_iovcnt = 1;
1780 			auio.uio_td = curthread;
1781 			aiov.iov_base = ap->a_target;
1782 			aiov.iov_len = bytes;
1783 			error = hammer2_write_file(nip, &auio, IO_APPEND, 0);
1784 			/* XXX handle error */
1785 			error = 0;
1786 		}
1787 	}
1788 	hammer2_chain_unlock(hmp, &nip->chain);
1789 
1790 	/*
1791 	 * Finalize namecache
1792 	 */
1793 	if (error == 0) {
1794 		cache_setunresolved(ap->a_nch);
1795 		cache_setvp(ap->a_nch, *ap->a_vpp);
1796 		/* hammer2_knote(ap->a_dvp, NOTE_WRITE); */
1797 	}
1798 	return error;
1799 }
1800 
1801 /*
1802  * hammer2_vop_nremove { nch, dvp, cred }
1803  */
1804 static
1805 int
1806 hammer2_vop_nremove(struct vop_nremove_args *ap)
1807 {
1808 	hammer2_inode_t *dip;
1809 	hammer2_mount_t *hmp;
1810 	struct namecache *ncp;
1811 	const uint8_t *name;
1812 	size_t name_len;
1813 	int error;
1814 
1815 	dip = VTOI(ap->a_dvp);
1816 	hmp = dip->hmp;
1817 	if (hmp->ronly)
1818 		return(EROFS);
1819 
1820 	ncp = ap->a_nch->ncp;
1821 	name = ncp->nc_name;
1822 	name_len = ncp->nc_nlen;
1823 
1824 	error = hammer2_unlink_file(dip, name, name_len, 0, NULL);
1825 
1826 	if (error == 0) {
1827 		cache_setunresolved(ap->a_nch);
1828 		cache_setvp(ap->a_nch, NULL);
1829 	}
1830 	return (error);
1831 }
1832 
1833 /*
1834  * hammer2_vop_nrmdir { nch, dvp, cred }
1835  */
1836 static
1837 int
1838 hammer2_vop_nrmdir(struct vop_nrmdir_args *ap)
1839 {
1840 	hammer2_inode_t *dip;
1841 	hammer2_mount_t *hmp;
1842 	struct namecache *ncp;
1843 	const uint8_t *name;
1844 	size_t name_len;
1845 	int error;
1846 
1847 	dip = VTOI(ap->a_dvp);
1848 	hmp = dip->hmp;
1849 	if (hmp->ronly)
1850 		return(EROFS);
1851 
1852 	ncp = ap->a_nch->ncp;
1853 	name = ncp->nc_name;
1854 	name_len = ncp->nc_nlen;
1855 
1856 	error = hammer2_unlink_file(dip, name, name_len, 1, NULL);
1857 
1858 	if (error == 0) {
1859 		cache_setunresolved(ap->a_nch);
1860 		cache_setvp(ap->a_nch, NULL);
1861 	}
1862 	return (error);
1863 }
1864 
1865 /*
1866  * hammer2_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1867  */
1868 static
1869 int
1870 hammer2_vop_nrename(struct vop_nrename_args *ap)
1871 {
1872 	struct namecache *fncp;
1873 	struct namecache *tncp;
1874 	hammer2_inode_t *fdip;
1875 	hammer2_inode_t *tdip;
1876 	hammer2_inode_t *ip;
1877 	hammer2_mount_t *hmp;
1878 	const uint8_t *fname;
1879 	size_t fname_len;
1880 	const uint8_t *tname;
1881 	size_t tname_len;
1882 	int error;
1883 
1884 	if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
1885 		return(EXDEV);
1886 	if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1887 		return(EXDEV);
1888 
1889 	fdip = VTOI(ap->a_fdvp);	/* source directory */
1890 	tdip = VTOI(ap->a_tdvp);	/* target directory */
1891 
1892 	hmp = fdip->hmp;		/* check read-only filesystem */
1893 	if (hmp->ronly)
1894 		return(EROFS);
1895 
1896 	fncp = ap->a_fnch->ncp;		/* entry name in source */
1897 	fname = fncp->nc_name;
1898 	fname_len = fncp->nc_nlen;
1899 
1900 	tncp = ap->a_tnch->ncp;		/* entry name in target */
1901 	tname = tncp->nc_name;
1902 	tname_len = tncp->nc_nlen;
1903 
1904 	/*
1905 	 * ip is the inode being removed.  If this is a hardlink then
1906 	 * ip represents the actual file and not the hardlink marker.
1907 	 */
1908 	ip = VTOI(fncp->nc_vp);
1909 
1910 	/*
1911 	 * Keep a tight grip on the inode as removing it should disconnect
1912 	 * it and we don't want to destroy it.
1913 	 *
1914 	 * NOTE: To avoid deadlocks we cannot lock (ip) while we are
1915 	 *	 unlinking elements from their directories.  Locking
1916 	 *	 the nlinks field does not lock the whole inode.
1917 	 */
1918 	hammer2_inode_lock_nlinks(ip);
1919 
1920 	/*
1921 	 * Remove target if it exists
1922 	 */
1923 	error = hammer2_unlink_file(tdip, tname, tname_len, -1, NULL);
1924 	if (error && error != ENOENT)
1925 		goto done;
1926 	cache_setunresolved(ap->a_tnch);
1927 	cache_setvp(ap->a_tnch, NULL);
1928 
1929 	/*
1930 	 * Disconnect (fdip, fname) from the source directory.  This will
1931 	 * disconnect (ip) if it represents a direct file.  If (ip) represents
1932 	 * a hardlink the HARDLINK pointer object will be removed but the
1933 	 * hardlink will stay intact.
1934 	 *
1935 	 * If (ip) is already hardlinked we have to resolve to a consolidated
1936 	 * file but we do not bump the nlinks count.  (ip) must hold the nlinks
1937 	 * lock & ref for the operation.  If the consolidated file has been
1938 	 * relocated (ip) will be adjusted and the related nlinks lock moved
1939 	 * along with it.
1940 	 *
1941 	 * If (ip) does not have multiple links we can just copy the physical
1942 	 * contents of the inode.
1943 	 */
1944 	if (ip->ip_data.nlinks > 1) {
1945 		error = hammer2_hardlink_consolidate(&ip, tdip);
1946 		if (error)
1947 			goto done;
1948 	}
1949 	error = hammer2_unlink_file(fdip, fname, fname_len, -1, ip);
1950 	if (error)
1951 		goto done;
1952 
1953 	/*
1954 	 * Reconnect ip to target directory.
1955 	 *
1956 	 * WARNING: chain locks can lock buffer cache buffers, to avoid
1957 	 *	    deadlocks we want to unlock before issuing a cache_*()
1958 	 *	    op (that might have to lock a vnode).
1959 	 */
1960 	hammer2_chain_lock(hmp, &ip->chain, HAMMER2_RESOLVE_ALWAYS);
1961 	error = hammer2_inode_connect(tdip, ip, tname, tname_len);
1962 	hammer2_chain_unlock(hmp, &ip->chain);
1963 
1964 	if (error == 0) {
1965 		cache_rename(ap->a_fnch, ap->a_tnch);
1966 	}
1967 done:
1968 	hammer2_inode_unlock_nlinks(ip);
1969 
1970 	return (error);
1971 }
1972 
1973 static int hammer2_strategy_read(struct vop_strategy_args *ap);
1974 static int hammer2_strategy_write(struct vop_strategy_args *ap);
1975 
1976 static
1977 int
1978 hammer2_vop_strategy(struct vop_strategy_args *ap)
1979 {
1980 	struct bio *biop;
1981 	struct buf *bp;
1982 	int error;
1983 
1984 	biop = ap->a_bio;
1985 	bp = biop->bio_buf;
1986 
1987 	switch(bp->b_cmd) {
1988 	case BUF_CMD_READ:
1989 		error = hammer2_strategy_read(ap);
1990 		++hammer2_iod_file_read;
1991 		break;
1992 	case BUF_CMD_WRITE:
1993 		error = hammer2_strategy_write(ap);
1994 		++hammer2_iod_file_write;
1995 		break;
1996 	default:
1997 		bp->b_error = error = EINVAL;
1998 		bp->b_flags |= B_ERROR;
1999 		biodone(biop);
2000 		break;
2001 	}
2002 
2003 	return (error);
2004 }
2005 
2006 static
2007 int
2008 hammer2_strategy_read(struct vop_strategy_args *ap)
2009 {
2010 	struct buf *bp;
2011 	struct bio *bio;
2012 	struct bio *nbio;
2013 	hammer2_mount_t *hmp;
2014 	hammer2_inode_t *ip;
2015 	hammer2_chain_t *parent;
2016 	hammer2_chain_t *chain;
2017 	hammer2_key_t lbase;
2018 
2019 	bio = ap->a_bio;
2020 	bp = bio->bio_buf;
2021 	ip = VTOI(ap->a_vp);
2022 	hmp = ip->hmp;
2023 	nbio = push_bio(bio);
2024 
2025 	lbase = bio->bio_offset;
2026 	chain = NULL;
2027 	KKASSERT(((int)lbase & HAMMER2_PBUFMASK) == 0);
2028 
2029 	/*
2030 	 * We must characterize the logical->physical translation if it
2031 	 * has not already been cached.
2032 	 *
2033 	 * Physical data references < LBUFSIZE are never cached.  This
2034 	 * includes both small-block allocations and inode-embedded data.
2035 	 */
2036 	if (nbio->bio_offset == NOOFFSET) {
2037 		parent = &ip->chain;
2038 		hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS |
2039 						HAMMER2_RESOLVE_SHARED);
2040 
2041 		chain = hammer2_chain_lookup(hmp, &parent, lbase, lbase,
2042 					     HAMMER2_LOOKUP_NODATA |
2043 					     HAMMER2_LOOKUP_SHARED);
2044 		if (chain == NULL) {
2045 			/*
2046 			 * Data is zero-fill
2047 			 */
2048 			nbio->bio_offset = ZFOFFSET;
2049 		} else if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2050 			/*
2051 			 * Data is embedded in the inode (do nothing)
2052 			 */
2053 			KKASSERT(chain == parent);
2054 			hammer2_chain_unlock(hmp, chain);
2055 		} else if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
2056 			/*
2057 			 * Data is on-media
2058 			 */
2059 			KKASSERT(bp->b_bcount == chain->bytes);
2060 			nbio->bio_offset = chain->bref.data_off &
2061 					   HAMMER2_OFF_MASK;
2062 			hammer2_chain_unlock(hmp, chain);
2063 			KKASSERT(nbio->bio_offset != 0);
2064 		} else {
2065 			panic("hammer2_strategy_read: unknown bref type");
2066 		}
2067 		hammer2_chain_unlock(hmp, parent);
2068 	}
2069 
2070 	if (hammer2_debug & 0x0020) {
2071 		kprintf("read %016jx %016jx\n",
2072 			bio->bio_offset, nbio->bio_offset);
2073 	}
2074 
2075 	if (nbio->bio_offset == ZFOFFSET) {
2076 		/*
2077 		 * Data is zero-fill
2078 		 */
2079 		bp->b_resid = 0;
2080 		bp->b_error = 0;
2081 		bzero(bp->b_data, bp->b_bcount);
2082 		biodone(nbio);
2083 	} else if (nbio->bio_offset != NOOFFSET) {
2084 		/*
2085 		 * Forward direct IO to the device
2086 		 */
2087 		vn_strategy(hmp->devvp, nbio);
2088 	} else {
2089 		/*
2090 		 * Data is embedded in inode.
2091 		 */
2092 		bcopy(chain->data->ipdata.u.data, bp->b_data,
2093 		      HAMMER2_EMBEDDED_BYTES);
2094 		bzero(bp->b_data + HAMMER2_EMBEDDED_BYTES,
2095 		      bp->b_bcount - HAMMER2_EMBEDDED_BYTES);
2096 		bp->b_resid = 0;
2097 		bp->b_error = 0;
2098 		biodone(nbio);
2099 	}
2100 	return (0);
2101 }
2102 
2103 static
2104 int
2105 hammer2_strategy_write(struct vop_strategy_args *ap)
2106 {
2107 	struct buf *bp;
2108 	struct bio *bio;
2109 	struct bio *nbio;
2110 	hammer2_mount_t *hmp;
2111 	hammer2_inode_t *ip;
2112 
2113 	bio = ap->a_bio;
2114 	bp = bio->bio_buf;
2115 	ip = VTOI(ap->a_vp);
2116 	hmp = ip->hmp;
2117 	nbio = push_bio(bio);
2118 
2119 	KKASSERT((bio->bio_offset & HAMMER2_PBUFMASK64) == 0);
2120 	KKASSERT(nbio->bio_offset != 0 && nbio->bio_offset != ZFOFFSET);
2121 
2122 	if (nbio->bio_offset == NOOFFSET) {
2123 		/*
2124 		 * Must be embedded in the inode.
2125 		 */
2126 		KKASSERT(bio->bio_offset == 0);
2127 		bcopy(bp->b_data, ip->ip_data.u.data, HAMMER2_EMBEDDED_BYTES);
2128 		bp->b_resid = 0;
2129 		bp->b_error = 0;
2130 		biodone(nbio);
2131 
2132 		/*
2133 		 * This special flag does not follow the normal MODIFY rules
2134 		 * because we might deadlock on ip.  Instead we depend on
2135 		 * VOP_FSYNC() to detect the case.
2136 		 */
2137 		atomic_set_int(&ip->chain.flags, HAMMER2_CHAIN_DIRTYEMBED);
2138 	} else {
2139 		/*
2140 		 * Forward direct IO to the device
2141 		 */
2142 		vn_strategy(hmp->devvp, nbio);
2143 	}
2144 	return (0);
2145 }
2146 
2147 /*
2148  * hammer2_vop_ioctl { vp, command, data, fflag, cred }
2149  */
2150 static
2151 int
2152 hammer2_vop_ioctl(struct vop_ioctl_args *ap)
2153 {
2154 	hammer2_mount_t *hmp;
2155 	hammer2_inode_t *ip;
2156 	int error;
2157 
2158 	ip = VTOI(ap->a_vp);
2159 	hmp = ip->hmp;
2160 
2161 	error = hammer2_ioctl(ip, ap->a_command, (void *)ap->a_data,
2162 			      ap->a_fflag, ap->a_cred);
2163 	return (error);
2164 }
2165 
2166 static
2167 int
2168 hammer2_vop_mountctl(struct vop_mountctl_args *ap)
2169 {
2170 	struct mount *mp;
2171 	hammer2_pfsmount_t *pmp;
2172 	int rc;
2173 
2174 	switch (ap->a_op) {
2175 	case (MOUNTCTL_SET_EXPORT):
2176 		mp = ap->a_head.a_ops->head.vv_mount;
2177 		pmp = MPTOPMP(mp);
2178 
2179 		if (ap->a_ctllen != sizeof(struct export_args))
2180 			rc = (EINVAL);
2181 		else
2182 			rc = vfs_export(mp, &pmp->export,
2183 					(const struct export_args *)ap->a_ctl);
2184 		break;
2185 	default:
2186 		rc = vop_stdmountctl(ap);
2187 		break;
2188 	}
2189 	return (rc);
2190 }
2191 
2192 struct vop_ops hammer2_vnode_vops = {
2193 	.vop_default	= vop_defaultop,
2194 	.vop_fsync	= hammer2_vop_fsync,
2195 	.vop_getpages	= vop_stdgetpages,
2196 	.vop_putpages	= vop_stdputpages,
2197 	.vop_access	= hammer2_vop_access,
2198 	.vop_advlock	= hammer2_vop_advlock,
2199 	.vop_close	= hammer2_vop_close,
2200 	.vop_nlink	= hammer2_vop_nlink,
2201 	.vop_ncreate	= hammer2_vop_ncreate,
2202 	.vop_nsymlink	= hammer2_vop_nsymlink,
2203 	.vop_nremove	= hammer2_vop_nremove,
2204 	.vop_nrmdir	= hammer2_vop_nrmdir,
2205 	.vop_nrename	= hammer2_vop_nrename,
2206 	.vop_getattr	= hammer2_vop_getattr,
2207 	.vop_setattr	= hammer2_vop_setattr,
2208 	.vop_readdir	= hammer2_vop_readdir,
2209 	.vop_readlink	= hammer2_vop_readlink,
2210 	.vop_getpages	= vop_stdgetpages,
2211 	.vop_putpages	= vop_stdputpages,
2212 	.vop_read	= hammer2_vop_read,
2213 	.vop_write	= hammer2_vop_write,
2214 	.vop_open	= hammer2_vop_open,
2215 	.vop_inactive	= hammer2_vop_inactive,
2216 	.vop_reclaim 	= hammer2_vop_reclaim,
2217 	.vop_nresolve	= hammer2_vop_nresolve,
2218 	.vop_nlookupdotdot = hammer2_vop_nlookupdotdot,
2219 	.vop_nmkdir 	= hammer2_vop_nmkdir,
2220 	.vop_ioctl	= hammer2_vop_ioctl,
2221 	.vop_mountctl	= hammer2_vop_mountctl,
2222 	.vop_bmap	= hammer2_vop_bmap,
2223 	.vop_strategy	= hammer2_vop_strategy,
2224 };
2225 
2226 struct vop_ops hammer2_spec_vops = {
2227 
2228 };
2229 
2230 struct vop_ops hammer2_fifo_vops = {
2231 
2232 };
2233