xref: /dragonfly/sys/vfs/hammer/hammer_vnops.c (revision 7eedf208)
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $
35  */
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <sys/file.h>
48 #include <vm/vm_extern.h>
49 #include <vm/swap_pager.h>
50 #include <vfs/fifofs/fifo.h>
51 
52 #include "hammer.h"
53 
54 /*
55  * USERFS VNOPS
56  */
57 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
58 static int hammer_vop_fsync(struct vop_fsync_args *);
59 static int hammer_vop_read(struct vop_read_args *);
60 static int hammer_vop_write(struct vop_write_args *);
61 static int hammer_vop_access(struct vop_access_args *);
62 static int hammer_vop_advlock(struct vop_advlock_args *);
63 static int hammer_vop_close(struct vop_close_args *);
64 static int hammer_vop_ncreate(struct vop_ncreate_args *);
65 static int hammer_vop_getattr(struct vop_getattr_args *);
66 static int hammer_vop_nresolve(struct vop_nresolve_args *);
67 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
68 static int hammer_vop_nlink(struct vop_nlink_args *);
69 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
70 static int hammer_vop_nmknod(struct vop_nmknod_args *);
71 static int hammer_vop_open(struct vop_open_args *);
72 static int hammer_vop_print(struct vop_print_args *);
73 static int hammer_vop_readdir(struct vop_readdir_args *);
74 static int hammer_vop_readlink(struct vop_readlink_args *);
75 static int hammer_vop_nremove(struct vop_nremove_args *);
76 static int hammer_vop_nrename(struct vop_nrename_args *);
77 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
78 static int hammer_vop_markatime(struct vop_markatime_args *);
79 static int hammer_vop_setattr(struct vop_setattr_args *);
80 static int hammer_vop_strategy(struct vop_strategy_args *);
81 static int hammer_vop_bmap(struct vop_bmap_args *ap);
82 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
83 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
84 static int hammer_vop_ioctl(struct vop_ioctl_args *);
85 static int hammer_vop_mountctl(struct vop_mountctl_args *);
86 static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
87 
88 static int hammer_vop_fifoclose (struct vop_close_args *);
89 static int hammer_vop_fiforead (struct vop_read_args *);
90 static int hammer_vop_fifowrite (struct vop_write_args *);
91 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
92 
93 struct vop_ops hammer_vnode_vops = {
94 	.vop_default =		vop_defaultop,
95 	.vop_fsync =		hammer_vop_fsync,
96 	.vop_getpages =		vop_stdgetpages,
97 	.vop_putpages =		vop_stdputpages,
98 	.vop_read =		hammer_vop_read,
99 	.vop_write =		hammer_vop_write,
100 	.vop_access =		hammer_vop_access,
101 	.vop_advlock =		hammer_vop_advlock,
102 	.vop_close =		hammer_vop_close,
103 	.vop_ncreate =		hammer_vop_ncreate,
104 	.vop_getattr =		hammer_vop_getattr,
105 	.vop_inactive =		hammer_vop_inactive,
106 	.vop_reclaim =		hammer_vop_reclaim,
107 	.vop_nresolve =		hammer_vop_nresolve,
108 	.vop_nlookupdotdot =	hammer_vop_nlookupdotdot,
109 	.vop_nlink =		hammer_vop_nlink,
110 	.vop_nmkdir =		hammer_vop_nmkdir,
111 	.vop_nmknod =		hammer_vop_nmknod,
112 	.vop_open =		hammer_vop_open,
113 	.vop_pathconf =		vop_stdpathconf,
114 	.vop_print =		hammer_vop_print,
115 	.vop_readdir =		hammer_vop_readdir,
116 	.vop_readlink =		hammer_vop_readlink,
117 	.vop_nremove =		hammer_vop_nremove,
118 	.vop_nrename =		hammer_vop_nrename,
119 	.vop_nrmdir =		hammer_vop_nrmdir,
120 	.vop_markatime = 	hammer_vop_markatime,
121 	.vop_setattr =		hammer_vop_setattr,
122 	.vop_bmap =		hammer_vop_bmap,
123 	.vop_strategy =		hammer_vop_strategy,
124 	.vop_nsymlink =		hammer_vop_nsymlink,
125 	.vop_nwhiteout =	hammer_vop_nwhiteout,
126 	.vop_ioctl =		hammer_vop_ioctl,
127 	.vop_mountctl =		hammer_vop_mountctl,
128 	.vop_kqfilter =		hammer_vop_kqfilter
129 };
130 
131 struct vop_ops hammer_spec_vops = {
132 	.vop_default =		vop_defaultop,
133 	.vop_fsync =		hammer_vop_fsync,
134 	.vop_read =		vop_stdnoread,
135 	.vop_write =		vop_stdnowrite,
136 	.vop_access =		hammer_vop_access,
137 	.vop_close =		hammer_vop_close,
138 	.vop_markatime = 	hammer_vop_markatime,
139 	.vop_getattr =		hammer_vop_getattr,
140 	.vop_inactive =		hammer_vop_inactive,
141 	.vop_reclaim =		hammer_vop_reclaim,
142 	.vop_setattr =		hammer_vop_setattr
143 };
144 
145 struct vop_ops hammer_fifo_vops = {
146 	.vop_default =		fifo_vnoperate,
147 	.vop_fsync =		hammer_vop_fsync,
148 	.vop_read =		hammer_vop_fiforead,
149 	.vop_write =		hammer_vop_fifowrite,
150 	.vop_access =		hammer_vop_access,
151 	.vop_close =		hammer_vop_fifoclose,
152 	.vop_markatime = 	hammer_vop_markatime,
153 	.vop_getattr =		hammer_vop_getattr,
154 	.vop_inactive =		hammer_vop_inactive,
155 	.vop_reclaim =		hammer_vop_reclaim,
156 	.vop_setattr =		hammer_vop_setattr,
157 	.vop_kqfilter =		hammer_vop_fifokqfilter
158 };
159 
160 static __inline
161 void
162 hammer_knote(struct vnode *vp, int flags)
163 {
164 	if (flags)
165 		KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
166 }
167 
168 #ifdef DEBUG_TRUNCATE
169 struct hammer_inode *HammerTruncIp;
170 #endif
171 
172 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
173 			   struct vnode *dvp, struct ucred *cred,
174 			   int flags, int isdir);
175 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
176 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
177 
178 #if 0
179 static
180 int
181 hammer_vop_vnoperate(struct vop_generic_args *)
182 {
183 	return (VOCALL(&hammer_vnode_vops, ap));
184 }
185 #endif
186 
187 /*
188  * hammer_vop_fsync { vp, waitfor }
189  *
190  * fsync() an inode to disk and wait for it to be completely committed
191  * such that the information would not be undone if a crash occured after
192  * return.
193  *
194  * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement
195  *	 a REDO log.  A sysctl is provided to relax HAMMER's fsync()
196  *	 operation.
197  *
198  *	 Ultimately the combination of a REDO log and use of fast storage
199  *	 to front-end cluster caches will make fsync fast, but it aint
200  *	 here yet.  And, in anycase, we need real transactional
201  *	 all-or-nothing features which are not restricted to a single file.
202  */
203 static
204 int
205 hammer_vop_fsync(struct vop_fsync_args *ap)
206 {
207 	hammer_inode_t ip = VTOI(ap->a_vp);
208 	hammer_mount_t hmp = ip->hmp;
209 	int waitfor = ap->a_waitfor;
210 	int mode;
211 
212 	lwkt_gettoken(&hmp->fs_token);
213 
214 	/*
215 	 * Fsync rule relaxation (default is either full synchronous flush
216 	 * or REDO semantics with synchronous flush).
217 	 */
218 	if (ap->a_flags & VOP_FSYNC_SYSCALL) {
219 		switch(hammer_fsync_mode) {
220 		case 0:
221 mode0:
222 			/* no REDO, full synchronous flush */
223 			goto skip;
224 		case 1:
225 mode1:
226 			/* no REDO, full asynchronous flush */
227 			if (waitfor == MNT_WAIT)
228 				waitfor = MNT_NOWAIT;
229 			goto skip;
230 		case 2:
231 			/* REDO semantics, synchronous flush */
232 			if (hmp->version < HAMMER_VOL_VERSION_FOUR)
233 				goto mode0;
234 			mode = HAMMER_FLUSH_UNDOS_AUTO;
235 			break;
236 		case 3:
237 			/* REDO semantics, relaxed asynchronous flush */
238 			if (hmp->version < HAMMER_VOL_VERSION_FOUR)
239 				goto mode1;
240 			mode = HAMMER_FLUSH_UNDOS_RELAXED;
241 			if (waitfor == MNT_WAIT)
242 				waitfor = MNT_NOWAIT;
243 			break;
244 		case 4:
245 			/* ignore the fsync() system call */
246 			lwkt_reltoken(&hmp->fs_token);
247 			return(0);
248 		default:
249 			/* we have to do something */
250 			mode = HAMMER_FLUSH_UNDOS_RELAXED;
251 			if (waitfor == MNT_WAIT)
252 				waitfor = MNT_NOWAIT;
253 			break;
254 		}
255 
256 		/*
257 		 * Fast fsync only needs to flush the UNDO/REDO fifo if
258 		 * HAMMER_INODE_REDO is non-zero and the only modifications
259 		 * made to the file are write or write-extends.
260 		 */
261 		if ((ip->flags & HAMMER_INODE_REDO) &&
262 		    (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0
263 		) {
264 			++hammer_count_fsyncs;
265 			hammer_flusher_flush_undos(hmp, mode);
266 			ip->redo_count = 0;
267 			lwkt_reltoken(&hmp->fs_token);
268 			return(0);
269 		}
270 
271 		/*
272 		 * REDO is enabled by fsync(), the idea being we really only
273 		 * want to lay down REDO records when programs are using
274 		 * fsync() heavily.  The first fsync() on the file starts
275 		 * the gravy train going and later fsync()s keep it hot by
276 		 * resetting the redo_count.
277 		 *
278 		 * We weren't running REDOs before now so we have to fall
279 		 * through and do a full fsync of what we have.
280 		 */
281 		if (hmp->version >= HAMMER_VOL_VERSION_FOUR &&
282 		    (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) {
283 			ip->flags |= HAMMER_INODE_REDO;
284 			ip->redo_count = 0;
285 		}
286 	}
287 skip:
288 
289 	/*
290 	 * Do a full flush sequence.
291 	 */
292 	++hammer_count_fsyncs;
293 	vfsync(ap->a_vp, waitfor, 1, NULL, NULL);
294 	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
295 	if (waitfor == MNT_WAIT) {
296 		vn_unlock(ap->a_vp);
297 		hammer_wait_inode(ip);
298 		vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
299 	}
300 	lwkt_reltoken(&hmp->fs_token);
301 	return (ip->error);
302 }
303 
304 /*
305  * hammer_vop_read { vp, uio, ioflag, cred }
306  *
307  * MPSAFE (for the cache safe does not require fs_token)
308  */
309 static
310 int
311 hammer_vop_read(struct vop_read_args *ap)
312 {
313 	struct hammer_transaction trans;
314 	hammer_inode_t ip;
315 	hammer_mount_t hmp;
316 	off_t offset;
317 	struct buf *bp;
318 	struct uio *uio;
319 	int error;
320 	int n;
321 	int seqcount;
322 	int ioseqcount;
323 	int blksize;
324 	int bigread;
325 	int got_fstoken;
326 
327 	if (ap->a_vp->v_type != VREG)
328 		return (EINVAL);
329 	ip = VTOI(ap->a_vp);
330 	hmp = ip->hmp;
331 	error = 0;
332 	uio = ap->a_uio;
333 
334 	/*
335 	 * Allow the UIO's size to override the sequential heuristic.
336 	 */
337 	blksize = hammer_blocksize(uio->uio_offset);
338 	seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE;
339 	ioseqcount = (ap->a_ioflag >> 16);
340 	if (seqcount < ioseqcount)
341 		seqcount = ioseqcount;
342 
343 	/*
344 	 * If reading or writing a huge amount of data we have to break
345 	 * atomicy and allow the operation to be interrupted by a signal
346 	 * or it can DOS the machine.
347 	 */
348 	bigread = (uio->uio_resid > 100 * 1024 * 1024);
349 	got_fstoken = 0;
350 
351 	/*
352 	 * Access the data typically in HAMMER_BUFSIZE blocks via the
353 	 * buffer cache, but HAMMER may use a variable block size based
354 	 * on the offset.
355 	 *
356 	 * XXX Temporary hack, delay the start transaction while we remain
357 	 *     MPSAFE.  NOTE: ino_data.size cannot change while vnode is
358 	 *     locked-shared.
359 	 */
360 	while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
361 		int64_t base_offset;
362 		int64_t file_limit;
363 
364 		blksize = hammer_blocksize(uio->uio_offset);
365 		offset = (int)uio->uio_offset & (blksize - 1);
366 		base_offset = uio->uio_offset - offset;
367 
368 		if (bigread && (error = hammer_signal_check(ip->hmp)) != 0)
369 			break;
370 
371 		/*
372 		 * MPSAFE
373 		 */
374 		bp = getblk(ap->a_vp, base_offset, blksize, 0, 0);
375 		if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) {
376 			bp->b_flags &= ~B_AGE;
377 			error = 0;
378 			goto skip;
379 		}
380 		if (ap->a_ioflag & IO_NRDELAY) {
381 			bqrelse(bp);
382 			return (EWOULDBLOCK);
383 		}
384 
385 		/*
386 		 * MPUNSAFE
387 		 */
388 		if (got_fstoken == 0) {
389 			lwkt_gettoken(&hmp->fs_token);
390 			got_fstoken = 1;
391 			hammer_start_transaction(&trans, ip->hmp);
392 		}
393 
394 		/*
395 		 * NOTE: A valid bp has already been acquired, but was not
396 		 *	 B_CACHE.
397 		 */
398 		if (hammer_cluster_enable) {
399 			/*
400 			 * Use file_limit to prevent cluster_read() from
401 			 * creating buffers of the wrong block size past
402 			 * the demarc.
403 			 */
404 			file_limit = ip->ino_data.size;
405 			if (base_offset < HAMMER_XDEMARC &&
406 			    file_limit > HAMMER_XDEMARC) {
407 				file_limit = HAMMER_XDEMARC;
408 			}
409 			error = cluster_readx(ap->a_vp,
410 					     file_limit, base_offset,
411 					     blksize, uio->uio_resid,
412 					     seqcount * BKVASIZE, &bp);
413 		} else {
414 			error = breadnx(ap->a_vp, base_offset, blksize,
415 					NULL, NULL, 0, &bp);
416 		}
417 		if (error) {
418 			brelse(bp);
419 			break;
420 		}
421 skip:
422 		if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) {
423 			kprintf("doff %016jx read file %016jx@%016jx\n",
424 				(intmax_t)bp->b_bio2.bio_offset,
425 				(intmax_t)ip->obj_id,
426 				(intmax_t)bp->b_loffset);
427 		}
428 		bp->b_flags &= ~B_IODEBUG;
429 
430 		/* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
431 		n = blksize - offset;
432 		if (n > uio->uio_resid)
433 			n = uio->uio_resid;
434 		if (n > ip->ino_data.size - uio->uio_offset)
435 			n = (int)(ip->ino_data.size - uio->uio_offset);
436 		if (got_fstoken)
437 			lwkt_reltoken(&hmp->fs_token);
438 
439 		/*
440 		 * Set B_AGE, data has a lower priority than meta-data.
441 		 *
442 		 * Use a hold/unlock/drop sequence to run the uiomove
443 		 * with the buffer unlocked, avoiding deadlocks against
444 		 * read()s on mmap()'d spaces.
445 		 */
446 		bp->b_flags |= B_AGE;
447 		bqhold(bp);
448 		bqrelse(bp);
449 		error = uiomove((char *)bp->b_data + offset, n, uio);
450 		bqdrop(bp);
451 
452 		if (got_fstoken)
453 			lwkt_gettoken(&hmp->fs_token);
454 
455 		if (error)
456 			break;
457 		hammer_stats_file_read += n;
458 	}
459 
460 	/*
461 	 * Try to update the atime with just the inode lock for maximum
462 	 * concurrency.  If we can't shortcut it we have to get the full
463 	 * blown transaction.
464 	 */
465 	if (got_fstoken == 0 && hammer_update_atime_quick(ip) < 0) {
466 		lwkt_gettoken(&hmp->fs_token);
467 		got_fstoken = 1;
468 		hammer_start_transaction(&trans, ip->hmp);
469 	}
470 
471 	if (got_fstoken) {
472 		if ((ip->flags & HAMMER_INODE_RO) == 0 &&
473 		    (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
474 			ip->ino_data.atime = trans.time;
475 			hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
476 		}
477 		hammer_done_transaction(&trans);
478 		lwkt_reltoken(&hmp->fs_token);
479 	}
480 	return (error);
481 }
482 
483 /*
484  * hammer_vop_write { vp, uio, ioflag, cred }
485  */
486 static
487 int
488 hammer_vop_write(struct vop_write_args *ap)
489 {
490 	struct hammer_transaction trans;
491 	struct hammer_inode *ip;
492 	hammer_mount_t hmp;
493 	thread_t td;
494 	struct uio *uio;
495 	int offset;
496 	off_t base_offset;
497 	struct buf *bp;
498 	int kflags;
499 	int error;
500 	int n;
501 	int flags;
502 	int seqcount;
503 	int bigwrite;
504 
505 	if (ap->a_vp->v_type != VREG)
506 		return (EINVAL);
507 	ip = VTOI(ap->a_vp);
508 	hmp = ip->hmp;
509 	error = 0;
510 	kflags = 0;
511 	seqcount = ap->a_ioflag >> 16;
512 
513 	if (ip->flags & HAMMER_INODE_RO)
514 		return (EROFS);
515 
516 	/*
517 	 * Create a transaction to cover the operations we perform.
518 	 */
519 	lwkt_gettoken(&hmp->fs_token);
520 	hammer_start_transaction(&trans, hmp);
521 	uio = ap->a_uio;
522 
523 	/*
524 	 * Check append mode
525 	 */
526 	if (ap->a_ioflag & IO_APPEND)
527 		uio->uio_offset = ip->ino_data.size;
528 
529 	/*
530 	 * Check for illegal write offsets.  Valid range is 0...2^63-1.
531 	 *
532 	 * NOTE: the base_off assignment is required to work around what
533 	 * I consider to be a GCC-4 optimization bug.
534 	 */
535 	if (uio->uio_offset < 0) {
536 		hammer_done_transaction(&trans);
537 		lwkt_reltoken(&hmp->fs_token);
538 		return (EFBIG);
539 	}
540 	base_offset = uio->uio_offset + uio->uio_resid;	/* work around gcc-4 */
541 	if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) {
542 		hammer_done_transaction(&trans);
543 		lwkt_reltoken(&hmp->fs_token);
544 		return (EFBIG);
545 	}
546 
547 	if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc &&
548 	    base_offset > td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
549 		hammer_done_transaction(&trans);
550 		lwkt_reltoken(&hmp->fs_token);
551 		lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
552 		return (EFBIG);
553 	}
554 
555 	/*
556 	 * If reading or writing a huge amount of data we have to break
557 	 * atomicy and allow the operation to be interrupted by a signal
558 	 * or it can DOS the machine.
559 	 *
560 	 * Preset redo_count so we stop generating REDOs earlier if the
561 	 * limit is exceeded.
562 	 */
563 	bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
564 	if ((ip->flags & HAMMER_INODE_REDO) &&
565 	    ip->redo_count < hammer_limit_redo) {
566 		ip->redo_count += uio->uio_resid;
567 	}
568 
569 	/*
570 	 * Access the data typically in HAMMER_BUFSIZE blocks via the
571 	 * buffer cache, but HAMMER may use a variable block size based
572 	 * on the offset.
573 	 */
574 	while (uio->uio_resid > 0) {
575 		int fixsize = 0;
576 		int blksize;
577 		int blkmask;
578 		int trivial;
579 		int endofblk;
580 		off_t nsize;
581 
582 		if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
583 			break;
584 		if (bigwrite && (error = hammer_signal_check(hmp)) != 0)
585 			break;
586 
587 		blksize = hammer_blocksize(uio->uio_offset);
588 
589 		/*
590 		 * Do not allow HAMMER to blow out the buffer cache.  Very
591 		 * large UIOs can lockout other processes due to bwillwrite()
592 		 * mechanics.
593 		 *
594 		 * The hammer inode is not locked during these operations.
595 		 * The vnode is locked which can interfere with the pageout
596 		 * daemon for non-UIO_NOCOPY writes but should not interfere
597 		 * with the buffer cache.  Even so, we cannot afford to
598 		 * allow the pageout daemon to build up too many dirty buffer
599 		 * cache buffers.
600 		 *
601 		 * Only call this if we aren't being recursively called from
602 		 * a virtual disk device (vn), else we may deadlock.
603 		 */
604 		if ((ap->a_ioflag & IO_RECURSE) == 0)
605 			bwillwrite(blksize);
606 
607 		/*
608 		 * Control the number of pending records associated with
609 		 * this inode.  If too many have accumulated start a
610 		 * flush.  Try to maintain a pipeline with the flusher.
611 		 *
612 		 * NOTE: It is possible for other sources to grow the
613 		 *	 records but not necessarily issue another flush,
614 		 *	 so use a timeout and ensure that a re-flush occurs.
615 		 */
616 		if (ip->rsv_recs >= hammer_limit_inode_recs) {
617 			hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
618 			while (ip->rsv_recs >= hammer_limit_inode_recs * 2) {
619 				ip->flags |= HAMMER_INODE_RECSW;
620 				tsleep(&ip->rsv_recs, 0, "hmrwww", hz);
621 				hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
622 			}
623 		}
624 
625 #if 0
626 		/*
627 		 * Do not allow HAMMER to blow out system memory by
628 		 * accumulating too many records.   Records are so well
629 		 * decoupled from the buffer cache that it is possible
630 		 * for userland to push data out to the media via
631 		 * direct-write, but build up the records queued to the
632 		 * backend faster then the backend can flush them out.
633 		 * HAMMER has hit its write limit but the frontend has
634 		 * no pushback to slow it down.
635 		 */
636 		if (hmp->rsv_recs > hammer_limit_recs / 2) {
637 			/*
638 			 * Get the inode on the flush list
639 			 */
640 			if (ip->rsv_recs >= 64)
641 				hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
642 			else if (ip->rsv_recs >= 16)
643 				hammer_flush_inode(ip, 0);
644 
645 			/*
646 			 * Keep the flusher going if the system keeps
647 			 * queueing records.
648 			 */
649 			delta = hmp->count_newrecords -
650 				hmp->last_newrecords;
651 			if (delta < 0 || delta > hammer_limit_recs / 2) {
652 				hmp->last_newrecords = hmp->count_newrecords;
653 				hammer_sync_hmp(hmp, MNT_NOWAIT);
654 			}
655 
656 			/*
657 			 * If we have gotten behind start slowing
658 			 * down the writers.
659 			 */
660 			delta = (hmp->rsv_recs - hammer_limit_recs) *
661 				hz / hammer_limit_recs;
662 			if (delta > 0)
663 				tsleep(&trans, 0, "hmrslo", delta);
664 		}
665 #endif
666 
667 		/*
668 		 * Calculate the blocksize at the current offset and figure
669 		 * out how much we can actually write.
670 		 */
671 		blkmask = blksize - 1;
672 		offset = (int)uio->uio_offset & blkmask;
673 		base_offset = uio->uio_offset & ~(int64_t)blkmask;
674 		n = blksize - offset;
675 		if (n > uio->uio_resid) {
676 			n = uio->uio_resid;
677 			endofblk = 0;
678 		} else {
679 			endofblk = 1;
680 		}
681 		nsize = uio->uio_offset + n;
682 		if (nsize > ip->ino_data.size) {
683 			if (uio->uio_offset > ip->ino_data.size)
684 				trivial = 0;
685 			else
686 				trivial = 1;
687 			nvextendbuf(ap->a_vp,
688 				    ip->ino_data.size,
689 				    nsize,
690 				    hammer_blocksize(ip->ino_data.size),
691 				    hammer_blocksize(nsize),
692 				    hammer_blockoff(ip->ino_data.size),
693 				    hammer_blockoff(nsize),
694 				    trivial);
695 			fixsize = 1;
696 			kflags |= NOTE_EXTEND;
697 		}
698 
699 		if (uio->uio_segflg == UIO_NOCOPY) {
700 			/*
701 			 * Issuing a write with the same data backing the
702 			 * buffer.  Instantiate the buffer to collect the
703 			 * backing vm pages, then read-in any missing bits.
704 			 *
705 			 * This case is used by vop_stdputpages().
706 			 */
707 			bp = getblk(ap->a_vp, base_offset,
708 				    blksize, GETBLK_BHEAVY, 0);
709 			if ((bp->b_flags & B_CACHE) == 0) {
710 				bqrelse(bp);
711 				error = bread(ap->a_vp, base_offset,
712 					      blksize, &bp);
713 			}
714 		} else if (offset == 0 && uio->uio_resid >= blksize) {
715 			/*
716 			 * Even though we are entirely overwriting the buffer
717 			 * we may still have to zero it out to avoid a
718 			 * mmap/write visibility issue.
719 			 */
720 			bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
721 			if ((bp->b_flags & B_CACHE) == 0)
722 				vfs_bio_clrbuf(bp);
723 		} else if (base_offset >= ip->ino_data.size) {
724 			/*
725 			 * If the base offset of the buffer is beyond the
726 			 * file EOF, we don't have to issue a read.
727 			 */
728 			bp = getblk(ap->a_vp, base_offset,
729 				    blksize, GETBLK_BHEAVY, 0);
730 			vfs_bio_clrbuf(bp);
731 		} else {
732 			/*
733 			 * Partial overwrite, read in any missing bits then
734 			 * replace the portion being written.
735 			 */
736 			error = bread(ap->a_vp, base_offset, blksize, &bp);
737 			if (error == 0)
738 				bheavy(bp);
739 		}
740 		if (error == 0) {
741 			lwkt_reltoken(&hmp->fs_token);
742 			error = uiomove(bp->b_data + offset, n, uio);
743 			lwkt_gettoken(&hmp->fs_token);
744 		}
745 
746 		/*
747 		 * Generate REDO records if enabled and redo_count will not
748 		 * exceeded the limit.
749 		 *
750 		 * If redo_count exceeds the limit we stop generating records
751 		 * and clear HAMMER_INODE_REDO.  This will cause the next
752 		 * fsync() to do a full meta-data sync instead of just an
753 		 * UNDO/REDO fifo update.
754 		 *
755 		 * When clearing HAMMER_INODE_REDO any pre-existing REDOs
756 		 * will still be tracked.  The tracks will be terminated
757 		 * when the related meta-data (including possible data
758 		 * modifications which are not tracked via REDO) is
759 		 * flushed.
760 		 */
761 		if ((ip->flags & HAMMER_INODE_REDO) && error == 0) {
762 			if (ip->redo_count < hammer_limit_redo) {
763 				bp->b_flags |= B_VFSFLAG1;
764 				error = hammer_generate_redo(&trans, ip,
765 						     base_offset + offset,
766 						     HAMMER_REDO_WRITE,
767 						     bp->b_data + offset,
768 						     (size_t)n);
769 			} else {
770 				ip->flags &= ~HAMMER_INODE_REDO;
771 			}
772 		}
773 
774 		/*
775 		 * If we screwed up we have to undo any VM size changes we
776 		 * made.
777 		 */
778 		if (error) {
779 			brelse(bp);
780 			if (fixsize) {
781 				nvtruncbuf(ap->a_vp, ip->ino_data.size,
782 					  hammer_blocksize(ip->ino_data.size),
783 					  hammer_blockoff(ip->ino_data.size));
784 			}
785 			break;
786 		}
787 		kflags |= NOTE_WRITE;
788 		hammer_stats_file_write += n;
789 		/* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
790 		if (ip->ino_data.size < uio->uio_offset) {
791 			ip->ino_data.size = uio->uio_offset;
792 			flags = HAMMER_INODE_SDIRTY;
793 		} else {
794 			flags = 0;
795 		}
796 		ip->ino_data.mtime = trans.time;
797 		flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
798 		hammer_modify_inode(&trans, ip, flags);
799 
800 		/*
801 		 * Once we dirty the buffer any cached zone-X offset
802 		 * becomes invalid.  HAMMER NOTE: no-history mode cannot
803 		 * allow overwriting over the same data sector unless
804 		 * we provide UNDOs for the old data, which we don't.
805 		 */
806 		bp->b_bio2.bio_offset = NOOFFSET;
807 
808 		/*
809 		 * Final buffer disposition.
810 		 *
811 		 * Because meta-data updates are deferred, HAMMER is
812 		 * especially sensitive to excessive bdwrite()s because
813 		 * the I/O stream is not broken up by disk reads.  So the
814 		 * buffer cache simply cannot keep up.
815 		 *
816 		 * WARNING!  blksize is variable.  cluster_write() is
817 		 *	     expected to not blow up if it encounters
818 		 *	     buffers that do not match the passed blksize.
819 		 *
820 		 * NOTE!  Hammer shouldn't need to bawrite()/cluster_write().
821 		 *	  The ip->rsv_recs check should burst-flush the data.
822 		 *	  If we queue it immediately the buf could be left
823 		 *	  locked on the device queue for a very long time.
824 		 *
825 		 *	  However, failing to flush a dirty buffer out when
826 		 *        issued from the pageout daemon can result in a low
827 		 *        memory deadlock against bio_page_alloc(), so we
828 		 *	  have to bawrite() on IO_ASYNC as well.
829 		 *
830 		 * NOTE!  To avoid degenerate stalls due to mismatched block
831 		 *	  sizes we only honor IO_DIRECT on the write which
832 		 *	  abuts the end of the buffer.  However, we must
833 		 *	  honor IO_SYNC in case someone is silly enough to
834 		 *	  configure a HAMMER file as swap, or when HAMMER
835 		 *	  is serving NFS (for commits).  Ick ick.
836 		 */
837 		bp->b_flags |= B_AGE;
838 		if (ap->a_ioflag & IO_SYNC) {
839 			bwrite(bp);
840 		} else if ((ap->a_ioflag & IO_DIRECT) && endofblk) {
841 			bawrite(bp);
842 		} else if (ap->a_ioflag & IO_ASYNC) {
843 			bawrite(bp);
844 		} else {
845 #if 0
846 		if (offset + n == blksize) {
847 			if (hammer_cluster_enable == 0 ||
848 			    (ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) {
849 				bawrite(bp);
850 			} else {
851 				cluster_write(bp, ip->ino_data.size,
852 					      blksize, seqcount);
853 			}
854 		} else {
855 #endif
856 			bdwrite(bp);
857 		}
858 	}
859 	hammer_done_transaction(&trans);
860 	hammer_knote(ap->a_vp, kflags);
861 	lwkt_reltoken(&hmp->fs_token);
862 	return (error);
863 }
864 
865 /*
866  * hammer_vop_access { vp, mode, cred }
867  *
868  * MPSAFE - does not require fs_token
869  */
870 static
871 int
872 hammer_vop_access(struct vop_access_args *ap)
873 {
874 	struct hammer_inode *ip = VTOI(ap->a_vp);
875 	uid_t uid;
876 	gid_t gid;
877 	int error;
878 
879 	++hammer_stats_file_iopsr;
880 	uid = hammer_to_unix_xid(&ip->ino_data.uid);
881 	gid = hammer_to_unix_xid(&ip->ino_data.gid);
882 
883 	error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
884 				  ip->ino_data.uflags);
885 	return (error);
886 }
887 
888 /*
889  * hammer_vop_advlock { vp, id, op, fl, flags }
890  *
891  * MPSAFE - does not require fs_token
892  */
893 static
894 int
895 hammer_vop_advlock(struct vop_advlock_args *ap)
896 {
897 	hammer_inode_t ip = VTOI(ap->a_vp);
898 
899 	return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
900 }
901 
902 /*
903  * hammer_vop_close { vp, fflag }
904  *
905  * We can only sync-on-close for normal closes.  XXX disabled for now.
906  */
907 static
908 int
909 hammer_vop_close(struct vop_close_args *ap)
910 {
911 #if 0
912 	struct vnode *vp = ap->a_vp;
913 	hammer_inode_t ip = VTOI(vp);
914 	int waitfor;
915 	if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) {
916 		if (vn_islocked(vp) == LK_EXCLUSIVE &&
917 		    (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) {
918 			if (ip->flags & HAMMER_INODE_CLOSESYNC)
919 				waitfor = MNT_WAIT;
920 			else
921 				waitfor = MNT_NOWAIT;
922 			ip->flags &= ~(HAMMER_INODE_CLOSESYNC |
923 				       HAMMER_INODE_CLOSEASYNC);
924 			VOP_FSYNC(vp, MNT_NOWAIT, waitfor);
925 		}
926 	}
927 #endif
928 	return (vop_stdclose(ap));
929 }
930 
931 /*
932  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
933  *
934  * The operating system has already ensured that the directory entry
935  * does not exist and done all appropriate namespace locking.
936  */
937 static
938 int
939 hammer_vop_ncreate(struct vop_ncreate_args *ap)
940 {
941 	struct hammer_transaction trans;
942 	struct hammer_inode *dip;
943 	struct hammer_inode *nip;
944 	struct nchandle *nch;
945 	hammer_mount_t hmp;
946 	int error;
947 
948 	nch = ap->a_nch;
949 	dip = VTOI(ap->a_dvp);
950 	hmp = dip->hmp;
951 
952 	if (dip->flags & HAMMER_INODE_RO)
953 		return (EROFS);
954 	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
955 		return (error);
956 
957 	/*
958 	 * Create a transaction to cover the operations we perform.
959 	 */
960 	lwkt_gettoken(&hmp->fs_token);
961 	hammer_start_transaction(&trans, hmp);
962 	++hammer_stats_file_iopsw;
963 
964 	/*
965 	 * Create a new filesystem object of the requested type.  The
966 	 * returned inode will be referenced and shared-locked to prevent
967 	 * it from being moved to the flusher.
968 	 */
969 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
970 				    dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
971 				    NULL, &nip);
972 	if (error) {
973 		hkprintf("hammer_create_inode error %d\n", error);
974 		hammer_done_transaction(&trans);
975 		*ap->a_vpp = NULL;
976 		lwkt_reltoken(&hmp->fs_token);
977 		return (error);
978 	}
979 
980 	/*
981 	 * Add the new filesystem object to the directory.  This will also
982 	 * bump the inode's link count.
983 	 */
984 	error = hammer_ip_add_directory(&trans, dip,
985 					nch->ncp->nc_name, nch->ncp->nc_nlen,
986 					nip);
987 	if (error)
988 		hkprintf("hammer_ip_add_directory error %d\n", error);
989 
990 	/*
991 	 * Finish up.
992 	 */
993 	if (error) {
994 		hammer_rel_inode(nip, 0);
995 		hammer_done_transaction(&trans);
996 		*ap->a_vpp = NULL;
997 	} else {
998 		error = hammer_get_vnode(nip, ap->a_vpp);
999 		hammer_done_transaction(&trans);
1000 		hammer_rel_inode(nip, 0);
1001 		if (error == 0) {
1002 			cache_setunresolved(ap->a_nch);
1003 			cache_setvp(ap->a_nch, *ap->a_vpp);
1004 		}
1005 		hammer_knote(ap->a_dvp, NOTE_WRITE);
1006 	}
1007 	lwkt_reltoken(&hmp->fs_token);
1008 	return (error);
1009 }
1010 
1011 /*
1012  * hammer_vop_getattr { vp, vap }
1013  *
1014  * Retrieve an inode's attribute information.  When accessing inodes
1015  * historically we fake the atime field to ensure consistent results.
1016  * The atime field is stored in the B-Tree element and allowed to be
1017  * updated without cycling the element.
1018  *
1019  * MPSAFE - does not require fs_token
1020  */
1021 static
1022 int
1023 hammer_vop_getattr(struct vop_getattr_args *ap)
1024 {
1025 	struct hammer_inode *ip = VTOI(ap->a_vp);
1026 	struct vattr *vap = ap->a_vap;
1027 
1028 	/*
1029 	 * We want the fsid to be different when accessing a filesystem
1030 	 * with different as-of's so programs like diff don't think
1031 	 * the files are the same.
1032 	 *
1033 	 * We also want the fsid to be the same when comparing snapshots,
1034 	 * or when comparing mirrors (which might be backed by different
1035 	 * physical devices).  HAMMER fsids are based on the PFS's
1036 	 * shared_uuid field.
1037 	 *
1038 	 * XXX there is a chance of collision here.  The va_fsid reported
1039 	 * by stat is different from the more involved fsid used in the
1040 	 * mount structure.
1041 	 */
1042 	++hammer_stats_file_iopsr;
1043 	hammer_lock_sh(&ip->lock);
1044 	vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
1045 		       (u_int32_t)(ip->obj_asof >> 32);
1046 
1047 	vap->va_fileid = ip->ino_leaf.base.obj_id;
1048 	vap->va_mode = ip->ino_data.mode;
1049 	vap->va_nlink = ip->ino_data.nlinks;
1050 	vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1051 	vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1052 	vap->va_rmajor = 0;
1053 	vap->va_rminor = 0;
1054 	vap->va_size = ip->ino_data.size;
1055 
1056 	/*
1057 	 * Special case for @@PFS softlinks.  The actual size of the
1058 	 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
1059 	 * or for MAX_TID is    "@@-1:%05d" == 10 bytes.
1060 	 */
1061 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
1062 	    ip->ino_data.size == 10 &&
1063 	    ip->obj_asof == HAMMER_MAX_TID &&
1064 	    ip->obj_localization == 0 &&
1065 	    strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
1066 		    if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE)
1067 			    vap->va_size = 26;
1068 		    else
1069 			    vap->va_size = 10;
1070 	}
1071 
1072 	/*
1073 	 * We must provide a consistent atime and mtime for snapshots
1074 	 * so people can do a 'tar cf - ... | md5' on them and get
1075 	 * consistent results.
1076 	 */
1077 	if (ip->flags & HAMMER_INODE_RO) {
1078 		hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
1079 		hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
1080 	} else {
1081 		hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
1082 		hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
1083 	}
1084 	hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
1085 	vap->va_flags = ip->ino_data.uflags;
1086 	vap->va_gen = 1;	/* hammer inums are unique for all time */
1087 	vap->va_blocksize = HAMMER_BUFSIZE;
1088 	if (ip->ino_data.size >= HAMMER_XDEMARC) {
1089 		vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
1090 				~HAMMER_XBUFMASK64;
1091 	} else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
1092 		vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
1093 				~HAMMER_BUFMASK64;
1094 	} else {
1095 		vap->va_bytes = (ip->ino_data.size + 15) & ~15;
1096 	}
1097 
1098 	vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
1099 	vap->va_filerev = 0; 	/* XXX */
1100 	vap->va_uid_uuid = ip->ino_data.uid;
1101 	vap->va_gid_uuid = ip->ino_data.gid;
1102 	vap->va_fsid_uuid = ip->hmp->fsid;
1103 	vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
1104 			  VA_FSID_UUID_VALID;
1105 
1106 	switch (ip->ino_data.obj_type) {
1107 	case HAMMER_OBJTYPE_CDEV:
1108 	case HAMMER_OBJTYPE_BDEV:
1109 		vap->va_rmajor = ip->ino_data.rmajor;
1110 		vap->va_rminor = ip->ino_data.rminor;
1111 		break;
1112 	default:
1113 		break;
1114 	}
1115 	hammer_unlock(&ip->lock);
1116 	return(0);
1117 }
1118 
1119 /*
1120  * hammer_vop_nresolve { nch, dvp, cred }
1121  *
1122  * Locate the requested directory entry.
1123  */
1124 static
1125 int
1126 hammer_vop_nresolve(struct vop_nresolve_args *ap)
1127 {
1128 	struct hammer_transaction trans;
1129 	struct namecache *ncp;
1130 	hammer_mount_t hmp;
1131 	hammer_inode_t dip;
1132 	hammer_inode_t ip;
1133 	hammer_tid_t asof;
1134 	struct hammer_cursor cursor;
1135 	struct vnode *vp;
1136 	int64_t namekey;
1137 	int error;
1138 	int i;
1139 	int nlen;
1140 	int flags;
1141 	int ispfs;
1142 	int64_t obj_id;
1143 	u_int32_t localization;
1144 	u_int32_t max_iterations;
1145 
1146 	/*
1147 	 * Misc initialization, plus handle as-of name extensions.  Look for
1148 	 * the '@@' extension.  Note that as-of files and directories cannot
1149 	 * be modified.
1150 	 */
1151 	dip = VTOI(ap->a_dvp);
1152 	ncp = ap->a_nch->ncp;
1153 	asof = dip->obj_asof;
1154 	localization = dip->obj_localization;	/* for code consistency */
1155 	nlen = ncp->nc_nlen;
1156 	flags = dip->flags & HAMMER_INODE_RO;
1157 	ispfs = 0;
1158 	hmp = dip->hmp;
1159 
1160 	lwkt_gettoken(&hmp->fs_token);
1161 	hammer_simple_transaction(&trans, hmp);
1162 	++hammer_stats_file_iopsr;
1163 
1164 	for (i = 0; i < nlen; ++i) {
1165 		if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
1166 			error = hammer_str_to_tid(ncp->nc_name + i + 2,
1167 						  &ispfs, &asof, &localization);
1168 			if (error != 0) {
1169 				i = nlen;
1170 				break;
1171 			}
1172 			if (asof != HAMMER_MAX_TID)
1173 				flags |= HAMMER_INODE_RO;
1174 			break;
1175 		}
1176 	}
1177 	nlen = i;
1178 
1179 	/*
1180 	 * If this is a PFS softlink we dive into the PFS
1181 	 */
1182 	if (ispfs && nlen == 0) {
1183 		ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
1184 				      asof, localization,
1185 				      flags, &error);
1186 		if (error == 0) {
1187 			error = hammer_get_vnode(ip, &vp);
1188 			hammer_rel_inode(ip, 0);
1189 		} else {
1190 			vp = NULL;
1191 		}
1192 		if (error == 0) {
1193 			vn_unlock(vp);
1194 			cache_setvp(ap->a_nch, vp);
1195 			vrele(vp);
1196 		}
1197 		goto done;
1198 	}
1199 
1200 	/*
1201 	 * If there is no path component the time extension is relative to dip.
1202 	 * e.g. "fubar/@@<snapshot>"
1203 	 *
1204 	 * "." is handled by the kernel, but ".@@<snapshot>" is not.
1205 	 * e.g. "fubar/.@@<snapshot>"
1206 	 *
1207 	 * ".." is handled by the kernel.  We do not currently handle
1208 	 * "..@<snapshot>".
1209 	 */
1210 	if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) {
1211 		ip = hammer_get_inode(&trans, dip, dip->obj_id,
1212 				      asof, dip->obj_localization,
1213 				      flags, &error);
1214 		if (error == 0) {
1215 			error = hammer_get_vnode(ip, &vp);
1216 			hammer_rel_inode(ip, 0);
1217 		} else {
1218 			vp = NULL;
1219 		}
1220 		if (error == 0) {
1221 			vn_unlock(vp);
1222 			cache_setvp(ap->a_nch, vp);
1223 			vrele(vp);
1224 		}
1225 		goto done;
1226 	}
1227 
1228 	/*
1229 	 * Calculate the namekey and setup the key range for the scan.  This
1230 	 * works kinda like a chained hash table where the lower 32 bits
1231 	 * of the namekey synthesize the chain.
1232 	 *
1233 	 * The key range is inclusive of both key_beg and key_end.
1234 	 */
1235 	namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
1236 					   &max_iterations);
1237 
1238 	error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
1239 	cursor.key_beg.localization = dip->obj_localization +
1240 				      hammer_dir_localization(dip);
1241         cursor.key_beg.obj_id = dip->obj_id;
1242 	cursor.key_beg.key = namekey;
1243         cursor.key_beg.create_tid = 0;
1244         cursor.key_beg.delete_tid = 0;
1245         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1246         cursor.key_beg.obj_type = 0;
1247 
1248 	cursor.key_end = cursor.key_beg;
1249 	cursor.key_end.key += max_iterations;
1250 	cursor.asof = asof;
1251 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1252 
1253 	/*
1254 	 * Scan all matching records (the chain), locate the one matching
1255 	 * the requested path component.
1256 	 *
1257 	 * The hammer_ip_*() functions merge in-memory records with on-disk
1258 	 * records for the purposes of the search.
1259 	 */
1260 	obj_id = 0;
1261 	localization = HAMMER_DEF_LOCALIZATION;
1262 
1263 	if (error == 0) {
1264 		error = hammer_ip_first(&cursor);
1265 		while (error == 0) {
1266 			error = hammer_ip_resolve_data(&cursor);
1267 			if (error)
1268 				break;
1269 			if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
1270 			    bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1271 				obj_id = cursor.data->entry.obj_id;
1272 				localization = cursor.data->entry.localization;
1273 				break;
1274 			}
1275 			error = hammer_ip_next(&cursor);
1276 		}
1277 	}
1278 	hammer_done_cursor(&cursor);
1279 
1280 	/*
1281 	 * Lookup the obj_id.  This should always succeed.  If it does not
1282 	 * the filesystem may be damaged and we return a dummy inode.
1283 	 */
1284 	if (error == 0) {
1285 		ip = hammer_get_inode(&trans, dip, obj_id,
1286 				      asof, localization,
1287 				      flags, &error);
1288 		if (error == ENOENT) {
1289 			kprintf("HAMMER: WARNING: Missing "
1290 				"inode for dirent \"%s\"\n"
1291 				"\tobj_id = %016llx, asof=%016llx, lo=%08x\n",
1292 				ncp->nc_name,
1293 				(long long)obj_id, (long long)asof,
1294 				localization);
1295 			error = 0;
1296 			ip = hammer_get_dummy_inode(&trans, dip, obj_id,
1297 						    asof, localization,
1298 						    flags, &error);
1299 		}
1300 		if (error == 0) {
1301 			error = hammer_get_vnode(ip, &vp);
1302 			hammer_rel_inode(ip, 0);
1303 		} else {
1304 			vp = NULL;
1305 		}
1306 		if (error == 0) {
1307 			vn_unlock(vp);
1308 			cache_setvp(ap->a_nch, vp);
1309 			vrele(vp);
1310 		}
1311 	} else if (error == ENOENT) {
1312 		cache_setvp(ap->a_nch, NULL);
1313 	}
1314 done:
1315 	hammer_done_transaction(&trans);
1316 	lwkt_reltoken(&hmp->fs_token);
1317 	return (error);
1318 }
1319 
1320 /*
1321  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
1322  *
1323  * Locate the parent directory of a directory vnode.
1324  *
1325  * dvp is referenced but not locked.  *vpp must be returned referenced and
1326  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
1327  * at the root, instead it could indicate that the directory we were in was
1328  * removed.
1329  *
1330  * NOTE: as-of sequences are not linked into the directory structure.  If
1331  * we are at the root with a different asof then the mount point, reload
1332  * the same directory with the mount point's asof.   I'm not sure what this
1333  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
1334  * get confused, but it hasn't been tested.
1335  */
1336 static
1337 int
1338 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1339 {
1340 	struct hammer_transaction trans;
1341 	struct hammer_inode *dip;
1342 	struct hammer_inode *ip;
1343 	hammer_mount_t hmp;
1344 	int64_t parent_obj_id;
1345 	u_int32_t parent_obj_localization;
1346 	hammer_tid_t asof;
1347 	int error;
1348 
1349 	dip = VTOI(ap->a_dvp);
1350 	asof = dip->obj_asof;
1351 	hmp = dip->hmp;
1352 
1353 	/*
1354 	 * Whos are parent?  This could be the root of a pseudo-filesystem
1355 	 * whos parent is in another localization domain.
1356 	 */
1357 	lwkt_gettoken(&hmp->fs_token);
1358 	parent_obj_id = dip->ino_data.parent_obj_id;
1359 	if (dip->obj_id == HAMMER_OBJID_ROOT)
1360 		parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
1361 	else
1362 		parent_obj_localization = dip->obj_localization;
1363 
1364 	if (parent_obj_id == 0) {
1365 		if (dip->obj_id == HAMMER_OBJID_ROOT &&
1366 		   asof != hmp->asof) {
1367 			parent_obj_id = dip->obj_id;
1368 			asof = hmp->asof;
1369 			*ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
1370 			ksnprintf(*ap->a_fakename, 19, "0x%016llx",
1371 				  (long long)dip->obj_asof);
1372 		} else {
1373 			*ap->a_vpp = NULL;
1374 			lwkt_reltoken(&hmp->fs_token);
1375 			return ENOENT;
1376 		}
1377 	}
1378 
1379 	hammer_simple_transaction(&trans, hmp);
1380 	++hammer_stats_file_iopsr;
1381 
1382 	ip = hammer_get_inode(&trans, dip, parent_obj_id,
1383 			      asof, parent_obj_localization,
1384 			      dip->flags, &error);
1385 	if (ip) {
1386 		error = hammer_get_vnode(ip, ap->a_vpp);
1387 		hammer_rel_inode(ip, 0);
1388 	} else {
1389 		*ap->a_vpp = NULL;
1390 	}
1391 	hammer_done_transaction(&trans);
1392 	lwkt_reltoken(&hmp->fs_token);
1393 	return (error);
1394 }
1395 
1396 /*
1397  * hammer_vop_nlink { nch, dvp, vp, cred }
1398  */
1399 static
1400 int
1401 hammer_vop_nlink(struct vop_nlink_args *ap)
1402 {
1403 	struct hammer_transaction trans;
1404 	struct hammer_inode *dip;
1405 	struct hammer_inode *ip;
1406 	struct nchandle *nch;
1407 	hammer_mount_t hmp;
1408 	int error;
1409 
1410 	if (ap->a_dvp->v_mount != ap->a_vp->v_mount)
1411 		return(EXDEV);
1412 
1413 	nch = ap->a_nch;
1414 	dip = VTOI(ap->a_dvp);
1415 	ip = VTOI(ap->a_vp);
1416 	hmp = dip->hmp;
1417 
1418 	if (dip->obj_localization != ip->obj_localization)
1419 		return(EXDEV);
1420 
1421 	if (dip->flags & HAMMER_INODE_RO)
1422 		return (EROFS);
1423 	if (ip->flags & HAMMER_INODE_RO)
1424 		return (EROFS);
1425 	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1426 		return (error);
1427 
1428 	/*
1429 	 * Create a transaction to cover the operations we perform.
1430 	 */
1431 	lwkt_gettoken(&hmp->fs_token);
1432 	hammer_start_transaction(&trans, hmp);
1433 	++hammer_stats_file_iopsw;
1434 
1435 	/*
1436 	 * Add the filesystem object to the directory.  Note that neither
1437 	 * dip nor ip are referenced or locked, but their vnodes are
1438 	 * referenced.  This function will bump the inode's link count.
1439 	 */
1440 	error = hammer_ip_add_directory(&trans, dip,
1441 					nch->ncp->nc_name, nch->ncp->nc_nlen,
1442 					ip);
1443 
1444 	/*
1445 	 * Finish up.
1446 	 */
1447 	if (error == 0) {
1448 		cache_setunresolved(nch);
1449 		cache_setvp(nch, ap->a_vp);
1450 	}
1451 	hammer_done_transaction(&trans);
1452 	hammer_knote(ap->a_vp, NOTE_LINK);
1453 	hammer_knote(ap->a_dvp, NOTE_WRITE);
1454 	lwkt_reltoken(&hmp->fs_token);
1455 	return (error);
1456 }
1457 
1458 /*
1459  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1460  *
1461  * The operating system has already ensured that the directory entry
1462  * does not exist and done all appropriate namespace locking.
1463  */
1464 static
1465 int
1466 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1467 {
1468 	struct hammer_transaction trans;
1469 	struct hammer_inode *dip;
1470 	struct hammer_inode *nip;
1471 	struct nchandle *nch;
1472 	hammer_mount_t hmp;
1473 	int error;
1474 
1475 	nch = ap->a_nch;
1476 	dip = VTOI(ap->a_dvp);
1477 	hmp = dip->hmp;
1478 
1479 	if (dip->flags & HAMMER_INODE_RO)
1480 		return (EROFS);
1481 	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1482 		return (error);
1483 
1484 	/*
1485 	 * Create a transaction to cover the operations we perform.
1486 	 */
1487 	lwkt_gettoken(&hmp->fs_token);
1488 	hammer_start_transaction(&trans, hmp);
1489 	++hammer_stats_file_iopsw;
1490 
1491 	/*
1492 	 * Create a new filesystem object of the requested type.  The
1493 	 * returned inode will be referenced but not locked.
1494 	 */
1495 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1496 				    dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1497 				    NULL, &nip);
1498 	if (error) {
1499 		hkprintf("hammer_mkdir error %d\n", error);
1500 		hammer_done_transaction(&trans);
1501 		*ap->a_vpp = NULL;
1502 		lwkt_reltoken(&hmp->fs_token);
1503 		return (error);
1504 	}
1505 	/*
1506 	 * Add the new filesystem object to the directory.  This will also
1507 	 * bump the inode's link count.
1508 	 */
1509 	error = hammer_ip_add_directory(&trans, dip,
1510 					nch->ncp->nc_name, nch->ncp->nc_nlen,
1511 					nip);
1512 	if (error)
1513 		hkprintf("hammer_mkdir (add) error %d\n", error);
1514 
1515 	/*
1516 	 * Finish up.
1517 	 */
1518 	if (error) {
1519 		hammer_rel_inode(nip, 0);
1520 		*ap->a_vpp = NULL;
1521 	} else {
1522 		error = hammer_get_vnode(nip, ap->a_vpp);
1523 		hammer_rel_inode(nip, 0);
1524 		if (error == 0) {
1525 			cache_setunresolved(ap->a_nch);
1526 			cache_setvp(ap->a_nch, *ap->a_vpp);
1527 		}
1528 	}
1529 	hammer_done_transaction(&trans);
1530 	if (error == 0)
1531 		hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1532 	lwkt_reltoken(&hmp->fs_token);
1533 	return (error);
1534 }
1535 
1536 /*
1537  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1538  *
1539  * The operating system has already ensured that the directory entry
1540  * does not exist and done all appropriate namespace locking.
1541  */
1542 static
1543 int
1544 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1545 {
1546 	struct hammer_transaction trans;
1547 	struct hammer_inode *dip;
1548 	struct hammer_inode *nip;
1549 	struct nchandle *nch;
1550 	hammer_mount_t hmp;
1551 	int error;
1552 
1553 	nch = ap->a_nch;
1554 	dip = VTOI(ap->a_dvp);
1555 	hmp = dip->hmp;
1556 
1557 	if (dip->flags & HAMMER_INODE_RO)
1558 		return (EROFS);
1559 	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1560 		return (error);
1561 
1562 	/*
1563 	 * Create a transaction to cover the operations we perform.
1564 	 */
1565 	lwkt_gettoken(&hmp->fs_token);
1566 	hammer_start_transaction(&trans, hmp);
1567 	++hammer_stats_file_iopsw;
1568 
1569 	/*
1570 	 * Create a new filesystem object of the requested type.  The
1571 	 * returned inode will be referenced but not locked.
1572 	 *
1573 	 * If mknod specifies a directory a pseudo-fs is created.
1574 	 */
1575 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1576 				    dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1577 				    NULL, &nip);
1578 	if (error) {
1579 		hammer_done_transaction(&trans);
1580 		*ap->a_vpp = NULL;
1581 		lwkt_reltoken(&hmp->fs_token);
1582 		return (error);
1583 	}
1584 
1585 	/*
1586 	 * Add the new filesystem object to the directory.  This will also
1587 	 * bump the inode's link count.
1588 	 */
1589 	error = hammer_ip_add_directory(&trans, dip,
1590 					nch->ncp->nc_name, nch->ncp->nc_nlen,
1591 					nip);
1592 
1593 	/*
1594 	 * Finish up.
1595 	 */
1596 	if (error) {
1597 		hammer_rel_inode(nip, 0);
1598 		*ap->a_vpp = NULL;
1599 	} else {
1600 		error = hammer_get_vnode(nip, ap->a_vpp);
1601 		hammer_rel_inode(nip, 0);
1602 		if (error == 0) {
1603 			cache_setunresolved(ap->a_nch);
1604 			cache_setvp(ap->a_nch, *ap->a_vpp);
1605 		}
1606 	}
1607 	hammer_done_transaction(&trans);
1608 	if (error == 0)
1609 		hammer_knote(ap->a_dvp, NOTE_WRITE);
1610 	lwkt_reltoken(&hmp->fs_token);
1611 	return (error);
1612 }
1613 
1614 /*
1615  * hammer_vop_open { vp, mode, cred, fp }
1616  *
1617  * MPSAFE (does not require fs_token)
1618  */
1619 static
1620 int
1621 hammer_vop_open(struct vop_open_args *ap)
1622 {
1623 	hammer_inode_t ip;
1624 
1625 	++hammer_stats_file_iopsr;
1626 	ip = VTOI(ap->a_vp);
1627 
1628 	if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1629 		return (EROFS);
1630 	return(vop_stdopen(ap));
1631 }
1632 
1633 /*
1634  * hammer_vop_print { vp }
1635  */
1636 static
1637 int
1638 hammer_vop_print(struct vop_print_args *ap)
1639 {
1640 	return EOPNOTSUPP;
1641 }
1642 
1643 /*
1644  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1645  */
1646 static
1647 int
1648 hammer_vop_readdir(struct vop_readdir_args *ap)
1649 {
1650 	struct hammer_transaction trans;
1651 	struct hammer_cursor cursor;
1652 	struct hammer_inode *ip;
1653 	hammer_mount_t hmp;
1654 	struct uio *uio;
1655 	hammer_base_elm_t base;
1656 	int error;
1657 	int cookie_index;
1658 	int ncookies;
1659 	off_t *cookies;
1660 	off_t saveoff;
1661 	int r;
1662 	int dtype;
1663 
1664 	++hammer_stats_file_iopsr;
1665 	ip = VTOI(ap->a_vp);
1666 	uio = ap->a_uio;
1667 	saveoff = uio->uio_offset;
1668 	hmp = ip->hmp;
1669 
1670 	if (ap->a_ncookies) {
1671 		ncookies = uio->uio_resid / 16 + 1;
1672 		if (ncookies > 1024)
1673 			ncookies = 1024;
1674 		cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1675 		cookie_index = 0;
1676 	} else {
1677 		ncookies = -1;
1678 		cookies = NULL;
1679 		cookie_index = 0;
1680 	}
1681 
1682 	lwkt_gettoken(&hmp->fs_token);
1683 	hammer_simple_transaction(&trans, hmp);
1684 
1685 	/*
1686 	 * Handle artificial entries
1687 	 *
1688 	 * It should be noted that the minimum value for a directory
1689 	 * hash key on-media is 0x0000000100000000, so we can use anything
1690 	 * less then that to represent our 'special' key space.
1691 	 */
1692 	error = 0;
1693 	if (saveoff == 0) {
1694 		r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1695 		if (r)
1696 			goto done;
1697 		if (cookies)
1698 			cookies[cookie_index] = saveoff;
1699 		++saveoff;
1700 		++cookie_index;
1701 		if (cookie_index == ncookies)
1702 			goto done;
1703 	}
1704 	if (saveoff == 1) {
1705 		if (ip->ino_data.parent_obj_id) {
1706 			r = vop_write_dirent(&error, uio,
1707 					     ip->ino_data.parent_obj_id,
1708 					     DT_DIR, 2, "..");
1709 		} else {
1710 			r = vop_write_dirent(&error, uio,
1711 					     ip->obj_id, DT_DIR, 2, "..");
1712 		}
1713 		if (r)
1714 			goto done;
1715 		if (cookies)
1716 			cookies[cookie_index] = saveoff;
1717 		++saveoff;
1718 		++cookie_index;
1719 		if (cookie_index == ncookies)
1720 			goto done;
1721 	}
1722 
1723 	/*
1724 	 * Key range (begin and end inclusive) to scan.  Directory keys
1725 	 * directly translate to a 64 bit 'seek' position.
1726 	 */
1727 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1728 	cursor.key_beg.localization = ip->obj_localization +
1729 				      hammer_dir_localization(ip);
1730 	cursor.key_beg.obj_id = ip->obj_id;
1731 	cursor.key_beg.create_tid = 0;
1732 	cursor.key_beg.delete_tid = 0;
1733         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1734 	cursor.key_beg.obj_type = 0;
1735 	cursor.key_beg.key = saveoff;
1736 
1737 	cursor.key_end = cursor.key_beg;
1738 	cursor.key_end.key = HAMMER_MAX_KEY;
1739 	cursor.asof = ip->obj_asof;
1740 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1741 
1742 	error = hammer_ip_first(&cursor);
1743 
1744 	while (error == 0) {
1745 		error = hammer_ip_resolve_data(&cursor);
1746 		if (error)
1747 			break;
1748 		base = &cursor.leaf->base;
1749 		saveoff = base->key;
1750 		KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1751 
1752 		if (base->obj_id != ip->obj_id)
1753 			panic("readdir: bad record at %p", cursor.node);
1754 
1755 		/*
1756 		 * Convert pseudo-filesystems into softlinks
1757 		 */
1758 		dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
1759 		r = vop_write_dirent(
1760 			     &error, uio, cursor.data->entry.obj_id,
1761 			     dtype,
1762 			     cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1763 			     (void *)cursor.data->entry.name);
1764 		if (r)
1765 			break;
1766 		++saveoff;
1767 		if (cookies)
1768 			cookies[cookie_index] = base->key;
1769 		++cookie_index;
1770 		if (cookie_index == ncookies)
1771 			break;
1772 		error = hammer_ip_next(&cursor);
1773 	}
1774 	hammer_done_cursor(&cursor);
1775 
1776 done:
1777 	hammer_done_transaction(&trans);
1778 
1779 	if (ap->a_eofflag)
1780 		*ap->a_eofflag = (error == ENOENT);
1781 	uio->uio_offset = saveoff;
1782 	if (error && cookie_index == 0) {
1783 		if (error == ENOENT)
1784 			error = 0;
1785 		if (cookies) {
1786 			kfree(cookies, M_TEMP);
1787 			*ap->a_ncookies = 0;
1788 			*ap->a_cookies = NULL;
1789 		}
1790 	} else {
1791 		if (error == ENOENT)
1792 			error = 0;
1793 		if (cookies) {
1794 			*ap->a_ncookies = cookie_index;
1795 			*ap->a_cookies = cookies;
1796 		}
1797 	}
1798 	lwkt_reltoken(&hmp->fs_token);
1799 	return(error);
1800 }
1801 
1802 /*
1803  * hammer_vop_readlink { vp, uio, cred }
1804  */
1805 static
1806 int
1807 hammer_vop_readlink(struct vop_readlink_args *ap)
1808 {
1809 	struct hammer_transaction trans;
1810 	struct hammer_cursor cursor;
1811 	struct hammer_inode *ip;
1812 	hammer_mount_t hmp;
1813 	char buf[32];
1814 	u_int32_t localization;
1815 	hammer_pseudofs_inmem_t pfsm;
1816 	int error;
1817 
1818 	ip = VTOI(ap->a_vp);
1819 	hmp = ip->hmp;
1820 
1821 	lwkt_gettoken(&hmp->fs_token);
1822 
1823 	/*
1824 	 * Shortcut if the symlink data was stuffed into ino_data.
1825 	 *
1826 	 * Also expand special "@@PFS%05d" softlinks (expansion only
1827 	 * occurs for non-historical (current) accesses made from the
1828 	 * primary filesystem).
1829 	 */
1830 	if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1831 		char *ptr;
1832 		int bytes;
1833 
1834 		ptr = ip->ino_data.ext.symlink;
1835 		bytes = (int)ip->ino_data.size;
1836 		if (bytes == 10 &&
1837 		    ip->obj_asof == HAMMER_MAX_TID &&
1838 		    ip->obj_localization == 0 &&
1839 		    strncmp(ptr, "@@PFS", 5) == 0) {
1840 			hammer_simple_transaction(&trans, hmp);
1841 			bcopy(ptr + 5, buf, 5);
1842 			buf[5] = 0;
1843 			localization = strtoul(buf, NULL, 10) << 16;
1844 			pfsm = hammer_load_pseudofs(&trans, localization,
1845 						    &error);
1846 			if (error == 0) {
1847 				if (pfsm->pfsd.mirror_flags &
1848 				    HAMMER_PFSD_SLAVE) {
1849 					/* vap->va_size == 26 */
1850 					ksnprintf(buf, sizeof(buf),
1851 						  "@@0x%016llx:%05d",
1852 						  (long long)pfsm->pfsd.sync_end_tid,
1853 						  localization >> 16);
1854 				} else {
1855 					/* vap->va_size == 10 */
1856 					ksnprintf(buf, sizeof(buf),
1857 						  "@@-1:%05d",
1858 						  localization >> 16);
1859 #if 0
1860 					ksnprintf(buf, sizeof(buf),
1861 						  "@@0x%016llx:%05d",
1862 						  (long long)HAMMER_MAX_TID,
1863 						  localization >> 16);
1864 #endif
1865 				}
1866 				ptr = buf;
1867 				bytes = strlen(buf);
1868 			}
1869 			if (pfsm)
1870 				hammer_rel_pseudofs(hmp, pfsm);
1871 			hammer_done_transaction(&trans);
1872 		}
1873 		error = uiomove(ptr, bytes, ap->a_uio);
1874 		lwkt_reltoken(&hmp->fs_token);
1875 		return(error);
1876 	}
1877 
1878 	/*
1879 	 * Long version
1880 	 */
1881 	hammer_simple_transaction(&trans, hmp);
1882 	++hammer_stats_file_iopsr;
1883 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1884 
1885 	/*
1886 	 * Key range (begin and end inclusive) to scan.  Directory keys
1887 	 * directly translate to a 64 bit 'seek' position.
1888 	 */
1889 	cursor.key_beg.localization = ip->obj_localization +
1890 				      HAMMER_LOCALIZE_MISC;
1891 	cursor.key_beg.obj_id = ip->obj_id;
1892 	cursor.key_beg.create_tid = 0;
1893 	cursor.key_beg.delete_tid = 0;
1894         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1895 	cursor.key_beg.obj_type = 0;
1896 	cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1897 	cursor.asof = ip->obj_asof;
1898 	cursor.flags |= HAMMER_CURSOR_ASOF;
1899 
1900 	error = hammer_ip_lookup(&cursor);
1901 	if (error == 0) {
1902 		error = hammer_ip_resolve_data(&cursor);
1903 		if (error == 0) {
1904 			KKASSERT(cursor.leaf->data_len >=
1905 				 HAMMER_SYMLINK_NAME_OFF);
1906 			error = uiomove(cursor.data->symlink.name,
1907 					cursor.leaf->data_len -
1908 						HAMMER_SYMLINK_NAME_OFF,
1909 					ap->a_uio);
1910 		}
1911 	}
1912 	hammer_done_cursor(&cursor);
1913 	hammer_done_transaction(&trans);
1914 	lwkt_reltoken(&hmp->fs_token);
1915 	return(error);
1916 }
1917 
1918 /*
1919  * hammer_vop_nremove { nch, dvp, cred }
1920  */
1921 static
1922 int
1923 hammer_vop_nremove(struct vop_nremove_args *ap)
1924 {
1925 	struct hammer_transaction trans;
1926 	struct hammer_inode *dip;
1927 	hammer_mount_t hmp;
1928 	int error;
1929 
1930 	dip = VTOI(ap->a_dvp);
1931 	hmp = dip->hmp;
1932 
1933 	if (hammer_nohistory(dip) == 0 &&
1934 	    (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1935 		return (error);
1936 	}
1937 
1938 	lwkt_gettoken(&hmp->fs_token);
1939 	hammer_start_transaction(&trans, hmp);
1940 	++hammer_stats_file_iopsw;
1941 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
1942 	hammer_done_transaction(&trans);
1943 	if (error == 0)
1944 		hammer_knote(ap->a_dvp, NOTE_WRITE);
1945 	lwkt_reltoken(&hmp->fs_token);
1946 	return (error);
1947 }
1948 
1949 /*
1950  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1951  */
1952 static
1953 int
1954 hammer_vop_nrename(struct vop_nrename_args *ap)
1955 {
1956 	struct hammer_transaction trans;
1957 	struct namecache *fncp;
1958 	struct namecache *tncp;
1959 	struct hammer_inode *fdip;
1960 	struct hammer_inode *tdip;
1961 	struct hammer_inode *ip;
1962 	hammer_mount_t hmp;
1963 	struct hammer_cursor cursor;
1964 	int64_t namekey;
1965 	u_int32_t max_iterations;
1966 	int nlen, error;
1967 
1968 	if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
1969 		return(EXDEV);
1970 	if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1971 		return(EXDEV);
1972 
1973 	fdip = VTOI(ap->a_fdvp);
1974 	tdip = VTOI(ap->a_tdvp);
1975 	fncp = ap->a_fnch->ncp;
1976 	tncp = ap->a_tnch->ncp;
1977 	ip = VTOI(fncp->nc_vp);
1978 	KKASSERT(ip != NULL);
1979 
1980 	hmp = ip->hmp;
1981 
1982 	if (fdip->obj_localization != tdip->obj_localization)
1983 		return(EXDEV);
1984 	if (fdip->obj_localization != ip->obj_localization)
1985 		return(EXDEV);
1986 
1987 	if (fdip->flags & HAMMER_INODE_RO)
1988 		return (EROFS);
1989 	if (tdip->flags & HAMMER_INODE_RO)
1990 		return (EROFS);
1991 	if (ip->flags & HAMMER_INODE_RO)
1992 		return (EROFS);
1993 	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1994 		return (error);
1995 
1996 	lwkt_gettoken(&hmp->fs_token);
1997 	hammer_start_transaction(&trans, hmp);
1998 	++hammer_stats_file_iopsw;
1999 
2000 	/*
2001 	 * Remove tncp from the target directory and then link ip as
2002 	 * tncp. XXX pass trans to dounlink
2003 	 *
2004 	 * Force the inode sync-time to match the transaction so it is
2005 	 * in-sync with the creation of the target directory entry.
2006 	 */
2007 	error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
2008 				ap->a_cred, 0, -1);
2009 	if (error == 0 || error == ENOENT) {
2010 		error = hammer_ip_add_directory(&trans, tdip,
2011 						tncp->nc_name, tncp->nc_nlen,
2012 						ip);
2013 		if (error == 0) {
2014 			ip->ino_data.parent_obj_id = tdip->obj_id;
2015 			ip->ino_data.ctime = trans.time;
2016 			hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY);
2017 		}
2018 	}
2019 	if (error)
2020 		goto failed; /* XXX */
2021 
2022 	/*
2023 	 * Locate the record in the originating directory and remove it.
2024 	 *
2025 	 * Calculate the namekey and setup the key range for the scan.  This
2026 	 * works kinda like a chained hash table where the lower 32 bits
2027 	 * of the namekey synthesize the chain.
2028 	 *
2029 	 * The key range is inclusive of both key_beg and key_end.
2030 	 */
2031 	namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
2032 					   &max_iterations);
2033 retry:
2034 	hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
2035 	cursor.key_beg.localization = fdip->obj_localization +
2036 				      hammer_dir_localization(fdip);
2037         cursor.key_beg.obj_id = fdip->obj_id;
2038 	cursor.key_beg.key = namekey;
2039         cursor.key_beg.create_tid = 0;
2040         cursor.key_beg.delete_tid = 0;
2041         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2042         cursor.key_beg.obj_type = 0;
2043 
2044 	cursor.key_end = cursor.key_beg;
2045 	cursor.key_end.key += max_iterations;
2046 	cursor.asof = fdip->obj_asof;
2047 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2048 
2049 	/*
2050 	 * Scan all matching records (the chain), locate the one matching
2051 	 * the requested path component.
2052 	 *
2053 	 * The hammer_ip_*() functions merge in-memory records with on-disk
2054 	 * records for the purposes of the search.
2055 	 */
2056 	error = hammer_ip_first(&cursor);
2057 	while (error == 0) {
2058 		if (hammer_ip_resolve_data(&cursor) != 0)
2059 			break;
2060 		nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2061 		KKASSERT(nlen > 0);
2062 		if (fncp->nc_nlen == nlen &&
2063 		    bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2064 			break;
2065 		}
2066 		error = hammer_ip_next(&cursor);
2067 	}
2068 
2069 	/*
2070 	 * If all is ok we have to get the inode so we can adjust nlinks.
2071 	 *
2072 	 * WARNING: hammer_ip_del_directory() may have to terminate the
2073 	 * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
2074 	 * twice.
2075 	 */
2076 	if (error == 0)
2077 		error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
2078 
2079 	/*
2080 	 * XXX A deadlock here will break rename's atomicy for the purposes
2081 	 * of crash recovery.
2082 	 */
2083 	if (error == EDEADLK) {
2084 		hammer_done_cursor(&cursor);
2085 		goto retry;
2086 	}
2087 
2088 	/*
2089 	 * Cleanup and tell the kernel that the rename succeeded.
2090 	 *
2091 	 * NOTE: ip->vp, if non-NULL, cannot be directly referenced
2092 	 *	 without formally acquiring the vp since the vp might
2093 	 *	 have zero refs on it, or in the middle of a reclaim,
2094 	 *	 etc.
2095 	 */
2096         hammer_done_cursor(&cursor);
2097 	if (error == 0) {
2098 		cache_rename(ap->a_fnch, ap->a_tnch);
2099 		hammer_knote(ap->a_fdvp, NOTE_WRITE);
2100 		hammer_knote(ap->a_tdvp, NOTE_WRITE);
2101 		while (ip->vp) {
2102 			struct vnode *vp;
2103 
2104 			error = hammer_get_vnode(ip, &vp);
2105 			if (error == 0 && vp) {
2106 				vn_unlock(vp);
2107 				hammer_knote(ip->vp, NOTE_RENAME);
2108 				vrele(vp);
2109 				break;
2110 			}
2111 			kprintf("Debug: HAMMER ip/vp race2 avoided\n");
2112 		}
2113 	}
2114 
2115 failed:
2116 	hammer_done_transaction(&trans);
2117 	lwkt_reltoken(&hmp->fs_token);
2118 	return (error);
2119 }
2120 
2121 /*
2122  * hammer_vop_nrmdir { nch, dvp, cred }
2123  */
2124 static
2125 int
2126 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
2127 {
2128 	struct hammer_transaction trans;
2129 	struct hammer_inode *dip;
2130 	hammer_mount_t hmp;
2131 	int error;
2132 
2133 	dip = VTOI(ap->a_dvp);
2134 	hmp = dip->hmp;
2135 
2136 	if (hammer_nohistory(dip) == 0 &&
2137 	    (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
2138 		return (error);
2139 	}
2140 
2141 	lwkt_gettoken(&hmp->fs_token);
2142 	hammer_start_transaction(&trans, hmp);
2143 	++hammer_stats_file_iopsw;
2144 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
2145 	hammer_done_transaction(&trans);
2146 	if (error == 0)
2147 		hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
2148 	lwkt_reltoken(&hmp->fs_token);
2149 	return (error);
2150 }
2151 
2152 /*
2153  * hammer_vop_markatime { vp, cred }
2154  */
2155 static
2156 int
2157 hammer_vop_markatime(struct vop_markatime_args *ap)
2158 {
2159 	struct hammer_transaction trans;
2160 	struct hammer_inode *ip;
2161 	hammer_mount_t hmp;
2162 
2163 	ip = VTOI(ap->a_vp);
2164 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2165 		return (EROFS);
2166 	if (ip->flags & HAMMER_INODE_RO)
2167 		return (EROFS);
2168 	hmp = ip->hmp;
2169 	if (hmp->mp->mnt_flag & MNT_NOATIME)
2170 		return (0);
2171 	lwkt_gettoken(&hmp->fs_token);
2172 	hammer_start_transaction(&trans, hmp);
2173 	++hammer_stats_file_iopsw;
2174 
2175 	ip->ino_data.atime = trans.time;
2176 	hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
2177 	hammer_done_transaction(&trans);
2178 	hammer_knote(ap->a_vp, NOTE_ATTRIB);
2179 	lwkt_reltoken(&hmp->fs_token);
2180 	return (0);
2181 }
2182 
2183 /*
2184  * hammer_vop_setattr { vp, vap, cred }
2185  */
2186 static
2187 int
2188 hammer_vop_setattr(struct vop_setattr_args *ap)
2189 {
2190 	struct hammer_transaction trans;
2191 	struct hammer_inode *ip;
2192 	struct vattr *vap;
2193 	hammer_mount_t hmp;
2194 	int modflags;
2195 	int error;
2196 	int truncating;
2197 	int blksize;
2198 	int kflags;
2199 #if 0
2200 	int64_t aligned_size;
2201 #endif
2202 	u_int32_t flags;
2203 
2204 	vap = ap->a_vap;
2205 	ip = ap->a_vp->v_data;
2206 	modflags = 0;
2207 	kflags = 0;
2208 	hmp = ip->hmp;
2209 
2210 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2211 		return(EROFS);
2212 	if (ip->flags & HAMMER_INODE_RO)
2213 		return (EROFS);
2214 	if (hammer_nohistory(ip) == 0 &&
2215 	    (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
2216 		return (error);
2217 	}
2218 
2219 	lwkt_gettoken(&hmp->fs_token);
2220 	hammer_start_transaction(&trans, hmp);
2221 	++hammer_stats_file_iopsw;
2222 	error = 0;
2223 
2224 	if (vap->va_flags != VNOVAL) {
2225 		flags = ip->ino_data.uflags;
2226 		error = vop_helper_setattr_flags(&flags, vap->va_flags,
2227 					 hammer_to_unix_xid(&ip->ino_data.uid),
2228 					 ap->a_cred);
2229 		if (error == 0) {
2230 			if (ip->ino_data.uflags != flags) {
2231 				ip->ino_data.uflags = flags;
2232 				ip->ino_data.ctime = trans.time;
2233 				modflags |= HAMMER_INODE_DDIRTY;
2234 				kflags |= NOTE_ATTRIB;
2235 			}
2236 			if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2237 				error = 0;
2238 				goto done;
2239 			}
2240 		}
2241 		goto done;
2242 	}
2243 	if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2244 		error = EPERM;
2245 		goto done;
2246 	}
2247 	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
2248 		mode_t cur_mode = ip->ino_data.mode;
2249 		uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2250 		gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2251 		uuid_t uuid_uid;
2252 		uuid_t uuid_gid;
2253 
2254 		error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
2255 					 ap->a_cred,
2256 					 &cur_uid, &cur_gid, &cur_mode);
2257 		if (error == 0) {
2258 			hammer_guid_to_uuid(&uuid_uid, cur_uid);
2259 			hammer_guid_to_uuid(&uuid_gid, cur_gid);
2260 			if (bcmp(&uuid_uid, &ip->ino_data.uid,
2261 				 sizeof(uuid_uid)) ||
2262 			    bcmp(&uuid_gid, &ip->ino_data.gid,
2263 				 sizeof(uuid_gid)) ||
2264 			    ip->ino_data.mode != cur_mode
2265 			) {
2266 				ip->ino_data.uid = uuid_uid;
2267 				ip->ino_data.gid = uuid_gid;
2268 				ip->ino_data.mode = cur_mode;
2269 				ip->ino_data.ctime = trans.time;
2270 				modflags |= HAMMER_INODE_DDIRTY;
2271 			}
2272 			kflags |= NOTE_ATTRIB;
2273 		}
2274 	}
2275 	while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
2276 		switch(ap->a_vp->v_type) {
2277 		case VREG:
2278 			if (vap->va_size == ip->ino_data.size)
2279 				break;
2280 
2281 			/*
2282 			 * Log the operation if in fast-fsync mode or if
2283 			 * there are unterminated redo write records present.
2284 			 *
2285 			 * The second check is needed so the recovery code
2286 			 * properly truncates write redos even if nominal
2287 			 * REDO operations is turned off due to excessive
2288 			 * writes, because the related records might be
2289 			 * destroyed and never lay down a TERM_WRITE.
2290 			 */
2291 			if ((ip->flags & HAMMER_INODE_REDO) ||
2292 			    (ip->flags & HAMMER_INODE_RDIRTY)) {
2293 				error = hammer_generate_redo(&trans, ip,
2294 							     vap->va_size,
2295 							     HAMMER_REDO_TRUNC,
2296 							     NULL, 0);
2297 			}
2298 			blksize = hammer_blocksize(vap->va_size);
2299 
2300 			/*
2301 			 * XXX break atomicy, we can deadlock the backend
2302 			 * if we do not release the lock.  Probably not a
2303 			 * big deal here.
2304 			 */
2305 			if (vap->va_size < ip->ino_data.size) {
2306 				nvtruncbuf(ap->a_vp, vap->va_size,
2307 					   blksize,
2308 					   hammer_blockoff(vap->va_size));
2309 				truncating = 1;
2310 				kflags |= NOTE_WRITE;
2311 			} else {
2312 				nvextendbuf(ap->a_vp,
2313 					    ip->ino_data.size,
2314 					    vap->va_size,
2315 					    hammer_blocksize(ip->ino_data.size),
2316 					    hammer_blocksize(vap->va_size),
2317 					    hammer_blockoff(ip->ino_data.size),
2318 					    hammer_blockoff(vap->va_size),
2319 					    0);
2320 				truncating = 0;
2321 				kflags |= NOTE_WRITE | NOTE_EXTEND;
2322 			}
2323 			ip->ino_data.size = vap->va_size;
2324 			ip->ino_data.mtime = trans.time;
2325 			/* XXX safe to use SDIRTY instead of DDIRTY here? */
2326 			modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2327 
2328 			/*
2329 			 * On-media truncation is cached in the inode until
2330 			 * the inode is synchronized.  We must immediately
2331 			 * handle any frontend records.
2332 			 */
2333 			if (truncating) {
2334 				hammer_ip_frontend_trunc(ip, vap->va_size);
2335 #ifdef DEBUG_TRUNCATE
2336 				if (HammerTruncIp == NULL)
2337 					HammerTruncIp = ip;
2338 #endif
2339 				if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2340 					ip->flags |= HAMMER_INODE_TRUNCATED;
2341 					ip->trunc_off = vap->va_size;
2342 #ifdef DEBUG_TRUNCATE
2343 					if (ip == HammerTruncIp)
2344 					kprintf("truncate1 %016llx\n",
2345 						(long long)ip->trunc_off);
2346 #endif
2347 				} else if (ip->trunc_off > vap->va_size) {
2348 					ip->trunc_off = vap->va_size;
2349 #ifdef DEBUG_TRUNCATE
2350 					if (ip == HammerTruncIp)
2351 					kprintf("truncate2 %016llx\n",
2352 						(long long)ip->trunc_off);
2353 #endif
2354 				} else {
2355 #ifdef DEBUG_TRUNCATE
2356 					if (ip == HammerTruncIp)
2357 					kprintf("truncate3 %016llx (ignored)\n",
2358 						(long long)vap->va_size);
2359 #endif
2360 				}
2361 			}
2362 
2363 #if 0
2364 			/*
2365 			 * When truncating, nvtruncbuf() may have cleaned out
2366 			 * a portion of the last block on-disk in the buffer
2367 			 * cache.  We must clean out any frontend records
2368 			 * for blocks beyond the new last block.
2369 			 */
2370 			aligned_size = (vap->va_size + (blksize - 1)) &
2371 				       ~(int64_t)(blksize - 1);
2372 			if (truncating && vap->va_size < aligned_size) {
2373 				aligned_size -= blksize;
2374 				hammer_ip_frontend_trunc(ip, aligned_size);
2375 			}
2376 #endif
2377 			break;
2378 		case VDATABASE:
2379 			if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2380 				ip->flags |= HAMMER_INODE_TRUNCATED;
2381 				ip->trunc_off = vap->va_size;
2382 			} else if (ip->trunc_off > vap->va_size) {
2383 				ip->trunc_off = vap->va_size;
2384 			}
2385 			hammer_ip_frontend_trunc(ip, vap->va_size);
2386 			ip->ino_data.size = vap->va_size;
2387 			ip->ino_data.mtime = trans.time;
2388 			modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2389 			kflags |= NOTE_ATTRIB;
2390 			break;
2391 		default:
2392 			error = EINVAL;
2393 			goto done;
2394 		}
2395 		break;
2396 	}
2397 	if (vap->va_atime.tv_sec != VNOVAL) {
2398 		ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime);
2399 		modflags |= HAMMER_INODE_ATIME;
2400 		kflags |= NOTE_ATTRIB;
2401 	}
2402 	if (vap->va_mtime.tv_sec != VNOVAL) {
2403 		ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime);
2404 		modflags |= HAMMER_INODE_MTIME;
2405 		kflags |= NOTE_ATTRIB;
2406 	}
2407 	if (vap->va_mode != (mode_t)VNOVAL) {
2408 		mode_t   cur_mode = ip->ino_data.mode;
2409 		uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2410 		gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2411 
2412 		error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
2413 					 cur_uid, cur_gid, &cur_mode);
2414 		if (error == 0 && ip->ino_data.mode != cur_mode) {
2415 			ip->ino_data.mode = cur_mode;
2416 			ip->ino_data.ctime = trans.time;
2417 			modflags |= HAMMER_INODE_DDIRTY;
2418 			kflags |= NOTE_ATTRIB;
2419 		}
2420 	}
2421 done:
2422 	if (error == 0)
2423 		hammer_modify_inode(&trans, ip, modflags);
2424 	hammer_done_transaction(&trans);
2425 	hammer_knote(ap->a_vp, kflags);
2426 	lwkt_reltoken(&hmp->fs_token);
2427 	return (error);
2428 }
2429 
2430 /*
2431  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
2432  */
2433 static
2434 int
2435 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
2436 {
2437 	struct hammer_transaction trans;
2438 	struct hammer_inode *dip;
2439 	struct hammer_inode *nip;
2440 	hammer_record_t record;
2441 	struct nchandle *nch;
2442 	hammer_mount_t hmp;
2443 	int error;
2444 	int bytes;
2445 
2446 	ap->a_vap->va_type = VLNK;
2447 
2448 	nch = ap->a_nch;
2449 	dip = VTOI(ap->a_dvp);
2450 	hmp = dip->hmp;
2451 
2452 	if (dip->flags & HAMMER_INODE_RO)
2453 		return (EROFS);
2454 	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
2455 		return (error);
2456 
2457 	/*
2458 	 * Create a transaction to cover the operations we perform.
2459 	 */
2460 	lwkt_gettoken(&hmp->fs_token);
2461 	hammer_start_transaction(&trans, hmp);
2462 	++hammer_stats_file_iopsw;
2463 
2464 	/*
2465 	 * Create a new filesystem object of the requested type.  The
2466 	 * returned inode will be referenced but not locked.
2467 	 */
2468 
2469 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
2470 				    dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
2471 				    NULL, &nip);
2472 	if (error) {
2473 		hammer_done_transaction(&trans);
2474 		*ap->a_vpp = NULL;
2475 		lwkt_reltoken(&hmp->fs_token);
2476 		return (error);
2477 	}
2478 
2479 	/*
2480 	 * Add a record representing the symlink.  symlink stores the link
2481 	 * as pure data, not a string, and is no \0 terminated.
2482 	 */
2483 	if (error == 0) {
2484 		bytes = strlen(ap->a_target);
2485 
2486 		if (bytes <= HAMMER_INODE_BASESYMLEN) {
2487 			bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
2488 		} else {
2489 			record = hammer_alloc_mem_record(nip, bytes);
2490 			record->type = HAMMER_MEM_RECORD_GENERAL;
2491 
2492 			record->leaf.base.localization = nip->obj_localization +
2493 							 HAMMER_LOCALIZE_MISC;
2494 			record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
2495 			record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
2496 			record->leaf.data_len = bytes;
2497 			KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
2498 			bcopy(ap->a_target, record->data->symlink.name, bytes);
2499 			error = hammer_ip_add_record(&trans, record);
2500 		}
2501 
2502 		/*
2503 		 * Set the file size to the length of the link.
2504 		 */
2505 		if (error == 0) {
2506 			nip->ino_data.size = bytes;
2507 			hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY);
2508 		}
2509 	}
2510 	if (error == 0)
2511 		error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
2512 						nch->ncp->nc_nlen, nip);
2513 
2514 	/*
2515 	 * Finish up.
2516 	 */
2517 	if (error) {
2518 		hammer_rel_inode(nip, 0);
2519 		*ap->a_vpp = NULL;
2520 	} else {
2521 		error = hammer_get_vnode(nip, ap->a_vpp);
2522 		hammer_rel_inode(nip, 0);
2523 		if (error == 0) {
2524 			cache_setunresolved(ap->a_nch);
2525 			cache_setvp(ap->a_nch, *ap->a_vpp);
2526 			hammer_knote(ap->a_dvp, NOTE_WRITE);
2527 		}
2528 	}
2529 	hammer_done_transaction(&trans);
2530 	lwkt_reltoken(&hmp->fs_token);
2531 	return (error);
2532 }
2533 
2534 /*
2535  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
2536  */
2537 static
2538 int
2539 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
2540 {
2541 	struct hammer_transaction trans;
2542 	struct hammer_inode *dip;
2543 	hammer_mount_t hmp;
2544 	int error;
2545 
2546 	dip = VTOI(ap->a_dvp);
2547 	hmp = dip->hmp;
2548 
2549 	if (hammer_nohistory(dip) == 0 &&
2550 	    (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) {
2551 		return (error);
2552 	}
2553 
2554 	lwkt_gettoken(&hmp->fs_token);
2555 	hammer_start_transaction(&trans, hmp);
2556 	++hammer_stats_file_iopsw;
2557 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
2558 				ap->a_cred, ap->a_flags, -1);
2559 	hammer_done_transaction(&trans);
2560 	lwkt_reltoken(&hmp->fs_token);
2561 
2562 	return (error);
2563 }
2564 
2565 /*
2566  * hammer_vop_ioctl { vp, command, data, fflag, cred }
2567  */
2568 static
2569 int
2570 hammer_vop_ioctl(struct vop_ioctl_args *ap)
2571 {
2572 	struct hammer_inode *ip = ap->a_vp->v_data;
2573 	hammer_mount_t hmp = ip->hmp;
2574 	int error;
2575 
2576 	++hammer_stats_file_iopsr;
2577 	lwkt_gettoken(&hmp->fs_token);
2578 	error = hammer_ioctl(ip, ap->a_command, ap->a_data,
2579 			     ap->a_fflag, ap->a_cred);
2580 	lwkt_reltoken(&hmp->fs_token);
2581 	return (error);
2582 }
2583 
2584 static
2585 int
2586 hammer_vop_mountctl(struct vop_mountctl_args *ap)
2587 {
2588 	static const struct mountctl_opt extraopt[] = {
2589 		{ HMNT_NOHISTORY, 	"nohistory" },
2590 		{ HMNT_MASTERID,	"master" },
2591 		{ 0, NULL}
2592 
2593 	};
2594 	struct hammer_mount *hmp;
2595 	struct mount *mp;
2596 	int usedbytes;
2597 	int error;
2598 
2599 	error = 0;
2600 	usedbytes = 0;
2601 	mp = ap->a_head.a_ops->head.vv_mount;
2602 	KKASSERT(mp->mnt_data != NULL);
2603 	hmp = (struct hammer_mount *)mp->mnt_data;
2604 
2605 	lwkt_gettoken(&hmp->fs_token);
2606 
2607 	switch(ap->a_op) {
2608 	case MOUNTCTL_SET_EXPORT:
2609 		if (ap->a_ctllen != sizeof(struct export_args))
2610 			error = EINVAL;
2611 		else
2612 			error = hammer_vfs_export(mp, ap->a_op,
2613 				      (const struct export_args *)ap->a_ctl);
2614 		break;
2615 	case MOUNTCTL_MOUNTFLAGS:
2616 	{
2617 		/*
2618 		 * Call standard mountctl VOP function
2619 		 * so we get user mount flags.
2620 		 */
2621 		error = vop_stdmountctl(ap);
2622 		if (error)
2623 			break;
2624 
2625 		usedbytes = *ap->a_res;
2626 
2627 		if (usedbytes > 0 && usedbytes < ap->a_buflen) {
2628 			usedbytes += vfs_flagstostr(hmp->hflags, extraopt,
2629 						    ap->a_buf,
2630 						    ap->a_buflen - usedbytes,
2631 						    &error);
2632 		}
2633 
2634 		*ap->a_res += usedbytes;
2635 		break;
2636 	}
2637 	default:
2638 		error = vop_stdmountctl(ap);
2639 		break;
2640 	}
2641 	lwkt_reltoken(&hmp->fs_token);
2642 	return(error);
2643 }
2644 
2645 /*
2646  * hammer_vop_strategy { vp, bio }
2647  *
2648  * Strategy call, used for regular file read & write only.  Note that the
2649  * bp may represent a cluster.
2650  *
2651  * To simplify operation and allow better optimizations in the future,
2652  * this code does not make any assumptions with regards to buffer alignment
2653  * or size.
2654  */
2655 static
2656 int
2657 hammer_vop_strategy(struct vop_strategy_args *ap)
2658 {
2659 	struct buf *bp;
2660 	int error;
2661 
2662 	bp = ap->a_bio->bio_buf;
2663 
2664 	switch(bp->b_cmd) {
2665 	case BUF_CMD_READ:
2666 		error = hammer_vop_strategy_read(ap);
2667 		break;
2668 	case BUF_CMD_WRITE:
2669 		error = hammer_vop_strategy_write(ap);
2670 		break;
2671 	default:
2672 		bp->b_error = error = EINVAL;
2673 		bp->b_flags |= B_ERROR;
2674 		biodone(ap->a_bio);
2675 		break;
2676 	}
2677 
2678 	/* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */
2679 
2680 	return (error);
2681 }
2682 
2683 /*
2684  * Read from a regular file.  Iterate the related records and fill in the
2685  * BIO/BUF.  Gaps are zero-filled.
2686  *
2687  * The support code in hammer_object.c should be used to deal with mixed
2688  * in-memory and on-disk records.
2689  *
2690  * NOTE: Can be called from the cluster code with an oversized buf.
2691  *
2692  * XXX atime update
2693  */
2694 static
2695 int
2696 hammer_vop_strategy_read(struct vop_strategy_args *ap)
2697 {
2698 	struct hammer_transaction trans;
2699 	struct hammer_inode *ip;
2700 	struct hammer_inode *dip;
2701 	hammer_mount_t hmp;
2702 	struct hammer_cursor cursor;
2703 	hammer_base_elm_t base;
2704 	hammer_off_t disk_offset;
2705 	struct bio *bio;
2706 	struct bio *nbio;
2707 	struct buf *bp;
2708 	int64_t rec_offset;
2709 	int64_t ran_end;
2710 	int64_t tmp64;
2711 	int error;
2712 	int boff;
2713 	int roff;
2714 	int n;
2715 	int isdedupable;
2716 
2717 	bio = ap->a_bio;
2718 	bp = bio->bio_buf;
2719 	ip = ap->a_vp->v_data;
2720 	hmp = ip->hmp;
2721 
2722 	/*
2723 	 * The zone-2 disk offset may have been set by the cluster code via
2724 	 * a BMAP operation, or else should be NOOFFSET.
2725 	 *
2726 	 * Checking the high bits for a match against zone-2 should suffice.
2727 	 *
2728 	 * In cases where a lot of data duplication is present it may be
2729 	 * more beneficial to drop through and doubule-buffer through the
2730 	 * device.
2731 	 */
2732 	nbio = push_bio(bio);
2733 	if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2734 	    HAMMER_ZONE_LARGE_DATA) {
2735 		if (hammer_double_buffer == 0) {
2736 			lwkt_gettoken(&hmp->fs_token);
2737 			error = hammer_io_direct_read(hmp, nbio, NULL);
2738 			lwkt_reltoken(&hmp->fs_token);
2739 			return (error);
2740 		}
2741 
2742 		/*
2743 		 * Try to shortcut requests for double_buffer mode too.
2744 		 * Since this mode runs through the device buffer cache
2745 		 * only compatible buffer sizes (meaning those generated
2746 		 * by normal filesystem buffers) are legal.
2747 		 */
2748 		if (hammer_live_dedup == 0 && (bp->b_flags & B_PAGING) == 0) {
2749 			error = hammer_io_indirect_read(hmp, nbio, NULL);
2750 			return (error);
2751 		}
2752 	}
2753 
2754 	/*
2755 	 * Well, that sucked.  Do it the hard way.  If all the stars are
2756 	 * aligned we may still be able to issue a direct-read.
2757 	 */
2758 	lwkt_gettoken(&hmp->fs_token);
2759 	hammer_simple_transaction(&trans, hmp);
2760 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2761 
2762 	/*
2763 	 * Key range (begin and end inclusive) to scan.  Note that the key's
2764 	 * stored in the actual records represent BASE+LEN, not BASE.  The
2765 	 * first record containing bio_offset will have a key > bio_offset.
2766 	 */
2767 	cursor.key_beg.localization = ip->obj_localization +
2768 				      HAMMER_LOCALIZE_MISC;
2769 	cursor.key_beg.obj_id = ip->obj_id;
2770 	cursor.key_beg.create_tid = 0;
2771 	cursor.key_beg.delete_tid = 0;
2772 	cursor.key_beg.obj_type = 0;
2773 	cursor.key_beg.key = bio->bio_offset + 1;
2774 	cursor.asof = ip->obj_asof;
2775 	cursor.flags |= HAMMER_CURSOR_ASOF;
2776 
2777 	cursor.key_end = cursor.key_beg;
2778 	KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2779 #if 0
2780 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2781 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2782 		cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2783 		cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2784 	} else
2785 #endif
2786 	{
2787 		ran_end = bio->bio_offset + bp->b_bufsize;
2788 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2789 		cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2790 		tmp64 = ran_end + MAXPHYS + 1;	/* work-around GCC-4 bug */
2791 		if (tmp64 < ran_end)
2792 			cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2793 		else
2794 			cursor.key_end.key = ran_end + MAXPHYS + 1;
2795 	}
2796 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2797 
2798 	/*
2799 	 * Set NOSWAPCACHE for cursor data extraction if double buffering
2800 	 * is disabled or (if the file is not marked cacheable via chflags
2801 	 * and vm.swapcache_use_chflags is enabled).
2802 	 */
2803 	if (hammer_double_buffer == 0 ||
2804 	    ((ap->a_vp->v_flag & VSWAPCACHE) == 0 &&
2805 	     vm_swapcache_use_chflags)) {
2806 		cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE;
2807 	}
2808 
2809 	error = hammer_ip_first(&cursor);
2810 	boff = 0;
2811 
2812 	while (error == 0) {
2813 		/*
2814 		 * Get the base file offset of the record.  The key for
2815 		 * data records is (base + bytes) rather then (base).
2816 		 */
2817 		base = &cursor.leaf->base;
2818 		rec_offset = base->key - cursor.leaf->data_len;
2819 
2820 		/*
2821 		 * Calculate the gap, if any, and zero-fill it.
2822 		 *
2823 		 * n is the offset of the start of the record verses our
2824 		 * current seek offset in the bio.
2825 		 */
2826 		n = (int)(rec_offset - (bio->bio_offset + boff));
2827 		if (n > 0) {
2828 			if (n > bp->b_bufsize - boff)
2829 				n = bp->b_bufsize - boff;
2830 			bzero((char *)bp->b_data + boff, n);
2831 			boff += n;
2832 			n = 0;
2833 		}
2834 
2835 		/*
2836 		 * Calculate the data offset in the record and the number
2837 		 * of bytes we can copy.
2838 		 *
2839 		 * There are two degenerate cases.  First, boff may already
2840 		 * be at bp->b_bufsize.  Secondly, the data offset within
2841 		 * the record may exceed the record's size.
2842 		 */
2843 		roff = -n;
2844 		rec_offset += roff;
2845 		n = cursor.leaf->data_len - roff;
2846 		if (n <= 0) {
2847 			kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2848 			n = 0;
2849 		} else if (n > bp->b_bufsize - boff) {
2850 			n = bp->b_bufsize - boff;
2851 		}
2852 
2853 		/*
2854 		 * Deal with cached truncations.  This cool bit of code
2855 		 * allows truncate()/ftruncate() to avoid having to sync
2856 		 * the file.
2857 		 *
2858 		 * If the frontend is truncated then all backend records are
2859 		 * subject to the frontend's truncation.
2860 		 *
2861 		 * If the backend is truncated then backend records on-disk
2862 		 * (but not in-memory) are subject to the backend's
2863 		 * truncation.  In-memory records owned by the backend
2864 		 * represent data written after the truncation point on the
2865 		 * backend and must not be truncated.
2866 		 *
2867 		 * Truncate operations deal with frontend buffer cache
2868 		 * buffers and frontend-owned in-memory records synchronously.
2869 		 */
2870 		if (ip->flags & HAMMER_INODE_TRUNCATED) {
2871 			if (hammer_cursor_ondisk(&cursor)/* ||
2872 			    cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) {
2873 				if (ip->trunc_off <= rec_offset)
2874 					n = 0;
2875 				else if (ip->trunc_off < rec_offset + n)
2876 					n = (int)(ip->trunc_off - rec_offset);
2877 			}
2878 		}
2879 		if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2880 			if (hammer_cursor_ondisk(&cursor)) {
2881 				if (ip->sync_trunc_off <= rec_offset)
2882 					n = 0;
2883 				else if (ip->sync_trunc_off < rec_offset + n)
2884 					n = (int)(ip->sync_trunc_off - rec_offset);
2885 			}
2886 		}
2887 
2888 		/*
2889 		 * Try to issue a direct read into our bio if possible,
2890 		 * otherwise resolve the element data into a hammer_buffer
2891 		 * and copy.
2892 		 *
2893 		 * The buffer on-disk should be zerod past any real
2894 		 * truncation point, but may not be for any synthesized
2895 		 * truncation point from above.
2896 		 *
2897 		 * NOTE: disk_offset is only valid if the cursor data is
2898 		 *	 on-disk.
2899 		 */
2900 		disk_offset = cursor.leaf->data_offset + roff;
2901 		isdedupable = (boff == 0 && n == bp->b_bufsize &&
2902 			       hammer_cursor_ondisk(&cursor) &&
2903 			       ((int)disk_offset & HAMMER_BUFMASK) == 0);
2904 
2905 		if (isdedupable && hammer_double_buffer == 0) {
2906 			/*
2907 			 * Direct read case
2908 			 */
2909 			KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2910 				 HAMMER_ZONE_LARGE_DATA);
2911 			nbio->bio_offset = disk_offset;
2912 			error = hammer_io_direct_read(hmp, nbio, cursor.leaf);
2913 			if (hammer_live_dedup && error == 0)
2914 				hammer_dedup_cache_add(ip, cursor.leaf);
2915 			goto done;
2916 		} else if (isdedupable) {
2917 			/*
2918 			 * Async I/O case for reading from backing store
2919 			 * and copying the data to the filesystem buffer.
2920 			 * live-dedup has to verify the data anyway if it
2921 			 * gets a hit later so we can just add the entry
2922 			 * now.
2923 			 */
2924 			KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2925 				 HAMMER_ZONE_LARGE_DATA);
2926 			nbio->bio_offset = disk_offset;
2927 			if (hammer_live_dedup)
2928 				hammer_dedup_cache_add(ip, cursor.leaf);
2929 			error = hammer_io_indirect_read(hmp, nbio, cursor.leaf);
2930 			goto done;
2931 		} else if (n) {
2932 			error = hammer_ip_resolve_data(&cursor);
2933 			if (error == 0) {
2934 				if (hammer_live_dedup && isdedupable)
2935 					hammer_dedup_cache_add(ip, cursor.leaf);
2936 				bcopy((char *)cursor.data + roff,
2937 				      (char *)bp->b_data + boff, n);
2938 			}
2939 		}
2940 		if (error)
2941 			break;
2942 
2943 		/*
2944 		 * We have to be sure that the only elements added to the
2945 		 * dedup cache are those which are already on-media.
2946 		 */
2947 		if (hammer_live_dedup && hammer_cursor_ondisk(&cursor))
2948 			hammer_dedup_cache_add(ip, cursor.leaf);
2949 
2950 		/*
2951 		 * Iterate until we have filled the request.
2952 		 */
2953 		boff += n;
2954 		if (boff == bp->b_bufsize)
2955 			break;
2956 		error = hammer_ip_next(&cursor);
2957 	}
2958 
2959 	/*
2960 	 * There may have been a gap after the last record
2961 	 */
2962 	if (error == ENOENT)
2963 		error = 0;
2964 	if (error == 0 && boff != bp->b_bufsize) {
2965 		KKASSERT(boff < bp->b_bufsize);
2966 		bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2967 		/* boff = bp->b_bufsize; */
2968 	}
2969 
2970 	/*
2971 	 * Disallow swapcache operation on the vnode buffer if double
2972 	 * buffering is enabled, the swapcache will get the data via
2973 	 * the block device buffer.
2974 	 */
2975 	if (hammer_double_buffer)
2976 		bp->b_flags |= B_NOTMETA;
2977 
2978 	/*
2979 	 * Cleanup
2980 	 */
2981 	bp->b_resid = 0;
2982 	bp->b_error = error;
2983 	if (error)
2984 		bp->b_flags |= B_ERROR;
2985 	biodone(ap->a_bio);
2986 
2987 done:
2988 	/*
2989 	 * Cache the b-tree node for the last data read in cache[1].
2990 	 *
2991 	 * If we hit the file EOF then also cache the node in the
2992 	 * governing director's cache[3], it will be used to initialize
2993 	 * the inode's cache[1] for any inodes looked up via the directory.
2994 	 *
2995 	 * This doesn't reduce disk accesses since the B-Tree chain is
2996 	 * likely cached, but it does reduce cpu overhead when looking
2997 	 * up file offsets for cpdup/tar/cpio style iterations.
2998 	 */
2999 	if (cursor.node)
3000 		hammer_cache_node(&ip->cache[1], cursor.node);
3001 	if (ran_end >= ip->ino_data.size) {
3002 		dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id,
3003 					ip->obj_asof, ip->obj_localization);
3004 		if (dip) {
3005 			hammer_cache_node(&dip->cache[3], cursor.node);
3006 			hammer_rel_inode(dip, 0);
3007 		}
3008 	}
3009 	hammer_done_cursor(&cursor);
3010 	hammer_done_transaction(&trans);
3011 	lwkt_reltoken(&hmp->fs_token);
3012 	return(error);
3013 }
3014 
3015 /*
3016  * BMAP operation - used to support cluster_read() only.
3017  *
3018  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
3019  *
3020  * This routine may return EOPNOTSUPP if the opration is not supported for
3021  * the specified offset.  The contents of the pointer arguments do not
3022  * need to be initialized in that case.
3023  *
3024  * If a disk address is available and properly aligned return 0 with
3025  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
3026  * to the run-length relative to that offset.  Callers may assume that
3027  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
3028  * large, so return EOPNOTSUPP if it is not sufficiently large.
3029  */
3030 static
3031 int
3032 hammer_vop_bmap(struct vop_bmap_args *ap)
3033 {
3034 	struct hammer_transaction trans;
3035 	struct hammer_inode *ip;
3036 	hammer_mount_t hmp;
3037 	struct hammer_cursor cursor;
3038 	hammer_base_elm_t base;
3039 	int64_t rec_offset;
3040 	int64_t ran_end;
3041 	int64_t tmp64;
3042 	int64_t base_offset;
3043 	int64_t base_disk_offset;
3044 	int64_t last_offset;
3045 	hammer_off_t last_disk_offset;
3046 	hammer_off_t disk_offset;
3047 	int	rec_len;
3048 	int	error;
3049 	int	blksize;
3050 
3051 	++hammer_stats_file_iopsr;
3052 	ip = ap->a_vp->v_data;
3053 	hmp = ip->hmp;
3054 
3055 	/*
3056 	 * We can only BMAP regular files.  We can't BMAP database files,
3057 	 * directories, etc.
3058 	 */
3059 	if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
3060 		return(EOPNOTSUPP);
3061 
3062 	/*
3063 	 * bmap is typically called with runp/runb both NULL when used
3064 	 * for writing.  We do not support BMAP for writing atm.
3065 	 */
3066 	if (ap->a_cmd != BUF_CMD_READ)
3067 		return(EOPNOTSUPP);
3068 
3069 	/*
3070 	 * Scan the B-Tree to acquire blockmap addresses, then translate
3071 	 * to raw addresses.
3072 	 */
3073 	lwkt_gettoken(&hmp->fs_token);
3074 	hammer_simple_transaction(&trans, hmp);
3075 #if 0
3076 	kprintf("bmap_beg %016llx ip->cache %p\n",
3077 		(long long)ap->a_loffset, ip->cache[1]);
3078 #endif
3079 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
3080 
3081 	/*
3082 	 * Key range (begin and end inclusive) to scan.  Note that the key's
3083 	 * stored in the actual records represent BASE+LEN, not BASE.  The
3084 	 * first record containing bio_offset will have a key > bio_offset.
3085 	 */
3086 	cursor.key_beg.localization = ip->obj_localization +
3087 				      HAMMER_LOCALIZE_MISC;
3088 	cursor.key_beg.obj_id = ip->obj_id;
3089 	cursor.key_beg.create_tid = 0;
3090 	cursor.key_beg.delete_tid = 0;
3091 	cursor.key_beg.obj_type = 0;
3092 	if (ap->a_runb)
3093 		cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
3094 	else
3095 		cursor.key_beg.key = ap->a_loffset + 1;
3096 	if (cursor.key_beg.key < 0)
3097 		cursor.key_beg.key = 0;
3098 	cursor.asof = ip->obj_asof;
3099 	cursor.flags |= HAMMER_CURSOR_ASOF;
3100 
3101 	cursor.key_end = cursor.key_beg;
3102 	KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
3103 
3104 	ran_end = ap->a_loffset + MAXPHYS;
3105 	cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
3106 	cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
3107 	tmp64 = ran_end + MAXPHYS + 1;	/* work-around GCC-4 bug */
3108 	if (tmp64 < ran_end)
3109 		cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
3110 	else
3111 		cursor.key_end.key = ran_end + MAXPHYS + 1;
3112 
3113 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
3114 
3115 	error = hammer_ip_first(&cursor);
3116 	base_offset = last_offset = 0;
3117 	base_disk_offset = last_disk_offset = 0;
3118 
3119 	while (error == 0) {
3120 		/*
3121 		 * Get the base file offset of the record.  The key for
3122 		 * data records is (base + bytes) rather then (base).
3123 		 *
3124 		 * NOTE: rec_offset + rec_len may exceed the end-of-file.
3125 		 * The extra bytes should be zero on-disk and the BMAP op
3126 		 * should still be ok.
3127 		 */
3128 		base = &cursor.leaf->base;
3129 		rec_offset = base->key - cursor.leaf->data_len;
3130 		rec_len    = cursor.leaf->data_len;
3131 
3132 		/*
3133 		 * Incorporate any cached truncation.
3134 		 *
3135 		 * NOTE: Modifications to rec_len based on synthesized
3136 		 * truncation points remove the guarantee that any extended
3137 		 * data on disk is zero (since the truncations may not have
3138 		 * taken place on-media yet).
3139 		 */
3140 		if (ip->flags & HAMMER_INODE_TRUNCATED) {
3141 			if (hammer_cursor_ondisk(&cursor) ||
3142 			    cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
3143 				if (ip->trunc_off <= rec_offset)
3144 					rec_len = 0;
3145 				else if (ip->trunc_off < rec_offset + rec_len)
3146 					rec_len = (int)(ip->trunc_off - rec_offset);
3147 			}
3148 		}
3149 		if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
3150 			if (hammer_cursor_ondisk(&cursor)) {
3151 				if (ip->sync_trunc_off <= rec_offset)
3152 					rec_len = 0;
3153 				else if (ip->sync_trunc_off < rec_offset + rec_len)
3154 					rec_len = (int)(ip->sync_trunc_off - rec_offset);
3155 			}
3156 		}
3157 
3158 		/*
3159 		 * Accumulate information.  If we have hit a discontiguous
3160 		 * block reset base_offset unless we are already beyond the
3161 		 * requested offset.  If we are, that's it, we stop.
3162 		 */
3163 		if (error)
3164 			break;
3165 		if (hammer_cursor_ondisk(&cursor)) {
3166 			disk_offset = cursor.leaf->data_offset;
3167 			if (rec_offset != last_offset ||
3168 			    disk_offset != last_disk_offset) {
3169 				if (rec_offset > ap->a_loffset)
3170 					break;
3171 				base_offset = rec_offset;
3172 				base_disk_offset = disk_offset;
3173 			}
3174 			last_offset = rec_offset + rec_len;
3175 			last_disk_offset = disk_offset + rec_len;
3176 
3177 			if (hammer_live_dedup)
3178 				hammer_dedup_cache_add(ip, cursor.leaf);
3179 		}
3180 
3181 		error = hammer_ip_next(&cursor);
3182 	}
3183 
3184 #if 0
3185 	kprintf("BMAP %016llx:  %016llx - %016llx\n",
3186 		(long long)ap->a_loffset,
3187 		(long long)base_offset,
3188 		(long long)last_offset);
3189 	kprintf("BMAP %16s:  %016llx - %016llx\n", "",
3190 		(long long)base_disk_offset,
3191 		(long long)last_disk_offset);
3192 #endif
3193 
3194 	if (cursor.node) {
3195 		hammer_cache_node(&ip->cache[1], cursor.node);
3196 #if 0
3197 		kprintf("bmap_end2 %016llx ip->cache %p\n",
3198 			(long long)ap->a_loffset, ip->cache[1]);
3199 #endif
3200 	}
3201 	hammer_done_cursor(&cursor);
3202 	hammer_done_transaction(&trans);
3203 	lwkt_reltoken(&hmp->fs_token);
3204 
3205 	/*
3206 	 * If we couldn't find any records or the records we did find were
3207 	 * all behind the requested offset, return failure.  A forward
3208 	 * truncation can leave a hole w/ no on-disk records.
3209 	 */
3210 	if (last_offset == 0 || last_offset < ap->a_loffset)
3211 		return (EOPNOTSUPP);
3212 
3213 	/*
3214 	 * Figure out the block size at the requested offset and adjust
3215 	 * our limits so the cluster_read() does not create inappropriately
3216 	 * sized buffer cache buffers.
3217 	 */
3218 	blksize = hammer_blocksize(ap->a_loffset);
3219 	if (hammer_blocksize(base_offset) != blksize) {
3220 		base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
3221 	}
3222 	if (last_offset != ap->a_loffset &&
3223 	    hammer_blocksize(last_offset - 1) != blksize) {
3224 		last_offset = hammer_blockdemarc(ap->a_loffset,
3225 						 last_offset - 1);
3226 	}
3227 
3228 	/*
3229 	 * Returning EOPNOTSUPP simply prevents the direct-IO optimization
3230 	 * from occuring.
3231 	 */
3232 	disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
3233 
3234 	if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
3235 		/*
3236 		 * Only large-data zones can be direct-IOd
3237 		 */
3238 		error = EOPNOTSUPP;
3239 	} else if ((disk_offset & HAMMER_BUFMASK) ||
3240 		   (last_offset - ap->a_loffset) < blksize) {
3241 		/*
3242 		 * doffsetp is not aligned or the forward run size does
3243 		 * not cover a whole buffer, disallow the direct I/O.
3244 		 */
3245 		error = EOPNOTSUPP;
3246 	} else {
3247 		/*
3248 		 * We're good.
3249 		 */
3250 		*ap->a_doffsetp = disk_offset;
3251 		if (ap->a_runb) {
3252 			*ap->a_runb = ap->a_loffset - base_offset;
3253 			KKASSERT(*ap->a_runb >= 0);
3254 		}
3255 		if (ap->a_runp) {
3256 			*ap->a_runp = last_offset - ap->a_loffset;
3257 			KKASSERT(*ap->a_runp >= 0);
3258 		}
3259 		error = 0;
3260 	}
3261 	return(error);
3262 }
3263 
3264 /*
3265  * Write to a regular file.   Because this is a strategy call the OS is
3266  * trying to actually get data onto the media.
3267  */
3268 static
3269 int
3270 hammer_vop_strategy_write(struct vop_strategy_args *ap)
3271 {
3272 	hammer_record_t record;
3273 	hammer_mount_t hmp;
3274 	hammer_inode_t ip;
3275 	struct bio *bio;
3276 	struct buf *bp;
3277 	int blksize;
3278 	int bytes;
3279 	int error;
3280 
3281 	bio = ap->a_bio;
3282 	bp = bio->bio_buf;
3283 	ip = ap->a_vp->v_data;
3284 	hmp = ip->hmp;
3285 
3286 	blksize = hammer_blocksize(bio->bio_offset);
3287 	KKASSERT(bp->b_bufsize == blksize);
3288 
3289 	if (ip->flags & HAMMER_INODE_RO) {
3290 		bp->b_error = EROFS;
3291 		bp->b_flags |= B_ERROR;
3292 		biodone(ap->a_bio);
3293 		return(EROFS);
3294 	}
3295 
3296 	lwkt_gettoken(&hmp->fs_token);
3297 
3298 	/*
3299 	 * Disallow swapcache operation on the vnode buffer if double
3300 	 * buffering is enabled, the swapcache will get the data via
3301 	 * the block device buffer.
3302 	 */
3303 	if (hammer_double_buffer)
3304 		bp->b_flags |= B_NOTMETA;
3305 
3306 	/*
3307 	 * Interlock with inode destruction (no in-kernel or directory
3308 	 * topology visibility).  If we queue new IO while trying to
3309 	 * destroy the inode we can deadlock the vtrunc call in
3310 	 * hammer_inode_unloadable_check().
3311 	 *
3312 	 * Besides, there's no point flushing a bp associated with an
3313 	 * inode that is being destroyed on-media and has no kernel
3314 	 * references.
3315 	 */
3316 	if ((ip->flags | ip->sync_flags) &
3317 	    (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
3318 		bp->b_resid = 0;
3319 		biodone(ap->a_bio);
3320 		lwkt_reltoken(&hmp->fs_token);
3321 		return(0);
3322 	}
3323 
3324 	/*
3325 	 * Reserve space and issue a direct-write from the front-end.
3326 	 * NOTE: The direct_io code will hammer_bread/bcopy smaller
3327 	 * allocations.
3328 	 *
3329 	 * An in-memory record will be installed to reference the storage
3330 	 * until the flusher can get to it.
3331 	 *
3332 	 * Since we own the high level bio the front-end will not try to
3333 	 * do a direct-read until the write completes.
3334 	 *
3335 	 * NOTE: The only time we do not reserve a full-sized buffers
3336 	 * worth of data is if the file is small.  We do not try to
3337 	 * allocate a fragment (from the small-data zone) at the end of
3338 	 * an otherwise large file as this can lead to wildly separated
3339 	 * data.
3340 	 */
3341 	KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
3342 	KKASSERT(bio->bio_offset < ip->ino_data.size);
3343 	if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
3344 		bytes = bp->b_bufsize;
3345 	else
3346 		bytes = ((int)ip->ino_data.size + 15) & ~15;
3347 
3348 	record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
3349 				    bytes, &error);
3350 
3351 	/*
3352 	 * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated
3353 	 * in hammer_vop_write().  We must flag the record so the proper
3354 	 * REDO_TERM_WRITE entry is generated during the flush.
3355 	 */
3356 	if (record) {
3357 		if (bp->b_flags & B_VFSFLAG1) {
3358 			record->flags |= HAMMER_RECF_REDO;
3359 			bp->b_flags &= ~B_VFSFLAG1;
3360 		}
3361 		if (record->flags & HAMMER_RECF_DEDUPED) {
3362 			bp->b_resid = 0;
3363 			hammer_ip_replace_bulk(hmp, record);
3364 			biodone(ap->a_bio);
3365 		} else {
3366 			hammer_io_direct_write(hmp, bio, record);
3367 		}
3368 		if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
3369 			hammer_flush_inode(ip, 0);
3370 	} else {
3371 		bp->b_bio2.bio_offset = NOOFFSET;
3372 		bp->b_error = error;
3373 		bp->b_flags |= B_ERROR;
3374 		biodone(ap->a_bio);
3375 	}
3376 	lwkt_reltoken(&hmp->fs_token);
3377 	return(error);
3378 }
3379 
3380 /*
3381  * dounlink - disconnect a directory entry
3382  *
3383  * XXX whiteout support not really in yet
3384  */
3385 static int
3386 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
3387 		struct vnode *dvp, struct ucred *cred,
3388 		int flags, int isdir)
3389 {
3390 	struct namecache *ncp;
3391 	hammer_inode_t dip;
3392 	hammer_inode_t ip;
3393 	hammer_mount_t hmp;
3394 	struct hammer_cursor cursor;
3395 	int64_t namekey;
3396 	u_int32_t max_iterations;
3397 	int nlen, error;
3398 
3399 	/*
3400 	 * Calculate the namekey and setup the key range for the scan.  This
3401 	 * works kinda like a chained hash table where the lower 32 bits
3402 	 * of the namekey synthesize the chain.
3403 	 *
3404 	 * The key range is inclusive of both key_beg and key_end.
3405 	 */
3406 	dip = VTOI(dvp);
3407 	ncp = nch->ncp;
3408 	hmp = dip->hmp;
3409 
3410 	if (dip->flags & HAMMER_INODE_RO)
3411 		return (EROFS);
3412 
3413 	namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen,
3414 					   &max_iterations);
3415 retry:
3416 	hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
3417 	cursor.key_beg.localization = dip->obj_localization +
3418 				      hammer_dir_localization(dip);
3419         cursor.key_beg.obj_id = dip->obj_id;
3420 	cursor.key_beg.key = namekey;
3421         cursor.key_beg.create_tid = 0;
3422         cursor.key_beg.delete_tid = 0;
3423         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
3424         cursor.key_beg.obj_type = 0;
3425 
3426 	cursor.key_end = cursor.key_beg;
3427 	cursor.key_end.key += max_iterations;
3428 	cursor.asof = dip->obj_asof;
3429 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
3430 
3431 	/*
3432 	 * Scan all matching records (the chain), locate the one matching
3433 	 * the requested path component.  info->last_error contains the
3434 	 * error code on search termination and could be 0, ENOENT, or
3435 	 * something else.
3436 	 *
3437 	 * The hammer_ip_*() functions merge in-memory records with on-disk
3438 	 * records for the purposes of the search.
3439 	 */
3440 	error = hammer_ip_first(&cursor);
3441 
3442 	while (error == 0) {
3443 		error = hammer_ip_resolve_data(&cursor);
3444 		if (error)
3445 			break;
3446 		nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
3447 		KKASSERT(nlen > 0);
3448 		if (ncp->nc_nlen == nlen &&
3449 		    bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
3450 			break;
3451 		}
3452 		error = hammer_ip_next(&cursor);
3453 	}
3454 
3455 	/*
3456 	 * If all is ok we have to get the inode so we can adjust nlinks.
3457 	 * To avoid a deadlock with the flusher we must release the inode
3458 	 * lock on the directory when acquiring the inode for the entry.
3459 	 *
3460 	 * If the target is a directory, it must be empty.
3461 	 */
3462 	if (error == 0) {
3463 		hammer_unlock(&cursor.ip->lock);
3464 		ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
3465 				      hmp->asof,
3466 				      cursor.data->entry.localization,
3467 				      0, &error);
3468 		hammer_lock_sh(&cursor.ip->lock);
3469 		if (error == ENOENT) {
3470 			kprintf("HAMMER: WARNING: Removing "
3471 				"dirent w/missing inode \"%s\"\n"
3472 				"\tobj_id = %016llx\n",
3473 				ncp->nc_name,
3474 				(long long)cursor.data->entry.obj_id);
3475 			error = 0;
3476 		}
3477 
3478 		/*
3479 		 * If isdir >= 0 we validate that the entry is or is not a
3480 		 * directory.  If isdir < 0 we don't care.
3481 		 */
3482 		if (error == 0 && isdir >= 0 && ip) {
3483 			if (isdir &&
3484 			    ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
3485 				error = ENOTDIR;
3486 			} else if (isdir == 0 &&
3487 			    ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
3488 				error = EISDIR;
3489 			}
3490 		}
3491 
3492 		/*
3493 		 * If we are trying to remove a directory the directory must
3494 		 * be empty.
3495 		 *
3496 		 * The check directory code can loop and deadlock/retry.  Our
3497 		 * own cursor's node locks must be released to avoid a 3-way
3498 		 * deadlock with the flusher if the check directory code
3499 		 * blocks.
3500 		 *
3501 		 * If any changes whatsoever have been made to the cursor
3502 		 * set EDEADLK and retry.
3503 		 *
3504 		 * WARNING: See warnings in hammer_unlock_cursor()
3505 		 *	    function.
3506 		 */
3507 		if (error == 0 && ip && ip->ino_data.obj_type ==
3508 				        HAMMER_OBJTYPE_DIRECTORY) {
3509 			hammer_unlock_cursor(&cursor);
3510 			error = hammer_ip_check_directory_empty(trans, ip);
3511 			hammer_lock_cursor(&cursor);
3512 			if (cursor.flags & HAMMER_CURSOR_RETEST) {
3513 				kprintf("HAMMER: Warning: avoided deadlock "
3514 					"on rmdir '%s'\n",
3515 					ncp->nc_name);
3516 				error = EDEADLK;
3517 			}
3518 		}
3519 
3520 		/*
3521 		 * Delete the directory entry.
3522 		 *
3523 		 * WARNING: hammer_ip_del_directory() may have to terminate
3524 		 * the cursor to avoid a deadlock.  It is ok to call
3525 		 * hammer_done_cursor() twice.
3526 		 */
3527 		if (error == 0) {
3528 			error = hammer_ip_del_directory(trans, &cursor,
3529 							dip, ip);
3530 		}
3531 		hammer_done_cursor(&cursor);
3532 		if (error == 0) {
3533 			cache_setunresolved(nch);
3534 			cache_setvp(nch, NULL);
3535 
3536 			/*
3537 			 * NOTE: ip->vp, if non-NULL, cannot be directly
3538 			 *	 referenced without formally acquiring the
3539 			 *	 vp since the vp might have zero refs on it,
3540 			 *	 or in the middle of a reclaim, etc.
3541 			 *
3542 			 * NOTE: The cache_setunresolved() can rip the vp
3543 			 *	 out from under us since the vp may not have
3544 			 *	 any refs, in which case ip->vp will be NULL
3545 			 *	 from the outset.
3546 			 */
3547 			while (ip && ip->vp) {
3548 				struct vnode *vp;
3549 
3550 				error = hammer_get_vnode(ip, &vp);
3551 				if (error == 0 && vp) {
3552 					vn_unlock(vp);
3553 					hammer_knote(ip->vp, NOTE_DELETE);
3554 					cache_inval_vp(ip->vp, CINV_DESTROY);
3555 					vrele(vp);
3556 					break;
3557 				}
3558 				kprintf("Debug: HAMMER ip/vp race1 avoided\n");
3559 			}
3560 		}
3561 		if (ip)
3562 			hammer_rel_inode(ip, 0);
3563 	} else {
3564 		hammer_done_cursor(&cursor);
3565 	}
3566 	if (error == EDEADLK)
3567 		goto retry;
3568 
3569 	return (error);
3570 }
3571 
3572 /************************************************************************
3573  *			    FIFO AND SPECFS OPS				*
3574  ************************************************************************
3575  *
3576  */
3577 static int
3578 hammer_vop_fifoclose (struct vop_close_args *ap)
3579 {
3580 	/* XXX update itimes */
3581 	return (VOCALL(&fifo_vnode_vops, &ap->a_head));
3582 }
3583 
3584 static int
3585 hammer_vop_fiforead (struct vop_read_args *ap)
3586 {
3587 	int error;
3588 
3589 	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3590 	/* XXX update access time */
3591 	return (error);
3592 }
3593 
3594 static int
3595 hammer_vop_fifowrite (struct vop_write_args *ap)
3596 {
3597 	int error;
3598 
3599 	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3600 	/* XXX update access time */
3601 	return (error);
3602 }
3603 
3604 static
3605 int
3606 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap)
3607 {
3608 	int error;
3609 
3610 	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3611 	if (error)
3612 		error = hammer_vop_kqfilter(ap);
3613 	return(error);
3614 }
3615 
3616 /************************************************************************
3617  *			    KQFILTER OPS				*
3618  ************************************************************************
3619  *
3620  */
3621 static void filt_hammerdetach(struct knote *kn);
3622 static int filt_hammerread(struct knote *kn, long hint);
3623 static int filt_hammerwrite(struct knote *kn, long hint);
3624 static int filt_hammervnode(struct knote *kn, long hint);
3625 
3626 static struct filterops hammerread_filtops =
3627 	{ FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerread };
3628 static struct filterops hammerwrite_filtops =
3629 	{ FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerwrite };
3630 static struct filterops hammervnode_filtops =
3631 	{ FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammervnode };
3632 
3633 static
3634 int
3635 hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
3636 {
3637 	struct vnode *vp = ap->a_vp;
3638 	struct knote *kn = ap->a_kn;
3639 
3640 	switch (kn->kn_filter) {
3641 	case EVFILT_READ:
3642 		kn->kn_fop = &hammerread_filtops;
3643 		break;
3644 	case EVFILT_WRITE:
3645 		kn->kn_fop = &hammerwrite_filtops;
3646 		break;
3647 	case EVFILT_VNODE:
3648 		kn->kn_fop = &hammervnode_filtops;
3649 		break;
3650 	default:
3651 		return (EOPNOTSUPP);
3652 	}
3653 
3654 	kn->kn_hook = (caddr_t)vp;
3655 
3656 	knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
3657 
3658 	return(0);
3659 }
3660 
3661 static void
3662 filt_hammerdetach(struct knote *kn)
3663 {
3664 	struct vnode *vp = (void *)kn->kn_hook;
3665 
3666 	knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
3667 }
3668 
3669 static int
3670 filt_hammerread(struct knote *kn, long hint)
3671 {
3672 	struct vnode *vp = (void *)kn->kn_hook;
3673 	hammer_inode_t ip = VTOI(vp);
3674 	hammer_mount_t hmp = ip->hmp;
3675 	off_t off;
3676 
3677 	if (hint == NOTE_REVOKE) {
3678 		kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
3679 		return(1);
3680 	}
3681 	lwkt_gettoken(&hmp->fs_token);	/* XXX use per-ip-token */
3682 	off = ip->ino_data.size - kn->kn_fp->f_offset;
3683 	kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX;
3684 	lwkt_reltoken(&hmp->fs_token);
3685 	if (kn->kn_sfflags & NOTE_OLDAPI)
3686 		return(1);
3687 	return (kn->kn_data != 0);
3688 }
3689 
3690 static int
3691 filt_hammerwrite(struct knote *kn, long hint)
3692 {
3693 	if (hint == NOTE_REVOKE)
3694 		kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
3695 	kn->kn_data = 0;
3696 	return (1);
3697 }
3698 
3699 static int
3700 filt_hammervnode(struct knote *kn, long hint)
3701 {
3702 	if (kn->kn_sfflags & hint)
3703 		kn->kn_fflags |= hint;
3704 	if (hint == NOTE_REVOKE) {
3705 		kn->kn_flags |= (EV_EOF | EV_NODATA);
3706 		return (1);
3707 	}
3708 	return (kn->kn_fflags != 0);
3709 }
3710 
3711