xref: /dragonfly/sys/vfs/hammer/hammer_vnops.c (revision 3bafb5c1)
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $
35  */
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <sys/file.h>
48 #include <vm/vm_extern.h>
49 #include <vm/swap_pager.h>
50 #include <vfs/fifofs/fifo.h>
51 
52 #include "hammer.h"
53 
54 /*
55  * USERFS VNOPS
56  */
57 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
58 static int hammer_vop_fsync(struct vop_fsync_args *);
59 static int hammer_vop_read(struct vop_read_args *);
60 static int hammer_vop_write(struct vop_write_args *);
61 static int hammer_vop_access(struct vop_access_args *);
62 static int hammer_vop_advlock(struct vop_advlock_args *);
63 static int hammer_vop_close(struct vop_close_args *);
64 static int hammer_vop_ncreate(struct vop_ncreate_args *);
65 static int hammer_vop_getattr(struct vop_getattr_args *);
66 static int hammer_vop_nresolve(struct vop_nresolve_args *);
67 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
68 static int hammer_vop_nlink(struct vop_nlink_args *);
69 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
70 static int hammer_vop_nmknod(struct vop_nmknod_args *);
71 static int hammer_vop_open(struct vop_open_args *);
72 static int hammer_vop_print(struct vop_print_args *);
73 static int hammer_vop_readdir(struct vop_readdir_args *);
74 static int hammer_vop_readlink(struct vop_readlink_args *);
75 static int hammer_vop_nremove(struct vop_nremove_args *);
76 static int hammer_vop_nrename(struct vop_nrename_args *);
77 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
78 static int hammer_vop_markatime(struct vop_markatime_args *);
79 static int hammer_vop_setattr(struct vop_setattr_args *);
80 static int hammer_vop_strategy(struct vop_strategy_args *);
81 static int hammer_vop_bmap(struct vop_bmap_args *ap);
82 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
83 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
84 static int hammer_vop_ioctl(struct vop_ioctl_args *);
85 static int hammer_vop_mountctl(struct vop_mountctl_args *);
86 static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
87 
88 static int hammer_vop_fifoclose (struct vop_close_args *);
89 static int hammer_vop_fiforead (struct vop_read_args *);
90 static int hammer_vop_fifowrite (struct vop_write_args *);
91 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
92 
93 struct vop_ops hammer_vnode_vops = {
94 	.vop_default =		vop_defaultop,
95 	.vop_fsync =		hammer_vop_fsync,
96 	.vop_getpages =		vop_stdgetpages,
97 	.vop_putpages =		vop_stdputpages,
98 	.vop_read =		hammer_vop_read,
99 	.vop_write =		hammer_vop_write,
100 	.vop_access =		hammer_vop_access,
101 	.vop_advlock =		hammer_vop_advlock,
102 	.vop_close =		hammer_vop_close,
103 	.vop_ncreate =		hammer_vop_ncreate,
104 	.vop_getattr =		hammer_vop_getattr,
105 	.vop_inactive =		hammer_vop_inactive,
106 	.vop_reclaim =		hammer_vop_reclaim,
107 	.vop_nresolve =		hammer_vop_nresolve,
108 	.vop_nlookupdotdot =	hammer_vop_nlookupdotdot,
109 	.vop_nlink =		hammer_vop_nlink,
110 	.vop_nmkdir =		hammer_vop_nmkdir,
111 	.vop_nmknod =		hammer_vop_nmknod,
112 	.vop_open =		hammer_vop_open,
113 	.vop_pathconf =		vop_stdpathconf,
114 	.vop_print =		hammer_vop_print,
115 	.vop_readdir =		hammer_vop_readdir,
116 	.vop_readlink =		hammer_vop_readlink,
117 	.vop_nremove =		hammer_vop_nremove,
118 	.vop_nrename =		hammer_vop_nrename,
119 	.vop_nrmdir =		hammer_vop_nrmdir,
120 	.vop_markatime = 	hammer_vop_markatime,
121 	.vop_setattr =		hammer_vop_setattr,
122 	.vop_bmap =		hammer_vop_bmap,
123 	.vop_strategy =		hammer_vop_strategy,
124 	.vop_nsymlink =		hammer_vop_nsymlink,
125 	.vop_nwhiteout =	hammer_vop_nwhiteout,
126 	.vop_ioctl =		hammer_vop_ioctl,
127 	.vop_mountctl =		hammer_vop_mountctl,
128 	.vop_kqfilter =		hammer_vop_kqfilter
129 };
130 
131 struct vop_ops hammer_spec_vops = {
132 	.vop_default =		vop_defaultop,
133 	.vop_fsync =		hammer_vop_fsync,
134 	.vop_read =		vop_stdnoread,
135 	.vop_write =		vop_stdnowrite,
136 	.vop_access =		hammer_vop_access,
137 	.vop_close =		hammer_vop_close,
138 	.vop_markatime = 	hammer_vop_markatime,
139 	.vop_getattr =		hammer_vop_getattr,
140 	.vop_inactive =		hammer_vop_inactive,
141 	.vop_reclaim =		hammer_vop_reclaim,
142 	.vop_setattr =		hammer_vop_setattr
143 };
144 
145 struct vop_ops hammer_fifo_vops = {
146 	.vop_default =		fifo_vnoperate,
147 	.vop_fsync =		hammer_vop_fsync,
148 	.vop_read =		hammer_vop_fiforead,
149 	.vop_write =		hammer_vop_fifowrite,
150 	.vop_access =		hammer_vop_access,
151 	.vop_close =		hammer_vop_fifoclose,
152 	.vop_markatime = 	hammer_vop_markatime,
153 	.vop_getattr =		hammer_vop_getattr,
154 	.vop_inactive =		hammer_vop_inactive,
155 	.vop_reclaim =		hammer_vop_reclaim,
156 	.vop_setattr =		hammer_vop_setattr,
157 	.vop_kqfilter =		hammer_vop_fifokqfilter
158 };
159 
160 static __inline
161 void
162 hammer_knote(struct vnode *vp, int flags)
163 {
164 	if (flags)
165 		KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
166 }
167 
168 #ifdef DEBUG_TRUNCATE
169 struct hammer_inode *HammerTruncIp;
170 #endif
171 
172 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
173 			   struct vnode *dvp, struct ucred *cred,
174 			   int flags, int isdir);
175 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
176 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
177 
178 #if 0
179 static
180 int
181 hammer_vop_vnoperate(struct vop_generic_args *)
182 {
183 	return (VOCALL(&hammer_vnode_vops, ap));
184 }
185 #endif
186 
187 /*
188  * hammer_vop_fsync { vp, waitfor }
189  *
190  * fsync() an inode to disk and wait for it to be completely committed
191  * such that the information would not be undone if a crash occured after
192  * return.
193  *
194  * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement
195  *	 a REDO log.  A sysctl is provided to relax HAMMER's fsync()
196  *	 operation.
197  *
198  *	 Ultimately the combination of a REDO log and use of fast storage
199  *	 to front-end cluster caches will make fsync fast, but it aint
200  *	 here yet.  And, in anycase, we need real transactional
201  *	 all-or-nothing features which are not restricted to a single file.
202  */
203 static
204 int
205 hammer_vop_fsync(struct vop_fsync_args *ap)
206 {
207 	hammer_inode_t ip = VTOI(ap->a_vp);
208 	hammer_mount_t hmp = ip->hmp;
209 	int waitfor = ap->a_waitfor;
210 	int mode;
211 
212 	lwkt_gettoken(&hmp->fs_token);
213 
214 	/*
215 	 * Fsync rule relaxation (default is either full synchronous flush
216 	 * or REDO semantics with synchronous flush).
217 	 */
218 	if (ap->a_flags & VOP_FSYNC_SYSCALL) {
219 		switch(hammer_fsync_mode) {
220 		case 0:
221 mode0:
222 			/* no REDO, full synchronous flush */
223 			goto skip;
224 		case 1:
225 mode1:
226 			/* no REDO, full asynchronous flush */
227 			if (waitfor == MNT_WAIT)
228 				waitfor = MNT_NOWAIT;
229 			goto skip;
230 		case 2:
231 			/* REDO semantics, synchronous flush */
232 			if (hmp->version < HAMMER_VOL_VERSION_FOUR)
233 				goto mode0;
234 			mode = HAMMER_FLUSH_UNDOS_AUTO;
235 			break;
236 		case 3:
237 			/* REDO semantics, relaxed asynchronous flush */
238 			if (hmp->version < HAMMER_VOL_VERSION_FOUR)
239 				goto mode1;
240 			mode = HAMMER_FLUSH_UNDOS_RELAXED;
241 			if (waitfor == MNT_WAIT)
242 				waitfor = MNT_NOWAIT;
243 			break;
244 		case 4:
245 			/* ignore the fsync() system call */
246 			lwkt_reltoken(&hmp->fs_token);
247 			return(0);
248 		default:
249 			/* we have to do something */
250 			mode = HAMMER_FLUSH_UNDOS_RELAXED;
251 			if (waitfor == MNT_WAIT)
252 				waitfor = MNT_NOWAIT;
253 			break;
254 		}
255 
256 		/*
257 		 * Fast fsync only needs to flush the UNDO/REDO fifo if
258 		 * HAMMER_INODE_REDO is non-zero and the only modifications
259 		 * made to the file are write or write-extends.
260 		 */
261 		if ((ip->flags & HAMMER_INODE_REDO) &&
262 		    (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0
263 		) {
264 			++hammer_count_fsyncs;
265 			hammer_flusher_flush_undos(hmp, mode);
266 			ip->redo_count = 0;
267 			lwkt_reltoken(&hmp->fs_token);
268 			return(0);
269 		}
270 
271 		/*
272 		 * REDO is enabled by fsync(), the idea being we really only
273 		 * want to lay down REDO records when programs are using
274 		 * fsync() heavily.  The first fsync() on the file starts
275 		 * the gravy train going and later fsync()s keep it hot by
276 		 * resetting the redo_count.
277 		 *
278 		 * We weren't running REDOs before now so we have to fall
279 		 * through and do a full fsync of what we have.
280 		 */
281 		if (hmp->version >= HAMMER_VOL_VERSION_FOUR &&
282 		    (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) {
283 			ip->flags |= HAMMER_INODE_REDO;
284 			ip->redo_count = 0;
285 		}
286 	}
287 skip:
288 
289 	/*
290 	 * Do a full flush sequence.
291 	 *
292 	 * Attempt to release the vnode while waiting for the inode to
293 	 * finish flushing.  This can really mess up inactive->reclaim
294 	 * sequences so only do it if the vnode is active.
295 	 */
296 	++hammer_count_fsyncs;
297 	vfsync(ap->a_vp, waitfor, 1, NULL, NULL);
298 	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
299 	if (waitfor == MNT_WAIT) {
300 		if ((ap->a_vp->v_flag & VINACTIVE) == 0)
301 			vn_unlock(ap->a_vp);
302 		hammer_wait_inode(ip);
303 		if ((ap->a_vp->v_flag & VINACTIVE) == 0)
304 			vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
305 	}
306 	lwkt_reltoken(&hmp->fs_token);
307 	return (ip->error);
308 }
309 
310 /*
311  * hammer_vop_read { vp, uio, ioflag, cred }
312  *
313  * MPSAFE (for the cache safe does not require fs_token)
314  */
315 static
316 int
317 hammer_vop_read(struct vop_read_args *ap)
318 {
319 	struct hammer_transaction trans;
320 	hammer_inode_t ip;
321 	hammer_mount_t hmp;
322 	off_t offset;
323 	struct buf *bp;
324 	struct uio *uio;
325 	int error;
326 	int n;
327 	int seqcount;
328 	int ioseqcount;
329 	int blksize;
330 	int bigread;
331 	int got_fstoken;
332 
333 	if (ap->a_vp->v_type != VREG)
334 		return (EINVAL);
335 	ip = VTOI(ap->a_vp);
336 	hmp = ip->hmp;
337 	error = 0;
338 	uio = ap->a_uio;
339 
340 	/*
341 	 * Allow the UIO's size to override the sequential heuristic.
342 	 */
343 	blksize = hammer_blocksize(uio->uio_offset);
344 	seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE;
345 	ioseqcount = (ap->a_ioflag >> 16);
346 	if (seqcount < ioseqcount)
347 		seqcount = ioseqcount;
348 
349 	/*
350 	 * If reading or writing a huge amount of data we have to break
351 	 * atomicy and allow the operation to be interrupted by a signal
352 	 * or it can DOS the machine.
353 	 */
354 	bigread = (uio->uio_resid > 100 * 1024 * 1024);
355 	got_fstoken = 0;
356 
357 	/*
358 	 * Access the data typically in HAMMER_BUFSIZE blocks via the
359 	 * buffer cache, but HAMMER may use a variable block size based
360 	 * on the offset.
361 	 *
362 	 * XXX Temporary hack, delay the start transaction while we remain
363 	 *     MPSAFE.  NOTE: ino_data.size cannot change while vnode is
364 	 *     locked-shared.
365 	 */
366 	while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
367 		int64_t base_offset;
368 		int64_t file_limit;
369 
370 		blksize = hammer_blocksize(uio->uio_offset);
371 		offset = (int)uio->uio_offset & (blksize - 1);
372 		base_offset = uio->uio_offset - offset;
373 
374 		if (bigread && (error = hammer_signal_check(ip->hmp)) != 0)
375 			break;
376 
377 		/*
378 		 * MPSAFE
379 		 */
380 		bp = getblk(ap->a_vp, base_offset, blksize, 0, 0);
381 		if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) {
382 			bp->b_flags &= ~B_AGE;
383 			error = 0;
384 			goto skip;
385 		}
386 		if (ap->a_ioflag & IO_NRDELAY) {
387 			bqrelse(bp);
388 			return (EWOULDBLOCK);
389 		}
390 
391 		/*
392 		 * MPUNSAFE
393 		 */
394 		if (got_fstoken == 0) {
395 			lwkt_gettoken(&hmp->fs_token);
396 			got_fstoken = 1;
397 			hammer_start_transaction(&trans, ip->hmp);
398 		}
399 
400 		/*
401 		 * NOTE: A valid bp has already been acquired, but was not
402 		 *	 B_CACHE.
403 		 */
404 		if (hammer_cluster_enable) {
405 			/*
406 			 * Use file_limit to prevent cluster_read() from
407 			 * creating buffers of the wrong block size past
408 			 * the demarc.
409 			 */
410 			file_limit = ip->ino_data.size;
411 			if (base_offset < HAMMER_XDEMARC &&
412 			    file_limit > HAMMER_XDEMARC) {
413 				file_limit = HAMMER_XDEMARC;
414 			}
415 			error = cluster_readx(ap->a_vp,
416 					     file_limit, base_offset,
417 					     blksize, uio->uio_resid,
418 					     seqcount * BKVASIZE, &bp);
419 		} else {
420 			error = breadnx(ap->a_vp, base_offset, blksize,
421 					NULL, NULL, 0, &bp);
422 		}
423 		if (error) {
424 			brelse(bp);
425 			break;
426 		}
427 skip:
428 		if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) {
429 			kprintf("doff %016jx read file %016jx@%016jx\n",
430 				(intmax_t)bp->b_bio2.bio_offset,
431 				(intmax_t)ip->obj_id,
432 				(intmax_t)bp->b_loffset);
433 		}
434 		bp->b_flags &= ~B_IODEBUG;
435 
436 		bp->b_flags |= B_CLUSTEROK;
437 		n = blksize - offset;
438 		if (n > uio->uio_resid)
439 			n = uio->uio_resid;
440 		if (n > ip->ino_data.size - uio->uio_offset)
441 			n = (int)(ip->ino_data.size - uio->uio_offset);
442 		if (got_fstoken)
443 			lwkt_reltoken(&hmp->fs_token);
444 
445 		/*
446 		 * Set B_AGE, data has a lower priority than meta-data.
447 		 *
448 		 * Use a hold/unlock/drop sequence to run the uiomove
449 		 * with the buffer unlocked, avoiding deadlocks against
450 		 * read()s on mmap()'d spaces.
451 		 */
452 		bp->b_flags |= B_AGE;
453 		bqhold(bp);
454 		bqrelse(bp);
455 		error = uiomove((char *)bp->b_data + offset, n, uio);
456 		bqdrop(bp);
457 
458 		if (got_fstoken)
459 			lwkt_gettoken(&hmp->fs_token);
460 
461 		if (error)
462 			break;
463 		hammer_stats_file_read += n;
464 	}
465 
466 	/*
467 	 * Try to update the atime with just the inode lock for maximum
468 	 * concurrency.  If we can't shortcut it we have to get the full
469 	 * blown transaction.
470 	 */
471 	if (got_fstoken == 0 && hammer_update_atime_quick(ip) < 0) {
472 		lwkt_gettoken(&hmp->fs_token);
473 		got_fstoken = 1;
474 		hammer_start_transaction(&trans, ip->hmp);
475 	}
476 
477 	if (got_fstoken) {
478 		if ((ip->flags & HAMMER_INODE_RO) == 0 &&
479 		    (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
480 			ip->ino_data.atime = trans.time;
481 			hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
482 		}
483 		hammer_done_transaction(&trans);
484 		lwkt_reltoken(&hmp->fs_token);
485 	}
486 	return (error);
487 }
488 
489 /*
490  * hammer_vop_write { vp, uio, ioflag, cred }
491  */
492 static
493 int
494 hammer_vop_write(struct vop_write_args *ap)
495 {
496 	struct hammer_transaction trans;
497 	struct hammer_inode *ip;
498 	hammer_mount_t hmp;
499 	thread_t td;
500 	struct uio *uio;
501 	int offset;
502 	off_t base_offset;
503 	int64_t cluster_eof;
504 	struct buf *bp;
505 	int kflags;
506 	int error;
507 	int n;
508 	int flags;
509 	int seqcount;
510 	int bigwrite;
511 
512 	if (ap->a_vp->v_type != VREG)
513 		return (EINVAL);
514 	ip = VTOI(ap->a_vp);
515 	hmp = ip->hmp;
516 	error = 0;
517 	kflags = 0;
518 	seqcount = ap->a_ioflag >> 16;
519 
520 	if (ip->flags & HAMMER_INODE_RO)
521 		return (EROFS);
522 
523 	/*
524 	 * Create a transaction to cover the operations we perform.
525 	 */
526 	lwkt_gettoken(&hmp->fs_token);
527 	hammer_start_transaction(&trans, hmp);
528 	uio = ap->a_uio;
529 
530 	/*
531 	 * Check append mode
532 	 */
533 	if (ap->a_ioflag & IO_APPEND)
534 		uio->uio_offset = ip->ino_data.size;
535 
536 	/*
537 	 * Check for illegal write offsets.  Valid range is 0...2^63-1.
538 	 *
539 	 * NOTE: the base_off assignment is required to work around what
540 	 * I consider to be a GCC-4 optimization bug.
541 	 */
542 	if (uio->uio_offset < 0) {
543 		hammer_done_transaction(&trans);
544 		lwkt_reltoken(&hmp->fs_token);
545 		return (EFBIG);
546 	}
547 	base_offset = uio->uio_offset + uio->uio_resid;	/* work around gcc-4 */
548 	if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) {
549 		hammer_done_transaction(&trans);
550 		lwkt_reltoken(&hmp->fs_token);
551 		return (EFBIG);
552 	}
553 
554 	if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc &&
555 	    base_offset > td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
556 		hammer_done_transaction(&trans);
557 		lwkt_reltoken(&hmp->fs_token);
558 		lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
559 		return (EFBIG);
560 	}
561 
562 	/*
563 	 * If reading or writing a huge amount of data we have to break
564 	 * atomicy and allow the operation to be interrupted by a signal
565 	 * or it can DOS the machine.
566 	 *
567 	 * Preset redo_count so we stop generating REDOs earlier if the
568 	 * limit is exceeded.
569 	 */
570 	bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
571 	if ((ip->flags & HAMMER_INODE_REDO) &&
572 	    ip->redo_count < hammer_limit_redo) {
573 		ip->redo_count += uio->uio_resid;
574 	}
575 
576 	/*
577 	 * Access the data typically in HAMMER_BUFSIZE blocks via the
578 	 * buffer cache, but HAMMER may use a variable block size based
579 	 * on the offset.
580 	 */
581 	while (uio->uio_resid > 0) {
582 		int fixsize = 0;
583 		int blksize;
584 		int blkmask;
585 		int trivial;
586 		int endofblk;
587 		off_t nsize;
588 
589 		if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
590 			break;
591 		if (bigwrite && (error = hammer_signal_check(hmp)) != 0)
592 			break;
593 
594 		blksize = hammer_blocksize(uio->uio_offset);
595 
596 		/*
597 		 * Do not allow HAMMER to blow out the buffer cache.  Very
598 		 * large UIOs can lockout other processes due to bwillwrite()
599 		 * mechanics.
600 		 *
601 		 * The hammer inode is not locked during these operations.
602 		 * The vnode is locked which can interfere with the pageout
603 		 * daemon for non-UIO_NOCOPY writes but should not interfere
604 		 * with the buffer cache.  Even so, we cannot afford to
605 		 * allow the pageout daemon to build up too many dirty buffer
606 		 * cache buffers.
607 		 *
608 		 * Only call this if we aren't being recursively called from
609 		 * a virtual disk device (vn), else we may deadlock.
610 		 */
611 		if ((ap->a_ioflag & IO_RECURSE) == 0)
612 			bwillwrite(blksize);
613 
614 		/*
615 		 * Control the number of pending records associated with
616 		 * this inode.  If too many have accumulated start a
617 		 * flush.  Try to maintain a pipeline with the flusher.
618 		 *
619 		 * NOTE: It is possible for other sources to grow the
620 		 *	 records but not necessarily issue another flush,
621 		 *	 so use a timeout and ensure that a re-flush occurs.
622 		 */
623 		if (ip->rsv_recs >= hammer_limit_inode_recs) {
624 			hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
625 			while (ip->rsv_recs >= hammer_limit_inode_recs * 2) {
626 				ip->flags |= HAMMER_INODE_RECSW;
627 				tsleep(&ip->rsv_recs, 0, "hmrwww", hz);
628 				hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
629 			}
630 		}
631 
632 #if 0
633 		/*
634 		 * Do not allow HAMMER to blow out system memory by
635 		 * accumulating too many records.   Records are so well
636 		 * decoupled from the buffer cache that it is possible
637 		 * for userland to push data out to the media via
638 		 * direct-write, but build up the records queued to the
639 		 * backend faster then the backend can flush them out.
640 		 * HAMMER has hit its write limit but the frontend has
641 		 * no pushback to slow it down.
642 		 */
643 		if (hmp->rsv_recs > hammer_limit_recs / 2) {
644 			/*
645 			 * Get the inode on the flush list
646 			 */
647 			if (ip->rsv_recs >= 64)
648 				hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
649 			else if (ip->rsv_recs >= 16)
650 				hammer_flush_inode(ip, 0);
651 
652 			/*
653 			 * Keep the flusher going if the system keeps
654 			 * queueing records.
655 			 */
656 			delta = hmp->count_newrecords -
657 				hmp->last_newrecords;
658 			if (delta < 0 || delta > hammer_limit_recs / 2) {
659 				hmp->last_newrecords = hmp->count_newrecords;
660 				hammer_sync_hmp(hmp, MNT_NOWAIT);
661 			}
662 
663 			/*
664 			 * If we have gotten behind start slowing
665 			 * down the writers.
666 			 */
667 			delta = (hmp->rsv_recs - hammer_limit_recs) *
668 				hz / hammer_limit_recs;
669 			if (delta > 0)
670 				tsleep(&trans, 0, "hmrslo", delta);
671 		}
672 #endif
673 
674 		/*
675 		 * Calculate the blocksize at the current offset and figure
676 		 * out how much we can actually write.
677 		 */
678 		blkmask = blksize - 1;
679 		offset = (int)uio->uio_offset & blkmask;
680 		base_offset = uio->uio_offset & ~(int64_t)blkmask;
681 		n = blksize - offset;
682 		if (n > uio->uio_resid) {
683 			n = uio->uio_resid;
684 			endofblk = 0;
685 		} else {
686 			endofblk = 1;
687 		}
688 		nsize = uio->uio_offset + n;
689 		if (nsize > ip->ino_data.size) {
690 			if (uio->uio_offset > ip->ino_data.size)
691 				trivial = 0;
692 			else
693 				trivial = 1;
694 			nvextendbuf(ap->a_vp,
695 				    ip->ino_data.size,
696 				    nsize,
697 				    hammer_blocksize(ip->ino_data.size),
698 				    hammer_blocksize(nsize),
699 				    hammer_blockoff(ip->ino_data.size),
700 				    hammer_blockoff(nsize),
701 				    trivial);
702 			fixsize = 1;
703 			kflags |= NOTE_EXTEND;
704 		}
705 
706 		if (uio->uio_segflg == UIO_NOCOPY) {
707 			/*
708 			 * Issuing a write with the same data backing the
709 			 * buffer.  Instantiate the buffer to collect the
710 			 * backing vm pages, then read-in any missing bits.
711 			 *
712 			 * This case is used by vop_stdputpages().
713 			 */
714 			bp = getblk(ap->a_vp, base_offset,
715 				    blksize, GETBLK_BHEAVY, 0);
716 			if ((bp->b_flags & B_CACHE) == 0) {
717 				bqrelse(bp);
718 				error = bread(ap->a_vp, base_offset,
719 					      blksize, &bp);
720 			}
721 		} else if (offset == 0 && uio->uio_resid >= blksize) {
722 			/*
723 			 * Even though we are entirely overwriting the buffer
724 			 * we may still have to zero it out to avoid a
725 			 * mmap/write visibility issue.
726 			 */
727 			bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
728 			if ((bp->b_flags & B_CACHE) == 0)
729 				vfs_bio_clrbuf(bp);
730 		} else if (base_offset >= ip->ino_data.size) {
731 			/*
732 			 * If the base offset of the buffer is beyond the
733 			 * file EOF, we don't have to issue a read.
734 			 */
735 			bp = getblk(ap->a_vp, base_offset,
736 				    blksize, GETBLK_BHEAVY, 0);
737 			vfs_bio_clrbuf(bp);
738 		} else {
739 			/*
740 			 * Partial overwrite, read in any missing bits then
741 			 * replace the portion being written.
742 			 */
743 			error = bread(ap->a_vp, base_offset, blksize, &bp);
744 			if (error == 0)
745 				bheavy(bp);
746 		}
747 		if (error == 0) {
748 			lwkt_reltoken(&hmp->fs_token);
749 			error = uiomove(bp->b_data + offset, n, uio);
750 			lwkt_gettoken(&hmp->fs_token);
751 		}
752 
753 		/*
754 		 * Generate REDO records if enabled and redo_count will not
755 		 * exceeded the limit.
756 		 *
757 		 * If redo_count exceeds the limit we stop generating records
758 		 * and clear HAMMER_INODE_REDO.  This will cause the next
759 		 * fsync() to do a full meta-data sync instead of just an
760 		 * UNDO/REDO fifo update.
761 		 *
762 		 * When clearing HAMMER_INODE_REDO any pre-existing REDOs
763 		 * will still be tracked.  The tracks will be terminated
764 		 * when the related meta-data (including possible data
765 		 * modifications which are not tracked via REDO) is
766 		 * flushed.
767 		 */
768 		if ((ip->flags & HAMMER_INODE_REDO) && error == 0) {
769 			if (ip->redo_count < hammer_limit_redo) {
770 				bp->b_flags |= B_VFSFLAG1;
771 				error = hammer_generate_redo(&trans, ip,
772 						     base_offset + offset,
773 						     HAMMER_REDO_WRITE,
774 						     bp->b_data + offset,
775 						     (size_t)n);
776 			} else {
777 				ip->flags &= ~HAMMER_INODE_REDO;
778 			}
779 		}
780 
781 		/*
782 		 * If we screwed up we have to undo any VM size changes we
783 		 * made.
784 		 */
785 		if (error) {
786 			brelse(bp);
787 			if (fixsize) {
788 				nvtruncbuf(ap->a_vp, ip->ino_data.size,
789 					  hammer_blocksize(ip->ino_data.size),
790 					  hammer_blockoff(ip->ino_data.size),
791 					  0);
792 			}
793 			break;
794 		}
795 		kflags |= NOTE_WRITE;
796 		hammer_stats_file_write += n;
797 		bp->b_flags |= B_CLUSTEROK;
798 		if (ip->ino_data.size < uio->uio_offset) {
799 			ip->ino_data.size = uio->uio_offset;
800 			flags = HAMMER_INODE_SDIRTY;
801 		} else {
802 			flags = 0;
803 		}
804 		ip->ino_data.mtime = trans.time;
805 		flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
806 		hammer_modify_inode(&trans, ip, flags);
807 
808 		/*
809 		 * Once we dirty the buffer any cached zone-X offset
810 		 * becomes invalid.  HAMMER NOTE: no-history mode cannot
811 		 * allow overwriting over the same data sector unless
812 		 * we provide UNDOs for the old data, which we don't.
813 		 */
814 		bp->b_bio2.bio_offset = NOOFFSET;
815 
816 		/*
817 		 * Final buffer disposition.
818 		 *
819 		 * Because meta-data updates are deferred, HAMMER is
820 		 * especially sensitive to excessive bdwrite()s because
821 		 * the I/O stream is not broken up by disk reads.  So the
822 		 * buffer cache simply cannot keep up.
823 		 *
824 		 * WARNING!  blksize is variable.  cluster_write() is
825 		 *	     expected to not blow up if it encounters
826 		 *	     buffers that do not match the passed blksize.
827 		 *
828 		 * NOTE!  Hammer shouldn't need to bawrite()/cluster_write().
829 		 *	  The ip->rsv_recs check should burst-flush the data.
830 		 *	  If we queue it immediately the buf could be left
831 		 *	  locked on the device queue for a very long time.
832 		 *
833 		 *	  However, failing to flush a dirty buffer out when
834 		 *        issued from the pageout daemon can result in a low
835 		 *        memory deadlock against bio_page_alloc(), so we
836 		 *	  have to bawrite() on IO_ASYNC as well.
837 		 *
838 		 * NOTE!  To avoid degenerate stalls due to mismatched block
839 		 *	  sizes we only honor IO_DIRECT on the write which
840 		 *	  abuts the end of the buffer.  However, we must
841 		 *	  honor IO_SYNC in case someone is silly enough to
842 		 *	  configure a HAMMER file as swap, or when HAMMER
843 		 *	  is serving NFS (for commits).  Ick ick.
844 		 */
845 		bp->b_flags |= B_AGE | B_CLUSTEROK;
846 		if (ap->a_ioflag & IO_SYNC) {
847 			bwrite(bp);
848 		} else if ((ap->a_ioflag & IO_DIRECT) && endofblk) {
849 			bawrite(bp);
850 		} else if (ap->a_ioflag & IO_ASYNC) {
851 			bawrite(bp);
852 		} else if (hammer_cluster_enable &&
853 			   !(ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) {
854 			if (base_offset < HAMMER_XDEMARC)
855 				cluster_eof = hammer_blockdemarc(base_offset,
856 							 ip->ino_data.size);
857 			else
858 				cluster_eof = ip->ino_data.size;
859 			cluster_write(bp, cluster_eof, blksize, seqcount);
860 		} else {
861 			bdwrite(bp);
862 		}
863 	}
864 	hammer_done_transaction(&trans);
865 	hammer_knote(ap->a_vp, kflags);
866 	lwkt_reltoken(&hmp->fs_token);
867 	return (error);
868 }
869 
870 /*
871  * hammer_vop_access { vp, mode, cred }
872  *
873  * MPSAFE - does not require fs_token
874  */
875 static
876 int
877 hammer_vop_access(struct vop_access_args *ap)
878 {
879 	struct hammer_inode *ip = VTOI(ap->a_vp);
880 	uid_t uid;
881 	gid_t gid;
882 	int error;
883 
884 	++hammer_stats_file_iopsr;
885 	uid = hammer_to_unix_xid(&ip->ino_data.uid);
886 	gid = hammer_to_unix_xid(&ip->ino_data.gid);
887 
888 	error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
889 				  ip->ino_data.uflags);
890 	return (error);
891 }
892 
893 /*
894  * hammer_vop_advlock { vp, id, op, fl, flags }
895  *
896  * MPSAFE - does not require fs_token
897  */
898 static
899 int
900 hammer_vop_advlock(struct vop_advlock_args *ap)
901 {
902 	hammer_inode_t ip = VTOI(ap->a_vp);
903 
904 	return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
905 }
906 
907 /*
908  * hammer_vop_close { vp, fflag }
909  *
910  * We can only sync-on-close for normal closes.  XXX disabled for now.
911  */
912 static
913 int
914 hammer_vop_close(struct vop_close_args *ap)
915 {
916 #if 0
917 	struct vnode *vp = ap->a_vp;
918 	hammer_inode_t ip = VTOI(vp);
919 	int waitfor;
920 	if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) {
921 		if (vn_islocked(vp) == LK_EXCLUSIVE &&
922 		    (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) {
923 			if (ip->flags & HAMMER_INODE_CLOSESYNC)
924 				waitfor = MNT_WAIT;
925 			else
926 				waitfor = MNT_NOWAIT;
927 			ip->flags &= ~(HAMMER_INODE_CLOSESYNC |
928 				       HAMMER_INODE_CLOSEASYNC);
929 			VOP_FSYNC(vp, MNT_NOWAIT, waitfor);
930 		}
931 	}
932 #endif
933 	return (vop_stdclose(ap));
934 }
935 
936 /*
937  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
938  *
939  * The operating system has already ensured that the directory entry
940  * does not exist and done all appropriate namespace locking.
941  */
942 static
943 int
944 hammer_vop_ncreate(struct vop_ncreate_args *ap)
945 {
946 	struct hammer_transaction trans;
947 	struct hammer_inode *dip;
948 	struct hammer_inode *nip;
949 	struct nchandle *nch;
950 	hammer_mount_t hmp;
951 	int error;
952 
953 	nch = ap->a_nch;
954 	dip = VTOI(ap->a_dvp);
955 	hmp = dip->hmp;
956 
957 	if (dip->flags & HAMMER_INODE_RO)
958 		return (EROFS);
959 	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
960 		return (error);
961 
962 	/*
963 	 * Create a transaction to cover the operations we perform.
964 	 */
965 	lwkt_gettoken(&hmp->fs_token);
966 	hammer_start_transaction(&trans, hmp);
967 	++hammer_stats_file_iopsw;
968 
969 	/*
970 	 * Create a new filesystem object of the requested type.  The
971 	 * returned inode will be referenced and shared-locked to prevent
972 	 * it from being moved to the flusher.
973 	 */
974 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
975 				    dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
976 				    NULL, &nip);
977 	if (error) {
978 		hkprintf("hammer_create_inode error %d\n", error);
979 		hammer_done_transaction(&trans);
980 		*ap->a_vpp = NULL;
981 		lwkt_reltoken(&hmp->fs_token);
982 		return (error);
983 	}
984 
985 	/*
986 	 * Add the new filesystem object to the directory.  This will also
987 	 * bump the inode's link count.
988 	 */
989 	error = hammer_ip_add_directory(&trans, dip,
990 					nch->ncp->nc_name, nch->ncp->nc_nlen,
991 					nip);
992 	if (error)
993 		hkprintf("hammer_ip_add_directory error %d\n", error);
994 
995 	/*
996 	 * Finish up.
997 	 */
998 	if (error) {
999 		hammer_rel_inode(nip, 0);
1000 		hammer_done_transaction(&trans);
1001 		*ap->a_vpp = NULL;
1002 	} else {
1003 		error = hammer_get_vnode(nip, ap->a_vpp);
1004 		hammer_done_transaction(&trans);
1005 		hammer_rel_inode(nip, 0);
1006 		if (error == 0) {
1007 			cache_setunresolved(ap->a_nch);
1008 			cache_setvp(ap->a_nch, *ap->a_vpp);
1009 		}
1010 		hammer_knote(ap->a_dvp, NOTE_WRITE);
1011 	}
1012 	lwkt_reltoken(&hmp->fs_token);
1013 	return (error);
1014 }
1015 
1016 /*
1017  * hammer_vop_getattr { vp, vap }
1018  *
1019  * Retrieve an inode's attribute information.  When accessing inodes
1020  * historically we fake the atime field to ensure consistent results.
1021  * The atime field is stored in the B-Tree element and allowed to be
1022  * updated without cycling the element.
1023  *
1024  * MPSAFE - does not require fs_token
1025  */
1026 static
1027 int
1028 hammer_vop_getattr(struct vop_getattr_args *ap)
1029 {
1030 	struct hammer_inode *ip = VTOI(ap->a_vp);
1031 	struct vattr *vap = ap->a_vap;
1032 
1033 	/*
1034 	 * We want the fsid to be different when accessing a filesystem
1035 	 * with different as-of's so programs like diff don't think
1036 	 * the files are the same.
1037 	 *
1038 	 * We also want the fsid to be the same when comparing snapshots,
1039 	 * or when comparing mirrors (which might be backed by different
1040 	 * physical devices).  HAMMER fsids are based on the PFS's
1041 	 * shared_uuid field.
1042 	 *
1043 	 * XXX there is a chance of collision here.  The va_fsid reported
1044 	 * by stat is different from the more involved fsid used in the
1045 	 * mount structure.
1046 	 */
1047 	++hammer_stats_file_iopsr;
1048 	hammer_lock_sh(&ip->lock);
1049 	vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
1050 		       (u_int32_t)(ip->obj_asof >> 32);
1051 
1052 	vap->va_fileid = ip->ino_leaf.base.obj_id;
1053 	vap->va_mode = ip->ino_data.mode;
1054 	vap->va_nlink = ip->ino_data.nlinks;
1055 	vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1056 	vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1057 	vap->va_rmajor = 0;
1058 	vap->va_rminor = 0;
1059 	vap->va_size = ip->ino_data.size;
1060 
1061 	/*
1062 	 * Special case for @@PFS softlinks.  The actual size of the
1063 	 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
1064 	 * or for MAX_TID is    "@@-1:%05d" == 10 bytes.
1065 	 */
1066 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
1067 	    ip->ino_data.size == 10 &&
1068 	    ip->obj_asof == HAMMER_MAX_TID &&
1069 	    ip->obj_localization == 0 &&
1070 	    strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
1071 		    if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE)
1072 			    vap->va_size = 26;
1073 		    else
1074 			    vap->va_size = 10;
1075 	}
1076 
1077 	/*
1078 	 * We must provide a consistent atime and mtime for snapshots
1079 	 * so people can do a 'tar cf - ... | md5' on them and get
1080 	 * consistent results.
1081 	 */
1082 	if (ip->flags & HAMMER_INODE_RO) {
1083 		hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
1084 		hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
1085 	} else {
1086 		hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
1087 		hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
1088 	}
1089 	hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
1090 	vap->va_flags = ip->ino_data.uflags;
1091 	vap->va_gen = 1;	/* hammer inums are unique for all time */
1092 	vap->va_blocksize = HAMMER_BUFSIZE;
1093 	if (ip->ino_data.size >= HAMMER_XDEMARC) {
1094 		vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
1095 				~HAMMER_XBUFMASK64;
1096 	} else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
1097 		vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
1098 				~HAMMER_BUFMASK64;
1099 	} else {
1100 		vap->va_bytes = (ip->ino_data.size + 15) & ~15;
1101 	}
1102 
1103 	vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
1104 	vap->va_filerev = 0; 	/* XXX */
1105 	vap->va_uid_uuid = ip->ino_data.uid;
1106 	vap->va_gid_uuid = ip->ino_data.gid;
1107 	vap->va_fsid_uuid = ip->hmp->fsid;
1108 	vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
1109 			  VA_FSID_UUID_VALID;
1110 
1111 	switch (ip->ino_data.obj_type) {
1112 	case HAMMER_OBJTYPE_CDEV:
1113 	case HAMMER_OBJTYPE_BDEV:
1114 		vap->va_rmajor = ip->ino_data.rmajor;
1115 		vap->va_rminor = ip->ino_data.rminor;
1116 		break;
1117 	default:
1118 		break;
1119 	}
1120 	hammer_unlock(&ip->lock);
1121 	return(0);
1122 }
1123 
1124 /*
1125  * hammer_vop_nresolve { nch, dvp, cred }
1126  *
1127  * Locate the requested directory entry.
1128  */
1129 static
1130 int
1131 hammer_vop_nresolve(struct vop_nresolve_args *ap)
1132 {
1133 	struct hammer_transaction trans;
1134 	struct namecache *ncp;
1135 	hammer_mount_t hmp;
1136 	hammer_inode_t dip;
1137 	hammer_inode_t ip;
1138 	hammer_tid_t asof;
1139 	struct hammer_cursor cursor;
1140 	struct vnode *vp;
1141 	int64_t namekey;
1142 	int error;
1143 	int i;
1144 	int nlen;
1145 	int flags;
1146 	int ispfs;
1147 	int64_t obj_id;
1148 	u_int32_t localization;
1149 	u_int32_t max_iterations;
1150 
1151 	/*
1152 	 * Misc initialization, plus handle as-of name extensions.  Look for
1153 	 * the '@@' extension.  Note that as-of files and directories cannot
1154 	 * be modified.
1155 	 */
1156 	dip = VTOI(ap->a_dvp);
1157 	ncp = ap->a_nch->ncp;
1158 	asof = dip->obj_asof;
1159 	localization = dip->obj_localization;	/* for code consistency */
1160 	nlen = ncp->nc_nlen;
1161 	flags = dip->flags & HAMMER_INODE_RO;
1162 	ispfs = 0;
1163 	hmp = dip->hmp;
1164 
1165 	lwkt_gettoken(&hmp->fs_token);
1166 	hammer_simple_transaction(&trans, hmp);
1167 	++hammer_stats_file_iopsr;
1168 
1169 	for (i = 0; i < nlen; ++i) {
1170 		if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
1171 			error = hammer_str_to_tid(ncp->nc_name + i + 2,
1172 						  &ispfs, &asof, &localization);
1173 			if (error != 0) {
1174 				i = nlen;
1175 				break;
1176 			}
1177 			if (asof != HAMMER_MAX_TID)
1178 				flags |= HAMMER_INODE_RO;
1179 			break;
1180 		}
1181 	}
1182 	nlen = i;
1183 
1184 	/*
1185 	 * If this is a PFS softlink we dive into the PFS
1186 	 */
1187 	if (ispfs && nlen == 0) {
1188 		ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
1189 				      asof, localization,
1190 				      flags, &error);
1191 		if (error == 0) {
1192 			error = hammer_get_vnode(ip, &vp);
1193 			hammer_rel_inode(ip, 0);
1194 		} else {
1195 			vp = NULL;
1196 		}
1197 		if (error == 0) {
1198 			vn_unlock(vp);
1199 			cache_setvp(ap->a_nch, vp);
1200 			vrele(vp);
1201 		}
1202 		goto done;
1203 	}
1204 
1205 	/*
1206 	 * If there is no path component the time extension is relative to dip.
1207 	 * e.g. "fubar/@@<snapshot>"
1208 	 *
1209 	 * "." is handled by the kernel, but ".@@<snapshot>" is not.
1210 	 * e.g. "fubar/.@@<snapshot>"
1211 	 *
1212 	 * ".." is handled by the kernel.  We do not currently handle
1213 	 * "..@<snapshot>".
1214 	 */
1215 	if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) {
1216 		ip = hammer_get_inode(&trans, dip, dip->obj_id,
1217 				      asof, dip->obj_localization,
1218 				      flags, &error);
1219 		if (error == 0) {
1220 			error = hammer_get_vnode(ip, &vp);
1221 			hammer_rel_inode(ip, 0);
1222 		} else {
1223 			vp = NULL;
1224 		}
1225 		if (error == 0) {
1226 			vn_unlock(vp);
1227 			cache_setvp(ap->a_nch, vp);
1228 			vrele(vp);
1229 		}
1230 		goto done;
1231 	}
1232 
1233 	/*
1234 	 * Calculate the namekey and setup the key range for the scan.  This
1235 	 * works kinda like a chained hash table where the lower 32 bits
1236 	 * of the namekey synthesize the chain.
1237 	 *
1238 	 * The key range is inclusive of both key_beg and key_end.
1239 	 */
1240 	namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
1241 					   &max_iterations);
1242 
1243 	error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
1244 	cursor.key_beg.localization = dip->obj_localization +
1245 				      hammer_dir_localization(dip);
1246         cursor.key_beg.obj_id = dip->obj_id;
1247 	cursor.key_beg.key = namekey;
1248         cursor.key_beg.create_tid = 0;
1249         cursor.key_beg.delete_tid = 0;
1250         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1251         cursor.key_beg.obj_type = 0;
1252 
1253 	cursor.key_end = cursor.key_beg;
1254 	cursor.key_end.key += max_iterations;
1255 	cursor.asof = asof;
1256 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1257 
1258 	/*
1259 	 * Scan all matching records (the chain), locate the one matching
1260 	 * the requested path component.
1261 	 *
1262 	 * The hammer_ip_*() functions merge in-memory records with on-disk
1263 	 * records for the purposes of the search.
1264 	 */
1265 	obj_id = 0;
1266 	localization = HAMMER_DEF_LOCALIZATION;
1267 
1268 	if (error == 0) {
1269 		error = hammer_ip_first(&cursor);
1270 		while (error == 0) {
1271 			error = hammer_ip_resolve_data(&cursor);
1272 			if (error)
1273 				break;
1274 			if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
1275 			    bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1276 				obj_id = cursor.data->entry.obj_id;
1277 				localization = cursor.data->entry.localization;
1278 				break;
1279 			}
1280 			error = hammer_ip_next(&cursor);
1281 		}
1282 	}
1283 	hammer_done_cursor(&cursor);
1284 
1285 	/*
1286 	 * Lookup the obj_id.  This should always succeed.  If it does not
1287 	 * the filesystem may be damaged and we return a dummy inode.
1288 	 */
1289 	if (error == 0) {
1290 		ip = hammer_get_inode(&trans, dip, obj_id,
1291 				      asof, localization,
1292 				      flags, &error);
1293 		if (error == ENOENT) {
1294 			kprintf("HAMMER: WARNING: Missing "
1295 				"inode for dirent \"%s\"\n"
1296 				"\tobj_id = %016llx, asof=%016llx, lo=%08x\n",
1297 				ncp->nc_name,
1298 				(long long)obj_id, (long long)asof,
1299 				localization);
1300 			error = 0;
1301 			ip = hammer_get_dummy_inode(&trans, dip, obj_id,
1302 						    asof, localization,
1303 						    flags, &error);
1304 		}
1305 		if (error == 0) {
1306 			error = hammer_get_vnode(ip, &vp);
1307 			hammer_rel_inode(ip, 0);
1308 		} else {
1309 			vp = NULL;
1310 		}
1311 		if (error == 0) {
1312 			vn_unlock(vp);
1313 			cache_setvp(ap->a_nch, vp);
1314 			vrele(vp);
1315 		}
1316 	} else if (error == ENOENT) {
1317 		cache_setvp(ap->a_nch, NULL);
1318 	}
1319 done:
1320 	hammer_done_transaction(&trans);
1321 	lwkt_reltoken(&hmp->fs_token);
1322 	return (error);
1323 }
1324 
1325 /*
1326  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
1327  *
1328  * Locate the parent directory of a directory vnode.
1329  *
1330  * dvp is referenced but not locked.  *vpp must be returned referenced and
1331  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
1332  * at the root, instead it could indicate that the directory we were in was
1333  * removed.
1334  *
1335  * NOTE: as-of sequences are not linked into the directory structure.  If
1336  * we are at the root with a different asof then the mount point, reload
1337  * the same directory with the mount point's asof.   I'm not sure what this
1338  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
1339  * get confused, but it hasn't been tested.
1340  */
1341 static
1342 int
1343 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1344 {
1345 	struct hammer_transaction trans;
1346 	struct hammer_inode *dip;
1347 	struct hammer_inode *ip;
1348 	hammer_mount_t hmp;
1349 	int64_t parent_obj_id;
1350 	u_int32_t parent_obj_localization;
1351 	hammer_tid_t asof;
1352 	int error;
1353 
1354 	dip = VTOI(ap->a_dvp);
1355 	asof = dip->obj_asof;
1356 	hmp = dip->hmp;
1357 
1358 	/*
1359 	 * Whos are parent?  This could be the root of a pseudo-filesystem
1360 	 * whos parent is in another localization domain.
1361 	 */
1362 	lwkt_gettoken(&hmp->fs_token);
1363 	parent_obj_id = dip->ino_data.parent_obj_id;
1364 	if (dip->obj_id == HAMMER_OBJID_ROOT)
1365 		parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
1366 	else
1367 		parent_obj_localization = dip->obj_localization;
1368 
1369 	if (parent_obj_id == 0) {
1370 		if (dip->obj_id == HAMMER_OBJID_ROOT &&
1371 		   asof != hmp->asof) {
1372 			parent_obj_id = dip->obj_id;
1373 			asof = hmp->asof;
1374 			*ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
1375 			ksnprintf(*ap->a_fakename, 19, "0x%016llx",
1376 				  (long long)dip->obj_asof);
1377 		} else {
1378 			*ap->a_vpp = NULL;
1379 			lwkt_reltoken(&hmp->fs_token);
1380 			return ENOENT;
1381 		}
1382 	}
1383 
1384 	hammer_simple_transaction(&trans, hmp);
1385 	++hammer_stats_file_iopsr;
1386 
1387 	ip = hammer_get_inode(&trans, dip, parent_obj_id,
1388 			      asof, parent_obj_localization,
1389 			      dip->flags, &error);
1390 	if (ip) {
1391 		error = hammer_get_vnode(ip, ap->a_vpp);
1392 		hammer_rel_inode(ip, 0);
1393 	} else {
1394 		*ap->a_vpp = NULL;
1395 	}
1396 	hammer_done_transaction(&trans);
1397 	lwkt_reltoken(&hmp->fs_token);
1398 	return (error);
1399 }
1400 
1401 /*
1402  * hammer_vop_nlink { nch, dvp, vp, cred }
1403  */
1404 static
1405 int
1406 hammer_vop_nlink(struct vop_nlink_args *ap)
1407 {
1408 	struct hammer_transaction trans;
1409 	struct hammer_inode *dip;
1410 	struct hammer_inode *ip;
1411 	struct nchandle *nch;
1412 	hammer_mount_t hmp;
1413 	int error;
1414 
1415 	if (ap->a_dvp->v_mount != ap->a_vp->v_mount)
1416 		return(EXDEV);
1417 
1418 	nch = ap->a_nch;
1419 	dip = VTOI(ap->a_dvp);
1420 	ip = VTOI(ap->a_vp);
1421 	hmp = dip->hmp;
1422 
1423 	if (dip->obj_localization != ip->obj_localization)
1424 		return(EXDEV);
1425 
1426 	if (dip->flags & HAMMER_INODE_RO)
1427 		return (EROFS);
1428 	if (ip->flags & HAMMER_INODE_RO)
1429 		return (EROFS);
1430 	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1431 		return (error);
1432 
1433 	/*
1434 	 * Create a transaction to cover the operations we perform.
1435 	 */
1436 	lwkt_gettoken(&hmp->fs_token);
1437 	hammer_start_transaction(&trans, hmp);
1438 	++hammer_stats_file_iopsw;
1439 
1440 	/*
1441 	 * Add the filesystem object to the directory.  Note that neither
1442 	 * dip nor ip are referenced or locked, but their vnodes are
1443 	 * referenced.  This function will bump the inode's link count.
1444 	 */
1445 	error = hammer_ip_add_directory(&trans, dip,
1446 					nch->ncp->nc_name, nch->ncp->nc_nlen,
1447 					ip);
1448 
1449 	/*
1450 	 * Finish up.
1451 	 */
1452 	if (error == 0) {
1453 		cache_setunresolved(nch);
1454 		cache_setvp(nch, ap->a_vp);
1455 	}
1456 	hammer_done_transaction(&trans);
1457 	hammer_knote(ap->a_vp, NOTE_LINK);
1458 	hammer_knote(ap->a_dvp, NOTE_WRITE);
1459 	lwkt_reltoken(&hmp->fs_token);
1460 	return (error);
1461 }
1462 
1463 /*
1464  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1465  *
1466  * The operating system has already ensured that the directory entry
1467  * does not exist and done all appropriate namespace locking.
1468  */
1469 static
1470 int
1471 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1472 {
1473 	struct hammer_transaction trans;
1474 	struct hammer_inode *dip;
1475 	struct hammer_inode *nip;
1476 	struct nchandle *nch;
1477 	hammer_mount_t hmp;
1478 	int error;
1479 
1480 	nch = ap->a_nch;
1481 	dip = VTOI(ap->a_dvp);
1482 	hmp = dip->hmp;
1483 
1484 	if (dip->flags & HAMMER_INODE_RO)
1485 		return (EROFS);
1486 	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1487 		return (error);
1488 
1489 	/*
1490 	 * Create a transaction to cover the operations we perform.
1491 	 */
1492 	lwkt_gettoken(&hmp->fs_token);
1493 	hammer_start_transaction(&trans, hmp);
1494 	++hammer_stats_file_iopsw;
1495 
1496 	/*
1497 	 * Create a new filesystem object of the requested type.  The
1498 	 * returned inode will be referenced but not locked.
1499 	 */
1500 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1501 				    dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1502 				    NULL, &nip);
1503 	if (error) {
1504 		hkprintf("hammer_mkdir error %d\n", error);
1505 		hammer_done_transaction(&trans);
1506 		*ap->a_vpp = NULL;
1507 		lwkt_reltoken(&hmp->fs_token);
1508 		return (error);
1509 	}
1510 	/*
1511 	 * Add the new filesystem object to the directory.  This will also
1512 	 * bump the inode's link count.
1513 	 */
1514 	error = hammer_ip_add_directory(&trans, dip,
1515 					nch->ncp->nc_name, nch->ncp->nc_nlen,
1516 					nip);
1517 	if (error)
1518 		hkprintf("hammer_mkdir (add) error %d\n", error);
1519 
1520 	/*
1521 	 * Finish up.
1522 	 */
1523 	if (error) {
1524 		hammer_rel_inode(nip, 0);
1525 		*ap->a_vpp = NULL;
1526 	} else {
1527 		error = hammer_get_vnode(nip, ap->a_vpp);
1528 		hammer_rel_inode(nip, 0);
1529 		if (error == 0) {
1530 			cache_setunresolved(ap->a_nch);
1531 			cache_setvp(ap->a_nch, *ap->a_vpp);
1532 		}
1533 	}
1534 	hammer_done_transaction(&trans);
1535 	if (error == 0)
1536 		hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1537 	lwkt_reltoken(&hmp->fs_token);
1538 	return (error);
1539 }
1540 
1541 /*
1542  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1543  *
1544  * The operating system has already ensured that the directory entry
1545  * does not exist and done all appropriate namespace locking.
1546  */
1547 static
1548 int
1549 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1550 {
1551 	struct hammer_transaction trans;
1552 	struct hammer_inode *dip;
1553 	struct hammer_inode *nip;
1554 	struct nchandle *nch;
1555 	hammer_mount_t hmp;
1556 	int error;
1557 
1558 	nch = ap->a_nch;
1559 	dip = VTOI(ap->a_dvp);
1560 	hmp = dip->hmp;
1561 
1562 	if (dip->flags & HAMMER_INODE_RO)
1563 		return (EROFS);
1564 	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1565 		return (error);
1566 
1567 	/*
1568 	 * Create a transaction to cover the operations we perform.
1569 	 */
1570 	lwkt_gettoken(&hmp->fs_token);
1571 	hammer_start_transaction(&trans, hmp);
1572 	++hammer_stats_file_iopsw;
1573 
1574 	/*
1575 	 * Create a new filesystem object of the requested type.  The
1576 	 * returned inode will be referenced but not locked.
1577 	 *
1578 	 * If mknod specifies a directory a pseudo-fs is created.
1579 	 */
1580 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1581 				    dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1582 				    NULL, &nip);
1583 	if (error) {
1584 		hammer_done_transaction(&trans);
1585 		*ap->a_vpp = NULL;
1586 		lwkt_reltoken(&hmp->fs_token);
1587 		return (error);
1588 	}
1589 
1590 	/*
1591 	 * Add the new filesystem object to the directory.  This will also
1592 	 * bump the inode's link count.
1593 	 */
1594 	error = hammer_ip_add_directory(&trans, dip,
1595 					nch->ncp->nc_name, nch->ncp->nc_nlen,
1596 					nip);
1597 
1598 	/*
1599 	 * Finish up.
1600 	 */
1601 	if (error) {
1602 		hammer_rel_inode(nip, 0);
1603 		*ap->a_vpp = NULL;
1604 	} else {
1605 		error = hammer_get_vnode(nip, ap->a_vpp);
1606 		hammer_rel_inode(nip, 0);
1607 		if (error == 0) {
1608 			cache_setunresolved(ap->a_nch);
1609 			cache_setvp(ap->a_nch, *ap->a_vpp);
1610 		}
1611 	}
1612 	hammer_done_transaction(&trans);
1613 	if (error == 0)
1614 		hammer_knote(ap->a_dvp, NOTE_WRITE);
1615 	lwkt_reltoken(&hmp->fs_token);
1616 	return (error);
1617 }
1618 
1619 /*
1620  * hammer_vop_open { vp, mode, cred, fp }
1621  *
1622  * MPSAFE (does not require fs_token)
1623  */
1624 static
1625 int
1626 hammer_vop_open(struct vop_open_args *ap)
1627 {
1628 	hammer_inode_t ip;
1629 
1630 	++hammer_stats_file_iopsr;
1631 	ip = VTOI(ap->a_vp);
1632 
1633 	if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1634 		return (EROFS);
1635 	return(vop_stdopen(ap));
1636 }
1637 
1638 /*
1639  * hammer_vop_print { vp }
1640  */
1641 static
1642 int
1643 hammer_vop_print(struct vop_print_args *ap)
1644 {
1645 	return EOPNOTSUPP;
1646 }
1647 
1648 /*
1649  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1650  */
1651 static
1652 int
1653 hammer_vop_readdir(struct vop_readdir_args *ap)
1654 {
1655 	struct hammer_transaction trans;
1656 	struct hammer_cursor cursor;
1657 	struct hammer_inode *ip;
1658 	hammer_mount_t hmp;
1659 	struct uio *uio;
1660 	hammer_base_elm_t base;
1661 	int error;
1662 	int cookie_index;
1663 	int ncookies;
1664 	off_t *cookies;
1665 	off_t saveoff;
1666 	int r;
1667 	int dtype;
1668 
1669 	++hammer_stats_file_iopsr;
1670 	ip = VTOI(ap->a_vp);
1671 	uio = ap->a_uio;
1672 	saveoff = uio->uio_offset;
1673 	hmp = ip->hmp;
1674 
1675 	if (ap->a_ncookies) {
1676 		ncookies = uio->uio_resid / 16 + 1;
1677 		if (ncookies > 1024)
1678 			ncookies = 1024;
1679 		cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1680 		cookie_index = 0;
1681 	} else {
1682 		ncookies = -1;
1683 		cookies = NULL;
1684 		cookie_index = 0;
1685 	}
1686 
1687 	lwkt_gettoken(&hmp->fs_token);
1688 	hammer_simple_transaction(&trans, hmp);
1689 
1690 	/*
1691 	 * Handle artificial entries
1692 	 *
1693 	 * It should be noted that the minimum value for a directory
1694 	 * hash key on-media is 0x0000000100000000, so we can use anything
1695 	 * less then that to represent our 'special' key space.
1696 	 */
1697 	error = 0;
1698 	if (saveoff == 0) {
1699 		r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1700 		if (r)
1701 			goto done;
1702 		if (cookies)
1703 			cookies[cookie_index] = saveoff;
1704 		++saveoff;
1705 		++cookie_index;
1706 		if (cookie_index == ncookies)
1707 			goto done;
1708 	}
1709 	if (saveoff == 1) {
1710 		if (ip->ino_data.parent_obj_id) {
1711 			r = vop_write_dirent(&error, uio,
1712 					     ip->ino_data.parent_obj_id,
1713 					     DT_DIR, 2, "..");
1714 		} else {
1715 			r = vop_write_dirent(&error, uio,
1716 					     ip->obj_id, DT_DIR, 2, "..");
1717 		}
1718 		if (r)
1719 			goto done;
1720 		if (cookies)
1721 			cookies[cookie_index] = saveoff;
1722 		++saveoff;
1723 		++cookie_index;
1724 		if (cookie_index == ncookies)
1725 			goto done;
1726 	}
1727 
1728 	/*
1729 	 * Key range (begin and end inclusive) to scan.  Directory keys
1730 	 * directly translate to a 64 bit 'seek' position.
1731 	 */
1732 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1733 	cursor.key_beg.localization = ip->obj_localization +
1734 				      hammer_dir_localization(ip);
1735 	cursor.key_beg.obj_id = ip->obj_id;
1736 	cursor.key_beg.create_tid = 0;
1737 	cursor.key_beg.delete_tid = 0;
1738         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1739 	cursor.key_beg.obj_type = 0;
1740 	cursor.key_beg.key = saveoff;
1741 
1742 	cursor.key_end = cursor.key_beg;
1743 	cursor.key_end.key = HAMMER_MAX_KEY;
1744 	cursor.asof = ip->obj_asof;
1745 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1746 
1747 	error = hammer_ip_first(&cursor);
1748 
1749 	while (error == 0) {
1750 		error = hammer_ip_resolve_data(&cursor);
1751 		if (error)
1752 			break;
1753 		base = &cursor.leaf->base;
1754 		saveoff = base->key;
1755 		KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1756 
1757 		if (base->obj_id != ip->obj_id)
1758 			panic("readdir: bad record at %p", cursor.node);
1759 
1760 		/*
1761 		 * Convert pseudo-filesystems into softlinks
1762 		 */
1763 		dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
1764 		r = vop_write_dirent(
1765 			     &error, uio, cursor.data->entry.obj_id,
1766 			     dtype,
1767 			     cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1768 			     (void *)cursor.data->entry.name);
1769 		if (r)
1770 			break;
1771 		++saveoff;
1772 		if (cookies)
1773 			cookies[cookie_index] = base->key;
1774 		++cookie_index;
1775 		if (cookie_index == ncookies)
1776 			break;
1777 		error = hammer_ip_next(&cursor);
1778 	}
1779 	hammer_done_cursor(&cursor);
1780 
1781 done:
1782 	hammer_done_transaction(&trans);
1783 
1784 	if (ap->a_eofflag)
1785 		*ap->a_eofflag = (error == ENOENT);
1786 	uio->uio_offset = saveoff;
1787 	if (error && cookie_index == 0) {
1788 		if (error == ENOENT)
1789 			error = 0;
1790 		if (cookies) {
1791 			kfree(cookies, M_TEMP);
1792 			*ap->a_ncookies = 0;
1793 			*ap->a_cookies = NULL;
1794 		}
1795 	} else {
1796 		if (error == ENOENT)
1797 			error = 0;
1798 		if (cookies) {
1799 			*ap->a_ncookies = cookie_index;
1800 			*ap->a_cookies = cookies;
1801 		}
1802 	}
1803 	lwkt_reltoken(&hmp->fs_token);
1804 	return(error);
1805 }
1806 
1807 /*
1808  * hammer_vop_readlink { vp, uio, cred }
1809  */
1810 static
1811 int
1812 hammer_vop_readlink(struct vop_readlink_args *ap)
1813 {
1814 	struct hammer_transaction trans;
1815 	struct hammer_cursor cursor;
1816 	struct hammer_inode *ip;
1817 	hammer_mount_t hmp;
1818 	char buf[32];
1819 	u_int32_t localization;
1820 	hammer_pseudofs_inmem_t pfsm;
1821 	int error;
1822 
1823 	ip = VTOI(ap->a_vp);
1824 	hmp = ip->hmp;
1825 
1826 	lwkt_gettoken(&hmp->fs_token);
1827 
1828 	/*
1829 	 * Shortcut if the symlink data was stuffed into ino_data.
1830 	 *
1831 	 * Also expand special "@@PFS%05d" softlinks (expansion only
1832 	 * occurs for non-historical (current) accesses made from the
1833 	 * primary filesystem).
1834 	 */
1835 	if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1836 		char *ptr;
1837 		int bytes;
1838 
1839 		ptr = ip->ino_data.ext.symlink;
1840 		bytes = (int)ip->ino_data.size;
1841 		if (bytes == 10 &&
1842 		    ip->obj_asof == HAMMER_MAX_TID &&
1843 		    ip->obj_localization == 0 &&
1844 		    strncmp(ptr, "@@PFS", 5) == 0) {
1845 			hammer_simple_transaction(&trans, hmp);
1846 			bcopy(ptr + 5, buf, 5);
1847 			buf[5] = 0;
1848 			localization = strtoul(buf, NULL, 10) << 16;
1849 			pfsm = hammer_load_pseudofs(&trans, localization,
1850 						    &error);
1851 			if (error == 0) {
1852 				if (pfsm->pfsd.mirror_flags &
1853 				    HAMMER_PFSD_SLAVE) {
1854 					/* vap->va_size == 26 */
1855 					ksnprintf(buf, sizeof(buf),
1856 						  "@@0x%016llx:%05d",
1857 						  (long long)pfsm->pfsd.sync_end_tid,
1858 						  localization >> 16);
1859 				} else {
1860 					/* vap->va_size == 10 */
1861 					ksnprintf(buf, sizeof(buf),
1862 						  "@@-1:%05d",
1863 						  localization >> 16);
1864 #if 0
1865 					ksnprintf(buf, sizeof(buf),
1866 						  "@@0x%016llx:%05d",
1867 						  (long long)HAMMER_MAX_TID,
1868 						  localization >> 16);
1869 #endif
1870 				}
1871 				ptr = buf;
1872 				bytes = strlen(buf);
1873 			}
1874 			if (pfsm)
1875 				hammer_rel_pseudofs(hmp, pfsm);
1876 			hammer_done_transaction(&trans);
1877 		}
1878 		error = uiomove(ptr, bytes, ap->a_uio);
1879 		lwkt_reltoken(&hmp->fs_token);
1880 		return(error);
1881 	}
1882 
1883 	/*
1884 	 * Long version
1885 	 */
1886 	hammer_simple_transaction(&trans, hmp);
1887 	++hammer_stats_file_iopsr;
1888 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1889 
1890 	/*
1891 	 * Key range (begin and end inclusive) to scan.  Directory keys
1892 	 * directly translate to a 64 bit 'seek' position.
1893 	 */
1894 	cursor.key_beg.localization = ip->obj_localization +
1895 				      HAMMER_LOCALIZE_MISC;
1896 	cursor.key_beg.obj_id = ip->obj_id;
1897 	cursor.key_beg.create_tid = 0;
1898 	cursor.key_beg.delete_tid = 0;
1899         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1900 	cursor.key_beg.obj_type = 0;
1901 	cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1902 	cursor.asof = ip->obj_asof;
1903 	cursor.flags |= HAMMER_CURSOR_ASOF;
1904 
1905 	error = hammer_ip_lookup(&cursor);
1906 	if (error == 0) {
1907 		error = hammer_ip_resolve_data(&cursor);
1908 		if (error == 0) {
1909 			KKASSERT(cursor.leaf->data_len >=
1910 				 HAMMER_SYMLINK_NAME_OFF);
1911 			error = uiomove(cursor.data->symlink.name,
1912 					cursor.leaf->data_len -
1913 						HAMMER_SYMLINK_NAME_OFF,
1914 					ap->a_uio);
1915 		}
1916 	}
1917 	hammer_done_cursor(&cursor);
1918 	hammer_done_transaction(&trans);
1919 	lwkt_reltoken(&hmp->fs_token);
1920 	return(error);
1921 }
1922 
1923 /*
1924  * hammer_vop_nremove { nch, dvp, cred }
1925  */
1926 static
1927 int
1928 hammer_vop_nremove(struct vop_nremove_args *ap)
1929 {
1930 	struct hammer_transaction trans;
1931 	struct hammer_inode *dip;
1932 	hammer_mount_t hmp;
1933 	int error;
1934 
1935 	dip = VTOI(ap->a_dvp);
1936 	hmp = dip->hmp;
1937 
1938 	if (hammer_nohistory(dip) == 0 &&
1939 	    (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1940 		return (error);
1941 	}
1942 
1943 	lwkt_gettoken(&hmp->fs_token);
1944 	hammer_start_transaction(&trans, hmp);
1945 	++hammer_stats_file_iopsw;
1946 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
1947 	hammer_done_transaction(&trans);
1948 	if (error == 0)
1949 		hammer_knote(ap->a_dvp, NOTE_WRITE);
1950 	lwkt_reltoken(&hmp->fs_token);
1951 	return (error);
1952 }
1953 
1954 /*
1955  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1956  */
1957 static
1958 int
1959 hammer_vop_nrename(struct vop_nrename_args *ap)
1960 {
1961 	struct hammer_transaction trans;
1962 	struct namecache *fncp;
1963 	struct namecache *tncp;
1964 	struct hammer_inode *fdip;
1965 	struct hammer_inode *tdip;
1966 	struct hammer_inode *ip;
1967 	hammer_mount_t hmp;
1968 	struct hammer_cursor cursor;
1969 	int64_t namekey;
1970 	u_int32_t max_iterations;
1971 	int nlen, error;
1972 
1973 	if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
1974 		return(EXDEV);
1975 	if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1976 		return(EXDEV);
1977 
1978 	fdip = VTOI(ap->a_fdvp);
1979 	tdip = VTOI(ap->a_tdvp);
1980 	fncp = ap->a_fnch->ncp;
1981 	tncp = ap->a_tnch->ncp;
1982 	ip = VTOI(fncp->nc_vp);
1983 	KKASSERT(ip != NULL);
1984 
1985 	hmp = ip->hmp;
1986 
1987 	if (fdip->obj_localization != tdip->obj_localization)
1988 		return(EXDEV);
1989 	if (fdip->obj_localization != ip->obj_localization)
1990 		return(EXDEV);
1991 
1992 	if (fdip->flags & HAMMER_INODE_RO)
1993 		return (EROFS);
1994 	if (tdip->flags & HAMMER_INODE_RO)
1995 		return (EROFS);
1996 	if (ip->flags & HAMMER_INODE_RO)
1997 		return (EROFS);
1998 	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1999 		return (error);
2000 
2001 	lwkt_gettoken(&hmp->fs_token);
2002 	hammer_start_transaction(&trans, hmp);
2003 	++hammer_stats_file_iopsw;
2004 
2005 	/*
2006 	 * Remove tncp from the target directory and then link ip as
2007 	 * tncp. XXX pass trans to dounlink
2008 	 *
2009 	 * Force the inode sync-time to match the transaction so it is
2010 	 * in-sync with the creation of the target directory entry.
2011 	 */
2012 	error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
2013 				ap->a_cred, 0, -1);
2014 	if (error == 0 || error == ENOENT) {
2015 		error = hammer_ip_add_directory(&trans, tdip,
2016 						tncp->nc_name, tncp->nc_nlen,
2017 						ip);
2018 		if (error == 0) {
2019 			ip->ino_data.parent_obj_id = tdip->obj_id;
2020 			ip->ino_data.ctime = trans.time;
2021 			hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY);
2022 		}
2023 	}
2024 	if (error)
2025 		goto failed; /* XXX */
2026 
2027 	/*
2028 	 * Locate the record in the originating directory and remove it.
2029 	 *
2030 	 * Calculate the namekey and setup the key range for the scan.  This
2031 	 * works kinda like a chained hash table where the lower 32 bits
2032 	 * of the namekey synthesize the chain.
2033 	 *
2034 	 * The key range is inclusive of both key_beg and key_end.
2035 	 */
2036 	namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
2037 					   &max_iterations);
2038 retry:
2039 	hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
2040 	cursor.key_beg.localization = fdip->obj_localization +
2041 				      hammer_dir_localization(fdip);
2042         cursor.key_beg.obj_id = fdip->obj_id;
2043 	cursor.key_beg.key = namekey;
2044         cursor.key_beg.create_tid = 0;
2045         cursor.key_beg.delete_tid = 0;
2046         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2047         cursor.key_beg.obj_type = 0;
2048 
2049 	cursor.key_end = cursor.key_beg;
2050 	cursor.key_end.key += max_iterations;
2051 	cursor.asof = fdip->obj_asof;
2052 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2053 
2054 	/*
2055 	 * Scan all matching records (the chain), locate the one matching
2056 	 * the requested path component.
2057 	 *
2058 	 * The hammer_ip_*() functions merge in-memory records with on-disk
2059 	 * records for the purposes of the search.
2060 	 */
2061 	error = hammer_ip_first(&cursor);
2062 	while (error == 0) {
2063 		if (hammer_ip_resolve_data(&cursor) != 0)
2064 			break;
2065 		nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2066 		KKASSERT(nlen > 0);
2067 		if (fncp->nc_nlen == nlen &&
2068 		    bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2069 			break;
2070 		}
2071 		error = hammer_ip_next(&cursor);
2072 	}
2073 
2074 	/*
2075 	 * If all is ok we have to get the inode so we can adjust nlinks.
2076 	 *
2077 	 * WARNING: hammer_ip_del_directory() may have to terminate the
2078 	 * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
2079 	 * twice.
2080 	 */
2081 	if (error == 0)
2082 		error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
2083 
2084 	/*
2085 	 * XXX A deadlock here will break rename's atomicy for the purposes
2086 	 * of crash recovery.
2087 	 */
2088 	if (error == EDEADLK) {
2089 		hammer_done_cursor(&cursor);
2090 		goto retry;
2091 	}
2092 
2093 	/*
2094 	 * Cleanup and tell the kernel that the rename succeeded.
2095 	 *
2096 	 * NOTE: ip->vp, if non-NULL, cannot be directly referenced
2097 	 *	 without formally acquiring the vp since the vp might
2098 	 *	 have zero refs on it, or in the middle of a reclaim,
2099 	 *	 etc.
2100 	 */
2101         hammer_done_cursor(&cursor);
2102 	if (error == 0) {
2103 		cache_rename(ap->a_fnch, ap->a_tnch);
2104 		hammer_knote(ap->a_fdvp, NOTE_WRITE);
2105 		hammer_knote(ap->a_tdvp, NOTE_WRITE);
2106 		while (ip->vp) {
2107 			struct vnode *vp;
2108 
2109 			error = hammer_get_vnode(ip, &vp);
2110 			if (error == 0 && vp) {
2111 				vn_unlock(vp);
2112 				hammer_knote(ip->vp, NOTE_RENAME);
2113 				vrele(vp);
2114 				break;
2115 			}
2116 			kprintf("Debug: HAMMER ip/vp race2 avoided\n");
2117 		}
2118 	}
2119 
2120 failed:
2121 	hammer_done_transaction(&trans);
2122 	lwkt_reltoken(&hmp->fs_token);
2123 	return (error);
2124 }
2125 
2126 /*
2127  * hammer_vop_nrmdir { nch, dvp, cred }
2128  */
2129 static
2130 int
2131 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
2132 {
2133 	struct hammer_transaction trans;
2134 	struct hammer_inode *dip;
2135 	hammer_mount_t hmp;
2136 	int error;
2137 
2138 	dip = VTOI(ap->a_dvp);
2139 	hmp = dip->hmp;
2140 
2141 	if (hammer_nohistory(dip) == 0 &&
2142 	    (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
2143 		return (error);
2144 	}
2145 
2146 	lwkt_gettoken(&hmp->fs_token);
2147 	hammer_start_transaction(&trans, hmp);
2148 	++hammer_stats_file_iopsw;
2149 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
2150 	hammer_done_transaction(&trans);
2151 	if (error == 0)
2152 		hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
2153 	lwkt_reltoken(&hmp->fs_token);
2154 	return (error);
2155 }
2156 
2157 /*
2158  * hammer_vop_markatime { vp, cred }
2159  */
2160 static
2161 int
2162 hammer_vop_markatime(struct vop_markatime_args *ap)
2163 {
2164 	struct hammer_transaction trans;
2165 	struct hammer_inode *ip;
2166 	hammer_mount_t hmp;
2167 
2168 	ip = VTOI(ap->a_vp);
2169 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2170 		return (EROFS);
2171 	if (ip->flags & HAMMER_INODE_RO)
2172 		return (EROFS);
2173 	hmp = ip->hmp;
2174 	if (hmp->mp->mnt_flag & MNT_NOATIME)
2175 		return (0);
2176 	lwkt_gettoken(&hmp->fs_token);
2177 	hammer_start_transaction(&trans, hmp);
2178 	++hammer_stats_file_iopsw;
2179 
2180 	ip->ino_data.atime = trans.time;
2181 	hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
2182 	hammer_done_transaction(&trans);
2183 	hammer_knote(ap->a_vp, NOTE_ATTRIB);
2184 	lwkt_reltoken(&hmp->fs_token);
2185 	return (0);
2186 }
2187 
2188 /*
2189  * hammer_vop_setattr { vp, vap, cred }
2190  */
2191 static
2192 int
2193 hammer_vop_setattr(struct vop_setattr_args *ap)
2194 {
2195 	struct hammer_transaction trans;
2196 	struct hammer_inode *ip;
2197 	struct vattr *vap;
2198 	hammer_mount_t hmp;
2199 	int modflags;
2200 	int error;
2201 	int truncating;
2202 	int blksize;
2203 	int kflags;
2204 #if 0
2205 	int64_t aligned_size;
2206 #endif
2207 	u_int32_t flags;
2208 
2209 	vap = ap->a_vap;
2210 	ip = ap->a_vp->v_data;
2211 	modflags = 0;
2212 	kflags = 0;
2213 	hmp = ip->hmp;
2214 
2215 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2216 		return(EROFS);
2217 	if (ip->flags & HAMMER_INODE_RO)
2218 		return (EROFS);
2219 	if (hammer_nohistory(ip) == 0 &&
2220 	    (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
2221 		return (error);
2222 	}
2223 
2224 	lwkt_gettoken(&hmp->fs_token);
2225 	hammer_start_transaction(&trans, hmp);
2226 	++hammer_stats_file_iopsw;
2227 	error = 0;
2228 
2229 	if (vap->va_flags != VNOVAL) {
2230 		flags = ip->ino_data.uflags;
2231 		error = vop_helper_setattr_flags(&flags, vap->va_flags,
2232 					 hammer_to_unix_xid(&ip->ino_data.uid),
2233 					 ap->a_cred);
2234 		if (error == 0) {
2235 			if (ip->ino_data.uflags != flags) {
2236 				ip->ino_data.uflags = flags;
2237 				ip->ino_data.ctime = trans.time;
2238 				modflags |= HAMMER_INODE_DDIRTY;
2239 				kflags |= NOTE_ATTRIB;
2240 			}
2241 			if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2242 				error = 0;
2243 				goto done;
2244 			}
2245 		}
2246 		goto done;
2247 	}
2248 	if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2249 		error = EPERM;
2250 		goto done;
2251 	}
2252 	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
2253 		mode_t cur_mode = ip->ino_data.mode;
2254 		uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2255 		gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2256 		uuid_t uuid_uid;
2257 		uuid_t uuid_gid;
2258 
2259 		error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
2260 					 ap->a_cred,
2261 					 &cur_uid, &cur_gid, &cur_mode);
2262 		if (error == 0) {
2263 			hammer_guid_to_uuid(&uuid_uid, cur_uid);
2264 			hammer_guid_to_uuid(&uuid_gid, cur_gid);
2265 			if (bcmp(&uuid_uid, &ip->ino_data.uid,
2266 				 sizeof(uuid_uid)) ||
2267 			    bcmp(&uuid_gid, &ip->ino_data.gid,
2268 				 sizeof(uuid_gid)) ||
2269 			    ip->ino_data.mode != cur_mode
2270 			) {
2271 				ip->ino_data.uid = uuid_uid;
2272 				ip->ino_data.gid = uuid_gid;
2273 				ip->ino_data.mode = cur_mode;
2274 				ip->ino_data.ctime = trans.time;
2275 				modflags |= HAMMER_INODE_DDIRTY;
2276 			}
2277 			kflags |= NOTE_ATTRIB;
2278 		}
2279 	}
2280 	while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
2281 		switch(ap->a_vp->v_type) {
2282 		case VREG:
2283 			if (vap->va_size == ip->ino_data.size)
2284 				break;
2285 
2286 			/*
2287 			 * Log the operation if in fast-fsync mode or if
2288 			 * there are unterminated redo write records present.
2289 			 *
2290 			 * The second check is needed so the recovery code
2291 			 * properly truncates write redos even if nominal
2292 			 * REDO operations is turned off due to excessive
2293 			 * writes, because the related records might be
2294 			 * destroyed and never lay down a TERM_WRITE.
2295 			 */
2296 			if ((ip->flags & HAMMER_INODE_REDO) ||
2297 			    (ip->flags & HAMMER_INODE_RDIRTY)) {
2298 				error = hammer_generate_redo(&trans, ip,
2299 							     vap->va_size,
2300 							     HAMMER_REDO_TRUNC,
2301 							     NULL, 0);
2302 			}
2303 			blksize = hammer_blocksize(vap->va_size);
2304 
2305 			/*
2306 			 * XXX break atomicy, we can deadlock the backend
2307 			 * if we do not release the lock.  Probably not a
2308 			 * big deal here.
2309 			 */
2310 			if (vap->va_size < ip->ino_data.size) {
2311 				nvtruncbuf(ap->a_vp, vap->va_size,
2312 					   blksize,
2313 					   hammer_blockoff(vap->va_size),
2314 					   0);
2315 				truncating = 1;
2316 				kflags |= NOTE_WRITE;
2317 			} else {
2318 				nvextendbuf(ap->a_vp,
2319 					    ip->ino_data.size,
2320 					    vap->va_size,
2321 					    hammer_blocksize(ip->ino_data.size),
2322 					    hammer_blocksize(vap->va_size),
2323 					    hammer_blockoff(ip->ino_data.size),
2324 					    hammer_blockoff(vap->va_size),
2325 					    0);
2326 				truncating = 0;
2327 				kflags |= NOTE_WRITE | NOTE_EXTEND;
2328 			}
2329 			ip->ino_data.size = vap->va_size;
2330 			ip->ino_data.mtime = trans.time;
2331 			/* XXX safe to use SDIRTY instead of DDIRTY here? */
2332 			modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2333 
2334 			/*
2335 			 * On-media truncation is cached in the inode until
2336 			 * the inode is synchronized.  We must immediately
2337 			 * handle any frontend records.
2338 			 */
2339 			if (truncating) {
2340 				hammer_ip_frontend_trunc(ip, vap->va_size);
2341 #ifdef DEBUG_TRUNCATE
2342 				if (HammerTruncIp == NULL)
2343 					HammerTruncIp = ip;
2344 #endif
2345 				if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2346 					ip->flags |= HAMMER_INODE_TRUNCATED;
2347 					ip->trunc_off = vap->va_size;
2348 #ifdef DEBUG_TRUNCATE
2349 					if (ip == HammerTruncIp)
2350 					kprintf("truncate1 %016llx\n",
2351 						(long long)ip->trunc_off);
2352 #endif
2353 				} else if (ip->trunc_off > vap->va_size) {
2354 					ip->trunc_off = vap->va_size;
2355 #ifdef DEBUG_TRUNCATE
2356 					if (ip == HammerTruncIp)
2357 					kprintf("truncate2 %016llx\n",
2358 						(long long)ip->trunc_off);
2359 #endif
2360 				} else {
2361 #ifdef DEBUG_TRUNCATE
2362 					if (ip == HammerTruncIp)
2363 					kprintf("truncate3 %016llx (ignored)\n",
2364 						(long long)vap->va_size);
2365 #endif
2366 				}
2367 			}
2368 
2369 #if 0
2370 			/*
2371 			 * When truncating, nvtruncbuf() may have cleaned out
2372 			 * a portion of the last block on-disk in the buffer
2373 			 * cache.  We must clean out any frontend records
2374 			 * for blocks beyond the new last block.
2375 			 */
2376 			aligned_size = (vap->va_size + (blksize - 1)) &
2377 				       ~(int64_t)(blksize - 1);
2378 			if (truncating && vap->va_size < aligned_size) {
2379 				aligned_size -= blksize;
2380 				hammer_ip_frontend_trunc(ip, aligned_size);
2381 			}
2382 #endif
2383 			break;
2384 		case VDATABASE:
2385 			if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2386 				ip->flags |= HAMMER_INODE_TRUNCATED;
2387 				ip->trunc_off = vap->va_size;
2388 			} else if (ip->trunc_off > vap->va_size) {
2389 				ip->trunc_off = vap->va_size;
2390 			}
2391 			hammer_ip_frontend_trunc(ip, vap->va_size);
2392 			ip->ino_data.size = vap->va_size;
2393 			ip->ino_data.mtime = trans.time;
2394 			modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2395 			kflags |= NOTE_ATTRIB;
2396 			break;
2397 		default:
2398 			error = EINVAL;
2399 			goto done;
2400 		}
2401 		break;
2402 	}
2403 	if (vap->va_atime.tv_sec != VNOVAL) {
2404 		ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime);
2405 		modflags |= HAMMER_INODE_ATIME;
2406 		kflags |= NOTE_ATTRIB;
2407 	}
2408 	if (vap->va_mtime.tv_sec != VNOVAL) {
2409 		ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime);
2410 		modflags |= HAMMER_INODE_MTIME;
2411 		kflags |= NOTE_ATTRIB;
2412 	}
2413 	if (vap->va_mode != (mode_t)VNOVAL) {
2414 		mode_t   cur_mode = ip->ino_data.mode;
2415 		uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2416 		gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2417 
2418 		error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
2419 					 cur_uid, cur_gid, &cur_mode);
2420 		if (error == 0 && ip->ino_data.mode != cur_mode) {
2421 			ip->ino_data.mode = cur_mode;
2422 			ip->ino_data.ctime = trans.time;
2423 			modflags |= HAMMER_INODE_DDIRTY;
2424 			kflags |= NOTE_ATTRIB;
2425 		}
2426 	}
2427 done:
2428 	if (error == 0)
2429 		hammer_modify_inode(&trans, ip, modflags);
2430 	hammer_done_transaction(&trans);
2431 	hammer_knote(ap->a_vp, kflags);
2432 	lwkt_reltoken(&hmp->fs_token);
2433 	return (error);
2434 }
2435 
2436 /*
2437  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
2438  */
2439 static
2440 int
2441 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
2442 {
2443 	struct hammer_transaction trans;
2444 	struct hammer_inode *dip;
2445 	struct hammer_inode *nip;
2446 	hammer_record_t record;
2447 	struct nchandle *nch;
2448 	hammer_mount_t hmp;
2449 	int error;
2450 	int bytes;
2451 
2452 	ap->a_vap->va_type = VLNK;
2453 
2454 	nch = ap->a_nch;
2455 	dip = VTOI(ap->a_dvp);
2456 	hmp = dip->hmp;
2457 
2458 	if (dip->flags & HAMMER_INODE_RO)
2459 		return (EROFS);
2460 	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
2461 		return (error);
2462 
2463 	/*
2464 	 * Create a transaction to cover the operations we perform.
2465 	 */
2466 	lwkt_gettoken(&hmp->fs_token);
2467 	hammer_start_transaction(&trans, hmp);
2468 	++hammer_stats_file_iopsw;
2469 
2470 	/*
2471 	 * Create a new filesystem object of the requested type.  The
2472 	 * returned inode will be referenced but not locked.
2473 	 */
2474 
2475 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
2476 				    dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
2477 				    NULL, &nip);
2478 	if (error) {
2479 		hammer_done_transaction(&trans);
2480 		*ap->a_vpp = NULL;
2481 		lwkt_reltoken(&hmp->fs_token);
2482 		return (error);
2483 	}
2484 
2485 	/*
2486 	 * Add a record representing the symlink.  symlink stores the link
2487 	 * as pure data, not a string, and is no \0 terminated.
2488 	 */
2489 	if (error == 0) {
2490 		bytes = strlen(ap->a_target);
2491 
2492 		if (bytes <= HAMMER_INODE_BASESYMLEN) {
2493 			bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
2494 		} else {
2495 			record = hammer_alloc_mem_record(nip, bytes);
2496 			record->type = HAMMER_MEM_RECORD_GENERAL;
2497 
2498 			record->leaf.base.localization = nip->obj_localization +
2499 							 HAMMER_LOCALIZE_MISC;
2500 			record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
2501 			record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
2502 			record->leaf.data_len = bytes;
2503 			KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
2504 			bcopy(ap->a_target, record->data->symlink.name, bytes);
2505 			error = hammer_ip_add_record(&trans, record);
2506 		}
2507 
2508 		/*
2509 		 * Set the file size to the length of the link.
2510 		 */
2511 		if (error == 0) {
2512 			nip->ino_data.size = bytes;
2513 			hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY);
2514 		}
2515 	}
2516 	if (error == 0)
2517 		error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
2518 						nch->ncp->nc_nlen, nip);
2519 
2520 	/*
2521 	 * Finish up.
2522 	 */
2523 	if (error) {
2524 		hammer_rel_inode(nip, 0);
2525 		*ap->a_vpp = NULL;
2526 	} else {
2527 		error = hammer_get_vnode(nip, ap->a_vpp);
2528 		hammer_rel_inode(nip, 0);
2529 		if (error == 0) {
2530 			cache_setunresolved(ap->a_nch);
2531 			cache_setvp(ap->a_nch, *ap->a_vpp);
2532 			hammer_knote(ap->a_dvp, NOTE_WRITE);
2533 		}
2534 	}
2535 	hammer_done_transaction(&trans);
2536 	lwkt_reltoken(&hmp->fs_token);
2537 	return (error);
2538 }
2539 
2540 /*
2541  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
2542  */
2543 static
2544 int
2545 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
2546 {
2547 	struct hammer_transaction trans;
2548 	struct hammer_inode *dip;
2549 	hammer_mount_t hmp;
2550 	int error;
2551 
2552 	dip = VTOI(ap->a_dvp);
2553 	hmp = dip->hmp;
2554 
2555 	if (hammer_nohistory(dip) == 0 &&
2556 	    (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) {
2557 		return (error);
2558 	}
2559 
2560 	lwkt_gettoken(&hmp->fs_token);
2561 	hammer_start_transaction(&trans, hmp);
2562 	++hammer_stats_file_iopsw;
2563 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
2564 				ap->a_cred, ap->a_flags, -1);
2565 	hammer_done_transaction(&trans);
2566 	lwkt_reltoken(&hmp->fs_token);
2567 
2568 	return (error);
2569 }
2570 
2571 /*
2572  * hammer_vop_ioctl { vp, command, data, fflag, cred }
2573  */
2574 static
2575 int
2576 hammer_vop_ioctl(struct vop_ioctl_args *ap)
2577 {
2578 	struct hammer_inode *ip = ap->a_vp->v_data;
2579 	hammer_mount_t hmp = ip->hmp;
2580 	int error;
2581 
2582 	++hammer_stats_file_iopsr;
2583 	lwkt_gettoken(&hmp->fs_token);
2584 	error = hammer_ioctl(ip, ap->a_command, ap->a_data,
2585 			     ap->a_fflag, ap->a_cred);
2586 	lwkt_reltoken(&hmp->fs_token);
2587 	return (error);
2588 }
2589 
2590 static
2591 int
2592 hammer_vop_mountctl(struct vop_mountctl_args *ap)
2593 {
2594 	static const struct mountctl_opt extraopt[] = {
2595 		{ HMNT_NOHISTORY, 	"nohistory" },
2596 		{ HMNT_MASTERID,	"master" },
2597 		{ 0, NULL}
2598 
2599 	};
2600 	struct hammer_mount *hmp;
2601 	struct mount *mp;
2602 	int usedbytes;
2603 	int error;
2604 
2605 	error = 0;
2606 	usedbytes = 0;
2607 	mp = ap->a_head.a_ops->head.vv_mount;
2608 	KKASSERT(mp->mnt_data != NULL);
2609 	hmp = (struct hammer_mount *)mp->mnt_data;
2610 
2611 	lwkt_gettoken(&hmp->fs_token);
2612 
2613 	switch(ap->a_op) {
2614 	case MOUNTCTL_SET_EXPORT:
2615 		if (ap->a_ctllen != sizeof(struct export_args))
2616 			error = EINVAL;
2617 		else
2618 			error = hammer_vfs_export(mp, ap->a_op,
2619 				      (const struct export_args *)ap->a_ctl);
2620 		break;
2621 	case MOUNTCTL_MOUNTFLAGS:
2622 	{
2623 		/*
2624 		 * Call standard mountctl VOP function
2625 		 * so we get user mount flags.
2626 		 */
2627 		error = vop_stdmountctl(ap);
2628 		if (error)
2629 			break;
2630 
2631 		usedbytes = *ap->a_res;
2632 
2633 		if (usedbytes > 0 && usedbytes < ap->a_buflen) {
2634 			usedbytes += vfs_flagstostr(hmp->hflags, extraopt,
2635 						    ap->a_buf,
2636 						    ap->a_buflen - usedbytes,
2637 						    &error);
2638 		}
2639 
2640 		*ap->a_res += usedbytes;
2641 		break;
2642 	}
2643 	default:
2644 		error = vop_stdmountctl(ap);
2645 		break;
2646 	}
2647 	lwkt_reltoken(&hmp->fs_token);
2648 	return(error);
2649 }
2650 
2651 /*
2652  * hammer_vop_strategy { vp, bio }
2653  *
2654  * Strategy call, used for regular file read & write only.  Note that the
2655  * bp may represent a cluster.
2656  *
2657  * To simplify operation and allow better optimizations in the future,
2658  * this code does not make any assumptions with regards to buffer alignment
2659  * or size.
2660  */
2661 static
2662 int
2663 hammer_vop_strategy(struct vop_strategy_args *ap)
2664 {
2665 	struct buf *bp;
2666 	int error;
2667 
2668 	bp = ap->a_bio->bio_buf;
2669 
2670 	switch(bp->b_cmd) {
2671 	case BUF_CMD_READ:
2672 		error = hammer_vop_strategy_read(ap);
2673 		break;
2674 	case BUF_CMD_WRITE:
2675 		error = hammer_vop_strategy_write(ap);
2676 		break;
2677 	default:
2678 		bp->b_error = error = EINVAL;
2679 		bp->b_flags |= B_ERROR;
2680 		biodone(ap->a_bio);
2681 		break;
2682 	}
2683 
2684 	/* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */
2685 
2686 	return (error);
2687 }
2688 
2689 /*
2690  * Read from a regular file.  Iterate the related records and fill in the
2691  * BIO/BUF.  Gaps are zero-filled.
2692  *
2693  * The support code in hammer_object.c should be used to deal with mixed
2694  * in-memory and on-disk records.
2695  *
2696  * NOTE: Can be called from the cluster code with an oversized buf.
2697  *
2698  * XXX atime update
2699  */
2700 static
2701 int
2702 hammer_vop_strategy_read(struct vop_strategy_args *ap)
2703 {
2704 	struct hammer_transaction trans;
2705 	struct hammer_inode *ip;
2706 	struct hammer_inode *dip;
2707 	hammer_mount_t hmp;
2708 	struct hammer_cursor cursor;
2709 	hammer_base_elm_t base;
2710 	hammer_off_t disk_offset;
2711 	struct bio *bio;
2712 	struct bio *nbio;
2713 	struct buf *bp;
2714 	int64_t rec_offset;
2715 	int64_t ran_end;
2716 	int64_t tmp64;
2717 	int error;
2718 	int boff;
2719 	int roff;
2720 	int n;
2721 	int isdedupable;
2722 
2723 	bio = ap->a_bio;
2724 	bp = bio->bio_buf;
2725 	ip = ap->a_vp->v_data;
2726 	hmp = ip->hmp;
2727 
2728 	/*
2729 	 * The zone-2 disk offset may have been set by the cluster code via
2730 	 * a BMAP operation, or else should be NOOFFSET.
2731 	 *
2732 	 * Checking the high bits for a match against zone-2 should suffice.
2733 	 *
2734 	 * In cases where a lot of data duplication is present it may be
2735 	 * more beneficial to drop through and doubule-buffer through the
2736 	 * device.
2737 	 */
2738 	nbio = push_bio(bio);
2739 	if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2740 	    HAMMER_ZONE_LARGE_DATA) {
2741 		if (hammer_double_buffer == 0) {
2742 			lwkt_gettoken(&hmp->fs_token);
2743 			error = hammer_io_direct_read(hmp, nbio, NULL);
2744 			lwkt_reltoken(&hmp->fs_token);
2745 			return (error);
2746 		}
2747 
2748 		/*
2749 		 * Try to shortcut requests for double_buffer mode too.
2750 		 * Since this mode runs through the device buffer cache
2751 		 * only compatible buffer sizes (meaning those generated
2752 		 * by normal filesystem buffers) are legal.
2753 		 */
2754 		if (hammer_live_dedup == 0 && (bp->b_flags & B_PAGING) == 0) {
2755 			error = hammer_io_indirect_read(hmp, nbio, NULL);
2756 			return (error);
2757 		}
2758 	}
2759 
2760 	/*
2761 	 * Well, that sucked.  Do it the hard way.  If all the stars are
2762 	 * aligned we may still be able to issue a direct-read.
2763 	 */
2764 	lwkt_gettoken(&hmp->fs_token);
2765 	hammer_simple_transaction(&trans, hmp);
2766 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2767 
2768 	/*
2769 	 * Key range (begin and end inclusive) to scan.  Note that the key's
2770 	 * stored in the actual records represent BASE+LEN, not BASE.  The
2771 	 * first record containing bio_offset will have a key > bio_offset.
2772 	 */
2773 	cursor.key_beg.localization = ip->obj_localization +
2774 				      HAMMER_LOCALIZE_MISC;
2775 	cursor.key_beg.obj_id = ip->obj_id;
2776 	cursor.key_beg.create_tid = 0;
2777 	cursor.key_beg.delete_tid = 0;
2778 	cursor.key_beg.obj_type = 0;
2779 	cursor.key_beg.key = bio->bio_offset + 1;
2780 	cursor.asof = ip->obj_asof;
2781 	cursor.flags |= HAMMER_CURSOR_ASOF;
2782 
2783 	cursor.key_end = cursor.key_beg;
2784 	KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2785 #if 0
2786 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2787 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2788 		cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2789 		cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2790 	} else
2791 #endif
2792 	{
2793 		ran_end = bio->bio_offset + bp->b_bufsize;
2794 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2795 		cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2796 		tmp64 = ran_end + MAXPHYS + 1;	/* work-around GCC-4 bug */
2797 		if (tmp64 < ran_end)
2798 			cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2799 		else
2800 			cursor.key_end.key = ran_end + MAXPHYS + 1;
2801 	}
2802 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2803 
2804 	/*
2805 	 * Set NOSWAPCACHE for cursor data extraction if double buffering
2806 	 * is disabled or (if the file is not marked cacheable via chflags
2807 	 * and vm.swapcache_use_chflags is enabled).
2808 	 */
2809 	if (hammer_double_buffer == 0 ||
2810 	    ((ap->a_vp->v_flag & VSWAPCACHE) == 0 &&
2811 	     vm_swapcache_use_chflags)) {
2812 		cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE;
2813 	}
2814 
2815 	error = hammer_ip_first(&cursor);
2816 	boff = 0;
2817 
2818 	while (error == 0) {
2819 		/*
2820 		 * Get the base file offset of the record.  The key for
2821 		 * data records is (base + bytes) rather then (base).
2822 		 */
2823 		base = &cursor.leaf->base;
2824 		rec_offset = base->key - cursor.leaf->data_len;
2825 
2826 		/*
2827 		 * Calculate the gap, if any, and zero-fill it.
2828 		 *
2829 		 * n is the offset of the start of the record verses our
2830 		 * current seek offset in the bio.
2831 		 */
2832 		n = (int)(rec_offset - (bio->bio_offset + boff));
2833 		if (n > 0) {
2834 			if (n > bp->b_bufsize - boff)
2835 				n = bp->b_bufsize - boff;
2836 			bzero((char *)bp->b_data + boff, n);
2837 			boff += n;
2838 			n = 0;
2839 		}
2840 
2841 		/*
2842 		 * Calculate the data offset in the record and the number
2843 		 * of bytes we can copy.
2844 		 *
2845 		 * There are two degenerate cases.  First, boff may already
2846 		 * be at bp->b_bufsize.  Secondly, the data offset within
2847 		 * the record may exceed the record's size.
2848 		 */
2849 		roff = -n;
2850 		rec_offset += roff;
2851 		n = cursor.leaf->data_len - roff;
2852 		if (n <= 0) {
2853 			kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2854 			n = 0;
2855 		} else if (n > bp->b_bufsize - boff) {
2856 			n = bp->b_bufsize - boff;
2857 		}
2858 
2859 		/*
2860 		 * Deal with cached truncations.  This cool bit of code
2861 		 * allows truncate()/ftruncate() to avoid having to sync
2862 		 * the file.
2863 		 *
2864 		 * If the frontend is truncated then all backend records are
2865 		 * subject to the frontend's truncation.
2866 		 *
2867 		 * If the backend is truncated then backend records on-disk
2868 		 * (but not in-memory) are subject to the backend's
2869 		 * truncation.  In-memory records owned by the backend
2870 		 * represent data written after the truncation point on the
2871 		 * backend and must not be truncated.
2872 		 *
2873 		 * Truncate operations deal with frontend buffer cache
2874 		 * buffers and frontend-owned in-memory records synchronously.
2875 		 */
2876 		if (ip->flags & HAMMER_INODE_TRUNCATED) {
2877 			if (hammer_cursor_ondisk(&cursor)/* ||
2878 			    cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) {
2879 				if (ip->trunc_off <= rec_offset)
2880 					n = 0;
2881 				else if (ip->trunc_off < rec_offset + n)
2882 					n = (int)(ip->trunc_off - rec_offset);
2883 			}
2884 		}
2885 		if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2886 			if (hammer_cursor_ondisk(&cursor)) {
2887 				if (ip->sync_trunc_off <= rec_offset)
2888 					n = 0;
2889 				else if (ip->sync_trunc_off < rec_offset + n)
2890 					n = (int)(ip->sync_trunc_off - rec_offset);
2891 			}
2892 		}
2893 
2894 		/*
2895 		 * Try to issue a direct read into our bio if possible,
2896 		 * otherwise resolve the element data into a hammer_buffer
2897 		 * and copy.
2898 		 *
2899 		 * The buffer on-disk should be zerod past any real
2900 		 * truncation point, but may not be for any synthesized
2901 		 * truncation point from above.
2902 		 *
2903 		 * NOTE: disk_offset is only valid if the cursor data is
2904 		 *	 on-disk.
2905 		 */
2906 		disk_offset = cursor.leaf->data_offset + roff;
2907 		isdedupable = (boff == 0 && n == bp->b_bufsize &&
2908 			       hammer_cursor_ondisk(&cursor) &&
2909 			       ((int)disk_offset & HAMMER_BUFMASK) == 0);
2910 
2911 		if (isdedupable && hammer_double_buffer == 0) {
2912 			/*
2913 			 * Direct read case
2914 			 */
2915 			KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2916 				 HAMMER_ZONE_LARGE_DATA);
2917 			nbio->bio_offset = disk_offset;
2918 			error = hammer_io_direct_read(hmp, nbio, cursor.leaf);
2919 			if (hammer_live_dedup && error == 0)
2920 				hammer_dedup_cache_add(ip, cursor.leaf);
2921 			goto done;
2922 		} else if (isdedupable) {
2923 			/*
2924 			 * Async I/O case for reading from backing store
2925 			 * and copying the data to the filesystem buffer.
2926 			 * live-dedup has to verify the data anyway if it
2927 			 * gets a hit later so we can just add the entry
2928 			 * now.
2929 			 */
2930 			KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2931 				 HAMMER_ZONE_LARGE_DATA);
2932 			nbio->bio_offset = disk_offset;
2933 			if (hammer_live_dedup)
2934 				hammer_dedup_cache_add(ip, cursor.leaf);
2935 			error = hammer_io_indirect_read(hmp, nbio, cursor.leaf);
2936 			goto done;
2937 		} else if (n) {
2938 			error = hammer_ip_resolve_data(&cursor);
2939 			if (error == 0) {
2940 				if (hammer_live_dedup && isdedupable)
2941 					hammer_dedup_cache_add(ip, cursor.leaf);
2942 				bcopy((char *)cursor.data + roff,
2943 				      (char *)bp->b_data + boff, n);
2944 			}
2945 		}
2946 		if (error)
2947 			break;
2948 
2949 		/*
2950 		 * We have to be sure that the only elements added to the
2951 		 * dedup cache are those which are already on-media.
2952 		 */
2953 		if (hammer_live_dedup && hammer_cursor_ondisk(&cursor))
2954 			hammer_dedup_cache_add(ip, cursor.leaf);
2955 
2956 		/*
2957 		 * Iterate until we have filled the request.
2958 		 */
2959 		boff += n;
2960 		if (boff == bp->b_bufsize)
2961 			break;
2962 		error = hammer_ip_next(&cursor);
2963 	}
2964 
2965 	/*
2966 	 * There may have been a gap after the last record
2967 	 */
2968 	if (error == ENOENT)
2969 		error = 0;
2970 	if (error == 0 && boff != bp->b_bufsize) {
2971 		KKASSERT(boff < bp->b_bufsize);
2972 		bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2973 		/* boff = bp->b_bufsize; */
2974 	}
2975 
2976 	/*
2977 	 * Disallow swapcache operation on the vnode buffer if double
2978 	 * buffering is enabled, the swapcache will get the data via
2979 	 * the block device buffer.
2980 	 */
2981 	if (hammer_double_buffer)
2982 		bp->b_flags |= B_NOTMETA;
2983 
2984 	/*
2985 	 * Cleanup
2986 	 */
2987 	bp->b_resid = 0;
2988 	bp->b_error = error;
2989 	if (error)
2990 		bp->b_flags |= B_ERROR;
2991 	biodone(ap->a_bio);
2992 
2993 done:
2994 	/*
2995 	 * Cache the b-tree node for the last data read in cache[1].
2996 	 *
2997 	 * If we hit the file EOF then also cache the node in the
2998 	 * governing director's cache[3], it will be used to initialize
2999 	 * the inode's cache[1] for any inodes looked up via the directory.
3000 	 *
3001 	 * This doesn't reduce disk accesses since the B-Tree chain is
3002 	 * likely cached, but it does reduce cpu overhead when looking
3003 	 * up file offsets for cpdup/tar/cpio style iterations.
3004 	 */
3005 	if (cursor.node)
3006 		hammer_cache_node(&ip->cache[1], cursor.node);
3007 	if (ran_end >= ip->ino_data.size) {
3008 		dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id,
3009 					ip->obj_asof, ip->obj_localization);
3010 		if (dip) {
3011 			hammer_cache_node(&dip->cache[3], cursor.node);
3012 			hammer_rel_inode(dip, 0);
3013 		}
3014 	}
3015 	hammer_done_cursor(&cursor);
3016 	hammer_done_transaction(&trans);
3017 	lwkt_reltoken(&hmp->fs_token);
3018 	return(error);
3019 }
3020 
3021 /*
3022  * BMAP operation - used to support cluster_read() only.
3023  *
3024  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
3025  *
3026  * This routine may return EOPNOTSUPP if the opration is not supported for
3027  * the specified offset.  The contents of the pointer arguments do not
3028  * need to be initialized in that case.
3029  *
3030  * If a disk address is available and properly aligned return 0 with
3031  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
3032  * to the run-length relative to that offset.  Callers may assume that
3033  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
3034  * large, so return EOPNOTSUPP if it is not sufficiently large.
3035  */
3036 static
3037 int
3038 hammer_vop_bmap(struct vop_bmap_args *ap)
3039 {
3040 	struct hammer_transaction trans;
3041 	struct hammer_inode *ip;
3042 	hammer_mount_t hmp;
3043 	struct hammer_cursor cursor;
3044 	hammer_base_elm_t base;
3045 	int64_t rec_offset;
3046 	int64_t ran_end;
3047 	int64_t tmp64;
3048 	int64_t base_offset;
3049 	int64_t base_disk_offset;
3050 	int64_t last_offset;
3051 	hammer_off_t last_disk_offset;
3052 	hammer_off_t disk_offset;
3053 	int	rec_len;
3054 	int	error;
3055 	int	blksize;
3056 
3057 	++hammer_stats_file_iopsr;
3058 	ip = ap->a_vp->v_data;
3059 	hmp = ip->hmp;
3060 
3061 	/*
3062 	 * We can only BMAP regular files.  We can't BMAP database files,
3063 	 * directories, etc.
3064 	 */
3065 	if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
3066 		return(EOPNOTSUPP);
3067 
3068 	/*
3069 	 * bmap is typically called with runp/runb both NULL when used
3070 	 * for writing.  We do not support BMAP for writing atm.
3071 	 */
3072 	if (ap->a_cmd != BUF_CMD_READ)
3073 		return(EOPNOTSUPP);
3074 
3075 	/*
3076 	 * Scan the B-Tree to acquire blockmap addresses, then translate
3077 	 * to raw addresses.
3078 	 */
3079 	lwkt_gettoken(&hmp->fs_token);
3080 	hammer_simple_transaction(&trans, hmp);
3081 #if 0
3082 	kprintf("bmap_beg %016llx ip->cache %p\n",
3083 		(long long)ap->a_loffset, ip->cache[1]);
3084 #endif
3085 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
3086 
3087 	/*
3088 	 * Key range (begin and end inclusive) to scan.  Note that the key's
3089 	 * stored in the actual records represent BASE+LEN, not BASE.  The
3090 	 * first record containing bio_offset will have a key > bio_offset.
3091 	 */
3092 	cursor.key_beg.localization = ip->obj_localization +
3093 				      HAMMER_LOCALIZE_MISC;
3094 	cursor.key_beg.obj_id = ip->obj_id;
3095 	cursor.key_beg.create_tid = 0;
3096 	cursor.key_beg.delete_tid = 0;
3097 	cursor.key_beg.obj_type = 0;
3098 	if (ap->a_runb)
3099 		cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
3100 	else
3101 		cursor.key_beg.key = ap->a_loffset + 1;
3102 	if (cursor.key_beg.key < 0)
3103 		cursor.key_beg.key = 0;
3104 	cursor.asof = ip->obj_asof;
3105 	cursor.flags |= HAMMER_CURSOR_ASOF;
3106 
3107 	cursor.key_end = cursor.key_beg;
3108 	KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
3109 
3110 	ran_end = ap->a_loffset + MAXPHYS;
3111 	cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
3112 	cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
3113 	tmp64 = ran_end + MAXPHYS + 1;	/* work-around GCC-4 bug */
3114 	if (tmp64 < ran_end)
3115 		cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
3116 	else
3117 		cursor.key_end.key = ran_end + MAXPHYS + 1;
3118 
3119 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
3120 
3121 	error = hammer_ip_first(&cursor);
3122 	base_offset = last_offset = 0;
3123 	base_disk_offset = last_disk_offset = 0;
3124 
3125 	while (error == 0) {
3126 		/*
3127 		 * Get the base file offset of the record.  The key for
3128 		 * data records is (base + bytes) rather then (base).
3129 		 *
3130 		 * NOTE: rec_offset + rec_len may exceed the end-of-file.
3131 		 * The extra bytes should be zero on-disk and the BMAP op
3132 		 * should still be ok.
3133 		 */
3134 		base = &cursor.leaf->base;
3135 		rec_offset = base->key - cursor.leaf->data_len;
3136 		rec_len    = cursor.leaf->data_len;
3137 
3138 		/*
3139 		 * Incorporate any cached truncation.
3140 		 *
3141 		 * NOTE: Modifications to rec_len based on synthesized
3142 		 * truncation points remove the guarantee that any extended
3143 		 * data on disk is zero (since the truncations may not have
3144 		 * taken place on-media yet).
3145 		 */
3146 		if (ip->flags & HAMMER_INODE_TRUNCATED) {
3147 			if (hammer_cursor_ondisk(&cursor) ||
3148 			    cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
3149 				if (ip->trunc_off <= rec_offset)
3150 					rec_len = 0;
3151 				else if (ip->trunc_off < rec_offset + rec_len)
3152 					rec_len = (int)(ip->trunc_off - rec_offset);
3153 			}
3154 		}
3155 		if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
3156 			if (hammer_cursor_ondisk(&cursor)) {
3157 				if (ip->sync_trunc_off <= rec_offset)
3158 					rec_len = 0;
3159 				else if (ip->sync_trunc_off < rec_offset + rec_len)
3160 					rec_len = (int)(ip->sync_trunc_off - rec_offset);
3161 			}
3162 		}
3163 
3164 		/*
3165 		 * Accumulate information.  If we have hit a discontiguous
3166 		 * block reset base_offset unless we are already beyond the
3167 		 * requested offset.  If we are, that's it, we stop.
3168 		 */
3169 		if (error)
3170 			break;
3171 		if (hammer_cursor_ondisk(&cursor)) {
3172 			disk_offset = cursor.leaf->data_offset;
3173 			if (rec_offset != last_offset ||
3174 			    disk_offset != last_disk_offset) {
3175 				if (rec_offset > ap->a_loffset)
3176 					break;
3177 				base_offset = rec_offset;
3178 				base_disk_offset = disk_offset;
3179 			}
3180 			last_offset = rec_offset + rec_len;
3181 			last_disk_offset = disk_offset + rec_len;
3182 
3183 			if (hammer_live_dedup)
3184 				hammer_dedup_cache_add(ip, cursor.leaf);
3185 		}
3186 
3187 		error = hammer_ip_next(&cursor);
3188 	}
3189 
3190 #if 0
3191 	kprintf("BMAP %016llx:  %016llx - %016llx\n",
3192 		(long long)ap->a_loffset,
3193 		(long long)base_offset,
3194 		(long long)last_offset);
3195 	kprintf("BMAP %16s:  %016llx - %016llx\n", "",
3196 		(long long)base_disk_offset,
3197 		(long long)last_disk_offset);
3198 #endif
3199 
3200 	if (cursor.node) {
3201 		hammer_cache_node(&ip->cache[1], cursor.node);
3202 #if 0
3203 		kprintf("bmap_end2 %016llx ip->cache %p\n",
3204 			(long long)ap->a_loffset, ip->cache[1]);
3205 #endif
3206 	}
3207 	hammer_done_cursor(&cursor);
3208 	hammer_done_transaction(&trans);
3209 	lwkt_reltoken(&hmp->fs_token);
3210 
3211 	/*
3212 	 * If we couldn't find any records or the records we did find were
3213 	 * all behind the requested offset, return failure.  A forward
3214 	 * truncation can leave a hole w/ no on-disk records.
3215 	 */
3216 	if (last_offset == 0 || last_offset < ap->a_loffset)
3217 		return (EOPNOTSUPP);
3218 
3219 	/*
3220 	 * Figure out the block size at the requested offset and adjust
3221 	 * our limits so the cluster_read() does not create inappropriately
3222 	 * sized buffer cache buffers.
3223 	 */
3224 	blksize = hammer_blocksize(ap->a_loffset);
3225 	if (hammer_blocksize(base_offset) != blksize) {
3226 		base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
3227 	}
3228 	if (last_offset != ap->a_loffset &&
3229 	    hammer_blocksize(last_offset - 1) != blksize) {
3230 		last_offset = hammer_blockdemarc(ap->a_loffset,
3231 						 last_offset - 1);
3232 	}
3233 
3234 	/*
3235 	 * Returning EOPNOTSUPP simply prevents the direct-IO optimization
3236 	 * from occuring.
3237 	 */
3238 	disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
3239 
3240 	if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
3241 		/*
3242 		 * Only large-data zones can be direct-IOd
3243 		 */
3244 		error = EOPNOTSUPP;
3245 	} else if ((disk_offset & HAMMER_BUFMASK) ||
3246 		   (last_offset - ap->a_loffset) < blksize) {
3247 		/*
3248 		 * doffsetp is not aligned or the forward run size does
3249 		 * not cover a whole buffer, disallow the direct I/O.
3250 		 */
3251 		error = EOPNOTSUPP;
3252 	} else {
3253 		/*
3254 		 * We're good.
3255 		 */
3256 		*ap->a_doffsetp = disk_offset;
3257 		if (ap->a_runb) {
3258 			*ap->a_runb = ap->a_loffset - base_offset;
3259 			KKASSERT(*ap->a_runb >= 0);
3260 		}
3261 		if (ap->a_runp) {
3262 			*ap->a_runp = last_offset - ap->a_loffset;
3263 			KKASSERT(*ap->a_runp >= 0);
3264 		}
3265 		error = 0;
3266 	}
3267 	return(error);
3268 }
3269 
3270 /*
3271  * Write to a regular file.   Because this is a strategy call the OS is
3272  * trying to actually get data onto the media.
3273  */
3274 static
3275 int
3276 hammer_vop_strategy_write(struct vop_strategy_args *ap)
3277 {
3278 	hammer_record_t record;
3279 	hammer_mount_t hmp;
3280 	hammer_inode_t ip;
3281 	struct bio *bio;
3282 	struct buf *bp;
3283 	int blksize;
3284 	int bytes;
3285 	int error;
3286 
3287 	bio = ap->a_bio;
3288 	bp = bio->bio_buf;
3289 	ip = ap->a_vp->v_data;
3290 	hmp = ip->hmp;
3291 
3292 	blksize = hammer_blocksize(bio->bio_offset);
3293 	KKASSERT(bp->b_bufsize == blksize);
3294 
3295 	if (ip->flags & HAMMER_INODE_RO) {
3296 		bp->b_error = EROFS;
3297 		bp->b_flags |= B_ERROR;
3298 		biodone(ap->a_bio);
3299 		return(EROFS);
3300 	}
3301 
3302 	lwkt_gettoken(&hmp->fs_token);
3303 
3304 	/*
3305 	 * Disallow swapcache operation on the vnode buffer if double
3306 	 * buffering is enabled, the swapcache will get the data via
3307 	 * the block device buffer.
3308 	 */
3309 	if (hammer_double_buffer)
3310 		bp->b_flags |= B_NOTMETA;
3311 
3312 	/*
3313 	 * Interlock with inode destruction (no in-kernel or directory
3314 	 * topology visibility).  If we queue new IO while trying to
3315 	 * destroy the inode we can deadlock the vtrunc call in
3316 	 * hammer_inode_unloadable_check().
3317 	 *
3318 	 * Besides, there's no point flushing a bp associated with an
3319 	 * inode that is being destroyed on-media and has no kernel
3320 	 * references.
3321 	 */
3322 	if ((ip->flags | ip->sync_flags) &
3323 	    (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
3324 		bp->b_resid = 0;
3325 		biodone(ap->a_bio);
3326 		lwkt_reltoken(&hmp->fs_token);
3327 		return(0);
3328 	}
3329 
3330 	/*
3331 	 * Reserve space and issue a direct-write from the front-end.
3332 	 * NOTE: The direct_io code will hammer_bread/bcopy smaller
3333 	 * allocations.
3334 	 *
3335 	 * An in-memory record will be installed to reference the storage
3336 	 * until the flusher can get to it.
3337 	 *
3338 	 * Since we own the high level bio the front-end will not try to
3339 	 * do a direct-read until the write completes.
3340 	 *
3341 	 * NOTE: The only time we do not reserve a full-sized buffers
3342 	 * worth of data is if the file is small.  We do not try to
3343 	 * allocate a fragment (from the small-data zone) at the end of
3344 	 * an otherwise large file as this can lead to wildly separated
3345 	 * data.
3346 	 */
3347 	KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
3348 	KKASSERT(bio->bio_offset < ip->ino_data.size);
3349 	if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
3350 		bytes = bp->b_bufsize;
3351 	else
3352 		bytes = ((int)ip->ino_data.size + 15) & ~15;
3353 
3354 	record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
3355 				    bytes, &error);
3356 
3357 	/*
3358 	 * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated
3359 	 * in hammer_vop_write().  We must flag the record so the proper
3360 	 * REDO_TERM_WRITE entry is generated during the flush.
3361 	 */
3362 	if (record) {
3363 		if (bp->b_flags & B_VFSFLAG1) {
3364 			record->flags |= HAMMER_RECF_REDO;
3365 			bp->b_flags &= ~B_VFSFLAG1;
3366 		}
3367 		if (record->flags & HAMMER_RECF_DEDUPED) {
3368 			bp->b_resid = 0;
3369 			hammer_ip_replace_bulk(hmp, record);
3370 			biodone(ap->a_bio);
3371 		} else {
3372 			hammer_io_direct_write(hmp, bio, record);
3373 		}
3374 		if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
3375 			hammer_flush_inode(ip, 0);
3376 	} else {
3377 		bp->b_bio2.bio_offset = NOOFFSET;
3378 		bp->b_error = error;
3379 		bp->b_flags |= B_ERROR;
3380 		biodone(ap->a_bio);
3381 	}
3382 	lwkt_reltoken(&hmp->fs_token);
3383 	return(error);
3384 }
3385 
3386 /*
3387  * dounlink - disconnect a directory entry
3388  *
3389  * XXX whiteout support not really in yet
3390  */
3391 static int
3392 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
3393 		struct vnode *dvp, struct ucred *cred,
3394 		int flags, int isdir)
3395 {
3396 	struct namecache *ncp;
3397 	hammer_inode_t dip;
3398 	hammer_inode_t ip;
3399 	hammer_mount_t hmp;
3400 	struct hammer_cursor cursor;
3401 	int64_t namekey;
3402 	u_int32_t max_iterations;
3403 	int nlen, error;
3404 
3405 	/*
3406 	 * Calculate the namekey and setup the key range for the scan.  This
3407 	 * works kinda like a chained hash table where the lower 32 bits
3408 	 * of the namekey synthesize the chain.
3409 	 *
3410 	 * The key range is inclusive of both key_beg and key_end.
3411 	 */
3412 	dip = VTOI(dvp);
3413 	ncp = nch->ncp;
3414 	hmp = dip->hmp;
3415 
3416 	if (dip->flags & HAMMER_INODE_RO)
3417 		return (EROFS);
3418 
3419 	namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen,
3420 					   &max_iterations);
3421 retry:
3422 	hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
3423 	cursor.key_beg.localization = dip->obj_localization +
3424 				      hammer_dir_localization(dip);
3425         cursor.key_beg.obj_id = dip->obj_id;
3426 	cursor.key_beg.key = namekey;
3427         cursor.key_beg.create_tid = 0;
3428         cursor.key_beg.delete_tid = 0;
3429         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
3430         cursor.key_beg.obj_type = 0;
3431 
3432 	cursor.key_end = cursor.key_beg;
3433 	cursor.key_end.key += max_iterations;
3434 	cursor.asof = dip->obj_asof;
3435 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
3436 
3437 	/*
3438 	 * Scan all matching records (the chain), locate the one matching
3439 	 * the requested path component.  info->last_error contains the
3440 	 * error code on search termination and could be 0, ENOENT, or
3441 	 * something else.
3442 	 *
3443 	 * The hammer_ip_*() functions merge in-memory records with on-disk
3444 	 * records for the purposes of the search.
3445 	 */
3446 	error = hammer_ip_first(&cursor);
3447 
3448 	while (error == 0) {
3449 		error = hammer_ip_resolve_data(&cursor);
3450 		if (error)
3451 			break;
3452 		nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
3453 		KKASSERT(nlen > 0);
3454 		if (ncp->nc_nlen == nlen &&
3455 		    bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
3456 			break;
3457 		}
3458 		error = hammer_ip_next(&cursor);
3459 	}
3460 
3461 	/*
3462 	 * If all is ok we have to get the inode so we can adjust nlinks.
3463 	 * To avoid a deadlock with the flusher we must release the inode
3464 	 * lock on the directory when acquiring the inode for the entry.
3465 	 *
3466 	 * If the target is a directory, it must be empty.
3467 	 */
3468 	if (error == 0) {
3469 		hammer_unlock(&cursor.ip->lock);
3470 		ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
3471 				      hmp->asof,
3472 				      cursor.data->entry.localization,
3473 				      0, &error);
3474 		hammer_lock_sh(&cursor.ip->lock);
3475 		if (error == ENOENT) {
3476 			kprintf("HAMMER: WARNING: Removing "
3477 				"dirent w/missing inode \"%s\"\n"
3478 				"\tobj_id = %016llx\n",
3479 				ncp->nc_name,
3480 				(long long)cursor.data->entry.obj_id);
3481 			error = 0;
3482 		}
3483 
3484 		/*
3485 		 * If isdir >= 0 we validate that the entry is or is not a
3486 		 * directory.  If isdir < 0 we don't care.
3487 		 */
3488 		if (error == 0 && isdir >= 0 && ip) {
3489 			if (isdir &&
3490 			    ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
3491 				error = ENOTDIR;
3492 			} else if (isdir == 0 &&
3493 			    ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
3494 				error = EISDIR;
3495 			}
3496 		}
3497 
3498 		/*
3499 		 * If we are trying to remove a directory the directory must
3500 		 * be empty.
3501 		 *
3502 		 * The check directory code can loop and deadlock/retry.  Our
3503 		 * own cursor's node locks must be released to avoid a 3-way
3504 		 * deadlock with the flusher if the check directory code
3505 		 * blocks.
3506 		 *
3507 		 * If any changes whatsoever have been made to the cursor
3508 		 * set EDEADLK and retry.
3509 		 *
3510 		 * WARNING: See warnings in hammer_unlock_cursor()
3511 		 *	    function.
3512 		 */
3513 		if (error == 0 && ip && ip->ino_data.obj_type ==
3514 				        HAMMER_OBJTYPE_DIRECTORY) {
3515 			hammer_unlock_cursor(&cursor);
3516 			error = hammer_ip_check_directory_empty(trans, ip);
3517 			hammer_lock_cursor(&cursor);
3518 			if (cursor.flags & HAMMER_CURSOR_RETEST) {
3519 				kprintf("HAMMER: Warning: avoided deadlock "
3520 					"on rmdir '%s'\n",
3521 					ncp->nc_name);
3522 				error = EDEADLK;
3523 			}
3524 		}
3525 
3526 		/*
3527 		 * Delete the directory entry.
3528 		 *
3529 		 * WARNING: hammer_ip_del_directory() may have to terminate
3530 		 * the cursor to avoid a deadlock.  It is ok to call
3531 		 * hammer_done_cursor() twice.
3532 		 */
3533 		if (error == 0) {
3534 			error = hammer_ip_del_directory(trans, &cursor,
3535 							dip, ip);
3536 		}
3537 		hammer_done_cursor(&cursor);
3538 		if (error == 0) {
3539 			cache_setunresolved(nch);
3540 			cache_setvp(nch, NULL);
3541 
3542 			/*
3543 			 * NOTE: ip->vp, if non-NULL, cannot be directly
3544 			 *	 referenced without formally acquiring the
3545 			 *	 vp since the vp might have zero refs on it,
3546 			 *	 or in the middle of a reclaim, etc.
3547 			 *
3548 			 * NOTE: The cache_setunresolved() can rip the vp
3549 			 *	 out from under us since the vp may not have
3550 			 *	 any refs, in which case ip->vp will be NULL
3551 			 *	 from the outset.
3552 			 */
3553 			while (ip && ip->vp) {
3554 				struct vnode *vp;
3555 
3556 				error = hammer_get_vnode(ip, &vp);
3557 				if (error == 0 && vp) {
3558 					vn_unlock(vp);
3559 					hammer_knote(ip->vp, NOTE_DELETE);
3560 					cache_inval_vp(ip->vp, CINV_DESTROY);
3561 					vrele(vp);
3562 					break;
3563 				}
3564 				kprintf("Debug: HAMMER ip/vp race1 avoided\n");
3565 			}
3566 		}
3567 		if (ip)
3568 			hammer_rel_inode(ip, 0);
3569 	} else {
3570 		hammer_done_cursor(&cursor);
3571 	}
3572 	if (error == EDEADLK)
3573 		goto retry;
3574 
3575 	return (error);
3576 }
3577 
3578 /************************************************************************
3579  *			    FIFO AND SPECFS OPS				*
3580  ************************************************************************
3581  *
3582  */
3583 static int
3584 hammer_vop_fifoclose (struct vop_close_args *ap)
3585 {
3586 	/* XXX update itimes */
3587 	return (VOCALL(&fifo_vnode_vops, &ap->a_head));
3588 }
3589 
3590 static int
3591 hammer_vop_fiforead (struct vop_read_args *ap)
3592 {
3593 	int error;
3594 
3595 	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3596 	/* XXX update access time */
3597 	return (error);
3598 }
3599 
3600 static int
3601 hammer_vop_fifowrite (struct vop_write_args *ap)
3602 {
3603 	int error;
3604 
3605 	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3606 	/* XXX update access time */
3607 	return (error);
3608 }
3609 
3610 static
3611 int
3612 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap)
3613 {
3614 	int error;
3615 
3616 	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3617 	if (error)
3618 		error = hammer_vop_kqfilter(ap);
3619 	return(error);
3620 }
3621 
3622 /************************************************************************
3623  *			    KQFILTER OPS				*
3624  ************************************************************************
3625  *
3626  */
3627 static void filt_hammerdetach(struct knote *kn);
3628 static int filt_hammerread(struct knote *kn, long hint);
3629 static int filt_hammerwrite(struct knote *kn, long hint);
3630 static int filt_hammervnode(struct knote *kn, long hint);
3631 
3632 static struct filterops hammerread_filtops =
3633 	{ FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerread };
3634 static struct filterops hammerwrite_filtops =
3635 	{ FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerwrite };
3636 static struct filterops hammervnode_filtops =
3637 	{ FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammervnode };
3638 
3639 static
3640 int
3641 hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
3642 {
3643 	struct vnode *vp = ap->a_vp;
3644 	struct knote *kn = ap->a_kn;
3645 
3646 	switch (kn->kn_filter) {
3647 	case EVFILT_READ:
3648 		kn->kn_fop = &hammerread_filtops;
3649 		break;
3650 	case EVFILT_WRITE:
3651 		kn->kn_fop = &hammerwrite_filtops;
3652 		break;
3653 	case EVFILT_VNODE:
3654 		kn->kn_fop = &hammervnode_filtops;
3655 		break;
3656 	default:
3657 		return (EOPNOTSUPP);
3658 	}
3659 
3660 	kn->kn_hook = (caddr_t)vp;
3661 
3662 	knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
3663 
3664 	return(0);
3665 }
3666 
3667 static void
3668 filt_hammerdetach(struct knote *kn)
3669 {
3670 	struct vnode *vp = (void *)kn->kn_hook;
3671 
3672 	knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
3673 }
3674 
3675 static int
3676 filt_hammerread(struct knote *kn, long hint)
3677 {
3678 	struct vnode *vp = (void *)kn->kn_hook;
3679 	hammer_inode_t ip = VTOI(vp);
3680 	hammer_mount_t hmp = ip->hmp;
3681 	off_t off;
3682 
3683 	if (hint == NOTE_REVOKE) {
3684 		kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
3685 		return(1);
3686 	}
3687 	lwkt_gettoken(&hmp->fs_token);	/* XXX use per-ip-token */
3688 	off = ip->ino_data.size - kn->kn_fp->f_offset;
3689 	kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX;
3690 	lwkt_reltoken(&hmp->fs_token);
3691 	if (kn->kn_sfflags & NOTE_OLDAPI)
3692 		return(1);
3693 	return (kn->kn_data != 0);
3694 }
3695 
3696 static int
3697 filt_hammerwrite(struct knote *kn, long hint)
3698 {
3699 	if (hint == NOTE_REVOKE)
3700 		kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
3701 	kn->kn_data = 0;
3702 	return (1);
3703 }
3704 
3705 static int
3706 filt_hammervnode(struct knote *kn, long hint)
3707 {
3708 	if (kn->kn_sfflags & hint)
3709 		kn->kn_fflags |= hint;
3710 	if (hint == NOTE_REVOKE) {
3711 		kn->kn_flags |= (EV_EOF | EV_NODATA);
3712 		return (1);
3713 	}
3714 	return (kn->kn_fflags != 0);
3715 }
3716 
3717