xref: /dragonfly/sys/vfs/hammer/hammer_vnops.c (revision 267c04fd)
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/fcntl.h>
39 #include <sys/namecache.h>
40 #include <sys/vnode.h>
41 #include <sys/lockf.h>
42 #include <sys/event.h>
43 #include <sys/stat.h>
44 #include <sys/dirent.h>
45 #include <sys/file.h>
46 #include <vm/vm_extern.h>
47 #include <vm/swap_pager.h>
48 #include <vfs/fifofs/fifo.h>
49 
50 #include "hammer.h"
51 
52 /*
53  * USERFS VNOPS
54  */
55 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
56 static int hammer_vop_fsync(struct vop_fsync_args *);
57 static int hammer_vop_read(struct vop_read_args *);
58 static int hammer_vop_write(struct vop_write_args *);
59 static int hammer_vop_access(struct vop_access_args *);
60 static int hammer_vop_advlock(struct vop_advlock_args *);
61 static int hammer_vop_close(struct vop_close_args *);
62 static int hammer_vop_ncreate(struct vop_ncreate_args *);
63 static int hammer_vop_getattr(struct vop_getattr_args *);
64 static int hammer_vop_nresolve(struct vop_nresolve_args *);
65 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
66 static int hammer_vop_nlink(struct vop_nlink_args *);
67 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
68 static int hammer_vop_nmknod(struct vop_nmknod_args *);
69 static int hammer_vop_open(struct vop_open_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_markatime(struct vop_markatime_args *);
77 static int hammer_vop_setattr(struct vop_setattr_args *);
78 static int hammer_vop_strategy(struct vop_strategy_args *);
79 static int hammer_vop_bmap(struct vop_bmap_args *ap);
80 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
81 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
82 static int hammer_vop_ioctl(struct vop_ioctl_args *);
83 static int hammer_vop_mountctl(struct vop_mountctl_args *);
84 static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
85 
86 static int hammer_vop_fifoclose (struct vop_close_args *);
87 static int hammer_vop_fiforead (struct vop_read_args *);
88 static int hammer_vop_fifowrite (struct vop_write_args *);
89 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
90 
91 struct vop_ops hammer_vnode_vops = {
92 	.vop_default =		vop_defaultop,
93 	.vop_fsync =		hammer_vop_fsync,
94 	.vop_getpages =		vop_stdgetpages,
95 	.vop_putpages =		vop_stdputpages,
96 	.vop_read =		hammer_vop_read,
97 	.vop_write =		hammer_vop_write,
98 	.vop_access =		hammer_vop_access,
99 	.vop_advlock =		hammer_vop_advlock,
100 	.vop_close =		hammer_vop_close,
101 	.vop_ncreate =		hammer_vop_ncreate,
102 	.vop_getattr =		hammer_vop_getattr,
103 	.vop_inactive =		hammer_vop_inactive,
104 	.vop_reclaim =		hammer_vop_reclaim,
105 	.vop_nresolve =		hammer_vop_nresolve,
106 	.vop_nlookupdotdot =	hammer_vop_nlookupdotdot,
107 	.vop_nlink =		hammer_vop_nlink,
108 	.vop_nmkdir =		hammer_vop_nmkdir,
109 	.vop_nmknod =		hammer_vop_nmknod,
110 	.vop_open =		hammer_vop_open,
111 	.vop_pathconf =		vop_stdpathconf,
112 	.vop_print =		hammer_vop_print,
113 	.vop_readdir =		hammer_vop_readdir,
114 	.vop_readlink =		hammer_vop_readlink,
115 	.vop_nremove =		hammer_vop_nremove,
116 	.vop_nrename =		hammer_vop_nrename,
117 	.vop_nrmdir =		hammer_vop_nrmdir,
118 	.vop_markatime =	hammer_vop_markatime,
119 	.vop_setattr =		hammer_vop_setattr,
120 	.vop_bmap =		hammer_vop_bmap,
121 	.vop_strategy =		hammer_vop_strategy,
122 	.vop_nsymlink =		hammer_vop_nsymlink,
123 	.vop_nwhiteout =	hammer_vop_nwhiteout,
124 	.vop_ioctl =		hammer_vop_ioctl,
125 	.vop_mountctl =		hammer_vop_mountctl,
126 	.vop_kqfilter =		hammer_vop_kqfilter
127 };
128 
129 struct vop_ops hammer_spec_vops = {
130 	.vop_default =		vop_defaultop,
131 	.vop_fsync =		hammer_vop_fsync,
132 	.vop_read =		vop_stdnoread,
133 	.vop_write =		vop_stdnowrite,
134 	.vop_access =		hammer_vop_access,
135 	.vop_close =		hammer_vop_close,
136 	.vop_markatime =	hammer_vop_markatime,
137 	.vop_getattr =		hammer_vop_getattr,
138 	.vop_inactive =		hammer_vop_inactive,
139 	.vop_reclaim =		hammer_vop_reclaim,
140 	.vop_setattr =		hammer_vop_setattr
141 };
142 
143 struct vop_ops hammer_fifo_vops = {
144 	.vop_default =		fifo_vnoperate,
145 	.vop_fsync =		hammer_vop_fsync,
146 	.vop_read =		hammer_vop_fiforead,
147 	.vop_write =		hammer_vop_fifowrite,
148 	.vop_access =		hammer_vop_access,
149 	.vop_close =		hammer_vop_fifoclose,
150 	.vop_markatime =	hammer_vop_markatime,
151 	.vop_getattr =		hammer_vop_getattr,
152 	.vop_inactive =		hammer_vop_inactive,
153 	.vop_reclaim =		hammer_vop_reclaim,
154 	.vop_setattr =		hammer_vop_setattr,
155 	.vop_kqfilter =		hammer_vop_fifokqfilter
156 };
157 
158 static __inline
159 void
160 hammer_knote(struct vnode *vp, int flags)
161 {
162 	if (flags)
163 		KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
164 }
165 
166 #ifdef DEBUG_TRUNCATE
167 struct hammer_inode *HammerTruncIp;
168 #endif
169 
170 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
171 			   struct vnode *dvp, struct ucred *cred,
172 			   int flags, int isdir);
173 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
174 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
175 
176 #if 0
177 static
178 int
179 hammer_vop_vnoperate(struct vop_generic_args *)
180 {
181 	return (VOCALL(&hammer_vnode_vops, ap));
182 }
183 #endif
184 
185 /*
186  * hammer_vop_fsync { vp, waitfor }
187  *
188  * fsync() an inode to disk and wait for it to be completely committed
189  * such that the information would not be undone if a crash occured after
190  * return.
191  *
192  * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement
193  *	 a REDO log.  A sysctl is provided to relax HAMMER's fsync()
194  *	 operation.
195  *
196  *	 Ultimately the combination of a REDO log and use of fast storage
197  *	 to front-end cluster caches will make fsync fast, but it aint
198  *	 here yet.  And, in anycase, we need real transactional
199  *	 all-or-nothing features which are not restricted to a single file.
200  */
201 static
202 int
203 hammer_vop_fsync(struct vop_fsync_args *ap)
204 {
205 	hammer_inode_t ip = VTOI(ap->a_vp);
206 	hammer_mount_t hmp = ip->hmp;
207 	int waitfor = ap->a_waitfor;
208 	int mode;
209 
210 	lwkt_gettoken(&hmp->fs_token);
211 
212 	/*
213 	 * Fsync rule relaxation (default is either full synchronous flush
214 	 * or REDO semantics with synchronous flush).
215 	 */
216 	if (ap->a_flags & VOP_FSYNC_SYSCALL) {
217 		switch(hammer_fsync_mode) {
218 		case 0:
219 mode0:
220 			/* no REDO, full synchronous flush */
221 			goto skip;
222 		case 1:
223 mode1:
224 			/* no REDO, full asynchronous flush */
225 			if (waitfor == MNT_WAIT)
226 				waitfor = MNT_NOWAIT;
227 			goto skip;
228 		case 2:
229 			/* REDO semantics, synchronous flush */
230 			if (hmp->version < HAMMER_VOL_VERSION_FOUR)
231 				goto mode0;
232 			mode = HAMMER_FLUSH_UNDOS_AUTO;
233 			break;
234 		case 3:
235 			/* REDO semantics, relaxed asynchronous flush */
236 			if (hmp->version < HAMMER_VOL_VERSION_FOUR)
237 				goto mode1;
238 			mode = HAMMER_FLUSH_UNDOS_RELAXED;
239 			if (waitfor == MNT_WAIT)
240 				waitfor = MNT_NOWAIT;
241 			break;
242 		case 4:
243 			/* ignore the fsync() system call */
244 			lwkt_reltoken(&hmp->fs_token);
245 			return(0);
246 		default:
247 			/* we have to do something */
248 			mode = HAMMER_FLUSH_UNDOS_RELAXED;
249 			if (waitfor == MNT_WAIT)
250 				waitfor = MNT_NOWAIT;
251 			break;
252 		}
253 
254 		/*
255 		 * Fast fsync only needs to flush the UNDO/REDO fifo if
256 		 * HAMMER_INODE_REDO is non-zero and the only modifications
257 		 * made to the file are write or write-extends.
258 		 */
259 		if ((ip->flags & HAMMER_INODE_REDO) &&
260 		    (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0
261 		) {
262 			++hammer_count_fsyncs;
263 			hammer_flusher_flush_undos(hmp, mode);
264 			ip->redo_count = 0;
265 			if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0)
266 				vclrisdirty(ip->vp);
267 			lwkt_reltoken(&hmp->fs_token);
268 			return(0);
269 		}
270 
271 		/*
272 		 * REDO is enabled by fsync(), the idea being we really only
273 		 * want to lay down REDO records when programs are using
274 		 * fsync() heavily.  The first fsync() on the file starts
275 		 * the gravy train going and later fsync()s keep it hot by
276 		 * resetting the redo_count.
277 		 *
278 		 * We weren't running REDOs before now so we have to fall
279 		 * through and do a full fsync of what we have.
280 		 */
281 		if (hmp->version >= HAMMER_VOL_VERSION_FOUR &&
282 		    (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) {
283 			ip->flags |= HAMMER_INODE_REDO;
284 			ip->redo_count = 0;
285 		}
286 	}
287 skip:
288 
289 	/*
290 	 * Do a full flush sequence.
291 	 *
292 	 * Attempt to release the vnode while waiting for the inode to
293 	 * finish flushing.  This can really mess up inactive->reclaim
294 	 * sequences so only do it if the vnode is active.
295 	 *
296 	 * WARNING! The VX lock functions must be used.  vn_lock() will
297 	 *	    fail when this is part of a VOP_RECLAIM sequence.
298 	 */
299 	++hammer_count_fsyncs;
300 	vfsync(ap->a_vp, waitfor, 1, NULL, NULL);
301 	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
302 	if (waitfor == MNT_WAIT) {
303 		int dorelock;
304 
305 		if ((ap->a_vp->v_flag & VRECLAIMED) == 0) {
306 			vx_unlock(ap->a_vp);
307 			dorelock = 1;
308 		} else {
309 			dorelock = 0;
310 		}
311 		hammer_wait_inode(ip);
312 		if (dorelock)
313 			vx_lock(ap->a_vp);
314 	}
315 	if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0)
316 		vclrisdirty(ip->vp);
317 	lwkt_reltoken(&hmp->fs_token);
318 	return (ip->error);
319 }
320 
321 /*
322  * hammer_vop_read { vp, uio, ioflag, cred }
323  *
324  * MPSAFE (for the cache safe does not require fs_token)
325  */
326 static
327 int
328 hammer_vop_read(struct vop_read_args *ap)
329 {
330 	struct hammer_transaction trans;
331 	hammer_inode_t ip;
332 	hammer_mount_t hmp;
333 	off_t offset;
334 	struct buf *bp;
335 	struct uio *uio;
336 	int error;
337 	int n;
338 	int seqcount;
339 	int ioseqcount;
340 	int blksize;
341 	int bigread;
342 	int got_trans;
343 	size_t resid;
344 
345 	if (ap->a_vp->v_type != VREG)
346 		return (EINVAL);
347 	ip = VTOI(ap->a_vp);
348 	hmp = ip->hmp;
349 	error = 0;
350 	got_trans = 0;
351 	uio = ap->a_uio;
352 
353 	/*
354 	 * Attempt to shortcut directly to the VM object using lwbufs.
355 	 * This is much faster than instantiating buffer cache buffers.
356 	 */
357 	resid = uio->uio_resid;
358 	error = vop_helper_read_shortcut(ap);
359 	hammer_stats_file_read += resid - uio->uio_resid;
360 	if (error)
361 		return (error);
362 	if (uio->uio_resid == 0)
363 		goto finished;
364 
365 	/*
366 	 * Allow the UIO's size to override the sequential heuristic.
367 	 */
368 	blksize = hammer_blocksize(uio->uio_offset);
369 	seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE;
370 	ioseqcount = (ap->a_ioflag >> 16);
371 	if (seqcount < ioseqcount)
372 		seqcount = ioseqcount;
373 
374 	/*
375 	 * If reading or writing a huge amount of data we have to break
376 	 * atomicy and allow the operation to be interrupted by a signal
377 	 * or it can DOS the machine.
378 	 */
379 	bigread = (uio->uio_resid > 100 * 1024 * 1024);
380 
381 	/*
382 	 * Access the data typically in HAMMER_BUFSIZE blocks via the
383 	 * buffer cache, but HAMMER may use a variable block size based
384 	 * on the offset.
385 	 *
386 	 * XXX Temporary hack, delay the start transaction while we remain
387 	 *     MPSAFE.  NOTE: ino_data.size cannot change while vnode is
388 	 *     locked-shared.
389 	 */
390 	while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
391 		int64_t base_offset;
392 		int64_t file_limit;
393 
394 		blksize = hammer_blocksize(uio->uio_offset);
395 		offset = (int)uio->uio_offset & (blksize - 1);
396 		base_offset = uio->uio_offset - offset;
397 
398 		if (bigread && (error = hammer_signal_check(ip->hmp)) != 0)
399 			break;
400 
401 		/*
402 		 * MPSAFE
403 		 */
404 		bp = getblk(ap->a_vp, base_offset, blksize, 0, 0);
405 		if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) {
406 			bp->b_flags &= ~B_AGE;
407 			error = 0;
408 			goto skip;
409 		}
410 		if (ap->a_ioflag & IO_NRDELAY) {
411 			bqrelse(bp);
412 			return (EWOULDBLOCK);
413 		}
414 
415 		/*
416 		 * MPUNSAFE
417 		 */
418 		if (got_trans == 0) {
419 			hammer_start_transaction(&trans, ip->hmp);
420 			got_trans = 1;
421 		}
422 
423 		/*
424 		 * NOTE: A valid bp has already been acquired, but was not
425 		 *	 B_CACHE.
426 		 */
427 		if (hammer_cluster_enable) {
428 			/*
429 			 * Use file_limit to prevent cluster_read() from
430 			 * creating buffers of the wrong block size past
431 			 * the demarc.
432 			 */
433 			file_limit = ip->ino_data.size;
434 			if (base_offset < HAMMER_XDEMARC &&
435 			    file_limit > HAMMER_XDEMARC) {
436 				file_limit = HAMMER_XDEMARC;
437 			}
438 			error = cluster_readx(ap->a_vp,
439 					     file_limit, base_offset,
440 					     blksize, uio->uio_resid,
441 					     seqcount * BKVASIZE, &bp);
442 		} else {
443 			error = breadnx(ap->a_vp, base_offset, blksize,
444 					NULL, NULL, 0, &bp);
445 		}
446 		if (error) {
447 			brelse(bp);
448 			break;
449 		}
450 skip:
451 		if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) {
452 			kprintf("doff %016jx read file %016jx@%016jx\n",
453 				(intmax_t)bp->b_bio2.bio_offset,
454 				(intmax_t)ip->obj_id,
455 				(intmax_t)bp->b_loffset);
456 		}
457 		bp->b_flags &= ~B_IODEBUG;
458 		if (blksize == HAMMER_XBUFSIZE)
459 			bp->b_flags |= B_CLUSTEROK;
460 
461 		n = blksize - offset;
462 		if (n > uio->uio_resid)
463 			n = uio->uio_resid;
464 		if (n > ip->ino_data.size - uio->uio_offset)
465 			n = (int)(ip->ino_data.size - uio->uio_offset);
466 
467 		/*
468 		 * Set B_AGE, data has a lower priority than meta-data.
469 		 *
470 		 * Use a hold/unlock/drop sequence to run the uiomove
471 		 * with the buffer unlocked, avoiding deadlocks against
472 		 * read()s on mmap()'d spaces.
473 		 */
474 		bp->b_flags |= B_AGE;
475 		error = uiomovebp(bp, (char *)bp->b_data + offset, n, uio);
476 		bqrelse(bp);
477 
478 		if (error)
479 			break;
480 		hammer_stats_file_read += n;
481 	}
482 
483 finished:
484 
485 	/*
486 	 * Try to update the atime with just the inode lock for maximum
487 	 * concurrency.  If we can't shortcut it we have to get the full
488 	 * blown transaction.
489 	 */
490 	if (got_trans == 0 && hammer_update_atime_quick(ip) < 0) {
491 		hammer_start_transaction(&trans, ip->hmp);
492 		got_trans = 1;
493 	}
494 
495 	if (got_trans) {
496 		if ((ip->flags & HAMMER_INODE_RO) == 0 &&
497 		    (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
498 			lwkt_gettoken(&hmp->fs_token);
499 			ip->ino_data.atime = trans.time;
500 			hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
501 			hammer_done_transaction(&trans);
502 			lwkt_reltoken(&hmp->fs_token);
503 		} else {
504 			hammer_done_transaction(&trans);
505 		}
506 	}
507 	return (error);
508 }
509 
510 /*
511  * hammer_vop_write { vp, uio, ioflag, cred }
512  */
513 static
514 int
515 hammer_vop_write(struct vop_write_args *ap)
516 {
517 	struct hammer_transaction trans;
518 	struct hammer_inode *ip;
519 	hammer_mount_t hmp;
520 	thread_t td;
521 	struct uio *uio;
522 	int offset;
523 	off_t base_offset;
524 	int64_t cluster_eof;
525 	struct buf *bp;
526 	int kflags;
527 	int error;
528 	int n;
529 	int flags;
530 	int seqcount;
531 	int bigwrite;
532 
533 	if (ap->a_vp->v_type != VREG)
534 		return (EINVAL);
535 	ip = VTOI(ap->a_vp);
536 	hmp = ip->hmp;
537 	error = 0;
538 	kflags = 0;
539 	seqcount = ap->a_ioflag >> 16;
540 
541 	if (ip->flags & HAMMER_INODE_RO)
542 		return (EROFS);
543 
544 	/*
545 	 * Create a transaction to cover the operations we perform.
546 	 */
547 	hammer_start_transaction(&trans, hmp);
548 	uio = ap->a_uio;
549 
550 	/*
551 	 * Check append mode
552 	 */
553 	if (ap->a_ioflag & IO_APPEND)
554 		uio->uio_offset = ip->ino_data.size;
555 
556 	/*
557 	 * Check for illegal write offsets.  Valid range is 0...2^63-1.
558 	 *
559 	 * NOTE: the base_off assignment is required to work around what
560 	 * I consider to be a GCC-4 optimization bug.
561 	 */
562 	if (uio->uio_offset < 0) {
563 		hammer_done_transaction(&trans);
564 		return (EFBIG);
565 	}
566 	base_offset = uio->uio_offset + uio->uio_resid;	/* work around gcc-4 */
567 	if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) {
568 		hammer_done_transaction(&trans);
569 		return (EFBIG);
570 	}
571 
572 	if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc &&
573 	    base_offset > td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
574 		hammer_done_transaction(&trans);
575 		lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
576 		return (EFBIG);
577 	}
578 
579 	/*
580 	 * If reading or writing a huge amount of data we have to break
581 	 * atomicy and allow the operation to be interrupted by a signal
582 	 * or it can DOS the machine.
583 	 *
584 	 * Preset redo_count so we stop generating REDOs earlier if the
585 	 * limit is exceeded.
586 	 *
587 	 * redo_count is heuristical, SMP races are ok
588 	 */
589 	bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
590 	if ((ip->flags & HAMMER_INODE_REDO) &&
591 	    ip->redo_count < hammer_limit_redo) {
592 		ip->redo_count += uio->uio_resid;
593 	}
594 
595 	/*
596 	 * Access the data typically in HAMMER_BUFSIZE blocks via the
597 	 * buffer cache, but HAMMER may use a variable block size based
598 	 * on the offset.
599 	 */
600 	while (uio->uio_resid > 0) {
601 		int fixsize = 0;
602 		int blksize;
603 		int blkmask;
604 		int trivial;
605 		int endofblk;
606 		off_t nsize;
607 
608 		if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
609 			break;
610 		if (bigwrite && (error = hammer_signal_check(hmp)) != 0)
611 			break;
612 
613 		blksize = hammer_blocksize(uio->uio_offset);
614 
615 		/*
616 		 * Control the number of pending records associated with
617 		 * this inode.  If too many have accumulated start a
618 		 * flush.  Try to maintain a pipeline with the flusher.
619 		 *
620 		 * NOTE: It is possible for other sources to grow the
621 		 *	 records but not necessarily issue another flush,
622 		 *	 so use a timeout and ensure that a re-flush occurs.
623 		 */
624 		if (ip->rsv_recs >= hammer_limit_inode_recs) {
625 			lwkt_gettoken(&hmp->fs_token);
626 			hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
627 			while (ip->rsv_recs >= hammer_limit_inode_recs * 2) {
628 				ip->flags |= HAMMER_INODE_RECSW;
629 				tsleep(&ip->rsv_recs, 0, "hmrwww", hz);
630 				hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
631 			}
632 			lwkt_reltoken(&hmp->fs_token);
633 		}
634 
635 		/*
636 		 * Do not allow HAMMER to blow out the buffer cache.  Very
637 		 * large UIOs can lockout other processes due to bwillwrite()
638 		 * mechanics.
639 		 *
640 		 * The hammer inode is not locked during these operations.
641 		 * The vnode is locked which can interfere with the pageout
642 		 * daemon for non-UIO_NOCOPY writes but should not interfere
643 		 * with the buffer cache.  Even so, we cannot afford to
644 		 * allow the pageout daemon to build up too many dirty buffer
645 		 * cache buffers.
646 		 *
647 		 * Only call this if we aren't being recursively called from
648 		 * a virtual disk device (vn), else we may deadlock.
649 		 */
650 		if ((ap->a_ioflag & IO_RECURSE) == 0)
651 			bwillwrite(blksize);
652 
653 		/*
654 		 * Calculate the blocksize at the current offset and figure
655 		 * out how much we can actually write.
656 		 */
657 		blkmask = blksize - 1;
658 		offset = (int)uio->uio_offset & blkmask;
659 		base_offset = uio->uio_offset & ~(int64_t)blkmask;
660 		n = blksize - offset;
661 		if (n > uio->uio_resid) {
662 			n = uio->uio_resid;
663 			endofblk = 0;
664 		} else {
665 			endofblk = 1;
666 		}
667 		nsize = uio->uio_offset + n;
668 		if (nsize > ip->ino_data.size) {
669 			if (uio->uio_offset > ip->ino_data.size)
670 				trivial = 0;
671 			else
672 				trivial = 1;
673 			nvextendbuf(ap->a_vp,
674 				    ip->ino_data.size,
675 				    nsize,
676 				    hammer_blocksize(ip->ino_data.size),
677 				    hammer_blocksize(nsize),
678 				    hammer_blockoff(ip->ino_data.size),
679 				    hammer_blockoff(nsize),
680 				    trivial);
681 			fixsize = 1;
682 			kflags |= NOTE_EXTEND;
683 		}
684 
685 		if (uio->uio_segflg == UIO_NOCOPY) {
686 			/*
687 			 * Issuing a write with the same data backing the
688 			 * buffer.  Instantiate the buffer to collect the
689 			 * backing vm pages, then read-in any missing bits.
690 			 *
691 			 * This case is used by vop_stdputpages().
692 			 */
693 			bp = getblk(ap->a_vp, base_offset,
694 				    blksize, GETBLK_BHEAVY, 0);
695 			if ((bp->b_flags & B_CACHE) == 0) {
696 				bqrelse(bp);
697 				error = bread(ap->a_vp, base_offset,
698 					      blksize, &bp);
699 			}
700 		} else if (offset == 0 && uio->uio_resid >= blksize) {
701 			/*
702 			 * Even though we are entirely overwriting the buffer
703 			 * we may still have to zero it out to avoid a
704 			 * mmap/write visibility issue.
705 			 */
706 			bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
707 			if ((bp->b_flags & B_CACHE) == 0)
708 				vfs_bio_clrbuf(bp);
709 		} else if (base_offset >= ip->ino_data.size) {
710 			/*
711 			 * If the base offset of the buffer is beyond the
712 			 * file EOF, we don't have to issue a read.
713 			 */
714 			bp = getblk(ap->a_vp, base_offset,
715 				    blksize, GETBLK_BHEAVY, 0);
716 			vfs_bio_clrbuf(bp);
717 		} else {
718 			/*
719 			 * Partial overwrite, read in any missing bits then
720 			 * replace the portion being written.
721 			 */
722 			error = bread(ap->a_vp, base_offset, blksize, &bp);
723 			if (error == 0)
724 				bheavy(bp);
725 		}
726 		if (error == 0)
727 			error = uiomovebp(bp, bp->b_data + offset, n, uio);
728 
729 		lwkt_gettoken(&hmp->fs_token);
730 
731 		/*
732 		 * Generate REDO records if enabled and redo_count will not
733 		 * exceeded the limit.
734 		 *
735 		 * If redo_count exceeds the limit we stop generating records
736 		 * and clear HAMMER_INODE_REDO.  This will cause the next
737 		 * fsync() to do a full meta-data sync instead of just an
738 		 * UNDO/REDO fifo update.
739 		 *
740 		 * When clearing HAMMER_INODE_REDO any pre-existing REDOs
741 		 * will still be tracked.  The tracks will be terminated
742 		 * when the related meta-data (including possible data
743 		 * modifications which are not tracked via REDO) is
744 		 * flushed.
745 		 */
746 		if ((ip->flags & HAMMER_INODE_REDO) && error == 0) {
747 			if (ip->redo_count < hammer_limit_redo) {
748 				bp->b_flags |= B_VFSFLAG1;
749 				error = hammer_generate_redo(&trans, ip,
750 						     base_offset + offset,
751 						     HAMMER_REDO_WRITE,
752 						     bp->b_data + offset,
753 						     (size_t)n);
754 			} else {
755 				ip->flags &= ~HAMMER_INODE_REDO;
756 			}
757 		}
758 
759 		/*
760 		 * If we screwed up we have to undo any VM size changes we
761 		 * made.
762 		 */
763 		if (error) {
764 			brelse(bp);
765 			if (fixsize) {
766 				nvtruncbuf(ap->a_vp, ip->ino_data.size,
767 					  hammer_blocksize(ip->ino_data.size),
768 					  hammer_blockoff(ip->ino_data.size),
769 					  0);
770 			}
771 			lwkt_reltoken(&hmp->fs_token);
772 			break;
773 		}
774 		kflags |= NOTE_WRITE;
775 		hammer_stats_file_write += n;
776 		if (blksize == HAMMER_XBUFSIZE)
777 			bp->b_flags |= B_CLUSTEROK;
778 		if (ip->ino_data.size < uio->uio_offset) {
779 			ip->ino_data.size = uio->uio_offset;
780 			flags = HAMMER_INODE_SDIRTY;
781 		} else {
782 			flags = 0;
783 		}
784 		ip->ino_data.mtime = trans.time;
785 		flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
786 		hammer_modify_inode(&trans, ip, flags);
787 
788 		/*
789 		 * Once we dirty the buffer any cached zone-X offset
790 		 * becomes invalid.  HAMMER NOTE: no-history mode cannot
791 		 * allow overwriting over the same data sector unless
792 		 * we provide UNDOs for the old data, which we don't.
793 		 */
794 		bp->b_bio2.bio_offset = NOOFFSET;
795 
796 		lwkt_reltoken(&hmp->fs_token);
797 
798 		/*
799 		 * Final buffer disposition.
800 		 *
801 		 * Because meta-data updates are deferred, HAMMER is
802 		 * especially sensitive to excessive bdwrite()s because
803 		 * the I/O stream is not broken up by disk reads.  So the
804 		 * buffer cache simply cannot keep up.
805 		 *
806 		 * WARNING!  blksize is variable.  cluster_write() is
807 		 *	     expected to not blow up if it encounters
808 		 *	     buffers that do not match the passed blksize.
809 		 *
810 		 * NOTE!  Hammer shouldn't need to bawrite()/cluster_write().
811 		 *	  The ip->rsv_recs check should burst-flush the data.
812 		 *	  If we queue it immediately the buf could be left
813 		 *	  locked on the device queue for a very long time.
814 		 *
815 		 *	  However, failing to flush a dirty buffer out when
816 		 *        issued from the pageout daemon can result in a low
817 		 *        memory deadlock against bio_page_alloc(), so we
818 		 *	  have to bawrite() on IO_ASYNC as well.
819 		 *
820 		 * NOTE!  To avoid degenerate stalls due to mismatched block
821 		 *	  sizes we only honor IO_DIRECT on the write which
822 		 *	  abuts the end of the buffer.  However, we must
823 		 *	  honor IO_SYNC in case someone is silly enough to
824 		 *	  configure a HAMMER file as swap, or when HAMMER
825 		 *	  is serving NFS (for commits).  Ick ick.
826 		 */
827 		bp->b_flags |= B_AGE;
828 		if (blksize == HAMMER_XBUFSIZE)
829 			bp->b_flags |= B_CLUSTEROK;
830 
831 		if (ap->a_ioflag & IO_SYNC) {
832 			bwrite(bp);
833 		} else if ((ap->a_ioflag & IO_DIRECT) && endofblk) {
834 			bawrite(bp);
835 		} else if (ap->a_ioflag & IO_ASYNC) {
836 			bawrite(bp);
837 		} else if (hammer_cluster_enable &&
838 			   !(ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) {
839 			if (base_offset < HAMMER_XDEMARC)
840 				cluster_eof = hammer_blockdemarc(base_offset,
841 							 ip->ino_data.size);
842 			else
843 				cluster_eof = ip->ino_data.size;
844 			cluster_write(bp, cluster_eof, blksize, seqcount);
845 		} else {
846 			bdwrite(bp);
847 		}
848 	}
849 	hammer_done_transaction(&trans);
850 	hammer_knote(ap->a_vp, kflags);
851 
852 	return (error);
853 }
854 
855 /*
856  * hammer_vop_access { vp, mode, cred }
857  *
858  * MPSAFE - does not require fs_token
859  */
860 static
861 int
862 hammer_vop_access(struct vop_access_args *ap)
863 {
864 	struct hammer_inode *ip = VTOI(ap->a_vp);
865 	uid_t uid;
866 	gid_t gid;
867 	int error;
868 
869 	++hammer_stats_file_iopsr;
870 	uid = hammer_to_unix_xid(&ip->ino_data.uid);
871 	gid = hammer_to_unix_xid(&ip->ino_data.gid);
872 
873 	error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
874 				  ip->ino_data.uflags);
875 	return (error);
876 }
877 
878 /*
879  * hammer_vop_advlock { vp, id, op, fl, flags }
880  *
881  * MPSAFE - does not require fs_token
882  */
883 static
884 int
885 hammer_vop_advlock(struct vop_advlock_args *ap)
886 {
887 	hammer_inode_t ip = VTOI(ap->a_vp);
888 
889 	return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
890 }
891 
892 /*
893  * hammer_vop_close { vp, fflag }
894  *
895  * We can only sync-on-close for normal closes.  XXX disabled for now.
896  */
897 static
898 int
899 hammer_vop_close(struct vop_close_args *ap)
900 {
901 #if 0
902 	struct vnode *vp = ap->a_vp;
903 	hammer_inode_t ip = VTOI(vp);
904 	int waitfor;
905 	if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) {
906 		if (vn_islocked(vp) == LK_EXCLUSIVE &&
907 		    (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) {
908 			if (ip->flags & HAMMER_INODE_CLOSESYNC)
909 				waitfor = MNT_WAIT;
910 			else
911 				waitfor = MNT_NOWAIT;
912 			ip->flags &= ~(HAMMER_INODE_CLOSESYNC |
913 				       HAMMER_INODE_CLOSEASYNC);
914 			VOP_FSYNC(vp, MNT_NOWAIT, waitfor);
915 		}
916 	}
917 #endif
918 	return (vop_stdclose(ap));
919 }
920 
921 /*
922  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
923  *
924  * The operating system has already ensured that the directory entry
925  * does not exist and done all appropriate namespace locking.
926  */
927 static
928 int
929 hammer_vop_ncreate(struct vop_ncreate_args *ap)
930 {
931 	struct hammer_transaction trans;
932 	struct hammer_inode *dip;
933 	struct hammer_inode *nip;
934 	struct nchandle *nch;
935 	hammer_mount_t hmp;
936 	int error;
937 
938 	nch = ap->a_nch;
939 	dip = VTOI(ap->a_dvp);
940 	hmp = dip->hmp;
941 
942 	if (dip->flags & HAMMER_INODE_RO)
943 		return (EROFS);
944 	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
945 		return (error);
946 
947 	/*
948 	 * Create a transaction to cover the operations we perform.
949 	 */
950 	lwkt_gettoken(&hmp->fs_token);
951 	hammer_start_transaction(&trans, hmp);
952 	++hammer_stats_file_iopsw;
953 
954 	/*
955 	 * Create a new filesystem object of the requested type.  The
956 	 * returned inode will be referenced and shared-locked to prevent
957 	 * it from being moved to the flusher.
958 	 */
959 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
960 				    dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
961 				    NULL, &nip);
962 	if (error) {
963 		hkprintf("hammer_create_inode error %d\n", error);
964 		hammer_done_transaction(&trans);
965 		*ap->a_vpp = NULL;
966 		lwkt_reltoken(&hmp->fs_token);
967 		return (error);
968 	}
969 
970 	/*
971 	 * Add the new filesystem object to the directory.  This will also
972 	 * bump the inode's link count.
973 	 */
974 	error = hammer_ip_add_directory(&trans, dip,
975 					nch->ncp->nc_name, nch->ncp->nc_nlen,
976 					nip);
977 	if (error)
978 		hkprintf("hammer_ip_add_directory error %d\n", error);
979 
980 	/*
981 	 * Finish up.
982 	 */
983 	if (error) {
984 		hammer_rel_inode(nip, 0);
985 		hammer_done_transaction(&trans);
986 		*ap->a_vpp = NULL;
987 	} else {
988 		error = hammer_get_vnode(nip, ap->a_vpp);
989 		hammer_done_transaction(&trans);
990 		hammer_rel_inode(nip, 0);
991 		if (error == 0) {
992 			cache_setunresolved(ap->a_nch);
993 			cache_setvp(ap->a_nch, *ap->a_vpp);
994 		}
995 		hammer_knote(ap->a_dvp, NOTE_WRITE);
996 	}
997 	lwkt_reltoken(&hmp->fs_token);
998 	return (error);
999 }
1000 
1001 /*
1002  * hammer_vop_getattr { vp, vap }
1003  *
1004  * Retrieve an inode's attribute information.  When accessing inodes
1005  * historically we fake the atime field to ensure consistent results.
1006  * The atime field is stored in the B-Tree element and allowed to be
1007  * updated without cycling the element.
1008  *
1009  * MPSAFE - does not require fs_token
1010  */
1011 static
1012 int
1013 hammer_vop_getattr(struct vop_getattr_args *ap)
1014 {
1015 	struct hammer_inode *ip = VTOI(ap->a_vp);
1016 	struct vattr *vap = ap->a_vap;
1017 
1018 	/*
1019 	 * We want the fsid to be different when accessing a filesystem
1020 	 * with different as-of's so programs like diff don't think
1021 	 * the files are the same.
1022 	 *
1023 	 * We also want the fsid to be the same when comparing snapshots,
1024 	 * or when comparing mirrors (which might be backed by different
1025 	 * physical devices).  HAMMER fsids are based on the PFS's
1026 	 * shared_uuid field.
1027 	 *
1028 	 * XXX there is a chance of collision here.  The va_fsid reported
1029 	 * by stat is different from the more involved fsid used in the
1030 	 * mount structure.
1031 	 */
1032 	++hammer_stats_file_iopsr;
1033 	hammer_lock_sh(&ip->lock);
1034 	vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
1035 		       (u_int32_t)(ip->obj_asof >> 32);
1036 
1037 	vap->va_fileid = ip->ino_leaf.base.obj_id;
1038 	vap->va_mode = ip->ino_data.mode;
1039 	vap->va_nlink = ip->ino_data.nlinks;
1040 	vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1041 	vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1042 	vap->va_rmajor = 0;
1043 	vap->va_rminor = 0;
1044 	vap->va_size = ip->ino_data.size;
1045 
1046 	/*
1047 	 * Special case for @@PFS softlinks.  The actual size of the
1048 	 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
1049 	 * or for MAX_TID is    "@@-1:%05d" == 10 bytes.
1050 	 *
1051 	 * Note that userspace hammer command does not allow users to
1052 	 * create a @@PFS softlink under an existing other PFS (id!=0)
1053 	 * so the ip localization here for @@PFS softlink is always 0.
1054 	 */
1055 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
1056 	    ip->ino_data.size == 10 &&
1057 	    ip->obj_asof == HAMMER_MAX_TID &&
1058 	    ip->obj_localization == 0 &&
1059 	    strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
1060 		    if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE)
1061 			    vap->va_size = 26;
1062 		    else
1063 			    vap->va_size = 10;
1064 	}
1065 
1066 	/*
1067 	 * We must provide a consistent atime and mtime for snapshots
1068 	 * so people can do a 'tar cf - ... | md5' on them and get
1069 	 * consistent results.
1070 	 */
1071 	if (ip->flags & HAMMER_INODE_RO) {
1072 		hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
1073 		hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
1074 	} else {
1075 		hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
1076 		hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
1077 	}
1078 	hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
1079 	vap->va_flags = ip->ino_data.uflags;
1080 	vap->va_gen = 1;	/* hammer inums are unique for all time */
1081 	vap->va_blocksize = HAMMER_BUFSIZE;
1082 	if (ip->ino_data.size >= HAMMER_XDEMARC) {
1083 		vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
1084 				~HAMMER_XBUFMASK64;
1085 	} else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
1086 		vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
1087 				~HAMMER_BUFMASK64;
1088 	} else {
1089 		vap->va_bytes = (ip->ino_data.size + 15) & ~15;
1090 	}
1091 
1092 	vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
1093 	vap->va_filerev = 0;	/* XXX */
1094 	vap->va_uid_uuid = ip->ino_data.uid;
1095 	vap->va_gid_uuid = ip->ino_data.gid;
1096 	vap->va_fsid_uuid = ip->hmp->fsid;
1097 	vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
1098 			  VA_FSID_UUID_VALID;
1099 
1100 	switch (ip->ino_data.obj_type) {
1101 	case HAMMER_OBJTYPE_CDEV:
1102 	case HAMMER_OBJTYPE_BDEV:
1103 		vap->va_rmajor = ip->ino_data.rmajor;
1104 		vap->va_rminor = ip->ino_data.rminor;
1105 		break;
1106 	default:
1107 		break;
1108 	}
1109 	hammer_unlock(&ip->lock);
1110 	return(0);
1111 }
1112 
1113 /*
1114  * hammer_vop_nresolve { nch, dvp, cred }
1115  *
1116  * Locate the requested directory entry.
1117  */
1118 static
1119 int
1120 hammer_vop_nresolve(struct vop_nresolve_args *ap)
1121 {
1122 	struct hammer_transaction trans;
1123 	struct namecache *ncp;
1124 	hammer_mount_t hmp;
1125 	hammer_inode_t dip;
1126 	hammer_inode_t ip;
1127 	hammer_tid_t asof;
1128 	struct hammer_cursor cursor;
1129 	struct vnode *vp;
1130 	int64_t namekey;
1131 	int error;
1132 	int i;
1133 	int nlen;
1134 	int flags;
1135 	int ispfs;
1136 	int64_t obj_id;
1137 	u_int32_t localization;
1138 	u_int32_t max_iterations;
1139 
1140 	/*
1141 	 * Misc initialization, plus handle as-of name extensions.  Look for
1142 	 * the '@@' extension.  Note that as-of files and directories cannot
1143 	 * be modified.
1144 	 */
1145 	dip = VTOI(ap->a_dvp);
1146 	ncp = ap->a_nch->ncp;
1147 	asof = dip->obj_asof;
1148 	localization = dip->obj_localization;	/* for code consistency */
1149 	nlen = ncp->nc_nlen;
1150 	flags = dip->flags & HAMMER_INODE_RO;
1151 	ispfs = 0;
1152 	hmp = dip->hmp;
1153 
1154 	lwkt_gettoken(&hmp->fs_token);
1155 	hammer_simple_transaction(&trans, hmp);
1156 	++hammer_stats_file_iopsr;
1157 
1158 	for (i = 0; i < nlen; ++i) {
1159 		if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
1160 			error = hammer_str_to_tid(ncp->nc_name + i + 2,
1161 						  &ispfs, &asof, &localization);
1162 			if (error != 0) {
1163 				i = nlen;
1164 				break;
1165 			}
1166 			if (asof != HAMMER_MAX_TID)
1167 				flags |= HAMMER_INODE_RO;
1168 			break;
1169 		}
1170 	}
1171 	nlen = i;
1172 
1173 	/*
1174 	 * If this is a PFS softlink we dive into the PFS
1175 	 */
1176 	if (ispfs && nlen == 0) {
1177 		ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
1178 				      asof, localization,
1179 				      flags, &error);
1180 		if (error == 0) {
1181 			error = hammer_get_vnode(ip, &vp);
1182 			hammer_rel_inode(ip, 0);
1183 		} else {
1184 			vp = NULL;
1185 		}
1186 		if (error == 0) {
1187 			vn_unlock(vp);
1188 			cache_setvp(ap->a_nch, vp);
1189 			vrele(vp);
1190 		}
1191 		goto done;
1192 	}
1193 
1194 	/*
1195 	 * If there is no path component the time extension is relative to dip.
1196 	 * e.g. "fubar/@@<snapshot>"
1197 	 *
1198 	 * "." is handled by the kernel, but ".@@<snapshot>" is not.
1199 	 * e.g. "fubar/.@@<snapshot>"
1200 	 *
1201 	 * ".." is handled by the kernel.  We do not currently handle
1202 	 * "..@<snapshot>".
1203 	 */
1204 	if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) {
1205 		ip = hammer_get_inode(&trans, dip, dip->obj_id,
1206 				      asof, dip->obj_localization,
1207 				      flags, &error);
1208 		if (error == 0) {
1209 			error = hammer_get_vnode(ip, &vp);
1210 			hammer_rel_inode(ip, 0);
1211 		} else {
1212 			vp = NULL;
1213 		}
1214 		if (error == 0) {
1215 			vn_unlock(vp);
1216 			cache_setvp(ap->a_nch, vp);
1217 			vrele(vp);
1218 		}
1219 		goto done;
1220 	}
1221 
1222 	/*
1223 	 * Calculate the namekey and setup the key range for the scan.  This
1224 	 * works kinda like a chained hash table where the lower 32 bits
1225 	 * of the namekey synthesize the chain.
1226 	 *
1227 	 * The key range is inclusive of both key_beg and key_end.
1228 	 */
1229 	namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
1230 					   &max_iterations);
1231 
1232 	error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
1233 	cursor.key_beg.localization = dip->obj_localization +
1234 				      hammer_dir_localization(dip);
1235         cursor.key_beg.obj_id = dip->obj_id;
1236 	cursor.key_beg.key = namekey;
1237         cursor.key_beg.create_tid = 0;
1238         cursor.key_beg.delete_tid = 0;
1239         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1240         cursor.key_beg.obj_type = 0;
1241 
1242 	cursor.key_end = cursor.key_beg;
1243 	cursor.key_end.key += max_iterations;
1244 	cursor.asof = asof;
1245 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1246 
1247 	/*
1248 	 * Scan all matching records (the chain), locate the one matching
1249 	 * the requested path component.
1250 	 *
1251 	 * The hammer_ip_*() functions merge in-memory records with on-disk
1252 	 * records for the purposes of the search.
1253 	 */
1254 	obj_id = 0;
1255 	localization = HAMMER_DEF_LOCALIZATION;
1256 
1257 	if (error == 0) {
1258 		error = hammer_ip_first(&cursor);
1259 		while (error == 0) {
1260 			error = hammer_ip_resolve_data(&cursor);
1261 			if (error)
1262 				break;
1263 			if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
1264 			    bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1265 				obj_id = cursor.data->entry.obj_id;
1266 				localization = cursor.data->entry.localization;
1267 				break;
1268 			}
1269 			error = hammer_ip_next(&cursor);
1270 		}
1271 	}
1272 	hammer_done_cursor(&cursor);
1273 
1274 	/*
1275 	 * Lookup the obj_id.  This should always succeed.  If it does not
1276 	 * the filesystem may be damaged and we return a dummy inode.
1277 	 */
1278 	if (error == 0) {
1279 		ip = hammer_get_inode(&trans, dip, obj_id,
1280 				      asof, localization,
1281 				      flags, &error);
1282 		if (error == ENOENT) {
1283 			kprintf("HAMMER: WARNING: Missing "
1284 				"inode for dirent \"%s\"\n"
1285 				"\tobj_id = %016llx, asof=%016llx, lo=%08x\n",
1286 				ncp->nc_name,
1287 				(long long)obj_id, (long long)asof,
1288 				localization);
1289 			error = 0;
1290 			ip = hammer_get_dummy_inode(&trans, dip, obj_id,
1291 						    asof, localization,
1292 						    flags, &error);
1293 		}
1294 		if (error == 0) {
1295 			error = hammer_get_vnode(ip, &vp);
1296 			hammer_rel_inode(ip, 0);
1297 		} else {
1298 			vp = NULL;
1299 		}
1300 		if (error == 0) {
1301 			vn_unlock(vp);
1302 			cache_setvp(ap->a_nch, vp);
1303 			vrele(vp);
1304 		}
1305 	} else if (error == ENOENT) {
1306 		cache_setvp(ap->a_nch, NULL);
1307 	}
1308 done:
1309 	hammer_done_transaction(&trans);
1310 	lwkt_reltoken(&hmp->fs_token);
1311 	return (error);
1312 }
1313 
1314 /*
1315  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
1316  *
1317  * Locate the parent directory of a directory vnode.
1318  *
1319  * dvp is referenced but not locked.  *vpp must be returned referenced and
1320  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
1321  * at the root, instead it could indicate that the directory we were in was
1322  * removed.
1323  *
1324  * NOTE: as-of sequences are not linked into the directory structure.  If
1325  * we are at the root with a different asof then the mount point, reload
1326  * the same directory with the mount point's asof.   I'm not sure what this
1327  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
1328  * get confused, but it hasn't been tested.
1329  */
1330 static
1331 int
1332 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1333 {
1334 	struct hammer_transaction trans;
1335 	struct hammer_inode *dip;
1336 	struct hammer_inode *ip;
1337 	hammer_mount_t hmp;
1338 	int64_t parent_obj_id;
1339 	u_int32_t parent_obj_localization;
1340 	hammer_tid_t asof;
1341 	int error;
1342 
1343 	dip = VTOI(ap->a_dvp);
1344 	asof = dip->obj_asof;
1345 	hmp = dip->hmp;
1346 
1347 	/*
1348 	 * Whos are parent?  This could be the root of a pseudo-filesystem
1349 	 * whos parent is in another localization domain.
1350 	 */
1351 	lwkt_gettoken(&hmp->fs_token);
1352 	parent_obj_id = dip->ino_data.parent_obj_id;
1353 	if (dip->obj_id == HAMMER_OBJID_ROOT)
1354 		parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
1355 	else
1356 		parent_obj_localization = dip->obj_localization;
1357 
1358 	/*
1359 	 * It's probably a PFS root when dip->ino_data.parent_obj_id is 0.
1360 	 */
1361 	if (parent_obj_id == 0) {
1362 		if (dip->obj_id == HAMMER_OBJID_ROOT &&
1363 		   asof != hmp->asof) {
1364 			parent_obj_id = dip->obj_id;
1365 			asof = hmp->asof;
1366 			*ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
1367 			ksnprintf(*ap->a_fakename, 19, "0x%016llx",
1368 				  (long long)dip->obj_asof);
1369 		} else {
1370 			*ap->a_vpp = NULL;
1371 			lwkt_reltoken(&hmp->fs_token);
1372 			return ENOENT;
1373 		}
1374 	}
1375 
1376 	hammer_simple_transaction(&trans, hmp);
1377 	++hammer_stats_file_iopsr;
1378 
1379 	ip = hammer_get_inode(&trans, dip, parent_obj_id,
1380 			      asof, parent_obj_localization,
1381 			      dip->flags, &error);
1382 	if (ip) {
1383 		error = hammer_get_vnode(ip, ap->a_vpp);
1384 		hammer_rel_inode(ip, 0);
1385 	} else {
1386 		*ap->a_vpp = NULL;
1387 	}
1388 	hammer_done_transaction(&trans);
1389 	lwkt_reltoken(&hmp->fs_token);
1390 	return (error);
1391 }
1392 
1393 /*
1394  * hammer_vop_nlink { nch, dvp, vp, cred }
1395  */
1396 static
1397 int
1398 hammer_vop_nlink(struct vop_nlink_args *ap)
1399 {
1400 	struct hammer_transaction trans;
1401 	struct hammer_inode *dip;
1402 	struct hammer_inode *ip;
1403 	struct nchandle *nch;
1404 	hammer_mount_t hmp;
1405 	int error;
1406 
1407 	if (ap->a_dvp->v_mount != ap->a_vp->v_mount)
1408 		return(EXDEV);
1409 
1410 	nch = ap->a_nch;
1411 	dip = VTOI(ap->a_dvp);
1412 	ip = VTOI(ap->a_vp);
1413 	hmp = dip->hmp;
1414 
1415 	if (dip->obj_localization != ip->obj_localization)
1416 		return(EXDEV);
1417 
1418 	if (dip->flags & HAMMER_INODE_RO)
1419 		return (EROFS);
1420 	if (ip->flags & HAMMER_INODE_RO)
1421 		return (EROFS);
1422 	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1423 		return (error);
1424 
1425 	/*
1426 	 * Create a transaction to cover the operations we perform.
1427 	 */
1428 	lwkt_gettoken(&hmp->fs_token);
1429 	hammer_start_transaction(&trans, hmp);
1430 	++hammer_stats_file_iopsw;
1431 
1432 	/*
1433 	 * Add the filesystem object to the directory.  Note that neither
1434 	 * dip nor ip are referenced or locked, but their vnodes are
1435 	 * referenced.  This function will bump the inode's link count.
1436 	 */
1437 	error = hammer_ip_add_directory(&trans, dip,
1438 					nch->ncp->nc_name, nch->ncp->nc_nlen,
1439 					ip);
1440 
1441 	/*
1442 	 * Finish up.
1443 	 */
1444 	if (error == 0) {
1445 		cache_setunresolved(nch);
1446 		cache_setvp(nch, ap->a_vp);
1447 	}
1448 	hammer_done_transaction(&trans);
1449 	hammer_knote(ap->a_vp, NOTE_LINK);
1450 	hammer_knote(ap->a_dvp, NOTE_WRITE);
1451 	lwkt_reltoken(&hmp->fs_token);
1452 	return (error);
1453 }
1454 
1455 /*
1456  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1457  *
1458  * The operating system has already ensured that the directory entry
1459  * does not exist and done all appropriate namespace locking.
1460  */
1461 static
1462 int
1463 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1464 {
1465 	struct hammer_transaction trans;
1466 	struct hammer_inode *dip;
1467 	struct hammer_inode *nip;
1468 	struct nchandle *nch;
1469 	hammer_mount_t hmp;
1470 	int error;
1471 
1472 	nch = ap->a_nch;
1473 	dip = VTOI(ap->a_dvp);
1474 	hmp = dip->hmp;
1475 
1476 	if (dip->flags & HAMMER_INODE_RO)
1477 		return (EROFS);
1478 	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1479 		return (error);
1480 
1481 	/*
1482 	 * Create a transaction to cover the operations we perform.
1483 	 */
1484 	lwkt_gettoken(&hmp->fs_token);
1485 	hammer_start_transaction(&trans, hmp);
1486 	++hammer_stats_file_iopsw;
1487 
1488 	/*
1489 	 * Create a new filesystem object of the requested type.  The
1490 	 * returned inode will be referenced but not locked.
1491 	 */
1492 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1493 				    dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1494 				    NULL, &nip);
1495 	if (error) {
1496 		hkprintf("hammer_mkdir error %d\n", error);
1497 		hammer_done_transaction(&trans);
1498 		*ap->a_vpp = NULL;
1499 		lwkt_reltoken(&hmp->fs_token);
1500 		return (error);
1501 	}
1502 	/*
1503 	 * Add the new filesystem object to the directory.  This will also
1504 	 * bump the inode's link count.
1505 	 */
1506 	error = hammer_ip_add_directory(&trans, dip,
1507 					nch->ncp->nc_name, nch->ncp->nc_nlen,
1508 					nip);
1509 	if (error)
1510 		hkprintf("hammer_mkdir (add) error %d\n", error);
1511 
1512 	/*
1513 	 * Finish up.
1514 	 */
1515 	if (error) {
1516 		hammer_rel_inode(nip, 0);
1517 		*ap->a_vpp = NULL;
1518 	} else {
1519 		error = hammer_get_vnode(nip, ap->a_vpp);
1520 		hammer_rel_inode(nip, 0);
1521 		if (error == 0) {
1522 			cache_setunresolved(ap->a_nch);
1523 			cache_setvp(ap->a_nch, *ap->a_vpp);
1524 		}
1525 	}
1526 	hammer_done_transaction(&trans);
1527 	if (error == 0)
1528 		hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1529 	lwkt_reltoken(&hmp->fs_token);
1530 	return (error);
1531 }
1532 
1533 /*
1534  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1535  *
1536  * The operating system has already ensured that the directory entry
1537  * does not exist and done all appropriate namespace locking.
1538  */
1539 static
1540 int
1541 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1542 {
1543 	struct hammer_transaction trans;
1544 	struct hammer_inode *dip;
1545 	struct hammer_inode *nip;
1546 	struct nchandle *nch;
1547 	hammer_mount_t hmp;
1548 	int error;
1549 
1550 	nch = ap->a_nch;
1551 	dip = VTOI(ap->a_dvp);
1552 	hmp = dip->hmp;
1553 
1554 	if (dip->flags & HAMMER_INODE_RO)
1555 		return (EROFS);
1556 	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1557 		return (error);
1558 
1559 	/*
1560 	 * Create a transaction to cover the operations we perform.
1561 	 */
1562 	lwkt_gettoken(&hmp->fs_token);
1563 	hammer_start_transaction(&trans, hmp);
1564 	++hammer_stats_file_iopsw;
1565 
1566 	/*
1567 	 * Create a new filesystem object of the requested type.  The
1568 	 * returned inode will be referenced but not locked.
1569 	 *
1570 	 * If mknod specifies a directory a pseudo-fs is created.
1571 	 */
1572 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1573 				    dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1574 				    NULL, &nip);
1575 	if (error) {
1576 		hammer_done_transaction(&trans);
1577 		*ap->a_vpp = NULL;
1578 		lwkt_reltoken(&hmp->fs_token);
1579 		return (error);
1580 	}
1581 
1582 	/*
1583 	 * Add the new filesystem object to the directory.  This will also
1584 	 * bump the inode's link count.
1585 	 */
1586 	error = hammer_ip_add_directory(&trans, dip,
1587 					nch->ncp->nc_name, nch->ncp->nc_nlen,
1588 					nip);
1589 
1590 	/*
1591 	 * Finish up.
1592 	 */
1593 	if (error) {
1594 		hammer_rel_inode(nip, 0);
1595 		*ap->a_vpp = NULL;
1596 	} else {
1597 		error = hammer_get_vnode(nip, ap->a_vpp);
1598 		hammer_rel_inode(nip, 0);
1599 		if (error == 0) {
1600 			cache_setunresolved(ap->a_nch);
1601 			cache_setvp(ap->a_nch, *ap->a_vpp);
1602 		}
1603 	}
1604 	hammer_done_transaction(&trans);
1605 	if (error == 0)
1606 		hammer_knote(ap->a_dvp, NOTE_WRITE);
1607 	lwkt_reltoken(&hmp->fs_token);
1608 	return (error);
1609 }
1610 
1611 /*
1612  * hammer_vop_open { vp, mode, cred, fp }
1613  *
1614  * MPSAFE (does not require fs_token)
1615  */
1616 static
1617 int
1618 hammer_vop_open(struct vop_open_args *ap)
1619 {
1620 	hammer_inode_t ip;
1621 
1622 	++hammer_stats_file_iopsr;
1623 	ip = VTOI(ap->a_vp);
1624 
1625 	if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1626 		return (EROFS);
1627 	return(vop_stdopen(ap));
1628 }
1629 
1630 /*
1631  * hammer_vop_print { vp }
1632  */
1633 static
1634 int
1635 hammer_vop_print(struct vop_print_args *ap)
1636 {
1637 	return EOPNOTSUPP;
1638 }
1639 
1640 /*
1641  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1642  */
1643 static
1644 int
1645 hammer_vop_readdir(struct vop_readdir_args *ap)
1646 {
1647 	struct hammer_transaction trans;
1648 	struct hammer_cursor cursor;
1649 	struct hammer_inode *ip;
1650 	hammer_mount_t hmp;
1651 	struct uio *uio;
1652 	hammer_base_elm_t base;
1653 	int error;
1654 	int cookie_index;
1655 	int ncookies;
1656 	off_t *cookies;
1657 	off_t saveoff;
1658 	int r;
1659 	int dtype;
1660 
1661 	++hammer_stats_file_iopsr;
1662 	ip = VTOI(ap->a_vp);
1663 	uio = ap->a_uio;
1664 	saveoff = uio->uio_offset;
1665 	hmp = ip->hmp;
1666 
1667 	if (ap->a_ncookies) {
1668 		ncookies = uio->uio_resid / 16 + 1;
1669 		if (ncookies > 1024)
1670 			ncookies = 1024;
1671 		cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1672 		cookie_index = 0;
1673 	} else {
1674 		ncookies = -1;
1675 		cookies = NULL;
1676 		cookie_index = 0;
1677 	}
1678 
1679 	lwkt_gettoken(&hmp->fs_token);
1680 	hammer_simple_transaction(&trans, hmp);
1681 
1682 	/*
1683 	 * Handle artificial entries
1684 	 *
1685 	 * It should be noted that the minimum value for a directory
1686 	 * hash key on-media is 0x0000000100000000, so we can use anything
1687 	 * less then that to represent our 'special' key space.
1688 	 */
1689 	error = 0;
1690 	if (saveoff == 0) {
1691 		r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1692 		if (r)
1693 			goto done;
1694 		if (cookies)
1695 			cookies[cookie_index] = saveoff;
1696 		++saveoff;
1697 		++cookie_index;
1698 		if (cookie_index == ncookies)
1699 			goto done;
1700 	}
1701 	if (saveoff == 1) {
1702 		if (ip->ino_data.parent_obj_id) {
1703 			r = vop_write_dirent(&error, uio,
1704 					     ip->ino_data.parent_obj_id,
1705 					     DT_DIR, 2, "..");
1706 		} else {
1707 			r = vop_write_dirent(&error, uio,
1708 					     ip->obj_id, DT_DIR, 2, "..");
1709 		}
1710 		if (r)
1711 			goto done;
1712 		if (cookies)
1713 			cookies[cookie_index] = saveoff;
1714 		++saveoff;
1715 		++cookie_index;
1716 		if (cookie_index == ncookies)
1717 			goto done;
1718 	}
1719 
1720 	/*
1721 	 * Key range (begin and end inclusive) to scan.  Directory keys
1722 	 * directly translate to a 64 bit 'seek' position.
1723 	 */
1724 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1725 	cursor.key_beg.localization = ip->obj_localization +
1726 				      hammer_dir_localization(ip);
1727 	cursor.key_beg.obj_id = ip->obj_id;
1728 	cursor.key_beg.create_tid = 0;
1729 	cursor.key_beg.delete_tid = 0;
1730         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1731 	cursor.key_beg.obj_type = 0;
1732 	cursor.key_beg.key = saveoff;
1733 
1734 	cursor.key_end = cursor.key_beg;
1735 	cursor.key_end.key = HAMMER_MAX_KEY;
1736 	cursor.asof = ip->obj_asof;
1737 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1738 
1739 	error = hammer_ip_first(&cursor);
1740 
1741 	while (error == 0) {
1742 		error = hammer_ip_resolve_data(&cursor);
1743 		if (error)
1744 			break;
1745 		base = &cursor.leaf->base;
1746 		saveoff = base->key;
1747 		KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1748 
1749 		if (base->obj_id != ip->obj_id)
1750 			panic("readdir: bad record at %p", cursor.node);
1751 
1752 		/*
1753 		 * Convert pseudo-filesystems into softlinks
1754 		 */
1755 		dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
1756 		r = vop_write_dirent(
1757 			     &error, uio, cursor.data->entry.obj_id,
1758 			     dtype,
1759 			     cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1760 			     (void *)cursor.data->entry.name);
1761 		if (r)
1762 			break;
1763 		++saveoff;
1764 		if (cookies)
1765 			cookies[cookie_index] = base->key;
1766 		++cookie_index;
1767 		if (cookie_index == ncookies)
1768 			break;
1769 		error = hammer_ip_next(&cursor);
1770 	}
1771 	hammer_done_cursor(&cursor);
1772 
1773 done:
1774 	hammer_done_transaction(&trans);
1775 
1776 	if (ap->a_eofflag)
1777 		*ap->a_eofflag = (error == ENOENT);
1778 	uio->uio_offset = saveoff;
1779 	if (error && cookie_index == 0) {
1780 		if (error == ENOENT)
1781 			error = 0;
1782 		if (cookies) {
1783 			kfree(cookies, M_TEMP);
1784 			*ap->a_ncookies = 0;
1785 			*ap->a_cookies = NULL;
1786 		}
1787 	} else {
1788 		if (error == ENOENT)
1789 			error = 0;
1790 		if (cookies) {
1791 			*ap->a_ncookies = cookie_index;
1792 			*ap->a_cookies = cookies;
1793 		}
1794 	}
1795 	lwkt_reltoken(&hmp->fs_token);
1796 	return(error);
1797 }
1798 
1799 /*
1800  * hammer_vop_readlink { vp, uio, cred }
1801  */
1802 static
1803 int
1804 hammer_vop_readlink(struct vop_readlink_args *ap)
1805 {
1806 	struct hammer_transaction trans;
1807 	struct hammer_cursor cursor;
1808 	struct hammer_inode *ip;
1809 	hammer_mount_t hmp;
1810 	char buf[32];
1811 	u_int32_t localization;
1812 	hammer_pseudofs_inmem_t pfsm;
1813 	int error;
1814 
1815 	ip = VTOI(ap->a_vp);
1816 	hmp = ip->hmp;
1817 
1818 	lwkt_gettoken(&hmp->fs_token);
1819 
1820 	/*
1821 	 * Shortcut if the symlink data was stuffed into ino_data.
1822 	 *
1823 	 * Also expand special "@@PFS%05d" softlinks (expansion only
1824 	 * occurs for non-historical (current) accesses made from the
1825 	 * primary filesystem).
1826 	 *
1827 	 * Note that userspace hammer command does not allow users to
1828 	 * create a @@PFS softlink under an existing other PFS (id!=0)
1829 	 * so the ip localization here for @@PFS softlink is always 0.
1830 	 */
1831 	if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1832 		char *ptr;
1833 		int bytes;
1834 
1835 		ptr = ip->ino_data.ext.symlink;
1836 		bytes = (int)ip->ino_data.size;
1837 		if (bytes == 10 &&
1838 		    ip->obj_asof == HAMMER_MAX_TID &&
1839 		    ip->obj_localization == 0 &&
1840 		    strncmp(ptr, "@@PFS", 5) == 0) {
1841 			hammer_simple_transaction(&trans, hmp);
1842 			bcopy(ptr + 5, buf, 5);
1843 			buf[5] = 0;
1844 			localization = strtoul(buf, NULL, 10) << 16;
1845 			pfsm = hammer_load_pseudofs(&trans, localization,
1846 						    &error);
1847 			if (error == 0) {
1848 				if (pfsm->pfsd.mirror_flags &
1849 				    HAMMER_PFSD_SLAVE) {
1850 					/* vap->va_size == 26 */
1851 					ksnprintf(buf, sizeof(buf),
1852 						  "@@0x%016llx:%05d",
1853 						  (long long)pfsm->pfsd.sync_end_tid,
1854 						  localization >> 16);
1855 				} else {
1856 					/* vap->va_size == 10 */
1857 					ksnprintf(buf, sizeof(buf),
1858 						  "@@-1:%05d",
1859 						  localization >> 16);
1860 #if 0
1861 					ksnprintf(buf, sizeof(buf),
1862 						  "@@0x%016llx:%05d",
1863 						  (long long)HAMMER_MAX_TID,
1864 						  localization >> 16);
1865 #endif
1866 				}
1867 				ptr = buf;
1868 				bytes = strlen(buf);
1869 			}
1870 			if (pfsm)
1871 				hammer_rel_pseudofs(hmp, pfsm);
1872 			hammer_done_transaction(&trans);
1873 		}
1874 		error = uiomove(ptr, bytes, ap->a_uio);
1875 		lwkt_reltoken(&hmp->fs_token);
1876 		return(error);
1877 	}
1878 
1879 	/*
1880 	 * Long version
1881 	 */
1882 	hammer_simple_transaction(&trans, hmp);
1883 	++hammer_stats_file_iopsr;
1884 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1885 
1886 	/*
1887 	 * Key range (begin and end inclusive) to scan.  Directory keys
1888 	 * directly translate to a 64 bit 'seek' position.
1889 	 */
1890 	cursor.key_beg.localization = ip->obj_localization +
1891 				      HAMMER_LOCALIZE_MISC;
1892 	cursor.key_beg.obj_id = ip->obj_id;
1893 	cursor.key_beg.create_tid = 0;
1894 	cursor.key_beg.delete_tid = 0;
1895         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1896 	cursor.key_beg.obj_type = 0;
1897 	cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1898 	cursor.asof = ip->obj_asof;
1899 	cursor.flags |= HAMMER_CURSOR_ASOF;
1900 
1901 	error = hammer_ip_lookup(&cursor);
1902 	if (error == 0) {
1903 		error = hammer_ip_resolve_data(&cursor);
1904 		if (error == 0) {
1905 			KKASSERT(cursor.leaf->data_len >=
1906 				 HAMMER_SYMLINK_NAME_OFF);
1907 			error = uiomove(cursor.data->symlink.name,
1908 					cursor.leaf->data_len -
1909 						HAMMER_SYMLINK_NAME_OFF,
1910 					ap->a_uio);
1911 		}
1912 	}
1913 	hammer_done_cursor(&cursor);
1914 	hammer_done_transaction(&trans);
1915 	lwkt_reltoken(&hmp->fs_token);
1916 	return(error);
1917 }
1918 
1919 /*
1920  * hammer_vop_nremove { nch, dvp, cred }
1921  */
1922 static
1923 int
1924 hammer_vop_nremove(struct vop_nremove_args *ap)
1925 {
1926 	struct hammer_transaction trans;
1927 	struct hammer_inode *dip;
1928 	hammer_mount_t hmp;
1929 	int error;
1930 
1931 	dip = VTOI(ap->a_dvp);
1932 	hmp = dip->hmp;
1933 
1934 	if (hammer_nohistory(dip) == 0 &&
1935 	    (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1936 		return (error);
1937 	}
1938 
1939 	lwkt_gettoken(&hmp->fs_token);
1940 	hammer_start_transaction(&trans, hmp);
1941 	++hammer_stats_file_iopsw;
1942 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
1943 	hammer_done_transaction(&trans);
1944 	if (error == 0)
1945 		hammer_knote(ap->a_dvp, NOTE_WRITE);
1946 	lwkt_reltoken(&hmp->fs_token);
1947 	return (error);
1948 }
1949 
1950 /*
1951  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1952  */
1953 static
1954 int
1955 hammer_vop_nrename(struct vop_nrename_args *ap)
1956 {
1957 	struct hammer_transaction trans;
1958 	struct namecache *fncp;
1959 	struct namecache *tncp;
1960 	struct hammer_inode *fdip;
1961 	struct hammer_inode *tdip;
1962 	struct hammer_inode *ip;
1963 	hammer_mount_t hmp;
1964 	struct hammer_cursor cursor;
1965 	int64_t namekey;
1966 	u_int32_t max_iterations;
1967 	int nlen, error;
1968 
1969 	if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
1970 		return(EXDEV);
1971 	if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1972 		return(EXDEV);
1973 
1974 	fdip = VTOI(ap->a_fdvp);
1975 	tdip = VTOI(ap->a_tdvp);
1976 	fncp = ap->a_fnch->ncp;
1977 	tncp = ap->a_tnch->ncp;
1978 	ip = VTOI(fncp->nc_vp);
1979 	KKASSERT(ip != NULL);
1980 
1981 	hmp = ip->hmp;
1982 
1983 	if (fdip->obj_localization != tdip->obj_localization)
1984 		return(EXDEV);
1985 	if (fdip->obj_localization != ip->obj_localization)
1986 		return(EXDEV);
1987 
1988 	if (fdip->flags & HAMMER_INODE_RO)
1989 		return (EROFS);
1990 	if (tdip->flags & HAMMER_INODE_RO)
1991 		return (EROFS);
1992 	if (ip->flags & HAMMER_INODE_RO)
1993 		return (EROFS);
1994 	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1995 		return (error);
1996 
1997 	lwkt_gettoken(&hmp->fs_token);
1998 	hammer_start_transaction(&trans, hmp);
1999 	++hammer_stats_file_iopsw;
2000 
2001 	/*
2002 	 * Remove tncp from the target directory and then link ip as
2003 	 * tncp. XXX pass trans to dounlink
2004 	 *
2005 	 * Force the inode sync-time to match the transaction so it is
2006 	 * in-sync with the creation of the target directory entry.
2007 	 */
2008 	error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
2009 				ap->a_cred, 0, -1);
2010 	if (error == 0 || error == ENOENT) {
2011 		error = hammer_ip_add_directory(&trans, tdip,
2012 						tncp->nc_name, tncp->nc_nlen,
2013 						ip);
2014 		if (error == 0) {
2015 			ip->ino_data.parent_obj_id = tdip->obj_id;
2016 			ip->ino_data.ctime = trans.time;
2017 			hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY);
2018 		}
2019 	}
2020 	if (error)
2021 		goto failed; /* XXX */
2022 
2023 	/*
2024 	 * Locate the record in the originating directory and remove it.
2025 	 *
2026 	 * Calculate the namekey and setup the key range for the scan.  This
2027 	 * works kinda like a chained hash table where the lower 32 bits
2028 	 * of the namekey synthesize the chain.
2029 	 *
2030 	 * The key range is inclusive of both key_beg and key_end.
2031 	 */
2032 	namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
2033 					   &max_iterations);
2034 retry:
2035 	hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
2036 	cursor.key_beg.localization = fdip->obj_localization +
2037 				      hammer_dir_localization(fdip);
2038         cursor.key_beg.obj_id = fdip->obj_id;
2039 	cursor.key_beg.key = namekey;
2040         cursor.key_beg.create_tid = 0;
2041         cursor.key_beg.delete_tid = 0;
2042         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2043         cursor.key_beg.obj_type = 0;
2044 
2045 	cursor.key_end = cursor.key_beg;
2046 	cursor.key_end.key += max_iterations;
2047 	cursor.asof = fdip->obj_asof;
2048 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2049 
2050 	/*
2051 	 * Scan all matching records (the chain), locate the one matching
2052 	 * the requested path component.
2053 	 *
2054 	 * The hammer_ip_*() functions merge in-memory records with on-disk
2055 	 * records for the purposes of the search.
2056 	 */
2057 	error = hammer_ip_first(&cursor);
2058 	while (error == 0) {
2059 		if (hammer_ip_resolve_data(&cursor) != 0)
2060 			break;
2061 		nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2062 		KKASSERT(nlen > 0);
2063 		if (fncp->nc_nlen == nlen &&
2064 		    bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2065 			break;
2066 		}
2067 		error = hammer_ip_next(&cursor);
2068 	}
2069 
2070 	/*
2071 	 * If all is ok we have to get the inode so we can adjust nlinks.
2072 	 *
2073 	 * WARNING: hammer_ip_del_directory() may have to terminate the
2074 	 * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
2075 	 * twice.
2076 	 */
2077 	if (error == 0)
2078 		error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
2079 
2080 	/*
2081 	 * XXX A deadlock here will break rename's atomicy for the purposes
2082 	 * of crash recovery.
2083 	 */
2084 	if (error == EDEADLK) {
2085 		hammer_done_cursor(&cursor);
2086 		goto retry;
2087 	}
2088 
2089 	/*
2090 	 * Cleanup and tell the kernel that the rename succeeded.
2091 	 *
2092 	 * NOTE: ip->vp, if non-NULL, cannot be directly referenced
2093 	 *	 without formally acquiring the vp since the vp might
2094 	 *	 have zero refs on it, or in the middle of a reclaim,
2095 	 *	 etc.
2096 	 */
2097         hammer_done_cursor(&cursor);
2098 	if (error == 0) {
2099 		cache_rename(ap->a_fnch, ap->a_tnch);
2100 		hammer_knote(ap->a_fdvp, NOTE_WRITE);
2101 		hammer_knote(ap->a_tdvp, NOTE_WRITE);
2102 		while (ip->vp) {
2103 			struct vnode *vp;
2104 
2105 			error = hammer_get_vnode(ip, &vp);
2106 			if (error == 0 && vp) {
2107 				vn_unlock(vp);
2108 				hammer_knote(ip->vp, NOTE_RENAME);
2109 				vrele(vp);
2110 				break;
2111 			}
2112 			kprintf("Debug: HAMMER ip/vp race2 avoided\n");
2113 		}
2114 	}
2115 
2116 failed:
2117 	hammer_done_transaction(&trans);
2118 	lwkt_reltoken(&hmp->fs_token);
2119 	return (error);
2120 }
2121 
2122 /*
2123  * hammer_vop_nrmdir { nch, dvp, cred }
2124  */
2125 static
2126 int
2127 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
2128 {
2129 	struct hammer_transaction trans;
2130 	struct hammer_inode *dip;
2131 	hammer_mount_t hmp;
2132 	int error;
2133 
2134 	dip = VTOI(ap->a_dvp);
2135 	hmp = dip->hmp;
2136 
2137 	if (hammer_nohistory(dip) == 0 &&
2138 	    (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
2139 		return (error);
2140 	}
2141 
2142 	lwkt_gettoken(&hmp->fs_token);
2143 	hammer_start_transaction(&trans, hmp);
2144 	++hammer_stats_file_iopsw;
2145 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
2146 	hammer_done_transaction(&trans);
2147 	if (error == 0)
2148 		hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
2149 	lwkt_reltoken(&hmp->fs_token);
2150 	return (error);
2151 }
2152 
2153 /*
2154  * hammer_vop_markatime { vp, cred }
2155  */
2156 static
2157 int
2158 hammer_vop_markatime(struct vop_markatime_args *ap)
2159 {
2160 	struct hammer_transaction trans;
2161 	struct hammer_inode *ip;
2162 	hammer_mount_t hmp;
2163 
2164 	ip = VTOI(ap->a_vp);
2165 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2166 		return (EROFS);
2167 	if (ip->flags & HAMMER_INODE_RO)
2168 		return (EROFS);
2169 	hmp = ip->hmp;
2170 	if (hmp->mp->mnt_flag & MNT_NOATIME)
2171 		return (0);
2172 	lwkt_gettoken(&hmp->fs_token);
2173 	hammer_start_transaction(&trans, hmp);
2174 	++hammer_stats_file_iopsw;
2175 
2176 	ip->ino_data.atime = trans.time;
2177 	hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
2178 	hammer_done_transaction(&trans);
2179 	hammer_knote(ap->a_vp, NOTE_ATTRIB);
2180 	lwkt_reltoken(&hmp->fs_token);
2181 	return (0);
2182 }
2183 
2184 /*
2185  * hammer_vop_setattr { vp, vap, cred }
2186  */
2187 static
2188 int
2189 hammer_vop_setattr(struct vop_setattr_args *ap)
2190 {
2191 	struct hammer_transaction trans;
2192 	struct hammer_inode *ip;
2193 	struct vattr *vap;
2194 	hammer_mount_t hmp;
2195 	int modflags;
2196 	int error;
2197 	int truncating;
2198 	int blksize;
2199 	int kflags;
2200 #if 0
2201 	int64_t aligned_size;
2202 #endif
2203 	u_int32_t flags;
2204 
2205 	vap = ap->a_vap;
2206 	ip = ap->a_vp->v_data;
2207 	modflags = 0;
2208 	kflags = 0;
2209 	hmp = ip->hmp;
2210 
2211 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2212 		return(EROFS);
2213 	if (ip->flags & HAMMER_INODE_RO)
2214 		return (EROFS);
2215 	if (hammer_nohistory(ip) == 0 &&
2216 	    (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
2217 		return (error);
2218 	}
2219 
2220 	lwkt_gettoken(&hmp->fs_token);
2221 	hammer_start_transaction(&trans, hmp);
2222 	++hammer_stats_file_iopsw;
2223 	error = 0;
2224 
2225 	if (vap->va_flags != VNOVAL) {
2226 		flags = ip->ino_data.uflags;
2227 		error = vop_helper_setattr_flags(&flags, vap->va_flags,
2228 					 hammer_to_unix_xid(&ip->ino_data.uid),
2229 					 ap->a_cred);
2230 		if (error == 0) {
2231 			if (ip->ino_data.uflags != flags) {
2232 				ip->ino_data.uflags = flags;
2233 				ip->ino_data.ctime = trans.time;
2234 				modflags |= HAMMER_INODE_DDIRTY;
2235 				kflags |= NOTE_ATTRIB;
2236 			}
2237 			if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2238 				error = 0;
2239 				goto done;
2240 			}
2241 		}
2242 		goto done;
2243 	}
2244 	if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2245 		error = EPERM;
2246 		goto done;
2247 	}
2248 	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
2249 		mode_t cur_mode = ip->ino_data.mode;
2250 		uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2251 		gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2252 		uuid_t uuid_uid;
2253 		uuid_t uuid_gid;
2254 
2255 		error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
2256 					 ap->a_cred,
2257 					 &cur_uid, &cur_gid, &cur_mode);
2258 		if (error == 0) {
2259 			hammer_guid_to_uuid(&uuid_uid, cur_uid);
2260 			hammer_guid_to_uuid(&uuid_gid, cur_gid);
2261 			if (bcmp(&uuid_uid, &ip->ino_data.uid,
2262 				 sizeof(uuid_uid)) ||
2263 			    bcmp(&uuid_gid, &ip->ino_data.gid,
2264 				 sizeof(uuid_gid)) ||
2265 			    ip->ino_data.mode != cur_mode
2266 			) {
2267 				ip->ino_data.uid = uuid_uid;
2268 				ip->ino_data.gid = uuid_gid;
2269 				ip->ino_data.mode = cur_mode;
2270 				ip->ino_data.ctime = trans.time;
2271 				modflags |= HAMMER_INODE_DDIRTY;
2272 			}
2273 			kflags |= NOTE_ATTRIB;
2274 		}
2275 	}
2276 	while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
2277 		switch(ap->a_vp->v_type) {
2278 		case VREG:
2279 			if (vap->va_size == ip->ino_data.size)
2280 				break;
2281 
2282 			/*
2283 			 * Log the operation if in fast-fsync mode or if
2284 			 * there are unterminated redo write records present.
2285 			 *
2286 			 * The second check is needed so the recovery code
2287 			 * properly truncates write redos even if nominal
2288 			 * REDO operations is turned off due to excessive
2289 			 * writes, because the related records might be
2290 			 * destroyed and never lay down a TERM_WRITE.
2291 			 */
2292 			if ((ip->flags & HAMMER_INODE_REDO) ||
2293 			    (ip->flags & HAMMER_INODE_RDIRTY)) {
2294 				error = hammer_generate_redo(&trans, ip,
2295 							     vap->va_size,
2296 							     HAMMER_REDO_TRUNC,
2297 							     NULL, 0);
2298 			}
2299 			blksize = hammer_blocksize(vap->va_size);
2300 
2301 			/*
2302 			 * XXX break atomicy, we can deadlock the backend
2303 			 * if we do not release the lock.  Probably not a
2304 			 * big deal here.
2305 			 */
2306 			if (vap->va_size < ip->ino_data.size) {
2307 				nvtruncbuf(ap->a_vp, vap->va_size,
2308 					   blksize,
2309 					   hammer_blockoff(vap->va_size),
2310 					   0);
2311 				truncating = 1;
2312 				kflags |= NOTE_WRITE;
2313 			} else {
2314 				nvextendbuf(ap->a_vp,
2315 					    ip->ino_data.size,
2316 					    vap->va_size,
2317 					    hammer_blocksize(ip->ino_data.size),
2318 					    hammer_blocksize(vap->va_size),
2319 					    hammer_blockoff(ip->ino_data.size),
2320 					    hammer_blockoff(vap->va_size),
2321 					    0);
2322 				truncating = 0;
2323 				kflags |= NOTE_WRITE | NOTE_EXTEND;
2324 			}
2325 			ip->ino_data.size = vap->va_size;
2326 			ip->ino_data.mtime = trans.time;
2327 			/* XXX safe to use SDIRTY instead of DDIRTY here? */
2328 			modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2329 
2330 			/*
2331 			 * On-media truncation is cached in the inode until
2332 			 * the inode is synchronized.  We must immediately
2333 			 * handle any frontend records.
2334 			 */
2335 			if (truncating) {
2336 				hammer_ip_frontend_trunc(ip, vap->va_size);
2337 #ifdef DEBUG_TRUNCATE
2338 				if (HammerTruncIp == NULL)
2339 					HammerTruncIp = ip;
2340 #endif
2341 				if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2342 					ip->flags |= HAMMER_INODE_TRUNCATED;
2343 					ip->trunc_off = vap->va_size;
2344 					hammer_inode_dirty(ip);
2345 #ifdef DEBUG_TRUNCATE
2346 					if (ip == HammerTruncIp)
2347 					kprintf("truncate1 %016llx\n",
2348 						(long long)ip->trunc_off);
2349 #endif
2350 				} else if (ip->trunc_off > vap->va_size) {
2351 					ip->trunc_off = vap->va_size;
2352 #ifdef DEBUG_TRUNCATE
2353 					if (ip == HammerTruncIp)
2354 					kprintf("truncate2 %016llx\n",
2355 						(long long)ip->trunc_off);
2356 #endif
2357 				} else {
2358 #ifdef DEBUG_TRUNCATE
2359 					if (ip == HammerTruncIp)
2360 					kprintf("truncate3 %016llx (ignored)\n",
2361 						(long long)vap->va_size);
2362 #endif
2363 				}
2364 			}
2365 
2366 #if 0
2367 			/*
2368 			 * When truncating, nvtruncbuf() may have cleaned out
2369 			 * a portion of the last block on-disk in the buffer
2370 			 * cache.  We must clean out any frontend records
2371 			 * for blocks beyond the new last block.
2372 			 */
2373 			aligned_size = (vap->va_size + (blksize - 1)) &
2374 				       ~(int64_t)(blksize - 1);
2375 			if (truncating && vap->va_size < aligned_size) {
2376 				aligned_size -= blksize;
2377 				hammer_ip_frontend_trunc(ip, aligned_size);
2378 			}
2379 #endif
2380 			break;
2381 		case VDATABASE:
2382 			if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2383 				ip->flags |= HAMMER_INODE_TRUNCATED;
2384 				ip->trunc_off = vap->va_size;
2385 				hammer_inode_dirty(ip);
2386 			} else if (ip->trunc_off > vap->va_size) {
2387 				ip->trunc_off = vap->va_size;
2388 			}
2389 			hammer_ip_frontend_trunc(ip, vap->va_size);
2390 			ip->ino_data.size = vap->va_size;
2391 			ip->ino_data.mtime = trans.time;
2392 			modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2393 			kflags |= NOTE_ATTRIB;
2394 			break;
2395 		default:
2396 			error = EINVAL;
2397 			goto done;
2398 		}
2399 		break;
2400 	}
2401 	if (vap->va_atime.tv_sec != VNOVAL) {
2402 		ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime);
2403 		modflags |= HAMMER_INODE_ATIME;
2404 		kflags |= NOTE_ATTRIB;
2405 	}
2406 	if (vap->va_mtime.tv_sec != VNOVAL) {
2407 		ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime);
2408 		modflags |= HAMMER_INODE_MTIME;
2409 		kflags |= NOTE_ATTRIB;
2410 	}
2411 	if (vap->va_mode != (mode_t)VNOVAL) {
2412 		mode_t   cur_mode = ip->ino_data.mode;
2413 		uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2414 		gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2415 
2416 		error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
2417 					 cur_uid, cur_gid, &cur_mode);
2418 		if (error == 0 && ip->ino_data.mode != cur_mode) {
2419 			ip->ino_data.mode = cur_mode;
2420 			ip->ino_data.ctime = trans.time;
2421 			modflags |= HAMMER_INODE_DDIRTY;
2422 			kflags |= NOTE_ATTRIB;
2423 		}
2424 	}
2425 done:
2426 	if (error == 0)
2427 		hammer_modify_inode(&trans, ip, modflags);
2428 	hammer_done_transaction(&trans);
2429 	hammer_knote(ap->a_vp, kflags);
2430 	lwkt_reltoken(&hmp->fs_token);
2431 	return (error);
2432 }
2433 
2434 /*
2435  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
2436  */
2437 static
2438 int
2439 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
2440 {
2441 	struct hammer_transaction trans;
2442 	struct hammer_inode *dip;
2443 	struct hammer_inode *nip;
2444 	hammer_record_t record;
2445 	struct nchandle *nch;
2446 	hammer_mount_t hmp;
2447 	int error;
2448 	int bytes;
2449 
2450 	ap->a_vap->va_type = VLNK;
2451 
2452 	nch = ap->a_nch;
2453 	dip = VTOI(ap->a_dvp);
2454 	hmp = dip->hmp;
2455 
2456 	if (dip->flags & HAMMER_INODE_RO)
2457 		return (EROFS);
2458 	if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
2459 		return (error);
2460 
2461 	/*
2462 	 * Create a transaction to cover the operations we perform.
2463 	 */
2464 	lwkt_gettoken(&hmp->fs_token);
2465 	hammer_start_transaction(&trans, hmp);
2466 	++hammer_stats_file_iopsw;
2467 
2468 	/*
2469 	 * Create a new filesystem object of the requested type.  The
2470 	 * returned inode will be referenced but not locked.
2471 	 */
2472 
2473 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
2474 				    dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
2475 				    NULL, &nip);
2476 	if (error) {
2477 		hammer_done_transaction(&trans);
2478 		*ap->a_vpp = NULL;
2479 		lwkt_reltoken(&hmp->fs_token);
2480 		return (error);
2481 	}
2482 
2483 	/*
2484 	 * Add a record representing the symlink.  symlink stores the link
2485 	 * as pure data, not a string, and is no \0 terminated.
2486 	 */
2487 	if (error == 0) {
2488 		bytes = strlen(ap->a_target);
2489 
2490 		if (bytes <= HAMMER_INODE_BASESYMLEN) {
2491 			bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
2492 		} else {
2493 			record = hammer_alloc_mem_record(nip, bytes);
2494 			record->type = HAMMER_MEM_RECORD_GENERAL;
2495 
2496 			record->leaf.base.localization = nip->obj_localization +
2497 							 HAMMER_LOCALIZE_MISC;
2498 			record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
2499 			record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
2500 			record->leaf.data_len = bytes;
2501 			KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
2502 			bcopy(ap->a_target, record->data->symlink.name, bytes);
2503 			error = hammer_ip_add_record(&trans, record);
2504 		}
2505 
2506 		/*
2507 		 * Set the file size to the length of the link.
2508 		 */
2509 		if (error == 0) {
2510 			nip->ino_data.size = bytes;
2511 			hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY);
2512 		}
2513 	}
2514 	if (error == 0)
2515 		error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
2516 						nch->ncp->nc_nlen, nip);
2517 
2518 	/*
2519 	 * Finish up.
2520 	 */
2521 	if (error) {
2522 		hammer_rel_inode(nip, 0);
2523 		*ap->a_vpp = NULL;
2524 	} else {
2525 		error = hammer_get_vnode(nip, ap->a_vpp);
2526 		hammer_rel_inode(nip, 0);
2527 		if (error == 0) {
2528 			cache_setunresolved(ap->a_nch);
2529 			cache_setvp(ap->a_nch, *ap->a_vpp);
2530 			hammer_knote(ap->a_dvp, NOTE_WRITE);
2531 		}
2532 	}
2533 	hammer_done_transaction(&trans);
2534 	lwkt_reltoken(&hmp->fs_token);
2535 	return (error);
2536 }
2537 
2538 /*
2539  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
2540  */
2541 static
2542 int
2543 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
2544 {
2545 	struct hammer_transaction trans;
2546 	struct hammer_inode *dip;
2547 	hammer_mount_t hmp;
2548 	int error;
2549 
2550 	dip = VTOI(ap->a_dvp);
2551 	hmp = dip->hmp;
2552 
2553 	if (hammer_nohistory(dip) == 0 &&
2554 	    (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) {
2555 		return (error);
2556 	}
2557 
2558 	lwkt_gettoken(&hmp->fs_token);
2559 	hammer_start_transaction(&trans, hmp);
2560 	++hammer_stats_file_iopsw;
2561 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
2562 				ap->a_cred, ap->a_flags, -1);
2563 	hammer_done_transaction(&trans);
2564 	lwkt_reltoken(&hmp->fs_token);
2565 
2566 	return (error);
2567 }
2568 
2569 /*
2570  * hammer_vop_ioctl { vp, command, data, fflag, cred }
2571  */
2572 static
2573 int
2574 hammer_vop_ioctl(struct vop_ioctl_args *ap)
2575 {
2576 	struct hammer_inode *ip = ap->a_vp->v_data;
2577 	hammer_mount_t hmp = ip->hmp;
2578 	int error;
2579 
2580 	++hammer_stats_file_iopsr;
2581 	lwkt_gettoken(&hmp->fs_token);
2582 	error = hammer_ioctl(ip, ap->a_command, ap->a_data,
2583 			     ap->a_fflag, ap->a_cred);
2584 	lwkt_reltoken(&hmp->fs_token);
2585 	return (error);
2586 }
2587 
2588 static
2589 int
2590 hammer_vop_mountctl(struct vop_mountctl_args *ap)
2591 {
2592 	static const struct mountctl_opt extraopt[] = {
2593 		{ HMNT_NOHISTORY,	"nohistory" },
2594 		{ HMNT_MASTERID,	"master" },
2595 		{ 0, NULL}
2596 
2597 	};
2598 	struct hammer_mount *hmp;
2599 	struct mount *mp;
2600 	int usedbytes;
2601 	int error;
2602 
2603 	error = 0;
2604 	usedbytes = 0;
2605 	mp = ap->a_head.a_ops->head.vv_mount;
2606 	KKASSERT(mp->mnt_data != NULL);
2607 	hmp = (struct hammer_mount *)mp->mnt_data;
2608 
2609 	lwkt_gettoken(&hmp->fs_token);
2610 
2611 	switch(ap->a_op) {
2612 	case MOUNTCTL_SET_EXPORT:
2613 		if (ap->a_ctllen != sizeof(struct export_args))
2614 			error = EINVAL;
2615 		else
2616 			error = hammer_vfs_export(mp, ap->a_op,
2617 				      (const struct export_args *)ap->a_ctl);
2618 		break;
2619 	case MOUNTCTL_MOUNTFLAGS:
2620 	{
2621 		/*
2622 		 * Call standard mountctl VOP function
2623 		 * so we get user mount flags.
2624 		 */
2625 		error = vop_stdmountctl(ap);
2626 		if (error)
2627 			break;
2628 
2629 		usedbytes = *ap->a_res;
2630 
2631 		if (usedbytes > 0 && usedbytes < ap->a_buflen) {
2632 			usedbytes += vfs_flagstostr(hmp->hflags, extraopt,
2633 						    ap->a_buf,
2634 						    ap->a_buflen - usedbytes,
2635 						    &error);
2636 		}
2637 
2638 		*ap->a_res += usedbytes;
2639 		break;
2640 	}
2641 	default:
2642 		error = vop_stdmountctl(ap);
2643 		break;
2644 	}
2645 	lwkt_reltoken(&hmp->fs_token);
2646 	return(error);
2647 }
2648 
2649 /*
2650  * hammer_vop_strategy { vp, bio }
2651  *
2652  * Strategy call, used for regular file read & write only.  Note that the
2653  * bp may represent a cluster.
2654  *
2655  * To simplify operation and allow better optimizations in the future,
2656  * this code does not make any assumptions with regards to buffer alignment
2657  * or size.
2658  */
2659 static
2660 int
2661 hammer_vop_strategy(struct vop_strategy_args *ap)
2662 {
2663 	struct buf *bp;
2664 	int error;
2665 
2666 	bp = ap->a_bio->bio_buf;
2667 
2668 	switch(bp->b_cmd) {
2669 	case BUF_CMD_READ:
2670 		error = hammer_vop_strategy_read(ap);
2671 		break;
2672 	case BUF_CMD_WRITE:
2673 		error = hammer_vop_strategy_write(ap);
2674 		break;
2675 	default:
2676 		bp->b_error = error = EINVAL;
2677 		bp->b_flags |= B_ERROR;
2678 		biodone(ap->a_bio);
2679 		break;
2680 	}
2681 
2682 	/* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */
2683 
2684 	return (error);
2685 }
2686 
2687 /*
2688  * Read from a regular file.  Iterate the related records and fill in the
2689  * BIO/BUF.  Gaps are zero-filled.
2690  *
2691  * The support code in hammer_object.c should be used to deal with mixed
2692  * in-memory and on-disk records.
2693  *
2694  * NOTE: Can be called from the cluster code with an oversized buf.
2695  *
2696  * XXX atime update
2697  */
2698 static
2699 int
2700 hammer_vop_strategy_read(struct vop_strategy_args *ap)
2701 {
2702 	struct hammer_transaction trans;
2703 	struct hammer_inode *ip;
2704 	struct hammer_inode *dip;
2705 	hammer_mount_t hmp;
2706 	struct hammer_cursor cursor;
2707 	hammer_base_elm_t base;
2708 	hammer_off_t disk_offset;
2709 	struct bio *bio;
2710 	struct bio *nbio;
2711 	struct buf *bp;
2712 	int64_t rec_offset;
2713 	int64_t ran_end;
2714 	int64_t tmp64;
2715 	int error;
2716 	int boff;
2717 	int roff;
2718 	int n;
2719 	int isdedupable;
2720 
2721 	bio = ap->a_bio;
2722 	bp = bio->bio_buf;
2723 	ip = ap->a_vp->v_data;
2724 	hmp = ip->hmp;
2725 
2726 	/*
2727 	 * The zone-2 disk offset may have been set by the cluster code via
2728 	 * a BMAP operation, or else should be NOOFFSET.
2729 	 *
2730 	 * Checking the high bits for a match against zone-2 should suffice.
2731 	 *
2732 	 * In cases where a lot of data duplication is present it may be
2733 	 * more beneficial to drop through and doubule-buffer through the
2734 	 * device.
2735 	 */
2736 	nbio = push_bio(bio);
2737 	if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2738 	    HAMMER_ZONE_LARGE_DATA) {
2739 		if (hammer_double_buffer == 0) {
2740 			lwkt_gettoken(&hmp->fs_token);
2741 			error = hammer_io_direct_read(hmp, nbio, NULL);
2742 			lwkt_reltoken(&hmp->fs_token);
2743 			return (error);
2744 		}
2745 
2746 		/*
2747 		 * Try to shortcut requests for double_buffer mode too.
2748 		 * Since this mode runs through the device buffer cache
2749 		 * only compatible buffer sizes (meaning those generated
2750 		 * by normal filesystem buffers) are legal.
2751 		 */
2752 		if (hammer_live_dedup == 0 && (bp->b_flags & B_PAGING) == 0) {
2753 			lwkt_gettoken(&hmp->fs_token);
2754 			error = hammer_io_indirect_read(hmp, nbio, NULL);
2755 			lwkt_reltoken(&hmp->fs_token);
2756 			return (error);
2757 		}
2758 	}
2759 
2760 	/*
2761 	 * Well, that sucked.  Do it the hard way.  If all the stars are
2762 	 * aligned we may still be able to issue a direct-read.
2763 	 */
2764 	lwkt_gettoken(&hmp->fs_token);
2765 	hammer_simple_transaction(&trans, hmp);
2766 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2767 
2768 	/*
2769 	 * Key range (begin and end inclusive) to scan.  Note that the key's
2770 	 * stored in the actual records represent BASE+LEN, not BASE.  The
2771 	 * first record containing bio_offset will have a key > bio_offset.
2772 	 */
2773 	cursor.key_beg.localization = ip->obj_localization +
2774 				      HAMMER_LOCALIZE_MISC;
2775 	cursor.key_beg.obj_id = ip->obj_id;
2776 	cursor.key_beg.create_tid = 0;
2777 	cursor.key_beg.delete_tid = 0;
2778 	cursor.key_beg.obj_type = 0;
2779 	cursor.key_beg.key = bio->bio_offset + 1;
2780 	cursor.asof = ip->obj_asof;
2781 	cursor.flags |= HAMMER_CURSOR_ASOF;
2782 
2783 	cursor.key_end = cursor.key_beg;
2784 	KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2785 #if 0
2786 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2787 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2788 		cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2789 		cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2790 	} else
2791 #endif
2792 	{
2793 		ran_end = bio->bio_offset + bp->b_bufsize;
2794 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2795 		cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2796 		tmp64 = ran_end + MAXPHYS + 1;	/* work-around GCC-4 bug */
2797 		if (tmp64 < ran_end)
2798 			cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2799 		else
2800 			cursor.key_end.key = ran_end + MAXPHYS + 1;
2801 	}
2802 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2803 
2804 	/*
2805 	 * Set NOSWAPCACHE for cursor data extraction if double buffering
2806 	 * is disabled or (if the file is not marked cacheable via chflags
2807 	 * and vm.swapcache_use_chflags is enabled).
2808 	 */
2809 	if (hammer_double_buffer == 0 ||
2810 	    ((ap->a_vp->v_flag & VSWAPCACHE) == 0 &&
2811 	     vm_swapcache_use_chflags)) {
2812 		cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE;
2813 	}
2814 
2815 	error = hammer_ip_first(&cursor);
2816 	boff = 0;
2817 
2818 	while (error == 0) {
2819 		/*
2820 		 * Get the base file offset of the record.  The key for
2821 		 * data records is (base + bytes) rather then (base).
2822 		 */
2823 		base = &cursor.leaf->base;
2824 		rec_offset = base->key - cursor.leaf->data_len;
2825 
2826 		/*
2827 		 * Calculate the gap, if any, and zero-fill it.
2828 		 *
2829 		 * n is the offset of the start of the record verses our
2830 		 * current seek offset in the bio.
2831 		 */
2832 		n = (int)(rec_offset - (bio->bio_offset + boff));
2833 		if (n > 0) {
2834 			if (n > bp->b_bufsize - boff)
2835 				n = bp->b_bufsize - boff;
2836 			bzero((char *)bp->b_data + boff, n);
2837 			boff += n;
2838 			n = 0;
2839 		}
2840 
2841 		/*
2842 		 * Calculate the data offset in the record and the number
2843 		 * of bytes we can copy.
2844 		 *
2845 		 * There are two degenerate cases.  First, boff may already
2846 		 * be at bp->b_bufsize.  Secondly, the data offset within
2847 		 * the record may exceed the record's size.
2848 		 */
2849 		roff = -n;
2850 		rec_offset += roff;
2851 		n = cursor.leaf->data_len - roff;
2852 		if (n <= 0) {
2853 			kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2854 			n = 0;
2855 		} else if (n > bp->b_bufsize - boff) {
2856 			n = bp->b_bufsize - boff;
2857 		}
2858 
2859 		/*
2860 		 * Deal with cached truncations.  This cool bit of code
2861 		 * allows truncate()/ftruncate() to avoid having to sync
2862 		 * the file.
2863 		 *
2864 		 * If the frontend is truncated then all backend records are
2865 		 * subject to the frontend's truncation.
2866 		 *
2867 		 * If the backend is truncated then backend records on-disk
2868 		 * (but not in-memory) are subject to the backend's
2869 		 * truncation.  In-memory records owned by the backend
2870 		 * represent data written after the truncation point on the
2871 		 * backend and must not be truncated.
2872 		 *
2873 		 * Truncate operations deal with frontend buffer cache
2874 		 * buffers and frontend-owned in-memory records synchronously.
2875 		 */
2876 		if (ip->flags & HAMMER_INODE_TRUNCATED) {
2877 			if (hammer_cursor_ondisk(&cursor)/* ||
2878 			    cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) {
2879 				if (ip->trunc_off <= rec_offset)
2880 					n = 0;
2881 				else if (ip->trunc_off < rec_offset + n)
2882 					n = (int)(ip->trunc_off - rec_offset);
2883 			}
2884 		}
2885 		if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2886 			if (hammer_cursor_ondisk(&cursor)) {
2887 				if (ip->sync_trunc_off <= rec_offset)
2888 					n = 0;
2889 				else if (ip->sync_trunc_off < rec_offset + n)
2890 					n = (int)(ip->sync_trunc_off - rec_offset);
2891 			}
2892 		}
2893 
2894 		/*
2895 		 * Try to issue a direct read into our bio if possible,
2896 		 * otherwise resolve the element data into a hammer_buffer
2897 		 * and copy.
2898 		 *
2899 		 * The buffer on-disk should be zerod past any real
2900 		 * truncation point, but may not be for any synthesized
2901 		 * truncation point from above.
2902 		 *
2903 		 * NOTE: disk_offset is only valid if the cursor data is
2904 		 *	 on-disk.
2905 		 */
2906 		disk_offset = cursor.leaf->data_offset + roff;
2907 		isdedupable = (boff == 0 && n == bp->b_bufsize &&
2908 			       hammer_cursor_ondisk(&cursor) &&
2909 			       ((int)disk_offset & HAMMER_BUFMASK) == 0);
2910 
2911 		if (isdedupable && hammer_double_buffer == 0) {
2912 			/*
2913 			 * Direct read case
2914 			 */
2915 			KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2916 				 HAMMER_ZONE_LARGE_DATA);
2917 			nbio->bio_offset = disk_offset;
2918 			error = hammer_io_direct_read(hmp, nbio, cursor.leaf);
2919 			if (hammer_live_dedup && error == 0)
2920 				hammer_dedup_cache_add(ip, cursor.leaf);
2921 			goto done;
2922 		} else if (isdedupable) {
2923 			/*
2924 			 * Async I/O case for reading from backing store
2925 			 * and copying the data to the filesystem buffer.
2926 			 * live-dedup has to verify the data anyway if it
2927 			 * gets a hit later so we can just add the entry
2928 			 * now.
2929 			 */
2930 			KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2931 				 HAMMER_ZONE_LARGE_DATA);
2932 			nbio->bio_offset = disk_offset;
2933 			if (hammer_live_dedup)
2934 				hammer_dedup_cache_add(ip, cursor.leaf);
2935 			error = hammer_io_indirect_read(hmp, nbio, cursor.leaf);
2936 			goto done;
2937 		} else if (n) {
2938 			error = hammer_ip_resolve_data(&cursor);
2939 			if (error == 0) {
2940 				if (hammer_live_dedup && isdedupable)
2941 					hammer_dedup_cache_add(ip, cursor.leaf);
2942 				bcopy((char *)cursor.data + roff,
2943 				      (char *)bp->b_data + boff, n);
2944 			}
2945 		}
2946 		if (error)
2947 			break;
2948 
2949 		/*
2950 		 * We have to be sure that the only elements added to the
2951 		 * dedup cache are those which are already on-media.
2952 		 */
2953 		if (hammer_live_dedup && hammer_cursor_ondisk(&cursor))
2954 			hammer_dedup_cache_add(ip, cursor.leaf);
2955 
2956 		/*
2957 		 * Iterate until we have filled the request.
2958 		 */
2959 		boff += n;
2960 		if (boff == bp->b_bufsize)
2961 			break;
2962 		error = hammer_ip_next(&cursor);
2963 	}
2964 
2965 	/*
2966 	 * There may have been a gap after the last record
2967 	 */
2968 	if (error == ENOENT)
2969 		error = 0;
2970 	if (error == 0 && boff != bp->b_bufsize) {
2971 		KKASSERT(boff < bp->b_bufsize);
2972 		bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2973 		/* boff = bp->b_bufsize; */
2974 	}
2975 
2976 	/*
2977 	 * Disallow swapcache operation on the vnode buffer if double
2978 	 * buffering is enabled, the swapcache will get the data via
2979 	 * the block device buffer.
2980 	 */
2981 	if (hammer_double_buffer)
2982 		bp->b_flags |= B_NOTMETA;
2983 
2984 	/*
2985 	 * Cleanup
2986 	 */
2987 	bp->b_resid = 0;
2988 	bp->b_error = error;
2989 	if (error)
2990 		bp->b_flags |= B_ERROR;
2991 	biodone(ap->a_bio);
2992 
2993 done:
2994 	/*
2995 	 * Cache the b-tree node for the last data read in cache[1].
2996 	 *
2997 	 * If we hit the file EOF then also cache the node in the
2998 	 * governing director's cache[3], it will be used to initialize
2999 	 * the inode's cache[1] for any inodes looked up via the directory.
3000 	 *
3001 	 * This doesn't reduce disk accesses since the B-Tree chain is
3002 	 * likely cached, but it does reduce cpu overhead when looking
3003 	 * up file offsets for cpdup/tar/cpio style iterations.
3004 	 */
3005 	if (cursor.node)
3006 		hammer_cache_node(&ip->cache[1], cursor.node);
3007 	if (ran_end >= ip->ino_data.size) {
3008 		dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id,
3009 					ip->obj_asof, ip->obj_localization);
3010 		if (dip) {
3011 			hammer_cache_node(&dip->cache[3], cursor.node);
3012 			hammer_rel_inode(dip, 0);
3013 		}
3014 	}
3015 	hammer_done_cursor(&cursor);
3016 	hammer_done_transaction(&trans);
3017 	lwkt_reltoken(&hmp->fs_token);
3018 	return(error);
3019 }
3020 
3021 /*
3022  * BMAP operation - used to support cluster_read() only.
3023  *
3024  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
3025  *
3026  * This routine may return EOPNOTSUPP if the opration is not supported for
3027  * the specified offset.  The contents of the pointer arguments do not
3028  * need to be initialized in that case.
3029  *
3030  * If a disk address is available and properly aligned return 0 with
3031  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
3032  * to the run-length relative to that offset.  Callers may assume that
3033  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
3034  * large, so return EOPNOTSUPP if it is not sufficiently large.
3035  */
3036 static
3037 int
3038 hammer_vop_bmap(struct vop_bmap_args *ap)
3039 {
3040 	struct hammer_transaction trans;
3041 	struct hammer_inode *ip;
3042 	hammer_mount_t hmp;
3043 	struct hammer_cursor cursor;
3044 	hammer_base_elm_t base;
3045 	int64_t rec_offset;
3046 	int64_t ran_end;
3047 	int64_t tmp64;
3048 	int64_t base_offset;
3049 	int64_t base_disk_offset;
3050 	int64_t last_offset;
3051 	hammer_off_t last_disk_offset;
3052 	hammer_off_t disk_offset;
3053 	int	rec_len;
3054 	int	error;
3055 	int	blksize;
3056 
3057 	++hammer_stats_file_iopsr;
3058 	ip = ap->a_vp->v_data;
3059 	hmp = ip->hmp;
3060 
3061 	/*
3062 	 * We can only BMAP regular files.  We can't BMAP database files,
3063 	 * directories, etc.
3064 	 */
3065 	if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
3066 		return(EOPNOTSUPP);
3067 
3068 	/*
3069 	 * bmap is typically called with runp/runb both NULL when used
3070 	 * for writing.  We do not support BMAP for writing atm.
3071 	 */
3072 	if (ap->a_cmd != BUF_CMD_READ)
3073 		return(EOPNOTSUPP);
3074 
3075 	/*
3076 	 * Scan the B-Tree to acquire blockmap addresses, then translate
3077 	 * to raw addresses.
3078 	 */
3079 	lwkt_gettoken(&hmp->fs_token);
3080 	hammer_simple_transaction(&trans, hmp);
3081 #if 0
3082 	kprintf("bmap_beg %016llx ip->cache %p\n",
3083 		(long long)ap->a_loffset, ip->cache[1]);
3084 #endif
3085 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
3086 
3087 	/*
3088 	 * Key range (begin and end inclusive) to scan.  Note that the key's
3089 	 * stored in the actual records represent BASE+LEN, not BASE.  The
3090 	 * first record containing bio_offset will have a key > bio_offset.
3091 	 */
3092 	cursor.key_beg.localization = ip->obj_localization +
3093 				      HAMMER_LOCALIZE_MISC;
3094 	cursor.key_beg.obj_id = ip->obj_id;
3095 	cursor.key_beg.create_tid = 0;
3096 	cursor.key_beg.delete_tid = 0;
3097 	cursor.key_beg.obj_type = 0;
3098 	if (ap->a_runb)
3099 		cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
3100 	else
3101 		cursor.key_beg.key = ap->a_loffset + 1;
3102 	if (cursor.key_beg.key < 0)
3103 		cursor.key_beg.key = 0;
3104 	cursor.asof = ip->obj_asof;
3105 	cursor.flags |= HAMMER_CURSOR_ASOF;
3106 
3107 	cursor.key_end = cursor.key_beg;
3108 	KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
3109 
3110 	ran_end = ap->a_loffset + MAXPHYS;
3111 	cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
3112 	cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
3113 	tmp64 = ran_end + MAXPHYS + 1;	/* work-around GCC-4 bug */
3114 	if (tmp64 < ran_end)
3115 		cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
3116 	else
3117 		cursor.key_end.key = ran_end + MAXPHYS + 1;
3118 
3119 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
3120 
3121 	error = hammer_ip_first(&cursor);
3122 	base_offset = last_offset = 0;
3123 	base_disk_offset = last_disk_offset = 0;
3124 
3125 	while (error == 0) {
3126 		/*
3127 		 * Get the base file offset of the record.  The key for
3128 		 * data records is (base + bytes) rather then (base).
3129 		 *
3130 		 * NOTE: rec_offset + rec_len may exceed the end-of-file.
3131 		 * The extra bytes should be zero on-disk and the BMAP op
3132 		 * should still be ok.
3133 		 */
3134 		base = &cursor.leaf->base;
3135 		rec_offset = base->key - cursor.leaf->data_len;
3136 		rec_len    = cursor.leaf->data_len;
3137 
3138 		/*
3139 		 * Incorporate any cached truncation.
3140 		 *
3141 		 * NOTE: Modifications to rec_len based on synthesized
3142 		 * truncation points remove the guarantee that any extended
3143 		 * data on disk is zero (since the truncations may not have
3144 		 * taken place on-media yet).
3145 		 */
3146 		if (ip->flags & HAMMER_INODE_TRUNCATED) {
3147 			if (hammer_cursor_ondisk(&cursor) ||
3148 			    cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
3149 				if (ip->trunc_off <= rec_offset)
3150 					rec_len = 0;
3151 				else if (ip->trunc_off < rec_offset + rec_len)
3152 					rec_len = (int)(ip->trunc_off - rec_offset);
3153 			}
3154 		}
3155 		if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
3156 			if (hammer_cursor_ondisk(&cursor)) {
3157 				if (ip->sync_trunc_off <= rec_offset)
3158 					rec_len = 0;
3159 				else if (ip->sync_trunc_off < rec_offset + rec_len)
3160 					rec_len = (int)(ip->sync_trunc_off - rec_offset);
3161 			}
3162 		}
3163 
3164 		/*
3165 		 * Accumulate information.  If we have hit a discontiguous
3166 		 * block reset base_offset unless we are already beyond the
3167 		 * requested offset.  If we are, that's it, we stop.
3168 		 */
3169 		if (error)
3170 			break;
3171 		if (hammer_cursor_ondisk(&cursor)) {
3172 			disk_offset = cursor.leaf->data_offset;
3173 			if (rec_offset != last_offset ||
3174 			    disk_offset != last_disk_offset) {
3175 				if (rec_offset > ap->a_loffset)
3176 					break;
3177 				base_offset = rec_offset;
3178 				base_disk_offset = disk_offset;
3179 			}
3180 			last_offset = rec_offset + rec_len;
3181 			last_disk_offset = disk_offset + rec_len;
3182 
3183 			if (hammer_live_dedup)
3184 				hammer_dedup_cache_add(ip, cursor.leaf);
3185 		}
3186 
3187 		error = hammer_ip_next(&cursor);
3188 	}
3189 
3190 #if 0
3191 	kprintf("BMAP %016llx:  %016llx - %016llx\n",
3192 		(long long)ap->a_loffset,
3193 		(long long)base_offset,
3194 		(long long)last_offset);
3195 	kprintf("BMAP %16s:  %016llx - %016llx\n", "",
3196 		(long long)base_disk_offset,
3197 		(long long)last_disk_offset);
3198 #endif
3199 
3200 	if (cursor.node) {
3201 		hammer_cache_node(&ip->cache[1], cursor.node);
3202 #if 0
3203 		kprintf("bmap_end2 %016llx ip->cache %p\n",
3204 			(long long)ap->a_loffset, ip->cache[1]);
3205 #endif
3206 	}
3207 	hammer_done_cursor(&cursor);
3208 	hammer_done_transaction(&trans);
3209 	lwkt_reltoken(&hmp->fs_token);
3210 
3211 	/*
3212 	 * If we couldn't find any records or the records we did find were
3213 	 * all behind the requested offset, return failure.  A forward
3214 	 * truncation can leave a hole w/ no on-disk records.
3215 	 */
3216 	if (last_offset == 0 || last_offset < ap->a_loffset)
3217 		return (EOPNOTSUPP);
3218 
3219 	/*
3220 	 * Figure out the block size at the requested offset and adjust
3221 	 * our limits so the cluster_read() does not create inappropriately
3222 	 * sized buffer cache buffers.
3223 	 */
3224 	blksize = hammer_blocksize(ap->a_loffset);
3225 	if (hammer_blocksize(base_offset) != blksize) {
3226 		base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
3227 	}
3228 	if (last_offset != ap->a_loffset &&
3229 	    hammer_blocksize(last_offset - 1) != blksize) {
3230 		last_offset = hammer_blockdemarc(ap->a_loffset,
3231 						 last_offset - 1);
3232 	}
3233 
3234 	/*
3235 	 * Returning EOPNOTSUPP simply prevents the direct-IO optimization
3236 	 * from occuring.
3237 	 */
3238 	disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
3239 
3240 	if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
3241 		/*
3242 		 * Only large-data zones can be direct-IOd
3243 		 */
3244 		error = EOPNOTSUPP;
3245 	} else if ((disk_offset & HAMMER_BUFMASK) ||
3246 		   (last_offset - ap->a_loffset) < blksize) {
3247 		/*
3248 		 * doffsetp is not aligned or the forward run size does
3249 		 * not cover a whole buffer, disallow the direct I/O.
3250 		 */
3251 		error = EOPNOTSUPP;
3252 	} else {
3253 		/*
3254 		 * We're good.
3255 		 */
3256 		*ap->a_doffsetp = disk_offset;
3257 		if (ap->a_runb) {
3258 			*ap->a_runb = ap->a_loffset - base_offset;
3259 			KKASSERT(*ap->a_runb >= 0);
3260 		}
3261 		if (ap->a_runp) {
3262 			*ap->a_runp = last_offset - ap->a_loffset;
3263 			KKASSERT(*ap->a_runp >= 0);
3264 		}
3265 		error = 0;
3266 	}
3267 	return(error);
3268 }
3269 
3270 /*
3271  * Write to a regular file.   Because this is a strategy call the OS is
3272  * trying to actually get data onto the media.
3273  */
3274 static
3275 int
3276 hammer_vop_strategy_write(struct vop_strategy_args *ap)
3277 {
3278 	hammer_record_t record;
3279 	hammer_mount_t hmp;
3280 	hammer_inode_t ip;
3281 	struct bio *bio;
3282 	struct buf *bp;
3283 	int blksize __debugvar;
3284 	int bytes;
3285 	int error;
3286 
3287 	bio = ap->a_bio;
3288 	bp = bio->bio_buf;
3289 	ip = ap->a_vp->v_data;
3290 	hmp = ip->hmp;
3291 
3292 	blksize = hammer_blocksize(bio->bio_offset);
3293 	KKASSERT(bp->b_bufsize == blksize);
3294 
3295 	if (ip->flags & HAMMER_INODE_RO) {
3296 		bp->b_error = EROFS;
3297 		bp->b_flags |= B_ERROR;
3298 		biodone(ap->a_bio);
3299 		return(EROFS);
3300 	}
3301 
3302 	lwkt_gettoken(&hmp->fs_token);
3303 
3304 	/*
3305 	 * Disallow swapcache operation on the vnode buffer if double
3306 	 * buffering is enabled, the swapcache will get the data via
3307 	 * the block device buffer.
3308 	 */
3309 	if (hammer_double_buffer)
3310 		bp->b_flags |= B_NOTMETA;
3311 
3312 	/*
3313 	 * Interlock with inode destruction (no in-kernel or directory
3314 	 * topology visibility).  If we queue new IO while trying to
3315 	 * destroy the inode we can deadlock the vtrunc call in
3316 	 * hammer_inode_unloadable_check().
3317 	 *
3318 	 * Besides, there's no point flushing a bp associated with an
3319 	 * inode that is being destroyed on-media and has no kernel
3320 	 * references.
3321 	 */
3322 	if ((ip->flags | ip->sync_flags) &
3323 	    (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
3324 		bp->b_resid = 0;
3325 		biodone(ap->a_bio);
3326 		lwkt_reltoken(&hmp->fs_token);
3327 		return(0);
3328 	}
3329 
3330 	/*
3331 	 * Reserve space and issue a direct-write from the front-end.
3332 	 * NOTE: The direct_io code will hammer_bread/bcopy smaller
3333 	 * allocations.
3334 	 *
3335 	 * An in-memory record will be installed to reference the storage
3336 	 * until the flusher can get to it.
3337 	 *
3338 	 * Since we own the high level bio the front-end will not try to
3339 	 * do a direct-read until the write completes.
3340 	 *
3341 	 * NOTE: The only time we do not reserve a full-sized buffers
3342 	 * worth of data is if the file is small.  We do not try to
3343 	 * allocate a fragment (from the small-data zone) at the end of
3344 	 * an otherwise large file as this can lead to wildly separated
3345 	 * data.
3346 	 */
3347 	KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
3348 	KKASSERT(bio->bio_offset < ip->ino_data.size);
3349 	if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
3350 		bytes = bp->b_bufsize;
3351 	else
3352 		bytes = ((int)ip->ino_data.size + 15) & ~15;
3353 
3354 	record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
3355 				    bytes, &error);
3356 
3357 	/*
3358 	 * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated
3359 	 * in hammer_vop_write().  We must flag the record so the proper
3360 	 * REDO_TERM_WRITE entry is generated during the flush.
3361 	 */
3362 	if (record) {
3363 		if (bp->b_flags & B_VFSFLAG1) {
3364 			record->flags |= HAMMER_RECF_REDO;
3365 			bp->b_flags &= ~B_VFSFLAG1;
3366 		}
3367 		if (record->flags & HAMMER_RECF_DEDUPED) {
3368 			bp->b_resid = 0;
3369 			hammer_ip_replace_bulk(hmp, record);
3370 			biodone(ap->a_bio);
3371 		} else {
3372 			hammer_io_direct_write(hmp, bio, record);
3373 		}
3374 		if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
3375 			hammer_flush_inode(ip, 0);
3376 	} else {
3377 		bp->b_bio2.bio_offset = NOOFFSET;
3378 		bp->b_error = error;
3379 		bp->b_flags |= B_ERROR;
3380 		biodone(ap->a_bio);
3381 	}
3382 	lwkt_reltoken(&hmp->fs_token);
3383 	return(error);
3384 }
3385 
3386 /*
3387  * dounlink - disconnect a directory entry
3388  *
3389  * XXX whiteout support not really in yet
3390  */
3391 static int
3392 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
3393 		struct vnode *dvp, struct ucred *cred,
3394 		int flags, int isdir)
3395 {
3396 	struct namecache *ncp;
3397 	hammer_inode_t dip;
3398 	hammer_inode_t ip;
3399 	hammer_mount_t hmp;
3400 	struct hammer_cursor cursor;
3401 	int64_t namekey;
3402 	u_int32_t max_iterations;
3403 	int nlen, error;
3404 
3405 	/*
3406 	 * Calculate the namekey and setup the key range for the scan.  This
3407 	 * works kinda like a chained hash table where the lower 32 bits
3408 	 * of the namekey synthesize the chain.
3409 	 *
3410 	 * The key range is inclusive of both key_beg and key_end.
3411 	 */
3412 	dip = VTOI(dvp);
3413 	ncp = nch->ncp;
3414 	hmp = dip->hmp;
3415 
3416 	if (dip->flags & HAMMER_INODE_RO)
3417 		return (EROFS);
3418 
3419 	namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen,
3420 					   &max_iterations);
3421 retry:
3422 	hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
3423 	cursor.key_beg.localization = dip->obj_localization +
3424 				      hammer_dir_localization(dip);
3425         cursor.key_beg.obj_id = dip->obj_id;
3426 	cursor.key_beg.key = namekey;
3427         cursor.key_beg.create_tid = 0;
3428         cursor.key_beg.delete_tid = 0;
3429         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
3430         cursor.key_beg.obj_type = 0;
3431 
3432 	cursor.key_end = cursor.key_beg;
3433 	cursor.key_end.key += max_iterations;
3434 	cursor.asof = dip->obj_asof;
3435 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
3436 
3437 	/*
3438 	 * Scan all matching records (the chain), locate the one matching
3439 	 * the requested path component.  info->last_error contains the
3440 	 * error code on search termination and could be 0, ENOENT, or
3441 	 * something else.
3442 	 *
3443 	 * The hammer_ip_*() functions merge in-memory records with on-disk
3444 	 * records for the purposes of the search.
3445 	 */
3446 	error = hammer_ip_first(&cursor);
3447 
3448 	while (error == 0) {
3449 		error = hammer_ip_resolve_data(&cursor);
3450 		if (error)
3451 			break;
3452 		nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
3453 		KKASSERT(nlen > 0);
3454 		if (ncp->nc_nlen == nlen &&
3455 		    bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
3456 			break;
3457 		}
3458 		error = hammer_ip_next(&cursor);
3459 	}
3460 
3461 	/*
3462 	 * If all is ok we have to get the inode so we can adjust nlinks.
3463 	 * To avoid a deadlock with the flusher we must release the inode
3464 	 * lock on the directory when acquiring the inode for the entry.
3465 	 *
3466 	 * If the target is a directory, it must be empty.
3467 	 */
3468 	if (error == 0) {
3469 		hammer_unlock(&cursor.ip->lock);
3470 		ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
3471 				      hmp->asof,
3472 				      cursor.data->entry.localization,
3473 				      0, &error);
3474 		hammer_lock_sh(&cursor.ip->lock);
3475 		if (error == ENOENT) {
3476 			kprintf("HAMMER: WARNING: Removing "
3477 				"dirent w/missing inode \"%s\"\n"
3478 				"\tobj_id = %016llx\n",
3479 				ncp->nc_name,
3480 				(long long)cursor.data->entry.obj_id);
3481 			error = 0;
3482 		}
3483 
3484 		/*
3485 		 * If isdir >= 0 we validate that the entry is or is not a
3486 		 * directory.  If isdir < 0 we don't care.
3487 		 */
3488 		if (error == 0 && isdir >= 0 && ip) {
3489 			if (isdir &&
3490 			    ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
3491 				error = ENOTDIR;
3492 			} else if (isdir == 0 &&
3493 			    ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
3494 				error = EISDIR;
3495 			}
3496 		}
3497 
3498 		/*
3499 		 * If we are trying to remove a directory the directory must
3500 		 * be empty.
3501 		 *
3502 		 * The check directory code can loop and deadlock/retry.  Our
3503 		 * own cursor's node locks must be released to avoid a 3-way
3504 		 * deadlock with the flusher if the check directory code
3505 		 * blocks.
3506 		 *
3507 		 * If any changes whatsoever have been made to the cursor
3508 		 * set EDEADLK and retry.
3509 		 *
3510 		 * WARNING: See warnings in hammer_unlock_cursor()
3511 		 *	    function.
3512 		 */
3513 		if (error == 0 && ip && ip->ino_data.obj_type ==
3514 				        HAMMER_OBJTYPE_DIRECTORY) {
3515 			hammer_unlock_cursor(&cursor);
3516 			error = hammer_ip_check_directory_empty(trans, ip);
3517 			hammer_lock_cursor(&cursor);
3518 			if (cursor.flags & HAMMER_CURSOR_RETEST) {
3519 				kprintf("HAMMER: Warning: avoided deadlock "
3520 					"on rmdir '%s'\n",
3521 					ncp->nc_name);
3522 				error = EDEADLK;
3523 			}
3524 		}
3525 
3526 		/*
3527 		 * Delete the directory entry.
3528 		 *
3529 		 * WARNING: hammer_ip_del_directory() may have to terminate
3530 		 * the cursor to avoid a deadlock.  It is ok to call
3531 		 * hammer_done_cursor() twice.
3532 		 */
3533 		if (error == 0) {
3534 			error = hammer_ip_del_directory(trans, &cursor,
3535 							dip, ip);
3536 		}
3537 		hammer_done_cursor(&cursor);
3538 		if (error == 0) {
3539 			/*
3540 			 * Tell the namecache that we are now unlinked.
3541 			 */
3542 			cache_unlink(nch);
3543 
3544 			/*
3545 			 * NOTE: ip->vp, if non-NULL, cannot be directly
3546 			 *	 referenced without formally acquiring the
3547 			 *	 vp since the vp might have zero refs on it,
3548 			 *	 or in the middle of a reclaim, etc.
3549 			 *
3550 			 * NOTE: The cache_setunresolved() can rip the vp
3551 			 *	 out from under us since the vp may not have
3552 			 *	 any refs, in which case ip->vp will be NULL
3553 			 *	 from the outset.
3554 			 */
3555 			while (ip && ip->vp) {
3556 				struct vnode *vp;
3557 
3558 				error = hammer_get_vnode(ip, &vp);
3559 				if (error == 0 && vp) {
3560 					vn_unlock(vp);
3561 					hammer_knote(ip->vp, NOTE_DELETE);
3562 #if 0
3563 					/*
3564 					 * Don't do this, it can deadlock
3565 					 * on concurrent rm's of hardlinks.
3566 					 * Shouldn't be needed any more.
3567 					 */
3568 					cache_inval_vp(ip->vp, CINV_DESTROY);
3569 #endif
3570 					vrele(vp);
3571 					break;
3572 				}
3573 				kprintf("Debug: HAMMER ip/vp race1 avoided\n");
3574 			}
3575 		}
3576 		if (ip)
3577 			hammer_rel_inode(ip, 0);
3578 	} else {
3579 		hammer_done_cursor(&cursor);
3580 	}
3581 	if (error == EDEADLK)
3582 		goto retry;
3583 
3584 	return (error);
3585 }
3586 
3587 /************************************************************************
3588  *			    FIFO AND SPECFS OPS				*
3589  ************************************************************************
3590  *
3591  */
3592 static int
3593 hammer_vop_fifoclose (struct vop_close_args *ap)
3594 {
3595 	/* XXX update itimes */
3596 	return (VOCALL(&fifo_vnode_vops, &ap->a_head));
3597 }
3598 
3599 static int
3600 hammer_vop_fiforead (struct vop_read_args *ap)
3601 {
3602 	int error;
3603 
3604 	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3605 	/* XXX update access time */
3606 	return (error);
3607 }
3608 
3609 static int
3610 hammer_vop_fifowrite (struct vop_write_args *ap)
3611 {
3612 	int error;
3613 
3614 	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3615 	/* XXX update access time */
3616 	return (error);
3617 }
3618 
3619 static
3620 int
3621 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap)
3622 {
3623 	int error;
3624 
3625 	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3626 	if (error)
3627 		error = hammer_vop_kqfilter(ap);
3628 	return(error);
3629 }
3630 
3631 /************************************************************************
3632  *			    KQFILTER OPS				*
3633  ************************************************************************
3634  *
3635  */
3636 static void filt_hammerdetach(struct knote *kn);
3637 static int filt_hammerread(struct knote *kn, long hint);
3638 static int filt_hammerwrite(struct knote *kn, long hint);
3639 static int filt_hammervnode(struct knote *kn, long hint);
3640 
3641 static struct filterops hammerread_filtops =
3642 	{ FILTEROP_ISFD | FILTEROP_MPSAFE,
3643 	  NULL, filt_hammerdetach, filt_hammerread };
3644 static struct filterops hammerwrite_filtops =
3645 	{ FILTEROP_ISFD | FILTEROP_MPSAFE,
3646 	  NULL, filt_hammerdetach, filt_hammerwrite };
3647 static struct filterops hammervnode_filtops =
3648 	{ FILTEROP_ISFD | FILTEROP_MPSAFE,
3649 	  NULL, filt_hammerdetach, filt_hammervnode };
3650 
3651 static
3652 int
3653 hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
3654 {
3655 	struct vnode *vp = ap->a_vp;
3656 	struct knote *kn = ap->a_kn;
3657 
3658 	switch (kn->kn_filter) {
3659 	case EVFILT_READ:
3660 		kn->kn_fop = &hammerread_filtops;
3661 		break;
3662 	case EVFILT_WRITE:
3663 		kn->kn_fop = &hammerwrite_filtops;
3664 		break;
3665 	case EVFILT_VNODE:
3666 		kn->kn_fop = &hammervnode_filtops;
3667 		break;
3668 	default:
3669 		return (EOPNOTSUPP);
3670 	}
3671 
3672 	kn->kn_hook = (caddr_t)vp;
3673 
3674 	knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
3675 
3676 	return(0);
3677 }
3678 
3679 static void
3680 filt_hammerdetach(struct knote *kn)
3681 {
3682 	struct vnode *vp = (void *)kn->kn_hook;
3683 
3684 	knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
3685 }
3686 
3687 static int
3688 filt_hammerread(struct knote *kn, long hint)
3689 {
3690 	struct vnode *vp = (void *)kn->kn_hook;
3691 	hammer_inode_t ip = VTOI(vp);
3692 	hammer_mount_t hmp = ip->hmp;
3693 	off_t off;
3694 
3695 	if (hint == NOTE_REVOKE) {
3696 		kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
3697 		return(1);
3698 	}
3699 	lwkt_gettoken(&hmp->fs_token);	/* XXX use per-ip-token */
3700 	off = ip->ino_data.size - kn->kn_fp->f_offset;
3701 	kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX;
3702 	lwkt_reltoken(&hmp->fs_token);
3703 	if (kn->kn_sfflags & NOTE_OLDAPI)
3704 		return(1);
3705 	return (kn->kn_data != 0);
3706 }
3707 
3708 static int
3709 filt_hammerwrite(struct knote *kn, long hint)
3710 {
3711 	if (hint == NOTE_REVOKE)
3712 		kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
3713 	kn->kn_data = 0;
3714 	return (1);
3715 }
3716 
3717 static int
3718 filt_hammervnode(struct knote *kn, long hint)
3719 {
3720 	if (kn->kn_sfflags & hint)
3721 		kn->kn_fflags |= hint;
3722 	if (hint == NOTE_REVOKE) {
3723 		kn->kn_flags |= (EV_EOF | EV_NODATA);
3724 		return (1);
3725 	}
3726 	return (kn->kn_fflags != 0);
3727 }
3728 
3729