xref: /dragonfly/sys/vfs/hammer/hammer_vnops.c (revision 9f3fc534)
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $
35  */
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <sys/file.h>
48 #include <vm/vm_extern.h>
49 #include <vfs/fifofs/fifo.h>
50 #include "hammer.h"
51 
52 /*
53  * USERFS VNOPS
54  */
55 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
56 static int hammer_vop_fsync(struct vop_fsync_args *);
57 static int hammer_vop_read(struct vop_read_args *);
58 static int hammer_vop_write(struct vop_write_args *);
59 static int hammer_vop_access(struct vop_access_args *);
60 static int hammer_vop_advlock(struct vop_advlock_args *);
61 static int hammer_vop_close(struct vop_close_args *);
62 static int hammer_vop_ncreate(struct vop_ncreate_args *);
63 static int hammer_vop_getattr(struct vop_getattr_args *);
64 static int hammer_vop_nresolve(struct vop_nresolve_args *);
65 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
66 static int hammer_vop_nlink(struct vop_nlink_args *);
67 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
68 static int hammer_vop_nmknod(struct vop_nmknod_args *);
69 static int hammer_vop_open(struct vop_open_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_markatime(struct vop_markatime_args *);
77 static int hammer_vop_setattr(struct vop_setattr_args *);
78 static int hammer_vop_strategy(struct vop_strategy_args *);
79 static int hammer_vop_bmap(struct vop_bmap_args *ap);
80 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
81 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
82 static int hammer_vop_ioctl(struct vop_ioctl_args *);
83 static int hammer_vop_mountctl(struct vop_mountctl_args *);
84 static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
85 
86 static int hammer_vop_fifoclose (struct vop_close_args *);
87 static int hammer_vop_fiforead (struct vop_read_args *);
88 static int hammer_vop_fifowrite (struct vop_write_args *);
89 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
90 
91 static int hammer_vop_specclose (struct vop_close_args *);
92 static int hammer_vop_specread (struct vop_read_args *);
93 static int hammer_vop_specwrite (struct vop_write_args *);
94 static int hammer_vop_specgetattr (struct vop_getattr_args *);
95 
96 struct vop_ops hammer_vnode_vops = {
97 	.vop_default =		vop_defaultop,
98 	.vop_fsync =		hammer_vop_fsync,
99 	.vop_getpages =		vop_stdgetpages,
100 	.vop_putpages =		vop_stdputpages,
101 	.vop_read =		hammer_vop_read,
102 	.vop_write =		hammer_vop_write,
103 	.vop_access =		hammer_vop_access,
104 	.vop_advlock =		hammer_vop_advlock,
105 	.vop_close =		hammer_vop_close,
106 	.vop_ncreate =		hammer_vop_ncreate,
107 	.vop_getattr =		hammer_vop_getattr,
108 	.vop_inactive =		hammer_vop_inactive,
109 	.vop_reclaim =		hammer_vop_reclaim,
110 	.vop_nresolve =		hammer_vop_nresolve,
111 	.vop_nlookupdotdot =	hammer_vop_nlookupdotdot,
112 	.vop_nlink =		hammer_vop_nlink,
113 	.vop_nmkdir =		hammer_vop_nmkdir,
114 	.vop_nmknod =		hammer_vop_nmknod,
115 	.vop_open =		hammer_vop_open,
116 	.vop_pathconf =		vop_stdpathconf,
117 	.vop_print =		hammer_vop_print,
118 	.vop_readdir =		hammer_vop_readdir,
119 	.vop_readlink =		hammer_vop_readlink,
120 	.vop_nremove =		hammer_vop_nremove,
121 	.vop_nrename =		hammer_vop_nrename,
122 	.vop_nrmdir =		hammer_vop_nrmdir,
123 	.vop_markatime = 	hammer_vop_markatime,
124 	.vop_setattr =		hammer_vop_setattr,
125 	.vop_bmap =		hammer_vop_bmap,
126 	.vop_strategy =		hammer_vop_strategy,
127 	.vop_nsymlink =		hammer_vop_nsymlink,
128 	.vop_nwhiteout =	hammer_vop_nwhiteout,
129 	.vop_ioctl =		hammer_vop_ioctl,
130 	.vop_mountctl =		hammer_vop_mountctl,
131 	.vop_kqfilter =		hammer_vop_kqfilter
132 };
133 
134 struct vop_ops hammer_spec_vops = {
135 	.vop_default =		spec_vnoperate,
136 	.vop_fsync =		hammer_vop_fsync,
137 	.vop_read =		hammer_vop_specread,
138 	.vop_write =		hammer_vop_specwrite,
139 	.vop_access =		hammer_vop_access,
140 	.vop_close =		hammer_vop_specclose,
141 	.vop_markatime = 	hammer_vop_markatime,
142 	.vop_getattr =		hammer_vop_specgetattr,
143 	.vop_inactive =		hammer_vop_inactive,
144 	.vop_reclaim =		hammer_vop_reclaim,
145 	.vop_setattr =		hammer_vop_setattr
146 };
147 
148 struct vop_ops hammer_fifo_vops = {
149 	.vop_default =		fifo_vnoperate,
150 	.vop_fsync =		hammer_vop_fsync,
151 	.vop_read =		hammer_vop_fiforead,
152 	.vop_write =		hammer_vop_fifowrite,
153 	.vop_access =		hammer_vop_access,
154 	.vop_close =		hammer_vop_fifoclose,
155 	.vop_markatime = 	hammer_vop_markatime,
156 	.vop_getattr =		hammer_vop_getattr,
157 	.vop_inactive =		hammer_vop_inactive,
158 	.vop_reclaim =		hammer_vop_reclaim,
159 	.vop_setattr =		hammer_vop_setattr,
160 	.vop_kqfilter =		hammer_vop_fifokqfilter
161 };
162 
163 static __inline
164 void
165 hammer_knote(struct vnode *vp, int flags)
166 {
167 	if (flags)
168 		KNOTE(&vp->v_pollinfo.vpi_selinfo.si_note, flags);
169 }
170 
171 #ifdef DEBUG_TRUNCATE
172 struct hammer_inode *HammerTruncIp;
173 #endif
174 
175 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
176 			   struct vnode *dvp, struct ucred *cred,
177 			   int flags, int isdir);
178 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
179 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
180 
181 #if 0
182 static
183 int
184 hammer_vop_vnoperate(struct vop_generic_args *)
185 {
186 	return (VOCALL(&hammer_vnode_vops, ap));
187 }
188 #endif
189 
190 /*
191  * hammer_vop_fsync { vp, waitfor }
192  *
193  * fsync() an inode to disk and wait for it to be completely committed
194  * such that the information would not be undone if a crash occured after
195  * return.
196  */
197 static
198 int
199 hammer_vop_fsync(struct vop_fsync_args *ap)
200 {
201 	hammer_inode_t ip = VTOI(ap->a_vp);
202 
203 	++hammer_count_fsyncs;
204 	vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL);
205 	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
206 	if (ap->a_waitfor == MNT_WAIT) {
207 		vn_unlock(ap->a_vp);
208 		hammer_wait_inode(ip);
209 		vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
210 	}
211 	return (ip->error);
212 }
213 
214 /*
215  * hammer_vop_read { vp, uio, ioflag, cred }
216  */
217 static
218 int
219 hammer_vop_read(struct vop_read_args *ap)
220 {
221 	struct hammer_transaction trans;
222 	hammer_inode_t ip;
223 	off_t offset;
224 	struct buf *bp;
225 	struct uio *uio;
226 	int error;
227 	int n;
228 	int seqcount;
229 	int ioseqcount;
230 	int blksize;
231 
232 	if (ap->a_vp->v_type != VREG)
233 		return (EINVAL);
234 	ip = VTOI(ap->a_vp);
235 	error = 0;
236 	uio = ap->a_uio;
237 
238 	/*
239 	 * Allow the UIO's size to override the sequential heuristic.
240 	 */
241 	blksize = hammer_blocksize(uio->uio_offset);
242 	seqcount = (uio->uio_resid + (blksize - 1)) / blksize;
243 	ioseqcount = ap->a_ioflag >> 16;
244 	if (seqcount < ioseqcount)
245 		seqcount = ioseqcount;
246 
247 	hammer_start_transaction(&trans, ip->hmp);
248 
249 	/*
250 	 * Access the data typically in HAMMER_BUFSIZE blocks via the
251 	 * buffer cache, but HAMMER may use a variable block size based
252 	 * on the offset.
253 	 */
254 	while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
255 		int64_t base_offset;
256 		int64_t file_limit;
257 
258 		blksize = hammer_blocksize(uio->uio_offset);
259 		offset = (int)uio->uio_offset & (blksize - 1);
260 		base_offset = uio->uio_offset - offset;
261 
262 		if (hammer_cluster_enable) {
263 			/*
264 			 * Use file_limit to prevent cluster_read() from
265 			 * creating buffers of the wrong block size past
266 			 * the demarc.
267 			 */
268 			file_limit = ip->ino_data.size;
269 			if (base_offset < HAMMER_XDEMARC &&
270 			    file_limit > HAMMER_XDEMARC) {
271 				file_limit = HAMMER_XDEMARC;
272 			}
273 			error = cluster_read(ap->a_vp,
274 					     file_limit, base_offset,
275 					     blksize, MAXPHYS,
276 					     seqcount, &bp);
277 		} else {
278 			error = bread(ap->a_vp, base_offset, blksize, &bp);
279 		}
280 		if (error) {
281 			kprintf("error %d\n", error);
282 			brelse(bp);
283 			break;
284 		}
285 
286 		/* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
287 		n = blksize - offset;
288 		if (n > uio->uio_resid)
289 			n = uio->uio_resid;
290 		if (n > ip->ino_data.size - uio->uio_offset)
291 			n = (int)(ip->ino_data.size - uio->uio_offset);
292 		error = uiomove((char *)bp->b_data + offset, n, uio);
293 
294 		/* data has a lower priority then meta-data */
295 		bp->b_flags |= B_AGE;
296 		bqrelse(bp);
297 		if (error)
298 			break;
299 		hammer_stats_file_read += n;
300 	}
301 	if ((ip->flags & HAMMER_INODE_RO) == 0 &&
302 	    (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
303 		ip->ino_data.atime = trans.time;
304 		hammer_modify_inode(ip, HAMMER_INODE_ATIME);
305 	}
306 	hammer_done_transaction(&trans);
307 	return (error);
308 }
309 
310 /*
311  * hammer_vop_write { vp, uio, ioflag, cred }
312  */
313 static
314 int
315 hammer_vop_write(struct vop_write_args *ap)
316 {
317 	struct hammer_transaction trans;
318 	struct hammer_inode *ip;
319 	hammer_mount_t hmp;
320 	struct uio *uio;
321 	int offset;
322 	off_t base_offset;
323 	struct buf *bp;
324 	int kflags;
325 	int error;
326 	int n;
327 	int flags;
328 	int delta;
329 	int seqcount;
330 
331 	if (ap->a_vp->v_type != VREG)
332 		return (EINVAL);
333 	ip = VTOI(ap->a_vp);
334 	hmp = ip->hmp;
335 	error = 0;
336 	kflags = 0;
337 	seqcount = ap->a_ioflag >> 16;
338 
339 	if (ip->flags & HAMMER_INODE_RO)
340 		return (EROFS);
341 
342 	/*
343 	 * Create a transaction to cover the operations we perform.
344 	 */
345 	hammer_start_transaction(&trans, hmp);
346 	uio = ap->a_uio;
347 
348 	/*
349 	 * Check append mode
350 	 */
351 	if (ap->a_ioflag & IO_APPEND)
352 		uio->uio_offset = ip->ino_data.size;
353 
354 	/*
355 	 * Check for illegal write offsets.  Valid range is 0...2^63-1.
356 	 *
357 	 * NOTE: the base_off assignment is required to work around what
358 	 * I consider to be a GCC-4 optimization bug.
359 	 */
360 	if (uio->uio_offset < 0) {
361 		hammer_done_transaction(&trans);
362 		return (EFBIG);
363 	}
364 	base_offset = uio->uio_offset + uio->uio_resid;	/* work around gcc-4 */
365 	if (uio->uio_resid > 0 && base_offset <= 0) {
366 		hammer_done_transaction(&trans);
367 		return (EFBIG);
368 	}
369 
370 	/*
371 	 * Access the data typically in HAMMER_BUFSIZE blocks via the
372 	 * buffer cache, but HAMMER may use a variable block size based
373 	 * on the offset.
374 	 */
375 	while (uio->uio_resid > 0) {
376 		int fixsize = 0;
377 		int blksize;
378 		int blkmask;
379 
380 		if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
381 			break;
382 
383 		blksize = hammer_blocksize(uio->uio_offset);
384 
385 		/*
386 		 * Do not allow HAMMER to blow out the buffer cache.  Very
387 		 * large UIOs can lockout other processes due to bwillwrite()
388 		 * mechanics.
389 		 *
390 		 * The hammer inode is not locked during these operations.
391 		 * The vnode is locked which can interfere with the pageout
392 		 * daemon for non-UIO_NOCOPY writes but should not interfere
393 		 * with the buffer cache.  Even so, we cannot afford to
394 		 * allow the pageout daemon to build up too many dirty buffer
395 		 * cache buffers.
396 		 *
397 		 * Only call this if we aren't being recursively called from
398 		 * a virtual disk device (vn), else we may deadlock.
399 		 */
400 		if ((ap->a_ioflag & IO_RECURSE) == 0)
401 			bwillwrite(blksize);
402 
403 		/*
404 		 * Do not allow HAMMER to blow out system memory by
405 		 * accumulating too many records.   Records are so well
406 		 * decoupled from the buffer cache that it is possible
407 		 * for userland to push data out to the media via
408 		 * direct-write, but build up the records queued to the
409 		 * backend faster then the backend can flush them out.
410 		 * HAMMER has hit its write limit but the frontend has
411 		 * no pushback to slow it down.
412 		 */
413 		if (hmp->rsv_recs > hammer_limit_recs / 2) {
414 			/*
415 			 * Get the inode on the flush list
416 			 */
417 			if (ip->rsv_recs >= 64)
418 				hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
419 			else if (ip->rsv_recs >= 16)
420 				hammer_flush_inode(ip, 0);
421 
422 			/*
423 			 * Keep the flusher going if the system keeps
424 			 * queueing records.
425 			 */
426 			delta = hmp->count_newrecords -
427 				hmp->last_newrecords;
428 			if (delta < 0 || delta > hammer_limit_recs / 2) {
429 				hmp->last_newrecords = hmp->count_newrecords;
430 				hammer_sync_hmp(hmp, MNT_NOWAIT);
431 			}
432 
433 			/*
434 			 * If we have gotten behind start slowing
435 			 * down the writers.
436 			 */
437 			delta = (hmp->rsv_recs - hammer_limit_recs) *
438 				hz / hammer_limit_recs;
439 			if (delta > 0)
440 				tsleep(&trans, 0, "hmrslo", delta);
441 		}
442 
443 		/*
444 		 * Calculate the blocksize at the current offset and figure
445 		 * out how much we can actually write.
446 		 */
447 		blkmask = blksize - 1;
448 		offset = (int)uio->uio_offset & blkmask;
449 		base_offset = uio->uio_offset & ~(int64_t)blkmask;
450 		n = blksize - offset;
451 		if (n > uio->uio_resid)
452 			n = uio->uio_resid;
453 		if (uio->uio_offset + n > ip->ino_data.size) {
454 			vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
455 			fixsize = 1;
456 			kflags |= NOTE_EXTEND;
457 		}
458 
459 		if (uio->uio_segflg == UIO_NOCOPY) {
460 			/*
461 			 * Issuing a write with the same data backing the
462 			 * buffer.  Instantiate the buffer to collect the
463 			 * backing vm pages, then read-in any missing bits.
464 			 *
465 			 * This case is used by vop_stdputpages().
466 			 */
467 			bp = getblk(ap->a_vp, base_offset,
468 				    blksize, GETBLK_BHEAVY, 0);
469 			if ((bp->b_flags & B_CACHE) == 0) {
470 				bqrelse(bp);
471 				error = bread(ap->a_vp, base_offset,
472 					      blksize, &bp);
473 			}
474 		} else if (offset == 0 && uio->uio_resid >= blksize) {
475 			/*
476 			 * Even though we are entirely overwriting the buffer
477 			 * we may still have to zero it out to avoid a
478 			 * mmap/write visibility issue.
479 			 */
480 			bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
481 			if ((bp->b_flags & B_CACHE) == 0)
482 				vfs_bio_clrbuf(bp);
483 		} else if (base_offset >= ip->ino_data.size) {
484 			/*
485 			 * If the base offset of the buffer is beyond the
486 			 * file EOF, we don't have to issue a read.
487 			 */
488 			bp = getblk(ap->a_vp, base_offset,
489 				    blksize, GETBLK_BHEAVY, 0);
490 			vfs_bio_clrbuf(bp);
491 		} else {
492 			/*
493 			 * Partial overwrite, read in any missing bits then
494 			 * replace the portion being written.
495 			 */
496 			error = bread(ap->a_vp, base_offset, blksize, &bp);
497 			if (error == 0)
498 				bheavy(bp);
499 		}
500 		if (error == 0) {
501 			error = uiomove((char *)bp->b_data + offset,
502 					n, uio);
503 		}
504 
505 		/*
506 		 * If we screwed up we have to undo any VM size changes we
507 		 * made.
508 		 */
509 		if (error) {
510 			brelse(bp);
511 			if (fixsize) {
512 				vtruncbuf(ap->a_vp, ip->ino_data.size,
513 					  hammer_blocksize(ip->ino_data.size));
514 			}
515 			break;
516 		}
517 		kflags |= NOTE_WRITE;
518 		hammer_stats_file_write += n;
519 		/* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
520 		if (ip->ino_data.size < uio->uio_offset) {
521 			ip->ino_data.size = uio->uio_offset;
522 			flags = HAMMER_INODE_DDIRTY;
523 			vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
524 		} else {
525 			flags = 0;
526 		}
527 		ip->ino_data.mtime = trans.time;
528 		flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
529 		hammer_modify_inode(ip, flags);
530 
531 		/*
532 		 * Once we dirty the buffer any cached zone-X offset
533 		 * becomes invalid.  HAMMER NOTE: no-history mode cannot
534 		 * allow overwriting over the same data sector unless
535 		 * we provide UNDOs for the old data, which we don't.
536 		 */
537 		bp->b_bio2.bio_offset = NOOFFSET;
538 
539 		/*
540 		 * Final buffer disposition.
541 		 */
542 		bp->b_flags |= B_AGE;
543 		if (ap->a_ioflag & IO_SYNC) {
544 			bwrite(bp);
545 		} else if (ap->a_ioflag & IO_DIRECT) {
546 			bawrite(bp);
547 		} else {
548 			bdwrite(bp);
549 		}
550 	}
551 	hammer_done_transaction(&trans);
552 	hammer_knote(ap->a_vp, kflags);
553 	return (error);
554 }
555 
556 /*
557  * hammer_vop_access { vp, mode, cred }
558  */
559 static
560 int
561 hammer_vop_access(struct vop_access_args *ap)
562 {
563 	struct hammer_inode *ip = VTOI(ap->a_vp);
564 	uid_t uid;
565 	gid_t gid;
566 	int error;
567 
568 	++hammer_stats_file_iopsr;
569 	uid = hammer_to_unix_xid(&ip->ino_data.uid);
570 	gid = hammer_to_unix_xid(&ip->ino_data.gid);
571 
572 	error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
573 				  ip->ino_data.uflags);
574 	return (error);
575 }
576 
577 /*
578  * hammer_vop_advlock { vp, id, op, fl, flags }
579  */
580 static
581 int
582 hammer_vop_advlock(struct vop_advlock_args *ap)
583 {
584 	hammer_inode_t ip = VTOI(ap->a_vp);
585 
586 	return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
587 }
588 
589 /*
590  * hammer_vop_close { vp, fflag }
591  */
592 static
593 int
594 hammer_vop_close(struct vop_close_args *ap)
595 {
596 	/*hammer_inode_t ip = VTOI(ap->a_vp);*/
597 	return (vop_stdclose(ap));
598 }
599 
600 /*
601  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
602  *
603  * The operating system has already ensured that the directory entry
604  * does not exist and done all appropriate namespace locking.
605  */
606 static
607 int
608 hammer_vop_ncreate(struct vop_ncreate_args *ap)
609 {
610 	struct hammer_transaction trans;
611 	struct hammer_inode *dip;
612 	struct hammer_inode *nip;
613 	struct nchandle *nch;
614 	int error;
615 
616 	nch = ap->a_nch;
617 	dip = VTOI(ap->a_dvp);
618 
619 	if (dip->flags & HAMMER_INODE_RO)
620 		return (EROFS);
621 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
622 		return (error);
623 
624 	/*
625 	 * Create a transaction to cover the operations we perform.
626 	 */
627 	hammer_start_transaction(&trans, dip->hmp);
628 	++hammer_stats_file_iopsw;
629 
630 	/*
631 	 * Create a new filesystem object of the requested type.  The
632 	 * returned inode will be referenced and shared-locked to prevent
633 	 * it from being moved to the flusher.
634 	 */
635 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
636 				    dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
637 				    NULL, &nip);
638 	if (error) {
639 		hkprintf("hammer_create_inode error %d\n", error);
640 		hammer_done_transaction(&trans);
641 		*ap->a_vpp = NULL;
642 		return (error);
643 	}
644 
645 	/*
646 	 * Add the new filesystem object to the directory.  This will also
647 	 * bump the inode's link count.
648 	 */
649 	error = hammer_ip_add_directory(&trans, dip,
650 					nch->ncp->nc_name, nch->ncp->nc_nlen,
651 					nip);
652 	if (error)
653 		hkprintf("hammer_ip_add_directory error %d\n", error);
654 
655 	/*
656 	 * Finish up.
657 	 */
658 	if (error) {
659 		hammer_rel_inode(nip, 0);
660 		hammer_done_transaction(&trans);
661 		*ap->a_vpp = NULL;
662 	} else {
663 		error = hammer_get_vnode(nip, ap->a_vpp);
664 		hammer_done_transaction(&trans);
665 		hammer_rel_inode(nip, 0);
666 		if (error == 0) {
667 			cache_setunresolved(ap->a_nch);
668 			cache_setvp(ap->a_nch, *ap->a_vpp);
669 		}
670 		hammer_knote(ap->a_dvp, NOTE_WRITE);
671 	}
672 	return (error);
673 }
674 
675 /*
676  * hammer_vop_getattr { vp, vap }
677  *
678  * Retrieve an inode's attribute information.  When accessing inodes
679  * historically we fake the atime field to ensure consistent results.
680  * The atime field is stored in the B-Tree element and allowed to be
681  * updated without cycling the element.
682  */
683 static
684 int
685 hammer_vop_getattr(struct vop_getattr_args *ap)
686 {
687 	struct hammer_inode *ip = VTOI(ap->a_vp);
688 	struct vattr *vap = ap->a_vap;
689 
690 	/*
691 	 * We want the fsid to be different when accessing a filesystem
692 	 * with different as-of's so programs like diff don't think
693 	 * the files are the same.
694 	 *
695 	 * We also want the fsid to be the same when comparing snapshots,
696 	 * or when comparing mirrors (which might be backed by different
697 	 * physical devices).  HAMMER fsids are based on the PFS's
698 	 * shared_uuid field.
699 	 *
700 	 * XXX there is a chance of collision here.  The va_fsid reported
701 	 * by stat is different from the more involved fsid used in the
702 	 * mount structure.
703 	 */
704 	++hammer_stats_file_iopsr;
705 	vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
706 		       (u_int32_t)(ip->obj_asof >> 32);
707 
708 	vap->va_fileid = ip->ino_leaf.base.obj_id;
709 	vap->va_mode = ip->ino_data.mode;
710 	vap->va_nlink = ip->ino_data.nlinks;
711 	vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
712 	vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
713 	vap->va_rmajor = 0;
714 	vap->va_rminor = 0;
715 	vap->va_size = ip->ino_data.size;
716 
717 	/*
718 	 * Special case for @@PFS softlinks.  The actual size of the
719 	 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
720 	 * or for MAX_TID is    "@@-1:%05d" == 10 bytes.
721 	 */
722 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
723 	    ip->ino_data.size == 10 &&
724 	    ip->obj_asof == HAMMER_MAX_TID &&
725 	    ip->obj_localization == 0 &&
726 	    strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
727 		    if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE)
728 			    vap->va_size = 26;
729 		    else
730 			    vap->va_size = 10;
731 	}
732 
733 	/*
734 	 * We must provide a consistent atime and mtime for snapshots
735 	 * so people can do a 'tar cf - ... | md5' on them and get
736 	 * consistent results.
737 	 */
738 	if (ip->flags & HAMMER_INODE_RO) {
739 		hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
740 		hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
741 	} else {
742 		hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
743 		hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
744 	}
745 	hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
746 	vap->va_flags = ip->ino_data.uflags;
747 	vap->va_gen = 1;	/* hammer inums are unique for all time */
748 	vap->va_blocksize = HAMMER_BUFSIZE;
749 	if (ip->ino_data.size >= HAMMER_XDEMARC) {
750 		vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
751 				~HAMMER_XBUFMASK64;
752 	} else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
753 		vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
754 				~HAMMER_BUFMASK64;
755 	} else {
756 		vap->va_bytes = (ip->ino_data.size + 15) & ~15;
757 	}
758 
759 	vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
760 	vap->va_filerev = 0; 	/* XXX */
761 	/* mtime uniquely identifies any adjustments made to the file XXX */
762 	vap->va_fsmid = ip->ino_data.mtime;
763 	vap->va_uid_uuid = ip->ino_data.uid;
764 	vap->va_gid_uuid = ip->ino_data.gid;
765 	vap->va_fsid_uuid = ip->hmp->fsid;
766 	vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
767 			  VA_FSID_UUID_VALID;
768 
769 	switch (ip->ino_data.obj_type) {
770 	case HAMMER_OBJTYPE_CDEV:
771 	case HAMMER_OBJTYPE_BDEV:
772 		vap->va_rmajor = ip->ino_data.rmajor;
773 		vap->va_rminor = ip->ino_data.rminor;
774 		break;
775 	default:
776 		break;
777 	}
778 	return(0);
779 }
780 
781 /*
782  * hammer_vop_nresolve { nch, dvp, cred }
783  *
784  * Locate the requested directory entry.
785  */
786 static
787 int
788 hammer_vop_nresolve(struct vop_nresolve_args *ap)
789 {
790 	struct hammer_transaction trans;
791 	struct namecache *ncp;
792 	hammer_inode_t dip;
793 	hammer_inode_t ip;
794 	hammer_tid_t asof;
795 	struct hammer_cursor cursor;
796 	struct vnode *vp;
797 	int64_t namekey;
798 	int error;
799 	int i;
800 	int nlen;
801 	int flags;
802 	int ispfs;
803 	int64_t obj_id;
804 	u_int32_t localization;
805 	u_int32_t max_iterations;
806 
807 	/*
808 	 * Misc initialization, plus handle as-of name extensions.  Look for
809 	 * the '@@' extension.  Note that as-of files and directories cannot
810 	 * be modified.
811 	 */
812 	dip = VTOI(ap->a_dvp);
813 	ncp = ap->a_nch->ncp;
814 	asof = dip->obj_asof;
815 	localization = dip->obj_localization;	/* for code consistency */
816 	nlen = ncp->nc_nlen;
817 	flags = dip->flags & HAMMER_INODE_RO;
818 	ispfs = 0;
819 
820 	hammer_simple_transaction(&trans, dip->hmp);
821 	++hammer_stats_file_iopsr;
822 
823 	for (i = 0; i < nlen; ++i) {
824 		if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
825 			error = hammer_str_to_tid(ncp->nc_name + i + 2,
826 						  &ispfs, &asof, &localization);
827 			if (error != 0) {
828 				i = nlen;
829 				break;
830 			}
831 			if (asof != HAMMER_MAX_TID)
832 				flags |= HAMMER_INODE_RO;
833 			break;
834 		}
835 	}
836 	nlen = i;
837 
838 	/*
839 	 * If this is a PFS softlink we dive into the PFS
840 	 */
841 	if (ispfs && nlen == 0) {
842 		ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
843 				      asof, localization,
844 				      flags, &error);
845 		if (error == 0) {
846 			error = hammer_get_vnode(ip, &vp);
847 			hammer_rel_inode(ip, 0);
848 		} else {
849 			vp = NULL;
850 		}
851 		if (error == 0) {
852 			vn_unlock(vp);
853 			cache_setvp(ap->a_nch, vp);
854 			vrele(vp);
855 		}
856 		goto done;
857 	}
858 
859 	/*
860 	 * If there is no path component the time extension is relative to dip.
861 	 * e.g. "fubar/@@<snapshot>"
862 	 *
863 	 * "." is handled by the kernel, but ".@@<snapshot>" is not.
864 	 * e.g. "fubar/.@@<snapshot>"
865 	 *
866 	 * ".." is handled by the kernel.  We do not currently handle
867 	 * "..@<snapshot>".
868 	 */
869 	if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) {
870 		ip = hammer_get_inode(&trans, dip, dip->obj_id,
871 				      asof, dip->obj_localization,
872 				      flags, &error);
873 		if (error == 0) {
874 			error = hammer_get_vnode(ip, &vp);
875 			hammer_rel_inode(ip, 0);
876 		} else {
877 			vp = NULL;
878 		}
879 		if (error == 0) {
880 			vn_unlock(vp);
881 			cache_setvp(ap->a_nch, vp);
882 			vrele(vp);
883 		}
884 		goto done;
885 	}
886 
887 	/*
888 	 * Calculate the namekey and setup the key range for the scan.  This
889 	 * works kinda like a chained hash table where the lower 32 bits
890 	 * of the namekey synthesize the chain.
891 	 *
892 	 * The key range is inclusive of both key_beg and key_end.
893 	 */
894 	namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
895 					   &max_iterations);
896 
897 	error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
898 	cursor.key_beg.localization = dip->obj_localization +
899 				      HAMMER_LOCALIZE_MISC;
900         cursor.key_beg.obj_id = dip->obj_id;
901 	cursor.key_beg.key = namekey;
902         cursor.key_beg.create_tid = 0;
903         cursor.key_beg.delete_tid = 0;
904         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
905         cursor.key_beg.obj_type = 0;
906 
907 	cursor.key_end = cursor.key_beg;
908 	cursor.key_end.key += max_iterations;
909 	cursor.asof = asof;
910 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
911 
912 	/*
913 	 * Scan all matching records (the chain), locate the one matching
914 	 * the requested path component.
915 	 *
916 	 * The hammer_ip_*() functions merge in-memory records with on-disk
917 	 * records for the purposes of the search.
918 	 */
919 	obj_id = 0;
920 	localization = HAMMER_DEF_LOCALIZATION;
921 
922 	if (error == 0) {
923 		error = hammer_ip_first(&cursor);
924 		while (error == 0) {
925 			error = hammer_ip_resolve_data(&cursor);
926 			if (error)
927 				break;
928 			if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
929 			    bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
930 				obj_id = cursor.data->entry.obj_id;
931 				localization = cursor.data->entry.localization;
932 				break;
933 			}
934 			error = hammer_ip_next(&cursor);
935 		}
936 	}
937 	hammer_done_cursor(&cursor);
938 
939 	/*
940 	 * Lookup the obj_id.  This should always succeed.  If it does not
941 	 * the filesystem may be damaged and we return a dummy inode.
942 	 */
943 	if (error == 0) {
944 		ip = hammer_get_inode(&trans, dip, obj_id,
945 				      asof, localization,
946 				      flags, &error);
947 		if (error == ENOENT) {
948 			kprintf("HAMMER: WARNING: Missing "
949 				"inode for dirent \"%s\"\n"
950 				"\tobj_id = %016llx\n",
951 				ncp->nc_name, (long long)obj_id);
952 			error = 0;
953 			ip = hammer_get_dummy_inode(&trans, dip, obj_id,
954 						    asof, localization,
955 						    flags, &error);
956 		}
957 		if (error == 0) {
958 			error = hammer_get_vnode(ip, &vp);
959 			hammer_rel_inode(ip, 0);
960 		} else {
961 			vp = NULL;
962 		}
963 		if (error == 0) {
964 			vn_unlock(vp);
965 			cache_setvp(ap->a_nch, vp);
966 			vrele(vp);
967 		}
968 	} else if (error == ENOENT) {
969 		cache_setvp(ap->a_nch, NULL);
970 	}
971 done:
972 	hammer_done_transaction(&trans);
973 	return (error);
974 }
975 
976 /*
977  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
978  *
979  * Locate the parent directory of a directory vnode.
980  *
981  * dvp is referenced but not locked.  *vpp must be returned referenced and
982  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
983  * at the root, instead it could indicate that the directory we were in was
984  * removed.
985  *
986  * NOTE: as-of sequences are not linked into the directory structure.  If
987  * we are at the root with a different asof then the mount point, reload
988  * the same directory with the mount point's asof.   I'm not sure what this
989  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
990  * get confused, but it hasn't been tested.
991  */
992 static
993 int
994 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
995 {
996 	struct hammer_transaction trans;
997 	struct hammer_inode *dip;
998 	struct hammer_inode *ip;
999 	int64_t parent_obj_id;
1000 	u_int32_t parent_obj_localization;
1001 	hammer_tid_t asof;
1002 	int error;
1003 
1004 	dip = VTOI(ap->a_dvp);
1005 	asof = dip->obj_asof;
1006 
1007 	/*
1008 	 * Whos are parent?  This could be the root of a pseudo-filesystem
1009 	 * whos parent is in another localization domain.
1010 	 */
1011 	parent_obj_id = dip->ino_data.parent_obj_id;
1012 	if (dip->obj_id == HAMMER_OBJID_ROOT)
1013 		parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
1014 	else
1015 		parent_obj_localization = dip->obj_localization;
1016 
1017 	if (parent_obj_id == 0) {
1018 		if (dip->obj_id == HAMMER_OBJID_ROOT &&
1019 		   asof != dip->hmp->asof) {
1020 			parent_obj_id = dip->obj_id;
1021 			asof = dip->hmp->asof;
1022 			*ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
1023 			ksnprintf(*ap->a_fakename, 19, "0x%016llx",
1024 				  (long long)dip->obj_asof);
1025 		} else {
1026 			*ap->a_vpp = NULL;
1027 			return ENOENT;
1028 		}
1029 	}
1030 
1031 	hammer_simple_transaction(&trans, dip->hmp);
1032 	++hammer_stats_file_iopsr;
1033 
1034 	ip = hammer_get_inode(&trans, dip, parent_obj_id,
1035 			      asof, parent_obj_localization,
1036 			      dip->flags, &error);
1037 	if (ip) {
1038 		error = hammer_get_vnode(ip, ap->a_vpp);
1039 		hammer_rel_inode(ip, 0);
1040 	} else {
1041 		*ap->a_vpp = NULL;
1042 	}
1043 	hammer_done_transaction(&trans);
1044 	return (error);
1045 }
1046 
1047 /*
1048  * hammer_vop_nlink { nch, dvp, vp, cred }
1049  */
1050 static
1051 int
1052 hammer_vop_nlink(struct vop_nlink_args *ap)
1053 {
1054 	struct hammer_transaction trans;
1055 	struct hammer_inode *dip;
1056 	struct hammer_inode *ip;
1057 	struct nchandle *nch;
1058 	int error;
1059 
1060 	if (ap->a_dvp->v_mount != ap->a_vp->v_mount)
1061 		return(EXDEV);
1062 
1063 	nch = ap->a_nch;
1064 	dip = VTOI(ap->a_dvp);
1065 	ip = VTOI(ap->a_vp);
1066 
1067 	if (dip->obj_localization != ip->obj_localization)
1068 		return(EXDEV);
1069 
1070 	if (dip->flags & HAMMER_INODE_RO)
1071 		return (EROFS);
1072 	if (ip->flags & HAMMER_INODE_RO)
1073 		return (EROFS);
1074 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1075 		return (error);
1076 
1077 	/*
1078 	 * Create a transaction to cover the operations we perform.
1079 	 */
1080 	hammer_start_transaction(&trans, dip->hmp);
1081 	++hammer_stats_file_iopsw;
1082 
1083 	/*
1084 	 * Add the filesystem object to the directory.  Note that neither
1085 	 * dip nor ip are referenced or locked, but their vnodes are
1086 	 * referenced.  This function will bump the inode's link count.
1087 	 */
1088 	error = hammer_ip_add_directory(&trans, dip,
1089 					nch->ncp->nc_name, nch->ncp->nc_nlen,
1090 					ip);
1091 
1092 	/*
1093 	 * Finish up.
1094 	 */
1095 	if (error == 0) {
1096 		cache_setunresolved(nch);
1097 		cache_setvp(nch, ap->a_vp);
1098 	}
1099 	hammer_done_transaction(&trans);
1100 	hammer_knote(ap->a_vp, NOTE_LINK);
1101 	hammer_knote(ap->a_dvp, NOTE_WRITE);
1102 	return (error);
1103 }
1104 
1105 /*
1106  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1107  *
1108  * The operating system has already ensured that the directory entry
1109  * does not exist and done all appropriate namespace locking.
1110  */
1111 static
1112 int
1113 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1114 {
1115 	struct hammer_transaction trans;
1116 	struct hammer_inode *dip;
1117 	struct hammer_inode *nip;
1118 	struct nchandle *nch;
1119 	int error;
1120 
1121 	nch = ap->a_nch;
1122 	dip = VTOI(ap->a_dvp);
1123 
1124 	if (dip->flags & HAMMER_INODE_RO)
1125 		return (EROFS);
1126 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1127 		return (error);
1128 
1129 	/*
1130 	 * Create a transaction to cover the operations we perform.
1131 	 */
1132 	hammer_start_transaction(&trans, dip->hmp);
1133 	++hammer_stats_file_iopsw;
1134 
1135 	/*
1136 	 * Create a new filesystem object of the requested type.  The
1137 	 * returned inode will be referenced but not locked.
1138 	 */
1139 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1140 				    dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1141 				    NULL, &nip);
1142 	if (error) {
1143 		hkprintf("hammer_mkdir error %d\n", error);
1144 		hammer_done_transaction(&trans);
1145 		*ap->a_vpp = NULL;
1146 		return (error);
1147 	}
1148 	/*
1149 	 * Add the new filesystem object to the directory.  This will also
1150 	 * bump the inode's link count.
1151 	 */
1152 	error = hammer_ip_add_directory(&trans, dip,
1153 					nch->ncp->nc_name, nch->ncp->nc_nlen,
1154 					nip);
1155 	if (error)
1156 		hkprintf("hammer_mkdir (add) error %d\n", error);
1157 
1158 	/*
1159 	 * Finish up.
1160 	 */
1161 	if (error) {
1162 		hammer_rel_inode(nip, 0);
1163 		*ap->a_vpp = NULL;
1164 	} else {
1165 		error = hammer_get_vnode(nip, ap->a_vpp);
1166 		hammer_rel_inode(nip, 0);
1167 		if (error == 0) {
1168 			cache_setunresolved(ap->a_nch);
1169 			cache_setvp(ap->a_nch, *ap->a_vpp);
1170 		}
1171 	}
1172 	hammer_done_transaction(&trans);
1173 	if (error == 0)
1174 		hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1175 	return (error);
1176 }
1177 
1178 /*
1179  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1180  *
1181  * The operating system has already ensured that the directory entry
1182  * does not exist and done all appropriate namespace locking.
1183  */
1184 static
1185 int
1186 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1187 {
1188 	struct hammer_transaction trans;
1189 	struct hammer_inode *dip;
1190 	struct hammer_inode *nip;
1191 	struct nchandle *nch;
1192 	int error;
1193 
1194 	nch = ap->a_nch;
1195 	dip = VTOI(ap->a_dvp);
1196 
1197 	if (dip->flags & HAMMER_INODE_RO)
1198 		return (EROFS);
1199 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1200 		return (error);
1201 
1202 	/*
1203 	 * Create a transaction to cover the operations we perform.
1204 	 */
1205 	hammer_start_transaction(&trans, dip->hmp);
1206 	++hammer_stats_file_iopsw;
1207 
1208 	/*
1209 	 * Create a new filesystem object of the requested type.  The
1210 	 * returned inode will be referenced but not locked.
1211 	 *
1212 	 * If mknod specifies a directory a pseudo-fs is created.
1213 	 */
1214 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1215 				    dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1216 				    NULL, &nip);
1217 	if (error) {
1218 		hammer_done_transaction(&trans);
1219 		*ap->a_vpp = NULL;
1220 		return (error);
1221 	}
1222 
1223 	/*
1224 	 * Add the new filesystem object to the directory.  This will also
1225 	 * bump the inode's link count.
1226 	 */
1227 	error = hammer_ip_add_directory(&trans, dip,
1228 					nch->ncp->nc_name, nch->ncp->nc_nlen,
1229 					nip);
1230 
1231 	/*
1232 	 * Finish up.
1233 	 */
1234 	if (error) {
1235 		hammer_rel_inode(nip, 0);
1236 		*ap->a_vpp = NULL;
1237 	} else {
1238 		error = hammer_get_vnode(nip, ap->a_vpp);
1239 		hammer_rel_inode(nip, 0);
1240 		if (error == 0) {
1241 			cache_setunresolved(ap->a_nch);
1242 			cache_setvp(ap->a_nch, *ap->a_vpp);
1243 		}
1244 	}
1245 	hammer_done_transaction(&trans);
1246 	if (error == 0)
1247 		hammer_knote(ap->a_dvp, NOTE_WRITE);
1248 	return (error);
1249 }
1250 
1251 /*
1252  * hammer_vop_open { vp, mode, cred, fp }
1253  */
1254 static
1255 int
1256 hammer_vop_open(struct vop_open_args *ap)
1257 {
1258 	hammer_inode_t ip;
1259 
1260 	++hammer_stats_file_iopsr;
1261 	ip = VTOI(ap->a_vp);
1262 
1263 	if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1264 		return (EROFS);
1265 	return(vop_stdopen(ap));
1266 }
1267 
1268 /*
1269  * hammer_vop_print { vp }
1270  */
1271 static
1272 int
1273 hammer_vop_print(struct vop_print_args *ap)
1274 {
1275 	return EOPNOTSUPP;
1276 }
1277 
1278 /*
1279  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1280  */
1281 static
1282 int
1283 hammer_vop_readdir(struct vop_readdir_args *ap)
1284 {
1285 	struct hammer_transaction trans;
1286 	struct hammer_cursor cursor;
1287 	struct hammer_inode *ip;
1288 	struct uio *uio;
1289 	hammer_base_elm_t base;
1290 	int error;
1291 	int cookie_index;
1292 	int ncookies;
1293 	off_t *cookies;
1294 	off_t saveoff;
1295 	int r;
1296 	int dtype;
1297 
1298 	++hammer_stats_file_iopsr;
1299 	ip = VTOI(ap->a_vp);
1300 	uio = ap->a_uio;
1301 	saveoff = uio->uio_offset;
1302 
1303 	if (ap->a_ncookies) {
1304 		ncookies = uio->uio_resid / 16 + 1;
1305 		if (ncookies > 1024)
1306 			ncookies = 1024;
1307 		cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1308 		cookie_index = 0;
1309 	} else {
1310 		ncookies = -1;
1311 		cookies = NULL;
1312 		cookie_index = 0;
1313 	}
1314 
1315 	hammer_simple_transaction(&trans, ip->hmp);
1316 
1317 	/*
1318 	 * Handle artificial entries
1319 	 *
1320 	 * It should be noted that the minimum value for a directory
1321 	 * hash key on-media is 0x0000000100000000, so we can use anything
1322 	 * less then that to represent our 'special' key space.
1323 	 */
1324 	error = 0;
1325 	if (saveoff == 0) {
1326 		r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1327 		if (r)
1328 			goto done;
1329 		if (cookies)
1330 			cookies[cookie_index] = saveoff;
1331 		++saveoff;
1332 		++cookie_index;
1333 		if (cookie_index == ncookies)
1334 			goto done;
1335 	}
1336 	if (saveoff == 1) {
1337 		if (ip->ino_data.parent_obj_id) {
1338 			r = vop_write_dirent(&error, uio,
1339 					     ip->ino_data.parent_obj_id,
1340 					     DT_DIR, 2, "..");
1341 		} else {
1342 			r = vop_write_dirent(&error, uio,
1343 					     ip->obj_id, DT_DIR, 2, "..");
1344 		}
1345 		if (r)
1346 			goto done;
1347 		if (cookies)
1348 			cookies[cookie_index] = saveoff;
1349 		++saveoff;
1350 		++cookie_index;
1351 		if (cookie_index == ncookies)
1352 			goto done;
1353 	}
1354 
1355 	/*
1356 	 * Key range (begin and end inclusive) to scan.  Directory keys
1357 	 * directly translate to a 64 bit 'seek' position.
1358 	 */
1359 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1360 	cursor.key_beg.localization = ip->obj_localization +
1361 				      HAMMER_LOCALIZE_MISC;
1362 	cursor.key_beg.obj_id = ip->obj_id;
1363 	cursor.key_beg.create_tid = 0;
1364 	cursor.key_beg.delete_tid = 0;
1365         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1366 	cursor.key_beg.obj_type = 0;
1367 	cursor.key_beg.key = saveoff;
1368 
1369 	cursor.key_end = cursor.key_beg;
1370 	cursor.key_end.key = HAMMER_MAX_KEY;
1371 	cursor.asof = ip->obj_asof;
1372 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1373 
1374 	error = hammer_ip_first(&cursor);
1375 
1376 	while (error == 0) {
1377 		error = hammer_ip_resolve_data(&cursor);
1378 		if (error)
1379 			break;
1380 		base = &cursor.leaf->base;
1381 		saveoff = base->key;
1382 		KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1383 
1384 		if (base->obj_id != ip->obj_id)
1385 			panic("readdir: bad record at %p", cursor.node);
1386 
1387 		/*
1388 		 * Convert pseudo-filesystems into softlinks
1389 		 */
1390 		dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
1391 		r = vop_write_dirent(
1392 			     &error, uio, cursor.data->entry.obj_id,
1393 			     dtype,
1394 			     cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1395 			     (void *)cursor.data->entry.name);
1396 		if (r)
1397 			break;
1398 		++saveoff;
1399 		if (cookies)
1400 			cookies[cookie_index] = base->key;
1401 		++cookie_index;
1402 		if (cookie_index == ncookies)
1403 			break;
1404 		error = hammer_ip_next(&cursor);
1405 	}
1406 	hammer_done_cursor(&cursor);
1407 
1408 done:
1409 	hammer_done_transaction(&trans);
1410 
1411 	if (ap->a_eofflag)
1412 		*ap->a_eofflag = (error == ENOENT);
1413 	uio->uio_offset = saveoff;
1414 	if (error && cookie_index == 0) {
1415 		if (error == ENOENT)
1416 			error = 0;
1417 		if (cookies) {
1418 			kfree(cookies, M_TEMP);
1419 			*ap->a_ncookies = 0;
1420 			*ap->a_cookies = NULL;
1421 		}
1422 	} else {
1423 		if (error == ENOENT)
1424 			error = 0;
1425 		if (cookies) {
1426 			*ap->a_ncookies = cookie_index;
1427 			*ap->a_cookies = cookies;
1428 		}
1429 	}
1430 	return(error);
1431 }
1432 
1433 /*
1434  * hammer_vop_readlink { vp, uio, cred }
1435  */
1436 static
1437 int
1438 hammer_vop_readlink(struct vop_readlink_args *ap)
1439 {
1440 	struct hammer_transaction trans;
1441 	struct hammer_cursor cursor;
1442 	struct hammer_inode *ip;
1443 	char buf[32];
1444 	u_int32_t localization;
1445 	hammer_pseudofs_inmem_t pfsm;
1446 	int error;
1447 
1448 	ip = VTOI(ap->a_vp);
1449 
1450 	/*
1451 	 * Shortcut if the symlink data was stuffed into ino_data.
1452 	 *
1453 	 * Also expand special "@@PFS%05d" softlinks (expansion only
1454 	 * occurs for non-historical (current) accesses made from the
1455 	 * primary filesystem).
1456 	 */
1457 	if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1458 		char *ptr;
1459 		int bytes;
1460 
1461 		ptr = ip->ino_data.ext.symlink;
1462 		bytes = (int)ip->ino_data.size;
1463 		if (bytes == 10 &&
1464 		    ip->obj_asof == HAMMER_MAX_TID &&
1465 		    ip->obj_localization == 0 &&
1466 		    strncmp(ptr, "@@PFS", 5) == 0) {
1467 			hammer_simple_transaction(&trans, ip->hmp);
1468 			bcopy(ptr + 5, buf, 5);
1469 			buf[5] = 0;
1470 			localization = strtoul(buf, NULL, 10) << 16;
1471 			pfsm = hammer_load_pseudofs(&trans, localization,
1472 						    &error);
1473 			if (error == 0) {
1474 				if (pfsm->pfsd.mirror_flags &
1475 				    HAMMER_PFSD_SLAVE) {
1476 					/* vap->va_size == 26 */
1477 					ksnprintf(buf, sizeof(buf),
1478 						  "@@0x%016llx:%05d",
1479 						  (long long)pfsm->pfsd.sync_end_tid,
1480 						  localization >> 16);
1481 				} else {
1482 					/* vap->va_size == 10 */
1483 					ksnprintf(buf, sizeof(buf),
1484 						  "@@-1:%05d",
1485 						  localization >> 16);
1486 #if 0
1487 					ksnprintf(buf, sizeof(buf),
1488 						  "@@0x%016llx:%05d",
1489 						  (long long)HAMMER_MAX_TID,
1490 						  localization >> 16);
1491 #endif
1492 				}
1493 				ptr = buf;
1494 				bytes = strlen(buf);
1495 			}
1496 			if (pfsm)
1497 				hammer_rel_pseudofs(trans.hmp, pfsm);
1498 			hammer_done_transaction(&trans);
1499 		}
1500 		error = uiomove(ptr, bytes, ap->a_uio);
1501 		return(error);
1502 	}
1503 
1504 	/*
1505 	 * Long version
1506 	 */
1507 	hammer_simple_transaction(&trans, ip->hmp);
1508 	++hammer_stats_file_iopsr;
1509 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1510 
1511 	/*
1512 	 * Key range (begin and end inclusive) to scan.  Directory keys
1513 	 * directly translate to a 64 bit 'seek' position.
1514 	 */
1515 	cursor.key_beg.localization = ip->obj_localization +
1516 				      HAMMER_LOCALIZE_MISC;
1517 	cursor.key_beg.obj_id = ip->obj_id;
1518 	cursor.key_beg.create_tid = 0;
1519 	cursor.key_beg.delete_tid = 0;
1520         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1521 	cursor.key_beg.obj_type = 0;
1522 	cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1523 	cursor.asof = ip->obj_asof;
1524 	cursor.flags |= HAMMER_CURSOR_ASOF;
1525 
1526 	error = hammer_ip_lookup(&cursor);
1527 	if (error == 0) {
1528 		error = hammer_ip_resolve_data(&cursor);
1529 		if (error == 0) {
1530 			KKASSERT(cursor.leaf->data_len >=
1531 				 HAMMER_SYMLINK_NAME_OFF);
1532 			error = uiomove(cursor.data->symlink.name,
1533 					cursor.leaf->data_len -
1534 						HAMMER_SYMLINK_NAME_OFF,
1535 					ap->a_uio);
1536 		}
1537 	}
1538 	hammer_done_cursor(&cursor);
1539 	hammer_done_transaction(&trans);
1540 	return(error);
1541 }
1542 
1543 /*
1544  * hammer_vop_nremove { nch, dvp, cred }
1545  */
1546 static
1547 int
1548 hammer_vop_nremove(struct vop_nremove_args *ap)
1549 {
1550 	struct hammer_transaction trans;
1551 	struct hammer_inode *dip;
1552 	int error;
1553 
1554 	dip = VTOI(ap->a_dvp);
1555 
1556 	if (hammer_nohistory(dip) == 0 &&
1557 	    (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1558 		return (error);
1559 	}
1560 
1561 	hammer_start_transaction(&trans, dip->hmp);
1562 	++hammer_stats_file_iopsw;
1563 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
1564 	hammer_done_transaction(&trans);
1565 	if (error == 0)
1566 		hammer_knote(ap->a_dvp, NOTE_WRITE);
1567 	return (error);
1568 }
1569 
1570 /*
1571  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1572  */
1573 static
1574 int
1575 hammer_vop_nrename(struct vop_nrename_args *ap)
1576 {
1577 	struct hammer_transaction trans;
1578 	struct namecache *fncp;
1579 	struct namecache *tncp;
1580 	struct hammer_inode *fdip;
1581 	struct hammer_inode *tdip;
1582 	struct hammer_inode *ip;
1583 	struct hammer_cursor cursor;
1584 	int64_t namekey;
1585 	u_int32_t max_iterations;
1586 	int nlen, error;
1587 
1588 	if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
1589 		return(EXDEV);
1590 	if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1591 		return(EXDEV);
1592 
1593 	fdip = VTOI(ap->a_fdvp);
1594 	tdip = VTOI(ap->a_tdvp);
1595 	fncp = ap->a_fnch->ncp;
1596 	tncp = ap->a_tnch->ncp;
1597 	ip = VTOI(fncp->nc_vp);
1598 	KKASSERT(ip != NULL);
1599 
1600 	if (fdip->obj_localization != tdip->obj_localization)
1601 		return(EXDEV);
1602 	if (fdip->obj_localization != ip->obj_localization)
1603 		return(EXDEV);
1604 
1605 	if (fdip->flags & HAMMER_INODE_RO)
1606 		return (EROFS);
1607 	if (tdip->flags & HAMMER_INODE_RO)
1608 		return (EROFS);
1609 	if (ip->flags & HAMMER_INODE_RO)
1610 		return (EROFS);
1611 	if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1612 		return (error);
1613 
1614 	hammer_start_transaction(&trans, fdip->hmp);
1615 	++hammer_stats_file_iopsw;
1616 
1617 	/*
1618 	 * Remove tncp from the target directory and then link ip as
1619 	 * tncp. XXX pass trans to dounlink
1620 	 *
1621 	 * Force the inode sync-time to match the transaction so it is
1622 	 * in-sync with the creation of the target directory entry.
1623 	 */
1624 	error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
1625 				ap->a_cred, 0, -1);
1626 	if (error == 0 || error == ENOENT) {
1627 		error = hammer_ip_add_directory(&trans, tdip,
1628 						tncp->nc_name, tncp->nc_nlen,
1629 						ip);
1630 		if (error == 0) {
1631 			ip->ino_data.parent_obj_id = tdip->obj_id;
1632 			ip->ino_data.ctime = trans.time;
1633 			hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
1634 		}
1635 	}
1636 	if (error)
1637 		goto failed; /* XXX */
1638 
1639 	/*
1640 	 * Locate the record in the originating directory and remove it.
1641 	 *
1642 	 * Calculate the namekey and setup the key range for the scan.  This
1643 	 * works kinda like a chained hash table where the lower 32 bits
1644 	 * of the namekey synthesize the chain.
1645 	 *
1646 	 * The key range is inclusive of both key_beg and key_end.
1647 	 */
1648 	namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
1649 					   &max_iterations);
1650 retry:
1651 	hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
1652 	cursor.key_beg.localization = fdip->obj_localization +
1653 				      HAMMER_LOCALIZE_MISC;
1654         cursor.key_beg.obj_id = fdip->obj_id;
1655 	cursor.key_beg.key = namekey;
1656         cursor.key_beg.create_tid = 0;
1657         cursor.key_beg.delete_tid = 0;
1658         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1659         cursor.key_beg.obj_type = 0;
1660 
1661 	cursor.key_end = cursor.key_beg;
1662 	cursor.key_end.key += max_iterations;
1663 	cursor.asof = fdip->obj_asof;
1664 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1665 
1666 	/*
1667 	 * Scan all matching records (the chain), locate the one matching
1668 	 * the requested path component.
1669 	 *
1670 	 * The hammer_ip_*() functions merge in-memory records with on-disk
1671 	 * records for the purposes of the search.
1672 	 */
1673 	error = hammer_ip_first(&cursor);
1674 	while (error == 0) {
1675 		if (hammer_ip_resolve_data(&cursor) != 0)
1676 			break;
1677 		nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1678 		KKASSERT(nlen > 0);
1679 		if (fncp->nc_nlen == nlen &&
1680 		    bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1681 			break;
1682 		}
1683 		error = hammer_ip_next(&cursor);
1684 	}
1685 
1686 	/*
1687 	 * If all is ok we have to get the inode so we can adjust nlinks.
1688 	 *
1689 	 * WARNING: hammer_ip_del_directory() may have to terminate the
1690 	 * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1691 	 * twice.
1692 	 */
1693 	if (error == 0)
1694 		error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1695 
1696 	/*
1697 	 * XXX A deadlock here will break rename's atomicy for the purposes
1698 	 * of crash recovery.
1699 	 */
1700 	if (error == EDEADLK) {
1701 		hammer_done_cursor(&cursor);
1702 		goto retry;
1703 	}
1704 
1705 	/*
1706 	 * Cleanup and tell the kernel that the rename succeeded.
1707 	 */
1708         hammer_done_cursor(&cursor);
1709 	if (error == 0) {
1710 		cache_rename(ap->a_fnch, ap->a_tnch);
1711 		hammer_knote(ap->a_fdvp, NOTE_WRITE);
1712 		hammer_knote(ap->a_tdvp, NOTE_WRITE);
1713 		if (ip->vp)
1714 			hammer_knote(ip->vp, NOTE_RENAME);
1715 	}
1716 
1717 failed:
1718 	hammer_done_transaction(&trans);
1719 	return (error);
1720 }
1721 
1722 /*
1723  * hammer_vop_nrmdir { nch, dvp, cred }
1724  */
1725 static
1726 int
1727 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1728 {
1729 	struct hammer_transaction trans;
1730 	struct hammer_inode *dip;
1731 	int error;
1732 
1733 	dip = VTOI(ap->a_dvp);
1734 
1735 	if (hammer_nohistory(dip) == 0 &&
1736 	    (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1737 		return (error);
1738 	}
1739 
1740 	hammer_start_transaction(&trans, dip->hmp);
1741 	++hammer_stats_file_iopsw;
1742 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
1743 	hammer_done_transaction(&trans);
1744 	if (error == 0)
1745 		hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1746 	return (error);
1747 }
1748 
1749 /*
1750  * hammer_vop_markatime { vp, cred }
1751  */
1752 static
1753 int
1754 hammer_vop_markatime(struct vop_markatime_args *ap)
1755 {
1756 	struct hammer_transaction trans;
1757 	struct hammer_inode *ip;
1758 
1759 	ip = VTOI(ap->a_vp);
1760 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1761 		return (EROFS);
1762 	if (ip->flags & HAMMER_INODE_RO)
1763 		return (EROFS);
1764 	if (ip->hmp->mp->mnt_flag & MNT_NOATIME)
1765 		return (0);
1766 	hammer_start_transaction(&trans, ip->hmp);
1767 	++hammer_stats_file_iopsw;
1768 
1769 	ip->ino_data.atime = trans.time;
1770 	hammer_modify_inode(ip, HAMMER_INODE_ATIME);
1771 	hammer_done_transaction(&trans);
1772 	hammer_knote(ap->a_vp, NOTE_ATTRIB);
1773 	return (0);
1774 }
1775 
1776 /*
1777  * hammer_vop_setattr { vp, vap, cred }
1778  */
1779 static
1780 int
1781 hammer_vop_setattr(struct vop_setattr_args *ap)
1782 {
1783 	struct hammer_transaction trans;
1784 	struct vattr *vap;
1785 	struct hammer_inode *ip;
1786 	int modflags;
1787 	int error;
1788 	int truncating;
1789 	int blksize;
1790 	int kflags;
1791 	int64_t aligned_size;
1792 	u_int32_t flags;
1793 
1794 	vap = ap->a_vap;
1795 	ip = ap->a_vp->v_data;
1796 	modflags = 0;
1797 	kflags = 0;
1798 
1799 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1800 		return(EROFS);
1801 	if (ip->flags & HAMMER_INODE_RO)
1802 		return (EROFS);
1803 	if (hammer_nohistory(ip) == 0 &&
1804 	    (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1805 		return (error);
1806 	}
1807 
1808 	hammer_start_transaction(&trans, ip->hmp);
1809 	++hammer_stats_file_iopsw;
1810 	error = 0;
1811 
1812 	if (vap->va_flags != VNOVAL) {
1813 		flags = ip->ino_data.uflags;
1814 		error = vop_helper_setattr_flags(&flags, vap->va_flags,
1815 					 hammer_to_unix_xid(&ip->ino_data.uid),
1816 					 ap->a_cred);
1817 		if (error == 0) {
1818 			if (ip->ino_data.uflags != flags) {
1819 				ip->ino_data.uflags = flags;
1820 				ip->ino_data.ctime = trans.time;
1821 				modflags |= HAMMER_INODE_DDIRTY;
1822 				kflags |= NOTE_ATTRIB;
1823 			}
1824 			if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1825 				error = 0;
1826 				goto done;
1827 			}
1828 		}
1829 		goto done;
1830 	}
1831 	if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1832 		error = EPERM;
1833 		goto done;
1834 	}
1835 	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
1836 		mode_t cur_mode = ip->ino_data.mode;
1837 		uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1838 		gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1839 		uuid_t uuid_uid;
1840 		uuid_t uuid_gid;
1841 
1842 		error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
1843 					 ap->a_cred,
1844 					 &cur_uid, &cur_gid, &cur_mode);
1845 		if (error == 0) {
1846 			hammer_guid_to_uuid(&uuid_uid, cur_uid);
1847 			hammer_guid_to_uuid(&uuid_gid, cur_gid);
1848 			if (bcmp(&uuid_uid, &ip->ino_data.uid,
1849 				 sizeof(uuid_uid)) ||
1850 			    bcmp(&uuid_gid, &ip->ino_data.gid,
1851 				 sizeof(uuid_gid)) ||
1852 			    ip->ino_data.mode != cur_mode
1853 			) {
1854 				ip->ino_data.uid = uuid_uid;
1855 				ip->ino_data.gid = uuid_gid;
1856 				ip->ino_data.mode = cur_mode;
1857 				ip->ino_data.ctime = trans.time;
1858 				modflags |= HAMMER_INODE_DDIRTY;
1859 			}
1860 			kflags |= NOTE_ATTRIB;
1861 		}
1862 	}
1863 	while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
1864 		switch(ap->a_vp->v_type) {
1865 		case VREG:
1866 			if (vap->va_size == ip->ino_data.size)
1867 				break;
1868 			/*
1869 			 * XXX break atomicy, we can deadlock the backend
1870 			 * if we do not release the lock.  Probably not a
1871 			 * big deal here.
1872 			 */
1873 			blksize = hammer_blocksize(vap->va_size);
1874 			if (vap->va_size < ip->ino_data.size) {
1875 				vtruncbuf(ap->a_vp, vap->va_size, blksize);
1876 				truncating = 1;
1877 				kflags |= NOTE_WRITE;
1878 			} else {
1879 				vnode_pager_setsize(ap->a_vp, vap->va_size);
1880 				truncating = 0;
1881 				kflags |= NOTE_WRITE | NOTE_EXTEND;
1882 			}
1883 			ip->ino_data.size = vap->va_size;
1884 			ip->ino_data.mtime = trans.time;
1885 			modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
1886 
1887 			/*
1888 			 * on-media truncation is cached in the inode until
1889 			 * the inode is synchronized.
1890 			 */
1891 			if (truncating) {
1892 				hammer_ip_frontend_trunc(ip, vap->va_size);
1893 #ifdef DEBUG_TRUNCATE
1894 				if (HammerTruncIp == NULL)
1895 					HammerTruncIp = ip;
1896 #endif
1897 				if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1898 					ip->flags |= HAMMER_INODE_TRUNCATED;
1899 					ip->trunc_off = vap->va_size;
1900 #ifdef DEBUG_TRUNCATE
1901 					if (ip == HammerTruncIp)
1902 					kprintf("truncate1 %016llx\n",
1903 						(long long)ip->trunc_off);
1904 #endif
1905 				} else if (ip->trunc_off > vap->va_size) {
1906 					ip->trunc_off = vap->va_size;
1907 #ifdef DEBUG_TRUNCATE
1908 					if (ip == HammerTruncIp)
1909 					kprintf("truncate2 %016llx\n",
1910 						(long long)ip->trunc_off);
1911 #endif
1912 				} else {
1913 #ifdef DEBUG_TRUNCATE
1914 					if (ip == HammerTruncIp)
1915 					kprintf("truncate3 %016llx (ignored)\n",
1916 						(long long)vap->va_size);
1917 #endif
1918 				}
1919 			}
1920 
1921 			/*
1922 			 * If truncating we have to clean out a portion of
1923 			 * the last block on-disk.  We do this in the
1924 			 * front-end buffer cache.
1925 			 */
1926 			aligned_size = (vap->va_size + (blksize - 1)) &
1927 				       ~(int64_t)(blksize - 1);
1928 			if (truncating && vap->va_size < aligned_size) {
1929 				struct buf *bp;
1930 				int offset;
1931 
1932 				aligned_size -= blksize;
1933 
1934 				offset = (int)vap->va_size & (blksize - 1);
1935 				error = bread(ap->a_vp, aligned_size,
1936 					      blksize, &bp);
1937 				hammer_ip_frontend_trunc(ip, aligned_size);
1938 				if (error == 0) {
1939 					bzero(bp->b_data + offset,
1940 					      blksize - offset);
1941 					/* must de-cache direct-io offset */
1942 					bp->b_bio2.bio_offset = NOOFFSET;
1943 					bdwrite(bp);
1944 				} else {
1945 					kprintf("ERROR %d\n", error);
1946 					brelse(bp);
1947 				}
1948 			}
1949 			break;
1950 		case VDATABASE:
1951 			if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1952 				ip->flags |= HAMMER_INODE_TRUNCATED;
1953 				ip->trunc_off = vap->va_size;
1954 			} else if (ip->trunc_off > vap->va_size) {
1955 				ip->trunc_off = vap->va_size;
1956 			}
1957 			hammer_ip_frontend_trunc(ip, vap->va_size);
1958 			ip->ino_data.size = vap->va_size;
1959 			ip->ino_data.mtime = trans.time;
1960 			modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
1961 			kflags |= NOTE_ATTRIB;
1962 			break;
1963 		default:
1964 			error = EINVAL;
1965 			goto done;
1966 		}
1967 		break;
1968 	}
1969 	if (vap->va_atime.tv_sec != VNOVAL) {
1970 		ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime);
1971 		modflags |= HAMMER_INODE_ATIME;
1972 		kflags |= NOTE_ATTRIB;
1973 	}
1974 	if (vap->va_mtime.tv_sec != VNOVAL) {
1975 		ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime);
1976 		modflags |= HAMMER_INODE_MTIME;
1977 		kflags |= NOTE_ATTRIB;
1978 	}
1979 	if (vap->va_mode != (mode_t)VNOVAL) {
1980 		mode_t   cur_mode = ip->ino_data.mode;
1981 		uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1982 		gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1983 
1984 		error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
1985 					 cur_uid, cur_gid, &cur_mode);
1986 		if (error == 0 && ip->ino_data.mode != cur_mode) {
1987 			ip->ino_data.mode = cur_mode;
1988 			ip->ino_data.ctime = trans.time;
1989 			modflags |= HAMMER_INODE_DDIRTY;
1990 			kflags |= NOTE_ATTRIB;
1991 		}
1992 	}
1993 done:
1994 	if (error == 0)
1995 		hammer_modify_inode(ip, modflags);
1996 	hammer_done_transaction(&trans);
1997 	hammer_knote(ap->a_vp, kflags);
1998 	return (error);
1999 }
2000 
2001 /*
2002  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
2003  */
2004 static
2005 int
2006 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
2007 {
2008 	struct hammer_transaction trans;
2009 	struct hammer_inode *dip;
2010 	struct hammer_inode *nip;
2011 	struct nchandle *nch;
2012 	hammer_record_t record;
2013 	int error;
2014 	int bytes;
2015 
2016 	ap->a_vap->va_type = VLNK;
2017 
2018 	nch = ap->a_nch;
2019 	dip = VTOI(ap->a_dvp);
2020 
2021 	if (dip->flags & HAMMER_INODE_RO)
2022 		return (EROFS);
2023 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
2024 		return (error);
2025 
2026 	/*
2027 	 * Create a transaction to cover the operations we perform.
2028 	 */
2029 	hammer_start_transaction(&trans, dip->hmp);
2030 	++hammer_stats_file_iopsw;
2031 
2032 	/*
2033 	 * Create a new filesystem object of the requested type.  The
2034 	 * returned inode will be referenced but not locked.
2035 	 */
2036 
2037 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
2038 				    dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
2039 				    NULL, &nip);
2040 	if (error) {
2041 		hammer_done_transaction(&trans);
2042 		*ap->a_vpp = NULL;
2043 		return (error);
2044 	}
2045 
2046 	/*
2047 	 * Add a record representing the symlink.  symlink stores the link
2048 	 * as pure data, not a string, and is no \0 terminated.
2049 	 */
2050 	if (error == 0) {
2051 		bytes = strlen(ap->a_target);
2052 
2053 		if (bytes <= HAMMER_INODE_BASESYMLEN) {
2054 			bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
2055 		} else {
2056 			record = hammer_alloc_mem_record(nip, bytes);
2057 			record->type = HAMMER_MEM_RECORD_GENERAL;
2058 
2059 			record->leaf.base.localization = nip->obj_localization +
2060 							 HAMMER_LOCALIZE_MISC;
2061 			record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
2062 			record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
2063 			record->leaf.data_len = bytes;
2064 			KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
2065 			bcopy(ap->a_target, record->data->symlink.name, bytes);
2066 			error = hammer_ip_add_record(&trans, record);
2067 		}
2068 
2069 		/*
2070 		 * Set the file size to the length of the link.
2071 		 */
2072 		if (error == 0) {
2073 			nip->ino_data.size = bytes;
2074 			hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
2075 		}
2076 	}
2077 	if (error == 0)
2078 		error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
2079 						nch->ncp->nc_nlen, nip);
2080 
2081 	/*
2082 	 * Finish up.
2083 	 */
2084 	if (error) {
2085 		hammer_rel_inode(nip, 0);
2086 		*ap->a_vpp = NULL;
2087 	} else {
2088 		error = hammer_get_vnode(nip, ap->a_vpp);
2089 		hammer_rel_inode(nip, 0);
2090 		if (error == 0) {
2091 			cache_setunresolved(ap->a_nch);
2092 			cache_setvp(ap->a_nch, *ap->a_vpp);
2093 			hammer_knote(ap->a_dvp, NOTE_WRITE);
2094 		}
2095 	}
2096 	hammer_done_transaction(&trans);
2097 	return (error);
2098 }
2099 
2100 /*
2101  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
2102  */
2103 static
2104 int
2105 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
2106 {
2107 	struct hammer_transaction trans;
2108 	struct hammer_inode *dip;
2109 	int error;
2110 
2111 	dip = VTOI(ap->a_dvp);
2112 
2113 	if (hammer_nohistory(dip) == 0 &&
2114 	    (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) {
2115 		return (error);
2116 	}
2117 
2118 	hammer_start_transaction(&trans, dip->hmp);
2119 	++hammer_stats_file_iopsw;
2120 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
2121 				ap->a_cred, ap->a_flags, -1);
2122 	hammer_done_transaction(&trans);
2123 
2124 	return (error);
2125 }
2126 
2127 /*
2128  * hammer_vop_ioctl { vp, command, data, fflag, cred }
2129  */
2130 static
2131 int
2132 hammer_vop_ioctl(struct vop_ioctl_args *ap)
2133 {
2134 	struct hammer_inode *ip = ap->a_vp->v_data;
2135 
2136 	++hammer_stats_file_iopsr;
2137 	return(hammer_ioctl(ip, ap->a_command, ap->a_data,
2138 			    ap->a_fflag, ap->a_cred));
2139 }
2140 
2141 static
2142 int
2143 hammer_vop_mountctl(struct vop_mountctl_args *ap)
2144 {
2145 	struct mount *mp;
2146 	int error;
2147 
2148 	mp = ap->a_head.a_ops->head.vv_mount;
2149 
2150 	switch(ap->a_op) {
2151 	case MOUNTCTL_SET_EXPORT:
2152 		if (ap->a_ctllen != sizeof(struct export_args))
2153 			error = EINVAL;
2154 		else
2155 			error = hammer_vfs_export(mp, ap->a_op,
2156 				      (const struct export_args *)ap->a_ctl);
2157 		break;
2158 	default:
2159 		error = journal_mountctl(ap);
2160 		break;
2161 	}
2162 	return(error);
2163 }
2164 
2165 /*
2166  * hammer_vop_strategy { vp, bio }
2167  *
2168  * Strategy call, used for regular file read & write only.  Note that the
2169  * bp may represent a cluster.
2170  *
2171  * To simplify operation and allow better optimizations in the future,
2172  * this code does not make any assumptions with regards to buffer alignment
2173  * or size.
2174  */
2175 static
2176 int
2177 hammer_vop_strategy(struct vop_strategy_args *ap)
2178 {
2179 	struct buf *bp;
2180 	int error;
2181 
2182 	bp = ap->a_bio->bio_buf;
2183 
2184 	switch(bp->b_cmd) {
2185 	case BUF_CMD_READ:
2186 		error = hammer_vop_strategy_read(ap);
2187 		break;
2188 	case BUF_CMD_WRITE:
2189 		error = hammer_vop_strategy_write(ap);
2190 		break;
2191 	default:
2192 		bp->b_error = error = EINVAL;
2193 		bp->b_flags |= B_ERROR;
2194 		biodone(ap->a_bio);
2195 		break;
2196 	}
2197 	return (error);
2198 }
2199 
2200 /*
2201  * Read from a regular file.  Iterate the related records and fill in the
2202  * BIO/BUF.  Gaps are zero-filled.
2203  *
2204  * The support code in hammer_object.c should be used to deal with mixed
2205  * in-memory and on-disk records.
2206  *
2207  * NOTE: Can be called from the cluster code with an oversized buf.
2208  *
2209  * XXX atime update
2210  */
2211 static
2212 int
2213 hammer_vop_strategy_read(struct vop_strategy_args *ap)
2214 {
2215 	struct hammer_transaction trans;
2216 	struct hammer_inode *ip;
2217 	struct hammer_inode *dip;
2218 	struct hammer_cursor cursor;
2219 	hammer_base_elm_t base;
2220 	hammer_off_t disk_offset;
2221 	struct bio *bio;
2222 	struct bio *nbio;
2223 	struct buf *bp;
2224 	int64_t rec_offset;
2225 	int64_t ran_end;
2226 	int64_t tmp64;
2227 	int error;
2228 	int boff;
2229 	int roff;
2230 	int n;
2231 
2232 	bio = ap->a_bio;
2233 	bp = bio->bio_buf;
2234 	ip = ap->a_vp->v_data;
2235 
2236 	/*
2237 	 * The zone-2 disk offset may have been set by the cluster code via
2238 	 * a BMAP operation, or else should be NOOFFSET.
2239 	 *
2240 	 * Checking the high bits for a match against zone-2 should suffice.
2241 	 */
2242 	nbio = push_bio(bio);
2243 	if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2244 	    HAMMER_ZONE_LARGE_DATA) {
2245 		error = hammer_io_direct_read(ip->hmp, nbio, NULL);
2246 		return (error);
2247 	}
2248 
2249 	/*
2250 	 * Well, that sucked.  Do it the hard way.  If all the stars are
2251 	 * aligned we may still be able to issue a direct-read.
2252 	 */
2253 	hammer_simple_transaction(&trans, ip->hmp);
2254 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2255 
2256 	/*
2257 	 * Key range (begin and end inclusive) to scan.  Note that the key's
2258 	 * stored in the actual records represent BASE+LEN, not BASE.  The
2259 	 * first record containing bio_offset will have a key > bio_offset.
2260 	 */
2261 	cursor.key_beg.localization = ip->obj_localization +
2262 				      HAMMER_LOCALIZE_MISC;
2263 	cursor.key_beg.obj_id = ip->obj_id;
2264 	cursor.key_beg.create_tid = 0;
2265 	cursor.key_beg.delete_tid = 0;
2266 	cursor.key_beg.obj_type = 0;
2267 	cursor.key_beg.key = bio->bio_offset + 1;
2268 	cursor.asof = ip->obj_asof;
2269 	cursor.flags |= HAMMER_CURSOR_ASOF;
2270 
2271 	cursor.key_end = cursor.key_beg;
2272 	KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2273 #if 0
2274 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2275 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2276 		cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2277 		cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2278 	} else
2279 #endif
2280 	{
2281 		ran_end = bio->bio_offset + bp->b_bufsize;
2282 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2283 		cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2284 		tmp64 = ran_end + MAXPHYS + 1;	/* work-around GCC-4 bug */
2285 		if (tmp64 < ran_end)
2286 			cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2287 		else
2288 			cursor.key_end.key = ran_end + MAXPHYS + 1;
2289 	}
2290 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2291 
2292 	error = hammer_ip_first(&cursor);
2293 	boff = 0;
2294 
2295 	while (error == 0) {
2296 		/*
2297 		 * Get the base file offset of the record.  The key for
2298 		 * data records is (base + bytes) rather then (base).
2299 		 */
2300 		base = &cursor.leaf->base;
2301 		rec_offset = base->key - cursor.leaf->data_len;
2302 
2303 		/*
2304 		 * Calculate the gap, if any, and zero-fill it.
2305 		 *
2306 		 * n is the offset of the start of the record verses our
2307 		 * current seek offset in the bio.
2308 		 */
2309 		n = (int)(rec_offset - (bio->bio_offset + boff));
2310 		if (n > 0) {
2311 			if (n > bp->b_bufsize - boff)
2312 				n = bp->b_bufsize - boff;
2313 			bzero((char *)bp->b_data + boff, n);
2314 			boff += n;
2315 			n = 0;
2316 		}
2317 
2318 		/*
2319 		 * Calculate the data offset in the record and the number
2320 		 * of bytes we can copy.
2321 		 *
2322 		 * There are two degenerate cases.  First, boff may already
2323 		 * be at bp->b_bufsize.  Secondly, the data offset within
2324 		 * the record may exceed the record's size.
2325 		 */
2326 		roff = -n;
2327 		rec_offset += roff;
2328 		n = cursor.leaf->data_len - roff;
2329 		if (n <= 0) {
2330 			kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2331 			n = 0;
2332 		} else if (n > bp->b_bufsize - boff) {
2333 			n = bp->b_bufsize - boff;
2334 		}
2335 
2336 		/*
2337 		 * Deal with cached truncations.  This cool bit of code
2338 		 * allows truncate()/ftruncate() to avoid having to sync
2339 		 * the file.
2340 		 *
2341 		 * If the frontend is truncated then all backend records are
2342 		 * subject to the frontend's truncation.
2343 		 *
2344 		 * If the backend is truncated then backend records on-disk
2345 		 * (but not in-memory) are subject to the backend's
2346 		 * truncation.  In-memory records owned by the backend
2347 		 * represent data written after the truncation point on the
2348 		 * backend and must not be truncated.
2349 		 *
2350 		 * Truncate operations deal with frontend buffer cache
2351 		 * buffers and frontend-owned in-memory records synchronously.
2352 		 */
2353 		if (ip->flags & HAMMER_INODE_TRUNCATED) {
2354 			if (hammer_cursor_ondisk(&cursor) ||
2355 			    cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2356 				if (ip->trunc_off <= rec_offset)
2357 					n = 0;
2358 				else if (ip->trunc_off < rec_offset + n)
2359 					n = (int)(ip->trunc_off - rec_offset);
2360 			}
2361 		}
2362 		if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2363 			if (hammer_cursor_ondisk(&cursor)) {
2364 				if (ip->sync_trunc_off <= rec_offset)
2365 					n = 0;
2366 				else if (ip->sync_trunc_off < rec_offset + n)
2367 					n = (int)(ip->sync_trunc_off - rec_offset);
2368 			}
2369 		}
2370 
2371 		/*
2372 		 * Try to issue a direct read into our bio if possible,
2373 		 * otherwise resolve the element data into a hammer_buffer
2374 		 * and copy.
2375 		 *
2376 		 * The buffer on-disk should be zerod past any real
2377 		 * truncation point, but may not be for any synthesized
2378 		 * truncation point from above.
2379 		 */
2380 		disk_offset = cursor.leaf->data_offset + roff;
2381 		if (boff == 0 && n == bp->b_bufsize &&
2382 		    hammer_cursor_ondisk(&cursor) &&
2383 		    (disk_offset & HAMMER_BUFMASK) == 0) {
2384 			KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2385 				 HAMMER_ZONE_LARGE_DATA);
2386 			nbio->bio_offset = disk_offset;
2387 			error = hammer_io_direct_read(trans.hmp, nbio,
2388 						      cursor.leaf);
2389 			goto done;
2390 		} else if (n) {
2391 			error = hammer_ip_resolve_data(&cursor);
2392 			if (error == 0) {
2393 				bcopy((char *)cursor.data + roff,
2394 				      (char *)bp->b_data + boff, n);
2395 			}
2396 		}
2397 		if (error)
2398 			break;
2399 
2400 		/*
2401 		 * Iterate until we have filled the request.
2402 		 */
2403 		boff += n;
2404 		if (boff == bp->b_bufsize)
2405 			break;
2406 		error = hammer_ip_next(&cursor);
2407 	}
2408 
2409 	/*
2410 	 * There may have been a gap after the last record
2411 	 */
2412 	if (error == ENOENT)
2413 		error = 0;
2414 	if (error == 0 && boff != bp->b_bufsize) {
2415 		KKASSERT(boff < bp->b_bufsize);
2416 		bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2417 		/* boff = bp->b_bufsize; */
2418 	}
2419 	bp->b_resid = 0;
2420 	bp->b_error = error;
2421 	if (error)
2422 		bp->b_flags |= B_ERROR;
2423 	biodone(ap->a_bio);
2424 
2425 done:
2426 	/*
2427 	 * Cache the b-tree node for the last data read in cache[1].
2428 	 *
2429 	 * If we hit the file EOF then also cache the node in the
2430 	 * governing director's cache[3], it will be used to initialize
2431 	 * the inode's cache[1] for any inodes looked up via the directory.
2432 	 *
2433 	 * This doesn't reduce disk accesses since the B-Tree chain is
2434 	 * likely cached, but it does reduce cpu overhead when looking
2435 	 * up file offsets for cpdup/tar/cpio style iterations.
2436 	 */
2437 	if (cursor.node)
2438 		hammer_cache_node(&ip->cache[1], cursor.node);
2439 	if (ran_end >= ip->ino_data.size) {
2440 		dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id,
2441 					ip->obj_asof, ip->obj_localization);
2442 		if (dip) {
2443 			hammer_cache_node(&dip->cache[3], cursor.node);
2444 			hammer_rel_inode(dip, 0);
2445 		}
2446 	}
2447 	hammer_done_cursor(&cursor);
2448 	hammer_done_transaction(&trans);
2449 	return(error);
2450 }
2451 
2452 /*
2453  * BMAP operation - used to support cluster_read() only.
2454  *
2455  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2456  *
2457  * This routine may return EOPNOTSUPP if the opration is not supported for
2458  * the specified offset.  The contents of the pointer arguments do not
2459  * need to be initialized in that case.
2460  *
2461  * If a disk address is available and properly aligned return 0 with
2462  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2463  * to the run-length relative to that offset.  Callers may assume that
2464  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2465  * large, so return EOPNOTSUPP if it is not sufficiently large.
2466  */
2467 static
2468 int
2469 hammer_vop_bmap(struct vop_bmap_args *ap)
2470 {
2471 	struct hammer_transaction trans;
2472 	struct hammer_inode *ip;
2473 	struct hammer_cursor cursor;
2474 	hammer_base_elm_t base;
2475 	int64_t rec_offset;
2476 	int64_t ran_end;
2477 	int64_t tmp64;
2478 	int64_t base_offset;
2479 	int64_t base_disk_offset;
2480 	int64_t last_offset;
2481 	hammer_off_t last_disk_offset;
2482 	hammer_off_t disk_offset;
2483 	int	rec_len;
2484 	int	error;
2485 	int	blksize;
2486 
2487 	++hammer_stats_file_iopsr;
2488 	ip = ap->a_vp->v_data;
2489 
2490 	/*
2491 	 * We can only BMAP regular files.  We can't BMAP database files,
2492 	 * directories, etc.
2493 	 */
2494 	if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2495 		return(EOPNOTSUPP);
2496 
2497 	/*
2498 	 * bmap is typically called with runp/runb both NULL when used
2499 	 * for writing.  We do not support BMAP for writing atm.
2500 	 */
2501 	if (ap->a_cmd != BUF_CMD_READ)
2502 		return(EOPNOTSUPP);
2503 
2504 	/*
2505 	 * Scan the B-Tree to acquire blockmap addresses, then translate
2506 	 * to raw addresses.
2507 	 */
2508 	hammer_simple_transaction(&trans, ip->hmp);
2509 #if 0
2510 	kprintf("bmap_beg %016llx ip->cache %p\n",
2511 		(long long)ap->a_loffset, ip->cache[1]);
2512 #endif
2513 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2514 
2515 	/*
2516 	 * Key range (begin and end inclusive) to scan.  Note that the key's
2517 	 * stored in the actual records represent BASE+LEN, not BASE.  The
2518 	 * first record containing bio_offset will have a key > bio_offset.
2519 	 */
2520 	cursor.key_beg.localization = ip->obj_localization +
2521 				      HAMMER_LOCALIZE_MISC;
2522 	cursor.key_beg.obj_id = ip->obj_id;
2523 	cursor.key_beg.create_tid = 0;
2524 	cursor.key_beg.delete_tid = 0;
2525 	cursor.key_beg.obj_type = 0;
2526 	if (ap->a_runb)
2527 		cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
2528 	else
2529 		cursor.key_beg.key = ap->a_loffset + 1;
2530 	if (cursor.key_beg.key < 0)
2531 		cursor.key_beg.key = 0;
2532 	cursor.asof = ip->obj_asof;
2533 	cursor.flags |= HAMMER_CURSOR_ASOF;
2534 
2535 	cursor.key_end = cursor.key_beg;
2536 	KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2537 
2538 	ran_end = ap->a_loffset + MAXPHYS;
2539 	cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2540 	cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2541 	tmp64 = ran_end + MAXPHYS + 1;	/* work-around GCC-4 bug */
2542 	if (tmp64 < ran_end)
2543 		cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2544 	else
2545 		cursor.key_end.key = ran_end + MAXPHYS + 1;
2546 
2547 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2548 
2549 	error = hammer_ip_first(&cursor);
2550 	base_offset = last_offset = 0;
2551 	base_disk_offset = last_disk_offset = 0;
2552 
2553 	while (error == 0) {
2554 		/*
2555 		 * Get the base file offset of the record.  The key for
2556 		 * data records is (base + bytes) rather then (base).
2557 		 *
2558 		 * NOTE: rec_offset + rec_len may exceed the end-of-file.
2559 		 * The extra bytes should be zero on-disk and the BMAP op
2560 		 * should still be ok.
2561 		 */
2562 		base = &cursor.leaf->base;
2563 		rec_offset = base->key - cursor.leaf->data_len;
2564 		rec_len    = cursor.leaf->data_len;
2565 
2566 		/*
2567 		 * Incorporate any cached truncation.
2568 		 *
2569 		 * NOTE: Modifications to rec_len based on synthesized
2570 		 * truncation points remove the guarantee that any extended
2571 		 * data on disk is zero (since the truncations may not have
2572 		 * taken place on-media yet).
2573 		 */
2574 		if (ip->flags & HAMMER_INODE_TRUNCATED) {
2575 			if (hammer_cursor_ondisk(&cursor) ||
2576 			    cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2577 				if (ip->trunc_off <= rec_offset)
2578 					rec_len = 0;
2579 				else if (ip->trunc_off < rec_offset + rec_len)
2580 					rec_len = (int)(ip->trunc_off - rec_offset);
2581 			}
2582 		}
2583 		if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2584 			if (hammer_cursor_ondisk(&cursor)) {
2585 				if (ip->sync_trunc_off <= rec_offset)
2586 					rec_len = 0;
2587 				else if (ip->sync_trunc_off < rec_offset + rec_len)
2588 					rec_len = (int)(ip->sync_trunc_off - rec_offset);
2589 			}
2590 		}
2591 
2592 		/*
2593 		 * Accumulate information.  If we have hit a discontiguous
2594 		 * block reset base_offset unless we are already beyond the
2595 		 * requested offset.  If we are, that's it, we stop.
2596 		 */
2597 		if (error)
2598 			break;
2599 		if (hammer_cursor_ondisk(&cursor)) {
2600 			disk_offset = cursor.leaf->data_offset;
2601 			if (rec_offset != last_offset ||
2602 			    disk_offset != last_disk_offset) {
2603 				if (rec_offset > ap->a_loffset)
2604 					break;
2605 				base_offset = rec_offset;
2606 				base_disk_offset = disk_offset;
2607 			}
2608 			last_offset = rec_offset + rec_len;
2609 			last_disk_offset = disk_offset + rec_len;
2610 		}
2611 		error = hammer_ip_next(&cursor);
2612 	}
2613 
2614 #if 0
2615 	kprintf("BMAP %016llx:  %016llx - %016llx\n",
2616 		(long long)ap->a_loffset,
2617 		(long long)base_offset,
2618 		(long long)last_offset);
2619 	kprintf("BMAP %16s:  %016llx - %016llx\n", "",
2620 		(long long)base_disk_offset,
2621 		(long long)last_disk_offset);
2622 #endif
2623 
2624 	if (cursor.node) {
2625 		hammer_cache_node(&ip->cache[1], cursor.node);
2626 #if 0
2627 		kprintf("bmap_end2 %016llx ip->cache %p\n",
2628 			(long long)ap->a_loffset, ip->cache[1]);
2629 #endif
2630 	}
2631 	hammer_done_cursor(&cursor);
2632 	hammer_done_transaction(&trans);
2633 
2634 	/*
2635 	 * If we couldn't find any records or the records we did find were
2636 	 * all behind the requested offset, return failure.  A forward
2637 	 * truncation can leave a hole w/ no on-disk records.
2638 	 */
2639 	if (last_offset == 0 || last_offset < ap->a_loffset)
2640 		return (EOPNOTSUPP);
2641 
2642 	/*
2643 	 * Figure out the block size at the requested offset and adjust
2644 	 * our limits so the cluster_read() does not create inappropriately
2645 	 * sized buffer cache buffers.
2646 	 */
2647 	blksize = hammer_blocksize(ap->a_loffset);
2648 	if (hammer_blocksize(base_offset) != blksize) {
2649 		base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
2650 	}
2651 	if (last_offset != ap->a_loffset &&
2652 	    hammer_blocksize(last_offset - 1) != blksize) {
2653 		last_offset = hammer_blockdemarc(ap->a_loffset,
2654 						 last_offset - 1);
2655 	}
2656 
2657 	/*
2658 	 * Returning EOPNOTSUPP simply prevents the direct-IO optimization
2659 	 * from occuring.
2660 	 */
2661 	disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
2662 
2663 	if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
2664 		/*
2665 		 * Only large-data zones can be direct-IOd
2666 		 */
2667 		error = EOPNOTSUPP;
2668 	} else if ((disk_offset & HAMMER_BUFMASK) ||
2669 		   (last_offset - ap->a_loffset) < blksize) {
2670 		/*
2671 		 * doffsetp is not aligned or the forward run size does
2672 		 * not cover a whole buffer, disallow the direct I/O.
2673 		 */
2674 		error = EOPNOTSUPP;
2675 	} else {
2676 		/*
2677 		 * We're good.
2678 		 */
2679 		*ap->a_doffsetp = disk_offset;
2680 		if (ap->a_runb) {
2681 			*ap->a_runb = ap->a_loffset - base_offset;
2682 			KKASSERT(*ap->a_runb >= 0);
2683 		}
2684 		if (ap->a_runp) {
2685 			*ap->a_runp = last_offset - ap->a_loffset;
2686 			KKASSERT(*ap->a_runp >= 0);
2687 		}
2688 		error = 0;
2689 	}
2690 	return(error);
2691 }
2692 
2693 /*
2694  * Write to a regular file.   Because this is a strategy call the OS is
2695  * trying to actually get data onto the media.
2696  */
2697 static
2698 int
2699 hammer_vop_strategy_write(struct vop_strategy_args *ap)
2700 {
2701 	hammer_record_t record;
2702 	hammer_mount_t hmp;
2703 	hammer_inode_t ip;
2704 	struct bio *bio;
2705 	struct buf *bp;
2706 	int blksize;
2707 	int bytes;
2708 	int error;
2709 
2710 	bio = ap->a_bio;
2711 	bp = bio->bio_buf;
2712 	ip = ap->a_vp->v_data;
2713 	hmp = ip->hmp;
2714 
2715 	blksize = hammer_blocksize(bio->bio_offset);
2716 	KKASSERT(bp->b_bufsize == blksize);
2717 
2718 	if (ip->flags & HAMMER_INODE_RO) {
2719 		bp->b_error = EROFS;
2720 		bp->b_flags |= B_ERROR;
2721 		biodone(ap->a_bio);
2722 		return(EROFS);
2723 	}
2724 
2725 	/*
2726 	 * Interlock with inode destruction (no in-kernel or directory
2727 	 * topology visibility).  If we queue new IO while trying to
2728 	 * destroy the inode we can deadlock the vtrunc call in
2729 	 * hammer_inode_unloadable_check().
2730 	 *
2731 	 * Besides, there's no point flushing a bp associated with an
2732 	 * inode that is being destroyed on-media and has no kernel
2733 	 * references.
2734 	 */
2735 	if ((ip->flags | ip->sync_flags) &
2736 	    (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
2737 		bp->b_resid = 0;
2738 		biodone(ap->a_bio);
2739 		return(0);
2740 	}
2741 
2742 	/*
2743 	 * Reserve space and issue a direct-write from the front-end.
2744 	 * NOTE: The direct_io code will hammer_bread/bcopy smaller
2745 	 * allocations.
2746 	 *
2747 	 * An in-memory record will be installed to reference the storage
2748 	 * until the flusher can get to it.
2749 	 *
2750 	 * Since we own the high level bio the front-end will not try to
2751 	 * do a direct-read until the write completes.
2752 	 *
2753 	 * NOTE: The only time we do not reserve a full-sized buffers
2754 	 * worth of data is if the file is small.  We do not try to
2755 	 * allocate a fragment (from the small-data zone) at the end of
2756 	 * an otherwise large file as this can lead to wildly separated
2757 	 * data.
2758 	 */
2759 	KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
2760 	KKASSERT(bio->bio_offset < ip->ino_data.size);
2761 	if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
2762 		bytes = bp->b_bufsize;
2763 	else
2764 		bytes = ((int)ip->ino_data.size + 15) & ~15;
2765 
2766 	record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
2767 				    bytes, &error);
2768 	if (record) {
2769 		hammer_io_direct_write(hmp, record, bio);
2770 		if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
2771 			hammer_flush_inode(ip, 0);
2772 	} else {
2773 		bp->b_bio2.bio_offset = NOOFFSET;
2774 		bp->b_error = error;
2775 		bp->b_flags |= B_ERROR;
2776 		biodone(ap->a_bio);
2777 	}
2778 	return(error);
2779 }
2780 
2781 /*
2782  * dounlink - disconnect a directory entry
2783  *
2784  * XXX whiteout support not really in yet
2785  */
2786 static int
2787 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
2788 		struct vnode *dvp, struct ucred *cred,
2789 		int flags, int isdir)
2790 {
2791 	struct namecache *ncp;
2792 	hammer_inode_t dip;
2793 	hammer_inode_t ip;
2794 	struct hammer_cursor cursor;
2795 	int64_t namekey;
2796 	u_int32_t max_iterations;
2797 	int nlen, error;
2798 
2799 	/*
2800 	 * Calculate the namekey and setup the key range for the scan.  This
2801 	 * works kinda like a chained hash table where the lower 32 bits
2802 	 * of the namekey synthesize the chain.
2803 	 *
2804 	 * The key range is inclusive of both key_beg and key_end.
2805 	 */
2806 	dip = VTOI(dvp);
2807 	ncp = nch->ncp;
2808 
2809 	if (dip->flags & HAMMER_INODE_RO)
2810 		return (EROFS);
2811 
2812 	namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen,
2813 					   &max_iterations);
2814 retry:
2815 	hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
2816 	cursor.key_beg.localization = dip->obj_localization +
2817 				      HAMMER_LOCALIZE_MISC;
2818         cursor.key_beg.obj_id = dip->obj_id;
2819 	cursor.key_beg.key = namekey;
2820         cursor.key_beg.create_tid = 0;
2821         cursor.key_beg.delete_tid = 0;
2822         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2823         cursor.key_beg.obj_type = 0;
2824 
2825 	cursor.key_end = cursor.key_beg;
2826 	cursor.key_end.key += max_iterations;
2827 	cursor.asof = dip->obj_asof;
2828 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2829 
2830 	/*
2831 	 * Scan all matching records (the chain), locate the one matching
2832 	 * the requested path component.  info->last_error contains the
2833 	 * error code on search termination and could be 0, ENOENT, or
2834 	 * something else.
2835 	 *
2836 	 * The hammer_ip_*() functions merge in-memory records with on-disk
2837 	 * records for the purposes of the search.
2838 	 */
2839 	error = hammer_ip_first(&cursor);
2840 
2841 	while (error == 0) {
2842 		error = hammer_ip_resolve_data(&cursor);
2843 		if (error)
2844 			break;
2845 		nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2846 		KKASSERT(nlen > 0);
2847 		if (ncp->nc_nlen == nlen &&
2848 		    bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2849 			break;
2850 		}
2851 		error = hammer_ip_next(&cursor);
2852 	}
2853 
2854 	/*
2855 	 * If all is ok we have to get the inode so we can adjust nlinks.
2856 	 * To avoid a deadlock with the flusher we must release the inode
2857 	 * lock on the directory when acquiring the inode for the entry.
2858 	 *
2859 	 * If the target is a directory, it must be empty.
2860 	 */
2861 	if (error == 0) {
2862 		hammer_unlock(&cursor.ip->lock);
2863 		ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
2864 				      dip->hmp->asof,
2865 				      cursor.data->entry.localization,
2866 				      0, &error);
2867 		hammer_lock_sh(&cursor.ip->lock);
2868 		if (error == ENOENT) {
2869 			kprintf("HAMMER: WARNING: Removing "
2870 				"dirent w/missing inode \"%s\"\n"
2871 				"\tobj_id = %016llx\n",
2872 				ncp->nc_name,
2873 				(long long)cursor.data->entry.obj_id);
2874 			error = 0;
2875 		}
2876 
2877 		/*
2878 		 * If isdir >= 0 we validate that the entry is or is not a
2879 		 * directory.  If isdir < 0 we don't care.
2880 		 */
2881 		if (error == 0 && isdir >= 0 && ip) {
2882 			if (isdir &&
2883 			    ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
2884 				error = ENOTDIR;
2885 			} else if (isdir == 0 &&
2886 			    ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
2887 				error = EISDIR;
2888 			}
2889 		}
2890 
2891 		/*
2892 		 * If we are trying to remove a directory the directory must
2893 		 * be empty.
2894 		 *
2895 		 * The check directory code can loop and deadlock/retry.  Our
2896 		 * own cursor's node locks must be released to avoid a 3-way
2897 		 * deadlock with the flusher if the check directory code
2898 		 * blocks.
2899 		 *
2900 		 * If any changes whatsoever have been made to the cursor
2901 		 * set EDEADLK and retry.
2902 		 */
2903 		if (error == 0 && ip && ip->ino_data.obj_type ==
2904 				        HAMMER_OBJTYPE_DIRECTORY) {
2905 			hammer_unlock_cursor(&cursor);
2906 			error = hammer_ip_check_directory_empty(trans, ip);
2907 			hammer_lock_cursor(&cursor);
2908 			if (cursor.flags & HAMMER_CURSOR_RETEST) {
2909 				kprintf("HAMMER: Warning: avoided deadlock "
2910 					"on rmdir '%s'\n",
2911 					ncp->nc_name);
2912 				error = EDEADLK;
2913 			}
2914 		}
2915 
2916 		/*
2917 		 * Delete the directory entry.
2918 		 *
2919 		 * WARNING: hammer_ip_del_directory() may have to terminate
2920 		 * the cursor to avoid a deadlock.  It is ok to call
2921 		 * hammer_done_cursor() twice.
2922 		 */
2923 		if (error == 0) {
2924 			error = hammer_ip_del_directory(trans, &cursor,
2925 							dip, ip);
2926 		}
2927 		hammer_done_cursor(&cursor);
2928 		if (error == 0) {
2929 			cache_setunresolved(nch);
2930 			cache_setvp(nch, NULL);
2931 			/* XXX locking */
2932 			if (ip && ip->vp) {
2933 				hammer_knote(ip->vp, NOTE_DELETE);
2934 				cache_inval_vp(ip->vp, CINV_DESTROY);
2935 			}
2936 		}
2937 		if (ip)
2938 			hammer_rel_inode(ip, 0);
2939 	} else {
2940 		hammer_done_cursor(&cursor);
2941 	}
2942 	if (error == EDEADLK)
2943 		goto retry;
2944 
2945 	return (error);
2946 }
2947 
2948 /************************************************************************
2949  *			    FIFO AND SPECFS OPS				*
2950  ************************************************************************
2951  *
2952  */
2953 
2954 static int
2955 hammer_vop_fifoclose (struct vop_close_args *ap)
2956 {
2957 	/* XXX update itimes */
2958 	return (VOCALL(&fifo_vnode_vops, &ap->a_head));
2959 }
2960 
2961 static int
2962 hammer_vop_fiforead (struct vop_read_args *ap)
2963 {
2964 	int error;
2965 
2966 	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2967 	/* XXX update access time */
2968 	return (error);
2969 }
2970 
2971 static int
2972 hammer_vop_fifowrite (struct vop_write_args *ap)
2973 {
2974 	int error;
2975 
2976 	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2977 	/* XXX update access time */
2978 	return (error);
2979 }
2980 
2981 static
2982 int
2983 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap)
2984 {
2985 	int error;
2986 
2987 	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2988 	if (error)
2989 		error = hammer_vop_kqfilter(ap);
2990 	return(error);
2991 }
2992 
2993 static int
2994 hammer_vop_specclose (struct vop_close_args *ap)
2995 {
2996 	/* XXX update itimes */
2997 	return (VOCALL(&spec_vnode_vops, &ap->a_head));
2998 }
2999 
3000 static int
3001 hammer_vop_specread (struct vop_read_args *ap)
3002 {
3003 	/* XXX update access time */
3004 	return (VOCALL(&spec_vnode_vops, &ap->a_head));
3005 }
3006 
3007 static int
3008 hammer_vop_specwrite (struct vop_write_args *ap)
3009 {
3010 	/* XXX update last change time */
3011 	return (VOCALL(&spec_vnode_vops, &ap->a_head));
3012 }
3013 
3014 /*
3015  * SPECFS's getattr will override fields as necessary, but does not fill
3016  *          stuff in from scratch.
3017  */
3018 static
3019 int
3020 hammer_vop_specgetattr (struct vop_getattr_args *ap)
3021 {
3022 	int error;
3023 
3024 	error = hammer_vop_getattr(ap);
3025 	if (error == 0)
3026 		VOCALL(&spec_vnode_vops, &ap->a_head);
3027 	return (error);
3028 }
3029 
3030 
3031 /************************************************************************
3032  *			    KQFILTER OPS				*
3033  ************************************************************************
3034  *
3035  */
3036 static void filt_hammerdetach(struct knote *kn);
3037 static int filt_hammerread(struct knote *kn, long hint);
3038 static int filt_hammerwrite(struct knote *kn, long hint);
3039 static int filt_hammervnode(struct knote *kn, long hint);
3040 
3041 static struct filterops hammerread_filtops =
3042 	{ 1, NULL, filt_hammerdetach, filt_hammerread };
3043 static struct filterops hammerwrite_filtops =
3044 	{ 1, NULL, filt_hammerdetach, filt_hammerwrite };
3045 static struct filterops hammervnode_filtops =
3046 	{ 1, NULL, filt_hammerdetach, filt_hammervnode };
3047 
3048 static
3049 int
3050 hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
3051 {
3052 	struct vnode *vp = ap->a_vp;
3053 	struct knote *kn = ap->a_kn;
3054 	lwkt_tokref ilock;
3055 
3056 	switch (kn->kn_filter) {
3057 	case EVFILT_READ:
3058 		kn->kn_fop = &hammerread_filtops;
3059 		break;
3060 	case EVFILT_WRITE:
3061 		kn->kn_fop = &hammerwrite_filtops;
3062 		break;
3063 	case EVFILT_VNODE:
3064 		kn->kn_fop = &hammervnode_filtops;
3065 		break;
3066 	default:
3067 		return (1);
3068 	}
3069 
3070 	kn->kn_hook = (caddr_t)vp;
3071 
3072 	lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
3073 	SLIST_INSERT_HEAD(&vp->v_pollinfo.vpi_selinfo.si_note, kn, kn_selnext);
3074 	lwkt_reltoken(&ilock);
3075 
3076 	return(0);
3077 }
3078 
3079 static void
3080 filt_hammerdetach(struct knote *kn)
3081 {
3082 	struct vnode *vp = (void *)kn->kn_hook;
3083 	lwkt_tokref ilock;
3084 
3085 	lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
3086 	SLIST_REMOVE(&vp->v_pollinfo.vpi_selinfo.si_note,
3087 		     kn, knote, kn_selnext);
3088 	lwkt_reltoken(&ilock);
3089 }
3090 
3091 static int
3092 filt_hammerread(struct knote *kn, long hint)
3093 {
3094 	struct vnode *vp = (void *)kn->kn_hook;
3095 	hammer_inode_t ip = VTOI(vp);
3096 
3097 	if (hint == NOTE_REVOKE) {
3098 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3099 		return(1);
3100 	}
3101 	kn->kn_data = ip->ino_data.size - kn->kn_fp->f_offset;
3102 	return (kn->kn_data != 0);
3103 }
3104 
3105 static int
3106 filt_hammerwrite(struct knote *kn, long hint)
3107 {
3108 	if (hint == NOTE_REVOKE)
3109 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3110 	kn->kn_data = 0;
3111 	return (1);
3112 }
3113 
3114 static int
3115 filt_hammervnode(struct knote *kn, long hint)
3116 {
3117 	if (kn->kn_sfflags & hint)
3118 		kn->kn_fflags |= hint;
3119 	if (hint == NOTE_REVOKE) {
3120 		kn->kn_flags |= EV_EOF;
3121 		return (1);
3122 	}
3123 	return (kn->kn_fflags != 0);
3124 }
3125 
3126