xref: /dragonfly/sys/vfs/hammer/hammer_vnops.c (revision a615f06f)
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $
35  */
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <sys/file.h>
48 #include <vm/vm_extern.h>
49 #include <vfs/fifofs/fifo.h>
50 #include "hammer.h"
51 
52 /*
53  * USERFS VNOPS
54  */
55 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
56 static int hammer_vop_fsync(struct vop_fsync_args *);
57 static int hammer_vop_read(struct vop_read_args *);
58 static int hammer_vop_write(struct vop_write_args *);
59 static int hammer_vop_access(struct vop_access_args *);
60 static int hammer_vop_advlock(struct vop_advlock_args *);
61 static int hammer_vop_close(struct vop_close_args *);
62 static int hammer_vop_ncreate(struct vop_ncreate_args *);
63 static int hammer_vop_getattr(struct vop_getattr_args *);
64 static int hammer_vop_nresolve(struct vop_nresolve_args *);
65 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
66 static int hammer_vop_nlink(struct vop_nlink_args *);
67 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
68 static int hammer_vop_nmknod(struct vop_nmknod_args *);
69 static int hammer_vop_open(struct vop_open_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_setattr(struct vop_setattr_args *);
77 static int hammer_vop_strategy(struct vop_strategy_args *);
78 static int hammer_vop_bmap(struct vop_bmap_args *ap);
79 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
80 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
81 static int hammer_vop_ioctl(struct vop_ioctl_args *);
82 static int hammer_vop_mountctl(struct vop_mountctl_args *);
83 static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
84 
85 static int hammer_vop_fifoclose (struct vop_close_args *);
86 static int hammer_vop_fiforead (struct vop_read_args *);
87 static int hammer_vop_fifowrite (struct vop_write_args *);
88 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
89 
90 static int hammer_vop_specclose (struct vop_close_args *);
91 static int hammer_vop_specread (struct vop_read_args *);
92 static int hammer_vop_specwrite (struct vop_write_args *);
93 
94 struct vop_ops hammer_vnode_vops = {
95 	.vop_default =		vop_defaultop,
96 	.vop_fsync =		hammer_vop_fsync,
97 	.vop_getpages =		vop_stdgetpages,
98 	.vop_putpages =		vop_stdputpages,
99 	.vop_read =		hammer_vop_read,
100 	.vop_write =		hammer_vop_write,
101 	.vop_access =		hammer_vop_access,
102 	.vop_advlock =		hammer_vop_advlock,
103 	.vop_close =		hammer_vop_close,
104 	.vop_ncreate =		hammer_vop_ncreate,
105 	.vop_getattr =		hammer_vop_getattr,
106 	.vop_inactive =		hammer_vop_inactive,
107 	.vop_reclaim =		hammer_vop_reclaim,
108 	.vop_nresolve =		hammer_vop_nresolve,
109 	.vop_nlookupdotdot =	hammer_vop_nlookupdotdot,
110 	.vop_nlink =		hammer_vop_nlink,
111 	.vop_nmkdir =		hammer_vop_nmkdir,
112 	.vop_nmknod =		hammer_vop_nmknod,
113 	.vop_open =		hammer_vop_open,
114 	.vop_pathconf =		vop_stdpathconf,
115 	.vop_print =		hammer_vop_print,
116 	.vop_readdir =		hammer_vop_readdir,
117 	.vop_readlink =		hammer_vop_readlink,
118 	.vop_nremove =		hammer_vop_nremove,
119 	.vop_nrename =		hammer_vop_nrename,
120 	.vop_nrmdir =		hammer_vop_nrmdir,
121 	.vop_setattr =		hammer_vop_setattr,
122 	.vop_bmap =		hammer_vop_bmap,
123 	.vop_strategy =		hammer_vop_strategy,
124 	.vop_nsymlink =		hammer_vop_nsymlink,
125 	.vop_nwhiteout =	hammer_vop_nwhiteout,
126 	.vop_ioctl =		hammer_vop_ioctl,
127 	.vop_mountctl =		hammer_vop_mountctl,
128 	.vop_kqfilter =		hammer_vop_kqfilter
129 };
130 
131 struct vop_ops hammer_spec_vops = {
132 	.vop_default =		spec_vnoperate,
133 	.vop_fsync =		hammer_vop_fsync,
134 	.vop_read =		hammer_vop_specread,
135 	.vop_write =		hammer_vop_specwrite,
136 	.vop_access =		hammer_vop_access,
137 	.vop_close =		hammer_vop_specclose,
138 	.vop_getattr =		hammer_vop_getattr,
139 	.vop_inactive =		hammer_vop_inactive,
140 	.vop_reclaim =		hammer_vop_reclaim,
141 	.vop_setattr =		hammer_vop_setattr
142 };
143 
144 struct vop_ops hammer_fifo_vops = {
145 	.vop_default =		fifo_vnoperate,
146 	.vop_fsync =		hammer_vop_fsync,
147 	.vop_read =		hammer_vop_fiforead,
148 	.vop_write =		hammer_vop_fifowrite,
149 	.vop_access =		hammer_vop_access,
150 	.vop_close =		hammer_vop_fifoclose,
151 	.vop_getattr =		hammer_vop_getattr,
152 	.vop_inactive =		hammer_vop_inactive,
153 	.vop_reclaim =		hammer_vop_reclaim,
154 	.vop_setattr =		hammer_vop_setattr,
155 	.vop_kqfilter =		hammer_vop_fifokqfilter
156 };
157 
158 static __inline
159 void
160 hammer_knote(struct vnode *vp, int flags)
161 {
162 	if (flags)
163 		KNOTE(&vp->v_pollinfo.vpi_selinfo.si_note, flags);
164 }
165 
166 #ifdef DEBUG_TRUNCATE
167 struct hammer_inode *HammerTruncIp;
168 #endif
169 
170 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
171 			   struct vnode *dvp, struct ucred *cred,
172 			   int flags, int isdir);
173 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
174 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
175 
176 #if 0
177 static
178 int
179 hammer_vop_vnoperate(struct vop_generic_args *)
180 {
181 	return (VOCALL(&hammer_vnode_vops, ap));
182 }
183 #endif
184 
185 /*
186  * hammer_vop_fsync { vp, waitfor }
187  *
188  * fsync() an inode to disk and wait for it to be completely committed
189  * such that the information would not be undone if a crash occured after
190  * return.
191  */
192 static
193 int
194 hammer_vop_fsync(struct vop_fsync_args *ap)
195 {
196 	hammer_inode_t ip = VTOI(ap->a_vp);
197 
198 	++hammer_count_fsyncs;
199 	vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL);
200 	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
201 	if (ap->a_waitfor == MNT_WAIT) {
202 		vn_unlock(ap->a_vp);
203 		hammer_wait_inode(ip);
204 		vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
205 	}
206 	return (ip->error);
207 }
208 
209 /*
210  * hammer_vop_read { vp, uio, ioflag, cred }
211  */
212 static
213 int
214 hammer_vop_read(struct vop_read_args *ap)
215 {
216 	struct hammer_transaction trans;
217 	hammer_inode_t ip;
218 	off_t offset;
219 	struct buf *bp;
220 	struct uio *uio;
221 	int error;
222 	int n;
223 	int seqcount;
224 	int ioseqcount;
225 	int blksize;
226 
227 	if (ap->a_vp->v_type != VREG)
228 		return (EINVAL);
229 	ip = VTOI(ap->a_vp);
230 	error = 0;
231 	uio = ap->a_uio;
232 
233 	/*
234 	 * Allow the UIO's size to override the sequential heuristic.
235 	 */
236 	blksize = hammer_blocksize(uio->uio_offset);
237 	seqcount = (uio->uio_resid + (blksize - 1)) / blksize;
238 	ioseqcount = ap->a_ioflag >> 16;
239 	if (seqcount < ioseqcount)
240 		seqcount = ioseqcount;
241 
242 	hammer_start_transaction(&trans, ip->hmp);
243 
244 	/*
245 	 * Access the data typically in HAMMER_BUFSIZE blocks via the
246 	 * buffer cache, but HAMMER may use a variable block size based
247 	 * on the offset.
248 	 */
249 	while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
250 		int64_t base_offset;
251 		int64_t file_limit;
252 
253 		blksize = hammer_blocksize(uio->uio_offset);
254 		offset = (int)uio->uio_offset & (blksize - 1);
255 		base_offset = uio->uio_offset - offset;
256 
257 		if (hammer_cluster_enable) {
258 			/*
259 			 * Use file_limit to prevent cluster_read() from
260 			 * creating buffers of the wrong block size past
261 			 * the demarc.
262 			 */
263 			file_limit = ip->ino_data.size;
264 			if (base_offset < HAMMER_XDEMARC &&
265 			    file_limit > HAMMER_XDEMARC) {
266 				file_limit = HAMMER_XDEMARC;
267 			}
268 			error = cluster_read(ap->a_vp,
269 					     file_limit, base_offset,
270 					     blksize, MAXPHYS,
271 					     seqcount, &bp);
272 		} else {
273 			error = bread(ap->a_vp, base_offset, blksize, &bp);
274 		}
275 		if (error) {
276 			kprintf("error %d\n", error);
277 			brelse(bp);
278 			break;
279 		}
280 
281 		/* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
282 		n = blksize - offset;
283 		if (n > uio->uio_resid)
284 			n = uio->uio_resid;
285 		if (n > ip->ino_data.size - uio->uio_offset)
286 			n = (int)(ip->ino_data.size - uio->uio_offset);
287 		error = uiomove((char *)bp->b_data + offset, n, uio);
288 
289 		/* data has a lower priority then meta-data */
290 		bp->b_flags |= B_AGE;
291 		bqrelse(bp);
292 		if (error)
293 			break;
294 		hammer_stats_file_read += n;
295 	}
296 	if ((ip->flags & HAMMER_INODE_RO) == 0 &&
297 	    (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
298 		ip->ino_data.atime = trans.time;
299 		hammer_modify_inode(ip, HAMMER_INODE_ATIME);
300 	}
301 	hammer_done_transaction(&trans);
302 	return (error);
303 }
304 
305 /*
306  * hammer_vop_write { vp, uio, ioflag, cred }
307  */
308 static
309 int
310 hammer_vop_write(struct vop_write_args *ap)
311 {
312 	struct hammer_transaction trans;
313 	struct hammer_inode *ip;
314 	hammer_mount_t hmp;
315 	struct uio *uio;
316 	int offset;
317 	off_t base_offset;
318 	struct buf *bp;
319 	int kflags;
320 	int error;
321 	int n;
322 	int flags;
323 	int delta;
324 	int seqcount;
325 
326 	if (ap->a_vp->v_type != VREG)
327 		return (EINVAL);
328 	ip = VTOI(ap->a_vp);
329 	hmp = ip->hmp;
330 	error = 0;
331 	kflags = 0;
332 	seqcount = ap->a_ioflag >> 16;
333 
334 	if (ip->flags & HAMMER_INODE_RO)
335 		return (EROFS);
336 
337 	/*
338 	 * Create a transaction to cover the operations we perform.
339 	 */
340 	hammer_start_transaction(&trans, hmp);
341 	uio = ap->a_uio;
342 
343 	/*
344 	 * Check append mode
345 	 */
346 	if (ap->a_ioflag & IO_APPEND)
347 		uio->uio_offset = ip->ino_data.size;
348 
349 	/*
350 	 * Check for illegal write offsets.  Valid range is 0...2^63-1.
351 	 *
352 	 * NOTE: the base_off assignment is required to work around what
353 	 * I consider to be a GCC-4 optimization bug.
354 	 */
355 	if (uio->uio_offset < 0) {
356 		hammer_done_transaction(&trans);
357 		return (EFBIG);
358 	}
359 	base_offset = uio->uio_offset + uio->uio_resid;	/* work around gcc-4 */
360 	if (uio->uio_resid > 0 && base_offset <= 0) {
361 		hammer_done_transaction(&trans);
362 		return (EFBIG);
363 	}
364 
365 	/*
366 	 * Access the data typically in HAMMER_BUFSIZE blocks via the
367 	 * buffer cache, but HAMMER may use a variable block size based
368 	 * on the offset.
369 	 */
370 	while (uio->uio_resid > 0) {
371 		int fixsize = 0;
372 		int blksize;
373 		int blkmask;
374 
375 		if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
376 			break;
377 
378 		blksize = hammer_blocksize(uio->uio_offset);
379 
380 		/*
381 		 * Do not allow HAMMER to blow out the buffer cache.  Very
382 		 * large UIOs can lockout other processes due to bwillwrite()
383 		 * mechanics.
384 		 *
385 		 * The hammer inode is not locked during these operations.
386 		 * The vnode is locked which can interfere with the pageout
387 		 * daemon for non-UIO_NOCOPY writes but should not interfere
388 		 * with the buffer cache.  Even so, we cannot afford to
389 		 * allow the pageout daemon to build up too many dirty buffer
390 		 * cache buffers.
391 		 *
392 		 * Only call this if we aren't being recursively called from
393 		 * a virtual disk device (vn), else we may deadlock.
394 		 */
395 		if ((ap->a_ioflag & IO_RECURSE) == 0)
396 			bwillwrite(blksize);
397 
398 		/*
399 		 * Do not allow HAMMER to blow out system memory by
400 		 * accumulating too many records.   Records are so well
401 		 * decoupled from the buffer cache that it is possible
402 		 * for userland to push data out to the media via
403 		 * direct-write, but build up the records queued to the
404 		 * backend faster then the backend can flush them out.
405 		 * HAMMER has hit its write limit but the frontend has
406 		 * no pushback to slow it down.
407 		 */
408 		if (hmp->rsv_recs > hammer_limit_recs / 2) {
409 			/*
410 			 * Get the inode on the flush list
411 			 */
412 			if (ip->rsv_recs >= 64)
413 				hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
414 			else if (ip->rsv_recs >= 16)
415 				hammer_flush_inode(ip, 0);
416 
417 			/*
418 			 * Keep the flusher going if the system keeps
419 			 * queueing records.
420 			 */
421 			delta = hmp->count_newrecords -
422 				hmp->last_newrecords;
423 			if (delta < 0 || delta > hammer_limit_recs / 2) {
424 				hmp->last_newrecords = hmp->count_newrecords;
425 				hammer_sync_hmp(hmp, MNT_NOWAIT);
426 			}
427 
428 			/*
429 			 * If we have gotten behind start slowing
430 			 * down the writers.
431 			 */
432 			delta = (hmp->rsv_recs - hammer_limit_recs) *
433 				hz / hammer_limit_recs;
434 			if (delta > 0)
435 				tsleep(&trans, 0, "hmrslo", delta);
436 		}
437 
438 		/*
439 		 * Calculate the blocksize at the current offset and figure
440 		 * out how much we can actually write.
441 		 */
442 		blkmask = blksize - 1;
443 		offset = (int)uio->uio_offset & blkmask;
444 		base_offset = uio->uio_offset & ~(int64_t)blkmask;
445 		n = blksize - offset;
446 		if (n > uio->uio_resid)
447 			n = uio->uio_resid;
448 		if (uio->uio_offset + n > ip->ino_data.size) {
449 			vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
450 			fixsize = 1;
451 			kflags |= NOTE_EXTEND;
452 		}
453 
454 		if (uio->uio_segflg == UIO_NOCOPY) {
455 			/*
456 			 * Issuing a write with the same data backing the
457 			 * buffer.  Instantiate the buffer to collect the
458 			 * backing vm pages, then read-in any missing bits.
459 			 *
460 			 * This case is used by vop_stdputpages().
461 			 */
462 			bp = getblk(ap->a_vp, base_offset,
463 				    blksize, GETBLK_BHEAVY, 0);
464 			if ((bp->b_flags & B_CACHE) == 0) {
465 				bqrelse(bp);
466 				error = bread(ap->a_vp, base_offset,
467 					      blksize, &bp);
468 			}
469 		} else if (offset == 0 && uio->uio_resid >= blksize) {
470 			/*
471 			 * Even though we are entirely overwriting the buffer
472 			 * we may still have to zero it out to avoid a
473 			 * mmap/write visibility issue.
474 			 */
475 			bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
476 			if ((bp->b_flags & B_CACHE) == 0)
477 				vfs_bio_clrbuf(bp);
478 		} else if (base_offset >= ip->ino_data.size) {
479 			/*
480 			 * If the base offset of the buffer is beyond the
481 			 * file EOF, we don't have to issue a read.
482 			 */
483 			bp = getblk(ap->a_vp, base_offset,
484 				    blksize, GETBLK_BHEAVY, 0);
485 			vfs_bio_clrbuf(bp);
486 		} else {
487 			/*
488 			 * Partial overwrite, read in any missing bits then
489 			 * replace the portion being written.
490 			 */
491 			error = bread(ap->a_vp, base_offset, blksize, &bp);
492 			if (error == 0)
493 				bheavy(bp);
494 		}
495 		if (error == 0) {
496 			error = uiomove((char *)bp->b_data + offset,
497 					n, uio);
498 		}
499 
500 		/*
501 		 * If we screwed up we have to undo any VM size changes we
502 		 * made.
503 		 */
504 		if (error) {
505 			brelse(bp);
506 			if (fixsize) {
507 				vtruncbuf(ap->a_vp, ip->ino_data.size,
508 					  hammer_blocksize(ip->ino_data.size));
509 			}
510 			break;
511 		}
512 		kflags |= NOTE_WRITE;
513 		hammer_stats_file_write += n;
514 		/* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
515 		if (ip->ino_data.size < uio->uio_offset) {
516 			ip->ino_data.size = uio->uio_offset;
517 			flags = HAMMER_INODE_DDIRTY;
518 			vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
519 		} else {
520 			flags = 0;
521 		}
522 		ip->ino_data.mtime = trans.time;
523 		flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
524 		hammer_modify_inode(ip, flags);
525 
526 		/*
527 		 * Once we dirty the buffer any cached zone-X offset
528 		 * becomes invalid.  HAMMER NOTE: no-history mode cannot
529 		 * allow overwriting over the same data sector unless
530 		 * we provide UNDOs for the old data, which we don't.
531 		 */
532 		bp->b_bio2.bio_offset = NOOFFSET;
533 
534 		/*
535 		 * Final buffer disposition.
536 		 */
537 		bp->b_flags |= B_AGE;
538 		if (ap->a_ioflag & IO_SYNC) {
539 			bwrite(bp);
540 		} else if (ap->a_ioflag & IO_DIRECT) {
541 			bawrite(bp);
542 		} else {
543 			bdwrite(bp);
544 		}
545 	}
546 	hammer_done_transaction(&trans);
547 	hammer_knote(ap->a_vp, kflags);
548 	return (error);
549 }
550 
551 /*
552  * hammer_vop_access { vp, mode, cred }
553  */
554 static
555 int
556 hammer_vop_access(struct vop_access_args *ap)
557 {
558 	struct hammer_inode *ip = VTOI(ap->a_vp);
559 	uid_t uid;
560 	gid_t gid;
561 	int error;
562 
563 	++hammer_stats_file_iopsr;
564 	uid = hammer_to_unix_xid(&ip->ino_data.uid);
565 	gid = hammer_to_unix_xid(&ip->ino_data.gid);
566 
567 	error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
568 				  ip->ino_data.uflags);
569 	return (error);
570 }
571 
572 /*
573  * hammer_vop_advlock { vp, id, op, fl, flags }
574  */
575 static
576 int
577 hammer_vop_advlock(struct vop_advlock_args *ap)
578 {
579 	hammer_inode_t ip = VTOI(ap->a_vp);
580 
581 	return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
582 }
583 
584 /*
585  * hammer_vop_close { vp, fflag }
586  */
587 static
588 int
589 hammer_vop_close(struct vop_close_args *ap)
590 {
591 	/*hammer_inode_t ip = VTOI(ap->a_vp);*/
592 	return (vop_stdclose(ap));
593 }
594 
595 /*
596  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
597  *
598  * The operating system has already ensured that the directory entry
599  * does not exist and done all appropriate namespace locking.
600  */
601 static
602 int
603 hammer_vop_ncreate(struct vop_ncreate_args *ap)
604 {
605 	struct hammer_transaction trans;
606 	struct hammer_inode *dip;
607 	struct hammer_inode *nip;
608 	struct nchandle *nch;
609 	int error;
610 
611 	nch = ap->a_nch;
612 	dip = VTOI(ap->a_dvp);
613 
614 	if (dip->flags & HAMMER_INODE_RO)
615 		return (EROFS);
616 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
617 		return (error);
618 
619 	/*
620 	 * Create a transaction to cover the operations we perform.
621 	 */
622 	hammer_start_transaction(&trans, dip->hmp);
623 	++hammer_stats_file_iopsw;
624 
625 	/*
626 	 * Create a new filesystem object of the requested type.  The
627 	 * returned inode will be referenced and shared-locked to prevent
628 	 * it from being moved to the flusher.
629 	 */
630 
631 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
632 				    dip, NULL, &nip);
633 	if (error) {
634 		hkprintf("hammer_create_inode error %d\n", error);
635 		hammer_done_transaction(&trans);
636 		*ap->a_vpp = NULL;
637 		return (error);
638 	}
639 
640 	/*
641 	 * Add the new filesystem object to the directory.  This will also
642 	 * bump the inode's link count.
643 	 */
644 	error = hammer_ip_add_directory(&trans, dip,
645 					nch->ncp->nc_name, nch->ncp->nc_nlen,
646 					nip);
647 	if (error)
648 		hkprintf("hammer_ip_add_directory error %d\n", error);
649 
650 	/*
651 	 * Finish up.
652 	 */
653 	if (error) {
654 		hammer_rel_inode(nip, 0);
655 		hammer_done_transaction(&trans);
656 		*ap->a_vpp = NULL;
657 	} else {
658 		error = hammer_get_vnode(nip, ap->a_vpp);
659 		hammer_done_transaction(&trans);
660 		hammer_rel_inode(nip, 0);
661 		if (error == 0) {
662 			cache_setunresolved(ap->a_nch);
663 			cache_setvp(ap->a_nch, *ap->a_vpp);
664 		}
665 		hammer_knote(ap->a_dvp, NOTE_WRITE);
666 	}
667 	return (error);
668 }
669 
670 /*
671  * hammer_vop_getattr { vp, vap }
672  *
673  * Retrieve an inode's attribute information.  When accessing inodes
674  * historically we fake the atime field to ensure consistent results.
675  * The atime field is stored in the B-Tree element and allowed to be
676  * updated without cycling the element.
677  */
678 static
679 int
680 hammer_vop_getattr(struct vop_getattr_args *ap)
681 {
682 	struct hammer_inode *ip = VTOI(ap->a_vp);
683 	struct vattr *vap = ap->a_vap;
684 
685 	/*
686 	 * We want the fsid to be different when accessing a filesystem
687 	 * with different as-of's so programs like diff don't think
688 	 * the files are the same.
689 	 *
690 	 * We also want the fsid to be the same when comparing snapshots,
691 	 * or when comparing mirrors (which might be backed by different
692 	 * physical devices).  HAMMER fsids are based on the PFS's
693 	 * shared_uuid field.
694 	 *
695 	 * XXX there is a chance of collision here.  The va_fsid reported
696 	 * by stat is different from the more involved fsid used in the
697 	 * mount structure.
698 	 */
699 	++hammer_stats_file_iopsr;
700 	vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
701 		       (u_int32_t)(ip->obj_asof >> 32);
702 
703 	vap->va_fileid = ip->ino_leaf.base.obj_id;
704 	vap->va_mode = ip->ino_data.mode;
705 	vap->va_nlink = ip->ino_data.nlinks;
706 	vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
707 	vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
708 	vap->va_rmajor = 0;
709 	vap->va_rminor = 0;
710 	vap->va_size = ip->ino_data.size;
711 
712 	/*
713 	 * Special case for @@PFS softlinks.  The actual size of the
714 	 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
715 	 */
716 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
717 	    ip->ino_data.size == 10 &&
718 	    ip->obj_asof == HAMMER_MAX_TID &&
719 	    ip->obj_localization == 0 &&
720 	    strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
721 		    vap->va_size = 26;
722 	}
723 
724 	/*
725 	 * We must provide a consistent atime and mtime for snapshots
726 	 * so people can do a 'tar cf - ... | md5' on them and get
727 	 * consistent results.
728 	 */
729 	if (ip->flags & HAMMER_INODE_RO) {
730 		hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
731 		hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
732 	} else {
733 		hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
734 		hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
735 	}
736 	hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
737 	vap->va_flags = ip->ino_data.uflags;
738 	vap->va_gen = 1;	/* hammer inums are unique for all time */
739 	vap->va_blocksize = HAMMER_BUFSIZE;
740 	if (ip->ino_data.size >= HAMMER_XDEMARC) {
741 		vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
742 				~HAMMER_XBUFMASK64;
743 	} else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
744 		vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
745 				~HAMMER_BUFMASK64;
746 	} else {
747 		vap->va_bytes = (ip->ino_data.size + 15) & ~15;
748 	}
749 
750 	vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
751 	vap->va_filerev = 0; 	/* XXX */
752 	/* mtime uniquely identifies any adjustments made to the file XXX */
753 	vap->va_fsmid = ip->ino_data.mtime;
754 	vap->va_uid_uuid = ip->ino_data.uid;
755 	vap->va_gid_uuid = ip->ino_data.gid;
756 	vap->va_fsid_uuid = ip->hmp->fsid;
757 	vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
758 			  VA_FSID_UUID_VALID;
759 
760 	switch (ip->ino_data.obj_type) {
761 	case HAMMER_OBJTYPE_CDEV:
762 	case HAMMER_OBJTYPE_BDEV:
763 		vap->va_rmajor = ip->ino_data.rmajor;
764 		vap->va_rminor = ip->ino_data.rminor;
765 		break;
766 	default:
767 		break;
768 	}
769 	return(0);
770 }
771 
772 /*
773  * hammer_vop_nresolve { nch, dvp, cred }
774  *
775  * Locate the requested directory entry.
776  */
777 static
778 int
779 hammer_vop_nresolve(struct vop_nresolve_args *ap)
780 {
781 	struct hammer_transaction trans;
782 	struct namecache *ncp;
783 	hammer_inode_t dip;
784 	hammer_inode_t ip;
785 	hammer_tid_t asof;
786 	struct hammer_cursor cursor;
787 	struct vnode *vp;
788 	int64_t namekey;
789 	int error;
790 	int i;
791 	int nlen;
792 	int flags;
793 	int ispfs;
794 	int64_t obj_id;
795 	u_int32_t localization;
796 	u_int32_t max_iterations;
797 
798 	/*
799 	 * Misc initialization, plus handle as-of name extensions.  Look for
800 	 * the '@@' extension.  Note that as-of files and directories cannot
801 	 * be modified.
802 	 */
803 	dip = VTOI(ap->a_dvp);
804 	ncp = ap->a_nch->ncp;
805 	asof = dip->obj_asof;
806 	localization = dip->obj_localization;	/* for code consistency */
807 	nlen = ncp->nc_nlen;
808 	flags = dip->flags & HAMMER_INODE_RO;
809 	ispfs = 0;
810 
811 	hammer_simple_transaction(&trans, dip->hmp);
812 	++hammer_stats_file_iopsr;
813 
814 	for (i = 0; i < nlen; ++i) {
815 		if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
816 			error = hammer_str_to_tid(ncp->nc_name + i + 2,
817 						  &ispfs, &asof, &localization);
818 			if (error != 0) {
819 				i = nlen;
820 				break;
821 			}
822 			if (asof != HAMMER_MAX_TID)
823 				flags |= HAMMER_INODE_RO;
824 			break;
825 		}
826 	}
827 	nlen = i;
828 
829 	/*
830 	 * If this is a PFS softlink we dive into the PFS
831 	 */
832 	if (ispfs && nlen == 0) {
833 		ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
834 				      asof, localization,
835 				      flags, &error);
836 		if (error == 0) {
837 			error = hammer_get_vnode(ip, &vp);
838 			hammer_rel_inode(ip, 0);
839 		} else {
840 			vp = NULL;
841 		}
842 		if (error == 0) {
843 			vn_unlock(vp);
844 			cache_setvp(ap->a_nch, vp);
845 			vrele(vp);
846 		}
847 		goto done;
848 	}
849 
850 	/*
851 	 * If there is no path component the time extension is relative to
852 	 * dip.
853 	 */
854 	if (nlen == 0) {
855 		ip = hammer_get_inode(&trans, dip, dip->obj_id,
856 				      asof, dip->obj_localization,
857 				      flags, &error);
858 		if (error == 0) {
859 			error = hammer_get_vnode(ip, &vp);
860 			hammer_rel_inode(ip, 0);
861 		} else {
862 			vp = NULL;
863 		}
864 		if (error == 0) {
865 			vn_unlock(vp);
866 			cache_setvp(ap->a_nch, vp);
867 			vrele(vp);
868 		}
869 		goto done;
870 	}
871 
872 	/*
873 	 * Calculate the namekey and setup the key range for the scan.  This
874 	 * works kinda like a chained hash table where the lower 32 bits
875 	 * of the namekey synthesize the chain.
876 	 *
877 	 * The key range is inclusive of both key_beg and key_end.
878 	 */
879 	namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
880 					   &max_iterations);
881 
882 	error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
883 	cursor.key_beg.localization = dip->obj_localization +
884 				      HAMMER_LOCALIZE_MISC;
885         cursor.key_beg.obj_id = dip->obj_id;
886 	cursor.key_beg.key = namekey;
887         cursor.key_beg.create_tid = 0;
888         cursor.key_beg.delete_tid = 0;
889         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
890         cursor.key_beg.obj_type = 0;
891 
892 	cursor.key_end = cursor.key_beg;
893 	cursor.key_end.key += max_iterations;
894 	cursor.asof = asof;
895 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
896 
897 	/*
898 	 * Scan all matching records (the chain), locate the one matching
899 	 * the requested path component.
900 	 *
901 	 * The hammer_ip_*() functions merge in-memory records with on-disk
902 	 * records for the purposes of the search.
903 	 */
904 	obj_id = 0;
905 	localization = HAMMER_DEF_LOCALIZATION;
906 
907 	if (error == 0) {
908 		error = hammer_ip_first(&cursor);
909 		while (error == 0) {
910 			error = hammer_ip_resolve_data(&cursor);
911 			if (error)
912 				break;
913 			if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
914 			    bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
915 				obj_id = cursor.data->entry.obj_id;
916 				localization = cursor.data->entry.localization;
917 				break;
918 			}
919 			error = hammer_ip_next(&cursor);
920 		}
921 	}
922 	hammer_done_cursor(&cursor);
923 	if (error == 0) {
924 		ip = hammer_get_inode(&trans, dip, obj_id,
925 				      asof, localization,
926 				      flags, &error);
927 		if (error == 0) {
928 			error = hammer_get_vnode(ip, &vp);
929 			hammer_rel_inode(ip, 0);
930 		} else {
931 			vp = NULL;
932 		}
933 		if (error == 0) {
934 			vn_unlock(vp);
935 			cache_setvp(ap->a_nch, vp);
936 			vrele(vp);
937 		}
938 	} else if (error == ENOENT) {
939 		cache_setvp(ap->a_nch, NULL);
940 	}
941 done:
942 	hammer_done_transaction(&trans);
943 	return (error);
944 }
945 
946 /*
947  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
948  *
949  * Locate the parent directory of a directory vnode.
950  *
951  * dvp is referenced but not locked.  *vpp must be returned referenced and
952  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
953  * at the root, instead it could indicate that the directory we were in was
954  * removed.
955  *
956  * NOTE: as-of sequences are not linked into the directory structure.  If
957  * we are at the root with a different asof then the mount point, reload
958  * the same directory with the mount point's asof.   I'm not sure what this
959  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
960  * get confused, but it hasn't been tested.
961  */
962 static
963 int
964 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
965 {
966 	struct hammer_transaction trans;
967 	struct hammer_inode *dip;
968 	struct hammer_inode *ip;
969 	int64_t parent_obj_id;
970 	u_int32_t parent_obj_localization;
971 	hammer_tid_t asof;
972 	int error;
973 
974 	dip = VTOI(ap->a_dvp);
975 	asof = dip->obj_asof;
976 
977 	/*
978 	 * Whos are parent?  This could be the root of a pseudo-filesystem
979 	 * whos parent is in another localization domain.
980 	 */
981 	parent_obj_id = dip->ino_data.parent_obj_id;
982 	if (dip->obj_id == HAMMER_OBJID_ROOT)
983 		parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
984 	else
985 		parent_obj_localization = dip->obj_localization;
986 
987 	if (parent_obj_id == 0) {
988 		if (dip->obj_id == HAMMER_OBJID_ROOT &&
989 		   asof != dip->hmp->asof) {
990 			parent_obj_id = dip->obj_id;
991 			asof = dip->hmp->asof;
992 			*ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
993 			ksnprintf(*ap->a_fakename, 19, "0x%016llx",
994 				   dip->obj_asof);
995 		} else {
996 			*ap->a_vpp = NULL;
997 			return ENOENT;
998 		}
999 	}
1000 
1001 	hammer_simple_transaction(&trans, dip->hmp);
1002 	++hammer_stats_file_iopsr;
1003 
1004 	ip = hammer_get_inode(&trans, dip, parent_obj_id,
1005 			      asof, parent_obj_localization,
1006 			      dip->flags, &error);
1007 	if (ip) {
1008 		error = hammer_get_vnode(ip, ap->a_vpp);
1009 		hammer_rel_inode(ip, 0);
1010 	} else {
1011 		*ap->a_vpp = NULL;
1012 	}
1013 	hammer_done_transaction(&trans);
1014 	return (error);
1015 }
1016 
1017 /*
1018  * hammer_vop_nlink { nch, dvp, vp, cred }
1019  */
1020 static
1021 int
1022 hammer_vop_nlink(struct vop_nlink_args *ap)
1023 {
1024 	struct hammer_transaction trans;
1025 	struct hammer_inode *dip;
1026 	struct hammer_inode *ip;
1027 	struct nchandle *nch;
1028 	int error;
1029 
1030 	if (ap->a_dvp->v_mount != ap->a_vp->v_mount)
1031 		return(EXDEV);
1032 
1033 	nch = ap->a_nch;
1034 	dip = VTOI(ap->a_dvp);
1035 	ip = VTOI(ap->a_vp);
1036 
1037 	if (dip->obj_localization != ip->obj_localization)
1038 		return(EXDEV);
1039 
1040 	if (dip->flags & HAMMER_INODE_RO)
1041 		return (EROFS);
1042 	if (ip->flags & HAMMER_INODE_RO)
1043 		return (EROFS);
1044 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1045 		return (error);
1046 
1047 	/*
1048 	 * Create a transaction to cover the operations we perform.
1049 	 */
1050 	hammer_start_transaction(&trans, dip->hmp);
1051 	++hammer_stats_file_iopsw;
1052 
1053 	/*
1054 	 * Add the filesystem object to the directory.  Note that neither
1055 	 * dip nor ip are referenced or locked, but their vnodes are
1056 	 * referenced.  This function will bump the inode's link count.
1057 	 */
1058 	error = hammer_ip_add_directory(&trans, dip,
1059 					nch->ncp->nc_name, nch->ncp->nc_nlen,
1060 					ip);
1061 
1062 	/*
1063 	 * Finish up.
1064 	 */
1065 	if (error == 0) {
1066 		cache_setunresolved(nch);
1067 		cache_setvp(nch, ap->a_vp);
1068 	}
1069 	hammer_done_transaction(&trans);
1070 	hammer_knote(ap->a_vp, NOTE_LINK);
1071 	hammer_knote(ap->a_dvp, NOTE_WRITE);
1072 	return (error);
1073 }
1074 
1075 /*
1076  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1077  *
1078  * The operating system has already ensured that the directory entry
1079  * does not exist and done all appropriate namespace locking.
1080  */
1081 static
1082 int
1083 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1084 {
1085 	struct hammer_transaction trans;
1086 	struct hammer_inode *dip;
1087 	struct hammer_inode *nip;
1088 	struct nchandle *nch;
1089 	int error;
1090 
1091 	nch = ap->a_nch;
1092 	dip = VTOI(ap->a_dvp);
1093 
1094 	if (dip->flags & HAMMER_INODE_RO)
1095 		return (EROFS);
1096 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1097 		return (error);
1098 
1099 	/*
1100 	 * Create a transaction to cover the operations we perform.
1101 	 */
1102 	hammer_start_transaction(&trans, dip->hmp);
1103 	++hammer_stats_file_iopsw;
1104 
1105 	/*
1106 	 * Create a new filesystem object of the requested type.  The
1107 	 * returned inode will be referenced but not locked.
1108 	 */
1109 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1110 				    dip, NULL, &nip);
1111 	if (error) {
1112 		hkprintf("hammer_mkdir error %d\n", error);
1113 		hammer_done_transaction(&trans);
1114 		*ap->a_vpp = NULL;
1115 		return (error);
1116 	}
1117 	/*
1118 	 * Add the new filesystem object to the directory.  This will also
1119 	 * bump the inode's link count.
1120 	 */
1121 	error = hammer_ip_add_directory(&trans, dip,
1122 					nch->ncp->nc_name, nch->ncp->nc_nlen,
1123 					nip);
1124 	if (error)
1125 		hkprintf("hammer_mkdir (add) error %d\n", error);
1126 
1127 	/*
1128 	 * Finish up.
1129 	 */
1130 	if (error) {
1131 		hammer_rel_inode(nip, 0);
1132 		*ap->a_vpp = NULL;
1133 	} else {
1134 		error = hammer_get_vnode(nip, ap->a_vpp);
1135 		hammer_rel_inode(nip, 0);
1136 		if (error == 0) {
1137 			cache_setunresolved(ap->a_nch);
1138 			cache_setvp(ap->a_nch, *ap->a_vpp);
1139 		}
1140 	}
1141 	hammer_done_transaction(&trans);
1142 	if (error == 0)
1143 		hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1144 	return (error);
1145 }
1146 
1147 /*
1148  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1149  *
1150  * The operating system has already ensured that the directory entry
1151  * does not exist and done all appropriate namespace locking.
1152  */
1153 static
1154 int
1155 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1156 {
1157 	struct hammer_transaction trans;
1158 	struct hammer_inode *dip;
1159 	struct hammer_inode *nip;
1160 	struct nchandle *nch;
1161 	int error;
1162 
1163 	nch = ap->a_nch;
1164 	dip = VTOI(ap->a_dvp);
1165 
1166 	if (dip->flags & HAMMER_INODE_RO)
1167 		return (EROFS);
1168 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1169 		return (error);
1170 
1171 	/*
1172 	 * Create a transaction to cover the operations we perform.
1173 	 */
1174 	hammer_start_transaction(&trans, dip->hmp);
1175 	++hammer_stats_file_iopsw;
1176 
1177 	/*
1178 	 * Create a new filesystem object of the requested type.  The
1179 	 * returned inode will be referenced but not locked.
1180 	 *
1181 	 * If mknod specifies a directory a pseudo-fs is created.
1182 	 */
1183 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1184 				    dip, NULL, &nip);
1185 	if (error) {
1186 		hammer_done_transaction(&trans);
1187 		*ap->a_vpp = NULL;
1188 		return (error);
1189 	}
1190 
1191 	/*
1192 	 * Add the new filesystem object to the directory.  This will also
1193 	 * bump the inode's link count.
1194 	 */
1195 	error = hammer_ip_add_directory(&trans, dip,
1196 					nch->ncp->nc_name, nch->ncp->nc_nlen,
1197 					nip);
1198 
1199 	/*
1200 	 * Finish up.
1201 	 */
1202 	if (error) {
1203 		hammer_rel_inode(nip, 0);
1204 		*ap->a_vpp = NULL;
1205 	} else {
1206 		error = hammer_get_vnode(nip, ap->a_vpp);
1207 		hammer_rel_inode(nip, 0);
1208 		if (error == 0) {
1209 			cache_setunresolved(ap->a_nch);
1210 			cache_setvp(ap->a_nch, *ap->a_vpp);
1211 		}
1212 	}
1213 	hammer_done_transaction(&trans);
1214 	if (error == 0)
1215 		hammer_knote(ap->a_dvp, NOTE_WRITE);
1216 	return (error);
1217 }
1218 
1219 /*
1220  * hammer_vop_open { vp, mode, cred, fp }
1221  */
1222 static
1223 int
1224 hammer_vop_open(struct vop_open_args *ap)
1225 {
1226 	hammer_inode_t ip;
1227 
1228 	++hammer_stats_file_iopsr;
1229 	ip = VTOI(ap->a_vp);
1230 
1231 	if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1232 		return (EROFS);
1233 	return(vop_stdopen(ap));
1234 }
1235 
1236 /*
1237  * hammer_vop_print { vp }
1238  */
1239 static
1240 int
1241 hammer_vop_print(struct vop_print_args *ap)
1242 {
1243 	return EOPNOTSUPP;
1244 }
1245 
1246 /*
1247  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1248  */
1249 static
1250 int
1251 hammer_vop_readdir(struct vop_readdir_args *ap)
1252 {
1253 	struct hammer_transaction trans;
1254 	struct hammer_cursor cursor;
1255 	struct hammer_inode *ip;
1256 	struct uio *uio;
1257 	hammer_base_elm_t base;
1258 	int error;
1259 	int cookie_index;
1260 	int ncookies;
1261 	off_t *cookies;
1262 	off_t saveoff;
1263 	int r;
1264 	int dtype;
1265 
1266 	++hammer_stats_file_iopsr;
1267 	ip = VTOI(ap->a_vp);
1268 	uio = ap->a_uio;
1269 	saveoff = uio->uio_offset;
1270 
1271 	if (ap->a_ncookies) {
1272 		ncookies = uio->uio_resid / 16 + 1;
1273 		if (ncookies > 1024)
1274 			ncookies = 1024;
1275 		cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1276 		cookie_index = 0;
1277 	} else {
1278 		ncookies = -1;
1279 		cookies = NULL;
1280 		cookie_index = 0;
1281 	}
1282 
1283 	hammer_simple_transaction(&trans, ip->hmp);
1284 
1285 	/*
1286 	 * Handle artificial entries
1287 	 */
1288 	error = 0;
1289 	if (saveoff == 0) {
1290 		r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1291 		if (r)
1292 			goto done;
1293 		if (cookies)
1294 			cookies[cookie_index] = saveoff;
1295 		++saveoff;
1296 		++cookie_index;
1297 		if (cookie_index == ncookies)
1298 			goto done;
1299 	}
1300 	if (saveoff == 1) {
1301 		if (ip->ino_data.parent_obj_id) {
1302 			r = vop_write_dirent(&error, uio,
1303 					     ip->ino_data.parent_obj_id,
1304 					     DT_DIR, 2, "..");
1305 		} else {
1306 			r = vop_write_dirent(&error, uio,
1307 					     ip->obj_id, DT_DIR, 2, "..");
1308 		}
1309 		if (r)
1310 			goto done;
1311 		if (cookies)
1312 			cookies[cookie_index] = saveoff;
1313 		++saveoff;
1314 		++cookie_index;
1315 		if (cookie_index == ncookies)
1316 			goto done;
1317 	}
1318 
1319 	/*
1320 	 * Key range (begin and end inclusive) to scan.  Directory keys
1321 	 * directly translate to a 64 bit 'seek' position.
1322 	 */
1323 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1324 	cursor.key_beg.localization = ip->obj_localization +
1325 				      HAMMER_LOCALIZE_MISC;
1326 	cursor.key_beg.obj_id = ip->obj_id;
1327 	cursor.key_beg.create_tid = 0;
1328 	cursor.key_beg.delete_tid = 0;
1329         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1330 	cursor.key_beg.obj_type = 0;
1331 	cursor.key_beg.key = saveoff;
1332 
1333 	cursor.key_end = cursor.key_beg;
1334 	cursor.key_end.key = HAMMER_MAX_KEY;
1335 	cursor.asof = ip->obj_asof;
1336 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1337 
1338 	error = hammer_ip_first(&cursor);
1339 
1340 	while (error == 0) {
1341 		error = hammer_ip_resolve_data(&cursor);
1342 		if (error)
1343 			break;
1344 		base = &cursor.leaf->base;
1345 		saveoff = base->key;
1346 		KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1347 
1348 		if (base->obj_id != ip->obj_id)
1349 			panic("readdir: bad record at %p", cursor.node);
1350 
1351 		/*
1352 		 * Convert pseudo-filesystems into softlinks
1353 		 */
1354 		dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
1355 		r = vop_write_dirent(
1356 			     &error, uio, cursor.data->entry.obj_id,
1357 			     dtype,
1358 			     cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1359 			     (void *)cursor.data->entry.name);
1360 		if (r)
1361 			break;
1362 		++saveoff;
1363 		if (cookies)
1364 			cookies[cookie_index] = base->key;
1365 		++cookie_index;
1366 		if (cookie_index == ncookies)
1367 			break;
1368 		error = hammer_ip_next(&cursor);
1369 	}
1370 	hammer_done_cursor(&cursor);
1371 
1372 done:
1373 	hammer_done_transaction(&trans);
1374 
1375 	if (ap->a_eofflag)
1376 		*ap->a_eofflag = (error == ENOENT);
1377 	uio->uio_offset = saveoff;
1378 	if (error && cookie_index == 0) {
1379 		if (error == ENOENT)
1380 			error = 0;
1381 		if (cookies) {
1382 			kfree(cookies, M_TEMP);
1383 			*ap->a_ncookies = 0;
1384 			*ap->a_cookies = NULL;
1385 		}
1386 	} else {
1387 		if (error == ENOENT)
1388 			error = 0;
1389 		if (cookies) {
1390 			*ap->a_ncookies = cookie_index;
1391 			*ap->a_cookies = cookies;
1392 		}
1393 	}
1394 	return(error);
1395 }
1396 
1397 /*
1398  * hammer_vop_readlink { vp, uio, cred }
1399  */
1400 static
1401 int
1402 hammer_vop_readlink(struct vop_readlink_args *ap)
1403 {
1404 	struct hammer_transaction trans;
1405 	struct hammer_cursor cursor;
1406 	struct hammer_inode *ip;
1407 	char buf[32];
1408 	u_int32_t localization;
1409 	hammer_pseudofs_inmem_t pfsm;
1410 	int error;
1411 
1412 	ip = VTOI(ap->a_vp);
1413 
1414 	/*
1415 	 * Shortcut if the symlink data was stuffed into ino_data.
1416 	 *
1417 	 * Also expand special "@@PFS%05d" softlinks (expansion only
1418 	 * occurs for non-historical (current) accesses made from the
1419 	 * primary filesystem).
1420 	 */
1421 	if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1422 		char *ptr;
1423 		int bytes;
1424 
1425 		ptr = ip->ino_data.ext.symlink;
1426 		bytes = (int)ip->ino_data.size;
1427 		if (bytes == 10 &&
1428 		    ip->obj_asof == HAMMER_MAX_TID &&
1429 		    ip->obj_localization == 0 &&
1430 		    strncmp(ptr, "@@PFS", 5) == 0) {
1431 			hammer_simple_transaction(&trans, ip->hmp);
1432 			bcopy(ptr + 5, buf, 5);
1433 			buf[5] = 0;
1434 			localization = strtoul(buf, NULL, 10) << 16;
1435 			pfsm = hammer_load_pseudofs(&trans, localization,
1436 						    &error);
1437 			if (error == 0) {
1438 				if (pfsm->pfsd.mirror_flags &
1439 				    HAMMER_PFSD_SLAVE) {
1440 					ksnprintf(buf, sizeof(buf),
1441 						  "@@0x%016llx:%05d",
1442 						  pfsm->pfsd.sync_end_tid,
1443 						  localization >> 16);
1444 				} else {
1445 					ksnprintf(buf, sizeof(buf),
1446 						  "@@0x%016llx:%05d",
1447 						  HAMMER_MAX_TID,
1448 						  localization >> 16);
1449 				}
1450 				ptr = buf;
1451 				bytes = strlen(buf);
1452 			}
1453 			if (pfsm)
1454 				hammer_rel_pseudofs(trans.hmp, pfsm);
1455 			hammer_done_transaction(&trans);
1456 		}
1457 		error = uiomove(ptr, bytes, ap->a_uio);
1458 		return(error);
1459 	}
1460 
1461 	/*
1462 	 * Long version
1463 	 */
1464 	hammer_simple_transaction(&trans, ip->hmp);
1465 	++hammer_stats_file_iopsr;
1466 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1467 
1468 	/*
1469 	 * Key range (begin and end inclusive) to scan.  Directory keys
1470 	 * directly translate to a 64 bit 'seek' position.
1471 	 */
1472 	cursor.key_beg.localization = ip->obj_localization +
1473 				      HAMMER_LOCALIZE_MISC;
1474 	cursor.key_beg.obj_id = ip->obj_id;
1475 	cursor.key_beg.create_tid = 0;
1476 	cursor.key_beg.delete_tid = 0;
1477         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1478 	cursor.key_beg.obj_type = 0;
1479 	cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1480 	cursor.asof = ip->obj_asof;
1481 	cursor.flags |= HAMMER_CURSOR_ASOF;
1482 
1483 	error = hammer_ip_lookup(&cursor);
1484 	if (error == 0) {
1485 		error = hammer_ip_resolve_data(&cursor);
1486 		if (error == 0) {
1487 			KKASSERT(cursor.leaf->data_len >=
1488 				 HAMMER_SYMLINK_NAME_OFF);
1489 			error = uiomove(cursor.data->symlink.name,
1490 					cursor.leaf->data_len -
1491 						HAMMER_SYMLINK_NAME_OFF,
1492 					ap->a_uio);
1493 		}
1494 	}
1495 	hammer_done_cursor(&cursor);
1496 	hammer_done_transaction(&trans);
1497 	return(error);
1498 }
1499 
1500 /*
1501  * hammer_vop_nremove { nch, dvp, cred }
1502  */
1503 static
1504 int
1505 hammer_vop_nremove(struct vop_nremove_args *ap)
1506 {
1507 	struct hammer_transaction trans;
1508 	struct hammer_inode *dip;
1509 	int error;
1510 
1511 	dip = VTOI(ap->a_dvp);
1512 
1513 	if (hammer_nohistory(dip) == 0 &&
1514 	    (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1515 		return (error);
1516 	}
1517 
1518 	hammer_start_transaction(&trans, dip->hmp);
1519 	++hammer_stats_file_iopsw;
1520 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
1521 	hammer_done_transaction(&trans);
1522 	if (error == 0)
1523 		hammer_knote(ap->a_dvp, NOTE_WRITE);
1524 	return (error);
1525 }
1526 
1527 /*
1528  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1529  */
1530 static
1531 int
1532 hammer_vop_nrename(struct vop_nrename_args *ap)
1533 {
1534 	struct hammer_transaction trans;
1535 	struct namecache *fncp;
1536 	struct namecache *tncp;
1537 	struct hammer_inode *fdip;
1538 	struct hammer_inode *tdip;
1539 	struct hammer_inode *ip;
1540 	struct hammer_cursor cursor;
1541 	int64_t namekey;
1542 	u_int32_t max_iterations;
1543 	int nlen, error;
1544 
1545 	if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
1546 		return(EXDEV);
1547 	if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1548 		return(EXDEV);
1549 
1550 	fdip = VTOI(ap->a_fdvp);
1551 	tdip = VTOI(ap->a_tdvp);
1552 	fncp = ap->a_fnch->ncp;
1553 	tncp = ap->a_tnch->ncp;
1554 	ip = VTOI(fncp->nc_vp);
1555 	KKASSERT(ip != NULL);
1556 
1557 	if (fdip->obj_localization != tdip->obj_localization)
1558 		return(EXDEV);
1559 	if (fdip->obj_localization != ip->obj_localization)
1560 		return(EXDEV);
1561 
1562 	if (fdip->flags & HAMMER_INODE_RO)
1563 		return (EROFS);
1564 	if (tdip->flags & HAMMER_INODE_RO)
1565 		return (EROFS);
1566 	if (ip->flags & HAMMER_INODE_RO)
1567 		return (EROFS);
1568 	if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1569 		return (error);
1570 
1571 	hammer_start_transaction(&trans, fdip->hmp);
1572 	++hammer_stats_file_iopsw;
1573 
1574 	/*
1575 	 * Remove tncp from the target directory and then link ip as
1576 	 * tncp. XXX pass trans to dounlink
1577 	 *
1578 	 * Force the inode sync-time to match the transaction so it is
1579 	 * in-sync with the creation of the target directory entry.
1580 	 */
1581 	error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
1582 				ap->a_cred, 0, -1);
1583 	if (error == 0 || error == ENOENT) {
1584 		error = hammer_ip_add_directory(&trans, tdip,
1585 						tncp->nc_name, tncp->nc_nlen,
1586 						ip);
1587 		if (error == 0) {
1588 			ip->ino_data.parent_obj_id = tdip->obj_id;
1589 			hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
1590 		}
1591 	}
1592 	if (error)
1593 		goto failed; /* XXX */
1594 
1595 	/*
1596 	 * Locate the record in the originating directory and remove it.
1597 	 *
1598 	 * Calculate the namekey and setup the key range for the scan.  This
1599 	 * works kinda like a chained hash table where the lower 32 bits
1600 	 * of the namekey synthesize the chain.
1601 	 *
1602 	 * The key range is inclusive of both key_beg and key_end.
1603 	 */
1604 	namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
1605 					   &max_iterations);
1606 retry:
1607 	hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
1608 	cursor.key_beg.localization = fdip->obj_localization +
1609 				      HAMMER_LOCALIZE_MISC;
1610         cursor.key_beg.obj_id = fdip->obj_id;
1611 	cursor.key_beg.key = namekey;
1612         cursor.key_beg.create_tid = 0;
1613         cursor.key_beg.delete_tid = 0;
1614         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1615         cursor.key_beg.obj_type = 0;
1616 
1617 	cursor.key_end = cursor.key_beg;
1618 	cursor.key_end.key += max_iterations;
1619 	cursor.asof = fdip->obj_asof;
1620 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1621 
1622 	/*
1623 	 * Scan all matching records (the chain), locate the one matching
1624 	 * the requested path component.
1625 	 *
1626 	 * The hammer_ip_*() functions merge in-memory records with on-disk
1627 	 * records for the purposes of the search.
1628 	 */
1629 	error = hammer_ip_first(&cursor);
1630 	while (error == 0) {
1631 		if (hammer_ip_resolve_data(&cursor) != 0)
1632 			break;
1633 		nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1634 		KKASSERT(nlen > 0);
1635 		if (fncp->nc_nlen == nlen &&
1636 		    bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1637 			break;
1638 		}
1639 		error = hammer_ip_next(&cursor);
1640 	}
1641 
1642 	/*
1643 	 * If all is ok we have to get the inode so we can adjust nlinks.
1644 	 *
1645 	 * WARNING: hammer_ip_del_directory() may have to terminate the
1646 	 * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1647 	 * twice.
1648 	 */
1649 	if (error == 0)
1650 		error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1651 
1652 	/*
1653 	 * XXX A deadlock here will break rename's atomicy for the purposes
1654 	 * of crash recovery.
1655 	 */
1656 	if (error == EDEADLK) {
1657 		hammer_done_cursor(&cursor);
1658 		goto retry;
1659 	}
1660 
1661 	/*
1662 	 * Cleanup and tell the kernel that the rename succeeded.
1663 	 */
1664         hammer_done_cursor(&cursor);
1665 	if (error == 0) {
1666 		cache_rename(ap->a_fnch, ap->a_tnch);
1667 		hammer_knote(ap->a_fdvp, NOTE_WRITE);
1668 		hammer_knote(ap->a_tdvp, NOTE_WRITE);
1669 		if (ip->vp)
1670 			hammer_knote(ip->vp, NOTE_RENAME);
1671 	}
1672 
1673 failed:
1674 	hammer_done_transaction(&trans);
1675 	return (error);
1676 }
1677 
1678 /*
1679  * hammer_vop_nrmdir { nch, dvp, cred }
1680  */
1681 static
1682 int
1683 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1684 {
1685 	struct hammer_transaction trans;
1686 	struct hammer_inode *dip;
1687 	int error;
1688 
1689 	dip = VTOI(ap->a_dvp);
1690 
1691 	if (hammer_nohistory(dip) == 0 &&
1692 	    (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1693 		return (error);
1694 	}
1695 
1696 	hammer_start_transaction(&trans, dip->hmp);
1697 	++hammer_stats_file_iopsw;
1698 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
1699 	hammer_done_transaction(&trans);
1700 	if (error == 0)
1701 		hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1702 	return (error);
1703 }
1704 
1705 /*
1706  * hammer_vop_setattr { vp, vap, cred }
1707  */
1708 static
1709 int
1710 hammer_vop_setattr(struct vop_setattr_args *ap)
1711 {
1712 	struct hammer_transaction trans;
1713 	struct vattr *vap;
1714 	struct hammer_inode *ip;
1715 	int modflags;
1716 	int error;
1717 	int truncating;
1718 	int blksize;
1719 	int kflags;
1720 	int64_t aligned_size;
1721 	u_int32_t flags;
1722 
1723 	vap = ap->a_vap;
1724 	ip = ap->a_vp->v_data;
1725 	modflags = 0;
1726 	kflags = 0;
1727 
1728 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1729 		return(EROFS);
1730 	if (ip->flags & HAMMER_INODE_RO)
1731 		return (EROFS);
1732 	if (hammer_nohistory(ip) == 0 &&
1733 	    (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1734 		return (error);
1735 	}
1736 
1737 	hammer_start_transaction(&trans, ip->hmp);
1738 	++hammer_stats_file_iopsw;
1739 	error = 0;
1740 
1741 	if (vap->va_flags != VNOVAL) {
1742 		flags = ip->ino_data.uflags;
1743 		error = vop_helper_setattr_flags(&flags, vap->va_flags,
1744 					 hammer_to_unix_xid(&ip->ino_data.uid),
1745 					 ap->a_cred);
1746 		if (error == 0) {
1747 			if (ip->ino_data.uflags != flags) {
1748 				ip->ino_data.uflags = flags;
1749 				modflags |= HAMMER_INODE_DDIRTY;
1750 				kflags |= NOTE_ATTRIB;
1751 			}
1752 			if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1753 				error = 0;
1754 				goto done;
1755 			}
1756 		}
1757 		goto done;
1758 	}
1759 	if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1760 		error = EPERM;
1761 		goto done;
1762 	}
1763 	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
1764 		mode_t cur_mode = ip->ino_data.mode;
1765 		uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1766 		gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1767 		uuid_t uuid_uid;
1768 		uuid_t uuid_gid;
1769 
1770 		error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
1771 					 ap->a_cred,
1772 					 &cur_uid, &cur_gid, &cur_mode);
1773 		if (error == 0) {
1774 			hammer_guid_to_uuid(&uuid_uid, cur_uid);
1775 			hammer_guid_to_uuid(&uuid_gid, cur_gid);
1776 			if (bcmp(&uuid_uid, &ip->ino_data.uid,
1777 				 sizeof(uuid_uid)) ||
1778 			    bcmp(&uuid_gid, &ip->ino_data.gid,
1779 				 sizeof(uuid_gid)) ||
1780 			    ip->ino_data.mode != cur_mode
1781 			) {
1782 				ip->ino_data.uid = uuid_uid;
1783 				ip->ino_data.gid = uuid_gid;
1784 				ip->ino_data.mode = cur_mode;
1785 			}
1786 			modflags |= HAMMER_INODE_DDIRTY;
1787 			kflags |= NOTE_ATTRIB;
1788 		}
1789 	}
1790 	while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
1791 		switch(ap->a_vp->v_type) {
1792 		case VREG:
1793 			if (vap->va_size == ip->ino_data.size)
1794 				break;
1795 			/*
1796 			 * XXX break atomicy, we can deadlock the backend
1797 			 * if we do not release the lock.  Probably not a
1798 			 * big deal here.
1799 			 */
1800 			blksize = hammer_blocksize(vap->va_size);
1801 			if (vap->va_size < ip->ino_data.size) {
1802 				vtruncbuf(ap->a_vp, vap->va_size, blksize);
1803 				truncating = 1;
1804 				kflags |= NOTE_WRITE;
1805 			} else {
1806 				vnode_pager_setsize(ap->a_vp, vap->va_size);
1807 				truncating = 0;
1808 				kflags |= NOTE_WRITE | NOTE_EXTEND;
1809 			}
1810 			ip->ino_data.size = vap->va_size;
1811 			modflags |= HAMMER_INODE_DDIRTY;
1812 
1813 			/*
1814 			 * on-media truncation is cached in the inode until
1815 			 * the inode is synchronized.
1816 			 */
1817 			if (truncating) {
1818 				hammer_ip_frontend_trunc(ip, vap->va_size);
1819 #ifdef DEBUG_TRUNCATE
1820 				if (HammerTruncIp == NULL)
1821 					HammerTruncIp = ip;
1822 #endif
1823 				if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1824 					ip->flags |= HAMMER_INODE_TRUNCATED;
1825 					ip->trunc_off = vap->va_size;
1826 #ifdef DEBUG_TRUNCATE
1827 					if (ip == HammerTruncIp)
1828 					kprintf("truncate1 %016llx\n", ip->trunc_off);
1829 #endif
1830 				} else if (ip->trunc_off > vap->va_size) {
1831 					ip->trunc_off = vap->va_size;
1832 #ifdef DEBUG_TRUNCATE
1833 					if (ip == HammerTruncIp)
1834 					kprintf("truncate2 %016llx\n", ip->trunc_off);
1835 #endif
1836 				} else {
1837 #ifdef DEBUG_TRUNCATE
1838 					if (ip == HammerTruncIp)
1839 					kprintf("truncate3 %016llx (ignored)\n", vap->va_size);
1840 #endif
1841 				}
1842 			}
1843 
1844 			/*
1845 			 * If truncating we have to clean out a portion of
1846 			 * the last block on-disk.  We do this in the
1847 			 * front-end buffer cache.
1848 			 */
1849 			aligned_size = (vap->va_size + (blksize - 1)) &
1850 				       ~(int64_t)(blksize - 1);
1851 			if (truncating && vap->va_size < aligned_size) {
1852 				struct buf *bp;
1853 				int offset;
1854 
1855 				aligned_size -= blksize;
1856 
1857 				offset = (int)vap->va_size & (blksize - 1);
1858 				error = bread(ap->a_vp, aligned_size,
1859 					      blksize, &bp);
1860 				hammer_ip_frontend_trunc(ip, aligned_size);
1861 				if (error == 0) {
1862 					bzero(bp->b_data + offset,
1863 					      blksize - offset);
1864 					/* must de-cache direct-io offset */
1865 					bp->b_bio2.bio_offset = NOOFFSET;
1866 					bdwrite(bp);
1867 				} else {
1868 					kprintf("ERROR %d\n", error);
1869 					brelse(bp);
1870 				}
1871 			}
1872 			break;
1873 		case VDATABASE:
1874 			if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1875 				ip->flags |= HAMMER_INODE_TRUNCATED;
1876 				ip->trunc_off = vap->va_size;
1877 			} else if (ip->trunc_off > vap->va_size) {
1878 				ip->trunc_off = vap->va_size;
1879 			}
1880 			hammer_ip_frontend_trunc(ip, vap->va_size);
1881 			ip->ino_data.size = vap->va_size;
1882 			modflags |= HAMMER_INODE_DDIRTY;
1883 			kflags |= NOTE_ATTRIB;
1884 			break;
1885 		default:
1886 			error = EINVAL;
1887 			goto done;
1888 		}
1889 		break;
1890 	}
1891 	if (vap->va_atime.tv_sec != VNOVAL) {
1892 		ip->ino_data.atime =
1893 			hammer_timespec_to_time(&vap->va_atime);
1894 		modflags |= HAMMER_INODE_ATIME;
1895 		kflags |= NOTE_ATTRIB;
1896 	}
1897 	if (vap->va_mtime.tv_sec != VNOVAL) {
1898 		ip->ino_data.mtime =
1899 			hammer_timespec_to_time(&vap->va_mtime);
1900 		modflags |= HAMMER_INODE_MTIME;
1901 		kflags |= NOTE_ATTRIB;
1902 	}
1903 	if (vap->va_mode != (mode_t)VNOVAL) {
1904 		mode_t   cur_mode = ip->ino_data.mode;
1905 		uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1906 		gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1907 
1908 		error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
1909 					 cur_uid, cur_gid, &cur_mode);
1910 		if (error == 0 && ip->ino_data.mode != cur_mode) {
1911 			ip->ino_data.mode = cur_mode;
1912 			modflags |= HAMMER_INODE_DDIRTY;
1913 			kflags |= NOTE_ATTRIB;
1914 		}
1915 	}
1916 done:
1917 	if (error == 0)
1918 		hammer_modify_inode(ip, modflags);
1919 	hammer_done_transaction(&trans);
1920 	hammer_knote(ap->a_vp, kflags);
1921 	return (error);
1922 }
1923 
1924 /*
1925  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1926  */
1927 static
1928 int
1929 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
1930 {
1931 	struct hammer_transaction trans;
1932 	struct hammer_inode *dip;
1933 	struct hammer_inode *nip;
1934 	struct nchandle *nch;
1935 	hammer_record_t record;
1936 	int error;
1937 	int bytes;
1938 
1939 	ap->a_vap->va_type = VLNK;
1940 
1941 	nch = ap->a_nch;
1942 	dip = VTOI(ap->a_dvp);
1943 
1944 	if (dip->flags & HAMMER_INODE_RO)
1945 		return (EROFS);
1946 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1947 		return (error);
1948 
1949 	/*
1950 	 * Create a transaction to cover the operations we perform.
1951 	 */
1952 	hammer_start_transaction(&trans, dip->hmp);
1953 	++hammer_stats_file_iopsw;
1954 
1955 	/*
1956 	 * Create a new filesystem object of the requested type.  The
1957 	 * returned inode will be referenced but not locked.
1958 	 */
1959 
1960 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1961 				    dip, NULL, &nip);
1962 	if (error) {
1963 		hammer_done_transaction(&trans);
1964 		*ap->a_vpp = NULL;
1965 		return (error);
1966 	}
1967 
1968 	/*
1969 	 * Add a record representing the symlink.  symlink stores the link
1970 	 * as pure data, not a string, and is no \0 terminated.
1971 	 */
1972 	if (error == 0) {
1973 		bytes = strlen(ap->a_target);
1974 
1975 		if (bytes <= HAMMER_INODE_BASESYMLEN) {
1976 			bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
1977 		} else {
1978 			record = hammer_alloc_mem_record(nip, bytes);
1979 			record->type = HAMMER_MEM_RECORD_GENERAL;
1980 
1981 			record->leaf.base.localization = nip->obj_localization +
1982 							 HAMMER_LOCALIZE_MISC;
1983 			record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
1984 			record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
1985 			record->leaf.data_len = bytes;
1986 			KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
1987 			bcopy(ap->a_target, record->data->symlink.name, bytes);
1988 			error = hammer_ip_add_record(&trans, record);
1989 		}
1990 
1991 		/*
1992 		 * Set the file size to the length of the link.
1993 		 */
1994 		if (error == 0) {
1995 			nip->ino_data.size = bytes;
1996 			hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
1997 		}
1998 	}
1999 	if (error == 0)
2000 		error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
2001 						nch->ncp->nc_nlen, nip);
2002 
2003 	/*
2004 	 * Finish up.
2005 	 */
2006 	if (error) {
2007 		hammer_rel_inode(nip, 0);
2008 		*ap->a_vpp = NULL;
2009 	} else {
2010 		error = hammer_get_vnode(nip, ap->a_vpp);
2011 		hammer_rel_inode(nip, 0);
2012 		if (error == 0) {
2013 			cache_setunresolved(ap->a_nch);
2014 			cache_setvp(ap->a_nch, *ap->a_vpp);
2015 			hammer_knote(ap->a_dvp, NOTE_WRITE);
2016 		}
2017 	}
2018 	hammer_done_transaction(&trans);
2019 	return (error);
2020 }
2021 
2022 /*
2023  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
2024  */
2025 static
2026 int
2027 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
2028 {
2029 	struct hammer_transaction trans;
2030 	struct hammer_inode *dip;
2031 	int error;
2032 
2033 	dip = VTOI(ap->a_dvp);
2034 
2035 	if (hammer_nohistory(dip) == 0 &&
2036 	    (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) {
2037 		return (error);
2038 	}
2039 
2040 	hammer_start_transaction(&trans, dip->hmp);
2041 	++hammer_stats_file_iopsw;
2042 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
2043 				ap->a_cred, ap->a_flags, -1);
2044 	hammer_done_transaction(&trans);
2045 
2046 	return (error);
2047 }
2048 
2049 /*
2050  * hammer_vop_ioctl { vp, command, data, fflag, cred }
2051  */
2052 static
2053 int
2054 hammer_vop_ioctl(struct vop_ioctl_args *ap)
2055 {
2056 	struct hammer_inode *ip = ap->a_vp->v_data;
2057 
2058 	++hammer_stats_file_iopsr;
2059 	return(hammer_ioctl(ip, ap->a_command, ap->a_data,
2060 			    ap->a_fflag, ap->a_cred));
2061 }
2062 
2063 static
2064 int
2065 hammer_vop_mountctl(struct vop_mountctl_args *ap)
2066 {
2067 	struct mount *mp;
2068 	int error;
2069 
2070 	mp = ap->a_head.a_ops->head.vv_mount;
2071 
2072 	switch(ap->a_op) {
2073 	case MOUNTCTL_SET_EXPORT:
2074 		if (ap->a_ctllen != sizeof(struct export_args))
2075 			error = EINVAL;
2076 		else
2077 			error = hammer_vfs_export(mp, ap->a_op,
2078 				      (const struct export_args *)ap->a_ctl);
2079 		break;
2080 	default:
2081 		error = journal_mountctl(ap);
2082 		break;
2083 	}
2084 	return(error);
2085 }
2086 
2087 /*
2088  * hammer_vop_strategy { vp, bio }
2089  *
2090  * Strategy call, used for regular file read & write only.  Note that the
2091  * bp may represent a cluster.
2092  *
2093  * To simplify operation and allow better optimizations in the future,
2094  * this code does not make any assumptions with regards to buffer alignment
2095  * or size.
2096  */
2097 static
2098 int
2099 hammer_vop_strategy(struct vop_strategy_args *ap)
2100 {
2101 	struct buf *bp;
2102 	int error;
2103 
2104 	bp = ap->a_bio->bio_buf;
2105 
2106 	switch(bp->b_cmd) {
2107 	case BUF_CMD_READ:
2108 		error = hammer_vop_strategy_read(ap);
2109 		break;
2110 	case BUF_CMD_WRITE:
2111 		error = hammer_vop_strategy_write(ap);
2112 		break;
2113 	default:
2114 		bp->b_error = error = EINVAL;
2115 		bp->b_flags |= B_ERROR;
2116 		biodone(ap->a_bio);
2117 		break;
2118 	}
2119 	return (error);
2120 }
2121 
2122 /*
2123  * Read from a regular file.  Iterate the related records and fill in the
2124  * BIO/BUF.  Gaps are zero-filled.
2125  *
2126  * The support code in hammer_object.c should be used to deal with mixed
2127  * in-memory and on-disk records.
2128  *
2129  * NOTE: Can be called from the cluster code with an oversized buf.
2130  *
2131  * XXX atime update
2132  */
2133 static
2134 int
2135 hammer_vop_strategy_read(struct vop_strategy_args *ap)
2136 {
2137 	struct hammer_transaction trans;
2138 	struct hammer_inode *ip;
2139 	struct hammer_cursor cursor;
2140 	hammer_base_elm_t base;
2141 	hammer_off_t disk_offset;
2142 	struct bio *bio;
2143 	struct bio *nbio;
2144 	struct buf *bp;
2145 	int64_t rec_offset;
2146 	int64_t ran_end;
2147 	int64_t tmp64;
2148 	int error;
2149 	int boff;
2150 	int roff;
2151 	int n;
2152 
2153 	bio = ap->a_bio;
2154 	bp = bio->bio_buf;
2155 	ip = ap->a_vp->v_data;
2156 
2157 	/*
2158 	 * The zone-2 disk offset may have been set by the cluster code via
2159 	 * a BMAP operation, or else should be NOOFFSET.
2160 	 *
2161 	 * Checking the high bits for a match against zone-2 should suffice.
2162 	 */
2163 	nbio = push_bio(bio);
2164 	if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2165 	    HAMMER_ZONE_LARGE_DATA) {
2166 		error = hammer_io_direct_read(ip->hmp, nbio, NULL);
2167 		return (error);
2168 	}
2169 
2170 	/*
2171 	 * Well, that sucked.  Do it the hard way.  If all the stars are
2172 	 * aligned we may still be able to issue a direct-read.
2173 	 */
2174 	hammer_simple_transaction(&trans, ip->hmp);
2175 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2176 
2177 	/*
2178 	 * Key range (begin and end inclusive) to scan.  Note that the key's
2179 	 * stored in the actual records represent BASE+LEN, not BASE.  The
2180 	 * first record containing bio_offset will have a key > bio_offset.
2181 	 */
2182 	cursor.key_beg.localization = ip->obj_localization +
2183 				      HAMMER_LOCALIZE_MISC;
2184 	cursor.key_beg.obj_id = ip->obj_id;
2185 	cursor.key_beg.create_tid = 0;
2186 	cursor.key_beg.delete_tid = 0;
2187 	cursor.key_beg.obj_type = 0;
2188 	cursor.key_beg.key = bio->bio_offset + 1;
2189 	cursor.asof = ip->obj_asof;
2190 	cursor.flags |= HAMMER_CURSOR_ASOF;
2191 
2192 	cursor.key_end = cursor.key_beg;
2193 	KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2194 #if 0
2195 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2196 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2197 		cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2198 		cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2199 	} else
2200 #endif
2201 	{
2202 		ran_end = bio->bio_offset + bp->b_bufsize;
2203 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2204 		cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2205 		tmp64 = ran_end + MAXPHYS + 1;	/* work-around GCC-4 bug */
2206 		if (tmp64 < ran_end)
2207 			cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2208 		else
2209 			cursor.key_end.key = ran_end + MAXPHYS + 1;
2210 	}
2211 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2212 
2213 	error = hammer_ip_first(&cursor);
2214 	boff = 0;
2215 
2216 	while (error == 0) {
2217 		/*
2218 		 * Get the base file offset of the record.  The key for
2219 		 * data records is (base + bytes) rather then (base).
2220 		 */
2221 		base = &cursor.leaf->base;
2222 		rec_offset = base->key - cursor.leaf->data_len;
2223 
2224 		/*
2225 		 * Calculate the gap, if any, and zero-fill it.
2226 		 *
2227 		 * n is the offset of the start of the record verses our
2228 		 * current seek offset in the bio.
2229 		 */
2230 		n = (int)(rec_offset - (bio->bio_offset + boff));
2231 		if (n > 0) {
2232 			if (n > bp->b_bufsize - boff)
2233 				n = bp->b_bufsize - boff;
2234 			bzero((char *)bp->b_data + boff, n);
2235 			boff += n;
2236 			n = 0;
2237 		}
2238 
2239 		/*
2240 		 * Calculate the data offset in the record and the number
2241 		 * of bytes we can copy.
2242 		 *
2243 		 * There are two degenerate cases.  First, boff may already
2244 		 * be at bp->b_bufsize.  Secondly, the data offset within
2245 		 * the record may exceed the record's size.
2246 		 */
2247 		roff = -n;
2248 		rec_offset += roff;
2249 		n = cursor.leaf->data_len - roff;
2250 		if (n <= 0) {
2251 			kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2252 			n = 0;
2253 		} else if (n > bp->b_bufsize - boff) {
2254 			n = bp->b_bufsize - boff;
2255 		}
2256 
2257 		/*
2258 		 * Deal with cached truncations.  This cool bit of code
2259 		 * allows truncate()/ftruncate() to avoid having to sync
2260 		 * the file.
2261 		 *
2262 		 * If the frontend is truncated then all backend records are
2263 		 * subject to the frontend's truncation.
2264 		 *
2265 		 * If the backend is truncated then backend records on-disk
2266 		 * (but not in-memory) are subject to the backend's
2267 		 * truncation.  In-memory records owned by the backend
2268 		 * represent data written after the truncation point on the
2269 		 * backend and must not be truncated.
2270 		 *
2271 		 * Truncate operations deal with frontend buffer cache
2272 		 * buffers and frontend-owned in-memory records synchronously.
2273 		 */
2274 		if (ip->flags & HAMMER_INODE_TRUNCATED) {
2275 			if (hammer_cursor_ondisk(&cursor) ||
2276 			    cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2277 				if (ip->trunc_off <= rec_offset)
2278 					n = 0;
2279 				else if (ip->trunc_off < rec_offset + n)
2280 					n = (int)(ip->trunc_off - rec_offset);
2281 			}
2282 		}
2283 		if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2284 			if (hammer_cursor_ondisk(&cursor)) {
2285 				if (ip->sync_trunc_off <= rec_offset)
2286 					n = 0;
2287 				else if (ip->sync_trunc_off < rec_offset + n)
2288 					n = (int)(ip->sync_trunc_off - rec_offset);
2289 			}
2290 		}
2291 
2292 		/*
2293 		 * Try to issue a direct read into our bio if possible,
2294 		 * otherwise resolve the element data into a hammer_buffer
2295 		 * and copy.
2296 		 *
2297 		 * The buffer on-disk should be zerod past any real
2298 		 * truncation point, but may not be for any synthesized
2299 		 * truncation point from above.
2300 		 */
2301 		disk_offset = cursor.leaf->data_offset + roff;
2302 		if (boff == 0 && n == bp->b_bufsize &&
2303 		    hammer_cursor_ondisk(&cursor) &&
2304 		    (disk_offset & HAMMER_BUFMASK) == 0) {
2305 			KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2306 				 HAMMER_ZONE_LARGE_DATA);
2307 			nbio->bio_offset = disk_offset;
2308 			error = hammer_io_direct_read(trans.hmp, nbio,
2309 						      cursor.leaf);
2310 			goto done;
2311 		} else if (n) {
2312 			error = hammer_ip_resolve_data(&cursor);
2313 			if (error == 0) {
2314 				bcopy((char *)cursor.data + roff,
2315 				      (char *)bp->b_data + boff, n);
2316 			}
2317 		}
2318 		if (error)
2319 			break;
2320 
2321 		/*
2322 		 * Iterate until we have filled the request.
2323 		 */
2324 		boff += n;
2325 		if (boff == bp->b_bufsize)
2326 			break;
2327 		error = hammer_ip_next(&cursor);
2328 	}
2329 
2330 	/*
2331 	 * There may have been a gap after the last record
2332 	 */
2333 	if (error == ENOENT)
2334 		error = 0;
2335 	if (error == 0 && boff != bp->b_bufsize) {
2336 		KKASSERT(boff < bp->b_bufsize);
2337 		bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2338 		/* boff = bp->b_bufsize; */
2339 	}
2340 	bp->b_resid = 0;
2341 	bp->b_error = error;
2342 	if (error)
2343 		bp->b_flags |= B_ERROR;
2344 	biodone(ap->a_bio);
2345 
2346 done:
2347 	if (cursor.node)
2348 		hammer_cache_node(&ip->cache[1], cursor.node);
2349 	hammer_done_cursor(&cursor);
2350 	hammer_done_transaction(&trans);
2351 	return(error);
2352 }
2353 
2354 /*
2355  * BMAP operation - used to support cluster_read() only.
2356  *
2357  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2358  *
2359  * This routine may return EOPNOTSUPP if the opration is not supported for
2360  * the specified offset.  The contents of the pointer arguments do not
2361  * need to be initialized in that case.
2362  *
2363  * If a disk address is available and properly aligned return 0 with
2364  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2365  * to the run-length relative to that offset.  Callers may assume that
2366  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2367  * large, so return EOPNOTSUPP if it is not sufficiently large.
2368  */
2369 static
2370 int
2371 hammer_vop_bmap(struct vop_bmap_args *ap)
2372 {
2373 	struct hammer_transaction trans;
2374 	struct hammer_inode *ip;
2375 	struct hammer_cursor cursor;
2376 	hammer_base_elm_t base;
2377 	int64_t rec_offset;
2378 	int64_t ran_end;
2379 	int64_t tmp64;
2380 	int64_t base_offset;
2381 	int64_t base_disk_offset;
2382 	int64_t last_offset;
2383 	hammer_off_t last_disk_offset;
2384 	hammer_off_t disk_offset;
2385 	int	rec_len;
2386 	int	error;
2387 	int	blksize;
2388 
2389 	++hammer_stats_file_iopsr;
2390 	ip = ap->a_vp->v_data;
2391 
2392 	/*
2393 	 * We can only BMAP regular files.  We can't BMAP database files,
2394 	 * directories, etc.
2395 	 */
2396 	if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2397 		return(EOPNOTSUPP);
2398 
2399 	/*
2400 	 * bmap is typically called with runp/runb both NULL when used
2401 	 * for writing.  We do not support BMAP for writing atm.
2402 	 */
2403 	if (ap->a_cmd != BUF_CMD_READ)
2404 		return(EOPNOTSUPP);
2405 
2406 	/*
2407 	 * Scan the B-Tree to acquire blockmap addresses, then translate
2408 	 * to raw addresses.
2409 	 */
2410 	hammer_simple_transaction(&trans, ip->hmp);
2411 #if 0
2412 	kprintf("bmap_beg %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
2413 #endif
2414 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2415 
2416 	/*
2417 	 * Key range (begin and end inclusive) to scan.  Note that the key's
2418 	 * stored in the actual records represent BASE+LEN, not BASE.  The
2419 	 * first record containing bio_offset will have a key > bio_offset.
2420 	 */
2421 	cursor.key_beg.localization = ip->obj_localization +
2422 				      HAMMER_LOCALIZE_MISC;
2423 	cursor.key_beg.obj_id = ip->obj_id;
2424 	cursor.key_beg.create_tid = 0;
2425 	cursor.key_beg.delete_tid = 0;
2426 	cursor.key_beg.obj_type = 0;
2427 	if (ap->a_runb)
2428 		cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
2429 	else
2430 		cursor.key_beg.key = ap->a_loffset + 1;
2431 	if (cursor.key_beg.key < 0)
2432 		cursor.key_beg.key = 0;
2433 	cursor.asof = ip->obj_asof;
2434 	cursor.flags |= HAMMER_CURSOR_ASOF;
2435 
2436 	cursor.key_end = cursor.key_beg;
2437 	KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2438 
2439 	ran_end = ap->a_loffset + MAXPHYS;
2440 	cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2441 	cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2442 	tmp64 = ran_end + MAXPHYS + 1;	/* work-around GCC-4 bug */
2443 	if (tmp64 < ran_end)
2444 		cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2445 	else
2446 		cursor.key_end.key = ran_end + MAXPHYS + 1;
2447 
2448 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2449 
2450 	error = hammer_ip_first(&cursor);
2451 	base_offset = last_offset = 0;
2452 	base_disk_offset = last_disk_offset = 0;
2453 
2454 	while (error == 0) {
2455 		/*
2456 		 * Get the base file offset of the record.  The key for
2457 		 * data records is (base + bytes) rather then (base).
2458 		 *
2459 		 * NOTE: rec_offset + rec_len may exceed the end-of-file.
2460 		 * The extra bytes should be zero on-disk and the BMAP op
2461 		 * should still be ok.
2462 		 */
2463 		base = &cursor.leaf->base;
2464 		rec_offset = base->key - cursor.leaf->data_len;
2465 		rec_len    = cursor.leaf->data_len;
2466 
2467 		/*
2468 		 * Incorporate any cached truncation.
2469 		 *
2470 		 * NOTE: Modifications to rec_len based on synthesized
2471 		 * truncation points remove the guarantee that any extended
2472 		 * data on disk is zero (since the truncations may not have
2473 		 * taken place on-media yet).
2474 		 */
2475 		if (ip->flags & HAMMER_INODE_TRUNCATED) {
2476 			if (hammer_cursor_ondisk(&cursor) ||
2477 			    cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2478 				if (ip->trunc_off <= rec_offset)
2479 					rec_len = 0;
2480 				else if (ip->trunc_off < rec_offset + rec_len)
2481 					rec_len = (int)(ip->trunc_off - rec_offset);
2482 			}
2483 		}
2484 		if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2485 			if (hammer_cursor_ondisk(&cursor)) {
2486 				if (ip->sync_trunc_off <= rec_offset)
2487 					rec_len = 0;
2488 				else if (ip->sync_trunc_off < rec_offset + rec_len)
2489 					rec_len = (int)(ip->sync_trunc_off - rec_offset);
2490 			}
2491 		}
2492 
2493 		/*
2494 		 * Accumulate information.  If we have hit a discontiguous
2495 		 * block reset base_offset unless we are already beyond the
2496 		 * requested offset.  If we are, that's it, we stop.
2497 		 */
2498 		if (error)
2499 			break;
2500 		if (hammer_cursor_ondisk(&cursor)) {
2501 			disk_offset = cursor.leaf->data_offset;
2502 			if (rec_offset != last_offset ||
2503 			    disk_offset != last_disk_offset) {
2504 				if (rec_offset > ap->a_loffset)
2505 					break;
2506 				base_offset = rec_offset;
2507 				base_disk_offset = disk_offset;
2508 			}
2509 			last_offset = rec_offset + rec_len;
2510 			last_disk_offset = disk_offset + rec_len;
2511 		}
2512 		error = hammer_ip_next(&cursor);
2513 	}
2514 
2515 #if 0
2516 	kprintf("BMAP %016llx:  %016llx - %016llx\n",
2517 		ap->a_loffset, base_offset, last_offset);
2518 	kprintf("BMAP %16s:  %016llx - %016llx\n",
2519 		"", base_disk_offset, last_disk_offset);
2520 #endif
2521 
2522 	if (cursor.node) {
2523 		hammer_cache_node(&ip->cache[1], cursor.node);
2524 #if 0
2525 		kprintf("bmap_end2 %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
2526 #endif
2527 	}
2528 	hammer_done_cursor(&cursor);
2529 	hammer_done_transaction(&trans);
2530 
2531 	/*
2532 	 * If we couldn't find any records or the records we did find were
2533 	 * all behind the requested offset, return failure.  A forward
2534 	 * truncation can leave a hole w/ no on-disk records.
2535 	 */
2536 	if (last_offset == 0 || last_offset < ap->a_loffset)
2537 		return (EOPNOTSUPP);
2538 
2539 	/*
2540 	 * Figure out the block size at the requested offset and adjust
2541 	 * our limits so the cluster_read() does not create inappropriately
2542 	 * sized buffer cache buffers.
2543 	 */
2544 	blksize = hammer_blocksize(ap->a_loffset);
2545 	if (hammer_blocksize(base_offset) != blksize) {
2546 		base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
2547 	}
2548 	if (last_offset != ap->a_loffset &&
2549 	    hammer_blocksize(last_offset - 1) != blksize) {
2550 		last_offset = hammer_blockdemarc(ap->a_loffset,
2551 						 last_offset - 1);
2552 	}
2553 
2554 	/*
2555 	 * Returning EOPNOTSUPP simply prevents the direct-IO optimization
2556 	 * from occuring.
2557 	 */
2558 	disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
2559 
2560 	if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
2561 		/*
2562 		 * Only large-data zones can be direct-IOd
2563 		 */
2564 		error = EOPNOTSUPP;
2565 	} else if ((disk_offset & HAMMER_BUFMASK) ||
2566 		   (last_offset - ap->a_loffset) < blksize) {
2567 		/*
2568 		 * doffsetp is not aligned or the forward run size does
2569 		 * not cover a whole buffer, disallow the direct I/O.
2570 		 */
2571 		error = EOPNOTSUPP;
2572 	} else {
2573 		/*
2574 		 * We're good.
2575 		 */
2576 		*ap->a_doffsetp = disk_offset;
2577 		if (ap->a_runb) {
2578 			*ap->a_runb = ap->a_loffset - base_offset;
2579 			KKASSERT(*ap->a_runb >= 0);
2580 		}
2581 		if (ap->a_runp) {
2582 			*ap->a_runp = last_offset - ap->a_loffset;
2583 			KKASSERT(*ap->a_runp >= 0);
2584 		}
2585 		error = 0;
2586 	}
2587 	return(error);
2588 }
2589 
2590 /*
2591  * Write to a regular file.   Because this is a strategy call the OS is
2592  * trying to actually get data onto the media.
2593  */
2594 static
2595 int
2596 hammer_vop_strategy_write(struct vop_strategy_args *ap)
2597 {
2598 	hammer_record_t record;
2599 	hammer_mount_t hmp;
2600 	hammer_inode_t ip;
2601 	struct bio *bio;
2602 	struct buf *bp;
2603 	int blksize;
2604 	int bytes;
2605 	int error;
2606 
2607 	bio = ap->a_bio;
2608 	bp = bio->bio_buf;
2609 	ip = ap->a_vp->v_data;
2610 	hmp = ip->hmp;
2611 
2612 	blksize = hammer_blocksize(bio->bio_offset);
2613 	KKASSERT(bp->b_bufsize == blksize);
2614 
2615 	if (ip->flags & HAMMER_INODE_RO) {
2616 		bp->b_error = EROFS;
2617 		bp->b_flags |= B_ERROR;
2618 		biodone(ap->a_bio);
2619 		return(EROFS);
2620 	}
2621 
2622 	/*
2623 	 * Interlock with inode destruction (no in-kernel or directory
2624 	 * topology visibility).  If we queue new IO while trying to
2625 	 * destroy the inode we can deadlock the vtrunc call in
2626 	 * hammer_inode_unloadable_check().
2627 	 *
2628 	 * Besides, there's no point flushing a bp associated with an
2629 	 * inode that is being destroyed on-media and has no kernel
2630 	 * references.
2631 	 */
2632 	if ((ip->flags | ip->sync_flags) &
2633 	    (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
2634 		bp->b_resid = 0;
2635 		biodone(ap->a_bio);
2636 		return(0);
2637 	}
2638 
2639 	/*
2640 	 * Reserve space and issue a direct-write from the front-end.
2641 	 * NOTE: The direct_io code will hammer_bread/bcopy smaller
2642 	 * allocations.
2643 	 *
2644 	 * An in-memory record will be installed to reference the storage
2645 	 * until the flusher can get to it.
2646 	 *
2647 	 * Since we own the high level bio the front-end will not try to
2648 	 * do a direct-read until the write completes.
2649 	 *
2650 	 * NOTE: The only time we do not reserve a full-sized buffers
2651 	 * worth of data is if the file is small.  We do not try to
2652 	 * allocate a fragment (from the small-data zone) at the end of
2653 	 * an otherwise large file as this can lead to wildly separated
2654 	 * data.
2655 	 */
2656 	KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
2657 	KKASSERT(bio->bio_offset < ip->ino_data.size);
2658 	if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
2659 		bytes = bp->b_bufsize;
2660 	else
2661 		bytes = ((int)ip->ino_data.size + 15) & ~15;
2662 
2663 	record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
2664 				    bytes, &error);
2665 	if (record) {
2666 		hammer_io_direct_write(hmp, record, bio);
2667 		if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
2668 			hammer_flush_inode(ip, 0);
2669 	} else {
2670 		bp->b_bio2.bio_offset = NOOFFSET;
2671 		bp->b_error = error;
2672 		bp->b_flags |= B_ERROR;
2673 		biodone(ap->a_bio);
2674 	}
2675 	return(error);
2676 }
2677 
2678 /*
2679  * dounlink - disconnect a directory entry
2680  *
2681  * XXX whiteout support not really in yet
2682  */
2683 static int
2684 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
2685 		struct vnode *dvp, struct ucred *cred,
2686 		int flags, int isdir)
2687 {
2688 	struct namecache *ncp;
2689 	hammer_inode_t dip;
2690 	hammer_inode_t ip;
2691 	struct hammer_cursor cursor;
2692 	int64_t namekey;
2693 	u_int32_t max_iterations;
2694 	int nlen, error;
2695 
2696 	/*
2697 	 * Calculate the namekey and setup the key range for the scan.  This
2698 	 * works kinda like a chained hash table where the lower 32 bits
2699 	 * of the namekey synthesize the chain.
2700 	 *
2701 	 * The key range is inclusive of both key_beg and key_end.
2702 	 */
2703 	dip = VTOI(dvp);
2704 	ncp = nch->ncp;
2705 
2706 	if (dip->flags & HAMMER_INODE_RO)
2707 		return (EROFS);
2708 
2709 	namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen,
2710 					   &max_iterations);
2711 retry:
2712 	hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
2713 	cursor.key_beg.localization = dip->obj_localization +
2714 				      HAMMER_LOCALIZE_MISC;
2715         cursor.key_beg.obj_id = dip->obj_id;
2716 	cursor.key_beg.key = namekey;
2717         cursor.key_beg.create_tid = 0;
2718         cursor.key_beg.delete_tid = 0;
2719         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2720         cursor.key_beg.obj_type = 0;
2721 
2722 	cursor.key_end = cursor.key_beg;
2723 	cursor.key_end.key += max_iterations;
2724 	cursor.asof = dip->obj_asof;
2725 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2726 
2727 	/*
2728 	 * Scan all matching records (the chain), locate the one matching
2729 	 * the requested path component.  info->last_error contains the
2730 	 * error code on search termination and could be 0, ENOENT, or
2731 	 * something else.
2732 	 *
2733 	 * The hammer_ip_*() functions merge in-memory records with on-disk
2734 	 * records for the purposes of the search.
2735 	 */
2736 	error = hammer_ip_first(&cursor);
2737 
2738 	while (error == 0) {
2739 		error = hammer_ip_resolve_data(&cursor);
2740 		if (error)
2741 			break;
2742 		nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2743 		KKASSERT(nlen > 0);
2744 		if (ncp->nc_nlen == nlen &&
2745 		    bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2746 			break;
2747 		}
2748 		error = hammer_ip_next(&cursor);
2749 	}
2750 
2751 	/*
2752 	 * If all is ok we have to get the inode so we can adjust nlinks.
2753 	 * To avoid a deadlock with the flusher we must release the inode
2754 	 * lock on the directory when acquiring the inode for the entry.
2755 	 *
2756 	 * If the target is a directory, it must be empty.
2757 	 */
2758 	if (error == 0) {
2759 		hammer_unlock(&cursor.ip->lock);
2760 		ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
2761 				      dip->hmp->asof,
2762 				      cursor.data->entry.localization,
2763 				      0, &error);
2764 		hammer_lock_sh(&cursor.ip->lock);
2765 		if (error == ENOENT) {
2766 			kprintf("obj_id %016llx\n", cursor.data->entry.obj_id);
2767 			Debugger("ENOENT unlinking object that should exist");
2768 		}
2769 
2770 		/*
2771 		 * If isdir >= 0 we validate that the entry is or is not a
2772 		 * directory.  If isdir < 0 we don't care.
2773 		 */
2774 		if (error == 0 && isdir >= 0) {
2775 			if (isdir &&
2776 			    ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
2777 				error = ENOTDIR;
2778 			} else if (isdir == 0 &&
2779 			    ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
2780 				error = EISDIR;
2781 			}
2782 		}
2783 
2784 		/*
2785 		 * If we are trying to remove a directory the directory must
2786 		 * be empty.
2787 		 *
2788 		 * WARNING: hammer_ip_check_directory_empty() may have to
2789 		 * terminate the cursor to avoid a deadlock.  It is ok to
2790 		 * call hammer_done_cursor() twice.
2791 		 */
2792 		if (error == 0 && ip->ino_data.obj_type ==
2793 				  HAMMER_OBJTYPE_DIRECTORY) {
2794 			error = hammer_ip_check_directory_empty(trans, ip);
2795 		}
2796 
2797 		/*
2798 		 * Delete the directory entry.
2799 		 *
2800 		 * WARNING: hammer_ip_del_directory() may have to terminate
2801 		 * the cursor to avoid a deadlock.  It is ok to call
2802 		 * hammer_done_cursor() twice.
2803 		 */
2804 		if (error == 0) {
2805 			error = hammer_ip_del_directory(trans, &cursor,
2806 							dip, ip);
2807 		}
2808 		hammer_done_cursor(&cursor);
2809 		if (error == 0) {
2810 			cache_setunresolved(nch);
2811 			cache_setvp(nch, NULL);
2812 			/* XXX locking */
2813 			if (ip->vp) {
2814 				hammer_knote(ip->vp, NOTE_DELETE);
2815 				cache_inval_vp(ip->vp, CINV_DESTROY);
2816 			}
2817 		}
2818 		if (ip)
2819 			hammer_rel_inode(ip, 0);
2820 	} else {
2821 		hammer_done_cursor(&cursor);
2822 	}
2823 	if (error == EDEADLK)
2824 		goto retry;
2825 
2826 	return (error);
2827 }
2828 
2829 /************************************************************************
2830  *			    FIFO AND SPECFS OPS				*
2831  ************************************************************************
2832  *
2833  */
2834 
2835 static int
2836 hammer_vop_fifoclose (struct vop_close_args *ap)
2837 {
2838 	/* XXX update itimes */
2839 	return (VOCALL(&fifo_vnode_vops, &ap->a_head));
2840 }
2841 
2842 static int
2843 hammer_vop_fiforead (struct vop_read_args *ap)
2844 {
2845 	int error;
2846 
2847 	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2848 	/* XXX update access time */
2849 	return (error);
2850 }
2851 
2852 static int
2853 hammer_vop_fifowrite (struct vop_write_args *ap)
2854 {
2855 	int error;
2856 
2857 	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2858 	/* XXX update access time */
2859 	return (error);
2860 }
2861 
2862 static
2863 int
2864 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap)
2865 {
2866 	int error;
2867 
2868 	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2869 	if (error)
2870 		error = hammer_vop_kqfilter(ap);
2871 	return(error);
2872 }
2873 
2874 static int
2875 hammer_vop_specclose (struct vop_close_args *ap)
2876 {
2877 	/* XXX update itimes */
2878 	return (VOCALL(&spec_vnode_vops, &ap->a_head));
2879 }
2880 
2881 static int
2882 hammer_vop_specread (struct vop_read_args *ap)
2883 {
2884 	/* XXX update access time */
2885 	return (VOCALL(&spec_vnode_vops, &ap->a_head));
2886 }
2887 
2888 static int
2889 hammer_vop_specwrite (struct vop_write_args *ap)
2890 {
2891 	/* XXX update last change time */
2892 	return (VOCALL(&spec_vnode_vops, &ap->a_head));
2893 }
2894 
2895 /************************************************************************
2896  *			    KQFILTER OPS				*
2897  ************************************************************************
2898  *
2899  */
2900 static void filt_hammerdetach(struct knote *kn);
2901 static int filt_hammerread(struct knote *kn, long hint);
2902 static int filt_hammerwrite(struct knote *kn, long hint);
2903 static int filt_hammervnode(struct knote *kn, long hint);
2904 
2905 static struct filterops hammerread_filtops =
2906 	{ 1, NULL, filt_hammerdetach, filt_hammerread };
2907 static struct filterops hammerwrite_filtops =
2908 	{ 1, NULL, filt_hammerdetach, filt_hammerwrite };
2909 static struct filterops hammervnode_filtops =
2910 	{ 1, NULL, filt_hammerdetach, filt_hammervnode };
2911 
2912 static
2913 int
2914 hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
2915 {
2916 	struct vnode *vp = ap->a_vp;
2917 	struct knote *kn = ap->a_kn;
2918 	lwkt_tokref ilock;
2919 
2920 	switch (kn->kn_filter) {
2921 	case EVFILT_READ:
2922 		kn->kn_fop = &hammerread_filtops;
2923 		break;
2924 	case EVFILT_WRITE:
2925 		kn->kn_fop = &hammerwrite_filtops;
2926 		break;
2927 	case EVFILT_VNODE:
2928 		kn->kn_fop = &hammervnode_filtops;
2929 		break;
2930 	default:
2931 		return (1);
2932 	}
2933 
2934 	kn->kn_hook = (caddr_t)vp;
2935 
2936 	lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
2937 	SLIST_INSERT_HEAD(&vp->v_pollinfo.vpi_selinfo.si_note, kn, kn_selnext);
2938 	lwkt_reltoken(&ilock);
2939 
2940 	return(0);
2941 }
2942 
2943 static void
2944 filt_hammerdetach(struct knote *kn)
2945 {
2946 	struct vnode *vp = (void *)kn->kn_hook;
2947 	lwkt_tokref ilock;
2948 
2949 	lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
2950 	SLIST_REMOVE(&vp->v_pollinfo.vpi_selinfo.si_note,
2951 		     kn, knote, kn_selnext);
2952 	lwkt_reltoken(&ilock);
2953 }
2954 
2955 static int
2956 filt_hammerread(struct knote *kn, long hint)
2957 {
2958 	struct vnode *vp = (void *)kn->kn_hook;
2959 	hammer_inode_t ip = VTOI(vp);
2960 
2961 	if (hint == NOTE_REVOKE) {
2962 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
2963 		return(1);
2964 	}
2965 	kn->kn_data = ip->ino_data.size - kn->kn_fp->f_offset;
2966 	return (kn->kn_data != 0);
2967 }
2968 
2969 static int
2970 filt_hammerwrite(struct knote *kn, long hint)
2971 {
2972 	if (hint == NOTE_REVOKE)
2973 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
2974 	kn->kn_data = 0;
2975 	return (1);
2976 }
2977 
2978 static int
2979 filt_hammervnode(struct knote *kn, long hint)
2980 {
2981 	if (kn->kn_sfflags & hint)
2982 		kn->kn_fflags |= hint;
2983 	if (hint == NOTE_REVOKE) {
2984 		kn->kn_flags |= EV_EOF;
2985 		return (1);
2986 	}
2987 	return (kn->kn_fflags != 0);
2988 }
2989 
2990