xref: /dragonfly/sys/vfs/hammer/hammer_vnops.c (revision cc93b0eb)
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.84 2008/07/09 10:29:20 dillon Exp $
35  */
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <vm/vm_extern.h>
48 #include <vfs/fifofs/fifo.h>
49 #include "hammer.h"
50 
51 /*
52  * USERFS VNOPS
53  */
54 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
55 static int hammer_vop_fsync(struct vop_fsync_args *);
56 static int hammer_vop_read(struct vop_read_args *);
57 static int hammer_vop_write(struct vop_write_args *);
58 static int hammer_vop_access(struct vop_access_args *);
59 static int hammer_vop_advlock(struct vop_advlock_args *);
60 static int hammer_vop_close(struct vop_close_args *);
61 static int hammer_vop_ncreate(struct vop_ncreate_args *);
62 static int hammer_vop_getattr(struct vop_getattr_args *);
63 static int hammer_vop_nresolve(struct vop_nresolve_args *);
64 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
65 static int hammer_vop_nlink(struct vop_nlink_args *);
66 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
67 static int hammer_vop_nmknod(struct vop_nmknod_args *);
68 static int hammer_vop_open(struct vop_open_args *);
69 static int hammer_vop_pathconf(struct vop_pathconf_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_setattr(struct vop_setattr_args *);
77 static int hammer_vop_strategy(struct vop_strategy_args *);
78 static int hammer_vop_bmap(struct vop_bmap_args *ap);
79 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
80 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
81 static int hammer_vop_ioctl(struct vop_ioctl_args *);
82 static int hammer_vop_mountctl(struct vop_mountctl_args *);
83 
84 static int hammer_vop_fifoclose (struct vop_close_args *);
85 static int hammer_vop_fiforead (struct vop_read_args *);
86 static int hammer_vop_fifowrite (struct vop_write_args *);
87 
88 static int hammer_vop_specclose (struct vop_close_args *);
89 static int hammer_vop_specread (struct vop_read_args *);
90 static int hammer_vop_specwrite (struct vop_write_args *);
91 
92 struct vop_ops hammer_vnode_vops = {
93 	.vop_default =		vop_defaultop,
94 	.vop_fsync =		hammer_vop_fsync,
95 	.vop_getpages =		vop_stdgetpages,
96 	.vop_putpages =		vop_stdputpages,
97 	.vop_read =		hammer_vop_read,
98 	.vop_write =		hammer_vop_write,
99 	.vop_access =		hammer_vop_access,
100 	.vop_advlock =		hammer_vop_advlock,
101 	.vop_close =		hammer_vop_close,
102 	.vop_ncreate =		hammer_vop_ncreate,
103 	.vop_getattr =		hammer_vop_getattr,
104 	.vop_inactive =		hammer_vop_inactive,
105 	.vop_reclaim =		hammer_vop_reclaim,
106 	.vop_nresolve =		hammer_vop_nresolve,
107 	.vop_nlookupdotdot =	hammer_vop_nlookupdotdot,
108 	.vop_nlink =		hammer_vop_nlink,
109 	.vop_nmkdir =		hammer_vop_nmkdir,
110 	.vop_nmknod =		hammer_vop_nmknod,
111 	.vop_open =		hammer_vop_open,
112 	.vop_pathconf =		hammer_vop_pathconf,
113 	.vop_print =		hammer_vop_print,
114 	.vop_readdir =		hammer_vop_readdir,
115 	.vop_readlink =		hammer_vop_readlink,
116 	.vop_nremove =		hammer_vop_nremove,
117 	.vop_nrename =		hammer_vop_nrename,
118 	.vop_nrmdir =		hammer_vop_nrmdir,
119 	.vop_setattr =		hammer_vop_setattr,
120 	.vop_bmap =		hammer_vop_bmap,
121 	.vop_strategy =		hammer_vop_strategy,
122 	.vop_nsymlink =		hammer_vop_nsymlink,
123 	.vop_nwhiteout =	hammer_vop_nwhiteout,
124 	.vop_ioctl =		hammer_vop_ioctl,
125 	.vop_mountctl =		hammer_vop_mountctl
126 };
127 
128 struct vop_ops hammer_spec_vops = {
129 	.vop_default =		spec_vnoperate,
130 	.vop_fsync =		hammer_vop_fsync,
131 	.vop_read =		hammer_vop_specread,
132 	.vop_write =		hammer_vop_specwrite,
133 	.vop_access =		hammer_vop_access,
134 	.vop_close =		hammer_vop_specclose,
135 	.vop_getattr =		hammer_vop_getattr,
136 	.vop_inactive =		hammer_vop_inactive,
137 	.vop_reclaim =		hammer_vop_reclaim,
138 	.vop_setattr =		hammer_vop_setattr
139 };
140 
141 struct vop_ops hammer_fifo_vops = {
142 	.vop_default =		fifo_vnoperate,
143 	.vop_fsync =		hammer_vop_fsync,
144 	.vop_read =		hammer_vop_fiforead,
145 	.vop_write =		hammer_vop_fifowrite,
146 	.vop_access =		hammer_vop_access,
147 	.vop_close =		hammer_vop_fifoclose,
148 	.vop_getattr =		hammer_vop_getattr,
149 	.vop_inactive =		hammer_vop_inactive,
150 	.vop_reclaim =		hammer_vop_reclaim,
151 	.vop_setattr =		hammer_vop_setattr
152 };
153 
154 #ifdef DEBUG_TRUNCATE
155 struct hammer_inode *HammerTruncIp;
156 #endif
157 
158 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
159 			   struct vnode *dvp, struct ucred *cred, int flags);
160 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
161 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
162 
163 #if 0
164 static
165 int
166 hammer_vop_vnoperate(struct vop_generic_args *)
167 {
168 	return (VOCALL(&hammer_vnode_vops, ap));
169 }
170 #endif
171 
172 /*
173  * hammer_vop_fsync { vp, waitfor }
174  *
175  * fsync() an inode to disk and wait for it to be completely committed
176  * such that the information would not be undone if a crash occured after
177  * return.
178  */
179 static
180 int
181 hammer_vop_fsync(struct vop_fsync_args *ap)
182 {
183 	hammer_inode_t ip = VTOI(ap->a_vp);
184 
185 	vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL);
186 	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
187 	if (ap->a_waitfor == MNT_WAIT)
188 		hammer_wait_inode(ip);
189 	return (ip->error);
190 }
191 
192 /*
193  * hammer_vop_read { vp, uio, ioflag, cred }
194  */
195 static
196 int
197 hammer_vop_read(struct vop_read_args *ap)
198 {
199 	struct hammer_transaction trans;
200 	hammer_inode_t ip;
201 	off_t offset;
202 	struct buf *bp;
203 	struct uio *uio;
204 	int error;
205 	int n;
206 	int seqcount;
207 	int ioseqcount;
208 	int blksize;
209 
210 	if (ap->a_vp->v_type != VREG)
211 		return (EINVAL);
212 	ip = VTOI(ap->a_vp);
213 	error = 0;
214 	uio = ap->a_uio;
215 
216 	/*
217 	 * Allow the UIO's size to override the sequential heuristic.
218 	 */
219 	blksize = hammer_blocksize(uio->uio_offset);
220 	seqcount = (uio->uio_resid + (blksize - 1)) / blksize;
221 	ioseqcount = ap->a_ioflag >> 16;
222 	if (seqcount < ioseqcount)
223 		seqcount = ioseqcount;
224 
225 	hammer_start_transaction(&trans, ip->hmp);
226 
227 	/*
228 	 * Access the data typically in HAMMER_BUFSIZE blocks via the
229 	 * buffer cache, but HAMMER may use a variable block size based
230 	 * on the offset.
231 	 */
232 	while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
233 		int64_t base_offset;
234 		int64_t file_limit;
235 
236 		blksize = hammer_blocksize(uio->uio_offset);
237 		offset = (int)uio->uio_offset & (blksize - 1);
238 		base_offset = uio->uio_offset - offset;
239 
240 		if (hammer_debug_cluster_enable) {
241 			/*
242 			 * Use file_limit to prevent cluster_read() from
243 			 * creating buffers of the wrong block size past
244 			 * the demarc.
245 			 */
246 			file_limit = ip->ino_data.size;
247 			if (base_offset < HAMMER_XDEMARC &&
248 			    file_limit > HAMMER_XDEMARC) {
249 				file_limit = HAMMER_XDEMARC;
250 			}
251 			error = cluster_read(ap->a_vp,
252 					     file_limit, base_offset,
253 					     blksize, MAXPHYS,
254 					     seqcount, &bp);
255 		} else {
256 			error = bread(ap->a_vp, base_offset, blksize, &bp);
257 		}
258 		if (error) {
259 			kprintf("error %d\n", error);
260 			brelse(bp);
261 			break;
262 		}
263 
264 		/* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
265 		n = blksize - offset;
266 		if (n > uio->uio_resid)
267 			n = uio->uio_resid;
268 		if (n > ip->ino_data.size - uio->uio_offset)
269 			n = (int)(ip->ino_data.size - uio->uio_offset);
270 		error = uiomove((char *)bp->b_data + offset, n, uio);
271 
272 		/* data has a lower priority then meta-data */
273 		bp->b_flags |= B_AGE;
274 		bqrelse(bp);
275 		if (error)
276 			break;
277 	}
278 	if ((ip->flags & HAMMER_INODE_RO) == 0 &&
279 	    (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
280 		ip->ino_data.atime = trans.time;
281 		hammer_modify_inode(ip, HAMMER_INODE_ATIME);
282 	}
283 	hammer_done_transaction(&trans);
284 	return (error);
285 }
286 
287 /*
288  * hammer_vop_write { vp, uio, ioflag, cred }
289  */
290 static
291 int
292 hammer_vop_write(struct vop_write_args *ap)
293 {
294 	struct hammer_transaction trans;
295 	struct hammer_inode *ip;
296 	hammer_mount_t hmp;
297 	struct uio *uio;
298 	int offset;
299 	off_t base_offset;
300 	struct buf *bp;
301 	int error;
302 	int n;
303 	int flags;
304 	int delta;
305 	int seqcount;
306 
307 	if (ap->a_vp->v_type != VREG)
308 		return (EINVAL);
309 	ip = VTOI(ap->a_vp);
310 	hmp = ip->hmp;
311 	error = 0;
312 	seqcount = ap->a_ioflag >> 16;
313 
314 	if (ip->flags & HAMMER_INODE_RO)
315 		return (EROFS);
316 
317 	/*
318 	 * Create a transaction to cover the operations we perform.
319 	 */
320 	hammer_start_transaction(&trans, hmp);
321 	uio = ap->a_uio;
322 
323 	/*
324 	 * Check append mode
325 	 */
326 	if (ap->a_ioflag & IO_APPEND)
327 		uio->uio_offset = ip->ino_data.size;
328 
329 	/*
330 	 * Check for illegal write offsets.  Valid range is 0...2^63-1.
331 	 *
332 	 * NOTE: the base_off assignment is required to work around what
333 	 * I consider to be a GCC-4 optimization bug.
334 	 */
335 	if (uio->uio_offset < 0) {
336 		hammer_done_transaction(&trans);
337 		return (EFBIG);
338 	}
339 	base_offset = uio->uio_offset + uio->uio_resid;	/* work around gcc-4 */
340 	if (uio->uio_resid > 0 && base_offset <= 0) {
341 		hammer_done_transaction(&trans);
342 		return (EFBIG);
343 	}
344 
345 	/*
346 	 * Access the data typically in HAMMER_BUFSIZE blocks via the
347 	 * buffer cache, but HAMMER may use a variable block size based
348 	 * on the offset.
349 	 */
350 	while (uio->uio_resid > 0) {
351 		int fixsize = 0;
352 		int blksize;
353 		int blkmask;
354 
355 		if ((error = hammer_checkspace(hmp, HAMMER_CHECKSPACE_SLOP_WRITE)) != 0)
356 			break;
357 
358 		blksize = hammer_blocksize(uio->uio_offset);
359 
360 		/*
361 		 * Do not allow HAMMER to blow out the buffer cache.  Very
362 		 * large UIOs can lockout other processes due to bwillwrite()
363 		 * mechanics.
364 		 *
365 		 * The hammer inode is not locked during these operations.
366 		 * The vnode is locked which can interfere with the pageout
367 		 * daemon for non-UIO_NOCOPY writes but should not interfere
368 		 * with the buffer cache.  Even so, we cannot afford to
369 		 * allow the pageout daemon to build up too many dirty buffer
370 		 * cache buffers.
371 		 */
372 		/*if (((int)uio->uio_offset & (blksize - 1)) == 0)*/
373 		bwillwrite(blksize);
374 
375 		/*
376 		 * Do not allow HAMMER to blow out system memory by
377 		 * accumulating too many records.   Records are so well
378 		 * decoupled from the buffer cache that it is possible
379 		 * for userland to push data out to the media via
380 		 * direct-write, but build up the records queued to the
381 		 * backend faster then the backend can flush them out.
382 		 * HAMMER has hit its write limit but the frontend has
383 		 * no pushback to slow it down.
384 		 */
385 		if (hmp->rsv_recs > hammer_limit_recs / 2) {
386 			/*
387 			 * Get the inode on the flush list
388 			 */
389 			if (ip->rsv_recs >= 64)
390 				hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
391 			else if (ip->rsv_recs >= 16)
392 				hammer_flush_inode(ip, 0);
393 
394 			/*
395 			 * Keep the flusher going if the system keeps
396 			 * queueing records.
397 			 */
398 			delta = hmp->count_newrecords -
399 				hmp->last_newrecords;
400 			if (delta < 0 || delta > hammer_limit_recs / 2) {
401 				hmp->last_newrecords = hmp->count_newrecords;
402 				hammer_sync_hmp(hmp, MNT_NOWAIT);
403 			}
404 
405 			/*
406 			 * If we have gotten behind start slowing
407 			 * down the writers.
408 			 */
409 			delta = (hmp->rsv_recs - hammer_limit_recs) *
410 				hz / hammer_limit_recs;
411 			if (delta > 0)
412 				tsleep(&trans, 0, "hmrslo", delta);
413 		}
414 
415 		/*
416 		 * Calculate the blocksize at the current offset and figure
417 		 * out how much we can actually write.
418 		 */
419 		blkmask = blksize - 1;
420 		offset = (int)uio->uio_offset & blkmask;
421 		base_offset = uio->uio_offset & ~(int64_t)blkmask;
422 		n = blksize - offset;
423 		if (n > uio->uio_resid)
424 			n = uio->uio_resid;
425 		if (uio->uio_offset + n > ip->ino_data.size) {
426 			vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
427 			fixsize = 1;
428 		}
429 
430 		if (uio->uio_segflg == UIO_NOCOPY) {
431 			/*
432 			 * Issuing a write with the same data backing the
433 			 * buffer.  Instantiate the buffer to collect the
434 			 * backing vm pages, then read-in any missing bits.
435 			 *
436 			 * This case is used by vop_stdputpages().
437 			 */
438 			bp = getblk(ap->a_vp, base_offset,
439 				    blksize, GETBLK_BHEAVY, 0);
440 			if ((bp->b_flags & B_CACHE) == 0) {
441 				bqrelse(bp);
442 				error = bread(ap->a_vp, base_offset,
443 					      blksize, &bp);
444 			}
445 		} else if (offset == 0 && uio->uio_resid >= blksize) {
446 			/*
447 			 * Even though we are entirely overwriting the buffer
448 			 * we may still have to zero it out to avoid a
449 			 * mmap/write visibility issue.
450 			 */
451 			bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
452 			if ((bp->b_flags & B_CACHE) == 0)
453 				vfs_bio_clrbuf(bp);
454 		} else if (base_offset >= ip->ino_data.size) {
455 			/*
456 			 * If the base offset of the buffer is beyond the
457 			 * file EOF, we don't have to issue a read.
458 			 */
459 			bp = getblk(ap->a_vp, base_offset,
460 				    blksize, GETBLK_BHEAVY, 0);
461 			vfs_bio_clrbuf(bp);
462 		} else {
463 			/*
464 			 * Partial overwrite, read in any missing bits then
465 			 * replace the portion being written.
466 			 */
467 			error = bread(ap->a_vp, base_offset, blksize, &bp);
468 			if (error == 0)
469 				bheavy(bp);
470 		}
471 		if (error == 0) {
472 			error = uiomove((char *)bp->b_data + offset,
473 					n, uio);
474 		}
475 
476 		/*
477 		 * If we screwed up we have to undo any VM size changes we
478 		 * made.
479 		 */
480 		if (error) {
481 			brelse(bp);
482 			if (fixsize) {
483 				vtruncbuf(ap->a_vp, ip->ino_data.size,
484 					  hammer_blocksize(ip->ino_data.size));
485 			}
486 			break;
487 		}
488 		/* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
489 		if (ip->ino_data.size < uio->uio_offset) {
490 			ip->ino_data.size = uio->uio_offset;
491 			flags = HAMMER_INODE_DDIRTY;
492 			vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
493 		} else {
494 			flags = 0;
495 		}
496 		ip->ino_data.mtime = trans.time;
497 		flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
498 		hammer_modify_inode(ip, flags);
499 
500 		/*
501 		 * Final buffer disposition.
502 		 */
503 		bp->b_flags |= B_AGE;
504 		if (ap->a_ioflag & IO_SYNC) {
505 			bwrite(bp);
506 		} else if (ap->a_ioflag & IO_DIRECT) {
507 			bawrite(bp);
508 		} else {
509 			bdwrite(bp);
510 		}
511 	}
512 	hammer_done_transaction(&trans);
513 	return (error);
514 }
515 
516 /*
517  * hammer_vop_access { vp, mode, cred }
518  */
519 static
520 int
521 hammer_vop_access(struct vop_access_args *ap)
522 {
523 	struct hammer_inode *ip = VTOI(ap->a_vp);
524 	uid_t uid;
525 	gid_t gid;
526 	int error;
527 
528 	uid = hammer_to_unix_xid(&ip->ino_data.uid);
529 	gid = hammer_to_unix_xid(&ip->ino_data.gid);
530 
531 	error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
532 				  ip->ino_data.uflags);
533 	return (error);
534 }
535 
536 /*
537  * hammer_vop_advlock { vp, id, op, fl, flags }
538  */
539 static
540 int
541 hammer_vop_advlock(struct vop_advlock_args *ap)
542 {
543 	hammer_inode_t ip = VTOI(ap->a_vp);
544 
545 	return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
546 }
547 
548 /*
549  * hammer_vop_close { vp, fflag }
550  */
551 static
552 int
553 hammer_vop_close(struct vop_close_args *ap)
554 {
555 	hammer_inode_t ip = VTOI(ap->a_vp);
556 
557 	if ((ip->flags | ip->sync_flags) & HAMMER_INODE_MODMASK)
558 		hammer_inode_waitreclaims(ip->hmp);
559 	return (vop_stdclose(ap));
560 }
561 
562 /*
563  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
564  *
565  * The operating system has already ensured that the directory entry
566  * does not exist and done all appropriate namespace locking.
567  */
568 static
569 int
570 hammer_vop_ncreate(struct vop_ncreate_args *ap)
571 {
572 	struct hammer_transaction trans;
573 	struct hammer_inode *dip;
574 	struct hammer_inode *nip;
575 	struct nchandle *nch;
576 	int error;
577 
578 	nch = ap->a_nch;
579 	dip = VTOI(ap->a_dvp);
580 
581 	if (dip->flags & HAMMER_INODE_RO)
582 		return (EROFS);
583 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHECKSPACE_SLOP_CREATE)) != 0)
584 		return (error);
585 
586 	/*
587 	 * Create a transaction to cover the operations we perform.
588 	 */
589 	hammer_start_transaction(&trans, dip->hmp);
590 
591 	/*
592 	 * Create a new filesystem object of the requested type.  The
593 	 * returned inode will be referenced and shared-locked to prevent
594 	 * it from being moved to the flusher.
595 	 */
596 
597 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
598 				    dip, NULL, &nip);
599 	if (error) {
600 		hkprintf("hammer_create_inode error %d\n", error);
601 		hammer_done_transaction(&trans);
602 		*ap->a_vpp = NULL;
603 		return (error);
604 	}
605 
606 	/*
607 	 * Add the new filesystem object to the directory.  This will also
608 	 * bump the inode's link count.
609 	 */
610 	error = hammer_ip_add_directory(&trans, dip,
611 					nch->ncp->nc_name, nch->ncp->nc_nlen,
612 					nip);
613 	if (error)
614 		hkprintf("hammer_ip_add_directory error %d\n", error);
615 
616 	/*
617 	 * Finish up.
618 	 */
619 	if (error) {
620 		hammer_rel_inode(nip, 0);
621 		hammer_done_transaction(&trans);
622 		*ap->a_vpp = NULL;
623 	} else {
624 		error = hammer_get_vnode(nip, ap->a_vpp);
625 		hammer_done_transaction(&trans);
626 		hammer_rel_inode(nip, 0);
627 		if (error == 0) {
628 			cache_setunresolved(ap->a_nch);
629 			cache_setvp(ap->a_nch, *ap->a_vpp);
630 		}
631 	}
632 	return (error);
633 }
634 
635 /*
636  * hammer_vop_getattr { vp, vap }
637  *
638  * Retrieve an inode's attribute information.  When accessing inodes
639  * historically we fake the atime field to ensure consistent results.
640  * The atime field is stored in the B-Tree element and allowed to be
641  * updated without cycling the element.
642  */
643 static
644 int
645 hammer_vop_getattr(struct vop_getattr_args *ap)
646 {
647 	struct hammer_inode *ip = VTOI(ap->a_vp);
648 	struct vattr *vap = ap->a_vap;
649 
650 	/*
651 	 * We want the fsid to be different when accessing a filesystem
652 	 * with different as-of's so programs like diff don't think
653 	 * the files are the same.
654 	 *
655 	 * We also want the fsid to be the same when comparing snapshots,
656 	 * or when comparing mirrors (which might be backed by different
657 	 * physical devices).  HAMMER fsids are based on the PFS's
658 	 * shared_uuid field.
659 	 *
660 	 * XXX there is a chance of collision here.  The va_fsid reported
661 	 * by stat is different from the more involved fsid used in the
662 	 * mount structure.
663 	 */
664 	vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
665 		       (u_int32_t)(ip->obj_asof >> 32);
666 
667 	vap->va_fileid = ip->ino_leaf.base.obj_id;
668 	vap->va_mode = ip->ino_data.mode;
669 	vap->va_nlink = ip->ino_data.nlinks;
670 	vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
671 	vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
672 	vap->va_rmajor = 0;
673 	vap->va_rminor = 0;
674 	vap->va_size = ip->ino_data.size;
675 
676 	/*
677 	 * We must provide a consistent atime and mtime for snapshots
678 	 * so people can do a 'tar cf - ... | md5' on them and get
679 	 * consistent results.
680 	 */
681 	if (ip->flags & HAMMER_INODE_RO) {
682 		hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
683 		hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
684 	} else {
685 		hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
686 		hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
687 	}
688 	hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
689 	vap->va_flags = ip->ino_data.uflags;
690 	vap->va_gen = 1;	/* hammer inums are unique for all time */
691 	vap->va_blocksize = HAMMER_BUFSIZE;
692 	if (ip->ino_data.size >= HAMMER_XDEMARC) {
693 		vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
694 				~HAMMER_XBUFMASK64;
695 	} else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
696 		vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
697 				~HAMMER_BUFMASK64;
698 	} else {
699 		vap->va_bytes = (ip->ino_data.size + 15) & ~15;
700 	}
701 	vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
702 	vap->va_filerev = 0; 	/* XXX */
703 	/* mtime uniquely identifies any adjustments made to the file XXX */
704 	vap->va_fsmid = ip->ino_data.mtime;
705 	vap->va_uid_uuid = ip->ino_data.uid;
706 	vap->va_gid_uuid = ip->ino_data.gid;
707 	vap->va_fsid_uuid = ip->hmp->fsid;
708 	vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
709 			  VA_FSID_UUID_VALID;
710 
711 	switch (ip->ino_data.obj_type) {
712 	case HAMMER_OBJTYPE_CDEV:
713 	case HAMMER_OBJTYPE_BDEV:
714 		vap->va_rmajor = ip->ino_data.rmajor;
715 		vap->va_rminor = ip->ino_data.rminor;
716 		break;
717 	default:
718 		break;
719 	}
720 	return(0);
721 }
722 
723 /*
724  * hammer_vop_nresolve { nch, dvp, cred }
725  *
726  * Locate the requested directory entry.
727  */
728 static
729 int
730 hammer_vop_nresolve(struct vop_nresolve_args *ap)
731 {
732 	struct hammer_transaction trans;
733 	struct namecache *ncp;
734 	hammer_inode_t dip;
735 	hammer_inode_t ip;
736 	hammer_tid_t asof;
737 	struct hammer_cursor cursor;
738 	struct vnode *vp;
739 	int64_t namekey;
740 	int error;
741 	int i;
742 	int nlen;
743 	int flags;
744 	int ispfs;
745 	int64_t obj_id;
746 	u_int32_t localization;
747 
748 	/*
749 	 * Misc initialization, plus handle as-of name extensions.  Look for
750 	 * the '@@' extension.  Note that as-of files and directories cannot
751 	 * be modified.
752 	 */
753 	dip = VTOI(ap->a_dvp);
754 	ncp = ap->a_nch->ncp;
755 	asof = dip->obj_asof;
756 	nlen = ncp->nc_nlen;
757 	flags = dip->flags & HAMMER_INODE_RO;
758 	ispfs = 0;
759 
760 	hammer_simple_transaction(&trans, dip->hmp);
761 
762 	for (i = 0; i < nlen; ++i) {
763 		if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
764 			asof = hammer_str_to_tid(ncp->nc_name + i + 2,
765 						 &ispfs, &localization);
766 			if (asof != HAMMER_MAX_TID)
767 				flags |= HAMMER_INODE_RO;
768 			break;
769 		}
770 	}
771 	nlen = i;
772 
773 	/*
774 	 * If this is a PFS softlink we dive into the PFS
775 	 */
776 	if (ispfs && nlen == 0) {
777 		ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
778 				      asof, localization,
779 				      flags, &error);
780 		if (error == 0) {
781 			error = hammer_get_vnode(ip, &vp);
782 			hammer_rel_inode(ip, 0);
783 		} else {
784 			vp = NULL;
785 		}
786 		if (error == 0) {
787 			vn_unlock(vp);
788 			cache_setvp(ap->a_nch, vp);
789 			vrele(vp);
790 		}
791 		goto done;
792 	}
793 
794 	/*
795 	 * If there is no path component the time extension is relative to
796 	 * dip.
797 	 */
798 	if (nlen == 0) {
799 		ip = hammer_get_inode(&trans, dip, dip->obj_id,
800 				      asof, dip->obj_localization,
801 				      flags, &error);
802 		if (error == 0) {
803 			error = hammer_get_vnode(ip, &vp);
804 			hammer_rel_inode(ip, 0);
805 		} else {
806 			vp = NULL;
807 		}
808 		if (error == 0) {
809 			vn_unlock(vp);
810 			cache_setvp(ap->a_nch, vp);
811 			vrele(vp);
812 		}
813 		goto done;
814 	}
815 
816 	/*
817 	 * Calculate the namekey and setup the key range for the scan.  This
818 	 * works kinda like a chained hash table where the lower 32 bits
819 	 * of the namekey synthesize the chain.
820 	 *
821 	 * The key range is inclusive of both key_beg and key_end.
822 	 */
823 	namekey = hammer_directory_namekey(ncp->nc_name, nlen);
824 
825 	error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
826 	cursor.key_beg.localization = dip->obj_localization +
827 				      HAMMER_LOCALIZE_MISC;
828         cursor.key_beg.obj_id = dip->obj_id;
829 	cursor.key_beg.key = namekey;
830         cursor.key_beg.create_tid = 0;
831         cursor.key_beg.delete_tid = 0;
832         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
833         cursor.key_beg.obj_type = 0;
834 
835 	cursor.key_end = cursor.key_beg;
836 	cursor.key_end.key |= 0xFFFFFFFFULL;
837 	cursor.asof = asof;
838 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
839 
840 	/*
841 	 * Scan all matching records (the chain), locate the one matching
842 	 * the requested path component.
843 	 *
844 	 * The hammer_ip_*() functions merge in-memory records with on-disk
845 	 * records for the purposes of the search.
846 	 */
847 	obj_id = 0;
848 	localization = HAMMER_DEF_LOCALIZATION;
849 
850 	if (error == 0) {
851 		error = hammer_ip_first(&cursor);
852 		while (error == 0) {
853 			error = hammer_ip_resolve_data(&cursor);
854 			if (error)
855 				break;
856 			if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
857 			    bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
858 				obj_id = cursor.data->entry.obj_id;
859 				localization = cursor.data->entry.localization;
860 				break;
861 			}
862 			error = hammer_ip_next(&cursor);
863 		}
864 	}
865 	hammer_done_cursor(&cursor);
866 	if (error == 0) {
867 		ip = hammer_get_inode(&trans, dip, obj_id,
868 				      asof, localization,
869 				      flags, &error);
870 		if (error == 0) {
871 			error = hammer_get_vnode(ip, &vp);
872 			hammer_rel_inode(ip, 0);
873 		} else {
874 			vp = NULL;
875 		}
876 		if (error == 0) {
877 			vn_unlock(vp);
878 			cache_setvp(ap->a_nch, vp);
879 			vrele(vp);
880 		}
881 	} else if (error == ENOENT) {
882 		cache_setvp(ap->a_nch, NULL);
883 	}
884 done:
885 	hammer_done_transaction(&trans);
886 	return (error);
887 }
888 
889 /*
890  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
891  *
892  * Locate the parent directory of a directory vnode.
893  *
894  * dvp is referenced but not locked.  *vpp must be returned referenced and
895  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
896  * at the root, instead it could indicate that the directory we were in was
897  * removed.
898  *
899  * NOTE: as-of sequences are not linked into the directory structure.  If
900  * we are at the root with a different asof then the mount point, reload
901  * the same directory with the mount point's asof.   I'm not sure what this
902  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
903  * get confused, but it hasn't been tested.
904  */
905 static
906 int
907 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
908 {
909 	struct hammer_transaction trans;
910 	struct hammer_inode *dip;
911 	struct hammer_inode *ip;
912 	int64_t parent_obj_id;
913 	u_int32_t parent_obj_localization;
914 	hammer_tid_t asof;
915 	int error;
916 
917 	dip = VTOI(ap->a_dvp);
918 	asof = dip->obj_asof;
919 
920 	/*
921 	 * Whos are parent?  This could be the root of a pseudo-filesystem
922 	 * whos parent is in another localization domain.
923 	 */
924 	parent_obj_id = dip->ino_data.parent_obj_id;
925 	if (dip->obj_id == HAMMER_OBJID_ROOT)
926 		parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
927 	else
928 		parent_obj_localization = dip->obj_localization;
929 
930 	if (parent_obj_id == 0) {
931 		if (dip->obj_id == HAMMER_OBJID_ROOT &&
932 		   asof != dip->hmp->asof) {
933 			parent_obj_id = dip->obj_id;
934 			asof = dip->hmp->asof;
935 			*ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
936 			ksnprintf(*ap->a_fakename, 19, "0x%016llx",
937 				   dip->obj_asof);
938 		} else {
939 			*ap->a_vpp = NULL;
940 			return ENOENT;
941 		}
942 	}
943 
944 	hammer_simple_transaction(&trans, dip->hmp);
945 
946 	ip = hammer_get_inode(&trans, dip, parent_obj_id,
947 			      asof, parent_obj_localization,
948 			      dip->flags, &error);
949 	if (ip) {
950 		error = hammer_get_vnode(ip, ap->a_vpp);
951 		hammer_rel_inode(ip, 0);
952 	} else {
953 		*ap->a_vpp = NULL;
954 	}
955 	hammer_done_transaction(&trans);
956 	return (error);
957 }
958 
959 /*
960  * hammer_vop_nlink { nch, dvp, vp, cred }
961  */
962 static
963 int
964 hammer_vop_nlink(struct vop_nlink_args *ap)
965 {
966 	struct hammer_transaction trans;
967 	struct hammer_inode *dip;
968 	struct hammer_inode *ip;
969 	struct nchandle *nch;
970 	int error;
971 
972 	nch = ap->a_nch;
973 	dip = VTOI(ap->a_dvp);
974 	ip = VTOI(ap->a_vp);
975 
976 	if (dip->flags & HAMMER_INODE_RO)
977 		return (EROFS);
978 	if (ip->flags & HAMMER_INODE_RO)
979 		return (EROFS);
980 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHECKSPACE_SLOP_CREATE)) != 0)
981 		return (error);
982 
983 	/*
984 	 * Create a transaction to cover the operations we perform.
985 	 */
986 	hammer_start_transaction(&trans, dip->hmp);
987 
988 	/*
989 	 * Add the filesystem object to the directory.  Note that neither
990 	 * dip nor ip are referenced or locked, but their vnodes are
991 	 * referenced.  This function will bump the inode's link count.
992 	 */
993 	error = hammer_ip_add_directory(&trans, dip,
994 					nch->ncp->nc_name, nch->ncp->nc_nlen,
995 					ip);
996 
997 	/*
998 	 * Finish up.
999 	 */
1000 	if (error == 0) {
1001 		cache_setunresolved(nch);
1002 		cache_setvp(nch, ap->a_vp);
1003 	}
1004 	hammer_done_transaction(&trans);
1005 	return (error);
1006 }
1007 
1008 /*
1009  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1010  *
1011  * The operating system has already ensured that the directory entry
1012  * does not exist and done all appropriate namespace locking.
1013  */
1014 static
1015 int
1016 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1017 {
1018 	struct hammer_transaction trans;
1019 	struct hammer_inode *dip;
1020 	struct hammer_inode *nip;
1021 	struct nchandle *nch;
1022 	int error;
1023 
1024 	nch = ap->a_nch;
1025 	dip = VTOI(ap->a_dvp);
1026 
1027 	if (dip->flags & HAMMER_INODE_RO)
1028 		return (EROFS);
1029 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHECKSPACE_SLOP_CREATE)) != 0)
1030 		return (error);
1031 
1032 	/*
1033 	 * Create a transaction to cover the operations we perform.
1034 	 */
1035 	hammer_start_transaction(&trans, dip->hmp);
1036 
1037 	/*
1038 	 * Create a new filesystem object of the requested type.  The
1039 	 * returned inode will be referenced but not locked.
1040 	 */
1041 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1042 				    dip, NULL, &nip);
1043 	if (error) {
1044 		hkprintf("hammer_mkdir error %d\n", error);
1045 		hammer_done_transaction(&trans);
1046 		*ap->a_vpp = NULL;
1047 		return (error);
1048 	}
1049 	/*
1050 	 * Add the new filesystem object to the directory.  This will also
1051 	 * bump the inode's link count.
1052 	 */
1053 	error = hammer_ip_add_directory(&trans, dip,
1054 					nch->ncp->nc_name, nch->ncp->nc_nlen,
1055 					nip);
1056 	if (error)
1057 		hkprintf("hammer_mkdir (add) error %d\n", error);
1058 
1059 	/*
1060 	 * Finish up.
1061 	 */
1062 	if (error) {
1063 		hammer_rel_inode(nip, 0);
1064 		*ap->a_vpp = NULL;
1065 	} else {
1066 		error = hammer_get_vnode(nip, ap->a_vpp);
1067 		hammer_rel_inode(nip, 0);
1068 		if (error == 0) {
1069 			cache_setunresolved(ap->a_nch);
1070 			cache_setvp(ap->a_nch, *ap->a_vpp);
1071 		}
1072 	}
1073 	hammer_done_transaction(&trans);
1074 	return (error);
1075 }
1076 
1077 /*
1078  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1079  *
1080  * The operating system has already ensured that the directory entry
1081  * does not exist and done all appropriate namespace locking.
1082  */
1083 static
1084 int
1085 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1086 {
1087 	struct hammer_transaction trans;
1088 	struct hammer_inode *dip;
1089 	struct hammer_inode *nip;
1090 	struct nchandle *nch;
1091 	int error;
1092 
1093 	nch = ap->a_nch;
1094 	dip = VTOI(ap->a_dvp);
1095 
1096 	if (dip->flags & HAMMER_INODE_RO)
1097 		return (EROFS);
1098 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHECKSPACE_SLOP_CREATE)) != 0)
1099 		return (error);
1100 
1101 	/*
1102 	 * Create a transaction to cover the operations we perform.
1103 	 */
1104 	hammer_start_transaction(&trans, dip->hmp);
1105 
1106 	/*
1107 	 * Create a new filesystem object of the requested type.  The
1108 	 * returned inode will be referenced but not locked.
1109 	 *
1110 	 * If mknod specifies a directory a pseudo-fs is created.
1111 	 */
1112 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1113 				    dip, NULL, &nip);
1114 	if (error) {
1115 		hammer_done_transaction(&trans);
1116 		*ap->a_vpp = NULL;
1117 		return (error);
1118 	}
1119 
1120 	/*
1121 	 * Add the new filesystem object to the directory.  This will also
1122 	 * bump the inode's link count.
1123 	 */
1124 	error = hammer_ip_add_directory(&trans, dip,
1125 					nch->ncp->nc_name, nch->ncp->nc_nlen,
1126 					nip);
1127 
1128 	/*
1129 	 * Finish up.
1130 	 */
1131 	if (error) {
1132 		hammer_rel_inode(nip, 0);
1133 		*ap->a_vpp = NULL;
1134 	} else {
1135 		error = hammer_get_vnode(nip, ap->a_vpp);
1136 		hammer_rel_inode(nip, 0);
1137 		if (error == 0) {
1138 			cache_setunresolved(ap->a_nch);
1139 			cache_setvp(ap->a_nch, *ap->a_vpp);
1140 		}
1141 	}
1142 	hammer_done_transaction(&trans);
1143 	return (error);
1144 }
1145 
1146 /*
1147  * hammer_vop_open { vp, mode, cred, fp }
1148  */
1149 static
1150 int
1151 hammer_vop_open(struct vop_open_args *ap)
1152 {
1153 	hammer_inode_t ip;
1154 
1155 	ip = VTOI(ap->a_vp);
1156 
1157 	if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1158 		return (EROFS);
1159 	return(vop_stdopen(ap));
1160 }
1161 
1162 /*
1163  * hammer_vop_pathconf { vp, name, retval }
1164  */
1165 static
1166 int
1167 hammer_vop_pathconf(struct vop_pathconf_args *ap)
1168 {
1169 	return EOPNOTSUPP;
1170 }
1171 
1172 /*
1173  * hammer_vop_print { vp }
1174  */
1175 static
1176 int
1177 hammer_vop_print(struct vop_print_args *ap)
1178 {
1179 	return EOPNOTSUPP;
1180 }
1181 
1182 /*
1183  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1184  */
1185 static
1186 int
1187 hammer_vop_readdir(struct vop_readdir_args *ap)
1188 {
1189 	struct hammer_transaction trans;
1190 	struct hammer_cursor cursor;
1191 	struct hammer_inode *ip;
1192 	struct uio *uio;
1193 	hammer_base_elm_t base;
1194 	int error;
1195 	int cookie_index;
1196 	int ncookies;
1197 	off_t *cookies;
1198 	off_t saveoff;
1199 	int r;
1200 	int dtype;
1201 
1202 	ip = VTOI(ap->a_vp);
1203 	uio = ap->a_uio;
1204 	saveoff = uio->uio_offset;
1205 
1206 	if (ap->a_ncookies) {
1207 		ncookies = uio->uio_resid / 16 + 1;
1208 		if (ncookies > 1024)
1209 			ncookies = 1024;
1210 		cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1211 		cookie_index = 0;
1212 	} else {
1213 		ncookies = -1;
1214 		cookies = NULL;
1215 		cookie_index = 0;
1216 	}
1217 
1218 	hammer_simple_transaction(&trans, ip->hmp);
1219 
1220 	/*
1221 	 * Handle artificial entries
1222 	 */
1223 	error = 0;
1224 	if (saveoff == 0) {
1225 		r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1226 		if (r)
1227 			goto done;
1228 		if (cookies)
1229 			cookies[cookie_index] = saveoff;
1230 		++saveoff;
1231 		++cookie_index;
1232 		if (cookie_index == ncookies)
1233 			goto done;
1234 	}
1235 	if (saveoff == 1) {
1236 		if (ip->ino_data.parent_obj_id) {
1237 			r = vop_write_dirent(&error, uio,
1238 					     ip->ino_data.parent_obj_id,
1239 					     DT_DIR, 2, "..");
1240 		} else {
1241 			r = vop_write_dirent(&error, uio,
1242 					     ip->obj_id, DT_DIR, 2, "..");
1243 		}
1244 		if (r)
1245 			goto done;
1246 		if (cookies)
1247 			cookies[cookie_index] = saveoff;
1248 		++saveoff;
1249 		++cookie_index;
1250 		if (cookie_index == ncookies)
1251 			goto done;
1252 	}
1253 
1254 	/*
1255 	 * Key range (begin and end inclusive) to scan.  Directory keys
1256 	 * directly translate to a 64 bit 'seek' position.
1257 	 */
1258 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1259 	cursor.key_beg.localization = ip->obj_localization +
1260 				      HAMMER_LOCALIZE_MISC;
1261 	cursor.key_beg.obj_id = ip->obj_id;
1262 	cursor.key_beg.create_tid = 0;
1263 	cursor.key_beg.delete_tid = 0;
1264         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1265 	cursor.key_beg.obj_type = 0;
1266 	cursor.key_beg.key = saveoff;
1267 
1268 	cursor.key_end = cursor.key_beg;
1269 	cursor.key_end.key = HAMMER_MAX_KEY;
1270 	cursor.asof = ip->obj_asof;
1271 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1272 
1273 	error = hammer_ip_first(&cursor);
1274 
1275 	while (error == 0) {
1276 		error = hammer_ip_resolve_data(&cursor);
1277 		if (error)
1278 			break;
1279 		base = &cursor.leaf->base;
1280 		saveoff = base->key;
1281 		KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1282 
1283 		if (base->obj_id != ip->obj_id)
1284 			panic("readdir: bad record at %p", cursor.node);
1285 
1286 		/*
1287 		 * Convert pseudo-filesystems into softlinks
1288 		 */
1289 		dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
1290 		r = vop_write_dirent(
1291 			     &error, uio, cursor.data->entry.obj_id,
1292 			     dtype,
1293 			     cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1294 			     (void *)cursor.data->entry.name);
1295 		if (r)
1296 			break;
1297 		++saveoff;
1298 		if (cookies)
1299 			cookies[cookie_index] = base->key;
1300 		++cookie_index;
1301 		if (cookie_index == ncookies)
1302 			break;
1303 		error = hammer_ip_next(&cursor);
1304 	}
1305 	hammer_done_cursor(&cursor);
1306 
1307 done:
1308 	hammer_done_transaction(&trans);
1309 
1310 	if (ap->a_eofflag)
1311 		*ap->a_eofflag = (error == ENOENT);
1312 	uio->uio_offset = saveoff;
1313 	if (error && cookie_index == 0) {
1314 		if (error == ENOENT)
1315 			error = 0;
1316 		if (cookies) {
1317 			kfree(cookies, M_TEMP);
1318 			*ap->a_ncookies = 0;
1319 			*ap->a_cookies = NULL;
1320 		}
1321 	} else {
1322 		if (error == ENOENT)
1323 			error = 0;
1324 		if (cookies) {
1325 			*ap->a_ncookies = cookie_index;
1326 			*ap->a_cookies = cookies;
1327 		}
1328 	}
1329 	return(error);
1330 }
1331 
1332 /*
1333  * hammer_vop_readlink { vp, uio, cred }
1334  */
1335 static
1336 int
1337 hammer_vop_readlink(struct vop_readlink_args *ap)
1338 {
1339 	struct hammer_transaction trans;
1340 	struct hammer_cursor cursor;
1341 	struct hammer_inode *ip;
1342 	char buf[32];
1343 	u_int32_t localization;
1344 	hammer_pseudofs_inmem_t pfsm;
1345 	int error;
1346 
1347 	ip = VTOI(ap->a_vp);
1348 
1349 	/*
1350 	 * Special softlink for PFS access, created by hammer pfs-create
1351 	 */
1352 
1353 	if (ip->obj_id == HAMMER_OBJID_ROOT && ip->obj_localization &&
1354 	    ip->obj_asof == HAMMER_MAX_TID) {
1355 		ksnprintf(buf, sizeof(buf), "@@0x%016llx:0x%04x",
1356 			ip->pfsm->pfsd.sync_end_tid,
1357 			ip->obj_localization >> 16);
1358 		error = uiomove(buf, strlen(buf), ap->a_uio);
1359 		return(error);
1360 	}
1361 
1362 	/*
1363 	 * Shortcut if the symlink data was stuffed into ino_data.
1364 	 *
1365 	 * Also expand special @@PFSxxxxx softlinks.
1366 	 */
1367 	if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1368 		char *ptr;
1369 		int bytes;
1370 
1371 		ptr = ip->ino_data.ext.symlink;
1372 		bytes = (int)ip->ino_data.size;
1373 		if (bytes == 10 && strncmp(ptr, "@@PFS", 5) == 0) {
1374 			hammer_simple_transaction(&trans, ip->hmp);
1375 			bcopy(ptr + 5, buf, 5);
1376 			buf[5] = 0;
1377 			localization = strtoul(buf, NULL, 10) << 16;
1378 			pfsm = hammer_load_pseudofs(&trans, localization,
1379 						    &error);
1380 			if (error == 0) {
1381 				ksnprintf(buf, sizeof(buf),
1382 					 "@@0x%016llx:%05d",
1383 					 pfsm->pfsd.sync_end_tid,
1384 					 localization >> 16);
1385 				ptr = buf;
1386 				bytes = strlen(buf);
1387 			}
1388 			if (pfsm)
1389 				hammer_rel_pseudofs(trans.hmp, pfsm);
1390 			hammer_done_transaction(&trans);
1391 		}
1392 		error = uiomove(ptr, bytes, ap->a_uio);
1393 		return(error);
1394 	}
1395 
1396 	/*
1397 	 * Long version
1398 	 */
1399 	hammer_simple_transaction(&trans, ip->hmp);
1400 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1401 
1402 	/*
1403 	 * Key range (begin and end inclusive) to scan.  Directory keys
1404 	 * directly translate to a 64 bit 'seek' position.
1405 	 */
1406 	cursor.key_beg.localization = ip->obj_localization +
1407 				      HAMMER_LOCALIZE_MISC;
1408 	cursor.key_beg.obj_id = ip->obj_id;
1409 	cursor.key_beg.create_tid = 0;
1410 	cursor.key_beg.delete_tid = 0;
1411         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1412 	cursor.key_beg.obj_type = 0;
1413 	cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1414 	cursor.asof = ip->obj_asof;
1415 	cursor.flags |= HAMMER_CURSOR_ASOF;
1416 
1417 	error = hammer_ip_lookup(&cursor);
1418 	if (error == 0) {
1419 		error = hammer_ip_resolve_data(&cursor);
1420 		if (error == 0) {
1421 			KKASSERT(cursor.leaf->data_len >=
1422 				 HAMMER_SYMLINK_NAME_OFF);
1423 			error = uiomove(cursor.data->symlink.name,
1424 					cursor.leaf->data_len -
1425 						HAMMER_SYMLINK_NAME_OFF,
1426 					ap->a_uio);
1427 		}
1428 	}
1429 	hammer_done_cursor(&cursor);
1430 	hammer_done_transaction(&trans);
1431 	return(error);
1432 }
1433 
1434 /*
1435  * hammer_vop_nremove { nch, dvp, cred }
1436  */
1437 static
1438 int
1439 hammer_vop_nremove(struct vop_nremove_args *ap)
1440 {
1441 	struct hammer_transaction trans;
1442 	struct hammer_inode *dip;
1443 	int error;
1444 
1445 	dip = VTOI(ap->a_dvp);
1446 
1447 	if (hammer_nohistory(dip) == 0 &&
1448 	    (error = hammer_checkspace(dip->hmp, HAMMER_CHECKSPACE_SLOP_REMOVE)) != 0) {
1449 		return (error);
1450 	}
1451 
1452 	hammer_start_transaction(&trans, dip->hmp);
1453 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1454 	hammer_done_transaction(&trans);
1455 
1456 	return (error);
1457 }
1458 
1459 /*
1460  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1461  */
1462 static
1463 int
1464 hammer_vop_nrename(struct vop_nrename_args *ap)
1465 {
1466 	struct hammer_transaction trans;
1467 	struct namecache *fncp;
1468 	struct namecache *tncp;
1469 	struct hammer_inode *fdip;
1470 	struct hammer_inode *tdip;
1471 	struct hammer_inode *ip;
1472 	struct hammer_cursor cursor;
1473 	int64_t namekey;
1474 	int nlen, error;
1475 
1476 	fdip = VTOI(ap->a_fdvp);
1477 	tdip = VTOI(ap->a_tdvp);
1478 	fncp = ap->a_fnch->ncp;
1479 	tncp = ap->a_tnch->ncp;
1480 	ip = VTOI(fncp->nc_vp);
1481 	KKASSERT(ip != NULL);
1482 
1483 	if (fdip->flags & HAMMER_INODE_RO)
1484 		return (EROFS);
1485 	if (tdip->flags & HAMMER_INODE_RO)
1486 		return (EROFS);
1487 	if (ip->flags & HAMMER_INODE_RO)
1488 		return (EROFS);
1489 	if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHECKSPACE_SLOP_CREATE)) != 0)
1490 		return (error);
1491 
1492 	hammer_start_transaction(&trans, fdip->hmp);
1493 
1494 	/*
1495 	 * Remove tncp from the target directory and then link ip as
1496 	 * tncp. XXX pass trans to dounlink
1497 	 *
1498 	 * Force the inode sync-time to match the transaction so it is
1499 	 * in-sync with the creation of the target directory entry.
1500 	 */
1501 	error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, ap->a_cred, 0);
1502 	if (error == 0 || error == ENOENT) {
1503 		error = hammer_ip_add_directory(&trans, tdip,
1504 						tncp->nc_name, tncp->nc_nlen,
1505 						ip);
1506 		if (error == 0) {
1507 			ip->ino_data.parent_obj_id = tdip->obj_id;
1508 			hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
1509 		}
1510 	}
1511 	if (error)
1512 		goto failed; /* XXX */
1513 
1514 	/*
1515 	 * Locate the record in the originating directory and remove it.
1516 	 *
1517 	 * Calculate the namekey and setup the key range for the scan.  This
1518 	 * works kinda like a chained hash table where the lower 32 bits
1519 	 * of the namekey synthesize the chain.
1520 	 *
1521 	 * The key range is inclusive of both key_beg and key_end.
1522 	 */
1523 	namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen);
1524 retry:
1525 	hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
1526 	cursor.key_beg.localization = fdip->obj_localization +
1527 				      HAMMER_LOCALIZE_MISC;
1528         cursor.key_beg.obj_id = fdip->obj_id;
1529 	cursor.key_beg.key = namekey;
1530         cursor.key_beg.create_tid = 0;
1531         cursor.key_beg.delete_tid = 0;
1532         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1533         cursor.key_beg.obj_type = 0;
1534 
1535 	cursor.key_end = cursor.key_beg;
1536 	cursor.key_end.key |= 0xFFFFFFFFULL;
1537 	cursor.asof = fdip->obj_asof;
1538 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1539 
1540 	/*
1541 	 * Scan all matching records (the chain), locate the one matching
1542 	 * the requested path component.
1543 	 *
1544 	 * The hammer_ip_*() functions merge in-memory records with on-disk
1545 	 * records for the purposes of the search.
1546 	 */
1547 	error = hammer_ip_first(&cursor);
1548 	while (error == 0) {
1549 		if (hammer_ip_resolve_data(&cursor) != 0)
1550 			break;
1551 		nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1552 		KKASSERT(nlen > 0);
1553 		if (fncp->nc_nlen == nlen &&
1554 		    bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1555 			break;
1556 		}
1557 		error = hammer_ip_next(&cursor);
1558 	}
1559 
1560 	/*
1561 	 * If all is ok we have to get the inode so we can adjust nlinks.
1562 	 *
1563 	 * WARNING: hammer_ip_del_directory() may have to terminate the
1564 	 * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1565 	 * twice.
1566 	 */
1567 	if (error == 0)
1568 		error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1569 
1570 	/*
1571 	 * XXX A deadlock here will break rename's atomicy for the purposes
1572 	 * of crash recovery.
1573 	 */
1574 	if (error == EDEADLK) {
1575 		hammer_done_cursor(&cursor);
1576 		goto retry;
1577 	}
1578 
1579 	/*
1580 	 * Cleanup and tell the kernel that the rename succeeded.
1581 	 */
1582         hammer_done_cursor(&cursor);
1583 	if (error == 0)
1584 		cache_rename(ap->a_fnch, ap->a_tnch);
1585 
1586 failed:
1587 	hammer_done_transaction(&trans);
1588 	return (error);
1589 }
1590 
1591 /*
1592  * hammer_vop_nrmdir { nch, dvp, cred }
1593  */
1594 static
1595 int
1596 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1597 {
1598 	struct hammer_transaction trans;
1599 	struct hammer_inode *dip;
1600 	int error;
1601 
1602 	dip = VTOI(ap->a_dvp);
1603 
1604 	if (hammer_nohistory(dip) == 0 &&
1605 	    (error = hammer_checkspace(dip->hmp, HAMMER_CHECKSPACE_SLOP_REMOVE)) != 0) {
1606 		return (error);
1607 	}
1608 
1609 	hammer_start_transaction(&trans, dip->hmp);
1610 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1611 	hammer_done_transaction(&trans);
1612 
1613 	return (error);
1614 }
1615 
1616 /*
1617  * hammer_vop_setattr { vp, vap, cred }
1618  */
1619 static
1620 int
1621 hammer_vop_setattr(struct vop_setattr_args *ap)
1622 {
1623 	struct hammer_transaction trans;
1624 	struct vattr *vap;
1625 	struct hammer_inode *ip;
1626 	int modflags;
1627 	int error;
1628 	int truncating;
1629 	int blksize;
1630 	int64_t aligned_size;
1631 	u_int32_t flags;
1632 
1633 	vap = ap->a_vap;
1634 	ip = ap->a_vp->v_data;
1635 	modflags = 0;
1636 
1637 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1638 		return(EROFS);
1639 	if (ip->flags & HAMMER_INODE_RO)
1640 		return (EROFS);
1641 	if (hammer_nohistory(ip) == 0 &&
1642 	    (error = hammer_checkspace(ip->hmp, HAMMER_CHECKSPACE_SLOP_REMOVE)) != 0) {
1643 		return (error);
1644 	}
1645 
1646 	hammer_start_transaction(&trans, ip->hmp);
1647 	error = 0;
1648 
1649 	if (vap->va_flags != VNOVAL) {
1650 		flags = ip->ino_data.uflags;
1651 		error = vop_helper_setattr_flags(&flags, vap->va_flags,
1652 					 hammer_to_unix_xid(&ip->ino_data.uid),
1653 					 ap->a_cred);
1654 		if (error == 0) {
1655 			if (ip->ino_data.uflags != flags) {
1656 				ip->ino_data.uflags = flags;
1657 				modflags |= HAMMER_INODE_DDIRTY;
1658 			}
1659 			if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1660 				error = 0;
1661 				goto done;
1662 			}
1663 		}
1664 		goto done;
1665 	}
1666 	if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1667 		error = EPERM;
1668 		goto done;
1669 	}
1670 	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
1671 		mode_t cur_mode = ip->ino_data.mode;
1672 		uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1673 		gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1674 		uuid_t uuid_uid;
1675 		uuid_t uuid_gid;
1676 
1677 		error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
1678 					 ap->a_cred,
1679 					 &cur_uid, &cur_gid, &cur_mode);
1680 		if (error == 0) {
1681 			hammer_guid_to_uuid(&uuid_uid, cur_uid);
1682 			hammer_guid_to_uuid(&uuid_gid, cur_gid);
1683 			if (bcmp(&uuid_uid, &ip->ino_data.uid,
1684 				 sizeof(uuid_uid)) ||
1685 			    bcmp(&uuid_gid, &ip->ino_data.gid,
1686 				 sizeof(uuid_gid)) ||
1687 			    ip->ino_data.mode != cur_mode
1688 			) {
1689 				ip->ino_data.uid = uuid_uid;
1690 				ip->ino_data.gid = uuid_gid;
1691 				ip->ino_data.mode = cur_mode;
1692 			}
1693 			modflags |= HAMMER_INODE_DDIRTY;
1694 		}
1695 	}
1696 	while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
1697 		switch(ap->a_vp->v_type) {
1698 		case VREG:
1699 			if (vap->va_size == ip->ino_data.size)
1700 				break;
1701 			/*
1702 			 * XXX break atomicy, we can deadlock the backend
1703 			 * if we do not release the lock.  Probably not a
1704 			 * big deal here.
1705 			 */
1706 			blksize = hammer_blocksize(vap->va_size);
1707 			if (vap->va_size < ip->ino_data.size) {
1708 				vtruncbuf(ap->a_vp, vap->va_size, blksize);
1709 				truncating = 1;
1710 			} else {
1711 				vnode_pager_setsize(ap->a_vp, vap->va_size);
1712 				truncating = 0;
1713 			}
1714 			ip->ino_data.size = vap->va_size;
1715 			modflags |= HAMMER_INODE_DDIRTY;
1716 
1717 			/*
1718 			 * on-media truncation is cached in the inode until
1719 			 * the inode is synchronized.
1720 			 */
1721 			if (truncating) {
1722 				hammer_ip_frontend_trunc(ip, vap->va_size);
1723 #ifdef DEBUG_TRUNCATE
1724 				if (HammerTruncIp == NULL)
1725 					HammerTruncIp = ip;
1726 #endif
1727 				if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1728 					ip->flags |= HAMMER_INODE_TRUNCATED;
1729 					ip->trunc_off = vap->va_size;
1730 #ifdef DEBUG_TRUNCATE
1731 					if (ip == HammerTruncIp)
1732 					kprintf("truncate1 %016llx\n", ip->trunc_off);
1733 #endif
1734 				} else if (ip->trunc_off > vap->va_size) {
1735 					ip->trunc_off = vap->va_size;
1736 #ifdef DEBUG_TRUNCATE
1737 					if (ip == HammerTruncIp)
1738 					kprintf("truncate2 %016llx\n", ip->trunc_off);
1739 #endif
1740 				} else {
1741 #ifdef DEBUG_TRUNCATE
1742 					if (ip == HammerTruncIp)
1743 					kprintf("truncate3 %016llx (ignored)\n", vap->va_size);
1744 #endif
1745 				}
1746 			}
1747 
1748 			/*
1749 			 * If truncating we have to clean out a portion of
1750 			 * the last block on-disk.  We do this in the
1751 			 * front-end buffer cache.
1752 			 */
1753 			aligned_size = (vap->va_size + (blksize - 1)) &
1754 				       ~(int64_t)(blksize - 1);
1755 			if (truncating && vap->va_size < aligned_size) {
1756 				struct buf *bp;
1757 				int offset;
1758 
1759 				aligned_size -= blksize;
1760 
1761 				offset = (int)vap->va_size & (blksize - 1);
1762 				error = bread(ap->a_vp, aligned_size,
1763 					      blksize, &bp);
1764 				hammer_ip_frontend_trunc(ip, aligned_size);
1765 				if (error == 0) {
1766 					bzero(bp->b_data + offset,
1767 					      blksize - offset);
1768 					bdwrite(bp);
1769 				} else {
1770 					kprintf("ERROR %d\n", error);
1771 					brelse(bp);
1772 				}
1773 			}
1774 			break;
1775 		case VDATABASE:
1776 			if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1777 				ip->flags |= HAMMER_INODE_TRUNCATED;
1778 				ip->trunc_off = vap->va_size;
1779 			} else if (ip->trunc_off > vap->va_size) {
1780 				ip->trunc_off = vap->va_size;
1781 			}
1782 			hammer_ip_frontend_trunc(ip, vap->va_size);
1783 			ip->ino_data.size = vap->va_size;
1784 			modflags |= HAMMER_INODE_DDIRTY;
1785 			break;
1786 		default:
1787 			error = EINVAL;
1788 			goto done;
1789 		}
1790 		break;
1791 	}
1792 	if (vap->va_atime.tv_sec != VNOVAL) {
1793 		ip->ino_data.atime =
1794 			hammer_timespec_to_time(&vap->va_atime);
1795 		modflags |= HAMMER_INODE_ATIME;
1796 	}
1797 	if (vap->va_mtime.tv_sec != VNOVAL) {
1798 		ip->ino_data.mtime =
1799 			hammer_timespec_to_time(&vap->va_mtime);
1800 		modflags |= HAMMER_INODE_MTIME;
1801 	}
1802 	if (vap->va_mode != (mode_t)VNOVAL) {
1803 		mode_t   cur_mode = ip->ino_data.mode;
1804 		uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1805 		gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1806 
1807 		error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
1808 					 cur_uid, cur_gid, &cur_mode);
1809 		if (error == 0 && ip->ino_data.mode != cur_mode) {
1810 			ip->ino_data.mode = cur_mode;
1811 			modflags |= HAMMER_INODE_DDIRTY;
1812 		}
1813 	}
1814 done:
1815 	if (error == 0)
1816 		hammer_modify_inode(ip, modflags);
1817 	hammer_done_transaction(&trans);
1818 	return (error);
1819 }
1820 
1821 /*
1822  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1823  */
1824 static
1825 int
1826 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
1827 {
1828 	struct hammer_transaction trans;
1829 	struct hammer_inode *dip;
1830 	struct hammer_inode *nip;
1831 	struct nchandle *nch;
1832 	hammer_record_t record;
1833 	int error;
1834 	int bytes;
1835 
1836 	ap->a_vap->va_type = VLNK;
1837 
1838 	nch = ap->a_nch;
1839 	dip = VTOI(ap->a_dvp);
1840 
1841 	if (dip->flags & HAMMER_INODE_RO)
1842 		return (EROFS);
1843 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHECKSPACE_SLOP_CREATE)) != 0)
1844 		return (error);
1845 
1846 	/*
1847 	 * Create a transaction to cover the operations we perform.
1848 	 */
1849 	hammer_start_transaction(&trans, dip->hmp);
1850 
1851 	/*
1852 	 * Create a new filesystem object of the requested type.  The
1853 	 * returned inode will be referenced but not locked.
1854 	 */
1855 
1856 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1857 				    dip, NULL, &nip);
1858 	if (error) {
1859 		hammer_done_transaction(&trans);
1860 		*ap->a_vpp = NULL;
1861 		return (error);
1862 	}
1863 
1864 	/*
1865 	 * Add a record representing the symlink.  symlink stores the link
1866 	 * as pure data, not a string, and is no \0 terminated.
1867 	 */
1868 	if (error == 0) {
1869 		bytes = strlen(ap->a_target);
1870 
1871 		if (bytes <= HAMMER_INODE_BASESYMLEN) {
1872 			bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
1873 		} else {
1874 			record = hammer_alloc_mem_record(nip, bytes);
1875 			record->type = HAMMER_MEM_RECORD_GENERAL;
1876 
1877 			record->leaf.base.localization = nip->obj_localization +
1878 							 HAMMER_LOCALIZE_MISC;
1879 			record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
1880 			record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
1881 			record->leaf.data_len = bytes;
1882 			KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
1883 			bcopy(ap->a_target, record->data->symlink.name, bytes);
1884 			error = hammer_ip_add_record(&trans, record);
1885 		}
1886 
1887 		/*
1888 		 * Set the file size to the length of the link.
1889 		 */
1890 		if (error == 0) {
1891 			nip->ino_data.size = bytes;
1892 			hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
1893 		}
1894 	}
1895 	if (error == 0)
1896 		error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
1897 						nch->ncp->nc_nlen, nip);
1898 
1899 	/*
1900 	 * Finish up.
1901 	 */
1902 	if (error) {
1903 		hammer_rel_inode(nip, 0);
1904 		*ap->a_vpp = NULL;
1905 	} else {
1906 		error = hammer_get_vnode(nip, ap->a_vpp);
1907 		hammer_rel_inode(nip, 0);
1908 		if (error == 0) {
1909 			cache_setunresolved(ap->a_nch);
1910 			cache_setvp(ap->a_nch, *ap->a_vpp);
1911 		}
1912 	}
1913 	hammer_done_transaction(&trans);
1914 	return (error);
1915 }
1916 
1917 /*
1918  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
1919  */
1920 static
1921 int
1922 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
1923 {
1924 	struct hammer_transaction trans;
1925 	struct hammer_inode *dip;
1926 	int error;
1927 
1928 	dip = VTOI(ap->a_dvp);
1929 
1930 	if (hammer_nohistory(dip) == 0 &&
1931 	    (error = hammer_checkspace(dip->hmp, HAMMER_CHECKSPACE_SLOP_CREATE)) != 0) {
1932 		return (error);
1933 	}
1934 
1935 	hammer_start_transaction(&trans, dip->hmp);
1936 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
1937 				ap->a_cred, ap->a_flags);
1938 	hammer_done_transaction(&trans);
1939 
1940 	return (error);
1941 }
1942 
1943 /*
1944  * hammer_vop_ioctl { vp, command, data, fflag, cred }
1945  */
1946 static
1947 int
1948 hammer_vop_ioctl(struct vop_ioctl_args *ap)
1949 {
1950 	struct hammer_inode *ip = ap->a_vp->v_data;
1951 
1952 	return(hammer_ioctl(ip, ap->a_command, ap->a_data,
1953 			    ap->a_fflag, ap->a_cred));
1954 }
1955 
1956 static
1957 int
1958 hammer_vop_mountctl(struct vop_mountctl_args *ap)
1959 {
1960 	struct mount *mp;
1961 	int error;
1962 
1963 	mp = ap->a_head.a_ops->head.vv_mount;
1964 
1965 	switch(ap->a_op) {
1966 	case MOUNTCTL_SET_EXPORT:
1967 		if (ap->a_ctllen != sizeof(struct export_args))
1968 			error = EINVAL;
1969 		error = hammer_vfs_export(mp, ap->a_op,
1970 				      (const struct export_args *)ap->a_ctl);
1971 		break;
1972 	default:
1973 		error = journal_mountctl(ap);
1974 		break;
1975 	}
1976 	return(error);
1977 }
1978 
1979 /*
1980  * hammer_vop_strategy { vp, bio }
1981  *
1982  * Strategy call, used for regular file read & write only.  Note that the
1983  * bp may represent a cluster.
1984  *
1985  * To simplify operation and allow better optimizations in the future,
1986  * this code does not make any assumptions with regards to buffer alignment
1987  * or size.
1988  */
1989 static
1990 int
1991 hammer_vop_strategy(struct vop_strategy_args *ap)
1992 {
1993 	struct buf *bp;
1994 	int error;
1995 
1996 	bp = ap->a_bio->bio_buf;
1997 
1998 	switch(bp->b_cmd) {
1999 	case BUF_CMD_READ:
2000 		error = hammer_vop_strategy_read(ap);
2001 		break;
2002 	case BUF_CMD_WRITE:
2003 		error = hammer_vop_strategy_write(ap);
2004 		break;
2005 	default:
2006 		bp->b_error = error = EINVAL;
2007 		bp->b_flags |= B_ERROR;
2008 		biodone(ap->a_bio);
2009 		break;
2010 	}
2011 	return (error);
2012 }
2013 
2014 /*
2015  * Read from a regular file.  Iterate the related records and fill in the
2016  * BIO/BUF.  Gaps are zero-filled.
2017  *
2018  * The support code in hammer_object.c should be used to deal with mixed
2019  * in-memory and on-disk records.
2020  *
2021  * NOTE: Can be called from the cluster code with an oversized buf.
2022  *
2023  * XXX atime update
2024  */
2025 static
2026 int
2027 hammer_vop_strategy_read(struct vop_strategy_args *ap)
2028 {
2029 	struct hammer_transaction trans;
2030 	struct hammer_inode *ip;
2031 	struct hammer_cursor cursor;
2032 	hammer_base_elm_t base;
2033 	hammer_off_t disk_offset;
2034 	struct bio *bio;
2035 	struct bio *nbio;
2036 	struct buf *bp;
2037 	int64_t rec_offset;
2038 	int64_t ran_end;
2039 	int64_t tmp64;
2040 	int error;
2041 	int boff;
2042 	int roff;
2043 	int n;
2044 
2045 	bio = ap->a_bio;
2046 	bp = bio->bio_buf;
2047 	ip = ap->a_vp->v_data;
2048 
2049 	/*
2050 	 * The zone-2 disk offset may have been set by the cluster code via
2051 	 * a BMAP operation, or else should be NOOFFSET.
2052 	 *
2053 	 * Checking the high bits for a match against zone-2 should suffice.
2054 	 */
2055 	nbio = push_bio(bio);
2056 	if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2057 	    HAMMER_ZONE_RAW_BUFFER) {
2058 		error = hammer_io_direct_read(ip->hmp, nbio);
2059 		return (error);
2060 	}
2061 
2062 	/*
2063 	 * Well, that sucked.  Do it the hard way.  If all the stars are
2064 	 * aligned we may still be able to issue a direct-read.
2065 	 */
2066 	hammer_simple_transaction(&trans, ip->hmp);
2067 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2068 
2069 	/*
2070 	 * Key range (begin and end inclusive) to scan.  Note that the key's
2071 	 * stored in the actual records represent BASE+LEN, not BASE.  The
2072 	 * first record containing bio_offset will have a key > bio_offset.
2073 	 */
2074 	cursor.key_beg.localization = ip->obj_localization +
2075 				      HAMMER_LOCALIZE_MISC;
2076 	cursor.key_beg.obj_id = ip->obj_id;
2077 	cursor.key_beg.create_tid = 0;
2078 	cursor.key_beg.delete_tid = 0;
2079 	cursor.key_beg.obj_type = 0;
2080 	cursor.key_beg.key = bio->bio_offset + 1;
2081 	cursor.asof = ip->obj_asof;
2082 	cursor.flags |= HAMMER_CURSOR_ASOF;
2083 
2084 	cursor.key_end = cursor.key_beg;
2085 	KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2086 #if 0
2087 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2088 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2089 		cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2090 		cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2091 	} else
2092 #endif
2093 	{
2094 		ran_end = bio->bio_offset + bp->b_bufsize;
2095 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2096 		cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2097 		tmp64 = ran_end + MAXPHYS + 1;	/* work-around GCC-4 bug */
2098 		if (tmp64 < ran_end)
2099 			cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2100 		else
2101 			cursor.key_end.key = ran_end + MAXPHYS + 1;
2102 	}
2103 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2104 
2105 	error = hammer_ip_first(&cursor);
2106 	boff = 0;
2107 
2108 	while (error == 0) {
2109 		/*
2110 		 * Get the base file offset of the record.  The key for
2111 		 * data records is (base + bytes) rather then (base).
2112 		 */
2113 		base = &cursor.leaf->base;
2114 		rec_offset = base->key - cursor.leaf->data_len;
2115 
2116 		/*
2117 		 * Calculate the gap, if any, and zero-fill it.
2118 		 *
2119 		 * n is the offset of the start of the record verses our
2120 		 * current seek offset in the bio.
2121 		 */
2122 		n = (int)(rec_offset - (bio->bio_offset + boff));
2123 		if (n > 0) {
2124 			if (n > bp->b_bufsize - boff)
2125 				n = bp->b_bufsize - boff;
2126 			bzero((char *)bp->b_data + boff, n);
2127 			boff += n;
2128 			n = 0;
2129 		}
2130 
2131 		/*
2132 		 * Calculate the data offset in the record and the number
2133 		 * of bytes we can copy.
2134 		 *
2135 		 * There are two degenerate cases.  First, boff may already
2136 		 * be at bp->b_bufsize.  Secondly, the data offset within
2137 		 * the record may exceed the record's size.
2138 		 */
2139 		roff = -n;
2140 		rec_offset += roff;
2141 		n = cursor.leaf->data_len - roff;
2142 		if (n <= 0) {
2143 			kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2144 			n = 0;
2145 		} else if (n > bp->b_bufsize - boff) {
2146 			n = bp->b_bufsize - boff;
2147 		}
2148 
2149 		/*
2150 		 * Deal with cached truncations.  This cool bit of code
2151 		 * allows truncate()/ftruncate() to avoid having to sync
2152 		 * the file.
2153 		 *
2154 		 * If the frontend is truncated then all backend records are
2155 		 * subject to the frontend's truncation.
2156 		 *
2157 		 * If the backend is truncated then backend records on-disk
2158 		 * (but not in-memory) are subject to the backend's
2159 		 * truncation.  In-memory records owned by the backend
2160 		 * represent data written after the truncation point on the
2161 		 * backend and must not be truncated.
2162 		 *
2163 		 * Truncate operations deal with frontend buffer cache
2164 		 * buffers and frontend-owned in-memory records synchronously.
2165 		 */
2166 		if (ip->flags & HAMMER_INODE_TRUNCATED) {
2167 			if (hammer_cursor_ondisk(&cursor) ||
2168 			    cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2169 				if (ip->trunc_off <= rec_offset)
2170 					n = 0;
2171 				else if (ip->trunc_off < rec_offset + n)
2172 					n = (int)(ip->trunc_off - rec_offset);
2173 			}
2174 		}
2175 		if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2176 			if (hammer_cursor_ondisk(&cursor)) {
2177 				if (ip->sync_trunc_off <= rec_offset)
2178 					n = 0;
2179 				else if (ip->sync_trunc_off < rec_offset + n)
2180 					n = (int)(ip->sync_trunc_off - rec_offset);
2181 			}
2182 		}
2183 
2184 		/*
2185 		 * Try to issue a direct read into our bio if possible,
2186 		 * otherwise resolve the element data into a hammer_buffer
2187 		 * and copy.
2188 		 *
2189 		 * The buffer on-disk should be zerod past any real
2190 		 * truncation point, but may not be for any synthesized
2191 		 * truncation point from above.
2192 		 */
2193 		if (boff == 0 && n == bp->b_bufsize &&
2194 		    ((cursor.leaf->data_offset + roff) & HAMMER_BUFMASK) == 0) {
2195 			disk_offset = hammer_blockmap_lookup(
2196 						trans.hmp,
2197 						cursor.leaf->data_offset + roff,
2198 						&error);
2199 			if (error)
2200 				break;
2201 			nbio->bio_offset = disk_offset;
2202 			error = hammer_io_direct_read(trans.hmp, nbio);
2203 			goto done;
2204 		} else if (n) {
2205 			error = hammer_ip_resolve_data(&cursor);
2206 			if (error == 0) {
2207 				bcopy((char *)cursor.data + roff,
2208 				      (char *)bp->b_data + boff, n);
2209 			}
2210 		}
2211 		if (error)
2212 			break;
2213 
2214 		/*
2215 		 * Iterate until we have filled the request.
2216 		 */
2217 		boff += n;
2218 		if (boff == bp->b_bufsize)
2219 			break;
2220 		error = hammer_ip_next(&cursor);
2221 	}
2222 
2223 	/*
2224 	 * There may have been a gap after the last record
2225 	 */
2226 	if (error == ENOENT)
2227 		error = 0;
2228 	if (error == 0 && boff != bp->b_bufsize) {
2229 		KKASSERT(boff < bp->b_bufsize);
2230 		bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2231 		/* boff = bp->b_bufsize; */
2232 	}
2233 	bp->b_resid = 0;
2234 	bp->b_error = error;
2235 	if (error)
2236 		bp->b_flags |= B_ERROR;
2237 	biodone(ap->a_bio);
2238 
2239 done:
2240 	if (cursor.node)
2241 		hammer_cache_node(&ip->cache[1], cursor.node);
2242 	hammer_done_cursor(&cursor);
2243 	hammer_done_transaction(&trans);
2244 	return(error);
2245 }
2246 
2247 /*
2248  * BMAP operation - used to support cluster_read() only.
2249  *
2250  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2251  *
2252  * This routine may return EOPNOTSUPP if the opration is not supported for
2253  * the specified offset.  The contents of the pointer arguments do not
2254  * need to be initialized in that case.
2255  *
2256  * If a disk address is available and properly aligned return 0 with
2257  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2258  * to the run-length relative to that offset.  Callers may assume that
2259  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2260  * large, so return EOPNOTSUPP if it is not sufficiently large.
2261  */
2262 static
2263 int
2264 hammer_vop_bmap(struct vop_bmap_args *ap)
2265 {
2266 	struct hammer_transaction trans;
2267 	struct hammer_inode *ip;
2268 	struct hammer_cursor cursor;
2269 	hammer_base_elm_t base;
2270 	int64_t rec_offset;
2271 	int64_t ran_end;
2272 	int64_t tmp64;
2273 	int64_t base_offset;
2274 	int64_t base_disk_offset;
2275 	int64_t last_offset;
2276 	hammer_off_t last_disk_offset;
2277 	hammer_off_t disk_offset;
2278 	int	rec_len;
2279 	int	error;
2280 	int	blksize;
2281 
2282 	ip = ap->a_vp->v_data;
2283 
2284 	/*
2285 	 * We can only BMAP regular files.  We can't BMAP database files,
2286 	 * directories, etc.
2287 	 */
2288 	if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2289 		return(EOPNOTSUPP);
2290 
2291 	/*
2292 	 * bmap is typically called with runp/runb both NULL when used
2293 	 * for writing.  We do not support BMAP for writing atm.
2294 	 */
2295 	if (ap->a_cmd != BUF_CMD_READ)
2296 		return(EOPNOTSUPP);
2297 
2298 	/*
2299 	 * Scan the B-Tree to acquire blockmap addresses, then translate
2300 	 * to raw addresses.
2301 	 */
2302 	hammer_simple_transaction(&trans, ip->hmp);
2303 #if 0
2304 	kprintf("bmap_beg %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
2305 #endif
2306 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2307 
2308 	/*
2309 	 * Key range (begin and end inclusive) to scan.  Note that the key's
2310 	 * stored in the actual records represent BASE+LEN, not BASE.  The
2311 	 * first record containing bio_offset will have a key > bio_offset.
2312 	 */
2313 	cursor.key_beg.localization = ip->obj_localization +
2314 				      HAMMER_LOCALIZE_MISC;
2315 	cursor.key_beg.obj_id = ip->obj_id;
2316 	cursor.key_beg.create_tid = 0;
2317 	cursor.key_beg.delete_tid = 0;
2318 	cursor.key_beg.obj_type = 0;
2319 	if (ap->a_runb)
2320 		cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
2321 	else
2322 		cursor.key_beg.key = ap->a_loffset + 1;
2323 	if (cursor.key_beg.key < 0)
2324 		cursor.key_beg.key = 0;
2325 	cursor.asof = ip->obj_asof;
2326 	cursor.flags |= HAMMER_CURSOR_ASOF;
2327 
2328 	cursor.key_end = cursor.key_beg;
2329 	KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2330 
2331 	ran_end = ap->a_loffset + MAXPHYS;
2332 	cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2333 	cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2334 	tmp64 = ran_end + MAXPHYS + 1;	/* work-around GCC-4 bug */
2335 	if (tmp64 < ran_end)
2336 		cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2337 	else
2338 		cursor.key_end.key = ran_end + MAXPHYS + 1;
2339 
2340 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2341 
2342 	error = hammer_ip_first(&cursor);
2343 	base_offset = last_offset = 0;
2344 	base_disk_offset = last_disk_offset = 0;
2345 
2346 	while (error == 0) {
2347 		/*
2348 		 * Get the base file offset of the record.  The key for
2349 		 * data records is (base + bytes) rather then (base).
2350 		 *
2351 		 * NOTE: rec_offset + rec_len may exceed the end-of-file.
2352 		 * The extra bytes should be zero on-disk and the BMAP op
2353 		 * should still be ok.
2354 		 */
2355 		base = &cursor.leaf->base;
2356 		rec_offset = base->key - cursor.leaf->data_len;
2357 		rec_len    = cursor.leaf->data_len;
2358 
2359 		/*
2360 		 * Incorporate any cached truncation.
2361 		 *
2362 		 * NOTE: Modifications to rec_len based on synthesized
2363 		 * truncation points remove the guarantee that any extended
2364 		 * data on disk is zero (since the truncations may not have
2365 		 * taken place on-media yet).
2366 		 */
2367 		if (ip->flags & HAMMER_INODE_TRUNCATED) {
2368 			if (hammer_cursor_ondisk(&cursor) ||
2369 			    cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2370 				if (ip->trunc_off <= rec_offset)
2371 					rec_len = 0;
2372 				else if (ip->trunc_off < rec_offset + rec_len)
2373 					rec_len = (int)(ip->trunc_off - rec_offset);
2374 			}
2375 		}
2376 		if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2377 			if (hammer_cursor_ondisk(&cursor)) {
2378 				if (ip->sync_trunc_off <= rec_offset)
2379 					rec_len = 0;
2380 				else if (ip->sync_trunc_off < rec_offset + rec_len)
2381 					rec_len = (int)(ip->sync_trunc_off - rec_offset);
2382 			}
2383 		}
2384 
2385 		/*
2386 		 * Accumulate information.  If we have hit a discontiguous
2387 		 * block reset base_offset unless we are already beyond the
2388 		 * requested offset.  If we are, that's it, we stop.
2389 		 */
2390 		disk_offset = hammer_blockmap_lookup(trans.hmp,
2391 						     cursor.leaf->data_offset,
2392 						     &error);
2393 		if (error)
2394 			break;
2395 		if (rec_offset != last_offset ||
2396 		    disk_offset != last_disk_offset) {
2397 			if (rec_offset > ap->a_loffset)
2398 				break;
2399 			base_offset = rec_offset;
2400 			base_disk_offset = disk_offset;
2401 		}
2402 		last_offset = rec_offset + rec_len;
2403 		last_disk_offset = disk_offset + rec_len;
2404 
2405 		error = hammer_ip_next(&cursor);
2406 	}
2407 
2408 #if 0
2409 	kprintf("BMAP %016llx:  %016llx - %016llx\n",
2410 		ap->a_loffset, base_offset, last_offset);
2411 	kprintf("BMAP %16s:  %016llx - %016llx\n",
2412 		"", base_disk_offset, last_disk_offset);
2413 #endif
2414 
2415 	if (cursor.node) {
2416 		hammer_cache_node(&ip->cache[1], cursor.node);
2417 #if 0
2418 		kprintf("bmap_end2 %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
2419 #endif
2420 	}
2421 	hammer_done_cursor(&cursor);
2422 	hammer_done_transaction(&trans);
2423 
2424 	/*
2425 	 * If we couldn't find any records or the records we did find were
2426 	 * all behind the requested offset, return failure.  A forward
2427 	 * truncation can leave a hole w/ no on-disk records.
2428 	 */
2429 	if (last_offset == 0 || last_offset < ap->a_loffset)
2430 		return (EOPNOTSUPP);
2431 
2432 	/*
2433 	 * Figure out the block size at the requested offset and adjust
2434 	 * our limits so the cluster_read() does not create inappropriately
2435 	 * sized buffer cache buffers.
2436 	 */
2437 	blksize = hammer_blocksize(ap->a_loffset);
2438 	if (hammer_blocksize(base_offset) != blksize) {
2439 		base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
2440 	}
2441 	if (last_offset != ap->a_loffset &&
2442 	    hammer_blocksize(last_offset - 1) != blksize) {
2443 		last_offset = hammer_blockdemarc(ap->a_loffset,
2444 						 last_offset - 1);
2445 	}
2446 
2447 	/*
2448 	 * Returning EOPNOTSUPP simply prevents the direct-IO optimization
2449 	 * from occuring.
2450 	 */
2451 	disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
2452 
2453 	/*
2454 	 * If doffsetp is not aligned or the forward run size does
2455 	 * not cover a whole buffer, disallow the direct I/O.
2456 	 */
2457 	if ((disk_offset & HAMMER_BUFMASK) ||
2458 	    (last_offset - ap->a_loffset) < blksize) {
2459 		error = EOPNOTSUPP;
2460 	} else {
2461 		*ap->a_doffsetp = disk_offset;
2462 		if (ap->a_runb) {
2463 			*ap->a_runb = ap->a_loffset - base_offset;
2464 			KKASSERT(*ap->a_runb >= 0);
2465 		}
2466 		if (ap->a_runp) {
2467 			*ap->a_runp = last_offset - ap->a_loffset;
2468 			KKASSERT(*ap->a_runp >= 0);
2469 		}
2470 		error = 0;
2471 	}
2472 	return(error);
2473 }
2474 
2475 /*
2476  * Write to a regular file.   Because this is a strategy call the OS is
2477  * trying to actually get data onto the media.
2478  */
2479 static
2480 int
2481 hammer_vop_strategy_write(struct vop_strategy_args *ap)
2482 {
2483 	hammer_record_t record;
2484 	hammer_mount_t hmp;
2485 	hammer_inode_t ip;
2486 	struct bio *bio;
2487 	struct buf *bp;
2488 	int blksize;
2489 	int bytes;
2490 	int error;
2491 
2492 	bio = ap->a_bio;
2493 	bp = bio->bio_buf;
2494 	ip = ap->a_vp->v_data;
2495 	hmp = ip->hmp;
2496 
2497 	blksize = hammer_blocksize(bio->bio_offset);
2498 	KKASSERT(bp->b_bufsize == blksize);
2499 
2500 	if (ip->flags & HAMMER_INODE_RO) {
2501 		bp->b_error = EROFS;
2502 		bp->b_flags |= B_ERROR;
2503 		biodone(ap->a_bio);
2504 		return(EROFS);
2505 	}
2506 
2507 	/*
2508 	 * Interlock with inode destruction (no in-kernel or directory
2509 	 * topology visibility).  If we queue new IO while trying to
2510 	 * destroy the inode we can deadlock the vtrunc call in
2511 	 * hammer_inode_unloadable_check().
2512 	 */
2513 	if (ip->flags & (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
2514 		bp->b_resid = 0;
2515 		biodone(ap->a_bio);
2516 		return(0);
2517 	}
2518 
2519 	/*
2520 	 * Reserve space and issue a direct-write from the front-end.
2521 	 * NOTE: The direct_io code will hammer_bread/bcopy smaller
2522 	 * allocations.
2523 	 *
2524 	 * An in-memory record will be installed to reference the storage
2525 	 * until the flusher can get to it.
2526 	 *
2527 	 * Since we own the high level bio the front-end will not try to
2528 	 * do a direct-read until the write completes.
2529 	 *
2530 	 * NOTE: The only time we do not reserve a full-sized buffers
2531 	 * worth of data is if the file is small.  We do not try to
2532 	 * allocate a fragment (from the small-data zone) at the end of
2533 	 * an otherwise large file as this can lead to wildly separated
2534 	 * data.
2535 	 */
2536 	KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
2537 	KKASSERT(bio->bio_offset < ip->ino_data.size);
2538 	if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
2539 		bytes = bp->b_bufsize;
2540 	else
2541 		bytes = ((int)ip->ino_data.size + 15) & ~15;
2542 
2543 	record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
2544 				    bytes, &error);
2545 	if (record) {
2546 		hammer_io_direct_write(hmp, &record->leaf, bio);
2547 		hammer_rel_mem_record(record);
2548 		if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
2549 			hammer_flush_inode(ip, 0);
2550 	} else {
2551 		bp->b_bio2.bio_offset = NOOFFSET;
2552 		bp->b_error = error;
2553 		bp->b_flags |= B_ERROR;
2554 		biodone(ap->a_bio);
2555 	}
2556 	return(error);
2557 }
2558 
2559 /*
2560  * dounlink - disconnect a directory entry
2561  *
2562  * XXX whiteout support not really in yet
2563  */
2564 static int
2565 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
2566 		struct vnode *dvp, struct ucred *cred, int flags)
2567 {
2568 	struct namecache *ncp;
2569 	hammer_inode_t dip;
2570 	hammer_inode_t ip;
2571 	struct hammer_cursor cursor;
2572 	int64_t namekey;
2573 	int nlen, error;
2574 
2575 	/*
2576 	 * Calculate the namekey and setup the key range for the scan.  This
2577 	 * works kinda like a chained hash table where the lower 32 bits
2578 	 * of the namekey synthesize the chain.
2579 	 *
2580 	 * The key range is inclusive of both key_beg and key_end.
2581 	 */
2582 	dip = VTOI(dvp);
2583 	ncp = nch->ncp;
2584 
2585 	if (dip->flags & HAMMER_INODE_RO)
2586 		return (EROFS);
2587 
2588 	namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen);
2589 retry:
2590 	hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
2591 	cursor.key_beg.localization = dip->obj_localization +
2592 				      HAMMER_LOCALIZE_MISC;
2593         cursor.key_beg.obj_id = dip->obj_id;
2594 	cursor.key_beg.key = namekey;
2595         cursor.key_beg.create_tid = 0;
2596         cursor.key_beg.delete_tid = 0;
2597         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2598         cursor.key_beg.obj_type = 0;
2599 
2600 	cursor.key_end = cursor.key_beg;
2601 	cursor.key_end.key |= 0xFFFFFFFFULL;
2602 	cursor.asof = dip->obj_asof;
2603 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2604 
2605 	/*
2606 	 * Scan all matching records (the chain), locate the one matching
2607 	 * the requested path component.  info->last_error contains the
2608 	 * error code on search termination and could be 0, ENOENT, or
2609 	 * something else.
2610 	 *
2611 	 * The hammer_ip_*() functions merge in-memory records with on-disk
2612 	 * records for the purposes of the search.
2613 	 */
2614 	error = hammer_ip_first(&cursor);
2615 
2616 	while (error == 0) {
2617 		error = hammer_ip_resolve_data(&cursor);
2618 		if (error)
2619 			break;
2620 		nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2621 		KKASSERT(nlen > 0);
2622 		if (ncp->nc_nlen == nlen &&
2623 		    bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2624 			break;
2625 		}
2626 		error = hammer_ip_next(&cursor);
2627 	}
2628 
2629 	/*
2630 	 * If all is ok we have to get the inode so we can adjust nlinks.
2631 	 * To avoid a deadlock with the flusher we must release the inode
2632 	 * lock on the directory when acquiring the inode for the entry.
2633 	 *
2634 	 * If the target is a directory, it must be empty.
2635 	 */
2636 	if (error == 0) {
2637 		hammer_unlock(&cursor.ip->lock);
2638 		ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
2639 				      dip->hmp->asof,
2640 				      cursor.data->entry.localization,
2641 				      0, &error);
2642 		hammer_lock_sh(&cursor.ip->lock);
2643 		if (error == ENOENT) {
2644 			kprintf("obj_id %016llx\n", cursor.data->entry.obj_id);
2645 			Debugger("ENOENT unlinking object that should exist");
2646 		}
2647 
2648 		/*
2649 		 * If we are trying to remove a directory the directory must
2650 		 * be empty.
2651 		 *
2652 		 * WARNING: hammer_ip_check_directory_empty() may have to
2653 		 * terminate the cursor to avoid a deadlock.  It is ok to
2654 		 * call hammer_done_cursor() twice.
2655 		 */
2656 		if (error == 0 && ip->ino_data.obj_type ==
2657 				  HAMMER_OBJTYPE_DIRECTORY) {
2658 			error = hammer_ip_check_directory_empty(trans, ip);
2659 		}
2660 
2661 		/*
2662 		 * Delete the directory entry.
2663 		 *
2664 		 * WARNING: hammer_ip_del_directory() may have to terminate
2665 		 * the cursor to avoid a deadlock.  It is ok to call
2666 		 * hammer_done_cursor() twice.
2667 		 */
2668 		if (error == 0) {
2669 			error = hammer_ip_del_directory(trans, &cursor,
2670 							dip, ip);
2671 		}
2672 		hammer_done_cursor(&cursor);
2673 		if (error == 0) {
2674 			cache_setunresolved(nch);
2675 			cache_setvp(nch, NULL);
2676 			/* XXX locking */
2677 			if (ip->vp)
2678 				cache_inval_vp(ip->vp, CINV_DESTROY);
2679 		}
2680 		if (ip)
2681 			hammer_rel_inode(ip, 0);
2682 	} else {
2683 		hammer_done_cursor(&cursor);
2684 	}
2685 	if (error == EDEADLK)
2686 		goto retry;
2687 
2688 	return (error);
2689 }
2690 
2691 /************************************************************************
2692  *			    FIFO AND SPECFS OPS				*
2693  ************************************************************************
2694  *
2695  */
2696 
2697 static int
2698 hammer_vop_fifoclose (struct vop_close_args *ap)
2699 {
2700 	/* XXX update itimes */
2701 	return (VOCALL(&fifo_vnode_vops, &ap->a_head));
2702 }
2703 
2704 static int
2705 hammer_vop_fiforead (struct vop_read_args *ap)
2706 {
2707 	int error;
2708 
2709 	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2710 	/* XXX update access time */
2711 	return (error);
2712 }
2713 
2714 static int
2715 hammer_vop_fifowrite (struct vop_write_args *ap)
2716 {
2717 	int error;
2718 
2719 	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2720 	/* XXX update access time */
2721 	return (error);
2722 }
2723 
2724 static int
2725 hammer_vop_specclose (struct vop_close_args *ap)
2726 {
2727 	/* XXX update itimes */
2728 	return (VOCALL(&spec_vnode_vops, &ap->a_head));
2729 }
2730 
2731 static int
2732 hammer_vop_specread (struct vop_read_args *ap)
2733 {
2734 	/* XXX update access time */
2735 	return (VOCALL(&spec_vnode_vops, &ap->a_head));
2736 }
2737 
2738 static int
2739 hammer_vop_specwrite (struct vop_write_args *ap)
2740 {
2741 	/* XXX update last change time */
2742 	return (VOCALL(&spec_vnode_vops, &ap->a_head));
2743 }
2744 
2745