xref: /dragonfly/sys/vfs/hammer/hammer_vnops.c (revision bc3d4063)
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.96 2008/08/09 07:04:16 dillon Exp $
35  */
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <vm/vm_extern.h>
48 #include <vfs/fifofs/fifo.h>
49 #include "hammer.h"
50 
51 /*
52  * USERFS VNOPS
53  */
54 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
55 static int hammer_vop_fsync(struct vop_fsync_args *);
56 static int hammer_vop_read(struct vop_read_args *);
57 static int hammer_vop_write(struct vop_write_args *);
58 static int hammer_vop_access(struct vop_access_args *);
59 static int hammer_vop_advlock(struct vop_advlock_args *);
60 static int hammer_vop_close(struct vop_close_args *);
61 static int hammer_vop_ncreate(struct vop_ncreate_args *);
62 static int hammer_vop_getattr(struct vop_getattr_args *);
63 static int hammer_vop_nresolve(struct vop_nresolve_args *);
64 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
65 static int hammer_vop_nlink(struct vop_nlink_args *);
66 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
67 static int hammer_vop_nmknod(struct vop_nmknod_args *);
68 static int hammer_vop_open(struct vop_open_args *);
69 static int hammer_vop_pathconf(struct vop_pathconf_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_setattr(struct vop_setattr_args *);
77 static int hammer_vop_strategy(struct vop_strategy_args *);
78 static int hammer_vop_bmap(struct vop_bmap_args *ap);
79 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
80 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
81 static int hammer_vop_ioctl(struct vop_ioctl_args *);
82 static int hammer_vop_mountctl(struct vop_mountctl_args *);
83 
84 static int hammer_vop_fifoclose (struct vop_close_args *);
85 static int hammer_vop_fiforead (struct vop_read_args *);
86 static int hammer_vop_fifowrite (struct vop_write_args *);
87 
88 static int hammer_vop_specclose (struct vop_close_args *);
89 static int hammer_vop_specread (struct vop_read_args *);
90 static int hammer_vop_specwrite (struct vop_write_args *);
91 
92 struct vop_ops hammer_vnode_vops = {
93 	.vop_default =		vop_defaultop,
94 	.vop_fsync =		hammer_vop_fsync,
95 	.vop_getpages =		vop_stdgetpages,
96 	.vop_putpages =		vop_stdputpages,
97 	.vop_read =		hammer_vop_read,
98 	.vop_write =		hammer_vop_write,
99 	.vop_access =		hammer_vop_access,
100 	.vop_advlock =		hammer_vop_advlock,
101 	.vop_close =		hammer_vop_close,
102 	.vop_ncreate =		hammer_vop_ncreate,
103 	.vop_getattr =		hammer_vop_getattr,
104 	.vop_inactive =		hammer_vop_inactive,
105 	.vop_reclaim =		hammer_vop_reclaim,
106 	.vop_nresolve =		hammer_vop_nresolve,
107 	.vop_nlookupdotdot =	hammer_vop_nlookupdotdot,
108 	.vop_nlink =		hammer_vop_nlink,
109 	.vop_nmkdir =		hammer_vop_nmkdir,
110 	.vop_nmknod =		hammer_vop_nmknod,
111 	.vop_open =		hammer_vop_open,
112 	.vop_pathconf =		hammer_vop_pathconf,
113 	.vop_print =		hammer_vop_print,
114 	.vop_readdir =		hammer_vop_readdir,
115 	.vop_readlink =		hammer_vop_readlink,
116 	.vop_nremove =		hammer_vop_nremove,
117 	.vop_nrename =		hammer_vop_nrename,
118 	.vop_nrmdir =		hammer_vop_nrmdir,
119 	.vop_setattr =		hammer_vop_setattr,
120 	.vop_bmap =		hammer_vop_bmap,
121 	.vop_strategy =		hammer_vop_strategy,
122 	.vop_nsymlink =		hammer_vop_nsymlink,
123 	.vop_nwhiteout =	hammer_vop_nwhiteout,
124 	.vop_ioctl =		hammer_vop_ioctl,
125 	.vop_mountctl =		hammer_vop_mountctl
126 };
127 
128 struct vop_ops hammer_spec_vops = {
129 	.vop_default =		spec_vnoperate,
130 	.vop_fsync =		hammer_vop_fsync,
131 	.vop_read =		hammer_vop_specread,
132 	.vop_write =		hammer_vop_specwrite,
133 	.vop_access =		hammer_vop_access,
134 	.vop_close =		hammer_vop_specclose,
135 	.vop_getattr =		hammer_vop_getattr,
136 	.vop_inactive =		hammer_vop_inactive,
137 	.vop_reclaim =		hammer_vop_reclaim,
138 	.vop_setattr =		hammer_vop_setattr
139 };
140 
141 struct vop_ops hammer_fifo_vops = {
142 	.vop_default =		fifo_vnoperate,
143 	.vop_fsync =		hammer_vop_fsync,
144 	.vop_read =		hammer_vop_fiforead,
145 	.vop_write =		hammer_vop_fifowrite,
146 	.vop_access =		hammer_vop_access,
147 	.vop_close =		hammer_vop_fifoclose,
148 	.vop_getattr =		hammer_vop_getattr,
149 	.vop_inactive =		hammer_vop_inactive,
150 	.vop_reclaim =		hammer_vop_reclaim,
151 	.vop_setattr =		hammer_vop_setattr
152 };
153 
154 #ifdef DEBUG_TRUNCATE
155 struct hammer_inode *HammerTruncIp;
156 #endif
157 
158 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
159 			   struct vnode *dvp, struct ucred *cred,
160 			   int flags, int isdir);
161 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
162 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
163 
164 #if 0
165 static
166 int
167 hammer_vop_vnoperate(struct vop_generic_args *)
168 {
169 	return (VOCALL(&hammer_vnode_vops, ap));
170 }
171 #endif
172 
173 /*
174  * hammer_vop_fsync { vp, waitfor }
175  *
176  * fsync() an inode to disk and wait for it to be completely committed
177  * such that the information would not be undone if a crash occured after
178  * return.
179  */
180 static
181 int
182 hammer_vop_fsync(struct vop_fsync_args *ap)
183 {
184 	hammer_inode_t ip = VTOI(ap->a_vp);
185 
186 	++hammer_count_fsyncs;
187 	vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL);
188 	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
189 	if (ap->a_waitfor == MNT_WAIT)
190 		hammer_wait_inode(ip);
191 	return (ip->error);
192 }
193 
194 /*
195  * hammer_vop_read { vp, uio, ioflag, cred }
196  */
197 static
198 int
199 hammer_vop_read(struct vop_read_args *ap)
200 {
201 	struct hammer_transaction trans;
202 	hammer_inode_t ip;
203 	off_t offset;
204 	struct buf *bp;
205 	struct uio *uio;
206 	int error;
207 	int n;
208 	int seqcount;
209 	int ioseqcount;
210 	int blksize;
211 
212 	if (ap->a_vp->v_type != VREG)
213 		return (EINVAL);
214 	ip = VTOI(ap->a_vp);
215 	error = 0;
216 	uio = ap->a_uio;
217 
218 	/*
219 	 * Allow the UIO's size to override the sequential heuristic.
220 	 */
221 	blksize = hammer_blocksize(uio->uio_offset);
222 	seqcount = (uio->uio_resid + (blksize - 1)) / blksize;
223 	ioseqcount = ap->a_ioflag >> 16;
224 	if (seqcount < ioseqcount)
225 		seqcount = ioseqcount;
226 
227 	hammer_start_transaction(&trans, ip->hmp);
228 
229 	/*
230 	 * Access the data typically in HAMMER_BUFSIZE blocks via the
231 	 * buffer cache, but HAMMER may use a variable block size based
232 	 * on the offset.
233 	 */
234 	while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
235 		int64_t base_offset;
236 		int64_t file_limit;
237 
238 		blksize = hammer_blocksize(uio->uio_offset);
239 		offset = (int)uio->uio_offset & (blksize - 1);
240 		base_offset = uio->uio_offset - offset;
241 
242 		if (hammer_cluster_enable) {
243 			/*
244 			 * Use file_limit to prevent cluster_read() from
245 			 * creating buffers of the wrong block size past
246 			 * the demarc.
247 			 */
248 			file_limit = ip->ino_data.size;
249 			if (base_offset < HAMMER_XDEMARC &&
250 			    file_limit > HAMMER_XDEMARC) {
251 				file_limit = HAMMER_XDEMARC;
252 			}
253 			error = cluster_read(ap->a_vp,
254 					     file_limit, base_offset,
255 					     blksize, MAXPHYS,
256 					     seqcount, &bp);
257 		} else {
258 			error = bread(ap->a_vp, base_offset, blksize, &bp);
259 		}
260 		if (error) {
261 			kprintf("error %d\n", error);
262 			brelse(bp);
263 			break;
264 		}
265 
266 		/* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
267 		n = blksize - offset;
268 		if (n > uio->uio_resid)
269 			n = uio->uio_resid;
270 		if (n > ip->ino_data.size - uio->uio_offset)
271 			n = (int)(ip->ino_data.size - uio->uio_offset);
272 		error = uiomove((char *)bp->b_data + offset, n, uio);
273 
274 		/* data has a lower priority then meta-data */
275 		bp->b_flags |= B_AGE;
276 		bqrelse(bp);
277 		if (error)
278 			break;
279 		hammer_stats_file_read += n;
280 	}
281 	if ((ip->flags & HAMMER_INODE_RO) == 0 &&
282 	    (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
283 		ip->ino_data.atime = trans.time;
284 		hammer_modify_inode(ip, HAMMER_INODE_ATIME);
285 	}
286 	hammer_done_transaction(&trans);
287 	return (error);
288 }
289 
290 /*
291  * hammer_vop_write { vp, uio, ioflag, cred }
292  */
293 static
294 int
295 hammer_vop_write(struct vop_write_args *ap)
296 {
297 	struct hammer_transaction trans;
298 	struct hammer_inode *ip;
299 	hammer_mount_t hmp;
300 	struct uio *uio;
301 	int offset;
302 	off_t base_offset;
303 	struct buf *bp;
304 	int error;
305 	int n;
306 	int flags;
307 	int delta;
308 	int seqcount;
309 
310 	if (ap->a_vp->v_type != VREG)
311 		return (EINVAL);
312 	ip = VTOI(ap->a_vp);
313 	hmp = ip->hmp;
314 	error = 0;
315 	seqcount = ap->a_ioflag >> 16;
316 
317 	if (ip->flags & HAMMER_INODE_RO)
318 		return (EROFS);
319 
320 	/*
321 	 * Create a transaction to cover the operations we perform.
322 	 */
323 	hammer_start_transaction(&trans, hmp);
324 	uio = ap->a_uio;
325 
326 	/*
327 	 * Check append mode
328 	 */
329 	if (ap->a_ioflag & IO_APPEND)
330 		uio->uio_offset = ip->ino_data.size;
331 
332 	/*
333 	 * Check for illegal write offsets.  Valid range is 0...2^63-1.
334 	 *
335 	 * NOTE: the base_off assignment is required to work around what
336 	 * I consider to be a GCC-4 optimization bug.
337 	 */
338 	if (uio->uio_offset < 0) {
339 		hammer_done_transaction(&trans);
340 		return (EFBIG);
341 	}
342 	base_offset = uio->uio_offset + uio->uio_resid;	/* work around gcc-4 */
343 	if (uio->uio_resid > 0 && base_offset <= 0) {
344 		hammer_done_transaction(&trans);
345 		return (EFBIG);
346 	}
347 
348 	/*
349 	 * Access the data typically in HAMMER_BUFSIZE blocks via the
350 	 * buffer cache, but HAMMER may use a variable block size based
351 	 * on the offset.
352 	 */
353 	while (uio->uio_resid > 0) {
354 		int fixsize = 0;
355 		int blksize;
356 		int blkmask;
357 
358 		if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
359 			break;
360 
361 		blksize = hammer_blocksize(uio->uio_offset);
362 
363 		/*
364 		 * Do not allow HAMMER to blow out the buffer cache.  Very
365 		 * large UIOs can lockout other processes due to bwillwrite()
366 		 * mechanics.
367 		 *
368 		 * The hammer inode is not locked during these operations.
369 		 * The vnode is locked which can interfere with the pageout
370 		 * daemon for non-UIO_NOCOPY writes but should not interfere
371 		 * with the buffer cache.  Even so, we cannot afford to
372 		 * allow the pageout daemon to build up too many dirty buffer
373 		 * cache buffers.
374 		 */
375 		/*if (((int)uio->uio_offset & (blksize - 1)) == 0)*/
376 		bwillwrite(blksize);
377 
378 		/*
379 		 * Do not allow HAMMER to blow out system memory by
380 		 * accumulating too many records.   Records are so well
381 		 * decoupled from the buffer cache that it is possible
382 		 * for userland to push data out to the media via
383 		 * direct-write, but build up the records queued to the
384 		 * backend faster then the backend can flush them out.
385 		 * HAMMER has hit its write limit but the frontend has
386 		 * no pushback to slow it down.
387 		 */
388 		if (hmp->rsv_recs > hammer_limit_recs / 2) {
389 			/*
390 			 * Get the inode on the flush list
391 			 */
392 			if (ip->rsv_recs >= 64)
393 				hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
394 			else if (ip->rsv_recs >= 16)
395 				hammer_flush_inode(ip, 0);
396 
397 			/*
398 			 * Keep the flusher going if the system keeps
399 			 * queueing records.
400 			 */
401 			delta = hmp->count_newrecords -
402 				hmp->last_newrecords;
403 			if (delta < 0 || delta > hammer_limit_recs / 2) {
404 				hmp->last_newrecords = hmp->count_newrecords;
405 				hammer_sync_hmp(hmp, MNT_NOWAIT);
406 			}
407 
408 			/*
409 			 * If we have gotten behind start slowing
410 			 * down the writers.
411 			 */
412 			delta = (hmp->rsv_recs - hammer_limit_recs) *
413 				hz / hammer_limit_recs;
414 			if (delta > 0)
415 				tsleep(&trans, 0, "hmrslo", delta);
416 		}
417 
418 		/*
419 		 * Calculate the blocksize at the current offset and figure
420 		 * out how much we can actually write.
421 		 */
422 		blkmask = blksize - 1;
423 		offset = (int)uio->uio_offset & blkmask;
424 		base_offset = uio->uio_offset & ~(int64_t)blkmask;
425 		n = blksize - offset;
426 		if (n > uio->uio_resid)
427 			n = uio->uio_resid;
428 		if (uio->uio_offset + n > ip->ino_data.size) {
429 			vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
430 			fixsize = 1;
431 		}
432 
433 		if (uio->uio_segflg == UIO_NOCOPY) {
434 			/*
435 			 * Issuing a write with the same data backing the
436 			 * buffer.  Instantiate the buffer to collect the
437 			 * backing vm pages, then read-in any missing bits.
438 			 *
439 			 * This case is used by vop_stdputpages().
440 			 */
441 			bp = getblk(ap->a_vp, base_offset,
442 				    blksize, GETBLK_BHEAVY, 0);
443 			if ((bp->b_flags & B_CACHE) == 0) {
444 				bqrelse(bp);
445 				error = bread(ap->a_vp, base_offset,
446 					      blksize, &bp);
447 			}
448 		} else if (offset == 0 && uio->uio_resid >= blksize) {
449 			/*
450 			 * Even though we are entirely overwriting the buffer
451 			 * we may still have to zero it out to avoid a
452 			 * mmap/write visibility issue.
453 			 */
454 			bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
455 			if ((bp->b_flags & B_CACHE) == 0)
456 				vfs_bio_clrbuf(bp);
457 		} else if (base_offset >= ip->ino_data.size) {
458 			/*
459 			 * If the base offset of the buffer is beyond the
460 			 * file EOF, we don't have to issue a read.
461 			 */
462 			bp = getblk(ap->a_vp, base_offset,
463 				    blksize, GETBLK_BHEAVY, 0);
464 			vfs_bio_clrbuf(bp);
465 		} else {
466 			/*
467 			 * Partial overwrite, read in any missing bits then
468 			 * replace the portion being written.
469 			 */
470 			error = bread(ap->a_vp, base_offset, blksize, &bp);
471 			if (error == 0)
472 				bheavy(bp);
473 		}
474 		if (error == 0) {
475 			error = uiomove((char *)bp->b_data + offset,
476 					n, uio);
477 		}
478 
479 		/*
480 		 * If we screwed up we have to undo any VM size changes we
481 		 * made.
482 		 */
483 		if (error) {
484 			brelse(bp);
485 			if (fixsize) {
486 				vtruncbuf(ap->a_vp, ip->ino_data.size,
487 					  hammer_blocksize(ip->ino_data.size));
488 			}
489 			break;
490 		}
491 		hammer_stats_file_write += n;
492 		/* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
493 		if (ip->ino_data.size < uio->uio_offset) {
494 			ip->ino_data.size = uio->uio_offset;
495 			flags = HAMMER_INODE_DDIRTY;
496 			vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
497 		} else {
498 			flags = 0;
499 		}
500 		ip->ino_data.mtime = trans.time;
501 		flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
502 		hammer_modify_inode(ip, flags);
503 
504 		/*
505 		 * Once we dirty the buffer any cached zone-X offset
506 		 * becomes invalid.  HAMMER NOTE: no-history mode cannot
507 		 * allow overwriting over the same data sector unless
508 		 * we provide UNDOs for the old data, which we don't.
509 		 */
510 		bp->b_bio2.bio_offset = NOOFFSET;
511 
512 		/*
513 		 * Final buffer disposition.
514 		 */
515 		bp->b_flags |= B_AGE;
516 		if (ap->a_ioflag & IO_SYNC) {
517 			bwrite(bp);
518 		} else if (ap->a_ioflag & IO_DIRECT) {
519 			bawrite(bp);
520 		} else {
521 			bdwrite(bp);
522 		}
523 	}
524 	hammer_done_transaction(&trans);
525 	return (error);
526 }
527 
528 /*
529  * hammer_vop_access { vp, mode, cred }
530  */
531 static
532 int
533 hammer_vop_access(struct vop_access_args *ap)
534 {
535 	struct hammer_inode *ip = VTOI(ap->a_vp);
536 	uid_t uid;
537 	gid_t gid;
538 	int error;
539 
540 	++hammer_stats_file_iopsr;
541 	uid = hammer_to_unix_xid(&ip->ino_data.uid);
542 	gid = hammer_to_unix_xid(&ip->ino_data.gid);
543 
544 	error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
545 				  ip->ino_data.uflags);
546 	return (error);
547 }
548 
549 /*
550  * hammer_vop_advlock { vp, id, op, fl, flags }
551  */
552 static
553 int
554 hammer_vop_advlock(struct vop_advlock_args *ap)
555 {
556 	hammer_inode_t ip = VTOI(ap->a_vp);
557 
558 	return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
559 }
560 
561 /*
562  * hammer_vop_close { vp, fflag }
563  */
564 static
565 int
566 hammer_vop_close(struct vop_close_args *ap)
567 {
568 	hammer_inode_t ip = VTOI(ap->a_vp);
569 
570 	if ((ip->flags | ip->sync_flags) & HAMMER_INODE_MODMASK)
571 		hammer_inode_waitreclaims(ip->hmp);
572 	return (vop_stdclose(ap));
573 }
574 
575 /*
576  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
577  *
578  * The operating system has already ensured that the directory entry
579  * does not exist and done all appropriate namespace locking.
580  */
581 static
582 int
583 hammer_vop_ncreate(struct vop_ncreate_args *ap)
584 {
585 	struct hammer_transaction trans;
586 	struct hammer_inode *dip;
587 	struct hammer_inode *nip;
588 	struct nchandle *nch;
589 	int error;
590 
591 	nch = ap->a_nch;
592 	dip = VTOI(ap->a_dvp);
593 
594 	if (dip->flags & HAMMER_INODE_RO)
595 		return (EROFS);
596 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
597 		return (error);
598 
599 	/*
600 	 * Create a transaction to cover the operations we perform.
601 	 */
602 	hammer_start_transaction(&trans, dip->hmp);
603 	++hammer_stats_file_iopsw;
604 
605 	/*
606 	 * Create a new filesystem object of the requested type.  The
607 	 * returned inode will be referenced and shared-locked to prevent
608 	 * it from being moved to the flusher.
609 	 */
610 
611 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
612 				    dip, NULL, &nip);
613 	if (error) {
614 		hkprintf("hammer_create_inode error %d\n", error);
615 		hammer_done_transaction(&trans);
616 		*ap->a_vpp = NULL;
617 		return (error);
618 	}
619 
620 	/*
621 	 * Add the new filesystem object to the directory.  This will also
622 	 * bump the inode's link count.
623 	 */
624 	error = hammer_ip_add_directory(&trans, dip,
625 					nch->ncp->nc_name, nch->ncp->nc_nlen,
626 					nip);
627 	if (error)
628 		hkprintf("hammer_ip_add_directory error %d\n", error);
629 
630 	/*
631 	 * Finish up.
632 	 */
633 	if (error) {
634 		hammer_rel_inode(nip, 0);
635 		hammer_done_transaction(&trans);
636 		*ap->a_vpp = NULL;
637 	} else {
638 		error = hammer_get_vnode(nip, ap->a_vpp);
639 		hammer_done_transaction(&trans);
640 		hammer_rel_inode(nip, 0);
641 		if (error == 0) {
642 			cache_setunresolved(ap->a_nch);
643 			cache_setvp(ap->a_nch, *ap->a_vpp);
644 		}
645 	}
646 	return (error);
647 }
648 
649 /*
650  * hammer_vop_getattr { vp, vap }
651  *
652  * Retrieve an inode's attribute information.  When accessing inodes
653  * historically we fake the atime field to ensure consistent results.
654  * The atime field is stored in the B-Tree element and allowed to be
655  * updated without cycling the element.
656  */
657 static
658 int
659 hammer_vop_getattr(struct vop_getattr_args *ap)
660 {
661 	struct hammer_inode *ip = VTOI(ap->a_vp);
662 	struct vattr *vap = ap->a_vap;
663 
664 	/*
665 	 * We want the fsid to be different when accessing a filesystem
666 	 * with different as-of's so programs like diff don't think
667 	 * the files are the same.
668 	 *
669 	 * We also want the fsid to be the same when comparing snapshots,
670 	 * or when comparing mirrors (which might be backed by different
671 	 * physical devices).  HAMMER fsids are based on the PFS's
672 	 * shared_uuid field.
673 	 *
674 	 * XXX there is a chance of collision here.  The va_fsid reported
675 	 * by stat is different from the more involved fsid used in the
676 	 * mount structure.
677 	 */
678 	++hammer_stats_file_iopsr;
679 	vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
680 		       (u_int32_t)(ip->obj_asof >> 32);
681 
682 	vap->va_fileid = ip->ino_leaf.base.obj_id;
683 	vap->va_mode = ip->ino_data.mode;
684 	vap->va_nlink = ip->ino_data.nlinks;
685 	vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
686 	vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
687 	vap->va_rmajor = 0;
688 	vap->va_rminor = 0;
689 	vap->va_size = ip->ino_data.size;
690 
691 	/*
692 	 * Special case for @@PFS softlinks.  The actual size of the
693 	 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
694 	 */
695 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
696 	    ip->ino_data.size == 10 &&
697 	    ip->obj_asof == HAMMER_MAX_TID &&
698 	    ip->obj_localization == 0 &&
699 	    strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
700 		    vap->va_size = 26;
701 	}
702 
703 	/*
704 	 * We must provide a consistent atime and mtime for snapshots
705 	 * so people can do a 'tar cf - ... | md5' on them and get
706 	 * consistent results.
707 	 */
708 	if (ip->flags & HAMMER_INODE_RO) {
709 		hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
710 		hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
711 	} else {
712 		hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
713 		hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
714 	}
715 	hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
716 	vap->va_flags = ip->ino_data.uflags;
717 	vap->va_gen = 1;	/* hammer inums are unique for all time */
718 	vap->va_blocksize = HAMMER_BUFSIZE;
719 	if (ip->ino_data.size >= HAMMER_XDEMARC) {
720 		vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
721 				~HAMMER_XBUFMASK64;
722 	} else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
723 		vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
724 				~HAMMER_BUFMASK64;
725 	} else {
726 		vap->va_bytes = (ip->ino_data.size + 15) & ~15;
727 	}
728 	vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
729 	vap->va_filerev = 0; 	/* XXX */
730 	/* mtime uniquely identifies any adjustments made to the file XXX */
731 	vap->va_fsmid = ip->ino_data.mtime;
732 	vap->va_uid_uuid = ip->ino_data.uid;
733 	vap->va_gid_uuid = ip->ino_data.gid;
734 	vap->va_fsid_uuid = ip->hmp->fsid;
735 	vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
736 			  VA_FSID_UUID_VALID;
737 
738 	switch (ip->ino_data.obj_type) {
739 	case HAMMER_OBJTYPE_CDEV:
740 	case HAMMER_OBJTYPE_BDEV:
741 		vap->va_rmajor = ip->ino_data.rmajor;
742 		vap->va_rminor = ip->ino_data.rminor;
743 		break;
744 	default:
745 		break;
746 	}
747 	return(0);
748 }
749 
750 /*
751  * hammer_vop_nresolve { nch, dvp, cred }
752  *
753  * Locate the requested directory entry.
754  */
755 static
756 int
757 hammer_vop_nresolve(struct vop_nresolve_args *ap)
758 {
759 	struct hammer_transaction trans;
760 	struct namecache *ncp;
761 	hammer_inode_t dip;
762 	hammer_inode_t ip;
763 	hammer_tid_t asof;
764 	struct hammer_cursor cursor;
765 	struct vnode *vp;
766 	int64_t namekey;
767 	int error;
768 	int i;
769 	int nlen;
770 	int flags;
771 	int ispfs;
772 	int64_t obj_id;
773 	u_int32_t localization;
774 
775 	/*
776 	 * Misc initialization, plus handle as-of name extensions.  Look for
777 	 * the '@@' extension.  Note that as-of files and directories cannot
778 	 * be modified.
779 	 */
780 	dip = VTOI(ap->a_dvp);
781 	ncp = ap->a_nch->ncp;
782 	asof = dip->obj_asof;
783 	nlen = ncp->nc_nlen;
784 	flags = dip->flags & HAMMER_INODE_RO;
785 	ispfs = 0;
786 
787 	hammer_simple_transaction(&trans, dip->hmp);
788 	++hammer_stats_file_iopsr;
789 
790 	for (i = 0; i < nlen; ++i) {
791 		if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
792 			asof = hammer_str_to_tid(ncp->nc_name + i + 2,
793 						 &ispfs, &localization);
794 			if (asof != HAMMER_MAX_TID)
795 				flags |= HAMMER_INODE_RO;
796 			break;
797 		}
798 	}
799 	nlen = i;
800 
801 	/*
802 	 * If this is a PFS softlink we dive into the PFS
803 	 */
804 	if (ispfs && nlen == 0) {
805 		ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
806 				      asof, localization,
807 				      flags, &error);
808 		if (error == 0) {
809 			error = hammer_get_vnode(ip, &vp);
810 			hammer_rel_inode(ip, 0);
811 		} else {
812 			vp = NULL;
813 		}
814 		if (error == 0) {
815 			vn_unlock(vp);
816 			cache_setvp(ap->a_nch, vp);
817 			vrele(vp);
818 		}
819 		goto done;
820 	}
821 
822 	/*
823 	 * If there is no path component the time extension is relative to
824 	 * dip.
825 	 */
826 	if (nlen == 0) {
827 		ip = hammer_get_inode(&trans, dip, dip->obj_id,
828 				      asof, dip->obj_localization,
829 				      flags, &error);
830 		if (error == 0) {
831 			error = hammer_get_vnode(ip, &vp);
832 			hammer_rel_inode(ip, 0);
833 		} else {
834 			vp = NULL;
835 		}
836 		if (error == 0) {
837 			vn_unlock(vp);
838 			cache_setvp(ap->a_nch, vp);
839 			vrele(vp);
840 		}
841 		goto done;
842 	}
843 
844 	/*
845 	 * Calculate the namekey and setup the key range for the scan.  This
846 	 * works kinda like a chained hash table where the lower 32 bits
847 	 * of the namekey synthesize the chain.
848 	 *
849 	 * The key range is inclusive of both key_beg and key_end.
850 	 */
851 	namekey = hammer_directory_namekey(ncp->nc_name, nlen);
852 
853 	error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
854 	cursor.key_beg.localization = dip->obj_localization +
855 				      HAMMER_LOCALIZE_MISC;
856         cursor.key_beg.obj_id = dip->obj_id;
857 	cursor.key_beg.key = namekey;
858         cursor.key_beg.create_tid = 0;
859         cursor.key_beg.delete_tid = 0;
860         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
861         cursor.key_beg.obj_type = 0;
862 
863 	cursor.key_end = cursor.key_beg;
864 	cursor.key_end.key |= 0xFFFFFFFFULL;
865 	cursor.asof = asof;
866 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
867 
868 	/*
869 	 * Scan all matching records (the chain), locate the one matching
870 	 * the requested path component.
871 	 *
872 	 * The hammer_ip_*() functions merge in-memory records with on-disk
873 	 * records for the purposes of the search.
874 	 */
875 	obj_id = 0;
876 	localization = HAMMER_DEF_LOCALIZATION;
877 
878 	if (error == 0) {
879 		error = hammer_ip_first(&cursor);
880 		while (error == 0) {
881 			error = hammer_ip_resolve_data(&cursor);
882 			if (error)
883 				break;
884 			if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
885 			    bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
886 				obj_id = cursor.data->entry.obj_id;
887 				localization = cursor.data->entry.localization;
888 				break;
889 			}
890 			error = hammer_ip_next(&cursor);
891 		}
892 	}
893 	hammer_done_cursor(&cursor);
894 	if (error == 0) {
895 		ip = hammer_get_inode(&trans, dip, obj_id,
896 				      asof, localization,
897 				      flags, &error);
898 		if (error == 0) {
899 			error = hammer_get_vnode(ip, &vp);
900 			hammer_rel_inode(ip, 0);
901 		} else {
902 			vp = NULL;
903 		}
904 		if (error == 0) {
905 			vn_unlock(vp);
906 			cache_setvp(ap->a_nch, vp);
907 			vrele(vp);
908 		}
909 	} else if (error == ENOENT) {
910 		cache_setvp(ap->a_nch, NULL);
911 	}
912 done:
913 	hammer_done_transaction(&trans);
914 	return (error);
915 }
916 
917 /*
918  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
919  *
920  * Locate the parent directory of a directory vnode.
921  *
922  * dvp is referenced but not locked.  *vpp must be returned referenced and
923  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
924  * at the root, instead it could indicate that the directory we were in was
925  * removed.
926  *
927  * NOTE: as-of sequences are not linked into the directory structure.  If
928  * we are at the root with a different asof then the mount point, reload
929  * the same directory with the mount point's asof.   I'm not sure what this
930  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
931  * get confused, but it hasn't been tested.
932  */
933 static
934 int
935 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
936 {
937 	struct hammer_transaction trans;
938 	struct hammer_inode *dip;
939 	struct hammer_inode *ip;
940 	int64_t parent_obj_id;
941 	u_int32_t parent_obj_localization;
942 	hammer_tid_t asof;
943 	int error;
944 
945 	dip = VTOI(ap->a_dvp);
946 	asof = dip->obj_asof;
947 
948 	/*
949 	 * Whos are parent?  This could be the root of a pseudo-filesystem
950 	 * whos parent is in another localization domain.
951 	 */
952 	parent_obj_id = dip->ino_data.parent_obj_id;
953 	if (dip->obj_id == HAMMER_OBJID_ROOT)
954 		parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
955 	else
956 		parent_obj_localization = dip->obj_localization;
957 
958 	if (parent_obj_id == 0) {
959 		if (dip->obj_id == HAMMER_OBJID_ROOT &&
960 		   asof != dip->hmp->asof) {
961 			parent_obj_id = dip->obj_id;
962 			asof = dip->hmp->asof;
963 			*ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
964 			ksnprintf(*ap->a_fakename, 19, "0x%016llx",
965 				   dip->obj_asof);
966 		} else {
967 			*ap->a_vpp = NULL;
968 			return ENOENT;
969 		}
970 	}
971 
972 	hammer_simple_transaction(&trans, dip->hmp);
973 	++hammer_stats_file_iopsr;
974 
975 	ip = hammer_get_inode(&trans, dip, parent_obj_id,
976 			      asof, parent_obj_localization,
977 			      dip->flags, &error);
978 	if (ip) {
979 		error = hammer_get_vnode(ip, ap->a_vpp);
980 		hammer_rel_inode(ip, 0);
981 	} else {
982 		*ap->a_vpp = NULL;
983 	}
984 	hammer_done_transaction(&trans);
985 	return (error);
986 }
987 
988 /*
989  * hammer_vop_nlink { nch, dvp, vp, cred }
990  */
991 static
992 int
993 hammer_vop_nlink(struct vop_nlink_args *ap)
994 {
995 	struct hammer_transaction trans;
996 	struct hammer_inode *dip;
997 	struct hammer_inode *ip;
998 	struct nchandle *nch;
999 	int error;
1000 
1001 	if (ap->a_dvp->v_mount != ap->a_vp->v_mount)
1002 		return(EXDEV);
1003 
1004 	nch = ap->a_nch;
1005 	dip = VTOI(ap->a_dvp);
1006 	ip = VTOI(ap->a_vp);
1007 
1008 	if (dip->obj_localization != ip->obj_localization)
1009 		return(EXDEV);
1010 
1011 	if (dip->flags & HAMMER_INODE_RO)
1012 		return (EROFS);
1013 	if (ip->flags & HAMMER_INODE_RO)
1014 		return (EROFS);
1015 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1016 		return (error);
1017 
1018 	/*
1019 	 * Create a transaction to cover the operations we perform.
1020 	 */
1021 	hammer_start_transaction(&trans, dip->hmp);
1022 	++hammer_stats_file_iopsw;
1023 
1024 	/*
1025 	 * Add the filesystem object to the directory.  Note that neither
1026 	 * dip nor ip are referenced or locked, but their vnodes are
1027 	 * referenced.  This function will bump the inode's link count.
1028 	 */
1029 	error = hammer_ip_add_directory(&trans, dip,
1030 					nch->ncp->nc_name, nch->ncp->nc_nlen,
1031 					ip);
1032 
1033 	/*
1034 	 * Finish up.
1035 	 */
1036 	if (error == 0) {
1037 		cache_setunresolved(nch);
1038 		cache_setvp(nch, ap->a_vp);
1039 	}
1040 	hammer_done_transaction(&trans);
1041 	return (error);
1042 }
1043 
1044 /*
1045  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1046  *
1047  * The operating system has already ensured that the directory entry
1048  * does not exist and done all appropriate namespace locking.
1049  */
1050 static
1051 int
1052 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1053 {
1054 	struct hammer_transaction trans;
1055 	struct hammer_inode *dip;
1056 	struct hammer_inode *nip;
1057 	struct nchandle *nch;
1058 	int error;
1059 
1060 	nch = ap->a_nch;
1061 	dip = VTOI(ap->a_dvp);
1062 
1063 	if (dip->flags & HAMMER_INODE_RO)
1064 		return (EROFS);
1065 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1066 		return (error);
1067 
1068 	/*
1069 	 * Create a transaction to cover the operations we perform.
1070 	 */
1071 	hammer_start_transaction(&trans, dip->hmp);
1072 	++hammer_stats_file_iopsw;
1073 
1074 	/*
1075 	 * Create a new filesystem object of the requested type.  The
1076 	 * returned inode will be referenced but not locked.
1077 	 */
1078 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1079 				    dip, NULL, &nip);
1080 	if (error) {
1081 		hkprintf("hammer_mkdir error %d\n", error);
1082 		hammer_done_transaction(&trans);
1083 		*ap->a_vpp = NULL;
1084 		return (error);
1085 	}
1086 	/*
1087 	 * Add the new filesystem object to the directory.  This will also
1088 	 * bump the inode's link count.
1089 	 */
1090 	error = hammer_ip_add_directory(&trans, dip,
1091 					nch->ncp->nc_name, nch->ncp->nc_nlen,
1092 					nip);
1093 	if (error)
1094 		hkprintf("hammer_mkdir (add) error %d\n", error);
1095 
1096 	/*
1097 	 * Finish up.
1098 	 */
1099 	if (error) {
1100 		hammer_rel_inode(nip, 0);
1101 		*ap->a_vpp = NULL;
1102 	} else {
1103 		error = hammer_get_vnode(nip, ap->a_vpp);
1104 		hammer_rel_inode(nip, 0);
1105 		if (error == 0) {
1106 			cache_setunresolved(ap->a_nch);
1107 			cache_setvp(ap->a_nch, *ap->a_vpp);
1108 		}
1109 	}
1110 	hammer_done_transaction(&trans);
1111 	return (error);
1112 }
1113 
1114 /*
1115  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1116  *
1117  * The operating system has already ensured that the directory entry
1118  * does not exist and done all appropriate namespace locking.
1119  */
1120 static
1121 int
1122 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1123 {
1124 	struct hammer_transaction trans;
1125 	struct hammer_inode *dip;
1126 	struct hammer_inode *nip;
1127 	struct nchandle *nch;
1128 	int error;
1129 
1130 	nch = ap->a_nch;
1131 	dip = VTOI(ap->a_dvp);
1132 
1133 	if (dip->flags & HAMMER_INODE_RO)
1134 		return (EROFS);
1135 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1136 		return (error);
1137 
1138 	/*
1139 	 * Create a transaction to cover the operations we perform.
1140 	 */
1141 	hammer_start_transaction(&trans, dip->hmp);
1142 	++hammer_stats_file_iopsw;
1143 
1144 	/*
1145 	 * Create a new filesystem object of the requested type.  The
1146 	 * returned inode will be referenced but not locked.
1147 	 *
1148 	 * If mknod specifies a directory a pseudo-fs is created.
1149 	 */
1150 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1151 				    dip, NULL, &nip);
1152 	if (error) {
1153 		hammer_done_transaction(&trans);
1154 		*ap->a_vpp = NULL;
1155 		return (error);
1156 	}
1157 
1158 	/*
1159 	 * Add the new filesystem object to the directory.  This will also
1160 	 * bump the inode's link count.
1161 	 */
1162 	error = hammer_ip_add_directory(&trans, dip,
1163 					nch->ncp->nc_name, nch->ncp->nc_nlen,
1164 					nip);
1165 
1166 	/*
1167 	 * Finish up.
1168 	 */
1169 	if (error) {
1170 		hammer_rel_inode(nip, 0);
1171 		*ap->a_vpp = NULL;
1172 	} else {
1173 		error = hammer_get_vnode(nip, ap->a_vpp);
1174 		hammer_rel_inode(nip, 0);
1175 		if (error == 0) {
1176 			cache_setunresolved(ap->a_nch);
1177 			cache_setvp(ap->a_nch, *ap->a_vpp);
1178 		}
1179 	}
1180 	hammer_done_transaction(&trans);
1181 	return (error);
1182 }
1183 
1184 /*
1185  * hammer_vop_open { vp, mode, cred, fp }
1186  */
1187 static
1188 int
1189 hammer_vop_open(struct vop_open_args *ap)
1190 {
1191 	hammer_inode_t ip;
1192 
1193 	++hammer_stats_file_iopsr;
1194 	ip = VTOI(ap->a_vp);
1195 
1196 	if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1197 		return (EROFS);
1198 	return(vop_stdopen(ap));
1199 }
1200 
1201 /*
1202  * hammer_vop_pathconf { vp, name, retval }
1203  */
1204 static
1205 int
1206 hammer_vop_pathconf(struct vop_pathconf_args *ap)
1207 {
1208 	return EOPNOTSUPP;
1209 }
1210 
1211 /*
1212  * hammer_vop_print { vp }
1213  */
1214 static
1215 int
1216 hammer_vop_print(struct vop_print_args *ap)
1217 {
1218 	return EOPNOTSUPP;
1219 }
1220 
1221 /*
1222  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1223  */
1224 static
1225 int
1226 hammer_vop_readdir(struct vop_readdir_args *ap)
1227 {
1228 	struct hammer_transaction trans;
1229 	struct hammer_cursor cursor;
1230 	struct hammer_inode *ip;
1231 	struct uio *uio;
1232 	hammer_base_elm_t base;
1233 	int error;
1234 	int cookie_index;
1235 	int ncookies;
1236 	off_t *cookies;
1237 	off_t saveoff;
1238 	int r;
1239 	int dtype;
1240 
1241 	++hammer_stats_file_iopsr;
1242 	ip = VTOI(ap->a_vp);
1243 	uio = ap->a_uio;
1244 	saveoff = uio->uio_offset;
1245 
1246 	if (ap->a_ncookies) {
1247 		ncookies = uio->uio_resid / 16 + 1;
1248 		if (ncookies > 1024)
1249 			ncookies = 1024;
1250 		cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1251 		cookie_index = 0;
1252 	} else {
1253 		ncookies = -1;
1254 		cookies = NULL;
1255 		cookie_index = 0;
1256 	}
1257 
1258 	hammer_simple_transaction(&trans, ip->hmp);
1259 
1260 	/*
1261 	 * Handle artificial entries
1262 	 */
1263 	error = 0;
1264 	if (saveoff == 0) {
1265 		r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1266 		if (r)
1267 			goto done;
1268 		if (cookies)
1269 			cookies[cookie_index] = saveoff;
1270 		++saveoff;
1271 		++cookie_index;
1272 		if (cookie_index == ncookies)
1273 			goto done;
1274 	}
1275 	if (saveoff == 1) {
1276 		if (ip->ino_data.parent_obj_id) {
1277 			r = vop_write_dirent(&error, uio,
1278 					     ip->ino_data.parent_obj_id,
1279 					     DT_DIR, 2, "..");
1280 		} else {
1281 			r = vop_write_dirent(&error, uio,
1282 					     ip->obj_id, DT_DIR, 2, "..");
1283 		}
1284 		if (r)
1285 			goto done;
1286 		if (cookies)
1287 			cookies[cookie_index] = saveoff;
1288 		++saveoff;
1289 		++cookie_index;
1290 		if (cookie_index == ncookies)
1291 			goto done;
1292 	}
1293 
1294 	/*
1295 	 * Key range (begin and end inclusive) to scan.  Directory keys
1296 	 * directly translate to a 64 bit 'seek' position.
1297 	 */
1298 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1299 	cursor.key_beg.localization = ip->obj_localization +
1300 				      HAMMER_LOCALIZE_MISC;
1301 	cursor.key_beg.obj_id = ip->obj_id;
1302 	cursor.key_beg.create_tid = 0;
1303 	cursor.key_beg.delete_tid = 0;
1304         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1305 	cursor.key_beg.obj_type = 0;
1306 	cursor.key_beg.key = saveoff;
1307 
1308 	cursor.key_end = cursor.key_beg;
1309 	cursor.key_end.key = HAMMER_MAX_KEY;
1310 	cursor.asof = ip->obj_asof;
1311 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1312 
1313 	error = hammer_ip_first(&cursor);
1314 
1315 	while (error == 0) {
1316 		error = hammer_ip_resolve_data(&cursor);
1317 		if (error)
1318 			break;
1319 		base = &cursor.leaf->base;
1320 		saveoff = base->key;
1321 		KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1322 
1323 		if (base->obj_id != ip->obj_id)
1324 			panic("readdir: bad record at %p", cursor.node);
1325 
1326 		/*
1327 		 * Convert pseudo-filesystems into softlinks
1328 		 */
1329 		dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
1330 		r = vop_write_dirent(
1331 			     &error, uio, cursor.data->entry.obj_id,
1332 			     dtype,
1333 			     cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1334 			     (void *)cursor.data->entry.name);
1335 		if (r)
1336 			break;
1337 		++saveoff;
1338 		if (cookies)
1339 			cookies[cookie_index] = base->key;
1340 		++cookie_index;
1341 		if (cookie_index == ncookies)
1342 			break;
1343 		error = hammer_ip_next(&cursor);
1344 	}
1345 	hammer_done_cursor(&cursor);
1346 
1347 done:
1348 	hammer_done_transaction(&trans);
1349 
1350 	if (ap->a_eofflag)
1351 		*ap->a_eofflag = (error == ENOENT);
1352 	uio->uio_offset = saveoff;
1353 	if (error && cookie_index == 0) {
1354 		if (error == ENOENT)
1355 			error = 0;
1356 		if (cookies) {
1357 			kfree(cookies, M_TEMP);
1358 			*ap->a_ncookies = 0;
1359 			*ap->a_cookies = NULL;
1360 		}
1361 	} else {
1362 		if (error == ENOENT)
1363 			error = 0;
1364 		if (cookies) {
1365 			*ap->a_ncookies = cookie_index;
1366 			*ap->a_cookies = cookies;
1367 		}
1368 	}
1369 	return(error);
1370 }
1371 
1372 /*
1373  * hammer_vop_readlink { vp, uio, cred }
1374  */
1375 static
1376 int
1377 hammer_vop_readlink(struct vop_readlink_args *ap)
1378 {
1379 	struct hammer_transaction trans;
1380 	struct hammer_cursor cursor;
1381 	struct hammer_inode *ip;
1382 	char buf[32];
1383 	u_int32_t localization;
1384 	hammer_pseudofs_inmem_t pfsm;
1385 	int error;
1386 
1387 	ip = VTOI(ap->a_vp);
1388 
1389 	/*
1390 	 * Shortcut if the symlink data was stuffed into ino_data.
1391 	 *
1392 	 * Also expand special "@@PFS%05d" softlinks (expansion only
1393 	 * occurs for non-historical (current) accesses made from the
1394 	 * primary filesystem).
1395 	 */
1396 	if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1397 		char *ptr;
1398 		int bytes;
1399 
1400 		ptr = ip->ino_data.ext.symlink;
1401 		bytes = (int)ip->ino_data.size;
1402 		if (bytes == 10 &&
1403 		    ip->obj_asof == HAMMER_MAX_TID &&
1404 		    ip->obj_localization == 0 &&
1405 		    strncmp(ptr, "@@PFS", 5) == 0) {
1406 			hammer_simple_transaction(&trans, ip->hmp);
1407 			bcopy(ptr + 5, buf, 5);
1408 			buf[5] = 0;
1409 			localization = strtoul(buf, NULL, 10) << 16;
1410 			pfsm = hammer_load_pseudofs(&trans, localization,
1411 						    &error);
1412 			if (error == 0) {
1413 				if (pfsm->pfsd.mirror_flags &
1414 				    HAMMER_PFSD_SLAVE) {
1415 					ksnprintf(buf, sizeof(buf),
1416 						  "@@0x%016llx:%05d",
1417 						  pfsm->pfsd.sync_end_tid,
1418 						  localization >> 16);
1419 				} else {
1420 					ksnprintf(buf, sizeof(buf),
1421 						  "@@0x%016llx:%05d",
1422 						  HAMMER_MAX_TID,
1423 						  localization >> 16);
1424 				}
1425 				ptr = buf;
1426 				bytes = strlen(buf);
1427 			}
1428 			if (pfsm)
1429 				hammer_rel_pseudofs(trans.hmp, pfsm);
1430 			hammer_done_transaction(&trans);
1431 		}
1432 		error = uiomove(ptr, bytes, ap->a_uio);
1433 		return(error);
1434 	}
1435 
1436 	/*
1437 	 * Long version
1438 	 */
1439 	hammer_simple_transaction(&trans, ip->hmp);
1440 	++hammer_stats_file_iopsr;
1441 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1442 
1443 	/*
1444 	 * Key range (begin and end inclusive) to scan.  Directory keys
1445 	 * directly translate to a 64 bit 'seek' position.
1446 	 */
1447 	cursor.key_beg.localization = ip->obj_localization +
1448 				      HAMMER_LOCALIZE_MISC;
1449 	cursor.key_beg.obj_id = ip->obj_id;
1450 	cursor.key_beg.create_tid = 0;
1451 	cursor.key_beg.delete_tid = 0;
1452         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1453 	cursor.key_beg.obj_type = 0;
1454 	cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1455 	cursor.asof = ip->obj_asof;
1456 	cursor.flags |= HAMMER_CURSOR_ASOF;
1457 
1458 	error = hammer_ip_lookup(&cursor);
1459 	if (error == 0) {
1460 		error = hammer_ip_resolve_data(&cursor);
1461 		if (error == 0) {
1462 			KKASSERT(cursor.leaf->data_len >=
1463 				 HAMMER_SYMLINK_NAME_OFF);
1464 			error = uiomove(cursor.data->symlink.name,
1465 					cursor.leaf->data_len -
1466 						HAMMER_SYMLINK_NAME_OFF,
1467 					ap->a_uio);
1468 		}
1469 	}
1470 	hammer_done_cursor(&cursor);
1471 	hammer_done_transaction(&trans);
1472 	return(error);
1473 }
1474 
1475 /*
1476  * hammer_vop_nremove { nch, dvp, cred }
1477  */
1478 static
1479 int
1480 hammer_vop_nremove(struct vop_nremove_args *ap)
1481 {
1482 	struct hammer_transaction trans;
1483 	struct hammer_inode *dip;
1484 	int error;
1485 
1486 	dip = VTOI(ap->a_dvp);
1487 
1488 	if (hammer_nohistory(dip) == 0 &&
1489 	    (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1490 		return (error);
1491 	}
1492 
1493 	hammer_start_transaction(&trans, dip->hmp);
1494 	++hammer_stats_file_iopsw;
1495 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
1496 	hammer_done_transaction(&trans);
1497 
1498 	return (error);
1499 }
1500 
1501 /*
1502  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1503  */
1504 static
1505 int
1506 hammer_vop_nrename(struct vop_nrename_args *ap)
1507 {
1508 	struct hammer_transaction trans;
1509 	struct namecache *fncp;
1510 	struct namecache *tncp;
1511 	struct hammer_inode *fdip;
1512 	struct hammer_inode *tdip;
1513 	struct hammer_inode *ip;
1514 	struct hammer_cursor cursor;
1515 	int64_t namekey;
1516 	int nlen, error;
1517 
1518 	if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
1519 		return(EXDEV);
1520 	if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1521 		return(EXDEV);
1522 
1523 	fdip = VTOI(ap->a_fdvp);
1524 	tdip = VTOI(ap->a_tdvp);
1525 	fncp = ap->a_fnch->ncp;
1526 	tncp = ap->a_tnch->ncp;
1527 	ip = VTOI(fncp->nc_vp);
1528 	KKASSERT(ip != NULL);
1529 
1530 	if (fdip->obj_localization != tdip->obj_localization)
1531 		return(EXDEV);
1532 	if (fdip->obj_localization != ip->obj_localization)
1533 		return(EXDEV);
1534 
1535 	if (fdip->flags & HAMMER_INODE_RO)
1536 		return (EROFS);
1537 	if (tdip->flags & HAMMER_INODE_RO)
1538 		return (EROFS);
1539 	if (ip->flags & HAMMER_INODE_RO)
1540 		return (EROFS);
1541 	if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1542 		return (error);
1543 
1544 	hammer_start_transaction(&trans, fdip->hmp);
1545 	++hammer_stats_file_iopsw;
1546 
1547 	/*
1548 	 * Remove tncp from the target directory and then link ip as
1549 	 * tncp. XXX pass trans to dounlink
1550 	 *
1551 	 * Force the inode sync-time to match the transaction so it is
1552 	 * in-sync with the creation of the target directory entry.
1553 	 */
1554 	error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
1555 				ap->a_cred, 0, -1);
1556 	if (error == 0 || error == ENOENT) {
1557 		error = hammer_ip_add_directory(&trans, tdip,
1558 						tncp->nc_name, tncp->nc_nlen,
1559 						ip);
1560 		if (error == 0) {
1561 			ip->ino_data.parent_obj_id = tdip->obj_id;
1562 			hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
1563 		}
1564 	}
1565 	if (error)
1566 		goto failed; /* XXX */
1567 
1568 	/*
1569 	 * Locate the record in the originating directory and remove it.
1570 	 *
1571 	 * Calculate the namekey and setup the key range for the scan.  This
1572 	 * works kinda like a chained hash table where the lower 32 bits
1573 	 * of the namekey synthesize the chain.
1574 	 *
1575 	 * The key range is inclusive of both key_beg and key_end.
1576 	 */
1577 	namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen);
1578 retry:
1579 	hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
1580 	cursor.key_beg.localization = fdip->obj_localization +
1581 				      HAMMER_LOCALIZE_MISC;
1582         cursor.key_beg.obj_id = fdip->obj_id;
1583 	cursor.key_beg.key = namekey;
1584         cursor.key_beg.create_tid = 0;
1585         cursor.key_beg.delete_tid = 0;
1586         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1587         cursor.key_beg.obj_type = 0;
1588 
1589 	cursor.key_end = cursor.key_beg;
1590 	cursor.key_end.key |= 0xFFFFFFFFULL;
1591 	cursor.asof = fdip->obj_asof;
1592 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1593 
1594 	/*
1595 	 * Scan all matching records (the chain), locate the one matching
1596 	 * the requested path component.
1597 	 *
1598 	 * The hammer_ip_*() functions merge in-memory records with on-disk
1599 	 * records for the purposes of the search.
1600 	 */
1601 	error = hammer_ip_first(&cursor);
1602 	while (error == 0) {
1603 		if (hammer_ip_resolve_data(&cursor) != 0)
1604 			break;
1605 		nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1606 		KKASSERT(nlen > 0);
1607 		if (fncp->nc_nlen == nlen &&
1608 		    bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1609 			break;
1610 		}
1611 		error = hammer_ip_next(&cursor);
1612 	}
1613 
1614 	/*
1615 	 * If all is ok we have to get the inode so we can adjust nlinks.
1616 	 *
1617 	 * WARNING: hammer_ip_del_directory() may have to terminate the
1618 	 * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1619 	 * twice.
1620 	 */
1621 	if (error == 0)
1622 		error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1623 
1624 	/*
1625 	 * XXX A deadlock here will break rename's atomicy for the purposes
1626 	 * of crash recovery.
1627 	 */
1628 	if (error == EDEADLK) {
1629 		hammer_done_cursor(&cursor);
1630 		goto retry;
1631 	}
1632 
1633 	/*
1634 	 * Cleanup and tell the kernel that the rename succeeded.
1635 	 */
1636         hammer_done_cursor(&cursor);
1637 	if (error == 0)
1638 		cache_rename(ap->a_fnch, ap->a_tnch);
1639 
1640 failed:
1641 	hammer_done_transaction(&trans);
1642 	return (error);
1643 }
1644 
1645 /*
1646  * hammer_vop_nrmdir { nch, dvp, cred }
1647  */
1648 static
1649 int
1650 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1651 {
1652 	struct hammer_transaction trans;
1653 	struct hammer_inode *dip;
1654 	int error;
1655 
1656 	dip = VTOI(ap->a_dvp);
1657 
1658 	if (hammer_nohistory(dip) == 0 &&
1659 	    (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1660 		return (error);
1661 	}
1662 
1663 	hammer_start_transaction(&trans, dip->hmp);
1664 	++hammer_stats_file_iopsw;
1665 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
1666 	hammer_done_transaction(&trans);
1667 
1668 	return (error);
1669 }
1670 
1671 /*
1672  * hammer_vop_setattr { vp, vap, cred }
1673  */
1674 static
1675 int
1676 hammer_vop_setattr(struct vop_setattr_args *ap)
1677 {
1678 	struct hammer_transaction trans;
1679 	struct vattr *vap;
1680 	struct hammer_inode *ip;
1681 	int modflags;
1682 	int error;
1683 	int truncating;
1684 	int blksize;
1685 	int64_t aligned_size;
1686 	u_int32_t flags;
1687 
1688 	vap = ap->a_vap;
1689 	ip = ap->a_vp->v_data;
1690 	modflags = 0;
1691 
1692 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1693 		return(EROFS);
1694 	if (ip->flags & HAMMER_INODE_RO)
1695 		return (EROFS);
1696 	if (hammer_nohistory(ip) == 0 &&
1697 	    (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1698 		return (error);
1699 	}
1700 
1701 	hammer_start_transaction(&trans, ip->hmp);
1702 	++hammer_stats_file_iopsw;
1703 	error = 0;
1704 
1705 	if (vap->va_flags != VNOVAL) {
1706 		flags = ip->ino_data.uflags;
1707 		error = vop_helper_setattr_flags(&flags, vap->va_flags,
1708 					 hammer_to_unix_xid(&ip->ino_data.uid),
1709 					 ap->a_cred);
1710 		if (error == 0) {
1711 			if (ip->ino_data.uflags != flags) {
1712 				ip->ino_data.uflags = flags;
1713 				modflags |= HAMMER_INODE_DDIRTY;
1714 			}
1715 			if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1716 				error = 0;
1717 				goto done;
1718 			}
1719 		}
1720 		goto done;
1721 	}
1722 	if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1723 		error = EPERM;
1724 		goto done;
1725 	}
1726 	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
1727 		mode_t cur_mode = ip->ino_data.mode;
1728 		uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1729 		gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1730 		uuid_t uuid_uid;
1731 		uuid_t uuid_gid;
1732 
1733 		error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
1734 					 ap->a_cred,
1735 					 &cur_uid, &cur_gid, &cur_mode);
1736 		if (error == 0) {
1737 			hammer_guid_to_uuid(&uuid_uid, cur_uid);
1738 			hammer_guid_to_uuid(&uuid_gid, cur_gid);
1739 			if (bcmp(&uuid_uid, &ip->ino_data.uid,
1740 				 sizeof(uuid_uid)) ||
1741 			    bcmp(&uuid_gid, &ip->ino_data.gid,
1742 				 sizeof(uuid_gid)) ||
1743 			    ip->ino_data.mode != cur_mode
1744 			) {
1745 				ip->ino_data.uid = uuid_uid;
1746 				ip->ino_data.gid = uuid_gid;
1747 				ip->ino_data.mode = cur_mode;
1748 			}
1749 			modflags |= HAMMER_INODE_DDIRTY;
1750 		}
1751 	}
1752 	while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
1753 		switch(ap->a_vp->v_type) {
1754 		case VREG:
1755 			if (vap->va_size == ip->ino_data.size)
1756 				break;
1757 			/*
1758 			 * XXX break atomicy, we can deadlock the backend
1759 			 * if we do not release the lock.  Probably not a
1760 			 * big deal here.
1761 			 */
1762 			blksize = hammer_blocksize(vap->va_size);
1763 			if (vap->va_size < ip->ino_data.size) {
1764 				vtruncbuf(ap->a_vp, vap->va_size, blksize);
1765 				truncating = 1;
1766 			} else {
1767 				vnode_pager_setsize(ap->a_vp, vap->va_size);
1768 				truncating = 0;
1769 			}
1770 			ip->ino_data.size = vap->va_size;
1771 			modflags |= HAMMER_INODE_DDIRTY;
1772 
1773 			/*
1774 			 * on-media truncation is cached in the inode until
1775 			 * the inode is synchronized.
1776 			 */
1777 			if (truncating) {
1778 				hammer_ip_frontend_trunc(ip, vap->va_size);
1779 #ifdef DEBUG_TRUNCATE
1780 				if (HammerTruncIp == NULL)
1781 					HammerTruncIp = ip;
1782 #endif
1783 				if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1784 					ip->flags |= HAMMER_INODE_TRUNCATED;
1785 					ip->trunc_off = vap->va_size;
1786 #ifdef DEBUG_TRUNCATE
1787 					if (ip == HammerTruncIp)
1788 					kprintf("truncate1 %016llx\n", ip->trunc_off);
1789 #endif
1790 				} else if (ip->trunc_off > vap->va_size) {
1791 					ip->trunc_off = vap->va_size;
1792 #ifdef DEBUG_TRUNCATE
1793 					if (ip == HammerTruncIp)
1794 					kprintf("truncate2 %016llx\n", ip->trunc_off);
1795 #endif
1796 				} else {
1797 #ifdef DEBUG_TRUNCATE
1798 					if (ip == HammerTruncIp)
1799 					kprintf("truncate3 %016llx (ignored)\n", vap->va_size);
1800 #endif
1801 				}
1802 			}
1803 
1804 			/*
1805 			 * If truncating we have to clean out a portion of
1806 			 * the last block on-disk.  We do this in the
1807 			 * front-end buffer cache.
1808 			 */
1809 			aligned_size = (vap->va_size + (blksize - 1)) &
1810 				       ~(int64_t)(blksize - 1);
1811 			if (truncating && vap->va_size < aligned_size) {
1812 				struct buf *bp;
1813 				int offset;
1814 
1815 				aligned_size -= blksize;
1816 
1817 				offset = (int)vap->va_size & (blksize - 1);
1818 				error = bread(ap->a_vp, aligned_size,
1819 					      blksize, &bp);
1820 				hammer_ip_frontend_trunc(ip, aligned_size);
1821 				if (error == 0) {
1822 					bzero(bp->b_data + offset,
1823 					      blksize - offset);
1824 					/* must de-cache direct-io offset */
1825 					bp->b_bio2.bio_offset = NOOFFSET;
1826 					bdwrite(bp);
1827 				} else {
1828 					kprintf("ERROR %d\n", error);
1829 					brelse(bp);
1830 				}
1831 			}
1832 			break;
1833 		case VDATABASE:
1834 			if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1835 				ip->flags |= HAMMER_INODE_TRUNCATED;
1836 				ip->trunc_off = vap->va_size;
1837 			} else if (ip->trunc_off > vap->va_size) {
1838 				ip->trunc_off = vap->va_size;
1839 			}
1840 			hammer_ip_frontend_trunc(ip, vap->va_size);
1841 			ip->ino_data.size = vap->va_size;
1842 			modflags |= HAMMER_INODE_DDIRTY;
1843 			break;
1844 		default:
1845 			error = EINVAL;
1846 			goto done;
1847 		}
1848 		break;
1849 	}
1850 	if (vap->va_atime.tv_sec != VNOVAL) {
1851 		ip->ino_data.atime =
1852 			hammer_timespec_to_time(&vap->va_atime);
1853 		modflags |= HAMMER_INODE_ATIME;
1854 	}
1855 	if (vap->va_mtime.tv_sec != VNOVAL) {
1856 		ip->ino_data.mtime =
1857 			hammer_timespec_to_time(&vap->va_mtime);
1858 		modflags |= HAMMER_INODE_MTIME;
1859 	}
1860 	if (vap->va_mode != (mode_t)VNOVAL) {
1861 		mode_t   cur_mode = ip->ino_data.mode;
1862 		uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1863 		gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1864 
1865 		error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
1866 					 cur_uid, cur_gid, &cur_mode);
1867 		if (error == 0 && ip->ino_data.mode != cur_mode) {
1868 			ip->ino_data.mode = cur_mode;
1869 			modflags |= HAMMER_INODE_DDIRTY;
1870 		}
1871 	}
1872 done:
1873 	if (error == 0)
1874 		hammer_modify_inode(ip, modflags);
1875 	hammer_done_transaction(&trans);
1876 	return (error);
1877 }
1878 
1879 /*
1880  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1881  */
1882 static
1883 int
1884 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
1885 {
1886 	struct hammer_transaction trans;
1887 	struct hammer_inode *dip;
1888 	struct hammer_inode *nip;
1889 	struct nchandle *nch;
1890 	hammer_record_t record;
1891 	int error;
1892 	int bytes;
1893 
1894 	ap->a_vap->va_type = VLNK;
1895 
1896 	nch = ap->a_nch;
1897 	dip = VTOI(ap->a_dvp);
1898 
1899 	if (dip->flags & HAMMER_INODE_RO)
1900 		return (EROFS);
1901 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1902 		return (error);
1903 
1904 	/*
1905 	 * Create a transaction to cover the operations we perform.
1906 	 */
1907 	hammer_start_transaction(&trans, dip->hmp);
1908 	++hammer_stats_file_iopsw;
1909 
1910 	/*
1911 	 * Create a new filesystem object of the requested type.  The
1912 	 * returned inode will be referenced but not locked.
1913 	 */
1914 
1915 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1916 				    dip, NULL, &nip);
1917 	if (error) {
1918 		hammer_done_transaction(&trans);
1919 		*ap->a_vpp = NULL;
1920 		return (error);
1921 	}
1922 
1923 	/*
1924 	 * Add a record representing the symlink.  symlink stores the link
1925 	 * as pure data, not a string, and is no \0 terminated.
1926 	 */
1927 	if (error == 0) {
1928 		bytes = strlen(ap->a_target);
1929 
1930 		if (bytes <= HAMMER_INODE_BASESYMLEN) {
1931 			bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
1932 		} else {
1933 			record = hammer_alloc_mem_record(nip, bytes);
1934 			record->type = HAMMER_MEM_RECORD_GENERAL;
1935 
1936 			record->leaf.base.localization = nip->obj_localization +
1937 							 HAMMER_LOCALIZE_MISC;
1938 			record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
1939 			record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
1940 			record->leaf.data_len = bytes;
1941 			KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
1942 			bcopy(ap->a_target, record->data->symlink.name, bytes);
1943 			error = hammer_ip_add_record(&trans, record);
1944 		}
1945 
1946 		/*
1947 		 * Set the file size to the length of the link.
1948 		 */
1949 		if (error == 0) {
1950 			nip->ino_data.size = bytes;
1951 			hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
1952 		}
1953 	}
1954 	if (error == 0)
1955 		error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
1956 						nch->ncp->nc_nlen, nip);
1957 
1958 	/*
1959 	 * Finish up.
1960 	 */
1961 	if (error) {
1962 		hammer_rel_inode(nip, 0);
1963 		*ap->a_vpp = NULL;
1964 	} else {
1965 		error = hammer_get_vnode(nip, ap->a_vpp);
1966 		hammer_rel_inode(nip, 0);
1967 		if (error == 0) {
1968 			cache_setunresolved(ap->a_nch);
1969 			cache_setvp(ap->a_nch, *ap->a_vpp);
1970 		}
1971 	}
1972 	hammer_done_transaction(&trans);
1973 	return (error);
1974 }
1975 
1976 /*
1977  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
1978  */
1979 static
1980 int
1981 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
1982 {
1983 	struct hammer_transaction trans;
1984 	struct hammer_inode *dip;
1985 	int error;
1986 
1987 	dip = VTOI(ap->a_dvp);
1988 
1989 	if (hammer_nohistory(dip) == 0 &&
1990 	    (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) {
1991 		return (error);
1992 	}
1993 
1994 	hammer_start_transaction(&trans, dip->hmp);
1995 	++hammer_stats_file_iopsw;
1996 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
1997 				ap->a_cred, ap->a_flags, -1);
1998 	hammer_done_transaction(&trans);
1999 
2000 	return (error);
2001 }
2002 
2003 /*
2004  * hammer_vop_ioctl { vp, command, data, fflag, cred }
2005  */
2006 static
2007 int
2008 hammer_vop_ioctl(struct vop_ioctl_args *ap)
2009 {
2010 	struct hammer_inode *ip = ap->a_vp->v_data;
2011 
2012 	++hammer_stats_file_iopsr;
2013 	return(hammer_ioctl(ip, ap->a_command, ap->a_data,
2014 			    ap->a_fflag, ap->a_cred));
2015 }
2016 
2017 static
2018 int
2019 hammer_vop_mountctl(struct vop_mountctl_args *ap)
2020 {
2021 	struct mount *mp;
2022 	int error;
2023 
2024 	mp = ap->a_head.a_ops->head.vv_mount;
2025 
2026 	switch(ap->a_op) {
2027 	case MOUNTCTL_SET_EXPORT:
2028 		if (ap->a_ctllen != sizeof(struct export_args))
2029 			error = EINVAL;
2030 		error = hammer_vfs_export(mp, ap->a_op,
2031 				      (const struct export_args *)ap->a_ctl);
2032 		break;
2033 	default:
2034 		error = journal_mountctl(ap);
2035 		break;
2036 	}
2037 	return(error);
2038 }
2039 
2040 /*
2041  * hammer_vop_strategy { vp, bio }
2042  *
2043  * Strategy call, used for regular file read & write only.  Note that the
2044  * bp may represent a cluster.
2045  *
2046  * To simplify operation and allow better optimizations in the future,
2047  * this code does not make any assumptions with regards to buffer alignment
2048  * or size.
2049  */
2050 static
2051 int
2052 hammer_vop_strategy(struct vop_strategy_args *ap)
2053 {
2054 	struct buf *bp;
2055 	int error;
2056 
2057 	bp = ap->a_bio->bio_buf;
2058 
2059 	switch(bp->b_cmd) {
2060 	case BUF_CMD_READ:
2061 		error = hammer_vop_strategy_read(ap);
2062 		break;
2063 	case BUF_CMD_WRITE:
2064 		error = hammer_vop_strategy_write(ap);
2065 		break;
2066 	default:
2067 		bp->b_error = error = EINVAL;
2068 		bp->b_flags |= B_ERROR;
2069 		biodone(ap->a_bio);
2070 		break;
2071 	}
2072 	return (error);
2073 }
2074 
2075 /*
2076  * Read from a regular file.  Iterate the related records and fill in the
2077  * BIO/BUF.  Gaps are zero-filled.
2078  *
2079  * The support code in hammer_object.c should be used to deal with mixed
2080  * in-memory and on-disk records.
2081  *
2082  * NOTE: Can be called from the cluster code with an oversized buf.
2083  *
2084  * XXX atime update
2085  */
2086 static
2087 int
2088 hammer_vop_strategy_read(struct vop_strategy_args *ap)
2089 {
2090 	struct hammer_transaction trans;
2091 	struct hammer_inode *ip;
2092 	struct hammer_cursor cursor;
2093 	hammer_base_elm_t base;
2094 	hammer_off_t disk_offset;
2095 	struct bio *bio;
2096 	struct bio *nbio;
2097 	struct buf *bp;
2098 	int64_t rec_offset;
2099 	int64_t ran_end;
2100 	int64_t tmp64;
2101 	int error;
2102 	int boff;
2103 	int roff;
2104 	int n;
2105 
2106 	bio = ap->a_bio;
2107 	bp = bio->bio_buf;
2108 	ip = ap->a_vp->v_data;
2109 
2110 	/*
2111 	 * The zone-2 disk offset may have been set by the cluster code via
2112 	 * a BMAP operation, or else should be NOOFFSET.
2113 	 *
2114 	 * Checking the high bits for a match against zone-2 should suffice.
2115 	 */
2116 	nbio = push_bio(bio);
2117 	if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2118 	    HAMMER_ZONE_LARGE_DATA) {
2119 		error = hammer_io_direct_read(ip->hmp, nbio, NULL);
2120 		return (error);
2121 	}
2122 
2123 	/*
2124 	 * Well, that sucked.  Do it the hard way.  If all the stars are
2125 	 * aligned we may still be able to issue a direct-read.
2126 	 */
2127 	hammer_simple_transaction(&trans, ip->hmp);
2128 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2129 
2130 	/*
2131 	 * Key range (begin and end inclusive) to scan.  Note that the key's
2132 	 * stored in the actual records represent BASE+LEN, not BASE.  The
2133 	 * first record containing bio_offset will have a key > bio_offset.
2134 	 */
2135 	cursor.key_beg.localization = ip->obj_localization +
2136 				      HAMMER_LOCALIZE_MISC;
2137 	cursor.key_beg.obj_id = ip->obj_id;
2138 	cursor.key_beg.create_tid = 0;
2139 	cursor.key_beg.delete_tid = 0;
2140 	cursor.key_beg.obj_type = 0;
2141 	cursor.key_beg.key = bio->bio_offset + 1;
2142 	cursor.asof = ip->obj_asof;
2143 	cursor.flags |= HAMMER_CURSOR_ASOF;
2144 
2145 	cursor.key_end = cursor.key_beg;
2146 	KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2147 #if 0
2148 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2149 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2150 		cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2151 		cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2152 	} else
2153 #endif
2154 	{
2155 		ran_end = bio->bio_offset + bp->b_bufsize;
2156 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2157 		cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2158 		tmp64 = ran_end + MAXPHYS + 1;	/* work-around GCC-4 bug */
2159 		if (tmp64 < ran_end)
2160 			cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2161 		else
2162 			cursor.key_end.key = ran_end + MAXPHYS + 1;
2163 	}
2164 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2165 
2166 	error = hammer_ip_first(&cursor);
2167 	boff = 0;
2168 
2169 	while (error == 0) {
2170 		/*
2171 		 * Get the base file offset of the record.  The key for
2172 		 * data records is (base + bytes) rather then (base).
2173 		 */
2174 		base = &cursor.leaf->base;
2175 		rec_offset = base->key - cursor.leaf->data_len;
2176 
2177 		/*
2178 		 * Calculate the gap, if any, and zero-fill it.
2179 		 *
2180 		 * n is the offset of the start of the record verses our
2181 		 * current seek offset in the bio.
2182 		 */
2183 		n = (int)(rec_offset - (bio->bio_offset + boff));
2184 		if (n > 0) {
2185 			if (n > bp->b_bufsize - boff)
2186 				n = bp->b_bufsize - boff;
2187 			bzero((char *)bp->b_data + boff, n);
2188 			boff += n;
2189 			n = 0;
2190 		}
2191 
2192 		/*
2193 		 * Calculate the data offset in the record and the number
2194 		 * of bytes we can copy.
2195 		 *
2196 		 * There are two degenerate cases.  First, boff may already
2197 		 * be at bp->b_bufsize.  Secondly, the data offset within
2198 		 * the record may exceed the record's size.
2199 		 */
2200 		roff = -n;
2201 		rec_offset += roff;
2202 		n = cursor.leaf->data_len - roff;
2203 		if (n <= 0) {
2204 			kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2205 			n = 0;
2206 		} else if (n > bp->b_bufsize - boff) {
2207 			n = bp->b_bufsize - boff;
2208 		}
2209 
2210 		/*
2211 		 * Deal with cached truncations.  This cool bit of code
2212 		 * allows truncate()/ftruncate() to avoid having to sync
2213 		 * the file.
2214 		 *
2215 		 * If the frontend is truncated then all backend records are
2216 		 * subject to the frontend's truncation.
2217 		 *
2218 		 * If the backend is truncated then backend records on-disk
2219 		 * (but not in-memory) are subject to the backend's
2220 		 * truncation.  In-memory records owned by the backend
2221 		 * represent data written after the truncation point on the
2222 		 * backend and must not be truncated.
2223 		 *
2224 		 * Truncate operations deal with frontend buffer cache
2225 		 * buffers and frontend-owned in-memory records synchronously.
2226 		 */
2227 		if (ip->flags & HAMMER_INODE_TRUNCATED) {
2228 			if (hammer_cursor_ondisk(&cursor) ||
2229 			    cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2230 				if (ip->trunc_off <= rec_offset)
2231 					n = 0;
2232 				else if (ip->trunc_off < rec_offset + n)
2233 					n = (int)(ip->trunc_off - rec_offset);
2234 			}
2235 		}
2236 		if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2237 			if (hammer_cursor_ondisk(&cursor)) {
2238 				if (ip->sync_trunc_off <= rec_offset)
2239 					n = 0;
2240 				else if (ip->sync_trunc_off < rec_offset + n)
2241 					n = (int)(ip->sync_trunc_off - rec_offset);
2242 			}
2243 		}
2244 
2245 		/*
2246 		 * Try to issue a direct read into our bio if possible,
2247 		 * otherwise resolve the element data into a hammer_buffer
2248 		 * and copy.
2249 		 *
2250 		 * The buffer on-disk should be zerod past any real
2251 		 * truncation point, but may not be for any synthesized
2252 		 * truncation point from above.
2253 		 */
2254 		disk_offset = cursor.leaf->data_offset + roff;
2255 		if (boff == 0 && n == bp->b_bufsize &&
2256 		    hammer_cursor_ondisk(&cursor) &&
2257 		    (disk_offset & HAMMER_BUFMASK) == 0) {
2258 			KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2259 				 HAMMER_ZONE_LARGE_DATA);
2260 			nbio->bio_offset = disk_offset;
2261 			error = hammer_io_direct_read(trans.hmp, nbio,
2262 						      cursor.leaf);
2263 			goto done;
2264 		} else if (n) {
2265 			error = hammer_ip_resolve_data(&cursor);
2266 			if (error == 0) {
2267 				bcopy((char *)cursor.data + roff,
2268 				      (char *)bp->b_data + boff, n);
2269 			}
2270 		}
2271 		if (error)
2272 			break;
2273 
2274 		/*
2275 		 * Iterate until we have filled the request.
2276 		 */
2277 		boff += n;
2278 		if (boff == bp->b_bufsize)
2279 			break;
2280 		error = hammer_ip_next(&cursor);
2281 	}
2282 
2283 	/*
2284 	 * There may have been a gap after the last record
2285 	 */
2286 	if (error == ENOENT)
2287 		error = 0;
2288 	if (error == 0 && boff != bp->b_bufsize) {
2289 		KKASSERT(boff < bp->b_bufsize);
2290 		bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2291 		/* boff = bp->b_bufsize; */
2292 	}
2293 	bp->b_resid = 0;
2294 	bp->b_error = error;
2295 	if (error)
2296 		bp->b_flags |= B_ERROR;
2297 	biodone(ap->a_bio);
2298 
2299 done:
2300 	if (cursor.node)
2301 		hammer_cache_node(&ip->cache[1], cursor.node);
2302 	hammer_done_cursor(&cursor);
2303 	hammer_done_transaction(&trans);
2304 	return(error);
2305 }
2306 
2307 /*
2308  * BMAP operation - used to support cluster_read() only.
2309  *
2310  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2311  *
2312  * This routine may return EOPNOTSUPP if the opration is not supported for
2313  * the specified offset.  The contents of the pointer arguments do not
2314  * need to be initialized in that case.
2315  *
2316  * If a disk address is available and properly aligned return 0 with
2317  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2318  * to the run-length relative to that offset.  Callers may assume that
2319  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2320  * large, so return EOPNOTSUPP if it is not sufficiently large.
2321  */
2322 static
2323 int
2324 hammer_vop_bmap(struct vop_bmap_args *ap)
2325 {
2326 	struct hammer_transaction trans;
2327 	struct hammer_inode *ip;
2328 	struct hammer_cursor cursor;
2329 	hammer_base_elm_t base;
2330 	int64_t rec_offset;
2331 	int64_t ran_end;
2332 	int64_t tmp64;
2333 	int64_t base_offset;
2334 	int64_t base_disk_offset;
2335 	int64_t last_offset;
2336 	hammer_off_t last_disk_offset;
2337 	hammer_off_t disk_offset;
2338 	int	rec_len;
2339 	int	error;
2340 	int	blksize;
2341 
2342 	++hammer_stats_file_iopsr;
2343 	ip = ap->a_vp->v_data;
2344 
2345 	/*
2346 	 * We can only BMAP regular files.  We can't BMAP database files,
2347 	 * directories, etc.
2348 	 */
2349 	if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2350 		return(EOPNOTSUPP);
2351 
2352 	/*
2353 	 * bmap is typically called with runp/runb both NULL when used
2354 	 * for writing.  We do not support BMAP for writing atm.
2355 	 */
2356 	if (ap->a_cmd != BUF_CMD_READ)
2357 		return(EOPNOTSUPP);
2358 
2359 	/*
2360 	 * Scan the B-Tree to acquire blockmap addresses, then translate
2361 	 * to raw addresses.
2362 	 */
2363 	hammer_simple_transaction(&trans, ip->hmp);
2364 #if 0
2365 	kprintf("bmap_beg %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
2366 #endif
2367 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2368 
2369 	/*
2370 	 * Key range (begin and end inclusive) to scan.  Note that the key's
2371 	 * stored in the actual records represent BASE+LEN, not BASE.  The
2372 	 * first record containing bio_offset will have a key > bio_offset.
2373 	 */
2374 	cursor.key_beg.localization = ip->obj_localization +
2375 				      HAMMER_LOCALIZE_MISC;
2376 	cursor.key_beg.obj_id = ip->obj_id;
2377 	cursor.key_beg.create_tid = 0;
2378 	cursor.key_beg.delete_tid = 0;
2379 	cursor.key_beg.obj_type = 0;
2380 	if (ap->a_runb)
2381 		cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
2382 	else
2383 		cursor.key_beg.key = ap->a_loffset + 1;
2384 	if (cursor.key_beg.key < 0)
2385 		cursor.key_beg.key = 0;
2386 	cursor.asof = ip->obj_asof;
2387 	cursor.flags |= HAMMER_CURSOR_ASOF;
2388 
2389 	cursor.key_end = cursor.key_beg;
2390 	KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2391 
2392 	ran_end = ap->a_loffset + MAXPHYS;
2393 	cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2394 	cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2395 	tmp64 = ran_end + MAXPHYS + 1;	/* work-around GCC-4 bug */
2396 	if (tmp64 < ran_end)
2397 		cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2398 	else
2399 		cursor.key_end.key = ran_end + MAXPHYS + 1;
2400 
2401 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2402 
2403 	error = hammer_ip_first(&cursor);
2404 	base_offset = last_offset = 0;
2405 	base_disk_offset = last_disk_offset = 0;
2406 
2407 	while (error == 0) {
2408 		/*
2409 		 * Get the base file offset of the record.  The key for
2410 		 * data records is (base + bytes) rather then (base).
2411 		 *
2412 		 * NOTE: rec_offset + rec_len may exceed the end-of-file.
2413 		 * The extra bytes should be zero on-disk and the BMAP op
2414 		 * should still be ok.
2415 		 */
2416 		base = &cursor.leaf->base;
2417 		rec_offset = base->key - cursor.leaf->data_len;
2418 		rec_len    = cursor.leaf->data_len;
2419 
2420 		/*
2421 		 * Incorporate any cached truncation.
2422 		 *
2423 		 * NOTE: Modifications to rec_len based on synthesized
2424 		 * truncation points remove the guarantee that any extended
2425 		 * data on disk is zero (since the truncations may not have
2426 		 * taken place on-media yet).
2427 		 */
2428 		if (ip->flags & HAMMER_INODE_TRUNCATED) {
2429 			if (hammer_cursor_ondisk(&cursor) ||
2430 			    cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2431 				if (ip->trunc_off <= rec_offset)
2432 					rec_len = 0;
2433 				else if (ip->trunc_off < rec_offset + rec_len)
2434 					rec_len = (int)(ip->trunc_off - rec_offset);
2435 			}
2436 		}
2437 		if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2438 			if (hammer_cursor_ondisk(&cursor)) {
2439 				if (ip->sync_trunc_off <= rec_offset)
2440 					rec_len = 0;
2441 				else if (ip->sync_trunc_off < rec_offset + rec_len)
2442 					rec_len = (int)(ip->sync_trunc_off - rec_offset);
2443 			}
2444 		}
2445 
2446 		/*
2447 		 * Accumulate information.  If we have hit a discontiguous
2448 		 * block reset base_offset unless we are already beyond the
2449 		 * requested offset.  If we are, that's it, we stop.
2450 		 */
2451 		if (error)
2452 			break;
2453 		if (hammer_cursor_ondisk(&cursor)) {
2454 			disk_offset = cursor.leaf->data_offset;
2455 			if (rec_offset != last_offset ||
2456 			    disk_offset != last_disk_offset) {
2457 				if (rec_offset > ap->a_loffset)
2458 					break;
2459 				base_offset = rec_offset;
2460 				base_disk_offset = disk_offset;
2461 			}
2462 			last_offset = rec_offset + rec_len;
2463 			last_disk_offset = disk_offset + rec_len;
2464 		}
2465 		error = hammer_ip_next(&cursor);
2466 	}
2467 
2468 #if 0
2469 	kprintf("BMAP %016llx:  %016llx - %016llx\n",
2470 		ap->a_loffset, base_offset, last_offset);
2471 	kprintf("BMAP %16s:  %016llx - %016llx\n",
2472 		"", base_disk_offset, last_disk_offset);
2473 #endif
2474 
2475 	if (cursor.node) {
2476 		hammer_cache_node(&ip->cache[1], cursor.node);
2477 #if 0
2478 		kprintf("bmap_end2 %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
2479 #endif
2480 	}
2481 	hammer_done_cursor(&cursor);
2482 	hammer_done_transaction(&trans);
2483 
2484 	/*
2485 	 * If we couldn't find any records or the records we did find were
2486 	 * all behind the requested offset, return failure.  A forward
2487 	 * truncation can leave a hole w/ no on-disk records.
2488 	 */
2489 	if (last_offset == 0 || last_offset < ap->a_loffset)
2490 		return (EOPNOTSUPP);
2491 
2492 	/*
2493 	 * Figure out the block size at the requested offset and adjust
2494 	 * our limits so the cluster_read() does not create inappropriately
2495 	 * sized buffer cache buffers.
2496 	 */
2497 	blksize = hammer_blocksize(ap->a_loffset);
2498 	if (hammer_blocksize(base_offset) != blksize) {
2499 		base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
2500 	}
2501 	if (last_offset != ap->a_loffset &&
2502 	    hammer_blocksize(last_offset - 1) != blksize) {
2503 		last_offset = hammer_blockdemarc(ap->a_loffset,
2504 						 last_offset - 1);
2505 	}
2506 
2507 	/*
2508 	 * Returning EOPNOTSUPP simply prevents the direct-IO optimization
2509 	 * from occuring.
2510 	 */
2511 	disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
2512 
2513 	if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
2514 		/*
2515 		 * Only large-data zones can be direct-IOd
2516 		 */
2517 		error = EOPNOTSUPP;
2518 	} else if ((disk_offset & HAMMER_BUFMASK) ||
2519 		   (last_offset - ap->a_loffset) < blksize) {
2520 		/*
2521 		 * doffsetp is not aligned or the forward run size does
2522 		 * not cover a whole buffer, disallow the direct I/O.
2523 		 */
2524 		error = EOPNOTSUPP;
2525 	} else {
2526 		/*
2527 		 * We're good.
2528 		 */
2529 		*ap->a_doffsetp = disk_offset;
2530 		if (ap->a_runb) {
2531 			*ap->a_runb = ap->a_loffset - base_offset;
2532 			KKASSERT(*ap->a_runb >= 0);
2533 		}
2534 		if (ap->a_runp) {
2535 			*ap->a_runp = last_offset - ap->a_loffset;
2536 			KKASSERT(*ap->a_runp >= 0);
2537 		}
2538 		error = 0;
2539 	}
2540 	return(error);
2541 }
2542 
2543 /*
2544  * Write to a regular file.   Because this is a strategy call the OS is
2545  * trying to actually get data onto the media.
2546  */
2547 static
2548 int
2549 hammer_vop_strategy_write(struct vop_strategy_args *ap)
2550 {
2551 	hammer_record_t record;
2552 	hammer_mount_t hmp;
2553 	hammer_inode_t ip;
2554 	struct bio *bio;
2555 	struct buf *bp;
2556 	int blksize;
2557 	int bytes;
2558 	int error;
2559 
2560 	bio = ap->a_bio;
2561 	bp = bio->bio_buf;
2562 	ip = ap->a_vp->v_data;
2563 	hmp = ip->hmp;
2564 
2565 	blksize = hammer_blocksize(bio->bio_offset);
2566 	KKASSERT(bp->b_bufsize == blksize);
2567 
2568 	if (ip->flags & HAMMER_INODE_RO) {
2569 		bp->b_error = EROFS;
2570 		bp->b_flags |= B_ERROR;
2571 		biodone(ap->a_bio);
2572 		return(EROFS);
2573 	}
2574 
2575 	/*
2576 	 * Interlock with inode destruction (no in-kernel or directory
2577 	 * topology visibility).  If we queue new IO while trying to
2578 	 * destroy the inode we can deadlock the vtrunc call in
2579 	 * hammer_inode_unloadable_check().
2580 	 */
2581 	if (ip->flags & (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
2582 		bp->b_resid = 0;
2583 		biodone(ap->a_bio);
2584 		return(0);
2585 	}
2586 
2587 	/*
2588 	 * Reserve space and issue a direct-write from the front-end.
2589 	 * NOTE: The direct_io code will hammer_bread/bcopy smaller
2590 	 * allocations.
2591 	 *
2592 	 * An in-memory record will be installed to reference the storage
2593 	 * until the flusher can get to it.
2594 	 *
2595 	 * Since we own the high level bio the front-end will not try to
2596 	 * do a direct-read until the write completes.
2597 	 *
2598 	 * NOTE: The only time we do not reserve a full-sized buffers
2599 	 * worth of data is if the file is small.  We do not try to
2600 	 * allocate a fragment (from the small-data zone) at the end of
2601 	 * an otherwise large file as this can lead to wildly separated
2602 	 * data.
2603 	 */
2604 	KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
2605 	KKASSERT(bio->bio_offset < ip->ino_data.size);
2606 	if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
2607 		bytes = bp->b_bufsize;
2608 	else
2609 		bytes = ((int)ip->ino_data.size + 15) & ~15;
2610 
2611 	record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
2612 				    bytes, &error);
2613 	if (record) {
2614 		hammer_io_direct_write(hmp, record, bio);
2615 		if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
2616 			hammer_flush_inode(ip, 0);
2617 	} else {
2618 		bp->b_bio2.bio_offset = NOOFFSET;
2619 		bp->b_error = error;
2620 		bp->b_flags |= B_ERROR;
2621 		biodone(ap->a_bio);
2622 	}
2623 	return(error);
2624 }
2625 
2626 /*
2627  * dounlink - disconnect a directory entry
2628  *
2629  * XXX whiteout support not really in yet
2630  */
2631 static int
2632 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
2633 		struct vnode *dvp, struct ucred *cred,
2634 		int flags, int isdir)
2635 {
2636 	struct namecache *ncp;
2637 	hammer_inode_t dip;
2638 	hammer_inode_t ip;
2639 	struct hammer_cursor cursor;
2640 	int64_t namekey;
2641 	int nlen, error;
2642 
2643 	/*
2644 	 * Calculate the namekey and setup the key range for the scan.  This
2645 	 * works kinda like a chained hash table where the lower 32 bits
2646 	 * of the namekey synthesize the chain.
2647 	 *
2648 	 * The key range is inclusive of both key_beg and key_end.
2649 	 */
2650 	dip = VTOI(dvp);
2651 	ncp = nch->ncp;
2652 
2653 	if (dip->flags & HAMMER_INODE_RO)
2654 		return (EROFS);
2655 
2656 	namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen);
2657 retry:
2658 	hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
2659 	cursor.key_beg.localization = dip->obj_localization +
2660 				      HAMMER_LOCALIZE_MISC;
2661         cursor.key_beg.obj_id = dip->obj_id;
2662 	cursor.key_beg.key = namekey;
2663         cursor.key_beg.create_tid = 0;
2664         cursor.key_beg.delete_tid = 0;
2665         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2666         cursor.key_beg.obj_type = 0;
2667 
2668 	cursor.key_end = cursor.key_beg;
2669 	cursor.key_end.key |= 0xFFFFFFFFULL;
2670 	cursor.asof = dip->obj_asof;
2671 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2672 
2673 	/*
2674 	 * Scan all matching records (the chain), locate the one matching
2675 	 * the requested path component.  info->last_error contains the
2676 	 * error code on search termination and could be 0, ENOENT, or
2677 	 * something else.
2678 	 *
2679 	 * The hammer_ip_*() functions merge in-memory records with on-disk
2680 	 * records for the purposes of the search.
2681 	 */
2682 	error = hammer_ip_first(&cursor);
2683 
2684 	while (error == 0) {
2685 		error = hammer_ip_resolve_data(&cursor);
2686 		if (error)
2687 			break;
2688 		nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2689 		KKASSERT(nlen > 0);
2690 		if (ncp->nc_nlen == nlen &&
2691 		    bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2692 			break;
2693 		}
2694 		error = hammer_ip_next(&cursor);
2695 	}
2696 
2697 	/*
2698 	 * If all is ok we have to get the inode so we can adjust nlinks.
2699 	 * To avoid a deadlock with the flusher we must release the inode
2700 	 * lock on the directory when acquiring the inode for the entry.
2701 	 *
2702 	 * If the target is a directory, it must be empty.
2703 	 */
2704 	if (error == 0) {
2705 		hammer_unlock(&cursor.ip->lock);
2706 		ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
2707 				      dip->hmp->asof,
2708 				      cursor.data->entry.localization,
2709 				      0, &error);
2710 		hammer_lock_sh(&cursor.ip->lock);
2711 		if (error == ENOENT) {
2712 			kprintf("obj_id %016llx\n", cursor.data->entry.obj_id);
2713 			Debugger("ENOENT unlinking object that should exist");
2714 		}
2715 
2716 		/*
2717 		 * If isdir >= 0 we validate that the entry is or is not a
2718 		 * directory.  If isdir < 0 we don't care.
2719 		 */
2720 		if (error == 0 && isdir >= 0) {
2721 			if (isdir &&
2722 			    ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
2723 				error = ENOTDIR;
2724 			} else if (isdir == 0 &&
2725 			    ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
2726 				error = EISDIR;
2727 			}
2728 		}
2729 
2730 		/*
2731 		 * If we are trying to remove a directory the directory must
2732 		 * be empty.
2733 		 *
2734 		 * WARNING: hammer_ip_check_directory_empty() may have to
2735 		 * terminate the cursor to avoid a deadlock.  It is ok to
2736 		 * call hammer_done_cursor() twice.
2737 		 */
2738 		if (error == 0 && ip->ino_data.obj_type ==
2739 				  HAMMER_OBJTYPE_DIRECTORY) {
2740 			error = hammer_ip_check_directory_empty(trans, ip);
2741 		}
2742 
2743 		/*
2744 		 * Delete the directory entry.
2745 		 *
2746 		 * WARNING: hammer_ip_del_directory() may have to terminate
2747 		 * the cursor to avoid a deadlock.  It is ok to call
2748 		 * hammer_done_cursor() twice.
2749 		 */
2750 		if (error == 0) {
2751 			error = hammer_ip_del_directory(trans, &cursor,
2752 							dip, ip);
2753 		}
2754 		hammer_done_cursor(&cursor);
2755 		if (error == 0) {
2756 			cache_setunresolved(nch);
2757 			cache_setvp(nch, NULL);
2758 			/* XXX locking */
2759 			if (ip->vp)
2760 				cache_inval_vp(ip->vp, CINV_DESTROY);
2761 		}
2762 		if (ip)
2763 			hammer_rel_inode(ip, 0);
2764 	} else {
2765 		hammer_done_cursor(&cursor);
2766 	}
2767 	hammer_inode_waitreclaims(dip->hmp);
2768 	if (error == EDEADLK)
2769 		goto retry;
2770 
2771 	return (error);
2772 }
2773 
2774 /************************************************************************
2775  *			    FIFO AND SPECFS OPS				*
2776  ************************************************************************
2777  *
2778  */
2779 
2780 static int
2781 hammer_vop_fifoclose (struct vop_close_args *ap)
2782 {
2783 	/* XXX update itimes */
2784 	return (VOCALL(&fifo_vnode_vops, &ap->a_head));
2785 }
2786 
2787 static int
2788 hammer_vop_fiforead (struct vop_read_args *ap)
2789 {
2790 	int error;
2791 
2792 	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2793 	/* XXX update access time */
2794 	return (error);
2795 }
2796 
2797 static int
2798 hammer_vop_fifowrite (struct vop_write_args *ap)
2799 {
2800 	int error;
2801 
2802 	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2803 	/* XXX update access time */
2804 	return (error);
2805 }
2806 
2807 static int
2808 hammer_vop_specclose (struct vop_close_args *ap)
2809 {
2810 	/* XXX update itimes */
2811 	return (VOCALL(&spec_vnode_vops, &ap->a_head));
2812 }
2813 
2814 static int
2815 hammer_vop_specread (struct vop_read_args *ap)
2816 {
2817 	/* XXX update access time */
2818 	return (VOCALL(&spec_vnode_vops, &ap->a_head));
2819 }
2820 
2821 static int
2822 hammer_vop_specwrite (struct vop_write_args *ap)
2823 {
2824 	/* XXX update last change time */
2825 	return (VOCALL(&spec_vnode_vops, &ap->a_head));
2826 }
2827 
2828