xref: /dragonfly/sys/vfs/hammer/hammer_vnops.c (revision dca3c15d)
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $
35  */
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <sys/file.h>
48 #include <vm/vm_extern.h>
49 #include <vfs/fifofs/fifo.h>
50 #include "hammer.h"
51 
52 /*
53  * USERFS VNOPS
54  */
55 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
56 static int hammer_vop_fsync(struct vop_fsync_args *);
57 static int hammer_vop_read(struct vop_read_args *);
58 static int hammer_vop_write(struct vop_write_args *);
59 static int hammer_vop_access(struct vop_access_args *);
60 static int hammer_vop_advlock(struct vop_advlock_args *);
61 static int hammer_vop_close(struct vop_close_args *);
62 static int hammer_vop_ncreate(struct vop_ncreate_args *);
63 static int hammer_vop_getattr(struct vop_getattr_args *);
64 static int hammer_vop_nresolve(struct vop_nresolve_args *);
65 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
66 static int hammer_vop_nlink(struct vop_nlink_args *);
67 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
68 static int hammer_vop_nmknod(struct vop_nmknod_args *);
69 static int hammer_vop_open(struct vop_open_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_markatime(struct vop_markatime_args *);
77 static int hammer_vop_setattr(struct vop_setattr_args *);
78 static int hammer_vop_strategy(struct vop_strategy_args *);
79 static int hammer_vop_bmap(struct vop_bmap_args *ap);
80 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
81 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
82 static int hammer_vop_ioctl(struct vop_ioctl_args *);
83 static int hammer_vop_mountctl(struct vop_mountctl_args *);
84 static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
85 
86 static int hammer_vop_fifoclose (struct vop_close_args *);
87 static int hammer_vop_fiforead (struct vop_read_args *);
88 static int hammer_vop_fifowrite (struct vop_write_args *);
89 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
90 
91 struct vop_ops hammer_vnode_vops = {
92 	.vop_default =		vop_defaultop,
93 	.vop_fsync =		hammer_vop_fsync,
94 	.vop_getpages =		vop_stdgetpages,
95 	.vop_putpages =		vop_stdputpages,
96 	.vop_read =		hammer_vop_read,
97 	.vop_write =		hammer_vop_write,
98 	.vop_access =		hammer_vop_access,
99 	.vop_advlock =		hammer_vop_advlock,
100 	.vop_close =		hammer_vop_close,
101 	.vop_ncreate =		hammer_vop_ncreate,
102 	.vop_getattr =		hammer_vop_getattr,
103 	.vop_inactive =		hammer_vop_inactive,
104 	.vop_reclaim =		hammer_vop_reclaim,
105 	.vop_nresolve =		hammer_vop_nresolve,
106 	.vop_nlookupdotdot =	hammer_vop_nlookupdotdot,
107 	.vop_nlink =		hammer_vop_nlink,
108 	.vop_nmkdir =		hammer_vop_nmkdir,
109 	.vop_nmknod =		hammer_vop_nmknod,
110 	.vop_open =		hammer_vop_open,
111 	.vop_pathconf =		vop_stdpathconf,
112 	.vop_print =		hammer_vop_print,
113 	.vop_readdir =		hammer_vop_readdir,
114 	.vop_readlink =		hammer_vop_readlink,
115 	.vop_nremove =		hammer_vop_nremove,
116 	.vop_nrename =		hammer_vop_nrename,
117 	.vop_nrmdir =		hammer_vop_nrmdir,
118 	.vop_markatime = 	hammer_vop_markatime,
119 	.vop_setattr =		hammer_vop_setattr,
120 	.vop_bmap =		hammer_vop_bmap,
121 	.vop_strategy =		hammer_vop_strategy,
122 	.vop_nsymlink =		hammer_vop_nsymlink,
123 	.vop_nwhiteout =	hammer_vop_nwhiteout,
124 	.vop_ioctl =		hammer_vop_ioctl,
125 	.vop_mountctl =		hammer_vop_mountctl,
126 	.vop_kqfilter =		hammer_vop_kqfilter
127 };
128 
129 struct vop_ops hammer_spec_vops = {
130 	.vop_default =		vop_defaultop,
131 	.vop_fsync =		hammer_vop_fsync,
132 	.vop_read =		vop_stdnoread,
133 	.vop_write =		vop_stdnowrite,
134 	.vop_access =		hammer_vop_access,
135 	.vop_close =		hammer_vop_close,
136 	.vop_markatime = 	hammer_vop_markatime,
137 	.vop_getattr =		hammer_vop_getattr,
138 	.vop_inactive =		hammer_vop_inactive,
139 	.vop_reclaim =		hammer_vop_reclaim,
140 	.vop_setattr =		hammer_vop_setattr
141 };
142 
143 struct vop_ops hammer_fifo_vops = {
144 	.vop_default =		fifo_vnoperate,
145 	.vop_fsync =		hammer_vop_fsync,
146 	.vop_read =		hammer_vop_fiforead,
147 	.vop_write =		hammer_vop_fifowrite,
148 	.vop_access =		hammer_vop_access,
149 	.vop_close =		hammer_vop_fifoclose,
150 	.vop_markatime = 	hammer_vop_markatime,
151 	.vop_getattr =		hammer_vop_getattr,
152 	.vop_inactive =		hammer_vop_inactive,
153 	.vop_reclaim =		hammer_vop_reclaim,
154 	.vop_setattr =		hammer_vop_setattr,
155 	.vop_kqfilter =		hammer_vop_fifokqfilter
156 };
157 
158 static __inline
159 void
160 hammer_knote(struct vnode *vp, int flags)
161 {
162 	if (flags)
163 		KNOTE(&vp->v_pollinfo.vpi_selinfo.si_note, flags);
164 }
165 
166 #ifdef DEBUG_TRUNCATE
167 struct hammer_inode *HammerTruncIp;
168 #endif
169 
170 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
171 			   struct vnode *dvp, struct ucred *cred,
172 			   int flags, int isdir);
173 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
174 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
175 
176 #if 0
177 static
178 int
179 hammer_vop_vnoperate(struct vop_generic_args *)
180 {
181 	return (VOCALL(&hammer_vnode_vops, ap));
182 }
183 #endif
184 
185 /*
186  * hammer_vop_fsync { vp, waitfor }
187  *
188  * fsync() an inode to disk and wait for it to be completely committed
189  * such that the information would not be undone if a crash occured after
190  * return.
191  *
192  * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement
193  *	 a REDO log.  A sysctl is provided to relax HAMMER's fsync()
194  *	 operation.
195  *
196  *	 Ultimately the combination of a REDO log and use of fast storage
197  *	 to front-end cluster caches will make fsync fast, but it aint
198  *	 here yet.  And, in anycase, we need real transactional
199  *	 all-or-nothing features which are not restricted to a single file.
200  */
201 static
202 int
203 hammer_vop_fsync(struct vop_fsync_args *ap)
204 {
205 	hammer_inode_t ip = VTOI(ap->a_vp);
206 	int waitfor = ap->a_waitfor;
207 
208 	/*
209 	 * Fsync rule relaxation (default disabled)
210 	 */
211 	if (ap->a_flags & VOP_FSYNC_SYSCALL) {
212 		switch(hammer_fsync_mode) {
213 		case 0:
214 			/* full semantics */
215 			break;
216 		case 1:
217 			/* asynchronous */
218 			if (waitfor == MNT_WAIT)
219 				waitfor = MNT_NOWAIT;
220 			break;
221 		case 2:
222 			/* synchronous fsync on close */
223 			ip->flags |= HAMMER_INODE_CLOSESYNC;
224 			return(0);
225 		case 3:
226 			/* asynchronous fsync on close */
227 			ip->flags |= HAMMER_INODE_CLOSEASYNC;
228 			return(0);
229 		default:
230 			/* ignore the fsync() system call */
231 			return(0);
232 		}
233 	}
234 
235 	/*
236 	 * Go do it
237 	 */
238 	++hammer_count_fsyncs;
239 	vfsync(ap->a_vp, waitfor, 1, NULL, NULL);
240 	hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
241 	if (waitfor == MNT_WAIT) {
242 		vn_unlock(ap->a_vp);
243 		hammer_wait_inode(ip);
244 		vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
245 	}
246 	return (ip->error);
247 }
248 
249 /*
250  * hammer_vop_read { vp, uio, ioflag, cred }
251  *
252  * MPALMOSTSAFE
253  */
254 static
255 int
256 hammer_vop_read(struct vop_read_args *ap)
257 {
258 	struct hammer_transaction trans;
259 	hammer_inode_t ip;
260 	off_t offset;
261 	struct buf *bp;
262 	struct uio *uio;
263 	int error;
264 	int n;
265 	int seqcount;
266 	int ioseqcount;
267 	int blksize;
268 	int got_mplock;
269 	int bigread;
270 
271 	if (ap->a_vp->v_type != VREG)
272 		return (EINVAL);
273 	ip = VTOI(ap->a_vp);
274 	error = 0;
275 	uio = ap->a_uio;
276 
277 	/*
278 	 * Allow the UIO's size to override the sequential heuristic.
279 	 */
280 	blksize = hammer_blocksize(uio->uio_offset);
281 	seqcount = (uio->uio_resid + (blksize - 1)) / blksize;
282 	ioseqcount = ap->a_ioflag >> 16;
283 	if (seqcount < ioseqcount)
284 		seqcount = ioseqcount;
285 
286 	/*
287 	 * Temporary hack until more of HAMMER can be made MPSAFE.
288 	 */
289 #ifdef SMP
290 	if (curthread->td_mpcount) {
291 		got_mplock = -1;
292 		hammer_start_transaction(&trans, ip->hmp);
293 	} else {
294 		got_mplock = 0;
295 	}
296 #else
297 	hammer_start_transaction(&trans, ip->hmp);
298 	got_mplock = -1;
299 #endif
300 
301 	/*
302 	 * If reading or writing a huge amount of data we have to break
303 	 * atomicy and allow the operation to be interrupted by a signal
304 	 * or it can DOS the machine.
305 	 */
306 	bigread = (uio->uio_resid > 100 * 1024 * 1024);
307 
308 	/*
309 	 * Access the data typically in HAMMER_BUFSIZE blocks via the
310 	 * buffer cache, but HAMMER may use a variable block size based
311 	 * on the offset.
312 	 *
313 	 * XXX Temporary hack, delay the start transaction while we remain
314 	 *     MPSAFE.  NOTE: ino_data.size cannot change while vnode is
315 	 *     locked-shared.
316 	 */
317 	while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
318 		int64_t base_offset;
319 		int64_t file_limit;
320 
321 		blksize = hammer_blocksize(uio->uio_offset);
322 		offset = (int)uio->uio_offset & (blksize - 1);
323 		base_offset = uio->uio_offset - offset;
324 
325 		if (bigread && (error = hammer_signal_check(ip->hmp)) != 0)
326 			break;
327 
328 		/*
329 		 * MPSAFE
330 		 */
331 		bp = getcacheblk(ap->a_vp, base_offset);
332 		if (bp) {
333 			error = 0;
334 			goto skip;
335 		}
336 
337 		/*
338 		 * MPUNSAFE
339 		 */
340 		if (got_mplock == 0) {
341 			got_mplock = 1;
342 			get_mplock();
343 			hammer_start_transaction(&trans, ip->hmp);
344 		}
345 
346 		if (hammer_cluster_enable) {
347 			/*
348 			 * Use file_limit to prevent cluster_read() from
349 			 * creating buffers of the wrong block size past
350 			 * the demarc.
351 			 */
352 			file_limit = ip->ino_data.size;
353 			if (base_offset < HAMMER_XDEMARC &&
354 			    file_limit > HAMMER_XDEMARC) {
355 				file_limit = HAMMER_XDEMARC;
356 			}
357 			error = cluster_read(ap->a_vp,
358 					     file_limit, base_offset,
359 					     blksize, MAXPHYS,
360 					     seqcount, &bp);
361 		} else {
362 			error = bread(ap->a_vp, base_offset, blksize, &bp);
363 		}
364 		if (error) {
365 			kprintf("error %d\n", error);
366 			brelse(bp);
367 			break;
368 		}
369 skip:
370 
371 		/* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
372 		n = blksize - offset;
373 		if (n > uio->uio_resid)
374 			n = uio->uio_resid;
375 		if (n > ip->ino_data.size - uio->uio_offset)
376 			n = (int)(ip->ino_data.size - uio->uio_offset);
377 		error = uiomove((char *)bp->b_data + offset, n, uio);
378 
379 		/* data has a lower priority then meta-data */
380 		bp->b_flags |= B_AGE;
381 		bqrelse(bp);
382 		if (error)
383 			break;
384 		hammer_stats_file_read += n;
385 	}
386 
387 	/*
388 	 * XXX only update the atime if we had to get the MP lock.
389 	 * XXX hack hack hack, fixme.
390 	 */
391 	if (got_mplock) {
392 		if ((ip->flags & HAMMER_INODE_RO) == 0 &&
393 		    (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
394 			ip->ino_data.atime = trans.time;
395 			hammer_modify_inode(ip, HAMMER_INODE_ATIME);
396 		}
397 		hammer_done_transaction(&trans);
398 		if (got_mplock > 0)
399 			rel_mplock();
400 	}
401 	return (error);
402 }
403 
404 /*
405  * hammer_vop_write { vp, uio, ioflag, cred }
406  */
407 static
408 int
409 hammer_vop_write(struct vop_write_args *ap)
410 {
411 	struct hammer_transaction trans;
412 	struct hammer_inode *ip;
413 	hammer_mount_t hmp;
414 	struct uio *uio;
415 	int offset;
416 	off_t base_offset;
417 	struct buf *bp;
418 	int kflags;
419 	int error;
420 	int n;
421 	int flags;
422 	int seqcount;
423 	int bigwrite;
424 
425 	if (ap->a_vp->v_type != VREG)
426 		return (EINVAL);
427 	ip = VTOI(ap->a_vp);
428 	hmp = ip->hmp;
429 	error = 0;
430 	kflags = 0;
431 	seqcount = ap->a_ioflag >> 16;
432 
433 	if (ip->flags & HAMMER_INODE_RO)
434 		return (EROFS);
435 
436 	/*
437 	 * Create a transaction to cover the operations we perform.
438 	 */
439 	hammer_start_transaction(&trans, hmp);
440 	uio = ap->a_uio;
441 
442 	/*
443 	 * Check append mode
444 	 */
445 	if (ap->a_ioflag & IO_APPEND)
446 		uio->uio_offset = ip->ino_data.size;
447 
448 	/*
449 	 * Check for illegal write offsets.  Valid range is 0...2^63-1.
450 	 *
451 	 * NOTE: the base_off assignment is required to work around what
452 	 * I consider to be a GCC-4 optimization bug.
453 	 */
454 	if (uio->uio_offset < 0) {
455 		hammer_done_transaction(&trans);
456 		return (EFBIG);
457 	}
458 	base_offset = uio->uio_offset + uio->uio_resid;	/* work around gcc-4 */
459 	if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) {
460 		hammer_done_transaction(&trans);
461 		return (EFBIG);
462 	}
463 
464 	/*
465 	 * If reading or writing a huge amount of data we have to break
466 	 * atomicy and allow the operation to be interrupted by a signal
467 	 * or it can DOS the machine.
468 	 */
469 	bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
470 
471 	/*
472 	 * Access the data typically in HAMMER_BUFSIZE blocks via the
473 	 * buffer cache, but HAMMER may use a variable block size based
474 	 * on the offset.
475 	 */
476 	while (uio->uio_resid > 0) {
477 		int fixsize = 0;
478 		int blksize;
479 		int blkmask;
480 
481 		if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
482 			break;
483 		if (bigwrite && (error = hammer_signal_check(hmp)) != 0)
484 			break;
485 
486 		blksize = hammer_blocksize(uio->uio_offset);
487 
488 		/*
489 		 * Do not allow HAMMER to blow out the buffer cache.  Very
490 		 * large UIOs can lockout other processes due to bwillwrite()
491 		 * mechanics.
492 		 *
493 		 * The hammer inode is not locked during these operations.
494 		 * The vnode is locked which can interfere with the pageout
495 		 * daemon for non-UIO_NOCOPY writes but should not interfere
496 		 * with the buffer cache.  Even so, we cannot afford to
497 		 * allow the pageout daemon to build up too many dirty buffer
498 		 * cache buffers.
499 		 *
500 		 * Only call this if we aren't being recursively called from
501 		 * a virtual disk device (vn), else we may deadlock.
502 		 */
503 		if ((ap->a_ioflag & IO_RECURSE) == 0)
504 			bwillwrite(blksize);
505 
506 		/*
507 		 * Control the number of pending records associated with
508 		 * this inode.  If too many have accumulated start a
509 		 * flush.  Try to maintain a pipeline with the flusher.
510 		 */
511 		if (ip->rsv_recs >= hammer_limit_inode_recs) {
512 			hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
513 		}
514 		if (ip->rsv_recs >= hammer_limit_inode_recs * 2) {
515 			while (ip->rsv_recs >= hammer_limit_inode_recs) {
516 				tsleep(&ip->rsv_recs, 0, "hmrwww", hz);
517 			}
518 			hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
519 		}
520 
521 #if 0
522 		/*
523 		 * Do not allow HAMMER to blow out system memory by
524 		 * accumulating too many records.   Records are so well
525 		 * decoupled from the buffer cache that it is possible
526 		 * for userland to push data out to the media via
527 		 * direct-write, but build up the records queued to the
528 		 * backend faster then the backend can flush them out.
529 		 * HAMMER has hit its write limit but the frontend has
530 		 * no pushback to slow it down.
531 		 */
532 		if (hmp->rsv_recs > hammer_limit_recs / 2) {
533 			/*
534 			 * Get the inode on the flush list
535 			 */
536 			if (ip->rsv_recs >= 64)
537 				hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
538 			else if (ip->rsv_recs >= 16)
539 				hammer_flush_inode(ip, 0);
540 
541 			/*
542 			 * Keep the flusher going if the system keeps
543 			 * queueing records.
544 			 */
545 			delta = hmp->count_newrecords -
546 				hmp->last_newrecords;
547 			if (delta < 0 || delta > hammer_limit_recs / 2) {
548 				hmp->last_newrecords = hmp->count_newrecords;
549 				hammer_sync_hmp(hmp, MNT_NOWAIT);
550 			}
551 
552 			/*
553 			 * If we have gotten behind start slowing
554 			 * down the writers.
555 			 */
556 			delta = (hmp->rsv_recs - hammer_limit_recs) *
557 				hz / hammer_limit_recs;
558 			if (delta > 0)
559 				tsleep(&trans, 0, "hmrslo", delta);
560 		}
561 #endif
562 
563 		/*
564 		 * Calculate the blocksize at the current offset and figure
565 		 * out how much we can actually write.
566 		 */
567 		blkmask = blksize - 1;
568 		offset = (int)uio->uio_offset & blkmask;
569 		base_offset = uio->uio_offset & ~(int64_t)blkmask;
570 		n = blksize - offset;
571 		if (n > uio->uio_resid)
572 			n = uio->uio_resid;
573 		if (uio->uio_offset + n > ip->ino_data.size) {
574 			vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
575 			fixsize = 1;
576 			kflags |= NOTE_EXTEND;
577 		}
578 
579 		if (uio->uio_segflg == UIO_NOCOPY) {
580 			/*
581 			 * Issuing a write with the same data backing the
582 			 * buffer.  Instantiate the buffer to collect the
583 			 * backing vm pages, then read-in any missing bits.
584 			 *
585 			 * This case is used by vop_stdputpages().
586 			 */
587 			bp = getblk(ap->a_vp, base_offset,
588 				    blksize, GETBLK_BHEAVY, 0);
589 			if ((bp->b_flags & B_CACHE) == 0) {
590 				bqrelse(bp);
591 				error = bread(ap->a_vp, base_offset,
592 					      blksize, &bp);
593 			}
594 		} else if (offset == 0 && uio->uio_resid >= blksize) {
595 			/*
596 			 * Even though we are entirely overwriting the buffer
597 			 * we may still have to zero it out to avoid a
598 			 * mmap/write visibility issue.
599 			 */
600 			bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
601 			if ((bp->b_flags & B_CACHE) == 0)
602 				vfs_bio_clrbuf(bp);
603 		} else if (base_offset >= ip->ino_data.size) {
604 			/*
605 			 * If the base offset of the buffer is beyond the
606 			 * file EOF, we don't have to issue a read.
607 			 */
608 			bp = getblk(ap->a_vp, base_offset,
609 				    blksize, GETBLK_BHEAVY, 0);
610 			vfs_bio_clrbuf(bp);
611 		} else {
612 			/*
613 			 * Partial overwrite, read in any missing bits then
614 			 * replace the portion being written.
615 			 */
616 			error = bread(ap->a_vp, base_offset, blksize, &bp);
617 			if (error == 0)
618 				bheavy(bp);
619 		}
620 		if (error == 0) {
621 			error = uiomove((char *)bp->b_data + offset,
622 					n, uio);
623 		}
624 
625 		/*
626 		 * If we screwed up we have to undo any VM size changes we
627 		 * made.
628 		 */
629 		if (error) {
630 			brelse(bp);
631 			if (fixsize) {
632 				vtruncbuf(ap->a_vp, ip->ino_data.size,
633 					  hammer_blocksize(ip->ino_data.size));
634 			}
635 			break;
636 		}
637 		kflags |= NOTE_WRITE;
638 		hammer_stats_file_write += n;
639 		/* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
640 		if (ip->ino_data.size < uio->uio_offset) {
641 			ip->ino_data.size = uio->uio_offset;
642 			flags = HAMMER_INODE_DDIRTY;
643 			vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
644 		} else {
645 			flags = 0;
646 		}
647 		ip->ino_data.mtime = trans.time;
648 		flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
649 		hammer_modify_inode(ip, flags);
650 
651 		/*
652 		 * Once we dirty the buffer any cached zone-X offset
653 		 * becomes invalid.  HAMMER NOTE: no-history mode cannot
654 		 * allow overwriting over the same data sector unless
655 		 * we provide UNDOs for the old data, which we don't.
656 		 */
657 		bp->b_bio2.bio_offset = NOOFFSET;
658 
659 		/*
660 		 * Final buffer disposition.
661 		 *
662 		 * Because meta-data updates are deferred, HAMMER is
663 		 * especially sensitive to excessive bdwrite()s because
664 		 * the I/O stream is not broken up by disk reads.  So the
665 		 * buffer cache simply cannot keep up.
666 		 *
667 		 * WARNING!  blksize is variable.  cluster_write() is
668 		 * expected to not blow up if it encounters buffers that
669 		 * do not match the passed blksize.
670 		 *
671 		 * NOTE!  Hammer shouldn't need to bawrite()/cluster_write().
672 		 *	  The ip->rsv_recs check should burst-flush the data.
673 		 *	  If we queue it immediately the buf could be left
674 		 *	  locked on the device queue for a very long time.
675 		 */
676 		bp->b_flags |= B_AGE;
677 		if (ap->a_ioflag & IO_SYNC) {
678 			bwrite(bp);
679 		} else if (ap->a_ioflag & IO_DIRECT) {
680 			bawrite(bp);
681 		} else {
682 #if 0
683 		if (offset + n == blksize) {
684 			if (hammer_cluster_enable == 0 ||
685 			    (ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) {
686 				bawrite(bp);
687 			} else {
688 				cluster_write(bp, ip->ino_data.size,
689 					      blksize, seqcount);
690 			}
691 		} else {
692 #endif
693 			bdwrite(bp);
694 		}
695 	}
696 	hammer_done_transaction(&trans);
697 	hammer_knote(ap->a_vp, kflags);
698 	return (error);
699 }
700 
701 /*
702  * hammer_vop_access { vp, mode, cred }
703  */
704 static
705 int
706 hammer_vop_access(struct vop_access_args *ap)
707 {
708 	struct hammer_inode *ip = VTOI(ap->a_vp);
709 	uid_t uid;
710 	gid_t gid;
711 	int error;
712 
713 	++hammer_stats_file_iopsr;
714 	uid = hammer_to_unix_xid(&ip->ino_data.uid);
715 	gid = hammer_to_unix_xid(&ip->ino_data.gid);
716 
717 	error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
718 				  ip->ino_data.uflags);
719 	return (error);
720 }
721 
722 /*
723  * hammer_vop_advlock { vp, id, op, fl, flags }
724  */
725 static
726 int
727 hammer_vop_advlock(struct vop_advlock_args *ap)
728 {
729 	hammer_inode_t ip = VTOI(ap->a_vp);
730 
731 	return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
732 }
733 
734 /*
735  * hammer_vop_close { vp, fflag }
736  *
737  * We can only sync-on-close for normal closes.
738  */
739 static
740 int
741 hammer_vop_close(struct vop_close_args *ap)
742 {
743 	struct vnode *vp = ap->a_vp;
744 	hammer_inode_t ip = VTOI(vp);
745 	int waitfor;
746 
747 	if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) {
748 		if (vn_islocked(vp) == LK_EXCLUSIVE &&
749 		    (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) {
750 			if (ip->flags & HAMMER_INODE_CLOSESYNC)
751 				waitfor = MNT_WAIT;
752 			else
753 				waitfor = MNT_NOWAIT;
754 			ip->flags &= ~(HAMMER_INODE_CLOSESYNC |
755 				       HAMMER_INODE_CLOSEASYNC);
756 			VOP_FSYNC(vp, MNT_NOWAIT, waitfor);
757 		}
758 	}
759 	return (vop_stdclose(ap));
760 }
761 
762 /*
763  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
764  *
765  * The operating system has already ensured that the directory entry
766  * does not exist and done all appropriate namespace locking.
767  */
768 static
769 int
770 hammer_vop_ncreate(struct vop_ncreate_args *ap)
771 {
772 	struct hammer_transaction trans;
773 	struct hammer_inode *dip;
774 	struct hammer_inode *nip;
775 	struct nchandle *nch;
776 	int error;
777 
778 	nch = ap->a_nch;
779 	dip = VTOI(ap->a_dvp);
780 
781 	if (dip->flags & HAMMER_INODE_RO)
782 		return (EROFS);
783 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
784 		return (error);
785 
786 	/*
787 	 * Create a transaction to cover the operations we perform.
788 	 */
789 	hammer_start_transaction(&trans, dip->hmp);
790 	++hammer_stats_file_iopsw;
791 
792 	/*
793 	 * Create a new filesystem object of the requested type.  The
794 	 * returned inode will be referenced and shared-locked to prevent
795 	 * it from being moved to the flusher.
796 	 */
797 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
798 				    dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
799 				    NULL, &nip);
800 	if (error) {
801 		hkprintf("hammer_create_inode error %d\n", error);
802 		hammer_done_transaction(&trans);
803 		*ap->a_vpp = NULL;
804 		return (error);
805 	}
806 
807 	/*
808 	 * Add the new filesystem object to the directory.  This will also
809 	 * bump the inode's link count.
810 	 */
811 	error = hammer_ip_add_directory(&trans, dip,
812 					nch->ncp->nc_name, nch->ncp->nc_nlen,
813 					nip);
814 	if (error)
815 		hkprintf("hammer_ip_add_directory error %d\n", error);
816 
817 	/*
818 	 * Finish up.
819 	 */
820 	if (error) {
821 		hammer_rel_inode(nip, 0);
822 		hammer_done_transaction(&trans);
823 		*ap->a_vpp = NULL;
824 	} else {
825 		error = hammer_get_vnode(nip, ap->a_vpp);
826 		hammer_done_transaction(&trans);
827 		hammer_rel_inode(nip, 0);
828 		if (error == 0) {
829 			cache_setunresolved(ap->a_nch);
830 			cache_setvp(ap->a_nch, *ap->a_vpp);
831 		}
832 		hammer_knote(ap->a_dvp, NOTE_WRITE);
833 	}
834 	return (error);
835 }
836 
837 /*
838  * hammer_vop_getattr { vp, vap }
839  *
840  * Retrieve an inode's attribute information.  When accessing inodes
841  * historically we fake the atime field to ensure consistent results.
842  * The atime field is stored in the B-Tree element and allowed to be
843  * updated without cycling the element.
844  *
845  * MPSAFE
846  */
847 static
848 int
849 hammer_vop_getattr(struct vop_getattr_args *ap)
850 {
851 	struct hammer_inode *ip = VTOI(ap->a_vp);
852 	struct vattr *vap = ap->a_vap;
853 
854 	/*
855 	 * We want the fsid to be different when accessing a filesystem
856 	 * with different as-of's so programs like diff don't think
857 	 * the files are the same.
858 	 *
859 	 * We also want the fsid to be the same when comparing snapshots,
860 	 * or when comparing mirrors (which might be backed by different
861 	 * physical devices).  HAMMER fsids are based on the PFS's
862 	 * shared_uuid field.
863 	 *
864 	 * XXX there is a chance of collision here.  The va_fsid reported
865 	 * by stat is different from the more involved fsid used in the
866 	 * mount structure.
867 	 */
868 	++hammer_stats_file_iopsr;
869 	hammer_lock_sh(&ip->lock);
870 	vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
871 		       (u_int32_t)(ip->obj_asof >> 32);
872 
873 	vap->va_fileid = ip->ino_leaf.base.obj_id;
874 	vap->va_mode = ip->ino_data.mode;
875 	vap->va_nlink = ip->ino_data.nlinks;
876 	vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
877 	vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
878 	vap->va_rmajor = 0;
879 	vap->va_rminor = 0;
880 	vap->va_size = ip->ino_data.size;
881 
882 	/*
883 	 * Special case for @@PFS softlinks.  The actual size of the
884 	 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
885 	 * or for MAX_TID is    "@@-1:%05d" == 10 bytes.
886 	 */
887 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
888 	    ip->ino_data.size == 10 &&
889 	    ip->obj_asof == HAMMER_MAX_TID &&
890 	    ip->obj_localization == 0 &&
891 	    strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
892 		    if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE)
893 			    vap->va_size = 26;
894 		    else
895 			    vap->va_size = 10;
896 	}
897 
898 	/*
899 	 * We must provide a consistent atime and mtime for snapshots
900 	 * so people can do a 'tar cf - ... | md5' on them and get
901 	 * consistent results.
902 	 */
903 	if (ip->flags & HAMMER_INODE_RO) {
904 		hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
905 		hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
906 	} else {
907 		hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
908 		hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
909 	}
910 	hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
911 	vap->va_flags = ip->ino_data.uflags;
912 	vap->va_gen = 1;	/* hammer inums are unique for all time */
913 	vap->va_blocksize = HAMMER_BUFSIZE;
914 	if (ip->ino_data.size >= HAMMER_XDEMARC) {
915 		vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
916 				~HAMMER_XBUFMASK64;
917 	} else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
918 		vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
919 				~HAMMER_BUFMASK64;
920 	} else {
921 		vap->va_bytes = (ip->ino_data.size + 15) & ~15;
922 	}
923 
924 	vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
925 	vap->va_filerev = 0; 	/* XXX */
926 	/* mtime uniquely identifies any adjustments made to the file XXX */
927 	vap->va_fsmid = ip->ino_data.mtime;
928 	vap->va_uid_uuid = ip->ino_data.uid;
929 	vap->va_gid_uuid = ip->ino_data.gid;
930 	vap->va_fsid_uuid = ip->hmp->fsid;
931 	vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
932 			  VA_FSID_UUID_VALID;
933 
934 	switch (ip->ino_data.obj_type) {
935 	case HAMMER_OBJTYPE_CDEV:
936 	case HAMMER_OBJTYPE_BDEV:
937 		vap->va_rmajor = ip->ino_data.rmajor;
938 		vap->va_rminor = ip->ino_data.rminor;
939 		break;
940 	default:
941 		break;
942 	}
943 	hammer_unlock(&ip->lock);
944 	return(0);
945 }
946 
947 /*
948  * hammer_vop_nresolve { nch, dvp, cred }
949  *
950  * Locate the requested directory entry.
951  */
952 static
953 int
954 hammer_vop_nresolve(struct vop_nresolve_args *ap)
955 {
956 	struct hammer_transaction trans;
957 	struct namecache *ncp;
958 	hammer_inode_t dip;
959 	hammer_inode_t ip;
960 	hammer_tid_t asof;
961 	struct hammer_cursor cursor;
962 	struct vnode *vp;
963 	int64_t namekey;
964 	int error;
965 	int i;
966 	int nlen;
967 	int flags;
968 	int ispfs;
969 	int64_t obj_id;
970 	u_int32_t localization;
971 	u_int32_t max_iterations;
972 
973 	/*
974 	 * Misc initialization, plus handle as-of name extensions.  Look for
975 	 * the '@@' extension.  Note that as-of files and directories cannot
976 	 * be modified.
977 	 */
978 	dip = VTOI(ap->a_dvp);
979 	ncp = ap->a_nch->ncp;
980 	asof = dip->obj_asof;
981 	localization = dip->obj_localization;	/* for code consistency */
982 	nlen = ncp->nc_nlen;
983 	flags = dip->flags & HAMMER_INODE_RO;
984 	ispfs = 0;
985 
986 	hammer_simple_transaction(&trans, dip->hmp);
987 	++hammer_stats_file_iopsr;
988 
989 	for (i = 0; i < nlen; ++i) {
990 		if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
991 			error = hammer_str_to_tid(ncp->nc_name + i + 2,
992 						  &ispfs, &asof, &localization);
993 			if (error != 0) {
994 				i = nlen;
995 				break;
996 			}
997 			if (asof != HAMMER_MAX_TID)
998 				flags |= HAMMER_INODE_RO;
999 			break;
1000 		}
1001 	}
1002 	nlen = i;
1003 
1004 	/*
1005 	 * If this is a PFS softlink we dive into the PFS
1006 	 */
1007 	if (ispfs && nlen == 0) {
1008 		ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
1009 				      asof, localization,
1010 				      flags, &error);
1011 		if (error == 0) {
1012 			error = hammer_get_vnode(ip, &vp);
1013 			hammer_rel_inode(ip, 0);
1014 		} else {
1015 			vp = NULL;
1016 		}
1017 		if (error == 0) {
1018 			vn_unlock(vp);
1019 			cache_setvp(ap->a_nch, vp);
1020 			vrele(vp);
1021 		}
1022 		goto done;
1023 	}
1024 
1025 	/*
1026 	 * If there is no path component the time extension is relative to dip.
1027 	 * e.g. "fubar/@@<snapshot>"
1028 	 *
1029 	 * "." is handled by the kernel, but ".@@<snapshot>" is not.
1030 	 * e.g. "fubar/.@@<snapshot>"
1031 	 *
1032 	 * ".." is handled by the kernel.  We do not currently handle
1033 	 * "..@<snapshot>".
1034 	 */
1035 	if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) {
1036 		ip = hammer_get_inode(&trans, dip, dip->obj_id,
1037 				      asof, dip->obj_localization,
1038 				      flags, &error);
1039 		if (error == 0) {
1040 			error = hammer_get_vnode(ip, &vp);
1041 			hammer_rel_inode(ip, 0);
1042 		} else {
1043 			vp = NULL;
1044 		}
1045 		if (error == 0) {
1046 			vn_unlock(vp);
1047 			cache_setvp(ap->a_nch, vp);
1048 			vrele(vp);
1049 		}
1050 		goto done;
1051 	}
1052 
1053 	/*
1054 	 * Calculate the namekey and setup the key range for the scan.  This
1055 	 * works kinda like a chained hash table where the lower 32 bits
1056 	 * of the namekey synthesize the chain.
1057 	 *
1058 	 * The key range is inclusive of both key_beg and key_end.
1059 	 */
1060 	namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
1061 					   &max_iterations);
1062 
1063 	error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
1064 	cursor.key_beg.localization = dip->obj_localization +
1065 				      hammer_dir_localization(dip);
1066         cursor.key_beg.obj_id = dip->obj_id;
1067 	cursor.key_beg.key = namekey;
1068         cursor.key_beg.create_tid = 0;
1069         cursor.key_beg.delete_tid = 0;
1070         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1071         cursor.key_beg.obj_type = 0;
1072 
1073 	cursor.key_end = cursor.key_beg;
1074 	cursor.key_end.key += max_iterations;
1075 	cursor.asof = asof;
1076 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1077 
1078 	/*
1079 	 * Scan all matching records (the chain), locate the one matching
1080 	 * the requested path component.
1081 	 *
1082 	 * The hammer_ip_*() functions merge in-memory records with on-disk
1083 	 * records for the purposes of the search.
1084 	 */
1085 	obj_id = 0;
1086 	localization = HAMMER_DEF_LOCALIZATION;
1087 
1088 	if (error == 0) {
1089 		error = hammer_ip_first(&cursor);
1090 		while (error == 0) {
1091 			error = hammer_ip_resolve_data(&cursor);
1092 			if (error)
1093 				break;
1094 			if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
1095 			    bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1096 				obj_id = cursor.data->entry.obj_id;
1097 				localization = cursor.data->entry.localization;
1098 				break;
1099 			}
1100 			error = hammer_ip_next(&cursor);
1101 		}
1102 	}
1103 	hammer_done_cursor(&cursor);
1104 
1105 	/*
1106 	 * Lookup the obj_id.  This should always succeed.  If it does not
1107 	 * the filesystem may be damaged and we return a dummy inode.
1108 	 */
1109 	if (error == 0) {
1110 		ip = hammer_get_inode(&trans, dip, obj_id,
1111 				      asof, localization,
1112 				      flags, &error);
1113 		if (error == ENOENT) {
1114 			kprintf("HAMMER: WARNING: Missing "
1115 				"inode for dirent \"%s\"\n"
1116 				"\tobj_id = %016llx, asof=%016llx, lo=%08x\n",
1117 				ncp->nc_name,
1118 				(long long)obj_id, (long long)asof,
1119 				localization);
1120 			error = 0;
1121 			ip = hammer_get_dummy_inode(&trans, dip, obj_id,
1122 						    asof, localization,
1123 						    flags, &error);
1124 		}
1125 		if (error == 0) {
1126 			error = hammer_get_vnode(ip, &vp);
1127 			hammer_rel_inode(ip, 0);
1128 		} else {
1129 			vp = NULL;
1130 		}
1131 		if (error == 0) {
1132 			vn_unlock(vp);
1133 			cache_setvp(ap->a_nch, vp);
1134 			vrele(vp);
1135 		}
1136 	} else if (error == ENOENT) {
1137 		cache_setvp(ap->a_nch, NULL);
1138 	}
1139 done:
1140 	hammer_done_transaction(&trans);
1141 	return (error);
1142 }
1143 
1144 /*
1145  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
1146  *
1147  * Locate the parent directory of a directory vnode.
1148  *
1149  * dvp is referenced but not locked.  *vpp must be returned referenced and
1150  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
1151  * at the root, instead it could indicate that the directory we were in was
1152  * removed.
1153  *
1154  * NOTE: as-of sequences are not linked into the directory structure.  If
1155  * we are at the root with a different asof then the mount point, reload
1156  * the same directory with the mount point's asof.   I'm not sure what this
1157  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
1158  * get confused, but it hasn't been tested.
1159  */
1160 static
1161 int
1162 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1163 {
1164 	struct hammer_transaction trans;
1165 	struct hammer_inode *dip;
1166 	struct hammer_inode *ip;
1167 	int64_t parent_obj_id;
1168 	u_int32_t parent_obj_localization;
1169 	hammer_tid_t asof;
1170 	int error;
1171 
1172 	dip = VTOI(ap->a_dvp);
1173 	asof = dip->obj_asof;
1174 
1175 	/*
1176 	 * Whos are parent?  This could be the root of a pseudo-filesystem
1177 	 * whos parent is in another localization domain.
1178 	 */
1179 	parent_obj_id = dip->ino_data.parent_obj_id;
1180 	if (dip->obj_id == HAMMER_OBJID_ROOT)
1181 		parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
1182 	else
1183 		parent_obj_localization = dip->obj_localization;
1184 
1185 	if (parent_obj_id == 0) {
1186 		if (dip->obj_id == HAMMER_OBJID_ROOT &&
1187 		   asof != dip->hmp->asof) {
1188 			parent_obj_id = dip->obj_id;
1189 			asof = dip->hmp->asof;
1190 			*ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
1191 			ksnprintf(*ap->a_fakename, 19, "0x%016llx",
1192 				  (long long)dip->obj_asof);
1193 		} else {
1194 			*ap->a_vpp = NULL;
1195 			return ENOENT;
1196 		}
1197 	}
1198 
1199 	hammer_simple_transaction(&trans, dip->hmp);
1200 	++hammer_stats_file_iopsr;
1201 
1202 	ip = hammer_get_inode(&trans, dip, parent_obj_id,
1203 			      asof, parent_obj_localization,
1204 			      dip->flags, &error);
1205 	if (ip) {
1206 		error = hammer_get_vnode(ip, ap->a_vpp);
1207 		hammer_rel_inode(ip, 0);
1208 	} else {
1209 		*ap->a_vpp = NULL;
1210 	}
1211 	hammer_done_transaction(&trans);
1212 	return (error);
1213 }
1214 
1215 /*
1216  * hammer_vop_nlink { nch, dvp, vp, cred }
1217  */
1218 static
1219 int
1220 hammer_vop_nlink(struct vop_nlink_args *ap)
1221 {
1222 	struct hammer_transaction trans;
1223 	struct hammer_inode *dip;
1224 	struct hammer_inode *ip;
1225 	struct nchandle *nch;
1226 	int error;
1227 
1228 	if (ap->a_dvp->v_mount != ap->a_vp->v_mount)
1229 		return(EXDEV);
1230 
1231 	nch = ap->a_nch;
1232 	dip = VTOI(ap->a_dvp);
1233 	ip = VTOI(ap->a_vp);
1234 
1235 	if (dip->obj_localization != ip->obj_localization)
1236 		return(EXDEV);
1237 
1238 	if (dip->flags & HAMMER_INODE_RO)
1239 		return (EROFS);
1240 	if (ip->flags & HAMMER_INODE_RO)
1241 		return (EROFS);
1242 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1243 		return (error);
1244 
1245 	/*
1246 	 * Create a transaction to cover the operations we perform.
1247 	 */
1248 	hammer_start_transaction(&trans, dip->hmp);
1249 	++hammer_stats_file_iopsw;
1250 
1251 	/*
1252 	 * Add the filesystem object to the directory.  Note that neither
1253 	 * dip nor ip are referenced or locked, but their vnodes are
1254 	 * referenced.  This function will bump the inode's link count.
1255 	 */
1256 	error = hammer_ip_add_directory(&trans, dip,
1257 					nch->ncp->nc_name, nch->ncp->nc_nlen,
1258 					ip);
1259 
1260 	/*
1261 	 * Finish up.
1262 	 */
1263 	if (error == 0) {
1264 		cache_setunresolved(nch);
1265 		cache_setvp(nch, ap->a_vp);
1266 	}
1267 	hammer_done_transaction(&trans);
1268 	hammer_knote(ap->a_vp, NOTE_LINK);
1269 	hammer_knote(ap->a_dvp, NOTE_WRITE);
1270 	return (error);
1271 }
1272 
1273 /*
1274  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1275  *
1276  * The operating system has already ensured that the directory entry
1277  * does not exist and done all appropriate namespace locking.
1278  */
1279 static
1280 int
1281 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1282 {
1283 	struct hammer_transaction trans;
1284 	struct hammer_inode *dip;
1285 	struct hammer_inode *nip;
1286 	struct nchandle *nch;
1287 	int error;
1288 
1289 	nch = ap->a_nch;
1290 	dip = VTOI(ap->a_dvp);
1291 
1292 	if (dip->flags & HAMMER_INODE_RO)
1293 		return (EROFS);
1294 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1295 		return (error);
1296 
1297 	/*
1298 	 * Create a transaction to cover the operations we perform.
1299 	 */
1300 	hammer_start_transaction(&trans, dip->hmp);
1301 	++hammer_stats_file_iopsw;
1302 
1303 	/*
1304 	 * Create a new filesystem object of the requested type.  The
1305 	 * returned inode will be referenced but not locked.
1306 	 */
1307 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1308 				    dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1309 				    NULL, &nip);
1310 	if (error) {
1311 		hkprintf("hammer_mkdir error %d\n", error);
1312 		hammer_done_transaction(&trans);
1313 		*ap->a_vpp = NULL;
1314 		return (error);
1315 	}
1316 	/*
1317 	 * Add the new filesystem object to the directory.  This will also
1318 	 * bump the inode's link count.
1319 	 */
1320 	error = hammer_ip_add_directory(&trans, dip,
1321 					nch->ncp->nc_name, nch->ncp->nc_nlen,
1322 					nip);
1323 	if (error)
1324 		hkprintf("hammer_mkdir (add) error %d\n", error);
1325 
1326 	/*
1327 	 * Finish up.
1328 	 */
1329 	if (error) {
1330 		hammer_rel_inode(nip, 0);
1331 		*ap->a_vpp = NULL;
1332 	} else {
1333 		error = hammer_get_vnode(nip, ap->a_vpp);
1334 		hammer_rel_inode(nip, 0);
1335 		if (error == 0) {
1336 			cache_setunresolved(ap->a_nch);
1337 			cache_setvp(ap->a_nch, *ap->a_vpp);
1338 		}
1339 	}
1340 	hammer_done_transaction(&trans);
1341 	if (error == 0)
1342 		hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1343 	return (error);
1344 }
1345 
1346 /*
1347  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1348  *
1349  * The operating system has already ensured that the directory entry
1350  * does not exist and done all appropriate namespace locking.
1351  */
1352 static
1353 int
1354 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1355 {
1356 	struct hammer_transaction trans;
1357 	struct hammer_inode *dip;
1358 	struct hammer_inode *nip;
1359 	struct nchandle *nch;
1360 	int error;
1361 
1362 	nch = ap->a_nch;
1363 	dip = VTOI(ap->a_dvp);
1364 
1365 	if (dip->flags & HAMMER_INODE_RO)
1366 		return (EROFS);
1367 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1368 		return (error);
1369 
1370 	/*
1371 	 * Create a transaction to cover the operations we perform.
1372 	 */
1373 	hammer_start_transaction(&trans, dip->hmp);
1374 	++hammer_stats_file_iopsw;
1375 
1376 	/*
1377 	 * Create a new filesystem object of the requested type.  The
1378 	 * returned inode will be referenced but not locked.
1379 	 *
1380 	 * If mknod specifies a directory a pseudo-fs is created.
1381 	 */
1382 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1383 				    dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1384 				    NULL, &nip);
1385 	if (error) {
1386 		hammer_done_transaction(&trans);
1387 		*ap->a_vpp = NULL;
1388 		return (error);
1389 	}
1390 
1391 	/*
1392 	 * Add the new filesystem object to the directory.  This will also
1393 	 * bump the inode's link count.
1394 	 */
1395 	error = hammer_ip_add_directory(&trans, dip,
1396 					nch->ncp->nc_name, nch->ncp->nc_nlen,
1397 					nip);
1398 
1399 	/*
1400 	 * Finish up.
1401 	 */
1402 	if (error) {
1403 		hammer_rel_inode(nip, 0);
1404 		*ap->a_vpp = NULL;
1405 	} else {
1406 		error = hammer_get_vnode(nip, ap->a_vpp);
1407 		hammer_rel_inode(nip, 0);
1408 		if (error == 0) {
1409 			cache_setunresolved(ap->a_nch);
1410 			cache_setvp(ap->a_nch, *ap->a_vpp);
1411 		}
1412 	}
1413 	hammer_done_transaction(&trans);
1414 	if (error == 0)
1415 		hammer_knote(ap->a_dvp, NOTE_WRITE);
1416 	return (error);
1417 }
1418 
1419 /*
1420  * hammer_vop_open { vp, mode, cred, fp }
1421  */
1422 static
1423 int
1424 hammer_vop_open(struct vop_open_args *ap)
1425 {
1426 	hammer_inode_t ip;
1427 
1428 	++hammer_stats_file_iopsr;
1429 	ip = VTOI(ap->a_vp);
1430 
1431 	if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1432 		return (EROFS);
1433 	return(vop_stdopen(ap));
1434 }
1435 
1436 /*
1437  * hammer_vop_print { vp }
1438  */
1439 static
1440 int
1441 hammer_vop_print(struct vop_print_args *ap)
1442 {
1443 	return EOPNOTSUPP;
1444 }
1445 
1446 /*
1447  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1448  */
1449 static
1450 int
1451 hammer_vop_readdir(struct vop_readdir_args *ap)
1452 {
1453 	struct hammer_transaction trans;
1454 	struct hammer_cursor cursor;
1455 	struct hammer_inode *ip;
1456 	struct uio *uio;
1457 	hammer_base_elm_t base;
1458 	int error;
1459 	int cookie_index;
1460 	int ncookies;
1461 	off_t *cookies;
1462 	off_t saveoff;
1463 	int r;
1464 	int dtype;
1465 
1466 	++hammer_stats_file_iopsr;
1467 	ip = VTOI(ap->a_vp);
1468 	uio = ap->a_uio;
1469 	saveoff = uio->uio_offset;
1470 
1471 	if (ap->a_ncookies) {
1472 		ncookies = uio->uio_resid / 16 + 1;
1473 		if (ncookies > 1024)
1474 			ncookies = 1024;
1475 		cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1476 		cookie_index = 0;
1477 	} else {
1478 		ncookies = -1;
1479 		cookies = NULL;
1480 		cookie_index = 0;
1481 	}
1482 
1483 	hammer_simple_transaction(&trans, ip->hmp);
1484 
1485 	/*
1486 	 * Handle artificial entries
1487 	 *
1488 	 * It should be noted that the minimum value for a directory
1489 	 * hash key on-media is 0x0000000100000000, so we can use anything
1490 	 * less then that to represent our 'special' key space.
1491 	 */
1492 	error = 0;
1493 	if (saveoff == 0) {
1494 		r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1495 		if (r)
1496 			goto done;
1497 		if (cookies)
1498 			cookies[cookie_index] = saveoff;
1499 		++saveoff;
1500 		++cookie_index;
1501 		if (cookie_index == ncookies)
1502 			goto done;
1503 	}
1504 	if (saveoff == 1) {
1505 		if (ip->ino_data.parent_obj_id) {
1506 			r = vop_write_dirent(&error, uio,
1507 					     ip->ino_data.parent_obj_id,
1508 					     DT_DIR, 2, "..");
1509 		} else {
1510 			r = vop_write_dirent(&error, uio,
1511 					     ip->obj_id, DT_DIR, 2, "..");
1512 		}
1513 		if (r)
1514 			goto done;
1515 		if (cookies)
1516 			cookies[cookie_index] = saveoff;
1517 		++saveoff;
1518 		++cookie_index;
1519 		if (cookie_index == ncookies)
1520 			goto done;
1521 	}
1522 
1523 	/*
1524 	 * Key range (begin and end inclusive) to scan.  Directory keys
1525 	 * directly translate to a 64 bit 'seek' position.
1526 	 */
1527 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1528 	cursor.key_beg.localization = ip->obj_localization +
1529 				      hammer_dir_localization(ip);
1530 	cursor.key_beg.obj_id = ip->obj_id;
1531 	cursor.key_beg.create_tid = 0;
1532 	cursor.key_beg.delete_tid = 0;
1533         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1534 	cursor.key_beg.obj_type = 0;
1535 	cursor.key_beg.key = saveoff;
1536 
1537 	cursor.key_end = cursor.key_beg;
1538 	cursor.key_end.key = HAMMER_MAX_KEY;
1539 	cursor.asof = ip->obj_asof;
1540 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1541 
1542 	error = hammer_ip_first(&cursor);
1543 
1544 	while (error == 0) {
1545 		error = hammer_ip_resolve_data(&cursor);
1546 		if (error)
1547 			break;
1548 		base = &cursor.leaf->base;
1549 		saveoff = base->key;
1550 		KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1551 
1552 		if (base->obj_id != ip->obj_id)
1553 			panic("readdir: bad record at %p", cursor.node);
1554 
1555 		/*
1556 		 * Convert pseudo-filesystems into softlinks
1557 		 */
1558 		dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
1559 		r = vop_write_dirent(
1560 			     &error, uio, cursor.data->entry.obj_id,
1561 			     dtype,
1562 			     cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1563 			     (void *)cursor.data->entry.name);
1564 		if (r)
1565 			break;
1566 		++saveoff;
1567 		if (cookies)
1568 			cookies[cookie_index] = base->key;
1569 		++cookie_index;
1570 		if (cookie_index == ncookies)
1571 			break;
1572 		error = hammer_ip_next(&cursor);
1573 	}
1574 	hammer_done_cursor(&cursor);
1575 
1576 done:
1577 	hammer_done_transaction(&trans);
1578 
1579 	if (ap->a_eofflag)
1580 		*ap->a_eofflag = (error == ENOENT);
1581 	uio->uio_offset = saveoff;
1582 	if (error && cookie_index == 0) {
1583 		if (error == ENOENT)
1584 			error = 0;
1585 		if (cookies) {
1586 			kfree(cookies, M_TEMP);
1587 			*ap->a_ncookies = 0;
1588 			*ap->a_cookies = NULL;
1589 		}
1590 	} else {
1591 		if (error == ENOENT)
1592 			error = 0;
1593 		if (cookies) {
1594 			*ap->a_ncookies = cookie_index;
1595 			*ap->a_cookies = cookies;
1596 		}
1597 	}
1598 	return(error);
1599 }
1600 
1601 /*
1602  * hammer_vop_readlink { vp, uio, cred }
1603  */
1604 static
1605 int
1606 hammer_vop_readlink(struct vop_readlink_args *ap)
1607 {
1608 	struct hammer_transaction trans;
1609 	struct hammer_cursor cursor;
1610 	struct hammer_inode *ip;
1611 	char buf[32];
1612 	u_int32_t localization;
1613 	hammer_pseudofs_inmem_t pfsm;
1614 	int error;
1615 
1616 	ip = VTOI(ap->a_vp);
1617 
1618 	/*
1619 	 * Shortcut if the symlink data was stuffed into ino_data.
1620 	 *
1621 	 * Also expand special "@@PFS%05d" softlinks (expansion only
1622 	 * occurs for non-historical (current) accesses made from the
1623 	 * primary filesystem).
1624 	 */
1625 	if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1626 		char *ptr;
1627 		int bytes;
1628 
1629 		ptr = ip->ino_data.ext.symlink;
1630 		bytes = (int)ip->ino_data.size;
1631 		if (bytes == 10 &&
1632 		    ip->obj_asof == HAMMER_MAX_TID &&
1633 		    ip->obj_localization == 0 &&
1634 		    strncmp(ptr, "@@PFS", 5) == 0) {
1635 			hammer_simple_transaction(&trans, ip->hmp);
1636 			bcopy(ptr + 5, buf, 5);
1637 			buf[5] = 0;
1638 			localization = strtoul(buf, NULL, 10) << 16;
1639 			pfsm = hammer_load_pseudofs(&trans, localization,
1640 						    &error);
1641 			if (error == 0) {
1642 				if (pfsm->pfsd.mirror_flags &
1643 				    HAMMER_PFSD_SLAVE) {
1644 					/* vap->va_size == 26 */
1645 					ksnprintf(buf, sizeof(buf),
1646 						  "@@0x%016llx:%05d",
1647 						  (long long)pfsm->pfsd.sync_end_tid,
1648 						  localization >> 16);
1649 				} else {
1650 					/* vap->va_size == 10 */
1651 					ksnprintf(buf, sizeof(buf),
1652 						  "@@-1:%05d",
1653 						  localization >> 16);
1654 #if 0
1655 					ksnprintf(buf, sizeof(buf),
1656 						  "@@0x%016llx:%05d",
1657 						  (long long)HAMMER_MAX_TID,
1658 						  localization >> 16);
1659 #endif
1660 				}
1661 				ptr = buf;
1662 				bytes = strlen(buf);
1663 			}
1664 			if (pfsm)
1665 				hammer_rel_pseudofs(trans.hmp, pfsm);
1666 			hammer_done_transaction(&trans);
1667 		}
1668 		error = uiomove(ptr, bytes, ap->a_uio);
1669 		return(error);
1670 	}
1671 
1672 	/*
1673 	 * Long version
1674 	 */
1675 	hammer_simple_transaction(&trans, ip->hmp);
1676 	++hammer_stats_file_iopsr;
1677 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1678 
1679 	/*
1680 	 * Key range (begin and end inclusive) to scan.  Directory keys
1681 	 * directly translate to a 64 bit 'seek' position.
1682 	 */
1683 	cursor.key_beg.localization = ip->obj_localization +
1684 				      HAMMER_LOCALIZE_MISC;
1685 	cursor.key_beg.obj_id = ip->obj_id;
1686 	cursor.key_beg.create_tid = 0;
1687 	cursor.key_beg.delete_tid = 0;
1688         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1689 	cursor.key_beg.obj_type = 0;
1690 	cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1691 	cursor.asof = ip->obj_asof;
1692 	cursor.flags |= HAMMER_CURSOR_ASOF;
1693 
1694 	error = hammer_ip_lookup(&cursor);
1695 	if (error == 0) {
1696 		error = hammer_ip_resolve_data(&cursor);
1697 		if (error == 0) {
1698 			KKASSERT(cursor.leaf->data_len >=
1699 				 HAMMER_SYMLINK_NAME_OFF);
1700 			error = uiomove(cursor.data->symlink.name,
1701 					cursor.leaf->data_len -
1702 						HAMMER_SYMLINK_NAME_OFF,
1703 					ap->a_uio);
1704 		}
1705 	}
1706 	hammer_done_cursor(&cursor);
1707 	hammer_done_transaction(&trans);
1708 	return(error);
1709 }
1710 
1711 /*
1712  * hammer_vop_nremove { nch, dvp, cred }
1713  */
1714 static
1715 int
1716 hammer_vop_nremove(struct vop_nremove_args *ap)
1717 {
1718 	struct hammer_transaction trans;
1719 	struct hammer_inode *dip;
1720 	int error;
1721 
1722 	dip = VTOI(ap->a_dvp);
1723 
1724 	if (hammer_nohistory(dip) == 0 &&
1725 	    (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1726 		return (error);
1727 	}
1728 
1729 	hammer_start_transaction(&trans, dip->hmp);
1730 	++hammer_stats_file_iopsw;
1731 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
1732 	hammer_done_transaction(&trans);
1733 	if (error == 0)
1734 		hammer_knote(ap->a_dvp, NOTE_WRITE);
1735 	return (error);
1736 }
1737 
1738 /*
1739  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1740  */
1741 static
1742 int
1743 hammer_vop_nrename(struct vop_nrename_args *ap)
1744 {
1745 	struct hammer_transaction trans;
1746 	struct namecache *fncp;
1747 	struct namecache *tncp;
1748 	struct hammer_inode *fdip;
1749 	struct hammer_inode *tdip;
1750 	struct hammer_inode *ip;
1751 	struct hammer_cursor cursor;
1752 	int64_t namekey;
1753 	u_int32_t max_iterations;
1754 	int nlen, error;
1755 
1756 	if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
1757 		return(EXDEV);
1758 	if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1759 		return(EXDEV);
1760 
1761 	fdip = VTOI(ap->a_fdvp);
1762 	tdip = VTOI(ap->a_tdvp);
1763 	fncp = ap->a_fnch->ncp;
1764 	tncp = ap->a_tnch->ncp;
1765 	ip = VTOI(fncp->nc_vp);
1766 	KKASSERT(ip != NULL);
1767 
1768 	if (fdip->obj_localization != tdip->obj_localization)
1769 		return(EXDEV);
1770 	if (fdip->obj_localization != ip->obj_localization)
1771 		return(EXDEV);
1772 
1773 	if (fdip->flags & HAMMER_INODE_RO)
1774 		return (EROFS);
1775 	if (tdip->flags & HAMMER_INODE_RO)
1776 		return (EROFS);
1777 	if (ip->flags & HAMMER_INODE_RO)
1778 		return (EROFS);
1779 	if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1780 		return (error);
1781 
1782 	hammer_start_transaction(&trans, fdip->hmp);
1783 	++hammer_stats_file_iopsw;
1784 
1785 	/*
1786 	 * Remove tncp from the target directory and then link ip as
1787 	 * tncp. XXX pass trans to dounlink
1788 	 *
1789 	 * Force the inode sync-time to match the transaction so it is
1790 	 * in-sync with the creation of the target directory entry.
1791 	 */
1792 	error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
1793 				ap->a_cred, 0, -1);
1794 	if (error == 0 || error == ENOENT) {
1795 		error = hammer_ip_add_directory(&trans, tdip,
1796 						tncp->nc_name, tncp->nc_nlen,
1797 						ip);
1798 		if (error == 0) {
1799 			ip->ino_data.parent_obj_id = tdip->obj_id;
1800 			ip->ino_data.ctime = trans.time;
1801 			hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
1802 		}
1803 	}
1804 	if (error)
1805 		goto failed; /* XXX */
1806 
1807 	/*
1808 	 * Locate the record in the originating directory and remove it.
1809 	 *
1810 	 * Calculate the namekey and setup the key range for the scan.  This
1811 	 * works kinda like a chained hash table where the lower 32 bits
1812 	 * of the namekey synthesize the chain.
1813 	 *
1814 	 * The key range is inclusive of both key_beg and key_end.
1815 	 */
1816 	namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
1817 					   &max_iterations);
1818 retry:
1819 	hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
1820 	cursor.key_beg.localization = fdip->obj_localization +
1821 				      hammer_dir_localization(fdip);
1822         cursor.key_beg.obj_id = fdip->obj_id;
1823 	cursor.key_beg.key = namekey;
1824         cursor.key_beg.create_tid = 0;
1825         cursor.key_beg.delete_tid = 0;
1826         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1827         cursor.key_beg.obj_type = 0;
1828 
1829 	cursor.key_end = cursor.key_beg;
1830 	cursor.key_end.key += max_iterations;
1831 	cursor.asof = fdip->obj_asof;
1832 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1833 
1834 	/*
1835 	 * Scan all matching records (the chain), locate the one matching
1836 	 * the requested path component.
1837 	 *
1838 	 * The hammer_ip_*() functions merge in-memory records with on-disk
1839 	 * records for the purposes of the search.
1840 	 */
1841 	error = hammer_ip_first(&cursor);
1842 	while (error == 0) {
1843 		if (hammer_ip_resolve_data(&cursor) != 0)
1844 			break;
1845 		nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1846 		KKASSERT(nlen > 0);
1847 		if (fncp->nc_nlen == nlen &&
1848 		    bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1849 			break;
1850 		}
1851 		error = hammer_ip_next(&cursor);
1852 	}
1853 
1854 	/*
1855 	 * If all is ok we have to get the inode so we can adjust nlinks.
1856 	 *
1857 	 * WARNING: hammer_ip_del_directory() may have to terminate the
1858 	 * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1859 	 * twice.
1860 	 */
1861 	if (error == 0)
1862 		error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1863 
1864 	/*
1865 	 * XXX A deadlock here will break rename's atomicy for the purposes
1866 	 * of crash recovery.
1867 	 */
1868 	if (error == EDEADLK) {
1869 		hammer_done_cursor(&cursor);
1870 		goto retry;
1871 	}
1872 
1873 	/*
1874 	 * Cleanup and tell the kernel that the rename succeeded.
1875 	 */
1876         hammer_done_cursor(&cursor);
1877 	if (error == 0) {
1878 		cache_rename(ap->a_fnch, ap->a_tnch);
1879 		hammer_knote(ap->a_fdvp, NOTE_WRITE);
1880 		hammer_knote(ap->a_tdvp, NOTE_WRITE);
1881 		if (ip->vp)
1882 			hammer_knote(ip->vp, NOTE_RENAME);
1883 	}
1884 
1885 failed:
1886 	hammer_done_transaction(&trans);
1887 	return (error);
1888 }
1889 
1890 /*
1891  * hammer_vop_nrmdir { nch, dvp, cred }
1892  */
1893 static
1894 int
1895 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1896 {
1897 	struct hammer_transaction trans;
1898 	struct hammer_inode *dip;
1899 	int error;
1900 
1901 	dip = VTOI(ap->a_dvp);
1902 
1903 	if (hammer_nohistory(dip) == 0 &&
1904 	    (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1905 		return (error);
1906 	}
1907 
1908 	hammer_start_transaction(&trans, dip->hmp);
1909 	++hammer_stats_file_iopsw;
1910 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
1911 	hammer_done_transaction(&trans);
1912 	if (error == 0)
1913 		hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1914 	return (error);
1915 }
1916 
1917 /*
1918  * hammer_vop_markatime { vp, cred }
1919  */
1920 static
1921 int
1922 hammer_vop_markatime(struct vop_markatime_args *ap)
1923 {
1924 	struct hammer_transaction trans;
1925 	struct hammer_inode *ip;
1926 
1927 	ip = VTOI(ap->a_vp);
1928 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1929 		return (EROFS);
1930 	if (ip->flags & HAMMER_INODE_RO)
1931 		return (EROFS);
1932 	if (ip->hmp->mp->mnt_flag & MNT_NOATIME)
1933 		return (0);
1934 	hammer_start_transaction(&trans, ip->hmp);
1935 	++hammer_stats_file_iopsw;
1936 
1937 	ip->ino_data.atime = trans.time;
1938 	hammer_modify_inode(ip, HAMMER_INODE_ATIME);
1939 	hammer_done_transaction(&trans);
1940 	hammer_knote(ap->a_vp, NOTE_ATTRIB);
1941 	return (0);
1942 }
1943 
1944 /*
1945  * hammer_vop_setattr { vp, vap, cred }
1946  */
1947 static
1948 int
1949 hammer_vop_setattr(struct vop_setattr_args *ap)
1950 {
1951 	struct hammer_transaction trans;
1952 	struct vattr *vap;
1953 	struct hammer_inode *ip;
1954 	int modflags;
1955 	int error;
1956 	int truncating;
1957 	int blksize;
1958 	int kflags;
1959 	int64_t aligned_size;
1960 	u_int32_t flags;
1961 
1962 	vap = ap->a_vap;
1963 	ip = ap->a_vp->v_data;
1964 	modflags = 0;
1965 	kflags = 0;
1966 
1967 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1968 		return(EROFS);
1969 	if (ip->flags & HAMMER_INODE_RO)
1970 		return (EROFS);
1971 	if (hammer_nohistory(ip) == 0 &&
1972 	    (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1973 		return (error);
1974 	}
1975 
1976 	hammer_start_transaction(&trans, ip->hmp);
1977 	++hammer_stats_file_iopsw;
1978 	error = 0;
1979 
1980 	if (vap->va_flags != VNOVAL) {
1981 		flags = ip->ino_data.uflags;
1982 		error = vop_helper_setattr_flags(&flags, vap->va_flags,
1983 					 hammer_to_unix_xid(&ip->ino_data.uid),
1984 					 ap->a_cred);
1985 		if (error == 0) {
1986 			if (ip->ino_data.uflags != flags) {
1987 				ip->ino_data.uflags = flags;
1988 				ip->ino_data.ctime = trans.time;
1989 				modflags |= HAMMER_INODE_DDIRTY;
1990 				kflags |= NOTE_ATTRIB;
1991 			}
1992 			if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1993 				error = 0;
1994 				goto done;
1995 			}
1996 		}
1997 		goto done;
1998 	}
1999 	if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2000 		error = EPERM;
2001 		goto done;
2002 	}
2003 	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
2004 		mode_t cur_mode = ip->ino_data.mode;
2005 		uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2006 		gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2007 		uuid_t uuid_uid;
2008 		uuid_t uuid_gid;
2009 
2010 		error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
2011 					 ap->a_cred,
2012 					 &cur_uid, &cur_gid, &cur_mode);
2013 		if (error == 0) {
2014 			hammer_guid_to_uuid(&uuid_uid, cur_uid);
2015 			hammer_guid_to_uuid(&uuid_gid, cur_gid);
2016 			if (bcmp(&uuid_uid, &ip->ino_data.uid,
2017 				 sizeof(uuid_uid)) ||
2018 			    bcmp(&uuid_gid, &ip->ino_data.gid,
2019 				 sizeof(uuid_gid)) ||
2020 			    ip->ino_data.mode != cur_mode
2021 			) {
2022 				ip->ino_data.uid = uuid_uid;
2023 				ip->ino_data.gid = uuid_gid;
2024 				ip->ino_data.mode = cur_mode;
2025 				ip->ino_data.ctime = trans.time;
2026 				modflags |= HAMMER_INODE_DDIRTY;
2027 			}
2028 			kflags |= NOTE_ATTRIB;
2029 		}
2030 	}
2031 	while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
2032 		switch(ap->a_vp->v_type) {
2033 		case VREG:
2034 			if (vap->va_size == ip->ino_data.size)
2035 				break;
2036 			/*
2037 			 * XXX break atomicy, we can deadlock the backend
2038 			 * if we do not release the lock.  Probably not a
2039 			 * big deal here.
2040 			 */
2041 			blksize = hammer_blocksize(vap->va_size);
2042 			if (vap->va_size < ip->ino_data.size) {
2043 				vtruncbuf(ap->a_vp, vap->va_size, blksize);
2044 				truncating = 1;
2045 				kflags |= NOTE_WRITE;
2046 			} else {
2047 				vnode_pager_setsize(ap->a_vp, vap->va_size);
2048 				truncating = 0;
2049 				kflags |= NOTE_WRITE | NOTE_EXTEND;
2050 			}
2051 			ip->ino_data.size = vap->va_size;
2052 			ip->ino_data.mtime = trans.time;
2053 			modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2054 
2055 			/*
2056 			 * on-media truncation is cached in the inode until
2057 			 * the inode is synchronized.
2058 			 */
2059 			if (truncating) {
2060 				hammer_ip_frontend_trunc(ip, vap->va_size);
2061 #ifdef DEBUG_TRUNCATE
2062 				if (HammerTruncIp == NULL)
2063 					HammerTruncIp = ip;
2064 #endif
2065 				if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2066 					ip->flags |= HAMMER_INODE_TRUNCATED;
2067 					ip->trunc_off = vap->va_size;
2068 #ifdef DEBUG_TRUNCATE
2069 					if (ip == HammerTruncIp)
2070 					kprintf("truncate1 %016llx\n",
2071 						(long long)ip->trunc_off);
2072 #endif
2073 				} else if (ip->trunc_off > vap->va_size) {
2074 					ip->trunc_off = vap->va_size;
2075 #ifdef DEBUG_TRUNCATE
2076 					if (ip == HammerTruncIp)
2077 					kprintf("truncate2 %016llx\n",
2078 						(long long)ip->trunc_off);
2079 #endif
2080 				} else {
2081 #ifdef DEBUG_TRUNCATE
2082 					if (ip == HammerTruncIp)
2083 					kprintf("truncate3 %016llx (ignored)\n",
2084 						(long long)vap->va_size);
2085 #endif
2086 				}
2087 			}
2088 
2089 			/*
2090 			 * If truncating we have to clean out a portion of
2091 			 * the last block on-disk.  We do this in the
2092 			 * front-end buffer cache.
2093 			 */
2094 			aligned_size = (vap->va_size + (blksize - 1)) &
2095 				       ~(int64_t)(blksize - 1);
2096 			if (truncating && vap->va_size < aligned_size) {
2097 				struct buf *bp;
2098 				int offset;
2099 
2100 				aligned_size -= blksize;
2101 
2102 				offset = (int)vap->va_size & (blksize - 1);
2103 				error = bread(ap->a_vp, aligned_size,
2104 					      blksize, &bp);
2105 				hammer_ip_frontend_trunc(ip, aligned_size);
2106 				if (error == 0) {
2107 					bzero(bp->b_data + offset,
2108 					      blksize - offset);
2109 					/* must de-cache direct-io offset */
2110 					bp->b_bio2.bio_offset = NOOFFSET;
2111 					bdwrite(bp);
2112 				} else {
2113 					kprintf("ERROR %d\n", error);
2114 					brelse(bp);
2115 				}
2116 			}
2117 			break;
2118 		case VDATABASE:
2119 			if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2120 				ip->flags |= HAMMER_INODE_TRUNCATED;
2121 				ip->trunc_off = vap->va_size;
2122 			} else if (ip->trunc_off > vap->va_size) {
2123 				ip->trunc_off = vap->va_size;
2124 			}
2125 			hammer_ip_frontend_trunc(ip, vap->va_size);
2126 			ip->ino_data.size = vap->va_size;
2127 			ip->ino_data.mtime = trans.time;
2128 			modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2129 			kflags |= NOTE_ATTRIB;
2130 			break;
2131 		default:
2132 			error = EINVAL;
2133 			goto done;
2134 		}
2135 		break;
2136 	}
2137 	if (vap->va_atime.tv_sec != VNOVAL) {
2138 		ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime);
2139 		modflags |= HAMMER_INODE_ATIME;
2140 		kflags |= NOTE_ATTRIB;
2141 	}
2142 	if (vap->va_mtime.tv_sec != VNOVAL) {
2143 		ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime);
2144 		modflags |= HAMMER_INODE_MTIME;
2145 		kflags |= NOTE_ATTRIB;
2146 	}
2147 	if (vap->va_mode != (mode_t)VNOVAL) {
2148 		mode_t   cur_mode = ip->ino_data.mode;
2149 		uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2150 		gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2151 
2152 		error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
2153 					 cur_uid, cur_gid, &cur_mode);
2154 		if (error == 0 && ip->ino_data.mode != cur_mode) {
2155 			ip->ino_data.mode = cur_mode;
2156 			ip->ino_data.ctime = trans.time;
2157 			modflags |= HAMMER_INODE_DDIRTY;
2158 			kflags |= NOTE_ATTRIB;
2159 		}
2160 	}
2161 done:
2162 	if (error == 0)
2163 		hammer_modify_inode(ip, modflags);
2164 	hammer_done_transaction(&trans);
2165 	hammer_knote(ap->a_vp, kflags);
2166 	return (error);
2167 }
2168 
2169 /*
2170  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
2171  */
2172 static
2173 int
2174 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
2175 {
2176 	struct hammer_transaction trans;
2177 	struct hammer_inode *dip;
2178 	struct hammer_inode *nip;
2179 	struct nchandle *nch;
2180 	hammer_record_t record;
2181 	int error;
2182 	int bytes;
2183 
2184 	ap->a_vap->va_type = VLNK;
2185 
2186 	nch = ap->a_nch;
2187 	dip = VTOI(ap->a_dvp);
2188 
2189 	if (dip->flags & HAMMER_INODE_RO)
2190 		return (EROFS);
2191 	if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
2192 		return (error);
2193 
2194 	/*
2195 	 * Create a transaction to cover the operations we perform.
2196 	 */
2197 	hammer_start_transaction(&trans, dip->hmp);
2198 	++hammer_stats_file_iopsw;
2199 
2200 	/*
2201 	 * Create a new filesystem object of the requested type.  The
2202 	 * returned inode will be referenced but not locked.
2203 	 */
2204 
2205 	error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
2206 				    dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
2207 				    NULL, &nip);
2208 	if (error) {
2209 		hammer_done_transaction(&trans);
2210 		*ap->a_vpp = NULL;
2211 		return (error);
2212 	}
2213 
2214 	/*
2215 	 * Add a record representing the symlink.  symlink stores the link
2216 	 * as pure data, not a string, and is no \0 terminated.
2217 	 */
2218 	if (error == 0) {
2219 		bytes = strlen(ap->a_target);
2220 
2221 		if (bytes <= HAMMER_INODE_BASESYMLEN) {
2222 			bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
2223 		} else {
2224 			record = hammer_alloc_mem_record(nip, bytes);
2225 			record->type = HAMMER_MEM_RECORD_GENERAL;
2226 
2227 			record->leaf.base.localization = nip->obj_localization +
2228 							 HAMMER_LOCALIZE_MISC;
2229 			record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
2230 			record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
2231 			record->leaf.data_len = bytes;
2232 			KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
2233 			bcopy(ap->a_target, record->data->symlink.name, bytes);
2234 			error = hammer_ip_add_record(&trans, record);
2235 		}
2236 
2237 		/*
2238 		 * Set the file size to the length of the link.
2239 		 */
2240 		if (error == 0) {
2241 			nip->ino_data.size = bytes;
2242 			hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
2243 		}
2244 	}
2245 	if (error == 0)
2246 		error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
2247 						nch->ncp->nc_nlen, nip);
2248 
2249 	/*
2250 	 * Finish up.
2251 	 */
2252 	if (error) {
2253 		hammer_rel_inode(nip, 0);
2254 		*ap->a_vpp = NULL;
2255 	} else {
2256 		error = hammer_get_vnode(nip, ap->a_vpp);
2257 		hammer_rel_inode(nip, 0);
2258 		if (error == 0) {
2259 			cache_setunresolved(ap->a_nch);
2260 			cache_setvp(ap->a_nch, *ap->a_vpp);
2261 			hammer_knote(ap->a_dvp, NOTE_WRITE);
2262 		}
2263 	}
2264 	hammer_done_transaction(&trans);
2265 	return (error);
2266 }
2267 
2268 /*
2269  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
2270  */
2271 static
2272 int
2273 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
2274 {
2275 	struct hammer_transaction trans;
2276 	struct hammer_inode *dip;
2277 	int error;
2278 
2279 	dip = VTOI(ap->a_dvp);
2280 
2281 	if (hammer_nohistory(dip) == 0 &&
2282 	    (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) {
2283 		return (error);
2284 	}
2285 
2286 	hammer_start_transaction(&trans, dip->hmp);
2287 	++hammer_stats_file_iopsw;
2288 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
2289 				ap->a_cred, ap->a_flags, -1);
2290 	hammer_done_transaction(&trans);
2291 
2292 	return (error);
2293 }
2294 
2295 /*
2296  * hammer_vop_ioctl { vp, command, data, fflag, cred }
2297  */
2298 static
2299 int
2300 hammer_vop_ioctl(struct vop_ioctl_args *ap)
2301 {
2302 	struct hammer_inode *ip = ap->a_vp->v_data;
2303 
2304 	++hammer_stats_file_iopsr;
2305 	return(hammer_ioctl(ip, ap->a_command, ap->a_data,
2306 			    ap->a_fflag, ap->a_cred));
2307 }
2308 
2309 static
2310 int
2311 hammer_vop_mountctl(struct vop_mountctl_args *ap)
2312 {
2313 	static const struct mountctl_opt extraopt[] = {
2314 		{ HMNT_NOHISTORY, 	"nohistory" },
2315 		{ HMNT_MASTERID,	"master" },
2316 		{ 0, NULL}
2317 
2318 	};
2319 	struct hammer_mount *hmp;
2320 	struct mount *mp;
2321 	int usedbytes;
2322 	int error;
2323 
2324 	error = 0;
2325 	usedbytes = 0;
2326 	mp = ap->a_head.a_ops->head.vv_mount;
2327 	KKASSERT(mp->mnt_data != NULL);
2328 	hmp = (struct hammer_mount *)mp->mnt_data;
2329 
2330 	switch(ap->a_op) {
2331 
2332 	case MOUNTCTL_SET_EXPORT:
2333 		if (ap->a_ctllen != sizeof(struct export_args))
2334 			error = EINVAL;
2335 		else
2336 			error = hammer_vfs_export(mp, ap->a_op,
2337 				      (const struct export_args *)ap->a_ctl);
2338 		break;
2339 	case MOUNTCTL_MOUNTFLAGS:
2340 	{
2341 		/*
2342 		 * Call standard mountctl VOP function
2343 		 * so we get user mount flags.
2344 		 */
2345 		error = vop_stdmountctl(ap);
2346 		if (error)
2347 			break;
2348 
2349 		usedbytes = *ap->a_res;
2350 
2351 		if (usedbytes > 0 && usedbytes < ap->a_buflen) {
2352 			usedbytes += vfs_flagstostr(hmp->hflags, extraopt, ap->a_buf,
2353 						    ap->a_buflen - usedbytes,
2354 						    &error);
2355 		}
2356 
2357 		*ap->a_res += usedbytes;
2358 		break;
2359 	}
2360 	default:
2361 		error = vop_stdmountctl(ap);
2362 		break;
2363 	}
2364 	return(error);
2365 }
2366 
2367 /*
2368  * hammer_vop_strategy { vp, bio }
2369  *
2370  * Strategy call, used for regular file read & write only.  Note that the
2371  * bp may represent a cluster.
2372  *
2373  * To simplify operation and allow better optimizations in the future,
2374  * this code does not make any assumptions with regards to buffer alignment
2375  * or size.
2376  */
2377 static
2378 int
2379 hammer_vop_strategy(struct vop_strategy_args *ap)
2380 {
2381 	struct buf *bp;
2382 	int error;
2383 
2384 	bp = ap->a_bio->bio_buf;
2385 
2386 	switch(bp->b_cmd) {
2387 	case BUF_CMD_READ:
2388 		error = hammer_vop_strategy_read(ap);
2389 		break;
2390 	case BUF_CMD_WRITE:
2391 		error = hammer_vop_strategy_write(ap);
2392 		break;
2393 	default:
2394 		bp->b_error = error = EINVAL;
2395 		bp->b_flags |= B_ERROR;
2396 		biodone(ap->a_bio);
2397 		break;
2398 	}
2399 	return (error);
2400 }
2401 
2402 /*
2403  * Read from a regular file.  Iterate the related records and fill in the
2404  * BIO/BUF.  Gaps are zero-filled.
2405  *
2406  * The support code in hammer_object.c should be used to deal with mixed
2407  * in-memory and on-disk records.
2408  *
2409  * NOTE: Can be called from the cluster code with an oversized buf.
2410  *
2411  * XXX atime update
2412  */
2413 static
2414 int
2415 hammer_vop_strategy_read(struct vop_strategy_args *ap)
2416 {
2417 	struct hammer_transaction trans;
2418 	struct hammer_inode *ip;
2419 	struct hammer_inode *dip;
2420 	struct hammer_cursor cursor;
2421 	hammer_base_elm_t base;
2422 	hammer_off_t disk_offset;
2423 	struct bio *bio;
2424 	struct bio *nbio;
2425 	struct buf *bp;
2426 	int64_t rec_offset;
2427 	int64_t ran_end;
2428 	int64_t tmp64;
2429 	int error;
2430 	int boff;
2431 	int roff;
2432 	int n;
2433 
2434 	bio = ap->a_bio;
2435 	bp = bio->bio_buf;
2436 	ip = ap->a_vp->v_data;
2437 
2438 	/*
2439 	 * The zone-2 disk offset may have been set by the cluster code via
2440 	 * a BMAP operation, or else should be NOOFFSET.
2441 	 *
2442 	 * Checking the high bits for a match against zone-2 should suffice.
2443 	 */
2444 	nbio = push_bio(bio);
2445 	if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2446 	    HAMMER_ZONE_LARGE_DATA) {
2447 		error = hammer_io_direct_read(ip->hmp, nbio, NULL);
2448 		return (error);
2449 	}
2450 
2451 	/*
2452 	 * Well, that sucked.  Do it the hard way.  If all the stars are
2453 	 * aligned we may still be able to issue a direct-read.
2454 	 */
2455 	hammer_simple_transaction(&trans, ip->hmp);
2456 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2457 
2458 	/*
2459 	 * Key range (begin and end inclusive) to scan.  Note that the key's
2460 	 * stored in the actual records represent BASE+LEN, not BASE.  The
2461 	 * first record containing bio_offset will have a key > bio_offset.
2462 	 */
2463 	cursor.key_beg.localization = ip->obj_localization +
2464 				      HAMMER_LOCALIZE_MISC;
2465 	cursor.key_beg.obj_id = ip->obj_id;
2466 	cursor.key_beg.create_tid = 0;
2467 	cursor.key_beg.delete_tid = 0;
2468 	cursor.key_beg.obj_type = 0;
2469 	cursor.key_beg.key = bio->bio_offset + 1;
2470 	cursor.asof = ip->obj_asof;
2471 	cursor.flags |= HAMMER_CURSOR_ASOF;
2472 
2473 	cursor.key_end = cursor.key_beg;
2474 	KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2475 #if 0
2476 	if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2477 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2478 		cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2479 		cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2480 	} else
2481 #endif
2482 	{
2483 		ran_end = bio->bio_offset + bp->b_bufsize;
2484 		cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2485 		cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2486 		tmp64 = ran_end + MAXPHYS + 1;	/* work-around GCC-4 bug */
2487 		if (tmp64 < ran_end)
2488 			cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2489 		else
2490 			cursor.key_end.key = ran_end + MAXPHYS + 1;
2491 	}
2492 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2493 
2494 	error = hammer_ip_first(&cursor);
2495 	boff = 0;
2496 
2497 	while (error == 0) {
2498 		/*
2499 		 * Get the base file offset of the record.  The key for
2500 		 * data records is (base + bytes) rather then (base).
2501 		 */
2502 		base = &cursor.leaf->base;
2503 		rec_offset = base->key - cursor.leaf->data_len;
2504 
2505 		/*
2506 		 * Calculate the gap, if any, and zero-fill it.
2507 		 *
2508 		 * n is the offset of the start of the record verses our
2509 		 * current seek offset in the bio.
2510 		 */
2511 		n = (int)(rec_offset - (bio->bio_offset + boff));
2512 		if (n > 0) {
2513 			if (n > bp->b_bufsize - boff)
2514 				n = bp->b_bufsize - boff;
2515 			bzero((char *)bp->b_data + boff, n);
2516 			boff += n;
2517 			n = 0;
2518 		}
2519 
2520 		/*
2521 		 * Calculate the data offset in the record and the number
2522 		 * of bytes we can copy.
2523 		 *
2524 		 * There are two degenerate cases.  First, boff may already
2525 		 * be at bp->b_bufsize.  Secondly, the data offset within
2526 		 * the record may exceed the record's size.
2527 		 */
2528 		roff = -n;
2529 		rec_offset += roff;
2530 		n = cursor.leaf->data_len - roff;
2531 		if (n <= 0) {
2532 			kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2533 			n = 0;
2534 		} else if (n > bp->b_bufsize - boff) {
2535 			n = bp->b_bufsize - boff;
2536 		}
2537 
2538 		/*
2539 		 * Deal with cached truncations.  This cool bit of code
2540 		 * allows truncate()/ftruncate() to avoid having to sync
2541 		 * the file.
2542 		 *
2543 		 * If the frontend is truncated then all backend records are
2544 		 * subject to the frontend's truncation.
2545 		 *
2546 		 * If the backend is truncated then backend records on-disk
2547 		 * (but not in-memory) are subject to the backend's
2548 		 * truncation.  In-memory records owned by the backend
2549 		 * represent data written after the truncation point on the
2550 		 * backend and must not be truncated.
2551 		 *
2552 		 * Truncate operations deal with frontend buffer cache
2553 		 * buffers and frontend-owned in-memory records synchronously.
2554 		 */
2555 		if (ip->flags & HAMMER_INODE_TRUNCATED) {
2556 			if (hammer_cursor_ondisk(&cursor) ||
2557 			    cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2558 				if (ip->trunc_off <= rec_offset)
2559 					n = 0;
2560 				else if (ip->trunc_off < rec_offset + n)
2561 					n = (int)(ip->trunc_off - rec_offset);
2562 			}
2563 		}
2564 		if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2565 			if (hammer_cursor_ondisk(&cursor)) {
2566 				if (ip->sync_trunc_off <= rec_offset)
2567 					n = 0;
2568 				else if (ip->sync_trunc_off < rec_offset + n)
2569 					n = (int)(ip->sync_trunc_off - rec_offset);
2570 			}
2571 		}
2572 
2573 		/*
2574 		 * Try to issue a direct read into our bio if possible,
2575 		 * otherwise resolve the element data into a hammer_buffer
2576 		 * and copy.
2577 		 *
2578 		 * The buffer on-disk should be zerod past any real
2579 		 * truncation point, but may not be for any synthesized
2580 		 * truncation point from above.
2581 		 */
2582 		disk_offset = cursor.leaf->data_offset + roff;
2583 		if (boff == 0 && n == bp->b_bufsize &&
2584 		    hammer_cursor_ondisk(&cursor) &&
2585 		    (disk_offset & HAMMER_BUFMASK) == 0) {
2586 			KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2587 				 HAMMER_ZONE_LARGE_DATA);
2588 			nbio->bio_offset = disk_offset;
2589 			error = hammer_io_direct_read(trans.hmp, nbio,
2590 						      cursor.leaf);
2591 			goto done;
2592 		} else if (n) {
2593 			error = hammer_ip_resolve_data(&cursor);
2594 			if (error == 0) {
2595 				bcopy((char *)cursor.data + roff,
2596 				      (char *)bp->b_data + boff, n);
2597 			}
2598 		}
2599 		if (error)
2600 			break;
2601 
2602 		/*
2603 		 * Iterate until we have filled the request.
2604 		 */
2605 		boff += n;
2606 		if (boff == bp->b_bufsize)
2607 			break;
2608 		error = hammer_ip_next(&cursor);
2609 	}
2610 
2611 	/*
2612 	 * There may have been a gap after the last record
2613 	 */
2614 	if (error == ENOENT)
2615 		error = 0;
2616 	if (error == 0 && boff != bp->b_bufsize) {
2617 		KKASSERT(boff < bp->b_bufsize);
2618 		bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2619 		/* boff = bp->b_bufsize; */
2620 	}
2621 	bp->b_resid = 0;
2622 	bp->b_error = error;
2623 	if (error)
2624 		bp->b_flags |= B_ERROR;
2625 	biodone(ap->a_bio);
2626 
2627 done:
2628 	/*
2629 	 * Cache the b-tree node for the last data read in cache[1].
2630 	 *
2631 	 * If we hit the file EOF then also cache the node in the
2632 	 * governing director's cache[3], it will be used to initialize
2633 	 * the inode's cache[1] for any inodes looked up via the directory.
2634 	 *
2635 	 * This doesn't reduce disk accesses since the B-Tree chain is
2636 	 * likely cached, but it does reduce cpu overhead when looking
2637 	 * up file offsets for cpdup/tar/cpio style iterations.
2638 	 */
2639 	if (cursor.node)
2640 		hammer_cache_node(&ip->cache[1], cursor.node);
2641 	if (ran_end >= ip->ino_data.size) {
2642 		dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id,
2643 					ip->obj_asof, ip->obj_localization);
2644 		if (dip) {
2645 			hammer_cache_node(&dip->cache[3], cursor.node);
2646 			hammer_rel_inode(dip, 0);
2647 		}
2648 	}
2649 	hammer_done_cursor(&cursor);
2650 	hammer_done_transaction(&trans);
2651 	return(error);
2652 }
2653 
2654 /*
2655  * BMAP operation - used to support cluster_read() only.
2656  *
2657  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2658  *
2659  * This routine may return EOPNOTSUPP if the opration is not supported for
2660  * the specified offset.  The contents of the pointer arguments do not
2661  * need to be initialized in that case.
2662  *
2663  * If a disk address is available and properly aligned return 0 with
2664  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2665  * to the run-length relative to that offset.  Callers may assume that
2666  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2667  * large, so return EOPNOTSUPP if it is not sufficiently large.
2668  */
2669 static
2670 int
2671 hammer_vop_bmap(struct vop_bmap_args *ap)
2672 {
2673 	struct hammer_transaction trans;
2674 	struct hammer_inode *ip;
2675 	struct hammer_cursor cursor;
2676 	hammer_base_elm_t base;
2677 	int64_t rec_offset;
2678 	int64_t ran_end;
2679 	int64_t tmp64;
2680 	int64_t base_offset;
2681 	int64_t base_disk_offset;
2682 	int64_t last_offset;
2683 	hammer_off_t last_disk_offset;
2684 	hammer_off_t disk_offset;
2685 	int	rec_len;
2686 	int	error;
2687 	int	blksize;
2688 
2689 	++hammer_stats_file_iopsr;
2690 	ip = ap->a_vp->v_data;
2691 
2692 	/*
2693 	 * We can only BMAP regular files.  We can't BMAP database files,
2694 	 * directories, etc.
2695 	 */
2696 	if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2697 		return(EOPNOTSUPP);
2698 
2699 	/*
2700 	 * bmap is typically called with runp/runb both NULL when used
2701 	 * for writing.  We do not support BMAP for writing atm.
2702 	 */
2703 	if (ap->a_cmd != BUF_CMD_READ)
2704 		return(EOPNOTSUPP);
2705 
2706 	/*
2707 	 * Scan the B-Tree to acquire blockmap addresses, then translate
2708 	 * to raw addresses.
2709 	 */
2710 	hammer_simple_transaction(&trans, ip->hmp);
2711 #if 0
2712 	kprintf("bmap_beg %016llx ip->cache %p\n",
2713 		(long long)ap->a_loffset, ip->cache[1]);
2714 #endif
2715 	hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2716 
2717 	/*
2718 	 * Key range (begin and end inclusive) to scan.  Note that the key's
2719 	 * stored in the actual records represent BASE+LEN, not BASE.  The
2720 	 * first record containing bio_offset will have a key > bio_offset.
2721 	 */
2722 	cursor.key_beg.localization = ip->obj_localization +
2723 				      HAMMER_LOCALIZE_MISC;
2724 	cursor.key_beg.obj_id = ip->obj_id;
2725 	cursor.key_beg.create_tid = 0;
2726 	cursor.key_beg.delete_tid = 0;
2727 	cursor.key_beg.obj_type = 0;
2728 	if (ap->a_runb)
2729 		cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
2730 	else
2731 		cursor.key_beg.key = ap->a_loffset + 1;
2732 	if (cursor.key_beg.key < 0)
2733 		cursor.key_beg.key = 0;
2734 	cursor.asof = ip->obj_asof;
2735 	cursor.flags |= HAMMER_CURSOR_ASOF;
2736 
2737 	cursor.key_end = cursor.key_beg;
2738 	KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2739 
2740 	ran_end = ap->a_loffset + MAXPHYS;
2741 	cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2742 	cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2743 	tmp64 = ran_end + MAXPHYS + 1;	/* work-around GCC-4 bug */
2744 	if (tmp64 < ran_end)
2745 		cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2746 	else
2747 		cursor.key_end.key = ran_end + MAXPHYS + 1;
2748 
2749 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2750 
2751 	error = hammer_ip_first(&cursor);
2752 	base_offset = last_offset = 0;
2753 	base_disk_offset = last_disk_offset = 0;
2754 
2755 	while (error == 0) {
2756 		/*
2757 		 * Get the base file offset of the record.  The key for
2758 		 * data records is (base + bytes) rather then (base).
2759 		 *
2760 		 * NOTE: rec_offset + rec_len may exceed the end-of-file.
2761 		 * The extra bytes should be zero on-disk and the BMAP op
2762 		 * should still be ok.
2763 		 */
2764 		base = &cursor.leaf->base;
2765 		rec_offset = base->key - cursor.leaf->data_len;
2766 		rec_len    = cursor.leaf->data_len;
2767 
2768 		/*
2769 		 * Incorporate any cached truncation.
2770 		 *
2771 		 * NOTE: Modifications to rec_len based on synthesized
2772 		 * truncation points remove the guarantee that any extended
2773 		 * data on disk is zero (since the truncations may not have
2774 		 * taken place on-media yet).
2775 		 */
2776 		if (ip->flags & HAMMER_INODE_TRUNCATED) {
2777 			if (hammer_cursor_ondisk(&cursor) ||
2778 			    cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2779 				if (ip->trunc_off <= rec_offset)
2780 					rec_len = 0;
2781 				else if (ip->trunc_off < rec_offset + rec_len)
2782 					rec_len = (int)(ip->trunc_off - rec_offset);
2783 			}
2784 		}
2785 		if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2786 			if (hammer_cursor_ondisk(&cursor)) {
2787 				if (ip->sync_trunc_off <= rec_offset)
2788 					rec_len = 0;
2789 				else if (ip->sync_trunc_off < rec_offset + rec_len)
2790 					rec_len = (int)(ip->sync_trunc_off - rec_offset);
2791 			}
2792 		}
2793 
2794 		/*
2795 		 * Accumulate information.  If we have hit a discontiguous
2796 		 * block reset base_offset unless we are already beyond the
2797 		 * requested offset.  If we are, that's it, we stop.
2798 		 */
2799 		if (error)
2800 			break;
2801 		if (hammer_cursor_ondisk(&cursor)) {
2802 			disk_offset = cursor.leaf->data_offset;
2803 			if (rec_offset != last_offset ||
2804 			    disk_offset != last_disk_offset) {
2805 				if (rec_offset > ap->a_loffset)
2806 					break;
2807 				base_offset = rec_offset;
2808 				base_disk_offset = disk_offset;
2809 			}
2810 			last_offset = rec_offset + rec_len;
2811 			last_disk_offset = disk_offset + rec_len;
2812 		}
2813 		error = hammer_ip_next(&cursor);
2814 	}
2815 
2816 #if 0
2817 	kprintf("BMAP %016llx:  %016llx - %016llx\n",
2818 		(long long)ap->a_loffset,
2819 		(long long)base_offset,
2820 		(long long)last_offset);
2821 	kprintf("BMAP %16s:  %016llx - %016llx\n", "",
2822 		(long long)base_disk_offset,
2823 		(long long)last_disk_offset);
2824 #endif
2825 
2826 	if (cursor.node) {
2827 		hammer_cache_node(&ip->cache[1], cursor.node);
2828 #if 0
2829 		kprintf("bmap_end2 %016llx ip->cache %p\n",
2830 			(long long)ap->a_loffset, ip->cache[1]);
2831 #endif
2832 	}
2833 	hammer_done_cursor(&cursor);
2834 	hammer_done_transaction(&trans);
2835 
2836 	/*
2837 	 * If we couldn't find any records or the records we did find were
2838 	 * all behind the requested offset, return failure.  A forward
2839 	 * truncation can leave a hole w/ no on-disk records.
2840 	 */
2841 	if (last_offset == 0 || last_offset < ap->a_loffset)
2842 		return (EOPNOTSUPP);
2843 
2844 	/*
2845 	 * Figure out the block size at the requested offset and adjust
2846 	 * our limits so the cluster_read() does not create inappropriately
2847 	 * sized buffer cache buffers.
2848 	 */
2849 	blksize = hammer_blocksize(ap->a_loffset);
2850 	if (hammer_blocksize(base_offset) != blksize) {
2851 		base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
2852 	}
2853 	if (last_offset != ap->a_loffset &&
2854 	    hammer_blocksize(last_offset - 1) != blksize) {
2855 		last_offset = hammer_blockdemarc(ap->a_loffset,
2856 						 last_offset - 1);
2857 	}
2858 
2859 	/*
2860 	 * Returning EOPNOTSUPP simply prevents the direct-IO optimization
2861 	 * from occuring.
2862 	 */
2863 	disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
2864 
2865 	if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
2866 		/*
2867 		 * Only large-data zones can be direct-IOd
2868 		 */
2869 		error = EOPNOTSUPP;
2870 	} else if ((disk_offset & HAMMER_BUFMASK) ||
2871 		   (last_offset - ap->a_loffset) < blksize) {
2872 		/*
2873 		 * doffsetp is not aligned or the forward run size does
2874 		 * not cover a whole buffer, disallow the direct I/O.
2875 		 */
2876 		error = EOPNOTSUPP;
2877 	} else {
2878 		/*
2879 		 * We're good.
2880 		 */
2881 		*ap->a_doffsetp = disk_offset;
2882 		if (ap->a_runb) {
2883 			*ap->a_runb = ap->a_loffset - base_offset;
2884 			KKASSERT(*ap->a_runb >= 0);
2885 		}
2886 		if (ap->a_runp) {
2887 			*ap->a_runp = last_offset - ap->a_loffset;
2888 			KKASSERT(*ap->a_runp >= 0);
2889 		}
2890 		error = 0;
2891 	}
2892 	return(error);
2893 }
2894 
2895 /*
2896  * Write to a regular file.   Because this is a strategy call the OS is
2897  * trying to actually get data onto the media.
2898  */
2899 static
2900 int
2901 hammer_vop_strategy_write(struct vop_strategy_args *ap)
2902 {
2903 	hammer_record_t record;
2904 	hammer_mount_t hmp;
2905 	hammer_inode_t ip;
2906 	struct bio *bio;
2907 	struct buf *bp;
2908 	int blksize;
2909 	int bytes;
2910 	int error;
2911 
2912 	bio = ap->a_bio;
2913 	bp = bio->bio_buf;
2914 	ip = ap->a_vp->v_data;
2915 	hmp = ip->hmp;
2916 
2917 	blksize = hammer_blocksize(bio->bio_offset);
2918 	KKASSERT(bp->b_bufsize == blksize);
2919 
2920 	if (ip->flags & HAMMER_INODE_RO) {
2921 		bp->b_error = EROFS;
2922 		bp->b_flags |= B_ERROR;
2923 		biodone(ap->a_bio);
2924 		return(EROFS);
2925 	}
2926 
2927 	/*
2928 	 * Interlock with inode destruction (no in-kernel or directory
2929 	 * topology visibility).  If we queue new IO while trying to
2930 	 * destroy the inode we can deadlock the vtrunc call in
2931 	 * hammer_inode_unloadable_check().
2932 	 *
2933 	 * Besides, there's no point flushing a bp associated with an
2934 	 * inode that is being destroyed on-media and has no kernel
2935 	 * references.
2936 	 */
2937 	if ((ip->flags | ip->sync_flags) &
2938 	    (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
2939 		bp->b_resid = 0;
2940 		biodone(ap->a_bio);
2941 		return(0);
2942 	}
2943 
2944 	/*
2945 	 * Reserve space and issue a direct-write from the front-end.
2946 	 * NOTE: The direct_io code will hammer_bread/bcopy smaller
2947 	 * allocations.
2948 	 *
2949 	 * An in-memory record will be installed to reference the storage
2950 	 * until the flusher can get to it.
2951 	 *
2952 	 * Since we own the high level bio the front-end will not try to
2953 	 * do a direct-read until the write completes.
2954 	 *
2955 	 * NOTE: The only time we do not reserve a full-sized buffers
2956 	 * worth of data is if the file is small.  We do not try to
2957 	 * allocate a fragment (from the small-data zone) at the end of
2958 	 * an otherwise large file as this can lead to wildly separated
2959 	 * data.
2960 	 */
2961 	KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
2962 	KKASSERT(bio->bio_offset < ip->ino_data.size);
2963 	if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
2964 		bytes = bp->b_bufsize;
2965 	else
2966 		bytes = ((int)ip->ino_data.size + 15) & ~15;
2967 
2968 	record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
2969 				    bytes, &error);
2970 	if (record) {
2971 		hammer_io_direct_write(hmp, record, bio);
2972 		if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
2973 			hammer_flush_inode(ip, 0);
2974 	} else {
2975 		bp->b_bio2.bio_offset = NOOFFSET;
2976 		bp->b_error = error;
2977 		bp->b_flags |= B_ERROR;
2978 		biodone(ap->a_bio);
2979 	}
2980 	return(error);
2981 }
2982 
2983 /*
2984  * dounlink - disconnect a directory entry
2985  *
2986  * XXX whiteout support not really in yet
2987  */
2988 static int
2989 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
2990 		struct vnode *dvp, struct ucred *cred,
2991 		int flags, int isdir)
2992 {
2993 	struct namecache *ncp;
2994 	hammer_inode_t dip;
2995 	hammer_inode_t ip;
2996 	struct hammer_cursor cursor;
2997 	int64_t namekey;
2998 	u_int32_t max_iterations;
2999 	int nlen, error;
3000 
3001 	/*
3002 	 * Calculate the namekey and setup the key range for the scan.  This
3003 	 * works kinda like a chained hash table where the lower 32 bits
3004 	 * of the namekey synthesize the chain.
3005 	 *
3006 	 * The key range is inclusive of both key_beg and key_end.
3007 	 */
3008 	dip = VTOI(dvp);
3009 	ncp = nch->ncp;
3010 
3011 	if (dip->flags & HAMMER_INODE_RO)
3012 		return (EROFS);
3013 
3014 	namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen,
3015 					   &max_iterations);
3016 retry:
3017 	hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
3018 	cursor.key_beg.localization = dip->obj_localization +
3019 				      hammer_dir_localization(dip);
3020         cursor.key_beg.obj_id = dip->obj_id;
3021 	cursor.key_beg.key = namekey;
3022         cursor.key_beg.create_tid = 0;
3023         cursor.key_beg.delete_tid = 0;
3024         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
3025         cursor.key_beg.obj_type = 0;
3026 
3027 	cursor.key_end = cursor.key_beg;
3028 	cursor.key_end.key += max_iterations;
3029 	cursor.asof = dip->obj_asof;
3030 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
3031 
3032 	/*
3033 	 * Scan all matching records (the chain), locate the one matching
3034 	 * the requested path component.  info->last_error contains the
3035 	 * error code on search termination and could be 0, ENOENT, or
3036 	 * something else.
3037 	 *
3038 	 * The hammer_ip_*() functions merge in-memory records with on-disk
3039 	 * records for the purposes of the search.
3040 	 */
3041 	error = hammer_ip_first(&cursor);
3042 
3043 	while (error == 0) {
3044 		error = hammer_ip_resolve_data(&cursor);
3045 		if (error)
3046 			break;
3047 		nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
3048 		KKASSERT(nlen > 0);
3049 		if (ncp->nc_nlen == nlen &&
3050 		    bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
3051 			break;
3052 		}
3053 		error = hammer_ip_next(&cursor);
3054 	}
3055 
3056 	/*
3057 	 * If all is ok we have to get the inode so we can adjust nlinks.
3058 	 * To avoid a deadlock with the flusher we must release the inode
3059 	 * lock on the directory when acquiring the inode for the entry.
3060 	 *
3061 	 * If the target is a directory, it must be empty.
3062 	 */
3063 	if (error == 0) {
3064 		hammer_unlock(&cursor.ip->lock);
3065 		ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
3066 				      dip->hmp->asof,
3067 				      cursor.data->entry.localization,
3068 				      0, &error);
3069 		hammer_lock_sh(&cursor.ip->lock);
3070 		if (error == ENOENT) {
3071 			kprintf("HAMMER: WARNING: Removing "
3072 				"dirent w/missing inode \"%s\"\n"
3073 				"\tobj_id = %016llx\n",
3074 				ncp->nc_name,
3075 				(long long)cursor.data->entry.obj_id);
3076 			error = 0;
3077 		}
3078 
3079 		/*
3080 		 * If isdir >= 0 we validate that the entry is or is not a
3081 		 * directory.  If isdir < 0 we don't care.
3082 		 */
3083 		if (error == 0 && isdir >= 0 && ip) {
3084 			if (isdir &&
3085 			    ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
3086 				error = ENOTDIR;
3087 			} else if (isdir == 0 &&
3088 			    ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
3089 				error = EISDIR;
3090 			}
3091 		}
3092 
3093 		/*
3094 		 * If we are trying to remove a directory the directory must
3095 		 * be empty.
3096 		 *
3097 		 * The check directory code can loop and deadlock/retry.  Our
3098 		 * own cursor's node locks must be released to avoid a 3-way
3099 		 * deadlock with the flusher if the check directory code
3100 		 * blocks.
3101 		 *
3102 		 * If any changes whatsoever have been made to the cursor
3103 		 * set EDEADLK and retry.
3104 		 *
3105 		 * WARNING: See warnings in hammer_unlock_cursor()
3106 		 *	    function.
3107 		 */
3108 		if (error == 0 && ip && ip->ino_data.obj_type ==
3109 				        HAMMER_OBJTYPE_DIRECTORY) {
3110 			hammer_unlock_cursor(&cursor);
3111 			error = hammer_ip_check_directory_empty(trans, ip);
3112 			hammer_lock_cursor(&cursor);
3113 			if (cursor.flags & HAMMER_CURSOR_RETEST) {
3114 				kprintf("HAMMER: Warning: avoided deadlock "
3115 					"on rmdir '%s'\n",
3116 					ncp->nc_name);
3117 				error = EDEADLK;
3118 			}
3119 		}
3120 
3121 		/*
3122 		 * Delete the directory entry.
3123 		 *
3124 		 * WARNING: hammer_ip_del_directory() may have to terminate
3125 		 * the cursor to avoid a deadlock.  It is ok to call
3126 		 * hammer_done_cursor() twice.
3127 		 */
3128 		if (error == 0) {
3129 			error = hammer_ip_del_directory(trans, &cursor,
3130 							dip, ip);
3131 		}
3132 		hammer_done_cursor(&cursor);
3133 		if (error == 0) {
3134 			cache_setunresolved(nch);
3135 			cache_setvp(nch, NULL);
3136 			/* XXX locking */
3137 			if (ip && ip->vp) {
3138 				hammer_knote(ip->vp, NOTE_DELETE);
3139 				cache_inval_vp(ip->vp, CINV_DESTROY);
3140 			}
3141 		}
3142 		if (ip)
3143 			hammer_rel_inode(ip, 0);
3144 	} else {
3145 		hammer_done_cursor(&cursor);
3146 	}
3147 	if (error == EDEADLK)
3148 		goto retry;
3149 
3150 	return (error);
3151 }
3152 
3153 /************************************************************************
3154  *			    FIFO AND SPECFS OPS				*
3155  ************************************************************************
3156  *
3157  */
3158 
3159 static int
3160 hammer_vop_fifoclose (struct vop_close_args *ap)
3161 {
3162 	/* XXX update itimes */
3163 	return (VOCALL(&fifo_vnode_vops, &ap->a_head));
3164 }
3165 
3166 static int
3167 hammer_vop_fiforead (struct vop_read_args *ap)
3168 {
3169 	int error;
3170 
3171 	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3172 	/* XXX update access time */
3173 	return (error);
3174 }
3175 
3176 static int
3177 hammer_vop_fifowrite (struct vop_write_args *ap)
3178 {
3179 	int error;
3180 
3181 	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3182 	/* XXX update access time */
3183 	return (error);
3184 }
3185 
3186 static
3187 int
3188 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap)
3189 {
3190 	int error;
3191 
3192 	error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3193 	if (error)
3194 		error = hammer_vop_kqfilter(ap);
3195 	return(error);
3196 }
3197 
3198 /************************************************************************
3199  *			    KQFILTER OPS				*
3200  ************************************************************************
3201  *
3202  */
3203 static void filt_hammerdetach(struct knote *kn);
3204 static int filt_hammerread(struct knote *kn, long hint);
3205 static int filt_hammerwrite(struct knote *kn, long hint);
3206 static int filt_hammervnode(struct knote *kn, long hint);
3207 
3208 static struct filterops hammerread_filtops =
3209 	{ 1, NULL, filt_hammerdetach, filt_hammerread };
3210 static struct filterops hammerwrite_filtops =
3211 	{ 1, NULL, filt_hammerdetach, filt_hammerwrite };
3212 static struct filterops hammervnode_filtops =
3213 	{ 1, NULL, filt_hammerdetach, filt_hammervnode };
3214 
3215 static
3216 int
3217 hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
3218 {
3219 	struct vnode *vp = ap->a_vp;
3220 	struct knote *kn = ap->a_kn;
3221 	lwkt_tokref vlock;
3222 
3223 	switch (kn->kn_filter) {
3224 	case EVFILT_READ:
3225 		kn->kn_fop = &hammerread_filtops;
3226 		break;
3227 	case EVFILT_WRITE:
3228 		kn->kn_fop = &hammerwrite_filtops;
3229 		break;
3230 	case EVFILT_VNODE:
3231 		kn->kn_fop = &hammervnode_filtops;
3232 		break;
3233 	default:
3234 		return (1);
3235 	}
3236 
3237 	kn->kn_hook = (caddr_t)vp;
3238 
3239 	lwkt_gettoken(&vlock, &vp->v_token);
3240 	SLIST_INSERT_HEAD(&vp->v_pollinfo.vpi_selinfo.si_note, kn, kn_selnext);
3241 	lwkt_reltoken(&vlock);
3242 
3243 	return(0);
3244 }
3245 
3246 static void
3247 filt_hammerdetach(struct knote *kn)
3248 {
3249 	struct vnode *vp = (void *)kn->kn_hook;
3250 	lwkt_tokref vlock;
3251 
3252 	lwkt_gettoken(&vlock, &vp->v_token);
3253 	SLIST_REMOVE(&vp->v_pollinfo.vpi_selinfo.si_note,
3254 		     kn, knote, kn_selnext);
3255 	lwkt_reltoken(&vlock);
3256 }
3257 
3258 static int
3259 filt_hammerread(struct knote *kn, long hint)
3260 {
3261 	struct vnode *vp = (void *)kn->kn_hook;
3262 	hammer_inode_t ip = VTOI(vp);
3263 
3264 	if (hint == NOTE_REVOKE) {
3265 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3266 		return(1);
3267 	}
3268 	kn->kn_data = ip->ino_data.size - kn->kn_fp->f_offset;
3269 	return (kn->kn_data != 0);
3270 }
3271 
3272 static int
3273 filt_hammerwrite(struct knote *kn, long hint)
3274 {
3275 	if (hint == NOTE_REVOKE)
3276 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3277 	kn->kn_data = 0;
3278 	return (1);
3279 }
3280 
3281 static int
3282 filt_hammervnode(struct knote *kn, long hint)
3283 {
3284 	if (kn->kn_sfflags & hint)
3285 		kn->kn_fflags |= hint;
3286 	if (hint == NOTE_REVOKE) {
3287 		kn->kn_flags |= EV_EOF;
3288 		return (1);
3289 	}
3290 	return (kn->kn_fflags != 0);
3291 }
3292 
3293