xref: /dragonfly/sys/vfs/tmpfs/tmpfs_vnops.c (revision 99bd8089)
1 /*-
2  * Copyright (c) 2005, 2006 The NetBSD Foundation, Inc.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to The NetBSD Foundation
6  * by Julio M. Merino Vidal, developed as part of Google's Summer of Code
7  * 2005 program.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
22  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  *
30  * $NetBSD: tmpfs_vnops.c,v 1.39 2007/07/23 15:41:01 jmmv Exp $
31  */
32 
33 /*
34  * tmpfs vnode interface.
35  */
36 
37 #include <sys/kernel.h>
38 #include <sys/kern_syscall.h>
39 #include <sys/param.h>
40 #include <sys/uio.h>
41 #include <sys/fcntl.h>
42 #include <sys/lockf.h>
43 #include <sys/priv.h>
44 #include <sys/proc.h>
45 #include <sys/resourcevar.h>
46 #include <sys/sched.h>
47 #include <sys/stat.h>
48 #include <sys/systm.h>
49 #include <sys/sysctl.h>
50 #include <sys/unistd.h>
51 #include <sys/vfsops.h>
52 #include <sys/vnode.h>
53 #include <sys/mountctl.h>
54 
55 #include <vm/vm.h>
56 #include <vm/vm_extern.h>
57 #include <vm/vm_object.h>
58 #include <vm/vm_page.h>
59 #include <vm/vm_pageout.h>
60 #include <vm/vm_pager.h>
61 #include <vm/swap_pager.h>
62 
63 #include <sys/buf2.h>
64 #include <vm/vm_page2.h>
65 
66 #include <vfs/fifofs/fifo.h>
67 #include <vfs/tmpfs/tmpfs_vnops.h>
68 #include "tmpfs.h"
69 
70 static void tmpfs_strategy_done(struct bio *bio);
71 static void tmpfs_move_pages(vm_object_t src, vm_object_t dst, int movflags);
72 
73 /*
74  * bufcache_mode:
75  *	0	Normal page queue operation on flush.  Try to keep in memory.
76  *	1	Try to cache on flush to swap (default).
77  *	2	Always page to swap (not recommended).
78  */
79 __read_mostly static int tmpfs_cluster_rd_enable = 1;
80 __read_mostly static int tmpfs_cluster_wr_enable = 1;
81 __read_mostly int tmpfs_bufcache_mode = 1;
82 SYSCTL_NODE(_vfs, OID_AUTO, tmpfs, CTLFLAG_RW, 0, "TMPFS filesystem");
83 SYSCTL_INT(_vfs_tmpfs, OID_AUTO, cluster_rd_enable, CTLFLAG_RW,
84 		&tmpfs_cluster_rd_enable, 0, "");
85 SYSCTL_INT(_vfs_tmpfs, OID_AUTO, cluster_wr_enable, CTLFLAG_RW,
86 		&tmpfs_cluster_wr_enable, 0, "");
87 SYSCTL_INT(_vfs_tmpfs, OID_AUTO, bufcache_mode, CTLFLAG_RW,
88 		&tmpfs_bufcache_mode, 0, "");
89 
90 #define TMPFS_MOVF_FROMBACKING	0x0001
91 #define TMPFS_MOVF_DEACTIVATE	0x0002
92 
93 
94 static __inline
95 void
96 tmpfs_knote(struct vnode *vp, int flags)
97 {
98 	if (flags)
99 		KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
100 }
101 
102 
103 /* --------------------------------------------------------------------- */
104 
105 static int
106 tmpfs_nresolve(struct vop_nresolve_args *ap)
107 {
108 	struct vnode *dvp = ap->a_dvp;
109 	struct vnode *vp = NULL;
110 	struct namecache *ncp = ap->a_nch->ncp;
111 	struct tmpfs_node *tnode;
112 	struct tmpfs_dirent *de;
113 	struct tmpfs_node *dnode;
114 	int error;
115 
116 	dnode = VP_TO_TMPFS_DIR(dvp);
117 
118 	TMPFS_NODE_LOCK_SH(dnode);
119 loop:
120 	de = tmpfs_dir_lookup(dnode, NULL, ncp);
121 	if (de == NULL) {
122 		error = ENOENT;
123 	} else {
124 		/*
125 		 * Allocate a vnode for the node we found.  Use
126 		 * tmpfs_alloc_vp()'s deadlock handling mode.
127 		 */
128 		tnode = de->td_node;
129 		error = tmpfs_alloc_vp(dvp->v_mount, dnode, tnode,
130 				       LK_EXCLUSIVE | LK_RETRY, &vp);
131 		if (error == EAGAIN)
132 			goto loop;
133 		if (error)
134 			goto out;
135 		KKASSERT(vp);
136 	}
137 
138 out:
139 	TMPFS_NODE_UNLOCK(dnode);
140 
141 	if ((dnode->tn_status & TMPFS_NODE_ACCESSED) == 0) {
142 		TMPFS_NODE_LOCK(dnode);
143 		dnode->tn_status |= TMPFS_NODE_ACCESSED;
144 		TMPFS_NODE_UNLOCK(dnode);
145 	}
146 
147 	/*
148 	 * Store the result of this lookup in the cache.  Avoid this if the
149 	 * request was for creation, as it does not improve timings on
150 	 * emprical tests.
151 	 */
152 	if (vp) {
153 		vn_unlock(vp);
154 		cache_setvp(ap->a_nch, vp);
155 		vrele(vp);
156 	} else if (error == ENOENT) {
157 		cache_setvp(ap->a_nch, NULL);
158 	}
159 	return (error);
160 }
161 
162 static int
163 tmpfs_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
164 {
165 	struct vnode *dvp = ap->a_dvp;
166 	struct vnode **vpp = ap->a_vpp;
167 	struct tmpfs_node *dnode = VP_TO_TMPFS_NODE(dvp);
168 	struct ucred *cred = ap->a_cred;
169 	int error;
170 
171 	*vpp = NULL;
172 
173 	/* Check accessibility of requested node as a first step. */
174 	error = VOP_ACCESS(dvp, VEXEC, cred);
175 	if (error != 0)
176 		return error;
177 
178 	if (dnode->tn_dir.tn_parent != NULL) {
179 		/* Allocate a new vnode on the matching entry. */
180 		error = tmpfs_alloc_vp(dvp->v_mount,
181 				       NULL, dnode->tn_dir.tn_parent,
182 				       LK_EXCLUSIVE | LK_RETRY, vpp);
183 
184 		if (*vpp)
185 			vn_unlock(*vpp);
186 	}
187 	return (*vpp == NULL) ? ENOENT : 0;
188 }
189 
190 /* --------------------------------------------------------------------- */
191 
192 static int
193 tmpfs_ncreate(struct vop_ncreate_args *ap)
194 {
195 	struct vnode *dvp = ap->a_dvp;
196 	struct vnode **vpp = ap->a_vpp;
197 	struct namecache *ncp = ap->a_nch->ncp;
198 	struct vattr *vap = ap->a_vap;
199 	struct ucred *cred = ap->a_cred;
200 	int error;
201 
202 	KKASSERT(vap->va_type == VREG || vap->va_type == VSOCK);
203 
204 	error = tmpfs_alloc_file(dvp, vpp, vap, ncp, cred, NULL);
205 	if (error == 0) {
206 		cache_setunresolved(ap->a_nch);
207 		cache_setvp(ap->a_nch, *vpp);
208 		tmpfs_knote(dvp, NOTE_WRITE);
209 	}
210 	return (error);
211 }
212 /* --------------------------------------------------------------------- */
213 
214 static int
215 tmpfs_nmknod(struct vop_nmknod_args *ap)
216 {
217 	struct vnode *dvp = ap->a_dvp;
218 	struct vnode **vpp = ap->a_vpp;
219 	struct namecache *ncp = ap->a_nch->ncp;
220 	struct vattr *vap = ap->a_vap;
221 	struct ucred *cred = ap->a_cred;
222 	int error;
223 
224 	if (vap->va_type != VBLK && vap->va_type != VCHR &&
225 	    vap->va_type != VFIFO) {
226 		return (EINVAL);
227 	}
228 
229 	error = tmpfs_alloc_file(dvp, vpp, vap, ncp, cred, NULL);
230 	if (error == 0) {
231 		cache_setunresolved(ap->a_nch);
232 		cache_setvp(ap->a_nch, *vpp);
233 		tmpfs_knote(dvp, NOTE_WRITE);
234 	}
235 	return error;
236 }
237 
238 /* --------------------------------------------------------------------- */
239 
240 static int
241 tmpfs_open(struct vop_open_args *ap)
242 {
243 	struct vnode *vp = ap->a_vp;
244 	int mode = ap->a_mode;
245 	struct tmpfs_node *node;
246 	int error;
247 
248 	node = VP_TO_TMPFS_NODE(vp);
249 
250 #if 0
251 	/* The file is still active but all its names have been removed
252 	 * (e.g. by a "rmdir $(pwd)").  It cannot be opened any more as
253 	 * it is about to die. */
254 	if (node->tn_links < 1)
255 		return (ENOENT);
256 #endif
257 
258 	/* If the file is marked append-only, deny write requests. */
259 	if ((node->tn_flags & APPEND) &&
260 	    (mode & (FWRITE | O_APPEND)) == FWRITE) {
261 		error = EPERM;
262 	} else {
263 		if (node->tn_reg.tn_pages_in_aobj) {
264 			TMPFS_NODE_LOCK(node);
265 			if (node->tn_reg.tn_pages_in_aobj) {
266 				tmpfs_move_pages(node->tn_reg.tn_aobj,
267 						 vp->v_object,
268 						 TMPFS_MOVF_FROMBACKING);
269 				node->tn_reg.tn_pages_in_aobj = 0;
270 			}
271 			TMPFS_NODE_UNLOCK(node);
272 		}
273 		error = vop_stdopen(ap);
274 	}
275 
276 	return (error);
277 }
278 
279 /* --------------------------------------------------------------------- */
280 
281 static int
282 tmpfs_close(struct vop_close_args *ap)
283 {
284 	struct vnode *vp = ap->a_vp;
285 	struct tmpfs_node *node;
286 	int error;
287 
288 	node = VP_TO_TMPFS_NODE(vp);
289 
290 	if (node->tn_links > 0) {
291 		/*
292 		 * Update node times.  No need to do it if the node has
293 		 * been deleted, because it will vanish after we return.
294 		 */
295 		tmpfs_update(vp);
296 	}
297 
298 	error = vop_stdclose(ap);
299 
300 	return (error);
301 }
302 
303 /* --------------------------------------------------------------------- */
304 
305 int
306 tmpfs_access(struct vop_access_args *ap)
307 {
308 	struct vnode *vp = ap->a_vp;
309 	int error;
310 	struct tmpfs_node *node;
311 
312 	node = VP_TO_TMPFS_NODE(vp);
313 
314 	switch (vp->v_type) {
315 	case VDIR:
316 		/* FALLTHROUGH */
317 	case VLNK:
318 		/* FALLTHROUGH */
319 	case VREG:
320 		if ((ap->a_mode & VWRITE) &&
321 	            (vp->v_mount->mnt_flag & MNT_RDONLY)) {
322 			error = EROFS;
323 			goto out;
324 		}
325 		break;
326 
327 	case VBLK:
328 		/* FALLTHROUGH */
329 	case VCHR:
330 		/* FALLTHROUGH */
331 	case VSOCK:
332 		/* FALLTHROUGH */
333 	case VFIFO:
334 		break;
335 
336 	default:
337 		error = EINVAL;
338 		goto out;
339 	}
340 
341 	if ((ap->a_mode & VWRITE) && (node->tn_flags & IMMUTABLE)) {
342 		error = EPERM;
343 		goto out;
344 	}
345 
346 	error = vop_helper_access(ap, node->tn_uid, node->tn_gid,
347 			          node->tn_mode, 0);
348 out:
349 	return error;
350 }
351 
352 /* --------------------------------------------------------------------- */
353 
354 int
355 tmpfs_getattr(struct vop_getattr_args *ap)
356 {
357 	struct vnode *vp = ap->a_vp;
358 	struct vattr *vap = ap->a_vap;
359 	struct tmpfs_node *node;
360 
361 	node = VP_TO_TMPFS_NODE(vp);
362 
363 	tmpfs_update(vp);
364 
365 	vap->va_type = vp->v_type;
366 	vap->va_mode = node->tn_mode;
367 	vap->va_nlink = node->tn_links;
368 	vap->va_uid = node->tn_uid;
369 	vap->va_gid = node->tn_gid;
370 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
371 	vap->va_fileid = node->tn_id;
372 	vap->va_size = node->tn_size;
373 	vap->va_blocksize = PAGE_SIZE;
374 	vap->va_atime.tv_sec = node->tn_atime;
375 	vap->va_atime.tv_nsec = node->tn_atimensec;
376 	vap->va_mtime.tv_sec = node->tn_mtime;
377 	vap->va_mtime.tv_nsec = node->tn_mtimensec;
378 	vap->va_ctime.tv_sec = node->tn_ctime;
379 	vap->va_ctime.tv_nsec = node->tn_ctimensec;
380 	vap->va_gen = node->tn_gen;
381 	vap->va_flags = node->tn_flags;
382 	if (vp->v_type == VBLK || vp->v_type == VCHR) {
383 		vap->va_rmajor = umajor(node->tn_rdev);
384 		vap->va_rminor = uminor(node->tn_rdev);
385 	}
386 	vap->va_bytes = round_page(node->tn_size);
387 	vap->va_filerev = 0;
388 
389 	return 0;
390 }
391 
392 /* --------------------------------------------------------------------- */
393 
394 int
395 tmpfs_getattr_quick(struct vop_getattr_args *ap)
396 {
397 	struct vnode *vp = ap->a_vp;
398 	struct vattr *vap = ap->a_vap;
399 	struct tmpfs_node *node;
400 
401 	node = VP_TO_TMPFS_NODE(vp);
402 
403 	tmpfs_update(vp);
404 
405 	vap->va_type = vp->v_type;
406 	vap->va_mode = node->tn_mode;
407 	vap->va_nlink = node->tn_links;
408 	vap->va_uid = node->tn_uid;
409 	vap->va_gid = node->tn_gid;
410 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
411 	vap->va_fileid = node->tn_id;
412 	vap->va_size = node->tn_size;
413 	vap->va_blocksize = PAGE_SIZE;
414 	vap->va_gen = node->tn_gen;
415 	vap->va_flags = node->tn_flags;
416 	if (vp->v_type == VBLK || vp->v_type == VCHR) {
417 		vap->va_rmajor = umajor(node->tn_rdev);
418 		vap->va_rminor = uminor(node->tn_rdev);
419 	}
420 	vap->va_bytes = -1;
421 	vap->va_filerev = 0;
422 
423 	return 0;
424 }
425 
426 
427 /* --------------------------------------------------------------------- */
428 
429 int
430 tmpfs_setattr(struct vop_setattr_args *ap)
431 {
432 	struct vnode *vp = ap->a_vp;
433 	struct vattr *vap = ap->a_vap;
434 	struct ucred *cred = ap->a_cred;
435 	struct tmpfs_node *node = VP_TO_TMPFS_NODE(vp);
436 	int error = 0;
437 	int kflags = 0;
438 
439 	TMPFS_NODE_LOCK(node);
440 	if (error == 0 && (vap->va_flags != VNOVAL)) {
441 		error = tmpfs_chflags(vp, vap->va_flags, cred);
442 		kflags |= NOTE_ATTRIB;
443 	}
444 
445 	if (error == 0 && (vap->va_size != VNOVAL)) {
446 		/* restore any saved pages before proceeding */
447 		if (node->tn_reg.tn_pages_in_aobj) {
448 			tmpfs_move_pages(node->tn_reg.tn_aobj, vp->v_object,
449 					 TMPFS_MOVF_FROMBACKING |
450 					 TMPFS_MOVF_DEACTIVATE);
451 			node->tn_reg.tn_pages_in_aobj = 0;
452 		}
453 		if (vap->va_size > node->tn_size)
454 			kflags |= NOTE_WRITE | NOTE_EXTEND;
455 		else
456 			kflags |= NOTE_WRITE;
457 		error = tmpfs_chsize(vp, vap->va_size, cred);
458 	}
459 
460 	if (error == 0 && (vap->va_uid != (uid_t)VNOVAL ||
461 			   vap->va_gid != (gid_t)VNOVAL)) {
462 		error = tmpfs_chown(vp, vap->va_uid, vap->va_gid, cred);
463 		kflags |= NOTE_ATTRIB;
464 	}
465 
466 	if (error == 0 && (vap->va_mode != (mode_t)VNOVAL)) {
467 		error = tmpfs_chmod(vp, vap->va_mode, cred);
468 		kflags |= NOTE_ATTRIB;
469 	}
470 
471 	if (error == 0 && ((vap->va_atime.tv_sec != VNOVAL &&
472 	    vap->va_atime.tv_nsec != VNOVAL) ||
473 	    (vap->va_mtime.tv_sec != VNOVAL &&
474 	    vap->va_mtime.tv_nsec != VNOVAL) )) {
475 		error = tmpfs_chtimes(vp, &vap->va_atime, &vap->va_mtime,
476 				      vap->va_vaflags, cred);
477 		kflags |= NOTE_ATTRIB;
478 	}
479 
480 	/*
481 	 * Update the node times.  We give preference to the error codes
482 	 * generated by this function rather than the ones that may arise
483 	 * from tmpfs_update.
484 	 */
485 	tmpfs_update(vp);
486 	TMPFS_NODE_UNLOCK(node);
487 	tmpfs_knote(vp, kflags);
488 
489 	return (error);
490 }
491 
492 /* --------------------------------------------------------------------- */
493 
494 /*
495  * fsync is usually a NOP, but we must take action when unmounting or
496  * when recycling.
497  */
498 static int
499 tmpfs_fsync(struct vop_fsync_args *ap)
500 {
501 	struct tmpfs_node *node;
502 	struct vnode *vp = ap->a_vp;
503 
504 	node = VP_TO_TMPFS_NODE(vp);
505 
506 	/*
507 	 * tmpfs vnodes typically remain dirty, avoid long syncer scans
508 	 * by forcing removal from the syncer list.
509 	 */
510 	vn_syncer_remove(vp, 1);
511 
512 	tmpfs_update(vp);
513 	if (vp->v_type == VREG) {
514 		if (vp->v_flag & VRECLAIMED) {
515 			if (node->tn_links == 0)
516 				tmpfs_truncate(vp, 0);
517 			else
518 				vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL);
519 		}
520 	}
521 
522 	return 0;
523 }
524 
525 /* --------------------------------------------------------------------- */
526 
527 static int
528 tmpfs_read(struct vop_read_args *ap)
529 {
530 	struct buf *bp;
531 	struct vnode *vp = ap->a_vp;
532 	struct uio *uio = ap->a_uio;
533 	struct tmpfs_node *node;
534 	off_t base_offset;
535 	size_t offset;
536 	size_t len;
537 	size_t resid;
538 	int error;
539 	int seqcount;
540 
541 	/*
542 	 * Check the basics
543 	 */
544 	if (uio->uio_offset < 0)
545 		return (EINVAL);
546 	if (vp->v_type != VREG)
547 		return (EINVAL);
548 
549 	/*
550 	 * Extract node, try to shortcut the operation through
551 	 * the VM page cache, allowing us to avoid buffer cache
552 	 * overheads.
553 	 */
554 	node = VP_TO_TMPFS_NODE(vp);
555         resid = uio->uio_resid;
556 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
557         error = vop_helper_read_shortcut(ap);
558         if (error)
559                 return error;
560         if (uio->uio_resid == 0) {
561 		if (resid)
562 			goto finished;
563 		return error;
564 	}
565 
566 	/*
567 	 * restore any saved pages before proceeding
568 	 */
569 	if (node->tn_reg.tn_pages_in_aobj) {
570 		TMPFS_NODE_LOCK(node);
571 		if (node->tn_reg.tn_pages_in_aobj) {
572 			tmpfs_move_pages(node->tn_reg.tn_aobj, vp->v_object,
573 					 TMPFS_MOVF_FROMBACKING);
574 			node->tn_reg.tn_pages_in_aobj = 0;
575 		}
576 		TMPFS_NODE_UNLOCK(node);
577 	}
578 
579 	/*
580 	 * Fall-through to our normal read code.
581 	 */
582 	while (uio->uio_resid > 0 && uio->uio_offset < node->tn_size) {
583 		/*
584 		 * Use buffer cache I/O (via tmpfs_strategy)
585 		 */
586 		offset = (size_t)uio->uio_offset & TMPFS_BLKMASK64;
587 		base_offset = (off_t)uio->uio_offset - offset;
588 		bp = getcacheblk(vp, base_offset,
589 				 node->tn_blksize, GETBLK_KVABIO);
590 		if (bp == NULL) {
591 			if (tmpfs_cluster_rd_enable) {
592 				error = cluster_readx(vp, node->tn_size,
593 						     base_offset,
594 						     node->tn_blksize,
595 						     B_NOTMETA | B_KVABIO,
596 						     uio->uio_resid,
597 						     seqcount * MAXBSIZE,
598 						     &bp);
599 			} else {
600 				error = bread_kvabio(vp, base_offset,
601 						     node->tn_blksize, &bp);
602 			}
603 			if (error) {
604 				brelse(bp);
605 				kprintf("tmpfs_read bread error %d\n", error);
606 				break;
607 			}
608 
609 			/*
610 			 * tmpfs pretty much fiddles directly with the VM
611 			 * system, don't let it exhaust it or we won't play
612 			 * nice with other processes.
613 			 *
614 			 * Only do this if the VOP is coming from a normal
615 			 * read/write.  The VM system handles the case for
616 			 * UIO_NOCOPY.
617 			 */
618 			if (uio->uio_segflg != UIO_NOCOPY)
619 				vm_wait_nominal();
620 		}
621 		bp->b_flags |= B_CLUSTEROK;
622 		bkvasync(bp);
623 
624 		/*
625 		 * Figure out how many bytes we can actually copy this loop.
626 		 */
627 		len = node->tn_blksize - offset;
628 		if (len > uio->uio_resid)
629 			len = uio->uio_resid;
630 		if (len > node->tn_size - uio->uio_offset)
631 			len = (size_t)(node->tn_size - uio->uio_offset);
632 
633 		error = uiomovebp(bp, (char *)bp->b_data + offset, len, uio);
634 		bqrelse(bp);
635 		if (error) {
636 			kprintf("tmpfs_read uiomove error %d\n", error);
637 			break;
638 		}
639 	}
640 
641 finished:
642 	if ((node->tn_status & TMPFS_NODE_ACCESSED) == 0) {
643 		TMPFS_NODE_LOCK(node);
644 		node->tn_status |= TMPFS_NODE_ACCESSED;
645 		TMPFS_NODE_UNLOCK(node);
646 	}
647 	return (error);
648 }
649 
650 static int
651 tmpfs_write(struct vop_write_args *ap)
652 {
653 	struct buf *bp;
654 	struct vnode *vp = ap->a_vp;
655 	struct uio *uio = ap->a_uio;
656 	struct thread *td = uio->uio_td;
657 	struct tmpfs_node *node;
658 	boolean_t extended;
659 	off_t oldsize;
660 	int error;
661 	off_t base_offset;
662 	size_t offset;
663 	size_t len;
664 	struct rlimit limit;
665 	int trivial = 0;
666 	int kflags = 0;
667 	int seqcount;
668 
669 	error = 0;
670 	if (uio->uio_resid == 0) {
671 		return error;
672 	}
673 
674 	node = VP_TO_TMPFS_NODE(vp);
675 
676 	if (vp->v_type != VREG)
677 		return (EINVAL);
678 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
679 
680 	TMPFS_NODE_LOCK(node);
681 
682 	/*
683 	 * restore any saved pages before proceeding
684 	 */
685 	if (node->tn_reg.tn_pages_in_aobj) {
686 		tmpfs_move_pages(node->tn_reg.tn_aobj, vp->v_object,
687 				 TMPFS_MOVF_FROMBACKING);
688 		node->tn_reg.tn_pages_in_aobj = 0;
689 	}
690 
691 	oldsize = node->tn_size;
692 	if (ap->a_ioflag & IO_APPEND)
693 		uio->uio_offset = node->tn_size;
694 
695 	/*
696 	 * Check for illegal write offsets.
697 	 */
698 	if (uio->uio_offset + uio->uio_resid >
699 	  VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) {
700 		error = EFBIG;
701 		goto done;
702 	}
703 
704 	/*
705 	 * NOTE: Ignore if UIO does not come from a user thread (e.g. VN).
706 	 */
707 	if (vp->v_type == VREG && td != NULL && td->td_lwp != NULL) {
708 		error = kern_getrlimit(RLIMIT_FSIZE, &limit);
709 		if (error)
710 			goto done;
711 		if (uio->uio_offset + uio->uio_resid > limit.rlim_cur) {
712 			ksignal(td->td_proc, SIGXFSZ);
713 			error = EFBIG;
714 			goto done;
715 		}
716 	}
717 
718 	/*
719 	 * Extend the file's size if necessary
720 	 */
721 	extended = ((uio->uio_offset + uio->uio_resid) > node->tn_size);
722 
723 	while (uio->uio_resid > 0) {
724 		/*
725 		 * Don't completely blow out running buffer I/O
726 		 * when being hit from the pageout daemon.
727 		 */
728 		if (uio->uio_segflg == UIO_NOCOPY &&
729 		    (ap->a_ioflag & IO_RECURSE) == 0) {
730 			bwillwrite(node->tn_blksize);
731 		}
732 
733 		/*
734 		 * Use buffer cache I/O (via tmpfs_strategy)
735 		 *
736 		 * Calculate the maximum bytes we can write to the buffer at
737 		 * this offset (after resizing).
738 		 */
739 		offset = (size_t)uio->uio_offset & TMPFS_BLKMASK64;
740 		base_offset = (off_t)uio->uio_offset - offset;
741 		len = uio->uio_resid;
742 		if (len > TMPFS_BLKSIZE - offset)
743 			len = TMPFS_BLKSIZE - offset;
744 
745 		if ((uio->uio_offset + len) > node->tn_size) {
746 			trivial = (uio->uio_offset <= node->tn_size);
747 			error = tmpfs_reg_resize(vp, uio->uio_offset + len,
748 						 trivial);
749 			if (error)
750 				break;
751 		}
752 
753 		/*
754 		 * Read to fill in any gaps.  Theoretically we could
755 		 * optimize this if the write covers the entire buffer
756 		 * and is not a UIO_NOCOPY write, however this can lead
757 		 * to a security violation exposing random kernel memory
758 		 * (whatever junk was in the backing VM pages before).
759 		 *
760 		 * So just use bread() to do the right thing.
761 		 */
762 		error = bread_kvabio(vp, base_offset, node->tn_blksize, &bp);
763 		bkvasync(bp);
764 		error = uiomovebp(bp, (char *)bp->b_data + offset, len, uio);
765 		if (error) {
766 			kprintf("tmpfs_write uiomove error %d\n", error);
767 			brelse(bp);
768 			break;
769 		}
770 
771 		if (uio->uio_offset > node->tn_size) {
772 			node->tn_size = uio->uio_offset;
773 			kflags |= NOTE_EXTEND;
774 		}
775 		kflags |= NOTE_WRITE;
776 
777 		/*
778 		 * UIO_NOCOPY is a sensitive state due to potentially being
779 		 * issued from the pageout daemon while in a low-memory
780 		 * situation.  However, in order to cluster the I/O nicely
781 		 * (e.g. 64KB+ writes instead of 16KB writes), we still try
782 		 * to follow the same semantics that any other filesystem
783 		 * might use.
784 		 *
785 		 * For the normal case we buwrite(), dirtying the underlying
786 		 * VM pages instead of dirtying the buffer and releasing the
787 		 * buffer as a clean buffer.  This allows tmpfs to use
788 		 * essentially all available memory to cache file data.
789 		 * If we used bdwrite() the buffer cache would wind up
790 		 * flushing the data to swap too quickly.
791 		 *
792 		 * But because tmpfs can seriously load the VM system we
793 		 * fall-back to using bdwrite() when free memory starts
794 		 * to get low.  This shifts the load away from the VM system
795 		 * and makes tmpfs act more like a normal filesystem with
796 		 * regards to disk activity.
797 		 *
798 		 * tmpfs pretty much fiddles directly with the VM
799 		 * system, don't let it exhaust it or we won't play
800 		 * nice with other processes.  Only do this if the
801 		 * VOP is coming from a normal read/write.  The VM system
802 		 * handles the case for UIO_NOCOPY.
803 		 */
804 		bp->b_flags |= B_CLUSTEROK;
805 		if (uio->uio_segflg == UIO_NOCOPY) {
806 			/*
807 			 * Flush from the pageout daemon, deal with
808 			 * potentially very heavy tmpfs write activity
809 			 * causing long stalls in the pageout daemon
810 			 * before pages get to free/cache.
811 			 *
812 			 * (a) Under severe pressure setting B_DIRECT will
813 			 *     cause a buffer release to try to free the
814 			 *     underlying pages.
815 			 *
816 			 * (b) Under modest memory pressure the B_RELBUF
817 			 *     alone is sufficient to get the pages moved
818 			 *     to the cache.  We could also force this by
819 			 *     setting B_NOTMETA but that might have other
820 			 *     unintended side-effects (e.g. setting
821 			 *     PG_NOTMETA on the VM page).
822 			 *
823 			 * (c) For the pageout->putpages->generic_putpages->
824 			 *     UIO_NOCOPY-write (here), issuing an immediate
825 			 *     write prevents any real clustering from
826 			 *     happening because the buffers probably aren't
827 			 *     (yet) marked dirty, or lost due to prior use
828 			 *     of buwrite().  Try to use the normal
829 			 *     cluster_write() mechanism for performance.
830 			 *
831 			 * Hopefully this will unblock the VM system more
832 			 * quickly under extreme tmpfs write load.
833 			 */
834 			if (vm_page_count_min(vm_page_free_hysteresis))
835 				bp->b_flags |= B_DIRECT;
836 			bp->b_flags |= B_AGE | B_RELBUF | B_TTC;
837 			bp->b_act_count = 0;	/* buffer->deactivate pgs */
838 			if (tmpfs_cluster_wr_enable &&
839 			    (ap->a_ioflag & (IO_SYNC | IO_DIRECT)) == 0) {
840 				cluster_write(bp, node->tn_size,
841 					      node->tn_blksize, seqcount);
842 			} else {
843 				cluster_awrite(bp);
844 			}
845 		} else if (vm_pages_needed || vm_paging_needed(0) ||
846 			   tmpfs_bufcache_mode >= 2) {
847 			/*
848 			 * If the pageout daemon is running we cycle the
849 			 * write through the buffer cache normally to
850 			 * pipeline the flush, thus avoiding adding any
851 			 * more memory pressure to the pageout daemon.
852 			 */
853 			bp->b_act_count = 0;	/* buffer->deactivate pgs */
854 			if (tmpfs_cluster_wr_enable) {
855 				cluster_write(bp, node->tn_size,
856 					      node->tn_blksize, seqcount);
857 			} else {
858 				bdwrite(bp);
859 			}
860 		} else {
861 			/*
862 			 * Otherwise run the buffer directly through to the
863 			 * backing VM store, leaving the buffer clean so
864 			 * buffer limits do not force early flushes to swap.
865 			 */
866 			buwrite(bp);
867 			/*vm_wait_nominal();*/
868 		}
869 
870 		if (bp->b_error) {
871 			kprintf("tmpfs_write bwrite error %d\n", bp->b_error);
872 			break;
873 		}
874 	}
875 
876 	if (error) {
877 		if (extended) {
878 			(void)tmpfs_reg_resize(vp, oldsize, trivial);
879 			kflags &= ~NOTE_EXTEND;
880 		}
881 		goto done;
882 	}
883 
884 	/*
885 	 * Currently we don't set the mtime on files modified via mmap()
886 	 * because we can't tell the difference between those modifications
887 	 * and an attempt by the pageout daemon to flush tmpfs pages to
888 	 * swap.
889 	 *
890 	 * This is because in order to defer flushes as long as possible
891 	 * buwrite() works by marking the underlying VM pages dirty in
892 	 * order to be able to dispose of the buffer cache buffer without
893 	 * flushing it.
894 	 */
895 	if (uio->uio_segflg == UIO_NOCOPY) {
896 		if (vp->v_flag & VLASTWRITETS) {
897 			node->tn_mtime = vp->v_lastwrite_ts.tv_sec;
898 			node->tn_mtimensec = vp->v_lastwrite_ts.tv_nsec;
899 		}
900 	} else {
901 		node->tn_status |= TMPFS_NODE_MODIFIED;
902 		vclrflags(vp, VLASTWRITETS);
903 	}
904 
905 	if (extended)
906 		node->tn_status |= TMPFS_NODE_CHANGED;
907 
908 	if (node->tn_mode & (S_ISUID | S_ISGID)) {
909 		if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0))
910 			node->tn_mode &= ~(S_ISUID | S_ISGID);
911 	}
912 done:
913 	TMPFS_NODE_UNLOCK(node);
914 	if (kflags)
915 		tmpfs_knote(vp, kflags);
916 
917 	return(error);
918 }
919 
920 static int
921 tmpfs_advlock(struct vop_advlock_args *ap)
922 {
923 	struct tmpfs_node *node;
924 	struct vnode *vp = ap->a_vp;
925 	int error;
926 
927 	node = VP_TO_TMPFS_NODE(vp);
928 	error = (lf_advlock(ap, &node->tn_advlock, node->tn_size));
929 
930 	return (error);
931 }
932 
933 /*
934  * The strategy function is typically only called when memory pressure
935  * forces the system to attempt to pageout pages.  It can also be called
936  * by [n]vtruncbuf() when a truncation cuts a page in half.  Normal write
937  * operations
938  *
939  * We set VKVABIO for VREG files so bp->b_data may not be synchronized to
940  * our cpu.  swap_pager_strategy() is all we really use, and it directly
941  * supports this.
942  */
943 static int
944 tmpfs_strategy(struct vop_strategy_args *ap)
945 {
946 	struct bio *bio = ap->a_bio;
947 	struct bio *nbio;
948 	struct buf *bp = bio->bio_buf;
949 	struct vnode *vp = ap->a_vp;
950 	struct tmpfs_node *node;
951 	vm_object_t uobj;
952 	vm_page_t m;
953 	int i;
954 
955 	if (vp->v_type != VREG) {
956 		bp->b_resid = bp->b_bcount;
957 		bp->b_flags |= B_ERROR | B_INVAL;
958 		bp->b_error = EINVAL;
959 		biodone(bio);
960 		return(0);
961 	}
962 
963 	node = VP_TO_TMPFS_NODE(vp);
964 
965 	uobj = node->tn_reg.tn_aobj;
966 
967 	/*
968 	 * Don't bother flushing to swap if there is no swap, just
969 	 * ensure that the pages are marked as needing a commit (still).
970 	 */
971 	if (bp->b_cmd == BUF_CMD_WRITE && vm_swap_size == 0) {
972 		for (i = 0; i < bp->b_xio.xio_npages; ++i) {
973 			m = bp->b_xio.xio_pages[i];
974 			vm_page_need_commit(m);
975 		}
976 		bp->b_resid = 0;
977 		bp->b_error = 0;
978 		biodone(bio);
979 	} else {
980 		/*
981 		 * Tell the buffer cache to try to recycle the pages
982 		 * to PQ_CACHE on release.
983 		 */
984 		if (tmpfs_bufcache_mode >= 2 ||
985 		    (tmpfs_bufcache_mode == 1 && vm_paging_needed(0))) {
986 			bp->b_flags |= B_TTC;
987 		}
988 		nbio = push_bio(bio);
989 		nbio->bio_done = tmpfs_strategy_done;
990 		nbio->bio_offset = bio->bio_offset;
991 		swap_pager_strategy(uobj, nbio);
992 	}
993 	return 0;
994 }
995 
996 /*
997  * If we were unable to commit the pages to swap make sure they are marked
998  * as needing a commit (again).  If we were, clear the flag to allow the
999  * pages to be freed.
1000  *
1001  * Do not error-out the buffer.  In particular, vinvalbuf() needs to
1002  * always work.
1003  */
1004 static void
1005 tmpfs_strategy_done(struct bio *bio)
1006 {
1007 	struct buf *bp;
1008 	vm_page_t m;
1009 	int i;
1010 
1011 	bp = bio->bio_buf;
1012 
1013 	if (bp->b_flags & B_ERROR) {
1014 		bp->b_flags &= ~B_ERROR;
1015 		bp->b_error = 0;
1016 		bp->b_resid = 0;
1017 		for (i = 0; i < bp->b_xio.xio_npages; ++i) {
1018 			m = bp->b_xio.xio_pages[i];
1019 			vm_page_need_commit(m);
1020 		}
1021 	} else {
1022 		for (i = 0; i < bp->b_xio.xio_npages; ++i) {
1023 			m = bp->b_xio.xio_pages[i];
1024 			vm_page_clear_commit(m);
1025 		}
1026 	}
1027 	bio = pop_bio(bio);
1028 	biodone(bio);
1029 }
1030 
1031 /*
1032  * To make write clustering work well make the backing store look
1033  * contiguous to the cluster_*() code.  The swap_strategy() function
1034  * will take it from there.
1035  *
1036  * Use MAXBSIZE-sized chunks as a micro-optimization to make random
1037  * flushes leave full-sized gaps.
1038  */
1039 static int
1040 tmpfs_bmap(struct vop_bmap_args *ap)
1041 {
1042 	if (ap->a_doffsetp != NULL)
1043 		*ap->a_doffsetp = ap->a_loffset;
1044 	if (ap->a_runp != NULL)
1045 		*ap->a_runp = MAXBSIZE - (ap->a_loffset & (MAXBSIZE - 1));
1046 	if (ap->a_runb != NULL)
1047 		*ap->a_runb = ap->a_loffset & (MAXBSIZE - 1);
1048 
1049 	return 0;
1050 }
1051 
1052 /* --------------------------------------------------------------------- */
1053 
1054 static int
1055 tmpfs_nremove(struct vop_nremove_args *ap)
1056 {
1057 	struct vnode *dvp = ap->a_dvp;
1058 	struct namecache *ncp = ap->a_nch->ncp;
1059 	struct vnode *vp;
1060 	int error;
1061 	struct tmpfs_dirent *de;
1062 	struct tmpfs_mount *tmp;
1063 	struct tmpfs_node *dnode;
1064 	struct tmpfs_node *node;
1065 
1066 	/*
1067 	 * We have to acquire the vp from ap->a_nch because we will likely
1068 	 * unresolve the namecache entry, and a vrele/vput is needed to
1069 	 * trigger the tmpfs_inactive/tmpfs_reclaim sequence.
1070 	 *
1071 	 * We have to use vget to clear any inactive state on the vnode,
1072 	 * otherwise the vnode may remain inactive and thus tmpfs_inactive
1073 	 * will not get called when we release it.
1074 	 */
1075 	error = cache_vget(ap->a_nch, ap->a_cred, LK_SHARED, &vp);
1076 	KKASSERT(vp->v_mount == dvp->v_mount);
1077 	KKASSERT(error == 0);
1078 	vn_unlock(vp);
1079 
1080 	if (vp->v_type == VDIR) {
1081 		error = EISDIR;
1082 		goto out2;
1083 	}
1084 
1085 	dnode = VP_TO_TMPFS_DIR(dvp);
1086 	node = VP_TO_TMPFS_NODE(vp);
1087 	tmp = VFS_TO_TMPFS(vp->v_mount);
1088 
1089 	TMPFS_NODE_LOCK(dnode);
1090 	TMPFS_NODE_LOCK(node);
1091 	de = tmpfs_dir_lookup(dnode, node, ncp);
1092 	if (de == NULL) {
1093 		error = ENOENT;
1094 		TMPFS_NODE_UNLOCK(node);
1095 		TMPFS_NODE_UNLOCK(dnode);
1096 		goto out;
1097 	}
1098 
1099 	/* Files marked as immutable or append-only cannot be deleted. */
1100 	if ((node->tn_flags & (IMMUTABLE | APPEND | NOUNLINK)) ||
1101 	    (dnode->tn_flags & APPEND)) {
1102 		error = EPERM;
1103 		TMPFS_NODE_UNLOCK(node);
1104 		TMPFS_NODE_UNLOCK(dnode);
1105 		goto out;
1106 	}
1107 
1108 	/* Remove the entry from the directory; as it is a file, we do not
1109 	 * have to change the number of hard links of the directory. */
1110 	tmpfs_dir_detach(dnode, de);
1111 	TMPFS_NODE_UNLOCK(dnode);
1112 
1113 	/* Free the directory entry we just deleted.  Note that the node
1114 	 * referred by it will not be removed until the vnode is really
1115 	 * reclaimed. */
1116 	tmpfs_free_dirent(tmp, de);
1117 
1118 	if (node->tn_links > 0)
1119 		node->tn_status |= TMPFS_NODE_CHANGED;
1120 	TMPFS_NODE_UNLOCK(node);
1121 
1122 	cache_unlink(ap->a_nch);
1123 	tmpfs_knote(vp, NOTE_DELETE);
1124 	error = 0;
1125 
1126 out:
1127 	if (error == 0)
1128 		tmpfs_knote(dvp, NOTE_WRITE);
1129 out2:
1130 	vrele(vp);
1131 
1132 	return error;
1133 }
1134 
1135 /* --------------------------------------------------------------------- */
1136 
1137 static int
1138 tmpfs_nlink(struct vop_nlink_args *ap)
1139 {
1140 	struct vnode *dvp = ap->a_dvp;
1141 	struct vnode *vp = ap->a_vp;
1142 	struct namecache *ncp = ap->a_nch->ncp;
1143 	struct tmpfs_dirent *de;
1144 	struct tmpfs_node *node;
1145 	struct tmpfs_node *dnode;
1146 	int error;
1147 
1148 	KKASSERT(dvp != vp); /* XXX When can this be false? */
1149 
1150 	node = VP_TO_TMPFS_NODE(vp);
1151 	dnode = VP_TO_TMPFS_NODE(dvp);
1152 	TMPFS_NODE_LOCK(dnode);
1153 
1154 	/* XXX: Why aren't the following two tests done by the caller? */
1155 
1156 	/* Hard links of directories are forbidden. */
1157 	if (vp->v_type == VDIR) {
1158 		error = EPERM;
1159 		goto out;
1160 	}
1161 
1162 	/* Cannot create cross-device links. */
1163 	if (dvp->v_mount != vp->v_mount) {
1164 		error = EXDEV;
1165 		goto out;
1166 	}
1167 
1168 	/* Ensure that we do not overflow the maximum number of links imposed
1169 	 * by the system. */
1170 	KKASSERT(node->tn_links <= LINK_MAX);
1171 	if (node->tn_links >= LINK_MAX) {
1172 		error = EMLINK;
1173 		goto out;
1174 	}
1175 
1176 	/* We cannot create links of files marked immutable or append-only. */
1177 	if (node->tn_flags & (IMMUTABLE | APPEND)) {
1178 		error = EPERM;
1179 		goto out;
1180 	}
1181 
1182 	/* Allocate a new directory entry to represent the node. */
1183 	error = tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), node,
1184 				   ncp->nc_name, ncp->nc_nlen, &de);
1185 	if (error != 0)
1186 		goto out;
1187 
1188 	/* Insert the new directory entry into the appropriate directory. */
1189 	tmpfs_dir_attach(dnode, de);
1190 
1191 	/* vp link count has changed, so update node times. */
1192 
1193 	TMPFS_NODE_LOCK(node);
1194 	node->tn_status |= TMPFS_NODE_CHANGED;
1195 	TMPFS_NODE_UNLOCK(node);
1196 	tmpfs_update(vp);
1197 
1198 	tmpfs_knote(vp, NOTE_LINK);
1199 	cache_setunresolved(ap->a_nch);
1200 	cache_setvp(ap->a_nch, vp);
1201 	error = 0;
1202 
1203 out:
1204 	TMPFS_NODE_UNLOCK(dnode);
1205 	if (error == 0)
1206 		tmpfs_knote(dvp, NOTE_WRITE);
1207 	return error;
1208 }
1209 
1210 /* --------------------------------------------------------------------- */
1211 
1212 static int
1213 tmpfs_nrename(struct vop_nrename_args *ap)
1214 {
1215 	struct vnode *fdvp = ap->a_fdvp;
1216 	struct namecache *fncp = ap->a_fnch->ncp;
1217 	struct vnode *fvp = fncp->nc_vp;
1218 	struct vnode *tdvp = ap->a_tdvp;
1219 	struct namecache *tncp = ap->a_tnch->ncp;
1220 	struct vnode *tvp;
1221 	struct tmpfs_dirent *de, *tde;
1222 	struct tmpfs_mount *tmp;
1223 	struct tmpfs_node *fdnode;
1224 	struct tmpfs_node *tdnode;
1225 	struct tmpfs_node *fnode;
1226 	struct tmpfs_node *tnode;
1227 	char *newname;
1228 	char *oldname;
1229 	int error;
1230 
1231 	KKASSERT(fdvp->v_mount == fvp->v_mount);
1232 
1233 	/*
1234 	 * Because tvp can get overwritten we have to vget it instead of
1235 	 * just vref or use it, otherwise it's VINACTIVE flag may not get
1236 	 * cleared and the node won't get destroyed.
1237 	 */
1238 	error = cache_vget(ap->a_tnch, ap->a_cred, LK_SHARED, &tvp);
1239 	if (error == 0) {
1240 		tnode = VP_TO_TMPFS_NODE(tvp);
1241 		vn_unlock(tvp);
1242 	} else {
1243 		tnode = NULL;
1244 	}
1245 
1246 	/* Disallow cross-device renames.
1247 	 * XXX Why isn't this done by the caller? */
1248 	if (fvp->v_mount != tdvp->v_mount ||
1249 	    (tvp != NULL && fvp->v_mount != tvp->v_mount)) {
1250 		error = EXDEV;
1251 		goto out;
1252 	}
1253 
1254 	tmp = VFS_TO_TMPFS(tdvp->v_mount);
1255 	tdnode = VP_TO_TMPFS_DIR(tdvp);
1256 
1257 	/* If source and target are the same file, there is nothing to do. */
1258 	if (fvp == tvp) {
1259 		error = 0;
1260 		goto out;
1261 	}
1262 
1263 	fdnode = VP_TO_TMPFS_DIR(fdvp);
1264 	fnode = VP_TO_TMPFS_NODE(fvp);
1265 
1266 	tmpfs_lock4(fdnode, tdnode, fnode, tnode);
1267 
1268 	de = tmpfs_dir_lookup(fdnode, fnode, fncp);
1269 
1270 	/* Avoid manipulating '.' and '..' entries. */
1271 	if (de == NULL) {
1272 		error = ENOENT;
1273 		goto out_locked;
1274 	}
1275 	KKASSERT(de->td_node == fnode);
1276 
1277 	/*
1278 	 * If replacing an entry in the target directory and that entry
1279 	 * is a directory, it must be empty.
1280 	 *
1281 	 * Kern_rename gurantees the destination to be a directory
1282 	 * if the source is one (it does?).
1283 	 */
1284 	if (tvp != NULL) {
1285 		KKASSERT(tnode != NULL);
1286 
1287 		if ((tnode->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
1288 		    (tdnode->tn_flags & (APPEND | IMMUTABLE))) {
1289 			error = EPERM;
1290 			goto out_locked;
1291 		}
1292 
1293 		if (fnode->tn_type == VDIR && tnode->tn_type == VDIR) {
1294 			if (tnode->tn_size > 0) {
1295 				error = ENOTEMPTY;
1296 				goto out_locked;
1297 			}
1298 		} else if (fnode->tn_type == VDIR && tnode->tn_type != VDIR) {
1299 			error = ENOTDIR;
1300 			goto out_locked;
1301 		} else if (fnode->tn_type != VDIR && tnode->tn_type == VDIR) {
1302 			error = EISDIR;
1303 			goto out_locked;
1304 		} else {
1305 			KKASSERT(fnode->tn_type != VDIR &&
1306 				tnode->tn_type != VDIR);
1307 		}
1308 	}
1309 
1310 	if ((fnode->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
1311 	    (fdnode->tn_flags & (APPEND | IMMUTABLE))) {
1312 		error = EPERM;
1313 		goto out_locked;
1314 	}
1315 
1316 	/*
1317 	 * Ensure that we have enough memory to hold the new name, if it
1318 	 * has to be changed.
1319 	 */
1320 	if (fncp->nc_nlen != tncp->nc_nlen ||
1321 	    bcmp(fncp->nc_name, tncp->nc_name, fncp->nc_nlen) != 0) {
1322 		newname = kmalloc(tncp->nc_nlen + 1, tmp->tm_name_zone,
1323 				  M_WAITOK | M_NULLOK);
1324 		if (newname == NULL) {
1325 			error = ENOSPC;
1326 			goto out_locked;
1327 		}
1328 		bcopy(tncp->nc_name, newname, tncp->nc_nlen);
1329 		newname[tncp->nc_nlen] = '\0';
1330 	} else {
1331 		newname = NULL;
1332 	}
1333 
1334 	/*
1335 	 * Unlink entry from source directory.  Note that the kernel has
1336 	 * already checked for illegal recursion cases (renaming a directory
1337 	 * into a subdirectory of itself).
1338 	 */
1339 	if (fdnode != tdnode) {
1340 		tmpfs_dir_detach(fdnode, de);
1341 	} else {
1342 		/* XXX depend on namecache lock */
1343 		KKASSERT(de == tmpfs_dir_lookup(fdnode, fnode, fncp));
1344 		RB_REMOVE(tmpfs_dirtree, &fdnode->tn_dir.tn_dirtree, de);
1345 		RB_REMOVE(tmpfs_dirtree_cookie,
1346 			  &fdnode->tn_dir.tn_cookietree, de);
1347 	}
1348 
1349 	/*
1350 	 * Handle any name change.  Swap with newname, we will
1351 	 * deallocate it at the end.
1352 	 */
1353 	if (newname != NULL) {
1354 		oldname = de->td_name;
1355 		de->td_name = newname;
1356 		de->td_namelen = (uint16_t)tncp->nc_nlen;
1357 		newname = oldname;
1358 	}
1359 
1360 	/*
1361 	 * If we are overwriting an entry, we have to remove the old one
1362 	 * from the target directory.
1363 	 */
1364 	if (tvp != NULL) {
1365 		/* Remove the old entry from the target directory. */
1366 		tde = tmpfs_dir_lookup(tdnode, tnode, tncp);
1367 		tmpfs_dir_detach(tdnode, tde);
1368 		tmpfs_knote(tdnode->tn_vnode, NOTE_DELETE);
1369 
1370 		/*
1371 		 * Free the directory entry we just deleted.  Note that the
1372 		 * node referred by it will not be removed until the vnode is
1373 		 * really reclaimed.
1374 		 */
1375 		tmpfs_free_dirent(VFS_TO_TMPFS(tvp->v_mount), tde);
1376 		/*cache_inval_vp(tvp, CINV_DESTROY);*/
1377 	}
1378 
1379 	/*
1380 	 * Link entry to target directory.  If the entry
1381 	 * represents a directory move the parent linkage
1382 	 * as well.
1383 	 */
1384 	if (fdnode != tdnode) {
1385 		if (de->td_node->tn_type == VDIR) {
1386 			TMPFS_VALIDATE_DIR(fnode);
1387 		}
1388 		tmpfs_dir_attach(tdnode, de);
1389 	} else {
1390 		tdnode->tn_status |= TMPFS_NODE_MODIFIED;
1391 		RB_INSERT(tmpfs_dirtree, &tdnode->tn_dir.tn_dirtree, de);
1392 		RB_INSERT(tmpfs_dirtree_cookie,
1393 			  &tdnode->tn_dir.tn_cookietree, de);
1394 	}
1395 	tmpfs_unlock4(fdnode, tdnode, fnode, tnode);
1396 
1397 	/*
1398 	 * Finish up
1399 	 */
1400 	if (newname) {
1401 		kfree(newname, tmp->tm_name_zone);
1402 		newname = NULL;
1403 	}
1404 	cache_rename(ap->a_fnch, ap->a_tnch);
1405 	tmpfs_knote(ap->a_fdvp, NOTE_WRITE);
1406 	tmpfs_knote(ap->a_tdvp, NOTE_WRITE);
1407 	if (fnode->tn_vnode)
1408 		tmpfs_knote(fnode->tn_vnode, NOTE_RENAME);
1409 	if (tvp)
1410 		vrele(tvp);
1411 	return 0;
1412 
1413 out_locked:
1414 	tmpfs_unlock4(fdnode, tdnode, fnode, tnode);
1415 out:
1416 	if (tvp)
1417 		vrele(tvp);
1418 	return error;
1419 }
1420 
1421 /* --------------------------------------------------------------------- */
1422 
1423 static int
1424 tmpfs_nmkdir(struct vop_nmkdir_args *ap)
1425 {
1426 	struct vnode *dvp = ap->a_dvp;
1427 	struct vnode **vpp = ap->a_vpp;
1428 	struct namecache *ncp = ap->a_nch->ncp;
1429 	struct vattr *vap = ap->a_vap;
1430 	struct ucred *cred = ap->a_cred;
1431 	int error;
1432 
1433 	KKASSERT(vap->va_type == VDIR);
1434 
1435 	error = tmpfs_alloc_file(dvp, vpp, vap, ncp, cred, NULL);
1436 	if (error == 0) {
1437 		cache_setunresolved(ap->a_nch);
1438 		cache_setvp(ap->a_nch, *vpp);
1439 		tmpfs_knote(dvp, NOTE_WRITE | NOTE_LINK);
1440 	}
1441 	return error;
1442 }
1443 
1444 /* --------------------------------------------------------------------- */
1445 
1446 static int
1447 tmpfs_nrmdir(struct vop_nrmdir_args *ap)
1448 {
1449 	struct vnode *dvp = ap->a_dvp;
1450 	struct namecache *ncp = ap->a_nch->ncp;
1451 	struct vnode *vp;
1452 	struct tmpfs_dirent *de;
1453 	struct tmpfs_mount *tmp;
1454 	struct tmpfs_node *dnode;
1455 	struct tmpfs_node *node;
1456 	int error;
1457 
1458 	/*
1459 	 * We have to acquire the vp from ap->a_nch because we will likely
1460 	 * unresolve the namecache entry, and a vrele/vput is needed to
1461 	 * trigger the tmpfs_inactive/tmpfs_reclaim sequence.
1462 	 *
1463 	 * We have to use vget to clear any inactive state on the vnode,
1464 	 * otherwise the vnode may remain inactive and thus tmpfs_inactive
1465 	 * will not get called when we release it.
1466 	 */
1467 	error = cache_vget(ap->a_nch, ap->a_cred, LK_SHARED, &vp);
1468 	KKASSERT(error == 0);
1469 	vn_unlock(vp);
1470 
1471 	/*
1472 	 * Prevalidate so we don't hit an assertion later
1473 	 */
1474 	if (vp->v_type != VDIR) {
1475 		error = ENOTDIR;
1476 		goto out;
1477 	}
1478 
1479 	tmp = VFS_TO_TMPFS(dvp->v_mount);
1480 	dnode = VP_TO_TMPFS_DIR(dvp);
1481 	node = VP_TO_TMPFS_DIR(vp);
1482 
1483 	/*
1484 	 *
1485 	 */
1486 	TMPFS_NODE_LOCK(dnode);
1487 	TMPFS_NODE_LOCK(node);
1488 
1489 	/*
1490 	 * Only empty directories can be removed.
1491 	 */
1492 	if (node->tn_size > 0) {
1493 		error = ENOTEMPTY;
1494 		goto out_locked;
1495 	}
1496 
1497 	if ((dnode->tn_flags & APPEND)
1498 	    || (node->tn_flags & (NOUNLINK | IMMUTABLE | APPEND))) {
1499 		error = EPERM;
1500 		goto out_locked;
1501 	}
1502 
1503 	/*
1504 	 * This invariant holds only if we are not trying to
1505 	 * remove "..".  We checked for that above so this is safe now.
1506 	 */
1507 	KKASSERT(node->tn_dir.tn_parent == dnode);
1508 
1509 	/*
1510 	 * Get the directory entry associated with node (vp)
1511 	 */
1512 	de = tmpfs_dir_lookup(dnode, node, ncp);
1513 	KKASSERT(TMPFS_DIRENT_MATCHES(de, ncp->nc_name, ncp->nc_nlen));
1514 
1515 	/* Check flags to see if we are allowed to remove the directory. */
1516 	if ((dnode->tn_flags & APPEND) ||
1517 	    node->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) {
1518 		error = EPERM;
1519 		goto out_locked;
1520 	}
1521 
1522 	/* Detach the directory entry from the directory (dnode). */
1523 	tmpfs_dir_detach(dnode, de);
1524 
1525 	/*
1526 	 * Must set parent linkage to NULL (tested by ncreate to disallow
1527 	 * the creation of new files/dirs in a deleted directory)
1528 	 */
1529 	node->tn_status |= TMPFS_NODE_CHANGED;
1530 
1531 	dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED |
1532 			    TMPFS_NODE_MODIFIED;
1533 
1534 	/* Free the directory entry we just deleted.  Note that the node
1535 	 * referred by it will not be removed until the vnode is really
1536 	 * reclaimed. */
1537 	tmpfs_free_dirent(tmp, de);
1538 
1539 	/* Release the deleted vnode (will destroy the node, notify
1540 	 * interested parties and clean it from the cache). */
1541 
1542 	dnode->tn_status |= TMPFS_NODE_CHANGED;
1543 
1544 	TMPFS_NODE_UNLOCK(node);
1545 	TMPFS_NODE_UNLOCK(dnode);
1546 
1547 	tmpfs_update(dvp);
1548 	cache_unlink(ap->a_nch);
1549 	tmpfs_knote(dvp, NOTE_WRITE | NOTE_LINK);
1550 	vrele(vp);
1551 	return 0;
1552 
1553 out_locked:
1554 	TMPFS_NODE_UNLOCK(node);
1555 	TMPFS_NODE_UNLOCK(dnode);
1556 
1557 out:
1558 	vrele(vp);
1559 
1560 	return error;
1561 }
1562 
1563 /* --------------------------------------------------------------------- */
1564 
1565 static int
1566 tmpfs_nsymlink(struct vop_nsymlink_args *ap)
1567 {
1568 	struct vnode *dvp = ap->a_dvp;
1569 	struct vnode **vpp = ap->a_vpp;
1570 	struct namecache *ncp = ap->a_nch->ncp;
1571 	struct vattr *vap = ap->a_vap;
1572 	struct ucred *cred = ap->a_cred;
1573 	char *target = ap->a_target;
1574 	int error;
1575 
1576 	vap->va_type = VLNK;
1577 	error = tmpfs_alloc_file(dvp, vpp, vap, ncp, cred, target);
1578 	if (error == 0) {
1579 		tmpfs_knote(*vpp, NOTE_WRITE);
1580 		cache_setunresolved(ap->a_nch);
1581 		cache_setvp(ap->a_nch, *vpp);
1582 	}
1583 	return error;
1584 }
1585 
1586 /* --------------------------------------------------------------------- */
1587 
1588 static int
1589 tmpfs_readdir(struct vop_readdir_args *ap)
1590 {
1591 	struct vnode *vp = ap->a_vp;
1592 	struct uio *uio = ap->a_uio;
1593 	int *eofflag = ap->a_eofflag;
1594 	off_t **cookies = ap->a_cookies;
1595 	int *ncookies = ap->a_ncookies;
1596 	struct tmpfs_mount *tmp;
1597 	int error;
1598 	off_t startoff;
1599 	off_t cnt = 0;
1600 	struct tmpfs_node *node;
1601 
1602 	/* This operation only makes sense on directory nodes. */
1603 	if (vp->v_type != VDIR) {
1604 		return ENOTDIR;
1605 	}
1606 
1607 	tmp = VFS_TO_TMPFS(vp->v_mount);
1608 	node = VP_TO_TMPFS_DIR(vp);
1609 	startoff = uio->uio_offset;
1610 
1611 	if (uio->uio_offset == TMPFS_DIRCOOKIE_DOT) {
1612 		error = tmpfs_dir_getdotdent(node, uio);
1613 		if (error != 0) {
1614 			TMPFS_NODE_LOCK_SH(node);
1615 			goto outok;
1616 		}
1617 		cnt++;
1618 	}
1619 
1620 	if (uio->uio_offset == TMPFS_DIRCOOKIE_DOTDOT) {
1621 		/* may lock parent, cannot hold node lock */
1622 		error = tmpfs_dir_getdotdotdent(tmp, node, uio);
1623 		if (error != 0) {
1624 			TMPFS_NODE_LOCK_SH(node);
1625 			goto outok;
1626 		}
1627 		cnt++;
1628 	}
1629 
1630 	TMPFS_NODE_LOCK_SH(node);
1631 	error = tmpfs_dir_getdents(node, uio, &cnt);
1632 
1633 outok:
1634 	KKASSERT(error >= -1);
1635 
1636 	if (error == -1)
1637 		error = 0;
1638 
1639 	if (eofflag != NULL)
1640 		*eofflag =
1641 		    (error == 0 && uio->uio_offset == TMPFS_DIRCOOKIE_EOF);
1642 
1643 	/* Update NFS-related variables. */
1644 	if (error == 0 && cookies != NULL && ncookies != NULL) {
1645 		off_t i;
1646 		off_t off = startoff;
1647 		struct tmpfs_dirent *de = NULL;
1648 
1649 		*ncookies = cnt;
1650 		*cookies = kmalloc(cnt * sizeof(off_t), M_TEMP, M_WAITOK);
1651 
1652 		for (i = 0; i < cnt; i++) {
1653 			KKASSERT(off != TMPFS_DIRCOOKIE_EOF);
1654 			if (off == TMPFS_DIRCOOKIE_DOT) {
1655 				off = TMPFS_DIRCOOKIE_DOTDOT;
1656 			} else {
1657 				if (off == TMPFS_DIRCOOKIE_DOTDOT) {
1658 					de = RB_MIN(tmpfs_dirtree_cookie,
1659 						&node->tn_dir.tn_cookietree);
1660 				} else if (de != NULL) {
1661 					de = RB_NEXT(tmpfs_dirtree_cookie,
1662 					       &node->tn_dir.tn_cookietree, de);
1663 				} else {
1664 					de = tmpfs_dir_lookupbycookie(node,
1665 								      off);
1666 					KKASSERT(de != NULL);
1667 					de = RB_NEXT(tmpfs_dirtree_cookie,
1668 					       &node->tn_dir.tn_cookietree, de);
1669 				}
1670 				if (de == NULL)
1671 					off = TMPFS_DIRCOOKIE_EOF;
1672 				else
1673 					off = tmpfs_dircookie(de);
1674 			}
1675 			(*cookies)[i] = off;
1676 		}
1677 		KKASSERT(uio->uio_offset == off);
1678 	}
1679 	TMPFS_NODE_UNLOCK(node);
1680 
1681 	if ((node->tn_status & TMPFS_NODE_ACCESSED) == 0) {
1682 		TMPFS_NODE_LOCK(node);
1683 		node->tn_status |= TMPFS_NODE_ACCESSED;
1684 		TMPFS_NODE_UNLOCK(node);
1685 	}
1686 	return error;
1687 }
1688 
1689 /* --------------------------------------------------------------------- */
1690 
1691 static int
1692 tmpfs_readlink(struct vop_readlink_args *ap)
1693 {
1694 	struct vnode *vp = ap->a_vp;
1695 	struct uio *uio = ap->a_uio;
1696 	int error;
1697 	struct tmpfs_node *node;
1698 
1699 	KKASSERT(uio->uio_offset == 0);
1700 	KKASSERT(vp->v_type == VLNK);
1701 
1702 	node = VP_TO_TMPFS_NODE(vp);
1703 	TMPFS_NODE_LOCK_SH(node);
1704 	error = uiomove(node->tn_link,
1705 			MIN(node->tn_size, uio->uio_resid), uio);
1706 	TMPFS_NODE_UNLOCK(node);
1707 	if ((node->tn_status & TMPFS_NODE_ACCESSED) == 0) {
1708 		TMPFS_NODE_LOCK(node);
1709 		node->tn_status |= TMPFS_NODE_ACCESSED;
1710 		TMPFS_NODE_UNLOCK(node);
1711 	}
1712 	return error;
1713 }
1714 
1715 /* --------------------------------------------------------------------- */
1716 
1717 static int
1718 tmpfs_inactive(struct vop_inactive_args *ap)
1719 {
1720 	struct vnode *vp = ap->a_vp;
1721 	struct tmpfs_node *node;
1722 	struct mount *mp;
1723 
1724 	mp = vp->v_mount;
1725 	lwkt_gettoken(&mp->mnt_token);
1726 	node = VP_TO_TMPFS_NODE(vp);
1727 
1728 	/*
1729 	 * Degenerate case
1730 	 */
1731 	if (node == NULL) {
1732 		vrecycle(vp);
1733 		lwkt_reltoken(&mp->mnt_token);
1734 		return(0);
1735 	}
1736 
1737 	/*
1738 	 * Get rid of unreferenced deleted vnodes sooner rather than
1739 	 * later so the data memory can be recovered immediately.
1740 	 *
1741 	 * We must truncate the vnode to prevent the normal reclamation
1742 	 * path from flushing the data for the removed file to disk.
1743 	 */
1744 	TMPFS_NODE_LOCK(node);
1745 	if (node->tn_links == 0) {
1746 		node->tn_vpstate = TMPFS_VNODE_DOOMED;
1747 		TMPFS_NODE_UNLOCK(node);
1748 		if (node->tn_type == VREG)
1749 			tmpfs_truncate(vp, 0);
1750 		vrecycle(vp);
1751 	} else {
1752 		/*
1753 		 * We must retain any VM pages belonging to the vnode's
1754 		 * object as the vnode will destroy the object during a
1755 		 * later reclaim.  We call vinvalbuf(V_SAVE) to clean
1756 		 * out the buffer cache.
1757 		 *
1758 		 * On DragonFlyBSD, vnodes are not immediately deactivated
1759 		 * on the 1->0 refs, so this is a relatively optimal
1760 		 * operation.  We have to do this in tmpfs_inactive()
1761 		 * because the pages will have already been thrown away
1762 		 * at the time tmpfs_reclaim() is called.
1763 		 */
1764 		if (node->tn_type == VREG &&
1765 		    node->tn_reg.tn_pages_in_aobj == 0) {
1766 			vinvalbuf(vp, V_SAVE, 0, 0);
1767 			KKASSERT(RB_EMPTY(&vp->v_rbdirty_tree));
1768 			KKASSERT(RB_EMPTY(&vp->v_rbclean_tree));
1769 			tmpfs_move_pages(vp->v_object, node->tn_reg.tn_aobj,
1770 					 TMPFS_MOVF_DEACTIVATE);
1771 			node->tn_reg.tn_pages_in_aobj = 1;
1772 		}
1773 
1774 		TMPFS_NODE_UNLOCK(node);
1775 	}
1776 	lwkt_reltoken(&mp->mnt_token);
1777 
1778 	return 0;
1779 }
1780 
1781 /* --------------------------------------------------------------------- */
1782 
1783 int
1784 tmpfs_reclaim(struct vop_reclaim_args *ap)
1785 {
1786 	struct vnode *vp = ap->a_vp;
1787 	struct tmpfs_mount *tmp;
1788 	struct tmpfs_node *node;
1789 	struct mount *mp;
1790 
1791 	mp = vp->v_mount;
1792 	lwkt_gettoken(&mp->mnt_token);
1793 
1794 	node = VP_TO_TMPFS_NODE(vp);
1795 	tmp = VFS_TO_TMPFS(vp->v_mount);
1796 	KKASSERT(mp == tmp->tm_mount);
1797 
1798         TMPFS_NODE_LOCK(node);
1799 	KKASSERT(node->tn_vnode == vp);
1800         node->tn_vnode = NULL;
1801         vp->v_data = NULL;
1802 
1803 	/*
1804 	 * If the node referenced by this vnode was deleted by the
1805 	 * user, we must free its associated data structures now that
1806 	 * the vnode is being reclaimed.
1807 	 *
1808 	 * Directories have an extra link ref.
1809 	 */
1810 	if (node->tn_links == 0) {
1811 		node->tn_vpstate = TMPFS_VNODE_DOOMED;
1812 		tmpfs_free_node(tmp, node);
1813 		/* eats the lock */
1814 	} else {
1815 		TMPFS_NODE_UNLOCK(node);
1816 	}
1817 	lwkt_reltoken(&mp->mnt_token);
1818 
1819 	KKASSERT(vp->v_data == NULL);
1820 	return 0;
1821 }
1822 
1823 /* --------------------------------------------------------------------- */
1824 
1825 static int
1826 tmpfs_mountctl(struct vop_mountctl_args *ap)
1827 {
1828 	struct tmpfs_mount *tmp;
1829 	struct mount *mp;
1830 	int rc;
1831 
1832 	mp = ap->a_head.a_ops->head.vv_mount;
1833 	lwkt_gettoken(&mp->mnt_token);
1834 
1835 	switch (ap->a_op) {
1836 	case (MOUNTCTL_SET_EXPORT):
1837 		tmp = (struct tmpfs_mount *) mp->mnt_data;
1838 
1839 		if (ap->a_ctllen != sizeof(struct export_args))
1840 			rc = (EINVAL);
1841 		else
1842 			rc = vfs_export(mp, &tmp->tm_export,
1843 					(const struct export_args *) ap->a_ctl);
1844 		break;
1845 	default:
1846 		rc = vop_stdmountctl(ap);
1847 		break;
1848 	}
1849 
1850 	lwkt_reltoken(&mp->mnt_token);
1851 	return (rc);
1852 }
1853 
1854 /* --------------------------------------------------------------------- */
1855 
1856 static int
1857 tmpfs_print(struct vop_print_args *ap)
1858 {
1859 	struct vnode *vp = ap->a_vp;
1860 
1861 	struct tmpfs_node *node;
1862 
1863 	node = VP_TO_TMPFS_NODE(vp);
1864 
1865 	kprintf("tag VT_TMPFS, tmpfs_node %p, flags 0x%x, links %d\n",
1866 	    node, node->tn_flags, node->tn_links);
1867 	kprintf("\tmode 0%o, owner %d, group %d, size %ju, status 0x%x\n",
1868 	    node->tn_mode, node->tn_uid, node->tn_gid,
1869 	    (uintmax_t)node->tn_size, node->tn_status);
1870 
1871 	if (vp->v_type == VFIFO)
1872 		fifo_printinfo(vp);
1873 
1874 	kprintf("\n");
1875 
1876 	return 0;
1877 }
1878 
1879 /* --------------------------------------------------------------------- */
1880 
1881 static int
1882 tmpfs_pathconf(struct vop_pathconf_args *ap)
1883 {
1884 	struct vnode *vp = ap->a_vp;
1885 	int name = ap->a_name;
1886 	register_t *retval = ap->a_retval;
1887 	struct tmpfs_mount *tmp;
1888 	int error;
1889 
1890 	error = 0;
1891 
1892 	switch (name) {
1893 	case _PC_CHOWN_RESTRICTED:
1894 		*retval = 1;
1895 		break;
1896 
1897 	case _PC_FILESIZEBITS:
1898 		tmp = VFS_TO_TMPFS(vp->v_mount);
1899 		*retval = max(32, flsll(tmp->tm_pages_max * PAGE_SIZE) + 1);
1900 		break;
1901 
1902 	case _PC_LINK_MAX:
1903 		*retval = LINK_MAX;
1904 		break;
1905 
1906 	case _PC_NAME_MAX:
1907 		*retval = NAME_MAX;
1908 		break;
1909 
1910 	case _PC_NO_TRUNC:
1911 		*retval = 1;
1912 		break;
1913 
1914 	case _PC_PATH_MAX:
1915 		*retval = PATH_MAX;
1916 		break;
1917 
1918 	case _PC_PIPE_BUF:
1919 		*retval = PIPE_BUF;
1920 		break;
1921 
1922 	case _PC_SYNC_IO:
1923 		*retval = 1;
1924 		break;
1925 
1926 	case _PC_2_SYMLINKS:
1927 		*retval = 1;
1928 		break;
1929 
1930 	default:
1931 		error = EINVAL;
1932 	}
1933 
1934 	return error;
1935 }
1936 
1937 /************************************************************************
1938  *                          KQFILTER OPS                                *
1939  ************************************************************************/
1940 
1941 static void filt_tmpfsdetach(struct knote *kn);
1942 static int filt_tmpfsread(struct knote *kn, long hint);
1943 static int filt_tmpfswrite(struct knote *kn, long hint);
1944 static int filt_tmpfsvnode(struct knote *kn, long hint);
1945 
1946 static struct filterops tmpfsread_filtops =
1947 	{ FILTEROP_ISFD | FILTEROP_MPSAFE,
1948 	  NULL, filt_tmpfsdetach, filt_tmpfsread };
1949 static struct filterops tmpfswrite_filtops =
1950 	{ FILTEROP_ISFD | FILTEROP_MPSAFE,
1951 	  NULL, filt_tmpfsdetach, filt_tmpfswrite };
1952 static struct filterops tmpfsvnode_filtops =
1953 	{ FILTEROP_ISFD | FILTEROP_MPSAFE,
1954 	  NULL, filt_tmpfsdetach, filt_tmpfsvnode };
1955 
1956 static int
1957 tmpfs_kqfilter (struct vop_kqfilter_args *ap)
1958 {
1959 	struct vnode *vp = ap->a_vp;
1960 	struct knote *kn = ap->a_kn;
1961 
1962 	switch (kn->kn_filter) {
1963 	case EVFILT_READ:
1964 		kn->kn_fop = &tmpfsread_filtops;
1965 		break;
1966 	case EVFILT_WRITE:
1967 		kn->kn_fop = &tmpfswrite_filtops;
1968 		break;
1969 	case EVFILT_VNODE:
1970 		kn->kn_fop = &tmpfsvnode_filtops;
1971 		break;
1972 	default:
1973 		return (EOPNOTSUPP);
1974 	}
1975 
1976 	kn->kn_hook = (caddr_t)vp;
1977 
1978 	knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
1979 
1980 	return(0);
1981 }
1982 
1983 static void
1984 filt_tmpfsdetach(struct knote *kn)
1985 {
1986 	struct vnode *vp = (void *)kn->kn_hook;
1987 
1988 	knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
1989 }
1990 
1991 static int
1992 filt_tmpfsread(struct knote *kn, long hint)
1993 {
1994 	struct vnode *vp = (void *)kn->kn_hook;
1995 	struct tmpfs_node *node = VP_TO_TMPFS_NODE(vp);
1996 	off_t off;
1997 
1998 	if (hint == NOTE_REVOKE) {
1999 		kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
2000 		return(1);
2001 	}
2002 
2003 	/*
2004 	 * Interlock against MP races when performing this function.
2005 	 */
2006 	TMPFS_NODE_LOCK_SH(node);
2007 	off = node->tn_size - kn->kn_fp->f_offset;
2008 	kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX;
2009 	if (kn->kn_sfflags & NOTE_OLDAPI) {
2010 		TMPFS_NODE_UNLOCK(node);
2011 		return(1);
2012 	}
2013 	if (kn->kn_data == 0) {
2014 		kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX;
2015 	}
2016 	TMPFS_NODE_UNLOCK(node);
2017 	return (kn->kn_data != 0);
2018 }
2019 
2020 static int
2021 filt_tmpfswrite(struct knote *kn, long hint)
2022 {
2023 	if (hint == NOTE_REVOKE)
2024 		kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
2025 	kn->kn_data = 0;
2026 	return (1);
2027 }
2028 
2029 static int
2030 filt_tmpfsvnode(struct knote *kn, long hint)
2031 {
2032 	if (kn->kn_sfflags & hint)
2033 		kn->kn_fflags |= hint;
2034 	if (hint == NOTE_REVOKE) {
2035 		kn->kn_flags |= (EV_EOF | EV_NODATA);
2036 		return (1);
2037 	}
2038 	return (kn->kn_fflags != 0);
2039 }
2040 
2041 /*
2042  * Helper to move VM pages between objects
2043  *
2044  * NOTE: The vm_page_rename() dirties the page, so we can clear the
2045  *	 PG_NEED_COMMIT flag.  If the pages are being moved into tn_aobj,
2046  *	 the pageout daemon will be able to page them out.
2047  */
2048 static int
2049 tmpfs_move_pages_callback(vm_page_t p, void *data)
2050 {
2051 	struct rb_vm_page_scan_info *info = data;
2052 	vm_pindex_t pindex;
2053 
2054 	pindex = p->pindex;
2055 	if (vm_page_busy_try(p, TRUE)) {
2056 		vm_page_sleep_busy(p, TRUE, "tpgmov");
2057 		info->error = -1;
2058 		return -1;
2059 	}
2060 	if (p->object != info->object || p->pindex != pindex) {
2061 		vm_page_wakeup(p);
2062 		info->error = -1;
2063 		return -1;
2064 	}
2065 
2066 	if ((info->pagerflags & TMPFS_MOVF_FROMBACKING) &&
2067 	    (p->flags & PG_SWAPPED) &&
2068 	    (p->flags & PG_NEED_COMMIT) == 0 &&
2069 	    p->dirty == 0) {
2070 		/*
2071 		 * If the page in the backing aobj was paged out to swap
2072 		 * it will be clean and it is better to free it rather
2073 		 * than re-dirty it.  We will assume that the page was
2074 		 * paged out to swap for a reason!
2075 		 *
2076 		 * This helps avoid unnecessary swap thrashing on the page.
2077 		 */
2078 		vm_page_free(p);
2079 	} else if ((info->pagerflags & TMPFS_MOVF_FROMBACKING) == 0 &&
2080 		   (p->flags & PG_NEED_COMMIT) == 0 &&
2081 		   p->dirty == 0) {
2082 		/*
2083 		 * If the page associated with the vnode was cleaned via
2084 		 * a tmpfs_strategy() call, it exists as a swap block in
2085 		 * aobj and it is again better to free it rather than
2086 		 * re-dirty it.  We will assume that the page was
2087 		 * paged out to swap for a reason!
2088 		 *
2089 		 * This helps avoid unnecessary swap thrashing on the page.
2090 		 */
2091 		vm_page_free(p);
2092 	} else {
2093 		/*
2094 		 * Rename the page, which will also ensure that it is flagged
2095 		 * as dirty and check whether a swap block association exists
2096 		 * in the target object or not, setting appropriate flags if
2097 		 * it does.
2098 		 */
2099 		vm_page_rename(p, info->dest_object, pindex);
2100 		vm_page_clear_commit(p);
2101 		if (info->pagerflags & TMPFS_MOVF_DEACTIVATE)
2102 			vm_page_deactivate(p);
2103 		vm_page_wakeup(p);
2104 		/* page automaticaly made dirty */
2105 	}
2106 
2107 	return 0;
2108 }
2109 
2110 static
2111 void
2112 tmpfs_move_pages(vm_object_t src, vm_object_t dst, int movflags)
2113 {
2114 	struct rb_vm_page_scan_info info;
2115 
2116 	vm_object_hold(src);
2117 	vm_object_hold(dst);
2118 	info.object = src;
2119 	info.dest_object = dst;
2120 	info.pagerflags = movflags;
2121 	do {
2122 		if (src->paging_in_progress)
2123 			vm_object_pip_wait(src, "objtfs");
2124 		info.error = 1;
2125 		vm_page_rb_tree_RB_SCAN(&src->rb_memq, NULL,
2126 					tmpfs_move_pages_callback, &info);
2127 	} while (info.error < 0 || !RB_EMPTY(&src->rb_memq) ||
2128 		 src->paging_in_progress);
2129 	vm_object_drop(dst);
2130 	vm_object_drop(src);
2131 }
2132 
2133 /* --------------------------------------------------------------------- */
2134 
2135 /*
2136  * vnode operations vector used for files stored in a tmpfs file system.
2137  */
2138 struct vop_ops tmpfs_vnode_vops = {
2139 	.vop_default =			vop_defaultop,
2140 	.vop_getpages = 		vop_stdgetpages,
2141 	.vop_putpages = 		vop_stdputpages,
2142 	.vop_ncreate =			tmpfs_ncreate,
2143 	.vop_nresolve =			tmpfs_nresolve,
2144 	.vop_nlookupdotdot =		tmpfs_nlookupdotdot,
2145 	.vop_nmknod =			tmpfs_nmknod,
2146 	.vop_open =			tmpfs_open,
2147 	.vop_close =			tmpfs_close,
2148 	.vop_access =			tmpfs_access,
2149 	.vop_getattr =			tmpfs_getattr,
2150 	.vop_getattr_quick =		tmpfs_getattr_quick,
2151 	.vop_setattr =			tmpfs_setattr,
2152 	.vop_read =			tmpfs_read,
2153 	.vop_write =			tmpfs_write,
2154 	.vop_fsync =			tmpfs_fsync,
2155 	.vop_mountctl =			tmpfs_mountctl,
2156 	.vop_nremove =			tmpfs_nremove,
2157 	.vop_nlink =			tmpfs_nlink,
2158 	.vop_nrename =			tmpfs_nrename,
2159 	.vop_nmkdir =			tmpfs_nmkdir,
2160 	.vop_nrmdir =			tmpfs_nrmdir,
2161 	.vop_nsymlink =			tmpfs_nsymlink,
2162 	.vop_readdir =			tmpfs_readdir,
2163 	.vop_readlink =			tmpfs_readlink,
2164 	.vop_inactive =			tmpfs_inactive,
2165 	.vop_reclaim =			tmpfs_reclaim,
2166 	.vop_print =			tmpfs_print,
2167 	.vop_pathconf =			tmpfs_pathconf,
2168 	.vop_bmap =			tmpfs_bmap,
2169 	.vop_strategy =			tmpfs_strategy,
2170 	.vop_advlock =			tmpfs_advlock,
2171 	.vop_kqfilter =			tmpfs_kqfilter
2172 };
2173