xref: /dragonfly/sys/vfs/tmpfs/tmpfs_vnops.c (revision 7b8d89eb)
1 /*-
2  * Copyright (c) 2005, 2006 The NetBSD Foundation, Inc.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to The NetBSD Foundation
6  * by Julio M. Merino Vidal, developed as part of Google's Summer of Code
7  * 2005 program.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
22  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  *
30  * $NetBSD: tmpfs_vnops.c,v 1.39 2007/07/23 15:41:01 jmmv Exp $
31  */
32 
33 /*
34  * tmpfs vnode interface.
35  */
36 
37 #include <sys/kernel.h>
38 #include <sys/kern_syscall.h>
39 #include <sys/param.h>
40 #include <sys/uio.h>
41 #include <sys/fcntl.h>
42 #include <sys/lockf.h>
43 #include <sys/priv.h>
44 #include <sys/proc.h>
45 #include <sys/resourcevar.h>
46 #include <sys/sched.h>
47 #include <sys/stat.h>
48 #include <sys/systm.h>
49 #include <sys/sysctl.h>
50 #include <sys/unistd.h>
51 #include <sys/vfsops.h>
52 #include <sys/vnode.h>
53 #include <sys/mountctl.h>
54 
55 #include <vm/vm.h>
56 #include <vm/vm_extern.h>
57 #include <vm/vm_object.h>
58 #include <vm/vm_page.h>
59 #include <vm/vm_pageout.h>
60 #include <vm/vm_pager.h>
61 #include <vm/swap_pager.h>
62 
63 #include <sys/buf2.h>
64 #include <vm/vm_page2.h>
65 
66 #include <vfs/fifofs/fifo.h>
67 #include <vfs/tmpfs/tmpfs_vnops.h>
68 #include "tmpfs.h"
69 
70 static void tmpfs_strategy_done(struct bio *bio);
71 static void tmpfs_move_pages(vm_object_t src, vm_object_t dst, int movflags);
72 
73 /*
74  * bufcache_mode:
75  *	0	Normal page queue operation on flush.  Try to keep in memory.
76  *	1	Try to cache on flush to swap (default).
77  *	2	Always page to swap (not recommended).
78  */
79 __read_mostly static int tmpfs_cluster_rd_enable = 1;
80 __read_mostly static int tmpfs_cluster_wr_enable = 1;
81 __read_mostly static int tmpfs_bufcache_mode = 1;
82 SYSCTL_NODE(_vfs, OID_AUTO, tmpfs, CTLFLAG_RW, 0, "TMPFS filesystem");
83 SYSCTL_INT(_vfs_tmpfs, OID_AUTO, cluster_rd_enable, CTLFLAG_RW,
84 		&tmpfs_cluster_rd_enable, 0, "");
85 SYSCTL_INT(_vfs_tmpfs, OID_AUTO, cluster_wr_enable, CTLFLAG_RW,
86 		&tmpfs_cluster_wr_enable, 0, "");
87 SYSCTL_INT(_vfs_tmpfs, OID_AUTO, bufcache_mode, CTLFLAG_RW,
88 		&tmpfs_bufcache_mode, 0, "");
89 
90 #define TMPFS_MOVF_FROMBACKING	0x0001
91 #define TMPFS_MOVF_DEACTIVATE	0x0002
92 
93 
94 static __inline
95 void
96 tmpfs_knote(struct vnode *vp, int flags)
97 {
98 	if (flags)
99 		KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
100 }
101 
102 
103 /* --------------------------------------------------------------------- */
104 
105 static int
106 tmpfs_nresolve(struct vop_nresolve_args *ap)
107 {
108 	struct vnode *dvp = ap->a_dvp;
109 	struct vnode *vp = NULL;
110 	struct namecache *ncp = ap->a_nch->ncp;
111 	struct tmpfs_node *tnode;
112 	struct tmpfs_dirent *de;
113 	struct tmpfs_node *dnode;
114 	int error;
115 
116 	dnode = VP_TO_TMPFS_DIR(dvp);
117 
118 	TMPFS_NODE_LOCK_SH(dnode);
119 loop:
120 	de = tmpfs_dir_lookup(dnode, NULL, ncp);
121 	if (de == NULL) {
122 		error = ENOENT;
123 	} else {
124 		/*
125 		 * Allocate a vnode for the node we found.  Use
126 		 * tmpfs_alloc_vp()'s deadlock handling mode.
127 		 */
128 		tnode = de->td_node;
129 		error = tmpfs_alloc_vp(dvp->v_mount, dnode, tnode,
130 				       LK_EXCLUSIVE | LK_RETRY, &vp);
131 		if (error == EAGAIN)
132 			goto loop;
133 		if (error)
134 			goto out;
135 		KKASSERT(vp);
136 	}
137 
138 out:
139 	TMPFS_NODE_UNLOCK(dnode);
140 
141 	if ((dnode->tn_status & TMPFS_NODE_ACCESSED) == 0) {
142 		TMPFS_NODE_LOCK(dnode);
143 		dnode->tn_status |= TMPFS_NODE_ACCESSED;
144 		TMPFS_NODE_UNLOCK(dnode);
145 	}
146 
147 	/*
148 	 * Store the result of this lookup in the cache.  Avoid this if the
149 	 * request was for creation, as it does not improve timings on
150 	 * emprical tests.
151 	 */
152 	if (vp) {
153 		vn_unlock(vp);
154 		cache_setvp(ap->a_nch, vp);
155 		vrele(vp);
156 	} else if (error == ENOENT) {
157 		cache_setvp(ap->a_nch, NULL);
158 	}
159 	return (error);
160 }
161 
162 static int
163 tmpfs_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
164 {
165 	struct vnode *dvp = ap->a_dvp;
166 	struct vnode **vpp = ap->a_vpp;
167 	struct tmpfs_node *dnode = VP_TO_TMPFS_NODE(dvp);
168 	struct ucred *cred = ap->a_cred;
169 	int error;
170 
171 	*vpp = NULL;
172 
173 	/* Check accessibility of requested node as a first step. */
174 	error = VOP_ACCESS(dvp, VEXEC, cred);
175 	if (error != 0)
176 		return error;
177 
178 	if (dnode->tn_dir.tn_parent != NULL) {
179 		/* Allocate a new vnode on the matching entry. */
180 		error = tmpfs_alloc_vp(dvp->v_mount,
181 				       NULL, dnode->tn_dir.tn_parent,
182 				       LK_EXCLUSIVE | LK_RETRY, vpp);
183 
184 		if (*vpp)
185 			vn_unlock(*vpp);
186 	}
187 	return (*vpp == NULL) ? ENOENT : 0;
188 }
189 
190 /* --------------------------------------------------------------------- */
191 
192 static int
193 tmpfs_ncreate(struct vop_ncreate_args *ap)
194 {
195 	struct vnode *dvp = ap->a_dvp;
196 	struct vnode **vpp = ap->a_vpp;
197 	struct namecache *ncp = ap->a_nch->ncp;
198 	struct vattr *vap = ap->a_vap;
199 	struct ucred *cred = ap->a_cred;
200 	int error;
201 
202 	KKASSERT(vap->va_type == VREG || vap->va_type == VSOCK);
203 
204 	error = tmpfs_alloc_file(dvp, vpp, vap, ncp, cred, NULL);
205 	if (error == 0) {
206 		cache_setunresolved(ap->a_nch);
207 		cache_setvp(ap->a_nch, *vpp);
208 		tmpfs_knote(dvp, NOTE_WRITE);
209 	}
210 	return (error);
211 }
212 /* --------------------------------------------------------------------- */
213 
214 static int
215 tmpfs_nmknod(struct vop_nmknod_args *ap)
216 {
217 	struct vnode *dvp = ap->a_dvp;
218 	struct vnode **vpp = ap->a_vpp;
219 	struct namecache *ncp = ap->a_nch->ncp;
220 	struct vattr *vap = ap->a_vap;
221 	struct ucred *cred = ap->a_cred;
222 	int error;
223 
224 	if (vap->va_type != VBLK && vap->va_type != VCHR &&
225 	    vap->va_type != VFIFO) {
226 		return (EINVAL);
227 	}
228 
229 	error = tmpfs_alloc_file(dvp, vpp, vap, ncp, cred, NULL);
230 	if (error == 0) {
231 		cache_setunresolved(ap->a_nch);
232 		cache_setvp(ap->a_nch, *vpp);
233 		tmpfs_knote(dvp, NOTE_WRITE);
234 	}
235 	return error;
236 }
237 
238 /* --------------------------------------------------------------------- */
239 
240 static int
241 tmpfs_open(struct vop_open_args *ap)
242 {
243 	struct vnode *vp = ap->a_vp;
244 	int mode = ap->a_mode;
245 	struct tmpfs_node *node;
246 	int error;
247 
248 	node = VP_TO_TMPFS_NODE(vp);
249 
250 #if 0
251 	/* The file is still active but all its names have been removed
252 	 * (e.g. by a "rmdir $(pwd)").  It cannot be opened any more as
253 	 * it is about to die. */
254 	if (node->tn_links < 1)
255 		return (ENOENT);
256 #endif
257 
258 	/* If the file is marked append-only, deny write requests. */
259 	if ((node->tn_flags & APPEND) &&
260 	    (mode & (FWRITE | O_APPEND)) == FWRITE) {
261 		error = EPERM;
262 	} else {
263 		if (node->tn_reg.tn_pages_in_aobj) {
264 			TMPFS_NODE_LOCK(node);
265 			if (node->tn_reg.tn_pages_in_aobj) {
266 				tmpfs_move_pages(node->tn_reg.tn_aobj,
267 						 vp->v_object,
268 						 TMPFS_MOVF_FROMBACKING);
269 				node->tn_reg.tn_pages_in_aobj = 0;
270 			}
271 			TMPFS_NODE_UNLOCK(node);
272 		}
273 		error = vop_stdopen(ap);
274 	}
275 
276 	return (error);
277 }
278 
279 /* --------------------------------------------------------------------- */
280 
281 static int
282 tmpfs_close(struct vop_close_args *ap)
283 {
284 	struct vnode *vp = ap->a_vp;
285 	struct tmpfs_node *node;
286 	int error;
287 
288 	node = VP_TO_TMPFS_NODE(vp);
289 
290 	if (node->tn_links > 0) {
291 		/*
292 		 * Update node times.  No need to do it if the node has
293 		 * been deleted, because it will vanish after we return.
294 		 */
295 		tmpfs_update(vp);
296 	}
297 
298 	error = vop_stdclose(ap);
299 
300 	return (error);
301 }
302 
303 /* --------------------------------------------------------------------- */
304 
305 int
306 tmpfs_access(struct vop_access_args *ap)
307 {
308 	struct vnode *vp = ap->a_vp;
309 	int error;
310 	struct tmpfs_node *node;
311 
312 	node = VP_TO_TMPFS_NODE(vp);
313 
314 	switch (vp->v_type) {
315 	case VDIR:
316 		/* FALLTHROUGH */
317 	case VLNK:
318 		/* FALLTHROUGH */
319 	case VREG:
320 		if ((ap->a_mode & VWRITE) &&
321 	            (vp->v_mount->mnt_flag & MNT_RDONLY)) {
322 			error = EROFS;
323 			goto out;
324 		}
325 		break;
326 
327 	case VBLK:
328 		/* FALLTHROUGH */
329 	case VCHR:
330 		/* FALLTHROUGH */
331 	case VSOCK:
332 		/* FALLTHROUGH */
333 	case VFIFO:
334 		break;
335 
336 	default:
337 		error = EINVAL;
338 		goto out;
339 	}
340 
341 	if ((ap->a_mode & VWRITE) && (node->tn_flags & IMMUTABLE)) {
342 		error = EPERM;
343 		goto out;
344 	}
345 
346 	error = vop_helper_access(ap, node->tn_uid, node->tn_gid,
347 			          node->tn_mode, 0);
348 out:
349 	return error;
350 }
351 
352 /* --------------------------------------------------------------------- */
353 
354 int
355 tmpfs_getattr(struct vop_getattr_args *ap)
356 {
357 	struct vnode *vp = ap->a_vp;
358 	struct vattr *vap = ap->a_vap;
359 	struct tmpfs_node *node;
360 
361 	node = VP_TO_TMPFS_NODE(vp);
362 
363 	tmpfs_update(vp);
364 
365 	TMPFS_NODE_LOCK_SH(node);
366 	vap->va_type = vp->v_type;
367 	vap->va_mode = node->tn_mode;
368 	vap->va_nlink = node->tn_links;
369 	vap->va_uid = node->tn_uid;
370 	vap->va_gid = node->tn_gid;
371 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
372 	vap->va_fileid = node->tn_id;
373 	vap->va_size = node->tn_size;
374 	vap->va_blocksize = PAGE_SIZE;
375 	vap->va_atime.tv_sec = node->tn_atime;
376 	vap->va_atime.tv_nsec = node->tn_atimensec;
377 	vap->va_mtime.tv_sec = node->tn_mtime;
378 	vap->va_mtime.tv_nsec = node->tn_mtimensec;
379 	vap->va_ctime.tv_sec = node->tn_ctime;
380 	vap->va_ctime.tv_nsec = node->tn_ctimensec;
381 	vap->va_gen = node->tn_gen;
382 	vap->va_flags = node->tn_flags;
383 	if (vp->v_type == VBLK || vp->v_type == VCHR) {
384 		vap->va_rmajor = umajor(node->tn_rdev);
385 		vap->va_rminor = uminor(node->tn_rdev);
386 	}
387 	vap->va_bytes = round_page(node->tn_size);
388 	vap->va_filerev = 0;
389 	TMPFS_NODE_UNLOCK(node);
390 
391 	return 0;
392 }
393 
394 /* --------------------------------------------------------------------- */
395 
396 int
397 tmpfs_setattr(struct vop_setattr_args *ap)
398 {
399 	struct vnode *vp = ap->a_vp;
400 	struct vattr *vap = ap->a_vap;
401 	struct ucred *cred = ap->a_cred;
402 	struct tmpfs_node *node = VP_TO_TMPFS_NODE(vp);
403 	int error = 0;
404 	int kflags = 0;
405 
406 	TMPFS_NODE_LOCK(node);
407 	if (error == 0 && (vap->va_flags != VNOVAL)) {
408 		error = tmpfs_chflags(vp, vap->va_flags, cred);
409 		kflags |= NOTE_ATTRIB;
410 	}
411 
412 	if (error == 0 && (vap->va_size != VNOVAL)) {
413 		/* restore any saved pages before proceeding */
414 		if (node->tn_reg.tn_pages_in_aobj) {
415 			tmpfs_move_pages(node->tn_reg.tn_aobj, vp->v_object,
416 					 TMPFS_MOVF_FROMBACKING |
417 					 TMPFS_MOVF_DEACTIVATE);
418 			node->tn_reg.tn_pages_in_aobj = 0;
419 		}
420 		if (vap->va_size > node->tn_size)
421 			kflags |= NOTE_WRITE | NOTE_EXTEND;
422 		else
423 			kflags |= NOTE_WRITE;
424 		error = tmpfs_chsize(vp, vap->va_size, cred);
425 	}
426 
427 	if (error == 0 && (vap->va_uid != (uid_t)VNOVAL ||
428 			   vap->va_gid != (gid_t)VNOVAL)) {
429 		error = tmpfs_chown(vp, vap->va_uid, vap->va_gid, cred);
430 		kflags |= NOTE_ATTRIB;
431 	}
432 
433 	if (error == 0 && (vap->va_mode != (mode_t)VNOVAL)) {
434 		error = tmpfs_chmod(vp, vap->va_mode, cred);
435 		kflags |= NOTE_ATTRIB;
436 	}
437 
438 	if (error == 0 && ((vap->va_atime.tv_sec != VNOVAL &&
439 	    vap->va_atime.tv_nsec != VNOVAL) ||
440 	    (vap->va_mtime.tv_sec != VNOVAL &&
441 	    vap->va_mtime.tv_nsec != VNOVAL) )) {
442 		error = tmpfs_chtimes(vp, &vap->va_atime, &vap->va_mtime,
443 				      vap->va_vaflags, cred);
444 		kflags |= NOTE_ATTRIB;
445 	}
446 
447 	/*
448 	 * Update the node times.  We give preference to the error codes
449 	 * generated by this function rather than the ones that may arise
450 	 * from tmpfs_update.
451 	 */
452 	tmpfs_update(vp);
453 	TMPFS_NODE_UNLOCK(node);
454 	tmpfs_knote(vp, kflags);
455 
456 	return (error);
457 }
458 
459 /* --------------------------------------------------------------------- */
460 
461 /*
462  * fsync is usually a NOP, but we must take action when unmounting or
463  * when recycling.
464  */
465 static int
466 tmpfs_fsync(struct vop_fsync_args *ap)
467 {
468 	struct tmpfs_node *node;
469 	struct vnode *vp = ap->a_vp;
470 
471 	node = VP_TO_TMPFS_NODE(vp);
472 
473 	/*
474 	 * tmpfs vnodes typically remain dirty, avoid long syncer scans
475 	 * by forcing removal from the syncer list.
476 	 */
477 	vn_syncer_remove(vp, 1);
478 
479 	tmpfs_update(vp);
480 	if (vp->v_type == VREG) {
481 		if (vp->v_flag & VRECLAIMED) {
482 			if (node->tn_links == 0)
483 				tmpfs_truncate(vp, 0);
484 			else
485 				vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL);
486 		}
487 	}
488 
489 	return 0;
490 }
491 
492 /* --------------------------------------------------------------------- */
493 
494 static int
495 tmpfs_read(struct vop_read_args *ap)
496 {
497 	struct buf *bp;
498 	struct vnode *vp = ap->a_vp;
499 	struct uio *uio = ap->a_uio;
500 	struct tmpfs_node *node;
501 	off_t base_offset;
502 	size_t offset;
503 	size_t len;
504 	size_t resid;
505 	int error;
506 	int seqcount;
507 
508 	/*
509 	 * Check the basics
510 	 */
511 	if (uio->uio_offset < 0)
512 		return (EINVAL);
513 	if (vp->v_type != VREG)
514 		return (EINVAL);
515 
516 	/*
517 	 * Extract node, try to shortcut the operation through
518 	 * the VM page cache, allowing us to avoid buffer cache
519 	 * overheads.
520 	 */
521 	node = VP_TO_TMPFS_NODE(vp);
522         resid = uio->uio_resid;
523 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
524         error = vop_helper_read_shortcut(ap);
525         if (error)
526                 return error;
527         if (uio->uio_resid == 0) {
528 		if (resid)
529 			goto finished;
530 		return error;
531 	}
532 
533 	/*
534 	 * restore any saved pages before proceeding
535 	 */
536 	if (node->tn_reg.tn_pages_in_aobj) {
537 		TMPFS_NODE_LOCK(node);
538 		if (node->tn_reg.tn_pages_in_aobj) {
539 			tmpfs_move_pages(node->tn_reg.tn_aobj, vp->v_object,
540 					 TMPFS_MOVF_FROMBACKING);
541 			node->tn_reg.tn_pages_in_aobj = 0;
542 		}
543 		TMPFS_NODE_UNLOCK(node);
544 	}
545 
546 	/*
547 	 * Fall-through to our normal read code.
548 	 */
549 	while (uio->uio_resid > 0 && uio->uio_offset < node->tn_size) {
550 		/*
551 		 * Use buffer cache I/O (via tmpfs_strategy)
552 		 */
553 		offset = (size_t)uio->uio_offset & TMPFS_BLKMASK64;
554 		base_offset = (off_t)uio->uio_offset - offset;
555 		bp = getcacheblk(vp, base_offset, TMPFS_BLKSIZE, GETBLK_KVABIO);
556 		if (bp == NULL) {
557 			if (tmpfs_cluster_rd_enable) {
558 				error = cluster_readx(vp, node->tn_size,
559 						     base_offset,
560 						     TMPFS_BLKSIZE,
561 						     B_NOTMETA | B_KVABIO,
562 						     uio->uio_resid,
563 						     seqcount * MAXBSIZE,
564 						     &bp);
565 			} else {
566 				error = bread_kvabio(vp, base_offset,
567 						     TMPFS_BLKSIZE, &bp);
568 			}
569 			if (error) {
570 				brelse(bp);
571 				kprintf("tmpfs_read bread error %d\n", error);
572 				break;
573 			}
574 
575 			/*
576 			 * tmpfs pretty much fiddles directly with the VM
577 			 * system, don't let it exhaust it or we won't play
578 			 * nice with other processes.
579 			 *
580 			 * Only do this if the VOP is coming from a normal
581 			 * read/write.  The VM system handles the case for
582 			 * UIO_NOCOPY.
583 			 */
584 			if (uio->uio_segflg != UIO_NOCOPY)
585 				vm_wait_nominal();
586 		}
587 		bp->b_flags |= B_CLUSTEROK;
588 		bkvasync(bp);
589 
590 		/*
591 		 * Figure out how many bytes we can actually copy this loop.
592 		 */
593 		len = TMPFS_BLKSIZE - offset;
594 		if (len > uio->uio_resid)
595 			len = uio->uio_resid;
596 		if (len > node->tn_size - uio->uio_offset)
597 			len = (size_t)(node->tn_size - uio->uio_offset);
598 
599 		error = uiomovebp(bp, (char *)bp->b_data + offset, len, uio);
600 		bqrelse(bp);
601 		if (error) {
602 			kprintf("tmpfs_read uiomove error %d\n", error);
603 			break;
604 		}
605 	}
606 
607 finished:
608 	if ((node->tn_status & TMPFS_NODE_ACCESSED) == 0) {
609 		TMPFS_NODE_LOCK(node);
610 		node->tn_status |= TMPFS_NODE_ACCESSED;
611 		TMPFS_NODE_UNLOCK(node);
612 	}
613 	return (error);
614 }
615 
616 static int
617 tmpfs_write(struct vop_write_args *ap)
618 {
619 	struct buf *bp;
620 	struct vnode *vp = ap->a_vp;
621 	struct uio *uio = ap->a_uio;
622 	struct thread *td = uio->uio_td;
623 	struct tmpfs_node *node;
624 	boolean_t extended;
625 	off_t oldsize;
626 	int error;
627 	off_t base_offset;
628 	size_t offset;
629 	size_t len;
630 	struct rlimit limit;
631 	int trivial = 0;
632 	int kflags = 0;
633 	int seqcount;
634 
635 	error = 0;
636 	if (uio->uio_resid == 0) {
637 		return error;
638 	}
639 
640 	node = VP_TO_TMPFS_NODE(vp);
641 
642 	if (vp->v_type != VREG)
643 		return (EINVAL);
644 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
645 
646 	TMPFS_NODE_LOCK(node);
647 
648 	/*
649 	 * restore any saved pages before proceeding
650 	 */
651 	if (node->tn_reg.tn_pages_in_aobj) {
652 		tmpfs_move_pages(node->tn_reg.tn_aobj, vp->v_object,
653 				 TMPFS_MOVF_FROMBACKING);
654 		node->tn_reg.tn_pages_in_aobj = 0;
655 	}
656 
657 	oldsize = node->tn_size;
658 	if (ap->a_ioflag & IO_APPEND)
659 		uio->uio_offset = node->tn_size;
660 
661 	/*
662 	 * Check for illegal write offsets.
663 	 */
664 	if (uio->uio_offset + uio->uio_resid >
665 	  VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) {
666 		error = EFBIG;
667 		goto done;
668 	}
669 
670 	/*
671 	 * NOTE: Ignore if UIO does not come from a user thread (e.g. VN).
672 	 */
673 	if (vp->v_type == VREG && td != NULL && td->td_lwp != NULL) {
674 		error = kern_getrlimit(RLIMIT_FSIZE, &limit);
675 		if (error)
676 			goto done;
677 		if (uio->uio_offset + uio->uio_resid > limit.rlim_cur) {
678 			ksignal(td->td_proc, SIGXFSZ);
679 			error = EFBIG;
680 			goto done;
681 		}
682 	}
683 
684 	/*
685 	 * Extend the file's size if necessary
686 	 */
687 	extended = ((uio->uio_offset + uio->uio_resid) > node->tn_size);
688 
689 	while (uio->uio_resid > 0) {
690 		/*
691 		 * Don't completely blow out running buffer I/O
692 		 * when being hit from the pageout daemon.
693 		 */
694 		if (uio->uio_segflg == UIO_NOCOPY &&
695 		    (ap->a_ioflag & IO_RECURSE) == 0) {
696 			bwillwrite(TMPFS_BLKSIZE);
697 		}
698 
699 		/*
700 		 * Use buffer cache I/O (via tmpfs_strategy)
701 		 */
702 		offset = (size_t)uio->uio_offset & TMPFS_BLKMASK64;
703 		base_offset = (off_t)uio->uio_offset - offset;
704 		len = TMPFS_BLKSIZE - offset;
705 		if (len > uio->uio_resid)
706 			len = uio->uio_resid;
707 
708 		if ((uio->uio_offset + len) > node->tn_size) {
709 			trivial = (uio->uio_offset <= node->tn_size);
710 			error = tmpfs_reg_resize(vp, uio->uio_offset + len,
711 						 trivial);
712 			if (error)
713 				break;
714 		}
715 
716 		/*
717 		 * Read to fill in any gaps.  Theoretically we could
718 		 * optimize this if the write covers the entire buffer
719 		 * and is not a UIO_NOCOPY write, however this can lead
720 		 * to a security violation exposing random kernel memory
721 		 * (whatever junk was in the backing VM pages before).
722 		 *
723 		 * So just use bread() to do the right thing.
724 		 */
725 		error = bread_kvabio(vp, base_offset, TMPFS_BLKSIZE, &bp);
726 		bkvasync(bp);
727 		error = uiomovebp(bp, (char *)bp->b_data + offset, len, uio);
728 		if (error) {
729 			kprintf("tmpfs_write uiomove error %d\n", error);
730 			brelse(bp);
731 			break;
732 		}
733 
734 		if (uio->uio_offset > node->tn_size) {
735 			node->tn_size = uio->uio_offset;
736 			kflags |= NOTE_EXTEND;
737 		}
738 		kflags |= NOTE_WRITE;
739 
740 		/*
741 		 * UIO_NOCOPY is a sensitive state due to potentially being
742 		 * issued from the pageout daemon while in a low-memory
743 		 * situation.  However, in order to cluster the I/O nicely
744 		 * (e.g. 64KB+ writes instead of 16KB writes), we still try
745 		 * to follow the same semantics that any other filesystem
746 		 * might use.
747 		 *
748 		 * For the normal case we buwrite(), dirtying the underlying
749 		 * VM pages instead of dirtying the buffer and releasing the
750 		 * buffer as a clean buffer.  This allows tmpfs to use
751 		 * essentially all available memory to cache file data.
752 		 * If we used bdwrite() the buffer cache would wind up
753 		 * flushing the data to swap too quickly.
754 		 *
755 		 * But because tmpfs can seriously load the VM system we
756 		 * fall-back to using bdwrite() when free memory starts
757 		 * to get low.  This shifts the load away from the VM system
758 		 * and makes tmpfs act more like a normal filesystem with
759 		 * regards to disk activity.
760 		 *
761 		 * tmpfs pretty much fiddles directly with the VM
762 		 * system, don't let it exhaust it or we won't play
763 		 * nice with other processes.  Only do this if the
764 		 * VOP is coming from a normal read/write.  The VM system
765 		 * handles the case for UIO_NOCOPY.
766 		 */
767 		bp->b_flags |= B_CLUSTEROK;
768 		if (uio->uio_segflg == UIO_NOCOPY) {
769 			/*
770 			 * Flush from the pageout daemon, deal with
771 			 * potentially very heavy tmpfs write activity
772 			 * causing long stalls in the pageout daemon
773 			 * before pages get to free/cache.
774 			 *
775 			 * (a) Under severe pressure setting B_DIRECT will
776 			 *     cause a buffer release to try to free the
777 			 *     underlying pages.
778 			 *
779 			 * (b) Under modest memory pressure the B_RELBUF
780 			 *     alone is sufficient to get the pages moved
781 			 *     to the cache.  We could also force this by
782 			 *     setting B_NOTMETA but that might have other
783 			 *     unintended side-effects (e.g. setting
784 			 *     PG_NOTMETA on the VM page).
785 			 *
786 			 * (c) For the pageout->putpages->generic_putpages->
787 			 *     UIO_NOCOPY-write (here), issuing an immediate
788 			 *     write prevents any real clustering from
789 			 *     happening because the buffers probably aren't
790 			 *     (yet) marked dirty, or lost due to prior use
791 			 *     of buwrite().  Try to use the normal
792 			 *     cluster_write() mechanism for performance.
793 			 *
794 			 * Hopefully this will unblock the VM system more
795 			 * quickly under extreme tmpfs write load.
796 			 */
797 			if (vm_page_count_min(vm_page_free_hysteresis))
798 				bp->b_flags |= B_DIRECT;
799 			bp->b_flags |= B_AGE | B_RELBUF | B_TTC;
800 			bp->b_act_count = 0;	/* buffer->deactivate pgs */
801 			if (tmpfs_cluster_wr_enable &&
802 			    (ap->a_ioflag & (IO_SYNC | IO_DIRECT)) == 0) {
803 				cluster_write(bp, node->tn_size,
804 					      TMPFS_BLKSIZE, seqcount);
805 			} else {
806 				cluster_awrite(bp);
807 			}
808 		} else if (vm_pages_needed || vm_paging_needed(0) ||
809 			   tmpfs_bufcache_mode >= 2) {
810 			/*
811 			 * If the pageout daemon is running we cycle the
812 			 * write through the buffer cache normally to
813 			 * pipeline the flush, thus avoiding adding any
814 			 * more memory pressure to the pageout daemon.
815 			 */
816 			bp->b_act_count = 0;	/* buffer->deactivate pgs */
817 			if (tmpfs_cluster_wr_enable) {
818 				cluster_write(bp, node->tn_size,
819 					      TMPFS_BLKSIZE, seqcount);
820 			} else {
821 				bdwrite(bp);
822 			}
823 		} else {
824 			/*
825 			 * Otherwise run the buffer directly through to the
826 			 * backing VM store, leaving the buffer clean so
827 			 * buffer limits do not force early flushes to swap.
828 			 */
829 			buwrite(bp);
830 			/*vm_wait_nominal();*/
831 		}
832 
833 		if (bp->b_error) {
834 			kprintf("tmpfs_write bwrite error %d\n", bp->b_error);
835 			break;
836 		}
837 	}
838 
839 	if (error) {
840 		if (extended) {
841 			(void)tmpfs_reg_resize(vp, oldsize, trivial);
842 			kflags &= ~NOTE_EXTEND;
843 		}
844 		goto done;
845 	}
846 
847 	/*
848 	 * Currently we don't set the mtime on files modified via mmap()
849 	 * because we can't tell the difference between those modifications
850 	 * and an attempt by the pageout daemon to flush tmpfs pages to
851 	 * swap.
852 	 *
853 	 * This is because in order to defer flushes as long as possible
854 	 * buwrite() works by marking the underlying VM pages dirty in
855 	 * order to be able to dispose of the buffer cache buffer without
856 	 * flushing it.
857 	 */
858 	if (uio->uio_segflg == UIO_NOCOPY) {
859 		if (vp->v_flag & VLASTWRITETS) {
860 			node->tn_mtime = vp->v_lastwrite_ts.tv_sec;
861 			node->tn_mtimensec = vp->v_lastwrite_ts.tv_nsec;
862 		}
863 	} else {
864 		node->tn_status |= TMPFS_NODE_MODIFIED;
865 		vclrflags(vp, VLASTWRITETS);
866 	}
867 
868 	if (extended)
869 		node->tn_status |= TMPFS_NODE_CHANGED;
870 
871 	if (node->tn_mode & (S_ISUID | S_ISGID)) {
872 		if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0))
873 			node->tn_mode &= ~(S_ISUID | S_ISGID);
874 	}
875 done:
876 	TMPFS_NODE_UNLOCK(node);
877 	if (kflags)
878 		tmpfs_knote(vp, kflags);
879 
880 	return(error);
881 }
882 
883 static int
884 tmpfs_advlock(struct vop_advlock_args *ap)
885 {
886 	struct tmpfs_node *node;
887 	struct vnode *vp = ap->a_vp;
888 	int error;
889 
890 	node = VP_TO_TMPFS_NODE(vp);
891 	error = (lf_advlock(ap, &node->tn_advlock, node->tn_size));
892 
893 	return (error);
894 }
895 
896 /*
897  * The strategy function is typically only called when memory pressure
898  * forces the system to attempt to pageout pages.  It can also be called
899  * by [n]vtruncbuf() when a truncation cuts a page in half.  Normal write
900  * operations
901  *
902  * We set VKVABIO for VREG files so bp->b_data may not be synchronized to
903  * our cpu.  swap_pager_strategy() is all we really use, and it directly
904  * supports this.
905  */
906 static int
907 tmpfs_strategy(struct vop_strategy_args *ap)
908 {
909 	struct bio *bio = ap->a_bio;
910 	struct bio *nbio;
911 	struct buf *bp = bio->bio_buf;
912 	struct vnode *vp = ap->a_vp;
913 	struct tmpfs_node *node;
914 	vm_object_t uobj;
915 	vm_page_t m;
916 	int i;
917 
918 	if (vp->v_type != VREG) {
919 		bp->b_resid = bp->b_bcount;
920 		bp->b_flags |= B_ERROR | B_INVAL;
921 		bp->b_error = EINVAL;
922 		biodone(bio);
923 		return(0);
924 	}
925 
926 	node = VP_TO_TMPFS_NODE(vp);
927 
928 	uobj = node->tn_reg.tn_aobj;
929 
930 	/*
931 	 * Don't bother flushing to swap if there is no swap, just
932 	 * ensure that the pages are marked as needing a commit (still).
933 	 */
934 	if (bp->b_cmd == BUF_CMD_WRITE && vm_swap_size == 0) {
935 		for (i = 0; i < bp->b_xio.xio_npages; ++i) {
936 			m = bp->b_xio.xio_pages[i];
937 			vm_page_need_commit(m);
938 		}
939 		bp->b_resid = 0;
940 		bp->b_error = 0;
941 		biodone(bio);
942 	} else {
943 		/*
944 		 * Tell the buffer cache to try to recycle the pages
945 		 * to PQ_CACHE on release.
946 		 */
947 		if (tmpfs_bufcache_mode >= 2 ||
948 		    (tmpfs_bufcache_mode == 1 && vm_paging_needed(0))) {
949 			bp->b_flags |= B_TTC;
950 		}
951 		nbio = push_bio(bio);
952 		nbio->bio_done = tmpfs_strategy_done;
953 		nbio->bio_offset = bio->bio_offset;
954 		swap_pager_strategy(uobj, nbio);
955 	}
956 	return 0;
957 }
958 
959 /*
960  * If we were unable to commit the pages to swap make sure they are marked
961  * as needing a commit (again).  If we were, clear the flag to allow the
962  * pages to be freed.
963  *
964  * Do not error-out the buffer.  In particular, vinvalbuf() needs to
965  * always work.
966  */
967 static void
968 tmpfs_strategy_done(struct bio *bio)
969 {
970 	struct buf *bp;
971 	vm_page_t m;
972 	int i;
973 
974 	bp = bio->bio_buf;
975 
976 	if (bp->b_flags & B_ERROR) {
977 		bp->b_flags &= ~B_ERROR;
978 		bp->b_error = 0;
979 		bp->b_resid = 0;
980 		for (i = 0; i < bp->b_xio.xio_npages; ++i) {
981 			m = bp->b_xio.xio_pages[i];
982 			vm_page_need_commit(m);
983 		}
984 	} else {
985 		for (i = 0; i < bp->b_xio.xio_npages; ++i) {
986 			m = bp->b_xio.xio_pages[i];
987 			vm_page_clear_commit(m);
988 		}
989 	}
990 	bio = pop_bio(bio);
991 	biodone(bio);
992 }
993 
994 /*
995  * To make write clustering work well make the backing store look
996  * contiguous to the cluster_*() code.  The swap_strategy() function
997  * will take it from there.
998  *
999  * Use MAXBSIZE-sized chunks as a micro-optimization to make random
1000  * flushes leave full-sized gaps.
1001  */
1002 static int
1003 tmpfs_bmap(struct vop_bmap_args *ap)
1004 {
1005 	if (ap->a_doffsetp != NULL)
1006 		*ap->a_doffsetp = ap->a_loffset;
1007 	if (ap->a_runp != NULL)
1008 		*ap->a_runp = MAXBSIZE - (ap->a_loffset & (MAXBSIZE - 1));
1009 	if (ap->a_runb != NULL)
1010 		*ap->a_runb = ap->a_loffset & (MAXBSIZE - 1);
1011 
1012 	return 0;
1013 }
1014 
1015 /* --------------------------------------------------------------------- */
1016 
1017 static int
1018 tmpfs_nremove(struct vop_nremove_args *ap)
1019 {
1020 	struct vnode *dvp = ap->a_dvp;
1021 	struct namecache *ncp = ap->a_nch->ncp;
1022 	struct vnode *vp;
1023 	int error;
1024 	struct tmpfs_dirent *de;
1025 	struct tmpfs_mount *tmp;
1026 	struct tmpfs_node *dnode;
1027 	struct tmpfs_node *node;
1028 
1029 	/*
1030 	 * We have to acquire the vp from ap->a_nch because we will likely
1031 	 * unresolve the namecache entry, and a vrele/vput is needed to
1032 	 * trigger the tmpfs_inactive/tmpfs_reclaim sequence.
1033 	 *
1034 	 * We have to use vget to clear any inactive state on the vnode,
1035 	 * otherwise the vnode may remain inactive and thus tmpfs_inactive
1036 	 * will not get called when we release it.
1037 	 */
1038 	error = cache_vget(ap->a_nch, ap->a_cred, LK_SHARED, &vp);
1039 	KKASSERT(vp->v_mount == dvp->v_mount);
1040 	KKASSERT(error == 0);
1041 	vn_unlock(vp);
1042 
1043 	if (vp->v_type == VDIR) {
1044 		error = EISDIR;
1045 		goto out2;
1046 	}
1047 
1048 	dnode = VP_TO_TMPFS_DIR(dvp);
1049 	node = VP_TO_TMPFS_NODE(vp);
1050 	tmp = VFS_TO_TMPFS(vp->v_mount);
1051 
1052 	TMPFS_NODE_LOCK(dnode);
1053 	de = tmpfs_dir_lookup(dnode, node, ncp);
1054 	if (de == NULL) {
1055 		error = ENOENT;
1056 		TMPFS_NODE_UNLOCK(dnode);
1057 		goto out;
1058 	}
1059 
1060 	/* Files marked as immutable or append-only cannot be deleted. */
1061 	if ((node->tn_flags & (IMMUTABLE | APPEND | NOUNLINK)) ||
1062 	    (dnode->tn_flags & APPEND)) {
1063 		error = EPERM;
1064 		TMPFS_NODE_UNLOCK(dnode);
1065 		goto out;
1066 	}
1067 
1068 	/* Remove the entry from the directory; as it is a file, we do not
1069 	 * have to change the number of hard links of the directory. */
1070 	tmpfs_dir_detach(dnode, de);
1071 	TMPFS_NODE_UNLOCK(dnode);
1072 
1073 	/* Free the directory entry we just deleted.  Note that the node
1074 	 * referred by it will not be removed until the vnode is really
1075 	 * reclaimed. */
1076 	tmpfs_free_dirent(tmp, de);
1077 
1078 	if (node->tn_links > 0) {
1079 	        TMPFS_NODE_LOCK(node);
1080 		node->tn_status |= TMPFS_NODE_CHANGED;
1081 	        TMPFS_NODE_UNLOCK(node);
1082 	}
1083 
1084 	cache_unlink(ap->a_nch);
1085 	tmpfs_knote(vp, NOTE_DELETE);
1086 	error = 0;
1087 
1088 out:
1089 	if (error == 0)
1090 		tmpfs_knote(dvp, NOTE_WRITE);
1091 out2:
1092 	vrele(vp);
1093 
1094 	return error;
1095 }
1096 
1097 /* --------------------------------------------------------------------- */
1098 
1099 static int
1100 tmpfs_nlink(struct vop_nlink_args *ap)
1101 {
1102 	struct vnode *dvp = ap->a_dvp;
1103 	struct vnode *vp = ap->a_vp;
1104 	struct namecache *ncp = ap->a_nch->ncp;
1105 	struct tmpfs_dirent *de;
1106 	struct tmpfs_node *node;
1107 	struct tmpfs_node *dnode;
1108 	int error;
1109 
1110 	KKASSERT(dvp != vp); /* XXX When can this be false? */
1111 
1112 	node = VP_TO_TMPFS_NODE(vp);
1113 	dnode = VP_TO_TMPFS_NODE(dvp);
1114 	TMPFS_NODE_LOCK(dnode);
1115 
1116 	/* XXX: Why aren't the following two tests done by the caller? */
1117 
1118 	/* Hard links of directories are forbidden. */
1119 	if (vp->v_type == VDIR) {
1120 		error = EPERM;
1121 		goto out;
1122 	}
1123 
1124 	/* Cannot create cross-device links. */
1125 	if (dvp->v_mount != vp->v_mount) {
1126 		error = EXDEV;
1127 		goto out;
1128 	}
1129 
1130 	/* Ensure that we do not overflow the maximum number of links imposed
1131 	 * by the system. */
1132 	KKASSERT(node->tn_links <= LINK_MAX);
1133 	if (node->tn_links >= LINK_MAX) {
1134 		error = EMLINK;
1135 		goto out;
1136 	}
1137 
1138 	/* We cannot create links of files marked immutable or append-only. */
1139 	if (node->tn_flags & (IMMUTABLE | APPEND)) {
1140 		error = EPERM;
1141 		goto out;
1142 	}
1143 
1144 	/* Allocate a new directory entry to represent the node. */
1145 	error = tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), node,
1146 				   ncp->nc_name, ncp->nc_nlen, &de);
1147 	if (error != 0)
1148 		goto out;
1149 
1150 	/* Insert the new directory entry into the appropriate directory. */
1151 	tmpfs_dir_attach(dnode, de);
1152 
1153 	/* vp link count has changed, so update node times. */
1154 
1155 	TMPFS_NODE_LOCK(node);
1156 	node->tn_status |= TMPFS_NODE_CHANGED;
1157 	TMPFS_NODE_UNLOCK(node);
1158 	tmpfs_update(vp);
1159 
1160 	tmpfs_knote(vp, NOTE_LINK);
1161 	cache_setunresolved(ap->a_nch);
1162 	cache_setvp(ap->a_nch, vp);
1163 	error = 0;
1164 
1165 out:
1166 	TMPFS_NODE_UNLOCK(dnode);
1167 	if (error == 0)
1168 		tmpfs_knote(dvp, NOTE_WRITE);
1169 	return error;
1170 }
1171 
1172 /* --------------------------------------------------------------------- */
1173 
1174 static int
1175 tmpfs_nrename(struct vop_nrename_args *ap)
1176 {
1177 	struct vnode *fdvp = ap->a_fdvp;
1178 	struct namecache *fncp = ap->a_fnch->ncp;
1179 	struct vnode *fvp = fncp->nc_vp;
1180 	struct vnode *tdvp = ap->a_tdvp;
1181 	struct namecache *tncp = ap->a_tnch->ncp;
1182 	struct vnode *tvp;
1183 	struct tmpfs_dirent *de, *tde;
1184 	struct tmpfs_mount *tmp;
1185 	struct tmpfs_node *fdnode;
1186 	struct tmpfs_node *fnode;
1187 	struct tmpfs_node *tnode;
1188 	struct tmpfs_node *tdnode;
1189 	char *newname;
1190 	char *oldname;
1191 	int error;
1192 
1193 	KKASSERT(fdvp->v_mount == fvp->v_mount);
1194 
1195 	/*
1196 	 * Because tvp can get overwritten we have to vget it instead of
1197 	 * just vref or use it, otherwise it's VINACTIVE flag may not get
1198 	 * cleared and the node won't get destroyed.
1199 	 */
1200 	error = cache_vget(ap->a_tnch, ap->a_cred, LK_SHARED, &tvp);
1201 	if (error == 0) {
1202 		tnode = VP_TO_TMPFS_NODE(tvp);
1203 		vn_unlock(tvp);
1204 	} else {
1205 		tnode = NULL;
1206 	}
1207 
1208 	/* Disallow cross-device renames.
1209 	 * XXX Why isn't this done by the caller? */
1210 	if (fvp->v_mount != tdvp->v_mount ||
1211 	    (tvp != NULL && fvp->v_mount != tvp->v_mount)) {
1212 		error = EXDEV;
1213 		goto out;
1214 	}
1215 
1216 	tmp = VFS_TO_TMPFS(tdvp->v_mount);
1217 	tdnode = VP_TO_TMPFS_DIR(tdvp);
1218 
1219 	/* If source and target are the same file, there is nothing to do. */
1220 	if (fvp == tvp) {
1221 		error = 0;
1222 		goto out;
1223 	}
1224 
1225 	fdnode = VP_TO_TMPFS_DIR(fdvp);
1226 	fnode = VP_TO_TMPFS_NODE(fvp);
1227 	TMPFS_NODE_LOCK(fdnode);
1228 	de = tmpfs_dir_lookup(fdnode, fnode, fncp);
1229 	TMPFS_NODE_UNLOCK(fdnode);	/* XXX depend on namecache lock */
1230 
1231 	/* Avoid manipulating '.' and '..' entries. */
1232 	if (de == NULL) {
1233 		error = ENOENT;
1234 		goto out_locked;
1235 	}
1236 	KKASSERT(de->td_node == fnode);
1237 
1238 	/*
1239 	 * If replacing an entry in the target directory and that entry
1240 	 * is a directory, it must be empty.
1241 	 *
1242 	 * Kern_rename gurantees the destination to be a directory
1243 	 * if the source is one (it does?).
1244 	 */
1245 	if (tvp != NULL) {
1246 		KKASSERT(tnode != NULL);
1247 
1248 		if ((tnode->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
1249 		    (tdnode->tn_flags & (APPEND | IMMUTABLE))) {
1250 			error = EPERM;
1251 			goto out_locked;
1252 		}
1253 
1254 		if (fnode->tn_type == VDIR && tnode->tn_type == VDIR) {
1255 			if (tnode->tn_size > 0) {
1256 				error = ENOTEMPTY;
1257 				goto out_locked;
1258 			}
1259 		} else if (fnode->tn_type == VDIR && tnode->tn_type != VDIR) {
1260 			error = ENOTDIR;
1261 			goto out_locked;
1262 		} else if (fnode->tn_type != VDIR && tnode->tn_type == VDIR) {
1263 			error = EISDIR;
1264 			goto out_locked;
1265 		} else {
1266 			KKASSERT(fnode->tn_type != VDIR &&
1267 				tnode->tn_type != VDIR);
1268 		}
1269 	}
1270 
1271 	if ((fnode->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
1272 	    (fdnode->tn_flags & (APPEND | IMMUTABLE))) {
1273 		error = EPERM;
1274 		goto out_locked;
1275 	}
1276 
1277 	/*
1278 	 * Ensure that we have enough memory to hold the new name, if it
1279 	 * has to be changed.
1280 	 */
1281 	if (fncp->nc_nlen != tncp->nc_nlen ||
1282 	    bcmp(fncp->nc_name, tncp->nc_name, fncp->nc_nlen) != 0) {
1283 		newname = kmalloc(tncp->nc_nlen + 1, tmp->tm_name_zone,
1284 				  M_WAITOK | M_NULLOK);
1285 		if (newname == NULL) {
1286 			error = ENOSPC;
1287 			goto out_locked;
1288 		}
1289 		bcopy(tncp->nc_name, newname, tncp->nc_nlen);
1290 		newname[tncp->nc_nlen] = '\0';
1291 	} else {
1292 		newname = NULL;
1293 	}
1294 
1295 	/*
1296 	 * Unlink entry from source directory.  Note that the kernel has
1297 	 * already checked for illegal recursion cases (renaming a directory
1298 	 * into a subdirectory of itself).
1299 	 */
1300 	if (fdnode != tdnode) {
1301 		tmpfs_dir_detach(fdnode, de);
1302 	} else {
1303 		/* XXX depend on namecache lock */
1304 		TMPFS_NODE_LOCK(fdnode);
1305 		KKASSERT(de == tmpfs_dir_lookup(fdnode, fnode, fncp));
1306 		RB_REMOVE(tmpfs_dirtree, &fdnode->tn_dir.tn_dirtree, de);
1307 		RB_REMOVE(tmpfs_dirtree_cookie,
1308 			  &fdnode->tn_dir.tn_cookietree, de);
1309 		TMPFS_NODE_UNLOCK(fdnode);
1310 	}
1311 
1312 	/*
1313 	 * Handle any name change.  Swap with newname, we will
1314 	 * deallocate it at the end.
1315 	 */
1316 	if (newname != NULL) {
1317 #if 0
1318 		TMPFS_NODE_LOCK(fnode);
1319 		fnode->tn_status |= TMPFS_NODE_CHANGED;
1320 		TMPFS_NODE_UNLOCK(fnode);
1321 #endif
1322 		oldname = de->td_name;
1323 		de->td_name = newname;
1324 		de->td_namelen = (uint16_t)tncp->nc_nlen;
1325 		newname = oldname;
1326 	}
1327 
1328 	/*
1329 	 * If we are overwriting an entry, we have to remove the old one
1330 	 * from the target directory.
1331 	 */
1332 	if (tvp != NULL) {
1333 		/* Remove the old entry from the target directory. */
1334 		TMPFS_NODE_LOCK(tdnode);
1335 		tde = tmpfs_dir_lookup(tdnode, tnode, tncp);
1336 		tmpfs_dir_detach(tdnode, tde);
1337 		TMPFS_NODE_UNLOCK(tdnode);
1338 		tmpfs_knote(tdnode->tn_vnode, NOTE_DELETE);
1339 
1340 		/*
1341 		 * Free the directory entry we just deleted.  Note that the
1342 		 * node referred by it will not be removed until the vnode is
1343 		 * really reclaimed.
1344 		 */
1345 		tmpfs_free_dirent(VFS_TO_TMPFS(tvp->v_mount), tde);
1346 		/*cache_inval_vp(tvp, CINV_DESTROY);*/
1347 	}
1348 
1349 	/*
1350 	 * Link entry to target directory.  If the entry
1351 	 * represents a directory move the parent linkage
1352 	 * as well.
1353 	 */
1354 	if (fdnode != tdnode) {
1355 		if (de->td_node->tn_type == VDIR) {
1356 			TMPFS_VALIDATE_DIR(fnode);
1357 		}
1358 		tmpfs_dir_attach(tdnode, de);
1359 	} else {
1360 		TMPFS_NODE_LOCK(tdnode);
1361 		tdnode->tn_status |= TMPFS_NODE_MODIFIED;
1362 		RB_INSERT(tmpfs_dirtree, &tdnode->tn_dir.tn_dirtree, de);
1363 		RB_INSERT(tmpfs_dirtree_cookie,
1364 			  &tdnode->tn_dir.tn_cookietree, de);
1365 		TMPFS_NODE_UNLOCK(tdnode);
1366 	}
1367 
1368 	/*
1369 	 * Finish up
1370 	 */
1371 	if (newname) {
1372 		kfree(newname, tmp->tm_name_zone);
1373 		newname = NULL;
1374 	}
1375 	cache_rename(ap->a_fnch, ap->a_tnch);
1376 	tmpfs_knote(ap->a_fdvp, NOTE_WRITE);
1377 	tmpfs_knote(ap->a_tdvp, NOTE_WRITE);
1378 	if (fnode->tn_vnode)
1379 		tmpfs_knote(fnode->tn_vnode, NOTE_RENAME);
1380 	error = 0;
1381 
1382 out_locked:
1383 	;
1384 out:
1385 	if (tvp)
1386 		vrele(tvp);
1387 	return error;
1388 }
1389 
1390 /* --------------------------------------------------------------------- */
1391 
1392 static int
1393 tmpfs_nmkdir(struct vop_nmkdir_args *ap)
1394 {
1395 	struct vnode *dvp = ap->a_dvp;
1396 	struct vnode **vpp = ap->a_vpp;
1397 	struct namecache *ncp = ap->a_nch->ncp;
1398 	struct vattr *vap = ap->a_vap;
1399 	struct ucred *cred = ap->a_cred;
1400 	int error;
1401 
1402 	KKASSERT(vap->va_type == VDIR);
1403 
1404 	error = tmpfs_alloc_file(dvp, vpp, vap, ncp, cred, NULL);
1405 	if (error == 0) {
1406 		cache_setunresolved(ap->a_nch);
1407 		cache_setvp(ap->a_nch, *vpp);
1408 		tmpfs_knote(dvp, NOTE_WRITE | NOTE_LINK);
1409 	}
1410 	return error;
1411 }
1412 
1413 /* --------------------------------------------------------------------- */
1414 
1415 static int
1416 tmpfs_nrmdir(struct vop_nrmdir_args *ap)
1417 {
1418 	struct vnode *dvp = ap->a_dvp;
1419 	struct namecache *ncp = ap->a_nch->ncp;
1420 	struct vnode *vp;
1421 	struct tmpfs_dirent *de;
1422 	struct tmpfs_mount *tmp;
1423 	struct tmpfs_node *dnode;
1424 	struct tmpfs_node *node;
1425 	int error;
1426 
1427 	/*
1428 	 * We have to acquire the vp from ap->a_nch because we will likely
1429 	 * unresolve the namecache entry, and a vrele/vput is needed to
1430 	 * trigger the tmpfs_inactive/tmpfs_reclaim sequence.
1431 	 *
1432 	 * We have to use vget to clear any inactive state on the vnode,
1433 	 * otherwise the vnode may remain inactive and thus tmpfs_inactive
1434 	 * will not get called when we release it.
1435 	 */
1436 	error = cache_vget(ap->a_nch, ap->a_cred, LK_SHARED, &vp);
1437 	KKASSERT(error == 0);
1438 	vn_unlock(vp);
1439 
1440 	/*
1441 	 * Prevalidate so we don't hit an assertion later
1442 	 */
1443 	if (vp->v_type != VDIR) {
1444 		error = ENOTDIR;
1445 		goto out;
1446 	}
1447 
1448 	tmp = VFS_TO_TMPFS(dvp->v_mount);
1449 	dnode = VP_TO_TMPFS_DIR(dvp);
1450 	node = VP_TO_TMPFS_DIR(vp);
1451 
1452 	/*
1453 	 * Directories with more than two entries ('.' and '..') cannot
1454 	 * be removed.
1455 	 */
1456 	if (node->tn_size > 0) {
1457 		error = ENOTEMPTY;
1458 		goto out;
1459 	}
1460 
1461 	if ((dnode->tn_flags & APPEND)
1462 	    || (node->tn_flags & (NOUNLINK | IMMUTABLE | APPEND))) {
1463 		error = EPERM;
1464 		goto out;
1465 	}
1466 
1467 	/*
1468 	 * This invariant holds only if we are not trying to
1469 	 * remove "..".  We checked for that above so this is safe now.
1470 	 */
1471 	KKASSERT(node->tn_dir.tn_parent == dnode);
1472 
1473 	/*
1474 	 * Get the directory entry associated with node (vp).  This
1475 	 * was filled by tmpfs_lookup while looking up the entry.
1476 	 */
1477 	TMPFS_NODE_LOCK(dnode);
1478 	de = tmpfs_dir_lookup(dnode, node, ncp);
1479 	KKASSERT(TMPFS_DIRENT_MATCHES(de, ncp->nc_name, ncp->nc_nlen));
1480 
1481 	/* Check flags to see if we are allowed to remove the directory. */
1482 	if ((dnode->tn_flags & APPEND) ||
1483 	    node->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) {
1484 		error = EPERM;
1485 		TMPFS_NODE_UNLOCK(dnode);
1486 		goto out;
1487 	}
1488 
1489 	/* Detach the directory entry from the directory (dnode). */
1490 	tmpfs_dir_detach(dnode, de);
1491 	TMPFS_NODE_UNLOCK(dnode);
1492 
1493 	/* No vnode should be allocated for this entry from this point */
1494 	TMPFS_NODE_LOCK(dnode);
1495 	TMPFS_ASSERT_ELOCKED(dnode);
1496 	TMPFS_NODE_LOCK(node);
1497 	TMPFS_ASSERT_ELOCKED(node);
1498 
1499 	/*
1500 	 * Must set parent linkage to NULL (tested by ncreate to disallow
1501 	 * the creation of new files/dirs in a deleted directory)
1502 	 */
1503 	node->tn_status |= TMPFS_NODE_CHANGED;
1504 
1505 	dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED |
1506 			    TMPFS_NODE_MODIFIED;
1507 
1508 	TMPFS_NODE_UNLOCK(node);
1509 	TMPFS_NODE_UNLOCK(dnode);
1510 
1511 	/* Free the directory entry we just deleted.  Note that the node
1512 	 * referred by it will not be removed until the vnode is really
1513 	 * reclaimed. */
1514 	tmpfs_free_dirent(tmp, de);
1515 
1516 	/* Release the deleted vnode (will destroy the node, notify
1517 	 * interested parties and clean it from the cache). */
1518 
1519 	TMPFS_NODE_LOCK(dnode);
1520 	dnode->tn_status |= TMPFS_NODE_CHANGED;
1521 	TMPFS_NODE_UNLOCK(dnode);
1522 	tmpfs_update(dvp);
1523 
1524 	cache_unlink(ap->a_nch);
1525 	tmpfs_knote(dvp, NOTE_WRITE | NOTE_LINK);
1526 	error = 0;
1527 
1528 out:
1529 	vrele(vp);
1530 
1531 	return error;
1532 }
1533 
1534 /* --------------------------------------------------------------------- */
1535 
1536 static int
1537 tmpfs_nsymlink(struct vop_nsymlink_args *ap)
1538 {
1539 	struct vnode *dvp = ap->a_dvp;
1540 	struct vnode **vpp = ap->a_vpp;
1541 	struct namecache *ncp = ap->a_nch->ncp;
1542 	struct vattr *vap = ap->a_vap;
1543 	struct ucred *cred = ap->a_cred;
1544 	char *target = ap->a_target;
1545 	int error;
1546 
1547 	vap->va_type = VLNK;
1548 	error = tmpfs_alloc_file(dvp, vpp, vap, ncp, cred, target);
1549 	if (error == 0) {
1550 		tmpfs_knote(*vpp, NOTE_WRITE);
1551 		cache_setunresolved(ap->a_nch);
1552 		cache_setvp(ap->a_nch, *vpp);
1553 	}
1554 	return error;
1555 }
1556 
1557 /* --------------------------------------------------------------------- */
1558 
1559 static int
1560 tmpfs_readdir(struct vop_readdir_args *ap)
1561 {
1562 	struct vnode *vp = ap->a_vp;
1563 	struct uio *uio = ap->a_uio;
1564 	int *eofflag = ap->a_eofflag;
1565 	off_t **cookies = ap->a_cookies;
1566 	int *ncookies = ap->a_ncookies;
1567 	struct tmpfs_mount *tmp;
1568 	int error;
1569 	off_t startoff;
1570 	off_t cnt = 0;
1571 	struct tmpfs_node *node;
1572 
1573 	/* This operation only makes sense on directory nodes. */
1574 	if (vp->v_type != VDIR) {
1575 		return ENOTDIR;
1576 	}
1577 
1578 	tmp = VFS_TO_TMPFS(vp->v_mount);
1579 	node = VP_TO_TMPFS_DIR(vp);
1580 	startoff = uio->uio_offset;
1581 
1582 	if (uio->uio_offset == TMPFS_DIRCOOKIE_DOT) {
1583 		error = tmpfs_dir_getdotdent(node, uio);
1584 		if (error != 0) {
1585 			TMPFS_NODE_LOCK_SH(node);
1586 			goto outok;
1587 		}
1588 		cnt++;
1589 	}
1590 
1591 	if (uio->uio_offset == TMPFS_DIRCOOKIE_DOTDOT) {
1592 		/* may lock parent, cannot hold node lock */
1593 		error = tmpfs_dir_getdotdotdent(tmp, node, uio);
1594 		if (error != 0) {
1595 			TMPFS_NODE_LOCK_SH(node);
1596 			goto outok;
1597 		}
1598 		cnt++;
1599 	}
1600 
1601 	TMPFS_NODE_LOCK_SH(node);
1602 	error = tmpfs_dir_getdents(node, uio, &cnt);
1603 
1604 outok:
1605 	KKASSERT(error >= -1);
1606 
1607 	if (error == -1)
1608 		error = 0;
1609 
1610 	if (eofflag != NULL)
1611 		*eofflag =
1612 		    (error == 0 && uio->uio_offset == TMPFS_DIRCOOKIE_EOF);
1613 
1614 	/* Update NFS-related variables. */
1615 	if (error == 0 && cookies != NULL && ncookies != NULL) {
1616 		off_t i;
1617 		off_t off = startoff;
1618 		struct tmpfs_dirent *de = NULL;
1619 
1620 		*ncookies = cnt;
1621 		*cookies = kmalloc(cnt * sizeof(off_t), M_TEMP, M_WAITOK);
1622 
1623 		for (i = 0; i < cnt; i++) {
1624 			KKASSERT(off != TMPFS_DIRCOOKIE_EOF);
1625 			if (off == TMPFS_DIRCOOKIE_DOT) {
1626 				off = TMPFS_DIRCOOKIE_DOTDOT;
1627 			} else {
1628 				if (off == TMPFS_DIRCOOKIE_DOTDOT) {
1629 					de = RB_MIN(tmpfs_dirtree_cookie,
1630 						&node->tn_dir.tn_cookietree);
1631 				} else if (de != NULL) {
1632 					de = RB_NEXT(tmpfs_dirtree_cookie,
1633 					       &node->tn_dir.tn_cookietree, de);
1634 				} else {
1635 					de = tmpfs_dir_lookupbycookie(node,
1636 								      off);
1637 					KKASSERT(de != NULL);
1638 					de = RB_NEXT(tmpfs_dirtree_cookie,
1639 					       &node->tn_dir.tn_cookietree, de);
1640 				}
1641 				if (de == NULL)
1642 					off = TMPFS_DIRCOOKIE_EOF;
1643 				else
1644 					off = tmpfs_dircookie(de);
1645 			}
1646 			(*cookies)[i] = off;
1647 		}
1648 		KKASSERT(uio->uio_offset == off);
1649 	}
1650 	TMPFS_NODE_UNLOCK(node);
1651 
1652 	if ((node->tn_status & TMPFS_NODE_ACCESSED) == 0) {
1653 		TMPFS_NODE_LOCK(node);
1654 		node->tn_status |= TMPFS_NODE_ACCESSED;
1655 		TMPFS_NODE_UNLOCK(node);
1656 	}
1657 	return error;
1658 }
1659 
1660 /* --------------------------------------------------------------------- */
1661 
1662 static int
1663 tmpfs_readlink(struct vop_readlink_args *ap)
1664 {
1665 	struct vnode *vp = ap->a_vp;
1666 	struct uio *uio = ap->a_uio;
1667 	int error;
1668 	struct tmpfs_node *node;
1669 
1670 	KKASSERT(uio->uio_offset == 0);
1671 	KKASSERT(vp->v_type == VLNK);
1672 
1673 	node = VP_TO_TMPFS_NODE(vp);
1674 	TMPFS_NODE_LOCK_SH(node);
1675 	error = uiomove(node->tn_link,
1676 			MIN(node->tn_size, uio->uio_resid), uio);
1677 	TMPFS_NODE_UNLOCK(node);
1678 	if ((node->tn_status & TMPFS_NODE_ACCESSED) == 0) {
1679 		TMPFS_NODE_LOCK(node);
1680 		node->tn_status |= TMPFS_NODE_ACCESSED;
1681 		TMPFS_NODE_UNLOCK(node);
1682 	}
1683 	return error;
1684 }
1685 
1686 /* --------------------------------------------------------------------- */
1687 
1688 static int
1689 tmpfs_inactive(struct vop_inactive_args *ap)
1690 {
1691 	struct vnode *vp = ap->a_vp;
1692 	struct tmpfs_node *node;
1693 	struct mount *mp;
1694 
1695 	mp = vp->v_mount;
1696 	lwkt_gettoken(&mp->mnt_token);
1697 	node = VP_TO_TMPFS_NODE(vp);
1698 
1699 	/*
1700 	 * Degenerate case
1701 	 */
1702 	if (node == NULL) {
1703 		vrecycle(vp);
1704 		lwkt_reltoken(&mp->mnt_token);
1705 		return(0);
1706 	}
1707 
1708 	/*
1709 	 * Get rid of unreferenced deleted vnodes sooner rather than
1710 	 * later so the data memory can be recovered immediately.
1711 	 *
1712 	 * We must truncate the vnode to prevent the normal reclamation
1713 	 * path from flushing the data for the removed file to disk.
1714 	 */
1715 	TMPFS_NODE_LOCK(node);
1716 	if ((node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0 &&
1717 	    node->tn_links == 0)
1718 	{
1719 		node->tn_vpstate = TMPFS_VNODE_DOOMED;
1720 		TMPFS_NODE_UNLOCK(node);
1721 		if (node->tn_type == VREG)
1722 			tmpfs_truncate(vp, 0);
1723 		vrecycle(vp);
1724 	} else {
1725 		/*
1726 		 * We must retain any VM pages belonging to the vnode's
1727 		 * object as the vnode will destroy the object during a
1728 		 * later reclaim.  We call vinvalbuf(V_SAVE) to clean
1729 		 * out the buffer cache.
1730 		 *
1731 		 * On DragonFlyBSD, vnodes are not immediately deactivated
1732 		 * on the 1->0 refs, so this is a relatively optimal
1733 		 * operation.  We have to do this in tmpfs_inactive()
1734 		 * because the pages will have already been thrown away
1735 		 * at the time tmpfs_reclaim() is called.
1736 		 */
1737 		if (node->tn_type == VREG &&
1738 		    node->tn_reg.tn_pages_in_aobj == 0) {
1739 			vinvalbuf(vp, V_SAVE, 0, 0);
1740 			KKASSERT(RB_EMPTY(&vp->v_rbdirty_tree));
1741 			KKASSERT(RB_EMPTY(&vp->v_rbclean_tree));
1742 			tmpfs_move_pages(vp->v_object, node->tn_reg.tn_aobj,
1743 					 TMPFS_MOVF_DEACTIVATE);
1744 			node->tn_reg.tn_pages_in_aobj = 1;
1745 		}
1746 
1747 		TMPFS_NODE_UNLOCK(node);
1748 	}
1749 	lwkt_reltoken(&mp->mnt_token);
1750 
1751 	return 0;
1752 }
1753 
1754 /* --------------------------------------------------------------------- */
1755 
1756 int
1757 tmpfs_reclaim(struct vop_reclaim_args *ap)
1758 {
1759 	struct vnode *vp = ap->a_vp;
1760 	struct tmpfs_mount *tmp;
1761 	struct tmpfs_node *node;
1762 	struct mount *mp;
1763 
1764 	mp = vp->v_mount;
1765 	lwkt_gettoken(&mp->mnt_token);
1766 
1767 	node = VP_TO_TMPFS_NODE(vp);
1768 	tmp = VFS_TO_TMPFS(vp->v_mount);
1769 	KKASSERT(mp == tmp->tm_mount);
1770 
1771         TMPFS_NODE_LOCK(node);
1772 	KKASSERT(node->tn_vnode == vp);
1773         node->tn_vnode = NULL;
1774         vp->v_data = NULL;
1775 
1776 	/*
1777 	 * If the node referenced by this vnode was deleted by the
1778 	 * user, we must free its associated data structures now that
1779 	 * the vnode is being reclaimed.
1780 	 *
1781 	 * Directories have an extra link ref.
1782 	 */
1783 	if ((node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0 &&
1784 	    node->tn_links == 0) {
1785 		node->tn_vpstate = TMPFS_VNODE_DOOMED;
1786 		tmpfs_free_node(tmp, node);
1787 		/* eats the lock */
1788 	} else {
1789 		TMPFS_NODE_UNLOCK(node);
1790 	}
1791 	lwkt_reltoken(&mp->mnt_token);
1792 
1793 	KKASSERT(vp->v_data == NULL);
1794 	return 0;
1795 }
1796 
1797 /* --------------------------------------------------------------------- */
1798 
1799 static int
1800 tmpfs_mountctl(struct vop_mountctl_args *ap)
1801 {
1802 	struct tmpfs_mount *tmp;
1803 	struct mount *mp;
1804 	int rc;
1805 
1806 	mp = ap->a_head.a_ops->head.vv_mount;
1807 	lwkt_gettoken(&mp->mnt_token);
1808 
1809 	switch (ap->a_op) {
1810 	case (MOUNTCTL_SET_EXPORT):
1811 		tmp = (struct tmpfs_mount *) mp->mnt_data;
1812 
1813 		if (ap->a_ctllen != sizeof(struct export_args))
1814 			rc = (EINVAL);
1815 		else
1816 			rc = vfs_export(mp, &tmp->tm_export,
1817 					(const struct export_args *) ap->a_ctl);
1818 		break;
1819 	default:
1820 		rc = vop_stdmountctl(ap);
1821 		break;
1822 	}
1823 
1824 	lwkt_reltoken(&mp->mnt_token);
1825 	return (rc);
1826 }
1827 
1828 /* --------------------------------------------------------------------- */
1829 
1830 static int
1831 tmpfs_print(struct vop_print_args *ap)
1832 {
1833 	struct vnode *vp = ap->a_vp;
1834 
1835 	struct tmpfs_node *node;
1836 
1837 	node = VP_TO_TMPFS_NODE(vp);
1838 
1839 	kprintf("tag VT_TMPFS, tmpfs_node %p, flags 0x%x, links %d\n",
1840 	    node, node->tn_flags, node->tn_links);
1841 	kprintf("\tmode 0%o, owner %d, group %d, size %ju, status 0x%x\n",
1842 	    node->tn_mode, node->tn_uid, node->tn_gid,
1843 	    (uintmax_t)node->tn_size, node->tn_status);
1844 
1845 	if (vp->v_type == VFIFO)
1846 		fifo_printinfo(vp);
1847 
1848 	kprintf("\n");
1849 
1850 	return 0;
1851 }
1852 
1853 /* --------------------------------------------------------------------- */
1854 
1855 static int
1856 tmpfs_pathconf(struct vop_pathconf_args *ap)
1857 {
1858 	struct vnode *vp = ap->a_vp;
1859 	int name = ap->a_name;
1860 	register_t *retval = ap->a_retval;
1861 	struct tmpfs_mount *tmp;
1862 	int error;
1863 
1864 	error = 0;
1865 
1866 	switch (name) {
1867 	case _PC_CHOWN_RESTRICTED:
1868 		*retval = 1;
1869 		break;
1870 
1871 	case _PC_FILESIZEBITS:
1872 		tmp = VFS_TO_TMPFS(vp->v_mount);
1873 		*retval = max(32, flsll(tmp->tm_pages_max * PAGE_SIZE) + 1);
1874 		break;
1875 
1876 	case _PC_LINK_MAX:
1877 		*retval = LINK_MAX;
1878 		break;
1879 
1880 	case _PC_NAME_MAX:
1881 		*retval = NAME_MAX;
1882 		break;
1883 
1884 	case _PC_NO_TRUNC:
1885 		*retval = 1;
1886 		break;
1887 
1888 	case _PC_PATH_MAX:
1889 		*retval = PATH_MAX;
1890 		break;
1891 
1892 	case _PC_PIPE_BUF:
1893 		*retval = PIPE_BUF;
1894 		break;
1895 
1896 	case _PC_SYNC_IO:
1897 		*retval = 1;
1898 		break;
1899 
1900 	case _PC_2_SYMLINKS:
1901 		*retval = 1;
1902 		break;
1903 
1904 	default:
1905 		error = EINVAL;
1906 	}
1907 
1908 	return error;
1909 }
1910 
1911 /************************************************************************
1912  *                          KQFILTER OPS                                *
1913  ************************************************************************/
1914 
1915 static void filt_tmpfsdetach(struct knote *kn);
1916 static int filt_tmpfsread(struct knote *kn, long hint);
1917 static int filt_tmpfswrite(struct knote *kn, long hint);
1918 static int filt_tmpfsvnode(struct knote *kn, long hint);
1919 
1920 static struct filterops tmpfsread_filtops =
1921 	{ FILTEROP_ISFD | FILTEROP_MPSAFE,
1922 	  NULL, filt_tmpfsdetach, filt_tmpfsread };
1923 static struct filterops tmpfswrite_filtops =
1924 	{ FILTEROP_ISFD | FILTEROP_MPSAFE,
1925 	  NULL, filt_tmpfsdetach, filt_tmpfswrite };
1926 static struct filterops tmpfsvnode_filtops =
1927 	{ FILTEROP_ISFD | FILTEROP_MPSAFE,
1928 	  NULL, filt_tmpfsdetach, filt_tmpfsvnode };
1929 
1930 static int
1931 tmpfs_kqfilter (struct vop_kqfilter_args *ap)
1932 {
1933 	struct vnode *vp = ap->a_vp;
1934 	struct knote *kn = ap->a_kn;
1935 
1936 	switch (kn->kn_filter) {
1937 	case EVFILT_READ:
1938 		kn->kn_fop = &tmpfsread_filtops;
1939 		break;
1940 	case EVFILT_WRITE:
1941 		kn->kn_fop = &tmpfswrite_filtops;
1942 		break;
1943 	case EVFILT_VNODE:
1944 		kn->kn_fop = &tmpfsvnode_filtops;
1945 		break;
1946 	default:
1947 		return (EOPNOTSUPP);
1948 	}
1949 
1950 	kn->kn_hook = (caddr_t)vp;
1951 
1952 	knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
1953 
1954 	return(0);
1955 }
1956 
1957 static void
1958 filt_tmpfsdetach(struct knote *kn)
1959 {
1960 	struct vnode *vp = (void *)kn->kn_hook;
1961 
1962 	knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
1963 }
1964 
1965 static int
1966 filt_tmpfsread(struct knote *kn, long hint)
1967 {
1968 	struct vnode *vp = (void *)kn->kn_hook;
1969 	struct tmpfs_node *node = VP_TO_TMPFS_NODE(vp);
1970 	off_t off;
1971 
1972 	if (hint == NOTE_REVOKE) {
1973 		kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
1974 		return(1);
1975 	}
1976 
1977 	/*
1978 	 * Interlock against MP races when performing this function.
1979 	 */
1980 	TMPFS_NODE_LOCK_SH(node);
1981 	off = node->tn_size - kn->kn_fp->f_offset;
1982 	kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX;
1983 	if (kn->kn_sfflags & NOTE_OLDAPI) {
1984 		TMPFS_NODE_UNLOCK(node);
1985 		return(1);
1986 	}
1987 	if (kn->kn_data == 0) {
1988 		kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX;
1989 	}
1990 	TMPFS_NODE_UNLOCK(node);
1991 	return (kn->kn_data != 0);
1992 }
1993 
1994 static int
1995 filt_tmpfswrite(struct knote *kn, long hint)
1996 {
1997 	if (hint == NOTE_REVOKE)
1998 		kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
1999 	kn->kn_data = 0;
2000 	return (1);
2001 }
2002 
2003 static int
2004 filt_tmpfsvnode(struct knote *kn, long hint)
2005 {
2006 	if (kn->kn_sfflags & hint)
2007 		kn->kn_fflags |= hint;
2008 	if (hint == NOTE_REVOKE) {
2009 		kn->kn_flags |= (EV_EOF | EV_NODATA);
2010 		return (1);
2011 	}
2012 	return (kn->kn_fflags != 0);
2013 }
2014 
2015 /*
2016  * Helper to move VM pages between objects
2017  *
2018  * NOTE: The vm_page_rename() dirties the page, so we can clear the
2019  *	 PG_NEED_COMMIT flag.  If the pages are being moved into tn_aobj,
2020  *	 the pageout daemon will be able to page them out.
2021  */
2022 static int
2023 tmpfs_move_pages_callback(vm_page_t p, void *data)
2024 {
2025 	struct rb_vm_page_scan_info *info = data;
2026 	vm_pindex_t pindex;
2027 
2028 	pindex = p->pindex;
2029 	if (vm_page_busy_try(p, TRUE)) {
2030 		vm_page_sleep_busy(p, TRUE, "tpgmov");
2031 		info->error = -1;
2032 		return -1;
2033 	}
2034 	if (p->object != info->object || p->pindex != pindex) {
2035 		vm_page_wakeup(p);
2036 		info->error = -1;
2037 		return -1;
2038 	}
2039 
2040 	if ((info->pagerflags & TMPFS_MOVF_FROMBACKING) &&
2041 	    (p->flags & PG_SWAPPED) &&
2042 	    (p->flags & PG_NEED_COMMIT) == 0 &&
2043 	    p->dirty == 0) {
2044 		/*
2045 		 * If the page in the backing aobj was paged out to swap
2046 		 * it will be clean and it is better to free it rather
2047 		 * than re-dirty it.  We will assume that the page was
2048 		 * paged out to swap for a reason!
2049 		 *
2050 		 * This helps avoid unnecessary swap thrashing on the page.
2051 		 */
2052 		vm_page_free(p);
2053 	} else if ((info->pagerflags & TMPFS_MOVF_FROMBACKING) == 0 &&
2054 		   (p->flags & PG_NEED_COMMIT) == 0 &&
2055 		   p->dirty == 0) {
2056 		/*
2057 		 * If the page associated with the vnode was cleaned via
2058 		 * a tmpfs_strategy() call, it exists as a swap block in
2059 		 * aobj and it is again better to free it rather than
2060 		 * re-dirty it.  We will assume that the page was
2061 		 * paged out to swap for a reason!
2062 		 *
2063 		 * This helps avoid unnecessary swap thrashing on the page.
2064 		 */
2065 		vm_page_free(p);
2066 	} else {
2067 		/*
2068 		 * Rename the page, which will also ensure that it is flagged
2069 		 * as dirty and check whether a swap block association exists
2070 		 * in the target object or not, setting appropriate flags if
2071 		 * it does.
2072 		 */
2073 		vm_page_rename(p, info->dest_object, pindex);
2074 		vm_page_clear_commit(p);
2075 		if (info->pagerflags & TMPFS_MOVF_DEACTIVATE)
2076 			vm_page_deactivate(p);
2077 		vm_page_wakeup(p);
2078 		/* page automaticaly made dirty */
2079 	}
2080 
2081 	return 0;
2082 }
2083 
2084 static
2085 void
2086 tmpfs_move_pages(vm_object_t src, vm_object_t dst, int movflags)
2087 {
2088 	struct rb_vm_page_scan_info info;
2089 
2090 	vm_object_hold(src);
2091 	vm_object_hold(dst);
2092 	info.object = src;
2093 	info.dest_object = dst;
2094 	info.pagerflags = movflags;
2095 	do {
2096 		if (src->paging_in_progress)
2097 			vm_object_pip_wait(src, "objtfs");
2098 		info.error = 1;
2099 		vm_page_rb_tree_RB_SCAN(&src->rb_memq, NULL,
2100 					tmpfs_move_pages_callback, &info);
2101 	} while (info.error < 0 || !RB_EMPTY(&src->rb_memq) ||
2102 		 src->paging_in_progress);
2103 	vm_object_drop(dst);
2104 	vm_object_drop(src);
2105 }
2106 
2107 /* --------------------------------------------------------------------- */
2108 
2109 /*
2110  * vnode operations vector used for files stored in a tmpfs file system.
2111  */
2112 struct vop_ops tmpfs_vnode_vops = {
2113 	.vop_default =			vop_defaultop,
2114 	.vop_getpages = 		vop_stdgetpages,
2115 	.vop_putpages = 		vop_stdputpages,
2116 	.vop_ncreate =			tmpfs_ncreate,
2117 	.vop_nresolve =			tmpfs_nresolve,
2118 	.vop_nlookupdotdot =		tmpfs_nlookupdotdot,
2119 	.vop_nmknod =			tmpfs_nmknod,
2120 	.vop_open =			tmpfs_open,
2121 	.vop_close =			tmpfs_close,
2122 	.vop_access =			tmpfs_access,
2123 	.vop_getattr =			tmpfs_getattr,
2124 	.vop_setattr =			tmpfs_setattr,
2125 	.vop_read =			tmpfs_read,
2126 	.vop_write =			tmpfs_write,
2127 	.vop_fsync =			tmpfs_fsync,
2128 	.vop_mountctl =			tmpfs_mountctl,
2129 	.vop_nremove =			tmpfs_nremove,
2130 	.vop_nlink =			tmpfs_nlink,
2131 	.vop_nrename =			tmpfs_nrename,
2132 	.vop_nmkdir =			tmpfs_nmkdir,
2133 	.vop_nrmdir =			tmpfs_nrmdir,
2134 	.vop_nsymlink =			tmpfs_nsymlink,
2135 	.vop_readdir =			tmpfs_readdir,
2136 	.vop_readlink =			tmpfs_readlink,
2137 	.vop_inactive =			tmpfs_inactive,
2138 	.vop_reclaim =			tmpfs_reclaim,
2139 	.vop_print =			tmpfs_print,
2140 	.vop_pathconf =			tmpfs_pathconf,
2141 	.vop_bmap =			tmpfs_bmap,
2142 	.vop_strategy =			tmpfs_strategy,
2143 	.vop_advlock =			tmpfs_advlock,
2144 	.vop_kqfilter =			tmpfs_kqfilter
2145 };
2146