xref: /netbsd/sys/fs/tmpfs/tmpfs_subr.c (revision 6550d01e)
1 /*	$NetBSD: tmpfs_subr.c,v 1.62 2011/01/13 13:35:12 pooka Exp $	*/
2 
3 /*
4  * Copyright (c) 2005, 2006, 2007 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Julio M. Merino Vidal, developed as part of Google's Summer of Code
9  * 2005 program.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * Efficient memory file system supporting functions.
35  */
36 
37 #include <sys/cdefs.h>
38 __KERNEL_RCSID(0, "$NetBSD: tmpfs_subr.c,v 1.62 2011/01/13 13:35:12 pooka Exp $");
39 
40 #include <sys/param.h>
41 #include <sys/dirent.h>
42 #include <sys/event.h>
43 #include <sys/kmem.h>
44 #include <sys/mount.h>
45 #include <sys/namei.h>
46 #include <sys/time.h>
47 #include <sys/stat.h>
48 #include <sys/systm.h>
49 #include <sys/swap.h>
50 #include <sys/vnode.h>
51 #include <sys/kauth.h>
52 #include <sys/proc.h>
53 #include <sys/atomic.h>
54 
55 #include <uvm/uvm.h>
56 
57 #include <miscfs/specfs/specdev.h>
58 #include <miscfs/genfs/genfs.h>
59 #include <fs/tmpfs/tmpfs.h>
60 #include <fs/tmpfs/tmpfs_fifoops.h>
61 #include <fs/tmpfs/tmpfs_specops.h>
62 #include <fs/tmpfs/tmpfs_vnops.h>
63 
64 /* --------------------------------------------------------------------- */
65 
66 /*
67  * Allocates a new node of type 'type' inside the 'tmp' mount point, with
68  * its owner set to 'uid', its group to 'gid' and its mode set to 'mode',
69  * using the credentials of the process 'p'.
70  *
71  * If the node type is set to 'VDIR', then the parent parameter must point
72  * to the parent directory of the node being created.  It may only be NULL
73  * while allocating the root node.
74  *
75  * If the node type is set to 'VBLK' or 'VCHR', then the rdev parameter
76  * specifies the device the node represents.
77  *
78  * If the node type is set to 'VLNK', then the parameter target specifies
79  * the file name of the target file for the symbolic link that is being
80  * created.
81  *
82  * Note that new nodes are retrieved from the available list if it has
83  * items or, if it is empty, from the node pool as long as there is enough
84  * space to create them.
85  *
86  * Returns zero on success or an appropriate error code on failure.
87  */
88 int
89 tmpfs_alloc_node(struct tmpfs_mount *tmp, enum vtype type,
90     uid_t uid, gid_t gid, mode_t mode, struct tmpfs_node *parent,
91     char *target, dev_t rdev, struct tmpfs_node **node)
92 {
93 	struct tmpfs_node *nnode;
94 
95 	/* If the root directory of the 'tmp' file system is not yet
96 	 * allocated, this must be the request to do it. */
97 	KASSERT(IMPLIES(tmp->tm_root == NULL, parent == NULL && type == VDIR));
98 
99 	KASSERT(IFF(type == VLNK, target != NULL));
100 	KASSERT(IFF(type == VBLK || type == VCHR, rdev != VNOVAL));
101 
102 	KASSERT(uid != VNOVAL && gid != VNOVAL && mode != VNOVAL);
103 
104 	nnode = NULL;
105 	if (atomic_inc_uint_nv(&tmp->tm_nodes_cnt) >= tmp->tm_nodes_max) {
106 		atomic_dec_uint(&tmp->tm_nodes_cnt);
107 		return ENOSPC;
108 	}
109 
110 	nnode = tmpfs_node_get(tmp);
111 	if (nnode == NULL) {
112 		atomic_dec_uint(&tmp->tm_nodes_cnt);
113 		return ENOSPC;
114 	}
115 
116 	/*
117 	 * XXX Where the pool is backed by a map larger than (4GB *
118 	 * sizeof(*nnode)), this may produce duplicate inode numbers
119 	 * for applications that do not understand 64-bit ino_t.
120 	 */
121 	nnode->tn_id = (ino_t)((uintptr_t)nnode / sizeof(*nnode));
122 	nnode->tn_gen = arc4random();
123 
124 	/* Generic initialization. */
125 	nnode->tn_type = type;
126 	nnode->tn_size = 0;
127 	nnode->tn_status = 0;
128 	nnode->tn_flags = 0;
129 	nnode->tn_links = 0;
130 
131 	vfs_timestamp(&nnode->tn_atime);
132 	nnode->tn_birthtime = nnode->tn_atime;
133 	nnode->tn_ctime = nnode->tn_atime;
134 	nnode->tn_mtime = nnode->tn_atime;
135 
136 	nnode->tn_uid = uid;
137 	nnode->tn_gid = gid;
138 	nnode->tn_mode = mode;
139 	nnode->tn_lockf = NULL;
140 	nnode->tn_vnode = NULL;
141 
142 	/* Type-specific initialization. */
143 	switch (nnode->tn_type) {
144 	case VBLK:
145 	case VCHR:
146 		nnode->tn_spec.tn_dev.tn_rdev = rdev;
147 		break;
148 
149 	case VDIR:
150 		TAILQ_INIT(&nnode->tn_spec.tn_dir.tn_dir);
151 		nnode->tn_spec.tn_dir.tn_parent =
152 		    (parent == NULL) ? nnode : parent;
153 		nnode->tn_spec.tn_dir.tn_readdir_lastn = 0;
154 		nnode->tn_spec.tn_dir.tn_readdir_lastp = NULL;
155 		nnode->tn_links++;
156 		break;
157 
158 	case VFIFO:
159 		/* FALLTHROUGH */
160 	case VSOCK:
161 		break;
162 
163 	case VLNK:
164 		KASSERT(strlen(target) < MAXPATHLEN);
165 		nnode->tn_size = strlen(target);
166 		nnode->tn_spec.tn_lnk.tn_link =
167 		    tmpfs_strname_alloc(tmp, nnode->tn_size);
168 		if (nnode->tn_spec.tn_lnk.tn_link == NULL) {
169 			atomic_dec_uint(&tmp->tm_nodes_cnt);
170 			tmpfs_node_put(tmp, nnode);
171 			return ENOSPC;
172 		}
173 		memcpy(nnode->tn_spec.tn_lnk.tn_link, target, nnode->tn_size);
174 		break;
175 
176 	case VREG:
177 		nnode->tn_spec.tn_reg.tn_aobj =
178 		    uao_create(INT32_MAX - PAGE_SIZE, 0);
179 		nnode->tn_spec.tn_reg.tn_aobj_pages = 0;
180 		break;
181 
182 	default:
183 		KASSERT(0);
184 	}
185 
186 	mutex_init(&nnode->tn_vlock, MUTEX_DEFAULT, IPL_NONE);
187 
188 	mutex_enter(&tmp->tm_lock);
189 	LIST_INSERT_HEAD(&tmp->tm_nodes, nnode, tn_entries);
190 	mutex_exit(&tmp->tm_lock);
191 
192 	*node = nnode;
193 	return 0;
194 }
195 
196 /* --------------------------------------------------------------------- */
197 
198 /*
199  * Destroys the node pointed to by node from the file system 'tmp'.
200  * If the node does not belong to the given mount point, the results are
201  * unpredicted.
202  *
203  * If the node references a directory; no entries are allowed because
204  * their removal could need a recursive algorithm, something forbidden in
205  * kernel space.  Furthermore, there is not need to provide such
206  * functionality (recursive removal) because the only primitives offered
207  * to the user are the removal of empty directories and the deletion of
208  * individual files.
209  *
210  * Note that nodes are not really deleted; in fact, when a node has been
211  * allocated, it cannot be deleted during the whole life of the file
212  * system.  Instead, they are moved to the available list and remain there
213  * until reused.
214  */
215 void
216 tmpfs_free_node(struct tmpfs_mount *tmp, struct tmpfs_node *node)
217 {
218 	size_t objsz;
219 
220 	mutex_enter(&tmp->tm_lock);
221 	LIST_REMOVE(node, tn_entries);
222 	mutex_exit(&tmp->tm_lock);
223 	atomic_dec_uint(&tmp->tm_nodes_cnt);
224 
225 	switch (node->tn_type) {
226 	case VLNK:
227 		tmpfs_strname_free(tmp, node->tn_spec.tn_lnk.tn_link,
228 		    node->tn_size);
229 		break;
230 	case VREG:
231 		/*
232 		 * Calculate the size of node data, decrease the used-memory
233 		 * counter, and destroy the memory object (if any).
234 		 */
235 		objsz = PAGE_SIZE * node->tn_spec.tn_reg.tn_aobj_pages;
236 		if (objsz != 0) {
237 			tmpfs_mem_decr(tmp, objsz);
238 		}
239 		if (node->tn_spec.tn_reg.tn_aobj != NULL) {
240 			uao_detach(node->tn_spec.tn_reg.tn_aobj);
241 		}
242 		break;
243 	default:
244 		break;
245 	}
246 
247 	mutex_destroy(&node->tn_vlock);
248 	tmpfs_node_put(tmp, node);
249 }
250 
251 /* --------------------------------------------------------------------- */
252 
253 /*
254  * Allocates a new directory entry for the node node with a name of name.
255  * The new directory entry is returned in *de.
256  *
257  * The link count of node is increased by one to reflect the new object
258  * referencing it.  This takes care of notifying kqueue listeners about
259  * this change.
260  *
261  * Returns zero on success or an appropriate error code on failure.
262  */
263 int
264 tmpfs_alloc_dirent(struct tmpfs_mount *tmp, struct tmpfs_node *node,
265     const char *name, uint16_t len, struct tmpfs_dirent **de)
266 {
267 	struct tmpfs_dirent *nde;
268 
269 	nde = tmpfs_dirent_get(tmp);
270 	if (nde == NULL)
271 		return ENOSPC;
272 
273 	nde->td_name = tmpfs_strname_alloc(tmp, len);
274 	if (nde->td_name == NULL) {
275 		tmpfs_dirent_put(tmp, nde);
276 		return ENOSPC;
277 	}
278 	nde->td_namelen = len;
279 	memcpy(nde->td_name, name, len);
280 	nde->td_node = node;
281 
282 	if (node != TMPFS_NODE_WHITEOUT) {
283 		node->tn_links++;
284 		if (node->tn_links > 1 && node->tn_vnode != NULL)
285 			VN_KNOTE(node->tn_vnode, NOTE_LINK);
286 	}
287 	*de = nde;
288 
289 	return 0;
290 }
291 
292 /* --------------------------------------------------------------------- */
293 
294 /*
295  * Frees a directory entry.  It is the caller's responsibility to destroy
296  * the node referenced by it if needed.
297  *
298  * The link count of node is decreased by one to reflect the removal of an
299  * object that referenced it.  This only happens if 'node_exists' is true;
300  * otherwise the function will not access the node referred to by the
301  * directory entry, as it may already have been released from the outside.
302  *
303  * Interested parties (kqueue) are notified of the link count change; note
304  * that this can include both the node pointed to by the directory entry
305  * as well as its parent.
306  */
307 void
308 tmpfs_free_dirent(struct tmpfs_mount *tmp, struct tmpfs_dirent *de,
309     bool node_exists)
310 {
311 	if (node_exists && de->td_node != TMPFS_NODE_WHITEOUT) {
312 		struct tmpfs_node *node;
313 
314 		node = de->td_node;
315 
316 		KASSERT(node->tn_links > 0);
317 		node->tn_links--;
318 		if (node->tn_vnode != NULL)
319 			VN_KNOTE(node->tn_vnode, node->tn_links == 0 ?
320 			    NOTE_DELETE : NOTE_LINK);
321 		if (node->tn_type == VDIR)
322 			VN_KNOTE(node->tn_spec.tn_dir.tn_parent->tn_vnode,
323 			    NOTE_LINK);
324 	}
325 
326 	tmpfs_strname_free(tmp, de->td_name, de->td_namelen);
327 	tmpfs_dirent_put(tmp, de);
328 }
329 
330 /* --------------------------------------------------------------------- */
331 
332 /*
333  * Allocates a new vnode for the node node or returns a new reference to
334  * an existing one if the node had already a vnode referencing it.  The
335  * resulting locked vnode is returned in *vpp.
336  *
337  * Returns zero on success or an appropriate error code on failure.
338  */
339 int
340 tmpfs_alloc_vp(struct mount *mp, struct tmpfs_node *node, struct vnode **vpp)
341 {
342 	int error;
343 	struct vnode *vp;
344 
345 	/* If there is already a vnode, then lock it. */
346 	for (;;) {
347 		mutex_enter(&node->tn_vlock);
348 		if ((vp = node->tn_vnode) != NULL) {
349 			mutex_enter(&vp->v_interlock);
350 			mutex_exit(&node->tn_vlock);
351 			error = vget(vp, LK_EXCLUSIVE);
352 			if (error == ENOENT) {
353 				/* vnode was reclaimed. */
354 				continue;
355 			}
356 			*vpp = vp;
357 			return error;
358 		}
359 		break;
360 	}
361 
362 	/* Get a new vnode and associate it with our node. */
363 	error = getnewvnode(VT_TMPFS, mp, tmpfs_vnodeop_p, &vp);
364 	if (error != 0) {
365 		mutex_exit(&node->tn_vlock);
366 		return error;
367 	}
368 
369 	error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
370 	if (error != 0) {
371 		mutex_exit(&node->tn_vlock);
372 		ungetnewvnode(vp);
373 		return error;
374 	}
375 
376 	vp->v_type = node->tn_type;
377 
378 	/* Type-specific initialization. */
379 	switch (node->tn_type) {
380 	case VBLK:
381 		/* FALLTHROUGH */
382 	case VCHR:
383 		vp->v_op = tmpfs_specop_p;
384 		spec_node_init(vp, node->tn_spec.tn_dev.tn_rdev);
385 		break;
386 
387 	case VDIR:
388 		vp->v_vflag |= node->tn_spec.tn_dir.tn_parent == node ?
389 		    VV_ROOT : 0;
390 		break;
391 
392 	case VFIFO:
393 		vp->v_op = tmpfs_fifoop_p;
394 		break;
395 
396 	case VLNK:
397 		/* FALLTHROUGH */
398 	case VREG:
399 		/* FALLTHROUGH */
400 	case VSOCK:
401 		break;
402 
403 	default:
404 		KASSERT(0);
405 	}
406 
407 	uvm_vnp_setsize(vp, node->tn_size);
408 	vp->v_data = node;
409 	node->tn_vnode = vp;
410 	mutex_exit(&node->tn_vlock);
411 	*vpp = vp;
412 
413 	KASSERT(IFF(error == 0, *vpp != NULL && VOP_ISLOCKED(*vpp)));
414 	KASSERT(*vpp == node->tn_vnode);
415 
416 	return error;
417 }
418 
419 /* --------------------------------------------------------------------- */
420 
421 /*
422  * Destroys the association between the vnode vp and the node it
423  * references.
424  */
425 void
426 tmpfs_free_vp(struct vnode *vp)
427 {
428 	struct tmpfs_node *node;
429 
430 	node = VP_TO_TMPFS_NODE(vp);
431 
432 	mutex_enter(&node->tn_vlock);
433 	node->tn_vnode = NULL;
434 	mutex_exit(&node->tn_vlock);
435 	vp->v_data = NULL;
436 }
437 
438 /* --------------------------------------------------------------------- */
439 
440 /*
441  * Allocates a new file of type 'type' and adds it to the parent directory
442  * 'dvp'; this addition is done using the component name given in 'cnp'.
443  * The ownership of the new file is automatically assigned based on the
444  * credentials of the caller (through 'cnp'), the group is set based on
445  * the parent directory and the mode is determined from the 'vap' argument.
446  * If successful, *vpp holds a vnode to the newly created file and zero
447  * is returned.  Otherwise *vpp is NULL and the function returns an
448  * appropriate error code.
449  */
450 int
451 tmpfs_alloc_file(struct vnode *dvp, struct vnode **vpp, struct vattr *vap,
452     struct componentname *cnp, char *target)
453 {
454 	int error;
455 	struct tmpfs_dirent *de;
456 	struct tmpfs_mount *tmp;
457 	struct tmpfs_node *dnode;
458 	struct tmpfs_node *node;
459 	struct tmpfs_node *parent;
460 
461 	KASSERT(VOP_ISLOCKED(dvp));
462 
463 	tmp = VFS_TO_TMPFS(dvp->v_mount);
464 	dnode = VP_TO_TMPFS_DIR(dvp);
465 	*vpp = NULL;
466 
467 	/* If the entry we are creating is a directory, we cannot overflow
468 	 * the number of links of its parent, because it will get a new
469 	 * link. */
470 	if (vap->va_type == VDIR) {
471 		/* Ensure that we do not overflow the maximum number of links
472 		 * imposed by the system. */
473 		KASSERT(dnode->tn_links <= LINK_MAX);
474 		if (dnode->tn_links == LINK_MAX) {
475 			error = EMLINK;
476 			goto out;
477 		}
478 
479 		parent = dnode;
480 	} else
481 		parent = NULL;
482 
483 	/* Allocate a node that represents the new file. */
484 	error = tmpfs_alloc_node(tmp, vap->va_type, kauth_cred_geteuid(cnp->cn_cred),
485 	    dnode->tn_gid, vap->va_mode, parent, target, vap->va_rdev, &node);
486 	if (error != 0)
487 		goto out;
488 
489 	/* Allocate a directory entry that points to the new file. */
490 	error = tmpfs_alloc_dirent(tmp, node, cnp->cn_nameptr, cnp->cn_namelen,
491 	    &de);
492 	if (error != 0) {
493 		tmpfs_free_node(tmp, node);
494 		goto out;
495 	}
496 
497 	/* Allocate a vnode for the new file. */
498 	error = tmpfs_alloc_vp(dvp->v_mount, node, vpp);
499 	if (error != 0) {
500 		tmpfs_free_dirent(tmp, de, true);
501 		tmpfs_free_node(tmp, node);
502 		goto out;
503 	}
504 
505 	/* Now that all required items are allocated, we can proceed to
506 	 * insert the new node into the directory, an operation that
507 	 * cannot fail. */
508 	tmpfs_dir_attach(dvp, de);
509 	if (vap->va_type == VDIR) {
510 		VN_KNOTE(dvp, NOTE_LINK);
511 		dnode->tn_links++;
512 		KASSERT(dnode->tn_links <= LINK_MAX);
513 	}
514 
515 out:
516 	vput(dvp);
517 
518 	KASSERT(IFF(error == 0, *vpp != NULL));
519 
520 	return error;
521 }
522 
523 /* --------------------------------------------------------------------- */
524 
525 /*
526  * Attaches the directory entry de to the directory represented by vp.
527  * Note that this does not change the link count of the node pointed by
528  * the directory entry, as this is done by tmpfs_alloc_dirent.
529  *
530  * As the "parent" directory changes, interested parties are notified of
531  * a write to it.
532  */
533 void
534 tmpfs_dir_attach(struct vnode *vp, struct tmpfs_dirent *de)
535 {
536 	struct tmpfs_node *dnode;
537 
538 	KASSERT(VOP_ISLOCKED(vp));
539 	dnode = VP_TO_TMPFS_DIR(vp);
540 
541 	TAILQ_INSERT_TAIL(&dnode->tn_spec.tn_dir.tn_dir, de, td_entries);
542 	dnode->tn_size += sizeof(struct tmpfs_dirent);
543 	dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \
544 	    TMPFS_NODE_MODIFIED;
545 	uvm_vnp_setsize(vp, dnode->tn_size);
546 
547 	VN_KNOTE(vp, NOTE_WRITE);
548 }
549 
550 /* --------------------------------------------------------------------- */
551 
552 /*
553  * Detaches the directory entry de from the directory represented by vp.
554  * Note that this does not change the link count of the node pointed by
555  * the directory entry, as this is done by tmpfs_free_dirent.
556  *
557  * As the "parent" directory changes, interested parties are notified of
558  * a write to it.
559  */
560 void
561 tmpfs_dir_detach(struct vnode *vp, struct tmpfs_dirent *de)
562 {
563 	struct tmpfs_node *dnode;
564 
565 	KASSERT(VOP_ISLOCKED(vp));
566 	dnode = VP_TO_TMPFS_DIR(vp);
567 
568 	if (dnode->tn_spec.tn_dir.tn_readdir_lastp == de) {
569 		dnode->tn_spec.tn_dir.tn_readdir_lastn = 0;
570 		dnode->tn_spec.tn_dir.tn_readdir_lastp = NULL;
571 	}
572 
573 	TAILQ_REMOVE(&dnode->tn_spec.tn_dir.tn_dir, de, td_entries);
574 	dnode->tn_size -= sizeof(struct tmpfs_dirent);
575 	dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \
576 	    TMPFS_NODE_MODIFIED;
577 	uvm_vnp_setsize(vp, dnode->tn_size);
578 
579 	VN_KNOTE(vp, NOTE_WRITE);
580 }
581 
582 /* --------------------------------------------------------------------- */
583 
584 /*
585  * Looks for a directory entry in the directory represented by node.
586  * 'cnp' describes the name of the entry to look for.  Note that the .
587  * and .. components are not allowed as they do not physically exist
588  * within directories.
589  *
590  * Returns a pointer to the entry when found, otherwise NULL.
591  */
592 struct tmpfs_dirent *
593 tmpfs_dir_lookup(struct tmpfs_node *node, struct componentname *cnp)
594 {
595 	struct tmpfs_dirent *de;
596 
597 	KASSERT(VOP_ISLOCKED(node->tn_vnode));
598 	KASSERT(IMPLIES(cnp->cn_namelen == 1, cnp->cn_nameptr[0] != '.'));
599 	KASSERT(IMPLIES(cnp->cn_namelen == 2, !(cnp->cn_nameptr[0] == '.' &&
600 	    cnp->cn_nameptr[1] == '.')));
601 	TMPFS_VALIDATE_DIR(node);
602 
603 	node->tn_status |= TMPFS_NODE_ACCESSED;
604 
605 	TAILQ_FOREACH(de, &node->tn_spec.tn_dir.tn_dir, td_entries) {
606 		KASSERT(cnp->cn_namelen < 0xffff);
607 		if (de->td_namelen == (uint16_t)cnp->cn_namelen &&
608 		    memcmp(de->td_name, cnp->cn_nameptr, de->td_namelen) == 0) {
609 			break;
610 		}
611 	}
612 
613 	return de;
614 }
615 
616 /* --------------------------------------------------------------------- */
617 
618 /*
619  * Helper function for tmpfs_readdir.  Creates a '.' entry for the given
620  * directory and returns it in the uio space.  The function returns 0
621  * on success, -1 if there was not enough space in the uio structure to
622  * hold the directory entry or an appropriate error code if another
623  * error happens.
624  */
625 int
626 tmpfs_dir_getdotdent(struct tmpfs_node *node, struct uio *uio)
627 {
628 	int error;
629 	struct dirent *dentp;
630 
631 	TMPFS_VALIDATE_DIR(node);
632 	KASSERT(uio->uio_offset == TMPFS_DIRCOOKIE_DOT);
633 
634 	dentp = kmem_alloc(sizeof(struct dirent), KM_SLEEP);
635 
636 	dentp->d_fileno = node->tn_id;
637 	dentp->d_type = DT_DIR;
638 	dentp->d_namlen = 1;
639 	dentp->d_name[0] = '.';
640 	dentp->d_name[1] = '\0';
641 	dentp->d_reclen = _DIRENT_SIZE(dentp);
642 
643 	if (dentp->d_reclen > uio->uio_resid)
644 		error = -1;
645 	else {
646 		error = uiomove(dentp, dentp->d_reclen, uio);
647 		if (error == 0)
648 			uio->uio_offset = TMPFS_DIRCOOKIE_DOTDOT;
649 	}
650 
651 	node->tn_status |= TMPFS_NODE_ACCESSED;
652 
653 	kmem_free(dentp, sizeof(struct dirent));
654 	return error;
655 }
656 
657 /* --------------------------------------------------------------------- */
658 
659 /*
660  * Helper function for tmpfs_readdir.  Creates a '..' entry for the given
661  * directory and returns it in the uio space.  The function returns 0
662  * on success, -1 if there was not enough space in the uio structure to
663  * hold the directory entry or an appropriate error code if another
664  * error happens.
665  */
666 int
667 tmpfs_dir_getdotdotdent(struct tmpfs_node *node, struct uio *uio)
668 {
669 	int error;
670 	struct dirent *dentp;
671 
672 	TMPFS_VALIDATE_DIR(node);
673 	KASSERT(uio->uio_offset == TMPFS_DIRCOOKIE_DOTDOT);
674 
675 	dentp = kmem_alloc(sizeof(struct dirent), KM_SLEEP);
676 
677 	dentp->d_fileno = node->tn_spec.tn_dir.tn_parent->tn_id;
678 	dentp->d_type = DT_DIR;
679 	dentp->d_namlen = 2;
680 	dentp->d_name[0] = '.';
681 	dentp->d_name[1] = '.';
682 	dentp->d_name[2] = '\0';
683 	dentp->d_reclen = _DIRENT_SIZE(dentp);
684 
685 	if (dentp->d_reclen > uio->uio_resid)
686 		error = -1;
687 	else {
688 		error = uiomove(dentp, dentp->d_reclen, uio);
689 		if (error == 0) {
690 			struct tmpfs_dirent *de;
691 
692 			de = TAILQ_FIRST(&node->tn_spec.tn_dir.tn_dir);
693 			if (de == NULL)
694 				uio->uio_offset = TMPFS_DIRCOOKIE_EOF;
695 			else
696 				uio->uio_offset = tmpfs_dircookie(de);
697 		}
698 	}
699 
700 	node->tn_status |= TMPFS_NODE_ACCESSED;
701 
702 	kmem_free(dentp, sizeof(struct dirent));
703 	return error;
704 }
705 
706 /* --------------------------------------------------------------------- */
707 
708 /*
709  * Lookup a directory entry by its associated cookie.
710  */
711 struct tmpfs_dirent *
712 tmpfs_dir_lookupbycookie(struct tmpfs_node *node, off_t cookie)
713 {
714 	struct tmpfs_dirent *de;
715 
716 	KASSERT(VOP_ISLOCKED(node->tn_vnode));
717 
718 	if (cookie == node->tn_spec.tn_dir.tn_readdir_lastn &&
719 	    node->tn_spec.tn_dir.tn_readdir_lastp != NULL) {
720 		return node->tn_spec.tn_dir.tn_readdir_lastp;
721 	}
722 
723 	TAILQ_FOREACH(de, &node->tn_spec.tn_dir.tn_dir, td_entries) {
724 		if (tmpfs_dircookie(de) == cookie) {
725 			break;
726 		}
727 	}
728 
729 	return de;
730 }
731 
732 /* --------------------------------------------------------------------- */
733 
734 /*
735  * Helper function for tmpfs_readdir.  Returns as much directory entries
736  * as can fit in the uio space.  The read starts at uio->uio_offset.
737  * The function returns 0 on success, -1 if there was not enough space
738  * in the uio structure to hold the directory entry or an appropriate
739  * error code if another error happens.
740  */
741 int
742 tmpfs_dir_getdents(struct tmpfs_node *node, struct uio *uio, off_t *cntp)
743 {
744 	int error;
745 	off_t startcookie;
746 	struct dirent *dentp;
747 	struct tmpfs_dirent *de;
748 
749 	KASSERT(VOP_ISLOCKED(node->tn_vnode));
750 	TMPFS_VALIDATE_DIR(node);
751 
752 	/* Locate the first directory entry we have to return.  We have cached
753 	 * the last readdir in the node, so use those values if appropriate.
754 	 * Otherwise do a linear scan to find the requested entry. */
755 	startcookie = uio->uio_offset;
756 	KASSERT(startcookie != TMPFS_DIRCOOKIE_DOT);
757 	KASSERT(startcookie != TMPFS_DIRCOOKIE_DOTDOT);
758 	if (startcookie == TMPFS_DIRCOOKIE_EOF) {
759 		return 0;
760 	} else {
761 		de = tmpfs_dir_lookupbycookie(node, startcookie);
762 	}
763 	if (de == NULL) {
764 		return EINVAL;
765 	}
766 
767 	dentp = kmem_alloc(sizeof(struct dirent), KM_SLEEP);
768 
769 	/* Read as much entries as possible; i.e., until we reach the end of
770 	 * the directory or we exhaust uio space. */
771 	do {
772 		/* Create a dirent structure representing the current
773 		 * tmpfs_node and fill it. */
774 		if (de->td_node == TMPFS_NODE_WHITEOUT) {
775 			dentp->d_fileno = 1;
776 			dentp->d_type = DT_WHT;
777 		} else {
778 			dentp->d_fileno = de->td_node->tn_id;
779 			switch (de->td_node->tn_type) {
780 			case VBLK:
781 				dentp->d_type = DT_BLK;
782 			break;
783 
784 			case VCHR:
785 				dentp->d_type = DT_CHR;
786 				break;
787 
788 			case VDIR:
789 				dentp->d_type = DT_DIR;
790 				break;
791 
792 			case VFIFO:
793 				dentp->d_type = DT_FIFO;
794 				break;
795 
796 			case VLNK:
797 				dentp->d_type = DT_LNK;
798 				break;
799 
800 			case VREG:
801 				dentp->d_type = DT_REG;
802 				break;
803 
804 			case VSOCK:
805 				dentp->d_type = DT_SOCK;
806 			break;
807 
808 			default:
809 				KASSERT(0);
810 			}
811 		}
812 		dentp->d_namlen = de->td_namelen;
813 		KASSERT(de->td_namelen < sizeof(dentp->d_name));
814 		(void)memcpy(dentp->d_name, de->td_name, de->td_namelen);
815 		dentp->d_name[de->td_namelen] = '\0';
816 		dentp->d_reclen = _DIRENT_SIZE(dentp);
817 
818 		/* Stop reading if the directory entry we are treating is
819 		 * bigger than the amount of data that can be returned. */
820 		if (dentp->d_reclen > uio->uio_resid) {
821 			error = -1;
822 			break;
823 		}
824 
825 		/* Copy the new dirent structure into the output buffer and
826 		 * advance pointers. */
827 		error = uiomove(dentp, dentp->d_reclen, uio);
828 
829 		(*cntp)++;
830 		de = TAILQ_NEXT(de, td_entries);
831 	} while (error == 0 && uio->uio_resid > 0 && de != NULL);
832 
833 	/* Update the offset and cache. */
834 	if (de == NULL) {
835 		uio->uio_offset = TMPFS_DIRCOOKIE_EOF;
836 		node->tn_spec.tn_dir.tn_readdir_lastn = 0;
837 		node->tn_spec.tn_dir.tn_readdir_lastp = NULL;
838 	} else {
839 		node->tn_spec.tn_dir.tn_readdir_lastn = uio->uio_offset =
840 		    tmpfs_dircookie(de);
841 		node->tn_spec.tn_dir.tn_readdir_lastp = de;
842 	}
843 
844 	node->tn_status |= TMPFS_NODE_ACCESSED;
845 
846 	kmem_free(dentp, sizeof(struct dirent));
847 	return error;
848 }
849 
850 /* --------------------------------------------------------------------- */
851 
852 /*
853  * Resizes the aobj associated to the regular file pointed to by vp to
854  * the size newsize.  'vp' must point to a vnode that represents a regular
855  * file.  'newsize' must be positive.
856  *
857  * If the file is extended, the appropriate kevent is raised.  This does
858  * not rise a write event though because resizing is not the same as
859  * writing.
860  *
861  * Returns zero on success or an appropriate error code on failure.
862  */
863 int
864 tmpfs_reg_resize(struct vnode *vp, off_t newsize)
865 {
866 	size_t newpages, oldpages;
867 	struct tmpfs_mount *tmp;
868 	struct tmpfs_node *node;
869 	off_t oldsize;
870 
871 	KASSERT(vp->v_type == VREG);
872 	KASSERT(newsize >= 0);
873 
874 	node = VP_TO_TMPFS_NODE(vp);
875 	tmp = VFS_TO_TMPFS(vp->v_mount);
876 
877 	oldsize = node->tn_size;
878 	oldpages = round_page(oldsize) >> PAGE_SHIFT;
879 	newpages = round_page(newsize) >> PAGE_SHIFT;
880 	KASSERT(oldpages == node->tn_spec.tn_reg.tn_aobj_pages);
881 
882 	if (newpages > oldpages) {
883 		/* Increase the used-memory counter if getting extra pages. */
884 		if (!tmpfs_mem_incr(tmp, (newpages - oldpages) << PAGE_SHIFT)) {
885 			return ENOSPC;
886 		}
887 	} else if (newsize < oldsize) {
888 		int zerolen = MIN(round_page(newsize), node->tn_size) - newsize;
889 
890 		/* Zero out the truncated part of the last page. */
891 		uvm_vnp_zerorange(vp, newsize, zerolen);
892 	}
893 
894 	node->tn_spec.tn_reg.tn_aobj_pages = newpages;
895 	node->tn_size = newsize;
896 	uvm_vnp_setsize(vp, newsize);
897 
898 	/*
899 	 * Free "backing store".
900 	 */
901 	if (newpages < oldpages) {
902 		struct uvm_object *uobj;
903 
904 		uobj = node->tn_spec.tn_reg.tn_aobj;
905 
906 		mutex_enter(&uobj->vmobjlock);
907 		uao_dropswap_range(uobj, newpages, oldpages);
908 		mutex_exit(&uobj->vmobjlock);
909 
910 		/* Decrease the used-memory counter. */
911 		tmpfs_mem_decr(tmp, (oldpages - newpages) << PAGE_SHIFT);
912 	}
913 
914 	if (newsize > oldsize)
915 		VN_KNOTE(vp, NOTE_EXTEND);
916 
917 	return 0;
918 }
919 
920 /*
921  * Change flags of the given vnode.
922  * Caller should execute tmpfs_update on vp after a successful execution.
923  * The vnode must be locked on entry and remain locked on exit.
924  */
925 int
926 tmpfs_chflags(struct vnode *vp, int flags, kauth_cred_t cred, struct lwp *l)
927 {
928 	int error;
929 	struct tmpfs_node *node;
930 	kauth_action_t action = KAUTH_VNODE_WRITE_FLAGS;
931 	int fs_decision = 0;
932 
933 	KASSERT(VOP_ISLOCKED(vp));
934 
935 	node = VP_TO_TMPFS_NODE(vp);
936 
937 	/* Disallow this operation if the file system is mounted read-only. */
938 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
939 		return EROFS;
940 
941 	if (kauth_cred_geteuid(cred) != node->tn_uid)
942 		fs_decision = EACCES;
943 
944 	/*
945 	 * If the new flags have non-user flags that are different than
946 	 * those on the node, we need special permission to change them.
947 	 */
948 	if ((flags & SF_SETTABLE) != (node->tn_flags & SF_SETTABLE)) {
949 		action |= KAUTH_VNODE_WRITE_SYSFLAGS;
950 		if (!fs_decision)
951 			fs_decision = EPERM;
952 	}
953 
954 	/*
955 	 * Indicate that this node's flags have system attributes in them if
956 	 * that's the case.
957 	 */
958 	if (node->tn_flags & (SF_IMMUTABLE | SF_APPEND)) {
959 		action |= KAUTH_VNODE_HAS_SYSFLAGS;
960 	}
961 
962 	error = kauth_authorize_vnode(cred, action, vp, NULL, fs_decision);
963 	if (error)
964 		return error;
965 
966 	/*
967 	 * Set the flags. If we're not setting non-user flags, be careful not
968 	 * to overwrite them.
969 	 *
970 	 * XXX: Can't we always assign here? if the system flags are different,
971 	 *      the code above should catch attempts to change them without
972 	 *      proper permissions, and if we're here it means it's okay to
973 	 *      change them...
974 	 */
975 	if (action & KAUTH_VNODE_WRITE_SYSFLAGS) {
976 		node->tn_flags = flags;
977 	} else {
978 		/* Clear all user-settable flags and re-set them. */
979 		node->tn_flags &= SF_SETTABLE;
980 		node->tn_flags |= (flags & UF_SETTABLE);
981 	}
982 
983 	node->tn_status |= TMPFS_NODE_CHANGED;
984 	VN_KNOTE(vp, NOTE_ATTRIB);
985 
986 	KASSERT(VOP_ISLOCKED(vp));
987 
988 	return 0;
989 }
990 
991 /* --------------------------------------------------------------------- */
992 
993 /*
994  * Change access mode on the given vnode.
995  * Caller should execute tmpfs_update on vp after a successful execution.
996  * The vnode must be locked on entry and remain locked on exit.
997  */
998 int
999 tmpfs_chmod(struct vnode *vp, mode_t mode, kauth_cred_t cred, struct lwp *l)
1000 {
1001 	int error;
1002 	struct tmpfs_node *node;
1003 
1004 	KASSERT(VOP_ISLOCKED(vp));
1005 
1006 	node = VP_TO_TMPFS_NODE(vp);
1007 
1008 	/* Disallow this operation if the file system is mounted read-only. */
1009 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
1010 		return EROFS;
1011 
1012 	/* Immutable or append-only files cannot be modified, either. */
1013 	if (node->tn_flags & (IMMUTABLE | APPEND))
1014 		return EPERM;
1015 
1016 	error = genfs_can_chmod(vp, cred, node->tn_uid, node->tn_gid,
1017 	    mode);
1018 
1019 	error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_SECURITY, vp,
1020 	    NULL, error);
1021 	if (error)
1022 		return (error);
1023 
1024 	node->tn_mode = (mode & ALLPERMS);
1025 
1026 	node->tn_status |= TMPFS_NODE_CHANGED;
1027 	VN_KNOTE(vp, NOTE_ATTRIB);
1028 
1029 	KASSERT(VOP_ISLOCKED(vp));
1030 
1031 	return 0;
1032 }
1033 
1034 /* --------------------------------------------------------------------- */
1035 
1036 /*
1037  * Change ownership of the given vnode.  At least one of uid or gid must
1038  * be different than VNOVAL.  If one is set to that value, the attribute
1039  * is unchanged.
1040  * Caller should execute tmpfs_update on vp after a successful execution.
1041  * The vnode must be locked on entry and remain locked on exit.
1042  */
1043 int
1044 tmpfs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred,
1045     struct lwp *l)
1046 {
1047 	int error;
1048 	struct tmpfs_node *node;
1049 
1050 	KASSERT(VOP_ISLOCKED(vp));
1051 
1052 	node = VP_TO_TMPFS_NODE(vp);
1053 
1054 	/* Assign default values if they are unknown. */
1055 	KASSERT(uid != VNOVAL || gid != VNOVAL);
1056 	if (uid == VNOVAL)
1057 		uid = node->tn_uid;
1058 	if (gid == VNOVAL)
1059 		gid = node->tn_gid;
1060 	KASSERT(uid != VNOVAL && gid != VNOVAL);
1061 
1062 	/* Disallow this operation if the file system is mounted read-only. */
1063 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
1064 		return EROFS;
1065 
1066 	/* Immutable or append-only files cannot be modified, either. */
1067 	if (node->tn_flags & (IMMUTABLE | APPEND))
1068 		return EPERM;
1069 
1070 	error = genfs_can_chown(vp, cred, node->tn_uid, node->tn_gid, uid,
1071 	    gid);
1072 
1073 	error = kauth_authorize_vnode(cred, KAUTH_VNODE_CHANGE_OWNERSHIP, vp,
1074 	    NULL, error);
1075 	if (error)
1076 		return (error);
1077 
1078 	node->tn_uid = uid;
1079 	node->tn_gid = gid;
1080 
1081 	node->tn_status |= TMPFS_NODE_CHANGED;
1082 	VN_KNOTE(vp, NOTE_ATTRIB);
1083 
1084 	KASSERT(VOP_ISLOCKED(vp));
1085 
1086 	return 0;
1087 }
1088 
1089 /* --------------------------------------------------------------------- */
1090 
1091 /*
1092  * Change size of the given vnode.
1093  * Caller should execute tmpfs_update on vp after a successful execution.
1094  * The vnode must be locked on entry and remain locked on exit.
1095  */
1096 int
1097 tmpfs_chsize(struct vnode *vp, u_quad_t size, kauth_cred_t cred,
1098     struct lwp *l)
1099 {
1100 	int error;
1101 	struct tmpfs_node *node;
1102 
1103 	KASSERT(VOP_ISLOCKED(vp));
1104 
1105 	node = VP_TO_TMPFS_NODE(vp);
1106 
1107 	/* Decide whether this is a valid operation based on the file type. */
1108 	error = 0;
1109 	switch (vp->v_type) {
1110 	case VDIR:
1111 		return EISDIR;
1112 
1113 	case VREG:
1114 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
1115 			return EROFS;
1116 		break;
1117 
1118 	case VBLK:
1119 		/* FALLTHROUGH */
1120 	case VCHR:
1121 		/* FALLTHROUGH */
1122 	case VFIFO:
1123 		/* Allow modifications of special files even if in the file
1124 		 * system is mounted read-only (we are not modifying the
1125 		 * files themselves, but the objects they represent). */
1126 		return 0;
1127 
1128 	default:
1129 		/* Anything else is unsupported. */
1130 		return EOPNOTSUPP;
1131 	}
1132 
1133 	/* Immutable or append-only files cannot be modified, either. */
1134 	if (node->tn_flags & (IMMUTABLE | APPEND))
1135 		return EPERM;
1136 
1137 	error = tmpfs_truncate(vp, size);
1138 	/* tmpfs_truncate will raise the NOTE_EXTEND and NOTE_ATTRIB kevents
1139 	 * for us, as will update tn_status; no need to do that here. */
1140 
1141 	KASSERT(VOP_ISLOCKED(vp));
1142 
1143 	return error;
1144 }
1145 
1146 /* --------------------------------------------------------------------- */
1147 
1148 /*
1149  * Change access and modification times of the given vnode.
1150  * Caller should execute tmpfs_update on vp after a successful execution.
1151  * The vnode must be locked on entry and remain locked on exit.
1152  */
1153 int
1154 tmpfs_chtimes(struct vnode *vp, const struct timespec *atime,
1155     const struct timespec *mtime, const struct timespec *btime,
1156     int vaflags, kauth_cred_t cred, struct lwp *l)
1157 {
1158 	int error;
1159 	struct tmpfs_node *node;
1160 
1161 	KASSERT(VOP_ISLOCKED(vp));
1162 
1163 	node = VP_TO_TMPFS_NODE(vp);
1164 
1165 	/* Disallow this operation if the file system is mounted read-only. */
1166 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
1167 		return EROFS;
1168 
1169 	/* Immutable or append-only files cannot be modified, either. */
1170 	if (node->tn_flags & (IMMUTABLE | APPEND))
1171 		return EPERM;
1172 
1173 	error = genfs_can_chtimes(vp, vaflags, node->tn_uid, cred);
1174 
1175 	error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_TIMES, vp, NULL,
1176 	    error);
1177 	if (error)
1178 		return (error);
1179 
1180 	if (atime->tv_sec != VNOVAL && atime->tv_nsec != VNOVAL)
1181 		node->tn_status |= TMPFS_NODE_ACCESSED;
1182 
1183 	if (mtime->tv_sec != VNOVAL && mtime->tv_nsec != VNOVAL)
1184 		node->tn_status |= TMPFS_NODE_MODIFIED;
1185 
1186 	if (btime->tv_sec == VNOVAL && btime->tv_nsec == VNOVAL)
1187 		btime = NULL;
1188 
1189 	tmpfs_update(vp, atime, mtime, btime, 0);
1190 	VN_KNOTE(vp, NOTE_ATTRIB);
1191 
1192 	KASSERT(VOP_ISLOCKED(vp));
1193 
1194 	return 0;
1195 }
1196 
1197 /* --------------------------------------------------------------------- */
1198 
1199 /* Sync timestamps */
1200 void
1201 tmpfs_itimes(struct vnode *vp, const struct timespec *acc,
1202     const struct timespec *mod, const struct timespec *birth)
1203 {
1204 	struct tmpfs_node *node;
1205 	struct timespec nowtm;
1206 
1207 	node = VP_TO_TMPFS_NODE(vp);
1208 
1209 	if ((node->tn_status & (TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED |
1210 	    TMPFS_NODE_CHANGED)) == 0)
1211 		return;
1212 
1213 	if (birth != NULL) {
1214 		node->tn_birthtime = *birth;
1215 	}
1216 	vfs_timestamp(&nowtm);
1217 
1218 	if (node->tn_status & TMPFS_NODE_ACCESSED) {
1219 		node->tn_atime = acc ? *acc : nowtm;
1220 	}
1221 	if (node->tn_status & TMPFS_NODE_MODIFIED) {
1222 		node->tn_mtime = mod ? *mod : nowtm;
1223 	}
1224 	if (node->tn_status & TMPFS_NODE_CHANGED) {
1225 		node->tn_ctime = nowtm;
1226 	}
1227 
1228 	node->tn_status &=
1229 	    ~(TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED);
1230 }
1231 
1232 /* --------------------------------------------------------------------- */
1233 
1234 void
1235 tmpfs_update(struct vnode *vp, const struct timespec *acc,
1236     const struct timespec *mod, const struct timespec *birth, int flags)
1237 {
1238 
1239 	struct tmpfs_node *node;
1240 
1241 	KASSERT(VOP_ISLOCKED(vp));
1242 
1243 	node = VP_TO_TMPFS_NODE(vp);
1244 
1245 #if 0
1246 	if (flags & UPDATE_CLOSE)
1247 		; /* XXX Need to do anything special? */
1248 #endif
1249 
1250 	tmpfs_itimes(vp, acc, mod, birth);
1251 
1252 	KASSERT(VOP_ISLOCKED(vp));
1253 }
1254 
1255 /* --------------------------------------------------------------------- */
1256 
1257 int
1258 tmpfs_truncate(struct vnode *vp, off_t length)
1259 {
1260 	bool extended;
1261 	int error;
1262 	struct tmpfs_node *node;
1263 
1264 	node = VP_TO_TMPFS_NODE(vp);
1265 	extended = length > node->tn_size;
1266 
1267 	if (length < 0) {
1268 		error = EINVAL;
1269 		goto out;
1270 	}
1271 
1272 	if (node->tn_size == length) {
1273 		error = 0;
1274 		goto out;
1275 	}
1276 
1277 	error = tmpfs_reg_resize(vp, length);
1278 	if (error == 0)
1279 		node->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED;
1280 
1281 out:
1282 	tmpfs_update(vp, NULL, NULL, NULL, 0);
1283 
1284 	return error;
1285 }
1286