xref: /minix/minix/servers/mib/tree.c (revision 03de4d97)
1 /* MIB service - tree.c - tree access and management */
2 
3 #include "mib.h"
4 
5 /*
6  * Does the given identifier fall within the range of static identifiers in the
7  * given parent?  This check can be used to enumerate all static array entries
8  * in the given parent, starting from zero.  The check does not guarantee that
9  * the entry is actually for a valid node, nor does it guarantee that there is
10  * not a dynamic node with this identifier.
11  */
12 #define IS_STATIC_ID(parent, id) ((unsigned int)(id) < (parent)->node_size)
13 
14 /*
15  * Scratch buffer, used for various cases of temporary data storage.  It must
16  * be large enough to fit a sysctldesc structure followed by the longest
17  * supported description.  It must also be large enough to serve as temporary
18  * storage for data being written in the majority of cases.  Finally, it must
19  * be large enough to contain an entire page, for mib_copyin_str().
20  */
21 #define MAXDESCLEN	1024	/* from NetBSD */
22 #define SCRATCH_SIZE	MAX(PAGE_SIZE, sizeof(struct sysctldesc) + MAXDESCLEN)
23 static char scratch[SCRATCH_SIZE] __aligned(sizeof(int32_t));
24 
25 unsigned int mib_nodes;		/* how many nodes are there in the tree? */
26 unsigned int mib_objects;	/* how many memory objects are allocated? */
27 unsigned int mib_remotes;	/* how many remote subtrees are there? */
28 
29 /*
30  * Find a node through its parent node and identifier.  Return the node if it
31  * was found, and optionally store a pointer to the pointer to its dynode
32  * superstructure (for removal).  If no matching node was found, return NULL.
33  */
34 static struct mib_node *
35 mib_find(struct mib_node * parent, int id, struct mib_dynode *** prevpp)
36 {
37 	struct mib_node *node;
38 	struct mib_dynode **dynp;
39 
40 	if (id < 0)
41 		return NULL;
42 
43 	/*
44 	 * Is there a static node with this identifier?  The static nodes are
45 	 * all in a single array, so lookup is O(1) for these nodes.  We use
46 	 * the node flags field to see whether the array entry is valid.
47 	 */
48 	if (IS_STATIC_ID(parent, id)) {
49 		node = &parent->node_scptr[id];
50 
51 		if (node->node_flags != 0) {
52 			/* Found a matching static node. */
53 			if (prevpp != NULL)
54 				*prevpp = NULL;
55 			return node;
56 		}
57 	}
58 
59 	/*
60 	 * Is there a dynamic node with this identifier?  The dynamic nodes
61 	 * form a linked list.  This is predominantly because userland may pick
62 	 * the identifier number at creation time, so we cannot rely on all
63 	 * dynamically created nodes falling into a small identifier range.
64 	 * That eliminates the option of a dynamic array indexed by identifier,
65 	 * and a linked list is the simplest next option.  Thus, dynamic node
66 	 * lookup is O(n).  However, since the list is sorted by identifier,
67 	 * we may be able to stop the search early.
68 	 */
69 	for (dynp = &parent->node_dcptr; *dynp != NULL;
70 	    dynp = &((*dynp)->dynode_next)) {
71 		if ((*dynp)->dynode_id == id) {
72 			/* Found a matching dynamic node. */
73 			if (prevpp != NULL)
74 				*prevpp = dynp;
75 			return &(*dynp)->dynode_node;
76 		} else if ((*dynp)->dynode_id > id)
77 			break; /* no need to look further */
78 	}
79 
80 	return NULL;
81 }
82 
83 /*
84  * Copy out a node to userland, using the exchange format for nodes (namely,
85  * a sysctlnode structure).  Return the size of the object that is (or, if the
86  * node falls outside the requested data range, would be) copied out on
87  * success, or a negative error code on failure.  The function may return 0
88  * to indicate that nothing was copied out after all (this is unused here).
89  */
90 static ssize_t
91 mib_copyout_node(struct mib_call * call, struct mib_oldp * oldp, size_t off,
92 	int id, const struct mib_node * node)
93 {
94 	struct sysctlnode scn;
95 	int visible;
96 
97 	if (!mib_inrange(oldp, off))
98 		return sizeof(scn); /* nothing to do */
99 
100 	memset(&scn, 0, sizeof(scn));
101 
102 	/*
103 	 * We use CTLFLAG_PARENT, CTLFLAG_VERIFY, and CTLFLAG_REMOTE internally
104 	 * only.  NetBSD uses the values of these flags for different purposes.
105 	 * Either way, do not expose them to userland.
106 	 */
107 	scn.sysctl_flags = SYSCTL_VERSION | (node->node_flags &
108 	    ~(CTLFLAG_PARENT | CTLFLAG_VERIFY | CTLFLAG_REMOTE));
109 	scn.sysctl_num = id;
110 	strlcpy(scn.sysctl_name, node->node_name, sizeof(scn.sysctl_name));
111 	scn.sysctl_ver = node->node_ver;
112 	scn.sysctl_size = node->node_size;
113 
114 	/* Some information is only visible if the user can access the node. */
115 	visible = (!(node->node_flags & CTLFLAG_PRIVATE) || mib_authed(call));
116 
117 	/*
118 	 * For immediate types, store the immediate value in the resulting
119 	 * structure, unless the caller is not authorized to obtain the value.
120 	 */
121 	if ((node->node_flags & CTLFLAG_IMMEDIATE) && visible) {
122 		switch (SYSCTL_TYPE(node->node_flags)) {
123 		case CTLTYPE_BOOL:
124 			scn.sysctl_bdata = node->node_bool;
125 			break;
126 		case CTLTYPE_INT:
127 			scn.sysctl_idata = node->node_int;
128 			break;
129 		case CTLTYPE_QUAD:
130 			scn.sysctl_qdata = node->node_quad;
131 		}
132 	}
133 
134 	/* Special rules apply to parent nodes. */
135 	if (SYSCTL_TYPE(node->node_flags) == CTLTYPE_NODE) {
136 		/* Report the node size the way NetBSD does, just in case. */
137 		scn.sysctl_size = sizeof(scn);
138 
139 		/*
140 		 * If this is a remote node, use the values we have of the root
141 		 * of the remote subtree.  If we did not have these values, we
142 		 * would have to call into the remote service here, which for
143 		 * reliability purposes is a bad idea.
144 		 *
145 		 * If this is a real parent node, report child information.  In
146 		 * both these cases, expose child information only if the node
147 		 * itself is accessible by the caller.
148 		 *
149 		 * If this is a function-driven node, indicate this by setting
150 		 * a nonzero function address.  This allows trace(1) to
151 		 * determine that it should not attempt to descend into this
152 		 * part of the tree as usual, because a) accessing subnodes may
153 		 * have side effects, and b) meta-identifiers may not work as
154 		 * expected in these parts of the tree.  Do not return the real
155 		 * function pointer, as this would leak anti-ASR information.
156 		 */
157 		if (node->node_flags & CTLFLAG_REMOTE) {
158 			if (visible) {
159 				scn.sysctl_csize = node->node_rcsize;
160 				scn.sysctl_clen = node->node_rclen;
161 			}
162 		} else if (node->node_flags & CTLFLAG_PARENT) {
163 			if (visible) {
164 				scn.sysctl_csize = node->node_csize;
165 				scn.sysctl_clen = node->node_clen;
166 			}
167 		} else
168 			scn.sysctl_func = SYSCTL_NODE_FN;
169 	}
170 
171 	/* Copy out the resulting node. */
172 	return mib_copyout(oldp, off, &scn, sizeof(scn));
173 }
174 
175 /*
176  * Given a query on a non-leaf (parent) node, provide the user with an array of
177  * this node's children.
178  */
179 static ssize_t
180 mib_query(struct mib_call * call, struct mib_node * parent,
181 	struct mib_oldp * oldp, struct mib_newp * newp)
182 {
183 	struct sysctlnode scn;
184 	struct mib_node *node;
185 	struct mib_dynode *dynode;
186 	size_t off;
187 	int r, id;
188 
189 	/* If the user passed in version numbers, check them. */
190 	if (newp != NULL) {
191 		if ((r = mib_copyin(newp, &scn, sizeof(scn))) != OK)
192 			return r;
193 
194 		if (SYSCTL_VERS(scn.sysctl_flags) != SYSCTL_VERSION)
195 			return EINVAL;
196 
197 		/*
198 		 * If a node version number is given, it must match the version
199 		 * of the parent or the root.
200 		 */
201 		if (scn.sysctl_ver != 0 &&
202 		    scn.sysctl_ver != mib_root.node_ver &&
203 		    scn.sysctl_ver != parent->node_ver)
204 			return EINVAL;
205 	}
206 
207 	/*
208 	 * We need not return the nodes strictly in ascending order of
209 	 * identifiers, as this is not expected by userland.  For example,
210 	 * sysctlgetmibinfo(3) performs its own sorting after a query.
211 	 * Thus, we can go through the static and dynamic nodes separately.
212 	 */
213 	off = 0;
214 
215 	/* First enumerate the static nodes. */
216 	for (id = 0; IS_STATIC_ID(parent, id); id++) {
217 		node = &parent->node_scptr[id];
218 
219 		if (node->node_flags == 0)
220 			continue;
221 
222 		if ((r = mib_copyout_node(call, oldp, off, id, node)) < 0)
223 			return r;
224 		off += r;
225 	}
226 
227 	/* Then enumerate the dynamic nodes. */
228 	for (dynode = parent->node_dcptr; dynode != NULL;
229 	    dynode = dynode->dynode_next) {
230 		node = &dynode->dynode_node;
231 
232 		if ((r = mib_copyout_node(call, oldp, off, dynode->dynode_id,
233 		    node)) < 0)
234 			return r;
235 		off += r;
236 	}
237 
238 	return off;
239 }
240 
241 /*
242  * Check whether the given name buffer contains a valid node name string.  If
243  * the name is nonempty, properly terminated, and contains only acceptable
244  * characters, return the length of the string excluding null terminator.
245  * Otherwise, return zero to indicate failure.
246  */
247 static size_t
248 mib_check_name(const char * name, size_t namesize)
249 {
250 	size_t namelen;
251 	char c;
252 
253 	/* Names must be nonempty, null terminated, C symbol style strings. */
254 	for (namelen = 0; namelen < namesize; namelen++) {
255 		if ((c = name[namelen]) == '\0')
256 			break;
257 		/* A-Z, a-z, 0-9, _ only, and no digit as first character. */
258 		if (!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') ||
259 		    c == '_' || (c >= '0' && c <= '9' && namelen > 0)))
260 			return 0;
261 	}
262 	if (namelen == 0 || namelen == namesize)
263 		return 0;
264 
265 	return namelen;
266 }
267 
268 /*
269  * Scan a parent node's children, as part of new node creation.  Search for
270  * either a free node identifier (if given_id < 0) or collisions with the node
271  * identifier to use (if given_id >= 0).  Also check for name collisions.  Upon
272  * success, return OK, with the resulting node identifier stored in 'idp' and a
273  * pointer to the pointer for the new dynamic node stored in 'prevpp'.  Upon
274  * failure, return an error code.  If the failure is EEXIST, 'idp' will contain
275  * the ID of the conflicting node, and 'nodep' will point to the node.
276  */
277 static int
278 mib_scan(struct mib_node * parent, int given_id, const char * name, int * idp,
279 	struct mib_dynode *** prevpp, struct mib_node ** nodep)
280 {
281 	struct mib_dynode **prevp, **dynp;
282 	struct mib_node *node;
283 	int id;
284 
285 	/*
286 	 * We must verify that no entry already exists with the given name.  In
287 	 * addition, if a nonnegative identifier is given, we should use that
288 	 * identifier and make sure it does not already exist.  Otherwise, we
289 	 * must find a free identifier.  Finally, we sort the dynamic nodes in
290 	 * ascending identifier order, so we must find the right place at which
291 	 * to insert the new node.
292 	 *
293 	 * For finding a free identifier, choose an ID that falls (well)
294 	 * outside the static range, both to avoid accidental retrieval by an
295 	 * application that uses a static ID, and to simplify verifying that
296 	 * the ID is indeed free.  The sorting of dynamic nodes by identifier
297 	 * ensures that searching for a free identifier is O(n).
298 	 *
299 	 * At this time, we do not support some NetBSD features.  We do not
300 	 * force success if the new node is sufficiently like an existing one.
301 	 * Also, we do not use global autoincrement for dynamic identifiers,
302 	 * although that could easily be changed.
303 	 */
304 
305 	/* First check the static node array, just for collisions. */
306 	for (id = 0; IS_STATIC_ID(parent, id); id++) {
307 		node = &parent->node_scptr[id];
308 		if (node->node_flags == 0)
309 			continue;
310 		if (id == given_id || !strcmp(name, node->node_name)) {
311 			*idp = id;
312 			*nodep = node;
313 			return EEXIST;
314 		}
315 	}
316 
317 	/*
318 	 * Then try to find the place to insert a new dynamic node.  At the
319 	 * same time, check for both identifier and name collisions.
320 	 */
321 	if (given_id >= 0)
322 		id = given_id;
323 	else
324 		id = MAX(CREATE_BASE, parent->node_size);
325 
326 	for (prevp = &parent->node_dcptr; *prevp != NULL;
327 	    prevp = &((*prevp)->dynode_next)) {
328 		if ((*prevp)->dynode_id > id)
329 			break;
330 		if ((*prevp)->dynode_id == id) {
331 			if (given_id >= 0) {
332 				*idp = id;
333 				*nodep = &(*prevp)->dynode_node;
334 				return EEXIST;
335 			} else
336 				id++;
337 		}
338 		if (!strcmp(name, (*prevp)->dynode_node.node_name)) {
339 			*idp = (*prevp)->dynode_id;
340 			*nodep = &(*prevp)->dynode_node;
341 			return EEXIST;
342 		}
343 	}
344 
345 	/* Finally, check the rest of the dynamic nodes for name collisions. */
346 	for (dynp = prevp; *dynp != NULL; dynp = &((*dynp)->dynode_next)) {
347 		assert((*dynp)->dynode_id > id);
348 
349 		if (!strcmp(name, (*dynp)->dynode_node.node_name)) {
350 			*idp = (*dynp)->dynode_id;
351 			*nodep = &(*dynp)->dynode_node;
352 			return EEXIST;
353 		}
354 	}
355 
356 	*idp = id;
357 	*prevpp = prevp;
358 	return OK;
359 }
360 
361 /*
362  * Copy in a string from the user process, located at the given remote address,
363  * into the given local buffer.  If no buffer is given, just compute the length
364  * of the string.  On success, return OK.  If 'sizep' is not NULL, it will be
365  * filled with the string size, including the null terminator.  If a non-NULL
366  * buffer was given, the string will be copied into the provided buffer (also
367  * including null terminator).  Return an error code on failure, which includes
368  * the case that no null terminator was found within the range of bytes that
369  * would fit in the given buffer.
370  */
371 static int
372 mib_copyin_str(struct mib_newp * __restrict newp, vir_bytes addr,
373 	char * __restrict buf, size_t bufsize, size_t * __restrict sizep)
374 {
375 	char *ptr, *endp;
376 	size_t chunk, len;
377 	int r;
378 
379 	assert(newp != NULL);
380 	assert(bufsize <= SSIZE_MAX);
381 
382 	if (addr == 0)
383 		return EINVAL;
384 
385 	/*
386 	 * NetBSD has a kernel routine for copying in a string from userland.
387 	 * MINIX3 does not, since its system call interface has always relied
388 	 * on userland passing in string lengths.  The sysctl(2) API does not
389 	 * provide the string length, and thus, we have to do a bit of guess
390 	 * work.  If we copy too little at once, performance suffers.  If we
391 	 * copy too much at once, we may trigger an unneeded page fault.  Make
392 	 * use of page boundaries to strike a balance between those two.  If we
393 	 * are requested to just get the string length, use the scratch buffer.
394 	 */
395 	len = 0;
396 
397 	while (bufsize > 0) {
398 		chunk = PAGE_SIZE - (addr % PAGE_SIZE);
399 		if (chunk > bufsize)
400 			chunk = bufsize;
401 
402 		ptr = (buf != NULL) ? &buf[len] : scratch;
403 		if ((r = mib_copyin_aux(newp, addr, ptr, chunk)) != OK)
404 			return r;
405 
406 		if ((endp = memchr(ptr, '\0', chunk)) != NULL) {
407 			/* A null terminator was found - success. */
408 			if (sizep != NULL)
409 				*sizep = len + (size_t)(endp - ptr) + 1;
410 			return OK;
411 		}
412 
413 		addr += chunk;
414 		len += chunk;
415 		bufsize -= chunk;
416 	}
417 
418 	/* No null terminator found. */
419 	return EINVAL;
420 }
421 
422 /*
423  * Increase the version of the root node, and copy this new version to all
424  * nodes on the path to the given node, including that node itself.
425  */
426 static void
427 mib_upgrade(struct mib_node * node)
428 {
429 	uint32_t ver;
430 
431 	assert(node != NULL);
432 
433 	/*
434 	 * The root node determines the version of the entire tree.  Do not use
435 	 * version number 0, as a zero version number indicates no interest in
436 	 * versions elsewhere.
437 	 */
438 
439 	ver = mib_root.node_ver + 1;
440 	if (ver == 0)
441 		ver = 1;
442 
443 	/* Copy the new version to all the nodes on the path. */
444 	do {
445 		node->node_ver = ver;
446 
447 		node = node->node_parent;
448 	} while (node != NULL);
449 }
450 
451 /*
452  * Add a new dynamically allocated node into the tree, inserting it into the
453  * linked-list position of the parent tree as given by 'prevp'.  Also update
454  * versions and counters accordingly.  This function never fails.
455  */
456 static void
457 mib_add(struct mib_dynode * dynode, struct mib_dynode ** prevp)
458 {
459 	struct mib_node *parent;
460 
461 	parent = dynode->dynode_node.node_parent;
462 	assert(parent != NULL);
463 
464 	/* Link the dynamic node into the list, in the right place. */
465 	assert(prevp != NULL);
466 	dynode->dynode_next = *prevp;
467 	*prevp = dynode;
468 
469 	/* The parent node now has one more child. */
470 	parent->node_csize++;
471 	parent->node_clen++;
472 
473 	/* There is now one more node in the tree. */
474 	mib_nodes++;
475 
476 	/*
477 	 * Bump the version of all nodes on the path to the new node, including
478 	 * the node itself.
479 	 */
480 	mib_upgrade(&dynode->dynode_node);
481 }
482 
483 /*
484  * Create a node.
485  */
486 static ssize_t
487 mib_create(struct mib_call * call, struct mib_node * parent,
488 	struct mib_oldp * oldp, struct mib_newp * newp)
489 {
490 	struct mib_dynode *dynode, **prevp;
491 	struct mib_node *node;
492 	struct sysctlnode scn;
493 	size_t namelen, size;
494 	ssize_t len;
495 	bool b;
496 	char c;
497 	int r, id;
498 
499 	/* This is a privileged operation. */
500 	if (!mib_authed(call))
501 		return EPERM;
502 
503 	/*
504 	 * The parent must not be a remote node, but this is already implied by
505 	 * the fact that we got here at all.
506 	 */
507 	assert(SYSCTL_TYPE(parent->node_flags) == CTLTYPE_NODE);
508 	assert(!(parent->node_flags & CTLFLAG_REMOTE));
509 
510 	/* The parent node must not be marked as read-only. */
511 	if (!(parent->node_flags & CTLFLAG_READWRITE))
512 		return EPERM;
513 
514 	/*
515 	 * Has the parent reached its child node limit?  This check is entirely
516 	 * theoretical as long as we support only 32-bit virtual memory.
517 	 */
518 	if (parent->node_csize == INT_MAX)
519 		return EINVAL;
520 	assert(parent->node_clen <= parent->node_csize);
521 
522 	/* The caller must supply information on the child node to create. */
523 	if (newp == NULL)
524 		return EINVAL;
525 
526 	if ((r = mib_copyin(newp, &scn, sizeof(scn))) != OK)
527 		return r;
528 
529 	/*
530 	 * We perform as many checks as possible before we start allocating
531 	 * memory.  Then again, after allocation, copying in data may still
532 	 * fail.  Unlike when setting values, we do not first copy data into a
533 	 * temporary buffer here, because we do not need to: if the copy fails,
534 	 * the entire create operation fails, so atomicity is not an issue.
535 	 */
536 	if (SYSCTL_VERS(scn.sysctl_flags) != SYSCTL_VERSION)
537 		return EINVAL;
538 
539 	/*
540 	 * If a node version number is given, it must match the version of
541 	 * either the parent or the root node.  The given version number is
542 	 * *not* used for the node being created.
543 	 */
544 	if (scn.sysctl_ver != 0 && scn.sysctl_ver != mib_root.node_ver &&
545 	    scn.sysctl_ver != parent->node_ver)
546 		return EINVAL;
547 
548 	/*
549 	 * Validate the node flags.  In addition to the NetBSD-allowed flags,
550 	 * we also allow UNSIGNED, and leave its interpretation to userland.
551 	 */
552 	if (SYSCTL_FLAGS(scn.sysctl_flags) &
553 	    ~(SYSCTL_USERFLAGS | CTLFLAG_UNSIGNED))
554 		return EINVAL;
555 
556 	if (!(scn.sysctl_flags & CTLFLAG_IMMEDIATE)) {
557 		/*
558 		 * Without either IMMEDIATE or OWNDATA, data pointers are
559 		 * actually kernel addresses--a concept we do not support.
560 		 * Otherwise, if IMMEDIATE is not set, we are going to have to
561 		 * allocate extra memory for the data, so force OWNDATA to be.
562 		 * set.  Node-type nodes have no data, though.
563 		 */
564 		if (SYSCTL_TYPE(scn.sysctl_flags) != CTLTYPE_NODE) {
565 			if (!(scn.sysctl_flags & CTLFLAG_OWNDATA) &&
566 			    scn.sysctl_data != NULL)
567 				return EINVAL; /* not meaningful on MINIX3 */
568 
569 			scn.sysctl_flags |= CTLFLAG_OWNDATA;
570 		}
571 	} else if (scn.sysctl_flags & CTLFLAG_OWNDATA)
572 		return EINVAL;
573 
574 	/* The READWRITE flag consists of multiple bits.  Sanitize. */
575 	if (scn.sysctl_flags & CTLFLAG_READWRITE)
576 		scn.sysctl_flags |= CTLFLAG_READWRITE;
577 
578 	/* Validate the node type and size, and do some additional checks. */
579 	switch (SYSCTL_TYPE(scn.sysctl_flags)) {
580 	case CTLTYPE_BOOL:
581 		if (scn.sysctl_size != sizeof(bool))
582 			return EINVAL;
583 		break;
584 	case CTLTYPE_INT:
585 		if (scn.sysctl_size != sizeof(int))
586 			return EINVAL;
587 		break;
588 	case CTLTYPE_QUAD:
589 		if (scn.sysctl_size != sizeof(u_quad_t))
590 			return EINVAL;
591 		break;
592 	case CTLTYPE_STRING:
593 		/*
594 		 * For strings, a zero length means that we are supposed to
595 		 * allocate a buffer size based on the given string size.
596 		 */
597 		if (scn.sysctl_size == 0 && scn.sysctl_data != NULL) {
598 			if ((r = mib_copyin_str(newp,
599 			    (vir_bytes)scn.sysctl_data, NULL, SSIZE_MAX,
600 			    &size)) != OK)
601 				return r;
602 			scn.sysctl_size = size;
603 		}
604 		/* FALLTHROUGH */
605 	case CTLTYPE_STRUCT:
606 		/*
607 		 * We do not set an upper size on the data size, since it would
608 		 * still be possible to create a large number of nodes, and
609 		 * this is a privileged operation ayway.
610 		 */
611 		if (scn.sysctl_size == 0 || scn.sysctl_size > SSIZE_MAX)
612 			return EINVAL;
613 		if (scn.sysctl_flags & CTLFLAG_IMMEDIATE)
614 			return EINVAL;
615 		break;
616 	case CTLTYPE_NODE:
617 		/*
618 		 * The zero size is an API requirement, but we also rely on the
619 		 * zero value internally, as the node has no static children.
620 		 */
621 		if (scn.sysctl_size != 0)
622 			return EINVAL;
623 		if (scn.sysctl_flags & (CTLFLAG_IMMEDIATE | CTLFLAG_OWNDATA))
624 			return EINVAL;
625 		if (scn.sysctl_csize != 0 || scn.sysctl_clen != 0 ||
626 		    scn.sysctl_child != NULL)
627 			return EINVAL;
628 		break;
629 	default:
630 		return EINVAL;
631 	}
632 
633 	if (scn.sysctl_func != NULL || scn.sysctl_parent != NULL)
634 		return EINVAL;
635 
636 	/* Verify that the given name is valid, and get its string length. */
637 	namelen = mib_check_name(scn.sysctl_name, sizeof(scn.sysctl_name));
638 
639 	if (namelen == 0)
640 		return EINVAL;
641 
642 	/*
643 	 * Find a free identifier, or check for ID collisions if a specific
644 	 * identifier was given.  At the same time, scan for name collisions,
645 	 * and find the location at which to insert the new node in the list.
646 	 */
647 	r = mib_scan(parent, scn.sysctl_num, scn.sysctl_name, &id, &prevp,
648 	    &node);
649 
650 	if (r != OK) {
651 		/*
652 		 * On collisions, if requested, copy out the existing node.
653 		 * This does not quite fit the general interaction model, as
654 		 * the service must now return a nonzero old length from a call
655 		 * that actually failed (in contrast to ENOMEM failures).
656 		 */
657 		if (r == EEXIST && oldp != NULL) {
658 			len = mib_copyout_node(call, oldp, 0, id, node);
659 
660 			if (len > 0)
661 				mib_setoldlen(call, len);
662 		}
663 
664 		return r;
665 	}
666 
667 	/*
668 	 * All checks so far have passed.  "id" now contains the new node
669 	 * identifier, and "prevp" points to the pointer at which to insert the
670 	 * new node in its parent's linked list of dynamic nodes.
671 	 *
672 	 * We can now attempt to create and initialize a new dynamic node.
673 	 * Allocating nodes this way may cause heavy memory fragmentation over
674 	 * time, but we do not expect the tree to see heavy modification at run
675 	 * time, and the superuser has easier ways to get the MIB service in
676 	 * trouble.  We note that even in low-memory conditions, the MIB
677 	 * service is always able to provide basic functionality.
678 	 */
679 	size = sizeof(*dynode) + namelen;
680 	if (!(scn.sysctl_flags & CTLFLAG_IMMEDIATE))
681 		size += scn.sysctl_size;
682 
683 	if ((dynode = malloc(size)) == NULL)
684 		return EINVAL; /* do not return ENOMEM */
685 	mib_objects++;
686 
687 	/* From here on, we have to free "dynode" before returning an error. */
688 	r = OK;
689 
690 	memset(dynode, 0, sizeof(*dynode)); /* no need to zero all of "size" */
691 	dynode->dynode_id = id;
692 	strlcpy(dynode->dynode_name, scn.sysctl_name, namelen + 1);
693 
694 	node = &dynode->dynode_node;
695 	node->node_flags = scn.sysctl_flags & ~SYSCTL_VERS_MASK;
696 	if (SYSCTL_TYPE(scn.sysctl_flags) == CTLTYPE_NODE)
697 		node->node_flags |= CTLFLAG_PARENT;
698 	node->node_size = scn.sysctl_size;
699 	node->node_parent = parent;
700 	node->node_name = dynode->dynode_name;
701 
702 	/* Initialize the node value. */
703 	if (scn.sysctl_flags & CTLFLAG_IMMEDIATE) {
704 		switch (SYSCTL_TYPE(scn.sysctl_flags)) {
705 		case CTLTYPE_BOOL:
706 			/* Sanitize booleans.  See the C99 _Bool comment. */
707 			memcpy(&c, &scn.sysctl_bdata, sizeof(c));
708 			node->node_bool = (bool)c;
709 			break;
710 		case CTLTYPE_INT:
711 			node->node_int = scn.sysctl_idata;
712 			break;
713 		case CTLTYPE_QUAD:
714 			node->node_quad = scn.sysctl_qdata;
715 			break;
716 		default:
717 			assert(0);
718 		}
719 	} else if (SYSCTL_TYPE(scn.sysctl_flags) != CTLTYPE_NODE) {
720 		node->node_data = dynode->dynode_name + namelen + 1;
721 
722 		/* Did the user supply initial data?  If not, use zeroes. */
723 		if (scn.sysctl_data != NULL) {
724 			/*
725 			 * For strings, do not copy in more than needed.  This
726 			 * is just a nice feature which allows initialization
727 			 * of large string buffers with short strings.
728 			 */
729 			if (SYSCTL_TYPE(scn.sysctl_flags) == CTLTYPE_STRING)
730 				r = mib_copyin_str(newp,
731 				    (vir_bytes)scn.sysctl_data,
732 				    node->node_data, scn.sysctl_size, NULL);
733 			else
734 				r = mib_copyin_aux(newp,
735 				    (vir_bytes)scn.sysctl_data,
736 				    node->node_data, scn.sysctl_size);
737 		} else
738 			memset(node->node_data, 0, scn.sysctl_size);
739 
740 		/*
741 		 * Sanitize booleans.  See the C99 _Bool comment elsewhere.
742 		 * In this case it is not as big of a deal, as we will not be
743 		 * accessing the boolean value directly ourselves.
744 		 */
745 		if (r == OK && SYSCTL_TYPE(scn.sysctl_flags) == CTLTYPE_BOOL) {
746 			b = (bool)*(char *)node->node_data;
747 			memcpy(node->node_data, &b, sizeof(b));
748 		}
749 	}
750 
751 	/*
752 	 * Even though it would be entirely possible to set a description right
753 	 * away as well, this does not seem to be supported on NetBSD at all.
754 	 */
755 
756 	/* Deal with earlier failures now. */
757 	if (r != OK) {
758 		free(dynode);
759 		mib_objects--;
760 
761 		return r;
762 	}
763 
764 	/*
765 	 * At this point, actual creation can no longer fail.  Add the node
766 	 * into the tree, and update versions and counters.
767 	 */
768 	mib_add(dynode, prevp);
769 
770 	/*
771 	 * Copy out the newly created node as resulting ("old") data.  Do not
772 	 * undo the creation if this fails, though.
773 	 */
774 	return mib_copyout_node(call, oldp, 0, id, node);
775 }
776 
777 /*
778  * Remove the given node from the tree.  If 'prevp' is NULL, the node is a
779  * static node which should be zeroed out.  If 'prevp' is not NULL, the node is
780  * a dynamic node which should be freed; 'prevp' will then point to the pointer
781  * to its dynode container.  Also update versions and counters as appropriate.
782  * This function never fails.
783  */
784 static void
785 mib_remove(struct mib_node * node, struct mib_dynode ** prevp)
786 {
787 	struct mib_dynode *dynode;
788 	struct mib_node *parent;
789 
790 	parent = node->node_parent;
791 	assert(parent != NULL);
792 
793 	/* If the description was allocated, free it. */
794 	if (node->node_flags & CTLFLAG_OWNDESC) {
795 		free(__UNCONST(node->node_desc));
796 		mib_objects--;
797 	}
798 
799 	/*
800 	 * Static nodes only use static memory, and dynamic nodes have the data
801 	 * area embedded in the dynode object.  In neither case is data memory
802 	 * allocated separately, and thus, it need never be freed separately.
803 	 * Therefore we *must not* check CTLFLAG_OWNDATA here.
804 	 */
805 
806 	assert(parent->node_csize > 0);
807 	assert(parent->node_clen > 0);
808 
809 	/*
810 	 * Dynamic nodes must be freed.  Freeing the dynode object also frees
811 	 * the node name and any associated data.  Static nodes are zeroed out,
812 	 * and the static memory they referenced will become inaccessible.
813 	 */
814 	if (prevp != NULL) {
815 		dynode = *prevp;
816 		*prevp = dynode->dynode_next;
817 
818 		assert(node == &dynode->dynode_node);
819 
820 		free(dynode);
821 		mib_objects--;
822 
823 		parent->node_csize--;
824 	} else
825 		memset(node, 0, sizeof(*node));
826 
827 	parent->node_clen--;
828 
829 	mib_nodes--;
830 
831 	/* Bump the version of all nodes on the path to the destroyed node. */
832 	mib_upgrade(parent);
833 }
834 
835 /*
836  * Destroy a node.
837  */
838 static ssize_t
839 mib_destroy(struct mib_call * call, struct mib_node * parent,
840 	struct mib_oldp * oldp, struct mib_newp * newp)
841 {
842 	struct mib_dynode **prevp;
843 	struct mib_node *node;
844 	struct sysctlnode scn;
845 	ssize_t r;
846 
847 	/* This is a privileged operation. */
848 	if (!mib_authed(call))
849 		return EPERM;
850 
851 	/* The parent node must not be marked as read-only. */
852 	if (!(parent->node_flags & CTLFLAG_READWRITE))
853 		return EPERM;
854 
855 	/* The caller must specify which child node to destroy. */
856 	if (newp == NULL)
857 		return EINVAL;
858 
859 	if ((r = mib_copyin(newp, &scn, sizeof(scn))) != OK)
860 		return r;
861 
862 	if (SYSCTL_VERS(scn.sysctl_flags) != SYSCTL_VERSION)
863 		return EINVAL;
864 
865 	/* Locate the child node. */
866 	if ((node = mib_find(parent, scn.sysctl_num, &prevp)) == NULL)
867 		return ENOENT;
868 
869 	/* The node must not be marked as permanent. */
870 	if (node->node_flags & CTLFLAG_PERMANENT)
871 		return EPERM;
872 
873 	/* For node-type nodes, extra rules apply. */
874 	if (SYSCTL_TYPE(node->node_flags) == CTLTYPE_NODE) {
875 		/* The node must not be a mount point. */
876 		if (node->node_flags & CTLFLAG_REMOTE)
877 			return EBUSY;
878 
879 		/* The node must not have an associated function. */
880 		if (!(node->node_flags & CTLFLAG_PARENT))
881 			return EPERM;
882 
883 		/* The target node must itself not have child nodes. */
884 		if (node->node_clen != 0)
885 			return ENOTEMPTY;
886 	}
887 
888 	/* If the user supplied a version, it must match the node version. */
889 	if (scn.sysctl_ver != 0 && scn.sysctl_ver != node->node_ver)
890 		return EINVAL;	/* NetBSD inconsistently throws ENOENT here */
891 
892 	/* If the user supplied a name, it must match the node name. */
893 	if (scn.sysctl_name[0] != '\0') {
894 		if (memchr(scn.sysctl_name, '\0',
895 		    sizeof(scn.sysctl_name)) == NULL)
896 			return EINVAL;
897 		if (strcmp(scn.sysctl_name, node->node_name))
898 			return EINVAL;	/* also ENOENT on NetBSD */
899 	}
900 
901 	/*
902 	 * Copy out the old node if requested, otherwise return the length
903 	 * anyway.  The node will be destroyed even if this call fails,
904 	 * because that is how NetBSD behaves.
905 	 */
906 	r = mib_copyout_node(call, oldp, 0, scn.sysctl_num, node);
907 
908 	/*
909 	 * Remove the node from the tree.  The procedure depends on whether the
910 	 * node is static (prevp == NULL) or dynamic (prevp != NULL).  Also
911 	 * update versions and counters.
912 	 */
913 	mib_remove(node, prevp);
914 
915 	return r;
916 }
917 
918 /*
919  * Copy out a node description to userland, using the exchange format for node
920  * descriptions (namely, a sysctldesc structure).  Return the size of the
921  * object that is (or, if the description falls outside the requested data
922  * range, would be) copied out on success, or a negative error code on failure.
923  * The function may return 0 to indicate that nothing was copied out after all.
924  */
925 static ssize_t
926 mib_copyout_desc(struct mib_call * call, struct mib_oldp * oldp, size_t off,
927 	int id, const struct mib_node * node)
928 {
929 	struct sysctldesc *scd;
930 	size_t size;
931 	int r;
932 
933 	/* Descriptions of private nodes are considered private too. */
934 	if ((node->node_flags & CTLFLAG_PRIVATE) && !mib_authed(call))
935 		return 0;
936 
937 	/* The description length includes the null terminator. */
938 	if (node->node_desc != NULL)
939 		size = strlen(node->node_desc) + 1;
940 	else
941 		size = 1;
942 
943 	assert(sizeof(*scd) + size <= sizeof(scratch));
944 
945 	scd = (struct sysctldesc *)scratch;
946 	memset(scd, 0, sizeof(*scd));
947 	scd->descr_num = id;
948 	scd->descr_ver = node->node_ver;
949 	scd->descr_len = size;
950 	if (node->node_desc != NULL)
951 		strlcpy(scd->descr_str, node->node_desc,
952 		    sizeof(scratch) - sizeof(*scd));
953 	else
954 		scd->descr_str[0] = '\0';
955 
956 	size += offsetof(struct sysctldesc, descr_str);
957 
958 	if ((r = mib_copyout(oldp, off, scratch, size)) < 0)
959 		return r;
960 
961 	/*
962 	 * By aligning just the size, we may leave garbage between the entries
963 	 * copied out, which is fine because it is userland's own data.
964 	 */
965 	return roundup2(size, sizeof(int32_t));
966 }
967 
968 /*
969  * Retrieve node descriptions in bulk, or retrieve or assign a particular
970  * node's description.
971  */
972 static ssize_t
973 mib_describe(struct mib_call * call, struct mib_node * parent,
974 	struct mib_oldp * oldp, struct mib_newp * newp)
975 {
976 	struct sysctlnode scn;
977 	struct mib_node *node;
978 	struct mib_dynode *dynode;
979 	size_t off;
980 	int r, id;
981 
982 	/* If new data are given, they identify a particular target node. */
983 	if (newp != NULL) {
984 		if ((r = mib_copyin(newp, &scn, sizeof(scn))) != OK)
985 			return r;
986 
987 		if (SYSCTL_VERS(scn.sysctl_flags) != SYSCTL_VERSION)
988 			return EINVAL;
989 
990 		/* Locate the child node. */
991 		if ((node = mib_find(parent, scn.sysctl_num, NULL)) == NULL)
992 			return ENOENT;
993 
994 		/* Descriptions of private nodes are considered private too. */
995 		if ((node->node_flags & CTLFLAG_PRIVATE) && !mib_authed(call))
996 			return EPERM;
997 
998 		/*
999 		 * If a description pointer was given, this is a request to
1000 		 * set the node's description.
1001 		 */
1002 		if (scn.sysctl_desc != NULL) {
1003 			/* Such a request requires superuser privileges. */
1004 			if (!mib_authed(call))
1005 				return EPERM;
1006 
1007 			/*
1008 			 * The node must not be a mount point.  Arguably this
1009 			 * check is not necessary, since we use the description
1010 			 * of the preexisting underlying node anyway.
1011 			 */
1012 			if (SYSCTL_TYPE(node->node_flags) == CTLTYPE_NODE &&
1013 			    (node->node_flags & CTLFLAG_REMOTE))
1014 				return EBUSY;
1015 
1016 			/* The node must not already have a description. */
1017 			if (node->node_desc != NULL)
1018 				return EPERM;
1019 
1020 			/* The node must not be marked as permanent. */
1021 			if (node->node_flags & CTLFLAG_PERMANENT)
1022 				return EPERM;
1023 
1024 			/*
1025 			 * If the user supplied a version, it must match.
1026 			 * NetBSD performs this check only when setting
1027 			 * descriptions, and thus, so do we..
1028 			 */
1029 			if (scn.sysctl_ver != 0 &&
1030 			    scn.sysctl_ver != node->node_ver)
1031 				return EINVAL;
1032 
1033 			/*
1034 			 * Copy in the description to the scratch buffer.
1035 			 * The length of the description must be reasonable.
1036 			 */
1037 			if ((r = mib_copyin_str(newp,
1038 			    (vir_bytes)scn.sysctl_desc, scratch, MAXDESCLEN,
1039 			    NULL)) != OK)
1040 				return r;
1041 
1042 			/* Allocate memory and store the description. */
1043 			if ((node->node_desc = strdup(scratch)) == NULL) {
1044 				printf("MIB: out of memory!\n");
1045 
1046 				return EINVAL; /* do not return ENOMEM */
1047 			}
1048 			mib_objects++;
1049 
1050 			/* The description must now be freed with the node. */
1051 			node->node_flags |= CTLFLAG_OWNDESC;
1052 		}
1053 
1054 		/*
1055 		 * Either way, copy out the requested node's description, which
1056 		 * should indeed be the new description if one was just set.
1057 		 * Note that we have already performed the permission check
1058 		 * that could make this call return zero, so here it will not.
1059 		 */
1060 		return mib_copyout_desc(call, oldp, 0, scn.sysctl_num, node);
1061 	}
1062 
1063 	/* See also the considerations laid out in mib_query(). */
1064 	off = 0;
1065 
1066 	/* First describe the static nodes. */
1067 	for (id = 0; IS_STATIC_ID(parent, id); id++) {
1068 		node = &parent->node_scptr[id];
1069 
1070 		if (node->node_flags == 0)
1071 			continue;
1072 
1073 		if ((r = mib_copyout_desc(call, oldp, off, id, node)) < 0)
1074 			return r;
1075 		off += r;
1076 	}
1077 
1078 	/* Then describe the dynamic nodes. */
1079 	for (dynode = parent->node_dcptr; dynode != NULL;
1080 	    dynode = dynode->dynode_next) {
1081 		node = &dynode->dynode_node;
1082 
1083 		if ((r = mib_copyout_desc(call, oldp, off, dynode->dynode_id,
1084 		    node)) < 0)
1085 			return r;
1086 		off += r;
1087 	}
1088 
1089 	return off;
1090 }
1091 
1092 /*
1093  * Return a pointer to the data associated with the given node, or NULL if the
1094  * node has no associated data.  Actual calls to this function should never
1095  * result in NULL - as long as the proper rules are followed elsewhere.
1096  */
1097 static void *
1098 mib_getptr(struct mib_node * node)
1099 {
1100 
1101 	switch (SYSCTL_TYPE(node->node_flags)) {
1102 	case CTLTYPE_BOOL:
1103 		if (node->node_flags & CTLFLAG_IMMEDIATE)
1104 			return &node->node_bool;
1105 		break;
1106 	case CTLTYPE_INT:
1107 		if (node->node_flags & CTLFLAG_IMMEDIATE)
1108 			return &node->node_int;
1109 		break;
1110 	case CTLTYPE_QUAD:
1111 		if (node->node_flags & CTLFLAG_IMMEDIATE)
1112 			return &node->node_quad;
1113 		break;
1114 	case CTLTYPE_STRING:
1115 	case CTLTYPE_STRUCT:
1116 		if (node->node_flags & CTLFLAG_IMMEDIATE)
1117 			return NULL;
1118 		break;
1119 	default:
1120 		return NULL;
1121 	}
1122 
1123 	return node->node_data;
1124 }
1125 
1126 /*
1127  * Read current (old) data from a regular data node, if requested.  Return the
1128  * old data length.
1129  */
1130 static ssize_t
1131 mib_read(struct mib_node * node, struct mib_oldp * oldp)
1132 {
1133 	void *ptr;
1134 	size_t oldlen;
1135 	int r;
1136 
1137 	if ((ptr = mib_getptr(node)) == NULL)
1138 		return EINVAL;
1139 
1140 	if (SYSCTL_TYPE(node->node_flags) == CTLTYPE_STRING)
1141 		oldlen = strlen(node->node_data) + 1;
1142 	else
1143 		oldlen = node->node_size;
1144 
1145 	if (oldlen > SSIZE_MAX)
1146 		return EINVAL;
1147 
1148 	/* Copy out the current data, if requested at all. */
1149 	if (oldp != NULL && (r = mib_copyout(oldp, 0, ptr, oldlen)) < 0)
1150 		return r;
1151 
1152 	/* Return the current length in any case. */
1153 	return (ssize_t)oldlen;
1154 }
1155 
1156 /*
1157  * Write new data into a regular data node, if requested.
1158  */
1159 static int
1160 mib_write(struct mib_call * call, struct mib_node * node,
1161 	struct mib_newp * newp, mib_verify_ptr verify)
1162 {
1163 	bool b[(sizeof(bool) == sizeof(char)) ? 1 : -1]; /* explained below */
1164 	char *src, *dst;
1165 	size_t newlen;
1166 	int r;
1167 
1168 	if (newp == NULL)
1169 		return OK; /* nothing to do */
1170 
1171 	/*
1172 	 * When setting a new value, we cannot risk doing an in-place update:
1173 	 * the copy from userland may fail halfway through, in which case an
1174 	 * in-place update could leave the node value in a corrupted state.
1175 	 * Thus, we must first fetch any new data into a temporary buffer.
1176 	 *
1177 	 * Given that we use intermediate data storage, we could support value
1178 	 * swapping, where the user provides the same buffer for new and old
1179 	 * data.  We choose not to: NetBSD does not support it, it would make
1180 	 * trace(1)'s job a lot harder, and it would convolute the code here.
1181 	 */
1182 	newlen = mib_getnewlen(newp);
1183 
1184 	if ((dst = mib_getptr(node)) == NULL)
1185 		return EINVAL;
1186 
1187 	switch (SYSCTL_TYPE(node->node_flags)) {
1188 	case CTLTYPE_BOOL:
1189 	case CTLTYPE_INT:
1190 	case CTLTYPE_QUAD:
1191 	case CTLTYPE_STRUCT:
1192 		/* Non-string types must have an exact size match. */
1193 		if (newlen != node->node_size)
1194 			return EINVAL;
1195 		break;
1196 	case CTLTYPE_STRING:
1197 		/*
1198 		 * Strings must not exceed their buffer size.  There is a
1199 		 * second check further below, because we allow userland to
1200 		 * give us an unterminated string.  In that case we terminate
1201 		 * it ourselves, but then the null terminator must fit as well.
1202 		 */
1203 		if (newlen > node->node_size)
1204 			return EINVAL;
1205 		break;
1206 	default:
1207 		return EINVAL;
1208 	}
1209 
1210 	/*
1211 	 * If we cannot fit the data in the scratch buffer, then allocate a
1212 	 * temporary buffer.  We add one extra byte so that we can add a null
1213 	 * terminator at the end of strings in case userland did not supply
1214 	 * one.  Either way, we must free the temporary buffer later!
1215 	 *
1216 	 * The alternative is to ensure that the given memory is accessible
1217 	 * before starting the copy, but that would break if we ever add kernel
1218 	 * threads or anything that allows asynchronous memory unmapping, etc.
1219 	 */
1220 	if (newlen + 1 > sizeof(scratch)) {
1221 		/*
1222 		 * In practice, the temporary buffer is at least an entire
1223 		 * memory page, which is reasonable by any standard.  As a
1224 		 * result, we can get away with refusing to perform dynamic
1225 		 * allocation for unprivileged users.  This limits the impact
1226 		 * that unprivileged users can have on our memory space.
1227 		 */
1228 		if (!mib_authed(call))
1229 			return EPERM;
1230 
1231 		/*
1232 		 * Do not return ENOMEM on allocation failure, because ENOMEM
1233 		 * implies that a valid old length was returned.
1234 		 */
1235 		if ((src = malloc(newlen + 1)) == NULL) {
1236 			printf("MIB: out of memory!\n");
1237 
1238 			return EINVAL;
1239 		}
1240 		mib_objects++;
1241 	} else
1242 		src = scratch;
1243 
1244 	/* Copy in the data.  Note that newlen may be zero. */
1245 	r = mib_copyin(newp, src, newlen);
1246 
1247 	if (r == OK && verify != NULL && !verify(call, node, src, newlen))
1248 		r = EINVAL;
1249 
1250 	if (r == OK) {
1251 		/* Check and, if acceptable, store the new value. */
1252 		switch (SYSCTL_TYPE(node->node_flags)) {
1253 		case CTLTYPE_BOOL:
1254 			/*
1255 			 * Due to the nature of the C99 _Bool type, we can not
1256 			 * test directly whether the given boolean value is a
1257 			 * value that is not "true" and not "false".  In the
1258 			 * worst case, another value could invoke undefined
1259 			 * behavior.  We try our best to sanitize the value
1260 			 * without looking at it directly, which unfortunately
1261 			 * requires us to test for the size of the bool type.
1262 			 * We do that at compile time, hence the 'b' "array".
1263 			 * Any size other than one byte is an ABI violation.
1264 			 */
1265 			b[0] = (bool)src[0];
1266 			memcpy(dst, &b[0], sizeof(b[0]));
1267 			break;
1268 		case CTLTYPE_INT:
1269 		case CTLTYPE_QUAD:
1270 		case CTLTYPE_STRUCT:
1271 			memcpy(dst, src, node->node_size);
1272 			break;
1273 		case CTLTYPE_STRING:
1274 			if (newlen == node->node_size &&
1275 			    src[newlen - 1] != '\0') {
1276 				/* Our null terminator does not fit! */
1277 				r = EINVAL;
1278 				break;
1279 			}
1280 			/*
1281 			 * We do not mind null characters in the middle.  In
1282 			 * general, the buffer may contain garbage after the
1283 			 * first null terminator, but such garbage will never
1284 			 * end up being copied out.
1285 			 */
1286 			src[newlen] = '\0';
1287 			strlcpy(dst, src, node->node_size);
1288 			break;
1289 		default:
1290 			r = EINVAL;
1291 		}
1292 	}
1293 
1294 	if (src != scratch) {
1295 		free(src);
1296 		mib_objects--;
1297 	}
1298 
1299 	return r;
1300 }
1301 
1302 /*
1303  * Read and/or write the value of a regular data node.  A regular data node is
1304  * a leaf node.  Typically, a leaf node has no associated function, in which
1305  * case this function will be used instead.  In addition, this function may be
1306  * used from handler functions as part of their functionality.
1307  */
1308 ssize_t
1309 mib_readwrite(struct mib_call * call, struct mib_node * node,
1310 	struct mib_oldp * oldp, struct mib_newp * newp, mib_verify_ptr verify)
1311 {
1312 	ssize_t len;
1313 	int r;
1314 
1315 	/* Copy out old data, if requested.  Always get the old data length. */
1316 	if ((r = len = mib_read(node, oldp)) < 0)
1317 		return r;
1318 
1319 	/* Copy in new data, if requested. */
1320 	if ((r = mib_write(call, node, newp, verify)) != OK)
1321 		return r;
1322 
1323 	/* Return the old data length. */
1324 	return len;
1325 }
1326 
1327 /*
1328  * Dispatch a sysctl call, by looking up the target node by its MIB name and
1329  * taking the appropriate action on the resulting node, if found.  Return the
1330  * old data length on success, or a negative error code on failure.
1331  */
1332 ssize_t
1333 mib_dispatch(struct mib_call * call, struct mib_oldp * oldp,
1334 	struct mib_newp * newp)
1335 {
1336 	struct mib_node *parent, *node;
1337 	ssize_t r;
1338 	int id, is_leaf, can_restart, has_verify, has_func;
1339 
1340 	assert(call->call_namelen <= CTL_MAXNAME);
1341 
1342 	/*
1343 	 * Resolve the name by descending into the node tree, level by level,
1344 	 * starting at the MIB root.
1345 	 */
1346 	for (parent = &mib_root; call->call_namelen > 0; parent = node) {
1347 		id = call->call_name[0];
1348 		call->call_name++;
1349 		call->call_namelen--;
1350 
1351 		assert(SYSCTL_TYPE(parent->node_flags) == CTLTYPE_NODE);
1352 		assert(parent->node_flags & CTLFLAG_PARENT);
1353 
1354 		/*
1355 		 * Check for meta-identifiers.  Regular identifiers are never
1356 		 * negative, although node handler functions may take subpaths
1357 		 * with negative identifiers that are not meta-identifiers
1358 		 * (e.g., see KERN_PROC2).
1359 		 */
1360 		if (id < 0) {
1361 			/*
1362 			 * A meta-identifier must always be the last name
1363 			 * component.
1364 			 */
1365 			if (call->call_namelen > 0)
1366 				return EINVAL;
1367 
1368 			switch (id) {
1369 			case CTL_QUERY:
1370 				return mib_query(call, parent, oldp, newp);
1371 			case CTL_CREATE:
1372 				return mib_create(call, parent, oldp, newp);
1373 			case CTL_DESTROY:
1374 				return mib_destroy(call, parent, oldp, newp);
1375 			case CTL_DESCRIBE:
1376 				return mib_describe(call, parent, oldp, newp);
1377 			case CTL_CREATESYM:
1378 			case CTL_MMAP:
1379 			default:
1380 				return EOPNOTSUPP;
1381 			}
1382 		}
1383 
1384 		/* Locate the child node. */
1385 		if ((node = mib_find(parent, id, NULL /*prevp*/)) == NULL)
1386 			return ENOENT;
1387 
1388 		/* Check if access is permitted at this level. */
1389 		if ((node->node_flags & CTLFLAG_PRIVATE) && !mib_authed(call))
1390 			return EPERM;
1391 
1392 		/*
1393 		 * Start by checking if the node is a remote node.  If so, let
1394 		 * a remote service handle the remainder of this request.
1395 		 * However, as part of attempting the remote call, we may
1396 		 * discover that the remote service has died or that it is
1397 		 * unmounting the subtree.  If the node was not a temporary
1398 		 * mountpoint, we should (and do) continue with the request
1399 		 * locally - if it was, it will already be deallocated and we
1400 		 * must be very careful not to access 'node' again!
1401 		 */
1402 		is_leaf = (SYSCTL_TYPE(node->node_flags) != CTLTYPE_NODE);
1403 
1404 		if (!is_leaf && (node->node_flags & CTLFLAG_REMOTE)) {
1405 			/* Determine this before 'node' may disappear.. */
1406 			can_restart = (node->node_flags & CTLFLAG_PARENT);
1407 
1408 			r = mib_remote_call(call, node, oldp, newp);
1409 
1410 			if (r != ERESTART || !can_restart)
1411 				return (r != ERESTART) ? r : ENOENT;
1412 
1413 			/* Service died, subtree is unmounted, keep going. */
1414 			assert(SYSCTL_TYPE(node->node_flags) == CTLTYPE_NODE);
1415 			assert(!(node->node_flags & CTLFLAG_REMOTE));
1416 		}
1417 
1418 		/*
1419 		 * Is this a leaf node, and/or is this node handled by a
1420 		 * function?  If either is true, resolution ends at this level.
1421 		 * In order to save a few bytes of memory per node, we use
1422 		 * different ways to determine whether there is a function
1423 		 * depending on whether the node is a leaf or not.
1424 		 */
1425 		if (is_leaf) {
1426 			has_verify = (node->node_flags & CTLFLAG_VERIFY);
1427 			has_func = (!has_verify && node->node_func != NULL);
1428 		} else {
1429 			has_verify = FALSE;
1430 			has_func = !(node->node_flags & CTLFLAG_PARENT);
1431 		}
1432 
1433 		/*
1434 		 * The name may be longer only if the node is not a leaf.  That
1435 		 * also applies to leaves with functions, so check this first.
1436 		 */
1437 		if (is_leaf && call->call_namelen > 0)
1438 			return ENOTDIR;
1439 
1440 		/*
1441 		 * If resolution indeed ends here, and the user supplied new
1442 		 * data, check if writing is allowed.  For functions, it is
1443 		 * arguable whether we should do this check here already.
1444 		 * However, for now, this approach covers all our use cases.
1445 		 */
1446 		if ((is_leaf || has_func) && newp != NULL) {
1447 			if (!(node->node_flags & CTLFLAG_READWRITE))
1448 				return EPERM;
1449 
1450 			/*
1451 			 * Unless nonprivileged users may write to this node,
1452 			 * ensure that the user has superuser privileges.  The
1453 			 * ANYWRITE flag does not override the READWRITE flag.
1454 			 */
1455 			if (!(node->node_flags & CTLFLAG_ANYWRITE) &&
1456 			    !mib_authed(call))
1457 				return EPERM;
1458 		}
1459 
1460 		/* If this node has a handler function, let it do the work. */
1461 		if (has_func)
1462 			return node->node_func(call, node, oldp, newp);
1463 
1464 		/* For regular data leaf nodes, handle generic access. */
1465 		if (is_leaf)
1466 			return mib_readwrite(call, node, oldp, newp,
1467 			    has_verify ? node->node_verify : NULL);
1468 
1469 		/* No function and not a leaf?  Descend further. */
1470 	}
1471 
1472 	/* If we get here, the name refers to a node array. */
1473 	return EISDIR;
1474 }
1475 
1476 /*
1477  * Recursively initialize the static tree at initialization time.
1478  */
1479 static void
1480 mib_tree_recurse(struct mib_node * parent)
1481 {
1482 	struct mib_node *node;
1483 	int id;
1484 
1485 	assert(SYSCTL_TYPE(parent->node_flags) == CTLTYPE_NODE);
1486 	assert(parent->node_flags & CTLFLAG_PARENT);
1487 
1488 	/*
1489 	 * Later on, node_csize and node_clen will also include dynamically
1490 	 * created nodes.  This means that we cannot use node_csize to iterate
1491 	 * over the static nodes.
1492 	 */
1493 	parent->node_csize = parent->node_size;
1494 
1495 	node = parent->node_scptr;
1496 
1497 	for (id = 0; IS_STATIC_ID(parent, id); id++, node++) {
1498 		if (node->node_flags == 0)
1499 			continue;
1500 
1501 		mib_nodes++;
1502 
1503 		parent->node_clen++;
1504 
1505 		node->node_ver = parent->node_ver;
1506 		node->node_parent = parent;
1507 
1508 		/* Recursively apply this function to all node children. */
1509 		if (SYSCTL_TYPE(node->node_flags) == CTLTYPE_NODE &&
1510 		    (node->node_flags & CTLFLAG_PARENT))
1511 			mib_tree_recurse(node);
1512 	}
1513 }
1514 
1515 /*
1516  * Go through the entire static tree, recursively, initializing some values
1517  * that could not be assigned at compile time.
1518  */
1519 void
1520 mib_tree_init(void)
1521 {
1522 
1523 	/* Initialize some variables. */
1524 	mib_nodes = 1; /* the root node itself */
1525 	mib_objects = 0;
1526 
1527 	/*
1528 	 * The entire tree starts with the same, nonzero node version.
1529 	 * The root node is the only node without a parent.
1530 	 */
1531 	mib_root.node_ver = 1;
1532 	mib_root.node_parent = NULL;
1533 
1534 	/* Recursively initialize the static tree. */
1535 	mib_tree_recurse(&mib_root);
1536 }
1537 
1538 /*
1539  * Process a subtree mount request from a remote service.  Return OK on
1540  * success, with a pointer to the resulting static-node structure stored in
1541  * 'nodep'.  Return a negative error code on failure.
1542  */
1543 int
1544 mib_mount(const int * mib, unsigned int miblen, unsigned int eid, uint32_t rid,
1545 	uint32_t flags, unsigned int csize, unsigned int clen,
1546 	struct mib_node ** nodep)
1547 {
1548 	struct mib_dynode *dynode, **prevp;
1549 	struct mib_node *parent, *node;
1550 	char name[SYSCTL_NAMELEN], *desc;
1551 	size_t size, namelen, desclen;
1552 	unsigned int n;
1553 	int r, id;
1554 
1555 	/*
1556 	 * Perform initial verification of the given parameters.  Even stricter
1557 	 * checks may be performed later.
1558 	 */
1559 	/*
1560 	 * By policy, we forbid mounting top-level nodes.  This is in effect
1561 	 * also the only security-like restriction: a service should not be
1562 	 * able to just take over, say, the entire "kern" subtree.  There is
1563 	 * currently little in the way of a service taking over an important
1564 	 * set of second-level nodes, though.
1565 	 *
1566 	 * TODO: allow mounting of predefined mount points only, for example by
1567 	 * having an internal node flag that permits mounting the subtree or
1568 	 * any node in it.  As an even better alternative, allow this to be
1569 	 * controlled through a policy specification; unfortunately, this would
1570 	 * also add a substantial amount of infrastructure.
1571 	 */
1572 	if (miblen < 2) {
1573 		MIB_DEBUG_MOUNT(("MIB: mounting failed, path too short\n"));
1574 
1575 		return EPERM;
1576 	}
1577 
1578 	/*
1579 	 * The flags field is highly restricted right now.  Only a few flags
1580 	 * may be given at all, and then when using an existing node as mount
1581 	 * point, the flag must exactly match the existing node's flags.
1582 	 */
1583 	if (SYSCTL_VERS(flags) != SYSCTL_VERSION ||
1584 	    SYSCTL_TYPE(flags) != CTLTYPE_NODE ||
1585 	    (SYSCTL_FLAGS(flags) & ~(CTLFLAG_READONLY | CTLFLAG_READWRITE |
1586 	    CTLFLAG_PERMANENT | CTLFLAG_HIDDEN)) != 0) {
1587 		MIB_DEBUG_MOUNT(("MIB: mounting failed, invalid flags %"PRIx32
1588 		    "\n", flags));
1589 
1590 		return EINVAL;
1591 	}
1592 
1593 	if (csize > (1U << MIB_RC_BITS) || clen > csize) {
1594 		MIB_DEBUG_MOUNT(("MIB: mounting failed, invalid child size or "
1595 		    "length (%u, %u)\n", csize, clen));
1596 
1597 		return EINVAL;
1598 	}
1599 
1600 	/*
1601 	 * Look up the parent node of the mount point.  This parent node must
1602 	 * exist - we don't want to create more than one temporary node in any
1603 	 * case.  All the nodes leading up to and including the parent node
1604 	 * must be real, local, non-private, node-type nodes.  The path may not
1605 	 * be private, because that would allow an unprivileged service to
1606 	 * intercept writes to privileged nodes--currently a total nonissue in
1607 	 * practice, but still.  Note that the service may itself restrict
1608 	 * access to nodes in its own mounted subtree in any way it wishes.
1609 	 */
1610 	parent = &mib_root;
1611 
1612 	for (n = 0; n < miblen - 1; n++) {
1613 		/* Meta-identifiers are obviously not allowed in the path. */
1614 		if ((id = mib[n]) < 0) {
1615 			MIB_DEBUG_MOUNT(("MIB: mounting failed, meta-ID in "
1616 			    "path\n"));
1617 
1618 			return EINVAL;
1619 		}
1620 
1621 		/* Locate the child node. */
1622 		if ((node = mib_find(parent, id, NULL /*prevp*/)) == NULL) {
1623 			MIB_DEBUG_MOUNT(("MIB: mounting failed, path not "
1624 			    "found\n"));
1625 
1626 			return ENOENT;
1627 		}
1628 
1629 		/* Make sure it is a regular node-type node. */
1630 		if (SYSCTL_TYPE(node->node_flags) != CTLTYPE_NODE ||
1631 		    !(node->node_flags & CTLFLAG_PARENT) ||
1632 		    (node->node_flags & (CTLFLAG_REMOTE | CTLFLAG_PRIVATE))) {
1633 			MIB_DEBUG_MOUNT(("MIB: mounting failed, unacceptable "
1634 			    "node on path\n"));
1635 
1636 			return EPERM;
1637 		}
1638 
1639 		parent = node;
1640 	}
1641 
1642 	/* Now see if the mount point itself exists. */
1643 	if ((id = mib[miblen - 1]) < 0) {
1644 		MIB_DEBUG_MOUNT(("MIB: mounting failed, meta-ID in path\n"));
1645 
1646 		return EINVAL;
1647 	}
1648 
1649 	/*
1650 	 * If the target node exists and passes all tests, it will simply be
1651 	 * converted to a mount point.  If the target node does not exist, we
1652 	 * have to allocate a temporary node as mount point.
1653 	 */
1654 	if ((node = mib_find(parent, id, NULL /*prevp*/)) != NULL) {
1655 		/*
1656 		 * We are about to mount on an existing node.  As stated above,
1657 		 * the node flags must match the given flags exactly.
1658 		 */
1659 		if (SYSCTL_TYPE(node->node_flags) != CTLTYPE_NODE ||
1660 		    SYSCTL_FLAGS(node->node_flags) !=
1661 		    (SYSCTL_FLAGS(flags) | CTLFLAG_PARENT)) {
1662 			MIB_DEBUG_MOUNT(("MIB: mounting failed, target node "
1663 			    "mismatch (%"PRIx32", %"PRIx32")\n",
1664 			    node->node_flags, flags));
1665 
1666 			return EPERM;
1667 		}
1668 
1669 		/*
1670 		 * If the node has dynamically added children, we will not be
1671 		 * able to restore the node to its old state when unmounting.
1672 		 */
1673 		if (node->node_size != node->node_csize) {
1674 			MIB_DEBUG_MOUNT(("MIB: mounting failed, node has "
1675 			    "dynamic children\n"));
1676 
1677 			return EBUSY;
1678 		}
1679 
1680 		mib_upgrade(node);
1681 	} else {
1682 		/*
1683 		 * We are going to create a temporary mount point.  Much of the
1684 		 * procedure that follows is a rather selective extract from
1685 		 * mib_create().  Start with a check for the impossible.
1686 		 */
1687 		if (parent->node_csize == INT_MAX) {
1688 			MIB_DEBUG_MOUNT(("MIB: mounting failed, parent node "
1689 			    "full\n"));
1690 
1691 			return EINVAL;
1692 		}
1693 
1694 		/*
1695 		 * In order to create the new node, we also need the node's
1696 		 * name and description; those did not fit in the request
1697 		 * message.  Ask the caller to copy these strings to us.
1698 		 */
1699 		name[0] = '\0';
1700 		scratch[0] = '\0';
1701 
1702 		if ((r = mib_remote_info(eid, rid, name, sizeof(name), scratch,
1703 		    MAXDESCLEN)) != OK) {
1704 			MIB_DEBUG_MOUNT(("MIB: mounting failed, node info "
1705 			    "request yielded %d\n", r));
1706 
1707 			return r;
1708 		}
1709 
1710 		/* Make sure the name is valid. */
1711 		if ((namelen = mib_check_name(name, sizeof(name))) == 0) {
1712 			printf("MIB: mounting failed, bad name\n");
1713 
1714 			return EINVAL;
1715 		}
1716 
1717 		/* Just forcefully terminate the description. */
1718 		scratch[MAXDESCLEN - 1] = '\0';
1719 		desclen = strlen(scratch);
1720 
1721 		/*
1722 		 * We know the identifier is not in use yet; make sure that the
1723 		 * name is not, either.  As a side effect, find out where the
1724 		 * new node should be inserted upon success.
1725 		 */
1726 		if (mib_scan(parent, id, name, &id /*unused*/, &prevp,
1727 		    &node /*unused*/) != OK) {
1728 			MIB_DEBUG_MOUNT(("MIB: mounting failed, name "
1729 			    "conflict\n"));
1730 
1731 			return EEXIST;
1732 		}
1733 
1734 		/*
1735 		 * Allocate a dynamic node.  Unlike for user-created dynamic
1736 		 * nodes, temporary mount points also include the description
1737 		 * in the dynode object.
1738 		 */
1739 		size = sizeof(*dynode) + namelen + desclen + 1;
1740 
1741 		if ((dynode = malloc(size)) == NULL) {
1742 			printf("MIB: out of memory!\n");
1743 
1744 			return ENOMEM;
1745 		}
1746 		mib_objects++;
1747 
1748 		/* Initialize the dynamic node. */
1749 		memset(dynode, 0, sizeof(*dynode));
1750 		dynode->dynode_id = id;
1751 		strlcpy(dynode->dynode_name, name, namelen + 1);
1752 		desc = &dynode->dynode_name[namelen + 1];
1753 		strlcpy(desc, scratch, desclen + 1);
1754 
1755 		node = &dynode->dynode_node;
1756 		node->node_flags = flags & ~SYSCTL_VERS_MASK;
1757 		node->node_size = 0;
1758 		node->node_parent = parent;
1759 		node->node_name = dynode->dynode_name;
1760 		node->node_desc = desc;
1761 
1762 		/*
1763 		 * Add the new dynamic node into the tree, and adjust versions
1764 		 * and counters.
1765 		 */
1766 		mib_add(dynode, prevp);
1767 	}
1768 
1769 	/* Success!  Perform the actual mount, and return the target node. */
1770 	node->node_flags |= CTLFLAG_REMOTE;
1771 	node->node_eid = eid;
1772 	node->node_rcsize = csize;
1773 	node->node_rclen = clen;
1774 	node->node_rid = rid;
1775 
1776 	mib_remotes++;
1777 
1778 	*nodep = node;
1779 	return OK;
1780 }
1781 
1782 /*
1783  * Unmount the remote subtree identified by the given node.  Release the mount
1784  * point by reversing the action performed while mounting.  Also bump the
1785  * version numbers on the path, so that userland knows that it is to expect a
1786  * change of contents in the subtree.  This function always succeeds, and may
1787  * deallocate the given node.
1788  */
1789 void
1790 mib_unmount(struct mib_node * node)
1791 {
1792 	struct mib_dynode **prevp;
1793 	struct mib_node *child;
1794 	int id;
1795 
1796 	assert(SYSCTL_TYPE(node->node_flags) == CTLTYPE_NODE);
1797 	assert(node->node_flags & CTLFLAG_REMOTE);
1798 
1799 	/*
1800 	 * Given that the node has the CTLFLAG_REMOTE flag set, we can now tell
1801 	 * whether the remote subtree obscured a preexisting node or we created
1802 	 * a temporary mount point, by checking its CTLFLAG_PARENT flag.
1803 	 */
1804 	if (node->node_flags & CTLFLAG_PARENT) {
1805 		/*
1806 		 * Return the node to its former pre-mount state.  Restore the
1807 		 * original node_clen field by recomputing it.
1808 		 */
1809 		node->node_flags &= ~CTLFLAG_REMOTE;
1810 		node->node_csize = node->node_size;
1811 		node->node_clen = 0;
1812 
1813 		for (id = 0; IS_STATIC_ID(node, id); id++) {
1814 			child = &node->node_scptr[id];
1815 
1816 			if (child->node_flags != 0)
1817 				node->node_clen++;
1818 		}
1819 
1820 		node->node_dcptr = NULL;
1821 
1822 		/* Increase version numbers on the path to the node. */
1823 		mib_upgrade(node);
1824 	} else {
1825 		/*
1826 		 * We know that we dynamically allocated this node; find its
1827 		 * parent's pointer to it.
1828 		 */
1829 		for (prevp = &node->node_parent->node_dcptr; *prevp != NULL;
1830 		    prevp = &(*prevp)->dynode_next) {
1831 			if (&(*prevp)->dynode_node == node)
1832 				break;
1833 		}
1834 		assert(*prevp != NULL);
1835 
1836 		/* Free the node, and adjust counts and versions. */
1837 		mib_remove(node, prevp);
1838 	}
1839 
1840 	assert(mib_remotes > 0);
1841 	mib_remotes--;
1842 }
1843