xref: /freebsd/sys/netlink/netlink_domain.c (revision 2a58b312)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2021 Ng Peng Nam Sean
5  * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 /*
30  * This file contains socket and protocol bindings for netlink.
31  */
32 
33 #include <sys/param.h>
34 #include <sys/kernel.h>
35 #include <sys/malloc.h>
36 #include <sys/lock.h>
37 #include <sys/rmlock.h>
38 #include <sys/domain.h>
39 #include <sys/jail.h>
40 #include <sys/mbuf.h>
41 #include <sys/protosw.h>
42 #include <sys/proc.h>
43 #include <sys/ck.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/sysent.h>
47 #include <sys/syslog.h>
48 #include <sys/priv.h> /* priv_check */
49 
50 #include <netlink/netlink.h>
51 #include <netlink/netlink_ctl.h>
52 #include <netlink/netlink_var.h>
53 
54 #define	DEBUG_MOD_NAME	nl_domain
55 #define	DEBUG_MAX_LEVEL	LOG_DEBUG3
56 #include <netlink/netlink_debug.h>
57 _DECLARE_DEBUG(LOG_DEBUG);
58 
59 _Static_assert((NLP_MAX_GROUPS % 64) == 0,
60     "NLP_MAX_GROUPS has to be multiple of 64");
61 _Static_assert(NLP_MAX_GROUPS >= 64,
62     "NLP_MAX_GROUPS has to be at least 64");
63 
64 #define	NLCTL_TRACKER		struct rm_priotracker nl_tracker
65 #define	NLCTL_RLOCK(_ctl)	rm_rlock(&((_ctl)->ctl_lock), &nl_tracker)
66 #define	NLCTL_RUNLOCK(_ctl)	rm_runlock(&((_ctl)->ctl_lock), &nl_tracker)
67 
68 #define	NLCTL_WLOCK(_ctl)	rm_wlock(&((_ctl)->ctl_lock))
69 #define	NLCTL_WUNLOCK(_ctl)	rm_wunlock(&((_ctl)->ctl_lock))
70 
71 static u_long nl_sendspace = NLSNDQ;
72 SYSCTL_ULONG(_net_netlink, OID_AUTO, sendspace, CTLFLAG_RW, &nl_sendspace, 0,
73     "Default netlink socket send space");
74 
75 static u_long nl_recvspace = NLSNDQ;
76 SYSCTL_ULONG(_net_netlink, OID_AUTO, recvspace, CTLFLAG_RW, &nl_recvspace, 0,
77     "Default netlink socket receive space");
78 
79 extern u_long sb_max_adj;
80 static u_long nl_maxsockbuf = 512 * 1024 * 1024; /* 512M, XXX: init based on physmem */
81 static int sysctl_handle_nl_maxsockbuf(SYSCTL_HANDLER_ARGS);
82 SYSCTL_OID(_net_netlink, OID_AUTO, nl_maxsockbuf,
83     CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, &nl_maxsockbuf, 0,
84     sysctl_handle_nl_maxsockbuf, "LU",
85     "Maximum Netlink socket buffer size");
86 
87 /*
88  * Looks up a nlpcb struct based on the @portid. Need to claim nlsock_mtx.
89  * Returns nlpcb pointer if present else NULL
90  */
91 static struct nlpcb *
92 nl_port_lookup(uint32_t port_id)
93 {
94 	struct nlpcb *nlp;
95 
96 	CK_LIST_FOREACH(nlp, &V_nl_ctl->ctl_port_head, nl_port_next) {
97 		if (nlp->nl_port == port_id)
98 			return (nlp);
99 	}
100 	return (NULL);
101 }
102 
103 static void
104 nl_add_group_locked(struct nlpcb *nlp, unsigned int group_id)
105 {
106 	MPASS(group_id <= NLP_MAX_GROUPS);
107 	--group_id;
108 
109 	/* TODO: add family handler callback */
110 	if (!nlp_unconstrained_vnet(nlp))
111 		return;
112 
113 	nlp->nl_groups[group_id / 64] |= (uint64_t)1 << (group_id % 64);
114 }
115 
116 static void
117 nl_del_group_locked(struct nlpcb *nlp, unsigned int group_id)
118 {
119 	MPASS(group_id <= NLP_MAX_GROUPS);
120 	--group_id;
121 
122 	nlp->nl_groups[group_id / 64] &= ~((uint64_t)1 << (group_id % 64));
123 }
124 
125 static bool
126 nl_isset_group_locked(struct nlpcb *nlp, unsigned int group_id)
127 {
128 	MPASS(group_id <= NLP_MAX_GROUPS);
129 	--group_id;
130 
131 	return (nlp->nl_groups[group_id / 64] & ((uint64_t)1 << (group_id % 64)));
132 }
133 
134 static uint32_t
135 nl_get_groups_compat(struct nlpcb *nlp)
136 {
137 	uint32_t groups_mask = 0;
138 
139 	for (int i = 0; i < 32; i++) {
140 		if (nl_isset_group_locked(nlp, i + 1))
141 			groups_mask |= (1 << i);
142 	}
143 
144 	return (groups_mask);
145 }
146 
147 /*
148  * Broadcasts message @m to the protocol @proto group specified by @group_id
149  */
150 void
151 nl_send_group(struct mbuf *m, int num_messages, int proto, int group_id)
152 {
153 	struct nlpcb *nlp_last = NULL;
154 	struct nlpcb *nlp;
155 	NLCTL_TRACKER;
156 
157 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
158 		struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *);
159 		NL_LOG(LOG_DEBUG2, "MCAST mbuf len %u msg type %d len %u to group %d/%d",
160 		    m->m_len, hdr->nlmsg_type, hdr->nlmsg_len, proto, group_id);
161 	}
162 
163 	struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
164 	if (__predict_false(ctl == NULL)) {
165 		/*
166 		 * Can be the case when notification is sent within VNET
167 		 * which doesn't have any netlink sockets.
168 		 */
169 		m_freem(m);
170 		return;
171 	}
172 
173 	NLCTL_RLOCK(ctl);
174 
175 	int io_flags = NL_IOF_UNTRANSLATED;
176 
177 	CK_LIST_FOREACH(nlp, &ctl->ctl_pcb_head, nl_next) {
178 		if (nl_isset_group_locked(nlp, group_id) && nlp->nl_proto == proto) {
179 			if (nlp_last != NULL) {
180 				struct mbuf *m_copy;
181 				m_copy = m_copym(m, 0, M_COPYALL, M_NOWAIT);
182 				if (m_copy != NULL)
183 					nl_send_one(m_copy, nlp_last, num_messages, io_flags);
184 				else {
185 					NLP_LOCK(nlp_last);
186 					if (nlp_last->nl_socket != NULL)
187 						sorwakeup(nlp_last->nl_socket);
188 					NLP_UNLOCK(nlp_last);
189 				}
190 			}
191 			nlp_last = nlp;
192 		}
193 	}
194 	if (nlp_last != NULL)
195 		nl_send_one(m, nlp_last, num_messages, io_flags);
196 	else
197 		m_freem(m);
198 
199 	NLCTL_RUNLOCK(ctl);
200 }
201 
202 bool
203 nl_has_listeners(int netlink_family, uint32_t groups_mask)
204 {
205 	return (V_nl_ctl != NULL);
206 }
207 
208 static uint32_t
209 nl_find_port(void)
210 {
211 	/*
212 	 * app can open multiple netlink sockets.
213 	 * Start with current pid, if already taken,
214 	 * try random numbers in 65k..256k+65k space,
215 	 * avoiding clash with pids.
216 	 */
217 	if (nl_port_lookup(curproc->p_pid) == NULL)
218 		return (curproc->p_pid);
219 	for (int i = 0; i < 16; i++) {
220 		uint32_t nl_port = (arc4random() % 65536) + 65536 * 4;
221 		if (nl_port_lookup(nl_port) == 0)
222 			return (nl_port);
223 		NL_LOG(LOG_DEBUG3, "tried %u\n", nl_port);
224 	}
225 	return (curproc->p_pid);
226 }
227 
228 static int
229 nl_bind_locked(struct nlpcb *nlp, struct sockaddr_nl *snl)
230 {
231 	if (nlp->nl_bound) {
232 		if (nlp->nl_port != snl->nl_pid) {
233 			NL_LOG(LOG_DEBUG,
234 			    "bind() failed: program pid %d "
235 			    "is different from provided pid %d",
236 			    nlp->nl_port, snl->nl_pid);
237 			return (EINVAL); // XXX: better error
238 		}
239 	} else {
240 		if (snl->nl_pid == 0)
241 			snl->nl_pid = nl_find_port();
242 		if (nl_port_lookup(snl->nl_pid) != NULL)
243 			return (EADDRINUSE);
244 		nlp->nl_port = snl->nl_pid;
245 		nlp->nl_bound = true;
246 		CK_LIST_INSERT_HEAD(&V_nl_ctl->ctl_port_head, nlp, nl_port_next);
247 	}
248 	for (int i = 0; i < 32; i++) {
249 		if (snl->nl_groups & ((uint32_t)1 << i))
250 			nl_add_group_locked(nlp, i + 1);
251 		else
252 			nl_del_group_locked(nlp, i + 1);
253 	}
254 
255 	return (0);
256 }
257 
258 static int
259 nl_pru_attach(struct socket *so, int proto, struct thread *td)
260 {
261 	struct nlpcb *nlp;
262 	int error;
263 
264 	if (__predict_false(netlink_unloading != 0))
265 		return (EAFNOSUPPORT);
266 
267 	error = nl_verify_proto(proto);
268 	if (error != 0)
269 		return (error);
270 
271 	bool is_linux = SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX;
272 	NL_LOG(LOG_DEBUG2, "socket %p, %sPID %d: attaching socket to %s",
273 	    so, is_linux ? "(linux) " : "", curproc->p_pid,
274 	    nl_get_proto_name(proto));
275 
276 	/* Create per-VNET state on first socket init */
277 	struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
278 	if (ctl == NULL)
279 		ctl = vnet_nl_ctl_init();
280 	KASSERT(V_nl_ctl != NULL, ("nl_attach: vnet_sock_init() failed"));
281 
282 	MPASS(sotonlpcb(so) == NULL);
283 
284 	nlp = malloc(sizeof(struct nlpcb), M_PCB, M_WAITOK | M_ZERO);
285 	error = soreserve(so, nl_sendspace, nl_recvspace);
286 	if (error != 0) {
287 		free(nlp, M_PCB);
288 		return (error);
289 	}
290 	so->so_pcb = nlp;
291 	nlp->nl_socket = so;
292 	/* Copy so_cred to avoid having socket_var.h in every header */
293 	nlp->nl_cred = so->so_cred;
294 	nlp->nl_proto = proto;
295 	nlp->nl_process_id = curproc->p_pid;
296 	nlp->nl_linux = is_linux;
297 	nlp->nl_active = true;
298 	nlp->nl_unconstrained_vnet = !jailed_without_vnet(so->so_cred);
299 	NLP_LOCK_INIT(nlp);
300 	refcount_init(&nlp->nl_refcount, 1);
301 	nl_init_io(nlp);
302 
303 	nlp->nl_taskqueue = taskqueue_create("netlink_socket", M_WAITOK,
304 	    taskqueue_thread_enqueue, &nlp->nl_taskqueue);
305 	TASK_INIT(&nlp->nl_task, 0, nl_taskqueue_handler, nlp);
306 	taskqueue_start_threads(&nlp->nl_taskqueue, 1, PWAIT,
307 	    "netlink_socket (PID %u)", nlp->nl_process_id);
308 
309 	NLCTL_WLOCK(ctl);
310 	/* XXX: check ctl is still alive */
311 	CK_LIST_INSERT_HEAD(&ctl->ctl_pcb_head, nlp, nl_next);
312 	NLCTL_WUNLOCK(ctl);
313 
314 	soisconnected(so);
315 
316 	return (0);
317 }
318 
319 static void
320 nl_pru_abort(struct socket *so)
321 {
322 	NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
323 	MPASS(sotonlpcb(so) != NULL);
324 	soisdisconnected(so);
325 }
326 
327 static int
328 nl_pru_bind(struct socket *so, struct sockaddr *sa, struct thread *td)
329 {
330 	struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
331 	struct nlpcb *nlp = sotonlpcb(so);
332 	struct sockaddr_nl *snl = (struct sockaddr_nl *)sa;
333 	int error;
334 
335 	NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
336 	if (snl->nl_len != sizeof(*snl)) {
337 		NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so);
338 		return (EINVAL);
339 	}
340 
341 
342 	NLCTL_WLOCK(ctl);
343 	NLP_LOCK(nlp);
344 	error = nl_bind_locked(nlp, snl);
345 	NLP_UNLOCK(nlp);
346 	NLCTL_WUNLOCK(ctl);
347 	NL_LOG(LOG_DEBUG2, "socket %p, bind() to %u, groups %u, error %d", so,
348 	    snl->nl_pid, snl->nl_groups, error);
349 
350 	return (error);
351 }
352 
353 
354 static int
355 nl_assign_port(struct nlpcb *nlp, uint32_t port_id)
356 {
357 	struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
358 	struct sockaddr_nl snl = {
359 		.nl_pid = port_id,
360 	};
361 	int error;
362 
363 	NLCTL_WLOCK(ctl);
364 	NLP_LOCK(nlp);
365 	snl.nl_groups = nl_get_groups_compat(nlp);
366 	error = nl_bind_locked(nlp, &snl);
367 	NLP_UNLOCK(nlp);
368 	NLCTL_WUNLOCK(ctl);
369 
370 	NL_LOG(LOG_DEBUG3, "socket %p, port assign: %d, error: %d", nlp->nl_socket, port_id, error);
371 	return (error);
372 }
373 
374 /*
375  * nl_autobind_port binds a unused portid to @nlp
376  * @nlp: pcb data for the netlink socket
377  * @candidate_id: first id to consider
378  */
379 static int
380 nl_autobind_port(struct nlpcb *nlp, uint32_t candidate_id)
381 {
382 	struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
383 	uint32_t port_id = candidate_id;
384 	NLCTL_TRACKER;
385 	bool exist;
386 	int error = EADDRINUSE;
387 
388 	for (int i = 0; i < 10; i++) {
389 		NL_LOG(LOG_DEBUG3, "socket %p, trying to assign port %d", nlp->nl_socket, port_id);
390 		NLCTL_RLOCK(ctl);
391 		exist = nl_port_lookup(port_id) != 0;
392 		NLCTL_RUNLOCK(ctl);
393 		if (!exist) {
394 			error = nl_assign_port(nlp, port_id);
395 			if (error != EADDRINUSE)
396 				break;
397 		}
398 		port_id++;
399 	}
400 	NL_LOG(LOG_DEBUG3, "socket %p, autobind to %d, error: %d", nlp->nl_socket, port_id, error);
401 	return (error);
402 }
403 
404 static int
405 nl_pru_connect(struct socket *so, struct sockaddr *sa, struct thread *td)
406 {
407 	struct sockaddr_nl *snl = (struct sockaddr_nl *)sa;
408 	struct nlpcb *nlp;
409 
410 	NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
411 	if (snl->nl_len != sizeof(*snl)) {
412 		NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so);
413 		return (EINVAL);
414 	}
415 
416 	nlp = sotonlpcb(so);
417 	if (!nlp->nl_bound) {
418 		int error = nl_autobind_port(nlp, td->td_proc->p_pid);
419 		if (error != 0) {
420 			NL_LOG(LOG_DEBUG, "socket %p, nl_autobind() failed: %d", so, error);
421 			return (error);
422 		}
423 	}
424 	/* XXX: Handle socket flags & multicast */
425 	soisconnected(so);
426 
427 	NL_LOG(LOG_DEBUG2, "socket %p, connect to %u", so, snl->nl_pid);
428 
429 	return (0);
430 }
431 
432 static void
433 destroy_nlpcb(struct nlpcb *nlp)
434 {
435 	NLP_LOCK(nlp);
436 	nl_free_io(nlp);
437 	NLP_LOCK_DESTROY(nlp);
438 	free(nlp, M_PCB);
439 }
440 
441 static void
442 destroy_nlpcb_epoch(epoch_context_t ctx)
443 {
444 	struct nlpcb *nlp;
445 
446 	nlp = __containerof(ctx, struct nlpcb, nl_epoch_ctx);
447 
448 	destroy_nlpcb(nlp);
449 }
450 
451 
452 static void
453 nl_pru_detach(struct socket *so)
454 {
455 	struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
456 	MPASS(sotonlpcb(so) != NULL);
457 	struct nlpcb *nlp;
458 
459 	NL_LOG(LOG_DEBUG2, "detaching socket %p, PID %d", so, curproc->p_pid);
460 	nlp = sotonlpcb(so);
461 
462 	/* Mark as inactive so no new work can be enqueued */
463 	NLP_LOCK(nlp);
464 	bool was_bound = nlp->nl_bound;
465 	nlp->nl_active = false;
466 	NLP_UNLOCK(nlp);
467 
468 	/* Wait till all scheduled work has been completed  */
469 	taskqueue_drain_all(nlp->nl_taskqueue);
470 	taskqueue_free(nlp->nl_taskqueue);
471 
472 	NLCTL_WLOCK(ctl);
473 	NLP_LOCK(nlp);
474 	if (was_bound) {
475 		CK_LIST_REMOVE(nlp, nl_port_next);
476 		NL_LOG(LOG_DEBUG3, "socket %p, unlinking bound pid %u", so, nlp->nl_port);
477 	}
478 	CK_LIST_REMOVE(nlp, nl_next);
479 	nlp->nl_socket = NULL;
480 	NLP_UNLOCK(nlp);
481 	NLCTL_WUNLOCK(ctl);
482 
483 	so->so_pcb = NULL;
484 
485 	NL_LOG(LOG_DEBUG3, "socket %p, detached", so);
486 
487 	/* XXX: is delayed free needed? */
488 	NET_EPOCH_CALL(destroy_nlpcb_epoch, &nlp->nl_epoch_ctx);
489 }
490 
491 static int
492 nl_pru_disconnect(struct socket *so)
493 {
494 	NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
495 	MPASS(sotonlpcb(so) != NULL);
496 	return (ENOTCONN);
497 }
498 
499 static int
500 nl_pru_peeraddr(struct socket *so, struct sockaddr **sa)
501 {
502 	NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
503 	MPASS(sotonlpcb(so) != NULL);
504 	return (ENOTCONN);
505 }
506 
507 static int
508 nl_pru_shutdown(struct socket *so)
509 {
510 	NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
511 	MPASS(sotonlpcb(so) != NULL);
512 	socantsendmore(so);
513 	return (0);
514 }
515 
516 static int
517 nl_pru_sockaddr(struct socket *so, struct sockaddr **sa)
518 {
519 	struct sockaddr_nl *snl;
520 
521 	snl = malloc(sizeof(struct sockaddr_nl), M_SONAME, M_WAITOK | M_ZERO);
522 	/* TODO: set other fields */
523 	snl->nl_len = sizeof(struct sockaddr_nl);
524 	snl->nl_family = AF_NETLINK;
525 	snl->nl_pid = sotonlpcb(so)->nl_port;
526 	*sa = (struct sockaddr *)snl;
527 	return (0);
528 }
529 
530 static void
531 nl_pru_close(struct socket *so)
532 {
533 	NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
534 	MPASS(sotonlpcb(so) != NULL);
535 	soisdisconnected(so);
536 }
537 
538 static int
539 nl_pru_output(struct mbuf *m, struct socket *so, ...)
540 {
541 
542 	if (__predict_false(m == NULL ||
543 	    ((m->m_len < sizeof(struct nlmsghdr)) &&
544 		(m = m_pullup(m, sizeof(struct nlmsghdr))) == NULL)))
545 		return (ENOBUFS);
546 	MPASS((m->m_flags & M_PKTHDR) != 0);
547 
548 	NL_LOG(LOG_DEBUG3, "sending message to kernel async processing");
549 	nl_receive_async(m, so);
550 	return (0);
551 }
552 
553 
554 static int
555 nl_pru_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *sa,
556     struct mbuf *control, struct thread *td)
557 {
558         NL_LOG(LOG_DEBUG2, "sending message to kernel");
559 
560 	if (__predict_false(control != NULL)) {
561 		if (control->m_len) {
562 			m_freem(control);
563 			return (EINVAL);
564 		}
565 		m_freem(control);
566 	}
567 
568 	return (nl_pru_output(m, so));
569 }
570 
571 static int
572 nl_pru_rcvd(struct socket *so, int flags)
573 {
574 	NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
575 	MPASS(sotonlpcb(so) != NULL);
576 
577 	nl_on_transmit(sotonlpcb(so));
578 
579 	return (0);
580 }
581 
582 static int
583 nl_getoptflag(int sopt_name)
584 {
585 	switch (sopt_name) {
586 	case NETLINK_CAP_ACK:
587 		return (NLF_CAP_ACK);
588 	case NETLINK_EXT_ACK:
589 		return (NLF_EXT_ACK);
590 	case NETLINK_GET_STRICT_CHK:
591 		return (NLF_STRICT);
592 	}
593 
594 	return (0);
595 }
596 
597 static int
598 nl_ctloutput(struct socket *so, struct sockopt *sopt)
599 {
600 	struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
601 	struct nlpcb *nlp = sotonlpcb(so);
602 	uint32_t flag;
603 	int optval, error = 0;
604 	NLCTL_TRACKER;
605 
606 	NL_LOG(LOG_DEBUG2, "%ssockopt(%p, %d)", (sopt->sopt_dir) ? "set" : "get",
607 	    so, sopt->sopt_name);
608 
609 	switch (sopt->sopt_dir) {
610 	case SOPT_SET:
611 		switch (sopt->sopt_name) {
612 		case NETLINK_ADD_MEMBERSHIP:
613 		case NETLINK_DROP_MEMBERSHIP:
614 			error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
615 			if (error != 0)
616 				break;
617 			if (optval <= 0 || optval >= NLP_MAX_GROUPS) {
618 				error = ERANGE;
619 				break;
620 			}
621 			NL_LOG(LOG_DEBUG2, "ADD/DEL group %d", (uint32_t)optval);
622 
623 			NLCTL_WLOCK(ctl);
624 			if (sopt->sopt_name == NETLINK_ADD_MEMBERSHIP)
625 				nl_add_group_locked(nlp, optval);
626 			else
627 				nl_del_group_locked(nlp, optval);
628 			NLCTL_WUNLOCK(ctl);
629 			break;
630 		case NETLINK_CAP_ACK:
631 		case NETLINK_EXT_ACK:
632 		case NETLINK_GET_STRICT_CHK:
633 			error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
634 			if (error != 0)
635 				break;
636 
637 			flag = nl_getoptflag(sopt->sopt_name);
638 
639 			NLCTL_WLOCK(ctl);
640 			if (optval != 0)
641 				nlp->nl_flags |= flag;
642 			else
643 				nlp->nl_flags &= ~flag;
644 			NLCTL_WUNLOCK(ctl);
645 			break;
646 		default:
647 			error = ENOPROTOOPT;
648 		}
649 		break;
650 	case SOPT_GET:
651 		switch (sopt->sopt_name) {
652 		case NETLINK_LIST_MEMBERSHIPS:
653 			NLCTL_RLOCK(ctl);
654 			optval = nl_get_groups_compat(nlp);
655 			NLCTL_RUNLOCK(ctl);
656 			error = sooptcopyout(sopt, &optval, sizeof(optval));
657 			break;
658 		case NETLINK_CAP_ACK:
659 		case NETLINK_EXT_ACK:
660 		case NETLINK_GET_STRICT_CHK:
661 			NLCTL_RLOCK(ctl);
662 			optval = (nlp->nl_flags & nl_getoptflag(sopt->sopt_name)) != 0;
663 			NLCTL_RUNLOCK(ctl);
664 			error = sooptcopyout(sopt, &optval, sizeof(optval));
665 			break;
666 		default:
667 			error = ENOPROTOOPT;
668 		}
669 		break;
670 	default:
671 		error = ENOPROTOOPT;
672 	}
673 
674 	return (error);
675 }
676 
677 static int
678 sysctl_handle_nl_maxsockbuf(SYSCTL_HANDLER_ARGS)
679 {
680 	int error = 0;
681 	u_long tmp_maxsockbuf = nl_maxsockbuf;
682 
683 	error = sysctl_handle_long(oidp, &tmp_maxsockbuf, arg2, req);
684 	if (error || !req->newptr)
685 		return (error);
686 	if (tmp_maxsockbuf < MSIZE + MCLBYTES)
687 		return (EINVAL);
688 	nl_maxsockbuf = tmp_maxsockbuf;
689 
690 	return (0);
691 }
692 
693 static int
694 nl_setsbopt(struct socket *so, struct sockopt *sopt)
695 {
696 	int error, optval;
697 	bool result;
698 
699 	if (sopt->sopt_name != SO_RCVBUF)
700 		return (sbsetopt(so, sopt));
701 
702 	/* Allow to override max buffer size in certain conditions */
703 
704 	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
705 	if (error != 0)
706 		return (error);
707 	NL_LOG(LOG_DEBUG2, "socket %p, PID %d, SO_RCVBUF=%d", so, curproc->p_pid, optval);
708 	if (optval > sb_max_adj) {
709 		if (priv_check(curthread, PRIV_NET_ROUTE) != 0)
710 			return (EPERM);
711 	}
712 
713 	SOCK_RECVBUF_LOCK(so);
714 	result = sbreserve_locked_limit(so, SO_RCV, optval, nl_maxsockbuf, curthread);
715 	SOCK_RECVBUF_UNLOCK(so);
716 
717 	return (result ? 0 : ENOBUFS);
718 }
719 
720 #define	NETLINK_PROTOSW						\
721 	.pr_flags = PR_ATOMIC | PR_ADDR | PR_WANTRCVD,		\
722 	.pr_ctloutput = nl_ctloutput,				\
723 	.pr_setsbopt = nl_setsbopt,				\
724 	.pr_abort = nl_pru_abort,				\
725 	.pr_attach = nl_pru_attach,				\
726 	.pr_bind = nl_pru_bind,					\
727 	.pr_connect = nl_pru_connect,				\
728 	.pr_detach = nl_pru_detach,				\
729 	.pr_disconnect = nl_pru_disconnect,			\
730 	.pr_peeraddr = nl_pru_peeraddr,				\
731 	.pr_send = nl_pru_send,					\
732 	.pr_rcvd = nl_pru_rcvd,					\
733 	.pr_shutdown = nl_pru_shutdown,				\
734 	.pr_sockaddr = nl_pru_sockaddr,				\
735 	.pr_close = nl_pru_close
736 
737 static struct protosw netlink_raw_sw = {
738 	.pr_type = SOCK_RAW,
739 	NETLINK_PROTOSW
740 };
741 
742 static struct protosw netlink_dgram_sw = {
743 	.pr_type = SOCK_DGRAM,
744 	NETLINK_PROTOSW
745 };
746 
747 static struct domain netlinkdomain = {
748 	.dom_family = PF_NETLINK,
749 	.dom_name = "netlink",
750 	.dom_flags = DOMF_UNLOADABLE,
751 	.dom_nprotosw =		2,
752 	.dom_protosw =		{ &netlink_raw_sw, &netlink_dgram_sw },
753 };
754 
755 DOMAIN_SET(netlink);
756