xref: /freebsd/sys/netlink/netlink_io.c (revision 315ee00f)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2021 Ng Peng Nam Sean
5  * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include "opt_netlink.h"
30 
31 #include <sys/cdefs.h>
32 #include <sys/param.h>
33 #include <sys/ck.h>
34 #include <sys/lock.h>
35 #include <sys/malloc.h>
36 #include <sys/mbuf.h>
37 #include <sys/mutex.h>
38 #include <sys/socket.h>
39 #include <sys/socketvar.h>
40 #include <sys/syslog.h>
41 
42 #include <netlink/netlink.h>
43 #include <netlink/netlink_ctl.h>
44 #include <netlink/netlink_linux.h>
45 #include <netlink/netlink_var.h>
46 
47 #define	DEBUG_MOD_NAME	nl_io
48 #define	DEBUG_MAX_LEVEL	LOG_DEBUG3
49 #include <netlink/netlink_debug.h>
50 _DECLARE_DEBUG(LOG_INFO);
51 
52 /*
53  * The logic below provide a p2p interface for receiving and
54  * sending netlink data between the kernel and userland.
55  */
56 
57 static const struct sockaddr_nl _nl_empty_src = {
58 	.nl_len = sizeof(struct sockaddr_nl),
59 	.nl_family = PF_NETLINK,
60 	.nl_pid = 0 /* comes from the kernel */
61 };
62 static const struct sockaddr *nl_empty_src = (const struct sockaddr *)&_nl_empty_src;
63 
64 static struct mbuf *nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp);
65 
66 
67 static void
68 queue_push(struct nl_io_queue *q, struct mbuf *mq)
69 {
70 	while (mq != NULL) {
71 		struct mbuf *m = mq;
72 		mq = mq->m_nextpkt;
73 		m->m_nextpkt = NULL;
74 
75 		q->length += m_length(m, NULL);
76 		STAILQ_INSERT_TAIL(&q->head, m, m_stailqpkt);
77 	}
78 }
79 
80 static void
81 queue_push_head(struct nl_io_queue *q, struct mbuf *m)
82 {
83 	MPASS(m->m_nextpkt == NULL);
84 
85 	q->length += m_length(m, NULL);
86 	STAILQ_INSERT_HEAD(&q->head, m, m_stailqpkt);
87 }
88 
89 static struct mbuf *
90 queue_pop(struct nl_io_queue *q)
91 {
92 	if (!STAILQ_EMPTY(&q->head)) {
93 		struct mbuf *m = STAILQ_FIRST(&q->head);
94 		STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt);
95 		m->m_nextpkt = NULL;
96 		q->length -= m_length(m, NULL);
97 
98 		return (m);
99 	}
100 	return (NULL);
101 }
102 
103 static struct mbuf *
104 queue_head(const struct nl_io_queue *q)
105 {
106 	return (STAILQ_FIRST(&q->head));
107 }
108 
109 static inline bool
110 queue_empty(const struct nl_io_queue *q)
111 {
112 	return (q->length == 0);
113 }
114 
115 static void
116 queue_free(struct nl_io_queue *q)
117 {
118 	while (!STAILQ_EMPTY(&q->head)) {
119 		struct mbuf *m = STAILQ_FIRST(&q->head);
120 		STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt);
121 		m->m_nextpkt = NULL;
122 		m_freem(m);
123 	}
124 	q->length = 0;
125 }
126 
127 void
128 nl_add_msg_info(struct mbuf *m)
129 {
130 	struct nlpcb *nlp = nl_get_thread_nlp(curthread);
131 	NL_LOG(LOG_DEBUG2, "Trying to recover nlp from thread %p: %p",
132 	    curthread, nlp);
133 
134 	if (nlp == NULL)
135 		return;
136 
137 	/* Prepare what we want to encode - PID, socket PID & msg seq */
138 	struct {
139 		struct nlattr nla;
140 		uint32_t val;
141 	} data[] = {
142 		{
143 			.nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t),
144 			.nla.nla_type = NLMSGINFO_ATTR_PROCESS_ID,
145 			.val = nlp->nl_process_id,
146 		},
147 		{
148 			.nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t),
149 			.nla.nla_type = NLMSGINFO_ATTR_PORT_ID,
150 			.val = nlp->nl_port,
151 		},
152 	};
153 
154 
155 	while (m->m_next != NULL)
156 		m = m->m_next;
157 	m->m_next = sbcreatecontrol(data, sizeof(data),
158 	    NETLINK_MSG_INFO, SOL_NETLINK, M_NOWAIT);
159 
160 	NL_LOG(LOG_DEBUG2, "Storing %u bytes of data, ctl: %p",
161 	    (unsigned)sizeof(data), m->m_next);
162 }
163 
164 static __noinline struct mbuf *
165 extract_msg_info(struct mbuf *m)
166 {
167 	while (m->m_next != NULL) {
168 		if (m->m_next->m_type == MT_CONTROL) {
169 			struct mbuf *ctl = m->m_next;
170 			m->m_next = NULL;
171 			return (ctl);
172 		}
173 		m = m->m_next;
174 	}
175 	return (NULL);
176 }
177 
178 static void
179 nl_schedule_taskqueue(struct nlpcb *nlp)
180 {
181 	if (!nlp->nl_task_pending) {
182 		nlp->nl_task_pending = true;
183 		taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task);
184 		NL_LOG(LOG_DEBUG3, "taskqueue scheduled");
185 	} else {
186 		NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped");
187 	}
188 }
189 
190 int
191 nl_receive_async(struct mbuf *m, struct socket *so)
192 {
193 	struct nlpcb *nlp = sotonlpcb(so);
194 	int error = 0;
195 
196 	m->m_nextpkt = NULL;
197 
198 	NLP_LOCK(nlp);
199 
200 	if ((__predict_true(nlp->nl_active))) {
201 		sbappend(&so->so_snd, m, 0);
202 		NL_LOG(LOG_DEBUG3, "enqueue %u bytes", m_length(m, NULL));
203 		nl_schedule_taskqueue(nlp);
204 	} else {
205 		NL_LOG(LOG_DEBUG, "ignoring %u bytes on non-active socket",
206 		    m_length(m, NULL));
207 		m_free(m);
208 		error = EINVAL;
209 	}
210 
211 	NLP_UNLOCK(nlp);
212 
213 	return (error);
214 }
215 
216 static bool
217 tx_check_locked(struct nlpcb *nlp)
218 {
219 	if (queue_empty(&nlp->tx_queue))
220 		return (true);
221 
222 	/*
223 	 * Check if something can be moved from the internal TX queue
224 	 * to the socket queue.
225 	 */
226 
227 	bool appended = false;
228 	struct sockbuf *sb = &nlp->nl_socket->so_rcv;
229 	SOCKBUF_LOCK(sb);
230 
231 	while (true) {
232 		struct mbuf *m = queue_head(&nlp->tx_queue);
233 		if (m != NULL) {
234 			struct mbuf *ctl = NULL;
235 			if (__predict_false(m->m_next != NULL))
236 				ctl = extract_msg_info(m);
237 			if (sbappendaddr_locked(sb, nl_empty_src, m, ctl) != 0) {
238 				/* appended successfully */
239 				queue_pop(&nlp->tx_queue);
240 				appended = true;
241 			} else
242 				break;
243 		} else
244 			break;
245 	}
246 
247 	SOCKBUF_UNLOCK(sb);
248 
249 	if (appended)
250 		sorwakeup(nlp->nl_socket);
251 
252 	return (queue_empty(&nlp->tx_queue));
253 }
254 
255 static bool
256 nl_process_received_one(struct nlpcb *nlp)
257 {
258 	bool reschedule = false;
259 
260 	NLP_LOCK(nlp);
261 	nlp->nl_task_pending = false;
262 
263 	if (!tx_check_locked(nlp)) {
264 		/* TX overflow queue still not empty, ignore RX */
265 		NLP_UNLOCK(nlp);
266 		return (false);
267 	}
268 
269 	if (queue_empty(&nlp->rx_queue)) {
270 		/*
271 		 * Grab all data we have from the socket TX queue
272 		 * and store it the internal queue, so it can be worked on
273 		 * w/o holding socket lock.
274 		 */
275 		struct sockbuf *sb = &nlp->nl_socket->so_snd;
276 
277 		SOCKBUF_LOCK(sb);
278 		unsigned int avail = sbavail(sb);
279 		if (avail > 0) {
280 			NL_LOG(LOG_DEBUG3, "grabbed %u bytes", avail);
281 			queue_push(&nlp->rx_queue, sbcut_locked(sb, avail));
282 		}
283 		SOCKBUF_UNLOCK(sb);
284 	} else {
285 		/* Schedule another pass to read from the socket queue */
286 		reschedule = true;
287 	}
288 
289 	int prev_hiwat = nlp->tx_queue.hiwat;
290 	NLP_UNLOCK(nlp);
291 
292 	while (!queue_empty(&nlp->rx_queue)) {
293 		struct mbuf *m = queue_pop(&nlp->rx_queue);
294 
295 		m = nl_process_mbuf(m, nlp);
296 		if (m != NULL) {
297 			queue_push_head(&nlp->rx_queue, m);
298 			reschedule = false;
299 			break;
300 		}
301 	}
302 	if (nlp->tx_queue.hiwat > prev_hiwat) {
303 		NLP_LOG(LOG_DEBUG, nlp, "TX override peaked to %d", nlp->tx_queue.hiwat);
304 
305 	}
306 
307 	return (reschedule);
308 }
309 
310 static void
311 nl_process_received(struct nlpcb *nlp)
312 {
313 	NL_LOG(LOG_DEBUG3, "taskqueue called");
314 
315 	if (__predict_false(nlp->nl_need_thread_setup)) {
316 		nl_set_thread_nlp(curthread, nlp);
317 		NLP_LOCK(nlp);
318 		nlp->nl_need_thread_setup = false;
319 		NLP_UNLOCK(nlp);
320 	}
321 
322 	while (nl_process_received_one(nlp))
323 		;
324 }
325 
326 void
327 nl_init_io(struct nlpcb *nlp)
328 {
329 	STAILQ_INIT(&nlp->rx_queue.head);
330 	STAILQ_INIT(&nlp->tx_queue.head);
331 }
332 
333 void
334 nl_free_io(struct nlpcb *nlp)
335 {
336 	queue_free(&nlp->rx_queue);
337 	queue_free(&nlp->tx_queue);
338 }
339 
340 /*
341  * Called after some data have been read from the socket.
342  */
343 void
344 nl_on_transmit(struct nlpcb *nlp)
345 {
346 	NLP_LOCK(nlp);
347 
348 	struct socket *so = nlp->nl_socket;
349 	if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) {
350 		unsigned long dropped_bytes = nlp->nl_dropped_bytes;
351 		unsigned long dropped_messages = nlp->nl_dropped_messages;
352 		nlp->nl_dropped_bytes = 0;
353 		nlp->nl_dropped_messages = 0;
354 
355 		struct sockbuf *sb = &so->so_rcv;
356 		NLP_LOG(LOG_DEBUG, nlp,
357 		    "socket RX overflowed, %lu messages (%lu bytes) dropped. "
358 		    "bytes: [%u/%u] mbufs: [%u/%u]", dropped_messages, dropped_bytes,
359 		    sb->sb_ccc, sb->sb_hiwat, sb->sb_mbcnt, sb->sb_mbmax);
360 		/* TODO: send netlink message */
361 	}
362 
363 	nl_schedule_taskqueue(nlp);
364 	NLP_UNLOCK(nlp);
365 }
366 
367 void
368 nl_taskqueue_handler(void *_arg, int pending)
369 {
370 	struct nlpcb *nlp = (struct nlpcb *)_arg;
371 
372 	CURVNET_SET(nlp->nl_socket->so_vnet);
373 	nl_process_received(nlp);
374 	CURVNET_RESTORE();
375 }
376 
377 static __noinline void
378 queue_push_tx(struct nlpcb *nlp, struct mbuf *m)
379 {
380 	queue_push(&nlp->tx_queue, m);
381 	nlp->nl_tx_blocked = true;
382 
383 	if (nlp->tx_queue.length > nlp->tx_queue.hiwat)
384 		nlp->tx_queue.hiwat = nlp->tx_queue.length;
385 }
386 
387 /*
388  * Tries to send @m to the socket @nlp.
389  *
390  * @m: mbuf(s) to send to. Consumed in any case.
391  * @nlp: socket to send to
392  * @cnt: number of messages in @m
393  * @io_flags: combination of NL_IOF_* flags
394  *
395  * Returns true on success.
396  * If no queue overrunes happened, wakes up socket owner.
397  */
398 bool
399 nl_send_one(struct mbuf *m, struct nlpcb *nlp, int num_messages, int io_flags)
400 {
401 	bool untranslated = io_flags & NL_IOF_UNTRANSLATED;
402 	bool ignore_limits = io_flags & NL_IOF_IGNORE_LIMIT;
403 	bool result = true;
404 
405 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
406 		struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *);
407 		NLP_LOG(LOG_DEBUG2, nlp,
408 		    "TX mbuf len %u msgs %u msg type %d first hdrlen %u io_flags %X",
409 		    m_length(m, NULL), num_messages, hdr->nlmsg_type, hdr->nlmsg_len,
410 		    io_flags);
411 	}
412 
413 	if (__predict_false(nlp->nl_linux && linux_netlink_p != NULL && untranslated)) {
414 		m = linux_netlink_p->mbufs_to_linux(nlp->nl_proto, m, nlp);
415 		if (m == NULL)
416 			return (false);
417 	}
418 
419 	NLP_LOCK(nlp);
420 
421 	if (__predict_false(nlp->nl_socket == NULL)) {
422 		NLP_UNLOCK(nlp);
423 		m_freem(m);
424 		return (false);
425 	}
426 
427 	if (!queue_empty(&nlp->tx_queue)) {
428 		if (ignore_limits) {
429 			queue_push_tx(nlp, m);
430 		} else {
431 			m_free(m);
432 			result = false;
433 		}
434 		NLP_UNLOCK(nlp);
435 		return (result);
436 	}
437 
438 	struct socket *so = nlp->nl_socket;
439 	struct mbuf *ctl = NULL;
440 	if (__predict_false(m->m_next != NULL))
441 		ctl = extract_msg_info(m);
442 	if (sbappendaddr(&so->so_rcv, nl_empty_src, m, ctl) != 0) {
443 		sorwakeup(so);
444 		NLP_LOG(LOG_DEBUG3, nlp, "appended data & woken up");
445 	} else {
446 		if (ignore_limits) {
447 			queue_push_tx(nlp, m);
448 		} else {
449 			/*
450 			 * Store dropped data so it can be reported
451 			 * on the next read
452 			 */
453 			nlp->nl_dropped_bytes += m_length(m, NULL);
454 			nlp->nl_dropped_messages += num_messages;
455 			NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)",
456 			    (unsigned long)nlp->nl_dropped_messages, num_messages,
457 			    (unsigned long)nlp->nl_dropped_bytes, m_length(m, NULL));
458 			soroverflow(so);
459 			m_freem(m);
460 			result = false;
461 		}
462 	}
463 	NLP_UNLOCK(nlp);
464 
465 	return (result);
466 }
467 
468 static int
469 nl_receive_message(struct nlmsghdr *hdr, int remaining_length,
470     struct nlpcb *nlp, struct nl_pstate *npt)
471 {
472 	nl_handler_f handler = nl_handlers[nlp->nl_proto].cb;
473 	int error = 0;
474 
475 	NLP_LOG(LOG_DEBUG2, nlp, "msg len: %u type: %d: flags: 0x%X seq: %u pid: %u",
476 	    hdr->nlmsg_len, hdr->nlmsg_type, hdr->nlmsg_flags, hdr->nlmsg_seq,
477 	    hdr->nlmsg_pid);
478 
479 	if (__predict_false(hdr->nlmsg_len > remaining_length)) {
480 		NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d",
481 		    hdr->nlmsg_len, remaining_length);
482 		return (EINVAL);
483 	} else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) {
484 		NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len);
485 		return (EINVAL);
486 	}
487 	/* Stamp each message with sender pid */
488 	hdr->nlmsg_pid = nlp->nl_port;
489 
490 	npt->hdr = hdr;
491 
492 	if (hdr->nlmsg_flags & NLM_F_REQUEST && hdr->nlmsg_type >= NLMSG_MIN_TYPE) {
493 		NL_LOG(LOG_DEBUG2, "handling message with msg type: %d",
494 		   hdr->nlmsg_type);
495 
496 		if (nlp->nl_linux && linux_netlink_p != NULL) {
497 			struct nlmsghdr *hdr_orig = hdr;
498 			hdr = linux_netlink_p->msg_from_linux(nlp->nl_proto, hdr, npt);
499 			if (hdr == NULL) {
500 				 /* Failed to translate to kernel format. Report an error back */
501 				hdr = hdr_orig;
502 				npt->hdr = hdr;
503 				if (hdr->nlmsg_flags & NLM_F_ACK)
504 					nlmsg_ack(nlp, EOPNOTSUPP, hdr, npt);
505 				return (0);
506 			}
507 		}
508 		error = handler(hdr, npt);
509 		NL_LOG(LOG_DEBUG2, "retcode: %d", error);
510 	}
511 	if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) {
512 		if (!npt->nw->suppress_ack) {
513 			NL_LOG(LOG_DEBUG3, "ack");
514 			nlmsg_ack(nlp, error, hdr, npt);
515 		}
516 	}
517 
518 	return (0);
519 }
520 
521 static void
522 npt_clear(struct nl_pstate *npt)
523 {
524 	lb_clear(&npt->lb);
525 	npt->error = 0;
526 	npt->err_msg = NULL;
527 	npt->err_off = 0;
528 	npt->hdr = NULL;
529 	npt->nw->suppress_ack = false;
530 }
531 
532 /*
533  * Processes an incoming packet, which can contain multiple netlink messages
534  */
535 static struct mbuf *
536 nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp)
537 {
538 	int offset, buffer_length;
539 	struct nlmsghdr *hdr;
540 	char *buffer;
541 	int error;
542 
543 	NL_LOG(LOG_DEBUG3, "RX netlink mbuf %p on %p", m, nlp->nl_socket);
544 
545 	struct nl_writer nw = {};
546 	if (!nlmsg_get_unicast_writer(&nw, NLMSG_SMALL, nlp)) {
547 		m_freem(m);
548 		NL_LOG(LOG_DEBUG, "error allocating socket writer");
549 		return (NULL);
550 	}
551 
552 	nlmsg_ignore_limit(&nw);
553 	/* TODO: alloc this buf once for nlp */
554 	int data_length = m_length(m, NULL);
555 	buffer_length = roundup2(data_length, 8) + SCRATCH_BUFFER_SIZE;
556 	if (nlp->nl_linux)
557 		buffer_length += roundup2(data_length, 8);
558 	buffer = malloc(buffer_length, M_NETLINK, M_NOWAIT | M_ZERO);
559 	if (buffer == NULL) {
560 		m_freem(m);
561 		nlmsg_flush(&nw);
562 		NL_LOG(LOG_DEBUG, "Unable to allocate %d bytes of memory",
563 		    buffer_length);
564 		return (NULL);
565 	}
566 	m_copydata(m, 0, data_length, buffer);
567 
568 	struct nl_pstate npt = {
569 		.nlp = nlp,
570 		.lb.base = &buffer[roundup2(data_length, 8)],
571 		.lb.size = buffer_length - roundup2(data_length, 8),
572 		.nw = &nw,
573 		.strict = nlp->nl_flags & NLF_STRICT,
574 	};
575 
576 	for (offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) {
577 		hdr = (struct nlmsghdr *)&buffer[offset];
578 		/* Save length prior to calling handler */
579 		int msglen = NLMSG_ALIGN(hdr->nlmsg_len);
580 		NL_LOG(LOG_DEBUG3, "parsing offset %d/%d", offset, data_length);
581 		npt_clear(&npt);
582 		error = nl_receive_message(hdr, data_length - offset, nlp, &npt);
583 		offset += msglen;
584 		if (__predict_false(error != 0 || nlp->nl_tx_blocked))
585 			break;
586 	}
587 	NL_LOG(LOG_DEBUG3, "packet parsing done");
588 	free(buffer, M_NETLINK);
589 	nlmsg_flush(&nw);
590 
591 	if (nlp->nl_tx_blocked) {
592 		NLP_LOCK(nlp);
593 		nlp->nl_tx_blocked = false;
594 		NLP_UNLOCK(nlp);
595 		m_adj(m, offset);
596 		return (m);
597 	} else {
598 		m_freem(m);
599 		return (NULL);
600 	}
601 }
602