xref: /minix/minix/net/lwip/pktsock.c (revision fb9c64b2)
1 /* LWIP service - pktsock.c - packet code shared between UDP and RAW */
2 
3 #include "lwip.h"
4 #include "pktsock.h"
5 #include "ifaddr.h"
6 
7 /*
8  * This buffer should be much bigger (at least 10KB, according to RFC 3542),
9  * but we do not support the ancillary options that take so much space anyway.
10  */
11 #define PKTSOCK_CTLBUF_SIZE		256
12 
13 static char pktsock_ctlbuf[PKTSOCK_CTLBUF_SIZE];
14 
15 /*
16  * Header structures with ancillary data for received packets.  The reason that
17  * we do not simply use a generic pkthdr structure with ip_addr_t source and
18  * destination addresses, is that for UDP packets, we put this structure in
19  * place of the received (ethernet and IP headers), and such a full structure
20  * (including IPv6-size addresses) would not fit in the header space for IPv4
21  * packets.  So instead we use two address structures, one for IPv4 and one for
22  * IPv6, and a generic header structure on top of it, which also identifies
23  * which address structure is underneath.  The combination of the address
24  * structure and the header structure must fit in the IP header.  The IPv6
25  * packet header is already so close to the limit here that we have to use
26  * packed addresses.  For IPv4 we use the regular addresses for simplicity.
27  */
28 struct pkthdr {
29 	uint16_t port;			/* source port number (UDP only) */
30 	uint8_t dstif;			/* interface that received the pkt */
31 	uint8_t addrif;			/* interface that accepted the pkt */
32 	uint8_t tos;			/* TOS/TC value from the IP header */
33 	uint8_t ttl;			/* TTL/HL value from the IP header */
34 	uint8_t flags;			/* packet flags (PKTHF_) */
35 	uint8_t _unused;		/* all that is still available.. */
36 };
37 
38 #define PKTHF_IPV6		0x01	/* packet has IPv6 header */
39 #define PKTHF_MCAST		0x02	/* packet has multicast destination */
40 #define PKTHF_BCAST		0x04	/* packet has broadcast destination */
41 
42 struct pktaddr4 {
43 	ip4_addr_t srcaddr;
44 	ip4_addr_t dstaddr;
45 };
46 
47 struct pktaddr6 {
48 	ip6_addr_p_t srcaddr;
49 	ip6_addr_p_t dstaddr;
50 };
51 
52 /*
53  * Create a packet socket.  Relay parameters and return values to and from the
54  * IP module's socket creation function.  This function must not allocate any
55  * resources in any form, as socket creation may still fail later, in which
56  * case no destruction function is called.
57  */
58 int
59 pktsock_socket(struct pktsock * pkt, int domain, size_t sndbuf, size_t rcvbuf,
60 	struct sock ** sockp)
61 {
62 
63 	pkt->pkt_rcvhead = NULL;
64 	pkt->pkt_rcvtailp = &pkt->pkt_rcvhead;
65 	pkt->pkt_rcvlen = 0;
66 
67 	mcast_reset(&pkt->pkt_mcast);
68 
69 	memset(&pkt->pkt_srcaddr, 0, sizeof(pkt->pkt_srcaddr));
70 	pkt->pkt_ifindex = 0;
71 
72 	/*
73 	 * Any PKTF_ type flags should be initialized on the socket only after
74 	 * the following call, as this call will clear the flags field.  For
75 	 * now, no PKTF_ flags need to be set by default, though.
76 	 */
77 	return ipsock_socket(&pkt->pkt_ipsock, domain, sndbuf, rcvbuf, sockp);
78 }
79 
80 /*
81  * Return TRUE if the given packet can and should be received on the given
82  * socket, or FALSE if there is a reason not to receive the packet.
83  */
84 static int
85 pktsock_may_recv(struct pktsock * pkt, struct pbuf * pbuf)
86 {
87 
88 	/*
89 	 * By policy, multicast packets should not be received on sockets of
90 	 * which the owning application is not multicast aware.
91 	 */
92 	if (ip_addr_ismulticast(ip_current_dest_addr()) &&
93 	    !(ipsock_get_flag(&pkt->pkt_ipsock, PKTF_MCAWARE)))
94 		return FALSE;
95 
96 	/*
97 	 * Due to fragment reassembly, we might end up with packets that take
98 	 * up more buffer space than their byte size, even after rounding up
99 	 * the latter.  The user probably does not want packets to get dropped
100 	 * for that reason, e.g. when they set a 64K limit and the packet ends
101 	 * up being estimated as 65K and dropped.  So, we test against
102 	 * 'pbuf->tot_len' rather than the rounded-up packet size.  However,
103 	 * 'pkt->pkt_rcvlen' itself is increased by the rounded-up packet size
104 	 * when enqueuing the packet, so that we still count the memory
105 	 * consumption (generally) conservatively, which is what we want.
106 	 */
107 	return (pkt->pkt_rcvlen + pbuf->tot_len <=
108 	    ipsock_get_rcvbuf(&pkt->pkt_ipsock));
109 }
110 
111 /*
112  * Check whether the given packet can and should be received on the given
113  * socket.  If so, return the amount of space for ancillary information that
114  * will be necessary for the packet.  If not, return a negative value.
115  */
116 int
117 pktsock_test_input(struct pktsock * pkt, struct pbuf * pbuf)
118 {
119 
120 	/*
121 	 * This check will be done again in pktsock_input(), but this function
122 	 * is called for raw packets only (not for UDP packets) and, if this
123 	 * (cheap) check fails, we can avoid a (rather expensive) packet copy.
124 	 */
125 	if (!pktsock_may_recv(pkt, pbuf))
126 		return -1;
127 
128 	if (ip_current_is_v6())
129 		return (int)(sizeof(struct pktaddr6) + sizeof(struct pkthdr));
130 	else
131 		return (int)(sizeof(struct pktaddr4) + sizeof(struct pkthdr));
132 }
133 
134 /*
135  * A packet has arrived on a packet socket.  We own the given packet buffer,
136  * and so we must free it if we do not want to keep it.
137  */
138 void
139 pktsock_input(struct pktsock * pkt, struct pbuf * pbuf,
140 	const ip_addr_t * srcaddr, uint16_t port)
141 {
142 	struct pktaddr4 pktaddr4;
143 	struct pktaddr6 pktaddr6;
144 	struct pkthdr pkthdr;
145 	void *pktaddr;
146 	struct ifdev *ifdev;
147 	size_t pktaddrlen;
148 
149 	/*
150 	 * We are going to mess with the packet's header and contents, so we
151 	 * must be the exclusive owner of the packet.  For UDP packets, lwIP
152 	 * must have made a copy for us in case of non-exclusive delivery
153 	 * (e.g., multicast packets).  For raw packets, we have made a copy of
154 	 * the packet ourselves just before the call to this function.
155 	 */
156 	if (pbuf->ref != 1)
157 		panic("input packet has multiple references!");
158 
159 	/* If the packet should not be received on this socket, drop it. */
160 	if (!pktsock_may_recv(pkt, pbuf)) {
161 		pbuf_free(pbuf);
162 
163 		return;
164 	}
165 
166 	/*
167 	 * Enqueue the packet.  Overwrite the leading IP header with packet
168 	 * information that is used at the time of receipt by userland.  The
169 	 * data structures are such that the information always fits in what
170 	 * was the IP header.  The reference count check earlier ensures that
171 	 * we never overwrite part of a packet that is still in use elsewhere.
172 	 */
173 	if (ip_current_is_v6()) {
174 		assert(IP_IS_V6(srcaddr));
175 		assert(ip6_current_dest_addr() != NULL);
176 
177 		ip6_addr_copy_to_packed(pktaddr6.srcaddr, *ip_2_ip6(srcaddr));
178 		ip6_addr_copy_to_packed(pktaddr6.dstaddr,
179 		    *ip6_current_dest_addr());
180 		pktaddr = &pktaddr6;
181 		pktaddrlen = sizeof(pktaddr6);
182 
183 		assert(pktaddrlen + sizeof(pkthdr) <= IP6_HLEN);
184 
185 		pkthdr.tos = IP6H_TC(ip6_current_header());
186 		pkthdr.ttl = IP6H_HOPLIM(ip6_current_header());
187 		pkthdr.flags = PKTHF_IPV6;
188 	} else {
189 		assert(IP_IS_V4(srcaddr));
190 		assert(ip4_current_dest_addr() != NULL);
191 
192 		memcpy(&pktaddr4.srcaddr, ip_2_ip4(srcaddr),
193 		    sizeof(pktaddr4.srcaddr));
194 		memcpy(&pktaddr4.dstaddr, ip4_current_dest_addr(),
195 		    sizeof(pktaddr4.srcaddr));
196 		pktaddr = &pktaddr4;
197 		pktaddrlen = sizeof(pktaddr4);
198 
199 		assert(pktaddrlen + sizeof(pkthdr) <= IP_HLEN);
200 
201 		pkthdr.tos = IPH_TOS(ip4_current_header());
202 		pkthdr.ttl = IPH_TTL(ip4_current_header());
203 		pkthdr.flags = 0;
204 	}
205 
206 	/*
207 	 * Save both the interface on which the packet was received (for
208 	 * PKTINFO) and the interface that owns the destination address of the
209 	 * packet (for the source address's zone ID).
210 	 */
211 	assert(ip_current_input_netif() != NULL);
212 	ifdev = netif_get_ifdev(ip_current_input_netif());
213 	pkthdr.dstif = (uint16_t)ifdev_get_index(ifdev);
214 
215 	assert(ip_current_netif() != NULL);
216 	ifdev = netif_get_ifdev(ip_current_netif());
217 	pkthdr.addrif = (uint16_t)ifdev_get_index(ifdev);
218 
219 	if ((pbuf->flags & PBUF_FLAG_LLMCAST) ||
220 	    ip_addr_ismulticast(ip_current_dest_addr()))
221 		pkthdr.flags |= PKTHF_MCAST;
222 	else if ((pbuf->flags & PBUF_FLAG_LLBCAST) ||
223 	    ip_addr_isbroadcast(ip_current_dest_addr(), ip_current_netif()))
224 		pkthdr.flags |= PKTHF_BCAST;
225 
226 	pkthdr.port = port;
227 
228 	util_pbuf_header(pbuf, sizeof(pkthdr));
229 
230 	memcpy(pbuf->payload, &pkthdr, sizeof(pkthdr));
231 
232 	util_pbuf_header(pbuf, pktaddrlen);
233 
234 	memcpy(pbuf->payload, pktaddr, pktaddrlen);
235 
236 	util_pbuf_header(pbuf, -(int)(sizeof(pkthdr) + pktaddrlen));
237 
238 	*pkt->pkt_rcvtailp = pbuf;
239 	pkt->pkt_rcvtailp = pchain_end(pbuf);
240 	pkt->pkt_rcvlen += pchain_size(pbuf);
241 
242 	sockevent_raise(ipsock_get_sock(&pkt->pkt_ipsock), SEV_RECV);
243 }
244 
245 /*
246  * Obtain interface and source address information for an outgoing packet.  In
247  * particular, parse any IPV6_PKTINFO options provided as either sticky options
248  * on the socket 'pkt' or as ancillary options in the packet options 'pkto'.
249  * On success, return OK, with 'ifdevp' set to either the outgoing interface to
250  * use for the packet, or NULL if no outgoing interface was specified using
251  * either of the aforementioned options.  If, and only if, 'ifdevp' is set to
252  * an actual interface (i.e., not NULL), then 'src_addrp' is filled with either
253  * a locally owned, validated, unicast address to use as source of the packet,
254  * or the unspecified ('any') address if no source address was specified using
255  * the options.  On failure, return a negative error code.
256  */
257 int
258 pktsock_get_pktinfo(struct pktsock * pkt, struct pktopt * pkto,
259 	struct ifdev ** ifdevp, ip_addr_t * src_addrp)
260 {
261 	struct ifdev *ifdev, *ifdev2;
262 	ip_addr_t ipaddr;
263 	uint32_t ifindex;
264 	int r;
265 
266 	/* We support only IPV6_PKTINFO.  IP_PKTINFO is not supported. */
267 	if (!ipsock_is_ipv6(&pkt->pkt_ipsock)) {
268 		*ifdevp = NULL;
269 		return OK;
270 	}
271 
272 	/*
273 	 * TODO: we are spending a lot of effort on initializing and copying
274 	 * stuff around, even just to find out whether there is anything to do
275 	 * at all here.  See if this can be optimized.
276 	 */
277 	ip_addr_set_zero_ip6(&ipaddr);
278 
279 	/*
280 	 * Ancillary data takes precedence over sticky options.  We treat the
281 	 * source address and interface index fields as separate, overriding
282 	 * each earlier value only if non-zero.  TODO: is that correct?
283 	 */
284 	if (pkto->pkto_flags & PKTOF_PKTINFO) {
285 		memcpy(ip_2_ip6(&ipaddr)->addr, &pkto->pkto_srcaddr.addr,
286 		    sizeof(ip_2_ip6(&ipaddr)->addr));
287 		ifindex = pkto->pkto_ifindex;
288 	} else
289 		ifindex = 0;
290 
291 	if (ip6_addr_isany(ip_2_ip6(&ipaddr)))
292 		memcpy(ip_2_ip6(&ipaddr)->addr, &pkt->pkt_srcaddr.addr,
293 		    sizeof(ip_2_ip6(&ipaddr)->addr));
294 	if (ifindex == 0)
295 		ifindex = pkt->pkt_ifindex;
296 
297 	/* If both fields are blank, there is nothing more to do. */
298 	if (ip6_addr_isany(ip_2_ip6(&ipaddr)) && ifindex == 0) {
299 		*ifdevp = NULL;
300 		return OK;
301 	}
302 
303 	/* If an interface index is specified, it must be valid. */
304 	ifdev = NULL;
305 
306 	if (ifindex != 0 && (ifdev = ifdev_get_by_index(ifindex)) == NULL)
307 		return ENXIO;
308 
309 	/*
310 	 * Use the interface index to set a zone on the source address, if the
311 	 * source address has a scope.
312 	 */
313 	if (ip6_addr_has_scope(ip_2_ip6(&ipaddr), IP6_UNKNOWN)) {
314 		if (ifindex == 0)
315 			return EADDRNOTAVAIL;
316 
317 		ip6_addr_set_zone(ip_2_ip6(&ipaddr), ifindex);
318 	}
319 
320 	/*
321 	 * We need to validate the given address just as thoroughly as an
322 	 * address given through bind().  If we don't, we could allow forged
323 	 * source addresses etcetera.  To be sure: this call may change the
324 	 * address to an IPv4 type address if needed.
325 	 */
326 	if ((r = ipsock_check_src_addr(pktsock_get_ipsock(pkt), &ipaddr,
327 	    FALSE /*allow_mcast*/, &ifdev2)) != OK)
328 		return r;
329 
330 	if (ifdev2 != NULL) {
331 		if (ifdev == NULL)
332 			ifdev = ifdev2;
333 		else if (ifdev != ifdev2)
334 			return EADDRNOTAVAIL;
335 	} else {
336 		/*
337 		 * There should be no cases where the (non-multicast) address
338 		 * successfully parsed, is not unspecified, and yet did not map
339 		 * to an interface.  Eliminate the possibility anyway by
340 		 * throwing an error for this case.  As a result, we are left
341 		 * with one of two cases:
342 		 *
343 		 * 1) ifdevp is not NULL, and src_addrp is unspecified;
344 		 * 2) ifdevp is not NULL, and src_addrp is a locally assigned
345 		 *    (unicast) address.
346 		 *
347 		 * This is why we need not fill src_addrp when ifdevp is NULL.
348 		 */
349 		if (!ip_addr_isany(&ipaddr))
350 			return EADDRNOTAVAIL;
351 	}
352 
353 	*ifdevp = ifdev;
354 	if (ifdev != NULL)
355 		*src_addrp = ipaddr;
356 	return OK;
357 }
358 
359 /*
360  * Parse a chunk of user-provided control data, on an IPv4 socket provided as
361  * 'pkt'.  The control chunk is given as 'cmsg', and the length of the data
362  * following the control header (possibly zero) is given as 'len'.  On success,
363  * return OK, with any parsed options merged into the set of packet options
364  * 'pkto'.  On failure, return a negative error code.
365  */
366 static int
367 pktsock_parse_ctl_v4(struct pktsock * pkt __unused, struct cmsghdr * cmsg,
368 	socklen_t len, struct pktopt * pkto)
369 {
370 	uint8_t byte;
371 	int val;
372 
373 	if (cmsg->cmsg_level != IPPROTO_IP)
374 		return EAFNOSUPPORT;
375 
376 	switch (cmsg->cmsg_type) {
377 	case IP_TOS:
378 		/*
379 		 * Some userland code (bind's libisc in particular) supplies
380 		 * a single byte instead of a full integer for this option.
381 		 * We go out of our way to accept that format, too.
382 		 */
383 		if (len != sizeof(val) && len != sizeof(byte))
384 			return EINVAL;
385 
386 		if (len == sizeof(byte)) {
387 			memcpy(&byte, CMSG_DATA(cmsg), sizeof(byte));
388 			val = (int)byte;
389 		} else
390 			memcpy(&val, CMSG_DATA(cmsg), sizeof(val));
391 
392 		if (val < 0 || val > UINT8_MAX)
393 			return EINVAL;
394 
395 		pkto->pkto_flags |= PKTOF_TOS;
396 		pkto->pkto_tos = (uint8_t)val;
397 
398 		return OK;
399 
400 	case IP_TTL:
401 		if (len != sizeof(val))
402 			return EINVAL;
403 
404 		memcpy(&val, CMSG_DATA(cmsg), sizeof(val));
405 
406 		if (val < 0 || val > UINT8_MAX)
407 			return EINVAL;
408 
409 		pkto->pkto_flags |= PKTOF_TTL;
410 		pkto->pkto_ttl = (uint8_t)val;
411 
412 		return OK;
413 
414 	/*
415 	 * Implementing IP_PKTINFO might be a bit harder than its IPV6_PKTINFO
416 	 * sibling, because it would require the use of zone IDs (interface
417 	 * indices) for IPv4, which is not supported yet.
418 	 */
419 	}
420 
421 	return EINVAL;
422 }
423 
424 /*
425  * Parse a chunk of user-provided control data, on an IPv6 socket provided as
426  * 'pkt'.  The control chunk is given as 'cmsg', and the length of the data
427  * following the control header (possibly zero) is given as 'len'.  On success,
428  * return OK, with any parsed options merged into the set of packet options
429  * 'pkto'.  On failure, return a negative error code.
430  */
431 static int
432 pktsock_parse_ctl_v6(struct pktsock * pkt, struct cmsghdr * cmsg,
433 	socklen_t len, struct pktopt * pkto)
434 {
435 	struct in6_pktinfo ipi6;
436 	int val;
437 
438 	if (cmsg->cmsg_level != IPPROTO_IPV6)
439 		return EAFNOSUPPORT;
440 
441 	switch (cmsg->cmsg_type) {
442 	case IPV6_TCLASS:
443 		if (len != sizeof(val))
444 			return EINVAL;
445 
446 		memcpy(&val, CMSG_DATA(cmsg), sizeof(val));
447 
448 		if (val < -1 || val > UINT8_MAX)
449 			return EINVAL;
450 
451 		if (val == -1)
452 			val = 0;
453 
454 		pkto->pkto_flags |= PKTOF_TOS;
455 		pkto->pkto_tos = (uint8_t)val;
456 
457 		return OK;
458 
459 	case IPV6_HOPLIMIT:
460 		if (len != sizeof(val))
461 			return EINVAL;
462 
463 		memcpy(&val, CMSG_DATA(cmsg), sizeof(val));
464 
465 		if (val < -1 || val > UINT8_MAX)
466 			return EINVAL;
467 
468 		if (val == -1)
469 			val = IP_DEFAULT_TTL;
470 
471 		pkto->pkto_flags |= PKTOF_TTL;
472 		pkto->pkto_ttl = (uint8_t)val;
473 
474 		return OK;
475 
476 	case IPV6_PKTINFO:
477 		if (len != sizeof(ipi6))
478 			return EINVAL;
479 
480 		memcpy(&ipi6, CMSG_DATA(cmsg), sizeof(ipi6));
481 
482 		pkto->pkto_flags |= PKTOF_PKTINFO;
483 		memcpy(&pkto->pkto_srcaddr.addr, &ipi6.ipi6_addr,
484 		    sizeof(pkto->pkto_srcaddr.addr));
485 		pkto->pkto_ifindex = ipi6.ipi6_ifindex;
486 
487 		return OK;
488 
489 	case IPV6_USE_MIN_MTU:
490 		if (len != sizeof(int))
491 			return EINVAL;
492 
493 		memcpy(&val, CMSG_DATA(cmsg), sizeof(val));
494 
495 		if (val < -1 || val > 1)
496 			return EINVAL;
497 
498 		/* TODO: not supported by lwIP, but needed by applications. */
499 		return OK;
500 	}
501 
502 	return EINVAL;
503 }
504 
505 /*
506  * Copy in and parse control data, as part of sending a packet on socket 'pkt'.
507  * The control data is accessible through 'ctl', with a user-provided length of
508  * 'ctl_len'.  On success, return OK, with any parsed packet options stored in
509  * 'pkto'.  On failure, return a negative error code.
510  */
511 int
512 pktsock_get_ctl(struct pktsock * pkt, const struct sockdriver_data * ctl,
513 	socklen_t ctl_len, struct pktopt * pkto)
514 {
515 	struct msghdr msghdr;
516 	struct cmsghdr *cmsg;
517 	socklen_t left, len;
518 	int r;
519 
520 	/* The default: no packet options are being overridden. */
521 	assert(pkto->pkto_flags == 0);
522 
523 	/* If no control length is given, we are done here. */
524 	if (ctl_len == 0)
525 		return OK;
526 
527 	/*
528 	 * For now, we put a rather aggressive limit on the size of the control
529 	 * data.  We copy in and parse the whole thing in a single buffer.
530 	 */
531 	if (ctl_len > sizeof(pktsock_ctlbuf)) {
532 		printf("LWIP: too much control data given (%u bytes)\n",
533 		    ctl_len);
534 
535 		return ENOBUFS;
536 	}
537 
538 	if ((r = sockdriver_copyin(ctl, 0, pktsock_ctlbuf, ctl_len)) != OK)
539 		return r;
540 
541 	memset(&msghdr, 0, sizeof(msghdr));
542 	msghdr.msg_control = pktsock_ctlbuf;
543 	msghdr.msg_controllen = ctl_len;
544 
545 	for (cmsg = CMSG_FIRSTHDR(&msghdr); cmsg != NULL;
546 	    cmsg = CMSG_NXTHDR(&msghdr, cmsg)) {
547 		/* Check for bogus lengths. */
548 		assert((socklen_t)((char *)cmsg - pktsock_ctlbuf) <= ctl_len);
549 		left = ctl_len - (socklen_t)((char *)cmsg - pktsock_ctlbuf);
550 		assert(left >= CMSG_LEN(0)); /* guaranteed by CMSG_xxHDR */
551 
552 		if (cmsg->cmsg_len < CMSG_LEN(0) || cmsg->cmsg_len > left) {
553 			printf("LWIP: malformed control data rejected\n");
554 
555 			return EINVAL;
556 		}
557 
558 		len = cmsg->cmsg_len - CMSG_LEN(0);
559 
560 		if (ipsock_is_ipv6(&pkt->pkt_ipsock))
561 			r = pktsock_parse_ctl_v6(pkt, cmsg, len, pkto);
562 		else
563 			r = pktsock_parse_ctl_v4(pkt, cmsg, len, pkto);
564 
565 		if (r != OK)
566 			return r;
567 	}
568 
569 	return OK;
570 }
571 
572 /*
573  * Copy in the packet data from the calling user process, and store it in the
574  * buffer 'pbuf' that must already have been allocated with the appropriate
575  * size.
576  */
577 int
578 pktsock_get_data(struct pktsock * pkt, const struct sockdriver_data * data,
579 	size_t len, struct pbuf * pbuf)
580 
581 {
582 
583 	return util_copy_data(data, len, 0, pbuf, 0, TRUE /*copy_in*/);
584 }
585 
586 /*
587  * Dequeue and free the head of the receive queue of a packet socket.
588  */
589 static void
590 pktsock_dequeue(struct pktsock * pkt)
591 {
592 	struct pbuf *pbuf, **pnext;
593 	size_t size;
594 
595 	pbuf = pkt->pkt_rcvhead;
596 	assert(pbuf != NULL);
597 
598 	pnext = pchain_end(pbuf);
599 	size = pchain_size(pbuf);
600 
601 	if ((pkt->pkt_rcvhead = *pnext) == NULL)
602 		pkt->pkt_rcvtailp = &pkt->pkt_rcvhead;
603 
604 	assert(pkt->pkt_rcvlen >= size);
605 	pkt->pkt_rcvlen -= size;
606 
607 	*pnext = NULL;
608 	pbuf_free(pbuf);
609 }
610 
611 /*
612  * Perform preliminary checks on a receive request.
613  */
614 int
615 pktsock_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused,
616 	int flags)
617 {
618 
619 	/*
620 	 * We accept the same flags across all socket types in LWIP, and then
621 	 * simply ignore the ones we do not support for packet sockets.
622 	 */
623 	if ((flags & ~(MSG_PEEK | MSG_WAITALL)) != 0)
624 		return EOPNOTSUPP;
625 
626 	return OK;
627 }
628 
629 /*
630  * Add a chunk of control data to the global control buffer, starting from
631  * offset 'off'.  The chunk has the given level and type, and its data is given
632  * in the buffer 'ptr' with size 'len'.  Return the (padded) size of the chunk
633  * that was generated as a result.
634  */
635 static size_t
636 pktsock_add_ctl(int level, int type, void * ptr, socklen_t len, size_t off)
637 {
638 	struct cmsghdr cmsg;
639 	size_t size;
640 
641 	size = CMSG_SPACE(len);
642 
643 	/*
644 	 * The global control buffer must be large enough to store one chunk
645 	 * of each of the supported options.  If this panic triggers, increase
646 	 * PKTSOCK_CTLBUF_SIZE by as much as needed.
647 	 */
648 	if (off + size > sizeof(pktsock_ctlbuf))
649 		panic("control buffer too small, increase "
650 		    "PKTSOCK_CTLBUF_SIZE");
651 
652 	memset(&cmsg, 0, sizeof(cmsg));
653 	cmsg.cmsg_len = CMSG_LEN(len);
654 	cmsg.cmsg_level = level;
655 	cmsg.cmsg_type = type;
656 
657 	/*
658 	 * Clear any padding space.  This can be optimized, but in any case we
659 	 * must be careful not to copy out any bytes that have not been
660 	 * initialized at all.
661 	 */
662 	memset(&pktsock_ctlbuf[off], 0, size);
663 
664 	memcpy(&pktsock_ctlbuf[off], &cmsg, sizeof(cmsg));
665 	memcpy(CMSG_DATA((struct cmsghdr *)&pktsock_ctlbuf[off]), ptr, len);
666 
667 	return size;
668 }
669 
670 /*
671  * Generate and copy out control data, as part of delivering a packet from
672  * socket 'pkt' to userland.  The control data buffer is given as 'ctl', with
673  * a user-given length of 'ctl_len' bytes.  The packet's header information is
674  * provided as 'pkthdr', and its source and destination addresses as 'pktaddr',
675  * which maybe a pktaddr4 or pktaddr6 structure depending on the value of the
676  * PKTHF_IPV6 flag in the 'flags' field in 'pkthdr'.  Note that we support
677  * dual-stack sockets, and as such it is possible that the socket is of domain
678  * AF_INET6 while the received packet is an IPv4 packet.  On success, return
679  * the size of the control data copied out (possibly zero).  If more control
680  * data were generated than copied out, also merge the MSG_CTRUNC flag into
681  * 'rflags'.  On failure, return a negative error code.
682  */
683 static int
684 pktsock_put_ctl(struct pktsock * pkt, const struct sockdriver_data * ctl,
685 	socklen_t ctl_len, struct pkthdr * pkthdr, void * pktaddr,
686 	int * rflags)
687 {
688 	struct pktaddr6 *pktaddr6;
689 	struct pktaddr4 *pktaddr4;
690 	struct in_pktinfo ipi;
691 	struct in6_pktinfo ipi6;
692 	ip_addr_t ipaddr;
693 	unsigned int flags;
694 	uint8_t byte;
695 	size_t off;
696 	int r, val;
697 
698 	flags = ipsock_get_flags(&pkt->pkt_ipsock);
699 
700 	if (!(flags & (PKTF_RECVINFO | PKTF_RECVTOS | PKTF_RECVTTL)))
701 		return 0;
702 
703 	/*
704 	 * Important: all generated control chunks must fit in the global
705 	 * control buffer together.  When adding more options here, ensure that
706 	 * the control buffer remains large enough to receive all options at
707 	 * once.  See also the panic in pktsock_add_ctl().
708 	 */
709 	off = 0;
710 
711 	/*
712 	 * IPv6 sockets may receive IPv4 packets.  The ancillary data is in the
713 	 * format corresponding to the socket, which means we may have to
714 	 * convert any IPv4 addresses from the packet to IPv4-mapped IPv6
715 	 * addresses for the ancillary data, just like the source address.
716 	 */
717 	if (ipsock_is_ipv6(&pkt->pkt_ipsock)) {
718 		if (flags & PKTF_RECVTTL) {
719 			val = pkthdr->ttl;
720 
721 			off += pktsock_add_ctl(IPPROTO_IPV6, IPV6_HOPLIMIT,
722 			    &val, sizeof(val), off);
723 		}
724 
725 		if (flags & PKTF_RECVTOS) {
726 			val = pkthdr->tos;
727 
728 			off += pktsock_add_ctl(IPPROTO_IPV6, IPV6_TCLASS, &val,
729 			    sizeof(val), off);
730 		}
731 
732 		if (flags & PKTF_RECVINFO) {
733 			memset(&ipi6, 0, sizeof(ipi6));
734 
735 			if (pkthdr->flags & PKTHF_IPV6) {
736 				pktaddr6 = (struct pktaddr6 *)pktaddr;
737 				memcpy(&ipi6.ipi6_addr, &pktaddr6->dstaddr,
738 				    sizeof(ipi6.ipi6_addr));
739 			} else {
740 				pktaddr4 = (struct pktaddr4 *)pktaddr;
741 
742 				addr_make_v4mapped_v6(&ipaddr,
743 				    &pktaddr4->dstaddr);
744 
745 				memcpy(&ipi6.ipi6_addr,
746 				    ip_2_ip6(&ipaddr)->addr,
747 				    sizeof(ipi6.ipi6_addr));
748 			}
749 			ipi6.ipi6_ifindex = pkthdr->dstif;
750 
751 			off += pktsock_add_ctl(IPPROTO_IPV6, IPV6_PKTINFO,
752 			    &ipi6, sizeof(ipi6), off);
753 		}
754 	} else {
755 		if (flags & PKTF_RECVTTL) {
756 			byte = pkthdr->ttl;
757 
758 			off += pktsock_add_ctl(IPPROTO_IP, IP_TTL, &byte,
759 			    sizeof(byte), off);
760 		}
761 
762 		if (flags & PKTF_RECVINFO) {
763 			assert(!(pkthdr->flags & PKTHF_IPV6));
764 			pktaddr4 = (struct pktaddr4 *)pktaddr;
765 
766 			memset(&ipi, 0, sizeof(ipi));
767 			memcpy(&ipi.ipi_addr, &pktaddr4->dstaddr,
768 			    sizeof(ipi.ipi_addr));
769 			ipi.ipi_ifindex = pkthdr->dstif;
770 
771 			off += pktsock_add_ctl(IPPROTO_IP, IP_PKTINFO, &ipi,
772 			    sizeof(ipi), off);
773 		}
774 	}
775 
776 	assert(off > 0);
777 
778 	if (ctl_len >= off)
779 		ctl_len = off;
780 	else
781 		*rflags |= MSG_CTRUNC;
782 
783 	if (ctl_len > 0 &&
784 	    (r = sockdriver_copyout(ctl, 0, pktsock_ctlbuf, ctl_len)) != OK)
785 		return r;
786 
787 	return ctl_len;
788 }
789 
790 /*
791  * Receive data on a packet socket.
792  */
793 int
794 pktsock_recv(struct sock * sock, const struct sockdriver_data * data,
795 	size_t len, size_t * off, const struct sockdriver_data * ctl,
796 	socklen_t ctl_len, socklen_t * ctl_off, struct sockaddr * addr,
797 	socklen_t * addr_len, endpoint_t user_endpt __unused, int flags,
798 	size_t min __unused, int * rflags)
799 {
800 	struct pktsock *pkt = (struct pktsock *)sock;
801 	struct pktaddr4 pktaddr4;
802 	struct pktaddr6 pktaddr6;
803 	struct pkthdr pkthdr;
804 	void *pktaddr;
805 	struct pbuf *pbuf;
806 	ip_addr_t srcaddr;
807 	int r;
808 
809 	if ((pbuf = pkt->pkt_rcvhead) == NULL)
810 		return SUSPEND;
811 
812 	/*
813 	 * Get the ancillary data for the packet.  The format of the ancillary
814 	 * data depends on the received packet type, which may be different
815 	 * from the socket type.
816 	 */
817 	util_pbuf_header(pbuf, sizeof(pkthdr));
818 
819 	memcpy(&pkthdr, pbuf->payload, sizeof(pkthdr));
820 
821 	if (pkthdr.flags & PKTHF_IPV6) {
822 		util_pbuf_header(pbuf, sizeof(pktaddr6));
823 
824 		memcpy(&pktaddr6, pbuf->payload, sizeof(pktaddr6));
825 		pktaddr = &pktaddr6;
826 
827 		ip_addr_copy_from_ip6_packed(srcaddr, pktaddr6.srcaddr);
828 		if (ip6_addr_has_scope(ip_2_ip6(&srcaddr), IP6_UNICAST))
829 			ip6_addr_set_zone(ip_2_ip6(&srcaddr), pkthdr.addrif);
830 
831 		util_pbuf_header(pbuf,
832 		    -(int)(sizeof(pkthdr) + sizeof(pktaddr6)));
833 	} else {
834 		util_pbuf_header(pbuf, sizeof(pktaddr4));
835 
836 		memcpy(&pktaddr4, pbuf->payload, sizeof(pktaddr4));
837 		pktaddr = &pktaddr4;
838 
839 		ip_addr_copy_from_ip4(srcaddr, pktaddr4.srcaddr);
840 
841 		util_pbuf_header(pbuf,
842 		    -(int)(sizeof(pkthdr) + sizeof(pktaddr4)));
843 	}
844 
845 	/* Copy out the packet data to the calling user process. */
846 	if (len >= pbuf->tot_len)
847 		len = pbuf->tot_len;
848 	else
849 		*rflags |= MSG_TRUNC;
850 
851 	r = util_copy_data(data, len, 0, pbuf, 0, FALSE /*copy_in*/);
852 
853 	if (r != OK)
854 		return r;
855 
856 	/* Generate and copy out ancillary (control) data, if requested. */
857 	if ((r = pktsock_put_ctl(pkt, ctl, ctl_len, &pkthdr, pktaddr,
858 	    rflags)) < 0)
859 		return r;
860 
861 	/* Store the source IP address. */
862 	ipsock_put_addr(&pkt->pkt_ipsock, addr, addr_len, &srcaddr,
863 	    pkthdr.port);
864 
865 	/* Set multicast or broadcast message flag, if applicable. */
866 	if (pkthdr.flags & PKTHF_MCAST)
867 		*rflags |= MSG_MCAST;
868 	else if (pkthdr.flags & PKTHF_BCAST)
869 		*rflags |= MSG_BCAST;
870 
871 	/* Discard the packet now, unless we were instructed to peek only. */
872 	if (!(flags & MSG_PEEK))
873 		pktsock_dequeue(pkt);
874 
875 	/* Return the received part of the packet length. */
876 	*off = len;
877 	*ctl_off = r;
878 	return OK;
879 }
880 
881 /*
882  * Test whether data can be received on a packet socket, and if so, how many
883  * bytes of data.
884  */
885 int
886 pktsock_test_recv(struct sock * sock, size_t min __unused, size_t * size)
887 {
888 	struct pktsock *pkt = (struct pktsock *)sock;
889 
890 	if (pkt->pkt_rcvhead == NULL)
891 		return SUSPEND;
892 
893 	if (size != NULL)
894 		*size = pkt->pkt_rcvhead->tot_len;
895 	return OK;
896 }
897 
898 /*
899  * The caller has performed a multicast operation on the given socket.  Thus,
900  * the caller is multicast aware.  Remember this, because that means the socket
901  * may also receive traffic to multicast destinations.
902  */
903 void
904 pktsock_set_mcaware(struct pktsock * pkt)
905 {
906 
907 	ipsock_set_flag(&pkt->pkt_ipsock, PKTF_MCAWARE);
908 }
909 
910 /*
911  * Set socket options on a packet socket.
912  */
913 int
914 pktsock_setsockopt(struct pktsock * pkt, int level, int name,
915 	const struct sockdriver_data * data, socklen_t len,
916 	struct ipopts * ipopts)
917 {
918 	struct ip_mreq imr;
919 	struct ipv6_mreq ipv6mr;
920 	struct in6_pktinfo ipi6;
921 	ip_addr_t ipaddr, ifaddr;
922 	struct ifdev *ifdev;
923 	unsigned int flag;
924 	uint32_t ifindex;
925 	int r, val, has_scope;
926 
927 	switch (level) {
928 	case IPPROTO_IP:
929 		if (ipsock_is_ipv6(&pkt->pkt_ipsock))
930 			break;
931 
932 		switch (name) {
933 		case IP_ADD_MEMBERSHIP:
934 		case IP_DROP_MEMBERSHIP:
935 			pktsock_set_mcaware(pkt);
936 
937 			if ((r = sockdriver_copyin_opt(data, &imr, sizeof(imr),
938 			    len)) != OK)
939 				return r;
940 
941 			ip_addr_set_ip4_u32(&ipaddr, imr.imr_multiaddr.s_addr);
942 			ip_addr_set_ip4_u32(&ifaddr, imr.imr_interface.s_addr);
943 
944 			if (!ip_addr_isany(&ifaddr)) {
945 				ifdev = ifaddr_map_by_addr(&ifaddr);
946 
947 				if (ifdev == NULL)
948 					return EADDRNOTAVAIL;
949 			} else
950 				ifdev = NULL;
951 
952 			if (name == IP_ADD_MEMBERSHIP)
953 				r = mcast_join(&pkt->pkt_mcast, &ipaddr,
954 				    ifdev);
955 			else
956 				r = mcast_leave(&pkt->pkt_mcast, &ipaddr,
957 				    ifdev);
958 
959 			return r;
960 
961 		case IP_RECVTTL:
962 		case IP_RECVPKTINFO:
963 			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
964 			    len)) != OK)
965 				return r;
966 
967 			switch (name) {
968 			case IP_RECVTTL:	flag = PKTF_RECVTTL; break;
969 			case IP_RECVPKTINFO:	flag = PKTF_RECVINFO; break;
970 			default:		flag = 0; assert(0); break;
971 			}
972 
973 			if (val)
974 				ipsock_set_flag(&pkt->pkt_ipsock, flag);
975 			else
976 				ipsock_clear_flag(&pkt->pkt_ipsock, flag);
977 
978 			return OK;
979 		}
980 
981 		break;
982 
983 	case IPPROTO_IPV6:
984 		if (!ipsock_is_ipv6(&pkt->pkt_ipsock))
985 			break;
986 
987 		switch (name) {
988 		case IPV6_JOIN_GROUP:
989 		case IPV6_LEAVE_GROUP:
990 			pktsock_set_mcaware(pkt);
991 
992 			if ((r = sockdriver_copyin_opt(data, &ipv6mr,
993 			    sizeof(ipv6mr), len)) != OK)
994 				return r;
995 
996 			ip_addr_set_zero_ip6(&ipaddr);
997 			memcpy(ip_2_ip6(&ipaddr)->addr,
998 			    &ipv6mr.ipv6mr_multiaddr,
999 			    sizeof(ip_2_ip6(&ipaddr)->addr));
1000 
1001 			/*
1002 			 * We currently do not support joining IPv4 multicast
1003 			 * groups on IPv6 sockets.  The reason for this is that
1004 			 * this would require decisions on what to do if the
1005 			 * socket is set to V6ONLY later, as well as various
1006 			 * additional exceptions for a case that hopefully
1007 			 * doesn't occur in practice anyway.
1008 			 */
1009 			if (ip6_addr_isipv4mappedipv6(ip_2_ip6(&ipaddr)))
1010 				return EADDRNOTAVAIL;
1011 
1012 			has_scope = ip6_addr_has_scope(ip_2_ip6(&ipaddr),
1013 			    IP6_UNKNOWN);
1014 
1015 			if ((ifindex = ipv6mr.ipv6mr_interface) != 0) {
1016 				ifdev = ifdev_get_by_index(ifindex);
1017 
1018 				if (ifdev == NULL)
1019 					return ENXIO;
1020 
1021 				if (has_scope)
1022 					ip6_addr_set_zone(ip_2_ip6(&ipaddr),
1023 					    ifindex);
1024 			} else {
1025 				if (has_scope)
1026 					return EADDRNOTAVAIL;
1027 
1028 				ifdev = NULL;
1029 			}
1030 
1031 			if (name == IPV6_JOIN_GROUP)
1032 				r = mcast_join(&pkt->pkt_mcast, &ipaddr,
1033 				    ifdev);
1034 			else
1035 				r = mcast_leave(&pkt->pkt_mcast, &ipaddr,
1036 				    ifdev);
1037 
1038 			return r;
1039 
1040 		case IPV6_USE_MIN_MTU:
1041 			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
1042 			    len)) != OK)
1043 				return r;
1044 
1045 			if (val < -1 || val > 1)
1046 				return EINVAL;
1047 
1048 			/*
1049 			 * lwIP does not support path MTU discovery, so do
1050 			 * nothing.  TODO: see if this is actually good enough.
1051 			 */
1052 			return OK;
1053 
1054 		case IPV6_PKTINFO:
1055 			if ((r = sockdriver_copyin_opt(data, &ipi6,
1056 			    sizeof(ipi6), len)) != OK)
1057 				return r;
1058 
1059 			/*
1060 			 * Simply copy in what is given.  The values will be
1061 			 * parsed only once a packet is sent, in
1062 			 * pktsock_get_pktinfo().  Otherwise, if we perform
1063 			 * checks here, they may be outdated by the time the
1064 			 * values are actually used.
1065 			 */
1066 			memcpy(&pkt->pkt_srcaddr.addr, &ipi6.ipi6_addr,
1067 			    sizeof(pkt->pkt_srcaddr.addr));
1068 			pkt->pkt_ifindex = ipi6.ipi6_ifindex;
1069 
1070 			return OK;
1071 
1072 		case IPV6_RECVPKTINFO:
1073 		case IPV6_RECVHOPLIMIT:
1074 		case IPV6_RECVTCLASS:
1075 			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
1076 			    len)) != OK)
1077 				return r;
1078 
1079 			switch (name) {
1080 			case IPV6_RECVPKTINFO:	flag = PKTF_RECVINFO; break;
1081 			case IPV6_RECVHOPLIMIT:	flag = PKTF_RECVTTL; break;
1082 			case IPV6_RECVTCLASS:	flag = PKTF_RECVTOS; break;
1083 			default:		flag = 0; assert(0); break;
1084 			}
1085 
1086 			if (val)
1087 				ipsock_set_flag(&pkt->pkt_ipsock, flag);
1088 			else
1089 				ipsock_clear_flag(&pkt->pkt_ipsock, flag);
1090 
1091 			return OK;
1092 		}
1093 
1094 		break;
1095 	}
1096 
1097 	return ipsock_setsockopt(&pkt->pkt_ipsock, level, name, data, len,
1098 	    ipopts);
1099 }
1100 
1101 /*
1102  * Retrieve socket options on a packet socket.
1103  */
1104 int
1105 pktsock_getsockopt(struct pktsock * pkt, int level, int name,
1106 	const struct sockdriver_data * data, socklen_t * len,
1107 	struct ipopts * ipopts)
1108 {
1109 	struct in6_pktinfo ipi6;
1110 	unsigned int flag;
1111 	int val;
1112 
1113 	switch (level) {
1114 	case IPPROTO_IP:
1115 		if (ipsock_is_ipv6(&pkt->pkt_ipsock))
1116 			break;
1117 
1118 		switch (name) {
1119 		case IP_RECVTTL:
1120 		case IP_RECVPKTINFO:
1121 			switch (name) {
1122 			case IP_RECVTTL:	flag = PKTF_RECVTTL; break;
1123 			case IP_RECVPKTINFO:	flag = PKTF_RECVINFO; break;
1124 			default:		flag = 0; assert(0); break;
1125 			}
1126 
1127 			val = !!(ipsock_get_flag(&pkt->pkt_ipsock, flag));
1128 
1129 			return sockdriver_copyout_opt(data, &val, sizeof(val),
1130 			    len);
1131 		}
1132 
1133 		break;
1134 
1135 	case IPPROTO_IPV6:
1136 		if (!ipsock_is_ipv6(&pkt->pkt_ipsock))
1137 			break;
1138 
1139 		switch (name) {
1140 		case IPV6_USE_MIN_MTU:
1141 			/*
1142 			 * TODO: sort out exactly what lwIP actually supports
1143 			 * in the way of path MTU discovery.  Value 1 means
1144 			 * that path MTU discovery is disabled and packets are
1145 			 * sent at the minimum MTU (RFC 3542).
1146 			 */
1147 			val = 1;
1148 
1149 			return sockdriver_copyout_opt(data, &val, sizeof(val),
1150 			    len);
1151 
1152 		case IPV6_PKTINFO:
1153 			memset(&ipi6, 0, sizeof(ipi6));
1154 
1155 			/*
1156 			 * Simply copy out whatever was given before.  These
1157 			 * fields are initialized to zero on socket creation.
1158 			 */
1159 			memcpy(&ipi6.ipi6_addr, &pkt->pkt_srcaddr.addr,
1160 			    sizeof(ipi6.ipi6_addr));
1161 			ipi6.ipi6_ifindex = pkt->pkt_ifindex;
1162 
1163 			return sockdriver_copyout_opt(data, &ipi6,
1164 			    sizeof(ipi6), len);
1165 
1166 		case IPV6_RECVPKTINFO:
1167 		case IPV6_RECVHOPLIMIT:
1168 		case IPV6_RECVTCLASS:
1169 			switch (name) {
1170 			case IPV6_RECVPKTINFO:	flag = PKTF_RECVINFO; break;
1171 			case IPV6_RECVHOPLIMIT:	flag = PKTF_RECVTTL; break;
1172 			case IPV6_RECVTCLASS:	flag = PKTF_RECVTOS; break;
1173 			default:		flag = 0; assert(0); break;
1174 			}
1175 
1176 			val = !!(ipsock_get_flag(&pkt->pkt_ipsock, flag));
1177 
1178 			return sockdriver_copyout_opt(data, &val, sizeof(val),
1179 			    len);
1180 		}
1181 
1182 		break;
1183 	}
1184 
1185 	return ipsock_getsockopt(&pkt->pkt_ipsock, level, name, data, len,
1186 	    ipopts);
1187 }
1188 
1189 /*
1190  * Drain the receive queue of a packet socket.
1191  */
1192 static void
1193 pktsock_drain(struct pktsock * pkt)
1194 {
1195 
1196 	while (pkt->pkt_rcvhead != NULL)
1197 		pktsock_dequeue(pkt);
1198 
1199 	assert(pkt->pkt_rcvlen == 0);
1200 	assert(pkt->pkt_rcvtailp == &pkt->pkt_rcvhead);
1201 }
1202 
1203 /*
1204  * Shut down a packet socket for reading and/or writing.
1205  */
1206 void
1207 pktsock_shutdown(struct pktsock * pkt, unsigned int mask)
1208 {
1209 
1210 	if (mask & SFL_SHUT_RD)
1211 		pktsock_drain(pkt);
1212 }
1213 
1214 /*
1215  * Close a packet socket.
1216  */
1217 void
1218 pktsock_close(struct pktsock * pkt)
1219 {
1220 
1221 	pktsock_drain(pkt);
1222 
1223 	mcast_leave_all(&pkt->pkt_mcast);
1224 }
1225 
1226 /*
1227  * Return the rounded-up number of bytes in the packet socket's receive queue,
1228  * for sysctl(7).  NetBSD returns the used portion of each buffer, but that
1229  * would be quite some extra effort for us (TODO).
1230  */
1231 size_t
1232 pktsock_get_recvlen(struct pktsock * pkt)
1233 {
1234 
1235 	return pkt->pkt_rcvlen;
1236 }
1237