xref: /freebsd/sys/net/debugnet.c (revision 716fd348)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2019 Isilon Systems, LLC.
5  * Copyright (c) 2005-2014 Sandvine Incorporated. All rights reserved.
6  * Copyright (c) 2000 Darrell Anderson
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_ddb.h"
35 #include "opt_inet.h"
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/endian.h>
40 #include <sys/errno.h>
41 #include <sys/eventhandler.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/mutex.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 
48 #ifdef DDB
49 #include <ddb/ddb.h>
50 #include <ddb/db_lex.h>
51 #endif
52 
53 #include <net/ethernet.h>
54 #include <net/if.h>
55 #include <net/if_arp.h>
56 #include <net/if_dl.h>
57 #include <net/if_types.h>
58 #include <net/if_var.h>
59 #include <net/vnet.h>
60 #include <net/route.h>
61 #include <net/route/nhop.h>
62 
63 #include <netinet/in.h>
64 #include <netinet/in_fib.h>
65 #include <netinet/in_systm.h>
66 #include <netinet/in_var.h>
67 #include <netinet/ip.h>
68 #include <netinet/ip_var.h>
69 #include <netinet/ip_options.h>
70 #include <netinet/udp.h>
71 #include <netinet/udp_var.h>
72 
73 #include <machine/in_cksum.h>
74 #include <machine/pcb.h>
75 
76 #include <net/debugnet.h>
77 #define	DEBUGNET_INTERNAL
78 #include <net/debugnet_int.h>
79 
80 FEATURE(debugnet, "Debugnet support");
81 
82 SYSCTL_NODE(_net, OID_AUTO, debugnet, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
83     "debugnet parameters");
84 
85 unsigned debugnet_debug;
86 SYSCTL_UINT(_net_debugnet, OID_AUTO, debug, CTLFLAG_RWTUN,
87     &debugnet_debug, 0,
88     "Debug message verbosity (0: off; 1: on; 2: verbose)");
89 
90 int debugnet_npolls = 2000;
91 SYSCTL_INT(_net_debugnet, OID_AUTO, npolls, CTLFLAG_RWTUN,
92     &debugnet_npolls, 0,
93     "Number of times to poll before assuming packet loss (0.5ms per poll)");
94 int debugnet_nretries = 10;
95 SYSCTL_INT(_net_debugnet, OID_AUTO, nretries, CTLFLAG_RWTUN,
96     &debugnet_nretries, 0,
97     "Number of retransmit attempts before giving up");
98 int debugnet_fib = RT_DEFAULT_FIB;
99 SYSCTL_INT(_net_debugnet, OID_AUTO, fib, CTLFLAG_RWTUN,
100     &debugnet_fib, 0,
101     "Fib to use when sending dump");
102 
103 static bool g_debugnet_pcb_inuse;
104 static struct debugnet_pcb g_dnet_pcb;
105 
106 /*
107  * Simple accessors for opaque PCB.
108  */
109 const unsigned char *
110 debugnet_get_gw_mac(const struct debugnet_pcb *pcb)
111 {
112 	MPASS(g_debugnet_pcb_inuse && pcb == &g_dnet_pcb &&
113 	    pcb->dp_state >= DN_STATE_HAVE_GW_MAC);
114 	return (pcb->dp_gw_mac.octet);
115 }
116 
117 /*
118  * Start of network primitives, beginning with output primitives.
119  */
120 
121 /*
122  * Handles creation of the ethernet header, then places outgoing packets into
123  * the tx buffer for the NIC
124  *
125  * Parameters:
126  *	m	The mbuf containing the packet to be sent (will be freed by
127  *		this function or the NIC driver)
128  *	ifp	The interface to send on
129  *	dst	The destination ethernet address (source address will be looked
130  *		up using ifp)
131  *	etype	The ETHERTYPE_* value for the protocol that is being sent
132  *
133  * Returns:
134  *	int	see errno.h, 0 for success
135  */
136 int
137 debugnet_ether_output(struct mbuf *m, struct ifnet *ifp, struct ether_addr dst,
138     u_short etype)
139 {
140 	struct ether_header *eh;
141 
142 	if (((ifp->if_flags & (IFF_MONITOR | IFF_UP)) != IFF_UP) ||
143 	    (ifp->if_drv_flags & IFF_DRV_RUNNING) != IFF_DRV_RUNNING) {
144 		if_printf(ifp, "%s: interface isn't up\n", __func__);
145 		m_freem(m);
146 		return (ENETDOWN);
147 	}
148 
149 	/* Fill in the ethernet header. */
150 	M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT);
151 	if (m == NULL) {
152 		printf("%s: out of mbufs\n", __func__);
153 		return (ENOBUFS);
154 	}
155 	eh = mtod(m, struct ether_header *);
156 	memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
157 	memcpy(eh->ether_dhost, dst.octet, ETHER_ADDR_LEN);
158 	eh->ether_type = htons(etype);
159 	return (ifp->if_debugnet_methods->dn_transmit(ifp, m));
160 }
161 
162 /*
163  * Unreliable transmission of an mbuf chain to the debugnet server
164  * Note: can't handle fragmentation; fails if the packet is larger than
165  *	 ifp->if_mtu after adding the UDP/IP headers
166  *
167  * Parameters:
168  *	pcb	The debugnet context block
169  *	m	mbuf chain
170  *
171  * Returns:
172  *	int	see errno.h, 0 for success
173  */
174 static int
175 debugnet_udp_output(struct debugnet_pcb *pcb, struct mbuf *m)
176 {
177 	struct udphdr *udp;
178 
179 	MPASS(pcb->dp_state >= DN_STATE_HAVE_GW_MAC);
180 
181 	M_PREPEND(m, sizeof(*udp), M_NOWAIT);
182 	if (m == NULL) {
183 		printf("%s: out of mbufs\n", __func__);
184 		return (ENOBUFS);
185 	}
186 
187 	udp = mtod(m, void *);
188 	udp->uh_ulen = htons(m->m_pkthdr.len);
189 	/* Use this src port so that the server can connect() the socket */
190 	udp->uh_sport = htons(pcb->dp_client_port);
191 	udp->uh_dport = htons(pcb->dp_server_port);
192 	/* Computed later (protocol-dependent). */
193 	udp->uh_sum = 0;
194 
195 	return (debugnet_ip_output(pcb, m));
196 }
197 
198 int
199 debugnet_ack_output(struct debugnet_pcb *pcb, uint32_t seqno /* net endian */)
200 {
201 	struct debugnet_ack *dn_ack;
202 	struct mbuf *m;
203 
204 	DNETDEBUG("Acking with seqno %u\n", ntohl(seqno));
205 
206 	m = m_gethdr(M_NOWAIT, MT_DATA);
207 	if (m == NULL) {
208 		printf("%s: Out of mbufs\n", __func__);
209 		return (ENOBUFS);
210 	}
211 	m->m_len = sizeof(*dn_ack);
212 	m->m_pkthdr.len = sizeof(*dn_ack);
213 	MH_ALIGN(m, sizeof(*dn_ack));
214 	dn_ack = mtod(m, void *);
215 	dn_ack->da_seqno = seqno;
216 
217 	return (debugnet_udp_output(pcb, m));
218 }
219 
220 /*
221  * Dummy free function for debugnet clusters.
222  */
223 static void
224 debugnet_mbuf_free(struct mbuf *m __unused)
225 {
226 }
227 
228 /*
229  * Construct and reliably send a debugnet packet.  May fail from a resource
230  * shortage or extreme number of unacknowledged retransmissions.  Wait for
231  * an acknowledgement before returning.  Splits packets into chunks small
232  * enough to be sent without fragmentation (looks up the interface MTU)
233  *
234  * Parameters:
235  *	type	debugnet packet type (HERALD, FINISHED, ...)
236  *	data	data
237  *	datalen	data size (bytes)
238  *	auxdata	optional auxiliary information
239  *
240  * Returns:
241  *	int see errno.h, 0 for success
242  */
243 int
244 debugnet_send(struct debugnet_pcb *pcb, uint32_t type, const void *data,
245     uint32_t datalen, const struct debugnet_proto_aux *auxdata)
246 {
247 	struct debugnet_msg_hdr *dn_msg_hdr;
248 	struct mbuf *m, *m2;
249 	uint64_t want_acks;
250 	uint32_t i, pktlen, sent_so_far;
251 	int retries, polls, error;
252 
253 	if (pcb->dp_state == DN_STATE_REMOTE_CLOSED)
254 		return (ECONNRESET);
255 
256 	want_acks = 0;
257 	pcb->dp_rcvd_acks = 0;
258 	retries = 0;
259 
260 retransmit:
261 	/* Chunks can be too big to fit in packets. */
262 	for (i = sent_so_far = 0; sent_so_far < datalen ||
263 	    (i == 0 && datalen == 0); i++) {
264 		pktlen = datalen - sent_so_far;
265 
266 		/* Bound: the interface MTU (assume no IP options). */
267 		pktlen = min(pktlen, pcb->dp_ifp->if_mtu -
268 		    sizeof(struct udpiphdr) - sizeof(struct debugnet_msg_hdr));
269 
270 		/*
271 		 * Check if it is retransmitting and this has been ACKed
272 		 * already.
273 		 */
274 		if ((pcb->dp_rcvd_acks & (1 << i)) != 0) {
275 			sent_so_far += pktlen;
276 			continue;
277 		}
278 
279 		/*
280 		 * Get and fill a header mbuf, then chain data as an extended
281 		 * mbuf.
282 		 */
283 		m = m_gethdr(M_NOWAIT, MT_DATA);
284 		if (m == NULL) {
285 			printf("%s: Out of mbufs\n", __func__);
286 			return (ENOBUFS);
287 		}
288 		m->m_len = sizeof(struct debugnet_msg_hdr);
289 		m->m_pkthdr.len = sizeof(struct debugnet_msg_hdr);
290 		MH_ALIGN(m, sizeof(struct debugnet_msg_hdr));
291 		dn_msg_hdr = mtod(m, struct debugnet_msg_hdr *);
292 		dn_msg_hdr->mh_seqno = htonl(pcb->dp_seqno + i);
293 		dn_msg_hdr->mh_type = htonl(type);
294 		dn_msg_hdr->mh_len = htonl(pktlen);
295 
296 		if (auxdata != NULL) {
297 			dn_msg_hdr->mh_offset =
298 			    htobe64(auxdata->dp_offset_start + sent_so_far);
299 			dn_msg_hdr->mh_aux2 = htobe32(auxdata->dp_aux2);
300 		} else {
301 			dn_msg_hdr->mh_offset = htobe64(sent_so_far);
302 			dn_msg_hdr->mh_aux2 = 0;
303 		}
304 
305 		if (pktlen != 0) {
306 			m2 = m_get(M_NOWAIT, MT_DATA);
307 			if (m2 == NULL) {
308 				m_freem(m);
309 				printf("%s: Out of mbufs\n", __func__);
310 				return (ENOBUFS);
311 			}
312 			MEXTADD(m2, __DECONST(char *, data) + sent_so_far,
313 			    pktlen, debugnet_mbuf_free, NULL, NULL, 0,
314 			    EXT_DISPOSABLE);
315 			m2->m_len = pktlen;
316 
317 			m_cat(m, m2);
318 			m->m_pkthdr.len += pktlen;
319 		}
320 		error = debugnet_udp_output(pcb, m);
321 		if (error != 0)
322 			return (error);
323 
324 		/* Note that we're waiting for this packet in the bitfield. */
325 		want_acks |= (1 << i);
326 		sent_so_far += pktlen;
327 	}
328 	if (i >= DEBUGNET_MAX_IN_FLIGHT)
329 		printf("Warning: Sent more than %d packets (%d). "
330 		    "Acknowledgements will fail unless the size of "
331 		    "rcvd_acks/want_acks is increased.\n",
332 		    DEBUGNET_MAX_IN_FLIGHT, i);
333 
334 	/*
335 	 * Wait for acks.  A *real* window would speed things up considerably.
336 	 */
337 	polls = 0;
338 	while (pcb->dp_rcvd_acks != want_acks) {
339 		if (polls++ > debugnet_npolls) {
340 			if (retries++ > debugnet_nretries)
341 				return (ETIMEDOUT);
342 			printf(". ");
343 			goto retransmit;
344 		}
345 		debugnet_network_poll(pcb);
346 		DELAY(500);
347 		if (pcb->dp_state == DN_STATE_REMOTE_CLOSED)
348 			return (ECONNRESET);
349 	}
350 	pcb->dp_seqno += i;
351 	return (0);
352 }
353 
354 /*
355  * Network input primitives.
356  */
357 
358 /*
359  * Just introspect the header enough to fire off a seqno ack and validate
360  * length fits.
361  */
362 static void
363 debugnet_handle_rx_msg(struct debugnet_pcb *pcb, struct mbuf **mb)
364 {
365 	const struct debugnet_msg_hdr *dnh;
366 	struct mbuf *m;
367 	int error;
368 
369 	m = *mb;
370 
371 	if (m->m_pkthdr.len < sizeof(*dnh)) {
372 		DNETDEBUG("ignoring small debugnet_msg packet\n");
373 		return;
374 	}
375 
376 	/* Get ND header. */
377 	if (m->m_len < sizeof(*dnh)) {
378 		m = m_pullup(m, sizeof(*dnh));
379 		*mb = m;
380 		if (m == NULL) {
381 			DNETDEBUG("m_pullup failed\n");
382 			return;
383 		}
384 	}
385 	dnh = mtod(m, const void *);
386 
387 	if (ntohl(dnh->mh_len) + sizeof(*dnh) > m->m_pkthdr.len) {
388 		DNETDEBUG("Dropping short packet.\n");
389 		return;
390 	}
391 
392 	/*
393 	 * If the issue is transient (ENOBUFS), sender should resend.  If
394 	 * non-transient (like driver objecting to rx -> tx from the same
395 	 * thread), not much else we can do.
396 	 */
397 	error = debugnet_ack_output(pcb, dnh->mh_seqno);
398 	if (error != 0)
399 		return;
400 
401 	if (ntohl(dnh->mh_type) == DEBUGNET_FINISHED) {
402 		printf("Remote shut down the connection on us!\n");
403 		pcb->dp_state = DN_STATE_REMOTE_CLOSED;
404 
405 		/*
406 		 * Continue through to the user handler so they are signalled
407 		 * not to wait for further rx.
408 		 */
409 	}
410 
411 	pcb->dp_rx_handler(pcb, mb);
412 }
413 
414 static void
415 debugnet_handle_ack(struct debugnet_pcb *pcb, struct mbuf **mb, uint16_t sport)
416 {
417 	const struct debugnet_ack *dn_ack;
418 	struct mbuf *m;
419 	uint32_t rcv_ackno;
420 
421 	m = *mb;
422 
423 	/* Get Ack. */
424 	if (m->m_len < sizeof(*dn_ack)) {
425 		m = m_pullup(m, sizeof(*dn_ack));
426 		*mb = m;
427 		if (m == NULL) {
428 			DNETDEBUG("m_pullup failed\n");
429 			return;
430 		}
431 	}
432 	dn_ack = mtod(m, const void *);
433 
434 	/* Debugnet processing. */
435 	/*
436 	 * Packet is meant for us.  Extract the ack sequence number and the
437 	 * port number if necessary.
438 	 */
439 	rcv_ackno = ntohl(dn_ack->da_seqno);
440 	if (pcb->dp_state < DN_STATE_GOT_HERALD_PORT) {
441 		pcb->dp_server_port = sport;
442 		pcb->dp_state = DN_STATE_GOT_HERALD_PORT;
443 	}
444 	if (rcv_ackno >= pcb->dp_seqno + DEBUGNET_MAX_IN_FLIGHT)
445 		printf("%s: ACK %u too far in future!\n", __func__, rcv_ackno);
446 	else if (rcv_ackno >= pcb->dp_seqno) {
447 		/* We're interested in this ack. Record it. */
448 		pcb->dp_rcvd_acks |= 1 << (rcv_ackno - pcb->dp_seqno);
449 	}
450 }
451 
452 void
453 debugnet_handle_udp(struct debugnet_pcb *pcb, struct mbuf **mb)
454 {
455 	const struct udphdr *udp;
456 	struct mbuf *m;
457 	uint16_t sport, ulen;
458 
459 	/* UDP processing. */
460 
461 	m = *mb;
462 	if (m->m_pkthdr.len < sizeof(*udp)) {
463 		DNETDEBUG("ignoring small UDP packet\n");
464 		return;
465 	}
466 
467 	/* Get UDP headers. */
468 	if (m->m_len < sizeof(*udp)) {
469 		m = m_pullup(m, sizeof(*udp));
470 		*mb = m;
471 		if (m == NULL) {
472 			DNETDEBUG("m_pullup failed\n");
473 			return;
474 		}
475 	}
476 	udp = mtod(m, const void *);
477 
478 	/* We expect to receive UDP packets on the configured client port. */
479 	if (ntohs(udp->uh_dport) != pcb->dp_client_port) {
480 		DNETDEBUG("not on the expected port.\n");
481 		return;
482 	}
483 
484 	/* Check that ulen does not exceed actual size of data. */
485 	ulen = ntohs(udp->uh_ulen);
486 	if (m->m_pkthdr.len < ulen) {
487 		DNETDEBUG("ignoring runt UDP packet\n");
488 		return;
489 	}
490 
491 	sport = ntohs(udp->uh_sport);
492 
493 	m_adj(m, sizeof(*udp));
494 	ulen -= sizeof(*udp);
495 
496 	if (ulen == sizeof(struct debugnet_ack)) {
497 		debugnet_handle_ack(pcb, mb, sport);
498 		return;
499 	}
500 
501 	if (pcb->dp_rx_handler == NULL) {
502 		if (ulen < sizeof(struct debugnet_ack))
503 			DNETDEBUG("ignoring small ACK packet\n");
504 		else
505 			DNETDEBUG("ignoring unexpected non-ACK packet on "
506 			    "half-duplex connection.\n");
507 		return;
508 	}
509 
510 	debugnet_handle_rx_msg(pcb, mb);
511 }
512 
513 /*
514  * Handler for incoming packets directly from the network adapter
515  * Identifies the packet type (IP or ARP) and passes it along to one of the
516  * helper functions debugnet_handle_ip or debugnet_handle_arp.
517  *
518  * It needs to partially replicate the behaviour of ether_input() and
519  * ether_demux().
520  *
521  * Parameters:
522  *	ifp	the interface the packet came from
523  *	m	an mbuf containing the packet received
524  */
525 static void
526 debugnet_pkt_in(struct ifnet *ifp, struct mbuf *m)
527 {
528 	struct ifreq ifr;
529 	struct ether_header *eh;
530 	u_short etype;
531 
532 	/* Ethernet processing. */
533 	if ((m->m_flags & M_PKTHDR) == 0) {
534 		DNETDEBUG_IF(ifp, "discard frame without packet header\n");
535 		goto done;
536 	}
537 	if (m->m_len < ETHER_HDR_LEN) {
538 		DNETDEBUG_IF(ifp,
539 	    "discard frame without leading eth header (len %u pktlen %u)\n",
540 		    m->m_len, m->m_pkthdr.len);
541 		goto done;
542 	}
543 	if ((m->m_flags & M_HASFCS) != 0) {
544 		m_adj(m, -ETHER_CRC_LEN);
545 		m->m_flags &= ~M_HASFCS;
546 	}
547 	eh = mtod(m, struct ether_header *);
548 	etype = ntohs(eh->ether_type);
549 	if ((m->m_flags & M_VLANTAG) != 0 || etype == ETHERTYPE_VLAN) {
550 		DNETDEBUG_IF(ifp, "ignoring vlan packets\n");
551 		goto done;
552 	}
553 	if (if_gethwaddr(ifp, &ifr) != 0) {
554 		DNETDEBUG_IF(ifp, "failed to get hw addr for interface\n");
555 		goto done;
556 	}
557 	if (memcmp(ifr.ifr_addr.sa_data, eh->ether_dhost,
558 	    ETHER_ADDR_LEN) != 0 &&
559 	    (etype != ETHERTYPE_ARP || !ETHER_IS_BROADCAST(eh->ether_dhost))) {
560 		DNETDEBUG_IF(ifp,
561 		    "discard frame with incorrect destination addr\n");
562 		goto done;
563 	}
564 
565 	MPASS(g_debugnet_pcb_inuse);
566 
567 	/* Done ethernet processing. Strip off the ethernet header. */
568 	m_adj(m, ETHER_HDR_LEN);
569 	switch (etype) {
570 	case ETHERTYPE_ARP:
571 		debugnet_handle_arp(&g_dnet_pcb, &m);
572 		break;
573 	case ETHERTYPE_IP:
574 		debugnet_handle_ip(&g_dnet_pcb, &m);
575 		break;
576 	default:
577 		DNETDEBUG_IF(ifp, "dropping unknown ethertype %hu\n", etype);
578 		break;
579 	}
580 done:
581 	if (m != NULL)
582 		m_freem(m);
583 }
584 
585 /*
586  * Network polling primitive.
587  *
588  * Instead of assuming that most of the network stack is sane, we just poll the
589  * driver directly for packets.
590  */
591 void
592 debugnet_network_poll(struct debugnet_pcb *pcb)
593 {
594 	struct ifnet *ifp;
595 
596 	ifp = pcb->dp_ifp;
597 	ifp->if_debugnet_methods->dn_poll(ifp, 1000);
598 }
599 
600 /*
601  * Start of consumer API surface.
602  */
603 void
604 debugnet_free(struct debugnet_pcb *pcb)
605 {
606 	struct ifnet *ifp;
607 
608 	MPASS(pcb == &g_dnet_pcb);
609 	MPASS(pcb->dp_drv_input == NULL || g_debugnet_pcb_inuse);
610 
611 	ifp = pcb->dp_ifp;
612 	if (ifp != NULL) {
613 		if (pcb->dp_drv_input != NULL)
614 			ifp->if_input = pcb->dp_drv_input;
615 		if (pcb->dp_event_started)
616 			ifp->if_debugnet_methods->dn_event(ifp, DEBUGNET_END);
617 	}
618 	debugnet_mbuf_finish();
619 
620 	g_debugnet_pcb_inuse = false;
621 	memset(&g_dnet_pcb, 0xfd, sizeof(g_dnet_pcb));
622 }
623 
624 int
625 debugnet_connect(const struct debugnet_conn_params *dcp,
626     struct debugnet_pcb **pcb_out)
627 {
628 	struct debugnet_proto_aux herald_auxdata;
629 	struct debugnet_pcb *pcb;
630 	struct ifnet *ifp;
631 	int error;
632 
633 	if (g_debugnet_pcb_inuse) {
634 		printf("%s: Only one connection at a time.\n", __func__);
635 		return (EBUSY);
636 	}
637 
638 	pcb = &g_dnet_pcb;
639 	*pcb = (struct debugnet_pcb) {
640 		.dp_state = DN_STATE_INIT,
641 		.dp_client = dcp->dc_client,
642 		.dp_server = dcp->dc_server,
643 		.dp_gateway = dcp->dc_gateway,
644 		.dp_server_port = dcp->dc_herald_port,	/* Initially */
645 		.dp_client_port = dcp->dc_client_port,
646 		.dp_seqno = 1,
647 		.dp_ifp = dcp->dc_ifp,
648 		.dp_rx_handler = dcp->dc_rx_handler,
649 		.dp_drv_input = NULL,
650 	};
651 
652 	/* Switch to the debugnet mbuf zones. */
653 	debugnet_mbuf_start();
654 
655 	/* At least one needed parameter is missing; infer it. */
656 	if (pcb->dp_client == INADDR_ANY || pcb->dp_gateway == INADDR_ANY ||
657 	    pcb->dp_ifp == NULL) {
658 		struct sockaddr_in dest_sin, *gw_sin, *local_sin;
659 		struct ifnet *rt_ifp;
660 		struct nhop_object *nh;
661 
662 		memset(&dest_sin, 0, sizeof(dest_sin));
663 		dest_sin = (struct sockaddr_in) {
664 			.sin_len = sizeof(dest_sin),
665 			.sin_family = AF_INET,
666 			.sin_addr.s_addr = pcb->dp_server,
667 		};
668 
669 		CURVNET_SET(vnet0);
670 		nh = fib4_lookup_debugnet(debugnet_fib, dest_sin.sin_addr, 0,
671 		    NHR_NONE);
672 		CURVNET_RESTORE();
673 
674 		if (nh == NULL) {
675 			printf("%s: Could not get route for that server.\n",
676 			    __func__);
677 			error = ENOENT;
678 			goto cleanup;
679 		}
680 
681 		/* TODO support AF_INET6 */
682 		if (nh->gw_sa.sa_family == AF_INET)
683 			gw_sin = &nh->gw4_sa;
684 		else {
685 			if (nh->gw_sa.sa_family == AF_LINK)
686 				DNETDEBUG("Destination address is on link.\n");
687 			gw_sin = NULL;
688 		}
689 
690 		MPASS(nh->nh_ifa->ifa_addr->sa_family == AF_INET);
691 		local_sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
692 
693 		rt_ifp = nh->nh_ifp;
694 
695 		if (pcb->dp_client == INADDR_ANY)
696 			pcb->dp_client = local_sin->sin_addr.s_addr;
697 		if (pcb->dp_gateway == INADDR_ANY && gw_sin != NULL)
698 			pcb->dp_gateway = gw_sin->sin_addr.s_addr;
699 		if (pcb->dp_ifp == NULL)
700 			pcb->dp_ifp = rt_ifp;
701 	}
702 
703 	ifp = pcb->dp_ifp;
704 
705 	if (debugnet_debug > 0) {
706 		char serbuf[INET_ADDRSTRLEN], clibuf[INET_ADDRSTRLEN],
707 		    gwbuf[INET_ADDRSTRLEN];
708 		inet_ntop(AF_INET, &pcb->dp_server, serbuf, sizeof(serbuf));
709 		inet_ntop(AF_INET, &pcb->dp_client, clibuf, sizeof(clibuf));
710 		if (pcb->dp_gateway != INADDR_ANY)
711 			inet_ntop(AF_INET, &pcb->dp_gateway, gwbuf, sizeof(gwbuf));
712 		DNETDEBUG("Connecting to %s:%d%s%s from %s:%d on %s\n",
713 		    serbuf, pcb->dp_server_port,
714 		    (pcb->dp_gateway == INADDR_ANY) ? "" : " via ",
715 		    (pcb->dp_gateway == INADDR_ANY) ? "" : gwbuf,
716 		    clibuf, pcb->dp_client_port, if_name(ifp));
717 	}
718 
719 	/* Validate iface is online and supported. */
720 	if (!DEBUGNET_SUPPORTED_NIC(ifp)) {
721 		printf("%s: interface '%s' does not support debugnet\n",
722 		    __func__, if_name(ifp));
723 		error = ENODEV;
724 		goto cleanup;
725 	}
726 	if ((if_getflags(ifp) & IFF_UP) == 0) {
727 		printf("%s: interface '%s' link is down\n", __func__,
728 		    if_name(ifp));
729 		error = ENXIO;
730 		goto cleanup;
731 	}
732 
733 	ifp->if_debugnet_methods->dn_event(ifp, DEBUGNET_START);
734 	pcb->dp_event_started = true;
735 
736 	/*
737 	 * We maintain the invariant that g_debugnet_pcb_inuse is always true
738 	 * while the debugnet ifp's if_input is overridden with
739 	 * debugnet_pkt_in.
740 	 */
741 	g_debugnet_pcb_inuse = true;
742 
743 	/* Make the card use *our* receive callback. */
744 	pcb->dp_drv_input = ifp->if_input;
745 	ifp->if_input = debugnet_pkt_in;
746 
747 	printf("%s: searching for %s MAC...\n", __func__,
748 	    (dcp->dc_gateway == INADDR_ANY) ? "server" : "gateway");
749 
750 	error = debugnet_arp_gw(pcb);
751 	if (error != 0) {
752 		printf("%s: failed to locate MAC address\n", __func__);
753 		goto cleanup;
754 	}
755 	MPASS(pcb->dp_state == DN_STATE_HAVE_GW_MAC);
756 
757 	herald_auxdata = (struct debugnet_proto_aux) {
758 		.dp_offset_start = dcp->dc_herald_offset,
759 		.dp_aux2 = dcp->dc_herald_aux2,
760 	};
761 	error = debugnet_send(pcb, DEBUGNET_HERALD, dcp->dc_herald_data,
762 	    dcp->dc_herald_datalen, &herald_auxdata);
763 	if (error != 0) {
764 		printf("%s: failed to herald debugnet server\n", __func__);
765 		goto cleanup;
766 	}
767 
768 	*pcb_out = pcb;
769 	return (0);
770 
771 cleanup:
772 	debugnet_free(pcb);
773 	return (error);
774 }
775 
776 /*
777  * Pre-allocated dump-time mbuf tracking.
778  *
779  * We just track the high water mark we've ever seen and allocate appropriately
780  * for that iface/mtu combo.
781  */
782 static struct {
783 	int nmbuf;
784 	int ncl;
785 	int clsize;
786 } dn_hwm;
787 static struct mtx dn_hwm_lk;
788 MTX_SYSINIT(debugnet_hwm_lock, &dn_hwm_lk, "Debugnet HWM lock", MTX_DEF);
789 
790 static void
791 dn_maybe_reinit_mbufs(int nmbuf, int ncl, int clsize)
792 {
793 	bool any;
794 
795 	any = false;
796 	mtx_lock(&dn_hwm_lk);
797 
798 	if (nmbuf > dn_hwm.nmbuf) {
799 		any = true;
800 		dn_hwm.nmbuf = nmbuf;
801 	} else
802 		nmbuf = dn_hwm.nmbuf;
803 
804 	if (ncl > dn_hwm.ncl) {
805 		any = true;
806 		dn_hwm.ncl = ncl;
807 	} else
808 		ncl = dn_hwm.ncl;
809 
810 	if (clsize > dn_hwm.clsize) {
811 		any = true;
812 		dn_hwm.clsize = clsize;
813 	} else
814 		clsize = dn_hwm.clsize;
815 
816 	mtx_unlock(&dn_hwm_lk);
817 
818 	if (any)
819 		debugnet_mbuf_reinit(nmbuf, ncl, clsize);
820 }
821 
822 void
823 debugnet_any_ifnet_update(struct ifnet *ifp)
824 {
825 	int clsize, nmbuf, ncl, nrxr;
826 
827 	if (!DEBUGNET_SUPPORTED_NIC(ifp))
828 		return;
829 
830 	ifp->if_debugnet_methods->dn_init(ifp, &nrxr, &ncl, &clsize);
831 	KASSERT(nrxr > 0, ("invalid receive ring count %d", nrxr));
832 
833 	/*
834 	 * We need two headers per message on the transmit side. Multiply by
835 	 * four to give us some breathing room.
836 	 */
837 	nmbuf = ncl * (4 + nrxr);
838 	ncl *= nrxr;
839 
840 	/*
841 	 * Bandaid for drivers that (incorrectly) advertise LinkUp before their
842 	 * dn_init method is available.
843 	 */
844 	if (nmbuf == 0 || ncl == 0 || clsize == 0) {
845 		printf("%s: Bad dn_init result from %s (ifp %p), ignoring.\n",
846 		    __func__, if_name(ifp), ifp);
847 		return;
848 	}
849 	dn_maybe_reinit_mbufs(nmbuf, ncl, clsize);
850 }
851 
852 /*
853  * Unfortunately, the ifnet_arrival_event eventhandler hook is mostly useless
854  * for us because drivers tend to if_attach before invoking DEBUGNET_SET().
855  *
856  * On the other hand, hooking DEBUGNET_SET() itself may still be too early,
857  * because the driver is still in attach.  Since we cannot use down interfaces,
858  * maybe hooking ifnet_event:IFNET_EVENT_UP is sufficient?  ... Nope, at least
859  * with vtnet and dhcpclient that event just never occurs.
860  *
861  * So that's how I've landed on the lower level ifnet_link_event.
862  */
863 
864 static void
865 dn_ifnet_event(void *arg __unused, struct ifnet *ifp, int link_state)
866 {
867 	if (link_state == LINK_STATE_UP)
868 		debugnet_any_ifnet_update(ifp);
869 }
870 
871 static eventhandler_tag dn_attach_cookie;
872 static void
873 dn_evh_init(void *ctx __unused)
874 {
875 	dn_attach_cookie = EVENTHANDLER_REGISTER(ifnet_link_event,
876 	    dn_ifnet_event, NULL, EVENTHANDLER_PRI_ANY);
877 }
878 SYSINIT(dn_evh_init, SI_SUB_EVENTHANDLER + 1, SI_ORDER_ANY, dn_evh_init, NULL);
879 
880 /*
881  * DDB parsing helpers for debugnet(4) consumers.
882  */
883 #ifdef DDB
884 struct my_inet_opt {
885 	bool has_opt;
886 	const char *printname;
887 	in_addr_t *result;
888 };
889 
890 static int
891 dn_parse_optarg_ipv4(struct my_inet_opt *opt)
892 {
893 	in_addr_t tmp;
894 	unsigned octet;
895 	int t;
896 
897 	tmp = 0;
898 	for (octet = 0; octet < 4; octet++) {
899 		t = db_read_token_flags(DRT_WSPACE | DRT_DECIMAL);
900 		if (t != tNUMBER) {
901 			db_printf("%s:%s: octet %u expected number; found %d\n",
902 			    __func__, opt->printname, octet, t);
903 			return (EINVAL);
904 		}
905 		/*
906 		 * db_lex lexes '-' distinctly from the number itself, but
907 		 * let's document that invariant.
908 		 */
909 		MPASS(db_tok_number >= 0);
910 
911 		if (db_tok_number > UINT8_MAX) {
912 			db_printf("%s:%s: octet %u out of range: %jd\n", __func__,
913 			    opt->printname, octet, (intmax_t)db_tok_number);
914 			return (EDOM);
915 		}
916 
917 		/* Constructed host-endian and converted to network later. */
918 		tmp = (tmp << 8) | db_tok_number;
919 
920 		if (octet < 3) {
921 			t = db_read_token_flags(DRT_WSPACE);
922 			if (t != tDOT) {
923 				db_printf("%s:%s: octet %u expected '.'; found"
924 				    " %d\n", __func__, opt->printname, octet,
925 				    t);
926 				return (EINVAL);
927 			}
928 		}
929 	}
930 
931 	*opt->result = htonl(tmp);
932 	opt->has_opt = true;
933 	return (0);
934 }
935 
936 int
937 debugnet_parse_ddb_cmd(const char *cmd, struct debugnet_ddb_config *result)
938 {
939 	struct ifnet *ifp;
940 	int t, error;
941 	bool want_ifp;
942 	char ch;
943 
944 	struct my_inet_opt opt_client = {
945 		.printname = "client",
946 		.result = &result->dd_client,
947 	},
948 	opt_server = {
949 		.printname = "server",
950 		.result = &result->dd_server,
951 	},
952 	opt_gateway = {
953 		.printname = "gateway",
954 		.result = &result->dd_gateway,
955 	},
956 	*cur_inet_opt;
957 
958 	ifp = NULL;
959 	memset(result, 0, sizeof(*result));
960 
961 	/*
962 	 * command [space] [-] [opt] [[space] [optarg]] ...
963 	 *
964 	 * db_command has already lexed 'command' for us.
965 	 */
966 	t = db_read_token_flags(DRT_WSPACE);
967 	if (t == tWSPACE)
968 		t = db_read_token_flags(DRT_WSPACE);
969 
970 	while (t != tEOL) {
971 		if (t != tMINUS) {
972 			db_printf("%s: Bad syntax; expected '-', got %d\n",
973 			    cmd, t);
974 			goto usage;
975 		}
976 
977 		t = db_read_token_flags(DRT_WSPACE);
978 		if (t != tIDENT) {
979 			db_printf("%s: Bad syntax; expected tIDENT, got %d\n",
980 			    cmd, t);
981 			goto usage;
982 		}
983 
984 		if (strlen(db_tok_string) > 1) {
985 			db_printf("%s: Bad syntax; expected single option "
986 			    "flag, got '%s'\n", cmd, db_tok_string);
987 			goto usage;
988 		}
989 
990 		want_ifp = false;
991 		cur_inet_opt = NULL;
992 		switch ((ch = db_tok_string[0])) {
993 		default:
994 			DNETDEBUG("Unexpected: '%c'\n", ch);
995 			/* FALLTHROUGH */
996 		case 'h':
997 			goto usage;
998 		case 'c':
999 			cur_inet_opt = &opt_client;
1000 			break;
1001 		case 'g':
1002 			cur_inet_opt = &opt_gateway;
1003 			break;
1004 		case 's':
1005 			cur_inet_opt = &opt_server;
1006 			break;
1007 		case 'i':
1008 			want_ifp = true;
1009 			break;
1010 		}
1011 
1012 		t = db_read_token_flags(DRT_WSPACE);
1013 		if (t != tWSPACE) {
1014 			db_printf("%s: Bad syntax; expected space after "
1015 			    "flag %c, got %d\n", cmd, ch, t);
1016 			goto usage;
1017 		}
1018 
1019 		if (want_ifp) {
1020 			t = db_read_token_flags(DRT_WSPACE);
1021 			if (t != tIDENT) {
1022 				db_printf("%s: Expected interface but got %d\n",
1023 				    cmd, t);
1024 				goto usage;
1025 			}
1026 
1027 			CURVNET_SET(vnet0);
1028 			/*
1029 			 * We *don't* take a ref here because the only current
1030 			 * consumer, db_netdump_cmd, does not need it.  It
1031 			 * (somewhat redundantly) extracts the if_name(),
1032 			 * re-lookups the ifp, and takes its own reference.
1033 			 */
1034 			ifp = ifunit(db_tok_string);
1035 			CURVNET_RESTORE();
1036 			if (ifp == NULL) {
1037 				db_printf("Could not locate interface %s\n",
1038 				    db_tok_string);
1039 				goto cleanup;
1040 			}
1041 		} else {
1042 			MPASS(cur_inet_opt != NULL);
1043 			/* Assume IPv4 for now. */
1044 			error = dn_parse_optarg_ipv4(cur_inet_opt);
1045 			if (error != 0)
1046 				goto cleanup;
1047 		}
1048 
1049 		/* Skip (mandatory) whitespace after option, if not EOL. */
1050 		t = db_read_token_flags(DRT_WSPACE);
1051 		if (t == tEOL)
1052 			break;
1053 		if (t != tWSPACE) {
1054 			db_printf("%s: Bad syntax; expected space after "
1055 			    "flag %c option; got %d\n", cmd, ch, t);
1056 			goto usage;
1057 		}
1058 		t = db_read_token_flags(DRT_WSPACE);
1059 	}
1060 
1061 	if (!opt_server.has_opt) {
1062 		db_printf("%s: need a destination server address\n", cmd);
1063 		goto usage;
1064 	}
1065 
1066 	result->dd_has_client = opt_client.has_opt;
1067 	result->dd_has_gateway = opt_gateway.has_opt;
1068 	result->dd_ifp = ifp;
1069 
1070 	/* We parsed the full line to tEOL already, or bailed with an error. */
1071 	return (0);
1072 
1073 usage:
1074 	db_printf("Usage: %s -s <server> [-g <gateway> -c <localip> "
1075 	    "-i <interface>]\n", cmd);
1076 	error = EINVAL;
1077 	/* FALLTHROUGH */
1078 cleanup:
1079 	db_skip_to_eol();
1080 	return (error);
1081 }
1082 #endif /* DDB */
1083