xref: /freebsd/tools/tools/netmap/pkt-gen.c (revision 535af610)
1 /*
2  * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
3  * Copyright (C) 2013-2015 Universita` di Pisa. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *   1. Redistributions of source code must retain the above copyright
9  *      notice, this list of conditions and the following disclaimer.
10  *   2. Redistributions in binary form must reproduce the above copyright
11  *      notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 /*
28  * $FreeBSD$
29  * $Id: pkt-gen.c 12346 2013-06-12 17:36:25Z luigi $
30  *
31  * Example program to show how to build a multithreaded packet
32  * source/sink using the netmap device.
33  *
34  * In this example we create a programmable number of threads
35  * to take care of all the queues of the interface used to
36  * send or receive traffic.
37  *
38  */
39 
40 #define _GNU_SOURCE	/* for CPU_SET() */
41 #include <arpa/inet.h>	/* ntohs */
42 #include <assert.h>
43 #include <ctype.h>	// isprint()
44 #include <errno.h>
45 #include <fcntl.h>
46 #include <ifaddrs.h>	/* getifaddrs */
47 #include <libnetmap.h>
48 #include <math.h>
49 #include <net/ethernet.h>
50 #include <netinet/in.h>
51 #include <netinet/ip.h>
52 #include <netinet/ip6.h>
53 #include <netinet/udp.h>
54 #ifndef NO_PCAP
55 #include <pcap/pcap.h>
56 #endif
57 #include <pthread.h>
58 #include <signal.h>
59 #include <stdio.h>
60 #include <stdlib.h>
61 #include <string.h>
62 #include <sys/ioctl.h>
63 #include <sys/poll.h>
64 #include <sys/stat.h>
65 #if !defined(_WIN32) && !defined(linux)
66 #include <sys/sysctl.h>	/* sysctl */
67 #endif
68 #include <sys/types.h>
69 #include <unistd.h>	// sysconf()
70 #ifdef linux
71 #define IPV6_VERSION	0x60
72 #define IPV6_DEFHLIM	64
73 #endif
74 
75 #include "ctrs.h"
76 
77 static void usage(int);
78 
79 #ifdef _WIN32
80 #define cpuset_t        DWORD_PTR   //uint64_t
81 static inline void CPU_ZERO(cpuset_t *p)
82 {
83 	*p = 0;
84 }
85 
86 static inline void CPU_SET(uint32_t i, cpuset_t *p)
87 {
88 	*p |= 1<< (i & 0x3f);
89 }
90 
91 #define pthread_setaffinity_np(a, b, c) !SetThreadAffinityMask(a, *c)    //((void)a, 0)
92 #define TAP_CLONEDEV	"/dev/tap"
93 #define AF_LINK	18	//defined in winsocks.h
94 #define CLOCK_REALTIME_PRECISE CLOCK_REALTIME
95 #include <net/if_dl.h>
96 
97 /*
98  * Convert an ASCII representation of an ethernet address to
99  * binary form.
100  */
101 struct ether_addr *
102 ether_aton(const char *a)
103 {
104 	int i;
105 	static struct ether_addr o;
106 	unsigned int o0, o1, o2, o3, o4, o5;
107 
108 	i = sscanf(a, "%x:%x:%x:%x:%x:%x", &o0, &o1, &o2, &o3, &o4, &o5);
109 
110 	if (i != 6)
111 		return (NULL);
112 
113 	o.octet[0]=o0;
114 	o.octet[1]=o1;
115 	o.octet[2]=o2;
116 	o.octet[3]=o3;
117 	o.octet[4]=o4;
118 	o.octet[5]=o5;
119 
120 	return ((struct ether_addr *)&o);
121 }
122 
123 /*
124  * Convert a binary representation of an ethernet address to
125  * an ASCII string.
126  */
127 char *
128 ether_ntoa(const struct ether_addr *n)
129 {
130 	int i;
131 	static char a[18];
132 
133 	i = sprintf(a, "%02x:%02x:%02x:%02x:%02x:%02x",
134 	    n->octet[0], n->octet[1], n->octet[2],
135 	    n->octet[3], n->octet[4], n->octet[5]);
136 	return (i < 17 ? NULL : (char *)&a);
137 }
138 #endif /* _WIN32 */
139 
140 #ifdef linux
141 
142 #define cpuset_t        cpu_set_t
143 
144 #define ifr_flagshigh  ifr_flags        /* only the low 16 bits here */
145 #define IFF_PPROMISC   IFF_PROMISC      /* IFF_PPROMISC does not exist */
146 #include <linux/ethtool.h>
147 #include <linux/sockios.h>
148 
149 #define CLOCK_REALTIME_PRECISE CLOCK_REALTIME
150 #include <netinet/ether.h>      /* ether_aton */
151 #include <linux/if_packet.h>    /* sockaddr_ll */
152 #endif  /* linux */
153 
154 #ifdef __FreeBSD__
155 #include <sys/endian.h> /* le64toh */
156 #include <machine/param.h>
157 
158 #include <pthread_np.h> /* pthread w/ affinity */
159 #include <sys/cpuset.h> /* cpu_set */
160 #include <net/if_dl.h>  /* LLADDR */
161 #endif  /* __FreeBSD__ */
162 
163 #ifdef __APPLE__
164 
165 #define cpuset_t        uint64_t        // XXX
166 static inline void CPU_ZERO(cpuset_t *p)
167 {
168 	*p = 0;
169 }
170 
171 static inline void CPU_SET(uint32_t i, cpuset_t *p)
172 {
173 	*p |= 1<< (i & 0x3f);
174 }
175 
176 #define pthread_setaffinity_np(a, b, c) ((void)a, 0)
177 
178 #define ifr_flagshigh  ifr_flags        // XXX
179 #define IFF_PPROMISC   IFF_PROMISC
180 #include <net/if_dl.h>  /* LLADDR */
181 #define clock_gettime(a,b)      \
182 	do {struct timespec t0 = {0,0}; *(b) = t0; } while (0)
183 #endif  /* __APPLE__ */
184 
185 static const char *default_payload = "netmap pkt-gen DIRECT payload\n"
186 	"http://info.iet.unipi.it/~luigi/netmap/ ";
187 
188 static const char *indirect_payload = "netmap pkt-gen indirect payload\n"
189 	"http://info.iet.unipi.it/~luigi/netmap/ ";
190 
191 static int verbose = 0;
192 static int normalize = 1;
193 
194 #define VIRT_HDR_1	10	/* length of a base vnet-hdr */
195 #define VIRT_HDR_2	12	/* length of the extenede vnet-hdr */
196 #define VIRT_HDR_MAX	VIRT_HDR_2
197 struct virt_header {
198 	uint8_t fields[VIRT_HDR_MAX];
199 };
200 
201 #define MAX_BODYSIZE	65536
202 
203 struct pkt {
204 	struct virt_header vh;
205 	struct ether_header eh;
206 	union {
207 		struct {
208 			struct ip ip;
209 			struct udphdr udp;
210 			uint8_t body[MAX_BODYSIZE];	/* hardwired */
211 		} ipv4;
212 		struct {
213 			struct ip6_hdr ip;
214 			struct udphdr udp;
215 			uint8_t body[MAX_BODYSIZE];	/* hardwired */
216 		} ipv6;
217 	};
218 } __attribute__((__packed__));
219 
220 #define	PKT(p, f, af)	\
221     ((af) == AF_INET ? (p)->ipv4.f: (p)->ipv6.f)
222 
223 struct ip_range {
224 	const char *name;
225 	union {
226 		struct {
227 			uint32_t start, end; /* same as struct in_addr */
228 		} ipv4;
229 		struct {
230 			struct in6_addr start, end;
231 			uint8_t sgroup, egroup;
232 		} ipv6;
233 	};
234 	uint16_t port0, port1;
235 };
236 
237 struct mac_range {
238 	const char *name;
239 	struct ether_addr start, end;
240 };
241 
242 /* ifname can be netmap:foo-xxxx */
243 #define MAX_IFNAMELEN	512	/* our buffer for ifname */
244 //#define MAX_PKTSIZE	1536
245 #define MAX_PKTSIZE	MAX_BODYSIZE	/* XXX: + IP_HDR + ETH_HDR */
246 
247 /* compact timestamp to fit into 60 byte packet. (enough to obtain RTT) */
248 struct tstamp {
249 	uint32_t sec;
250 	uint32_t nsec;
251 };
252 
253 /*
254  * global arguments for all threads
255  */
256 
257 struct glob_arg {
258 	int af;		/* address family AF_INET/AF_INET6 */
259 	struct ip_range src_ip;
260 	struct ip_range dst_ip;
261 	struct mac_range dst_mac;
262 	struct mac_range src_mac;
263 	int pkt_size;
264 	int pkt_min_size;
265 	int burst;
266 	int forever;
267 	uint64_t npackets;	/* total packets to send */
268 	int frags;		/* fragments per packet */
269 	u_int frag_size;	/* size of each fragment */
270 	int nthreads;
271 	int cpus;	/* cpus used for running */
272 	int system_cpus;	/* cpus on the system */
273 
274 	int options;	/* testing */
275 #define OPT_PREFETCH	1
276 #define OPT_ACCESS	2
277 #define OPT_COPY	4
278 #define OPT_MEMCPY	8
279 #define OPT_TS		16	/* add a timestamp */
280 #define OPT_INDIRECT	32	/* use indirect buffers, tx only */
281 #define OPT_DUMP	64	/* dump rx/tx traffic */
282 #define OPT_RUBBISH	256	/* send whatever the buffers contain */
283 #define OPT_RANDOM_SRC  512
284 #define OPT_RANDOM_DST  1024
285 #define OPT_PPS_STATS   2048
286 #define OPT_UPDATE_CSUM 4096
287 	int dev_type;
288 #ifndef NO_PCAP
289 	pcap_t *p;
290 #endif
291 
292 	int tx_rate;
293 	struct timespec tx_period;
294 
295 	int affinity;
296 	int main_fd;
297 	struct nmport_d *nmd;
298 	uint32_t orig_mode;
299 	int report_interval;		/* milliseconds between prints */
300 	void *(*td_body)(void *);
301 	int td_type;
302 	void *mmap_addr;
303 	char ifname[MAX_IFNAMELEN];
304 	const char *nmr_config;
305 	int dummy_send;
306 	int virt_header;	/* send also the virt_header */
307 	char *packet_file;	/* -P option */
308 #define	STATS_WIN	15
309 	int win_idx;
310 	int64_t win[STATS_WIN];
311 	int wait_link;
312 	int framing;		/* #bits of framing (for bw output) */
313 };
314 enum dev_type { DEV_NONE, DEV_NETMAP, DEV_PCAP, DEV_TAP };
315 
316 enum {
317 	TD_TYPE_SENDER = 1,
318 	TD_TYPE_RECEIVER,
319 	TD_TYPE_OTHER,
320 };
321 
322 /*
323  * Arguments for a new thread. The same structure is used by
324  * the source and the sink
325  */
326 struct targ {
327 	struct glob_arg *g;
328 	int used;
329 	int completed;
330 	int cancel;
331 	int fd;
332 	struct nmport_d *nmd;
333 	/* these ought to be volatile, but they are
334 	 * only sampled and errors should not accumulate
335 	 */
336 	struct my_ctrs ctr;
337 
338 	struct timespec tic, toc;
339 	int me;
340 	pthread_t thread;
341 	int affinity;
342 
343 	struct pkt pkt;
344 	void *frame;
345 	uint16_t seed[3];
346 	u_int frags;
347 	u_int frag_size;
348 };
349 
350 static __inline uint16_t
351 cksum_add(uint16_t sum, uint16_t a)
352 {
353 	uint16_t res;
354 
355 	res = sum + a;
356 	return (res + (res < a));
357 }
358 
359 static void
360 extract_ipv4_addr(char *name, uint32_t *addr, uint16_t *port)
361 {
362 	struct in_addr a;
363 	char *pp;
364 
365 	pp = strchr(name, ':');
366 	if (pp != NULL) {	/* do we have ports ? */
367 		*pp++ = '\0';
368 		*port = (uint16_t)strtol(pp, NULL, 0);
369 	}
370 
371 	inet_pton(AF_INET, name, &a);
372 	*addr = ntohl(a.s_addr);
373 }
374 
375 static void
376 extract_ipv6_addr(char *name, struct in6_addr *addr, uint16_t *port,
377     uint8_t *group)
378 {
379 	char *pp;
380 
381 	/*
382 	 * We accept IPv6 address in the following form:
383 	 *  group@[2001:DB8::1001]:port	(w/ brackets and port)
384 	 *  group@[2001:DB8::1]		(w/ brackets and w/o port)
385 	 *  group@2001:DB8::1234	(w/o brackets and w/o port)
386 	 */
387 	pp = strchr(name, '@');
388 	if (pp != NULL) {
389 		*pp++ = '\0';
390 		*group = (uint8_t)strtol(name, NULL, 0);
391 		if (*group > 7)
392 			*group = 7;
393 		name = pp;
394 	}
395 	if (name[0] == '[')
396 		name++;
397 	pp = strchr(name, ']');
398 	if (pp != NULL)
399 		*pp++ = '\0';
400 	if (pp != NULL && *pp != ':')
401 		pp = NULL;
402 	if (pp != NULL) {	/* do we have ports ? */
403 		*pp++ = '\0';
404 		*port = (uint16_t)strtol(pp, NULL, 0);
405 	}
406 	inet_pton(AF_INET6, name, addr);
407 }
408 /*
409  * extract the extremes from a range of ipv4 addresses.
410  * addr_lo[-addr_hi][:port_lo[-port_hi]]
411  */
412 static int
413 extract_ip_range(struct ip_range *r, int af)
414 {
415 	char *name, *ap, start[INET6_ADDRSTRLEN];
416 	char end[INET6_ADDRSTRLEN];
417 	struct in_addr a;
418 	uint32_t tmp;
419 
420 	if (verbose)
421 		D("extract IP range from %s", r->name);
422 
423 	name = strdup(r->name);
424 	if (name == NULL) {
425 		D("strdup failed");
426 		usage(-1);
427 	}
428 	/* the first - splits start/end of range */
429 	ap = strchr(name, '-');
430 	if (ap != NULL)
431 		*ap++ = '\0';
432 	r->port0 = 1234;	/* default port */
433 	if (af == AF_INET6) {
434 		r->ipv6.sgroup = 7; /* default group */
435 		extract_ipv6_addr(name, &r->ipv6.start, &r->port0,
436 		    &r->ipv6.sgroup);
437 	} else
438 		extract_ipv4_addr(name, &r->ipv4.start, &r->port0);
439 
440 	r->port1 = r->port0;
441 	if (af == AF_INET6) {
442 		if (ap != NULL) {
443 			r->ipv6.egroup = r->ipv6.sgroup;
444 			extract_ipv6_addr(ap, &r->ipv6.end, &r->port1,
445 			    &r->ipv6.egroup);
446 		} else {
447 			r->ipv6.end = r->ipv6.start;
448 			r->ipv6.egroup = r->ipv6.sgroup;
449 		}
450 	} else {
451 		if (ap != NULL) {
452 			extract_ipv4_addr(ap, &r->ipv4.end, &r->port1);
453 			if (r->ipv4.start > r->ipv4.end) {
454 				tmp = r->ipv4.end;
455 				r->ipv4.end = r->ipv4.start;
456 				r->ipv4.start = tmp;
457 			}
458 		} else
459 			r->ipv4.end = r->ipv4.start;
460 	}
461 
462 	if (r->port0 > r->port1) {
463 		tmp = r->port0;
464 		r->port0 = r->port1;
465 		r->port1 = tmp;
466 	}
467 	if (af == AF_INET) {
468 		a.s_addr = htonl(r->ipv4.start);
469 		inet_ntop(af, &a, start, sizeof(start));
470 		a.s_addr = htonl(r->ipv4.end);
471 		inet_ntop(af, &a, end, sizeof(end));
472 	} else {
473 		inet_ntop(af, &r->ipv6.start, start, sizeof(start));
474 		inet_ntop(af, &r->ipv6.end, end, sizeof(end));
475 	}
476 	if (af == AF_INET)
477 		D("range is %s:%d to %s:%d", start, r->port0, end, r->port1);
478 	else
479 		D("range is %d@[%s]:%d to %d@[%s]:%d", r->ipv6.sgroup,
480 		    start, r->port0, r->ipv6.egroup, end, r->port1);
481 
482 	free(name);
483 	if (r->port0 != r->port1 ||
484 	    (af == AF_INET && r->ipv4.start != r->ipv4.end) ||
485 	    (af == AF_INET6 &&
486 		!IN6_ARE_ADDR_EQUAL(&r->ipv6.start, &r->ipv6.end)))
487 		return (OPT_COPY);
488 	return (0);
489 }
490 
491 static int
492 extract_mac_range(struct mac_range *r)
493 {
494 	struct ether_addr *e;
495 	if (verbose)
496 	    D("extract MAC range from %s", r->name);
497 
498 	e = ether_aton(r->name);
499 	if (e == NULL) {
500 		D("invalid MAC address '%s'", r->name);
501 		return 1;
502 	}
503 	bcopy(e, &r->start, 6);
504 	bcopy(e, &r->end, 6);
505 #if 0
506 	bcopy(targ->src_mac, eh->ether_shost, 6);
507 	p = index(targ->g->src_mac, '-');
508 	if (p)
509 		targ->src_mac_range = atoi(p+1);
510 
511 	bcopy(ether_aton(targ->g->dst_mac), targ->dst_mac, 6);
512 	bcopy(targ->dst_mac, eh->ether_dhost, 6);
513 	p = index(targ->g->dst_mac, '-');
514 	if (p)
515 		targ->dst_mac_range = atoi(p+1);
516 #endif
517 	if (verbose)
518 		D("%s starts at %s", r->name, ether_ntoa(&r->start));
519 	return 0;
520 }
521 
522 static int
523 get_if_mtu(const struct glob_arg *g)
524 {
525 	struct ifreq ifreq;
526 	int s, ret;
527 	const char *ifname = g->nmd->hdr.nr_name;
528 	size_t len;
529 
530 	if (!strncmp(g->ifname, "netmap:", 7) && !strchr(ifname, '{')
531 			&& !strchr(ifname, '}')) {
532 
533 		len = strlen(ifname);
534 
535 		if (len > IFNAMSIZ) {
536 			D("'%s' too long, cannot ask for MTU", ifname);
537 			return -1;
538 		}
539 
540 		s = socket(AF_INET, SOCK_DGRAM, 0);
541 		if (s < 0) {
542 			D("socket() failed: %s", strerror(errno));
543 			return s;
544 		}
545 
546 		memset(&ifreq, 0, sizeof(ifreq));
547 		memcpy(ifreq.ifr_name, ifname, len);
548 
549 		ret = ioctl(s, SIOCGIFMTU, &ifreq);
550 		if (ret) {
551 			D("ioctl(SIOCGIFMTU) failed: %s", strerror(errno));
552 		}
553 
554 		close(s);
555 
556 		return ifreq.ifr_mtu;
557 	}
558 
559 	/* This is a pipe or a VALE port, where the MTU is very large,
560 	 * so we use some practical limit. */
561 	return 65536;
562 }
563 
564 static struct targ *targs;
565 static int global_nthreads;
566 
567 /* control-C handler */
568 static void
569 sigint_h(int sig)
570 {
571 	int i;
572 
573 	(void)sig;	/* UNUSED */
574 	D("received control-C on thread %p", (void *)pthread_self());
575 	for (i = 0; i < global_nthreads; i++) {
576 		targs[i].cancel = 1;
577 	}
578 }
579 
580 /* sysctl wrapper to return the number of active CPUs */
581 static int
582 system_ncpus(void)
583 {
584 	int ncpus;
585 #if defined (__FreeBSD__)
586 	int mib[2] = { CTL_HW, HW_NCPU };
587 	size_t len = sizeof(mib);
588 	sysctl(mib, 2, &ncpus, &len, NULL, 0);
589 #elif defined(linux)
590 	ncpus = sysconf(_SC_NPROCESSORS_ONLN);
591 #elif defined(_WIN32)
592 	{
593 		SYSTEM_INFO sysinfo;
594 		GetSystemInfo(&sysinfo);
595 		ncpus = sysinfo.dwNumberOfProcessors;
596 	}
597 #else /* others */
598 	ncpus = 1;
599 #endif /* others */
600 	return (ncpus);
601 }
602 
603 #ifdef __linux__
604 #define sockaddr_dl    sockaddr_ll
605 #define sdl_family     sll_family
606 #define AF_LINK        AF_PACKET
607 #define LLADDR(s)      s->sll_addr;
608 #include <linux/if_tun.h>
609 #define TAP_CLONEDEV	"/dev/net/tun"
610 #endif /* __linux__ */
611 
612 #ifdef __FreeBSD__
613 #include <net/if_tun.h>
614 #define TAP_CLONEDEV	"/dev/tap"
615 #endif /* __FreeBSD */
616 
617 #ifdef __APPLE__
618 // #warning TAP not supported on apple ?
619 #include <net/if_utun.h>
620 #define TAP_CLONEDEV	"/dev/tap"
621 #endif /* __APPLE__ */
622 
623 
624 /*
625  * parse the vale configuration in conf and put it in nmr.
626  * Return the flag set if necessary.
627  * The configuration may consist of 1 to 4 numbers separated
628  * by commas: #tx-slots,#rx-slots,#tx-rings,#rx-rings.
629  * Missing numbers or zeroes stand for default values.
630  * As an additional convenience, if exactly one number
631  * is specified, then this is assigned to both #tx-slots and #rx-slots.
632  * If there is no 4th number, then the 3rd is assigned to both #tx-rings
633  * and #rx-rings.
634  */
635 static int
636 parse_nmr_config(const char* conf, struct nmreq_register *nmr)
637 {
638 	char *w, *tok;
639 	int i, v;
640 
641 	if (conf == NULL || ! *conf)
642 		return 0;
643 	nmr->nr_tx_rings = nmr->nr_rx_rings = 0;
644 	nmr->nr_tx_slots = nmr->nr_rx_slots = 0;
645 	w = strdup(conf);
646 	for (i = 0, tok = strtok(w, ","); tok; i++, tok = strtok(NULL, ",")) {
647 		v = atoi(tok);
648 		switch (i) {
649 		case 0:
650 			nmr->nr_tx_slots = nmr->nr_rx_slots = v;
651 			break;
652 		case 1:
653 			nmr->nr_rx_slots = v;
654 			break;
655 		case 2:
656 			nmr->nr_tx_rings = nmr->nr_rx_rings = v;
657 			break;
658 		case 3:
659 			nmr->nr_rx_rings = v;
660 			break;
661 		default:
662 			D("ignored config: %s", tok);
663 			break;
664 		}
665 	}
666 	D("txr %d txd %d rxr %d rxd %d",
667 			nmr->nr_tx_rings, nmr->nr_tx_slots,
668 			nmr->nr_rx_rings, nmr->nr_rx_slots);
669 	free(w);
670 	return 0;
671 }
672 
673 
674 /*
675  * locate the src mac address for our interface, put it
676  * into the user-supplied buffer. return 0 if ok, -1 on error.
677  */
678 static int
679 source_hwaddr(const char *ifname, char *buf)
680 {
681 	struct ifaddrs *ifaphead, *ifap;
682 
683 	if (getifaddrs(&ifaphead) != 0) {
684 		D("getifaddrs %s failed", ifname);
685 		return (-1);
686 	}
687 
688 	/* remove 'netmap:' prefix before comparing interfaces */
689 	if (!strncmp(ifname, "netmap:", 7))
690 		ifname = &ifname[7];
691 
692 	for (ifap = ifaphead; ifap; ifap = ifap->ifa_next) {
693 		struct sockaddr_dl *sdl =
694 			(struct sockaddr_dl *)ifap->ifa_addr;
695 		uint8_t *mac;
696 
697 		if (!sdl || sdl->sdl_family != AF_LINK)
698 			continue;
699 		if (strncmp(ifap->ifa_name, ifname, IFNAMSIZ) != 0)
700 			continue;
701 		mac = (uint8_t *)LLADDR(sdl);
702 		sprintf(buf, "%02x:%02x:%02x:%02x:%02x:%02x",
703 			mac[0], mac[1], mac[2],
704 			mac[3], mac[4], mac[5]);
705 		if (verbose)
706 			D("source hwaddr %s", buf);
707 		break;
708 	}
709 	freeifaddrs(ifaphead);
710 	return ifap ? 0 : 1;
711 }
712 
713 
714 /* set the thread affinity. */
715 static int
716 setaffinity(pthread_t me, int i)
717 {
718 	cpuset_t cpumask;
719 
720 	if (i == -1)
721 		return 0;
722 
723 	/* Set thread affinity affinity.*/
724 	CPU_ZERO(&cpumask);
725 	CPU_SET(i, &cpumask);
726 
727 	if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) {
728 		D("Unable to set affinity: %s", strerror(errno));
729 		return 1;
730 	}
731 	return 0;
732 }
733 
734 
735 /* Compute the checksum of the given ip header. */
736 static uint32_t
737 checksum(const void *data, uint16_t len, uint32_t sum)
738 {
739 	const uint8_t *addr = data;
740 	uint32_t i;
741 
742 	/* Checksum all the pairs of bytes first... */
743 	for (i = 0; i < (len & ~1U); i += 2) {
744 		sum += (uint16_t)ntohs(*((const uint16_t *)(addr + i)));
745 		if (sum > 0xFFFF)
746 			sum -= 0xFFFF;
747 	}
748 	/*
749 	 * If there's a single byte left over, checksum it, too.
750 	 * Network byte order is big-endian, so the remaining byte is
751 	 * the high byte.
752 	 */
753 	if (i < len) {
754 		sum += addr[i] << 8;
755 		if (sum > 0xFFFF)
756 			sum -= 0xFFFF;
757 	}
758 	return sum;
759 }
760 
761 static uint16_t
762 wrapsum(uint32_t sum)
763 {
764 	sum = ~sum & 0xFFFF;
765 	return (htons(sum));
766 }
767 
768 /* Check the payload of the packet for errors (use it for debug).
769  * Look for consecutive ascii representations of the size of the packet.
770  */
771 static void
772 dump_payload(const char *_p, int len, struct netmap_ring *ring, int cur)
773 {
774 	char buf[128];
775 	int i, j, i0;
776 	const unsigned char *p = (const unsigned char *)_p;
777 
778 	/* get the length in ASCII of the length of the packet. */
779 
780 	printf("ring %p cur %5d [buf %6d flags 0x%04x len %5d]\n",
781 		ring, cur, ring->slot[cur].buf_idx,
782 		ring->slot[cur].flags, len);
783 	/* hexdump routine */
784 	for (i = 0; i < len; ) {
785 		memset(buf, ' ', sizeof(buf));
786 		sprintf(buf, "%5d: ", i);
787 		i0 = i;
788 		for (j=0; j < 16 && i < len; i++, j++)
789 			sprintf(buf+7+j*3, "%02x ", (uint8_t)(p[i]));
790 		i = i0;
791 		for (j=0; j < 16 && i < len; i++, j++)
792 			sprintf(buf+7+j + 48, "%c",
793 				isprint(p[i]) ? p[i] : '.');
794 		printf("%s\n", buf);
795 	}
796 }
797 
798 /*
799  * Fill a packet with some payload.
800  * We create a UDP packet so the payload starts at
801  *	14+20+8 = 42 bytes.
802  */
803 #ifdef __linux__
804 #define uh_sport source
805 #define uh_dport dest
806 #define uh_ulen len
807 #define uh_sum check
808 #endif /* linux */
809 
810 static uint16_t
811 new_ip_sum(uint16_t ip_sum, uint32_t oaddr, uint32_t naddr)
812 {
813 	ip_sum = cksum_add(ip_sum, ~oaddr >> 16);
814 	ip_sum = cksum_add(ip_sum, ~oaddr & 0xffff);
815 	ip_sum = cksum_add(ip_sum, naddr >> 16);
816 	ip_sum = cksum_add(ip_sum, naddr & 0xffff);
817 	return ip_sum;
818 }
819 
820 static uint16_t
821 new_udp_sum(uint16_t udp_sum, uint16_t oport, uint16_t nport)
822 {
823 	udp_sum = cksum_add(udp_sum, ~oport);
824 	udp_sum = cksum_add(udp_sum, nport);
825 	return udp_sum;
826 }
827 
828 
829 static void
830 update_ip(struct pkt *pkt, struct targ *t)
831 {
832 	struct glob_arg *g = t->g;
833 	struct ip ip;
834 	struct udphdr udp;
835 	uint32_t oaddr, naddr;
836 	uint16_t oport, nport;
837 	uint16_t ip_sum = 0, udp_sum = 0;
838 
839 	memcpy(&ip, &pkt->ipv4.ip, sizeof(ip));
840 	memcpy(&udp, &pkt->ipv4.udp, sizeof(udp));
841 	do {
842 		ip_sum = udp_sum = 0;
843 		naddr = oaddr = ntohl(ip.ip_src.s_addr);
844 		nport = oport = ntohs(udp.uh_sport);
845 		if (g->options & OPT_RANDOM_SRC) {
846 			ip.ip_src.s_addr = nrand48(t->seed);
847 			udp.uh_sport = nrand48(t->seed);
848 			naddr = ntohl(ip.ip_src.s_addr);
849 			nport = ntohs(udp.uh_sport);
850 			ip_sum = new_ip_sum(ip_sum, oaddr, naddr);
851 			udp_sum = new_udp_sum(udp_sum, oport, nport);
852 		} else {
853 			if (oport < g->src_ip.port1) {
854 				nport = oport + 1;
855 				udp.uh_sport = htons(nport);
856 				udp_sum = new_udp_sum(udp_sum, oport, nport);
857 				break;
858 			}
859 			nport = g->src_ip.port0;
860 			udp.uh_sport = htons(nport);
861 			if (oaddr < g->src_ip.ipv4.end) {
862 				naddr = oaddr + 1;
863 				ip.ip_src.s_addr = htonl(naddr);
864 				ip_sum = new_ip_sum(ip_sum, oaddr, naddr);
865 				break;
866 			}
867 			naddr = g->src_ip.ipv4.start;
868 			ip.ip_src.s_addr = htonl(naddr);
869 			ip_sum = new_ip_sum(ip_sum, oaddr, naddr);
870 		}
871 
872 		naddr = oaddr = ntohl(ip.ip_dst.s_addr);
873 		nport = oport = ntohs(udp.uh_dport);
874 		if (g->options & OPT_RANDOM_DST) {
875 			ip.ip_dst.s_addr = nrand48(t->seed);
876 			udp.uh_dport = nrand48(t->seed);
877 			naddr = ntohl(ip.ip_dst.s_addr);
878 			nport = ntohs(udp.uh_dport);
879 			ip_sum = new_ip_sum(ip_sum, oaddr, naddr);
880 			udp_sum = new_udp_sum(udp_sum, oport, nport);
881 		} else {
882 			if (oport < g->dst_ip.port1) {
883 				nport = oport + 1;
884 				udp.uh_dport = htons(nport);
885 				udp_sum = new_udp_sum(udp_sum, oport, nport);
886 				break;
887 			}
888 			nport = g->dst_ip.port0;
889 			udp.uh_dport = htons(nport);
890 			if (oaddr < g->dst_ip.ipv4.end) {
891 				naddr = oaddr + 1;
892 				ip.ip_dst.s_addr = htonl(naddr);
893 				ip_sum = new_ip_sum(ip_sum, oaddr, naddr);
894 				break;
895 			}
896 			naddr = g->dst_ip.ipv4.start;
897 			ip.ip_dst.s_addr = htonl(naddr);
898 			ip_sum = new_ip_sum(ip_sum, oaddr, naddr);
899 		}
900 	} while (0);
901 	/* update checksums */
902 	if (udp_sum != 0)
903 		udp.uh_sum = ~cksum_add(~udp.uh_sum, htons(udp_sum));
904 	if (ip_sum != 0) {
905 		ip.ip_sum = ~cksum_add(~ip.ip_sum, htons(ip_sum));
906 		udp.uh_sum = ~cksum_add(~udp.uh_sum, htons(ip_sum));
907 	}
908 	memcpy(&pkt->ipv4.ip, &ip, sizeof(ip));
909 	memcpy(&pkt->ipv4.udp, &udp, sizeof(udp));
910 }
911 
912 #ifndef s6_addr16
913 #define	s6_addr16	__u6_addr.__u6_addr16
914 #endif
915 static void
916 update_ip6(struct pkt *pkt, struct targ *t)
917 {
918 	struct glob_arg *g = t->g;
919 	struct ip6_hdr ip6;
920 	struct udphdr udp;
921 	uint16_t udp_sum;
922 	uint16_t oaddr, naddr;
923 	uint16_t oport, nport;
924 	uint8_t group;
925 
926 	memcpy(&ip6, &pkt->ipv6.ip, sizeof(ip6));
927 	memcpy(&udp, &pkt->ipv6.udp, sizeof(udp));
928 	do {
929 		udp_sum = 0;
930 		group = g->src_ip.ipv6.sgroup;
931 		naddr = oaddr = ntohs(ip6.ip6_src.s6_addr16[group]);
932 		nport = oport = ntohs(udp.uh_sport);
933 		if (g->options & OPT_RANDOM_SRC) {
934 			ip6.ip6_src.s6_addr16[group] = nrand48(t->seed);
935 			udp.uh_sport = nrand48(t->seed);
936 			naddr = ntohs(ip6.ip6_src.s6_addr16[group]);
937 			nport = ntohs(udp.uh_sport);
938 			break;
939 		}
940 		if (oport < g->src_ip.port1) {
941 			nport = oport + 1;
942 			udp.uh_sport = htons(nport);
943 			break;
944 		}
945 		nport = g->src_ip.port0;
946 		udp.uh_sport = htons(nport);
947 		if (oaddr < ntohs(g->src_ip.ipv6.end.s6_addr16[group])) {
948 			naddr = oaddr + 1;
949 			ip6.ip6_src.s6_addr16[group] = htons(naddr);
950 			break;
951 		}
952 		naddr = ntohs(g->src_ip.ipv6.start.s6_addr16[group]);
953 		ip6.ip6_src.s6_addr16[group] = htons(naddr);
954 
955 		/* update checksums if needed */
956 		if (oaddr != naddr)
957 			udp_sum = cksum_add(~oaddr, naddr);
958 		if (oport != nport)
959 			udp_sum = cksum_add(udp_sum,
960 			    cksum_add(~oport, nport));
961 
962 		group = g->dst_ip.ipv6.egroup;
963 		naddr = oaddr = ntohs(ip6.ip6_dst.s6_addr16[group]);
964 		nport = oport = ntohs(udp.uh_dport);
965 		if (g->options & OPT_RANDOM_DST) {
966 			ip6.ip6_dst.s6_addr16[group] = nrand48(t->seed);
967 			udp.uh_dport = nrand48(t->seed);
968 			naddr = ntohs(ip6.ip6_dst.s6_addr16[group]);
969 			nport = ntohs(udp.uh_dport);
970 			break;
971 		}
972 		if (oport < g->dst_ip.port1) {
973 			nport = oport + 1;
974 			udp.uh_dport = htons(nport);
975 			break;
976 		}
977 		nport = g->dst_ip.port0;
978 		udp.uh_dport = htons(nport);
979 		if (oaddr < ntohs(g->dst_ip.ipv6.end.s6_addr16[group])) {
980 			naddr = oaddr + 1;
981 			ip6.ip6_dst.s6_addr16[group] = htons(naddr);
982 			break;
983 		}
984 		naddr = ntohs(g->dst_ip.ipv6.start.s6_addr16[group]);
985 		ip6.ip6_dst.s6_addr16[group] = htons(naddr);
986 	} while (0);
987 	/* update checksums */
988 	if (oaddr != naddr)
989 		udp_sum = cksum_add(udp_sum,
990 		    cksum_add(~oaddr, naddr));
991 	if (oport != nport)
992 		udp_sum = cksum_add(udp_sum,
993 		    cksum_add(~oport, nport));
994 	if (udp_sum != 0)
995 		udp.uh_sum = ~cksum_add(~udp.uh_sum, udp_sum);
996 	memcpy(&pkt->ipv6.ip, &ip6, sizeof(ip6));
997 	memcpy(&pkt->ipv6.udp, &udp, sizeof(udp));
998 }
999 
1000 static void
1001 update_addresses(struct pkt *pkt, struct targ *t)
1002 {
1003 
1004 	if (t->g->af == AF_INET)
1005 		update_ip(pkt, t);
1006 	else
1007 		update_ip6(pkt, t);
1008 }
1009 
1010 static void
1011 update_ip_size(struct pkt *pkt, int size)
1012 {
1013 	struct ip ip;
1014 	struct udphdr udp;
1015 	uint16_t oiplen, niplen;
1016 	uint16_t nudplen;
1017 	uint16_t ip_sum = 0;
1018 
1019 	memcpy(&ip, &pkt->ipv4.ip, sizeof(ip));
1020 	memcpy(&udp, &pkt->ipv4.udp, sizeof(udp));
1021 
1022 	oiplen = ntohs(ip.ip_len);
1023 	niplen = size - sizeof(struct ether_header);
1024 	ip.ip_len = htons(niplen);
1025 	nudplen = niplen - sizeof(struct ip);
1026 	udp.uh_ulen = htons(nudplen);
1027 	ip_sum = new_udp_sum(ip_sum, oiplen, niplen);
1028 
1029 	/* update checksums */
1030 	if (ip_sum != 0)
1031 		ip.ip_sum = ~cksum_add(~ip.ip_sum, htons(ip_sum));
1032 
1033 	udp.uh_sum = 0;
1034 	/* Magic: taken from sbin/dhclient/packet.c */
1035 	udp.uh_sum = wrapsum(
1036 		checksum(&udp, sizeof(udp),	/* udp header */
1037 		checksum(pkt->ipv4.body,	/* udp payload */
1038 		nudplen - sizeof(udp),
1039 		checksum(&ip.ip_src, /* pseudo header */
1040 		2 * sizeof(ip.ip_src),
1041 		IPPROTO_UDP + (u_int32_t)ntohs(udp.uh_ulen)))));
1042 
1043 	memcpy(&pkt->ipv4.ip, &ip, sizeof(ip));
1044 	memcpy(&pkt->ipv4.udp, &udp, sizeof(udp));
1045 }
1046 
1047 static void
1048 update_ip6_size(struct pkt *pkt, int size)
1049 {
1050 	struct ip6_hdr ip6;
1051 	struct udphdr udp;
1052 	uint16_t niplen, nudplen;
1053 	uint32_t csum;
1054 
1055 	memcpy(&ip6, &pkt->ipv6.ip, sizeof(ip6));
1056 	memcpy(&udp, &pkt->ipv6.udp, sizeof(udp));
1057 
1058 	nudplen = niplen = size - sizeof(struct ether_header) - sizeof(ip6);
1059 	ip6.ip6_plen = htons(niplen);
1060 	udp.uh_ulen = htons(nudplen);
1061 
1062 	/* Save part of pseudo header checksum into csum */
1063 	udp.uh_sum = 0;
1064 	csum = IPPROTO_UDP << 24;
1065 	csum = checksum(&csum, sizeof(csum), nudplen);
1066 	udp.uh_sum = wrapsum(
1067 		checksum(&udp, sizeof(udp),	/* udp header */
1068 		checksum(pkt->ipv6.body,	/* udp payload */
1069 		nudplen - sizeof(udp),
1070 		checksum(&pkt->ipv6.ip.ip6_src, /* pseudo header */
1071 		2 * sizeof(pkt->ipv6.ip.ip6_src), csum))));
1072 
1073 	memcpy(&pkt->ipv6.ip, &ip6, sizeof(ip6));
1074 	memcpy(&pkt->ipv6.udp, &udp, sizeof(udp));
1075 }
1076 
1077 static void
1078 update_size(struct pkt *pkt, struct targ *t, int size)
1079 {
1080 	if (t->g->options & OPT_UPDATE_CSUM) {
1081 		if (t->g->af == AF_INET)
1082 			update_ip_size(pkt, size);
1083 		else
1084 			update_ip6_size(pkt, size);
1085 	}
1086 }
1087 
1088 /*
1089  * initialize one packet and prepare for the next one.
1090  * The copy could be done better instead of repeating it each time.
1091  */
1092 static void
1093 initialize_packet(struct targ *targ)
1094 {
1095 	struct pkt *pkt = &targ->pkt;
1096 	struct ether_header *eh;
1097 	struct ip6_hdr ip6;
1098 	struct ip ip;
1099 	struct udphdr udp;
1100 	void *udp_ptr;
1101 	uint16_t paylen;
1102 	uint32_t csum = 0;
1103 	const char *payload = targ->g->options & OPT_INDIRECT ?
1104 		indirect_payload : default_payload;
1105 	int i, l0 = strlen(payload);
1106 
1107 #ifndef NO_PCAP
1108 	char errbuf[PCAP_ERRBUF_SIZE];
1109 	pcap_t *file;
1110 	struct pcap_pkthdr *header;
1111 	const unsigned char *packet;
1112 
1113 	/* Read a packet from a PCAP file if asked. */
1114 	if (targ->g->packet_file != NULL) {
1115 		if ((file = pcap_open_offline(targ->g->packet_file,
1116 			    errbuf)) == NULL)
1117 			D("failed to open pcap file %s",
1118 			    targ->g->packet_file);
1119 		if (pcap_next_ex(file, &header, &packet) < 0)
1120 			D("failed to read packet from %s",
1121 			    targ->g->packet_file);
1122 		if ((targ->frame = malloc(header->caplen)) == NULL)
1123 			D("out of memory");
1124 		bcopy(packet, (unsigned char *)targ->frame, header->caplen);
1125 		targ->g->pkt_size = header->caplen;
1126 		pcap_close(file);
1127 		return;
1128 	}
1129 #endif
1130 
1131 	paylen = targ->g->pkt_size - sizeof(*eh) -
1132 	    (targ->g->af == AF_INET ? sizeof(ip): sizeof(ip6));
1133 
1134 	/* create a nice NUL-terminated string */
1135 	for (i = 0; i < paylen; i += l0) {
1136 		if (l0 > paylen - i)
1137 			l0 = paylen - i; // last round
1138 		bcopy(payload, PKT(pkt, body, targ->g->af) + i, l0);
1139 	}
1140 	PKT(pkt, body, targ->g->af)[i - 1] = '\0';
1141 
1142 	/* prepare the headers */
1143 	eh = &pkt->eh;
1144 	bcopy(&targ->g->src_mac.start, eh->ether_shost, 6);
1145 	bcopy(&targ->g->dst_mac.start, eh->ether_dhost, 6);
1146 
1147 	if (targ->g->af == AF_INET) {
1148 		eh->ether_type = htons(ETHERTYPE_IP);
1149 		memcpy(&ip, &pkt->ipv4.ip, sizeof(ip));
1150 		udp_ptr = &pkt->ipv4.udp;
1151 		ip.ip_v = IPVERSION;
1152 		ip.ip_hl = sizeof(ip) >> 2;
1153 		ip.ip_id = 0;
1154 		ip.ip_tos = IPTOS_LOWDELAY;
1155 		ip.ip_len = htons(targ->g->pkt_size - sizeof(*eh));
1156 		ip.ip_id = 0;
1157 		ip.ip_off = htons(IP_DF); /* Don't fragment */
1158 		ip.ip_ttl = IPDEFTTL;
1159 		ip.ip_p = IPPROTO_UDP;
1160 		ip.ip_dst.s_addr = htonl(targ->g->dst_ip.ipv4.start);
1161 		ip.ip_src.s_addr = htonl(targ->g->src_ip.ipv4.start);
1162 		ip.ip_sum = wrapsum(checksum(&ip, sizeof(ip), 0));
1163 		memcpy(&pkt->ipv4.ip, &ip, sizeof(ip));
1164 	} else {
1165 		eh->ether_type = htons(ETHERTYPE_IPV6);
1166 		memcpy(&ip6, &pkt->ipv4.ip, sizeof(ip6));
1167 		udp_ptr = &pkt->ipv6.udp;
1168 		ip6.ip6_flow = 0;
1169 		ip6.ip6_plen = htons(paylen);
1170 		ip6.ip6_vfc = IPV6_VERSION;
1171 		ip6.ip6_nxt = IPPROTO_UDP;
1172 		ip6.ip6_hlim = IPV6_DEFHLIM;
1173 		ip6.ip6_src = targ->g->src_ip.ipv6.start;
1174 		ip6.ip6_dst = targ->g->dst_ip.ipv6.start;
1175 	}
1176 	memcpy(&udp, udp_ptr, sizeof(udp));
1177 
1178 	udp.uh_sport = htons(targ->g->src_ip.port0);
1179 	udp.uh_dport = htons(targ->g->dst_ip.port0);
1180 	udp.uh_ulen = htons(paylen);
1181 	if (targ->g->af == AF_INET) {
1182 		/* Magic: taken from sbin/dhclient/packet.c */
1183 		udp.uh_sum = wrapsum(
1184 		    checksum(&udp, sizeof(udp),	/* udp header */
1185 		    checksum(pkt->ipv4.body,	/* udp payload */
1186 		    paylen - sizeof(udp),
1187 		    checksum(&pkt->ipv4.ip.ip_src, /* pseudo header */
1188 			2 * sizeof(pkt->ipv4.ip.ip_src),
1189 			IPPROTO_UDP + (u_int32_t)ntohs(udp.uh_ulen)))));
1190 		memcpy(&pkt->ipv4.ip, &ip, sizeof(ip));
1191 	} else {
1192 		/* Save part of pseudo header checksum into csum */
1193 		csum = IPPROTO_UDP << 24;
1194 		csum = checksum(&csum, sizeof(csum), paylen);
1195 		udp.uh_sum = wrapsum(
1196 		    checksum(udp_ptr, sizeof(udp),	/* udp header */
1197 		    checksum(pkt->ipv6.body,	/* udp payload */
1198 		    paylen - sizeof(udp),
1199 		    checksum(&pkt->ipv6.ip.ip6_src, /* pseudo header */
1200 			2 * sizeof(pkt->ipv6.ip.ip6_src), csum))));
1201 		memcpy(&pkt->ipv6.ip, &ip6, sizeof(ip6));
1202 	}
1203 	memcpy(udp_ptr, &udp, sizeof(udp));
1204 
1205 	bzero(&pkt->vh, sizeof(pkt->vh));
1206 	// dump_payload((void *)pkt, targ->g->pkt_size, NULL, 0);
1207 }
1208 
1209 static void
1210 get_vnet_hdr_len(struct glob_arg *g)
1211 {
1212 	struct nmreq_header hdr;
1213 	struct nmreq_port_hdr ph;
1214 	int err;
1215 
1216 	hdr = g->nmd->hdr; /* copy name and version */
1217 	hdr.nr_reqtype = NETMAP_REQ_PORT_HDR_GET;
1218 	hdr.nr_options = 0;
1219 	memset(&ph, 0, sizeof(ph));
1220 	hdr.nr_body = (uintptr_t)&ph;
1221 	err = ioctl(g->main_fd, NIOCCTRL, &hdr);
1222 	if (err) {
1223 		D("Unable to get virtio-net header length");
1224 		return;
1225 	}
1226 
1227 	g->virt_header = ph.nr_hdr_len;
1228 	if (g->virt_header) {
1229 		D("Port requires virtio-net header, length = %d",
1230 		  g->virt_header);
1231 	}
1232 }
1233 
1234 static void
1235 set_vnet_hdr_len(struct glob_arg *g)
1236 {
1237 	int err, l = g->virt_header;
1238 	struct nmreq_header hdr;
1239 	struct nmreq_port_hdr ph;
1240 
1241 	if (l == 0)
1242 		return;
1243 
1244 	hdr = g->nmd->hdr; /* copy name and version */
1245 	hdr.nr_reqtype = NETMAP_REQ_PORT_HDR_SET;
1246 	hdr.nr_options = 0;
1247 	memset(&ph, 0, sizeof(ph));
1248 	hdr.nr_body = (uintptr_t)&ph;
1249 	err = ioctl(g->main_fd, NIOCCTRL, &hdr);
1250 	if (err) {
1251 		D("Unable to set virtio-net header length %d", l);
1252 	}
1253 }
1254 
1255 /*
1256  * create and enqueue a batch of packets on a ring.
1257  * On the last one set NS_REPORT to tell the driver to generate
1258  * an interrupt when done.
1259  */
1260 static int
1261 send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame,
1262 		int size, struct targ *t, u_int count, int options)
1263 {
1264 	u_int n, sent, head = ring->head;
1265 	u_int frags = t->frags;
1266 	u_int frag_size = t->frag_size;
1267 	struct netmap_slot *slot = &ring->slot[head];
1268 
1269 	n = nm_ring_space(ring);
1270 #if 0
1271 	if (options & (OPT_COPY | OPT_PREFETCH) ) {
1272 		for (sent = 0; sent < count; sent++) {
1273 			struct netmap_slot *slot = &ring->slot[head];
1274 			char *p = NETMAP_BUF(ring, slot->buf_idx);
1275 
1276 			__builtin_prefetch(p);
1277 			head = nm_ring_next(ring, head);
1278 		}
1279 		head = ring->head;
1280 	}
1281 #endif
1282 	for (sent = 0; sent < count && n >= frags; sent++, n--) {
1283 		char *p;
1284 		int buf_changed;
1285 		u_int tosend = size;
1286 
1287 		slot = &ring->slot[head];
1288 		p = NETMAP_BUF(ring, slot->buf_idx);
1289 		buf_changed = slot->flags & NS_BUF_CHANGED;
1290 
1291 		slot->flags = 0;
1292 		if (options & OPT_RUBBISH) {
1293 			/* do nothing */
1294 		} else if (options & OPT_INDIRECT) {
1295 			slot->flags |= NS_INDIRECT;
1296 			slot->ptr = (uint64_t)((uintptr_t)frame);
1297 		} else if (frags > 1) {
1298 			u_int i;
1299 			const char *f = frame;
1300 			char *fp = p;
1301 			for (i = 0; i < frags - 1; i++) {
1302 				memcpy(fp, f, frag_size);
1303 				slot->len = frag_size;
1304 				slot->flags = NS_MOREFRAG;
1305 				if (options & OPT_DUMP)
1306 					dump_payload(fp, frag_size, ring, head);
1307 				tosend -= frag_size;
1308 				f += frag_size;
1309 				head = nm_ring_next(ring, head);
1310 				slot = &ring->slot[head];
1311 				fp = NETMAP_BUF(ring, slot->buf_idx);
1312 			}
1313 			n -= (frags - 1);
1314 			p = fp;
1315 			slot->flags = 0;
1316 			memcpy(p, f, tosend);
1317 			update_addresses(pkt, t);
1318 		} else if ((options & (OPT_COPY | OPT_MEMCPY)) || buf_changed) {
1319 			if (options & OPT_COPY)
1320 				nm_pkt_copy(frame, p, size);
1321 			else
1322 				memcpy(p, frame, size);
1323 			update_addresses(pkt, t);
1324 		} else if (options & OPT_PREFETCH) {
1325 			__builtin_prefetch(p);
1326 		}
1327 		slot->len = tosend;
1328 		if (options & OPT_DUMP)
1329 			dump_payload(p, tosend, ring, head);
1330 		head = nm_ring_next(ring, head);
1331 	}
1332 	if (sent) {
1333 		slot->flags |= NS_REPORT;
1334 		ring->head = ring->cur = head;
1335 	}
1336 	if (sent < count) {
1337 		/* tell netmap that we need more slots */
1338 		ring->cur = ring->tail;
1339 	}
1340 
1341 	return (sent);
1342 }
1343 
1344 /*
1345  * Index of the highest bit set
1346  */
1347 static uint32_t
1348 msb64(uint64_t x)
1349 {
1350 	uint64_t m = 1ULL << 63;
1351 	int i;
1352 
1353 	for (i = 63; i >= 0; i--, m >>=1)
1354 		if (m & x)
1355 			return i;
1356 	return 0;
1357 }
1358 
1359 /*
1360  * wait until ts, either busy or sleeping if more than 1ms.
1361  * Return wakeup time.
1362  */
1363 static struct timespec
1364 wait_time(struct timespec ts)
1365 {
1366 	for (;;) {
1367 		struct timespec w, cur;
1368 		clock_gettime(CLOCK_REALTIME_PRECISE, &cur);
1369 		w = timespec_sub(ts, cur);
1370 		if (w.tv_sec < 0)
1371 			return cur;
1372 		else if (w.tv_sec > 0 || w.tv_nsec > 1000000)
1373 			poll(NULL, 0, 1);
1374 	}
1375 }
1376 
1377 /*
1378  * Send a packet, and wait for a response.
1379  * The payload (after UDP header, ofs 42) has a 4-byte sequence
1380  * followed by a struct timeval (or bintime?)
1381  */
1382 
1383 static void *
1384 ping_body(void *data)
1385 {
1386 	struct targ *targ = (struct targ *) data;
1387 	struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
1388 	struct netmap_if *nifp = targ->nmd->nifp;
1389 	int i, m;
1390 	void *frame;
1391 	int size;
1392 	struct timespec ts, now, last_print;
1393 	struct timespec nexttime = {0, 0}; /* silence compiler */
1394 	uint64_t sent = 0, n = targ->g->npackets;
1395 	uint64_t count = 0, t_cur, t_min = ~0, av = 0;
1396 	uint64_t g_min = ~0, g_av = 0;
1397 	uint64_t buckets[64];	/* bins for delays, ns */
1398 	int rate_limit = targ->g->tx_rate, tosend = 0;
1399 
1400 	frame = (char*)&targ->pkt + sizeof(targ->pkt.vh) - targ->g->virt_header;
1401 	size = targ->g->pkt_size + targ->g->virt_header;
1402 
1403 
1404 	if (targ->g->nthreads > 1) {
1405 		D("can only ping with 1 thread");
1406 		return NULL;
1407 	}
1408 
1409 	if (targ->g->af == AF_INET6) {
1410 		D("Warning: ping-pong with IPv6 not supported");
1411 	}
1412 
1413 	bzero(&buckets, sizeof(buckets));
1414 	clock_gettime(CLOCK_REALTIME_PRECISE, &last_print);
1415 	now = last_print;
1416 	if (rate_limit) {
1417 		targ->tic = timespec_add(now, (struct timespec){2,0});
1418 		targ->tic.tv_nsec = 0;
1419 		wait_time(targ->tic);
1420 		nexttime = targ->tic;
1421 	}
1422 	while (!targ->cancel && (n == 0 || sent < n)) {
1423 		struct netmap_ring *ring = NETMAP_TXRING(nifp, targ->nmd->first_tx_ring);
1424 		struct netmap_slot *slot;
1425 		char *p;
1426 		int rv;
1427 		uint64_t limit, event = 0;
1428 
1429 		if (rate_limit && tosend <= 0) {
1430 			tosend = targ->g->burst;
1431 			nexttime = timespec_add(nexttime, targ->g->tx_period);
1432 			wait_time(nexttime);
1433 		}
1434 
1435 		limit = rate_limit ? tosend : targ->g->burst;
1436 		if (n > 0 && n - sent < limit)
1437 			limit = n - sent;
1438 		for (m = 0; (unsigned)m < limit; m++) {
1439 			slot = &ring->slot[ring->head];
1440 			slot->len = size;
1441 			p = NETMAP_BUF(ring, slot->buf_idx);
1442 
1443 			if (nm_ring_empty(ring)) {
1444 				D("-- ouch, cannot send");
1445 				break;
1446 			} else {
1447 				struct tstamp *tp;
1448 				nm_pkt_copy(frame, p, size);
1449 				clock_gettime(CLOCK_REALTIME_PRECISE, &ts);
1450 				bcopy(&sent, p+42, sizeof(sent));
1451 				tp = (struct tstamp *)(p+46);
1452 				tp->sec = (uint32_t)ts.tv_sec;
1453 				tp->nsec = (uint32_t)ts.tv_nsec;
1454 				sent++;
1455 				ring->head = ring->cur = nm_ring_next(ring, ring->head);
1456 			}
1457 		}
1458 		if (m > 0)
1459 			event++;
1460 		targ->ctr.pkts = sent;
1461 		targ->ctr.bytes = sent*size;
1462 		targ->ctr.events = event;
1463 		if (rate_limit)
1464 			tosend -= m;
1465 #ifdef BUSYWAIT
1466 		rv = ioctl(pfd.fd, NIOCTXSYNC, NULL);
1467 		if (rv < 0) {
1468 			D("TXSYNC error on queue %d: %s", targ->me,
1469 				strerror(errno));
1470 		}
1471 	again:
1472 		ioctl(pfd.fd, NIOCRXSYNC, NULL);
1473 #else
1474 		/* should use a parameter to decide how often to send */
1475 		if ( (rv = poll(&pfd, 1, 3000)) <= 0) {
1476 			D("poll error on queue %d: %s", targ->me,
1477 				(rv ? strerror(errno) : "timeout"));
1478 			continue;
1479 		}
1480 #endif /* BUSYWAIT */
1481 		/* see what we got back */
1482 #ifdef BUSYWAIT
1483 		int rx = 0;
1484 #endif
1485 		for (i = targ->nmd->first_rx_ring;
1486 			i <= targ->nmd->last_rx_ring; i++) {
1487 			ring = NETMAP_RXRING(nifp, i);
1488 			while (!nm_ring_empty(ring)) {
1489 				uint32_t seq;
1490 				struct tstamp *tp;
1491 				int pos;
1492 
1493 				slot = &ring->slot[ring->head];
1494 				p = NETMAP_BUF(ring, slot->buf_idx);
1495 
1496 				clock_gettime(CLOCK_REALTIME_PRECISE, &now);
1497 				bcopy(p+42, &seq, sizeof(seq));
1498 				tp = (struct tstamp *)(p+46);
1499 				ts.tv_sec = (time_t)tp->sec;
1500 				ts.tv_nsec = (long)tp->nsec;
1501 				ts.tv_sec = now.tv_sec - ts.tv_sec;
1502 				ts.tv_nsec = now.tv_nsec - ts.tv_nsec;
1503 				if (ts.tv_nsec < 0) {
1504 					ts.tv_nsec += 1000000000;
1505 					ts.tv_sec--;
1506 				}
1507 				if (0) D("seq %d/%llu delta %d.%09d", seq,
1508 					(unsigned long long)sent,
1509 					(int)ts.tv_sec, (int)ts.tv_nsec);
1510 				t_cur = ts.tv_sec * 1000000000UL + ts.tv_nsec;
1511 				if (t_cur < t_min)
1512 					t_min = t_cur;
1513 				count ++;
1514 				av += t_cur;
1515 				pos = msb64(t_cur);
1516 				buckets[pos]++;
1517 				/* now store it in a bucket */
1518 				ring->head = ring->cur = nm_ring_next(ring, ring->head);
1519 #ifdef BUSYWAIT
1520 				rx++;
1521 #endif
1522 			}
1523 		}
1524 		//D("tx %d rx %d", sent, rx);
1525 		//usleep(100000);
1526 		ts.tv_sec = now.tv_sec - last_print.tv_sec;
1527 		ts.tv_nsec = now.tv_nsec - last_print.tv_nsec;
1528 		if (ts.tv_nsec < 0) {
1529 			ts.tv_nsec += 1000000000;
1530 			ts.tv_sec--;
1531 		}
1532 		if (ts.tv_sec >= 1) {
1533 			D("count %d RTT: min %d av %d ns",
1534 				(int)count, (int)t_min, (int)(av/count));
1535 			int k, j, kmin, off;
1536 			char buf[512];
1537 
1538 			for (kmin = 0; kmin < 64; kmin ++)
1539 				if (buckets[kmin])
1540 					break;
1541 			for (k = 63; k >= kmin; k--)
1542 				if (buckets[k])
1543 					break;
1544 			buf[0] = '\0';
1545 			off = 0;
1546 			for (j = kmin; j <= k; j++) {
1547 				off += sprintf(buf + off, " %5d", (int)buckets[j]);
1548 			}
1549 			D("k: %d .. %d\n\t%s", 1<<kmin, 1<<k, buf);
1550 			bzero(&buckets, sizeof(buckets));
1551 			count = 0;
1552 			g_av += av;
1553 			av = 0;
1554 			if (t_min < g_min)
1555 				g_min = t_min;
1556 			t_min = ~0;
1557 			last_print = now;
1558 		}
1559 #ifdef BUSYWAIT
1560 		if (rx < m && ts.tv_sec <= 3 && !targ->cancel)
1561 			goto again;
1562 #endif /* BUSYWAIT */
1563 	}
1564 
1565 	if (sent > 0) {
1566 		D("RTT over %llu packets: min %d av %d ns",
1567 			(long long unsigned)sent, (int)g_min,
1568 			(int)((double)g_av/sent));
1569 	}
1570 	targ->completed = 1;
1571 
1572 	/* reset the ``used`` flag. */
1573 	targ->used = 0;
1574 
1575 	return NULL;
1576 }
1577 
1578 
1579 /*
1580  * reply to ping requests
1581  */
1582 static void *
1583 pong_body(void *data)
1584 {
1585 	struct targ *targ = (struct targ *) data;
1586 	struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
1587 	struct netmap_if *nifp = targ->nmd->nifp;
1588 	struct netmap_ring *txring, *rxring;
1589 	int i;
1590 	uint64_t sent = 0, n = targ->g->npackets;
1591 
1592 	if (targ->g->nthreads > 1) {
1593 		D("can only reply ping with 1 thread");
1594 		return NULL;
1595 	}
1596 	if (n > 0)
1597 		D("understood ponger %llu but don't know how to do it",
1598 			(unsigned long long)n);
1599 
1600 	if (targ->g->af == AF_INET6) {
1601 		D("Warning: ping-pong with IPv6 not supported");
1602 	}
1603 
1604 	while (!targ->cancel && (n == 0 || sent < n)) {
1605 		uint32_t txhead, txavail;
1606 //#define BUSYWAIT
1607 #ifdef BUSYWAIT
1608 		ioctl(pfd.fd, NIOCRXSYNC, NULL);
1609 #else
1610 		int rv;
1611 		if ( (rv = poll(&pfd, 1, 1000)) <= 0) {
1612 			D("poll error on queue %d: %s", targ->me,
1613 				rv ? strerror(errno) : "timeout");
1614 			continue;
1615 		}
1616 #endif
1617 		txring = NETMAP_TXRING(nifp, targ->nmd->first_tx_ring);
1618 		txhead = txring->head;
1619 		txavail = nm_ring_space(txring);
1620 		/* see what we got back */
1621 		for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) {
1622 			rxring = NETMAP_RXRING(nifp, i);
1623 			while (!nm_ring_empty(rxring)) {
1624 				uint16_t *spkt, *dpkt;
1625 				uint32_t head = rxring->head;
1626 				struct netmap_slot *slot = &rxring->slot[head];
1627 				char *src, *dst;
1628 				src = NETMAP_BUF(rxring, slot->buf_idx);
1629 				//D("got pkt %p of size %d", src, slot->len);
1630 				rxring->head = rxring->cur = nm_ring_next(rxring, head);
1631 				if (txavail == 0)
1632 					continue;
1633 				dst = NETMAP_BUF(txring,
1634 				    txring->slot[txhead].buf_idx);
1635 				/* copy... */
1636 				dpkt = (uint16_t *)dst;
1637 				spkt = (uint16_t *)src;
1638 				nm_pkt_copy(src, dst, slot->len);
1639 				/* swap source and destination MAC */
1640 				dpkt[0] = spkt[3];
1641 				dpkt[1] = spkt[4];
1642 				dpkt[2] = spkt[5];
1643 				dpkt[3] = spkt[0];
1644 				dpkt[4] = spkt[1];
1645 				dpkt[5] = spkt[2];
1646 				/* swap source and destination IPv4 */
1647 				if (spkt[6] == htons(ETHERTYPE_IP)) {
1648 					dpkt[13] = spkt[15];
1649 					dpkt[14] = spkt[16];
1650 					dpkt[15] = spkt[13];
1651 					dpkt[16] = spkt[14];
1652 				}
1653 				txring->slot[txhead].len = slot->len;
1654 				//dump_payload(dst, slot->len, txring, txhead);
1655 				txhead = nm_ring_next(txring, txhead);
1656 				txavail--;
1657 				sent++;
1658 			}
1659 		}
1660 		txring->head = txring->cur = txhead;
1661 		targ->ctr.pkts = sent;
1662 #ifdef BUSYWAIT
1663 		ioctl(pfd.fd, NIOCTXSYNC, NULL);
1664 #endif
1665 	}
1666 
1667 	targ->completed = 1;
1668 
1669 	/* reset the ``used`` flag. */
1670 	targ->used = 0;
1671 
1672 	return NULL;
1673 }
1674 
1675 
1676 static void *
1677 sender_body(void *data)
1678 {
1679 	struct targ *targ = (struct targ *) data;
1680 	struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT };
1681 	struct netmap_if *nifp;
1682 	struct netmap_ring *txring = NULL;
1683 	int i;
1684 	uint64_t n = targ->g->npackets / targ->g->nthreads;
1685 	uint64_t sent = 0;
1686 	uint64_t event = 0;
1687 	int options = targ->g->options;
1688 	struct timespec nexttime = { 0, 0}; // XXX silence compiler
1689 	int rate_limit = targ->g->tx_rate;
1690 	struct pkt *pkt = &targ->pkt;
1691 	void *frame;
1692 	int size;
1693 
1694 	if (targ->frame == NULL) {
1695 		frame = (char *)pkt + sizeof(pkt->vh) - targ->g->virt_header;
1696 		size = targ->g->pkt_size + targ->g->virt_header;
1697 	} else {
1698 		frame = targ->frame;
1699 		size = targ->g->pkt_size;
1700 	}
1701 
1702 	D("start, fd %d main_fd %d", targ->fd, targ->g->main_fd);
1703 	if (setaffinity(targ->thread, targ->affinity))
1704 		goto quit;
1705 
1706 	/* main loop.*/
1707 	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
1708 	if (rate_limit) {
1709 		targ->tic = timespec_add(targ->tic, (struct timespec){2,0});
1710 		targ->tic.tv_nsec = 0;
1711 		wait_time(targ->tic);
1712 		nexttime = targ->tic;
1713 	}
1714 	if (targ->g->dev_type == DEV_TAP) {
1715 	    D("writing to file desc %d", targ->g->main_fd);
1716 
1717 	    for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) {
1718 		if (write(targ->g->main_fd, frame, size) != -1)
1719 			sent++;
1720 		update_addresses(pkt, targ);
1721 		if (i > 10000) {
1722 			targ->ctr.pkts = sent;
1723 			targ->ctr.bytes = sent*size;
1724 			targ->ctr.events = sent;
1725 			i = 0;
1726 		}
1727 	    }
1728 #ifndef NO_PCAP
1729     } else if (targ->g->dev_type == DEV_PCAP) {
1730 	    pcap_t *p = targ->g->p;
1731 
1732 	    for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) {
1733 		if (pcap_inject(p, frame, size) != -1)
1734 			sent++;
1735 		update_addresses(pkt, targ);
1736 		if (i > 10000) {
1737 			targ->ctr.pkts = sent;
1738 			targ->ctr.bytes = sent*size;
1739 			targ->ctr.events = sent;
1740 			i = 0;
1741 		}
1742 	    }
1743 #endif /* NO_PCAP */
1744     } else {
1745 	int tosend = 0;
1746 	u_int bufsz, frag_size = targ->g->frag_size;
1747 
1748 	nifp = targ->nmd->nifp;
1749 	txring = NETMAP_TXRING(nifp, targ->nmd->first_tx_ring);
1750 	bufsz = txring->nr_buf_size;
1751 	if (bufsz < frag_size)
1752 		frag_size = bufsz;
1753 	targ->frag_size = targ->g->pkt_size / targ->frags;
1754 	if (targ->frag_size > frag_size) {
1755 		targ->frags = targ->g->pkt_size / frag_size;
1756 		targ->frag_size = frag_size;
1757 		if (targ->g->pkt_size % frag_size != 0)
1758 			targ->frags++;
1759 	}
1760 	D("frags %u frag_size %u", targ->frags, targ->frag_size);
1761 
1762 	/* mark all slots of all rings as changed so initial copy will be done */
1763 	for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) {
1764 		uint32_t j;
1765 		struct netmap_slot *slot;
1766 
1767 		txring = NETMAP_TXRING(nifp, i);
1768 		for (j = 0; j < txring->num_slots; j++) {
1769 			slot = &txring->slot[j];
1770 			slot->flags = NS_BUF_CHANGED;
1771 		}
1772 	}
1773 
1774 	while (!targ->cancel && (n == 0 || sent < n)) {
1775 		int rv;
1776 
1777 		if (rate_limit && tosend <= 0) {
1778 			tosend = targ->g->burst;
1779 			nexttime = timespec_add(nexttime, targ->g->tx_period);
1780 			wait_time(nexttime);
1781 		}
1782 
1783 		/*
1784 		 * wait for available room in the send queue(s)
1785 		 */
1786 #ifdef BUSYWAIT
1787 		(void)rv;
1788 		if (ioctl(pfd.fd, NIOCTXSYNC, NULL) < 0) {
1789 			D("ioctl error on queue %d: %s", targ->me,
1790 					strerror(errno));
1791 			goto quit;
1792 		}
1793 #else /* !BUSYWAIT */
1794 		if ( (rv = poll(&pfd, 1, 2000)) <= 0) {
1795 			if (targ->cancel)
1796 				break;
1797 			D("poll error on queue %d: %s", targ->me,
1798 				rv ? strerror(errno) : "timeout");
1799 			// goto quit;
1800 		}
1801 		if (pfd.revents & POLLERR) {
1802 			D("poll error on %d ring %d-%d", pfd.fd,
1803 				targ->nmd->first_tx_ring, targ->nmd->last_tx_ring);
1804 			goto quit;
1805 		}
1806 #endif /* !BUSYWAIT */
1807 		/*
1808 		 * scan our queues and send on those with room
1809 		 */
1810 		for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) {
1811 			int m;
1812 			uint64_t limit = rate_limit ?  tosend : targ->g->burst;
1813 
1814 			if (n > 0 && n == sent)
1815 				break;
1816 
1817 			if (n > 0 && n - sent < limit)
1818 				limit = n - sent;
1819 			txring = NETMAP_TXRING(nifp, i);
1820 			if (nm_ring_empty(txring))
1821 				continue;
1822 
1823 			if (targ->g->pkt_min_size > 0) {
1824 				size = nrand48(targ->seed) %
1825 					(targ->g->pkt_size - targ->g->pkt_min_size) +
1826 					targ->g->pkt_min_size;
1827 				update_size(pkt, targ, size);
1828 			}
1829 			m = send_packets(txring, pkt, frame, size, targ,
1830 					 limit, options);
1831 			ND("limit %lu tail %d m %d",
1832 				limit, txring->tail, m);
1833 			sent += m;
1834 			if (m > 0) //XXX-ste: can m be 0?
1835 				event++;
1836 			targ->ctr.pkts = sent;
1837 			targ->ctr.bytes += m*size;
1838 			targ->ctr.events = event;
1839 			if (rate_limit) {
1840 				tosend -= m;
1841 				if (tosend <= 0)
1842 					break;
1843 			}
1844 		}
1845 	}
1846 	/* flush any remaining packets */
1847 	if (txring != NULL) {
1848 		D("flush tail %d head %d on thread %p",
1849 			txring->tail, txring->head,
1850 			(void *)pthread_self());
1851 		ioctl(pfd.fd, NIOCTXSYNC, NULL);
1852 	}
1853 
1854 	/* final part: wait all the TX queues to be empty. */
1855 	for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) {
1856 		txring = NETMAP_TXRING(nifp, i);
1857 		while (!targ->cancel && nm_tx_pending(txring)) {
1858 			RD(5, "pending tx tail %d head %d on ring %d",
1859 				txring->tail, txring->head, i);
1860 			ioctl(pfd.fd, NIOCTXSYNC, NULL);
1861 			usleep(1); /* wait 1 tick */
1862 		}
1863 	}
1864     } /* end DEV_NETMAP */
1865 
1866 	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
1867 	targ->completed = 1;
1868 	targ->ctr.pkts = sent;
1869 	targ->ctr.bytes = sent*size;
1870 	targ->ctr.events = event;
1871 quit:
1872 	/* reset the ``used`` flag. */
1873 	targ->used = 0;
1874 
1875 	return (NULL);
1876 }
1877 
1878 
1879 #ifndef NO_PCAP
1880 static void
1881 receive_pcap(u_char *user, const struct pcap_pkthdr * h,
1882 	const u_char * bytes)
1883 {
1884 	struct my_ctrs *ctr = (struct my_ctrs *)user;
1885 	(void)bytes;	/* UNUSED */
1886 	ctr->bytes += h->len;
1887 	ctr->pkts++;
1888 }
1889 #endif /* !NO_PCAP */
1890 
1891 
1892 static int
1893 receive_packets(struct netmap_ring *ring, u_int limit, int dump, uint64_t *bytes)
1894 {
1895 	u_int head, rx, n;
1896 	uint64_t b = 0;
1897 	u_int complete = 0;
1898 
1899 	if (bytes == NULL)
1900 		bytes = &b;
1901 
1902 	head = ring->head;
1903 	n = nm_ring_space(ring);
1904 	if (n < limit)
1905 		limit = n;
1906 	for (rx = 0; rx < limit; rx++) {
1907 		struct netmap_slot *slot = &ring->slot[head];
1908 		char *p = NETMAP_BUF(ring, slot->buf_idx);
1909 
1910 		*bytes += slot->len;
1911 		if (dump)
1912 			dump_payload(p, slot->len, ring, head);
1913 		if (!(slot->flags & NS_MOREFRAG))
1914 			complete++;
1915 
1916 		head = nm_ring_next(ring, head);
1917 	}
1918 	ring->head = ring->cur = head;
1919 
1920 	return (complete);
1921 }
1922 
1923 static void *
1924 receiver_body(void *data)
1925 {
1926 	struct targ *targ = (struct targ *) data;
1927 	struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
1928 	struct netmap_if *nifp;
1929 	struct netmap_ring *rxring;
1930 	int i;
1931 	struct my_ctrs cur;
1932 	uint64_t n = targ->g->npackets / targ->g->nthreads;
1933 
1934 	memset(&cur, 0, sizeof(cur));
1935 
1936 	if (setaffinity(targ->thread, targ->affinity))
1937 		goto quit;
1938 
1939 	D("reading from %s fd %d main_fd %d",
1940 		targ->g->ifname, targ->fd, targ->g->main_fd);
1941 	/* unbounded wait for the first packet. */
1942 	for (;!targ->cancel;) {
1943 		i = poll(&pfd, 1, 1000);
1944 		if (i > 0 && !(pfd.revents & POLLERR))
1945 			break;
1946 		if (i < 0) {
1947 			D("poll() error: %s", strerror(errno));
1948 			goto quit;
1949 		}
1950 		if (pfd.revents & POLLERR) {
1951 			D("fd error");
1952 			goto quit;
1953 		}
1954 		RD(1, "waiting for initial packets, poll returns %d %d",
1955 			i, pfd.revents);
1956 	}
1957 	/* main loop, exit after 1s silence */
1958 	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
1959     if (targ->g->dev_type == DEV_TAP) {
1960 	while (!targ->cancel && (n == 0 || targ->ctr.pkts < n)) {
1961 		char buf[MAX_BODYSIZE];
1962 		/* XXX should we poll ? */
1963 		i = read(targ->g->main_fd, buf, sizeof(buf));
1964 		if (i > 0) {
1965 			targ->ctr.pkts++;
1966 			targ->ctr.bytes += i;
1967 			targ->ctr.events++;
1968 		}
1969 	}
1970 #ifndef NO_PCAP
1971     } else if (targ->g->dev_type == DEV_PCAP) {
1972 	while (!targ->cancel && (n == 0 || targ->ctr.pkts < n)) {
1973 		/* XXX should we poll ? */
1974 		pcap_dispatch(targ->g->p, targ->g->burst, receive_pcap,
1975 			(u_char *)&targ->ctr);
1976 		targ->ctr.events++;
1977 	}
1978 #endif /* !NO_PCAP */
1979     } else {
1980 	int dump = targ->g->options & OPT_DUMP;
1981 
1982 	nifp = targ->nmd->nifp;
1983 	while (!targ->cancel && (n == 0 || targ->ctr.pkts < n)) {
1984 		/* Once we started to receive packets, wait at most 1 seconds
1985 		   before quitting. */
1986 #ifdef BUSYWAIT
1987 		if (ioctl(pfd.fd, NIOCRXSYNC, NULL) < 0) {
1988 			D("ioctl error on queue %d: %s", targ->me,
1989 					strerror(errno));
1990 			goto quit;
1991 		}
1992 #else /* !BUSYWAIT */
1993 		if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) {
1994 			clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
1995 			targ->toc.tv_sec -= 1; /* Subtract timeout time. */
1996 			goto out;
1997 		}
1998 
1999 		if (pfd.revents & POLLERR) {
2000 			D("poll err");
2001 			goto quit;
2002 		}
2003 #endif /* !BUSYWAIT */
2004 		uint64_t cur_space = 0;
2005 		for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) {
2006 			int m;
2007 
2008 			rxring = NETMAP_RXRING(nifp, i);
2009 			/* compute free space in the ring */
2010 			m = rxring->head + rxring->num_slots - rxring->tail;
2011 			if (m >= (int) rxring->num_slots)
2012 				m -= rxring->num_slots;
2013 			cur_space += m;
2014 			if (nm_ring_empty(rxring))
2015 				continue;
2016 
2017 			m = receive_packets(rxring, targ->g->burst, dump, &cur.bytes);
2018 			cur.pkts += m;
2019 			if (m > 0)
2020 				cur.events++;
2021 		}
2022 		cur.min_space = targ->ctr.min_space;
2023 		if (cur_space < cur.min_space)
2024 			cur.min_space = cur_space;
2025 		targ->ctr = cur;
2026 	}
2027     }
2028 
2029 	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
2030 
2031 #if !defined(BUSYWAIT)
2032 out:
2033 #endif
2034 	targ->completed = 1;
2035 	targ->ctr = cur;
2036 
2037 quit:
2038 	/* reset the ``used`` flag. */
2039 	targ->used = 0;
2040 
2041 	return (NULL);
2042 }
2043 
2044 static void *
2045 txseq_body(void *data)
2046 {
2047 	struct targ *targ = (struct targ *) data;
2048 	struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT };
2049 	struct netmap_ring *ring;
2050 	int64_t sent = 0;
2051 	uint64_t event = 0;
2052 	int options = targ->g->options | OPT_COPY;
2053 	struct timespec nexttime = {0, 0};
2054 	int rate_limit = targ->g->tx_rate;
2055 	struct pkt *pkt = &targ->pkt;
2056 	int frags = targ->g->frags;
2057 	uint32_t sequence = 0;
2058 	int budget = 0;
2059 	void *frame;
2060 	int size;
2061 
2062 	if (targ->g->nthreads > 1) {
2063 		D("can only txseq ping with 1 thread");
2064 		return NULL;
2065 	}
2066 
2067 	if (targ->g->npackets > 0) {
2068 		D("Ignoring -n argument");
2069 	}
2070 
2071 	frame = (char *)pkt + sizeof(pkt->vh) - targ->g->virt_header;
2072 	size = targ->g->pkt_size + targ->g->virt_header;
2073 
2074 	D("start, fd %d main_fd %d", targ->fd, targ->g->main_fd);
2075 	if (setaffinity(targ->thread, targ->affinity))
2076 		goto quit;
2077 
2078 	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
2079 	if (rate_limit) {
2080 		targ->tic = timespec_add(targ->tic, (struct timespec){2,0});
2081 		targ->tic.tv_nsec = 0;
2082 		wait_time(targ->tic);
2083 		nexttime = targ->tic;
2084 	}
2085 
2086 	/* Only use the first queue. */
2087 	ring = NETMAP_TXRING(targ->nmd->nifp, targ->nmd->first_tx_ring);
2088 
2089 	while (!targ->cancel) {
2090 		int64_t limit;
2091 		unsigned int space;
2092 		unsigned int head;
2093 		int fcnt;
2094 		uint16_t sum = 0;
2095 		int rv;
2096 
2097 		if (!rate_limit) {
2098 			budget = targ->g->burst;
2099 
2100 		} else if (budget <= 0) {
2101 			budget = targ->g->burst;
2102 			nexttime = timespec_add(nexttime, targ->g->tx_period);
2103 			wait_time(nexttime);
2104 		}
2105 
2106 		/* wait for available room in the send queue */
2107 #ifdef BUSYWAIT
2108 		(void)rv;
2109 		if (ioctl(pfd.fd, NIOCTXSYNC, NULL) < 0) {
2110 			D("ioctl error on queue %d: %s", targ->me,
2111 					strerror(errno));
2112 			goto quit;
2113 		}
2114 #else /* !BUSYWAIT */
2115 		if ( (rv = poll(&pfd, 1, 2000)) <= 0) {
2116 			if (targ->cancel)
2117 				break;
2118 			D("poll error on queue %d: %s", targ->me,
2119 				rv ? strerror(errno) : "timeout");
2120 			// goto quit;
2121 		}
2122 		if (pfd.revents & POLLERR) {
2123 			D("poll error on %d ring %d-%d", pfd.fd,
2124 				targ->nmd->first_tx_ring, targ->nmd->last_tx_ring);
2125 			goto quit;
2126 		}
2127 #endif /* !BUSYWAIT */
2128 
2129 		/* If no room poll() again. */
2130 		space = nm_ring_space(ring);
2131 		if (!space) {
2132 			continue;
2133 		}
2134 
2135 		limit = budget;
2136 
2137 		if (space < limit) {
2138 			limit = space;
2139 		}
2140 
2141 		/* Cut off ``limit`` to make sure is multiple of ``frags``. */
2142 		if (frags > 1) {
2143 			limit = (limit / frags) * frags;
2144 		}
2145 
2146 		limit = sent + limit; /* Convert to absolute. */
2147 
2148 		for (fcnt = frags, head = ring->head;
2149 				sent < limit; sent++, sequence++) {
2150 			struct netmap_slot *slot = &ring->slot[head];
2151 			char *p = NETMAP_BUF(ring, slot->buf_idx);
2152 			uint16_t *w = (uint16_t *)PKT(pkt, body, targ->g->af), t;
2153 
2154 			memcpy(&sum, targ->g->af == AF_INET ? &pkt->ipv4.udp.uh_sum : &pkt->ipv6.udp.uh_sum, sizeof(sum));
2155 
2156 			slot->flags = 0;
2157 			t = *w;
2158 			PKT(pkt, body, targ->g->af)[0] = sequence >> 24;
2159 			PKT(pkt, body, targ->g->af)[1] = (sequence >> 16) & 0xff;
2160 			sum = ~cksum_add(~sum, cksum_add(~t, *w));
2161 			t = *++w;
2162 			PKT(pkt, body, targ->g->af)[2] = (sequence >> 8) & 0xff;
2163 			PKT(pkt, body, targ->g->af)[3] = sequence & 0xff;
2164 			sum = ~cksum_add(~sum, cksum_add(~t, *w));
2165 			memcpy(targ->g->af == AF_INET ? &pkt->ipv4.udp.uh_sum : &pkt->ipv6.udp.uh_sum, &sum, sizeof(sum));
2166 			nm_pkt_copy(frame, p, size);
2167 			if (fcnt == frags) {
2168 				update_addresses(pkt, targ);
2169 			}
2170 
2171 			if (options & OPT_DUMP) {
2172 				dump_payload(p, size, ring, head);
2173 			}
2174 
2175 			slot->len = size;
2176 
2177 			if (--fcnt > 0) {
2178 				slot->flags |= NS_MOREFRAG;
2179 			} else {
2180 				fcnt = frags;
2181 			}
2182 
2183 			if (sent == limit - 1) {
2184 				/* Make sure we don't push an incomplete
2185 				 * packet. */
2186 				assert(!(slot->flags & NS_MOREFRAG));
2187 				slot->flags |= NS_REPORT;
2188 			}
2189 
2190 			head = nm_ring_next(ring, head);
2191 			if (rate_limit) {
2192 				budget--;
2193 			}
2194 		}
2195 
2196 		ring->cur = ring->head = head;
2197 
2198 		event ++;
2199 		targ->ctr.pkts = sent;
2200 		targ->ctr.bytes = sent * size;
2201 		targ->ctr.events = event;
2202 	}
2203 
2204 	/* flush any remaining packets */
2205 	D("flush tail %d head %d on thread %p",
2206 		ring->tail, ring->head,
2207 		(void *)pthread_self());
2208 	ioctl(pfd.fd, NIOCTXSYNC, NULL);
2209 
2210 	/* final part: wait the TX queues to become empty. */
2211 	while (!targ->cancel && nm_tx_pending(ring)) {
2212 		RD(5, "pending tx tail %d head %d on ring %d",
2213 				ring->tail, ring->head, targ->nmd->first_tx_ring);
2214 		ioctl(pfd.fd, NIOCTXSYNC, NULL);
2215 		usleep(1); /* wait 1 tick */
2216 	}
2217 
2218 	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
2219 	targ->completed = 1;
2220 	targ->ctr.pkts = sent;
2221 	targ->ctr.bytes = sent * size;
2222 	targ->ctr.events = event;
2223 quit:
2224 	/* reset the ``used`` flag. */
2225 	targ->used = 0;
2226 
2227 	return (NULL);
2228 }
2229 
2230 
2231 static char *
2232 multi_slot_to_string(struct netmap_ring *ring, unsigned int head,
2233 		     unsigned int nfrags, char *strbuf, size_t strbuflen)
2234 {
2235 	unsigned int f;
2236 	char *ret = strbuf;
2237 
2238 	for (f = 0; f < nfrags; f++) {
2239 		struct netmap_slot *slot = &ring->slot[head];
2240 		int m = snprintf(strbuf, strbuflen, "|%u,%x|", slot->len,
2241 				 slot->flags);
2242 		if (m >= (int)strbuflen) {
2243 			break;
2244 		}
2245 		strbuf += m;
2246 		strbuflen -= m;
2247 
2248 		head = nm_ring_next(ring, head);
2249 	}
2250 
2251 	return ret;
2252 }
2253 
2254 static void *
2255 rxseq_body(void *data)
2256 {
2257 	struct targ *targ = (struct targ *) data;
2258 	struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
2259 	int dump = targ->g->options & OPT_DUMP;
2260 	struct netmap_ring *ring;
2261 	unsigned int frags_exp = 1;
2262 	struct my_ctrs cur;
2263 	unsigned int frags = 0;
2264 	int first_packet = 1;
2265 	int first_slot = 1;
2266 	int i, j, af, nrings;
2267 	uint32_t seq, *seq_exp = NULL;
2268 
2269 	memset(&cur, 0, sizeof(cur));
2270 
2271 	if (setaffinity(targ->thread, targ->affinity))
2272 		goto quit;
2273 
2274 	nrings = targ->nmd->last_rx_ring - targ->nmd->first_rx_ring + 1;
2275 	seq_exp = calloc(nrings, sizeof(uint32_t));
2276 	if (seq_exp == NULL) {
2277 		D("failed to allocate seq array");
2278 		goto quit;
2279 	}
2280 
2281 	D("reading from %s fd %d main_fd %d",
2282 		targ->g->ifname, targ->fd, targ->g->main_fd);
2283 	/* unbounded wait for the first packet. */
2284 	for (;!targ->cancel;) {
2285 		i = poll(&pfd, 1, 1000);
2286 		if (i > 0 && !(pfd.revents & POLLERR))
2287 			break;
2288 		RD(1, "waiting for initial packets, poll returns %d %d",
2289 			i, pfd.revents);
2290 	}
2291 
2292 	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
2293 
2294 
2295 	while (!targ->cancel) {
2296 		unsigned int head;
2297 		int limit;
2298 
2299 #ifdef BUSYWAIT
2300 		if (ioctl(pfd.fd, NIOCRXSYNC, NULL) < 0) {
2301 			D("ioctl error on queue %d: %s", targ->me,
2302 					strerror(errno));
2303 			goto quit;
2304 		}
2305 #else /* !BUSYWAIT */
2306 		if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) {
2307 			clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
2308 			targ->toc.tv_sec -= 1; /* Subtract timeout time. */
2309 			goto out;
2310 		}
2311 
2312 		if (pfd.revents & POLLERR) {
2313 			D("poll err");
2314 			goto quit;
2315 		}
2316 #endif /* !BUSYWAIT */
2317 
2318 		for (j = targ->nmd->first_rx_ring; j <= targ->nmd->last_rx_ring; j++) {
2319 			ring = NETMAP_RXRING(targ->nmd->nifp, j);
2320 			if (nm_ring_empty(ring))
2321 				continue;
2322 
2323 			limit = nm_ring_space(ring);
2324 			if (limit > targ->g->burst)
2325 				limit = targ->g->burst;
2326 
2327 #if 0
2328 			/* Enable this if
2329 			 *     1) we remove the early-return optimization from
2330 			 *        the netmap poll implementation, or
2331 			 *     2) pipes get NS_MOREFRAG support.
2332 			 * With the current netmap implementation, an experiment like
2333 			 *    pkt-gen -i vale:1{1 -f txseq -F 9
2334 			 *    pkt-gen -i vale:1}1 -f rxseq
2335 			 * would get stuck as soon as we find nm_ring_space(ring) < 9,
2336 			 * since here limit is rounded to 0 and
2337 			 * pipe rxsync is not called anymore by the poll() of this loop.
2338 			 */
2339 			if (frags_exp > 1) {
2340 				int o = limit;
2341 				/* Cut off to the closest smaller multiple. */
2342 				limit = (limit / frags_exp) * frags_exp;
2343 				RD(2, "LIMIT %d --> %d", o, limit);
2344 			}
2345 #endif
2346 
2347 			for (head = ring->head, i = 0; i < limit; i++) {
2348 				struct netmap_slot *slot = &ring->slot[head];
2349 				char *p = NETMAP_BUF(ring, slot->buf_idx);
2350 				int len = slot->len;
2351 				struct pkt *pkt;
2352 
2353 				if (dump) {
2354 					dump_payload(p, slot->len, ring, head);
2355 				}
2356 
2357 				frags++;
2358 				if (!(slot->flags & NS_MOREFRAG)) {
2359 					if (first_packet) {
2360 						first_packet = 0;
2361 					} else if (frags != frags_exp) {
2362 						char prbuf[512];
2363 						RD(1, "Received packets with %u frags, "
2364 								"expected %u, '%s'", frags, frags_exp,
2365 								multi_slot_to_string(ring, head-frags+1,
2366 							       	frags,
2367 									prbuf, sizeof(prbuf)));
2368 					}
2369 					first_packet = 0;
2370 					frags_exp = frags;
2371 					frags = 0;
2372 				}
2373 
2374 				p -= sizeof(pkt->vh) - targ->g->virt_header;
2375 				len += sizeof(pkt->vh) - targ->g->virt_header;
2376 				pkt = (struct pkt *)p;
2377 				if (ntohs(pkt->eh.ether_type) == ETHERTYPE_IP)
2378 					af = AF_INET;
2379 				else
2380 					af = AF_INET6;
2381 
2382 				if ((char *)pkt + len < ((char *)PKT(pkt, body, af)) +
2383 						sizeof(seq)) {
2384 					RD(1, "%s: packet too small (len=%u)", __func__,
2385 							slot->len);
2386 				} else {
2387 					seq = (PKT(pkt, body, af)[0] << 24) |
2388 						(PKT(pkt, body, af)[1] << 16) |
2389 						(PKT(pkt, body, af)[2] << 8) |
2390 						PKT(pkt, body, af)[3];
2391 					if (first_slot) {
2392 						/* Grab the first one, whatever it
2393 						   is. */
2394 						seq_exp[j] = seq;
2395 						first_slot = 0;
2396 					} else if (seq != seq_exp[j]) {
2397 						uint32_t delta = seq - seq_exp[j];
2398 
2399 						if (delta < (0xFFFFFFFF >> 1)) {
2400 							RD(2, "Sequence GAP: exp %u found %u",
2401 									seq_exp[j], seq);
2402 						} else {
2403 							RD(2, "Sequence OUT OF ORDER: "
2404 									"exp %u found %u", seq_exp[j], seq);
2405 						}
2406 						seq_exp[j] = seq;
2407 					}
2408 					seq_exp[j]++;
2409 				}
2410 
2411 				cur.bytes += slot->len;
2412 				head = nm_ring_next(ring, head);
2413 				cur.pkts++;
2414 			}
2415 
2416 			ring->cur = ring->head = head;
2417 
2418 			cur.events++;
2419 			targ->ctr = cur;
2420 		}
2421 	}
2422 	clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
2423 
2424 #ifndef BUSYWAIT
2425 out:
2426 #endif /* !BUSYWAIT */
2427 	targ->completed = 1;
2428 	targ->ctr = cur;
2429 
2430 quit:
2431 	if (seq_exp != NULL)
2432 		free(seq_exp);
2433 	/* reset the ``used`` flag. */
2434 	targ->used = 0;
2435 
2436 	return (NULL);
2437 }
2438 
2439 
2440 static void
2441 tx_output(struct glob_arg *g, struct my_ctrs *cur, double delta, const char *msg)
2442 {
2443 	double bw, raw_bw, pps, abs;
2444 	char b1[40], b2[80], b3[80];
2445 	int size;
2446 
2447 	if (cur->pkts == 0) {
2448 		printf("%s nothing.\n", msg);
2449 		return;
2450 	}
2451 
2452 	size = (int)(cur->bytes / cur->pkts);
2453 
2454 	printf("%s %llu packets %llu bytes %llu events %d bytes each in %.2f seconds.\n",
2455 		msg,
2456 		(unsigned long long)cur->pkts,
2457 		(unsigned long long)cur->bytes,
2458 		(unsigned long long)cur->events, size, delta);
2459 	if (delta == 0)
2460 		delta = 1e-6;
2461 	if (size < 60)		/* correct for min packet size */
2462 		size = 60;
2463 	pps = cur->pkts / delta;
2464 	bw = (8.0 * cur->bytes) / delta;
2465 	raw_bw = (8.0 * cur->bytes + cur->pkts * g->framing) / delta;
2466 	abs = cur->pkts / (double)(cur->events);
2467 
2468 	printf("Speed: %spps Bandwidth: %sbps (raw %sbps). Average batch: %.2f pkts\n",
2469 		norm(b1, pps, normalize), norm(b2, bw, normalize), norm(b3, raw_bw, normalize), abs);
2470 }
2471 
2472 static void
2473 usage(int errcode)
2474 {
2475 /* This usage is generated from the pkt-gen man page:
2476  *   $ man pkt-gen > x
2477  * and pasted here adding the string terminators and endlines with simple
2478  * regular expressions. */
2479 	const char *cmd = "pkt-gen";
2480 	fprintf(stderr,
2481 		"Usage:\n"
2482 		"%s arguments\n"
2483 "     -h      Show program usage and exit.\n"
2484 "\n"
2485 "     -i interface\n"
2486 "             Name of the network interface that pkt-gen operates on.  It can be a system network interface\n"
2487 "             (e.g., em0), the name of a vale(4) port (e.g., valeSSS:PPP), the name of a netmap pipe or\n"
2488 "             monitor, or any valid netmap port name accepted by the nm_open library function, as docu-\n"
2489 "             mented in netmap(4) (NIOCREGIF section).\n"
2490 "\n"
2491 "     -f function\n"
2492 "             The function to be executed by pkt-gen.  Specify tx for transmission, rx for reception, ping\n"
2493 "             for client-side ping-pong operation, and pong for server-side ping-pong operation.\n"
2494 "\n"
2495 "     -n count\n"
2496 "             Number of iterations of the pkt-gen function (with 0 meaning infinite).  In case of tx or rx,\n"
2497 "             count is the number of packets to receive or transmit.  In case of ping or pong, count is the\n"
2498 "             number of ping-pong transactions.\n"
2499 "\n"
2500 "     -l pkt_size\n"
2501 "             Packet size in bytes excluding CRC.  If passed a second time, use random sizes larger or\n"
2502 "             equal than the second one and lower than the first one.\n"
2503 "\n"
2504 "     -b burst_size\n"
2505 "             Transmit or receive up to burst_size packets at a time.\n"
2506 "\n"
2507 "     -4      Use IPv4 addresses.\n"
2508 "\n"
2509 "     -6      Use IPv6 addresses.\n"
2510 "\n"
2511 "     -d dst_ip[:port[-dst_ip:port]]\n"
2512 "             Destination IPv4/IPv6 address and port, single or range.\n"
2513 "\n"
2514 "     -s src_ip[:port[-src_ip:port]]\n"
2515 "             Source IPv4/IPv6 address and port, single or range.\n"
2516 "\n"
2517 "     -D dst_mac\n"
2518 "             Destination MAC address in colon notation (e.g., aa:bb:cc:dd:ee:00).\n"
2519 "\n"
2520 "     -S src_mac\n"
2521 "             Source MAC address in colon notation.\n"
2522 "\n"
2523 "     -a cpu_id\n"
2524 "             Pin the first thread of pkt-gen to a particular CPU using pthread_setaffinity_np(3).  If more\n"
2525 "             threads are used, they are pinned to the subsequent CPUs, one per thread.\n"
2526 "\n"
2527 "     -c cpus\n"
2528 "             Maximum number of CPUs to use (0 means to use all the available ones).\n"
2529 "\n"
2530 "     -p threads\n"
2531 "             Number of threads to use.  By default, only a single thread is used to handle all the netmap\n"
2532 "             rings.  If threads is larger than one, each thread handles a single TX ring (in tx mode), a\n"
2533 "             single RX ring (in rx mode), or a TX/RX ring pair.  The number of threads must be less than or\n"
2534 "             equal to the number of TX (or RX) rings available in the device specified by interface.\n"
2535 "\n"
2536 "     -T report_ms\n"
2537 "             Number of milliseconds between reports.\n"
2538 "\n"
2539 "     -w wait_for_link_time\n"
2540 "             Number of seconds to wait before starting the pkt-gen function, useful to make sure that the\n"
2541 "             network link is up.  A network device driver may take some time to enter netmap mode, or to\n"
2542 "             create a new transmit/receive ring pair when netmap(4) requests one.\n"
2543 "\n"
2544 "     -R rate\n"
2545 "             Packet transmission rate.  Not setting the packet transmission rate tells pkt-gen to transmit\n"
2546 "             packets as quickly as possible.  On servers from 2010 onward netmap(4) is able to com-\n"
2547 "             pletely use all of the bandwidth of a 10 or 40Gbps link, so this option should be used unless\n"
2548 "             your intention is to saturate the link.\n"
2549 "\n"
2550 "     -X      Dump payload of each packet transmitted or received.\n"
2551 "\n"
2552 "     -H len  Add empty virtio-net-header with size 'len'.  Valid sizes are 0, 10 and 12.  This option is\n"
2553 "             only used with Virtual Machine technologies that use virtio as a network interface.\n"
2554 "\n"
2555 "     -P file\n"
2556 "             Load the packet to be transmitted from a pcap file rather than constructing it within\n"
2557 "             pkt-gen.\n"
2558 "\n"
2559 "     -z      Use random IPv4/IPv6 src address/port.\n"
2560 "\n"
2561 "     -Z      Use random IPv4/IPv6 dst address/port.\n"
2562 "\n"
2563 "     -N      Do not normalize units (i.e., use bps, pps instead of Mbps, Kpps, etc.).\n"
2564 "\n"
2565 "     -F num_frags\n"
2566 "             Send multi-slot packets, each one with num_frags fragments.  A multi-slot packet is repre-\n"
2567 "             sented by two or more consecutive netmap slots with the NS_MOREFRAG flag set (except for the\n"
2568 "             last slot).  This is useful to transmit or receive packets larger than the netmap buffer\n"
2569 "             size.\n"
2570 "\n"
2571 "     -M frag_size\n"
2572 "             In multi-slot mode, frag_size specifies the size of each fragment, if smaller than the packet\n"
2573 "             length divided by num_frags.\n"
2574 "\n"
2575 "     -I      Use indirect buffers.  It is only valid for transmitting on VALE ports, and it is implemented\n"
2576 "             by setting the NS_INDIRECT flag in the netmap slots.\n"
2577 "\n"
2578 "     -W      Exit immediately if all the RX rings are empty the first time they are examined.\n"
2579 "\n"
2580 "     -v      Increase the verbosity level.\n"
2581 "\n"
2582 "     -r      In tx mode, do not initialize packets, but send whatever the content of the uninitialized\n"
2583 "             netmap buffers is (rubbish mode).\n"
2584 "\n"
2585 "     -A      Compute mean and standard deviation (over a sliding window) for the transmit or receive rate.\n"
2586 "\n"
2587 "     -B      Take Ethernet framing and CRC into account when computing the average bps.  This adds 4 bytes\n"
2588 "             of CRC and 20 bytes of framing to each packet.\n"
2589 "\n"
2590 "     -C tx_slots[,rx_slots[,tx_rings[,rx_rings]]]\n"
2591 "             Configuration in terms of number of rings and slots to be used when opening the netmap port.\n"
2592 "             Such configuration has an effect on software ports created on the fly, such as VALE ports and\n"
2593 "             netmap pipes.  The configuration may consist of 1 to 4 numbers separated by commas: tx_slots,\n"
2594 "             rx_slots, tx_rings, rx_rings.  Missing numbers or zeroes stand for default values.  As an\n"
2595 "             additional convenience, if exactly one number is specified, then this is assigned to both\n"
2596 "             tx_slots and rx_slots.  If there is no fourth number, then the third one is assigned to both\n"
2597 "             tx_rings and rx_rings.\n"
2598 "\n"
2599 "     -o options		data generation options (parsed using atoi)\n"
2600 "				OPT_PREFETCH	1\n"
2601 "				OPT_ACCESS	2\n"
2602 "				OPT_COPY	4\n"
2603 "				OPT_MEMCPY	8\n"
2604 "				OPT_TS		16 (add a timestamp)\n"
2605 "				OPT_INDIRECT	32 (use indirect buffers)\n"
2606 "				OPT_DUMP	64 (dump rx/tx traffic)\n"
2607 "				OPT_RUBBISH	256\n"
2608 "					(send whatever the buffers contain)\n"
2609 "				OPT_RANDOM_SRC  512\n"
2610 "				OPT_RANDOM_DST  1024\n"
2611 "				OPT_PPS_STATS   2048\n"
2612 "				OPT_UPDATE_CSUM 4096\n"
2613 		     "",
2614 		cmd);
2615 	exit(errcode);
2616 }
2617 
2618 static int
2619 start_threads(struct glob_arg *g) {
2620 	int i;
2621 
2622 	targs = calloc(g->nthreads, sizeof(*targs));
2623 	struct targ *t;
2624 	/*
2625 	 * Now create the desired number of threads, each one
2626 	 * using a single descriptor.
2627 	 */
2628 	for (i = 0; i < g->nthreads; i++) {
2629 		uint64_t seed = (uint64_t)time(0) | ((uint64_t)time(0) << 32);
2630 		t = &targs[i];
2631 
2632 		bzero(t, sizeof(*t));
2633 		t->fd = -1; /* default, with pcap */
2634 		t->g = g;
2635 		memcpy(t->seed, &seed, sizeof(t->seed));
2636 
2637 		if (g->dev_type == DEV_NETMAP) {
2638 			int m = -1;
2639 
2640 			/*
2641 			 * if the user wants both HW and SW rings, we need to
2642 			 * know when to switch from NR_REG_ONE_NIC to NR_REG_ONE_SW
2643 			 */
2644 			if (g->orig_mode == NR_REG_NIC_SW) {
2645 				m = (g->td_type == TD_TYPE_RECEIVER ?
2646 						g->nmd->reg.nr_rx_rings :
2647 						g->nmd->reg.nr_tx_rings);
2648 			}
2649 
2650 			if (i > 0) {
2651 				int j;
2652 				/* the first thread uses the fd opened by the main
2653 				 * thread, the other threads re-open /dev/netmap
2654 				 */
2655 				t->nmd = nmport_clone(g->nmd);
2656 				if (t->nmd == NULL)
2657 					return -1;
2658 
2659 				j = i;
2660 				if (m > 0 && j >= m) {
2661 					/* switch to the software rings */
2662 					t->nmd->reg.nr_mode = NR_REG_ONE_SW;
2663 					j -= m;
2664 				}
2665 				t->nmd->reg.nr_ringid = j & NETMAP_RING_MASK;
2666 				/* Only touch one of the rings (rx is already ok) */
2667 				if (g->td_type == TD_TYPE_RECEIVER)
2668 					t->nmd->reg.nr_flags |= NETMAP_NO_TX_POLL;
2669 
2670 				/* register interface. Override ifname and ringid etc. */
2671 				if (nmport_open_desc(t->nmd) < 0) {
2672 					nmport_undo_prepare(t->nmd);
2673 					t->nmd = NULL;
2674 					return -1;
2675 				}
2676 			} else {
2677 				t->nmd = g->nmd;
2678 			}
2679 			t->fd = t->nmd->fd;
2680 			t->frags = g->frags;
2681 		} else {
2682 			targs[i].fd = g->main_fd;
2683 		}
2684 		t->used = 1;
2685 		t->me = i;
2686 		if (g->affinity >= 0) {
2687 			t->affinity = (g->affinity + i) % g->cpus;
2688 		} else {
2689 			t->affinity = -1;
2690 		}
2691 		/* default, init packets */
2692 		initialize_packet(t);
2693 	}
2694 	/* Wait for PHY reset. */
2695 	D("Wait %d secs for phy reset", g->wait_link);
2696 	sleep(g->wait_link);
2697 	D("Ready...");
2698 
2699 	for (i = 0; i < g->nthreads; i++) {
2700 		t = &targs[i];
2701 		if (pthread_create(&t->thread, NULL, g->td_body, t) == -1) {
2702 			D("Unable to create thread %d: %s", i, strerror(errno));
2703 			t->used = 0;
2704 		}
2705 	}
2706 	return 0;
2707 }
2708 
2709 static void
2710 main_thread(struct glob_arg *g)
2711 {
2712 	int i;
2713 
2714 	struct my_ctrs prev, cur;
2715 	double delta_t;
2716 	struct timeval tic, toc;
2717 
2718 	prev.pkts = prev.bytes = prev.events = 0;
2719 	gettimeofday(&prev.t, NULL);
2720 	for (;;) {
2721 		char b1[40], b2[40], b3[40], b4[100];
2722 		uint64_t pps, usec;
2723 		struct my_ctrs x;
2724 		double abs;
2725 		int done = 0;
2726 
2727 		usec = wait_for_next_report(&prev.t, &cur.t,
2728 				g->report_interval);
2729 
2730 		cur.pkts = cur.bytes = cur.events = 0;
2731 		cur.min_space = 0;
2732 		if (usec < 10000) /* too short to be meaningful */
2733 			continue;
2734 		/* accumulate counts for all threads */
2735 		for (i = 0; i < g->nthreads; i++) {
2736 			cur.pkts += targs[i].ctr.pkts;
2737 			cur.bytes += targs[i].ctr.bytes;
2738 			cur.events += targs[i].ctr.events;
2739 			cur.min_space += targs[i].ctr.min_space;
2740 			targs[i].ctr.min_space = 99999;
2741 			if (targs[i].used == 0)
2742 				done++;
2743 		}
2744 		x.pkts = cur.pkts - prev.pkts;
2745 		x.bytes = cur.bytes - prev.bytes;
2746 		x.events = cur.events - prev.events;
2747 		pps = (x.pkts*1000000 + usec/2) / usec;
2748 		abs = (x.events > 0) ? (x.pkts / (double) x.events) : 0;
2749 
2750 		if (!(g->options & OPT_PPS_STATS)) {
2751 			strcpy(b4, "");
2752 		} else {
2753 			/* Compute some pps stats using a sliding window. */
2754 			double ppsavg = 0.0, ppsdev = 0.0;
2755 			int nsamples = 0;
2756 
2757 			g->win[g->win_idx] = pps;
2758 			g->win_idx = (g->win_idx + 1) % STATS_WIN;
2759 
2760 			for (i = 0; i < STATS_WIN; i++) {
2761 				ppsavg += g->win[i];
2762 				if (g->win[i]) {
2763 					nsamples ++;
2764 				}
2765 			}
2766 			ppsavg /= nsamples;
2767 
2768 			for (i = 0; i < STATS_WIN; i++) {
2769 				if (g->win[i] == 0) {
2770 					continue;
2771 				}
2772 				ppsdev += (g->win[i] - ppsavg) * (g->win[i] - ppsavg);
2773 			}
2774 			ppsdev /= nsamples;
2775 			ppsdev = sqrt(ppsdev);
2776 
2777 			snprintf(b4, sizeof(b4), "[avg/std %s/%s pps]",
2778 				 norm(b1, ppsavg, normalize), norm(b2, ppsdev, normalize));
2779 		}
2780 
2781 		D("%spps %s(%spkts %sbps in %llu usec) %.2f avg_batch %d min_space",
2782 			norm(b1, pps, normalize), b4,
2783 			norm(b2, (double)x.pkts, normalize),
2784 			norm(b3, 1000000*((double)x.bytes*8+(double)x.pkts*g->framing)/usec, normalize),
2785 			(unsigned long long)usec,
2786 			abs, (int)cur.min_space);
2787 		prev = cur;
2788 
2789 		if (done == g->nthreads)
2790 			break;
2791 	}
2792 
2793 	timerclear(&tic);
2794 	timerclear(&toc);
2795 	cur.pkts = cur.bytes = cur.events = 0;
2796 	/* final round */
2797 	for (i = 0; i < g->nthreads; i++) {
2798 		struct timespec t_tic, t_toc;
2799 		/*
2800 		 * Join active threads, unregister interfaces and close
2801 		 * file descriptors.
2802 		 */
2803 		if (targs[i].used)
2804 			pthread_join(targs[i].thread, NULL); /* blocking */
2805 		if (g->dev_type == DEV_NETMAP) {
2806 			nmport_close(targs[i].nmd);
2807 			targs[i].nmd = NULL;
2808 		} else {
2809 			close(targs[i].fd);
2810 		}
2811 
2812 		if (targs[i].completed == 0)
2813 			D("ouch, thread %d exited with error", i);
2814 
2815 		/*
2816 		 * Collect threads output and extract information about
2817 		 * how long it took to send all the packets.
2818 		 */
2819 		cur.pkts += targs[i].ctr.pkts;
2820 		cur.bytes += targs[i].ctr.bytes;
2821 		cur.events += targs[i].ctr.events;
2822 		/* collect the largest start (tic) and end (toc) times,
2823 		 * XXX maybe we should do the earliest tic, or do a weighted
2824 		 * average ?
2825 		 */
2826 		t_tic = timeval2spec(&tic);
2827 		t_toc = timeval2spec(&toc);
2828 		if (!timerisset(&tic) || timespec_ge(&targs[i].tic, &t_tic))
2829 			tic = timespec2val(&targs[i].tic);
2830 		if (!timerisset(&toc) || timespec_ge(&targs[i].toc, &t_toc))
2831 			toc = timespec2val(&targs[i].toc);
2832 	}
2833 
2834 	/* print output. */
2835 	timersub(&toc, &tic, &toc);
2836 	delta_t = toc.tv_sec + 1e-6* toc.tv_usec;
2837 	if (g->td_type == TD_TYPE_SENDER)
2838 		tx_output(g, &cur, delta_t, "Sent");
2839 	else if (g->td_type == TD_TYPE_RECEIVER)
2840 		tx_output(g, &cur, delta_t, "Received");
2841 }
2842 
2843 struct td_desc {
2844 	int ty;
2845 	const char *key;
2846 	void *f;
2847 	int default_burst;
2848 };
2849 
2850 static struct td_desc func[] = {
2851 	{ TD_TYPE_RECEIVER,	"rx",		receiver_body,	512},	/* default */
2852 	{ TD_TYPE_SENDER,	"tx",		sender_body,	512 },
2853 	{ TD_TYPE_OTHER,	"ping",		ping_body,	1 },
2854 	{ TD_TYPE_OTHER,	"pong",		pong_body,	1 },
2855 	{ TD_TYPE_SENDER,	"txseq",	txseq_body,	512 },
2856 	{ TD_TYPE_RECEIVER,	"rxseq",	rxseq_body,	512 },
2857 	{ 0,			NULL,		NULL, 		0 }
2858 };
2859 
2860 static int
2861 tap_alloc(char *dev)
2862 {
2863 	struct ifreq ifr;
2864 	int fd, err;
2865 	const char *clonedev = TAP_CLONEDEV;
2866 
2867 	(void)err;
2868 	(void)dev;
2869 	/* Arguments taken by the function:
2870 	 *
2871 	 * char *dev: the name of an interface (or '\0'). MUST have enough
2872 	 *   space to hold the interface name if '\0' is passed
2873 	 * int flags: interface flags (eg, IFF_TUN etc.)
2874 	 */
2875 
2876 #ifdef __FreeBSD__
2877 	if (dev[3]) { /* tapSomething */
2878 		static char buf[128];
2879 		snprintf(buf, sizeof(buf), "/dev/%s", dev);
2880 		clonedev = buf;
2881 	}
2882 #endif
2883 	/* open the device */
2884 	if( (fd = open(clonedev, O_RDWR)) < 0 ) {
2885 		return fd;
2886 	}
2887 	D("%s open successful", clonedev);
2888 
2889 	/* preparation of the struct ifr, of type "struct ifreq" */
2890 	memset(&ifr, 0, sizeof(ifr));
2891 
2892 #ifdef linux
2893 	ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
2894 
2895 	if (*dev) {
2896 		/* if a device name was specified, put it in the structure; otherwise,
2897 		* the kernel will try to allocate the "next" device of the
2898 		* specified type */
2899 		size_t len = strlen(dev);
2900 		if (len > IFNAMSIZ) {
2901 			D("%s too long", dev);
2902 			return -1;
2903 		}
2904 		memcpy(ifr.ifr_name, dev, len);
2905 	}
2906 
2907 	/* try to create the device */
2908 	if( (err = ioctl(fd, TUNSETIFF, (void *) &ifr)) < 0 ) {
2909 		D("failed to do a TUNSETIFF: %s", strerror(errno));
2910 		close(fd);
2911 		return err;
2912 	}
2913 
2914 	/* if the operation was successful, write back the name of the
2915 	* interface to the variable "dev", so the caller can know
2916 	* it. Note that the caller MUST reserve space in *dev (see calling
2917 	* code below) */
2918 	strcpy(dev, ifr.ifr_name);
2919 	D("new name is %s", dev);
2920 #endif /* linux */
2921 
2922 	/* this is the special file descriptor that the caller will use to talk
2923 	 * with the virtual interface */
2924 	return fd;
2925 }
2926 
2927 int
2928 main(int arc, char **argv)
2929 {
2930 	int i;
2931 	struct sigaction sa;
2932 	sigset_t ss;
2933 
2934 	struct glob_arg g;
2935 
2936 	int ch;
2937 	int devqueues = 1;	/* how many device queues */
2938 	int wait_link_arg = 0;
2939 
2940 	int pkt_size_done = 0;
2941 
2942 	struct td_desc *fn = func;
2943 
2944 	bzero(&g, sizeof(g));
2945 
2946 	g.main_fd = -1;
2947 	g.td_body = fn->f;
2948 	g.td_type = fn->ty;
2949 	g.report_interval = 1000;	/* report interval */
2950 	g.affinity = -1;
2951 	/* ip addresses can also be a range x.x.x.x-x.x.x.y */
2952 	g.af = AF_INET;		/* default */
2953 	g.src_ip.name = "10.0.0.1";
2954 	g.dst_ip.name = "10.1.0.1";
2955 	g.dst_mac.name = "ff:ff:ff:ff:ff:ff";
2956 	g.src_mac.name = NULL;
2957 	g.pkt_size = 60;
2958 	g.pkt_min_size = 0;
2959 	g.nthreads = 1;
2960 	g.cpus = 1;		/* default */
2961 	g.forever = 1;
2962 	g.tx_rate = 0;
2963 	g.frags = 1;
2964 	g.frag_size = (u_int)-1;	/* use the netmap buffer size by default */
2965 	g.nmr_config = "";
2966 	g.virt_header = 0;
2967 	g.wait_link = 2;	/* wait 2 seconds for physical ports */
2968 
2969 	while ((ch = getopt(arc, argv, "46a:f:F:Nn:i:Il:d:s:D:S:b:c:o:p:"
2970 	    "T:w:WvR:XC:H:rP:zZAhBM:")) != -1) {
2971 
2972 		switch(ch) {
2973 		default:
2974 			D("bad option %c %s", ch, optarg);
2975 			usage(-1);
2976 			break;
2977 
2978 		case 'h':
2979 			usage(0);
2980 			break;
2981 
2982 		case '4':
2983 			g.af = AF_INET;
2984 			break;
2985 
2986 		case '6':
2987 			g.af = AF_INET6;
2988 			break;
2989 
2990 		case 'N':
2991 			normalize = 0;
2992 			break;
2993 
2994 		case 'n':
2995 			g.npackets = strtoull(optarg, NULL, 10);
2996 			break;
2997 
2998 		case 'F':
2999 			i = atoi(optarg);
3000 			if (i < 1 || i > 63) {
3001 				D("invalid frags %d [1..63], ignore", i);
3002 				break;
3003 			}
3004 			g.frags = i;
3005 			break;
3006 
3007 		case 'M':
3008 			g.frag_size = atoi(optarg);
3009 			break;
3010 
3011 		case 'f':
3012 			for (fn = func; fn->key; fn++) {
3013 				if (!strcmp(fn->key, optarg))
3014 					break;
3015 			}
3016 			if (fn->key) {
3017 				g.td_body = fn->f;
3018 				g.td_type = fn->ty;
3019 			} else {
3020 				D("unrecognised function %s", optarg);
3021 			}
3022 			break;
3023 
3024 		case 'o':	/* data generation options */
3025 			g.options |= atoi(optarg);
3026 			break;
3027 
3028 		case 'a':       /* force affinity */
3029 			g.affinity = atoi(optarg);
3030 			break;
3031 
3032 		case 'i':	/* interface */
3033 			/* a prefix of tap: netmap: or pcap: forces the mode.
3034 			 * otherwise we guess
3035 			 */
3036 			D("interface is %s", optarg);
3037 			if (strlen(optarg) > MAX_IFNAMELEN - 8) {
3038 				D("ifname too long %s", optarg);
3039 				break;
3040 			}
3041 			strcpy(g.ifname, optarg);
3042 			if (!strcmp(optarg, "null")) {
3043 				g.dev_type = DEV_NETMAP;
3044 				g.dummy_send = 1;
3045 			} else if (!strncmp(optarg, "tap:", 4)) {
3046 				g.dev_type = DEV_TAP;
3047 				strcpy(g.ifname, optarg + 4);
3048 			} else if (!strncmp(optarg, "pcap:", 5)) {
3049 				g.dev_type = DEV_PCAP;
3050 				strcpy(g.ifname, optarg + 5);
3051 			} else if (!strncmp(optarg, "netmap:", 7) ||
3052 				   !strncmp(optarg, "vale", 4)) {
3053 				g.dev_type = DEV_NETMAP;
3054 			} else if (!strncmp(optarg, "tap", 3)) {
3055 				g.dev_type = DEV_TAP;
3056 			} else { /* prepend netmap: */
3057 				g.dev_type = DEV_NETMAP;
3058 				sprintf(g.ifname, "netmap:%s", optarg);
3059 			}
3060 			break;
3061 
3062 		case 'I':
3063 			g.options |= OPT_INDIRECT;	/* use indirect buffers */
3064 			break;
3065 
3066 		case 'l':	/* pkt_size */
3067 			if (pkt_size_done) {
3068 				g.pkt_min_size = atoi(optarg);
3069 			} else {
3070 				g.pkt_size = atoi(optarg);
3071 				pkt_size_done = 1;
3072 			}
3073 			break;
3074 
3075 		case 'd':
3076 			g.dst_ip.name = optarg;
3077 			break;
3078 
3079 		case 's':
3080 			g.src_ip.name = optarg;
3081 			break;
3082 
3083 		case 'T':	/* report interval */
3084 			g.report_interval = atoi(optarg);
3085 			break;
3086 
3087 		case 'w':
3088 			g.wait_link = atoi(optarg);
3089 			wait_link_arg = 1;
3090 			break;
3091 
3092 		case 'W':
3093 			g.forever = 0; /* exit RX with no traffic */
3094 			break;
3095 
3096 		case 'b':	/* burst */
3097 			g.burst = atoi(optarg);
3098 			break;
3099 		case 'c':
3100 			g.cpus = atoi(optarg);
3101 			break;
3102 		case 'p':
3103 			g.nthreads = atoi(optarg);
3104 			break;
3105 
3106 		case 'D': /* destination mac */
3107 			g.dst_mac.name = optarg;
3108 			break;
3109 
3110 		case 'S': /* source mac */
3111 			g.src_mac.name = optarg;
3112 			break;
3113 		case 'v':
3114 			verbose++;
3115 			break;
3116 		case 'R':
3117 			g.tx_rate = atoi(optarg);
3118 			break;
3119 		case 'X':
3120 			g.options |= OPT_DUMP;
3121 			break;
3122 		case 'C':
3123 			D("WARNING: the 'C' option is deprecated, use the '+conf:' libnetmap option instead");
3124 			g.nmr_config = strdup(optarg);
3125 			break;
3126 		case 'H':
3127 			g.virt_header = atoi(optarg);
3128 			break;
3129 		case 'P':
3130 			g.packet_file = strdup(optarg);
3131 			break;
3132 		case 'r':
3133 			g.options |= OPT_RUBBISH;
3134 			break;
3135 		case 'z':
3136 			g.options |= OPT_RANDOM_SRC;
3137 			break;
3138 		case 'Z':
3139 			g.options |= OPT_RANDOM_DST;
3140 			break;
3141 		case 'A':
3142 			g.options |= OPT_PPS_STATS;
3143 			break;
3144 		case 'B':
3145 			/* raw packets have4 bytes crc + 20 bytes framing */
3146 			// XXX maybe add an option to pass the IFG
3147 			g.framing = 24 * 8;
3148 			break;
3149 		}
3150 	}
3151 
3152 	if (strlen(g.ifname) <=0 ) {
3153 		D("missing ifname");
3154 		usage(-1);
3155 	}
3156 
3157 	if (g.burst == 0) {
3158 		g.burst = fn->default_burst;
3159 		D("using default burst size: %d", g.burst);
3160 	}
3161 
3162 	g.system_cpus = i = system_ncpus();
3163 	if (g.cpus < 0 || g.cpus > i) {
3164 		D("%d cpus is too high, have only %d cpus", g.cpus, i);
3165 		usage(-1);
3166 	}
3167 	D("running on %d cpus (have %d)", g.cpus, i);
3168 	if (g.cpus == 0)
3169 		g.cpus = i;
3170 
3171 	if (!wait_link_arg && !strncmp(g.ifname, "vale", 4)) {
3172 		g.wait_link = 0;
3173 	}
3174 
3175 	if (g.pkt_size < 16 || g.pkt_size > MAX_PKTSIZE) {
3176 		D("bad pktsize %d [16..%d]\n", g.pkt_size, MAX_PKTSIZE);
3177 		usage(-1);
3178 	}
3179 
3180 	if (g.pkt_min_size > 0 && (g.pkt_min_size < 16 || g.pkt_min_size > g.pkt_size)) {
3181 		D("bad pktminsize %d [16..%d]\n", g.pkt_min_size, g.pkt_size);
3182 		usage(-1);
3183 	}
3184 
3185 	if (g.src_mac.name == NULL) {
3186 		static char mybuf[20] = "00:00:00:00:00:00";
3187 		/* retrieve source mac address. */
3188 		if (source_hwaddr(g.ifname, mybuf) == -1) {
3189 			D("Unable to retrieve source mac");
3190 			// continue, fail later
3191 		}
3192 		g.src_mac.name = mybuf;
3193 	}
3194 	/* extract address ranges */
3195 	if (extract_mac_range(&g.src_mac) || extract_mac_range(&g.dst_mac))
3196 		usage(-1);
3197 	g.options |= extract_ip_range(&g.src_ip, g.af);
3198 	g.options |= extract_ip_range(&g.dst_ip, g.af);
3199 
3200 	if (g.virt_header != 0 && g.virt_header != VIRT_HDR_1
3201 			&& g.virt_header != VIRT_HDR_2) {
3202 		D("bad virtio-net-header length");
3203 		usage(-1);
3204 	}
3205 
3206     if (g.dev_type == DEV_TAP) {
3207 	D("want to use tap %s", g.ifname);
3208 	g.main_fd = tap_alloc(g.ifname);
3209 	if (g.main_fd < 0) {
3210 		D("cannot open tap %s", g.ifname);
3211 		usage(-1);
3212 	}
3213 #ifndef NO_PCAP
3214     } else if (g.dev_type == DEV_PCAP) {
3215 	char pcap_errbuf[PCAP_ERRBUF_SIZE];
3216 
3217 	pcap_errbuf[0] = '\0'; // init the buffer
3218 	g.p = pcap_open_live(g.ifname, 256 /* XXX */, 1, 100, pcap_errbuf);
3219 	if (g.p == NULL) {
3220 		D("cannot open pcap on %s", g.ifname);
3221 		usage(-1);
3222 	}
3223 	g.main_fd = pcap_fileno(g.p);
3224 	D("using pcap on %s fileno %d", g.ifname, g.main_fd);
3225 #endif /* !NO_PCAP */
3226     } else if (g.dummy_send) { /* but DEV_NETMAP */
3227 	D("using a dummy send routine");
3228     } else {
3229 	g.nmd = nmport_prepare(g.ifname);
3230 	if (g.nmd == NULL)
3231 		goto out;
3232 
3233 	parse_nmr_config(g.nmr_config, &g.nmd->reg);
3234 
3235 	g.nmd->reg.nr_flags |= NR_ACCEPT_VNET_HDR;
3236 
3237 	/*
3238 	 * Open the netmap device using nm_open().
3239 	 *
3240 	 * protocol stack and may cause a reset of the card,
3241 	 * which in turn may take some time for the PHY to
3242 	 * reconfigure. We do the open here to have time to reset.
3243 	 */
3244 	g.orig_mode = g.nmd->reg.nr_mode;
3245 	if (g.nthreads > 1) {
3246 		switch (g.orig_mode) {
3247 		case NR_REG_ALL_NIC:
3248 		case NR_REG_NIC_SW:
3249 			g.nmd->reg.nr_mode = NR_REG_ONE_NIC;
3250 			break;
3251 		case NR_REG_SW:
3252 			g.nmd->reg.nr_mode = NR_REG_ONE_SW;
3253 			break;
3254 		default:
3255 			break;
3256 		}
3257 		g.nmd->reg.nr_ringid = 0;
3258 	}
3259 	if (nmport_open_desc(g.nmd) < 0)
3260 		goto out;
3261 	g.main_fd = g.nmd->fd;
3262 	ND("mapped %luKB at %p", (unsigned long)(g.nmd->req.nr_memsize>>10),
3263 				g.nmd->mem);
3264 
3265 	if (g.virt_header) {
3266 		/* Set the virtio-net header length, since the user asked
3267 		 * for it explicitly. */
3268 		set_vnet_hdr_len(&g);
3269 	} else {
3270 		/* Check whether the netmap port we opened requires us to send
3271 		 * and receive frames with virtio-net header. */
3272 		get_vnet_hdr_len(&g);
3273 	}
3274 
3275 	/* get num of queues in tx or rx */
3276 	if (g.td_type == TD_TYPE_SENDER)
3277 		devqueues = g.nmd->reg.nr_tx_rings + g.nmd->reg.nr_host_tx_rings;
3278 	else
3279 		devqueues = g.nmd->reg.nr_rx_rings + g.nmd->reg.nr_host_rx_rings;
3280 
3281 	/* validate provided nthreads. */
3282 	if (g.nthreads < 1 || g.nthreads > devqueues) {
3283 		D("bad nthreads %d, have %d queues", g.nthreads, devqueues);
3284 		// continue, fail later
3285 	}
3286 
3287 	if (g.td_type == TD_TYPE_SENDER) {
3288 		int mtu = get_if_mtu(&g);
3289 
3290 		if (mtu > 0 && g.pkt_size > mtu) {
3291 			D("pkt_size (%d) must be <= mtu (%d)",
3292 				g.pkt_size, mtu);
3293 			return -1;
3294 		}
3295 	}
3296 
3297 	if (verbose) {
3298 		struct netmap_if *nifp = g.nmd->nifp;
3299 		struct nmreq_register *req = &g.nmd->reg;
3300 
3301 		D("nifp at offset %"PRIu64" ntxqs %d nrxqs %d memid %d",
3302 		    req->nr_offset, req->nr_tx_rings, req->nr_rx_rings,
3303 		    req->nr_mem_id);
3304 		for (i = 0; i < req->nr_tx_rings + req->nr_host_tx_rings; i++) {
3305 			struct netmap_ring *ring = NETMAP_TXRING(nifp, i);
3306 			D("   TX%d at offset %p slots %d", i,
3307 			    (void *)((char *)ring - (char *)nifp), ring->num_slots);
3308 		}
3309 		for (i = 0; i < req->nr_rx_rings + req->nr_host_rx_rings; i++) {
3310 			struct netmap_ring *ring = NETMAP_RXRING(nifp, i);
3311 			D("   RX%d at offset %p slots %d", i,
3312 			    (void *)((char *)ring - (char *)nifp), ring->num_slots);
3313 		}
3314 	}
3315 
3316 	/* Print some debug information. */
3317 	fprintf(stdout,
3318 		"%s %s: %d queues, %d threads and %d cpus.\n",
3319 		(g.td_type == TD_TYPE_SENDER) ? "Sending on" :
3320 			((g.td_type == TD_TYPE_RECEIVER) ? "Receiving from" :
3321 			"Working on"),
3322 		g.ifname,
3323 		devqueues,
3324 		g.nthreads,
3325 		g.cpus);
3326 	if (g.td_type == TD_TYPE_SENDER) {
3327 		fprintf(stdout, "%s -> %s (%s -> %s)\n",
3328 			g.src_ip.name, g.dst_ip.name,
3329 			g.src_mac.name, g.dst_mac.name);
3330 	}
3331 
3332 out:
3333 	/* Exit if something went wrong. */
3334 	if (g.main_fd < 0) {
3335 		D("aborting");
3336 		usage(-1);
3337 	}
3338     }
3339 
3340 
3341 	if (g.options) {
3342 		D("--- SPECIAL OPTIONS:%s%s%s%s%s%s\n",
3343 			g.options & OPT_PREFETCH ? " prefetch" : "",
3344 			g.options & OPT_ACCESS ? " access" : "",
3345 			g.options & OPT_MEMCPY ? " memcpy" : "",
3346 			g.options & OPT_INDIRECT ? " indirect" : "",
3347 			g.options & OPT_COPY ? " copy" : "",
3348 			g.options & OPT_RUBBISH ? " rubbish " : "");
3349 	}
3350 
3351 	g.tx_period.tv_sec = g.tx_period.tv_nsec = 0;
3352 	if (g.tx_rate > 0) {
3353 		/* try to have at least something every second,
3354 		 * reducing the burst size to some 0.01s worth of data
3355 		 * (but no less than one full set of fragments)
3356 	 	 */
3357 		uint64_t x;
3358 		int lim = (g.tx_rate)/300;
3359 		if (g.burst > lim)
3360 			g.burst = lim;
3361 		if (g.burst == 0)
3362 			g.burst = 1;
3363 		x = ((uint64_t)1000000000 * (uint64_t)g.burst) / (uint64_t) g.tx_rate;
3364 		g.tx_period.tv_nsec = x;
3365 		g.tx_period.tv_sec = g.tx_period.tv_nsec / 1000000000;
3366 		g.tx_period.tv_nsec = g.tx_period.tv_nsec % 1000000000;
3367 	}
3368 	if (g.td_type == TD_TYPE_SENDER)
3369 	    D("Sending %d packets every  %jd.%09ld s",
3370 			g.burst, (intmax_t)g.tx_period.tv_sec, g.tx_period.tv_nsec);
3371 	/* Install ^C handler. */
3372 	global_nthreads = g.nthreads;
3373 	sigemptyset(&ss);
3374 	sigaddset(&ss, SIGINT);
3375 	/* block SIGINT now, so that all created threads will inherit the mask */
3376 	if (pthread_sigmask(SIG_BLOCK, &ss, NULL) < 0) {
3377 		D("failed to block SIGINT: %s", strerror(errno));
3378 	}
3379 	if (start_threads(&g) < 0)
3380 		return 1;
3381 	/* Install the handler and re-enable SIGINT for the main thread */
3382 	memset(&sa, 0, sizeof(sa));
3383 	sa.sa_handler = sigint_h;
3384 	if (sigaction(SIGINT, &sa, NULL) < 0) {
3385 		D("failed to install ^C handler: %s", strerror(errno));
3386 	}
3387 
3388 	if (pthread_sigmask(SIG_UNBLOCK, &ss, NULL) < 0) {
3389 		D("failed to re-enable SIGINT: %s", strerror(errno));
3390 	}
3391 	main_thread(&g);
3392 	free(targs);
3393 	return 0;
3394 }
3395 
3396 /* end of file */
3397