xref: /dragonfly/contrib/libpcap/pcap-dpdk.c (revision c090269b)
1 /*
2  * Copyright (C) 2018 jingle YANG. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  *   1. Redistributions of source code must retain the above copyright
9  *      notice, this list of conditions and the following disclaimer.
10  *   2. Redistributions in binary form must reproduce the above copyright
11  *      notice, this list of conditions and the following disclaimer in the
12  *      documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 /*
28 Date: Dec 16, 2018
29 
30 Description:
31 1. Pcap-dpdk provides libpcap the ability to use DPDK with the device name as dpdk:{portid}, such as dpdk:0.
32 2. DPDK is a set of libraries and drivers for fast packet processing. (https://www.dpdk.org/)
33 3. The testprogs/capturetest provides 6.4Gbps/800,000 pps on Intel 10-Gigabit X540-AT2 with DPDK 18.11.
34 
35 Limitations:
36 1. DPDK support will be on if DPDK is available. Please set DIR for --with-dpdk[=DIR] with ./configure or -DDPDK_DIR[=DIR] with cmake if DPDK is installed manually.
37 2. Only support link libdpdk.so dynamically, because the libdpdk.a will not work correctly.
38 3. Only support read operation, and packet injection has not been supported yet.
39 
40 Usage:
41 1. Compile DPDK as shared library and install.(https://github.com/DPDK/dpdk.git)
42 
43 You shall modify the file $RTE_SDK/$RTE_TARGET/.config and set:
44 CONFIG_RTE_BUILD_SHARED_LIB=y
45 By the following command:
46 sed -i 's/CONFIG_RTE_BUILD_SHARED_LIB=n/CONFIG_RTE_BUILD_SHARED_LIB=y/' $RTE_SDK/$RTE_TARGET/.config
47 
48 2. Launch l2fwd that is one of DPDK examples correctly, and get device information.
49 
50 You shall learn how to bind nic with DPDK-compatible driver by $RTE_SDK/usertools/dpdk-devbind.py, such as igb_uio.
51 And enable hugepages by dpdk-setup.sh
52 
53 Then launch the l2fwd with dynamic dirver support. For example:
54 $RTE_SDK/examples/l2fwd/$RTE_TARGET/l2fwd -dlibrte_pmd_e1000.so -dlibrte_pmd_ixgbe.so -dlibrte_mempool_ring.so -- -p 0x1
55 
56 3. Compile libpcap with dpdk options.
57 
58 If DPDK has not been found automatically, you shall export DPDK environment variable which are used for compiling DPDK. And then pass $RTE_SDK/$RTE_TARGET to --with-dpdk or -DDPDK_DIR
59 
60 export RTE_SDK={your DPDK base directory}
61 export RTE_TARGET={your target name}
62 
63 3.1 With configure
64 
65 ./configure --with-dpdk=$RTE_SDK/$RTE_TARGET && make -s all && make -s testprogs && make install
66 
67 3.2 With cmake
68 
69 mkdir -p build && cd build && cmake -DDPDK_DIR=$RTE_SDK/$RTE_TARGET ../ && make -s all && make -s testprogs && make install
70 
71 4. Link your own program with libpcap, and use DPDK with the device name as dpdk:{portid}, such as dpdk:0.
72 And you shall set DPDK configure options by environment variable DPDK_CFG
73 For example, the testprogs/capturetest could be lanched by:
74 
75 env DPDK_CFG="--log-level=debug -l0 -dlibrte_pmd_e1000.so -dlibrte_pmd_ixgbe.so -dlibrte_mempool_ring.so" ./capturetest -i dpdk:0
76 */
77 
78 #ifdef HAVE_CONFIG_H
79 #include <config.h>
80 #endif
81 
82 #include <errno.h>
83 #include <netdb.h>
84 #include <stdio.h>
85 #include <stdlib.h>
86 #include <string.h>
87 #include <unistd.h>
88 #include <time.h>
89 
90 #include <sys/time.h>
91 
92 //header for calling dpdk
93 #include <rte_config.h>
94 #include <rte_common.h>
95 #include <rte_errno.h>
96 #include <rte_log.h>
97 #include <rte_malloc.h>
98 #include <rte_memory.h>
99 #include <rte_eal.h>
100 #include <rte_launch.h>
101 #include <rte_atomic.h>
102 #include <rte_cycles.h>
103 #include <rte_lcore.h>
104 #include <rte_per_lcore.h>
105 #include <rte_branch_prediction.h>
106 #include <rte_interrupts.h>
107 #include <rte_random.h>
108 #include <rte_debug.h>
109 #include <rte_ether.h>
110 #include <rte_ethdev.h>
111 #include <rte_mempool.h>
112 #include <rte_mbuf.h>
113 #include <rte_bus.h>
114 
115 #include "pcap-int.h"
116 #include "pcap-dpdk.h"
117 
118 /*
119  * Deal with API changes that break source compatibility.
120  */
121 
122 #ifdef HAVE_STRUCT_RTE_ETHER_ADDR
123 #define ETHER_ADDR_TYPE	struct rte_ether_addr
124 #else
125 #define ETHER_ADDR_TYPE	struct ether_addr
126 #endif
127 
128 #define DPDK_DEF_LOG_LEV RTE_LOG_ERR
129 //
130 // This is set to 0 if we haven't initialized DPDK yet, 1 if we've
131 // successfully initialized it, a negative value, which is the negative
132 // of the rte_errno from rte_eal_init(), if we tried to initialize it
133 // and got an error.
134 //
135 static int is_dpdk_pre_inited=0;
136 #define DPDK_LIB_NAME "libpcap_dpdk"
137 #define DPDK_DESC "Data Plane Development Kit (DPDK) Interface"
138 #define DPDK_ERR_PERM_MSG "permission denied, DPDK needs root permission"
139 #define DPDK_ARGC_MAX 64
140 #define DPDK_CFG_MAX_LEN 1024
141 #define DPDK_DEV_NAME_MAX 32
142 #define DPDK_DEV_DESC_MAX 512
143 #define DPDK_CFG_ENV_NAME "DPDK_CFG"
144 #define DPDK_DEF_MIN_SLEEP_MS 1
145 static char dpdk_cfg_buf[DPDK_CFG_MAX_LEN];
146 #define DPDK_MAC_ADDR_SIZE 32
147 #define DPDK_DEF_MAC_ADDR "00:00:00:00:00:00"
148 #define DPDK_PCI_ADDR_SIZE 16
149 #define DPDK_DEF_CFG "--log-level=error -l0 -dlibrte_pmd_e1000.so -dlibrte_pmd_ixgbe.so -dlibrte_mempool_ring.so"
150 #define DPDK_PREFIX "dpdk:"
151 #define DPDK_PORTID_MAX 65535U
152 #define MBUF_POOL_NAME "mbuf_pool"
153 #define DPDK_TX_BUF_NAME "tx_buffer"
154 //The number of elements in the mbuf pool.
155 #define DPDK_NB_MBUFS 8192U
156 #define MEMPOOL_CACHE_SIZE 256
157 #define MAX_PKT_BURST 32
158 // Configurable number of RX/TX ring descriptors
159 #define RTE_TEST_RX_DESC_DEFAULT 1024
160 #define RTE_TEST_TX_DESC_DEFAULT 1024
161 
162 static uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT;
163 static uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
164 
165 #ifdef RTE_ETHER_MAX_JUMBO_FRAME_LEN
166 #define RTE_ETH_PCAP_SNAPLEN RTE_ETHER_MAX_JUMBO_FRAME_LEN
167 #else
168 #define RTE_ETH_PCAP_SNAPLEN ETHER_MAX_JUMBO_FRAME_LEN
169 #endif
170 
171 static struct rte_eth_dev_tx_buffer *tx_buffer;
172 
173 struct dpdk_ts_helper{
174 	struct timeval start_time;
175 	uint64_t start_cycles;
176 	uint64_t hz;
177 };
178 struct pcap_dpdk{
179 	pcap_t * orig;
180 	uint16_t portid; // portid of DPDK
181 	int must_clear_promisc;
182 	uint64_t bpf_drop;
183 	int nonblock;
184 	struct timeval required_select_timeout;
185 	struct timeval prev_ts;
186 	struct rte_eth_stats prev_stats;
187 	struct timeval curr_ts;
188 	struct rte_eth_stats curr_stats;
189 	uint64_t pps;
190 	uint64_t bps;
191 	struct rte_mempool * pktmbuf_pool;
192 	struct dpdk_ts_helper ts_helper;
193 	ETHER_ADDR_TYPE eth_addr;
194 	char mac_addr[DPDK_MAC_ADDR_SIZE];
195 	char pci_addr[DPDK_PCI_ADDR_SIZE];
196 	unsigned char pcap_tmp_buf[RTE_ETH_PCAP_SNAPLEN];
197 };
198 
199 static struct rte_eth_conf port_conf = {
200 	.rxmode = {
201 		.split_hdr_size = 0,
202 	},
203 	.txmode = {
204 		.mq_mode = ETH_MQ_TX_NONE,
205 	},
206 };
207 
208 static void	dpdk_fmt_errmsg_for_rte_errno(char *, size_t, int,
209     PCAP_FORMAT_STRING(const char *), ...) PCAP_PRINTFLIKE(4, 5);
210 
211 /*
212  * Generate an error message based on a format, arguments, and an
213  * rte_errno, with a message for the rte_errno after the formatted output.
214  */
215 static void dpdk_fmt_errmsg_for_rte_errno(char *errbuf, size_t errbuflen,
216     int errnum, const char *fmt, ...)
217 {
218 	va_list ap;
219 	size_t msglen;
220 	char *p;
221 	size_t errbuflen_remaining;
222 
223 	va_start(ap, fmt);
224 	vsnprintf(errbuf, errbuflen, fmt, ap);
225 	va_end(ap);
226 	msglen = strlen(errbuf);
227 
228 	/*
229 	 * Do we have enough space to append ": "?
230 	 * Including the terminating '\0', that's 3 bytes.
231 	 */
232 	if (msglen + 3 > errbuflen) {
233 		/* No - just give them what we've produced. */
234 		return;
235 	}
236 	p = errbuf + msglen;
237 	errbuflen_remaining = errbuflen - msglen;
238 	*p++ = ':';
239 	*p++ = ' ';
240 	*p = '\0';
241 	msglen += 2;
242 	errbuflen_remaining -= 2;
243 
244 	/*
245 	 * Now append the string for the error code.
246 	 * rte_strerror() is thread-safe, at least as of dpdk 18.11,
247 	 * unlike strerror() - it uses strerror_r() rather than strerror()
248 	 * for UN*X errno values, and prints to what I assume is a per-thread
249 	 * buffer (based on the "PER_LCORE" in "RTE_DEFINE_PER_LCORE" used
250 	 * to declare the buffers statically) for DPDK errors.
251 	 */
252 	snprintf(p, errbuflen_remaining, "%s", rte_strerror(errnum));
253 }
254 
255 static int dpdk_init_timer(struct pcap_dpdk *pd){
256 	gettimeofday(&(pd->ts_helper.start_time),NULL);
257 	pd->ts_helper.start_cycles = rte_get_timer_cycles();
258 	pd->ts_helper.hz = rte_get_timer_hz();
259 	if (pd->ts_helper.hz == 0){
260 		return -1;
261 	}
262 	return 0;
263 }
264 static inline void calculate_timestamp(struct dpdk_ts_helper *helper,struct timeval *ts)
265 {
266 	uint64_t cycles;
267 	// delta
268 	struct timeval cur_time;
269 	cycles = rte_get_timer_cycles() - helper->start_cycles;
270 	cur_time.tv_sec = (time_t)(cycles/helper->hz);
271 	cur_time.tv_usec = (suseconds_t)((cycles%helper->hz)*1e6/helper->hz);
272 	timeradd(&(helper->start_time), &cur_time, ts);
273 }
274 
275 static uint32_t dpdk_gather_data(unsigned char *data, uint32_t len, struct rte_mbuf *mbuf)
276 {
277 	uint32_t total_len = 0;
278 	while (mbuf && (total_len+mbuf->data_len) < len ){
279 		rte_memcpy(data+total_len, rte_pktmbuf_mtod(mbuf,void *),mbuf->data_len);
280 		total_len+=mbuf->data_len;
281 		mbuf=mbuf->next;
282 	}
283 	return total_len;
284 }
285 
286 
287 static int dpdk_read_with_timeout(pcap_t *p, struct rte_mbuf **pkts_burst, const uint16_t burst_cnt){
288 	struct pcap_dpdk *pd = (struct pcap_dpdk*)(p->priv);
289 	int nb_rx = 0;
290 	int timeout_ms = p->opt.timeout;
291 	int sleep_ms = 0;
292 	if (pd->nonblock){
293 		// In non-blocking mode, just read once, no matter how many packets are captured.
294 		nb_rx = (int)rte_eth_rx_burst(pd->portid, 0, pkts_burst, burst_cnt);
295 	}else{
296 		// In blocking mode, read many times until packets are captured or timeout or break_loop is set.
297 		// if timeout_ms == 0, it may be blocked forever.
298 		while (timeout_ms == 0 || sleep_ms < timeout_ms){
299 			nb_rx = (int)rte_eth_rx_burst(pd->portid, 0, pkts_burst, burst_cnt);
300 			if (nb_rx){ // got packets within timeout_ms
301 				break;
302 			}else{ // no packet arrives at this round.
303 				if (p->break_loop){
304 					break;
305 				}
306 				// sleep for a very short while.
307 				// block sleep is the only choice, since usleep() will impact performance dramatically.
308 				rte_delay_us_block(DPDK_DEF_MIN_SLEEP_MS*1000);
309 				sleep_ms += DPDK_DEF_MIN_SLEEP_MS;
310 			}
311 		}
312 	}
313 	return nb_rx;
314 }
315 
316 static int pcap_dpdk_dispatch(pcap_t *p, int max_cnt, pcap_handler cb, u_char *cb_arg)
317 {
318 	struct pcap_dpdk *pd = (struct pcap_dpdk*)(p->priv);
319 	int burst_cnt = 0;
320 	int nb_rx = 0;
321 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
322 	struct rte_mbuf *m;
323 	struct pcap_pkthdr pcap_header;
324 	// In DPDK, pkt_len is sum of lengths for all segments. And data_len is for one segment
325 	uint32_t pkt_len = 0;
326 	uint32_t caplen = 0;
327 	u_char *bp = NULL;
328 	int i=0;
329 	unsigned int gather_len =0;
330 	int pkt_cnt = 0;
331 	u_char *large_buffer=NULL;
332 	int timeout_ms = p->opt.timeout;
333 
334 	if ( !PACKET_COUNT_IS_UNLIMITED(max_cnt) && max_cnt < MAX_PKT_BURST){
335 		burst_cnt = max_cnt;
336 	}else{
337 		burst_cnt = MAX_PKT_BURST;
338 	}
339 
340 	while( PACKET_COUNT_IS_UNLIMITED(max_cnt) || pkt_cnt < max_cnt){
341 		if (p->break_loop){
342 			p->break_loop = 0;
343 			return PCAP_ERROR_BREAK;
344 		}
345 		// read once in non-blocking mode, or try many times waiting for timeout_ms.
346 		// if timeout_ms == 0, it will be blocked until one packet arrives or break_loop is set.
347 		nb_rx = dpdk_read_with_timeout(p, pkts_burst, burst_cnt);
348 		if (nb_rx == 0){
349 			if (pd->nonblock){
350 				RTE_LOG(DEBUG, USER1, "dpdk: no packets available in non-blocking mode.\n");
351 			}else{
352 				if (p->break_loop){
353 					RTE_LOG(DEBUG, USER1, "dpdk: no packets available and break_loop is set in blocking mode.\n");
354 					p->break_loop = 0;
355 					return PCAP_ERROR_BREAK;
356 
357 				}
358 				RTE_LOG(DEBUG, USER1, "dpdk: no packets available for timeout %d ms in blocking mode.\n", timeout_ms);
359 			}
360 			// break if dpdk reads 0 packet, no matter in blocking(timeout) or non-blocking mode.
361 			break;
362 		}
363 		pkt_cnt += nb_rx;
364 		for ( i = 0; i < nb_rx; i++) {
365 			m = pkts_burst[i];
366 			calculate_timestamp(&(pd->ts_helper),&(pcap_header.ts));
367 			pkt_len = rte_pktmbuf_pkt_len(m);
368 			// caplen = min(pkt_len, p->snapshot);
369 			// caplen will not be changed, no matter how long the rte_pktmbuf
370 			caplen = pkt_len < (uint32_t)p->snapshot ? pkt_len: (uint32_t)p->snapshot;
371 			pcap_header.caplen = caplen;
372 			pcap_header.len = pkt_len;
373 			// volatile prefetch
374 			rte_prefetch0(rte_pktmbuf_mtod(m, void *));
375 			bp = NULL;
376 			if (m->nb_segs == 1)
377 			{
378 				bp = rte_pktmbuf_mtod(m, u_char *);
379 			}else{
380 				// use fast buffer pcap_tmp_buf if pkt_len is small, no need to call malloc and free
381 				if ( pkt_len <= RTE_ETH_PCAP_SNAPLEN)
382 				{
383 					gather_len = dpdk_gather_data(pd->pcap_tmp_buf, RTE_ETH_PCAP_SNAPLEN, m);
384 					bp = pd->pcap_tmp_buf;
385 				}else{
386 					// need call free later
387 					large_buffer = (u_char *)malloc(caplen*sizeof(u_char));
388 					gather_len = dpdk_gather_data(large_buffer, caplen, m);
389 					bp = large_buffer;
390 				}
391 
392 			}
393 			if (bp){
394 				if (p->fcode.bf_insns==NULL || pcap_filter(p->fcode.bf_insns, bp, pcap_header.len, pcap_header.caplen)){
395 					cb(cb_arg, &pcap_header, bp);
396 				}else{
397 					pd->bpf_drop++;
398 				}
399 			}
400 			//free all pktmbuf
401 			rte_pktmbuf_free(m);
402 			if (large_buffer){
403 				free(large_buffer);
404 				large_buffer=NULL;
405 			}
406 		}
407 	}
408 	return pkt_cnt;
409 }
410 
411 static int pcap_dpdk_inject(pcap_t *p, const void *buf _U_, int size _U_)
412 {
413 	//not implemented yet
414 	pcap_strlcpy(p->errbuf,
415 	    "dpdk error: Inject function has not been implemented yet",
416 	    PCAP_ERRBUF_SIZE);
417 	return PCAP_ERROR;
418 }
419 
420 static void pcap_dpdk_close(pcap_t *p)
421 {
422 	struct pcap_dpdk *pd = p->priv;
423 	if (pd==NULL)
424 	{
425 		return;
426 	}
427 	if (pd->must_clear_promisc)
428 	{
429 		rte_eth_promiscuous_disable(pd->portid);
430 	}
431 	rte_eth_dev_stop(pd->portid);
432 	rte_eth_dev_close(pd->portid);
433 	pcap_cleanup_live_common(p);
434 }
435 
436 static void nic_stats_display(struct pcap_dpdk *pd)
437 {
438 	uint16_t portid = pd->portid;
439 	struct rte_eth_stats stats;
440 	rte_eth_stats_get(portid, &stats);
441 	RTE_LOG(INFO,USER1, "portid:%d, RX-packets: %-10"PRIu64"  RX-errors:  %-10"PRIu64
442 	       "  RX-bytes:  %-10"PRIu64"  RX-Imissed:  %-10"PRIu64"\n", portid, stats.ipackets, stats.ierrors,
443 	       stats.ibytes,stats.imissed);
444 	RTE_LOG(INFO,USER1, "portid:%d, RX-PPS: %-10"PRIu64" RX-Mbps: %.2lf\n", portid, pd->pps, pd->bps/1e6f );
445 }
446 
447 static int pcap_dpdk_stats(pcap_t *p, struct pcap_stat *ps)
448 {
449 	struct pcap_dpdk *pd = p->priv;
450 	calculate_timestamp(&(pd->ts_helper), &(pd->curr_ts));
451 	rte_eth_stats_get(pd->portid,&(pd->curr_stats));
452 	if (ps){
453 		ps->ps_recv = pd->curr_stats.ipackets;
454 		ps->ps_drop = pd->curr_stats.ierrors;
455 		ps->ps_drop += pd->bpf_drop;
456 		ps->ps_ifdrop = pd->curr_stats.imissed;
457 	}
458 	uint64_t delta_pkt = pd->curr_stats.ipackets - pd->prev_stats.ipackets;
459 	struct timeval delta_tm;
460 	timersub(&(pd->curr_ts),&(pd->prev_ts), &delta_tm);
461 	uint64_t delta_usec = delta_tm.tv_sec*1e6+delta_tm.tv_usec;
462 	uint64_t delta_bit = (pd->curr_stats.ibytes-pd->prev_stats.ibytes)*8;
463 	RTE_LOG(DEBUG, USER1, "delta_usec: %-10"PRIu64" delta_pkt: %-10"PRIu64" delta_bit: %-10"PRIu64"\n", delta_usec, delta_pkt, delta_bit);
464 	pd->pps = (uint64_t)(delta_pkt*1e6f/delta_usec);
465 	pd->bps = (uint64_t)(delta_bit*1e6f/delta_usec);
466 	nic_stats_display(pd);
467 	pd->prev_stats = pd->curr_stats;
468 	pd->prev_ts = pd->curr_ts;
469 	return 0;
470 }
471 
472 static int pcap_dpdk_setnonblock(pcap_t *p, int nonblock){
473 	struct pcap_dpdk *pd = (struct pcap_dpdk*)(p->priv);
474 	pd->nonblock = nonblock;
475 	return 0;
476 }
477 
478 static int pcap_dpdk_getnonblock(pcap_t *p){
479 	struct pcap_dpdk *pd = (struct pcap_dpdk*)(p->priv);
480 	return pd->nonblock;
481 }
482 static int check_link_status(uint16_t portid, struct rte_eth_link *plink)
483 {
484 	// wait up to 9 seconds to get link status
485 	rte_eth_link_get(portid, plink);
486 	return plink->link_status == ETH_LINK_UP;
487 }
488 static void eth_addr_str(ETHER_ADDR_TYPE *addrp, char* mac_str, int len)
489 {
490 	int offset=0;
491 	if (addrp == NULL){
492 		snprintf(mac_str, len-1, DPDK_DEF_MAC_ADDR);
493 		return;
494 	}
495 	for (int i=0; i<6; i++)
496 	{
497 		if (offset >= len)
498 		{ // buffer overflow
499 			return;
500 		}
501 		if (i==0)
502 		{
503 			snprintf(mac_str+offset, len-1-offset, "%02X",addrp->addr_bytes[i]);
504 			offset+=2; // FF
505 		}else{
506 			snprintf(mac_str+offset, len-1-offset, ":%02X", addrp->addr_bytes[i]);
507 			offset+=3; // :FF
508 		}
509 	}
510 	return;
511 }
512 // return portid by device name, otherwise return -1
513 static uint16_t portid_by_device(char * device)
514 {
515 	uint16_t ret = DPDK_PORTID_MAX;
516 	int len = strlen(device);
517 	int prefix_len = strlen(DPDK_PREFIX);
518 	unsigned long ret_ul = 0L;
519 	char *pEnd;
520 	if (len<=prefix_len || strncmp(device, DPDK_PREFIX, prefix_len)) // check prefix dpdk:
521 	{
522 		return ret;
523 	}
524 	//check all chars are digital
525 	for (int i=prefix_len; device[i]; i++){
526 		if (device[i]<'0' || device[i]>'9'){
527 			return ret;
528 		}
529 	}
530 	ret_ul = strtoul(&(device[prefix_len]), &pEnd, 10);
531 	if (pEnd == &(device[prefix_len]) || *pEnd != '\0'){
532 		return ret;
533 	}
534 	// too large for portid
535 	if (ret_ul >= DPDK_PORTID_MAX){
536 		return ret;
537 	}
538 	ret = (uint16_t)ret_ul;
539 	return ret;
540 }
541 
542 static int parse_dpdk_cfg(char* dpdk_cfg,char** dargv)
543 {
544 	int cnt=0;
545 	memset(dargv,0,sizeof(dargv[0])*DPDK_ARGC_MAX);
546 	//current process name
547 	int skip_space = 1;
548 	int i=0;
549 	RTE_LOG(INFO, USER1,"dpdk cfg: %s\n",dpdk_cfg);
550 	// find first non space char
551 	// The last opt is NULL
552 	for (i=0;dpdk_cfg[i] && cnt<DPDK_ARGC_MAX-1;i++){
553 		if (skip_space && dpdk_cfg[i]!=' '){ // not space
554 			skip_space=!skip_space; // skip normal char
555 			dargv[cnt++] = dpdk_cfg+i;
556 		}
557 		if (!skip_space && dpdk_cfg[i]==' '){ // fint a space
558 			dpdk_cfg[i]=0x00; // end of this opt
559 			skip_space=!skip_space; // skip space char
560 		}
561 	}
562 	dargv[cnt]=NULL;
563 	return cnt;
564 }
565 
566 // only called once
567 // Returns:
568 //
569 //    1 on success;
570 //
571 //    0 if "the EAL cannot initialize on this system", which we treat as
572 //    meaning "DPDK isn't available";
573 //
574 //    a PCAP_ERROR_ code for other errors.
575 //
576 // If eaccess_not_fatal is non-zero, treat "a permissions issue" the way
577 // we treat "the EAL cannot initialize on this system".  We use that
578 // when trying to find DPDK devices, as we don't want to fail to return
579 // *any* devices just because we can't support DPDK; when we're trying
580 // to open a device, we need to return a permissions error in that case.
581 static int dpdk_pre_init(char * ebuf, int eaccess_not_fatal)
582 {
583 	int dargv_cnt=0;
584 	char *dargv[DPDK_ARGC_MAX];
585 	char *ptr_dpdk_cfg = NULL;
586 	int ret;
587 	// globale var
588 	if (is_dpdk_pre_inited != 0)
589 	{
590 		// already inited; did that succeed?
591 		if (is_dpdk_pre_inited < 0)
592 		{
593 			// failed
594 			goto error;
595 		}
596 		else
597 		{
598 			// succeeded
599 			return 1;
600 		}
601 	}
602 	// init EAL
603 	ptr_dpdk_cfg = getenv(DPDK_CFG_ENV_NAME);
604 	// set default log level to debug
605 	rte_log_set_global_level(DPDK_DEF_LOG_LEV);
606 	if (ptr_dpdk_cfg == NULL)
607 	{
608 		RTE_LOG(INFO,USER1,"env $DPDK_CFG is unset, so using default: %s\n",DPDK_DEF_CFG);
609 		ptr_dpdk_cfg = DPDK_DEF_CFG;
610 	}
611 	memset(dpdk_cfg_buf,0,sizeof(dpdk_cfg_buf));
612 	snprintf(dpdk_cfg_buf,DPDK_CFG_MAX_LEN-1,"%s %s",DPDK_LIB_NAME,ptr_dpdk_cfg);
613 	dargv_cnt = parse_dpdk_cfg(dpdk_cfg_buf,dargv);
614 	ret = rte_eal_init(dargv_cnt,dargv);
615 	if (ret == -1)
616 	{
617 		// Indicate that we've called rte_eal_init() by setting
618 		// is_dpdk_pre_inited to the negative of the error code,
619 		// and process the error.
620 		is_dpdk_pre_inited = -rte_errno;
621 		goto error;
622 	}
623 	// init succeeded, so we do not need to do it again later.
624 	is_dpdk_pre_inited = 1;
625 	return 1;
626 
627 error:
628 	switch (-is_dpdk_pre_inited)
629 	{
630 		case EACCES:
631 			// This "indicates a permissions issue.".
632 			RTE_LOG(ERR, USER1, "%s\n", DPDK_ERR_PERM_MSG);
633 			// If we were told to treat this as just meaning
634 			// DPDK isn't available, do so.
635 			if (eaccess_not_fatal)
636 				return 0;
637 			// Otherwise report a fatal error.
638 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
639 			    "DPDK requires that it run as root");
640 			return PCAP_ERROR_PERM_DENIED;
641 
642 		case EAGAIN:
643 			// This "indicates either a bus or system
644 			// resource was not available, setup may
645 			// be attempted again."
646 			// There's no such error in pcap, so I'm
647 			// not sure what we should do here.
648 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
649 			    "Bus or system resource was not available");
650 			break;
651 
652 		case EALREADY:
653 			// This "indicates that the rte_eal_init
654 			// function has already been called, and
655 			// cannot be called again."
656 			// That's not an error; set the "we've
657 			// been here before" flag and return
658 			// success.
659 			is_dpdk_pre_inited = 1;
660 			return 1;
661 
662 		case EFAULT:
663 			// This "indicates the tailq configuration
664 			// name was not found in memory configuration."
665 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
666 			    "The tailq configuration name was not found in the memory configuration");
667 			return PCAP_ERROR;
668 
669 		case EINVAL:
670 			// This "indicates invalid parameters were
671 			// passed as argv/argc."  Those came from
672 			// the configuration file.
673 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
674 			    "The configuration file has invalid parameters");
675 			break;
676 
677 		case ENOMEM:
678 			// This "indicates failure likely caused by
679 			// an out-of-memory condition."
680 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
681 			    "Out of memory");
682 			break;
683 
684 		case ENODEV:
685 			// This "indicates memory setup issues."
686 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
687 			    "An error occurred setting up memory");
688 			break;
689 
690 		case ENOTSUP:
691 			// This "indicates that the EAL cannot
692 			// initialize on this system."  We treat
693 			// that as meaning DPDK isn't available
694 			// on this machine, rather than as a
695 			// fatal error, and let our caller decide
696 			// whether that's a fatal error (if trying
697 			// to activate a DPDK device) or not (if
698 			// trying to enumerate devices).
699 			return 0;
700 
701 		case EPROTO:
702 			// This "indicates that the PCI bus is
703 			// either not present, or is not readable
704 			// by the eal."  Does "the PCI bus is not
705 			// present" mean "this machine has no PCI
706 			// bus", which strikes me as a "not available"
707 			// case?  If so, should "is not readable by
708 			// the EAL" also something we should treat
709 			// as a "not available" case?  If not, we
710 			// can't distinguish between the two, so
711 			// we're stuck.
712 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
713 			    "PCI bus is not present or not readable by the EAL");
714 			break;
715 
716 		case ENOEXEC:
717 			// This "indicates that a service core
718 			// failed to launch successfully."
719 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
720 			    "A service core failed to launch successfully");
721 			break;
722 
723 		default:
724 			//
725 			// That's not in the list of errors in
726 			// the documentation; let it be reported
727 			// as an error.
728 			//
729 			dpdk_fmt_errmsg_for_rte_errno(ebuf,
730 			    PCAP_ERRBUF_SIZE, -is_dpdk_pre_inited,
731 			    "dpdk error: dpdk_pre_init failed");
732 			break;
733 	}
734 	// Error.
735 	return PCAP_ERROR;
736 }
737 
738 static int pcap_dpdk_activate(pcap_t *p)
739 {
740 	struct pcap_dpdk *pd = p->priv;
741 	pd->orig = p;
742 	int ret = PCAP_ERROR;
743 	uint16_t nb_ports=0;
744 	uint16_t portid= DPDK_PORTID_MAX;
745 	unsigned nb_mbufs = DPDK_NB_MBUFS;
746 	struct rte_eth_rxconf rxq_conf;
747 	struct rte_eth_txconf txq_conf;
748 	struct rte_eth_conf local_port_conf = port_conf;
749 	struct rte_eth_dev_info dev_info;
750 	int is_port_up = 0;
751 	struct rte_eth_link link;
752 	do{
753 		//init EAL; fail if we have insufficient permission
754 		char dpdk_pre_init_errbuf[PCAP_ERRBUF_SIZE];
755 		ret = dpdk_pre_init(dpdk_pre_init_errbuf, 0);
756 		if (ret < 0)
757 		{
758 			// This returns a negative value on an error.
759 			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
760 			    "Can't open device %s: %s",
761 			    p->opt.device, dpdk_pre_init_errbuf);
762 			// ret is set to the correct error
763 			break;
764 		}
765 		if (ret == 0)
766 		{
767 			// This means DPDK isn't available on this machine.
768 			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
769 			    "Can't open device %s: DPDK is not available on this machine",
770 			    p->opt.device);
771 			return PCAP_ERROR_NO_SUCH_DEVICE;
772 		}
773 
774 		ret = dpdk_init_timer(pd);
775 		if (ret<0)
776 		{
777 			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
778 				"dpdk error: Init timer is zero with device %s",
779 				p->opt.device);
780 			ret = PCAP_ERROR;
781 			break;
782 		}
783 
784 		nb_ports = rte_eth_dev_count_avail();
785 		if (nb_ports == 0)
786 		{
787 			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
788 			    "dpdk error: No Ethernet ports");
789 			ret = PCAP_ERROR;
790 			break;
791 		}
792 
793 		portid = portid_by_device(p->opt.device);
794 		if (portid == DPDK_PORTID_MAX){
795 			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
796 			    "dpdk error: portid is invalid. device %s",
797 			    p->opt.device);
798 			ret = PCAP_ERROR_NO_SUCH_DEVICE;
799 			break;
800 		}
801 
802 		pd->portid = portid;
803 
804 		if (p->snapshot <= 0 || p->snapshot > MAXIMUM_SNAPLEN)
805 		{
806 			p->snapshot = MAXIMUM_SNAPLEN;
807 		}
808 		// create the mbuf pool
809 		pd->pktmbuf_pool = rte_pktmbuf_pool_create(MBUF_POOL_NAME, nb_mbufs,
810 			MEMPOOL_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE,
811 			rte_socket_id());
812 		if (pd->pktmbuf_pool == NULL)
813 		{
814 			dpdk_fmt_errmsg_for_rte_errno(p->errbuf,
815 			    PCAP_ERRBUF_SIZE, rte_errno,
816 			    "dpdk error: Cannot init mbuf pool");
817 			ret = PCAP_ERROR;
818 			break;
819 		}
820 		// config dev
821 		rte_eth_dev_info_get(portid, &dev_info);
822 		if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
823 		{
824 			local_port_conf.txmode.offloads |=DEV_TX_OFFLOAD_MBUF_FAST_FREE;
825 		}
826 		// only support 1 queue
827 		ret = rte_eth_dev_configure(portid, 1, 1, &local_port_conf);
828 		if (ret < 0)
829 		{
830 			dpdk_fmt_errmsg_for_rte_errno(p->errbuf,
831 			    PCAP_ERRBUF_SIZE, -ret,
832 			    "dpdk error: Cannot configure device: port=%u",
833 			    portid);
834 			ret = PCAP_ERROR;
835 			break;
836 		}
837 		// adjust rx tx
838 		ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, &nb_txd);
839 		if (ret < 0)
840 		{
841 			dpdk_fmt_errmsg_for_rte_errno(p->errbuf,
842 			    PCAP_ERRBUF_SIZE, -ret,
843 			    "dpdk error: Cannot adjust number of descriptors: port=%u",
844 			    portid);
845 			ret = PCAP_ERROR;
846 			break;
847 		}
848 		// get MAC addr
849 		rte_eth_macaddr_get(portid, &(pd->eth_addr));
850 		eth_addr_str(&(pd->eth_addr), pd->mac_addr, DPDK_MAC_ADDR_SIZE-1);
851 
852 		// init one RX queue
853 		rxq_conf = dev_info.default_rxconf;
854 		rxq_conf.offloads = local_port_conf.rxmode.offloads;
855 		ret = rte_eth_rx_queue_setup(portid, 0, nb_rxd,
856 					     rte_eth_dev_socket_id(portid),
857 					     &rxq_conf,
858 					     pd->pktmbuf_pool);
859 		if (ret < 0)
860 		{
861 			dpdk_fmt_errmsg_for_rte_errno(p->errbuf,
862 			    PCAP_ERRBUF_SIZE, -ret,
863 			    "dpdk error: rte_eth_rx_queue_setup:port=%u",
864 			    portid);
865 			ret = PCAP_ERROR;
866 			break;
867 		}
868 
869 		// init one TX queue
870 		txq_conf = dev_info.default_txconf;
871 		txq_conf.offloads = local_port_conf.txmode.offloads;
872 		ret = rte_eth_tx_queue_setup(portid, 0, nb_txd,
873 				rte_eth_dev_socket_id(portid),
874 				&txq_conf);
875 		if (ret < 0)
876 		{
877 			dpdk_fmt_errmsg_for_rte_errno(p->errbuf,
878 			    PCAP_ERRBUF_SIZE, -ret,
879 			    "dpdk error: rte_eth_tx_queue_setup:port=%u",
880 			    portid);
881 			ret = PCAP_ERROR;
882 			break;
883 		}
884 		// Initialize TX buffers
885 		tx_buffer = rte_zmalloc_socket(DPDK_TX_BUF_NAME,
886 				RTE_ETH_TX_BUFFER_SIZE(MAX_PKT_BURST), 0,
887 				rte_eth_dev_socket_id(portid));
888 		if (tx_buffer == NULL)
889 		{
890 			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
891 			    "dpdk error: Cannot allocate buffer for tx on port %u", portid);
892 			ret = PCAP_ERROR;
893 			break;
894 		}
895 		rte_eth_tx_buffer_init(tx_buffer, MAX_PKT_BURST);
896 		// Start device
897 		ret = rte_eth_dev_start(portid);
898 		if (ret < 0)
899 		{
900 			dpdk_fmt_errmsg_for_rte_errno(p->errbuf,
901 			    PCAP_ERRBUF_SIZE, -ret,
902 			    "dpdk error: rte_eth_dev_start:port=%u",
903 			    portid);
904 			ret = PCAP_ERROR;
905 			break;
906 		}
907 		// set promiscuous mode
908 		if (p->opt.promisc){
909 			pd->must_clear_promisc=1;
910 			rte_eth_promiscuous_enable(portid);
911 		}
912 		// check link status
913 		is_port_up = check_link_status(portid, &link);
914 		if (!is_port_up){
915 			snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
916 			    "dpdk error: link is down, port=%u",portid);
917 			ret = PCAP_ERROR_IFACE_NOT_UP;
918 			break;
919 		}
920 		// reset statistics
921 		rte_eth_stats_reset(pd->portid);
922 		calculate_timestamp(&(pd->ts_helper), &(pd->prev_ts));
923 		rte_eth_stats_get(pd->portid,&(pd->prev_stats));
924 		// format pcap_t
925 		pd->portid = portid;
926 		p->fd = pd->portid;
927 		if (p->snapshot <=0 || p->snapshot> MAXIMUM_SNAPLEN)
928 		{
929 			p->snapshot = MAXIMUM_SNAPLEN;
930 		}
931 		p->linktype = DLT_EN10MB; // Ethernet, the 10MB is historical.
932 		p->selectable_fd = p->fd;
933 		p->read_op = pcap_dpdk_dispatch;
934 		p->inject_op = pcap_dpdk_inject;
935 		// using pcap_filter currently, though DPDK provides their own BPF function. Because DPDK BPF needs load a ELF file as a filter.
936 		p->setfilter_op = install_bpf_program;
937 		p->setdirection_op = NULL;
938 		p->set_datalink_op = NULL;
939 		p->getnonblock_op = pcap_dpdk_getnonblock;
940 		p->setnonblock_op = pcap_dpdk_setnonblock;
941 		p->stats_op = pcap_dpdk_stats;
942 		p->cleanup_op = pcap_dpdk_close;
943 		p->breakloop_op = pcap_breakloop_common;
944 		// set default timeout
945 		pd->required_select_timeout.tv_sec = 0;
946 		pd->required_select_timeout.tv_usec = DPDK_DEF_MIN_SLEEP_MS*1000;
947 		p->required_select_timeout = &pd->required_select_timeout;
948 		ret = 0; // OK
949 	}while(0);
950 
951 	if (ret <= PCAP_ERROR) // all kinds of error code
952 	{
953 		pcap_cleanup_live_common(p);
954 	}else{
955 		rte_eth_dev_get_name_by_port(portid,pd->pci_addr);
956 		RTE_LOG(INFO, USER1,"Port %d device: %s, MAC:%s, PCI:%s\n", portid, p->opt.device, pd->mac_addr, pd->pci_addr);
957 		RTE_LOG(INFO, USER1,"Port %d Link Up. Speed %u Mbps - %s\n",
958 							portid, link.link_speed,
959 					(link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
960 						("full-duplex") : ("half-duplex\n"));
961 	}
962 	return ret;
963 }
964 
965 // device name for dpdk should be in the form as dpdk:number, such as dpdk:0
966 pcap_t * pcap_dpdk_create(const char *device, char *ebuf, int *is_ours)
967 {
968 	pcap_t *p=NULL;
969 	*is_ours = 0;
970 
971 	*is_ours = !strncmp(device, "dpdk:", 5);
972 	if (! *is_ours)
973 		return NULL;
974 	//memset will happen
975 	p = PCAP_CREATE_COMMON(ebuf, struct pcap_dpdk);
976 
977 	if (p == NULL)
978 		return NULL;
979 	p->activate_op = pcap_dpdk_activate;
980 	return p;
981 }
982 
983 int pcap_dpdk_findalldevs(pcap_if_list_t *devlistp, char *ebuf)
984 {
985 	int ret=0;
986 	unsigned int nb_ports = 0;
987 	char dpdk_name[DPDK_DEV_NAME_MAX];
988 	char dpdk_desc[DPDK_DEV_DESC_MAX];
989 	ETHER_ADDR_TYPE eth_addr;
990 	char mac_addr[DPDK_MAC_ADDR_SIZE];
991 	char pci_addr[DPDK_PCI_ADDR_SIZE];
992 	do{
993 		// init EAL; return "DPDK not available" if we
994 		// have insufficient permission
995 		char dpdk_pre_init_errbuf[PCAP_ERRBUF_SIZE];
996 		ret = dpdk_pre_init(dpdk_pre_init_errbuf, 1);
997 		if (ret < 0)
998 		{
999 			// This returns a negative value on an error.
1000 			snprintf(ebuf, PCAP_ERRBUF_SIZE,
1001 			    "Can't look for DPDK devices: %s",
1002 			    dpdk_pre_init_errbuf);
1003 			ret = PCAP_ERROR;
1004 			break;
1005 		}
1006 		if (ret == 0)
1007 		{
1008 			// This means DPDK isn't available on this machine.
1009 			// That just means "don't return any devices".
1010 			break;
1011 		}
1012 		nb_ports = rte_eth_dev_count_avail();
1013 		if (nb_ports == 0)
1014 		{
1015 			// That just means "don't return any devices".
1016 			ret = 0;
1017 			break;
1018 		}
1019 		for (unsigned int i=0; i<nb_ports; i++){
1020 			snprintf(dpdk_name, DPDK_DEV_NAME_MAX-1,
1021 			    "%s%u", DPDK_PREFIX, i);
1022 			// mac addr
1023 			rte_eth_macaddr_get(i, &eth_addr);
1024 			eth_addr_str(&eth_addr,mac_addr,DPDK_MAC_ADDR_SIZE);
1025 			// PCI addr
1026 			rte_eth_dev_get_name_by_port(i,pci_addr);
1027 			snprintf(dpdk_desc,DPDK_DEV_DESC_MAX-1,"%s %s, MAC:%s, PCI:%s", DPDK_DESC, dpdk_name, mac_addr, pci_addr);
1028 			if (add_dev(devlistp, dpdk_name, 0, dpdk_desc, ebuf)==NULL){
1029 				ret = PCAP_ERROR;
1030 				break;
1031 			}
1032 		}
1033 	}while(0);
1034 	return ret;
1035 }
1036 
1037 #ifdef DPDK_ONLY
1038 /*
1039  * This libpcap build supports only DPDK, not regular network interfaces.
1040  */
1041 
1042 /*
1043  * There are no regular interfaces, just DPDK interfaces.
1044  */
1045 int
1046 pcap_platform_finddevs(pcap_if_list_t *devlistp _U_, char *errbuf)
1047 {
1048 	return (0);
1049 }
1050 
1051 /*
1052  * Attempts to open a regular interface fail.
1053  */
1054 pcap_t *
1055 pcap_create_interface(const char *device, char *errbuf)
1056 {
1057 	snprintf(errbuf, PCAP_ERRBUF_SIZE,
1058 	    "This version of libpcap only supports DPDK");
1059 	return NULL;
1060 }
1061 
1062 /*
1063  * Libpcap version string.
1064  */
1065 const char *
1066 pcap_lib_version(void)
1067 {
1068 	return (PCAP_VERSION_STRING " (DPDK-only)");
1069 }
1070 #endif
1071