diff options
Diffstat (limited to 'contrib/libpcap/pcap-dpdk.c')
-rw-r--r-- | contrib/libpcap/pcap-dpdk.c | 1086 |
1 files changed, 1086 insertions, 0 deletions
diff --git a/contrib/libpcap/pcap-dpdk.c b/contrib/libpcap/pcap-dpdk.c new file mode 100644 index 000000000000..025a67482c0f --- /dev/null +++ b/contrib/libpcap/pcap-dpdk.c @@ -0,0 +1,1086 @@ +/* + * Copyright (C) 2018 jingle YANG. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* +Date: Dec 16, 2018 + +Description: +1. Pcap-dpdk provides libpcap the ability to use DPDK with the device name as dpdk:{portid}, such as dpdk:0. +2. DPDK is a set of libraries and drivers for fast packet processing. (https://www.dpdk.org/) +3. The testprogs/capturetest provides 6.4Gbps/800,000 pps on Intel 10-Gigabit X540-AT2 with DPDK 18.11. + +Limitations: +1. DPDK support will be on if DPDK is available. Please set DIR for --with-dpdk[=DIR] with ./configure or -DDPDK_DIR[=DIR] with cmake if DPDK is installed manually. +2. Only support link libdpdk.so dynamically, because the libdpdk.a will not work correctly. +3. Only support read operation, and packet injection has not been supported yet. + +Usage: +1. Compile DPDK as shared library and install.(https://github.com/DPDK/dpdk.git) + +You shall modify the file $RTE_SDK/$RTE_TARGET/.config and set: +CONFIG_RTE_BUILD_SHARED_LIB=y +By the following command: +sed -i 's/CONFIG_RTE_BUILD_SHARED_LIB=n/CONFIG_RTE_BUILD_SHARED_LIB=y/' $RTE_SDK/$RTE_TARGET/.config + +2. Launch l2fwd that is one of DPDK examples correctly, and get device information. + +You shall learn how to bind nic with DPDK-compatible driver by $RTE_SDK/usertools/dpdk-devbind.py, such as igb_uio. +And enable hugepages by dpdk-setup.sh + +Then launch the l2fwd with dynamic driver support. For example: +$RTE_SDK/examples/l2fwd/$RTE_TARGET/l2fwd -dlibrte_pmd_e1000.so -dlibrte_pmd_ixgbe.so -dlibrte_mempool_ring.so -- -p 0x1 + +3. Compile libpcap with dpdk options. + +If DPDK has not been found automatically, you shall export DPDK environment variable which are used for compiling DPDK. And then pass $RTE_SDK/$RTE_TARGET to --with-dpdk or -DDPDK_DIR + +export RTE_SDK={your DPDK base directory} +export RTE_TARGET={your target name} + +3.1 With configure + +./configure --with-dpdk=$RTE_SDK/$RTE_TARGET && make -s all && make -s testprogs && make install + +3.2 With cmake + +mkdir -p build && cd build && cmake -DDPDK_DIR=$RTE_SDK/$RTE_TARGET ../ && make -s all && make -s testprogs && make install + +4. Link your own program with libpcap, and use DPDK with the device name as dpdk:{portid}, such as dpdk:0. +And you shall set DPDK configure options by environment variable DPDK_CFG +For example, the testprogs/capturetest could be lanched by: + +env DPDK_CFG="--log-level=debug -l0 -dlibrte_pmd_e1000.so -dlibrte_pmd_ixgbe.so -dlibrte_mempool_ring.so" ./capturetest -i dpdk:0 +*/ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <errno.h> +#include <netdb.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <limits.h> /* for INT_MAX */ +#include <time.h> + +#include <sys/time.h> + +//header for calling dpdk +#include <rte_config.h> +#include <rte_common.h> +#include <rte_errno.h> +#include <rte_log.h> +#include <rte_malloc.h> +#include <rte_memory.h> +#include <rte_eal.h> +#include <rte_launch.h> +#include <rte_atomic.h> +#include <rte_cycles.h> +#include <rte_lcore.h> +#include <rte_per_lcore.h> +#include <rte_branch_prediction.h> +#include <rte_interrupts.h> +#include <rte_random.h> +#include <rte_debug.h> +#include <rte_ether.h> +#include <rte_ethdev.h> +#include <rte_mempool.h> +#include <rte_mbuf.h> +#include <rte_bus.h> + +#include "pcap-int.h" +#include "pcap-dpdk.h" + +/* + * Deal with API changes that break source compatibility. + */ + +#ifdef HAVE_STRUCT_RTE_ETHER_ADDR +#define ETHER_ADDR_TYPE struct rte_ether_addr +#else +#define ETHER_ADDR_TYPE struct ether_addr +#endif + +#define DPDK_DEF_LOG_LEV RTE_LOG_ERR +// +// This is set to 0 if we haven't initialized DPDK yet, 1 if we've +// successfully initialized it, a negative value, which is the negative +// of the rte_errno from rte_eal_init(), if we tried to initialize it +// and got an error. +// +static int is_dpdk_pre_inited=0; +#define DPDK_LIB_NAME "libpcap_dpdk" +#define DPDK_DESC "Data Plane Development Kit (DPDK) Interface" +#define DPDK_ERR_PERM_MSG "permission denied, DPDK needs root permission" +#define DPDK_ARGC_MAX 64 +#define DPDK_CFG_MAX_LEN 1024 +#define DPDK_DEV_NAME_MAX 32 +#define DPDK_DEV_DESC_MAX 512 +#define DPDK_CFG_ENV_NAME "DPDK_CFG" +#define DPDK_DEF_MIN_SLEEP_MS 1 +static char dpdk_cfg_buf[DPDK_CFG_MAX_LEN]; +#define DPDK_MAC_ADDR_SIZE 32 +#define DPDK_DEF_MAC_ADDR "00:00:00:00:00:00" +#define DPDK_PCI_ADDR_SIZE 16 +#define DPDK_DEF_CFG "--log-level=error -l0 -dlibrte_pmd_e1000.so -dlibrte_pmd_ixgbe.so -dlibrte_mempool_ring.so" +#define DPDK_PREFIX "dpdk:" +#define DPDK_PORTID_MAX 65535U +#define MBUF_POOL_NAME "mbuf_pool" +#define DPDK_TX_BUF_NAME "tx_buffer" +//The number of elements in the mbuf pool. +#define DPDK_NB_MBUFS 8192U +#define MEMPOOL_CACHE_SIZE 256 +#define MAX_PKT_BURST 32 +// Configurable number of RX/TX ring descriptors +#define RTE_TEST_RX_DESC_DEFAULT 1024 +#define RTE_TEST_TX_DESC_DEFAULT 1024 + +static uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT; +static uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT; + +#ifdef RTE_ETHER_MAX_JUMBO_FRAME_LEN +#define RTE_ETH_PCAP_SNAPLEN RTE_ETHER_MAX_JUMBO_FRAME_LEN +#else +#define RTE_ETH_PCAP_SNAPLEN ETHER_MAX_JUMBO_FRAME_LEN +#endif + +static struct rte_eth_dev_tx_buffer *tx_buffer; + +struct dpdk_ts_helper{ + struct timeval start_time; + uint64_t start_cycles; + uint64_t hz; +}; +struct pcap_dpdk{ + pcap_t * orig; + uint16_t portid; // portid of DPDK + int must_clear_promisc; + uint64_t bpf_drop; + int nonblock; + struct timeval required_select_timeout; + struct timeval prev_ts; + struct rte_eth_stats prev_stats; + struct timeval curr_ts; + struct rte_eth_stats curr_stats; + uint64_t pps; + uint64_t bps; + struct rte_mempool * pktmbuf_pool; + struct dpdk_ts_helper ts_helper; + ETHER_ADDR_TYPE eth_addr; + char mac_addr[DPDK_MAC_ADDR_SIZE]; + char pci_addr[DPDK_PCI_ADDR_SIZE]; + unsigned char pcap_tmp_buf[RTE_ETH_PCAP_SNAPLEN]; +}; + +static struct rte_eth_conf port_conf = { + .rxmode = { + .split_hdr_size = 0, + }, + .txmode = { + .mq_mode = ETH_MQ_TX_NONE, + }, +}; + +static void dpdk_fmt_errmsg_for_rte_errno(char *, size_t, int, + PCAP_FORMAT_STRING(const char *), ...) PCAP_PRINTFLIKE(4, 5); + +/* + * Generate an error message based on a format, arguments, and an + * rte_errno, with a message for the rte_errno after the formatted output. + */ +static void dpdk_fmt_errmsg_for_rte_errno(char *errbuf, size_t errbuflen, + int errnum, const char *fmt, ...) +{ + va_list ap; + size_t msglen; + char *p; + size_t errbuflen_remaining; + + va_start(ap, fmt); + vsnprintf(errbuf, errbuflen, fmt, ap); + va_end(ap); + msglen = strlen(errbuf); + + /* + * Do we have enough space to append ": "? + * Including the terminating '\0', that's 3 bytes. + */ + if (msglen + 3 > errbuflen) { + /* No - just give them what we've produced. */ + return; + } + p = errbuf + msglen; + errbuflen_remaining = errbuflen - msglen; + *p++ = ':'; + *p++ = ' '; + *p = '\0'; + msglen += 2; + errbuflen_remaining -= 2; + + /* + * Now append the string for the error code. + * rte_strerror() is thread-safe, at least as of dpdk 18.11, + * unlike strerror() - it uses strerror_r() rather than strerror() + * for UN*X errno values, and prints to what I assume is a per-thread + * buffer (based on the "PER_LCORE" in "RTE_DEFINE_PER_LCORE" used + * to declare the buffers statically) for DPDK errors. + */ + snprintf(p, errbuflen_remaining, "%s", rte_strerror(errnum)); +} + +static int dpdk_init_timer(struct pcap_dpdk *pd){ + gettimeofday(&(pd->ts_helper.start_time),NULL); + pd->ts_helper.start_cycles = rte_get_timer_cycles(); + pd->ts_helper.hz = rte_get_timer_hz(); + if (pd->ts_helper.hz == 0){ + return -1; + } + return 0; +} +static inline void calculate_timestamp(struct dpdk_ts_helper *helper,struct timeval *ts) +{ + uint64_t cycles; + // delta + struct timeval cur_time; + cycles = rte_get_timer_cycles() - helper->start_cycles; + cur_time.tv_sec = (time_t)(cycles/helper->hz); + cur_time.tv_usec = (suseconds_t)((cycles%helper->hz)*1e6/helper->hz); + timeradd(&(helper->start_time), &cur_time, ts); +} + +static uint32_t dpdk_gather_data(unsigned char *data, uint32_t len, struct rte_mbuf *mbuf) +{ + uint32_t total_len = 0; + while (mbuf && (total_len+mbuf->data_len) < len ){ + rte_memcpy(data+total_len, rte_pktmbuf_mtod(mbuf,void *),mbuf->data_len); + total_len+=mbuf->data_len; + mbuf=mbuf->next; + } + return total_len; +} + + +static int dpdk_read_with_timeout(pcap_t *p, struct rte_mbuf **pkts_burst, const uint16_t burst_cnt){ + struct pcap_dpdk *pd = (struct pcap_dpdk*)(p->priv); + int nb_rx = 0; + int timeout_ms = p->opt.timeout; + int sleep_ms = 0; + if (pd->nonblock){ + // In non-blocking mode, just read once, no matter how many packets are captured. + nb_rx = (int)rte_eth_rx_burst(pd->portid, 0, pkts_burst, burst_cnt); + }else{ + // In blocking mode, read many times until packets are captured or timeout or break_loop is set. + // if timeout_ms == 0, it may be blocked forever. + while (timeout_ms == 0 || sleep_ms < timeout_ms){ + nb_rx = (int)rte_eth_rx_burst(pd->portid, 0, pkts_burst, burst_cnt); + if (nb_rx){ // got packets within timeout_ms + break; + }else{ // no packet arrives at this round. + if (p->break_loop){ + break; + } + // sleep for a very short while. + // block sleep is the only choice, since usleep() will impact performance dramatically. + rte_delay_us_block(DPDK_DEF_MIN_SLEEP_MS*1000); + sleep_ms += DPDK_DEF_MIN_SLEEP_MS; + } + } + } + return nb_rx; +} + +static int pcap_dpdk_dispatch(pcap_t *p, int max_cnt, pcap_handler cb, u_char *cb_arg) +{ + struct pcap_dpdk *pd = (struct pcap_dpdk*)(p->priv); + int burst_cnt = 0; + int nb_rx = 0; + struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; + struct rte_mbuf *m; + struct pcap_pkthdr pcap_header; + // In DPDK, pkt_len is sum of lengths for all segments. And data_len is for one segment + uint32_t pkt_len = 0; + uint32_t caplen = 0; + u_char *bp = NULL; + int i=0; + unsigned int gather_len =0; + int pkt_cnt = 0; + u_char *large_buffer=NULL; + int timeout_ms = p->opt.timeout; + + /* + * This can conceivably process more than INT_MAX packets, + * which would overflow the packet count, causing it either + * to look like a negative number, and thus cause us to + * return a value that looks like an error, or overflow + * back into positive territory, and thus cause us to + * return a too-low count. + * + * Therefore, if the packet count is unlimited, we clip + * it at INT_MAX; this routine is not expected to + * process packets indefinitely, so that's not an issue. + */ + if (PACKET_COUNT_IS_UNLIMITED(max_cnt)) + max_cnt = INT_MAX; + + if (max_cnt < MAX_PKT_BURST){ + burst_cnt = max_cnt; + }else{ + burst_cnt = MAX_PKT_BURST; + } + + while( pkt_cnt < max_cnt){ + if (p->break_loop){ + p->break_loop = 0; + return PCAP_ERROR_BREAK; + } + // read once in non-blocking mode, or try many times waiting for timeout_ms. + // if timeout_ms == 0, it will be blocked until one packet arrives or break_loop is set. + nb_rx = dpdk_read_with_timeout(p, pkts_burst, burst_cnt); + if (nb_rx == 0){ + if (pd->nonblock){ + RTE_LOG(DEBUG, USER1, "dpdk: no packets available in non-blocking mode.\n"); + }else{ + if (p->break_loop){ + RTE_LOG(DEBUG, USER1, "dpdk: no packets available and break_loop is set in blocking mode.\n"); + p->break_loop = 0; + return PCAP_ERROR_BREAK; + + } + RTE_LOG(DEBUG, USER1, "dpdk: no packets available for timeout %d ms in blocking mode.\n", timeout_ms); + } + // break if dpdk reads 0 packet, no matter in blocking(timeout) or non-blocking mode. + break; + } + pkt_cnt += nb_rx; + for ( i = 0; i < nb_rx; i++) { + m = pkts_burst[i]; + calculate_timestamp(&(pd->ts_helper),&(pcap_header.ts)); + pkt_len = rte_pktmbuf_pkt_len(m); + // caplen = min(pkt_len, p->snapshot); + // caplen will not be changed, no matter how long the rte_pktmbuf + caplen = pkt_len < (uint32_t)p->snapshot ? pkt_len: (uint32_t)p->snapshot; + pcap_header.caplen = caplen; + pcap_header.len = pkt_len; + // volatile prefetch + rte_prefetch0(rte_pktmbuf_mtod(m, void *)); + bp = NULL; + if (m->nb_segs == 1) + { + bp = rte_pktmbuf_mtod(m, u_char *); + }else{ + // use fast buffer pcap_tmp_buf if pkt_len is small, no need to call malloc and free + if ( pkt_len <= RTE_ETH_PCAP_SNAPLEN) + { + gather_len = dpdk_gather_data(pd->pcap_tmp_buf, RTE_ETH_PCAP_SNAPLEN, m); + bp = pd->pcap_tmp_buf; + }else{ + // need call free later + large_buffer = (u_char *)malloc(caplen*sizeof(u_char)); + gather_len = dpdk_gather_data(large_buffer, caplen, m); + bp = large_buffer; + } + + } + if (bp){ + if (p->fcode.bf_insns==NULL || pcap_filter(p->fcode.bf_insns, bp, pcap_header.len, pcap_header.caplen)){ + cb(cb_arg, &pcap_header, bp); + }else{ + pd->bpf_drop++; + } + } + //free all pktmbuf + rte_pktmbuf_free(m); + if (large_buffer){ + free(large_buffer); + large_buffer=NULL; + } + } + } + return pkt_cnt; +} + +static int pcap_dpdk_inject(pcap_t *p, const void *buf _U_, int size _U_) +{ + //not implemented yet + pcap_strlcpy(p->errbuf, + "dpdk error: Inject function has not been implemented yet", + PCAP_ERRBUF_SIZE); + return PCAP_ERROR; +} + +static void pcap_dpdk_close(pcap_t *p) +{ + struct pcap_dpdk *pd = p->priv; + if (pd==NULL) + { + return; + } + if (pd->must_clear_promisc) + { + rte_eth_promiscuous_disable(pd->portid); + } + rte_eth_dev_stop(pd->portid); + rte_eth_dev_close(pd->portid); + pcap_cleanup_live_common(p); +} + +static void nic_stats_display(struct pcap_dpdk *pd) +{ + uint16_t portid = pd->portid; + struct rte_eth_stats stats; + rte_eth_stats_get(portid, &stats); + RTE_LOG(INFO,USER1, "portid:%d, RX-packets: %-10"PRIu64" RX-errors: %-10"PRIu64 + " RX-bytes: %-10"PRIu64" RX-Imissed: %-10"PRIu64"\n", portid, stats.ipackets, stats.ierrors, + stats.ibytes,stats.imissed); + RTE_LOG(INFO,USER1, "portid:%d, RX-PPS: %-10"PRIu64" RX-Mbps: %.2lf\n", portid, pd->pps, pd->bps/1e6f ); +} + +static int pcap_dpdk_stats(pcap_t *p, struct pcap_stat *ps) +{ + struct pcap_dpdk *pd = p->priv; + calculate_timestamp(&(pd->ts_helper), &(pd->curr_ts)); + rte_eth_stats_get(pd->portid,&(pd->curr_stats)); + if (ps){ + ps->ps_recv = pd->curr_stats.ipackets; + ps->ps_drop = pd->curr_stats.ierrors; + ps->ps_drop += pd->bpf_drop; + ps->ps_ifdrop = pd->curr_stats.imissed; + } + uint64_t delta_pkt = pd->curr_stats.ipackets - pd->prev_stats.ipackets; + struct timeval delta_tm; + timersub(&(pd->curr_ts),&(pd->prev_ts), &delta_tm); + uint64_t delta_usec = delta_tm.tv_sec*1e6+delta_tm.tv_usec; + uint64_t delta_bit = (pd->curr_stats.ibytes-pd->prev_stats.ibytes)*8; + RTE_LOG(DEBUG, USER1, "delta_usec: %-10"PRIu64" delta_pkt: %-10"PRIu64" delta_bit: %-10"PRIu64"\n", delta_usec, delta_pkt, delta_bit); + pd->pps = (uint64_t)(delta_pkt*1e6f/delta_usec); + pd->bps = (uint64_t)(delta_bit*1e6f/delta_usec); + nic_stats_display(pd); + pd->prev_stats = pd->curr_stats; + pd->prev_ts = pd->curr_ts; + return 0; +} + +static int pcap_dpdk_setnonblock(pcap_t *p, int nonblock){ + struct pcap_dpdk *pd = (struct pcap_dpdk*)(p->priv); + pd->nonblock = nonblock; + return 0; +} + +static int pcap_dpdk_getnonblock(pcap_t *p){ + struct pcap_dpdk *pd = (struct pcap_dpdk*)(p->priv); + return pd->nonblock; +} +static int check_link_status(uint16_t portid, struct rte_eth_link *plink) +{ + // wait up to 9 seconds to get link status + rte_eth_link_get(portid, plink); + return plink->link_status == ETH_LINK_UP; +} +static void eth_addr_str(ETHER_ADDR_TYPE *addrp, char* mac_str, int len) +{ + int offset=0; + if (addrp == NULL){ + snprintf(mac_str, len-1, DPDK_DEF_MAC_ADDR); + return; + } + for (int i=0; i<6; i++) + { + if (offset >= len) + { // buffer overflow + return; + } + if (i==0) + { + snprintf(mac_str+offset, len-1-offset, "%02X",addrp->addr_bytes[i]); + offset+=2; // FF + }else{ + snprintf(mac_str+offset, len-1-offset, ":%02X", addrp->addr_bytes[i]); + offset+=3; // :FF + } + } + return; +} +// return portid by device name, otherwise return -1 +static uint16_t portid_by_device(char * device) +{ + uint16_t ret = DPDK_PORTID_MAX; + int len = strlen(device); + int prefix_len = strlen(DPDK_PREFIX); + unsigned long ret_ul = 0L; + char *pEnd; + if (len<=prefix_len || strncmp(device, DPDK_PREFIX, prefix_len)) // check prefix dpdk: + { + return ret; + } + //check all chars are digital + for (int i=prefix_len; device[i]; i++){ + if (device[i]<'0' || device[i]>'9'){ + return ret; + } + } + ret_ul = strtoul(&(device[prefix_len]), &pEnd, 10); + if (pEnd == &(device[prefix_len]) || *pEnd != '\0'){ + return ret; + } + // too large for portid + if (ret_ul >= DPDK_PORTID_MAX){ + return ret; + } + ret = (uint16_t)ret_ul; + return ret; +} + +static int parse_dpdk_cfg(char* dpdk_cfg,char** dargv) +{ + int cnt=0; + memset(dargv,0,sizeof(dargv[0])*DPDK_ARGC_MAX); + //current process name + int skip_space = 1; + int i=0; + RTE_LOG(INFO, USER1,"dpdk cfg: %s\n",dpdk_cfg); + // find first non space char + // The last opt is NULL + for (i=0;dpdk_cfg[i] && cnt<DPDK_ARGC_MAX-1;i++){ + if (skip_space && dpdk_cfg[i]!=' '){ // not space + skip_space=!skip_space; // skip normal char + dargv[cnt++] = dpdk_cfg+i; + } + if (!skip_space && dpdk_cfg[i]==' '){ // fint a space + dpdk_cfg[i]=0x00; // end of this opt + skip_space=!skip_space; // skip space char + } + } + dargv[cnt]=NULL; + return cnt; +} + +// only called once +// Returns: +// +// 1 on success; +// +// 0 if "the EAL cannot initialize on this system", which we treat as +// meaning "DPDK isn't available"; +// +// a PCAP_ERROR_ code for other errors. +// +// If eaccess_not_fatal is non-zero, treat "a permissions issue" the way +// we treat "the EAL cannot initialize on this system". We use that +// when trying to find DPDK devices, as we don't want to fail to return +// *any* devices just because we can't support DPDK; when we're trying +// to open a device, we need to return a permissions error in that case. +static int dpdk_pre_init(char * ebuf, int eaccess_not_fatal) +{ + int dargv_cnt=0; + char *dargv[DPDK_ARGC_MAX]; + char *ptr_dpdk_cfg = NULL; + int ret; + // globale var + if (is_dpdk_pre_inited != 0) + { + // already inited; did that succeed? + if (is_dpdk_pre_inited < 0) + { + // failed + goto error; + } + else + { + // succeeded + return 1; + } + } + // init EAL + ptr_dpdk_cfg = getenv(DPDK_CFG_ENV_NAME); + // set default log level to debug + rte_log_set_global_level(DPDK_DEF_LOG_LEV); + if (ptr_dpdk_cfg == NULL) + { + RTE_LOG(INFO,USER1,"env $DPDK_CFG is unset, so using default: %s\n",DPDK_DEF_CFG); + ptr_dpdk_cfg = DPDK_DEF_CFG; + } + memset(dpdk_cfg_buf,0,sizeof(dpdk_cfg_buf)); + snprintf(dpdk_cfg_buf,DPDK_CFG_MAX_LEN-1,"%s %s",DPDK_LIB_NAME,ptr_dpdk_cfg); + dargv_cnt = parse_dpdk_cfg(dpdk_cfg_buf,dargv); + ret = rte_eal_init(dargv_cnt,dargv); + if (ret == -1) + { + // Indicate that we've called rte_eal_init() by setting + // is_dpdk_pre_inited to the negative of the error code, + // and process the error. + is_dpdk_pre_inited = -rte_errno; + goto error; + } + // init succeeded, so we do not need to do it again later. + is_dpdk_pre_inited = 1; + return 1; + +error: + switch (-is_dpdk_pre_inited) + { + case EACCES: + // This "indicates a permissions issue.". + RTE_LOG(ERR, USER1, "%s\n", DPDK_ERR_PERM_MSG); + // If we were told to treat this as just meaning + // DPDK isn't available, do so. + if (eaccess_not_fatal) + return 0; + // Otherwise report a fatal error. + snprintf(ebuf, PCAP_ERRBUF_SIZE, + "DPDK requires that it run as root"); + return PCAP_ERROR_PERM_DENIED; + + case EAGAIN: + // This "indicates either a bus or system + // resource was not available, setup may + // be attempted again." + // There's no such error in pcap, so I'm + // not sure what we should do here. + snprintf(ebuf, PCAP_ERRBUF_SIZE, + "Bus or system resource was not available"); + break; + + case EALREADY: + // This "indicates that the rte_eal_init + // function has already been called, and + // cannot be called again." + // That's not an error; set the "we've + // been here before" flag and return + // success. + is_dpdk_pre_inited = 1; + return 1; + + case EFAULT: + // This "indicates the tailq configuration + // name was not found in memory configuration." + snprintf(ebuf, PCAP_ERRBUF_SIZE, + "The tailq configuration name was not found in the memory configuration"); + return PCAP_ERROR; + + case EINVAL: + // This "indicates invalid parameters were + // passed as argv/argc." Those came from + // the configuration file. + snprintf(ebuf, PCAP_ERRBUF_SIZE, + "The configuration file has invalid parameters"); + break; + + case ENOMEM: + // This "indicates failure likely caused by + // an out-of-memory condition." + snprintf(ebuf, PCAP_ERRBUF_SIZE, + "Out of memory"); + break; + + case ENODEV: + // This "indicates memory setup issues." + snprintf(ebuf, PCAP_ERRBUF_SIZE, + "An error occurred setting up memory"); + break; + + case ENOTSUP: + // This "indicates that the EAL cannot + // initialize on this system." We treat + // that as meaning DPDK isn't available + // on this machine, rather than as a + // fatal error, and let our caller decide + // whether that's a fatal error (if trying + // to activate a DPDK device) or not (if + // trying to enumerate devices). + return 0; + + case EPROTO: + // This "indicates that the PCI bus is + // either not present, or is not readable + // by the eal." Does "the PCI bus is not + // present" mean "this machine has no PCI + // bus", which strikes me as a "not available" + // case? If so, should "is not readable by + // the EAL" also something we should treat + // as a "not available" case? If not, we + // can't distinguish between the two, so + // we're stuck. + snprintf(ebuf, PCAP_ERRBUF_SIZE, + "PCI bus is not present or not readable by the EAL"); + break; + + case ENOEXEC: + // This "indicates that a service core + // failed to launch successfully." + snprintf(ebuf, PCAP_ERRBUF_SIZE, + "A service core failed to launch successfully"); + break; + + default: + // + // That's not in the list of errors in + // the documentation; let it be reported + // as an error. + // + dpdk_fmt_errmsg_for_rte_errno(ebuf, + PCAP_ERRBUF_SIZE, -is_dpdk_pre_inited, + "dpdk error: dpdk_pre_init failed"); + break; + } + // Error. + return PCAP_ERROR; +} + +static int pcap_dpdk_activate(pcap_t *p) +{ + struct pcap_dpdk *pd = p->priv; + pd->orig = p; + int ret = PCAP_ERROR; + uint16_t nb_ports=0; + uint16_t portid= DPDK_PORTID_MAX; + unsigned nb_mbufs = DPDK_NB_MBUFS; + struct rte_eth_rxconf rxq_conf; + struct rte_eth_txconf txq_conf; + struct rte_eth_conf local_port_conf = port_conf; + struct rte_eth_dev_info dev_info; + int is_port_up = 0; + struct rte_eth_link link; + do{ + //init EAL; fail if we have insufficient permission + char dpdk_pre_init_errbuf[PCAP_ERRBUF_SIZE]; + ret = dpdk_pre_init(dpdk_pre_init_errbuf, 0); + if (ret < 0) + { + // This returns a negative value on an error. + snprintf(p->errbuf, PCAP_ERRBUF_SIZE, + "Can't open device %s: %s", + p->opt.device, dpdk_pre_init_errbuf); + // ret is set to the correct error + break; + } + if (ret == 0) + { + // This means DPDK isn't available on this machine. + snprintf(p->errbuf, PCAP_ERRBUF_SIZE, + "Can't open device %s: DPDK is not available on this machine", + p->opt.device); + return PCAP_ERROR_NO_SUCH_DEVICE; + } + + ret = dpdk_init_timer(pd); + if (ret<0) + { + snprintf(p->errbuf, PCAP_ERRBUF_SIZE, + "dpdk error: Init timer is zero with device %s", + p->opt.device); + ret = PCAP_ERROR; + break; + } + + nb_ports = rte_eth_dev_count_avail(); + if (nb_ports == 0) + { + snprintf(p->errbuf, PCAP_ERRBUF_SIZE, + "dpdk error: No Ethernet ports"); + ret = PCAP_ERROR; + break; + } + + portid = portid_by_device(p->opt.device); + if (portid == DPDK_PORTID_MAX){ + snprintf(p->errbuf, PCAP_ERRBUF_SIZE, + "dpdk error: portid is invalid. device %s", + p->opt.device); + ret = PCAP_ERROR_NO_SUCH_DEVICE; + break; + } + + pd->portid = portid; + + if (p->snapshot <= 0 || p->snapshot > MAXIMUM_SNAPLEN) + { + p->snapshot = MAXIMUM_SNAPLEN; + } + // create the mbuf pool + pd->pktmbuf_pool = rte_pktmbuf_pool_create(MBUF_POOL_NAME, nb_mbufs, + MEMPOOL_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, + rte_socket_id()); + if (pd->pktmbuf_pool == NULL) + { + dpdk_fmt_errmsg_for_rte_errno(p->errbuf, + PCAP_ERRBUF_SIZE, rte_errno, + "dpdk error: Cannot init mbuf pool"); + ret = PCAP_ERROR; + break; + } + // config dev + rte_eth_dev_info_get(portid, &dev_info); + if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) + { + local_port_conf.txmode.offloads |=DEV_TX_OFFLOAD_MBUF_FAST_FREE; + } + // only support 1 queue + ret = rte_eth_dev_configure(portid, 1, 1, &local_port_conf); + if (ret < 0) + { + dpdk_fmt_errmsg_for_rte_errno(p->errbuf, + PCAP_ERRBUF_SIZE, -ret, + "dpdk error: Cannot configure device: port=%u", + portid); + ret = PCAP_ERROR; + break; + } + // adjust rx tx + ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, &nb_txd); + if (ret < 0) + { + dpdk_fmt_errmsg_for_rte_errno(p->errbuf, + PCAP_ERRBUF_SIZE, -ret, + "dpdk error: Cannot adjust number of descriptors: port=%u", + portid); + ret = PCAP_ERROR; + break; + } + // get MAC addr + rte_eth_macaddr_get(portid, &(pd->eth_addr)); + eth_addr_str(&(pd->eth_addr), pd->mac_addr, DPDK_MAC_ADDR_SIZE-1); + + // init one RX queue + rxq_conf = dev_info.default_rxconf; + rxq_conf.offloads = local_port_conf.rxmode.offloads; + ret = rte_eth_rx_queue_setup(portid, 0, nb_rxd, + rte_eth_dev_socket_id(portid), + &rxq_conf, + pd->pktmbuf_pool); + if (ret < 0) + { + dpdk_fmt_errmsg_for_rte_errno(p->errbuf, + PCAP_ERRBUF_SIZE, -ret, + "dpdk error: rte_eth_rx_queue_setup:port=%u", + portid); + ret = PCAP_ERROR; + break; + } + + // init one TX queue + txq_conf = dev_info.default_txconf; + txq_conf.offloads = local_port_conf.txmode.offloads; + ret = rte_eth_tx_queue_setup(portid, 0, nb_txd, + rte_eth_dev_socket_id(portid), + &txq_conf); + if (ret < 0) + { + dpdk_fmt_errmsg_for_rte_errno(p->errbuf, + PCAP_ERRBUF_SIZE, -ret, + "dpdk error: rte_eth_tx_queue_setup:port=%u", + portid); + ret = PCAP_ERROR; + break; + } + // Initialize TX buffers + tx_buffer = rte_zmalloc_socket(DPDK_TX_BUF_NAME, + RTE_ETH_TX_BUFFER_SIZE(MAX_PKT_BURST), 0, + rte_eth_dev_socket_id(portid)); + if (tx_buffer == NULL) + { + snprintf(p->errbuf, PCAP_ERRBUF_SIZE, + "dpdk error: Cannot allocate buffer for tx on port %u", portid); + ret = PCAP_ERROR; + break; + } + rte_eth_tx_buffer_init(tx_buffer, MAX_PKT_BURST); + // Start device + ret = rte_eth_dev_start(portid); + if (ret < 0) + { + dpdk_fmt_errmsg_for_rte_errno(p->errbuf, + PCAP_ERRBUF_SIZE, -ret, + "dpdk error: rte_eth_dev_start:port=%u", + portid); + ret = PCAP_ERROR; + break; + } + // set promiscuous mode + if (p->opt.promisc){ + pd->must_clear_promisc=1; + rte_eth_promiscuous_enable(portid); + } + // check link status + is_port_up = check_link_status(portid, &link); + if (!is_port_up){ + snprintf(p->errbuf, PCAP_ERRBUF_SIZE, + "dpdk error: link is down, port=%u",portid); + ret = PCAP_ERROR_IFACE_NOT_UP; + break; + } + // reset statistics + rte_eth_stats_reset(pd->portid); + calculate_timestamp(&(pd->ts_helper), &(pd->prev_ts)); + rte_eth_stats_get(pd->portid,&(pd->prev_stats)); + // format pcap_t + pd->portid = portid; + p->fd = pd->portid; + if (p->snapshot <=0 || p->snapshot> MAXIMUM_SNAPLEN) + { + p->snapshot = MAXIMUM_SNAPLEN; + } + p->linktype = DLT_EN10MB; // Ethernet, the 10MB is historical. + p->selectable_fd = p->fd; + p->read_op = pcap_dpdk_dispatch; + p->inject_op = pcap_dpdk_inject; + // using pcap_filter currently, though DPDK provides their own BPF function. Because DPDK BPF needs load a ELF file as a filter. + p->setfilter_op = install_bpf_program; + p->setdirection_op = NULL; + p->set_datalink_op = NULL; + p->getnonblock_op = pcap_dpdk_getnonblock; + p->setnonblock_op = pcap_dpdk_setnonblock; + p->stats_op = pcap_dpdk_stats; + p->cleanup_op = pcap_dpdk_close; + p->breakloop_op = pcap_breakloop_common; + // set default timeout + pd->required_select_timeout.tv_sec = 0; + pd->required_select_timeout.tv_usec = DPDK_DEF_MIN_SLEEP_MS*1000; + p->required_select_timeout = &pd->required_select_timeout; + ret = 0; // OK + }while(0); + + if (ret <= PCAP_ERROR) // all kinds of error code + { + pcap_cleanup_live_common(p); + }else{ + rte_eth_dev_get_name_by_port(portid,pd->pci_addr); + RTE_LOG(INFO, USER1,"Port %d device: %s, MAC:%s, PCI:%s\n", portid, p->opt.device, pd->mac_addr, pd->pci_addr); + RTE_LOG(INFO, USER1,"Port %d Link Up. Speed %u Mbps - %s\n", + portid, link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? + ("full-duplex") : ("half-duplex\n")); + } + return ret; +} + +// device name for dpdk should be in the form as dpdk:number, such as dpdk:0 +pcap_t * pcap_dpdk_create(const char *device, char *ebuf, int *is_ours) +{ + pcap_t *p=NULL; + *is_ours = 0; + + *is_ours = !strncmp(device, "dpdk:", 5); + if (! *is_ours) + return NULL; + //memset will happen + p = PCAP_CREATE_COMMON(ebuf, struct pcap_dpdk); + + if (p == NULL) + return NULL; + p->activate_op = pcap_dpdk_activate; + return p; +} + +int pcap_dpdk_findalldevs(pcap_if_list_t *devlistp, char *ebuf) +{ + int ret=0; + unsigned int nb_ports = 0; + char dpdk_name[DPDK_DEV_NAME_MAX]; + char dpdk_desc[DPDK_DEV_DESC_MAX]; + ETHER_ADDR_TYPE eth_addr; + char mac_addr[DPDK_MAC_ADDR_SIZE]; + char pci_addr[DPDK_PCI_ADDR_SIZE]; + do{ + // init EAL; return "DPDK not available" if we + // have insufficient permission + char dpdk_pre_init_errbuf[PCAP_ERRBUF_SIZE]; + ret = dpdk_pre_init(dpdk_pre_init_errbuf, 1); + if (ret < 0) + { + // This returns a negative value on an error. + snprintf(ebuf, PCAP_ERRBUF_SIZE, + "Can't look for DPDK devices: %s", + dpdk_pre_init_errbuf); + ret = PCAP_ERROR; + break; + } + if (ret == 0) + { + // This means DPDK isn't available on this machine. + // That just means "don't return any devices". + break; + } + nb_ports = rte_eth_dev_count_avail(); + if (nb_ports == 0) + { + // That just means "don't return any devices". + ret = 0; + break; + } + for (unsigned int i=0; i<nb_ports; i++){ + snprintf(dpdk_name, DPDK_DEV_NAME_MAX-1, + "%s%u", DPDK_PREFIX, i); + // mac addr + rte_eth_macaddr_get(i, ð_addr); + eth_addr_str(ð_addr,mac_addr,DPDK_MAC_ADDR_SIZE); + // PCI addr + rte_eth_dev_get_name_by_port(i,pci_addr); + snprintf(dpdk_desc,DPDK_DEV_DESC_MAX-1,"%s %s, MAC:%s, PCI:%s", DPDK_DESC, dpdk_name, mac_addr, pci_addr); + if (add_dev(devlistp, dpdk_name, 0, dpdk_desc, ebuf)==NULL){ + ret = PCAP_ERROR; + break; + } + } + }while(0); + return ret; +} + +#ifdef DPDK_ONLY +/* + * This libpcap build supports only DPDK, not regular network interfaces. + */ + +/* + * There are no regular interfaces, just DPDK interfaces. + */ +int +pcap_platform_finddevs(pcap_if_list_t *devlistp _U_, char *errbuf) +{ + return (0); +} + +/* + * Attempts to open a regular interface fail. + */ +pcap_t * +pcap_create_interface(const char *device, char *errbuf) +{ + snprintf(errbuf, PCAP_ERRBUF_SIZE, + "This version of libpcap only supports DPDK"); + return NULL; +} + +/* + * Libpcap version string. + */ +const char * +pcap_lib_version(void) +{ + return (PCAP_VERSION_STRING " (DPDK-only)"); +} +#endif |