diff options
Diffstat (limited to 'sys/dev/e1000/if_em.c')
-rw-r--r-- | sys/dev/e1000/if_em.c | 702 |
1 files changed, 473 insertions, 229 deletions
diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c index ab39c4355782..8032345d09ae 100644 --- a/sys/dev/e1000/if_em.c +++ b/sys/dev/e1000/if_em.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -32,6 +32,8 @@ ******************************************************************************/ /*$FreeBSD$*/ +#include "opt_em.h" +#include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" @@ -41,6 +43,10 @@ #include <sys/param.h> #include <sys/systm.h> +#ifdef DDB +#include <sys/types.h> +#include <ddb/ddb.h> +#endif #if __FreeBSD_version >= 800000 #include <sys/buf_ring.h> #endif @@ -52,6 +58,7 @@ #include <sys/mbuf.h> #include <sys/module.h> #include <sys/rman.h> +#include <sys/smp.h> #include <sys/socket.h> #include <sys/sockio.h> #include <sys/sysctl.h> @@ -208,7 +215,7 @@ static int em_resume(device_t); #ifdef EM_MULTIQUEUE static int em_mq_start(if_t, struct mbuf *); static int em_mq_start_locked(if_t, - struct tx_ring *, struct mbuf *); + struct tx_ring *); static void em_qflush(if_t); #else static void em_start(if_t); @@ -299,6 +306,10 @@ static void em_handle_tx(void *context, int pending); static void em_handle_rx(void *context, int pending); static void em_handle_link(void *context, int pending); +#ifdef EM_MULTIQUEUE +static void em_enable_vectors_82574(struct adapter *); +#endif + static void em_set_sysctl_value(struct adapter *, const char *, const char *, int *, int); static int em_set_flowcntl(SYSCTL_HANDLER_ARGS); @@ -388,6 +399,19 @@ static int em_enable_msix = TRUE; SYSCTL_INT(_hw_em, OID_AUTO, enable_msix, CTLFLAG_RDTUN, &em_enable_msix, 0, "Enable MSI-X interrupts"); +#ifdef EM_MULTIQUEUE +static int em_num_queues = 1; +SYSCTL_INT(_hw_em, OID_AUTO, num_queues, CTLFLAG_RDTUN, &em_num_queues, 0, + "82574 only: Number of queues to configure, 0 indicates autoconfigure"); +#endif + +/* +** Global variable to store last used CPU when binding queues +** to CPUs in igb_allocate_msix. Starts at CPU_FIRST and increments when a +** queue is bound to a cpu. +*/ +static int em_last_bind_cpu = -1; + /* How many packets rxeof tries to clean at a time */ static int em_rx_process_limit = 100; SYSCTL_INT(_hw_em, OID_AUTO, rx_process_limit, CTLFLAG_RDTUN, @@ -420,10 +444,10 @@ static int em_probe(device_t dev) { char adapter_name[60]; - u16 pci_vendor_id = 0; - u16 pci_device_id = 0; - u16 pci_subvendor_id = 0; - u16 pci_subdevice_id = 0; + uint16_t pci_vendor_id = 0; + uint16_t pci_device_id = 0; + uint16_t pci_subvendor_id = 0; + uint16_t pci_subdevice_id = 0; em_vendor_info_t *ent; INIT_DEBUGOUT("em_probe: begin"); @@ -550,6 +574,11 @@ em_attach(device_t dev) goto err_pci; } + /* + * Setup MSI/X or MSI if PCI Express + */ + adapter->msix = em_setup_msix(adapter); + e1000_get_bus_info(hw); /* Set up some sysctls for the tunable interrupt delays */ @@ -880,7 +909,7 @@ em_resume(device_t dev) EM_TX_LOCK(txr); #ifdef EM_MULTIQUEUE if (!drbr_empty(ifp, txr->br)) - em_mq_start_locked(ifp, txr, NULL); + em_mq_start_locked(ifp, txr); #else if (!if_sendq_empty(ifp)) em_start_locked(ifp, txr); @@ -894,107 +923,7 @@ em_resume(device_t dev) } -#ifdef EM_MULTIQUEUE -/********************************************************************* - * Multiqueue Transmit routines - * - * em_mq_start is called by the stack to initiate a transmit. - * however, if busy the driver can queue the request rather - * than do an immediate send. It is this that is an advantage - * in this driver, rather than also having multiple tx queues. - **********************************************************************/ -static int -em_mq_start_locked(if_t ifp, struct tx_ring *txr, struct mbuf *m) -{ - struct adapter *adapter = txr->adapter; - struct mbuf *next; - int err = 0, enq = 0; - - if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != - IFF_DRV_RUNNING || adapter->link_active == 0) { - if (m != NULL) - err = drbr_enqueue(ifp, txr->br, m); - return (err); - } - - enq = 0; - if (m != NULL) { - err = drbr_enqueue(ifp, txr->br, m); - if (err) - return (err); - } - - /* Process the queue */ - while ((next = drbr_peek(ifp, txr->br)) != NULL) { - if ((err = em_xmit(txr, &next)) != 0) { - if (next == NULL) - drbr_advance(ifp, txr->br); - else - drbr_putback(ifp, txr->br, next); - break; - } - drbr_advance(ifp, txr->br); - enq++; - if_inc_counter(ifp, IFCOUNTER_OBYTES, next->m_pkthdr.len); - if (next->m_flags & M_MCAST) - if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); - if_etherbpfmtap(ifp, next); - if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) - break; - } - - if (enq > 0) { - /* Set the watchdog */ - txr->queue_status = EM_QUEUE_WORKING; - txr->watchdog_time = ticks; - } - - if (txr->tx_avail < EM_MAX_SCATTER) - em_txeof(txr); - if (txr->tx_avail < EM_MAX_SCATTER) - if_setdrvflagbits(ifp, IFF_DRV_OACTIVE,0); - return (err); -} - -/* -** Multiqueue capable stack interface -*/ -static int -em_mq_start(if_t ifp, struct mbuf *m) -{ - struct adapter *adapter = if_getsoftc(ifp); - struct tx_ring *txr = adapter->tx_rings; - int error; - - if (EM_TX_TRYLOCK(txr)) { - error = em_mq_start_locked(ifp, txr, m); - EM_TX_UNLOCK(txr); - } else - error = drbr_enqueue(ifp, txr->br, m); - - return (error); -} - -/* -** Flush all ring buffers -*/ -static void -em_qflush(if_t ifp) -{ - struct adapter *adapter = if_getsoftc(ifp); - struct tx_ring *txr = adapter->tx_rings; - struct mbuf *m; - - for (int i = 0; i < adapter->num_queues; i++, txr++) { - EM_TX_LOCK(txr); - while ((m = buf_ring_dequeue_sc(txr->br)) != NULL) - m_freem(m); - EM_TX_UNLOCK(txr); - } - if_qflush(ifp); -} -#else /* !EM_MULTIQUEUE */ - +#ifndef EM_MULTIQUEUE static void em_start_locked(if_t ifp, struct tx_ring *txr) { @@ -1032,12 +961,13 @@ em_start_locked(if_t ifp, struct tx_ring *txr) break; } + /* Mark the queue as having work */ + if (txr->busy == EM_TX_IDLE) + txr->busy = EM_TX_BUSY; + /* Send a copy of the frame to the BPF listener */ - if_etherbpfmtap(ifp, m_head); + ETHER_BPF_MTAP(ifp, m_head); - /* Set timeout in case hardware has problems transmitting. */ - txr->watchdog_time = ticks; - txr->queue_status = EM_QUEUE_WORKING; } return; @@ -1056,6 +986,115 @@ em_start(if_t ifp) } return; } +#else /* EM_MULTIQUEUE */ +/********************************************************************* + * Multiqueue Transmit routines + * + * em_mq_start is called by the stack to initiate a transmit. + * however, if busy the driver can queue the request rather + * than do an immediate send. It is this that is an advantage + * in this driver, rather than also having multiple tx queues. + **********************************************************************/ +/* +** Multiqueue capable stack interface +*/ +static int +em_mq_start(if_t ifp, struct mbuf *m) +{ + struct adapter *adapter = if_getsoftc(ifp); + struct tx_ring *txr = adapter->tx_rings; + unsigned int i, error; + + if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) + i = m->m_pkthdr.flowid % adapter->num_queues; + else + i = curcpu % adapter->num_queues; + + txr = &adapter->tx_rings[i]; + + error = drbr_enqueue(ifp, txr->br, m); + if (error) + return (error); + + if (EM_TX_TRYLOCK(txr)) { + em_mq_start_locked(ifp, txr); + EM_TX_UNLOCK(txr); + } else + taskqueue_enqueue(txr->tq, &txr->tx_task); + + return (0); +} + +static int +em_mq_start_locked(if_t ifp, struct tx_ring *txr) +{ + struct adapter *adapter = txr->adapter; + struct mbuf *next; + int err = 0, enq = 0; + + EM_TX_LOCK_ASSERT(txr); + + if (((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) || + adapter->link_active == 0) { + return (ENETDOWN); + } + + /* Process the queue */ + while ((next = drbr_peek(ifp, txr->br)) != NULL) { + if ((err = em_xmit(txr, &next)) != 0) { + if (next == NULL) { + /* It was freed, move forward */ + drbr_advance(ifp, txr->br); + } else { + /* + * Still have one left, it may not be + * the same since the transmit function + * may have changed it. + */ + drbr_putback(ifp, txr->br, next); + } + break; + } + drbr_advance(ifp, txr->br); + enq++; + if_inc_counter(ifp, IFCOUNTER_OBYTES, next->m_pkthdr.len); + if (next->m_flags & M_MCAST) + if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); + ETHER_BPF_MTAP(ifp, next); + if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) + break; + } + + /* Mark the queue as having work */ + if ((enq > 0) && (txr->busy == EM_TX_IDLE)) + txr->busy = EM_TX_BUSY; + + if (txr->tx_avail < EM_MAX_SCATTER) + em_txeof(txr); + if (txr->tx_avail < EM_MAX_SCATTER) { + if_setdrvflagbits(ifp, IFF_DRV_OACTIVE,0); + } + return (err); +} + +/* +** Flush all ring buffers +*/ +static void +em_qflush(if_t ifp) +{ + struct adapter *adapter = if_getsoftc(ifp); + struct tx_ring *txr = adapter->tx_rings; + struct mbuf *m; + + for (int i = 0; i < adapter->num_queues; i++, txr++) { + EM_TX_LOCK(txr); + while ((m = buf_ring_dequeue_sc(txr->br)) != NULL) + m_freem(m); + EM_TX_UNLOCK(txr); + } + if_qflush(ifp); +} #endif /* EM_MULTIQUEUE */ /********************************************************************* @@ -1451,7 +1490,7 @@ em_poll(if_t ifp, enum poll_cmd cmd, int count) em_txeof(txr); #ifdef EM_MULTIQUEUE if (!drbr_empty(ifp, txr->br)) - em_mq_start_locked(ifp, txr, NULL); + em_mq_start_locked(ifp, txr); #else if (!if_sendq_empty(ifp)) em_start_locked(ifp, txr); @@ -1518,14 +1557,14 @@ em_handle_que(void *context, int pending) struct tx_ring *txr = adapter->tx_rings; struct rx_ring *rxr = adapter->rx_rings; - if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { bool more = em_rxeof(rxr, adapter->rx_process_limit, NULL); + EM_TX_LOCK(txr); em_txeof(txr); #ifdef EM_MULTIQUEUE if (!drbr_empty(ifp, txr->br)) - em_mq_start_locked(ifp, txr, NULL); + em_mq_start_locked(ifp, txr); #else if (!if_sendq_empty(ifp)) em_start_locked(ifp, txr); @@ -1559,11 +1598,12 @@ em_msix_tx(void *arg) em_txeof(txr); #ifdef EM_MULTIQUEUE if (!drbr_empty(ifp, txr->br)) - em_mq_start_locked(ifp, txr, NULL); + em_mq_start_locked(ifp, txr); #else if (!if_sendq_empty(ifp)) em_start_locked(ifp, txr); #endif + /* Reenable this interrupt */ E1000_WRITE_REG(&adapter->hw, E1000_IMS, txr->ims); EM_TX_UNLOCK(txr); @@ -1589,9 +1629,10 @@ em_msix_rx(void *arg) more = em_rxeof(rxr, adapter->rx_process_limit, NULL); if (more) taskqueue_enqueue(rxr->tq, &rxr->rx_task); - else + else { /* Reenable this interrupt */ E1000_WRITE_REG(&adapter->hw, E1000_IMS, rxr->ims); + } return; } @@ -1618,6 +1659,16 @@ em_msix_link(void *arg) } else E1000_WRITE_REG(&adapter->hw, E1000_IMS, EM_MSIX_LINK | E1000_IMS_LSC); + /* + ** Because we must read the ICR for this interrupt + ** it may clear other causes using autoclear, for + ** this reason we simply create a soft interrupt + ** for all these vectors. + */ + if (reg_icr) { + E1000_WRITE_REG(&adapter->hw, + E1000_ICS, adapter->ims); + } return; } @@ -1631,9 +1682,10 @@ em_handle_rx(void *context, int pending) more = em_rxeof(rxr, adapter->rx_process_limit, NULL); if (more) taskqueue_enqueue(rxr->tq, &rxr->rx_task); - else + else { /* Reenable this interrupt */ E1000_WRITE_REG(&adapter->hw, E1000_IMS, rxr->ims); + } } static void @@ -1647,7 +1699,7 @@ em_handle_tx(void *context, int pending) em_txeof(txr); #ifdef EM_MULTIQUEUE if (!drbr_empty(ifp, txr->br)) - em_mq_start_locked(ifp, txr, NULL); + em_mq_start_locked(ifp, txr); #else if (!if_sendq_empty(ifp)) em_start_locked(ifp, txr); @@ -1677,7 +1729,7 @@ em_handle_link(void *context, int pending) EM_TX_LOCK(txr); #ifdef EM_MULTIQUEUE if (!drbr_empty(ifp, txr->br)) - em_mq_start_locked(ifp, txr, NULL); + em_mq_start_locked(ifp, txr); #else if (if_sendq_empty(ifp)) em_start_locked(ifp, txr); @@ -2102,8 +2154,6 @@ retry: */ tx_buffer = &txr->tx_buffers[first]; tx_buffer->next_eop = last; - /* Update the watchdog time early and often */ - txr->watchdog_time = ticks; /* * Advance the Transmit Descriptor Tail (TDT), this tells the E1000 @@ -2223,7 +2273,7 @@ em_local_timer(void *arg) if_t ifp = adapter->ifp; struct tx_ring *txr = adapter->tx_rings; struct rx_ring *rxr = adapter->rx_rings; - u32 trigger; + u32 trigger = 0; EM_CORE_LOCK_ASSERT(adapter); @@ -2236,9 +2286,11 @@ em_local_timer(void *arg) e1000_rar_set(&adapter->hw, adapter->hw.mac.addr, 0); /* Mask to use in the irq trigger */ - if (adapter->msix_mem) - trigger = rxr->ims; - else + if (adapter->msix_mem) { + for (int i = 0; i < adapter->num_queues; i++, rxr++) + trigger |= rxr->ims; + rxr = adapter->rx_rings; + } else trigger = E1000_ICS_RXDMT0; /* @@ -2247,15 +2299,15 @@ em_local_timer(void *arg) ** and the HUNG state will be static if set. */ for (int i = 0; i < adapter->num_queues; i++, txr++) { - if ((txr->queue_status == EM_QUEUE_HUNG) && - (adapter->pause_frames == 0)) + if (txr->busy == EM_TX_HUNG) goto hung; + if (txr->busy >= EM_TX_MAXTRIES) + txr->busy = EM_TX_HUNG; /* Schedule a TX tasklet if needed */ if (txr->tx_avail <= EM_MAX_SCATTER) taskqueue_enqueue(txr->tq, &txr->tx_task); } - adapter->pause_frames = 0; callout_reset(&adapter->timer, hz, em_local_timer, adapter); #ifndef DEVICE_POLLING /* Trigger an RX interrupt to guarantee mbuf refresh */ @@ -2264,17 +2316,11 @@ em_local_timer(void *arg) return; hung: /* Looks like we're hung */ - device_printf(adapter->dev, "Watchdog timeout -- resetting\n"); - device_printf(adapter->dev, - "Queue(%d) tdh = %d, hw tdt = %d\n", txr->me, - E1000_READ_REG(&adapter->hw, E1000_TDH(txr->me)), - E1000_READ_REG(&adapter->hw, E1000_TDT(txr->me))); - device_printf(adapter->dev,"TX(%d) desc avail = %d," - "Next TX to Clean = %d\n", - txr->me, txr->tx_avail, txr->next_to_clean); + device_printf(adapter->dev, "Watchdog timeout Queue[%d]-- resetting\n", + txr->me); + em_print_debug_info(adapter); if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING); adapter->watchdog_events++; - adapter->pause_frames = 0; em_init_locked(adapter); } @@ -2324,7 +2370,7 @@ em_update_link_status(struct adapter *adapter) (hw->mac.type == e1000_82572))) { int tarc0; tarc0 = E1000_READ_REG(hw, E1000_TARC(0)); - tarc0 &= ~SPEED_MODE_BIT; + tarc0 &= ~TARC_SPEED_MODE_BIT; E1000_WRITE_REG(hw, E1000_TARC(0), tarc0); } if (bootverbose) @@ -2343,9 +2389,9 @@ em_update_link_status(struct adapter *adapter) if (bootverbose) device_printf(dev, "Link is Down\n"); adapter->link_active = 0; - /* Link down, disable watchdog */ + /* Link down, disable hang detection */ for (int i = 0; i < adapter->num_queues; i++, txr++) - txr->queue_status = EM_QUEUE_IDLE; + txr->busy = EM_TX_IDLE; if_link_state_change(ifp, LINK_STATE_DOWN); } } @@ -2376,10 +2422,10 @@ em_stop(void *arg) /* Tell the stack that the interface is no longer active */ if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); - /* Unarm watchdog timer. */ + /* Disarm Hang Detection. */ for (int i = 0; i < adapter->num_queues; i++, txr++) { EM_TX_LOCK(txr); - txr->queue_status = EM_QUEUE_IDLE; + txr->busy = EM_TX_IDLE; EM_TX_UNLOCK(txr); } @@ -2440,14 +2486,6 @@ em_allocate_pci_resources(struct adapter *adapter) rman_get_bushandle(adapter->memory); adapter->hw.hw_addr = (u8 *)&adapter->osdep.mem_bus_space_handle; - /* Default to a single queue */ - adapter->num_queues = 1; - - /* - * Setup MSI/X or MSI if PCI Express - */ - adapter->msix = em_setup_msix(adapter); - adapter->hw.back = &adapter->osdep; return (0); @@ -2522,13 +2560,14 @@ em_allocate_msix(struct adapter *adapter) struct tx_ring *txr = adapter->tx_rings; struct rx_ring *rxr = adapter->rx_rings; int error, rid, vector = 0; + int cpu_id = 0; /* Make sure all interrupts are disabled */ E1000_WRITE_REG(&adapter->hw, E1000_IMC, 0xffffffff); /* First set up ring resources */ - for (int i = 0; i < adapter->num_queues; i++, txr++, rxr++) { + for (int i = 0; i < adapter->num_queues; i++, rxr++, vector++) { /* RX ring */ rid = vector + 1; @@ -2548,14 +2587,20 @@ em_allocate_msix(struct adapter *adapter) return (error); } #if __FreeBSD_version >= 800504 - bus_describe_intr(dev, rxr->res, rxr->tag, "rx %d", i); + bus_describe_intr(dev, rxr->res, rxr->tag, "rx%d", i); #endif - rxr->msix = vector++; /* NOTE increment vector for TX */ + rxr->msix = vector; + + if (em_last_bind_cpu < 0) + em_last_bind_cpu = CPU_FIRST(); + cpu_id = em_last_bind_cpu; + bus_bind_intr(dev, rxr->res, cpu_id); + TASK_INIT(&rxr->rx_task, 0, em_handle_rx, rxr); rxr->tq = taskqueue_create_fast("em_rxq", M_NOWAIT, taskqueue_thread_enqueue, &rxr->tq); - taskqueue_start_threads(&rxr->tq, 1, PI_NET, "%s rxq", - device_get_nameunit(adapter->dev)); + taskqueue_start_threads(&rxr->tq, 1, PI_NET, "%s rxq (cpuid %d)", + device_get_nameunit(adapter->dev), cpu_id); /* ** Set the bit to enable interrupt ** in E1000_IMS -- bits 20 and 21 @@ -2563,8 +2608,13 @@ em_allocate_msix(struct adapter *adapter) ** NOTHING to do with the MSIX vector */ rxr->ims = 1 << (20 + i); + adapter->ims |= rxr->ims; adapter->ivars |= (8 | rxr->msix) << (i * 4); + em_last_bind_cpu = CPU_NEXT(em_last_bind_cpu); + } + + for (int i = 0; i < adapter->num_queues; i++, txr++, vector++) { /* TX ring */ rid = vector + 1; txr->res = bus_alloc_resource_any(dev, @@ -2582,14 +2632,20 @@ em_allocate_msix(struct adapter *adapter) return (error); } #if __FreeBSD_version >= 800504 - bus_describe_intr(dev, txr->res, txr->tag, "tx %d", i); + bus_describe_intr(dev, txr->res, txr->tag, "tx%d", i); #endif - txr->msix = vector++; /* Increment vector for next pass */ + txr->msix = vector; + + if (em_last_bind_cpu < 0) + em_last_bind_cpu = CPU_FIRST(); + cpu_id = em_last_bind_cpu; + bus_bind_intr(dev, txr->res, cpu_id); + TASK_INIT(&txr->tx_task, 0, em_handle_tx, txr); txr->tq = taskqueue_create_fast("em_txq", M_NOWAIT, taskqueue_thread_enqueue, &txr->tq); - taskqueue_start_threads(&txr->tq, 1, PI_NET, "%s txq", - device_get_nameunit(adapter->dev)); + taskqueue_start_threads(&txr->tq, 1, PI_NET, "%s txq (cpuid %d)", + device_get_nameunit(adapter->dev), cpu_id); /* ** Set the bit to enable interrupt ** in E1000_IMS -- bits 22 and 23 @@ -2597,13 +2653,16 @@ em_allocate_msix(struct adapter *adapter) ** NOTHING to do with the MSIX vector */ txr->ims = 1 << (22 + i); + adapter->ims |= txr->ims; adapter->ivars |= (8 | txr->msix) << (8 + (i * 4)); + + em_last_bind_cpu = CPU_NEXT(em_last_bind_cpu); } /* Link interrupt */ - ++rid; + rid = vector + 1; adapter->res = bus_alloc_resource_any(dev, - SYS_RES_IRQ, &rid, RF_ACTIVE); + SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (!adapter->res) { device_printf(dev,"Unable to allocate " "bus resource: Link interrupt [%d]\n", rid); @@ -2619,7 +2678,7 @@ em_allocate_msix(struct adapter *adapter) return (error); } #if __FreeBSD_version >= 800504 - bus_describe_intr(dev, adapter->res, adapter->tag, "link"); + bus_describe_intr(dev, adapter->res, adapter->tag, "link"); #endif adapter->linkvec = vector; adapter->ivars |= (8 | vector) << 16; @@ -2643,9 +2702,8 @@ em_free_pci_resources(struct adapter *adapter) */ for (int i = 0; i < adapter->num_queues; i++) { txr = &adapter->tx_rings[i]; - rxr = &adapter->rx_rings[i]; /* an early abort? */ - if ((txr == NULL) || (rxr == NULL)) + if (txr == NULL) break; rid = txr->msix +1; if (txr->tag != NULL) { @@ -2655,6 +2713,11 @@ em_free_pci_resources(struct adapter *adapter) if (txr->res != NULL) bus_release_resource(dev, SYS_RES_IRQ, rid, txr->res); + + rxr = &adapter->rx_rings[i]; + /* an early abort? */ + if (rxr == NULL) + break; rid = rxr->msix +1; if (rxr->tag != NULL) { bus_teardown_intr(dev, rxr->res, rxr->tag); @@ -2704,14 +2767,19 @@ em_setup_msix(struct adapter *adapter) device_t dev = adapter->dev; int val; + /* Nearly always going to use one queue */ + adapter->num_queues = 1; + /* - ** Setup MSI/X for Hartwell: tests have shown - ** use of two queues to be unstable, and to - ** provide no great gain anyway, so we simply - ** seperate the interrupts and use a single queue. + ** Try using MSI-X for Hartwell adapters */ if ((adapter->hw.mac.type == e1000_82574) && (em_enable_msix == TRUE)) { +#ifdef EM_MULTIQUEUE + adapter->num_queues = (em_num_queues == 1) ? 1 : 2; + if (adapter->num_queues > 1) + em_enable_vectors_82574(adapter); +#endif /* Map the MSIX BAR */ int rid = PCIR_BAR(EM_MSIX_BAR); adapter->msix_mem = bus_alloc_resource_any(dev, @@ -2723,16 +2791,34 @@ em_setup_msix(struct adapter *adapter) goto msi; } val = pci_msix_count(dev); - /* We only need/want 3 vectors */ - if (val >= 3) - val = 3; - else { - device_printf(adapter->dev, - "MSIX: insufficient vectors, using MSI\n"); - goto msi; + +#ifdef EM_MULTIQUEUE + /* We need 5 vectors in the multiqueue case */ + if (adapter->num_queues > 1 ) { + if (val >= 5) + val = 5; + else { + adapter->num_queues = 1; + device_printf(adapter->dev, + "Insufficient MSIX vectors for >1 queue, " + "using single queue...\n"); + goto msix_one; + } + } else { +msix_one: +#endif + if (val >= 3) + val = 3; + else { + device_printf(adapter->dev, + "Insufficient MSIX vectors, using MSI\n"); + goto msi; + } +#ifdef EM_MULTIQUEUE } +#endif - if ((pci_alloc_msix(dev, &val) == 0) && (val == 3)) { + if ((pci_alloc_msix(dev, &val) == 0)) { device_printf(adapter->dev, "Using MSIX interrupts " "with %d vectors\n", val); @@ -2753,7 +2839,7 @@ msi: } val = 1; if (pci_alloc_msi(dev, &val) == 0) { - device_printf(adapter->dev,"Using an MSI interrupt\n"); + device_printf(adapter->dev, "Using an MSI interrupt\n"); return (val); } /* Should only happen due to manual configuration */ @@ -3358,7 +3444,7 @@ em_setup_transmit_ring(struct tx_ring *txr) /* Set number of descriptors available */ txr->tx_avail = adapter->num_tx_desc; - txr->queue_status = EM_QUEUE_IDLE; + txr->busy = EM_TX_IDLE; /* Clear checksum offload context. */ txr->last_hw_offload = 0; @@ -3398,7 +3484,7 @@ em_initialize_transmit_unit(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; struct e1000_hw *hw = &adapter->hw; - u32 tctl, tarc, tipg = 0; + u32 tctl, txdctl = 0, tarc, tipg = 0; INIT_DEBUGOUT("em_initialize_transmit_unit: begin"); @@ -3419,7 +3505,16 @@ em_initialize_transmit_unit(struct adapter *adapter) E1000_READ_REG(&adapter->hw, E1000_TDBAL(i)), E1000_READ_REG(&adapter->hw, E1000_TDLEN(i))); - txr->queue_status = EM_QUEUE_IDLE; + txr->busy = EM_TX_IDLE; + txdctl = 0; /* clear txdctl */ + txdctl |= 0x1f; /* PTHRESH */ + txdctl |= 1 << 8; /* HTHRESH */ + txdctl |= 1 << 16;/* WTHRESH */ + txdctl |= 1 << 22; /* Reserved bit 22 must always be 1 */ + txdctl |= E1000_TXDCTL_GRAN; + txdctl |= 1 << 25; /* LWTHRESH */ + + E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl); } /* Set the default values for the Tx Inter Packet Gap timer */ @@ -3450,15 +3545,25 @@ em_initialize_transmit_unit(struct adapter *adapter) if ((adapter->hw.mac.type == e1000_82571) || (adapter->hw.mac.type == e1000_82572)) { tarc = E1000_READ_REG(&adapter->hw, E1000_TARC(0)); - tarc |= SPEED_MODE_BIT; + tarc |= TARC_SPEED_MODE_BIT; E1000_WRITE_REG(&adapter->hw, E1000_TARC(0), tarc); } else if (adapter->hw.mac.type == e1000_80003es2lan) { + /* errata: program both queues to unweighted RR */ tarc = E1000_READ_REG(&adapter->hw, E1000_TARC(0)); tarc |= 1; E1000_WRITE_REG(&adapter->hw, E1000_TARC(0), tarc); tarc = E1000_READ_REG(&adapter->hw, E1000_TARC(1)); tarc |= 1; E1000_WRITE_REG(&adapter->hw, E1000_TARC(1), tarc); + } else if (adapter->hw.mac.type == e1000_82574) { + tarc = E1000_READ_REG(&adapter->hw, E1000_TARC(0)); + tarc |= TARC_ERRATA_BIT; + if ( adapter->num_queues > 1) { + tarc |= (TARC_COMPENSATION_MODE | TARC_MQ_FIX); + E1000_WRITE_REG(&adapter->hw, E1000_TARC(0), tarc); + E1000_WRITE_REG(&adapter->hw, E1000_TARC(1), tarc); + } else + E1000_WRITE_REG(&adapter->hw, E1000_TARC(0), tarc); } adapter->txd_cmd = E1000_TXD_CMD_IFCS; @@ -3802,9 +3907,9 @@ em_txeof(struct tx_ring *txr) return; #endif /* DEV_NETMAP */ - /* No work, make sure watchdog is off */ + /* No work, make sure hang detection is disabled */ if (txr->tx_avail == adapter->num_tx_desc) { - txr->queue_status = EM_QUEUE_IDLE; + txr->busy = EM_TX_IDLE; return; } @@ -3847,7 +3952,6 @@ em_txeof(struct tx_ring *txr) tx_buffer->m_head = NULL; } tx_buffer->next_eop = -1; - txr->watchdog_time = ticks; if (++first == adapter->num_tx_desc) first = 0; @@ -3872,14 +3976,16 @@ em_txeof(struct tx_ring *txr) txr->next_to_clean = first; /* - ** Watchdog calculation, we know there's - ** work outstanding or the first return - ** would have been taken, so none processed - ** for too long indicates a hang. local timer - ** will examine this and do a reset if needed. + ** Hang detection: we know there's work outstanding + ** or the entry return would have been taken, so no + ** descriptor processed here indicates a potential hang. + ** The local timer will examine this and do a reset if needed. */ - if ((!processed) && ((ticks - txr->watchdog_time) > EM_WATCHDOG)) - txr->queue_status = EM_QUEUE_HUNG; + if (processed == 0) { + if (txr->busy != EM_TX_HUNG) + ++txr->busy; + } else /* At least one descriptor was cleaned */ + txr->busy = EM_TX_BUSY; /* note this clears HUNG */ /* * If we have a minimum free, clear IFF_DRV_OACTIVE @@ -3888,13 +3994,13 @@ em_txeof(struct tx_ring *txr) * TX lock which, with a single queue, guarantees * sanity. */ - if (txr->tx_avail >= EM_MAX_SCATTER) + if (txr->tx_avail >= EM_MAX_SCATTER) { if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); + } - /* Disable watchdog if all clean */ - if (txr->tx_avail == adapter->num_tx_desc) { - txr->queue_status = EM_QUEUE_IDLE; - } + /* Disable hang detection if all clean */ + if (txr->tx_avail == adapter->num_tx_desc) + txr->busy = EM_TX_IDLE; } @@ -4262,6 +4368,9 @@ em_initialize_receive_unit(struct adapter *adapter) E1000_WRITE_REG(&adapter->hw, E1000_RADV, adapter->rx_abs_int_delay.value); + + E1000_WRITE_REG(&adapter->hw, E1000_RDTR, + adapter->rx_int_delay.value); /* * Set the interrupt throttling rate. Value is calculated * as DEFAULT_ITR = 1/(MAX_INTS_PER_SEC * 256ns) @@ -4273,20 +4382,65 @@ em_initialize_receive_unit(struct adapter *adapter) ** using the EITR register (82574 only) */ if (hw->mac.type == e1000_82574) { + u32 rfctl; for (int i = 0; i < 4; i++) E1000_WRITE_REG(hw, E1000_EITR_82574(i), DEFAULT_ITR); /* Disable accelerated acknowledge */ - E1000_WRITE_REG(hw, E1000_RFCTL, E1000_RFCTL_ACK_DIS); + rfctl = E1000_READ_REG(hw, E1000_RFCTL); + rfctl |= E1000_RFCTL_ACK_DIS; + E1000_WRITE_REG(hw, E1000_RFCTL, rfctl); } rxcsum = E1000_READ_REG(hw, E1000_RXCSUM); - if (if_getcapenable(ifp) & IFCAP_RXCSUM) + if (if_getcapenable(ifp) & IFCAP_RXCSUM) { +#ifdef EM_MULTIQUEUE + rxcsum |= E1000_RXCSUM_TUOFL | + E1000_RXCSUM_IPOFL | + E1000_RXCSUM_PCSD; +#else rxcsum |= E1000_RXCSUM_TUOFL; - else +#endif + } else rxcsum &= ~E1000_RXCSUM_TUOFL; + E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum); +#ifdef EM_MULTIQUEUE + if (adapter->num_queues > 1) { + uint32_t rss_key[10]; + uint32_t reta; + int i; + + /* + * Configure RSS key + */ + arc4rand(rss_key, sizeof(rss_key), 0); + for (i = 0; i < 10; ++i) + E1000_WRITE_REG_ARRAY(hw,E1000_RSSRK(0), i, rss_key[i]); + + /* + * Configure RSS redirect table in following fashion: + * (hash & ring_cnt_mask) == rdr_table[(hash & rdr_table_mask)] + */ + reta = 0; + for (i = 0; i < 4; ++i) { + uint32_t q; + q = (i % adapter->num_queues) << 7; + reta |= q << (8 * i); + } + for (i = 0; i < 32; ++i) + E1000_WRITE_REG(hw, E1000_RETA(i), reta); + + E1000_WRITE_REG(hw, E1000_MRQC, E1000_MRQC_RSS_ENABLE_2Q | + E1000_MRQC_RSS_FIELD_IPV4_TCP | + E1000_MRQC_RSS_FIELD_IPV4 | + E1000_MRQC_RSS_FIELD_IPV6_TCP_EX | + E1000_MRQC_RSS_FIELD_IPV6_EX | + E1000_MRQC_RSS_FIELD_IPV6 | + E1000_MRQC_RSS_FIELD_IPV6_TCP); + } +#endif /* ** XXX TEMPORARY WORKAROUND: on some systems with 82573 ** long latencies are observed, like Lenovo X60. This @@ -4321,13 +4475,30 @@ em_initialize_receive_unit(struct adapter *adapter) E1000_WRITE_REG(hw, E1000_RDT(i), rdt); } - /* Set PTHRESH for improved jumbo performance */ + /* + * Set PTHRESH for improved jumbo performance + * According to 10.2.5.11 of Intel 82574 Datasheet, + * RXDCTL(1) is written whenever RXDCTL(0) is written. + * Only write to RXDCTL(1) if there is a need for different + * settings. + */ if (((adapter->hw.mac.type == e1000_ich9lan) || (adapter->hw.mac.type == e1000_pch2lan) || (adapter->hw.mac.type == e1000_ich10lan)) && (if_getmtu(ifp) > ETHERMTU)) { u32 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(0)); E1000_WRITE_REG(hw, E1000_RXDCTL(0), rxdctl | 3); + } else if ((adapter->hw.mac.type == e1000_82574) && + (if_getmtu(ifp) > ETHERMTU)) { + for (int i = 0; i < adapter->num_queues; i++) { + u32 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i)); + + rxdctl |= 0x20; /* PTHRESH */ + rxdctl |= 4 << 8; /* HTHRESH */ + rxdctl |= 4 << 16;/* WTHRESH */ + rxdctl |= 1 << 24; /* Switch to granularity */ + E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl); + } } if (adapter->hw.mac.type >= e1000_pch2lan) { @@ -4394,6 +4565,11 @@ em_rxeof(struct rx_ring *rxr, int count, int *done) EM_RX_LOCK(rxr); + /* Sync the ring */ + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + + #ifdef DEV_NETMAP if (netmap_rx_irq(ifp, rxr->me, &processed)) { EM_RX_UNLOCK(rxr); @@ -4406,9 +4582,6 @@ em_rxeof(struct rx_ring *rxr, int count, int *done) if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) break; - bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, - BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); - cur = &rxr->rx_base[i]; status = cur->status; mp = sendmp = NULL; @@ -4474,6 +4647,10 @@ skip: rxr->fmp = rxr->lmp = NULL; } next_desc: + /* Sync the ring */ + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + /* Zero out the receive descriptors status. */ cur->status = 0; ++rxdone; /* cumulative for POLL */ @@ -5130,12 +5307,7 @@ em_update_stats_counters(struct adapter *adapter) adapter->stats.rlec += E1000_READ_REG(&adapter->hw, E1000_RLEC); adapter->stats.xonrxc += E1000_READ_REG(&adapter->hw, E1000_XONRXC); adapter->stats.xontxc += E1000_READ_REG(&adapter->hw, E1000_XONTXC); - /* - ** For watchdog management we need to know if we have been - ** paused during the last interval, so capture that here. - */ - adapter->pause_frames = E1000_READ_REG(&adapter->hw, E1000_XOFFRXC); - adapter->stats.xoffrxc += adapter->pause_frames; + adapter->stats.xoffrxc += E1000_READ_REG(&adapter->hw, E1000_XOFFRXC); adapter->stats.xofftxc += E1000_READ_REG(&adapter->hw, E1000_XOFFTXC); adapter->stats.fcruc += E1000_READ_REG(&adapter->hw, E1000_FCRUC); adapter->stats.prc64 += E1000_READ_REG(&adapter->hw, E1000_PRC64); @@ -5300,10 +5472,10 @@ em_add_hw_stats(struct adapter *adapter) CTLFLAG_RD, &adapter->hw.fc.low_water, 0, "Flow Control Low Watermark"); - for (int i = 0; i < adapter->num_queues; i++, rxr++, txr++) { - snprintf(namebuf, QUEUE_NAME_LEN, "queue%d", i); + for (int i = 0; i < adapter->num_queues; i++, txr++, rxr++) { + snprintf(namebuf, QUEUE_NAME_LEN, "queue_tx_%d", i); queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, - CTLFLAG_RD, NULL, "Queue Name"); + CTLFLAG_RD, NULL, "TX Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "txd_head", @@ -5322,7 +5494,12 @@ em_add_hw_stats(struct adapter *adapter) SYSCTL_ADD_ULONG(ctx, queue_list, OID_AUTO, "no_desc_avail", CTLFLAG_RD, &txr->no_desc_avail, "Queue No Descriptor Available"); - + + snprintf(namebuf, QUEUE_NAME_LEN, "queue_rx_%d", i); + queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, + CTLFLAG_RD, NULL, "RX Queue Name"); + queue_list = SYSCTL_CHILDREN(queue_node); + SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "rxd_head", CTLTYPE_UINT | CTLFLAG_RD, adapter, E1000_RDH(rxr->me), @@ -5756,19 +5933,86 @@ em_print_debug_info(struct adapter *adapter) else printf("and ACTIVE\n"); - device_printf(dev, "hw tdh = %d, hw tdt = %d\n", - E1000_READ_REG(&adapter->hw, E1000_TDH(0)), - E1000_READ_REG(&adapter->hw, E1000_TDT(0))); - device_printf(dev, "hw rdh = %d, hw rdt = %d\n", - E1000_READ_REG(&adapter->hw, E1000_RDH(0)), - E1000_READ_REG(&adapter->hw, E1000_RDT(0))); - device_printf(dev, "Tx Queue Status = %d\n", txr->queue_status); - device_printf(dev, "TX descriptors avail = %d\n", - txr->tx_avail); - device_printf(dev, "Tx Descriptors avail failure = %ld\n", - txr->no_desc_avail); - device_printf(dev, "RX discarded packets = %ld\n", - rxr->rx_discarded); - device_printf(dev, "RX Next to Check = %d\n", rxr->next_to_check); - device_printf(dev, "RX Next to Refresh = %d\n", rxr->next_to_refresh); + for (int i = 0; i < adapter->num_queues; i++, txr++, rxr++) { + device_printf(dev, "TX Queue %d ------\n", i); + device_printf(dev, "hw tdh = %d, hw tdt = %d\n", + E1000_READ_REG(&adapter->hw, E1000_TDH(i)), + E1000_READ_REG(&adapter->hw, E1000_TDT(i))); + device_printf(dev, "Tx Queue Status = %d\n", txr->busy); + device_printf(dev, "TX descriptors avail = %d\n", + txr->tx_avail); + device_printf(dev, "Tx Descriptors avail failure = %ld\n", + txr->no_desc_avail); + device_printf(dev, "RX Queue %d ------\n", i); + device_printf(dev, "hw rdh = %d, hw rdt = %d\n", + E1000_READ_REG(&adapter->hw, E1000_RDH(i)), + E1000_READ_REG(&adapter->hw, E1000_RDT(i))); + device_printf(dev, "RX discarded packets = %ld\n", + rxr->rx_discarded); + device_printf(dev, "RX Next to Check = %d\n", rxr->next_to_check); + device_printf(dev, "RX Next to Refresh = %d\n", rxr->next_to_refresh); + } +} + +#ifdef EM_MULTIQUEUE +/* + * 82574 only: + * Write a new value to the EEPROM increasing the number of MSIX + * vectors from 3 to 5, for proper multiqueue support. + */ +static void +em_enable_vectors_82574(struct adapter *adapter) +{ + struct e1000_hw *hw = &adapter->hw; + device_t dev = adapter->dev; + u16 edata; + + e1000_read_nvm(hw, EM_NVM_PCIE_CTRL, 1, &edata); + printf("Current cap: %#06x\n", edata); + if (((edata & EM_NVM_MSIX_N_MASK) >> EM_NVM_MSIX_N_SHIFT) != 4) { + device_printf(dev, "Writing to eeprom: increasing " + "reported MSIX vectors from 3 to 5...\n"); + edata &= ~(EM_NVM_MSIX_N_MASK); + edata |= 4 << EM_NVM_MSIX_N_SHIFT; + e1000_write_nvm(hw, EM_NVM_PCIE_CTRL, 1, &edata); + e1000_update_nvm_checksum(hw); + device_printf(dev, "Writing to eeprom: done\n"); + } +} +#endif + +#ifdef DDB +DB_COMMAND(em_reset_dev, em_ddb_reset_dev) +{ + devclass_t dc; + int max_em; + + dc = devclass_find("em"); + max_em = devclass_get_maxunit(dc); + + for (int index = 0; index < (max_em - 1); index++) { + device_t dev; + dev = devclass_get_device(dc, index); + if (device_get_driver(dev) == &em_driver) { + struct adapter *adapter = device_get_softc(dev); + em_init_locked(adapter); + } + } +} +DB_COMMAND(em_dump_queue, em_ddb_dump_queue) +{ + devclass_t dc; + int max_em; + + dc = devclass_find("em"); + max_em = devclass_get_maxunit(dc); + + for (int index = 0; index < (max_em - 1); index++) { + device_t dev; + dev = devclass_get_device(dc, index); + if (device_get_driver(dev) == &em_driver) + em_print_debug_info(device_get_softc(dev)); + } + } +#endif |