diff options
author | Shailend Chand <shailend@google.com> | 2023-06-02 18:58:24 +0000 |
---|---|---|
committer | Xin LI <delphij@FreeBSD.org> | 2023-06-02 21:31:54 +0000 |
commit | 54dfc97b0bd99f1c3bcbb37357cf28cd81a7cf00 (patch) | |
tree | 78c4a459c86c99bfe3e1e9faef6f9c61e6aeb587 | |
parent | 0981275c75d5678172eb9dd8fbc89ef61c05c979 (diff) | |
download | src-54dfc97b0bd9.tar.gz src-54dfc97b0bd9.zip |
Add gve, the driver for Google Virtual NIC (gVNIC)
gVNIC is a virtual network interface designed specifically for
Google Compute Engine (GCE). It is required to support per-VM Tier_1
networking performance, and for using certain VM shapes on GCE.
The NIC supports TSO, Rx and Tx checksum offloads, and RSS.
It does not currently do hardware LRO, and thus the software-LRO
in the host is used instead. It also supports jumbo frames.
For each queue, the driver negotiates a set of pages with the NIC to
serve as a fixed bounce buffer, this precludes the use of iflib.
Reviewed-by: markj
MFC-after: 2 weeks
Differential Revision: https://reviews.freebsd.org/D39873
-rw-r--r-- | share/man/man4/Makefile | 5 | ||||
-rw-r--r-- | share/man/man4/gve.4 | 215 | ||||
-rw-r--r-- | sys/conf/files | 7 | ||||
-rw-r--r-- | sys/dev/gve/gve.h | 459 | ||||
-rw-r--r-- | sys/dev/gve/gve_adminq.c | 803 | ||||
-rw-r--r-- | sys/dev/gve/gve_adminq.h | 394 | ||||
-rw-r--r-- | sys/dev/gve/gve_desc.h | 151 | ||||
-rw-r--r-- | sys/dev/gve/gve_main.c | 853 | ||||
-rw-r--r-- | sys/dev/gve/gve_plat.h | 94 | ||||
-rw-r--r-- | sys/dev/gve/gve_qpl.c | 284 | ||||
-rw-r--r-- | sys/dev/gve/gve_register.h | 54 | ||||
-rw-r--r-- | sys/dev/gve/gve_rx.c | 684 | ||||
-rw-r--r-- | sys/dev/gve/gve_sysctl.c | 261 | ||||
-rw-r--r-- | sys/dev/gve/gve_tx.c | 806 | ||||
-rw-r--r-- | sys/dev/gve/gve_utils.c | 405 | ||||
-rw-r--r-- | sys/modules/Makefile | 5 | ||||
-rw-r--r-- | sys/modules/gve/Makefile | 36 |
17 files changed, 5516 insertions, 0 deletions
diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile index 71d12055f4db..9149a562383f 100644 --- a/share/man/man4/Makefile +++ b/share/man/man4/Makefile @@ -176,6 +176,7 @@ MAN= aac.4 \ geom_map.4 \ geom_uzip.4 \ gif.4 \ + ${_gve.4} \ gpio.4 \ gpioiic.4 \ gpiokeys.4 \ @@ -895,6 +896,10 @@ _linux.4= linux.4 _ossl.4= ossl.4 .endif +.if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "aarch64" +_gve.4= gve.4 +.endif + .if ${MACHINE_CPUARCH} == "arm" || ${MACHINE_CPUARCH} == "aarch64" || \ ${MACHINE_CPUARCH} == "riscv" _cgem.4= cgem.4 diff --git a/share/man/man4/gve.4 b/share/man/man4/gve.4 new file mode 100644 index 000000000000..9bb1be1b9a53 --- /dev/null +++ b/share/man/man4/gve.4 @@ -0,0 +1,215 @@ +.\" SPDX-License-Identifier: BSD-3-Clause +.\" +.\" Copyright (c) 2023 Google LLC +.\" +.\" Redistribution and use in source and binary forms, with or without modification, +.\" are permitted provided that the following conditions are met: +.\" +.\" 1. Redistributions of source code must retain the above copyright notice, this +.\" list of conditions and the following disclaimer. +.\" +.\" 2. Redistributions in binary form must reproduce the above copyright notice, +.\" this list of conditions and the following disclaimer in the documentation +.\" and/or other materials provided with the distribution. +.\" +.\" 3. Neither the name of the copyright holder nor the names of its contributors +.\" may be used to endorse or promote products derived from this software without +.\" specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +.\" WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +.\" DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +.\" ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +.\" (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +.\" LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +.\" ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +.\" SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +.Dd April 26, 2023 +.Dt GVE 4 +.Os +.Sh NAME +.Nm gve +.Nd "Ethernet driver for Google Virtual NIC (gVNIC)" +.Sh SYNOPSIS +To compile this driver into the kernel, +place the following lines in your +kernel configuration file: +.Bd -ragged -offset indent +.Cd "device gve" +.Ed +.Pp +Alternatively, to load the driver as a +module at boot time, place the following line in +.Xr loader.conf 5 : +.Bd -literal -offset indent +if_gve_load="YES" +.Ed +.Sh DESCRIPTION +gVNIC is a virtual network interface designed specifically for Google Compute Engine (GCE). +It is required to support per-VM Tier-1 networking performance, and for using certain VM shapes on GCE. +.Pp +.Nm +is the driver for gVNIC. +It supports the following features: +.Pp +.Bl -bullet -compact +.It +RX checksum offload +.It +TX chesksum offload +.It +TCP Segmentation Offload (TSO) +.It +Large Receive Offload (LRO) in software +.It +Jumbo frames +.It +Receive Side Scaling (RSS) +.El +.Pp +For more information on configuring this device, see +.Xr ifconfig 8 . +.Sh HARDWARE +.Nm +binds to a single PCI device ID presented by gVNIC: +.Pp +.Bl -bullet -compact +.It +0x1AE0:0x0042 +.El +.Sh DIAGNOSTICS +The following messages are recorded during driver initialization: +.Bl -diag +.It "Enabled MSIX with %d vectors" +.It "Configured device resources" +.It "Successfully attached %s" +.It "Deconfigured device resources" +.El +.Pp +These messages are seen if driver initialization fails. +Global (across-queues) allocation failures: +.Bl -diag +.It "Failed to configure device resources: err=%d" +.It "No compatible queue formats" +.It "Failed to allocate ifnet struct" +.It "Failed to allocate admin queue mem" +.It "Failed to alloc DMA mem for DescribeDevice" +.It "Failed to allocate QPL page" +.El +.Pp +irq and BAR allocation failures: +.Bl -diag +.It "Failed to acquire any msix vectors" +.It "Tried to acquire %d msix vectors, got only %d" +.It "Failed to setup irq %d for Tx queue %d " +.It "Failed to setup irq %d for Rx queue %d " +.It "Failed to allocate irq %d for mgmnt queue" +.It "Failed to setup irq %d for mgmnt queue, err: %d" +.It "Failed to allocate BAR0" +.It "Failed to allocate BAR2" +.It "Failed to allocate msix table" +.El +.Pp +Rx queue-specific allocation failures: +.Bl -diag +.It "No QPL left for rx ring %d" +.It "Failed to alloc queue resources for rx ring %d" +.It "Failed to alloc desc ring for rx ring %d" +.It "Failed to alloc data ring for rx ring %d" +.El +.Pp +Tx queue-specific allocation failures: +.Bl -diag +.It "No QPL left for tx ring %d" +.It "Failed to alloc queue resources for tx ring %d" +.It "Failed to alloc desc ring for tx ring %d" +.It "Failed to vmap fifo, qpl_id = %d" +.El +.El +.Pp +The following messages are recorded when the interface detach fails: +.Bl -diag +.It "Failed to deconfigure device resources: err=%d" +.El +.Pp +If bootverbose is on, the following messages are recorded when the interface is being brought up: +.Bl -diag +.It "Created %d rx queues" +.It "Created %d tx queues" +.It "MTU set to %d" +.El +.Pp +The following messages are recorded when the interface is being brought down: +.Bl -diag +.It "Destroyed %d rx queues" +.It "Destroyed %d tx queues" +.El +.Pp +These messages are seen if errors are encountered when bringing the interface up or down: +.Bl -diag +.It "Failed to destroy rxq %d, err: %d" +.It "Failed to destroy txq %d, err: %d" +.It "Failed to create rxq %d, err: %d" +.It "Failed to create txq %d, err: %d" +.It "Failed to set MTU to %d" +.It "Invalid new MTU setting. new mtu: %d max mtu: %d min mtu: %d" +.It "Cannot bring the iface up when detached" +.It "Reached max number of registered pages %lu > %lu" +.It "Failed to init lro for rx ring %d" +.El +.Pp +These messages are seen if any admin queue command fails: +.Bl -diag +.It "AQ command(%u): failed with status %d" +.It "AQ command(%u): unknown status code %d" +.It "AQ commands timed out, need to reset AQ" +.It "Unknown AQ command opcode %d" +.El +.Pp +These messages are recorded when the device is being reset due to an error: +.Bl -diag +.It "Scheduling reset task!" +.It "Waiting until admin queue is released." +.It "Admin queue released" +.El +.Pp +If it was the NIC that requested the reset, this message is recorded: +.Bl -diag +.It "Device requested reset" +.El +.Pp +If the reset fails during the reinitialization phase, this message is recorded: +.Bl -diag +.It "Restore failed!" +.El +.Pp +These two messages correspoond to the NIC alerting the driver to link state changes: +.Bl -diag +.It "Device link is up." +.It "Device link is down." +.El +.Pp +Apart from these messages, the driver exposes per-queue packet and error counters as sysctl nodes. +Global (across queues) counters can be read using +.Xr netstat 8 . +.Sh LIMITATIONS +.Nm +does not support the transmission of VLAN-tagged packets. +All VLAN-tagged traffic is dropped. +.Sh SUPPORT +Please email gvnic-drivers@google.com with the specifics of the issue encountered. +.El +.Sh SEE ALSO +.Xr ifconfig 8 , +.Xr netstat 8 +.Sh HISTORY +The +.Nm +device driver first appeared in +.Fx 14.0 . +.Sh AUTHORS +The +.Nm +driver was written by Google. diff --git a/sys/conf/files b/sys/conf/files index c0728504da5a..a5f55b49451c 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1748,6 +1748,13 @@ dev/fxp/if_fxp.c optional fxp dev/fxp/inphy.c optional fxp dev/gem/if_gem.c optional gem dev/gem/if_gem_pci.c optional gem pci +dev/gve/gve_adminq.c optional gve +dev/gve/gve_main.c optional gve +dev/gve/gve_qpl.c optional gve +dev/gve/gve_rx.c optional gve +dev/gve/gve_sysctl.c optional gve +dev/gve/gve_tx.c optional gve +dev/gve/gve_utils.c optional gve dev/goldfish/goldfish_rtc.c optional goldfish_rtc fdt dev/gpio/dwgpio/dwgpio.c optional gpio dwgpio fdt dev/gpio/dwgpio/dwgpio_bus.c optional gpio dwgpio fdt diff --git a/sys/dev/gve/gve.h b/sys/dev/gve/gve.h new file mode 100644 index 000000000000..61781cddee94 --- /dev/null +++ b/sys/dev/gve/gve.h @@ -0,0 +1,459 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2023 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _GVE_FBSD_H +#define _GVE_FBSD_H + +#include "gve_desc.h" +#include "gve_plat.h" +#include "gve_register.h" + +#ifndef PCI_VENDOR_ID_GOOGLE +#define PCI_VENDOR_ID_GOOGLE 0x1ae0 +#endif + +#define PCI_DEV_ID_GVNIC 0x0042 +#define GVE_REGISTER_BAR 0 +#define GVE_DOORBELL_BAR 2 + +/* Driver can alloc up to 2 segments for the header and 2 for the payload. */ +#define GVE_TX_MAX_DESCS 4 +#define GVE_TX_BUFRING_ENTRIES 4096 + +#define ADMINQ_SIZE PAGE_SIZE + +#define GVE_DEFAULT_RX_BUFFER_SIZE 2048 +/* Each RX bounce buffer page can fit two packet buffers. */ +#define GVE_DEFAULT_RX_BUFFER_OFFSET (PAGE_SIZE / 2) + +/* + * Number of descriptors per queue page list. + * Page count AKA QPL size can be derived by dividing the number of elements in + * a page by the number of descriptors available. + */ +#define GVE_QPL_DIVISOR 16 + +static MALLOC_DEFINE(M_GVE, "gve", "gve allocations"); + +struct gve_dma_handle { + bus_addr_t bus_addr; + void *cpu_addr; + bus_dma_tag_t tag; + bus_dmamap_t map; +}; + +union gve_tx_desc { + struct gve_tx_pkt_desc pkt; /* first desc for a packet */ + struct gve_tx_mtd_desc mtd; /* optional metadata descriptor */ + struct gve_tx_seg_desc seg; /* subsequent descs for a packet */ +}; + +/* Tracks the memory in the fifo occupied by a segment of a packet */ +struct gve_tx_iovec { + uint32_t iov_offset; /* offset into this segment */ + uint32_t iov_len; /* length */ + uint32_t iov_padding; /* padding associated with this segment */ +}; + +/* Tracks allowed and current queue settings */ +struct gve_queue_config { + uint16_t max_queues; + uint16_t num_queues; /* current */ +}; + +struct gve_irq_db { + __be32 index; +} __aligned(CACHE_LINE_SIZE); + +/* + * GVE_QUEUE_FORMAT_UNSPECIFIED must be zero since 0 is the default value + * when the entire configure_device_resources command is zeroed out and the + * queue_format is not specified. + */ +enum gve_queue_format { + GVE_QUEUE_FORMAT_UNSPECIFIED = 0x0, + GVE_GQI_RDA_FORMAT = 0x1, + GVE_GQI_QPL_FORMAT = 0x2, + GVE_DQO_RDA_FORMAT = 0x3, +}; + +enum gve_state_flags_bit { + GVE_STATE_FLAG_ADMINQ_OK, + GVE_STATE_FLAG_RESOURCES_OK, + GVE_STATE_FLAG_QPLREG_OK, + GVE_STATE_FLAG_RX_RINGS_OK, + GVE_STATE_FLAG_TX_RINGS_OK, + GVE_STATE_FLAG_QUEUES_UP, + GVE_STATE_FLAG_LINK_UP, + GVE_STATE_FLAG_DO_RESET, + GVE_STATE_FLAG_IN_RESET, + GVE_NUM_STATE_FLAGS /* Not part of the enum space */ +}; + +BITSET_DEFINE(gve_state_flags, GVE_NUM_STATE_FLAGS); + +#define GVE_DEVICE_STATUS_RESET (0x1 << 1) +#define GVE_DEVICE_STATUS_LINK_STATUS (0x1 << 2) + +#define GVE_RING_LOCK(ring) mtx_lock(&(ring)->ring_mtx) +#define GVE_RING_TRYLOCK(ring) mtx_trylock(&(ring)->ring_mtx) +#define GVE_RING_UNLOCK(ring) mtx_unlock(&(ring)->ring_mtx) +#define GVE_RING_ASSERT(ring) mtx_assert(&(ring)->ring_mtx, MA_OWNED) + +#define GVE_IFACE_LOCK_INIT(lock) sx_init(&lock, "gve interface lock") +#define GVE_IFACE_LOCK_DESTROY(lock) sx_destroy(&lock) +#define GVE_IFACE_LOCK_LOCK(lock) sx_xlock(&lock) +#define GVE_IFACE_LOCK_UNLOCK(lock) sx_unlock(&lock) +#define GVE_IFACE_LOCK_ASSERT(lock) sx_assert(&lock, SA_XLOCKED) + +struct gve_queue_page_list { + uint32_t id; + uint32_t num_dmas; + uint32_t num_pages; + vm_offset_t kva; + vm_page_t *pages; + struct gve_dma_handle *dmas; +}; + +struct gve_irq { + struct resource *res; + void *cookie; +}; + +struct gve_rx_slot_page_info { + void *page_address; + vm_page_t page; + uint32_t page_offset; + uint16_t pad; +}; + +/* + * A single received packet split across multiple buffers may be + * reconstructed using the information in this structure. + */ +struct gve_rx_ctx { + /* head and tail of mbuf chain for the current packet */ + struct mbuf *mbuf_head; + struct mbuf *mbuf_tail; + uint32_t total_size; + uint8_t frag_cnt; + bool drop_pkt; +}; + +struct gve_ring_com { + struct gve_priv *priv; + uint32_t id; + + /* + * BAR2 offset for this ring's doorbell and the + * counter-array offset for this ring's counter. + * Acquired from the device individually for each + * queue in the queue_create adminq command. + */ + struct gve_queue_resources *q_resources; + struct gve_dma_handle q_resources_mem; + + /* Byte offset into BAR2 where this ring's 4-byte irq doorbell lies. */ + uint32_t irq_db_offset; + /* Byte offset into BAR2 where this ring's 4-byte doorbell lies. */ + uint32_t db_offset; + /* + * Index, not byte-offset, into the counter array where this ring's + * 4-byte counter lies. + */ + uint32_t counter_idx; + + /* + * The index of the MSIX vector that was assigned to + * this ring in `gve_alloc_irqs`. + * + * It is passed to the device in the queue_create adminq + * command. + * + * Additionally, this also serves as the index into + * `priv->irq_db_indices` where this ring's irq doorbell's + * BAR2 offset, `irq_db_idx`, can be found. + */ + int ntfy_id; + + /* + * The fixed bounce buffer for this ring. + * Once allocated, has to be offered to the device + * over the register-page-list adminq command. + */ + struct gve_queue_page_list *qpl; + + struct task cleanup_task; + struct taskqueue *cleanup_tq; +} __aligned(CACHE_LINE_SIZE); + +struct gve_rxq_stats { + counter_u64_t rbytes; + counter_u64_t rpackets; + counter_u64_t rx_dropped_pkt; + counter_u64_t rx_copybreak_cnt; + counter_u64_t rx_frag_flip_cnt; + counter_u64_t rx_frag_copy_cnt; + counter_u64_t rx_dropped_pkt_desc_err; + counter_u64_t rx_dropped_pkt_mbuf_alloc_fail; +}; + +#define NUM_RX_STATS (sizeof(struct gve_rxq_stats) / sizeof(counter_u64_t)) + +/* power-of-2 sized receive ring */ +struct gve_rx_ring { + struct gve_ring_com com; + struct gve_dma_handle desc_ring_mem; + struct gve_dma_handle data_ring_mem; + + /* accessed in the receive hot path */ + struct { + struct gve_rx_desc *desc_ring; + union gve_rx_data_slot *data_ring; + struct gve_rx_slot_page_info *page_info; + + struct gve_rx_ctx ctx; + struct lro_ctrl lro; + uint8_t seq_no; /* helps traverse the descriptor ring */ + uint32_t cnt; /* free-running total number of completed packets */ + uint32_t fill_cnt; /* free-running total number of descs and buffs posted */ + uint32_t mask; /* masks the cnt and fill_cnt to the size of the ring */ + struct gve_rxq_stats stats; + } __aligned(CACHE_LINE_SIZE); + +} __aligned(CACHE_LINE_SIZE); + +/* + * A contiguous representation of the pages composing the Tx bounce buffer. + * The xmit taskqueue and the completion taskqueue both simultaneously use it. + * Both operate on `available`: the xmit tq lowers it and the completion tq + * raises it. `head` is the last location written at and so only the xmit tq + * uses it. + */ +struct gve_tx_fifo { + vm_offset_t base; /* address of base of FIFO */ + uint32_t size; /* total size */ + volatile int available; /* how much space is still available */ + uint32_t head; /* offset to write at */ +}; + +struct gve_tx_buffer_state { + struct mbuf *mbuf; + struct gve_tx_iovec iov[GVE_TX_MAX_DESCS]; +}; + +struct gve_txq_stats { + counter_u64_t tbytes; + counter_u64_t tpackets; + counter_u64_t tso_packet_cnt; + counter_u64_t tx_dropped_pkt; + counter_u64_t tx_dropped_pkt_nospace_device; + counter_u64_t tx_dropped_pkt_nospace_bufring; + counter_u64_t tx_dropped_pkt_vlan; +}; + +#define NUM_TX_STATS (sizeof(struct gve_txq_stats) / sizeof(counter_u64_t)) + +/* power-of-2 sized transmit ring */ +struct gve_tx_ring { + struct gve_ring_com com; + struct gve_dma_handle desc_ring_mem; + + struct task xmit_task; + struct taskqueue *xmit_tq; + + /* accessed in the transmit hot path */ + struct { + union gve_tx_desc *desc_ring; + struct gve_tx_buffer_state *info; + struct buf_ring *br; + + struct gve_tx_fifo fifo; + struct mtx ring_mtx; + + uint32_t req; /* free-running total number of packets written to the nic */ + uint32_t done; /* free-running total number of completed packets */ + uint32_t mask; /* masks the req and done to the size of the ring */ + struct gve_txq_stats stats; + } __aligned(CACHE_LINE_SIZE); + +} __aligned(CACHE_LINE_SIZE); + +struct gve_priv { + if_t ifp; + device_t dev; + struct ifmedia media; + + uint8_t mac[ETHER_ADDR_LEN]; + + struct gve_dma_handle aq_mem; + + struct resource *reg_bar; /* BAR0 */ + struct resource *db_bar; /* BAR2 */ + struct resource *msix_table; + + uint32_t mgmt_msix_idx; + uint32_t rx_copybreak; + + uint16_t num_event_counters; + uint16_t default_num_queues; + uint16_t tx_desc_cnt; + uint16_t rx_desc_cnt; + uint16_t rx_pages_per_qpl; + uint64_t max_registered_pages; + uint64_t num_registered_pages; + uint32_t supported_features; + uint16_t max_mtu; + + struct gve_dma_handle counter_array_mem; + __be32 *counters; + struct gve_dma_handle irqs_db_mem; + struct gve_irq_db *irq_db_indices; + + enum gve_queue_format queue_format; + struct gve_queue_page_list *qpls; + struct gve_queue_config tx_cfg; + struct gve_queue_config rx_cfg; + uint32_t num_queues; + + struct gve_irq *irq_tbl; + struct gve_tx_ring *tx; + struct gve_rx_ring *rx; + + /* + * Admin queue - see gve_adminq.h + * Since AQ cmds do not run in steady state, 32 bit counters suffice + */ + struct gve_adminq_command *adminq; + vm_paddr_t adminq_bus_addr; + uint32_t adminq_mask; /* masks prod_cnt to adminq size */ + uint32_t adminq_prod_cnt; /* free-running count of AQ cmds executed */ + uint32_t adminq_cmd_fail; /* free-running count of AQ cmds failed */ + uint32_t adminq_timeouts; /* free-running count of AQ cmds timeouts */ + /* free-running count of each distinct AQ cmd executed */ + uint32_t adminq_describe_device_cnt; + uint32_t adminq_cfg_device_resources_cnt; + uint32_t adminq_register_page_list_cnt; + uint32_t adminq_unregister_page_list_cnt; + uint32_t adminq_create_tx_queue_cnt; + uint32_t adminq_create_rx_queue_cnt; + uint32_t adminq_destroy_tx_queue_cnt; + uint32_t adminq_destroy_rx_queue_cnt; + uint32_t adminq_dcfg_device_resources_cnt; + uint32_t adminq_set_driver_parameter_cnt; + uint32_t adminq_verify_driver_compatibility_cnt; + + uint32_t interface_up_cnt; + uint32_t interface_down_cnt; + uint32_t reset_cnt; + + struct task service_task; + struct taskqueue *service_tq; + + struct gve_state_flags state_flags; + struct sx gve_iface_lock; +}; + +static inline bool +gve_get_state_flag(struct gve_priv *priv, int pos) +{ + return (BIT_ISSET(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags)); +} + +static inline void +gve_set_state_flag(struct gve_priv *priv, int pos) +{ + BIT_SET_ATOMIC(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags); +} + +static inline void +gve_clear_state_flag(struct gve_priv *priv, int pos) +{ + BIT_CLR_ATOMIC(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags); +} + +/* Defined in gve_main.c */ +void gve_schedule_reset(struct gve_priv *priv); + +/* Register access functions defined in gve_utils.c */ +uint32_t gve_reg_bar_read_4(struct gve_priv *priv, bus_size_t offset); +void gve_reg_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val); +void gve_db_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val); + +/* QPL (Queue Page List) functions defined in gve_qpl.c */ +int gve_alloc_qpls(struct gve_priv *priv); +void gve_free_qpls(struct gve_priv *priv); +int gve_register_qpls(struct gve_priv *priv); +int gve_unregister_qpls(struct gve_priv *priv); + +/* TX functions defined in gve_tx.c */ +int gve_alloc_tx_rings(struct gve_priv *priv); +void gve_free_tx_rings(struct gve_priv *priv); +int gve_create_tx_rings(struct gve_priv *priv); +int gve_destroy_tx_rings(struct gve_priv *priv); +int gve_tx_intr(void *arg); +int gve_xmit_ifp(if_t ifp, struct mbuf *mbuf); +void gve_qflush(if_t ifp); +void gve_xmit_tq(void *arg, int pending); +void gve_tx_cleanup_tq(void *arg, int pending); + +/* RX functions defined in gve_rx.c */ +int gve_alloc_rx_rings(struct gve_priv *priv); +void gve_free_rx_rings(struct gve_priv *priv); +int gve_create_rx_rings(struct gve_priv *priv); +int gve_destroy_rx_rings(struct gve_priv *priv); +int gve_rx_intr(void *arg); +void gve_rx_cleanup_tq(void *arg, int pending); + +/* DMA functions defined in gve_utils.c */ +int gve_dma_alloc_coherent(struct gve_priv *priv, int size, int align, + struct gve_dma_handle *dma); +void gve_dma_free_coherent(struct gve_dma_handle *dma); +int gve_dmamap_create(struct gve_priv *priv, int size, int align, + struct gve_dma_handle *dma); +void gve_dmamap_destroy(struct gve_dma_handle *dma); + +/* IRQ functions defined in gve_utils.c */ +void gve_free_irqs(struct gve_priv *priv); +int gve_alloc_irqs(struct gve_priv *priv); +void gve_unmask_all_queue_irqs(struct gve_priv *priv); +void gve_mask_all_queue_irqs(struct gve_priv *priv); + +/* Systcl functions defined in gve_sysctl.c*/ +void gve_setup_sysctl(struct gve_priv *priv); +void gve_accum_stats(struct gve_priv *priv, uint64_t *rpackets, + uint64_t *rbytes, uint64_t *rx_dropped_pkt, uint64_t *tpackets, + uint64_t *tbytes, uint64_t *tx_dropped_pkt); + +/* Stats functions defined in gve_utils.c */ +void gve_alloc_counters(counter_u64_t *stat, int num_stats); +void gve_free_counters(counter_u64_t *stat, int num_stats); + +#endif /* _GVE_FBSD_H_ */ diff --git a/sys/dev/gve/gve_adminq.c b/sys/dev/gve/gve_adminq.c new file mode 100644 index 000000000000..3c332607ebd4 --- /dev/null +++ b/sys/dev/gve/gve_adminq.c @@ -0,0 +1,803 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2023 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <sys/endian.h> +#include <sys/socket.h> +#include <sys/time.h> + +#include <net/ethernet.h> +#include <net/if.h> +#include <net/if_var.h> + +#include "gve.h" +#include "gve_adminq.h" + +#define GVE_ADMINQ_SLEEP_LEN_MS 20 +#define GVE_MAX_ADMINQ_EVENT_COUNTER_CHECK 10 +#define GVE_ADMINQ_DEVICE_DESCRIPTOR_VERSION 1 +#define GVE_REG_ADMINQ_ADDR 16 +#define ADMINQ_SLOTS (ADMINQ_SIZE / sizeof(struct gve_adminq_command)) + +#define GVE_DEVICE_OPTION_ERROR_FMT "%s option error:\n" \ + "Expected: length=%d, feature_mask=%x.\n" \ + "Actual: length=%d, feature_mask=%x.\n" + +#define GVE_DEVICE_OPTION_TOO_BIG_FMT "Length of %s option larger than expected." \ + " Possible older version of guest driver.\n" + +static +void gve_parse_device_option(struct gve_priv *priv, + struct gve_device_descriptor *device_descriptor, + struct gve_device_option *option, + struct gve_device_option_gqi_qpl **dev_op_gqi_qpl, + struct gve_device_option_jumbo_frames **dev_op_jumbo_frames) +{ + uint32_t req_feat_mask = be32toh(option->required_features_mask); + uint16_t option_length = be16toh(option->option_length); + uint16_t option_id = be16toh(option->option_id); + + /* + * If the length or feature mask doesn't match, continue without + * enabling the feature. + */ + switch (option_id) { + case GVE_DEV_OPT_ID_GQI_QPL: + if (option_length < sizeof(**dev_op_gqi_qpl) || + req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_GQI_QPL) { + device_printf(priv->dev, GVE_DEVICE_OPTION_ERROR_FMT, + "GQI QPL", (int)sizeof(**dev_op_gqi_qpl), + GVE_DEV_OPT_REQ_FEAT_MASK_GQI_QPL, + option_length, req_feat_mask); + break; + } + + if (option_length > sizeof(**dev_op_gqi_qpl)) { + device_printf(priv->dev, GVE_DEVICE_OPTION_TOO_BIG_FMT, + "GQI QPL"); + } + *dev_op_gqi_qpl = (void *)(option + 1); + break; + + case GVE_DEV_OPT_ID_JUMBO_FRAMES: + if (option_length < sizeof(**dev_op_jumbo_frames) || + req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES) { + device_printf(priv->dev, GVE_DEVICE_OPTION_ERROR_FMT, + "Jumbo Frames", (int)sizeof(**dev_op_jumbo_frames), + GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES, + option_length, req_feat_mask); + break; + } + + if (option_length > sizeof(**dev_op_jumbo_frames)) { + device_printf(priv->dev, + GVE_DEVICE_OPTION_TOO_BIG_FMT, "Jumbo Frames"); + } + *dev_op_jumbo_frames = (void *)(option + 1); + break; + + default: + /* + * If we don't recognize the option just continue + * without doing anything. + */ + device_printf(priv->dev, "Unrecognized device option 0x%hx not enabled.\n", + option_id); + } +} + +/* Process all device options for a given describe device call. */ +static int +gve_process_device_options(struct gve_priv *priv, + struct gve_device_descriptor *descriptor, + struct gve_device_option_gqi_qpl **dev_op_gqi_qpl, + struct gve_device_option_jumbo_frames **dev_op_jumbo_frames) +{ + char *desc_end = (char *)descriptor + be16toh(descriptor->total_length); + const int num_options = be16toh(descriptor->num_device_options); + struct gve_device_option *dev_opt; + int i; + + /* The options struct directly follows the device descriptor. */ + dev_opt = (void *)(descriptor + 1); + for (i = 0; i < num_options; i++) { + if ((char *)(dev_opt + 1) > desc_end || + (char *)(dev_opt + 1) + be16toh(dev_opt->option_length) > desc_end) { + device_printf(priv->dev, + "options exceed device_descriptor's total length.\n"); + return (EINVAL); + } + + gve_parse_device_option(priv, descriptor, dev_opt, + dev_op_gqi_qpl, dev_op_jumbo_frames); + dev_opt = (void *)((char *)(dev_opt + 1) + be16toh(dev_opt->option_length)); + } + + return (0); +} + +static int gve_adminq_execute_cmd(struct gve_priv *priv, + struct gve_adminq_command *cmd); + +static int +gve_adminq_destroy_tx_queue(struct gve_priv *priv, uint32_t id) +{ + struct gve_adminq_command cmd = (struct gve_adminq_command){}; + + cmd.opcode = htobe32(GVE_ADMINQ_DESTROY_TX_QUEUE); + cmd.destroy_tx_queue.queue_id = htobe32(id); + + return (gve_adminq_execute_cmd(priv, &cmd)); +} + +static int +gve_adminq_destroy_rx_queue(struct gve_priv *priv, uint32_t id) +{ + struct gve_adminq_command cmd = (struct gve_adminq_command){}; + + cmd.opcode = htobe32(GVE_ADMINQ_DESTROY_RX_QUEUE); + cmd.destroy_rx_queue.queue_id = htobe32(id); + + return (gve_adminq_execute_cmd(priv, &cmd)); +} + +int +gve_adminq_destroy_rx_queues(struct gve_priv *priv, uint32_t num_queues) +{ + int err; + int i; + + for (i = 0; i < num_queues; i++) { + err = gve_adminq_destroy_rx_queue(priv, i); + if (err != 0) { + device_printf(priv->dev, "Failed to destroy rxq %d, err: %d\n", + i, err); + } + } + + if (err != 0) + return (err); + + device_printf(priv->dev, "Destroyed %d rx queues\n", num_queues); + return (0); +} + +int +gve_adminq_destroy_tx_queues(struct gve_priv *priv, uint32_t num_queues) +{ + int err; + int i; + + for (i = 0; i < num_queues; i++) { + err = gve_adminq_destroy_tx_queue(priv, i); + if (err != 0) { + device_printf(priv->dev, "Failed to destroy txq %d, err: %d\n", + i, err); + } + } + + if (err != 0) + return (err); + + device_printf(priv->dev, "Destroyed %d tx queues\n", num_queues); + return (0); +} + +static int +gve_adminq_create_rx_queue(struct gve_priv *priv, uint32_t queue_index) +{ + struct gve_adminq_command cmd = (struct gve_adminq_command){}; + struct gve_rx_ring *rx = &priv->rx[queue_index]; + struct gve_dma_handle *qres_dma = &rx->com.q_resources_mem; + + bus_dmamap_sync(qres_dma->tag, qres_dma->map, BUS_DMASYNC_PREREAD); + + cmd.opcode = htobe32(GVE_ADMINQ_CREATE_RX_QUEUE); + cmd.create_rx_queue = (struct gve_adminq_create_rx_queue) { + .queue_id = htobe32(queue_index), + .index = htobe32(queue_index), + .ntfy_id = htobe32(rx->com.ntfy_id), + .queue_resources_addr = htobe64(qres_dma->bus_addr), + .rx_desc_ring_addr = htobe64(rx->desc_ring_mem.bus_addr), + .rx_data_ring_addr = htobe64(rx->data_ring_mem.bus_addr), + .queue_page_list_id = htobe32((rx->com.qpl)->id), + .rx_ring_size = htobe16(priv->rx_desc_cnt), + .packet_buffer_size = htobe16(GVE_DEFAULT_RX_BUFFER_SIZE), + }; + + return (gve_adminq_execute_cmd(priv, &cmd)); +} + +int +gve_adminq_create_rx_queues(struct gve_priv *priv, uint32_t num_queues) +{ + int err; + int i; + + for (i = 0; i < num_queues; i++) { + err = gve_adminq_create_rx_queue(priv, i); + if (err != 0) { + device_printf(priv->dev, "Failed to create rxq %d, err: %d\n", + i, err); + goto abort; + } + } + + if (bootverbose) + device_printf(priv->dev, "Created %d rx queues\n", num_queues); + return (0); + +abort: + gve_adminq_destroy_rx_queues(priv, i); + return (err); +} + +static int +gve_adminq_create_tx_queue(struct gve_priv *priv, uint32_t queue_index) +{ + struct gve_adminq_command cmd = (struct gve_adminq_command){}; + struct gve_tx_ring *tx = &priv->tx[queue_index]; + struct gve_dma_handle *qres_dma = &tx->com.q_resources_mem; + + bus_dmamap_sync(qres_dma->tag, qres_dma->map, BUS_DMASYNC_PREREAD); + + cmd.opcode = htobe32(GVE_ADMINQ_CREATE_TX_QUEUE); + cmd.create_tx_queue = (struct gve_adminq_create_tx_queue) { + .queue_id = htobe32(queue_index), + .queue_resources_addr = htobe64(qres_dma->bus_addr), + .tx_ring_addr = htobe64(tx->desc_ring_mem.bus_addr), + .queue_page_list_id = htobe32((tx->com.qpl)->id), + .ntfy_id = htobe32(tx->com.ntfy_id), + .tx_ring_size = htobe16(priv->tx_desc_cnt), + }; + + return (gve_adminq_execute_cmd(priv, &cmd)); +} + +int +gve_adminq_create_tx_queues(struct gve_priv *priv, uint32_t num_queues) +{ + int err; + int i; + + for (i = 0; i < num_queues; i++) { + err = gve_adminq_create_tx_queue(priv, i); + if (err != 0) { + device_printf(priv->dev, "Failed to create txq %d, err: %d\n", + i, err); + goto abort; + } + } + + if (bootverbose) + device_printf(priv->dev, "Created %d tx queues\n", num_queues); + return (0); + +abort: + gve_adminq_destroy_tx_queues(priv, i); + return (err); +} + +int +gve_adminq_set_mtu(struct gve_priv *priv, uint32_t mtu) { + struct gve_adminq_command cmd = (struct gve_adminq_command){}; + + cmd.opcode = htobe32(GVE_ADMINQ_SET_DRIVER_PARAMETER); + cmd.set_driver_param = (struct gve_adminq_set_driver_parameter) { + .parameter_type = htobe32(GVE_SET_PARAM_MTU), + .parameter_value = htobe64(mtu), + }; + + return (gve_adminq_execute_cmd(priv, &cmd)); +} + +static void +gve_enable_supported_features(struct gve_priv *priv, + uint32_t supported_features_mask, + const struct gve_device_option_jumbo_frames *dev_op_jumbo_frames) +{ + if (dev_op_jumbo_frames && + (supported_features_mask & GVE_SUP_JUMBO_FRAMES_MASK)) { + if (bootverbose) + device_printf(priv->dev, "JUMBO FRAMES device option enabled: %u.\n", + be16toh(dev_op_jumbo_frames->max_mtu)); + priv->max_mtu = be16toh(dev_op_jumbo_frames->max_mtu); + } +} + +int +gve_adminq_describe_device(struct gve_priv *priv) +{ + struct gve_adminq_command aq_cmd = (struct gve_adminq_command){}; + struct gve_device_descriptor *desc; + struct gve_dma_handle desc_mem; + struct gve_device_option_gqi_qpl *dev_op_gqi_qpl = NULL; + struct gve_device_option_jumbo_frames *dev_op_jumbo_frames = NULL; + uint32_t supported_features_mask = 0; + int rc; + int i; + + rc = gve_dma_alloc_coherent(priv, ADMINQ_SIZE, ADMINQ_SIZE, &desc_mem); + if (rc != 0) { + device_printf(priv->dev, "Failed to alloc DMA mem for DescribeDevice.\n"); + return (rc); + } + + desc = desc_mem.cpu_addr; + + aq_cmd.opcode = htobe32(GVE_ADMINQ_DESCRIBE_DEVICE); + aq_cmd.describe_device.device_descriptor_addr = htobe64( + desc_mem.bus_addr); + aq_cmd.describe_device.device_descriptor_version = htobe32( + GVE_ADMINQ_DEVICE_DESCRIPTOR_VERSION); + aq_cmd.describe_device.available_length = htobe32(ADMINQ_SIZE); + + bus_dmamap_sync(desc_mem.tag, desc_mem.map, BUS_DMASYNC_PREWRITE); + + rc = gve_adminq_execute_cmd(priv, &aq_cmd); + if (rc != 0) + goto free_device_descriptor; + + bus_dmamap_sync(desc_mem.tag, desc_mem.map, BUS_DMASYNC_POSTREAD); + + rc = gve_process_device_options(priv, desc, &dev_op_gqi_qpl, + &dev_op_jumbo_frames); + if (rc != 0) + goto free_device_descriptor; + + if (dev_op_gqi_qpl != NULL) { + priv->queue_format = GVE_GQI_QPL_FORMAT; + supported_features_mask = be32toh( + dev_op_gqi_qpl->supported_features_mask); + if (bootverbose) + device_printf(priv->dev, + "Driver is running with GQI QPL queue format.\n"); + } else { + device_printf(priv->dev, "No compatible queue formats\n"); + rc = (EINVAL); + goto free_device_descriptor; + } + + priv->num_event_counters = be16toh(desc->counters); + priv->default_num_queues = be16toh(desc->default_num_queues); + priv->tx_desc_cnt = be16toh(desc->tx_queue_entries); + priv->rx_desc_cnt = be16toh(desc->rx_queue_entries); + priv->rx_pages_per_qpl = be16toh(desc->rx_pages_per_qpl); + priv->max_registered_pages = be64toh(desc->max_registered_pages); + priv->max_mtu = be16toh(desc->mtu); + priv->default_num_queues = be16toh(desc->default_num_queues); + priv->supported_features = supported_features_mask; + + gve_enable_supported_features(priv, supported_features_mask, + dev_op_jumbo_frames); + + for (i = 0; i < ETHER_ADDR_LEN; i++) + priv->mac[i] = desc->mac[i]; + +free_device_descriptor: + gve_dma_free_coherent(&desc_mem); + + return (rc); +} + +int +gve_adminq_register_page_list(struct gve_priv *priv, + struct gve_queue_page_list *qpl) +{ + struct gve_adminq_command cmd = (struct gve_adminq_command){}; + uint32_t num_entries = qpl->num_pages; + uint32_t size = num_entries * sizeof(qpl->dmas[0].bus_addr); + __be64 *page_list; + struct gve_dma_handle dma; + int err; + int i; + + err = gve_dma_alloc_coherent(priv, size, PAGE_SIZE, &dma); + if (err != 0) + return (ENOMEM); + + page_list = dma.cpu_addr; + + for (i = 0; i < num_entries; i++) + page_list[i] = htobe64(qpl->dmas[i].bus_addr); + + bus_dmamap_sync(dma.tag, dma.map, BUS_DMASYNC_PREWRITE); + + cmd.opcode = htobe32(GVE_ADMINQ_REGISTER_PAGE_LIST); + cmd.reg_page_list = (struct gve_adminq_register_page_list) { + .page_list_id = htobe32(qpl->id), + .num_pages = htobe32(num_entries), + .page_address_list_addr = htobe64(dma.bus_addr), + .page_size = htobe64(PAGE_SIZE), + }; + + err = gve_adminq_execute_cmd(priv, &cmd); + gve_dma_free_coherent(&dma); + return (err); +} + +int +gve_adminq_unregister_page_list(struct gve_priv *priv, uint32_t page_list_id) +{ + struct gve_adminq_command cmd = (struct gve_adminq_command){}; + + cmd.opcode = htobe32(GVE_ADMINQ_UNREGISTER_PAGE_LIST); + cmd.unreg_page_list = (struct gve_adminq_unregister_page_list) { + .page_list_id = htobe32(page_list_id), + }; + + return (gve_adminq_execute_cmd(priv, &cmd)); +} + +#define GVE_NTFY_BLK_BASE_MSIX_IDX 0 +int +gve_adminq_configure_device_resources(struct gve_priv *priv) +{ + struct gve_adminq_command aq_cmd = (struct gve_adminq_command){}; + + bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map, + BUS_DMASYNC_PREREAD); + bus_dmamap_sync(priv->counter_array_mem.tag, + priv->counter_array_mem.map, BUS_DMASYNC_PREREAD); + + aq_cmd.opcode = htobe32(GVE_ADMINQ_CONFIGURE_DEVICE_RESOURCES); + aq_cmd.configure_device_resources = + (struct gve_adminq_configure_device_resources) { + .counter_array = htobe64(priv->counter_array_mem.bus_addr), + .irq_db_addr = htobe64(priv->irqs_db_mem.bus_addr), + .num_counters = htobe32(priv->num_event_counters), + .num_irq_dbs = htobe32(priv->num_queues), + .irq_db_stride = htobe32(sizeof(struct gve_irq_db)), + .ntfy_blk_msix_base_idx = htobe32(GVE_NTFY_BLK_BASE_MSIX_IDX), + .queue_format = priv->queue_format, + }; + + return (gve_adminq_execute_cmd(priv, &aq_cmd)); +} + +int +gve_adminq_deconfigure_device_resources(struct gve_priv *priv) +{ + struct gve_adminq_command aq_cmd = (struct gve_adminq_command){}; + + aq_cmd.opcode = htobe32(GVE_ADMINQ_DECONFIGURE_DEVICE_RESOURCES); + return (gve_adminq_execute_cmd(priv, &aq_cmd)); +} + +int +gve_adminq_verify_driver_compatibility(struct gve_priv *priv, + uint64_t driver_info_len, + vm_paddr_t driver_info_addr) +{ + struct gve_adminq_command aq_cmd = (struct gve_adminq_command){}; + + aq_cmd.opcode = htobe32(GVE_ADMINQ_VERIFY_DRIVER_COMPATIBILITY); + aq_cmd.verify_driver_compatibility = (struct gve_adminq_verify_driver_compatibility) { + .driver_info_len = htobe64(driver_info_len), + .driver_info_addr = htobe64(driver_info_addr), + }; + + return (gve_adminq_execute_cmd(priv, &aq_cmd)); +} + +int +gve_adminq_alloc(struct gve_priv *priv) +{ + int rc; + + if (gve_get_state_flag(priv, GVE_STATE_FLAG_ADMINQ_OK)) + return (0); + + if (priv->aq_mem.cpu_addr == NULL) { + rc = gve_dma_alloc_coherent(priv, ADMINQ_SIZE, ADMINQ_SIZE, + &priv->aq_mem); + if (rc != 0) { + device_printf(priv->dev, "Failed to allocate admin queue mem\n"); + return (rc); + } + } + + priv->adminq = priv->aq_mem.cpu_addr; + priv->adminq_bus_addr = priv->aq_mem.bus_addr; + + if (priv->adminq == NULL) + return (ENOMEM); + + priv->adminq_mask = ADMINQ_SLOTS - 1; + priv->adminq_prod_cnt = 0; + priv->adminq_cmd_fail = 0; + priv->adminq_timeouts = 0; + priv->adminq_describe_device_cnt = 0; + priv->adminq_cfg_device_resources_cnt = 0; + priv->adminq_register_page_list_cnt = 0; + priv->adminq_unregister_page_list_cnt = 0; + priv->adminq_create_tx_queue_cnt = 0; + priv->adminq_create_rx_queue_cnt = 0; + priv->adminq_destroy_tx_queue_cnt = 0; + priv->adminq_destroy_rx_queue_cnt = 0; + priv->adminq_dcfg_device_resources_cnt = 0; + priv->adminq_set_driver_parameter_cnt = 0; + + gve_reg_bar_write_4(priv, GVE_REG_ADMINQ_ADDR, + priv->adminq_bus_addr / ADMINQ_SIZE); + + gve_set_state_flag(priv, GVE_STATE_FLAG_ADMINQ_OK); + return (0); +} + +void +gve_release_adminq(struct gve_priv *priv) +{ + if (!gve_get_state_flag(priv, GVE_STATE_FLAG_ADMINQ_OK)) + return; + + gve_reg_bar_write_4(priv, GVE_REG_ADMINQ_ADDR, 0); + while (gve_reg_bar_read_4(priv, GVE_REG_ADMINQ_ADDR)) { + device_printf(priv->dev, "Waiting until admin queue is released.\n"); + pause("gve release adminq", GVE_ADMINQ_SLEEP_LEN_MS); + } + + gve_dma_free_coherent(&priv->aq_mem); + priv->aq_mem = (struct gve_dma_handle){}; + priv->adminq = 0; + priv->adminq_bus_addr = 0; + + gve_clear_state_flag(priv, GVE_STATE_FLAG_ADMINQ_OK); + + if (bootverbose) + device_printf(priv->dev, "Admin queue released\n"); +} + +static int +gve_adminq_parse_err(struct gve_priv *priv, uint32_t opcode, uint32_t status) +{ + if (status != GVE_ADMINQ_COMMAND_PASSED && + status != GVE_ADMINQ_COMMAND_UNSET) { + device_printf(priv->dev, "AQ command(%u): failed with status %d\n", opcode, status); + priv->adminq_cmd_fail++; + } + switch (status) { + case GVE_ADMINQ_COMMAND_PASSED: + return (0); + + case GVE_ADMINQ_COMMAND_UNSET: + device_printf(priv->dev, + "AQ command(%u): err and status both unset, this should not be possible.\n", + opcode); + return (EINVAL); + + case GVE_ADMINQ_COMMAND_ERROR_ABORTED: + case GVE_ADMINQ_COMMAND_ERROR_CANCELLED: + case GVE_ADMINQ_COMMAND_ERROR_DATALOSS: + case GVE_ADMINQ_COMMAND_ERROR_FAILED_PRECONDITION: + case GVE_ADMINQ_COMMAND_ERROR_UNAVAILABLE: + return (EAGAIN); + + case GVE_ADMINQ_COMMAND_ERROR_ALREADY_EXISTS: + case GVE_ADMINQ_COMMAND_ERROR_INTERNAL_ERROR: + case GVE_ADMINQ_COMMAND_ERROR_INVALID_ARGUMENT: + case GVE_ADMINQ_COMMAND_ERROR_NOT_FOUND: + case GVE_ADMINQ_COMMAND_ERROR_OUT_OF_RANGE: + case GVE_ADMINQ_COMMAND_ERROR_UNKNOWN_ERROR: + return (EINVAL); + + case GVE_ADMINQ_COMMAND_ERROR_DEADLINE_EXCEEDED: + return (ETIMEDOUT); + + case GVE_ADMINQ_COMMAND_ERROR_PERMISSION_DENIED: + case GVE_ADMINQ_COMMAND_ERROR_UNAUTHENTICATED: + return (EACCES); + + case GVE_ADMINQ_COMMAND_ERROR_RESOURCE_EXHAUSTED: + return (ENOMEM); + + case GVE_ADMINQ_COMMAND_ERROR_UNIMPLEMENTED: + return (EOPNOTSUPP); + + default: + device_printf(priv->dev, "AQ command(%u): unknown status code %d\n", + opcode, status); + return (EINVAL); + } +} + +static void +gve_adminq_kick_cmd(struct gve_priv *priv, uint32_t prod_cnt) +{ + gve_reg_bar_write_4(priv, ADMINQ_DOORBELL, prod_cnt); + +} + +static bool +gve_adminq_wait_for_cmd(struct gve_priv *priv, uint32_t prod_cnt) +{ + int i; + + for (i = 0; i < GVE_MAX_ADMINQ_EVENT_COUNTER_CHECK; i++) { + if (gve_reg_bar_read_4(priv, ADMINQ_EVENT_COUNTER) == prod_cnt) + return (true); + pause("gve adminq cmd", GVE_ADMINQ_SLEEP_LEN_MS); + } + + return (false); +} + +/* + * Flushes all AQ commands currently queued and waits for them to complete. + * If there are failures, it will return the first error. + */ +static int +gve_adminq_kick_and_wait(struct gve_priv *priv) +{ + struct gve_adminq_command *cmd; + uint32_t status, err; + uint32_t tail, head; + uint32_t opcode; + int i; + + tail = gve_reg_bar_read_4(priv, ADMINQ_EVENT_COUNTER); + head = priv->adminq_prod_cnt; + + gve_adminq_kick_cmd(priv, head); + if (!gve_adminq_wait_for_cmd(priv, head)) { + device_printf(priv->dev, "AQ commands timed out, need to reset AQ\n"); + priv->adminq_timeouts++; + return (ENOTRECOVERABLE); + } + bus_dmamap_sync( + priv->aq_mem.tag, priv->aq_mem.map, BUS_DMASYNC_POSTREAD); + + for (i = tail; i < head; i++) { + cmd = &priv->adminq[i & priv->adminq_mask]; + status = be32toh(cmd->status); + opcode = be32toh(cmd->opcode); + err = gve_adminq_parse_err(priv, opcode, status); + if (err != 0) + return (err); + } + + return (0); +} + +/* + * This function is not threadsafe - the caller is responsible for any + * necessary locks. + */ +static int +gve_adminq_issue_cmd(struct gve_priv *priv, struct gve_adminq_command *cmd_orig) +{ + struct gve_adminq_command *cmd; + uint32_t opcode; + uint32_t tail; + int err; + + tail = gve_reg_bar_read_4(priv, ADMINQ_EVENT_COUNTER); + + /* Check if next command will overflow the buffer. */ + if ((priv->adminq_prod_cnt - tail) > priv->adminq_mask) { + /* Flush existing commands to make room. */ + err = gve_adminq_kick_and_wait(priv); + if (err != 0) + return (err); + + /* Retry. */ + tail = gve_reg_bar_read_4(priv, ADMINQ_EVENT_COUNTER); + if ((priv->adminq_prod_cnt - tail) > priv->adminq_mask) { + /* + * This should never happen. We just flushed the + * command queue so there should be enough space. + */ + return (ENOMEM); + } + } + + cmd = &priv->adminq[priv->adminq_prod_cnt & priv->adminq_mask]; + priv->adminq_prod_cnt++; + + memcpy(cmd, cmd_orig, sizeof(*cmd_orig)); + + bus_dmamap_sync( + priv->aq_mem.tag, priv->aq_mem.map, BUS_DMASYNC_PREWRITE); + + opcode = be32toh(cmd->opcode); + + switch (opcode) { + case GVE_ADMINQ_DESCRIBE_DEVICE: + priv->adminq_describe_device_cnt++; + break; + + case GVE_ADMINQ_CONFIGURE_DEVICE_RESOURCES: + priv->adminq_cfg_device_resources_cnt++; + break; + + case GVE_ADMINQ_REGISTER_PAGE_LIST: + priv->adminq_register_page_list_cnt++; + break; + + case GVE_ADMINQ_UNREGISTER_PAGE_LIST: + priv->adminq_unregister_page_list_cnt++; + break; + + case GVE_ADMINQ_CREATE_TX_QUEUE: + priv->adminq_create_tx_queue_cnt++; + break; + + case GVE_ADMINQ_CREATE_RX_QUEUE: + priv->adminq_create_rx_queue_cnt++; + break; + + case GVE_ADMINQ_DESTROY_TX_QUEUE: + priv->adminq_destroy_tx_queue_cnt++; + break; + + case GVE_ADMINQ_DESTROY_RX_QUEUE: + priv->adminq_destroy_rx_queue_cnt++; + break; + + case GVE_ADMINQ_DECONFIGURE_DEVICE_RESOURCES: + priv->adminq_dcfg_device_resources_cnt++; + break; + + case GVE_ADMINQ_SET_DRIVER_PARAMETER: + priv->adminq_set_driver_parameter_cnt++; + break; + + case GVE_ADMINQ_VERIFY_DRIVER_COMPATIBILITY: + priv->adminq_verify_driver_compatibility_cnt++; + break; + + default: + device_printf(priv->dev, "Unknown AQ command opcode %d\n", opcode); + } + + return (0); +} + +/* + * This function is not threadsafe - the caller is responsible for any + * necessary locks. + * The caller is also responsible for making sure there are no commands + * waiting to be executed. + */ +static int +gve_adminq_execute_cmd(struct gve_priv *priv, struct gve_adminq_command *cmd_orig) +{ + uint32_t tail, head; + int err; + + tail = gve_reg_bar_read_4(priv, ADMINQ_EVENT_COUNTER); + head = priv->adminq_prod_cnt; + + if (tail != head) + return (EINVAL); + err = gve_adminq_issue_cmd(priv, cmd_orig); + if (err != 0) + return (err); + return (gve_adminq_kick_and_wait(priv)); +} diff --git a/sys/dev/gve/gve_adminq.h b/sys/dev/gve/gve_adminq.h new file mode 100644 index 000000000000..5923e5f353d1 --- /dev/null +++ b/sys/dev/gve/gve_adminq.h @@ -0,0 +1,394 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2023 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _GVE_AQ_H_ +#define _GVE_AQ_H_ 1 + +#include <sys/types.h> +#include <net/if.h> +#include <net/iflib.h> +#include <machine/bus.h> +#include <machine/resource.h> + +/* Admin queue opcodes */ +enum gve_adminq_opcodes { + GVE_ADMINQ_DESCRIBE_DEVICE = 0x1, + GVE_ADMINQ_CONFIGURE_DEVICE_RESOURCES = 0x2, + GVE_ADMINQ_REGISTER_PAGE_LIST = 0x3, + GVE_ADMINQ_UNREGISTER_PAGE_LIST = 0x4, + GVE_ADMINQ_CREATE_TX_QUEUE = 0x5, + GVE_ADMINQ_CREATE_RX_QUEUE = 0x6, + GVE_ADMINQ_DESTROY_TX_QUEUE = 0x7, + GVE_ADMINQ_DESTROY_RX_QUEUE = 0x8, + GVE_ADMINQ_DECONFIGURE_DEVICE_RESOURCES = 0x9, + GVE_ADMINQ_SET_DRIVER_PARAMETER = 0xB, + GVE_ADMINQ_REPORT_STATS = 0xC, + GVE_ADMINQ_REPORT_LINK_SPEED = 0xD, + GVE_ADMINQ_GET_PTYPE_MAP = 0xE, + GVE_ADMINQ_VERIFY_DRIVER_COMPATIBILITY = 0xF, +}; + +/* Admin queue status codes */ +enum gve_adminq_statuses { + GVE_ADMINQ_COMMAND_UNSET = 0x0, + GVE_ADMINQ_COMMAND_PASSED = 0x1, + GVE_ADMINQ_COMMAND_ERROR_ABORTED = 0xFFFFFFF0, + GVE_ADMINQ_COMMAND_ERROR_ALREADY_EXISTS = 0xFFFFFFF1, + GVE_ADMINQ_COMMAND_ERROR_CANCELLED = 0xFFFFFFF2, + GVE_ADMINQ_COMMAND_ERROR_DATALOSS = 0xFFFFFFF3, + GVE_ADMINQ_COMMAND_ERROR_DEADLINE_EXCEEDED = 0xFFFFFFF4, + GVE_ADMINQ_COMMAND_ERROR_FAILED_PRECONDITION = 0xFFFFFFF5, + GVE_ADMINQ_COMMAND_ERROR_INTERNAL_ERROR = 0xFFFFFFF6, + GVE_ADMINQ_COMMAND_ERROR_INVALID_ARGUMENT = 0xFFFFFFF7, + GVE_ADMINQ_COMMAND_ERROR_NOT_FOUND = 0xFFFFFFF8, + GVE_ADMINQ_COMMAND_ERROR_OUT_OF_RANGE = 0xFFFFFFF9, + GVE_ADMINQ_COMMAND_ERROR_PERMISSION_DENIED = 0xFFFFFFFA, + GVE_ADMINQ_COMMAND_ERROR_UNAUTHENTICATED = 0xFFFFFFFB, + GVE_ADMINQ_COMMAND_ERROR_RESOURCE_EXHAUSTED = 0xFFFFFFFC, + GVE_ADMINQ_COMMAND_ERROR_UNAVAILABLE = 0xFFFFFFFD, + GVE_ADMINQ_COMMAND_ERROR_UNIMPLEMENTED = 0xFFFFFFFE, + GVE_ADMINQ_COMMAND_ERROR_UNKNOWN_ERROR = 0xFFFFFFFF, +}; + +#define GVE_ADMINQ_DEVICE_DESCRIPTOR_VERSION 1 + +/* + * All AdminQ command structs should be naturally packed. The static_assert + * calls make sure this is the case at compile time. + */ + +struct gve_adminq_describe_device { + __be64 device_descriptor_addr; + __be32 device_descriptor_version; + __be32 available_length; +}; + +_Static_assert(sizeof(struct gve_adminq_describe_device) == 16, + "gve: bad admin queue struct length"); + +struct gve_device_descriptor { + __be64 max_registered_pages; + __be16 reserved1; + __be16 tx_queue_entries; + __be16 rx_queue_entries; + __be16 default_num_queues; + __be16 mtu; + __be16 counters; + __be16 reserved2; + __be16 rx_pages_per_qpl; + uint8_t mac[ETHER_ADDR_LEN]; + __be16 num_device_options; + __be16 total_length; + uint8_t reserved3[6]; +}; + +_Static_assert(sizeof(struct gve_device_descriptor) == 40, + "gve: bad admin queue struct length"); + +struct gve_device_option { + __be16 option_id; + __be16 option_length; + __be32 required_features_mask; +}; + +_Static_assert(sizeof(struct gve_device_option) == 8, + "gve: bad admin queue struct length"); + +struct gve_device_option_gqi_rda { + __be32 supported_features_mask; +}; + +_Static_assert(sizeof(struct gve_device_option_gqi_rda) == 4, + "gve: bad admin queue struct length"); + +struct gve_device_option_gqi_qpl { + __be32 supported_features_mask; +}; + +_Static_assert(sizeof(struct gve_device_option_gqi_qpl) == 4, + "gve: bad admin queue struct length"); + +struct gve_device_option_dqo_rda { + __be32 supported_features_mask; +}; + +_Static_assert(sizeof(struct gve_device_option_dqo_rda) == 4, + "gve: bad admin queue struct length"); + +struct gve_device_option_modify_ring { + __be32 supported_features_mask; + __be16 max_rx_ring_size; + __be16 max_tx_ring_size; +}; + +_Static_assert(sizeof(struct gve_device_option_modify_ring) == 8, + "gve: bad admin queue struct length"); + +struct gve_device_option_jumbo_frames { + __be32 supported_features_mask; + __be16 max_mtu; + uint8_t padding[2]; +}; + +_Static_assert(sizeof(struct gve_device_option_jumbo_frames) == 8, + "gve: bad admin queue struct length"); + +enum gve_dev_opt_id { + GVE_DEV_OPT_ID_GQI_RAW_ADDRESSING = 0x1, + GVE_DEV_OPT_ID_GQI_RDA = 0x2, + GVE_DEV_OPT_ID_GQI_QPL = 0x3, + GVE_DEV_OPT_ID_DQO_RDA = 0x4, + GVE_DEV_OPT_ID_MODIFY_RING = 0x6, + GVE_DEV_OPT_ID_JUMBO_FRAMES = 0x8, +}; + +/* + * These masks are way to predicate the use of a particular option on the driver + * having particular bug fixes represented by each bit position in the mask. + * Currently they are all zero because there are no known bugs preventing the + * use of any option. + */ +enum gve_dev_opt_req_feat_mask { + GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RAW_ADDRESSING = 0x0, + GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RDA = 0x0, + GVE_DEV_OPT_REQ_FEAT_MASK_GQI_QPL = 0x0, + GVE_DEV_OPT_REQ_FEAT_MASK_DQO_RDA = 0x0, + GVE_DEV_OPT_REQ_FEAT_MASK_MODIFY_RING = 0x0, + GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES = 0x0, +}; + +enum gve_sup_feature_mask { + GVE_SUP_MODIFY_RING_MASK = 1 << 0, + GVE_SUP_JUMBO_FRAMES_MASK = 1 << 2, +}; + +#define GVE_VERSION_STR_LEN 128 + +enum gve_driver_capability { + gve_driver_capability_gqi_qpl = 0, + gve_driver_capability_gqi_rda = 1, + gve_driver_capability_dqo_qpl = 2, /* reserved for future use */ + gve_driver_capability_dqo_rda = 3, + gve_driver_capability_alt_miss_compl = 4, +}; + +#define GVE_CAP1(a) BIT((int) a) +#define GVE_CAP2(a) BIT(((int) a) - 64) +#define GVE_CAP3(a) BIT(((int) a) - 128) +#define GVE_CAP4(a) BIT(((int) a) - 192) + +/* + * The following four defines describe 256 compatibility bits. + * Only a few bits (as shown in `gve_driver_compatibility`) are currently + * defined. The rest are reserved for future use. + */ +#define GVE_DRIVER_CAPABILITY_FLAGS1 (GVE_CAP1(gve_driver_capability_gqi_qpl)) +#define GVE_DRIVER_CAPABILITY_FLAGS2 0x0 +#define GVE_DRIVER_CAPABILITY_FLAGS3 0x0 +#define GVE_DRIVER_CAPABILITY_FLAGS4 0x0 + +struct gve_driver_info { + uint8_t os_type; + uint8_t driver_major; + uint8_t driver_minor; + uint8_t driver_sub; + __be32 os_version_major; + __be32 os_version_minor; + __be32 os_version_sub; + __be64 driver_capability_flags[4]; + uint8_t os_version_str1[GVE_VERSION_STR_LEN]; + uint8_t os_version_str2[GVE_VERSION_STR_LEN]; +}; + +struct gve_adminq_verify_driver_compatibility { + __be64 driver_info_len; + __be64 driver_info_addr; +}; + +_Static_assert(sizeof(struct gve_adminq_verify_driver_compatibility) == 16, + "gve: bad admin queue struct length"); + +struct gve_adminq_configure_device_resources { + __be64 counter_array; + __be64 irq_db_addr; + __be32 num_counters; + __be32 num_irq_dbs; + __be32 irq_db_stride; + __be32 ntfy_blk_msix_base_idx; + uint8_t queue_format; + uint8_t padding[7]; +}; + +_Static_assert(sizeof(struct gve_adminq_configure_device_resources) == 40, + "gve: bad admin queue struct length"); + +struct gve_adminq_register_page_list { + __be32 page_list_id; + __be32 num_pages; + __be64 page_address_list_addr; + __be64 page_size; +}; + +_Static_assert(sizeof(struct gve_adminq_register_page_list) == 24, + "gve: bad admin queue struct length"); + +struct gve_adminq_unregister_page_list { + __be32 page_list_id; +}; + +_Static_assert(sizeof(struct gve_adminq_unregister_page_list) == 4, + "gve: bad admin queue struct length"); + +struct gve_adminq_create_tx_queue { + __be32 queue_id; + __be32 reserved; + __be64 queue_resources_addr; + __be64 tx_ring_addr; + __be32 queue_page_list_id; + __be32 ntfy_id; + __be64 tx_comp_ring_addr; + __be16 tx_ring_size; + __be16 tx_comp_ring_size; + uint8_t padding[4]; +}; + +_Static_assert(sizeof(struct gve_adminq_create_tx_queue) == 48, + "gve: bad admin queue struct length"); + +struct gve_adminq_create_rx_queue { + __be32 queue_id; + __be32 index; + __be32 reserved; + __be32 ntfy_id; + __be64 queue_resources_addr; + __be64 rx_desc_ring_addr; + __be64 rx_data_ring_addr; + __be32 queue_page_list_id; + __be16 rx_ring_size; + __be16 packet_buffer_size; + __be16 rx_buff_ring_size; + uint8_t enable_rsc; + uint8_t padding[5]; +}; + +_Static_assert(sizeof(struct gve_adminq_create_rx_queue) == 56, + "gve: bad admin queue struct length"); + +/* Queue resources that are shared with the device */ +struct gve_queue_resources { + union { + struct { + __be32 db_index; /* Device -> Guest */ + __be32 counter_index; /* Device -> Guest */ + }; + uint8_t reserved[64]; + }; +}; + +_Static_assert(sizeof(struct gve_queue_resources) == 64, + "gve: bad admin queue struct length"); + +struct gve_adminq_destroy_tx_queue { + __be32 queue_id; +}; + +_Static_assert(sizeof(struct gve_adminq_destroy_tx_queue) == 4, + "gve: bad admin queue struct length"); + +struct gve_adminq_destroy_rx_queue { + __be32 queue_id; +}; + +_Static_assert(sizeof(struct gve_adminq_destroy_rx_queue) == 4, + "gve: bad admin queue struct length"); + +/* GVE Set Driver Parameter Types */ +enum gve_set_driver_param_types { + GVE_SET_PARAM_MTU = 0x1, +}; + +struct gve_adminq_set_driver_parameter { + __be32 parameter_type; + uint8_t reserved[4]; + __be64 parameter_value; +}; + +_Static_assert(sizeof(struct gve_adminq_set_driver_parameter) == 16, + "gve: bad admin queue struct length"); + +struct stats { + __be32 stat_name; + __be32 queue_id; + __be64 value; +}; + +_Static_assert(sizeof(struct stats) == 16, + "gve: bad admin queue struct length"); + +struct gve_adminq_command { + __be32 opcode; + __be32 status; + union { + struct gve_adminq_configure_device_resources + configure_device_resources; + struct gve_adminq_create_tx_queue create_tx_queue; + struct gve_adminq_create_rx_queue create_rx_queue; + struct gve_adminq_destroy_tx_queue destroy_tx_queue; + struct gve_adminq_destroy_rx_queue destroy_rx_queue; + struct gve_adminq_describe_device describe_device; + struct gve_adminq_register_page_list reg_page_list; + struct gve_adminq_unregister_page_list unreg_page_list; + struct gve_adminq_set_driver_parameter set_driver_param; + struct gve_adminq_verify_driver_compatibility + verify_driver_compatibility; + uint8_t reserved[56]; + }; +}; + +_Static_assert(sizeof(struct gve_adminq_command) == 64, + "gve: bad admin queue struct length"); + +int gve_adminq_create_rx_queues(struct gve_priv *priv, uint32_t num_queues); +int gve_adminq_create_tx_queues(struct gve_priv *priv, uint32_t num_queues); +int gve_adminq_destroy_tx_queues(struct gve_priv *priv, uint32_t num_queues); +int gve_adminq_destroy_rx_queues(struct gve_priv *priv, uint32_t num_queues); +int gve_adminq_set_mtu(struct gve_priv *priv, uint32_t mtu); +int gve_adminq_alloc(struct gve_priv *priv); +void gve_reset_adminq(struct gve_priv *priv); +int gve_adminq_describe_device(struct gve_priv *priv); +int gve_adminq_configure_device_resources(struct gve_priv *priv); +int gve_adminq_deconfigure_device_resources(struct gve_priv *priv); +void gve_release_adminq(struct gve_priv *priv); +int gve_adminq_register_page_list(struct gve_priv *priv, + struct gve_queue_page_list *qpl); +int gve_adminq_unregister_page_list(struct gve_priv *priv, uint32_t page_list_id); +int gve_adminq_verify_driver_compatibility(struct gve_priv *priv, + uint64_t driver_info_len, vm_paddr_t driver_info_addr); +#endif /* _GVE_AQ_H_ */ diff --git a/sys/dev/gve/gve_desc.h b/sys/dev/gve/gve_desc.h new file mode 100644 index 000000000000..5f09cc8b77b8 --- /dev/null +++ b/sys/dev/gve/gve_desc.h @@ -0,0 +1,151 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2023 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _GVE_DESC_H_ +#define _GVE_DESC_H_ + +#include "gve_plat.h" + +/* + * A note on seg_addrs + * + * Base addresses encoded in seg_addr are not assumed to be physical + * addresses. The ring format assumes these come from some linear address + * space. This could be physical memory, kernel virtual memory, user virtual + * memory. + * + * Each queue is assumed to be associated with a single such linear + * address space to ensure a consistent meaning for seg_addrs posted to its + * rings. + */ +struct gve_tx_pkt_desc { + uint8_t type_flags; /* desc type is lower 4 bits, flags upper */ + uint8_t l4_csum_offset; /* relative offset of L4 csum word */ + uint8_t l4_hdr_offset; /* Offset of start of L4 headers in packet */ + uint8_t desc_cnt; /* Total descriptors for this packet */ + __be16 len; /* Total length of this packet (in bytes) */ + __be16 seg_len; /* Length of this descriptor's segment */ + __be64 seg_addr; /* Base address (see note) of this segment */ +} __packed; + +struct gve_tx_mtd_desc { + uint8_t type_flags; /* type is lower 4 bits, subtype upper */ + uint8_t path_state; /* state is lower 4 bits, hash type upper */ + __be16 reserved0; + __be32 path_hash; + __be64 reserved1; +} __packed; + +struct gve_tx_seg_desc { + uint8_t type_flags; /* type is lower 4 bits, flags upper */ + uint8_t l3_offset; /* TSO: 2 byte units to start of IPH */ + __be16 reserved; + __be16 mss; /* TSO MSS */ + __be16 seg_len; + __be64 seg_addr; +} __packed; + +/* GVE Transmit Descriptor Types */ +#define GVE_TXD_STD (0x0 << 4) /* Std with Host Address */ +#define GVE_TXD_TSO (0x1 << 4) /* TSO with Host Address */ +#define GVE_TXD_SEG (0x2 << 4) /* Seg with Host Address */ +#define GVE_TXD_MTD (0x3 << 4) /* Metadata */ + +/* GVE Transmit Descriptor Flags for Std Pkts */ +#define GVE_TXF_L4CSUM BIT(0) /* Need csum offload */ +#define GVE_TXF_TSTAMP BIT(2) /* Timestamp required */ + +/* GVE Transmit Descriptor Flags for TSO Segs */ +#define GVE_TXSF_IPV6 BIT(1) /* IPv6 TSO */ + +/* GVE Transmit Descriptor Options for MTD Segs */ +#define GVE_MTD_SUBTYPE_PATH 0 + +#define GVE_MTD_PATH_STATE_DEFAULT 0 +#define GVE_MTD_PATH_STATE_TIMEOUT 1 +#define GVE_MTD_PATH_STATE_CONGESTION 2 +#define GVE_MTD_PATH_STATE_RETRANSMIT 3 + +#define GVE_MTD_PATH_HASH_NONE (0x0 << 4) +#define GVE_MTD_PATH_HASH_L4 (0x1 << 4) + +/* + * GVE Receive Packet Descriptor + * + * The start of an ethernet packet comes 2 bytes into the rx buffer. + * gVNIC adds this padding so that both the DMA and the L3/4 protocol header + * access is aligned. + */ +#define GVE_RX_PAD 2 + +struct gve_rx_desc { + uint8_t padding[48]; + __be32 rss_hash; /* Receive-side scaling hash (Toeplitz for gVNIC) */ + __be16 mss; + __be16 reserved; /* Reserved to zero */ + uint8_t hdr_len; /* Header length (L2-L4) including padding */ + uint8_t hdr_off; /* 64-byte-scaled offset into RX_DATA entry */ + uint16_t csum; /* 1's-complement partial checksum of L3+ bytes */ + __be16 len; /* Length of the received packet */ + __be16 flags_seq; /* Flags [15:3] and sequence number [2:0] (1-7) */ +} __packed; +_Static_assert(sizeof(struct gve_rx_desc) == 64, "gve: bad desc struct length"); + +/* + * If the device supports raw dma addressing then the addr in data slot is + * the dma address of the buffer. + * If the device only supports registered segments then the addr is a byte + * offset into the registered segment (an ordered list of pages) where the + * buffer is. + */ +union gve_rx_data_slot { + __be64 qpl_offset; + __be64 addr; +}; + +/* GVE Recive Packet Descriptor Seq No */ +#define GVE_SEQNO(x) (be16toh(x) & 0x7) + +/* GVE Recive Packet Descriptor Flags */ +#define GVE_RXFLG(x) htobe16(1 << (3 + (x))) +#define GVE_RXF_FRAG GVE_RXFLG(3) /* IP Fragment */ +#define GVE_RXF_IPV4 GVE_RXFLG(4) /* IPv4 */ +#define GVE_RXF_IPV6 GVE_RXFLG(5) /* IPv6 */ +#define GVE_RXF_TCP GVE_RXFLG(6) /* TCP Packet */ +#define GVE_RXF_UDP GVE_RXFLG(7) /* UDP Packet */ +#define GVE_RXF_ERR GVE_RXFLG(8) /* Packet Error Detected */ +#define GVE_RXF_PKT_CONT GVE_RXFLG(10) /* Multi Fragment RX packet */ + +/* GVE IRQ */ +#define GVE_IRQ_ACK BIT(31) +#define GVE_IRQ_MASK BIT(30) +#define GVE_IRQ_EVENT BIT(29) + +#endif /* _GVE_DESC_H_ */ diff --git a/sys/dev/gve/gve_main.c b/sys/dev/gve/gve_main.c new file mode 100644 index 000000000000..ae45a0cfc24a --- /dev/null +++ b/sys/dev/gve/gve_main.c @@ -0,0 +1,853 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2023 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "gve.h" +#include "gve_adminq.h" + +#define GVE_DRIVER_VERSION "GVE-FBSD-1.0.0\n" +#define GVE_VERSION_MAJOR 0 +#define GVE_VERSION_MINOR 9 +#define GVE_VERSION_SUB 0 + +#define GVE_DEFAULT_RX_COPYBREAK 256 + +struct sx gve_global_lock; + +static int +gve_verify_driver_compatibility(struct gve_priv *priv) +{ + int err; + struct gve_driver_info *driver_info; + struct gve_dma_handle driver_info_mem; + + err = gve_dma_alloc_coherent(priv, sizeof(struct gve_driver_info), + PAGE_SIZE, &driver_info_mem); + + if (err != 0) + return (ENOMEM); + + driver_info = driver_info_mem.cpu_addr; + + *driver_info = (struct gve_driver_info) { + .os_type = 3, /* Freebsd */ + .driver_major = GVE_VERSION_MAJOR, + .driver_minor = GVE_VERSION_MINOR, + .driver_sub = GVE_VERSION_SUB, + .os_version_major = htobe32(FBSD_VERSION_MAJOR), + .os_version_minor = htobe32(FBSD_VERSION_MINOR), + .os_version_sub = htobe32(FBSD_VERSION_PATCH), + .driver_capability_flags = { + htobe64(GVE_DRIVER_CAPABILITY_FLAGS1), + htobe64(GVE_DRIVER_CAPABILITY_FLAGS2), + htobe64(GVE_DRIVER_CAPABILITY_FLAGS3), + htobe64(GVE_DRIVER_CAPABILITY_FLAGS4), + }, + }; + + snprintf(driver_info->os_version_str1, sizeof(driver_info->os_version_str1), + "FreeBSD %u", __FreeBSD_version); + + bus_dmamap_sync(driver_info_mem.tag, driver_info_mem.map, + BUS_DMASYNC_PREREAD); + + err = gve_adminq_verify_driver_compatibility(priv, + sizeof(struct gve_driver_info), driver_info_mem.bus_addr); + + /* It's ok if the device doesn't support this */ + if (err == EOPNOTSUPP) + err = 0; + + gve_dma_free_coherent(&driver_info_mem); + + return (err); +} + +static int +gve_up(struct gve_priv *priv) +{ + if_t ifp = priv->ifp; + int err; + + GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock); + + if (device_is_attached(priv->dev) == 0) { + device_printf(priv->dev, "Cannot bring the iface up when detached\n"); + return (ENXIO); + } + + if (gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) + return (0); + + if_clearhwassist(ifp); + if (if_getcapenable(ifp) & IFCAP_TXCSUM) + if_sethwassistbits(ifp, CSUM_TCP | CSUM_UDP, 0); + if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6) + if_sethwassistbits(ifp, CSUM_IP6_TCP | CSUM_IP6_UDP, 0); + if (if_getcapenable(ifp) & IFCAP_TSO4) + if_sethwassistbits(ifp, CSUM_IP_TSO, 0); + if (if_getcapenable(ifp) & IFCAP_TSO6) + if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); + + err = gve_register_qpls(priv); + if (err != 0) + goto reset; + + err = gve_create_rx_rings(priv); + if (err != 0) + goto reset; + + err = gve_create_tx_rings(priv); + if (err != 0) + goto reset; + + if_setdrvflagbits(ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE); + + if (!gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) { + if_link_state_change(ifp, LINK_STATE_UP); + gve_set_state_flag(priv, GVE_STATE_FLAG_LINK_UP); + } + + gve_unmask_all_queue_irqs(priv); + gve_set_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP); + priv->interface_up_cnt++; + return (0); + +reset: + gve_schedule_reset(priv); + return (err); +} + +static void +gve_down(struct gve_priv *priv) +{ + GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock); + + if (!gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) + return; + + if (gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) { + if_link_state_change(priv->ifp, LINK_STATE_DOWN); + gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP); + } + + if_setdrvflagbits(priv->ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); + + if (gve_destroy_rx_rings(priv) != 0) + goto reset; + + if (gve_destroy_tx_rings(priv) != 0) + goto reset; + + if (gve_unregister_qpls(priv) != 0) + goto reset; + + gve_mask_all_queue_irqs(priv); + gve_clear_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP); + priv->interface_down_cnt++; + return; + +reset: + gve_schedule_reset(priv); +} + +static int +gve_set_mtu(if_t ifp, uint32_t new_mtu) +{ + struct gve_priv *priv = if_getsoftc(ifp); + int err; + + if ((new_mtu > priv->max_mtu) || (new_mtu < ETHERMIN)) { + device_printf(priv->dev, "Invalid new MTU setting. new mtu: %d max mtu: %d min mtu: %d\n", + new_mtu, priv->max_mtu, ETHERMIN); + return (EINVAL); + } + + err = gve_adminq_set_mtu(priv, new_mtu); + if (err == 0) { + if (bootverbose) + device_printf(priv->dev, "MTU set to %d\n", new_mtu); + if_setmtu(ifp, new_mtu); + } else { + device_printf(priv->dev, "Failed to set MTU to %d\n", new_mtu); + } + + return (err); +} + +static void +gve_init(void *arg) +{ + struct gve_priv *priv = (struct gve_priv *)arg; + + if (!gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) { + GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); + gve_up(priv); + GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); + } +} + +static int +gve_ioctl(if_t ifp, u_long command, caddr_t data) +{ + struct gve_priv *priv; + struct ifreq *ifr; + int rc = 0; + + priv = if_getsoftc(ifp); + ifr = (struct ifreq *)data; + + switch (command) { + case SIOCSIFMTU: + if (if_getmtu(ifp) == ifr->ifr_mtu) + break; + GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); + gve_down(priv); + gve_set_mtu(ifp, ifr->ifr_mtu); + rc = gve_up(priv); + GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); + break; + + case SIOCSIFFLAGS: + if ((if_getflags(ifp) & IFF_UP) != 0) { + if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) { + GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); + rc = gve_up(priv); + GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); + } + } else { + if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0) { + GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); + gve_down(priv); + GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); + } + } + break; + + case SIOCSIFCAP: + if (ifr->ifr_reqcap == if_getcapenable(ifp)) + break; + GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); + gve_down(priv); + if_setcapenable(ifp, ifr->ifr_reqcap); + rc = gve_up(priv); + GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); + break; + + case SIOCSIFMEDIA: + /* FALLTHROUGH */ + case SIOCGIFMEDIA: + rc = ifmedia_ioctl(ifp, ifr, &priv->media, command); + break; + + default: + rc = ether_ioctl(ifp, command, data); + break; + } + + return (rc); +} + +static int +gve_media_change(if_t ifp) +{ + struct gve_priv *priv = if_getsoftc(ifp); + + device_printf(priv->dev, "Media change not supported\n"); + return (0); +} + +static void +gve_media_status(if_t ifp, struct ifmediareq *ifmr) +{ + struct gve_priv *priv = if_getsoftc(ifp); + + GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); + + ifmr->ifm_status = IFM_AVALID; + ifmr->ifm_active = IFM_ETHER; + + if (gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) { + ifmr->ifm_status |= IFM_ACTIVE; + ifmr->ifm_active |= IFM_AUTO; + } else { + ifmr->ifm_active |= IFM_NONE; + } + + GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); +} + +static uint64_t +gve_get_counter(if_t ifp, ift_counter cnt) +{ + struct gve_priv *priv; + uint64_t rpackets = 0; + uint64_t tpackets = 0; + uint64_t rbytes = 0; + uint64_t tbytes = 0; + uint64_t rx_dropped_pkt = 0; + uint64_t tx_dropped_pkt = 0; + + priv = if_getsoftc(ifp); + + gve_accum_stats(priv, &rpackets, &rbytes, &rx_dropped_pkt, &tpackets, + &tbytes, &tx_dropped_pkt); + + switch (cnt) { + case IFCOUNTER_IPACKETS: + return (rpackets); + + case IFCOUNTER_OPACKETS: + return (tpackets); + + case IFCOUNTER_IBYTES: + return (rbytes); + + case IFCOUNTER_OBYTES: + return (tbytes); + + case IFCOUNTER_IQDROPS: + return (rx_dropped_pkt); + + case IFCOUNTER_OQDROPS: + return (tx_dropped_pkt); + + default: + return (if_get_counter_default(ifp, cnt)); + } +} + +static int +gve_setup_ifnet(device_t dev, struct gve_priv *priv) +{ + int caps = 0; + if_t ifp; + + ifp = priv->ifp = if_alloc(IFT_ETHER); + if (ifp == NULL) { + device_printf(priv->dev, "Failed to allocate ifnet struct\n"); + return (ENXIO); + } + + if_initname(ifp, device_get_name(dev), device_get_unit(dev)); + if_setsoftc(ifp, priv); + if_setdev(ifp, dev); + if_setinitfn(ifp, gve_init); + if_setioctlfn(ifp, gve_ioctl); + if_settransmitfn(ifp, gve_xmit_ifp); + if_setqflushfn(ifp, gve_qflush); + +#if __FreeBSD_version >= 1400086 + if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); +#else + if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST | IFF_KNOWSEPOCH); +#endif + + ifmedia_init(&priv->media, IFM_IMASK, gve_media_change, gve_media_status); + if_setgetcounterfn(ifp, gve_get_counter); + + caps = IFCAP_RXCSUM | + IFCAP_TXCSUM | + IFCAP_TXCSUM_IPV6 | + IFCAP_TSO | + IFCAP_LRO; + + if ((priv->supported_features & GVE_SUP_JUMBO_FRAMES_MASK) != 0) + caps |= IFCAP_JUMBO_MTU; + + if_setcapabilities(ifp, caps); + if_setcapenable(ifp, caps); + + if (bootverbose) + device_printf(priv->dev, "Setting initial MTU to %d\n", priv->max_mtu); + if_setmtu(ifp, priv->max_mtu); + + ether_ifattach(ifp, priv->mac); + + ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO, 0, NULL); + ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO); + + return (0); +} + +static int +gve_alloc_counter_array(struct gve_priv *priv) +{ + int err; + + err = gve_dma_alloc_coherent(priv, sizeof(uint32_t) * priv->num_event_counters, + PAGE_SIZE, &priv->counter_array_mem); + if (err != 0) + return (err); + + priv->counters = priv->counter_array_mem.cpu_addr; + return (0); +} + +static void +gve_free_counter_array(struct gve_priv *priv) +{ + if (priv->counters != NULL) + gve_dma_free_coherent(&priv->counter_array_mem); + priv->counter_array_mem = (struct gve_dma_handle){}; +} + +static int +gve_alloc_irq_db_array(struct gve_priv *priv) +{ + int err; + + err = gve_dma_alloc_coherent(priv, + sizeof(struct gve_irq_db) * (priv->num_queues), PAGE_SIZE, + &priv->irqs_db_mem); + if (err != 0) + return (err); + + priv->irq_db_indices = priv->irqs_db_mem.cpu_addr; + return (0); +} + +static void +gve_free_irq_db_array(struct gve_priv *priv) +{ + if (priv->irq_db_indices != NULL) + gve_dma_free_coherent(&priv->irqs_db_mem); + priv->irqs_db_mem = (struct gve_dma_handle){}; +} + +static void +gve_free_rings(struct gve_priv *priv) +{ + gve_free_irqs(priv); + gve_free_tx_rings(priv); + gve_free_rx_rings(priv); + gve_free_qpls(priv); +} + +static int +gve_alloc_rings(struct gve_priv *priv) +{ + int err; + + err = gve_alloc_qpls(priv); + if (err != 0) + goto abort; + + err = gve_alloc_rx_rings(priv); + if (err != 0) + goto abort; + + err = gve_alloc_tx_rings(priv); + if (err != 0) + goto abort; + + err = gve_alloc_irqs(priv); + if (err != 0) + goto abort; + + return (0); + +abort: + gve_free_rings(priv); + return (err); +} + +static void +gve_deconfigure_resources(struct gve_priv *priv) +{ + int err; + + if (gve_get_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK)) { + err = gve_adminq_deconfigure_device_resources(priv); + if (err != 0) { + device_printf(priv->dev, "Failed to deconfigure device resources: err=%d\n", + err); + return; + } + if (bootverbose) + device_printf(priv->dev, "Deconfigured device resources\n"); + gve_clear_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK); + } + + gve_free_irq_db_array(priv); + gve_free_counter_array(priv); +} + +static int +gve_configure_resources(struct gve_priv *priv) +{ + int err; + + if (gve_get_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK)) + return (0); + + err = gve_alloc_counter_array(priv); + if (err != 0) + return (err); + + err = gve_alloc_irq_db_array(priv); + if (err != 0) + goto abort; + + err = gve_adminq_configure_device_resources(priv); + if (err != 0) { + device_printf(priv->dev, "Failed to configure device resources: err=%d\n", + err); + err = (ENXIO); + goto abort; + } + + gve_set_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK); + if (bootverbose) + device_printf(priv->dev, "Configured device resources\n"); + return (0); + +abort: + gve_deconfigure_resources(priv); + return (err); +} + +static void +gve_set_queue_cnts(struct gve_priv *priv) +{ + priv->tx_cfg.max_queues = gve_reg_bar_read_4(priv, MAX_TX_QUEUES); + priv->rx_cfg.max_queues = gve_reg_bar_read_4(priv, MAX_RX_QUEUES); + priv->tx_cfg.num_queues = priv->tx_cfg.max_queues; + priv->rx_cfg.num_queues = priv->rx_cfg.max_queues; + + if (priv->default_num_queues > 0) { + priv->tx_cfg.num_queues = MIN(priv->default_num_queues, + priv->tx_cfg.num_queues); + priv->rx_cfg.num_queues = MIN(priv->default_num_queues, + priv->rx_cfg.num_queues); + } + + priv->num_queues = priv->tx_cfg.num_queues + priv->rx_cfg.num_queues; + priv->mgmt_msix_idx = priv->num_queues; +} + +static int +gve_alloc_adminq_and_describe_device(struct gve_priv *priv) +{ + int err; + + if ((err = gve_adminq_alloc(priv)) != 0) + return (err); + + if ((err = gve_verify_driver_compatibility(priv)) != 0) { + device_printf(priv->dev, + "Failed to verify driver compatibility: err=%d\n", err); + goto abort; + } + + if ((err = gve_adminq_describe_device(priv)) != 0) + goto abort; + + gve_set_queue_cnts(priv); + + priv->num_registered_pages = 0; + return (0); + +abort: + gve_release_adminq(priv); + return (err); +} + +void +gve_schedule_reset(struct gve_priv *priv) +{ + if (gve_get_state_flag(priv, GVE_STATE_FLAG_IN_RESET)) + return; + + device_printf(priv->dev, "Scheduling reset task!\n"); + gve_set_state_flag(priv, GVE_STATE_FLAG_DO_RESET); + taskqueue_enqueue(priv->service_tq, &priv->service_task); +} + +static void +gve_destroy(struct gve_priv *priv) +{ + gve_down(priv); + gve_deconfigure_resources(priv); + gve_release_adminq(priv); +} + +static void +gve_restore(struct gve_priv *priv) +{ + int err; + + err = gve_adminq_alloc(priv); + if (err != 0) + goto abort; + + err = gve_configure_resources(priv); + if (err != 0) + goto abort; + + err = gve_up(priv); + if (err != 0) + goto abort; + + return; + +abort: + device_printf(priv->dev, "Restore failed!\n"); + return; +} + +static void +gve_handle_reset(struct gve_priv *priv) +{ + if (!gve_get_state_flag(priv, GVE_STATE_FLAG_DO_RESET)) + return; + + gve_clear_state_flag(priv, GVE_STATE_FLAG_DO_RESET); + gve_set_state_flag(priv, GVE_STATE_FLAG_IN_RESET); + + GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); + + if_setdrvflagbits(priv->ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); + if_link_state_change(priv->ifp, LINK_STATE_DOWN); + gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP); + + /* + * Releasing the adminq causes the NIC to destroy all resources + * registered with it, so by clearing the flags beneath we cause + * the subsequent gve_down call below to not attempt to tell the + * NIC to destroy these resources again. + * + * The call to gve_down is needed in the first place to refresh + * the state and the DMA-able memory within each driver ring. + */ + gve_release_adminq(priv); + gve_clear_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK); + gve_clear_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK); + gve_clear_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK); + gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); + + gve_down(priv); + gve_restore(priv); + + GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); + + priv->reset_cnt++; + gve_clear_state_flag(priv, GVE_STATE_FLAG_IN_RESET); +} + +static void +gve_handle_link_status(struct gve_priv *priv) +{ + uint32_t status = gve_reg_bar_read_4(priv, DEVICE_STATUS); + bool link_up = status & GVE_DEVICE_STATUS_LINK_STATUS; + + if (link_up == gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) + return; + + if (link_up) { + if (bootverbose) + device_printf(priv->dev, "Device link is up.\n"); + if_link_state_change(priv->ifp, LINK_STATE_UP); + gve_set_state_flag(priv, GVE_STATE_FLAG_LINK_UP); + } else { + device_printf(priv->dev, "Device link is down.\n"); + if_link_state_change(priv->ifp, LINK_STATE_DOWN); + gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP); + } +} + +static void +gve_service_task(void *arg, int pending) +{ + struct gve_priv *priv = (struct gve_priv *)arg; + uint32_t status = gve_reg_bar_read_4(priv, DEVICE_STATUS); + + if (((GVE_DEVICE_STATUS_RESET_MASK & status) != 0) && + !gve_get_state_flag(priv, GVE_STATE_FLAG_IN_RESET)) { + device_printf(priv->dev, "Device requested reset\n"); + gve_set_state_flag(priv, GVE_STATE_FLAG_DO_RESET); + } + + gve_handle_reset(priv); + gve_handle_link_status(priv); +} + +static int +gve_probe(device_t dev) +{ + if (pci_get_vendor(dev) == PCI_VENDOR_ID_GOOGLE && + pci_get_device(dev) == PCI_DEV_ID_GVNIC) { + device_set_desc(dev, "gVNIC"); + return (BUS_PROBE_DEFAULT); + } + return (ENXIO); +} + +static void +gve_free_sys_res_mem(struct gve_priv *priv) +{ + if (priv->msix_table != NULL) + bus_release_resource(priv->dev, SYS_RES_MEMORY, + rman_get_rid(priv->msix_table), priv->msix_table); + + if (priv->db_bar != NULL) + bus_release_resource(priv->dev, SYS_RES_MEMORY, + rman_get_rid(priv->db_bar), priv->db_bar); + + if (priv->reg_bar != NULL) + bus_release_resource(priv->dev, SYS_RES_MEMORY, + rman_get_rid(priv->reg_bar), priv->reg_bar); +} + +static int +gve_attach(device_t dev) +{ + struct gve_priv *priv; + int rid; + int err; + + priv = device_get_softc(dev); + priv->dev = dev; + GVE_IFACE_LOCK_INIT(priv->gve_iface_lock); + + pci_enable_busmaster(dev); + + rid = PCIR_BAR(GVE_REGISTER_BAR); + priv->reg_bar = bus_alloc_resource_any(dev, SYS_RES_MEMORY, + &rid, RF_ACTIVE); + if (priv->reg_bar == NULL) { + device_printf(dev, "Failed to allocate BAR0\n"); + err = ENXIO; + goto abort; + } + + rid = PCIR_BAR(GVE_DOORBELL_BAR); + priv->db_bar = bus_alloc_resource_any(dev, SYS_RES_MEMORY, + &rid, RF_ACTIVE); + if (priv->db_bar == NULL) { + device_printf(dev, "Failed to allocate BAR2\n"); + err = ENXIO; + goto abort; + } + + rid = pci_msix_table_bar(priv->dev); + priv->msix_table = bus_alloc_resource_any(dev, SYS_RES_MEMORY, + &rid, RF_ACTIVE); + if (priv->msix_table == NULL) { + device_printf(dev, "Failed to allocate msix table\n"); + err = ENXIO; + goto abort; + } + + err = gve_alloc_adminq_and_describe_device(priv); + if (err != 0) + goto abort; + + err = gve_configure_resources(priv); + if (err != 0) + goto abort; + + err = gve_alloc_rings(priv); + if (err != 0) + goto abort; + + err = gve_setup_ifnet(dev, priv); + if (err != 0) + goto abort; + + priv->rx_copybreak = GVE_DEFAULT_RX_COPYBREAK; + + bus_write_multi_1(priv->reg_bar, DRIVER_VERSION, GVE_DRIVER_VERSION, + sizeof(GVE_DRIVER_VERSION) - 1); + + TASK_INIT(&priv->service_task, 0, gve_service_task, priv); + priv->service_tq = taskqueue_create("gve service", M_WAITOK | M_ZERO, + taskqueue_thread_enqueue, &priv->service_tq); + taskqueue_start_threads(&priv->service_tq, 1, PI_NET, "%s service tq", + device_get_nameunit(priv->dev)); + + gve_setup_sysctl(priv); + + if (bootverbose) + device_printf(priv->dev, "Successfully attached %s", GVE_DRIVER_VERSION); + return (0); + +abort: + gve_free_rings(priv); + gve_deconfigure_resources(priv); + gve_release_adminq(priv); + gve_free_sys_res_mem(priv); + GVE_IFACE_LOCK_DESTROY(priv->gve_iface_lock); + return (err); +} + +static int +gve_detach(device_t dev) +{ + struct gve_priv *priv = device_get_softc(dev); + if_t ifp = priv->ifp; + + ether_ifdetach(ifp); + + GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); + gve_destroy(priv); + GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); + + gve_free_rings(priv); + gve_free_sys_res_mem(priv); + GVE_IFACE_LOCK_DESTROY(priv->gve_iface_lock); + + while (taskqueue_cancel(priv->service_tq, &priv->service_task, NULL)) + taskqueue_drain(priv->service_tq, &priv->service_task); + taskqueue_free(priv->service_tq); + + if_free(ifp); + return (bus_generic_detach(dev)); +} + +static device_method_t gve_methods[] = { + DEVMETHOD(device_probe, gve_probe), + DEVMETHOD(device_attach, gve_attach), + DEVMETHOD(device_detach, gve_detach), + DEVMETHOD_END +}; + +static driver_t gve_driver = { + "gve", + gve_methods, + sizeof(struct gve_priv) +}; + +#if __FreeBSD_version < 1301503 +static devclass_t gve_devclass; + +DRIVER_MODULE(gve, pci, gve_driver, gve_devclass, 0, 0); +#else +DRIVER_MODULE(gve, pci, gve_driver, 0, 0); +#endif diff --git a/sys/dev/gve/gve_plat.h b/sys/dev/gve/gve_plat.h new file mode 100644 index 000000000000..ad6bc1c92b36 --- /dev/null +++ b/sys/dev/gve/gve_plat.h @@ -0,0 +1,94 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2023 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _GVE_PLAT_FBSD_H +#define _GVE_PLAT_FBSD_H + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bitset.h> +#include <sys/bus.h> +#include <sys/endian.h> +#include <sys/eventhandler.h> +#include <sys/kernel.h> +#include <sys/kthread.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/rman.h> +#include <sys/smp.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/sysctl.h> +#include <sys/taskqueue.h> +#include <sys/time.h> + +#include <net/bpf.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <net/if_arp.h> +#include <net/if_dl.h> +#include <net/if_media.h> +#include <net/if_types.h> +#include <net/if_var.h> +#include <net/if_vlan_var.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/if_ether.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> +#include <netinet/tcp.h> +#include <netinet/tcp_lro.h> +#include <netinet/udp.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> +#include <vm/vm_page.h> + +#include <machine/atomic.h> +#include <machine/bus.h> +#include <machine/in_cksum.h> +#include <machine/resource.h> + +#include <dev/pci/pcireg.h> +#include <dev/pci/pcivar.h> + +typedef uint16_t __be16; +typedef uint32_t __be32; +typedef uint64_t __be64; +#define BIT(nr) (1UL << (nr)) + +#define FBSD_VERSION_MAJOR (__FreeBSD_version / 100000) +#define FBSD_VERSION_MINOR ((__FreeBSD_version / 1000) - FBSD_VERSION_MAJOR * 100) +#define FBSD_VERSION_PATCH (__FreeBSD_version - ((FBSD_VERSION_MAJOR * 100 + FBSD_VERSION_MINOR) * 1000)) + +#endif // _GVE_PLAT_FBSD_H diff --git a/sys/dev/gve/gve_qpl.c b/sys/dev/gve/gve_qpl.c new file mode 100644 index 000000000000..891d132d2f10 --- /dev/null +++ b/sys/dev/gve/gve_qpl.c @@ -0,0 +1,284 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2023 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <sys/malloc.h> + +#include "gve.h" +#include "gve_adminq.h" + +static MALLOC_DEFINE(M_GVE_QPL, "gve qpl", "gve qpl allocations"); + +static uint32_t +gve_num_tx_qpls(struct gve_priv *priv) +{ + if (priv->queue_format != GVE_GQI_QPL_FORMAT) + return (0); + + return (priv->tx_cfg.max_queues); +} + +static uint32_t +gve_num_rx_qpls(struct gve_priv *priv) +{ + if (priv->queue_format != GVE_GQI_QPL_FORMAT) + return (0); + + return (priv->rx_cfg.max_queues); +} + +static void +gve_free_qpl(struct gve_priv *priv, uint32_t id) +{ + struct gve_queue_page_list *qpl = &priv->qpls[id]; + int i; + + for (i = 0; i < qpl->num_dmas; i++) { + gve_dmamap_destroy(&qpl->dmas[i]); + } + + if (qpl->kva) { + pmap_qremove(qpl->kva, qpl->num_pages); + kva_free(qpl->kva, PAGE_SIZE * qpl->num_pages); + } + + for (i = 0; i < qpl->num_pages; i++) { + /* + * Free the page only if this is the last ref. + * Tx pages are known to have no other refs at + * this point, but Rx pages might still be in + * use by the networking stack, see gve_mextadd_free. + */ + if (vm_page_unwire_noq(qpl->pages[i])) { + if (!qpl->kva) { + pmap_qremove((vm_offset_t)qpl->dmas[i].cpu_addr, 1); + kva_free((vm_offset_t)qpl->dmas[i].cpu_addr, PAGE_SIZE); + } + vm_page_free(qpl->pages[i]); + } + + priv->num_registered_pages--; + } + + if (qpl->pages != NULL) + free(qpl->pages, M_GVE_QPL); + + if (qpl->dmas != NULL) + free(qpl->dmas, M_GVE_QPL); +} + +static int +gve_alloc_qpl(struct gve_priv *priv, uint32_t id, int npages, bool single_kva) +{ + struct gve_queue_page_list *qpl = &priv->qpls[id]; + int err; + int i; + + if (npages + priv->num_registered_pages > priv->max_registered_pages) { + device_printf(priv->dev, "Reached max number of registered pages %lu > %lu\n", + npages + priv->num_registered_pages, + priv->max_registered_pages); + return (EINVAL); + } + + qpl->id = id; + qpl->num_pages = 0; + qpl->num_dmas = 0; + + qpl->dmas = malloc(npages * sizeof(*qpl->dmas), M_GVE_QPL, + M_WAITOK | M_ZERO); + + qpl->pages = malloc(npages * sizeof(*qpl->pages), M_GVE_QPL, + M_WAITOK | M_ZERO); + + qpl->kva = 0; + if (single_kva) { + qpl->kva = kva_alloc(PAGE_SIZE * npages); + if (!qpl->kva) { + device_printf(priv->dev, "Failed to create the single kva for QPL %d\n", id); + err = ENOMEM; + goto abort; + } + } + + for (i = 0; i < npages; i++) { + qpl->pages[i] = vm_page_alloc_noobj(VM_ALLOC_WIRED | + VM_ALLOC_WAITOK | + VM_ALLOC_ZERO); + + if (!single_kva) { + qpl->dmas[i].cpu_addr = (void *)kva_alloc(PAGE_SIZE); + if (!qpl->dmas[i].cpu_addr) { + device_printf(priv->dev, "Failed to create kva for page %d in QPL %d", i, id); + err = ENOMEM; + goto abort; + } + pmap_qenter((vm_offset_t)qpl->dmas[i].cpu_addr, &(qpl->pages[i]), 1); + } else + qpl->dmas[i].cpu_addr = (void *)(qpl->kva + (PAGE_SIZE * i)); + + + qpl->num_pages++; + } + + if (single_kva) + pmap_qenter(qpl->kva, qpl->pages, npages); + + for (i = 0; i < npages; i++) { + err = gve_dmamap_create(priv, /*size=*/PAGE_SIZE, /*align=*/PAGE_SIZE, + &qpl->dmas[i]); + if (err != 0) { + device_printf(priv->dev, "Failed to dma-map page %d in QPL %d\n", i, id); + goto abort; + } + + qpl->num_dmas++; + priv->num_registered_pages++; + } + + return (0); + +abort: + gve_free_qpl(priv, id); + return (err); +} + +void +gve_free_qpls(struct gve_priv *priv) +{ + int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); + int i; + + if (num_qpls == 0) + return; + + if (priv->qpls != NULL) { + for (i = 0; i < num_qpls; i++) + gve_free_qpl(priv, i); + free(priv->qpls, M_GVE_QPL); + } +} + +int gve_alloc_qpls(struct gve_priv *priv) +{ + int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); + int err; + int i; + + if (num_qpls == 0) + return (0); + + priv->qpls = malloc(num_qpls * sizeof(*priv->qpls), M_GVE_QPL, + M_WAITOK | M_ZERO); + + for (i = 0; i < gve_num_tx_qpls(priv); i++) { + err = gve_alloc_qpl(priv, i, priv->tx_desc_cnt / GVE_QPL_DIVISOR, + /*single_kva=*/true); + if (err != 0) + goto abort; + } + + for (; i < num_qpls; i++) { + err = gve_alloc_qpl(priv, i, priv->rx_desc_cnt, /*single_kva=*/false); + if (err != 0) + goto abort; + } + + return (0); + +abort: + gve_free_qpls(priv); + return (err); +} + +static int +gve_unregister_n_qpls(struct gve_priv *priv, int n) +{ + int err; + int i; + + for (i = 0; i < n; i++) { + err = gve_adminq_unregister_page_list(priv, priv->qpls[i].id); + if (err != 0) { + device_printf(priv->dev, + "Failed to unregister qpl %d, err: %d\n", + priv->qpls[i].id, err); + } + } + + if (err != 0) + return (err); + + return (0); +} + +int +gve_register_qpls(struct gve_priv *priv) +{ + int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); + int err; + int i; + + if (gve_get_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK)) + return (0); + + for (i = 0; i < num_qpls; i++) { + err = gve_adminq_register_page_list(priv, &priv->qpls[i]); + if (err != 0) { + device_printf(priv->dev, + "Failed to register qpl %d, err: %d\n", + priv->qpls[i].id, err); + goto abort; + } + } + + gve_set_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK); + return (0); + +abort: + gve_unregister_n_qpls(priv, i); + return (err); +} + +int +gve_unregister_qpls(struct gve_priv *priv) +{ + int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); + int err; + + if (!gve_get_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK)) + return (0); + + err = gve_unregister_n_qpls(priv, num_qpls); + if (err != 0) + return (err); + + gve_clear_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK); + return (0); +} diff --git a/sys/dev/gve/gve_register.h b/sys/dev/gve/gve_register.h new file mode 100644 index 000000000000..b3d4003faaa7 --- /dev/null +++ b/sys/dev/gve/gve_register.h @@ -0,0 +1,54 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2023 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _GVE_REGISTER_H_ +#define _GVE_REGISTER_H_ + +/* Fixed Configuration Registers */ +#define DEVICE_STATUS 0 +#define DRIVER_STATUS 4 +#define MAX_TX_QUEUES 8 +#define MAX_RX_QUEUES 12 +#define ADMINQ_PFN 16 +#define ADMINQ_DOORBELL 20 +#define ADMINQ_EVENT_COUNTER 24 +#define RESERVED 28 +#define DRIVER_VERSION 31 +#define ADMINQ_BASE_ADDRESS_HI 32 +#define ADMINQ_BASE_ADDRESS_LO 36 +#define ADMINQ_LENGTH 40 + + +enum gve_device_status_flags { + GVE_DEVICE_STATUS_RESET_MASK = BIT(1), + GVE_DEVICE_STATUS_LINK_STATUS_MASK = BIT(2), +}; + +#endif /* _GVE_REGISTER_H_ */ diff --git a/sys/dev/gve/gve_rx.c b/sys/dev/gve/gve_rx.c new file mode 100644 index 000000000000..f5d8fd7a7b75 --- /dev/null +++ b/sys/dev/gve/gve_rx.c @@ -0,0 +1,684 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2023 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "gve.h" +#include "gve_adminq.h" + +static void +gve_rx_free_ring(struct gve_priv *priv, int i) +{ + struct gve_rx_ring *rx = &priv->rx[i]; + struct gve_ring_com *com = &rx->com; + + /* Safe to call even if never allocated */ + gve_free_counters((counter_u64_t *)&rx->stats, NUM_RX_STATS); + + if (rx->page_info != NULL) { + free(rx->page_info, M_GVE); + rx->page_info = NULL; + } + + if (rx->data_ring != NULL) { + gve_dma_free_coherent(&rx->data_ring_mem); + rx->data_ring = NULL; + } + + if (rx->desc_ring != NULL) { + gve_dma_free_coherent(&rx->desc_ring_mem); + rx->desc_ring = NULL; + } + + if (com->q_resources != NULL) { + gve_dma_free_coherent(&com->q_resources_mem); + com->q_resources = NULL; + } +} + +static void +gve_prefill_rx_slots(struct gve_rx_ring *rx) +{ + struct gve_ring_com *com = &rx->com; + struct gve_dma_handle *dma; + int i; + + for (i = 0; i < com->priv->rx_desc_cnt; i++) { + rx->data_ring[i].qpl_offset = htobe64(PAGE_SIZE * i); + rx->page_info[i].page_offset = 0; + rx->page_info[i].page_address = com->qpl->dmas[i].cpu_addr; + rx->page_info[i].page = com->qpl->pages[i]; + + dma = &com->qpl->dmas[i]; + bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREREAD); + } + + bus_dmamap_sync(rx->data_ring_mem.tag, rx->data_ring_mem.map, + BUS_DMASYNC_PREWRITE); +} + +static int +gve_rx_alloc_ring(struct gve_priv *priv, int i) +{ + struct gve_rx_ring *rx = &priv->rx[i]; + struct gve_ring_com *com = &rx->com; + int err; + + com->priv = priv; + com->id = i; + + rx->mask = priv->rx_pages_per_qpl - 1; + + com->qpl = &priv->qpls[priv->tx_cfg.max_queues + i]; + if (com->qpl == NULL) { + device_printf(priv->dev, "No QPL left for rx ring %d", i); + return (ENOMEM); + } + + rx->page_info = malloc(priv->rx_desc_cnt * sizeof(*rx->page_info), M_GVE, + M_WAITOK | M_ZERO); + + gve_alloc_counters((counter_u64_t *)&rx->stats, NUM_RX_STATS); + + err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources), + PAGE_SIZE, &com->q_resources_mem); + if (err != 0) { + device_printf(priv->dev, "Failed to alloc queue resources for rx ring %d", i); + goto abort; + } + com->q_resources = com->q_resources_mem.cpu_addr; + + err = gve_dma_alloc_coherent(priv, + sizeof(struct gve_rx_desc) * priv->rx_desc_cnt, + CACHE_LINE_SIZE, &rx->desc_ring_mem); + if (err != 0) { + device_printf(priv->dev, "Failed to alloc desc ring for rx ring %d", i); + goto abort; + } + rx->desc_ring = rx->desc_ring_mem.cpu_addr; + + err = gve_dma_alloc_coherent(priv, + sizeof(union gve_rx_data_slot) * priv->rx_desc_cnt, + CACHE_LINE_SIZE, &rx->data_ring_mem); + if (err != 0) { + device_printf(priv->dev, "Failed to alloc data ring for rx ring %d", i); + goto abort; + } + rx->data_ring = rx->data_ring_mem.cpu_addr; + + gve_prefill_rx_slots(rx); + return (0); + +abort: + gve_rx_free_ring(priv, i); + return (err); +} + +int +gve_alloc_rx_rings(struct gve_priv *priv) +{ + int err = 0; + int i; + + priv->rx = malloc(sizeof(struct gve_rx_ring) * priv->rx_cfg.num_queues, + M_GVE, M_WAITOK | M_ZERO); + + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + err = gve_rx_alloc_ring(priv, i); + if (err != 0) + goto free_rings; + } + + return (0); + +free_rings: + while (i--) + gve_rx_free_ring(priv, i); + free(priv->rx, M_GVE); + return (err); +} + +void +gve_free_rx_rings(struct gve_priv *priv) +{ + int i; + + for (i = 0; i < priv->rx_cfg.num_queues; i++) + gve_rx_free_ring(priv, i); + + free(priv->rx, M_GVE); +} + +static void +gve_rx_clear_data_ring(struct gve_rx_ring *rx) +{ + struct gve_priv *priv = rx->com.priv; + int i; + + /* + * The Rx data ring has this invariant: "the networking stack is not + * using the buffer beginning at any page_offset". This invariant is + * established initially by gve_prefill_rx_slots at alloc-time and is + * maintained by the cleanup taskqueue. This invariant implies that the + * ring can be considered to be fully posted with buffers at this point, + * even if there are unfreed mbufs still being processed, which is why we + * can fill the ring without waiting on can_flip at each slot to become true. + */ + for (i = 0; i < priv->rx_desc_cnt; i++) { + rx->data_ring[i].qpl_offset = htobe64(PAGE_SIZE * i + + rx->page_info[i].page_offset); + rx->fill_cnt++; + } + + bus_dmamap_sync(rx->data_ring_mem.tag, rx->data_ring_mem.map, + BUS_DMASYNC_PREWRITE); +} + +static void +gve_rx_clear_desc_ring(struct gve_rx_ring *rx) +{ + struct gve_priv *priv = rx->com.priv; + int i; + + for (i = 0; i < priv->rx_desc_cnt; i++) + rx->desc_ring[i] = (struct gve_rx_desc){}; + + bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map, + BUS_DMASYNC_PREWRITE); +} + +static void +gve_clear_rx_ring(struct gve_priv *priv, int i) +{ + struct gve_rx_ring *rx = &priv->rx[i]; + + rx->seq_no = 1; + rx->cnt = 0; + rx->fill_cnt = 0; + rx->mask = priv->rx_desc_cnt - 1; + + gve_rx_clear_desc_ring(rx); + gve_rx_clear_data_ring(rx); +} + +static void +gve_start_rx_ring(struct gve_priv *priv, int i) +{ + struct gve_rx_ring *rx = &priv->rx[i]; + struct gve_ring_com *com = &rx->com; + + if ((if_getcapenable(priv->ifp) & IFCAP_LRO) != 0) { + if (tcp_lro_init(&rx->lro) != 0) + device_printf(priv->dev, "Failed to init lro for rx ring %d", i); + rx->lro.ifp = priv->ifp; + } + + NET_TASK_INIT(&com->cleanup_task, 0, gve_rx_cleanup_tq, rx); + com->cleanup_tq = taskqueue_create_fast("gve rx", M_WAITOK, + taskqueue_thread_enqueue, &com->cleanup_tq); + + taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, + "%s rxq %d", device_get_nameunit(priv->dev), i); + + gve_db_bar_write_4(priv, com->db_offset, rx->fill_cnt); +} + +int +gve_create_rx_rings(struct gve_priv *priv) +{ + struct gve_ring_com *com; + struct gve_rx_ring *rx; + int err; + int i; + + if (gve_get_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK)) + return (0); + + for (i = 0; i < priv->rx_cfg.num_queues; i++) + gve_clear_rx_ring(priv, i); + + err = gve_adminq_create_rx_queues(priv, priv->rx_cfg.num_queues); + if (err != 0) + return (err); + + bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map, + BUS_DMASYNC_POSTREAD); + + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + rx = &priv->rx[i]; + com = &rx->com; + + com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index); + + bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map, + BUS_DMASYNC_POSTREAD); + com->db_offset = 4 * be32toh(com->q_resources->db_index); + com->counter_idx = be32toh(com->q_resources->counter_index); + + gve_start_rx_ring(priv, i); + } + + gve_set_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK); + return (0); +} + +static void +gve_stop_rx_ring(struct gve_priv *priv, int i) +{ + struct gve_rx_ring *rx = &priv->rx[i]; + struct gve_ring_com *com = &rx->com; + + if (com->cleanup_tq != NULL) { + taskqueue_quiesce(com->cleanup_tq); + taskqueue_free(com->cleanup_tq); + com->cleanup_tq = NULL; + } + + tcp_lro_free(&rx->lro); + rx->ctx = (struct gve_rx_ctx){}; +} + +int +gve_destroy_rx_rings(struct gve_priv *priv) +{ + int err; + int i; + + for (i = 0; i < priv->rx_cfg.num_queues; i++) + gve_stop_rx_ring(priv, i); + + if (gve_get_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK)) { + err = gve_adminq_destroy_rx_queues(priv, priv->rx_cfg.num_queues); + if (err != 0) + return (err); + gve_clear_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK); + } + + return (0); +} + +int +gve_rx_intr(void *arg) +{ + struct gve_rx_ring *rx = arg; + struct gve_priv *priv = rx->com.priv; + struct gve_ring_com *com = &rx->com; + + if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) + return (FILTER_STRAY); + + gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK); + taskqueue_enqueue(rx->com.cleanup_tq, &rx->com.cleanup_task); + return (FILTER_HANDLED); +} + +static inline void +gve_set_rss_type(__be16 flag, struct mbuf *mbuf) +{ + if ((flag & GVE_RXF_IPV4) != 0) { + if ((flag & GVE_RXF_TCP) != 0) + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV4); + else if ((flag & GVE_RXF_UDP) != 0) + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV4); + else + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV4); + return; + } + + if ((flag & GVE_RXF_IPV6) != 0) { + if ((flag & GVE_RXF_TCP) != 0) + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV6); + else if ((flag & GVE_RXF_UDP) != 0) + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV6); + else + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV6); + return; + } +} + +static void +gve_mextadd_free(struct mbuf *mbuf) +{ + vm_page_t page = (vm_page_t)mbuf->m_ext.ext_arg1; + vm_offset_t va = (vm_offset_t)mbuf->m_ext.ext_arg2; + + /* + * Free the page only if this is the last ref. + * The interface might no longer exist by the time + * this callback is called, see gve_free_qpl. + */ + if (__predict_false(vm_page_unwire_noq(page))) { + pmap_qremove(va, 1); + kva_free(va, PAGE_SIZE); + vm_page_free(page); + } +} + +static void +gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr) +{ + const __be64 offset = htobe64(GVE_DEFAULT_RX_BUFFER_OFFSET); + page_info->page_offset ^= GVE_DEFAULT_RX_BUFFER_OFFSET; + *(slot_addr) ^= offset; +} + +static struct mbuf * +gve_rx_create_mbuf(struct gve_priv *priv, struct gve_rx_ring *rx, + struct gve_rx_slot_page_info *page_info, uint16_t len, + union gve_rx_data_slot *data_slot, bool is_only_frag) +{ + struct gve_rx_ctx *ctx = &rx->ctx; + struct mbuf *mbuf; + u_int ref_count; + bool can_flip; + + uint32_t offset = page_info->page_offset + page_info->pad; + void *va = (char *)page_info->page_address + offset; + + if (len <= priv->rx_copybreak && is_only_frag) { + mbuf = m_get2(len, M_NOWAIT, MT_DATA, M_PKTHDR); + if (__predict_false(mbuf == NULL)) + return (NULL); + + m_copyback(mbuf, 0, len, va); + counter_enter(); + counter_u64_add_protected(rx->stats.rx_copybreak_cnt, 1); + counter_exit(); + ctx->mbuf_head = mbuf; + ctx->mbuf_tail = mbuf; + } else { + struct mbuf *mbuf_tail = ctx->mbuf_tail; + KASSERT(len <= MCLBYTES, ("gve rx fragment bigger than cluster mbuf")); + + /* + * This page was created with VM_ALLOC_WIRED, thus the lowest + * wire count experienced by the page until the interface is + * destroyed is 1. + * + * We wire the page again before supplying an mbuf pointing to + * it to the networking stack, so before the mbuf leaves the + * driver, the wire count rises to 2. + * + * If it is 1 again, it necessarily means that the mbuf has been + * consumed and it was gve_mextadd_free that brought down the wire + * count back to 1. We only need to eventually observe the 1. + */ + ref_count = atomic_load_int(&page_info->page->ref_count); + can_flip = VPRC_WIRE_COUNT(ref_count) == 1; + + if (mbuf_tail == NULL) { + if (can_flip) + mbuf = m_gethdr(M_NOWAIT, MT_DATA); + else + mbuf = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); + + ctx->mbuf_head = mbuf; + ctx->mbuf_tail = mbuf; + } else { + if (can_flip) + mbuf = m_get(M_NOWAIT, MT_DATA); + else + mbuf = m_getcl(M_NOWAIT, MT_DATA, 0); + + mbuf_tail->m_next = mbuf; + ctx->mbuf_tail = mbuf; + } + + if (__predict_false(mbuf == NULL)) + return (NULL); + + if (can_flip) { + MEXTADD(mbuf, va, len, gve_mextadd_free, + page_info->page, page_info->page_address, + 0, EXT_NET_DRV); + + counter_enter(); + counter_u64_add_protected(rx->stats.rx_frag_flip_cnt, 1); + counter_exit(); + + /* + * Grab an extra ref to the page so that gve_mextadd_free + * does not end up freeing the page while the interface exists. + */ + vm_page_wire(page_info->page); + + gve_rx_flip_buff(page_info, &data_slot->qpl_offset); + } else { + m_copyback(mbuf, 0, len, va); + counter_enter(); + counter_u64_add_protected(rx->stats.rx_frag_copy_cnt, 1); + counter_exit(); + } + } + + mbuf->m_len = len; + ctx->total_size += len; + + return (mbuf); +} + +static inline bool +gve_needs_rss(__be16 flag) +{ + if ((flag & GVE_RXF_FRAG) != 0) + return (false); + if ((flag & (GVE_RXF_IPV4 | GVE_RXF_IPV6)) != 0) + return (true); + return (false); +} + +static void +gve_rx(struct gve_priv *priv, struct gve_rx_ring *rx, struct gve_rx_desc *desc, + uint32_t idx) +{ + struct gve_rx_slot_page_info *page_info; + struct gve_dma_handle *page_dma_handle; + union gve_rx_data_slot *data_slot; + struct gve_rx_ctx *ctx = &rx->ctx; + struct mbuf *mbuf = NULL; + if_t ifp = priv->ifp; + bool do_if_input; + uint16_t len; + + bool is_first_frag = ctx->frag_cnt == 0; + bool is_last_frag = !(GVE_RXF_PKT_CONT & desc->flags_seq); + bool is_only_frag = is_first_frag && is_last_frag; + + if (__predict_false(ctx->drop_pkt)) + goto finish_frag; + + if ((desc->flags_seq & GVE_RXF_ERR) != 0) { + ctx->drop_pkt = true; + counter_enter(); + counter_u64_add_protected(rx->stats.rx_dropped_pkt_desc_err, 1); + counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1); + counter_exit(); + m_freem(ctx->mbuf_head); + goto finish_frag; + } + + page_info = &rx->page_info[idx]; + data_slot = &rx->data_ring[idx]; + page_dma_handle = &(rx->com.qpl->dmas[idx]); + + page_info->pad = is_first_frag ? GVE_RX_PAD : 0; + len = be16toh(desc->len) - page_info->pad; + + bus_dmamap_sync(page_dma_handle->tag, page_dma_handle->map, + BUS_DMASYNC_POSTREAD); + + mbuf = gve_rx_create_mbuf(priv, rx, page_info, len, data_slot, + is_only_frag); + if (mbuf == NULL) { + ctx->drop_pkt = true; + counter_enter(); + counter_u64_add_protected(rx->stats.rx_dropped_pkt_mbuf_alloc_fail, 1); + counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1); + counter_exit(); + m_freem(ctx->mbuf_head); + goto finish_frag; + } + + if (is_first_frag) { + mbuf->m_pkthdr.rcvif = priv->ifp; + + if (gve_needs_rss(desc->flags_seq)) { + gve_set_rss_type(desc->flags_seq, mbuf); + mbuf->m_pkthdr.flowid = be32toh(desc->rss_hash); + } + + if ((desc->csum != 0) && ((desc->flags_seq & GVE_RXF_FRAG) == 0)) { + mbuf->m_pkthdr.csum_flags = CSUM_IP_CHECKED | + CSUM_IP_VALID | + CSUM_DATA_VALID | + CSUM_PSEUDO_HDR; + mbuf->m_pkthdr.csum_data = 0xffff; + } + } + + if (is_last_frag) { + mbuf = ctx->mbuf_head; + mbuf->m_pkthdr.len = ctx->total_size; + do_if_input = true; + + if (((if_getcapenable(priv->ifp) & IFCAP_LRO) != 0) && /* LRO is enabled */ + (desc->flags_seq & GVE_RXF_TCP) && /* pkt is a TCP pkt */ + ((mbuf->m_pkthdr.csum_flags & CSUM_DATA_VALID) != 0) && /* NIC verified csum */ + (rx->lro.lro_cnt != 0) && /* LRO resources exist */ + (tcp_lro_rx(&rx->lro, mbuf, 0) == 0)) + do_if_input = false; + + if (do_if_input) + if_input(ifp, mbuf); + + counter_enter(); + counter_u64_add_protected(rx->stats.rbytes, ctx->total_size); + counter_u64_add_protected(rx->stats.rpackets, 1); + counter_exit(); + } + +finish_frag: + ctx->frag_cnt++; + if (is_last_frag) + rx->ctx = (struct gve_rx_ctx){}; +} + +static bool +gve_rx_work_pending(struct gve_rx_ring *rx) +{ + struct gve_rx_desc *desc; + __be16 flags_seq; + uint32_t next_idx; + + next_idx = rx->cnt & rx->mask; + desc = rx->desc_ring + next_idx; + + flags_seq = desc->flags_seq; + + return (GVE_SEQNO(flags_seq) == rx->seq_no); +} + +static inline uint8_t +gve_next_seqno(uint8_t seq) +{ + return ((seq + 1) == 8 ? 1 : seq + 1); +} + +static void +gve_rx_cleanup(struct gve_priv *priv, struct gve_rx_ring *rx, int budget) +{ + uint32_t idx = rx->cnt & rx->mask; + struct gve_rx_desc *desc; + struct gve_rx_ctx *ctx = &rx->ctx; + uint32_t work_done = 0; + + NET_EPOCH_ASSERT(); + + bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map, + BUS_DMASYNC_POSTREAD); + desc = &rx->desc_ring[idx]; + + while ((work_done < budget || ctx->frag_cnt) && + (GVE_SEQNO(desc->flags_seq) == rx->seq_no)) { + + gve_rx(priv, rx, desc, idx); + + rx->cnt++; + idx = rx->cnt & rx->mask; + desc = &rx->desc_ring[idx]; + rx->seq_no = gve_next_seqno(rx->seq_no); + work_done++; + } + + /* The device will only send whole packets. */ + if (__predict_false(ctx->frag_cnt)) { + m_freem(ctx->mbuf_head); + rx->ctx = (struct gve_rx_ctx){}; + device_printf(priv->dev, + "Unexpected seq number %d with incomplete packet, expected %d, scheduling reset", + GVE_SEQNO(desc->flags_seq), rx->seq_no); + gve_schedule_reset(priv); + } + + if (work_done != 0) + tcp_lro_flush_all(&rx->lro); + + bus_dmamap_sync(rx->data_ring_mem.tag, rx->data_ring_mem.map, + BUS_DMASYNC_PREWRITE); + + /* Buffers are refilled as the descs are processed */ + rx->fill_cnt += work_done; + gve_db_bar_write_4(priv, rx->com.db_offset, rx->fill_cnt); +} + +void +gve_rx_cleanup_tq(void *arg, int pending) +{ + struct gve_rx_ring *rx = arg; + struct gve_priv *priv = rx->com.priv; + + if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) + return; + + gve_rx_cleanup(priv, rx, /*budget=*/128); + + gve_db_bar_write_4(priv, rx->com.irq_db_offset, + GVE_IRQ_ACK | GVE_IRQ_EVENT); + + /* + * Fragments received before this barrier MAY NOT cause the NIC to send an + * interrupt but they will still be handled by the enqueue below. + * Fragments received after the barrier WILL trigger an interrupt. + */ + mb(); + + if (gve_rx_work_pending(rx)) { + gve_db_bar_write_4(priv, rx->com.irq_db_offset, GVE_IRQ_MASK); + taskqueue_enqueue(rx->com.cleanup_tq, &rx->com.cleanup_task); + } +} diff --git a/sys/dev/gve/gve_sysctl.c b/sys/dev/gve/gve_sysctl.c new file mode 100644 index 000000000000..924654f62adc --- /dev/null +++ b/sys/dev/gve/gve_sysctl.c @@ -0,0 +1,261 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2023 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "gve.h" + +static void +gve_setup_rxq_sysctl(struct sysctl_ctx_list *ctx, + struct sysctl_oid_list *child, struct gve_rx_ring *rxq) +{ + struct sysctl_oid *node; + struct sysctl_oid_list *list; + struct gve_rxq_stats *stats; + char namebuf[16]; + + snprintf(namebuf, sizeof(namebuf), "rxq%d", rxq->com.id); + node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, + CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Receive Queue"); + list = SYSCTL_CHILDREN(node); + + stats = &rxq->stats; + + SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, + "rx_bytes", CTLFLAG_RD, + &stats->rbytes, "Bytes received"); + SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, + "rx_packets", CTLFLAG_RD, + &stats->rpackets, "Packets received"); + SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_copybreak_cnt", + CTLFLAG_RD, &stats->rx_copybreak_cnt, + "Total frags with mbufs allocated for copybreak"); + SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_frag_flip_cnt", + CTLFLAG_RD, &stats->rx_frag_flip_cnt, + "Total frags that allocated mbuf with page flip"); + SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_frag_copy_cnt", + CTLFLAG_RD, &stats->rx_frag_copy_cnt, + "Total frags with mbuf that copied payload into mbuf"); + SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_dropped_pkt", + CTLFLAG_RD, &stats->rx_dropped_pkt, + "Total rx packets dropped"); + SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, + "rx_dropped_pkt_desc_err", CTLFLAG_RD, + &stats->rx_dropped_pkt_desc_err, + "Packets dropped due to descriptor error"); + SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, + "rx_dropped_pkt_mbuf_alloc_fail", CTLFLAG_RD, + &stats->rx_dropped_pkt_mbuf_alloc_fail, + "Packets dropped due to failed mbuf allocation"); + SYSCTL_ADD_U32(ctx, list, OID_AUTO, + "rx_completed_desc", CTLFLAG_RD, + &rxq->cnt, 0, "Number of descriptors completed"); + SYSCTL_ADD_U32(ctx, list, OID_AUTO, + "num_desc_posted", CTLFLAG_RD, + &rxq->fill_cnt, rxq->fill_cnt, + "Toal number of descriptors posted"); +} + +static void +gve_setup_txq_sysctl(struct sysctl_ctx_list *ctx, + struct sysctl_oid_list *child, struct gve_tx_ring *txq) +{ + struct sysctl_oid *node; + struct sysctl_oid_list *tx_list; + struct gve_txq_stats *stats; + char namebuf[16]; + + snprintf(namebuf, sizeof(namebuf), "txq%d", txq->com.id); + node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, + CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Transmit Queue"); + tx_list = SYSCTL_CHILDREN(node); + + stats = &txq->stats; + + SYSCTL_ADD_U32(ctx, tx_list, OID_AUTO, + "tx_posted_desc", CTLFLAG_RD, + &txq->req, 0, "Number of descriptors posted by NIC"); + SYSCTL_ADD_U32(ctx, tx_list, OID_AUTO, + "tx_completed_desc", CTLFLAG_RD, + &txq->done, 0, "Number of descriptors completed"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_packets", CTLFLAG_RD, + &stats->tpackets, "Packets transmitted"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_tso_packets", CTLFLAG_RD, + &stats->tso_packet_cnt, "TSO Packets transmitted"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_bytes", CTLFLAG_RD, + &stats->tbytes, "Bytes transmitted"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_dropped_pkt_nospace_device", CTLFLAG_RD, + &stats->tx_dropped_pkt_nospace_device, + "Packets dropped due to no space in device"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_dropped_pkt_nospace_bufring", CTLFLAG_RD, + &stats->tx_dropped_pkt_nospace_bufring, + "Packets dropped due to no space in br ring"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_dropped_pkt_vlan", CTLFLAG_RD, + &stats->tx_dropped_pkt_vlan, + "Dropped VLAN packets"); +} + +static void +gve_setup_queue_stat_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, + struct gve_priv *priv) +{ + int i; + + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + gve_setup_rxq_sysctl(ctx, child, &priv->rx[i]); + } + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + gve_setup_txq_sysctl(ctx, child, &priv->tx[i]); + } +} + +static void +gve_setup_adminq_stat_sysctl(struct sysctl_ctx_list *ctx, + struct sysctl_oid_list *child, struct gve_priv *priv) +{ + struct sysctl_oid *admin_node; + struct sysctl_oid_list *admin_list; + + /* Admin queue stats */ + admin_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "adminq_stats", + CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Admin Queue statistics"); + admin_list = SYSCTL_CHILDREN(admin_node); + + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_prod_cnt", CTLFLAG_RD, + &priv->adminq_prod_cnt, 0, "Adminq Commands issued"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_cmd_fail", CTLFLAG_RD, + &priv->adminq_cmd_fail, 0, "Aqminq Failed commands"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_timeouts", CTLFLAG_RD, + &priv->adminq_timeouts, 0, "Adminq Timedout commands"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_describe_device_cnt", + CTLFLAG_RD, &priv->adminq_describe_device_cnt, 0, + "adminq_describe_device_cnt"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, + "adminq_cfg_device_resources_cnt", CTLFLAG_RD, + &priv->adminq_cfg_device_resources_cnt, 0, + "adminq_cfg_device_resources_cnt"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, + "adminq_register_page_list_cnt", CTLFLAG_RD, + &priv->adminq_register_page_list_cnt, 0, + "adminq_register_page_list_cnt"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, + "adminq_unregister_page_list_cnt", CTLFLAG_RD, + &priv->adminq_unregister_page_list_cnt, 0, + "adminq_unregister_page_list_cnt"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_create_tx_queue_cnt", + CTLFLAG_RD, &priv->adminq_create_tx_queue_cnt, 0, + "adminq_create_tx_queue_cnt"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_create_rx_queue_cnt", + CTLFLAG_RD, &priv->adminq_create_rx_queue_cnt, 0, + "adminq_create_rx_queue_cnt"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_destroy_tx_queue_cnt", + CTLFLAG_RD, &priv->adminq_destroy_tx_queue_cnt, 0, + "adminq_destroy_tx_queue_cnt"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_destroy_rx_queue_cnt", + CTLFLAG_RD, &priv->adminq_destroy_rx_queue_cnt, 0, + "adminq_destroy_rx_queue_cnt"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, + "adminq_dcfg_device_resources_cnt", CTLFLAG_RD, + &priv->adminq_dcfg_device_resources_cnt, 0, + "adminq_dcfg_device_resources_cnt"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, + "adminq_set_driver_parameter_cnt", CTLFLAG_RD, + &priv->adminq_set_driver_parameter_cnt, 0, + "adminq_set_driver_parameter_cnt"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, + "adminq_verify_driver_compatibility_cnt", CTLFLAG_RD, + &priv->adminq_verify_driver_compatibility_cnt, 0, + "adminq_verify_driver_compatibility_cnt"); +} + +static void +gve_setup_main_stat_sysctl(struct sysctl_ctx_list *ctx, + struct sysctl_oid_list *child, struct gve_priv *priv) +{ + struct sysctl_oid *main_node; + struct sysctl_oid_list *main_list; + + /* Main stats */ + main_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "main_stats", + CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Main statistics"); + main_list = SYSCTL_CHILDREN(main_node); + + SYSCTL_ADD_U32(ctx, main_list, OID_AUTO, "interface_up_cnt", CTLFLAG_RD, + &priv->interface_up_cnt, 0, "Times interface was set to up"); + SYSCTL_ADD_U32(ctx, main_list, OID_AUTO, "interface_down_cnt", CTLFLAG_RD, + &priv->interface_down_cnt, 0, "Times interface was set to down"); + SYSCTL_ADD_U32(ctx, main_list, OID_AUTO, "reset_cnt", CTLFLAG_RD, + &priv->reset_cnt, 0, "Times reset"); +} + +void gve_setup_sysctl(struct gve_priv *priv) +{ + device_t dev; + struct sysctl_ctx_list *ctx; + struct sysctl_oid *tree; + struct sysctl_oid_list *child; + + dev = priv->dev; + ctx = device_get_sysctl_ctx(dev); + tree = device_get_sysctl_tree(dev); + child = SYSCTL_CHILDREN(tree); + + gve_setup_queue_stat_sysctl(ctx, child, priv); + gve_setup_adminq_stat_sysctl(ctx, child, priv); + gve_setup_main_stat_sysctl(ctx, child, priv); +} + +void +gve_accum_stats(struct gve_priv *priv, uint64_t *rpackets, + uint64_t *rbytes, uint64_t *rx_dropped_pkt, uint64_t *tpackets, + uint64_t *tbytes, uint64_t *tx_dropped_pkt) +{ + struct gve_rxq_stats *rxqstats; + struct gve_txq_stats *txqstats; + int i; + + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + rxqstats = &priv->rx[i].stats; + *rpackets += counter_u64_fetch(rxqstats->rpackets); + *rbytes += counter_u64_fetch(rxqstats->rbytes); + *rx_dropped_pkt += counter_u64_fetch(rxqstats->rx_dropped_pkt); + } + + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + txqstats = &priv->tx[i].stats; + *tpackets += counter_u64_fetch(txqstats->tpackets); + *tbytes += counter_u64_fetch(txqstats->tbytes); + *tx_dropped_pkt += counter_u64_fetch(txqstats->tx_dropped_pkt); + } +} diff --git a/sys/dev/gve/gve_tx.c b/sys/dev/gve/gve_tx.c new file mode 100644 index 000000000000..dd48bc35a258 --- /dev/null +++ b/sys/dev/gve/gve_tx.c @@ -0,0 +1,806 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2023 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "gve.h" +#include "gve_adminq.h" + +#define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182 + +static int +gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_ring *tx) +{ + struct gve_queue_page_list *qpl = tx->com.qpl; + struct gve_tx_fifo *fifo = &tx->fifo; + + fifo->size = qpl->num_pages * PAGE_SIZE; + fifo->base = qpl->kva; + atomic_store_int(&fifo->available, fifo->size); + fifo->head = 0; + + return (0); +} + +static void +gve_tx_free_ring(struct gve_priv *priv, int i) +{ + struct gve_tx_ring *tx = &priv->tx[i]; + struct gve_ring_com *com = &tx->com; + + /* Safe to call even if never alloced */ + gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS); + + if (tx->br != NULL) { + buf_ring_free(tx->br, M_DEVBUF); + tx->br = NULL; + } + + if (mtx_initialized(&tx->ring_mtx)) + mtx_destroy(&tx->ring_mtx); + + if (tx->info != NULL) { + free(tx->info, M_GVE); + tx->info = NULL; + } + + if (tx->desc_ring != NULL) { + gve_dma_free_coherent(&tx->desc_ring_mem); + tx->desc_ring = NULL; + } + + if (com->q_resources != NULL) { + gve_dma_free_coherent(&com->q_resources_mem); + com->q_resources = NULL; + } +} + +static int +gve_tx_alloc_ring(struct gve_priv *priv, int i) +{ + struct gve_tx_ring *tx = &priv->tx[i]; + struct gve_ring_com *com = &tx->com; + char mtx_name[16]; + int err; + + com->priv = priv; + com->id = i; + + com->qpl = &priv->qpls[i]; + if (com->qpl == NULL) { + device_printf(priv->dev, "No QPL left for tx ring %d\n", i); + return (ENOMEM); + } + + err = gve_tx_fifo_init(priv, tx); + if (err != 0) + goto abort; + + tx->info = malloc(sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt, + M_GVE, M_WAITOK | M_ZERO); + + sprintf(mtx_name, "gvetx%d", i); + mtx_init(&tx->ring_mtx, mtx_name, NULL, MTX_DEF); + + tx->br = buf_ring_alloc(GVE_TX_BUFRING_ENTRIES, M_DEVBUF, + M_WAITOK, &tx->ring_mtx); + + gve_alloc_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS); + + err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources), + PAGE_SIZE, &com->q_resources_mem); + if (err != 0) { + device_printf(priv->dev, "Failed to alloc queue resources for tx ring %d", i); + goto abort; + } + com->q_resources = com->q_resources_mem.cpu_addr; + + err = gve_dma_alloc_coherent(priv, + sizeof(union gve_tx_desc) * priv->tx_desc_cnt, + CACHE_LINE_SIZE, &tx->desc_ring_mem); + if (err != 0) { + device_printf(priv->dev, "Failed to alloc desc ring for tx ring %d", i); + goto abort; + } + tx->desc_ring = tx->desc_ring_mem.cpu_addr; + + return (0); + +abort: + gve_tx_free_ring(priv, i); + return (err); +} + +int +gve_alloc_tx_rings(struct gve_priv *priv) +{ + int err = 0; + int i; + + priv->tx = malloc(sizeof(struct gve_tx_ring) * priv->tx_cfg.num_queues, + M_GVE, M_WAITOK | M_ZERO); + + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + err = gve_tx_alloc_ring(priv, i); + if (err != 0) + goto free_rings; + + } + + return (0); + +free_rings: + while (i--) + gve_tx_free_ring(priv, i); + free(priv->tx, M_GVE); + return (err); +} + +void +gve_free_tx_rings(struct gve_priv *priv) +{ + int i; + + for (i = 0; i < priv->tx_cfg.num_queues; i++) + gve_tx_free_ring(priv, i); + + free(priv->tx, M_GVE); +} + +static void +gve_tx_clear_desc_ring(struct gve_tx_ring *tx) +{ + struct gve_ring_com *com = &tx->com; + int i; + + for (i = 0; i < com->priv->tx_desc_cnt; i++) { + tx->desc_ring[i] = (union gve_tx_desc){}; + tx->info[i] = (struct gve_tx_buffer_state){}; + } + + bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, + BUS_DMASYNC_PREWRITE); +} + +static void +gve_clear_tx_ring(struct gve_priv *priv, int i) +{ + struct gve_tx_ring *tx = &priv->tx[i]; + struct gve_tx_fifo *fifo = &tx->fifo; + + tx->req = 0; + tx->done = 0; + tx->mask = priv->tx_desc_cnt - 1; + + atomic_store_int(&fifo->available, fifo->size); + fifo->head = 0; + + gve_tx_clear_desc_ring(tx); +} + +static void +gve_start_tx_ring(struct gve_priv *priv, int i) +{ + struct gve_tx_ring *tx = &priv->tx[i]; + struct gve_ring_com *com = &tx->com; + + NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq, tx); + com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK, + taskqueue_thread_enqueue, &com->cleanup_tq); + taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s txq %d", + device_get_nameunit(priv->dev), i); + + TASK_INIT(&tx->xmit_task, 0, gve_xmit_tq, tx); + tx->xmit_tq = taskqueue_create_fast("gve tx xmit", + M_WAITOK, taskqueue_thread_enqueue, &tx->xmit_tq); + taskqueue_start_threads(&tx->xmit_tq, 1, PI_NET, "%s txq %d xmit", + device_get_nameunit(priv->dev), i); +} + +int +gve_create_tx_rings(struct gve_priv *priv) +{ + struct gve_ring_com *com; + struct gve_tx_ring *tx; + int err; + int i; + + if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) + return (0); + + for (i = 0; i < priv->tx_cfg.num_queues; i++) + gve_clear_tx_ring(priv, i); + + err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues); + if (err != 0) + return (err); + + bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map, + BUS_DMASYNC_POSTREAD); + + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + tx = &priv->tx[i]; + com = &tx->com; + + com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index); + + bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map, + BUS_DMASYNC_POSTREAD); + com->db_offset = 4 * be32toh(com->q_resources->db_index); + com->counter_idx = be32toh(com->q_resources->counter_index); + + gve_start_tx_ring(priv, i); + } + + gve_set_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); + return (0); +} + +static void +gve_stop_tx_ring(struct gve_priv *priv, int i) +{ + struct gve_tx_ring *tx = &priv->tx[i]; + struct gve_ring_com *com = &tx->com; + + if (com->cleanup_tq != NULL) { + taskqueue_quiesce(com->cleanup_tq); + taskqueue_free(com->cleanup_tq); + com->cleanup_tq = NULL; + } + + if (tx->xmit_tq != NULL) { + taskqueue_quiesce(tx->xmit_tq); + taskqueue_free(tx->xmit_tq); + tx->xmit_tq = NULL; + } +} + +int +gve_destroy_tx_rings(struct gve_priv *priv) +{ + int err; + int i; + + for (i = 0; i < priv->tx_cfg.num_queues; i++) + gve_stop_tx_ring(priv, i); + + if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) { + err = gve_adminq_destroy_tx_queues(priv, priv->tx_cfg.num_queues); + if (err != 0) + return (err); + gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); + } + + return (0); +} + +int +gve_tx_intr(void *arg) +{ + struct gve_tx_ring *tx = arg; + struct gve_priv *priv = tx->com.priv; + struct gve_ring_com *com = &tx->com; + + if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) + return (FILTER_STRAY); + + gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK); + taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task); + return (FILTER_HANDLED); +} + +static uint32_t +gve_tx_load_event_counter(struct gve_priv *priv, struct gve_tx_ring *tx) +{ + bus_dmamap_sync(priv->counter_array_mem.tag, priv->counter_array_mem.map, + BUS_DMASYNC_POSTREAD); + uint32_t counter = priv->counters[tx->com.counter_idx]; + return (be32toh(counter)); +} + +static void +gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes) +{ + atomic_add_int(&fifo->available, bytes); +} + +void +gve_tx_cleanup_tq(void *arg, int pending) +{ + struct gve_tx_ring *tx = arg; + struct gve_priv *priv = tx->com.priv; + uint32_t nic_done = gve_tx_load_event_counter(priv, tx); + uint32_t todo = nic_done - tx->done; + size_t space_freed = 0; + int i, j; + + if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) + return; + + for (j = 0; j < todo; j++) { + uint32_t idx = tx->done & tx->mask; + struct gve_tx_buffer_state *info = &tx->info[idx]; + struct mbuf *mbuf = info->mbuf; + + tx->done++; + if (mbuf == NULL) + continue; + + info->mbuf = NULL; + counter_enter(); + counter_u64_add_protected(tx->stats.tbytes, mbuf->m_pkthdr.len); + counter_u64_add_protected(tx->stats.tpackets, 1); + counter_exit(); + m_freem(mbuf); + + for (i = 0; i < GVE_TX_MAX_DESCS; i++) { + space_freed += info->iov[i].iov_len + info->iov[i].iov_padding; + info->iov[i].iov_len = 0; + info->iov[i].iov_padding = 0; + } + } + + gve_tx_free_fifo(&tx->fifo, space_freed); + + gve_db_bar_write_4(priv, tx->com.irq_db_offset, + GVE_IRQ_ACK | GVE_IRQ_EVENT); + + /* + * Completions born before this barrier MAY NOT cause the NIC to send an + * interrupt but they will still be handled by the enqueue below. + * Completions born after the barrier WILL trigger an interrupt. + */ + mb(); + + nic_done = gve_tx_load_event_counter(priv, tx); + todo = nic_done - tx->done; + if (todo != 0) { + gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK); + taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task); + } +} + +static void +gve_dma_sync_for_device(struct gve_queue_page_list *qpl, + uint64_t iov_offset, uint64_t iov_len) +{ + uint64_t last_page = (iov_offset + iov_len - 1) / PAGE_SIZE; + uint64_t first_page = iov_offset / PAGE_SIZE; + struct gve_dma_handle *dma; + uint64_t page; + + for (page = first_page; page <= last_page; page++) { + dma = &(qpl->dmas[page]); + bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE); + } +} + +static void +gve_tx_fill_mtd_desc(struct gve_tx_mtd_desc *mtd_desc, struct mbuf *mbuf) +{ + mtd_desc->type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH; + mtd_desc->path_state = GVE_MTD_PATH_STATE_DEFAULT | GVE_MTD_PATH_HASH_L4; + mtd_desc->path_hash = htobe32(mbuf->m_pkthdr.flowid); + mtd_desc->reserved0 = 0; + mtd_desc->reserved1 = 0; +} + +static void +gve_tx_fill_pkt_desc(struct gve_tx_pkt_desc *pkt_desc, bool is_tso, + uint16_t l4_hdr_offset, uint32_t desc_cnt, + uint16_t first_seg_len, uint64_t addr, bool has_csum_flag, + int csum_offset, uint16_t pkt_len) +{ + if (is_tso) { + pkt_desc->type_flags = GVE_TXD_TSO | GVE_TXF_L4CSUM; + pkt_desc->l4_csum_offset = csum_offset >> 1; + pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1; + } else if (has_csum_flag) { + pkt_desc->type_flags = GVE_TXD_STD | GVE_TXF_L4CSUM; + pkt_desc->l4_csum_offset = csum_offset >> 1; + pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1; + } else { + pkt_desc->type_flags = GVE_TXD_STD; + pkt_desc->l4_csum_offset = 0; + pkt_desc->l4_hdr_offset = 0; + } + pkt_desc->desc_cnt = desc_cnt; + pkt_desc->len = htobe16(pkt_len); + pkt_desc->seg_len = htobe16(first_seg_len); + pkt_desc->seg_addr = htobe64(addr); +} + +static void +gve_tx_fill_seg_desc(struct gve_tx_seg_desc *seg_desc, + bool is_tso, uint16_t len, uint64_t addr, + bool is_ipv6, uint8_t l3_off, uint16_t tso_mss) +{ + seg_desc->type_flags = GVE_TXD_SEG; + if (is_tso) { + if (is_ipv6) + seg_desc->type_flags |= GVE_TXSF_IPV6; + seg_desc->l3_offset = l3_off >> 1; + seg_desc->mss = htobe16(tso_mss); + } + seg_desc->seg_len = htobe16(len); + seg_desc->seg_addr = htobe64(addr); +} + +static inline uint32_t +gve_tx_avail(struct gve_tx_ring *tx) +{ + return (tx->mask + 1 - (tx->req - tx->done)); +} + +static bool +gve_tx_fifo_can_alloc(struct gve_tx_fifo *fifo, size_t bytes) +{ + return (atomic_load_int(&fifo->available) >= bytes); +} + +static inline bool +gve_can_tx(struct gve_tx_ring *tx, int bytes_required) +{ + return (gve_tx_avail(tx) >= (GVE_TX_MAX_DESCS + 1) && + gve_tx_fifo_can_alloc(&tx->fifo, bytes_required)); +} + +static int +gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo *fifo, size_t bytes) +{ + return (fifo->head + bytes < fifo->size) ? 0 : fifo->size - fifo->head; +} + +static inline int +gve_fifo_bytes_required(struct gve_tx_ring *tx, uint16_t first_seg_len, + uint16_t pkt_len) +{ + int pad_bytes, align_hdr_pad; + int bytes; + + pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len); + /* We need to take into account the header alignment padding. */ + align_hdr_pad = roundup2(first_seg_len, CACHE_LINE_SIZE) - first_seg_len; + bytes = align_hdr_pad + pad_bytes + pkt_len; + + return (bytes); +} + +static int +gve_tx_alloc_fifo(struct gve_tx_fifo *fifo, size_t bytes, + struct gve_tx_iovec iov[2]) +{ + size_t overflow, padding; + uint32_t aligned_head; + int nfrags = 0; + + if (bytes == 0) + return (0); + + /* + * This check happens before we know how much padding is needed to + * align to a cacheline boundary for the payload, but that is fine, + * because the FIFO head always start aligned, and the FIFO's boundaries + * are aligned, so if there is space for the data, there is space for + * the padding to the next alignment. + */ + KASSERT(gve_tx_fifo_can_alloc(fifo, bytes), + ("Allocating gve tx fifo when there is no room")); + + nfrags++; + + iov[0].iov_offset = fifo->head; + iov[0].iov_len = bytes; + fifo->head += bytes; + + if (fifo->head > fifo->size) { + /* + * If the allocation did not fit in the tail fragment of the + * FIFO, also use the head fragment. + */ + nfrags++; + overflow = fifo->head - fifo->size; + iov[0].iov_len -= overflow; + iov[1].iov_offset = 0; /* Start of fifo*/ + iov[1].iov_len = overflow; + + fifo->head = overflow; + } + + /* Re-align to a cacheline boundary */ + aligned_head = roundup2(fifo->head, CACHE_LINE_SIZE); + padding = aligned_head - fifo->head; + iov[nfrags - 1].iov_padding = padding; + atomic_add_int(&fifo->available, -(bytes + padding)); + fifo->head = aligned_head; + + if (fifo->head == fifo->size) + fifo->head = 0; + + return (nfrags); +} + +/* Only error this returns is ENOBUFS when the tx fifo is short of space */ +static int +gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf) +{ + bool is_tso, has_csum_flag, is_ipv6 = false, is_tcp = false, is_udp = false; + int csum_flags, csum_offset, mtd_desc_nr, offset, copy_offset; + uint16_t tso_mss, l4_off, l4_data_off, pkt_len, first_seg_len; + int pad_bytes, hdr_nfrags, payload_nfrags; + struct gve_tx_pkt_desc *pkt_desc; + struct gve_tx_seg_desc *seg_desc; + struct gve_tx_mtd_desc *mtd_desc; + struct gve_tx_buffer_state *info; + uint32_t idx = tx->req & tx->mask; + struct ether_header *eh; + struct mbuf *mbuf_next; + int payload_iov = 2; + int bytes_required; + struct ip6_hdr *ip6; + struct tcphdr *th; + uint32_t next_idx; + uint8_t l3_off; + struct ip *ip; + int i; + + info = &tx->info[idx]; + csum_flags = mbuf->m_pkthdr.csum_flags; + pkt_len = mbuf->m_pkthdr.len; + is_tso = csum_flags & CSUM_TSO; + has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | + CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); + mtd_desc_nr = M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE ? 1 : 0; + tso_mss = is_tso ? mbuf->m_pkthdr.tso_segsz : 0; + + eh = mtod(mbuf, struct ether_header *); + KASSERT(eh->ether_type != ETHERTYPE_VLAN, + ("VLAN-tagged packets not supported")); + + is_ipv6 = ntohs(eh->ether_type) == ETHERTYPE_IPV6; + l3_off = ETHER_HDR_LEN; + mbuf_next = m_getptr(mbuf, l3_off, &offset); + + if (is_ipv6) { + ip6 = (struct ip6_hdr *)(mtodo(mbuf_next, offset)); + l4_off = l3_off + sizeof(struct ip6_hdr); + is_tcp = (ip6->ip6_nxt == IPPROTO_TCP); + is_udp = (ip6->ip6_nxt == IPPROTO_UDP); + mbuf_next = m_getptr(mbuf, l4_off, &offset); + } else if (ntohs(eh->ether_type) == ETHERTYPE_IP) { + ip = (struct ip *)(mtodo(mbuf_next, offset)); + l4_off = l3_off + (ip->ip_hl << 2); + is_tcp = (ip->ip_p == IPPROTO_TCP); + is_udp = (ip->ip_p == IPPROTO_UDP); + mbuf_next = m_getptr(mbuf, l4_off, &offset); + } + + l4_data_off = 0; + if (is_tcp) { + th = (struct tcphdr *)(mtodo(mbuf_next, offset)); + l4_data_off = l4_off + (th->th_off << 2); + } else if (is_udp) + l4_data_off = l4_off + sizeof(struct udphdr); + + if (has_csum_flag) { + if ((csum_flags & (CSUM_TSO | CSUM_TCP | CSUM_IP6_TCP)) != 0) + csum_offset = offsetof(struct tcphdr, th_sum); + else + csum_offset = offsetof(struct udphdr, uh_sum); + } + + /* + * If this packet is neither a TCP nor a UDP packet, the first segment, + * the one represented by the packet descriptor, will carry the + * spec-stipulated minimum of 182B. + */ + if (l4_data_off != 0) + first_seg_len = l4_data_off; + else + first_seg_len = MIN(pkt_len, GVE_GQ_TX_MIN_PKT_DESC_BYTES); + + bytes_required = gve_fifo_bytes_required(tx, first_seg_len, pkt_len); + if (__predict_false(!gve_can_tx(tx, bytes_required))) { + counter_enter(); + counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_device, 1); + counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); + counter_exit(); + return (ENOBUFS); + } + + /* So that the cleanup taskqueue can free the mbuf eventually. */ + info->mbuf = mbuf; + + /* + * We don't want to split the header, so if necessary, pad to the end + * of the fifo and then put the header at the beginning of the fifo. + */ + pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len); + hdr_nfrags = gve_tx_alloc_fifo(&tx->fifo, first_seg_len + pad_bytes, + &info->iov[0]); + KASSERT(hdr_nfrags > 0, ("Number of header fragments for gve tx is 0")); + payload_nfrags = gve_tx_alloc_fifo(&tx->fifo, pkt_len - first_seg_len, + &info->iov[payload_iov]); + + pkt_desc = &tx->desc_ring[idx].pkt; + gve_tx_fill_pkt_desc(pkt_desc, is_tso, l4_off, + 1 + mtd_desc_nr + payload_nfrags, first_seg_len, + info->iov[hdr_nfrags - 1].iov_offset, has_csum_flag, csum_offset, + pkt_len); + + m_copydata(mbuf, 0, first_seg_len, + (char *)tx->fifo.base + info->iov[hdr_nfrags - 1].iov_offset); + gve_dma_sync_for_device(tx->com.qpl, + info->iov[hdr_nfrags - 1].iov_offset, + info->iov[hdr_nfrags - 1].iov_len); + copy_offset = first_seg_len; + + if (mtd_desc_nr == 1) { + next_idx = (tx->req + 1) & tx->mask; + mtd_desc = &tx->desc_ring[next_idx].mtd; + gve_tx_fill_mtd_desc(mtd_desc, mbuf); + } + + for (i = payload_iov; i < payload_nfrags + payload_iov; i++) { + next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask; + seg_desc = &tx->desc_ring[next_idx].seg; + + gve_tx_fill_seg_desc(seg_desc, is_tso, info->iov[i].iov_len, + info->iov[i].iov_offset, is_ipv6, l3_off, tso_mss); + + m_copydata(mbuf, copy_offset, info->iov[i].iov_len, + (char *)tx->fifo.base + info->iov[i].iov_offset); + gve_dma_sync_for_device(tx->com.qpl, + info->iov[i].iov_offset, info->iov[i].iov_len); + copy_offset += info->iov[i].iov_len; + } + + tx->req += (1 + mtd_desc_nr + payload_nfrags); + if (is_tso) { + counter_enter(); + counter_u64_add_protected(tx->stats.tso_packet_cnt, 1); + counter_exit(); + } + return (0); +} + +static void +gve_xmit_br(struct gve_tx_ring *tx) +{ + struct gve_priv *priv = tx->com.priv; + struct ifnet *ifp = priv->ifp; + struct mbuf *mbuf; + + while (!drbr_empty(ifp, tx->br) && + (if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0) { + + mbuf = drbr_peek(ifp, tx->br); + if (__predict_false(gve_xmit(tx, mbuf) != 0)) { + drbr_putback(ifp, tx->br, mbuf); + taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); + break; + } + + bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, + BUS_DMASYNC_PREWRITE); + gve_db_bar_write_4(priv, tx->com.db_offset, tx->req); + + drbr_advance(ifp, tx->br); + BPF_MTAP(ifp, mbuf); + } +} + +void +gve_xmit_tq(void *arg, int pending) +{ + struct gve_tx_ring *tx = (struct gve_tx_ring *)arg; + + GVE_RING_LOCK(tx); + gve_xmit_br(tx); + GVE_RING_UNLOCK(tx); +} + +static bool +is_vlan_tagged_pkt(struct mbuf *mbuf) +{ + struct ether_header *eh; + + eh = mtod(mbuf, struct ether_header *); + return (ntohs(eh->ether_type) == ETHERTYPE_VLAN); +} + +int +gve_xmit_ifp(if_t ifp, struct mbuf *mbuf) +{ + struct gve_priv *priv = if_getsoftc(ifp); + struct gve_tx_ring *tx; + bool is_br_empty; + int err; + uint32_t i; + + if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) + return (ENODEV); + + if (M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE) + i = mbuf->m_pkthdr.flowid % priv->tx_cfg.num_queues; + else + i = curcpu % priv->tx_cfg.num_queues; + tx = &priv->tx[i]; + + if (__predict_false(is_vlan_tagged_pkt(mbuf))) { + counter_enter(); + counter_u64_add_protected(tx->stats.tx_dropped_pkt_vlan, 1); + counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); + counter_exit(); + m_freem(mbuf); + return (ENODEV); + } + + is_br_empty = drbr_empty(ifp, tx->br); + err = drbr_enqueue(ifp, tx->br, mbuf); + if (__predict_false(err != 0)) { + taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); + counter_enter(); + counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_bufring, 1); + counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); + counter_exit(); + return (err); + } + + /* + * If the mbuf we just enqueued is the only one on the ring, then + * transmit it right away in the interests of low latency. + */ + if (is_br_empty && (GVE_RING_TRYLOCK(tx) != 0)) { + gve_xmit_br(tx); + GVE_RING_UNLOCK(tx); + } else { + taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); + } + + return (0); +} + +void +gve_qflush(if_t ifp) +{ + struct gve_priv *priv = if_getsoftc(ifp); + struct gve_tx_ring *tx; + int i; + + for (i = 0; i < priv->tx_cfg.num_queues; ++i) { + tx = &priv->tx[i]; + if (drbr_empty(ifp, tx->br) == 0) { + GVE_RING_LOCK(tx); + drbr_flush(ifp, tx->br); + GVE_RING_UNLOCK(tx); + } + } + + if_qflush(ifp); +} diff --git a/sys/dev/gve/gve_utils.c b/sys/dev/gve/gve_utils.c new file mode 100644 index 000000000000..c05488770dbd --- /dev/null +++ b/sys/dev/gve/gve_utils.c @@ -0,0 +1,405 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2023 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "gve.h" + +uint32_t +gve_reg_bar_read_4(struct gve_priv *priv, bus_size_t offset) +{ + return (be32toh(bus_read_4(priv->reg_bar, offset))); +} + +void +gve_reg_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val) +{ + bus_write_4(priv->reg_bar, offset, htobe32(val)); +} + +void +gve_db_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val) +{ + bus_write_4(priv->db_bar, offset, htobe32(val)); +} + +void +gve_alloc_counters(counter_u64_t *stat, int num_stats) +{ + int i; + + for (i = 0; i < num_stats; i++) + stat[i] = counter_u64_alloc(M_WAITOK); +} + +void +gve_free_counters(counter_u64_t *stat, int num_stats) +{ + int i; + + for (i = 0; i < num_stats; i++) + counter_u64_free(stat[i]); +} + +/* Currently assumes a single segment. */ +static void +gve_dmamap_load_callback(void *arg, bus_dma_segment_t *segs, int nseg, + int error) +{ + if (error == 0) + *(bus_addr_t *) arg = segs[0].ds_addr; +} + +int +gve_dma_alloc_coherent(struct gve_priv *priv, int size, int align, + struct gve_dma_handle *dma) +{ + int err; + device_t dev = priv->dev; + + err = bus_dma_tag_create( + bus_get_dma_tag(dev), /* parent */ + align, 0, /* alignment, bounds */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + size, /* maxsize */ + 1, /* nsegments */ + size, /* maxsegsize */ + BUS_DMA_ALLOCNOW, /* flags */ + NULL, /* lockfunc */ + NULL, /* lockarg */ + &dma->tag); + if (err != 0) { + device_printf(dev, "%s: bus_dma_tag_create failed: %d\n", + __func__, err); + goto clear_tag; + } + + err = bus_dmamem_alloc(dma->tag, (void **) &dma->cpu_addr, + BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, + &dma->map); + if (err != 0) { + device_printf(dev, "%s: bus_dmamem_alloc(%ju) failed: %d\n", + __func__, (uintmax_t)size, err); + goto destroy_tag; + } + + /* An address set by the callback will never be -1 */ + dma->bus_addr = (bus_addr_t)-1; + err = bus_dmamap_load(dma->tag, dma->map, dma->cpu_addr, size, + gve_dmamap_load_callback, &dma->bus_addr, BUS_DMA_NOWAIT); + if (err != 0 || dma->bus_addr == (bus_addr_t)-1) { + device_printf(dev, "%s: bus_dmamap_load failed: %d\n", __func__, err); + goto free_mem; + } + + return (0); + +free_mem: + bus_dmamem_free(dma->tag, dma->cpu_addr, dma->map); +destroy_tag: + bus_dma_tag_destroy(dma->tag); +clear_tag: + dma->tag = NULL; + + return (err); +} + +void +gve_dma_free_coherent(struct gve_dma_handle *dma) +{ + bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + bus_dmamap_unload(dma->tag, dma->map); + bus_dmamem_free(dma->tag, dma->cpu_addr, dma->map); + bus_dma_tag_destroy(dma->tag); +} + +int +gve_dmamap_create(struct gve_priv *priv, int size, int align, + struct gve_dma_handle *dma) +{ + int err; + device_t dev = priv->dev; + + err = bus_dma_tag_create( + bus_get_dma_tag(dev), /* parent */ + align, 0, /* alignment, bounds */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + size, /* maxsize */ + 1, /* nsegments */ + size, /* maxsegsize */ + BUS_DMA_ALLOCNOW, /* flags */ + NULL, /* lockfunc */ + NULL, /* lockarg */ + &dma->tag); + if (err != 0) { + device_printf(dev, "%s: bus_dma_tag_create failed: %d\n", + __func__, err); + goto clear_tag; + } + + err = bus_dmamap_create(dma->tag, BUS_DMA_COHERENT, &dma->map); + if (err != 0) { + device_printf(dev, "%s: bus_dmamap_create failed: %d\n", + __func__, err); + goto destroy_tag; + } + + /* An address set by the callback will never be -1 */ + dma->bus_addr = (bus_addr_t)-1; + err = bus_dmamap_load(dma->tag, dma->map, dma->cpu_addr, size, + gve_dmamap_load_callback, &dma->bus_addr, BUS_DMA_WAITOK); + if (err != 0 || dma->bus_addr == (bus_addr_t)-1) { + device_printf(dev, "%s: bus_dmamap_load failed: %d\n", + __func__, err); + goto destroy_map; + } + + return (0); + +destroy_map: + bus_dmamap_destroy(dma->tag, dma->map); +destroy_tag: + bus_dma_tag_destroy(dma->tag); +clear_tag: + dma->tag = NULL; + + return (err); +} + +void +gve_dmamap_destroy(struct gve_dma_handle *dma) +{ + bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + bus_dmamap_unload(dma->tag, dma->map); + bus_dmamap_destroy(dma->tag, dma->map); + bus_dma_tag_destroy(dma->tag); +} + +static int +gve_mgmnt_intr(void *arg) +{ + struct gve_priv *priv = arg; + + taskqueue_enqueue(priv->service_tq, &priv->service_task); + return (FILTER_HANDLED); +} + +void +gve_free_irqs(struct gve_priv *priv) +{ + struct gve_irq *irq; + int num_irqs; + int rid; + int rc; + int i; + + if (priv->irq_tbl == NULL) { + device_printf(priv->dev, "No irq table, nothing to free\n"); + return; + } + + num_irqs = priv->tx_cfg.num_queues + priv->rx_cfg.num_queues + 1; + + for (i = 0; i < num_irqs; i++) { + irq = &priv->irq_tbl[i]; + if (irq->res == NULL) + continue; + + rid = rman_get_rid(irq->res); + + rc = bus_teardown_intr(priv->dev, irq->res, irq->cookie); + if (rc != 0) + device_printf(priv->dev, "Failed to teardown irq num %d\n", + rid); + + rc = bus_release_resource(priv->dev, SYS_RES_IRQ, + rid, irq->res); + if (rc != 0) + device_printf(priv->dev, "Failed to release irq num %d\n", + rid); + + irq->res = NULL; + irq->cookie = NULL; + } + + free(priv->irq_tbl, M_GVE); + priv->irq_tbl = NULL; + + /* Safe to call even if msix was never alloced */ + pci_release_msi(priv->dev); +} + +int +gve_alloc_irqs(struct gve_priv *priv) +{ + int num_tx = priv->tx_cfg.num_queues; + int num_rx = priv->rx_cfg.num_queues; + int req_nvecs = num_tx + num_rx + 1; + int got_nvecs = req_nvecs; + struct gve_irq *irq; + int i, j, m; + int rid; + int err; + + struct gve_ring_com *com; + struct gve_rx_ring *rx; + struct gve_tx_ring *tx; + + if (pci_alloc_msix(priv->dev, &got_nvecs) != 0) { + device_printf(priv->dev, "Failed to acquire any msix vectors\n"); + err = ENXIO; + goto abort; + } else if (got_nvecs != req_nvecs) { + device_printf(priv->dev, "Tried to acquire %d msix vectors, got only %d\n", + req_nvecs, got_nvecs); + err = ENOSPC; + goto abort; + } + + if (bootverbose) + device_printf(priv->dev, "Enabled MSIX with %d vectors\n", got_nvecs); + + priv->irq_tbl = malloc(sizeof(struct gve_irq) * req_nvecs, M_GVE, + M_WAITOK | M_ZERO); + + for (i = 0; i < num_tx; i++) { + irq = &priv->irq_tbl[i]; + tx = &priv->tx[i]; + com = &tx->com; + rid = i + 1; + + irq->res = bus_alloc_resource_any(priv->dev, SYS_RES_IRQ, + &rid, RF_ACTIVE); + if (irq->res == NULL) { + device_printf(priv->dev, "Failed to alloc irq %d for Tx queue %d\n", + rid, i); + err = ENOMEM; + goto abort; + } + + err = bus_setup_intr(priv->dev, irq->res, INTR_TYPE_NET | INTR_MPSAFE, + gve_tx_intr, NULL, &priv->tx[i], &irq->cookie); + if (err != 0) { + device_printf(priv->dev, "Failed to setup irq %d for Tx queue %d, " + "err: %d\n", rid, i, err); + goto abort; + } + + bus_describe_intr(priv->dev, irq->res, irq->cookie, "tx%d", i); + com->ntfy_id = i; + } + + for (j = 0; j < num_rx; j++) { + irq = &priv->irq_tbl[i + j]; + rx = &priv->rx[j]; + com = &rx->com; + rid = i + j + 1; + + irq->res = bus_alloc_resource_any(priv->dev, SYS_RES_IRQ, + &rid, RF_ACTIVE); + if (irq->res == NULL) { + device_printf(priv->dev, + "Failed to alloc irq %d for Rx queue %d", rid, j); + err = ENOMEM; + goto abort; + } + + err = bus_setup_intr(priv->dev, irq->res, INTR_TYPE_NET | INTR_MPSAFE, + gve_rx_intr, NULL, &priv->rx[j], &irq->cookie); + if (err != 0) { + device_printf(priv->dev, "Failed to setup irq %d for Rx queue %d, " + "err: %d\n", rid, j, err); + goto abort; + } + + bus_describe_intr(priv->dev, irq->res, irq->cookie, "rx%d", j); + com->ntfy_id = i + j; + } + + m = i + j; + rid = m + 1; + irq = &priv->irq_tbl[m]; + + irq->res = bus_alloc_resource_any(priv->dev, SYS_RES_IRQ, + &rid, RF_ACTIVE); + if (irq->res == NULL) { + device_printf(priv->dev, "Failed to allocate irq %d for mgmnt queue\n", rid); + err = ENOMEM; + goto abort; + } + + err = bus_setup_intr(priv->dev, irq->res, INTR_TYPE_NET | INTR_MPSAFE, + gve_mgmnt_intr, NULL, priv, &irq->cookie); + if (err != 0) { + device_printf(priv->dev, "Failed to setup irq %d for mgmnt queue, err: %d\n", + rid, err); + goto abort; + } + + bus_describe_intr(priv->dev, irq->res, irq->cookie, "mgmnt"); + + return (0); + +abort: + gve_free_irqs(priv); + return (err); +} + +void +gve_unmask_all_queue_irqs(struct gve_priv *priv) +{ + struct gve_tx_ring *tx; + struct gve_rx_ring *rx; + int idx; + + for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) { + tx = &priv->tx[idx]; + gve_db_bar_write_4(priv, tx->com.irq_db_offset, 0); + } + for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) { + rx = &priv->rx[idx]; + gve_db_bar_write_4(priv, rx->com.irq_db_offset, 0); + } +} + +void +gve_mask_all_queue_irqs(struct gve_priv *priv) +{ + for (int idx = 0; idx < priv->tx_cfg.num_queues; idx++) { + struct gve_tx_ring *tx = &priv->tx[idx]; + gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK); + } + for (int idx = 0; idx < priv->rx_cfg.num_queues; idx++) { + struct gve_rx_ring *rx = &priv->rx[idx]; + gve_db_bar_write_4(priv, rx->com.irq_db_offset, GVE_IRQ_MASK); + } +} diff --git a/sys/modules/Makefile b/sys/modules/Makefile index f14dd383221f..8e91e20720ca 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -131,6 +131,7 @@ SUBDIR= \ ${_glxiic} \ ${_glxsb} \ gpio \ + ${_gve} \ hid \ hifn \ ${_hpt27xx} \ @@ -554,6 +555,10 @@ _mlx5ib= mlx5ib .endif .endif +.if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" +_gve= gve +.endif + .if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" || \ ${MACHINE_CPUARCH} == "i386" _ena= ena diff --git a/sys/modules/gve/Makefile b/sys/modules/gve/Makefile new file mode 100644 index 000000000000..13c88c6c0a96 --- /dev/null +++ b/sys/modules/gve/Makefile @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: BSD-3-Clause +# +# Copyright (c) 2023 Google LLC +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +.PATH: ${SRCTOP}/sys/dev/gve + +KMOD= if_gve +SRCS= gve_main.c gve_adminq.c gve_utils.c gve_qpl.c gve_rx.c gve_tx.c gve_sysctl.c +SRCS+= device_if.h bus_if.h pci_if.h + +.include <bsd.kmod.mk> |