diff options
author | Wei Hu <whu@FreeBSD.org> | 2021-08-20 08:43:10 +0000 |
---|---|---|
committer | Wei Hu <whu@FreeBSD.org> | 2021-08-20 10:44:57 +0000 |
commit | ce110ea12fcea71ae437d0a1d0549d3d32055b0e (patch) | |
tree | 306e7c8986b92e503a57fca6b50172b887ed64b3 | |
parent | 464a166c27bb8947d5de1f617170ef6813587cc3 (diff) | |
download | src-ce110ea12fce.tar.gz src-ce110ea12fce.zip |
Microsoft Azure Network Adapter(MANA) VF support
MANA is the new network adapter from Microsoft which will be available
in Azure public cloud. It provides SRIOV NIC as virtual function to
guest OS running on Hyper-V.
The code can be divided into two major parts. Gdma_main.c is the one to
bring up the hardware board and drives all underlying hardware queue
infrastructure. Mana_en.c contains all main ethernet driver code.
It has only tested and supported on amd64 architecture.
PR: 256336
Reviewed by: decui@microsoft.com
Tested by: whu
MFC after: 2 week
Relnotes: yes
Sponsored by: Microsoft
Differential Revision: https://reviews.freebsd.org/D31150
-rw-r--r-- | sys/conf/files.x86 | 6 | ||||
-rw-r--r-- | sys/dev/mana/gdma.h | 744 | ||||
-rw-r--r-- | sys/dev/mana/gdma_main.c | 1961 | ||||
-rw-r--r-- | sys/dev/mana/gdma_util.c | 96 | ||||
-rw-r--r-- | sys/dev/mana/gdma_util.h | 206 | ||||
-rw-r--r-- | sys/dev/mana/hw_channel.c | 950 | ||||
-rw-r--r-- | sys/dev/mana/hw_channel.h | 222 | ||||
-rw-r--r-- | sys/dev/mana/mana.h | 689 | ||||
-rw-r--r-- | sys/dev/mana/mana_en.c | 2699 | ||||
-rw-r--r-- | sys/dev/mana/mana_sysctl.c | 219 | ||||
-rw-r--r-- | sys/dev/mana/mana_sysctl.h | 48 | ||||
-rw-r--r-- | sys/dev/mana/shm_channel.c | 337 | ||||
-rw-r--r-- | sys/dev/mana/shm_channel.h | 52 | ||||
-rw-r--r-- | sys/modules/Makefile | 2 | ||||
-rw-r--r-- | sys/modules/mana/Makefile | 12 |
15 files changed, 8243 insertions, 0 deletions
diff --git a/sys/conf/files.x86 b/sys/conf/files.x86 index d0cda2da8580..925a3c5fe889 100644 --- a/sys/conf/files.x86 +++ b/sys/conf/files.x86 @@ -264,6 +264,12 @@ dev/isci/scil/scif_sas_task_request_states.c optional isci dev/isci/scil/scif_sas_timer.c optional isci dev/itwd/itwd.c optional itwd dev/kvm_clock/kvm_clock.c optional kvm_clock +dev/mana/gdma_main.c optional mana +dev/mana/mana_en.c optional mana +dev/mana/mana_sysctl.c optional mana +dev/mana/shm_channel.c optional mana +dev/mana/hw_channel.c optional mana +dev/mana/gdma_util.c optional mana dev/qat/qat.c optional qat dev/qat/qat_ae.c optional qat dev/qat/qat_c2xxx.c optional qat diff --git a/sys/dev/mana/gdma.h b/sys/dev/mana/gdma.h new file mode 100644 index 000000000000..097b2b65e545 --- /dev/null +++ b/sys/dev/mana/gdma.h @@ -0,0 +1,744 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#ifndef _GDMA_H +#define _GDMA_H + +#include <sys/bus.h> +#include <sys/bus_dma.h> +#include <sys/types.h> +#include <sys/limits.h> +#include <sys/sx.h> + +#include "gdma_util.h" +#include "shm_channel.h" + +/* Structures labeled with "HW DATA" are exchanged with the hardware. All of + * them are naturally aligned and hence don't need __packed. + */ + +#define GDMA_BAR0 0 + +#define GDMA_IRQNAME_SZ 40 + +struct gdma_bus { + bus_space_handle_t bar0_h; + bus_space_tag_t bar0_t; +}; + +struct gdma_msix_entry { + int entry; + int vector; +}; + +enum gdma_request_type { + GDMA_VERIFY_VF_DRIVER_VERSION = 1, + GDMA_QUERY_MAX_RESOURCES = 2, + GDMA_LIST_DEVICES = 3, + GDMA_REGISTER_DEVICE = 4, + GDMA_DEREGISTER_DEVICE = 5, + GDMA_GENERATE_TEST_EQE = 10, + GDMA_CREATE_QUEUE = 12, + GDMA_DISABLE_QUEUE = 13, + GDMA_CREATE_DMA_REGION = 25, + GDMA_DMA_REGION_ADD_PAGES = 26, + GDMA_DESTROY_DMA_REGION = 27, +}; + +enum gdma_queue_type { + GDMA_INVALID_QUEUE, + GDMA_SQ, + GDMA_RQ, + GDMA_CQ, + GDMA_EQ, +}; + +enum gdma_work_request_flags { + GDMA_WR_NONE = 0, + GDMA_WR_OOB_IN_SGL = BIT(0), + GDMA_WR_PAD_BY_SGE0 = BIT(1), +}; + +enum gdma_eqe_type { + GDMA_EQE_COMPLETION = 3, + GDMA_EQE_TEST_EVENT = 64, + GDMA_EQE_HWC_INIT_EQ_ID_DB = 129, + GDMA_EQE_HWC_INIT_DATA = 130, + GDMA_EQE_HWC_INIT_DONE = 131, +}; + +enum { + GDMA_DEVICE_NONE = 0, + GDMA_DEVICE_HWC = 1, + GDMA_DEVICE_MANA = 2, +}; + + +struct gdma_resource { + /* Protect the bitmap */ + struct mtx lock_spin; + + /* The bitmap size in bits. */ + uint32_t size; + + /* The bitmap tracks the resources. */ + unsigned long *map; +}; + +union gdma_doorbell_entry { + uint64_t as_uint64; + + struct { + uint64_t id : 24; + uint64_t reserved : 8; + uint64_t tail_ptr : 31; + uint64_t arm : 1; + } cq; + + struct { + uint64_t id : 24; + uint64_t wqe_cnt : 8; + uint64_t tail_ptr : 32; + } rq; + + struct { + uint64_t id : 24; + uint64_t reserved : 8; + uint64_t tail_ptr : 32; + } sq; + + struct { + uint64_t id : 16; + uint64_t reserved : 16; + uint64_t tail_ptr : 31; + uint64_t arm : 1; + } eq; +}; /* HW DATA */ + +struct gdma_msg_hdr { + uint32_t hdr_type; + uint32_t msg_type; + uint16_t msg_version; + uint16_t hwc_msg_id; + uint32_t msg_size; +}; /* HW DATA */ + +struct gdma_dev_id { + union { + struct { + uint16_t type; + uint16_t instance; + }; + + uint32_t as_uint32; + }; +}; /* HW DATA */ + +struct gdma_req_hdr { + struct gdma_msg_hdr req; + struct gdma_msg_hdr resp; /* The expected response */ + struct gdma_dev_id dev_id; + uint32_t activity_id; +}; /* HW DATA */ + +struct gdma_resp_hdr { + struct gdma_msg_hdr response; + struct gdma_dev_id dev_id; + uint32_t activity_id; + uint32_t status; + uint32_t reserved; +}; /* HW DATA */ + +struct gdma_general_req { + struct gdma_req_hdr hdr; +}; /* HW DATA */ + +#define GDMA_MESSAGE_V1 1 + +struct gdma_general_resp { + struct gdma_resp_hdr hdr; +}; /* HW DATA */ + +#define GDMA_STANDARD_HEADER_TYPE 0 + +static inline void +mana_gd_init_req_hdr(struct gdma_req_hdr *hdr, uint32_t code, + uint32_t req_size, uint32_t resp_size) +{ + hdr->req.hdr_type = GDMA_STANDARD_HEADER_TYPE; + hdr->req.msg_type = code; + hdr->req.msg_version = GDMA_MESSAGE_V1; + hdr->req.msg_size = req_size; + + hdr->resp.hdr_type = GDMA_STANDARD_HEADER_TYPE; + hdr->resp.msg_type = code; + hdr->resp.msg_version = GDMA_MESSAGE_V1; + hdr->resp.msg_size = resp_size; +} + +/* The 16-byte struct is part of the GDMA work queue entry (WQE). */ +struct gdma_sge { + uint64_t address; + uint32_t mem_key; + uint32_t size; +}; /* HW DATA */ + +struct gdma_wqe_request { + struct gdma_sge *sgl; + uint32_t num_sge; + + uint32_t inline_oob_size; + const void *inline_oob_data; + + uint32_t flags; + uint32_t client_data_unit; +}; + +enum gdma_page_type { + GDMA_PAGE_TYPE_4K, +}; + +#define GDMA_INVALID_DMA_REGION 0 + +struct gdma_mem_info { + device_t dev; + + bus_dma_tag_t dma_tag; + bus_dmamap_t dma_map; + bus_addr_t dma_handle; /* Physical address */ + void *virt_addr; /* Virtual address */ + uint64_t length; + + /* Allocated by the PF driver */ + uint64_t gdma_region; +}; + +#define REGISTER_ATB_MST_MKEY_LOWER_SIZE 8 + +struct gdma_dev { + struct gdma_context *gdma_context; + + struct gdma_dev_id dev_id; + + uint32_t pdid; + uint32_t doorbell; + uint32_t gpa_mkey; + + /* GDMA driver specific pointer */ + void *driver_data; +}; + +#define MINIMUM_SUPPORTED_PAGE_SIZE PAGE_SIZE + +#define GDMA_CQE_SIZE 64 +#define GDMA_EQE_SIZE 16 +#define GDMA_MAX_SQE_SIZE 512 +#define GDMA_MAX_RQE_SIZE 256 + +#define GDMA_COMP_DATA_SIZE 0x3C + +#define GDMA_EVENT_DATA_SIZE 0xC + +/* The WQE size must be a multiple of the Basic Unit, which is 32 bytes. */ +#define GDMA_WQE_BU_SIZE 32 + +#define INVALID_PDID UINT_MAX +#define INVALID_DOORBELL UINT_MAX +#define INVALID_MEM_KEY UINT_MAX +#define INVALID_QUEUE_ID UINT_MAX +#define INVALID_PCI_MSIX_INDEX UINT_MAX + +struct gdma_comp { + uint32_t cqe_data[GDMA_COMP_DATA_SIZE / 4]; + uint32_t wq_num; + bool is_sq; +}; + +struct gdma_event { + uint32_t details[GDMA_EVENT_DATA_SIZE / 4]; + uint8_t type; +}; + +struct gdma_queue; + +#define CQE_POLLING_BUFFER 512 + +typedef void gdma_eq_callback(void *context, struct gdma_queue *q, + struct gdma_event *e); + +typedef void gdma_cq_callback(void *context, struct gdma_queue *q); + +/* The 'head' is the producer index. For SQ/RQ, when the driver posts a WQE + * (Note: the WQE size must be a multiple of the 32-byte Basic Unit), the + * driver increases the 'head' in BUs rather than in bytes, and notifies + * the HW of the updated head. For EQ/CQ, the driver uses the 'head' to track + * the HW head, and increases the 'head' by 1 for every processed EQE/CQE. + * + * The 'tail' is the consumer index for SQ/RQ. After the CQE of the SQ/RQ is + * processed, the driver increases the 'tail' to indicate that WQEs have + * been consumed by the HW, so the driver can post new WQEs into the SQ/RQ. + * + * The driver doesn't use the 'tail' for EQ/CQ, because the driver ensures + * that the EQ/CQ is big enough so they can't overflow, and the driver uses + * the owner bits mechanism to detect if the queue has become empty. + */ +struct gdma_queue { + struct gdma_dev *gdma_dev; + + enum gdma_queue_type type; + uint32_t id; + + struct gdma_mem_info mem_info; + + void *queue_mem_ptr; + uint32_t queue_size; + + bool monitor_avl_buf; + + uint32_t head; + uint32_t tail; + + /* Extra fields specific to EQ/CQ. */ + union { + struct { + bool disable_needed; + + gdma_eq_callback *callback; + void *context; + + unsigned int msix_index; + + uint32_t log2_throttle_limit; + + struct task cleanup_task; + struct taskqueue *cleanup_tq; + int cpu; + bool do_not_ring_db; + + int work_done; + int budget; + } eq; + + struct { + gdma_cq_callback *callback; + void *context; + + /* For CQ/EQ relationship */ + struct gdma_queue *parent; + } cq; + }; +}; + +struct gdma_queue_spec { + enum gdma_queue_type type; + bool monitor_avl_buf; + unsigned int queue_size; + + /* Extra fields specific to EQ/CQ. */ + union { + struct { + gdma_eq_callback *callback; + void *context; + + unsigned long log2_throttle_limit; + + /* Only used by the MANA device. */ + struct ifnet *ndev; + } eq; + + struct { + gdma_cq_callback *callback; + void *context; + + struct gdma_queue *parent_eq; + + } cq; + }; +}; + +struct mana_eq { + struct gdma_queue *eq; + struct gdma_comp cqe_poll[CQE_POLLING_BUFFER]; +}; + +struct gdma_irq_context { + struct gdma_msix_entry msix_e; + struct resource *res; + driver_intr_t *handler; + void *arg; + void *cookie; + bool requested; + int cpu; + char name[GDMA_IRQNAME_SZ]; +}; + +struct gdma_context { + device_t dev; + + struct gdma_bus gd_bus; + + /* Per-vPort max number of queues */ + unsigned int max_num_queues; + unsigned int max_num_msix; + unsigned int num_msix_usable; + struct gdma_resource msix_resource; + struct gdma_irq_context *irq_contexts; + + /* This maps a CQ index to the queue structure. */ + unsigned int max_num_cqs; + struct gdma_queue **cq_table; + + /* Protect eq_test_event and test_event_eq_id */ + struct sx eq_test_event_sx; + struct completion eq_test_event; + uint32_t test_event_eq_id; + + struct resource *bar0; + struct resource *msix; + int msix_rid; + void __iomem *shm_base; + void __iomem *db_page_base; + uint32_t db_page_size; + + /* Shared memory chanenl (used to bootstrap HWC) */ + struct shm_channel shm_channel; + + /* Hardware communication channel (HWC) */ + struct gdma_dev hwc; + + /* Azure network adapter */ + struct gdma_dev mana; +}; + +#define MAX_NUM_GDMA_DEVICES 4 + +static inline bool mana_gd_is_mana(struct gdma_dev *gd) +{ + return gd->dev_id.type == GDMA_DEVICE_MANA; +} + +static inline bool mana_gd_is_hwc(struct gdma_dev *gd) +{ + return gd->dev_id.type == GDMA_DEVICE_HWC; +} + +uint8_t *mana_gd_get_wqe_ptr(const struct gdma_queue *wq, uint32_t wqe_offset); +uint32_t mana_gd_wq_avail_space(struct gdma_queue *wq); + +int mana_gd_test_eq(struct gdma_context *gc, struct gdma_queue *eq); + +int mana_gd_create_hwc_queue(struct gdma_dev *gd, + const struct gdma_queue_spec *spec, + struct gdma_queue **queue_ptr); + +int mana_gd_create_mana_eq(struct gdma_dev *gd, + const struct gdma_queue_spec *spec, + struct gdma_queue **queue_ptr); + +int mana_gd_create_mana_wq_cq(struct gdma_dev *gd, + const struct gdma_queue_spec *spec, + struct gdma_queue **queue_ptr); + +void mana_gd_destroy_queue(struct gdma_context *gc, struct gdma_queue *queue); + +int mana_gd_poll_cq(struct gdma_queue *cq, struct gdma_comp *comp, int num_cqe); + +void mana_gd_arm_cq(struct gdma_queue *cq); + +struct gdma_wqe { + uint32_t reserved :24; + uint32_t last_vbytes :8; + + union { + uint32_t flags; + + struct { + uint32_t num_sge :8; + uint32_t inline_oob_size_div4 :3; + uint32_t client_oob_in_sgl :1; + uint32_t reserved1 :4; + uint32_t client_data_unit :14; + uint32_t reserved2 :2; + }; + }; +}; /* HW DATA */ + +#define INLINE_OOB_SMALL_SIZE 8 +#define INLINE_OOB_LARGE_SIZE 24 + +#define MAX_TX_WQE_SIZE 512 +#define MAX_RX_WQE_SIZE 256 + +struct gdma_cqe { + uint32_t cqe_data[GDMA_COMP_DATA_SIZE / 4]; + + union { + uint32_t as_uint32; + + struct { + uint32_t wq_num :24; + uint32_t is_sq :1; + uint32_t reserved :4; + uint32_t owner_bits :3; + }; + } cqe_info; +}; /* HW DATA */ + +#define GDMA_CQE_OWNER_BITS 3 + +#define GDMA_CQE_OWNER_MASK ((1 << GDMA_CQE_OWNER_BITS) - 1) + +#define SET_ARM_BIT 1 + +#define GDMA_EQE_OWNER_BITS 3 + +union gdma_eqe_info { + uint32_t as_uint32; + + struct { + uint32_t type : 8; + uint32_t reserved1 : 8; + uint32_t client_id : 2; + uint32_t reserved2 : 11; + uint32_t owner_bits : 3; + }; +}; /* HW DATA */ + +#define GDMA_EQE_OWNER_MASK ((1 << GDMA_EQE_OWNER_BITS) - 1) +#define INITIALIZED_OWNER_BIT(log2_num_entries) (1UL << (log2_num_entries)) + +struct gdma_eqe { + uint32_t details[GDMA_EVENT_DATA_SIZE / 4]; + uint32_t eqe_info; +}; /* HW DATA */ + +#define GDMA_REG_DB_PAGE_OFFSET 8 +#define GDMA_REG_DB_PAGE_SIZE 0x10 +#define GDMA_REG_SHM_OFFSET 0x18 + +struct gdma_posted_wqe_info { + uint32_t wqe_size_in_bu; +}; + +/* GDMA_GENERATE_TEST_EQE */ +struct gdma_generate_test_event_req { + struct gdma_req_hdr hdr; + uint32_t queue_index; +}; /* HW DATA */ + +/* GDMA_VERIFY_VF_DRIVER_VERSION */ +enum { + GDMA_PROTOCOL_V1 = 1, + GDMA_PROTOCOL_FIRST = GDMA_PROTOCOL_V1, + GDMA_PROTOCOL_LAST = GDMA_PROTOCOL_V1, +}; + +struct gdma_verify_ver_req { + struct gdma_req_hdr hdr; + + /* Mandatory fields required for protocol establishment */ + uint64_t protocol_ver_min; + uint64_t protocol_ver_max; + uint64_t drv_cap_flags1; + uint64_t drv_cap_flags2; + uint64_t drv_cap_flags3; + uint64_t drv_cap_flags4; + + /* Advisory fields */ + uint64_t drv_ver; + uint32_t os_type; /* Linux = 0x10; Windows = 0x20; Other = 0x30 */ + uint32_t reserved; + uint32_t os_ver_major; + uint32_t os_ver_minor; + uint32_t os_ver_build; + uint32_t os_ver_platform; + uint64_t reserved_2; + uint8_t os_ver_str1[128]; + uint8_t os_ver_str2[128]; + uint8_t os_ver_str3[128]; + uint8_t os_ver_str4[128]; +}; /* HW DATA */ + +struct gdma_verify_ver_resp { + struct gdma_resp_hdr hdr; + uint64_t gdma_protocol_ver; + uint64_t pf_cap_flags1; + uint64_t pf_cap_flags2; + uint64_t pf_cap_flags3; + uint64_t pf_cap_flags4; +}; /* HW DATA */ + +/* GDMA_QUERY_MAX_RESOURCES */ +struct gdma_query_max_resources_resp { + struct gdma_resp_hdr hdr; + uint32_t status; + uint32_t max_sq; + uint32_t max_rq; + uint32_t max_cq; + uint32_t max_eq; + uint32_t max_db; + uint32_t max_mst; + uint32_t max_cq_mod_ctx; + uint32_t max_mod_cq; + uint32_t max_msix; +}; /* HW DATA */ + +/* GDMA_LIST_DEVICES */ +struct gdma_list_devices_resp { + struct gdma_resp_hdr hdr; + uint32_t num_of_devs; + uint32_t reserved; + struct gdma_dev_id devs[64]; +}; /* HW DATA */ + +/* GDMA_REGISTER_DEVICE */ +struct gdma_register_device_resp { + struct gdma_resp_hdr hdr; + uint32_t pdid; + uint32_t gpa_mkey; + uint32_t db_id; +}; /* HW DATA */ + +/* GDMA_CREATE_QUEUE */ +struct gdma_create_queue_req { + struct gdma_req_hdr hdr; + uint32_t type; + uint32_t reserved1; + uint32_t pdid; + uint32_t doolbell_id; + uint64_t gdma_region; + uint32_t reserved2; + uint32_t queue_size; + uint32_t log2_throttle_limit; + uint32_t eq_pci_msix_index; + uint32_t cq_mod_ctx_id; + uint32_t cq_parent_eq_id; + uint8_t rq_drop_on_overrun; + uint8_t rq_err_on_wqe_overflow; + uint8_t rq_chain_rec_wqes; + uint8_t sq_hw_db; + uint32_t reserved3; +}; /* HW DATA */ + +struct gdma_create_queue_resp { + struct gdma_resp_hdr hdr; + uint32_t queue_index; +}; /* HW DATA */ + +/* GDMA_DISABLE_QUEUE */ +struct gdma_disable_queue_req { + struct gdma_req_hdr hdr; + uint32_t type; + uint32_t queue_index; + uint32_t alloc_res_id_on_creation; +}; /* HW DATA */ + +/* GDMA_CREATE_DMA_REGION */ +struct gdma_create_dma_region_req { + struct gdma_req_hdr hdr; + + /* The total size of the DMA region */ + uint64_t length; + + /* The offset in the first page */ + uint32_t offset_in_page; + + /* enum gdma_page_type */ + uint32_t gdma_page_type; + + /* The total number of pages */ + uint32_t page_count; + + /* If page_addr_list_len is smaller than page_count, + * the remaining page addresses will be added via the + * message GDMA_DMA_REGION_ADD_PAGES. + */ + uint32_t page_addr_list_len; + uint64_t page_addr_list[]; +}; /* HW DATA */ + +struct gdma_create_dma_region_resp { + struct gdma_resp_hdr hdr; + uint64_t gdma_region; +}; /* HW DATA */ + +/* GDMA_DMA_REGION_ADD_PAGES */ +struct gdma_dma_region_add_pages_req { + struct gdma_req_hdr hdr; + + uint64_t gdma_region; + + uint32_t page_addr_list_len; + uint32_t reserved3; + + uint64_t page_addr_list[]; +}; /* HW DATA */ + +/* GDMA_DESTROY_DMA_REGION */ +struct gdma_destroy_dma_region_req { + struct gdma_req_hdr hdr; + + uint64_t gdma_region; +}; /* HW DATA */ + +int mana_gd_verify_vf_version(device_t dev); + +int mana_gd_register_device(struct gdma_dev *gd); +int mana_gd_deregister_device(struct gdma_dev *gd); + +int mana_gd_post_work_request(struct gdma_queue *wq, + const struct gdma_wqe_request *wqe_req, + struct gdma_posted_wqe_info *wqe_info); + +int mana_gd_post_and_ring(struct gdma_queue *queue, + const struct gdma_wqe_request *wqe, + struct gdma_posted_wqe_info *wqe_info); + +int mana_gd_alloc_res_map(uint32_t res_avil, struct gdma_resource *r, + const char *lock_name); +void mana_gd_free_res_map(struct gdma_resource *r); + +void mana_gd_wq_ring_doorbell(struct gdma_context *gc, + struct gdma_queue *queue); + +int mana_gd_alloc_memory(struct gdma_context *gc, unsigned int length, + struct gdma_mem_info *gmi); + +void mana_gd_free_memory(struct gdma_mem_info *gmi); + +void mana_gd_dma_map_paddr(void *arg, bus_dma_segment_t *segs, + int nseg, int error); + +int mana_gd_send_request(struct gdma_context *gc, uint32_t req_len, + const void *req, uint32_t resp_len, void *resp); +#endif /* _GDMA_H */ diff --git a/sys/dev/mana/gdma_main.c b/sys/dev/mana/gdma_main.c new file mode 100644 index 000000000000..910992ce17a4 --- /dev/null +++ b/sys/dev/mana/gdma_main.c @@ -0,0 +1,1961 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/kthread.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/rman.h> +#include <sys/smp.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <sys/taskqueue.h> +#include <sys/time.h> +#include <sys/eventhandler.h> + +#include <machine/bus.h> +#include <machine/resource.h> +#include <machine/in_cksum.h> + +#include <net/if.h> +#include <net/if_var.h> + +#include <dev/pci/pcivar.h> +#include <dev/pci/pcireg.h> + +#include "gdma_util.h" +#include "mana.h" + + +static mana_vendor_id_t mana_id_table[] = { + { PCI_VENDOR_ID_MICROSOFT, PCI_DEV_ID_MANA_VF}, + /* Last entry */ + { 0, 0} +}; + +static inline uint32_t +mana_gd_r32(struct gdma_context *g, uint64_t offset) +{ + uint32_t v = bus_space_read_4(g->gd_bus.bar0_t, + g->gd_bus.bar0_h, offset); + rmb(); + return (v); +} + +#if defined(__amd64__) +static inline uint64_t +mana_gd_r64(struct gdma_context *g, uint64_t offset) +{ + uint64_t v = bus_space_read_8(g->gd_bus.bar0_t, + g->gd_bus.bar0_h, offset); + rmb(); + return (v); +} +#else +static inline uint64_t +mana_gd_r64(struct gdma_context *g, uint64_t offset) +{ + uint64_t v; + uint32_t *vp = (uint32_t *)&v; + + *vp = mana_gd_r32(g, offset); + *(vp + 1) = mana_gd_r32(g, offset + 4); + rmb(); + return (v); +} +#endif + +static int +mana_gd_query_max_resources(device_t dev) +{ + struct gdma_context *gc = device_get_softc(dev); + struct gdma_query_max_resources_resp resp = {}; + struct gdma_general_req req = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, GDMA_QUERY_MAX_RESOURCES, + sizeof(req), sizeof(resp)); + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) { + device_printf(gc->dev, + "Failed to query resource info: %d, 0x%x\n", + err, resp.hdr.status); + return err ? err : EPROTO; + } + + mana_dbg(NULL, "max_msix %u, max_eq %u, max_cq %u, " + "max_sq %u, max_rq %u\n", + resp.max_msix, resp.max_eq, resp.max_cq, + resp.max_sq, resp.max_rq); + + if (gc->num_msix_usable > resp.max_msix) + gc->num_msix_usable = resp.max_msix; + + if (gc->num_msix_usable <= 1) + return ENOSPC; + + gc->max_num_queues = mp_ncpus; + if (gc->max_num_queues > MANA_MAX_NUM_QUEUES) + gc->max_num_queues = MANA_MAX_NUM_QUEUES; + + if (gc->max_num_queues > resp.max_eq) + gc->max_num_queues = resp.max_eq; + + if (gc->max_num_queues > resp.max_cq) + gc->max_num_queues = resp.max_cq; + + if (gc->max_num_queues > resp.max_sq) + gc->max_num_queues = resp.max_sq; + + if (gc->max_num_queues > resp.max_rq) + gc->max_num_queues = resp.max_rq; + + return 0; +} + +static int +mana_gd_detect_devices(device_t dev) +{ + struct gdma_context *gc = device_get_softc(dev); + struct gdma_list_devices_resp resp = {}; + struct gdma_general_req req = {}; + struct gdma_dev_id gd_dev; + uint32_t i, max_num_devs; + uint16_t dev_type; + int err; + + mana_gd_init_req_hdr(&req.hdr, GDMA_LIST_DEVICES, sizeof(req), + sizeof(resp)); + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) { + device_printf(gc->dev, + "Failed to detect devices: %d, 0x%x\n", err, + resp.hdr.status); + return err ? err : EPROTO; + } + + max_num_devs = min_t(uint32_t, MAX_NUM_GDMA_DEVICES, resp.num_of_devs); + + for (i = 0; i < max_num_devs; i++) { + gd_dev = resp.devs[i]; + dev_type = gd_dev.type; + + mana_dbg(NULL, "gdma dev %d, type %u\n", + i, dev_type); + + /* HWC is already detected in mana_hwc_create_channel(). */ + if (dev_type == GDMA_DEVICE_HWC) + continue; + + if (dev_type == GDMA_DEVICE_MANA) { + gc->mana.gdma_context = gc; + gc->mana.dev_id = gd_dev; + } + } + + return gc->mana.dev_id.type == 0 ? ENODEV : 0; +} + +int +mana_gd_send_request(struct gdma_context *gc, uint32_t req_len, + const void *req, uint32_t resp_len, void *resp) +{ + struct hw_channel_context *hwc = gc->hwc.driver_data; + + return mana_hwc_send_request(hwc, req_len, req, resp_len, resp); +} + +void +mana_gd_dma_map_paddr(void *arg, bus_dma_segment_t *segs, int nseg, int error) +{ + bus_addr_t *paddr = arg; + + if (error) + return; + + KASSERT(nseg == 1, ("too many segments %d!", nseg)); + *paddr = segs->ds_addr; +} + +int +mana_gd_alloc_memory(struct gdma_context *gc, unsigned int length, + struct gdma_mem_info *gmi) +{ + bus_addr_t dma_handle; + void *buf; + int err; + + if (!gc || !gmi) + return EINVAL; + + if (length < PAGE_SIZE || (length != roundup_pow_of_two(length))) + return EINVAL; + + err = bus_dma_tag_create(bus_get_dma_tag(gc->dev), /* parent */ + PAGE_SIZE, 0, /* alignment, boundary */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + length, /* maxsize */ + 1, /* nsegments */ + length, /* maxsegsize */ + 0, /* flags */ + NULL, NULL, /* lockfunc, lockfuncarg*/ + &gmi->dma_tag); + if (err) { + device_printf(gc->dev, + "failed to create dma tag, err: %d\n", err); + return (err); + } + + /* + * Must have BUS_DMA_ZERO flag to clear the dma memory. + * Otherwise the queue overflow detection mechanism does + * not work. + */ + err = bus_dmamem_alloc(gmi->dma_tag, &buf, + BUS_DMA_NOWAIT | BUS_DMA_COHERENT | BUS_DMA_ZERO, &gmi->dma_map); + if (err) { + device_printf(gc->dev, + "failed to alloc dma mem, err: %d\n", err); + bus_dma_tag_destroy(gmi->dma_tag); + return (err); + } + + err = bus_dmamap_load(gmi->dma_tag, gmi->dma_map, buf, + length, mana_gd_dma_map_paddr, &dma_handle, BUS_DMA_NOWAIT); + if (err) { + device_printf(gc->dev, + "failed to load dma mem, err: %d\n", err); + bus_dmamem_free(gmi->dma_tag, buf, gmi->dma_map); + bus_dma_tag_destroy(gmi->dma_tag); + return (err); + } + + gmi->dev = gc->dev; + gmi->dma_handle = dma_handle; + gmi->virt_addr = buf; + gmi->length = length; + + return 0; +} + +void +mana_gd_free_memory(struct gdma_mem_info *gmi) +{ + bus_dmamap_unload(gmi->dma_tag, gmi->dma_map); + bus_dmamem_free(gmi->dma_tag, gmi->virt_addr, gmi->dma_map); + bus_dma_tag_destroy(gmi->dma_tag); +} + +static int +mana_gd_create_hw_eq(struct gdma_context *gc, + struct gdma_queue *queue) +{ + struct gdma_create_queue_resp resp = {}; + struct gdma_create_queue_req req = {}; + int err; + + if (queue->type != GDMA_EQ) + return EINVAL; + + mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_QUEUE, + sizeof(req), sizeof(resp)); + + req.hdr.dev_id = queue->gdma_dev->dev_id; + req.type = queue->type; + req.pdid = queue->gdma_dev->pdid; + req.doolbell_id = queue->gdma_dev->doorbell; + req.gdma_region = queue->mem_info.gdma_region; + req.queue_size = queue->queue_size; + req.log2_throttle_limit = queue->eq.log2_throttle_limit; + req.eq_pci_msix_index = queue->eq.msix_index; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) { + device_printf(gc->dev, + "Failed to create queue: %d, 0x%x\n", + err, resp.hdr.status); + return err ? err : EPROTO; + } + + queue->id = resp.queue_index; + queue->eq.disable_needed = true; + queue->mem_info.gdma_region = GDMA_INVALID_DMA_REGION; + return 0; +} + +static +int mana_gd_disable_queue(struct gdma_queue *queue) +{ + struct gdma_context *gc = queue->gdma_dev->gdma_context; + struct gdma_disable_queue_req req = {}; + struct gdma_general_resp resp = {}; + int err; + + if (queue->type != GDMA_EQ) + mana_warn(NULL, "Not event queue type 0x%x\n", + queue->type); + + mana_gd_init_req_hdr(&req.hdr, GDMA_DISABLE_QUEUE, + sizeof(req), sizeof(resp)); + + req.hdr.dev_id = queue->gdma_dev->dev_id; + req.type = queue->type; + req.queue_index = queue->id; + req.alloc_res_id_on_creation = 1; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) { + device_printf(gc->dev, + "Failed to disable queue: %d, 0x%x\n", err, + resp.hdr.status); + return err ? err : EPROTO; + } + + return 0; +} + +#define DOORBELL_OFFSET_SQ 0x0 +#define DOORBELL_OFFSET_RQ 0x400 +#define DOORBELL_OFFSET_CQ 0x800 +#define DOORBELL_OFFSET_EQ 0xFF8 + +static void +mana_gd_ring_doorbell(struct gdma_context *gc, uint32_t db_index, + enum gdma_queue_type q_type, uint32_t qid, + uint32_t tail_ptr, uint8_t num_req) +{ + union gdma_doorbell_entry e = {}; + void __iomem *addr; + + addr = (char *)gc->db_page_base + gc->db_page_size * db_index; + switch (q_type) { + case GDMA_EQ: + e.eq.id = qid; + e.eq.tail_ptr = tail_ptr; + e.eq.arm = num_req; + + addr = (char *)addr + DOORBELL_OFFSET_EQ; + break; + + case GDMA_CQ: + e.cq.id = qid; + e.cq.tail_ptr = tail_ptr; + e.cq.arm = num_req; + + addr = (char *)addr + DOORBELL_OFFSET_CQ; + break; + + case GDMA_RQ: + e.rq.id = qid; + e.rq.tail_ptr = tail_ptr; + e.rq.wqe_cnt = num_req; + + addr = (char *)addr + DOORBELL_OFFSET_RQ; + break; + + case GDMA_SQ: + e.sq.id = qid; + e.sq.tail_ptr = tail_ptr; + + addr = (char *)addr + DOORBELL_OFFSET_SQ; + break; + + default: + mana_warn(NULL, "Invalid queue type 0x%x\n", q_type); + return; + } + + /* Ensure all writes are done before ring doorbell */ + wmb(); + +#if defined(__amd64__) + writeq(addr, e.as_uint64); +#else + uint32_t *p = (uint32_t *)&e.as_uint64; + writel(addr, *p); + writel((char *)addr + 4, *(p + 1)); +#endif +} + +void +mana_gd_wq_ring_doorbell(struct gdma_context *gc, struct gdma_queue *queue) +{ + mana_gd_ring_doorbell(gc, queue->gdma_dev->doorbell, queue->type, + queue->id, queue->head * GDMA_WQE_BU_SIZE, 1); +} + +void +mana_gd_arm_cq(struct gdma_queue *cq) +{ + struct gdma_context *gc = cq->gdma_dev->gdma_context; + + uint32_t num_cqe = cq->queue_size / GDMA_CQE_SIZE; + + uint32_t head = cq->head % (num_cqe << GDMA_CQE_OWNER_BITS); + + mana_gd_ring_doorbell(gc, cq->gdma_dev->doorbell, cq->type, cq->id, + head, SET_ARM_BIT); +} + +static void +mana_gd_process_eqe(struct gdma_queue *eq) +{ + uint32_t head = eq->head % (eq->queue_size / GDMA_EQE_SIZE); + struct gdma_context *gc = eq->gdma_dev->gdma_context; + struct gdma_eqe *eq_eqe_ptr = eq->queue_mem_ptr; + union gdma_eqe_info eqe_info; + enum gdma_eqe_type type; + struct gdma_event event; + struct gdma_queue *cq; + struct gdma_eqe *eqe; + uint32_t cq_id; + + eqe = &eq_eqe_ptr[head]; + eqe_info.as_uint32 = eqe->eqe_info; + type = eqe_info.type; + + switch (type) { + case GDMA_EQE_COMPLETION: + cq_id = eqe->details[0] & 0xFFFFFF; + if (cq_id >= gc->max_num_cqs) { + mana_warn(NULL, + "failed: cq_id %u > max_num_cqs %u\n", + cq_id, gc->max_num_cqs); + break; + } + + cq = gc->cq_table[cq_id]; + if (!cq || cq->type != GDMA_CQ || cq->id != cq_id) { + mana_warn(NULL, + "failed: invalid cq_id %u\n", cq_id); + break; + } + + if (cq->cq.callback) + cq->cq.callback(cq->cq.context, cq); + + break; + + case GDMA_EQE_TEST_EVENT: + gc->test_event_eq_id = eq->id; + + mana_dbg(NULL, + "EQE TEST EVENT received for EQ %u\n", eq->id); + + complete(&gc->eq_test_event); + break; + + case GDMA_EQE_HWC_INIT_EQ_ID_DB: + case GDMA_EQE_HWC_INIT_DATA: + case GDMA_EQE_HWC_INIT_DONE: + if (!eq->eq.callback) + break; + + event.type = type; + memcpy(&event.details, &eqe->details, GDMA_EVENT_DATA_SIZE); + eq->eq.callback(eq->eq.context, eq, &event); + break; + + default: + break; + } +} + +static void +mana_gd_process_eq_events(void *arg) +{ + uint32_t owner_bits, new_bits, old_bits; + union gdma_eqe_info eqe_info; + struct gdma_eqe *eq_eqe_ptr; + struct gdma_queue *eq = arg; + struct gdma_context *gc; + uint32_t head, num_eqe; + struct gdma_eqe *eqe; + unsigned int arm_bit; + int i, j; + + gc = eq->gdma_dev->gdma_context; + + num_eqe = eq->queue_size / GDMA_EQE_SIZE; + eq_eqe_ptr = eq->queue_mem_ptr; + + bus_dmamap_sync(eq->mem_info.dma_tag, eq->mem_info.dma_map, + BUS_DMASYNC_POSTREAD); + + /* Process up to 5 EQEs at a time, and update the HW head. */ + for (i = 0; i < 5; i++) { + eqe = &eq_eqe_ptr[eq->head % num_eqe]; + eqe_info.as_uint32 = eqe->eqe_info; + owner_bits = eqe_info.owner_bits; + + old_bits = (eq->head / num_eqe - 1) & GDMA_EQE_OWNER_MASK; + + /* No more entries */ + if (owner_bits == old_bits) + break; + + new_bits = (eq->head / num_eqe) & GDMA_EQE_OWNER_MASK; + if (owner_bits != new_bits) { + /* Something wrong. Log for debugging purpose */ + device_printf(gc->dev, + "EQ %d: overflow detected, " + "i = %d, eq->head = %u " + "got owner_bits = %u, new_bits = %u " + "eqe addr %p, eqe->eqe_info 0x%x, " + "eqe type = %x, reserved1 = %x, client_id = %x, " + "reserved2 = %x, owner_bits = %x\n", + eq->id, i, eq->head, + owner_bits, new_bits, + eqe, eqe->eqe_info, + eqe_info.type, eqe_info.reserved1, + eqe_info.client_id, eqe_info.reserved2, + eqe_info.owner_bits); + + uint32_t *eqe_dump = (uint32_t *) eq_eqe_ptr; + for (j = 0; j < 20; j++) { + device_printf(gc->dev, "%p: %x\t%x\t%x\t%x\n", + &eqe_dump[j * 4], eqe_dump[j * 4], eqe_dump[j * 4 + 1], + eqe_dump[j * 4 + 2], eqe_dump[j * 4 + 3]); + } + break; + } + + mana_gd_process_eqe(eq); + + eq->head++; + } + + bus_dmamap_sync(eq->mem_info.dma_tag, eq->mem_info.dma_map, + BUS_DMASYNC_PREREAD); + + /* Always rearm the EQ for HWC. */ + if (mana_gd_is_hwc(eq->gdma_dev)) { + arm_bit = SET_ARM_BIT; + } else if (eq->eq.work_done < eq->eq.budget && + eq->eq.do_not_ring_db == false) { + arm_bit = SET_ARM_BIT; + } else { + arm_bit = 0; + } + + head = eq->head % (num_eqe << GDMA_EQE_OWNER_BITS); + + mana_gd_ring_doorbell(gc, eq->gdma_dev->doorbell, eq->type, eq->id, + head, arm_bit); +} + +#define MANA_POLL_BUDGET 8 +#define MANA_RX_BUDGET 256 + +static void +mana_poll(void *arg, int pending) +{ + struct gdma_queue *eq = arg; + int i; + + eq->eq.work_done = 0; + eq->eq.budget = MANA_RX_BUDGET; + + for (i = 0; i < MANA_POLL_BUDGET; i++) { + /* + * If this is the last loop, set the budget big enough + * so it will arm the EQ any way. + */ + if (i == (MANA_POLL_BUDGET - 1)) + eq->eq.budget = CQE_POLLING_BUFFER + 1; + + mana_gd_process_eq_events(eq); + + if (eq->eq.work_done < eq->eq.budget) + break; + + eq->eq.work_done = 0; + } +} + +static void +mana_gd_schedule_task(void *arg) +{ + struct gdma_queue *eq = arg; + + taskqueue_enqueue(eq->eq.cleanup_tq, &eq->eq.cleanup_task); +} + +static int +mana_gd_register_irq(struct gdma_queue *queue, + const struct gdma_queue_spec *spec) +{ + static int mana_last_bind_cpu = -1; + struct gdma_dev *gd = queue->gdma_dev; + bool is_mana = mana_gd_is_mana(gd); + struct gdma_irq_context *gic; + struct gdma_context *gc; + struct gdma_resource *r; + unsigned int msi_index; + int err; + + gc = gd->gdma_context; + r = &gc->msix_resource; + + mtx_lock_spin(&r->lock_spin); + + msi_index = find_first_zero_bit(r->map, r->size); + if (msi_index >= r->size) { + err = ENOSPC; + } else { + bitmap_set(r->map, msi_index, 1); + queue->eq.msix_index = msi_index; + err = 0; + } + + mtx_unlock_spin(&r->lock_spin); + + if (err) + return err; + + if (unlikely(msi_index >= gc->num_msix_usable)) { + device_printf(gc->dev, + "chose an invalid msix index %d, usable %d\n", + msi_index, gc->num_msix_usable); + return ENOSPC; + } + + gic = &gc->irq_contexts[msi_index]; + + if (is_mana) { + struct mana_port_context *apc = if_getsoftc(spec->eq.ndev); + queue->eq.do_not_ring_db = false; + + NET_TASK_INIT(&queue->eq.cleanup_task, 0, mana_poll, queue); + queue->eq.cleanup_tq = + taskqueue_create_fast("mana eq cleanup", + M_WAITOK, taskqueue_thread_enqueue, + &queue->eq.cleanup_tq); + + if (mana_last_bind_cpu < 0) + mana_last_bind_cpu = CPU_FIRST(); + queue->eq.cpu = mana_last_bind_cpu; + mana_last_bind_cpu = CPU_NEXT(mana_last_bind_cpu); + + /* XXX Name is not optimal. However we have to start + * the task here. Otherwise, test eq will have no + * handler. + */ + if (apc->bind_cleanup_thread_cpu) { + cpuset_t cpu_mask; + CPU_SETOF(queue->eq.cpu, &cpu_mask); + taskqueue_start_threads_cpuset(&queue->eq.cleanup_tq, + 1, PI_NET, &cpu_mask, + "mana eq poll msix %u on cpu %d", + msi_index, queue->eq.cpu); + } else { + + taskqueue_start_threads(&queue->eq.cleanup_tq, 1, + PI_NET, "mana eq poll on msix %u", msi_index); + } + } + + if (unlikely(gic->handler || gic->arg)) { + device_printf(gc->dev, + "interrupt handler or arg already assigned, " + "msix index: %d\n", msi_index); + } + + gic->arg = queue; + + if (is_mana) + gic->handler = mana_gd_schedule_task; + else + gic->handler = mana_gd_process_eq_events; + + mana_dbg(NULL, "registered msix index %d vector %d irq %ju\n", + msi_index, gic->msix_e.vector, rman_get_start(gic->res)); + + return 0; +} + +static void +mana_gd_deregiser_irq(struct gdma_queue *queue) +{ + struct gdma_dev *gd = queue->gdma_dev; + struct gdma_irq_context *gic; + struct gdma_context *gc; + struct gdma_resource *r; + unsigned int msix_index; + + gc = gd->gdma_context; + r = &gc->msix_resource; + + /* At most num_online_cpus() + 1 interrupts are used. */ + msix_index = queue->eq.msix_index; + if (unlikely(msix_index >= gc->num_msix_usable)) + return; + + gic = &gc->irq_contexts[msix_index]; + gic->handler = NULL; + gic->arg = NULL; + + mtx_lock_spin(&r->lock_spin); + bitmap_clear(r->map, msix_index, 1); + mtx_unlock_spin(&r->lock_spin); + + queue->eq.msix_index = INVALID_PCI_MSIX_INDEX; + + mana_dbg(NULL, "deregistered msix index %d vector %d irq %ju\n", + msix_index, gic->msix_e.vector, rman_get_start(gic->res)); +} + +int +mana_gd_test_eq(struct gdma_context *gc, struct gdma_queue *eq) +{ + struct gdma_generate_test_event_req req = {}; + struct gdma_general_resp resp = {}; + device_t dev = gc->dev; + int err; + + sx_xlock(&gc->eq_test_event_sx); + + init_completion(&gc->eq_test_event); + gc->test_event_eq_id = INVALID_QUEUE_ID; + + mana_gd_init_req_hdr(&req.hdr, GDMA_GENERATE_TEST_EQE, + sizeof(req), sizeof(resp)); + + req.hdr.dev_id = eq->gdma_dev->dev_id; + req.queue_index = eq->id; + + err = mana_gd_send_request(gc, sizeof(req), &req, + sizeof(resp), &resp); + if (err) { + device_printf(dev, "test_eq failed: %d\n", err); + goto out; + } + + err = EPROTO; + + if (resp.hdr.status) { + device_printf(dev, "test_eq failed: 0x%x\n", + resp.hdr.status); + goto out; + } + + if (wait_for_completion_timeout(&gc->eq_test_event, 30 * hz)) { + device_printf(dev, "test_eq timed out on queue %d\n", + eq->id); + goto out; + } + + if (eq->id != gc->test_event_eq_id) { + device_printf(dev, + "test_eq got an event on wrong queue %d (%d)\n", + gc->test_event_eq_id, eq->id); + goto out; + } + + err = 0; +out: + sx_xunlock(&gc->eq_test_event_sx); + return err; +} + +static void +mana_gd_destroy_eq(struct gdma_context *gc, bool flush_evenets, + struct gdma_queue *queue) +{ + int err; + + if (flush_evenets) { + err = mana_gd_test_eq(gc, queue); + if (err) + device_printf(gc->dev, + "Failed to flush EQ: %d\n", err); + } + + mana_gd_deregiser_irq(queue); + + if (mana_gd_is_mana(queue->gdma_dev)) { + while (taskqueue_cancel(queue->eq.cleanup_tq, + &queue->eq.cleanup_task, NULL)) + taskqueue_drain(queue->eq.cleanup_tq, + &queue->eq.cleanup_task); + + taskqueue_free(queue->eq.cleanup_tq); + } + + if (queue->eq.disable_needed) + mana_gd_disable_queue(queue); +} + +static int mana_gd_create_eq(struct gdma_dev *gd, + const struct gdma_queue_spec *spec, + bool create_hwq, struct gdma_queue *queue) +{ + struct gdma_context *gc = gd->gdma_context; + device_t dev = gc->dev; + uint32_t log2_num_entries; + int err; + + queue->eq.msix_index = INVALID_PCI_MSIX_INDEX; + + log2_num_entries = ilog2(queue->queue_size / GDMA_EQE_SIZE); + + if (spec->eq.log2_throttle_limit > log2_num_entries) { + device_printf(dev, + "EQ throttling limit (%lu) > maximum EQE (%u)\n", + spec->eq.log2_throttle_limit, log2_num_entries); + return EINVAL; + } + + err = mana_gd_register_irq(queue, spec); + if (err) { + device_printf(dev, "Failed to register irq: %d\n", err); + return err; + } + + queue->eq.callback = spec->eq.callback; + queue->eq.context = spec->eq.context; + queue->head |= INITIALIZED_OWNER_BIT(log2_num_entries); + queue->eq.log2_throttle_limit = spec->eq.log2_throttle_limit ?: 1; + + if (create_hwq) { + err = mana_gd_create_hw_eq(gc, queue); + if (err) + goto out; + + err = mana_gd_test_eq(gc, queue); + if (err) + goto out; + } + + return 0; +out: + device_printf(dev, "Failed to create EQ: %d\n", err); + mana_gd_destroy_eq(gc, false, queue); + return err; +} + +static void +mana_gd_create_cq(const struct gdma_queue_spec *spec, + struct gdma_queue *queue) +{ + uint32_t log2_num_entries = ilog2(spec->queue_size / GDMA_CQE_SIZE); + + queue->head |= INITIALIZED_OWNER_BIT(log2_num_entries); + queue->cq.parent = spec->cq.parent_eq; + queue->cq.context = spec->cq.context; + queue->cq.callback = spec->cq.callback; +} + +static void +mana_gd_destroy_cq(struct gdma_context *gc, + struct gdma_queue *queue) +{ + uint32_t id = queue->id; + + if (id >= gc->max_num_cqs) + return; + + if (!gc->cq_table[id]) + return; + + gc->cq_table[id] = NULL; +} + +int mana_gd_create_hwc_queue(struct gdma_dev *gd, + const struct gdma_queue_spec *spec, + struct gdma_queue **queue_ptr) +{ + struct gdma_context *gc = gd->gdma_context; + struct gdma_mem_info *gmi; + struct gdma_queue *queue; + int err; + + queue = malloc(sizeof(*queue), M_DEVBUF, M_WAITOK | M_ZERO); + if (!queue) + return ENOMEM; + + gmi = &queue->mem_info; + err = mana_gd_alloc_memory(gc, spec->queue_size, gmi); + if (err) + goto free_q; + + queue->head = 0; + queue->tail = 0; + queue->queue_mem_ptr = gmi->virt_addr; + queue->queue_size = spec->queue_size; + queue->monitor_avl_buf = spec->monitor_avl_buf; + queue->type = spec->type; + queue->gdma_dev = gd; + + if (spec->type == GDMA_EQ) + err = mana_gd_create_eq(gd, spec, false, queue); + else if (spec->type == GDMA_CQ) + mana_gd_create_cq(spec, queue); + + if (err) + goto out; + + *queue_ptr = queue; + return 0; +out: + mana_gd_free_memory(gmi); +free_q: + free(queue, M_DEVBUF); + return err; +} + +static void +mana_gd_destroy_dma_region(struct gdma_context *gc, uint64_t gdma_region) +{ + struct gdma_destroy_dma_region_req req = {}; + struct gdma_general_resp resp = {}; + int err; + + if (gdma_region == GDMA_INVALID_DMA_REGION) + return; + + mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_DMA_REGION, sizeof(req), + sizeof(resp)); + req.gdma_region = gdma_region; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), + &resp); + if (err || resp.hdr.status) + device_printf(gc->dev, + "Failed to destroy DMA region: %d, 0x%x\n", + err, resp.hdr.status); +} + +static int +mana_gd_create_dma_region(struct gdma_dev *gd, + struct gdma_mem_info *gmi) +{ + unsigned int num_page = gmi->length / PAGE_SIZE; + struct gdma_create_dma_region_req *req = NULL; + struct gdma_create_dma_region_resp resp = {}; + struct gdma_context *gc = gd->gdma_context; + struct hw_channel_context *hwc; + uint32_t length = gmi->length; + uint32_t req_msg_size; + int err; + int i; + + if (length < PAGE_SIZE || !is_power_of_2(length)) { + mana_err(NULL, "gmi size incorrect: %u\n", length); + return EINVAL; + } + + if (offset_in_page((uint64_t)gmi->virt_addr) != 0) { + mana_err(NULL, "gmi not page aligned: %p\n", + gmi->virt_addr); + return EINVAL; + } + + hwc = gc->hwc.driver_data; + req_msg_size = sizeof(*req) + num_page * sizeof(uint64_t); + if (req_msg_size > hwc->max_req_msg_size) { + mana_err(NULL, "req msg size too large: %u, %u\n", + req_msg_size, hwc->max_req_msg_size); + return EINVAL; + } + + req = malloc(req_msg_size, M_DEVBUF, M_WAITOK | M_ZERO); + if (!req) + return ENOMEM; + + mana_gd_init_req_hdr(&req->hdr, GDMA_CREATE_DMA_REGION, + req_msg_size, sizeof(resp)); + req->length = length; + req->offset_in_page = 0; + req->gdma_page_type = GDMA_PAGE_TYPE_4K; + req->page_count = num_page; + req->page_addr_list_len = num_page; + + for (i = 0; i < num_page; i++) + req->page_addr_list[i] = gmi->dma_handle + i * PAGE_SIZE; + + err = mana_gd_send_request(gc, req_msg_size, req, sizeof(resp), &resp); + if (err) + goto out; + + if (resp.hdr.status || resp.gdma_region == GDMA_INVALID_DMA_REGION) { + device_printf(gc->dev, "Failed to create DMA region: 0x%x\n", + resp.hdr.status); + err = EPROTO; + goto out; + } + + gmi->gdma_region = resp.gdma_region; +out: + free(req, M_DEVBUF); + return err; +} + +int +mana_gd_create_mana_eq(struct gdma_dev *gd, + const struct gdma_queue_spec *spec, + struct gdma_queue **queue_ptr) +{ + struct gdma_context *gc = gd->gdma_context; + struct gdma_mem_info *gmi; + struct gdma_queue *queue; + int err; + + if (spec->type != GDMA_EQ) + return EINVAL; + + queue = malloc(sizeof(*queue), M_DEVBUF, M_WAITOK | M_ZERO); + if (!queue) + return ENOMEM; + + gmi = &queue->mem_info; + err = mana_gd_alloc_memory(gc, spec->queue_size, gmi); + if (err) + goto free_q; + + err = mana_gd_create_dma_region(gd, gmi); + if (err) + goto out; + + queue->head = 0; + queue->tail = 0; + queue->queue_mem_ptr = gmi->virt_addr; + queue->queue_size = spec->queue_size; + queue->monitor_avl_buf = spec->monitor_avl_buf; + queue->type = spec->type; + queue->gdma_dev = gd; + + err = mana_gd_create_eq(gd, spec, true, queue); + if (err) + goto out; + + *queue_ptr = queue; + return 0; + +out: + mana_gd_free_memory(gmi); +free_q: + free(queue, M_DEVBUF); + return err; +} + +int mana_gd_create_mana_wq_cq(struct gdma_dev *gd, + const struct gdma_queue_spec *spec, + struct gdma_queue **queue_ptr) +{ + struct gdma_context *gc = gd->gdma_context; + struct gdma_mem_info *gmi; + struct gdma_queue *queue; + int err; + + if (spec->type != GDMA_CQ && spec->type != GDMA_SQ && + spec->type != GDMA_RQ) + return EINVAL; + + queue = malloc(sizeof(*queue), M_DEVBUF, M_WAITOK | M_ZERO); + if (!queue) + return ENOMEM; + + gmi = &queue->mem_info; + err = mana_gd_alloc_memory(gc, spec->queue_size, gmi); + if (err) + goto free_q; + + err = mana_gd_create_dma_region(gd, gmi); + if (err) + goto out; + + queue->head = 0; + queue->tail = 0; + queue->queue_mem_ptr = gmi->virt_addr; + queue->queue_size = spec->queue_size; + queue->monitor_avl_buf = spec->monitor_avl_buf; + queue->type = spec->type; + queue->gdma_dev = gd; + + if (spec->type == GDMA_CQ) + mana_gd_create_cq(spec, queue); + + *queue_ptr = queue; + return 0; + +out: + mana_gd_free_memory(gmi); +free_q: + free(queue, M_DEVBUF); + return err; +} + +void +mana_gd_destroy_queue(struct gdma_context *gc, struct gdma_queue *queue) +{ + struct gdma_mem_info *gmi = &queue->mem_info; + + switch (queue->type) { + case GDMA_EQ: + mana_gd_destroy_eq(gc, queue->eq.disable_needed, queue); + break; + + case GDMA_CQ: + mana_gd_destroy_cq(gc, queue); + break; + + case GDMA_RQ: + break; + + case GDMA_SQ: + break; + + default: + device_printf(gc->dev, + "Can't destroy unknown queue: type = %d\n", + queue->type); + return; + } + + mana_gd_destroy_dma_region(gc, gmi->gdma_region); + mana_gd_free_memory(gmi); + free(queue, M_DEVBUF); +} + +int +mana_gd_verify_vf_version(device_t dev) +{ + struct gdma_context *gc = device_get_softc(dev); + struct gdma_verify_ver_resp resp = {}; + struct gdma_verify_ver_req req = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, GDMA_VERIFY_VF_DRIVER_VERSION, + sizeof(req), sizeof(resp)); + + req.protocol_ver_min = GDMA_PROTOCOL_FIRST; + req.protocol_ver_max = GDMA_PROTOCOL_LAST; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) { + device_printf(gc->dev, + "VfVerifyVersionOutput: %d, status=0x%x\n", + err, resp.hdr.status); + return err ? err : EPROTO; + } + + return 0; +} + +int +mana_gd_register_device(struct gdma_dev *gd) +{ + struct gdma_context *gc = gd->gdma_context; + struct gdma_register_device_resp resp = {}; + struct gdma_general_req req = {}; + int err; + + gd->pdid = INVALID_PDID; + gd->doorbell = INVALID_DOORBELL; + gd->gpa_mkey = INVALID_MEM_KEY; + + mana_gd_init_req_hdr(&req.hdr, GDMA_REGISTER_DEVICE, sizeof(req), + sizeof(resp)); + + req.hdr.dev_id = gd->dev_id; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) { + device_printf(gc->dev, + "gdma_register_device_resp failed: %d, 0x%x\n", + err, resp.hdr.status); + return err ? err : -EPROTO; + } + + gd->pdid = resp.pdid; + gd->gpa_mkey = resp.gpa_mkey; + gd->doorbell = resp.db_id; + + mana_dbg(NULL, "mana device pdid %u, gpa_mkey %u, doorbell %u \n", + gd->pdid, gd->gpa_mkey, gd->doorbell); + + return 0; +} + +int +mana_gd_deregister_device(struct gdma_dev *gd) +{ + struct gdma_context *gc = gd->gdma_context; + struct gdma_general_resp resp = {}; + struct gdma_general_req req = {}; + int err; + + if (gd->pdid == INVALID_PDID) + return EINVAL; + + mana_gd_init_req_hdr(&req.hdr, GDMA_DEREGISTER_DEVICE, sizeof(req), + sizeof(resp)); + + req.hdr.dev_id = gd->dev_id; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) { + device_printf(gc->dev, + "Failed to deregister device: %d, 0x%x\n", + err, resp.hdr.status); + if (!err) + err = EPROTO; + } + + gd->pdid = INVALID_PDID; + gd->doorbell = INVALID_DOORBELL; + gd->gpa_mkey = INVALID_MEM_KEY; + + return err; +} + +uint32_t +mana_gd_wq_avail_space(struct gdma_queue *wq) +{ + uint32_t used_space = (wq->head - wq->tail) * GDMA_WQE_BU_SIZE; + uint32_t wq_size = wq->queue_size; + + if (used_space > wq_size) { + mana_warn(NULL, "failed: used space %u > queue size %u\n", + used_space, wq_size); + } + + return wq_size - used_space; +} + +uint8_t * +mana_gd_get_wqe_ptr(const struct gdma_queue *wq, uint32_t wqe_offset) +{ + uint32_t offset = + (wqe_offset * GDMA_WQE_BU_SIZE) & (wq->queue_size - 1); + + if ((offset + GDMA_WQE_BU_SIZE) > wq->queue_size) { + mana_warn(NULL, "failed: write end out of queue bound %u, " + "queue size %u\n", + offset + GDMA_WQE_BU_SIZE, wq->queue_size); + } + + return (uint8_t *)wq->queue_mem_ptr + offset; +} + +static uint32_t +mana_gd_write_client_oob(const struct gdma_wqe_request *wqe_req, + enum gdma_queue_type q_type, + uint32_t client_oob_size, uint32_t sgl_data_size, + uint8_t *wqe_ptr) +{ + bool oob_in_sgl = !!(wqe_req->flags & GDMA_WR_OOB_IN_SGL); + bool pad_data = !!(wqe_req->flags & GDMA_WR_PAD_BY_SGE0); + struct gdma_wqe *header = (struct gdma_wqe *)wqe_ptr; + uint8_t *ptr; + + memset(header, 0, sizeof(struct gdma_wqe)); + header->num_sge = wqe_req->num_sge; + header->inline_oob_size_div4 = client_oob_size / sizeof(uint32_t); + + if (oob_in_sgl) { + if (!pad_data || wqe_req->num_sge < 2) { + mana_warn(NULL, "no pad_data or num_sge < 2\n"); + } + + header->client_oob_in_sgl = 1; + + if (pad_data) + header->last_vbytes = wqe_req->sgl[0].size; + } + + if (q_type == GDMA_SQ) + header->client_data_unit = wqe_req->client_data_unit; + + /* + * The size of gdma_wqe + client_oob_size must be less than or equal + * to one Basic Unit (i.e. 32 bytes), so the pointer can't go beyond + * the queue memory buffer boundary. + */ + ptr = wqe_ptr + sizeof(header); + + if (wqe_req->inline_oob_data && wqe_req->inline_oob_size > 0) { + memcpy(ptr, wqe_req->inline_oob_data, wqe_req->inline_oob_size); + + if (client_oob_size > wqe_req->inline_oob_size) + memset(ptr + wqe_req->inline_oob_size, 0, + client_oob_size - wqe_req->inline_oob_size); + } + + return sizeof(header) + client_oob_size; +} + +static void +mana_gd_write_sgl(struct gdma_queue *wq, uint8_t *wqe_ptr, + const struct gdma_wqe_request *wqe_req) +{ + uint32_t sgl_size = sizeof(struct gdma_sge) * wqe_req->num_sge; + const uint8_t *address = (uint8_t *)wqe_req->sgl; + uint8_t *base_ptr, *end_ptr; + uint32_t size_to_end; + + base_ptr = wq->queue_mem_ptr; + end_ptr = base_ptr + wq->queue_size; + size_to_end = (uint32_t)(end_ptr - wqe_ptr); + + if (size_to_end < sgl_size) { + memcpy(wqe_ptr, address, size_to_end); + + wqe_ptr = base_ptr; + address += size_to_end; + sgl_size -= size_to_end; + } + + memcpy(wqe_ptr, address, sgl_size); +} + +int +mana_gd_post_work_request(struct gdma_queue *wq, + const struct gdma_wqe_request *wqe_req, + struct gdma_posted_wqe_info *wqe_info) +{ + uint32_t client_oob_size = wqe_req->inline_oob_size; + struct gdma_context *gc; + uint32_t sgl_data_size; + uint32_t max_wqe_size; + uint32_t wqe_size; + uint8_t *wqe_ptr; + + if (wqe_req->num_sge == 0) + return EINVAL; + + if (wq->type == GDMA_RQ) { + if (client_oob_size != 0) + return EINVAL; + + client_oob_size = INLINE_OOB_SMALL_SIZE; + + max_wqe_size = GDMA_MAX_RQE_SIZE; + } else { + if (client_oob_size != INLINE_OOB_SMALL_SIZE && + client_oob_size != INLINE_OOB_LARGE_SIZE) + return EINVAL; + + max_wqe_size = GDMA_MAX_SQE_SIZE; + } + + sgl_data_size = sizeof(struct gdma_sge) * wqe_req->num_sge; + wqe_size = ALIGN(sizeof(struct gdma_wqe) + client_oob_size + + sgl_data_size, GDMA_WQE_BU_SIZE); + if (wqe_size > max_wqe_size) + return EINVAL; + + if (wq->monitor_avl_buf && wqe_size > mana_gd_wq_avail_space(wq)) { + gc = wq->gdma_dev->gdma_context; + device_printf(gc->dev, "unsuccessful flow control!\n"); + return ENOSPC; + } + + if (wqe_info) + wqe_info->wqe_size_in_bu = wqe_size / GDMA_WQE_BU_SIZE; + + wqe_ptr = mana_gd_get_wqe_ptr(wq, wq->head); + wqe_ptr += mana_gd_write_client_oob(wqe_req, wq->type, client_oob_size, + sgl_data_size, wqe_ptr); + if (wqe_ptr >= (uint8_t *)wq->queue_mem_ptr + wq->queue_size) + wqe_ptr -= wq->queue_size; + + mana_gd_write_sgl(wq, wqe_ptr, wqe_req); + + wq->head += wqe_size / GDMA_WQE_BU_SIZE; + + bus_dmamap_sync(wq->mem_info.dma_tag, wq->mem_info.dma_map, + BUS_DMASYNC_PREWRITE); + + return 0; +} + +int +mana_gd_post_and_ring(struct gdma_queue *queue, + const struct gdma_wqe_request *wqe_req, + struct gdma_posted_wqe_info *wqe_info) +{ + struct gdma_context *gc = queue->gdma_dev->gdma_context; + int err; + + err = mana_gd_post_work_request(queue, wqe_req, wqe_info); + if (err) + return err; + + mana_gd_wq_ring_doorbell(gc, queue); + + return 0; +} + +static int +mana_gd_read_cqe(struct gdma_queue *cq, struct gdma_comp *comp) +{ + unsigned int num_cqe = cq->queue_size / sizeof(struct gdma_cqe); + struct gdma_cqe *cq_cqe = cq->queue_mem_ptr; + uint32_t owner_bits, new_bits, old_bits; + struct gdma_cqe *cqe; + + cqe = &cq_cqe[cq->head % num_cqe]; + owner_bits = cqe->cqe_info.owner_bits; + + old_bits = (cq->head / num_cqe - 1) & GDMA_CQE_OWNER_MASK; + /* Return 0 if no more entries. */ + if (owner_bits == old_bits) + return 0; + + new_bits = (cq->head / num_cqe) & GDMA_CQE_OWNER_MASK; + /* Return -1 if overflow detected. */ + if (owner_bits != new_bits) + return -1; + + comp->wq_num = cqe->cqe_info.wq_num; + comp->is_sq = cqe->cqe_info.is_sq; + memcpy(comp->cqe_data, cqe->cqe_data, GDMA_COMP_DATA_SIZE); + + return 1; +} + +int +mana_gd_poll_cq(struct gdma_queue *cq, struct gdma_comp *comp, int num_cqe) +{ + int cqe_idx; + int ret; + + bus_dmamap_sync(cq->mem_info.dma_tag, cq->mem_info.dma_map, + BUS_DMASYNC_POSTREAD); + + for (cqe_idx = 0; cqe_idx < num_cqe; cqe_idx++) { + ret = mana_gd_read_cqe(cq, &comp[cqe_idx]); + + if (ret < 0) { + cq->head -= cqe_idx; + return ret; + } + + if (ret == 0) + break; + + cq->head++; + } + + return cqe_idx; +} + +static void +mana_gd_intr(void *arg) +{ + struct gdma_irq_context *gic = arg; + + if (gic->handler) { + gic->handler(gic->arg); + } +} + +int +mana_gd_alloc_res_map(uint32_t res_avail, + struct gdma_resource *r, const char *lock_name) +{ + int n = howmany(res_avail, BITS_PER_LONG); + + r->map = + malloc(n * sizeof(unsigned long), M_DEVBUF, M_WAITOK | M_ZERO); + if (!r->map) + return ENOMEM; + + r->size = res_avail; + mtx_init(&r->lock_spin, lock_name, NULL, MTX_SPIN); + + mana_dbg(NULL, + "total res %u, total number of unsigned longs %u\n", + r->size, n); + return (0); +} + +void +mana_gd_free_res_map(struct gdma_resource *r) +{ + if (!r || !r->map) + return; + + free(r->map, M_DEVBUF); + r->map = NULL; + r->size = 0; +} + +static void +mana_gd_init_registers(struct gdma_context *gc) +{ + uint64_t bar0_va = rman_get_bushandle(gc->bar0); + + gc->db_page_size = mana_gd_r32(gc, GDMA_REG_DB_PAGE_SIZE) & 0xFFFF; + + gc->db_page_base = + (void *) (bar0_va + mana_gd_r64(gc, GDMA_REG_DB_PAGE_OFFSET)); + + gc->shm_base = + (void *) (bar0_va + mana_gd_r64(gc, GDMA_REG_SHM_OFFSET)); + + mana_dbg(NULL, "db_page_size 0x%xx, db_page_base %p," + " shm_base %p\n", + gc->db_page_size, gc->db_page_base, gc->shm_base); +} + +static struct resource * +mana_gd_alloc_bar(device_t dev, int bar) +{ + struct resource *res = NULL; + struct pci_map *pm; + int rid, type; + + if (bar < 0 || bar > PCIR_MAX_BAR_0) + goto alloc_bar_out; + + pm = pci_find_bar(dev, PCIR_BAR(bar)); + if (!pm) + goto alloc_bar_out; + + if (PCI_BAR_IO(pm->pm_value)) + type = SYS_RES_IOPORT; + else + type = SYS_RES_MEMORY; + if (type < 0) + goto alloc_bar_out; + + rid = PCIR_BAR(bar); + res = bus_alloc_resource_any(dev, type, &rid, RF_ACTIVE); +#if defined(__amd64__) + if (res) + mana_dbg(NULL, "bar %d: rid 0x%x, type 0x%jx," + " handle 0x%jx\n", + bar, rid, res->r_bustag, res->r_bushandle); +#endif + +alloc_bar_out: + return (res); +} + +static void +mana_gd_free_pci_res(struct gdma_context *gc) +{ + if (!gc || gc->dev) + return; + + if (gc->bar0 != NULL) { + bus_release_resource(gc->dev, SYS_RES_MEMORY, + PCIR_BAR(GDMA_BAR0), gc->bar0); + } + + if (gc->msix != NULL) { + bus_release_resource(gc->dev, SYS_RES_MEMORY, + gc->msix_rid, gc->msix); + } +} + +static int +mana_gd_setup_irqs(device_t dev) +{ + unsigned int max_queues_per_port = mp_ncpus; + struct gdma_context *gc = device_get_softc(dev); + struct gdma_irq_context *gic; + unsigned int max_irqs; + int nvec; + int rc, rcc, i; + + if (max_queues_per_port > MANA_MAX_NUM_QUEUES) + max_queues_per_port = MANA_MAX_NUM_QUEUES; + + max_irqs = max_queues_per_port * MAX_PORTS_IN_MANA_DEV; + + /* Need 1 interrupt for the Hardware communication Channel (HWC) */ + max_irqs++; + + nvec = max_irqs; + rc = pci_alloc_msix(dev, &nvec); + if (unlikely(rc != 0)) { + device_printf(dev, + "Failed to allocate MSIX, vectors %d, error: %d\n", + nvec, rc); + rc = ENOSPC; + goto err_setup_irq_alloc; + } + + if (nvec != max_irqs) { + if (nvec == 1) { + device_printf(dev, + "Not enough number of MSI-x allocated: %d\n", + nvec); + rc = ENOSPC; + goto err_setup_irq_release; + } + device_printf(dev, "Allocated only %d MSI-x (%d requested)\n", + nvec, max_irqs); + } + + gc->irq_contexts = malloc(nvec * sizeof(struct gdma_irq_context), + M_DEVBUF, M_WAITOK | M_ZERO); + if (!gc->irq_contexts) { + rc = ENOMEM; + goto err_setup_irq_release; + } + + for (i = 0; i < nvec; i++) { + gic = &gc->irq_contexts[i]; + gic->msix_e.entry = i; + /* Vector starts from 1. */ + gic->msix_e.vector = i + 1; + gic->handler = NULL; + gic->arg = NULL; + + gic->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, + &gic->msix_e.vector, RF_ACTIVE | RF_SHAREABLE); + if (unlikely(gic->res == NULL)) { + rc = ENOMEM; + device_printf(dev, "could not allocate resource " + "for irq vector %d\n", gic->msix_e.vector); + goto err_setup_irq; + } + + rc = bus_setup_intr(dev, gic->res, + INTR_TYPE_NET | INTR_MPSAFE, NULL, mana_gd_intr, + gic, &gic->cookie); + if (unlikely(rc != 0)) { + device_printf(dev, "failed to register interrupt " + "handler for irq %ju vector %d: error %d\n", + rman_get_start(gic->res), gic->msix_e.vector, rc); + goto err_setup_irq; + } + gic->requested = true; + + mana_dbg(NULL, "added msix vector %d irq %ju\n", + gic->msix_e.vector, rman_get_start(gic->res)); + } + + rc = mana_gd_alloc_res_map(nvec, &gc->msix_resource, + "gdma msix res lock"); + if (rc != 0) { + device_printf(dev, "failed to allocate memory " + "for msix bitmap\n"); + goto err_setup_irq; + } + + gc->max_num_msix = nvec; + gc->num_msix_usable = nvec; + + mana_dbg(NULL, "setup %d msix interrupts\n", nvec); + + return (0); + +err_setup_irq: + for (; i >= 0; i--) { + gic = &gc->irq_contexts[i]; + rcc = 0; + + /* + * If gic->requested is true, we need to free both intr and + * resources. + */ + if (gic->requested) + rcc = bus_teardown_intr(dev, gic->res, gic->cookie); + if (unlikely(rcc != 0)) + device_printf(dev, "could not release " + "irq vector %d, error: %d\n", + gic->msix_e.vector, rcc); + + rcc = 0; + if (gic->res != NULL) { + rcc = bus_release_resource(dev, SYS_RES_IRQ, + gic->msix_e.vector, gic->res); + } + if (unlikely(rcc != 0)) + device_printf(dev, "dev has no parent while " + "releasing resource for irq vector %d\n", + gic->msix_e.vector); + gic->requested = false; + gic->res = NULL; + } + + free(gc->irq_contexts, M_DEVBUF); + gc->irq_contexts = NULL; +err_setup_irq_release: + pci_release_msi(dev); +err_setup_irq_alloc: + return (rc); +} + +static void +mana_gd_remove_irqs(device_t dev) +{ + struct gdma_context *gc = device_get_softc(dev); + struct gdma_irq_context *gic; + int rc, i; + + mana_gd_free_res_map(&gc->msix_resource); + + for (i = 0; i < gc->max_num_msix; i++) { + gic = &gc->irq_contexts[i]; + if (gic->requested) { + rc = bus_teardown_intr(dev, gic->res, gic->cookie); + if (unlikely(rc != 0)) { + device_printf(dev, "failed to tear down " + "irq vector %d, error: %d\n", + gic->msix_e.vector, rc); + } + gic->requested = false; + } + + if (gic->res != NULL) { + rc = bus_release_resource(dev, SYS_RES_IRQ, + gic->msix_e.vector, gic->res); + if (unlikely(rc != 0)) { + device_printf(dev, "dev has no parent while " + "releasing resource for irq vector %d\n", + gic->msix_e.vector); + } + gic->res = NULL; + } + } + + gc->max_num_msix = 0; + gc->num_msix_usable = 0; + free(gc->irq_contexts, M_DEVBUF); + gc->irq_contexts = NULL; + + pci_release_msi(dev); +} + +static int +mana_gd_probe(device_t dev) +{ + mana_vendor_id_t *ent; + char adapter_name[60]; + uint16_t pci_vendor_id = 0; + uint16_t pci_device_id = 0; + + pci_vendor_id = pci_get_vendor(dev); + pci_device_id = pci_get_device(dev); + + ent = mana_id_table; + while (ent->vendor_id != 0) { + if ((pci_vendor_id == ent->vendor_id) && + (pci_device_id == ent->device_id)) { + mana_dbg(NULL, "vendor=%x device=%x\n", + pci_vendor_id, pci_device_id); + + sprintf(adapter_name, DEVICE_DESC); + device_set_desc_copy(dev, adapter_name); + return (BUS_PROBE_DEFAULT); + } + + ent++; + } + + return (ENXIO); +} + +/** + * mana_attach - Device Initialization Routine + * @dev: device information struct + * + * Returns 0 on success, otherwise on failure. + * + * mana_attach initializes a GDMA adapter identified by a device structure. + **/ +static int +mana_gd_attach(device_t dev) +{ + struct gdma_context *gc; + int msix_rid; + int rc; + + gc = device_get_softc(dev); + gc->dev = dev; + + pci_enable_io(dev, SYS_RES_IOPORT); + pci_enable_io(dev, SYS_RES_MEMORY); + + pci_enable_busmaster(dev); + + gc->bar0 = mana_gd_alloc_bar(dev, GDMA_BAR0); + if (unlikely(gc->bar0 == NULL)) { + device_printf(dev, + "unable to allocate bus resource for bar0!\n"); + rc = ENOMEM; + goto err_disable_dev; + } + + /* Store bar0 tage and handle for quick access */ + gc->gd_bus.bar0_t = rman_get_bustag(gc->bar0); + gc->gd_bus.bar0_h = rman_get_bushandle(gc->bar0); + + /* Map MSI-x vector table */ + msix_rid = pci_msix_table_bar(dev); + + mana_dbg(NULL, "msix_rid 0x%x\n", msix_rid); + + gc->msix = bus_alloc_resource_any(dev, SYS_RES_MEMORY, + &msix_rid, RF_ACTIVE); + if (unlikely(gc->msix == NULL)) { + device_printf(dev, + "unable to allocate bus resource for msix!\n"); + rc = ENOMEM; + goto err_free_pci_res; + } + gc->msix_rid = msix_rid; + + if (unlikely(gc->gd_bus.bar0_h == 0)) { + device_printf(dev, "failed to map bar0!\n"); + rc = ENXIO; + goto err_free_pci_res; + } + + mana_gd_init_registers(gc); + + mana_smc_init(&gc->shm_channel, gc->dev, gc->shm_base); + + rc = mana_gd_setup_irqs(dev); + if (rc) { + goto err_free_pci_res; + } + + sx_init(&gc->eq_test_event_sx, "gdma test event sx"); + + rc = mana_hwc_create_channel(gc); + if (rc) { + mana_dbg(NULL, "Failed to create hwc channel\n"); + if (rc == EIO) + goto err_clean_up_gdma; + else + goto err_remove_irq; + } + + rc = mana_gd_verify_vf_version(dev); + if (rc) { + mana_dbg(NULL, "Failed to verify vf\n"); + goto err_clean_up_gdma; + } + + rc = mana_gd_query_max_resources(dev); + if (rc) { + mana_dbg(NULL, "Failed to query max resources\n"); + goto err_clean_up_gdma; + } + + rc = mana_gd_detect_devices(dev); + if (rc) { + mana_dbg(NULL, "Failed to detect mana device\n"); + goto err_clean_up_gdma; + } + + rc = mana_probe(&gc->mana); + if (rc) { + mana_dbg(NULL, "Failed to probe mana device\n"); + goto err_clean_up_gdma; + } + + return (0); + +err_clean_up_gdma: + mana_hwc_destroy_channel(gc); + if (gc->cq_table) + free(gc->cq_table, M_DEVBUF); + gc->cq_table = NULL; +err_remove_irq: + mana_gd_remove_irqs(dev); +err_free_pci_res: + mana_gd_free_pci_res(gc); +err_disable_dev: + pci_disable_busmaster(dev); + + return(rc); +} + +/** + * mana_detach - Device Removal Routine + * @pdev: device information struct + * + * mana_detach is called by the device subsystem to alert the driver + * that it should release a PCI device. + **/ +static int +mana_gd_detach(device_t dev) +{ + struct gdma_context *gc = device_get_softc(dev); + + mana_remove(&gc->mana); + + mana_hwc_destroy_channel(gc); + free(gc->cq_table, M_DEVBUF); + gc->cq_table = NULL; + + mana_gd_remove_irqs(dev); + + mana_gd_free_pci_res(gc); + + pci_disable_busmaster(dev); + + return (bus_generic_detach(dev)); +} + + +/********************************************************************* + * FreeBSD Device Interface Entry Points + *********************************************************************/ + +static device_method_t mana_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, mana_gd_probe), + DEVMETHOD(device_attach, mana_gd_attach), + DEVMETHOD(device_detach, mana_gd_detach), + DEVMETHOD_END +}; + +static driver_t mana_driver = { + "mana", mana_methods, sizeof(struct gdma_context), +}; + +devclass_t mana_devclass; +DRIVER_MODULE(mana, pci, mana_driver, mana_devclass, 0, 0); +MODULE_PNP_INFO("U16:vendor;U16:device", pci, mana, mana_id_table, + nitems(mana_id_table) - 1); +MODULE_DEPEND(mana, pci, 1, 1, 1); +MODULE_DEPEND(mana, ether, 1, 1, 1); + +/*********************************************************************/ diff --git a/sys/dev/mana/gdma_util.c b/sys/dev/mana/gdma_util.c new file mode 100644 index 000000000000..304caa28ec7a --- /dev/null +++ b/sys/dev/mana/gdma_util.c @@ -0,0 +1,96 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/mutex.h> +#include <sys/systm.h> + +#include "gdma_util.h" + + +void +init_completion(struct completion *c) +{ + memset(c, 0, sizeof(*c)); + mtx_init(&c->lock, "gdma_completion", NULL, MTX_DEF); + c->done = 0; +} + +void +free_completion(struct completion *c) +{ + mtx_destroy(&c->lock); +} + +void +complete(struct completion *c) +{ + mtx_lock(&c->lock); + c->done++; + mtx_unlock(&c->lock); + wakeup(c); +} + +void +wait_for_completion(struct completion *c) +{ + mtx_lock(&c->lock); + while (c->done == 0) + mtx_sleep(c, &c->lock, 0, "gdma_wfc", 0); + c->done--; + mtx_unlock(&c->lock); +} + +/* + * Return: 0 if completed, a non-zero value if timed out. + */ +int +wait_for_completion_timeout(struct completion *c, int timeout) +{ + int ret; + + mtx_lock(&c->lock); + + if (c->done == 0) + mtx_sleep(c, &c->lock, 0, "gdma_wfc", timeout); + + if (c->done > 0) { + c->done--; + ret = 0; + } else { + ret = 1; + } + + mtx_unlock(&c->lock); + + return (ret); +} diff --git a/sys/dev/mana/gdma_util.h b/sys/dev/mana/gdma_util.h new file mode 100644 index 000000000000..da2dfe54f1b9 --- /dev/null +++ b/sys/dev/mana/gdma_util.h @@ -0,0 +1,206 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#ifndef _GDMA_UTIL_H_ +#define _GDMA_UTIL_H_ + +#include <sys/types.h> +#include <sys/param.h> + +/* Log Levels */ +#define MANA_ALERT (1 << 0) /* Alerts are providing more error info. */ +#define MANA_WARNING (1 << 1) /* Driver output is more error sensitive. */ +#define MANA_INFO (1 << 2) /* Provides additional driver info. */ +#define MANA_DBG (1 << 3) /* Driver output for debugging. */ + +extern int mana_log_level; + +#define mana_trace_raw(ctx, level, fmt, args...) \ + do { \ + ((void)(ctx)); \ + if (((level) & mana_log_level) != (level)) \ + break; \ + printf(fmt, ##args); \ + } while (0) + +#define mana_trace(ctx, level, fmt, args...) \ + mana_trace_raw(ctx, level, "%s() [TID:%d]: " \ + fmt, __func__, curthread->td_tid, ##args) + + +#define mana_dbg(ctx, format, arg...) \ + mana_trace(ctx, MANA_DBG, format, ##arg) +#define mana_info(ctx, format, arg...) \ + mana_trace(ctx, MANA_INFO, format, ##arg) +#define mana_warn(ctx, format, arg...) \ + mana_trace(ctx, MANA_WARNING, format, ##arg) +#define mana_err(ctx, format, arg...) \ + mana_trace(ctx, MANA_ALERT, format, ##arg) + +#define unlikely(x) __predict_false(!!(x)) +#define likely(x) __predict_true(!!(x)) + + +#define BITS_PER_LONG (sizeof(long) * NBBY) + +#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG)) +#define BITMAP_LAST_WORD_MASK(n) (~0UL >> (BITS_PER_LONG - (n))) +#define BITS_TO_LONGS(n) howmany((n), BITS_PER_LONG) +#define BIT_MASK(nr) (1UL << ((nr) & (BITS_PER_LONG - 1))) +#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) + +#undef ALIGN +#define ALIGN(x, y) roundup2((x), (y)) +#define IS_ALIGNED(x, a) (((x) & ((__typeof(x))(a) - 1)) == 0) + +#define BIT(n) (1ULL << (n)) + +#define PHYS_PFN(x) ((unsigned long)((x) >> PAGE_SHIFT)) +#define offset_in_page(x) ((x) & PAGE_MASK) + +#define min_t(type, _x, _y) \ + ((type)(_x) < (type)(_y) ? (type)(_x) : (type)(_y)) + +#define test_bit(i, a) \ + ((((volatile const unsigned long *)(a))[BIT_WORD(i)]) & BIT_MASK(i)) + +typedef volatile uint32_t atomic_t; + +#define atomic_add_return(v, p) (atomic_fetchadd_int(p, v) + (v)) +#define atomic_sub_return(v, p) (atomic_fetchadd_int(p, -(v)) - (v)) +#define atomic_inc_return(p) atomic_add_return(1, p) +#define atomic_dec_return(p) atomic_sub_return(1, p) +#define atomic_read(p) atomic_add_return(0, p) + +#define usleep_range(_1, _2) \ + pause_sbt("gdma-usleep-range", SBT_1US * _1, SBT_1US * 1, C_ABSOLUTE) + +static inline void +gdma_msleep(unsigned int ms) +{ + if (ms == 0) + ms = 1; + pause_sbt("gdma-msleep", mstosbt(ms), 0, C_HARDCLOCK); +} + +static inline void +bitmap_set(unsigned long *map, unsigned int start, int nr) +{ + const unsigned int size = start + nr; + int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); + unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); + + map += BIT_WORD(start); + + while (nr - bits_to_set >= 0) { + *map |= mask_to_set; + nr -= bits_to_set; + bits_to_set = BITS_PER_LONG; + mask_to_set = ~0UL; + map++; + } + + if (nr) { + mask_to_set &= BITMAP_LAST_WORD_MASK(size); + *map |= mask_to_set; + } +} + +static inline void +bitmap_clear(unsigned long *map, unsigned int start, int nr) +{ + const unsigned int size = start + nr; + int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); + unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); + + map += BIT_WORD(start); + + while (nr - bits_to_clear >= 0) { + *map &= ~mask_to_clear; + nr -= bits_to_clear; + bits_to_clear = BITS_PER_LONG; + mask_to_clear = ~0UL; + map++; + } + + if (nr) { + mask_to_clear &= BITMAP_LAST_WORD_MASK(size); + *map &= ~mask_to_clear; + } +} + +static inline unsigned long +find_first_zero_bit(const unsigned long *p, unsigned long max) +{ + unsigned long i, n; + + for (i = 0; i < max / BITS_PER_LONG + 1; i++) { + n = ~p[i]; + if (n != 0) + return (i * BITS_PER_LONG + ffsl(n) - 1); + } + return (max); +} + +static inline unsigned long +ilog2(unsigned long x) +{ + unsigned long log = x; + while (x >>= 1) + log++; + return (log); +} + +static inline unsigned long +roundup_pow_of_two(unsigned long x) +{ + return (1UL << flsl(x - 1)); +} + +static inline int +is_power_of_2(unsigned long n) +{ + return (n == roundup_pow_of_two(n)); +} + +struct completion { + unsigned int done; + struct mtx lock; +}; + +void init_completion(struct completion *c); +void free_completion(struct completion *c); +void complete(struct completion *c); +void wait_for_completion(struct completion *c); +int wait_for_completion_timeout(struct completion *c, int timeout); +#endif /* _GDMA_UTIL_H_ */ diff --git a/sys/dev/mana/hw_channel.c b/sys/dev/mana/hw_channel.c new file mode 100644 index 000000000000..1949f1d2e049 --- /dev/null +++ b/sys/dev/mana/hw_channel.c @@ -0,0 +1,950 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/kernel.h> +#include <sys/kthread.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/bus.h> +#include <machine/bus.h> + +#include "mana.h" +#include "hw_channel.h" + +static int +mana_hwc_get_msg_index(struct hw_channel_context *hwc, uint16_t *msg_id) +{ + struct gdma_resource *r = &hwc->inflight_msg_res; + uint32_t index; + + sema_wait(&hwc->sema); + + mtx_lock_spin(&r->lock_spin); + + index = find_first_zero_bit(hwc->inflight_msg_res.map, + hwc->inflight_msg_res.size); + + bitmap_set(hwc->inflight_msg_res.map, index, 1); + + mtx_unlock_spin(&r->lock_spin); + + *msg_id = index; + + return 0; +} + +static void +mana_hwc_put_msg_index(struct hw_channel_context *hwc, uint16_t msg_id) +{ + struct gdma_resource *r = &hwc->inflight_msg_res; + + mtx_lock_spin(&r->lock_spin); + bitmap_clear(hwc->inflight_msg_res.map, msg_id, 1); + mtx_unlock_spin(&r->lock_spin); + + sema_post(&hwc->sema); +} + +static int +mana_hwc_verify_resp_msg(const struct hwc_caller_ctx *caller_ctx, + const struct gdma_resp_hdr *resp_msg, + uint32_t resp_len) +{ + if (resp_len < sizeof(*resp_msg)) + return EPROTO; + + if (resp_len > caller_ctx->output_buflen) + return EPROTO; + + return 0; +} + +static void +mana_hwc_handle_resp(struct hw_channel_context *hwc, uint32_t resp_len, + const struct gdma_resp_hdr *resp_msg) +{ + struct hwc_caller_ctx *ctx; + int err; + + if (!test_bit(resp_msg->response.hwc_msg_id, + hwc->inflight_msg_res.map)) { + device_printf(hwc->dev, "hwc_rx: invalid msg_id = %u\n", + resp_msg->response.hwc_msg_id); + return; + } + + ctx = hwc->caller_ctx + resp_msg->response.hwc_msg_id; + err = mana_hwc_verify_resp_msg(ctx, resp_msg, resp_len); + if (err) + goto out; + + ctx->status_code = resp_msg->status; + + memcpy(ctx->output_buf, resp_msg, resp_len); +out: + ctx->error = err; + complete(&ctx->comp_event); +} + +static int +mana_hwc_post_rx_wqe(const struct hwc_wq *hwc_rxq, + struct hwc_work_request *req) +{ + device_t dev = hwc_rxq->hwc->dev; + struct gdma_sge *sge; + int err; + + sge = &req->sge; + sge->address = (uint64_t)req->buf_sge_addr; + sge->mem_key = hwc_rxq->msg_buf->gpa_mkey; + sge->size = req->buf_len; + + memset(&req->wqe_req, 0, sizeof(struct gdma_wqe_request)); + req->wqe_req.sgl = sge; + req->wqe_req.num_sge = 1; + req->wqe_req.client_data_unit = 0; + + err = mana_gd_post_and_ring(hwc_rxq->gdma_wq, &req->wqe_req, NULL); + if (err) + device_printf(dev, + "Failed to post WQE on HWC RQ: %d\n", err); + return err; +} + +static void +mana_hwc_init_event_handler(void *ctx, struct gdma_queue *q_self, + struct gdma_event *event) +{ + struct hw_channel_context *hwc = ctx; + struct gdma_dev *gd = hwc->gdma_dev; + union hwc_init_type_data type_data; + union hwc_init_eq_id_db eq_db; + uint32_t type, val; + + switch (event->type) { + case GDMA_EQE_HWC_INIT_EQ_ID_DB: + eq_db.as_uint32 = event->details[0]; + hwc->cq->gdma_eq->id = eq_db.eq_id; + gd->doorbell = eq_db.doorbell; + break; + + case GDMA_EQE_HWC_INIT_DATA: + type_data.as_uint32 = event->details[0]; + type = type_data.type; + val = type_data.value; + + switch (type) { + case HWC_INIT_DATA_CQID: + hwc->cq->gdma_cq->id = val; + break; + + case HWC_INIT_DATA_RQID: + hwc->rxq->gdma_wq->id = val; + break; + + case HWC_INIT_DATA_SQID: + hwc->txq->gdma_wq->id = val; + break; + + case HWC_INIT_DATA_QUEUE_DEPTH: + hwc->hwc_init_q_depth_max = (uint16_t)val; + break; + + case HWC_INIT_DATA_MAX_REQUEST: + hwc->hwc_init_max_req_msg_size = val; + break; + + case HWC_INIT_DATA_MAX_RESPONSE: + hwc->hwc_init_max_resp_msg_size = val; + break; + + case HWC_INIT_DATA_MAX_NUM_CQS: + gd->gdma_context->max_num_cqs = val; + break; + + case HWC_INIT_DATA_PDID: + hwc->gdma_dev->pdid = val; + break; + + case HWC_INIT_DATA_GPA_MKEY: + hwc->rxq->msg_buf->gpa_mkey = val; + hwc->txq->msg_buf->gpa_mkey = val; + break; + } + + break; + + case GDMA_EQE_HWC_INIT_DONE: + complete(&hwc->hwc_init_eqe_comp); + break; + + default: + /* Ignore unknown events, which should never happen. */ + break; + } +} + +static void +mana_hwc_rx_event_handler(void *ctx, uint32_t gdma_rxq_id, + const struct hwc_rx_oob *rx_oob) +{ + struct hw_channel_context *hwc = ctx; + struct hwc_wq *hwc_rxq = hwc->rxq; + struct hwc_work_request *rx_req; + struct gdma_resp_hdr *resp; + struct gdma_wqe *dma_oob; + struct gdma_queue *rq; + struct gdma_sge *sge; + uint64_t rq_base_addr; + uint64_t rx_req_idx; + uint8_t *wqe; + + if (hwc_rxq->gdma_wq->id != gdma_rxq_id) { + mana_warn(NULL, "unmatched rx queue %u != %u\n", + hwc_rxq->gdma_wq->id, gdma_rxq_id); + return; + } + + + rq = hwc_rxq->gdma_wq; + wqe = mana_gd_get_wqe_ptr(rq, rx_oob->wqe_offset / GDMA_WQE_BU_SIZE); + dma_oob = (struct gdma_wqe *)wqe; + + bus_dmamap_sync(rq->mem_info.dma_tag, rq->mem_info.dma_map, + BUS_DMASYNC_POSTREAD); + + sge = (struct gdma_sge *)(wqe + 8 + dma_oob->inline_oob_size_div4 * 4); + + /* Select the RX work request for virtual address and for reposting. */ + rq_base_addr = hwc_rxq->msg_buf->mem_info.dma_handle; + rx_req_idx = (sge->address - rq_base_addr) / hwc->max_req_msg_size; + + bus_dmamap_sync(hwc_rxq->msg_buf->mem_info.dma_tag, + hwc_rxq->msg_buf->mem_info.dma_map, + BUS_DMASYNC_POSTREAD); + + rx_req = &hwc_rxq->msg_buf->reqs[rx_req_idx]; + resp = (struct gdma_resp_hdr *)rx_req->buf_va; + + if (resp->response.hwc_msg_id >= hwc->num_inflight_msg) { + device_printf(hwc->dev, "HWC RX: wrong msg_id=%u\n", + resp->response.hwc_msg_id); + return; + } + + mana_hwc_handle_resp(hwc, rx_oob->tx_oob_data_size, resp); + + /* Do no longer use 'resp', because the buffer is posted to the HW + * in the below mana_hwc_post_rx_wqe(). + */ + resp = NULL; + + bus_dmamap_sync(hwc_rxq->msg_buf->mem_info.dma_tag, + hwc_rxq->msg_buf->mem_info.dma_map, + BUS_DMASYNC_PREREAD); + + mana_hwc_post_rx_wqe(hwc_rxq, rx_req); +} + +static void +mana_hwc_tx_event_handler(void *ctx, uint32_t gdma_txq_id, + const struct hwc_rx_oob *rx_oob) +{ + struct hw_channel_context *hwc = ctx; + struct hwc_wq *hwc_txq = hwc->txq; + + if (!hwc_txq || hwc_txq->gdma_wq->id != gdma_txq_id) { + mana_warn(NULL, "unmatched tx queue %u != %u\n", + hwc_txq->gdma_wq->id, gdma_txq_id); + } + + bus_dmamap_sync(hwc_txq->gdma_wq->mem_info.dma_tag, + hwc_txq->gdma_wq->mem_info.dma_map, + BUS_DMASYNC_POSTWRITE); +} + +static int +mana_hwc_create_gdma_wq(struct hw_channel_context *hwc, + enum gdma_queue_type type, uint64_t queue_size, + struct gdma_queue **queue) +{ + struct gdma_queue_spec spec = {}; + + if (type != GDMA_SQ && type != GDMA_RQ) + return EINVAL; + + spec.type = type; + spec.monitor_avl_buf = false; + spec.queue_size = queue_size; + + return mana_gd_create_hwc_queue(hwc->gdma_dev, &spec, queue); +} + +static int +mana_hwc_create_gdma_cq(struct hw_channel_context *hwc, + uint64_t queue_size, + void *ctx, gdma_cq_callback *cb, + struct gdma_queue *parent_eq, + struct gdma_queue **queue) +{ + struct gdma_queue_spec spec = {}; + + spec.type = GDMA_CQ; + spec.monitor_avl_buf = false; + spec.queue_size = queue_size; + spec.cq.context = ctx; + spec.cq.callback = cb; + spec.cq.parent_eq = parent_eq; + + return mana_gd_create_hwc_queue(hwc->gdma_dev, &spec, queue); +} + +static int +mana_hwc_create_gdma_eq(struct hw_channel_context *hwc, + uint64_t queue_size, + void *ctx, gdma_eq_callback *cb, + struct gdma_queue **queue) +{ + struct gdma_queue_spec spec = {}; + + spec.type = GDMA_EQ; + spec.monitor_avl_buf = false; + spec.queue_size = queue_size; + spec.eq.context = ctx; + spec.eq.callback = cb; + spec.eq.log2_throttle_limit = DEFAULT_LOG2_THROTTLING_FOR_ERROR_EQ; + + return mana_gd_create_hwc_queue(hwc->gdma_dev, &spec, queue); +} + +static void +mana_hwc_comp_event(void *ctx, struct gdma_queue *q_self) +{ + struct hwc_rx_oob comp_data = {}; + struct gdma_comp *completions; + struct hwc_cq *hwc_cq = ctx; + int comp_read, i; + + completions = hwc_cq->comp_buf; + comp_read = mana_gd_poll_cq(q_self, completions, hwc_cq->queue_depth); + + for (i = 0; i < comp_read; ++i) { + comp_data = *(struct hwc_rx_oob *)completions[i].cqe_data; + + if (completions[i].is_sq) + hwc_cq->tx_event_handler(hwc_cq->tx_event_ctx, + completions[i].wq_num, + &comp_data); + else + hwc_cq->rx_event_handler(hwc_cq->rx_event_ctx, + completions[i].wq_num, + &comp_data); + } + + bus_dmamap_sync(q_self->mem_info.dma_tag, q_self->mem_info.dma_map, + BUS_DMASYNC_POSTREAD); + + mana_gd_arm_cq(q_self); +} + +static void +mana_hwc_destroy_cq(struct gdma_context *gc, struct hwc_cq *hwc_cq) +{ + if (!hwc_cq) + return; + + if (hwc_cq->comp_buf) + free(hwc_cq->comp_buf, M_DEVBUF); + + if (hwc_cq->gdma_cq) + mana_gd_destroy_queue(gc, hwc_cq->gdma_cq); + + if (hwc_cq->gdma_eq) + mana_gd_destroy_queue(gc, hwc_cq->gdma_eq); + + free(hwc_cq, M_DEVBUF); +} + +static int +mana_hwc_create_cq(struct hw_channel_context *hwc, + uint16_t q_depth, + gdma_eq_callback *callback, void *ctx, + hwc_rx_event_handler_t *rx_ev_hdlr, void *rx_ev_ctx, + hwc_tx_event_handler_t *tx_ev_hdlr, void *tx_ev_ctx, + struct hwc_cq **hwc_cq_ptr) +{ + struct gdma_queue *eq, *cq; + struct gdma_comp *comp_buf; + struct hwc_cq *hwc_cq; + uint32_t eq_size, cq_size; + int err; + + eq_size = roundup_pow_of_two(GDMA_EQE_SIZE * q_depth); + if (eq_size < MINIMUM_SUPPORTED_PAGE_SIZE) + eq_size = MINIMUM_SUPPORTED_PAGE_SIZE; + + cq_size = roundup_pow_of_two(GDMA_CQE_SIZE * q_depth); + if (cq_size < MINIMUM_SUPPORTED_PAGE_SIZE) + cq_size = MINIMUM_SUPPORTED_PAGE_SIZE; + + hwc_cq = malloc(sizeof(*hwc_cq), M_DEVBUF, M_WAITOK | M_ZERO); + if (!hwc_cq) + return ENOMEM; + + err = mana_hwc_create_gdma_eq(hwc, eq_size, ctx, callback, &eq); + if (err) { + device_printf(hwc->dev, + "Failed to create HWC EQ for RQ: %d\n", err); + goto out; + } + hwc_cq->gdma_eq = eq; + + err = mana_hwc_create_gdma_cq(hwc, cq_size, hwc_cq, + mana_hwc_comp_event, eq, &cq); + if (err) { + device_printf(hwc->dev, + "Failed to create HWC CQ for RQ: %d\n", err); + goto out; + } + hwc_cq->gdma_cq = cq; + + comp_buf = mallocarray(q_depth, sizeof(struct gdma_comp), + M_DEVBUF, M_WAITOK | M_ZERO); + if (!comp_buf) { + err = ENOMEM; + goto out; + } + + hwc_cq->hwc = hwc; + hwc_cq->comp_buf = comp_buf; + hwc_cq->queue_depth = q_depth; + hwc_cq->rx_event_handler = rx_ev_hdlr; + hwc_cq->rx_event_ctx = rx_ev_ctx; + hwc_cq->tx_event_handler = tx_ev_hdlr; + hwc_cq->tx_event_ctx = tx_ev_ctx; + + *hwc_cq_ptr = hwc_cq; + return 0; +out: + mana_hwc_destroy_cq(hwc->gdma_dev->gdma_context, hwc_cq); + return err; +} + +static int +mana_hwc_alloc_dma_buf(struct hw_channel_context *hwc, uint16_t q_depth, + uint32_t max_msg_size, + struct hwc_dma_buf **dma_buf_ptr) +{ + struct gdma_context *gc = hwc->gdma_dev->gdma_context; + struct hwc_work_request *hwc_wr; + struct hwc_dma_buf *dma_buf; + struct gdma_mem_info *gmi; + uint32_t buf_size; + uint8_t *base_pa; + void *virt_addr; + uint16_t i; + int err; + + dma_buf = malloc(sizeof(*dma_buf) + + q_depth * sizeof(struct hwc_work_request), + M_DEVBUF, M_WAITOK | M_ZERO); + if (!dma_buf) + return ENOMEM; + + dma_buf->num_reqs = q_depth; + + buf_size = ALIGN(q_depth * max_msg_size, PAGE_SIZE); + + gmi = &dma_buf->mem_info; + err = mana_gd_alloc_memory(gc, buf_size, gmi); + if (err) { + device_printf(hwc->dev, + "Failed to allocate DMA buffer: %d\n", err); + goto out; + } + + virt_addr = dma_buf->mem_info.virt_addr; + base_pa = (uint8_t *)dma_buf->mem_info.dma_handle; + + for (i = 0; i < q_depth; i++) { + hwc_wr = &dma_buf->reqs[i]; + + hwc_wr->buf_va = (char *)virt_addr + i * max_msg_size; + hwc_wr->buf_sge_addr = base_pa + i * max_msg_size; + + hwc_wr->buf_len = max_msg_size; + } + + *dma_buf_ptr = dma_buf; + return 0; +out: + free(dma_buf, M_DEVBUF); + return err; +} + +static void +mana_hwc_dealloc_dma_buf(struct hw_channel_context *hwc, + struct hwc_dma_buf *dma_buf) +{ + if (!dma_buf) + return; + + mana_gd_free_memory(&dma_buf->mem_info); + + free(dma_buf, M_DEVBUF); +} + +static void +mana_hwc_destroy_wq(struct hw_channel_context *hwc, + struct hwc_wq *hwc_wq) +{ + if (!hwc_wq) + return; + + mana_hwc_dealloc_dma_buf(hwc, hwc_wq->msg_buf); + + if (hwc_wq->gdma_wq) + mana_gd_destroy_queue(hwc->gdma_dev->gdma_context, + hwc_wq->gdma_wq); + + free(hwc_wq, M_DEVBUF); +} + +static int +mana_hwc_create_wq(struct hw_channel_context *hwc, + enum gdma_queue_type q_type, uint16_t q_depth, + uint32_t max_msg_size, struct hwc_cq *hwc_cq, + struct hwc_wq **hwc_wq_ptr) +{ + struct gdma_queue *queue; + struct hwc_wq *hwc_wq; + uint32_t queue_size; + int err; + + if (q_type != GDMA_SQ && q_type != GDMA_RQ) { + /* XXX should fail and return error? */ + mana_warn(NULL, "Invalid q_type %u\n", q_type); + } + + if (q_type == GDMA_RQ) + queue_size = roundup_pow_of_two(GDMA_MAX_RQE_SIZE * q_depth); + else + queue_size = roundup_pow_of_two(GDMA_MAX_SQE_SIZE * q_depth); + + if (queue_size < MINIMUM_SUPPORTED_PAGE_SIZE) + queue_size = MINIMUM_SUPPORTED_PAGE_SIZE; + + hwc_wq = malloc(sizeof(*hwc_wq), M_DEVBUF, M_WAITOK | M_ZERO); + if (!hwc_wq) + return ENOMEM; + + err = mana_hwc_create_gdma_wq(hwc, q_type, queue_size, &queue); + if (err) + goto out; + + err = mana_hwc_alloc_dma_buf(hwc, q_depth, max_msg_size, + &hwc_wq->msg_buf); + if (err) + goto out; + + hwc_wq->hwc = hwc; + hwc_wq->gdma_wq = queue; + hwc_wq->queue_depth = q_depth; + hwc_wq->hwc_cq = hwc_cq; + + *hwc_wq_ptr = hwc_wq; + return 0; +out: + if (err) + mana_hwc_destroy_wq(hwc, hwc_wq); + return err; +} + +static int +mana_hwc_post_tx_wqe(const struct hwc_wq *hwc_txq, + struct hwc_work_request *req, + uint32_t dest_virt_rq_id, uint32_t dest_virt_rcq_id, + bool dest_pf) +{ + device_t dev = hwc_txq->hwc->dev; + struct hwc_tx_oob *tx_oob; + struct gdma_sge *sge; + int err; + + if (req->msg_size == 0 || req->msg_size > req->buf_len) { + device_printf(dev, "wrong msg_size: %u, buf_len: %u\n", + req->msg_size, req->buf_len); + return EINVAL; + } + + tx_oob = &req->tx_oob; + + tx_oob->vrq_id = dest_virt_rq_id; + tx_oob->dest_vfid = 0; + tx_oob->vrcq_id = dest_virt_rcq_id; + tx_oob->vscq_id = hwc_txq->hwc_cq->gdma_cq->id; + tx_oob->loopback = false; + tx_oob->lso_override = false; + tx_oob->dest_pf = dest_pf; + tx_oob->vsq_id = hwc_txq->gdma_wq->id; + + sge = &req->sge; + sge->address = (uint64_t)req->buf_sge_addr; + sge->mem_key = hwc_txq->msg_buf->gpa_mkey; + sge->size = req->msg_size; + + memset(&req->wqe_req, 0, sizeof(struct gdma_wqe_request)); + req->wqe_req.sgl = sge; + req->wqe_req.num_sge = 1; + req->wqe_req.inline_oob_size = sizeof(struct hwc_tx_oob); + req->wqe_req.inline_oob_data = tx_oob; + req->wqe_req.client_data_unit = 0; + + err = mana_gd_post_and_ring(hwc_txq->gdma_wq, &req->wqe_req, NULL); + if (err) + device_printf(dev, + "Failed to post WQE on HWC SQ: %d\n", err); + return err; +} + +static int +mana_hwc_init_inflight_msg(struct hw_channel_context *hwc, uint16_t num_msg) +{ + int err; + + sema_init(&hwc->sema, num_msg, "gdma hwc sema"); + + err = mana_gd_alloc_res_map(num_msg, &hwc->inflight_msg_res, + "gdma hwc res lock"); + if (err) + device_printf(hwc->dev, + "Failed to init inflight_msg_res: %d\n", err); + + return (err); +} + +static int +mana_hwc_test_channel(struct hw_channel_context *hwc, uint16_t q_depth, + uint32_t max_req_msg_size, uint32_t max_resp_msg_size) +{ + struct gdma_context *gc = hwc->gdma_dev->gdma_context; + struct hwc_wq *hwc_rxq = hwc->rxq; + struct hwc_work_request *req; + struct hwc_caller_ctx *ctx; + int err; + int i; + + /* Post all WQEs on the RQ */ + for (i = 0; i < q_depth; i++) { + req = &hwc_rxq->msg_buf->reqs[i]; + err = mana_hwc_post_rx_wqe(hwc_rxq, req); + if (err) + return err; + } + + ctx = malloc(q_depth * sizeof(struct hwc_caller_ctx), + M_DEVBUF, M_WAITOK | M_ZERO); + if (!ctx) + return ENOMEM; + + for (i = 0; i < q_depth; ++i) + init_completion(&ctx[i].comp_event); + + hwc->caller_ctx = ctx; + + return mana_gd_test_eq(gc, hwc->cq->gdma_eq); +} + +static int +mana_hwc_establish_channel(struct gdma_context *gc, uint16_t *q_depth, + uint32_t *max_req_msg_size, + uint32_t *max_resp_msg_size) +{ + struct hw_channel_context *hwc = gc->hwc.driver_data; + struct gdma_queue *rq = hwc->rxq->gdma_wq; + struct gdma_queue *sq = hwc->txq->gdma_wq; + struct gdma_queue *eq = hwc->cq->gdma_eq; + struct gdma_queue *cq = hwc->cq->gdma_cq; + int err; + + init_completion(&hwc->hwc_init_eqe_comp); + + err = mana_smc_setup_hwc(&gc->shm_channel, false, + eq->mem_info.dma_handle, + cq->mem_info.dma_handle, + rq->mem_info.dma_handle, + sq->mem_info.dma_handle, + eq->eq.msix_index); + if (err) + return err; + + if (wait_for_completion_timeout(&hwc->hwc_init_eqe_comp, 60 * hz)) + return ETIMEDOUT; + + *q_depth = hwc->hwc_init_q_depth_max; + *max_req_msg_size = hwc->hwc_init_max_req_msg_size; + *max_resp_msg_size = hwc->hwc_init_max_resp_msg_size; + + if (cq->id >= gc->max_num_cqs) { + mana_warn(NULL, "invalid cq id %u > %u\n", + cq->id, gc->max_num_cqs); + return EPROTO; + } + + gc->cq_table = malloc(gc->max_num_cqs * sizeof(struct gdma_queue *), + M_DEVBUF, M_WAITOK | M_ZERO); + if (!gc->cq_table) + return ENOMEM; + + gc->cq_table[cq->id] = cq; + + return 0; +} + +static int +mana_hwc_init_queues(struct hw_channel_context *hwc, uint16_t q_depth, + uint32_t max_req_msg_size, uint32_t max_resp_msg_size) +{ + struct hwc_wq *hwc_rxq = NULL; + struct hwc_wq *hwc_txq = NULL; + struct hwc_cq *hwc_cq = NULL; + int err; + + err = mana_hwc_init_inflight_msg(hwc, q_depth); + if (err) + return err; + + /* CQ is shared by SQ and RQ, so CQ's queue depth is the sum of SQ + * queue depth and RQ queue depth. + */ + err = mana_hwc_create_cq(hwc, q_depth * 2, + mana_hwc_init_event_handler, hwc, + mana_hwc_rx_event_handler, hwc, + mana_hwc_tx_event_handler, hwc, &hwc_cq); + if (err) { + device_printf(hwc->dev, "Failed to create HWC CQ: %d\n", err); + goto out; + } + hwc->cq = hwc_cq; + + err = mana_hwc_create_wq(hwc, GDMA_RQ, q_depth, max_req_msg_size, + hwc_cq, &hwc_rxq); + if (err) { + device_printf(hwc->dev, "Failed to create HWC RQ: %d\n", err); + goto out; + } + hwc->rxq = hwc_rxq; + + err = mana_hwc_create_wq(hwc, GDMA_SQ, q_depth, max_resp_msg_size, + hwc_cq, &hwc_txq); + if (err) { + device_printf(hwc->dev, "Failed to create HWC SQ: %d\n", err); + goto out; + } + hwc->txq = hwc_txq; + + hwc->num_inflight_msg = q_depth; + hwc->max_req_msg_size = max_req_msg_size; + + return 0; +out: + if (hwc_txq) + mana_hwc_destroy_wq(hwc, hwc_txq); + + if (hwc_rxq) + mana_hwc_destroy_wq(hwc, hwc_rxq); + + if (hwc_cq) + mana_hwc_destroy_cq(hwc->gdma_dev->gdma_context, hwc_cq); + + mana_gd_free_res_map(&hwc->inflight_msg_res); + return err; +} + +int +mana_hwc_create_channel(struct gdma_context *gc) +{ + uint32_t max_req_msg_size, max_resp_msg_size; + struct gdma_dev *gd = &gc->hwc; + struct hw_channel_context *hwc; + uint16_t q_depth_max; + int err; + + hwc = malloc(sizeof(*hwc), M_DEVBUF, M_WAITOK | M_ZERO); + if (!hwc) + return ENOMEM; + + gd->gdma_context = gc; + gd->driver_data = hwc; + hwc->gdma_dev = gd; + hwc->dev = gc->dev; + + /* HWC's instance number is always 0. */ + gd->dev_id.as_uint32 = 0; + gd->dev_id.type = GDMA_DEVICE_HWC; + + gd->pdid = INVALID_PDID; + gd->doorbell = INVALID_DOORBELL; + + err = mana_hwc_init_queues(hwc, HW_CHANNEL_VF_BOOTSTRAP_QUEUE_DEPTH, + HW_CHANNEL_MAX_REQUEST_SIZE, + HW_CHANNEL_MAX_RESPONSE_SIZE); + if (err) { + device_printf(hwc->dev, "Failed to initialize HWC: %d\n", + err); + goto out; + } + + err = mana_hwc_establish_channel(gc, &q_depth_max, &max_req_msg_size, + &max_resp_msg_size); + if (err) { + device_printf(hwc->dev, "Failed to establish HWC: %d\n", err); + goto out; + } + + err = mana_hwc_test_channel(gc->hwc.driver_data, + HW_CHANNEL_VF_BOOTSTRAP_QUEUE_DEPTH, + max_req_msg_size, max_resp_msg_size); + if (err) { + /* Test failed, but the channel has been established */ + device_printf(hwc->dev, "Failed to test HWC: %d\n", err); + return EIO; + } + + return 0; +out: + free(hwc, M_DEVBUF); + return (err); +} + +void +mana_hwc_destroy_channel(struct gdma_context *gc) +{ + struct hw_channel_context *hwc = gc->hwc.driver_data; + struct hwc_caller_ctx *ctx; + + mana_smc_teardown_hwc(&gc->shm_channel, false); + + ctx = hwc->caller_ctx; + free(ctx, M_DEVBUF); + hwc->caller_ctx = NULL; + + mana_hwc_destroy_wq(hwc, hwc->txq); + hwc->txq = NULL; + + mana_hwc_destroy_wq(hwc, hwc->rxq); + hwc->rxq = NULL; + + mana_hwc_destroy_cq(hwc->gdma_dev->gdma_context, hwc->cq); + hwc->cq = NULL; + + mana_gd_free_res_map(&hwc->inflight_msg_res); + + hwc->num_inflight_msg = 0; + + if (hwc->gdma_dev->pdid != INVALID_PDID) { + hwc->gdma_dev->doorbell = INVALID_DOORBELL; + hwc->gdma_dev->pdid = INVALID_PDID; + } + + free(hwc, M_DEVBUF); + gc->hwc.driver_data = NULL; + gc->hwc.gdma_context = NULL; +} + +int +mana_hwc_send_request(struct hw_channel_context *hwc, uint32_t req_len, + const void *req, uint32_t resp_len, void *resp) +{ + struct hwc_work_request *tx_wr; + struct hwc_wq *txq = hwc->txq; + struct gdma_req_hdr *req_msg; + struct hwc_caller_ctx *ctx; + uint16_t msg_id; + int err; + + mana_hwc_get_msg_index(hwc, &msg_id); + + tx_wr = &txq->msg_buf->reqs[msg_id]; + + if (req_len > tx_wr->buf_len) { + device_printf(hwc->dev, + "HWC: req msg size: %d > %d\n", req_len, + tx_wr->buf_len); + err = EINVAL; + goto out; + } + + ctx = hwc->caller_ctx + msg_id; + ctx->output_buf = resp; + ctx->output_buflen = resp_len; + + req_msg = (struct gdma_req_hdr *)tx_wr->buf_va; + if (req) + memcpy(req_msg, req, req_len); + + req_msg->req.hwc_msg_id = msg_id; + + tx_wr->msg_size = req_len; + + err = mana_hwc_post_tx_wqe(txq, tx_wr, 0, 0, false); + if (err) { + device_printf(hwc->dev, + "HWC: Failed to post send WQE: %d\n", err); + goto out; + } + + if (wait_for_completion_timeout(&ctx->comp_event, 30 * hz)) { + device_printf(hwc->dev, "HWC: Request timed out!\n"); + err = ETIMEDOUT; + goto out; + } + + if (ctx->error) { + err = ctx->error; + goto out; + } + + if (ctx->status_code) { + device_printf(hwc->dev, + "HWC: Failed hw_channel req: 0x%x\n", ctx->status_code); + err = EPROTO; + goto out; + } +out: + mana_hwc_put_msg_index(hwc, msg_id); + return err; +} diff --git a/sys/dev/mana/hw_channel.h b/sys/dev/mana/hw_channel.h new file mode 100644 index 000000000000..368cc1ecd5f9 --- /dev/null +++ b/sys/dev/mana/hw_channel.h @@ -0,0 +1,222 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#ifndef _HW_CHANNEL_H +#define _HW_CHANNEL_H + +#include <sys/sema.h> + +#define DEFAULT_LOG2_THROTTLING_FOR_ERROR_EQ 4 + +#define HW_CHANNEL_MAX_REQUEST_SIZE 0x1000 +#define HW_CHANNEL_MAX_RESPONSE_SIZE 0x1000 + +#define HW_CHANNEL_VF_BOOTSTRAP_QUEUE_DEPTH 1 + +#define HWC_INIT_DATA_CQID 1 +#define HWC_INIT_DATA_RQID 2 +#define HWC_INIT_DATA_SQID 3 +#define HWC_INIT_DATA_QUEUE_DEPTH 4 +#define HWC_INIT_DATA_MAX_REQUEST 5 +#define HWC_INIT_DATA_MAX_RESPONSE 6 +#define HWC_INIT_DATA_MAX_NUM_CQS 7 +#define HWC_INIT_DATA_PDID 8 +#define HWC_INIT_DATA_GPA_MKEY 9 + +/* Structures labeled with "HW DATA" are exchanged with the hardware. All of + * them are naturally aligned and hence don't need __packed. + */ + +union hwc_init_eq_id_db { + uint32_t as_uint32; + + struct { + uint32_t eq_id : 16; + uint32_t doorbell: 16; + }; +}; /* HW DATA */ + +union hwc_init_type_data { + uint32_t as_uint32; + + struct { + uint32_t value : 24; + uint32_t type : 8; + }; +}; /* HW DATA */ + +struct hwc_rx_oob { + uint32_t type : 6; + uint32_t eom : 1; + uint32_t som : 1; + uint32_t vendor_err : 8; + uint32_t reserved1 : 16; + + uint32_t src_virt_wq : 24; + uint32_t src_vfid : 8; + + uint32_t reserved2; + + union { + uint32_t wqe_addr_low; + uint32_t wqe_offset; + }; + + uint32_t wqe_addr_high; + + uint32_t client_data_unit : 14; + uint32_t reserved3 : 18; + + uint32_t tx_oob_data_size; + + uint32_t chunk_offset : 21; + uint32_t reserved4 : 11; +}; /* HW DATA */ + +struct hwc_tx_oob { + uint32_t reserved1; + + uint32_t reserved2; + + uint32_t vrq_id : 24; + uint32_t dest_vfid : 8; + + uint32_t vrcq_id : 24; + uint32_t reserved3 : 8; + + uint32_t vscq_id : 24; + uint32_t loopback : 1; + uint32_t lso_override: 1; + uint32_t dest_pf : 1; + uint32_t reserved4 : 5; + + uint32_t vsq_id : 24; + uint32_t reserved5 : 8; +}; /* HW DATA */ + +struct hwc_work_request { + void *buf_va; + void *buf_sge_addr; + uint32_t buf_len; + uint32_t msg_size; + + struct gdma_wqe_request wqe_req; + struct hwc_tx_oob tx_oob; + + struct gdma_sge sge; +}; + +/* hwc_dma_buf represents the array of in-flight WQEs. + * mem_info as know as the GDMA mapped memory is partitioned and used by + * in-flight WQEs. + * The number of WQEs is determined by the number of in-flight messages. + */ +struct hwc_dma_buf { + struct gdma_mem_info mem_info; + + uint32_t gpa_mkey; + + uint32_t num_reqs; + struct hwc_work_request reqs[]; +}; + +typedef void hwc_rx_event_handler_t(void *ctx, uint32_t gdma_rxq_id, + const struct hwc_rx_oob *rx_oob); + +typedef void hwc_tx_event_handler_t(void *ctx, uint32_t gdma_txq_id, + const struct hwc_rx_oob *rx_oob); + +struct hwc_cq { + struct hw_channel_context *hwc; + + struct gdma_queue *gdma_cq; + struct gdma_queue *gdma_eq; + struct gdma_comp *comp_buf; + uint16_t queue_depth; + + hwc_rx_event_handler_t *rx_event_handler; + void *rx_event_ctx; + + hwc_tx_event_handler_t *tx_event_handler; + void *tx_event_ctx; +}; + +struct hwc_wq { + struct hw_channel_context *hwc; + + struct gdma_queue *gdma_wq; + struct hwc_dma_buf *msg_buf; + uint16_t queue_depth; + + struct hwc_cq *hwc_cq; +}; + +struct hwc_caller_ctx { + struct completion comp_event; + void *output_buf; + uint32_t output_buflen; + + uint32_t error; /* Error code */ + uint32_t status_code; +}; + +struct hw_channel_context { + struct gdma_dev *gdma_dev; + device_t dev; + + uint16_t num_inflight_msg; + uint32_t max_req_msg_size; + + uint16_t hwc_init_q_depth_max; + uint32_t hwc_init_max_req_msg_size; + uint32_t hwc_init_max_resp_msg_size; + + struct completion hwc_init_eqe_comp; + + struct hwc_wq *rxq; + struct hwc_wq *txq; + struct hwc_cq *cq; + + struct sema sema; + struct gdma_resource inflight_msg_res; + + struct hwc_caller_ctx *caller_ctx; +}; + +int mana_hwc_create_channel(struct gdma_context *gc); +void mana_hwc_destroy_channel(struct gdma_context *gc); + +int mana_hwc_send_request(struct hw_channel_context *hwc, uint32_t req_len, + const void *req, uint32_t resp_len, void *resp); + +#endif /* _HW_CHANNEL_H */ diff --git a/sys/dev/mana/mana.h b/sys/dev/mana/mana.h new file mode 100644 index 000000000000..683ab67a6abd --- /dev/null +++ b/sys/dev/mana/mana.h @@ -0,0 +1,689 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#ifndef _MANA_H +#define _MANA_H + +#include <sys/types.h> +#include <sys/proc.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <sys/taskqueue.h> +#include <sys/counter.h> + +#include <net/ethernet.h> +#include <net/if.h> +#include <net/if_media.h> +#include <netinet/tcp_lro.h> + +#include "gdma.h" +#include "hw_channel.h" + + +/* Microsoft Azure Network Adapter (MANA)'s definitions + * + * Structures labeled with "HW DATA" are exchanged with the hardware. All of + * them are naturally aligned and hence don't need __packed. + */ +/* MANA protocol version */ +#define MANA_MAJOR_VERSION 0 +#define MANA_MINOR_VERSION 1 +#define MANA_MICRO_VERSION 1 + +#define DRV_MODULE_NAME "mana" + +#ifndef DRV_MODULE_VERSION +#define DRV_MODULE_VERSION \ + __XSTRING(MANA_MAJOR_VERSION) "." \ + __XSTRING(MANA_MINOR_VERSION) "." \ + __XSTRING(MANA_MICRO_VERSION) +#endif +#define DEVICE_NAME "Microsoft Azure Network Adapter (MANA)" +#define DEVICE_DESC "MANA adapter" + +/* + * Supported PCI vendor and devices IDs + */ +#ifndef PCI_VENDOR_ID_MICROSOFT +#define PCI_VENDOR_ID_MICROSOFT 0x1414 +#endif + +#define PCI_DEV_ID_MANA_VF 0x00ba + +typedef struct _mana_vendor_id_t { + uint16_t vendor_id; + uint16_t device_id; +} mana_vendor_id_t; + +typedef uint64_t mana_handle_t; +#define INVALID_MANA_HANDLE ((mana_handle_t)-1) + +enum TRI_STATE { + TRI_STATE_UNKNOWN = -1, + TRI_STATE_FALSE = 0, + TRI_STATE_TRUE = 1 +}; + +/* Number of entries for hardware indirection table must be in power of 2 */ +#define MANA_INDIRECT_TABLE_SIZE 64 +#define MANA_INDIRECT_TABLE_MASK (MANA_INDIRECT_TABLE_SIZE - 1) + +/* The Toeplitz hash key's length in bytes: should be multiple of 8 */ +#define MANA_HASH_KEY_SIZE 40 + +#define COMP_ENTRY_SIZE 64 + +#define MIN_FRAME_SIZE 146 +#define ADAPTER_MTU_SIZE 1500 +#define DEFAULT_FRAME_SIZE (ADAPTER_MTU_SIZE + 14) +#define MAX_FRAME_SIZE 4096 + +#define RX_BUFFERS_PER_QUEUE 512 + +#define MAX_SEND_BUFFERS_PER_QUEUE 256 + +#define EQ_SIZE (8 * PAGE_SIZE) +#define LOG2_EQ_THROTTLE 3 + +#if 1 /* XXX */ +#define MAX_PORTS_IN_MANA_DEV 1 +#else +#define MAX_PORTS_IN_MANA_DEV 16 +#endif + +struct mana_send_buf_info { + struct mbuf *mbuf; + bus_dmamap_t dma_map; + + /* Required to store the result of mana_gd_post_work_request. + * gdma_posted_wqe_info.wqe_size_in_bu is required for progressing the + * work queue when the WQE is consumed. + */ + struct gdma_posted_wqe_info wqe_inf; +}; + +struct mana_stats { + counter_u64_t packets; /* rx, tx */ + counter_u64_t bytes; /* rx, tx */ + counter_u64_t stop; /* tx */ + counter_u64_t wakeup; /* tx */ + counter_u64_t collapse; /* tx */ + counter_u64_t collapse_err; /* tx */ + counter_u64_t dma_mapping_err; /* rx, tx */ + counter_u64_t mbuf_alloc_fail; /* rx */ + counter_u64_t alt_chg; /* tx */ + counter_u64_t alt_reset; /* tx */ +}; + +struct mana_txq { + struct gdma_queue *gdma_sq; + + union { + uint32_t gdma_txq_id; + struct { + uint32_t reserved1 :10; + uint32_t vsq_frame :14; + uint32_t reserved2 :8; + }; + }; + + uint16_t vp_offset; + + struct ifnet *ndev; + /* Store index to the array of tx_qp in port structure */ + int idx; + /* The alternative txq idx when this txq is under heavy load */ + int alt_txq_idx; + + /* The mbufs are sent to the HW and we are waiting for the CQEs. */ + struct mana_send_buf_info *tx_buf_info; + uint16_t next_to_use; + uint16_t next_to_complete; + + atomic_t pending_sends; + + struct buf_ring *txq_br; + struct mtx txq_mtx; + char txq_mtx_name[16]; + + struct task enqueue_task; + struct taskqueue *enqueue_tq; + + struct mana_stats stats; +}; + + +/* + * Max WQE size is 512B. The first 8B is for GDMA Out of Band (OOB), + * next is the Client OOB can be either 8B or 24B. Thus, the max + * space for SGL entries in a singel WQE is 512 - 8 - 8 = 496B. Since each + * SGL is 16B in size, the max number of SGLs in a WQE is 496/16 = 31. + * Save one for emergency use, set the MAX_MBUF_FRAGS allowed to 30. + */ +#define MAX_MBUF_FRAGS 30 +#define MANA_TSO_MAXSEG_SZ PAGE_SIZE + +/* mbuf data and frags dma mappings */ +struct mana_mbuf_head { + bus_addr_t dma_handle[MAX_MBUF_FRAGS + 1]; + + uint32_t size[MAX_MBUF_FRAGS + 1]; +}; + +#define MANA_HEADROOM sizeof(struct mana_mbuf_head) + +enum mana_tx_pkt_format { + MANA_SHORT_PKT_FMT = 0, + MANA_LONG_PKT_FMT = 1, +}; + +struct mana_tx_short_oob { + uint32_t pkt_fmt :2; + uint32_t is_outer_ipv4 :1; + uint32_t is_outer_ipv6 :1; + uint32_t comp_iphdr_csum :1; + uint32_t comp_tcp_csum :1; + uint32_t comp_udp_csum :1; + uint32_t supress_txcqe_gen :1; + uint32_t vcq_num :24; + + uint32_t trans_off :10; /* Transport header offset */ + uint32_t vsq_frame :14; + uint32_t short_vp_offset :8; +}; /* HW DATA */ + +struct mana_tx_long_oob { + uint32_t is_encap :1; + uint32_t inner_is_ipv6 :1; + uint32_t inner_tcp_opt :1; + uint32_t inject_vlan_pri_tag :1; + uint32_t reserved1 :12; + uint32_t pcp :3; /* 802.1Q */ + uint32_t dei :1; /* 802.1Q */ + uint32_t vlan_id :12; /* 802.1Q */ + + uint32_t inner_frame_offset :10; + uint32_t inner_ip_rel_offset :6; + uint32_t long_vp_offset :12; + uint32_t reserved2 :4; + + uint32_t reserved3; + uint32_t reserved4; +}; /* HW DATA */ + +struct mana_tx_oob { + struct mana_tx_short_oob s_oob; + struct mana_tx_long_oob l_oob; +}; /* HW DATA */ + +enum mana_cq_type { + MANA_CQ_TYPE_RX, + MANA_CQ_TYPE_TX, +}; + +enum mana_cqe_type { + CQE_INVALID = 0, + CQE_RX_OKAY = 1, + CQE_RX_COALESCED_4 = 2, + CQE_RX_OBJECT_FENCE = 3, + CQE_RX_TRUNCATED = 4, + + CQE_TX_OKAY = 32, + CQE_TX_SA_DROP = 33, + CQE_TX_MTU_DROP = 34, + CQE_TX_INVALID_OOB = 35, + CQE_TX_INVALID_ETH_TYPE = 36, + CQE_TX_HDR_PROCESSING_ERROR = 37, + CQE_TX_VF_DISABLED = 38, + CQE_TX_VPORT_IDX_OUT_OF_RANGE = 39, + CQE_TX_VPORT_DISABLED = 40, + CQE_TX_VLAN_TAGGING_VIOLATION = 41, +}; + +#define MANA_CQE_COMPLETION 1 + +struct mana_cqe_header { + uint32_t cqe_type :6; + uint32_t client_type :2; + uint32_t vendor_err :24; +}; /* HW DATA */ + +/* NDIS HASH Types */ +#define NDIS_HASH_IPV4 BIT(0) +#define NDIS_HASH_TCP_IPV4 BIT(1) +#define NDIS_HASH_UDP_IPV4 BIT(2) +#define NDIS_HASH_IPV6 BIT(3) +#define NDIS_HASH_TCP_IPV6 BIT(4) +#define NDIS_HASH_UDP_IPV6 BIT(5) +#define NDIS_HASH_IPV6_EX BIT(6) +#define NDIS_HASH_TCP_IPV6_EX BIT(7) +#define NDIS_HASH_UDP_IPV6_EX BIT(8) + +#define MANA_HASH_L3 (NDIS_HASH_IPV4 | NDIS_HASH_IPV6 | NDIS_HASH_IPV6_EX) +#define MANA_HASH_L4 \ + (NDIS_HASH_TCP_IPV4 | NDIS_HASH_UDP_IPV4 | NDIS_HASH_TCP_IPV6 | \ + NDIS_HASH_UDP_IPV6 | NDIS_HASH_TCP_IPV6_EX | NDIS_HASH_UDP_IPV6_EX) + +#define NDIS_HASH_IPV4_L3_MASK (NDIS_HASH_IPV4) +#define NDIS_HASH_IPV4_L4_MASK (NDIS_HASH_TCP_IPV4 | NDIS_HASH_UDP_IPV4) +#define NDIS_HASH_IPV6_L3_MASK (NDIS_HASH_IPV6 | NDIS_HASH_IPV6_EX) +#define NDIS_HASH_IPV6_L4_MASK \ + (NDIS_HASH_TCP_IPV6 | NDIS_HASH_UDP_IPV6 | \ + NDIS_HASH_TCP_IPV6_EX | NDIS_HASH_UDP_IPV6_EX) +#define NDIS_HASH_IPV4_MASK \ + (NDIS_HASH_IPV4_L3_MASK | NDIS_HASH_IPV4_L4_MASK) +#define NDIS_HASH_IPV6_MASK \ + (NDIS_HASH_IPV6_L3_MASK | NDIS_HASH_IPV6_L4_MASK) + + +struct mana_rxcomp_perpkt_info { + uint32_t pkt_len :16; + uint32_t reserved1 :16; + uint32_t reserved2; + uint32_t pkt_hash; +}; /* HW DATA */ + +#define MANA_RXCOMP_OOB_NUM_PPI 4 + +/* Receive completion OOB */ +struct mana_rxcomp_oob { + struct mana_cqe_header cqe_hdr; + + uint32_t rx_vlan_id :12; + uint32_t rx_vlantag_present :1; + uint32_t rx_outer_iphdr_csum_succeed :1; + uint32_t rx_outer_iphdr_csum_fail :1; + uint32_t reserved1 :1; + uint32_t rx_hashtype :9; + uint32_t rx_iphdr_csum_succeed :1; + uint32_t rx_iphdr_csum_fail :1; + uint32_t rx_tcp_csum_succeed :1; + uint32_t rx_tcp_csum_fail :1; + uint32_t rx_udp_csum_succeed :1; + uint32_t rx_udp_csum_fail :1; + uint32_t reserved2 :1; + + struct mana_rxcomp_perpkt_info ppi[MANA_RXCOMP_OOB_NUM_PPI]; + + uint32_t rx_wqe_offset; +}; /* HW DATA */ + +struct mana_tx_comp_oob { + struct mana_cqe_header cqe_hdr; + + uint32_t tx_data_offset; + + uint32_t tx_sgl_offset :5; + uint32_t tx_wqe_offset :27; + + uint32_t reserved[12]; +}; /* HW DATA */ + +struct mana_rxq; + +struct mana_cq { + struct gdma_queue *gdma_cq; + + /* Cache the CQ id (used to verify if each CQE comes to the right CQ. */ + uint32_t gdma_id; + + /* Type of the CQ: TX or RX */ + enum mana_cq_type type; + + /* Pointer to the mana_rxq that is pushing RX CQEs to the queue. + * Only and must be non-NULL if type is MANA_CQ_TYPE_RX. + */ + struct mana_rxq *rxq; + + /* Pointer to the mana_txq that is pushing TX CQEs to the queue. + * Only and must be non-NULL if type is MANA_CQ_TYPE_TX. + */ + struct mana_txq *txq; + + /* Pointer to a buffer which the CQ handler can copy the CQE's into. */ + struct gdma_comp *gdma_comp_buf; +}; + +#define GDMA_MAX_RQE_SGES 15 + +struct mana_recv_buf_oob { + /* A valid GDMA work request representing the data buffer. */ + struct gdma_wqe_request wqe_req; + + struct mbuf *mbuf; + bus_dmamap_t dma_map; + + /* SGL of the buffer going to be sent as part of the work request. */ + uint32_t num_sge; + struct gdma_sge sgl[GDMA_MAX_RQE_SGES]; + + /* Required to store the result of mana_gd_post_work_request. + * gdma_posted_wqe_info.wqe_size_in_bu is required for progressing the + * work queue when the WQE is consumed. + */ + struct gdma_posted_wqe_info wqe_inf; +}; + +struct mana_rxq { + struct gdma_queue *gdma_rq; + /* Cache the gdma receive queue id */ + uint32_t gdma_id; + + /* Index of RQ in the vPort, not gdma receive queue id */ + uint32_t rxq_idx; + + uint32_t datasize; + + mana_handle_t rxobj; + + struct mana_cq rx_cq; + + struct ifnet *ndev; + struct lro_ctrl lro; + + /* Total number of receive buffers to be allocated */ + uint32_t num_rx_buf; + + uint32_t buf_index; + + struct mana_stats stats; + + /* MUST BE THE LAST MEMBER: + * Each receive buffer has an associated mana_recv_buf_oob. + */ + struct mana_recv_buf_oob rx_oobs[]; +}; + +struct mana_tx_qp { + struct mana_txq txq; + + struct mana_cq tx_cq; + + mana_handle_t tx_object; +}; + +struct mana_port_stats { + counter_u64_t rx_packets; + counter_u64_t tx_packets; + + counter_u64_t rx_bytes; + counter_u64_t tx_bytes; + + counter_u64_t rx_drops; + counter_u64_t tx_drops; + + counter_u64_t stop_queue; + counter_u64_t wake_queue; +}; + +struct mana_context { + struct gdma_dev *gdma_dev; + + uint16_t num_ports; + + struct ifnet *ports[MAX_PORTS_IN_MANA_DEV]; +}; + +struct mana_port_context { + struct mana_context *ac; + struct ifnet *ndev; + struct ifmedia media; + + struct sx apc_lock; + + /* DMA tag used for queue bufs of the entire port */ + bus_dma_tag_t rx_buf_tag; + bus_dma_tag_t tx_buf_tag; + + uint8_t mac_addr[ETHER_ADDR_LEN]; + + struct mana_eq *eqs; + + enum TRI_STATE rss_state; + + mana_handle_t default_rxobj; + bool tx_shortform_allowed; + uint16_t tx_vp_offset; + + struct mana_tx_qp *tx_qp; + + /* Indirection Table for RX & TX. The values are queue indexes */ + uint32_t indir_table[MANA_INDIRECT_TABLE_SIZE]; + + /* Indirection table containing RxObject Handles */ + mana_handle_t rxobj_table[MANA_INDIRECT_TABLE_SIZE]; + + /* Hash key used by the NIC */ + uint8_t hashkey[MANA_HASH_KEY_SIZE]; + + /* This points to an array of num_queues of RQ pointers. */ + struct mana_rxq **rxqs; + + /* Create num_queues EQs, SQs, SQ-CQs, RQs and RQ-CQs, respectively. */ + unsigned int max_queues; + unsigned int num_queues; + + mana_handle_t port_handle; + + uint16_t port_idx; + + uint16_t frame_size; + + bool port_is_up; + bool port_st_save; /* Saved port state */ + + bool enable_tx_altq; + bool bind_cleanup_thread_cpu; + + struct mana_port_stats port_stats; + + struct sysctl_oid_list *port_list; + struct sysctl_ctx_list que_sysctl_ctx; +}; + +#define MANA_APC_LOCK_INIT(apc) \ + sx_init(&(apc)->apc_lock, "MANA port lock") +#define MANA_APC_LOCK_DESTROY(apc) sx_destroy(&(apc)->apc_lock) +#define MANA_APC_LOCK_LOCK(apc) sx_xlock(&(apc)->apc_lock) +#define MANA_APC_LOCK_UNLOCK(apc) sx_unlock(&(apc)->apc_lock) + +int mana_config_rss(struct mana_port_context *ac, enum TRI_STATE rx, + bool update_hash, bool update_tab); + +int mana_alloc_queues(struct ifnet *ndev); +int mana_attach(struct ifnet *ndev); +int mana_detach(struct ifnet *ndev); + +int mana_probe(struct gdma_dev *gd); +void mana_remove(struct gdma_dev *gd); + +struct mana_obj_spec { + uint32_t queue_index; + uint64_t gdma_region; + uint32_t queue_size; + uint32_t attached_eq; + uint32_t modr_ctx_id; +}; + +enum mana_command_code { + MANA_QUERY_DEV_CONFIG = 0x20001, + MANA_QUERY_GF_STAT = 0x20002, + MANA_CONFIG_VPORT_TX = 0x20003, + MANA_CREATE_WQ_OBJ = 0x20004, + MANA_DESTROY_WQ_OBJ = 0x20005, + MANA_FENCE_RQ = 0x20006, + MANA_CONFIG_VPORT_RX = 0x20007, + MANA_QUERY_VPORT_CONFIG = 0x20008, +}; + +/* Query Device Configuration */ +struct mana_query_device_cfg_req { + struct gdma_req_hdr hdr; + + /* Driver Capability flags */ + uint64_t drv_cap_flags1; + uint64_t drv_cap_flags2; + uint64_t drv_cap_flags3; + uint64_t drv_cap_flags4; + + uint32_t proto_major_ver; + uint32_t proto_minor_ver; + uint32_t proto_micro_ver; + + uint32_t reserved; +}; /* HW DATA */ + +struct mana_query_device_cfg_resp { + struct gdma_resp_hdr hdr; + + uint64_t pf_cap_flags1; + uint64_t pf_cap_flags2; + uint64_t pf_cap_flags3; + uint64_t pf_cap_flags4; + + uint16_t max_num_vports; + uint16_t reserved; + uint32_t max_num_eqs; +}; /* HW DATA */ + +/* Query vPort Configuration */ +struct mana_query_vport_cfg_req { + struct gdma_req_hdr hdr; + uint32_t vport_index; +}; /* HW DATA */ + +struct mana_query_vport_cfg_resp { + struct gdma_resp_hdr hdr; + uint32_t max_num_sq; + uint32_t max_num_rq; + uint32_t num_indirection_ent; + uint32_t reserved1; + uint8_t mac_addr[6]; + uint8_t reserved2[2]; + mana_handle_t vport; +}; /* HW DATA */ + +/* Configure vPort */ +struct mana_config_vport_req { + struct gdma_req_hdr hdr; + mana_handle_t vport; + uint32_t pdid; + uint32_t doorbell_pageid; +}; /* HW DATA */ + +struct mana_config_vport_resp { + struct gdma_resp_hdr hdr; + uint16_t tx_vport_offset; + uint8_t short_form_allowed; + uint8_t reserved; +}; /* HW DATA */ + +/* Create WQ Object */ +struct mana_create_wqobj_req { + struct gdma_req_hdr hdr; + mana_handle_t vport; + uint32_t wq_type; + uint32_t reserved; + uint64_t wq_gdma_region; + uint64_t cq_gdma_region; + uint32_t wq_size; + uint32_t cq_size; + uint32_t cq_moderation_ctx_id; + uint32_t cq_parent_qid; +}; /* HW DATA */ + +struct mana_create_wqobj_resp { + struct gdma_resp_hdr hdr; + uint32_t wq_id; + uint32_t cq_id; + mana_handle_t wq_obj; +}; /* HW DATA */ + +/* Destroy WQ Object */ +struct mana_destroy_wqobj_req { + struct gdma_req_hdr hdr; + uint32_t wq_type; + uint32_t reserved; + mana_handle_t wq_obj_handle; +}; /* HW DATA */ + +struct mana_destroy_wqobj_resp { + struct gdma_resp_hdr hdr; +}; /* HW DATA */ + +/* Fence RQ */ +struct mana_fence_rq_req { + struct gdma_req_hdr hdr; + mana_handle_t wq_obj_handle; +}; /* HW DATA */ + +struct mana_fence_rq_resp { + struct gdma_resp_hdr hdr; +}; /* HW DATA */ + +/* Configure vPort Rx Steering */ +struct mana_cfg_rx_steer_req { + struct gdma_req_hdr hdr; + mana_handle_t vport; + uint16_t num_indir_entries; + uint16_t indir_tab_offset; + uint32_t rx_enable; + uint32_t rss_enable; + uint8_t update_default_rxobj; + uint8_t update_hashkey; + uint8_t update_indir_tab; + uint8_t reserved; + mana_handle_t default_rxobj; + uint8_t hashkey[MANA_HASH_KEY_SIZE]; +}; /* HW DATA */ + +struct mana_cfg_rx_steer_resp { + struct gdma_resp_hdr hdr; +}; /* HW DATA */ + +#define MANA_MAX_NUM_QUEUES 16 + +#define MANA_SHORT_VPORT_OFFSET_MAX ((1U << 8) - 1) + +struct mana_tx_package { + struct gdma_wqe_request wqe_req; + struct gdma_sge sgl_array[MAX_MBUF_FRAGS]; + + struct mana_tx_oob tx_oob; + + struct gdma_posted_wqe_info wqe_info; +}; + +int mana_restart(struct mana_port_context *apc); + +#endif /* _MANA_H */ diff --git a/sys/dev/mana/mana_en.c b/sys/dev/mana/mana_en.c new file mode 100644 index 000000000000..e6cffb852d70 --- /dev/null +++ b/sys/dev/mana/mana_en.c @@ -0,0 +1,2699 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/kthread.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/smp.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/time.h> +#include <sys/eventhandler.h> + +#include <machine/bus.h> +#include <machine/resource.h> +#include <machine/in_cksum.h> + +#include <net/if.h> +#include <net/if_var.h> +#include <net/if_types.h> +#include <net/if_vlan_var.h> +#ifdef RSS +#include <net/rss_config.h> +#endif + +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/if_ether.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> +#include <netinet/tcp.h> +#include <netinet/udp.h> + +#include "mana.h" +#include "mana_sysctl.h" + +static int mana_up(struct mana_port_context *apc); +static int mana_down(struct mana_port_context *apc); + +static void +mana_rss_key_fill(void *k, size_t size) +{ + static bool rss_key_generated = false; + static uint8_t rss_key[MANA_HASH_KEY_SIZE]; + + KASSERT(size <= MANA_HASH_KEY_SIZE, + ("Request more buytes than MANA RSS key can hold")); + + if (!rss_key_generated) { + arc4random_buf(rss_key, MANA_HASH_KEY_SIZE); + rss_key_generated = true; + } + memcpy(k, rss_key, size); +} + +static int +mana_ifmedia_change(struct ifnet *ifp __unused) +{ + return EOPNOTSUPP; +} + +static void +mana_ifmedia_status(struct ifnet *ifp, struct ifmediareq *ifmr) +{ + struct mana_port_context *apc = if_getsoftc(ifp); + + if (!apc) { + if_printf(ifp, "Port not available\n"); + return; + } + + MANA_APC_LOCK_LOCK(apc); + + ifmr->ifm_status = IFM_AVALID; + ifmr->ifm_active = IFM_ETHER; + + if (!apc->port_is_up) { + MANA_APC_LOCK_UNLOCK(apc); + mana_info(NULL, "Port %u link is down\n", apc->port_idx); + return; + } + + ifmr->ifm_status |= IFM_ACTIVE; + ifmr->ifm_active |= IFM_100G_DR | IFM_FDX; + + MANA_APC_LOCK_UNLOCK(apc); +} + +static uint64_t +mana_get_counter(struct ifnet *ifp, ift_counter cnt) +{ + struct mana_port_context *apc = if_getsoftc(ifp); + struct mana_port_stats *stats = &apc->port_stats; + + switch (cnt) { + case IFCOUNTER_IPACKETS: + return (counter_u64_fetch(stats->rx_packets)); + case IFCOUNTER_OPACKETS: + return (counter_u64_fetch(stats->tx_packets)); + case IFCOUNTER_IBYTES: + return (counter_u64_fetch(stats->rx_bytes)); + case IFCOUNTER_OBYTES: + return (counter_u64_fetch(stats->tx_bytes)); + case IFCOUNTER_IQDROPS: + return (counter_u64_fetch(stats->rx_drops)); + case IFCOUNTER_OQDROPS: + return (counter_u64_fetch(stats->tx_drops)); + default: + return (if_get_counter_default(ifp, cnt)); + } +} + +static void +mana_drain_eq_task(struct gdma_queue *queue) +{ + if (!queue || !queue->eq.cleanup_tq) + return; + + while (taskqueue_cancel(queue->eq.cleanup_tq, + &queue->eq.cleanup_task, NULL)) { + taskqueue_drain(queue->eq.cleanup_tq, + &queue->eq.cleanup_task); + } +} + +static void +mana_qflush(struct ifnet *ifp) +{ + if_qflush(ifp); +} + +int +mana_restart(struct mana_port_context *apc) +{ + int rc = 0; + + MANA_APC_LOCK_LOCK(apc); + if (apc->port_is_up) + mana_down(apc); + + rc = mana_up(apc); + MANA_APC_LOCK_UNLOCK(apc); + + return (rc); +} + +static int +mana_ioctl(struct ifnet *ifp, u_long command, caddr_t data) +{ + struct mana_port_context *apc = if_getsoftc(ifp); + struct ifrsskey *ifrk; + struct ifrsshash *ifrh; + struct ifreq *ifr; + uint16_t new_mtu; + int rc = 0; + + switch (command) { + case SIOCSIFMTU: + ifr = (struct ifreq *)data; + new_mtu = ifr->ifr_mtu; + if (ifp->if_mtu == new_mtu) + break; + if ((new_mtu + 18 > MAX_FRAME_SIZE) || + (new_mtu + 18 < MIN_FRAME_SIZE)) { + if_printf(ifp, "Invalid MTU. new_mtu: %d, " + "max allowed: %d, min allowed: %d\n", + new_mtu, MAX_FRAME_SIZE - 18, MIN_FRAME_SIZE - 18); + return EINVAL; + } + MANA_APC_LOCK_LOCK(apc); + if (apc->port_is_up) + mana_down(apc); + + apc->frame_size = new_mtu + 18; + if_setmtu(ifp, new_mtu); + mana_dbg(NULL, "Set MTU to %d\n", new_mtu); + + rc = mana_up(apc); + MANA_APC_LOCK_UNLOCK(apc); + break; + + case SIOCSIFFLAGS: + if (ifp->if_flags & IFF_UP) { + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { + MANA_APC_LOCK_LOCK(apc); + if (!apc->port_is_up) + rc = mana_up(apc); + MANA_APC_LOCK_UNLOCK(apc); + } + } else { + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + MANA_APC_LOCK_LOCK(apc); + if (apc->port_is_up) + mana_down(apc); + MANA_APC_LOCK_UNLOCK(apc); + } + } + break; + + case SIOCSIFMEDIA: + case SIOCGIFMEDIA: + case SIOCGIFXMEDIA: + ifr = (struct ifreq *)data; + rc = ifmedia_ioctl(ifp, ifr, &apc->media, command); + break; + + case SIOCGIFRSSKEY: + ifrk = (struct ifrsskey *)data; + ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; + ifrk->ifrk_keylen = MANA_HASH_KEY_SIZE; + memcpy(ifrk->ifrk_key, apc->hashkey, MANA_HASH_KEY_SIZE); + break; + + case SIOCGIFRSSHASH: + ifrh = (struct ifrsshash *)data; + ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; + ifrh->ifrh_types = + RSS_TYPE_TCP_IPV4 | + RSS_TYPE_UDP_IPV4 | + RSS_TYPE_TCP_IPV6 | + RSS_TYPE_UDP_IPV6; + break; + + default: + rc = ether_ioctl(ifp, command, data); + break; + } + + return (rc); +} + +static inline void +mana_alloc_counters(counter_u64_t *begin, int size) +{ + counter_u64_t *end = (counter_u64_t *)((char *)begin + size); + + for (; begin < end; ++begin) + *begin = counter_u64_alloc(M_WAITOK); +} + +static inline void +mana_free_counters(counter_u64_t *begin, int size) +{ + counter_u64_t *end = (counter_u64_t *)((char *)begin + size); + + for (; begin < end; ++begin) + counter_u64_free(*begin); +} + +static inline void +mana_reset_counters(counter_u64_t *begin, int size) +{ + counter_u64_t *end = (counter_u64_t *)((char *)begin + size); + + for (; begin < end; ++begin) + counter_u64_zero(*begin); +} + +static bool +mana_can_tx(struct gdma_queue *wq) +{ + return mana_gd_wq_avail_space(wq) >= MAX_TX_WQE_SIZE; +} + +static inline int +mana_tx_map_mbuf(struct mana_port_context *apc, + struct mana_send_buf_info *tx_info, + struct mbuf **m_head, struct mana_tx_package *tp, + struct mana_stats *tx_stats) +{ + struct gdma_dev *gd = apc->ac->gdma_dev; + bus_dma_segment_t segs[MAX_MBUF_FRAGS]; + struct mbuf *m = *m_head; + int err, nsegs, i; + + err = bus_dmamap_load_mbuf_sg(apc->tx_buf_tag, tx_info->dma_map, + m, segs, &nsegs, BUS_DMA_NOWAIT); + if (err == EFBIG) { + struct mbuf *m_new; + + counter_u64_add(tx_stats->collapse, 1); + m_new = m_collapse(m, M_NOWAIT, MAX_MBUF_FRAGS); + if (unlikely(m_new == NULL)) { + counter_u64_add(tx_stats->collapse_err, 1); + return ENOBUFS; + } else { + *m_head = m = m_new; + } + + mana_warn(NULL, + "Too many segs in orig mbuf, m_collapse called\n"); + + err = bus_dmamap_load_mbuf_sg(apc->tx_buf_tag, + tx_info->dma_map, m, segs, &nsegs, BUS_DMA_NOWAIT); + } + if (!err) { + for (i = 0; i < nsegs; i++) { + tp->wqe_req.sgl[i].address = segs[i].ds_addr; + tp->wqe_req.sgl[i].mem_key = gd->gpa_mkey; + tp->wqe_req.sgl[i].size = segs[i].ds_len; + } + tp->wqe_req.num_sge = nsegs; + + tx_info->mbuf = *m_head; + + bus_dmamap_sync(apc->tx_buf_tag, tx_info->dma_map, + BUS_DMASYNC_PREWRITE); + } + + return err; +} + +static inline void +mana_tx_unmap_mbuf(struct mana_port_context *apc, + struct mana_send_buf_info *tx_info) +{ + bus_dmamap_sync(apc->tx_buf_tag, tx_info->dma_map, + BUS_DMASYNC_POSTWRITE); + bus_dmamap_unload(apc->tx_buf_tag, tx_info->dma_map); + if (tx_info->mbuf) { + m_freem(tx_info->mbuf); + tx_info->mbuf = NULL; + } +} + +static inline int +mana_load_rx_mbuf(struct mana_port_context *apc, struct mana_rxq *rxq, + struct mana_recv_buf_oob *rx_oob, bool alloc_mbuf) +{ + bus_dma_segment_t segs[1]; + struct mbuf *mbuf; + int nsegs, err; + uint32_t mlen; + + if (alloc_mbuf) { + mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rxq->datasize); + if (unlikely(mbuf == NULL)) { + mbuf = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); + if (unlikely(mbuf == NULL)) { + return ENOMEM; + } + mlen = MCLBYTES; + } else { + mlen = rxq->datasize; + } + + mbuf->m_pkthdr.len = mbuf->m_len = mlen; + } else { + if (rx_oob->mbuf) { + mbuf = rx_oob->mbuf; + mlen = rx_oob->mbuf->m_pkthdr.len; + } else { + return ENOMEM; + } + } + + err = bus_dmamap_load_mbuf_sg(apc->rx_buf_tag, rx_oob->dma_map, + mbuf, segs, &nsegs, BUS_DMA_NOWAIT); + + if (unlikely((err != 0) || (nsegs != 1))) { + mana_warn(NULL, "Failed to map mbuf, error: %d, " + "nsegs: %d\n", err, nsegs); + counter_u64_add(rxq->stats.dma_mapping_err, 1); + goto error; + } + + bus_dmamap_sync(apc->rx_buf_tag, rx_oob->dma_map, + BUS_DMASYNC_PREREAD); + + rx_oob->mbuf = mbuf; + rx_oob->num_sge = 1; + rx_oob->sgl[0].address = segs[0].ds_addr; + rx_oob->sgl[0].size = mlen; + rx_oob->sgl[0].mem_key = apc->ac->gdma_dev->gpa_mkey; + + return 0; + +error: + m_freem(mbuf); + return EFAULT; +} + +static inline void +mana_unload_rx_mbuf(struct mana_port_context *apc, struct mana_rxq *rxq, + struct mana_recv_buf_oob *rx_oob, bool free_mbuf) +{ + bus_dmamap_sync(apc->rx_buf_tag, rx_oob->dma_map, + BUS_DMASYNC_POSTREAD); + bus_dmamap_unload(apc->rx_buf_tag, rx_oob->dma_map); + + if (free_mbuf && rx_oob->mbuf) { + m_freem(rx_oob->mbuf); + rx_oob->mbuf = NULL; + } +} + + +/* Use couple mbuf PH_loc spaces for l3 and l4 protocal type */ +#define MANA_L3_PROTO(_mbuf) ((_mbuf)->m_pkthdr.PH_loc.sixteen[0]) +#define MANA_L4_PROTO(_mbuf) ((_mbuf)->m_pkthdr.PH_loc.sixteen[1]) + +#define MANA_TXQ_FULL (IFF_DRV_RUNNING | IFF_DRV_OACTIVE) + +static void +mana_xmit(struct mana_txq *txq) +{ + enum mana_tx_pkt_format pkt_fmt = MANA_SHORT_PKT_FMT; + struct mana_send_buf_info *tx_info; + struct ifnet *ndev = txq->ndev; + struct mbuf *mbuf; + struct mana_port_context *apc = if_getsoftc(ndev); + struct mana_port_stats *port_stats = &apc->port_stats; + struct gdma_dev *gd = apc->ac->gdma_dev; + uint64_t packets, bytes; + uint16_t next_to_use; + struct mana_tx_package pkg = {}; + struct mana_stats *tx_stats; + struct gdma_queue *gdma_sq; + struct gdma_queue *gdma_eq; + struct mana_cq *cq; + int err, len; + + gdma_sq = txq->gdma_sq; + cq = &apc->tx_qp[txq->idx].tx_cq; + gdma_eq = cq->gdma_cq->cq.parent; + tx_stats = &txq->stats; + + packets = 0; + bytes = 0; + next_to_use = txq->next_to_use; + + while ((mbuf = drbr_peek(ndev, txq->txq_br)) != NULL) { + if (!apc->port_is_up || + (if_getdrvflags(ndev) & MANA_TXQ_FULL) != IFF_DRV_RUNNING) { + drbr_putback(ndev, txq->txq_br, mbuf); + break; + } + + if (!mana_can_tx(gdma_sq)) { + /* SQ is full. Set the IFF_DRV_OACTIVE flag */ + if_setdrvflagbits(apc->ndev, IFF_DRV_OACTIVE, 0); + counter_u64_add(tx_stats->stop, 1); + uint64_t stops = counter_u64_fetch(tx_stats->stop); + uint64_t wakeups = counter_u64_fetch(tx_stats->wakeup); +#define MANA_TXQ_STOP_THRESHOLD 50 + if (stops > MANA_TXQ_STOP_THRESHOLD && wakeups > 0 && + stops > wakeups && txq->alt_txq_idx == txq->idx) { + txq->alt_txq_idx = + (txq->idx + (stops / wakeups)) + % apc->num_queues; + counter_u64_add(tx_stats->alt_chg, 1); + } + + drbr_putback(ndev, txq->txq_br, mbuf); + + taskqueue_enqueue(gdma_eq->eq.cleanup_tq, + &gdma_eq->eq.cleanup_task); + break; + } + + tx_info = &txq->tx_buf_info[next_to_use]; + + memset(&pkg, 0, sizeof(struct mana_tx_package)); + pkg.wqe_req.sgl = pkg.sgl_array; + + err = mana_tx_map_mbuf(apc, tx_info, &mbuf, &pkg, tx_stats); + if (unlikely(err)) { + mana_dbg(NULL, + "Failed to map tx mbuf, err %d\n", err); + + counter_u64_add(tx_stats->dma_mapping_err, 1); + + /* The mbuf is still there. Free it */ + m_freem(mbuf); + /* Advance the drbr queue */ + drbr_advance(ndev, txq->txq_br); + continue; + } + + pkg.tx_oob.s_oob.vcq_num = cq->gdma_id; + pkg.tx_oob.s_oob.vsq_frame = txq->vsq_frame; + + if (txq->vp_offset > MANA_SHORT_VPORT_OFFSET_MAX) { + pkg.tx_oob.l_oob.long_vp_offset = txq->vp_offset; + pkt_fmt = MANA_LONG_PKT_FMT; + } else { + pkg.tx_oob.s_oob.short_vp_offset = txq->vp_offset; + } + + pkg.tx_oob.s_oob.pkt_fmt = pkt_fmt; + + if (pkt_fmt == MANA_SHORT_PKT_FMT) + pkg.wqe_req.inline_oob_size = sizeof(struct mana_tx_short_oob); + else + pkg.wqe_req.inline_oob_size = sizeof(struct mana_tx_oob); + + pkg.wqe_req.inline_oob_data = &pkg.tx_oob; + pkg.wqe_req.flags = 0; + pkg.wqe_req.client_data_unit = 0; + + if (mbuf->m_pkthdr.csum_flags & CSUM_TSO) { + if (MANA_L3_PROTO(mbuf) == ETHERTYPE_IP) + pkg.tx_oob.s_oob.is_outer_ipv4 = 1; + else + pkg.tx_oob.s_oob.is_outer_ipv6 = 1; + + pkg.tx_oob.s_oob.comp_iphdr_csum = 1; + pkg.tx_oob.s_oob.comp_tcp_csum = 1; + pkg.tx_oob.s_oob.trans_off = mbuf->m_pkthdr.l3hlen; + + pkg.wqe_req.client_data_unit = mbuf->m_pkthdr.tso_segsz; + pkg.wqe_req.flags = GDMA_WR_OOB_IN_SGL | GDMA_WR_PAD_BY_SGE0; + } else if (mbuf->m_pkthdr.csum_flags & + (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { + if (MANA_L3_PROTO(mbuf) == ETHERTYPE_IP) { + pkg.tx_oob.s_oob.is_outer_ipv4 = 1; + pkg.tx_oob.s_oob.comp_iphdr_csum = 1; + } else { + pkg.tx_oob.s_oob.is_outer_ipv6 = 1; + } + + if (MANA_L4_PROTO(mbuf) == IPPROTO_TCP) { + pkg.tx_oob.s_oob.comp_tcp_csum = 1; + pkg.tx_oob.s_oob.trans_off = + mbuf->m_pkthdr.l3hlen; + } else { + pkg.tx_oob.s_oob.comp_udp_csum = 1; + } + } else if (mbuf->m_pkthdr.csum_flags & CSUM_IP) { + pkg.tx_oob.s_oob.is_outer_ipv4 = 1; + pkg.tx_oob.s_oob.comp_iphdr_csum = 1; + } else { + if (MANA_L3_PROTO(mbuf) == ETHERTYPE_IP) + pkg.tx_oob.s_oob.is_outer_ipv4 = 1; + else if (MANA_L3_PROTO(mbuf) == ETHERTYPE_IPV6) + pkg.tx_oob.s_oob.is_outer_ipv6 = 1; + } + + len = mbuf->m_pkthdr.len; + + err = mana_gd_post_work_request(gdma_sq, &pkg.wqe_req, + (struct gdma_posted_wqe_info *)&tx_info->wqe_inf); + if (unlikely(err)) { + /* Should not happen */ + if_printf(ndev, "Failed to post TX OOB: %d\n", err); + + mana_tx_unmap_mbuf(apc, tx_info); + + drbr_advance(ndev, txq->txq_br); + continue; + } + + next_to_use = + (next_to_use + 1) % MAX_SEND_BUFFERS_PER_QUEUE; + + atomic_inc_return(&txq->pending_sends); + + drbr_advance(ndev, txq->txq_br); + + mana_gd_wq_ring_doorbell(gd->gdma_context, gdma_sq); + + packets++; + bytes += len; + } + + counter_enter(); + counter_u64_add_protected(tx_stats->packets, packets); + counter_u64_add_protected(port_stats->tx_packets, packets); + counter_u64_add_protected(tx_stats->bytes, bytes); + counter_u64_add_protected(port_stats->tx_bytes, bytes); + counter_exit(); + + txq->next_to_use = next_to_use; +} + +static void +mana_xmit_taskfunc(void *arg, int pending) +{ + struct mana_txq *txq = (struct mana_txq *)arg; + struct ifnet *ndev = txq->ndev; + struct mana_port_context *apc = if_getsoftc(ndev); + + while (!drbr_empty(ndev, txq->txq_br) && apc->port_is_up && + (if_getdrvflags(ndev) & MANA_TXQ_FULL) == IFF_DRV_RUNNING) { + mtx_lock(&txq->txq_mtx); + mana_xmit(txq); + mtx_unlock(&txq->txq_mtx); + } +} + +#define PULLUP_HDR(m, len) \ +do { \ + if (unlikely((m)->m_len < (len))) { \ + (m) = m_pullup((m), (len)); \ + if ((m) == NULL) \ + return (NULL); \ + } \ +} while (0) + +/* + * If this function failed, the mbuf would be freed. + */ +static inline struct mbuf * +mana_tso_fixup(struct mbuf *mbuf) +{ + struct ether_vlan_header *eh = mtod(mbuf, struct ether_vlan_header *); + struct tcphdr *th; + uint16_t etype; + int ehlen; + + if (eh->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) { + etype = ntohs(eh->evl_proto); + ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; + } else { + etype = ntohs(eh->evl_encap_proto); + ehlen = ETHER_HDR_LEN; + } + + if (etype == ETHERTYPE_IP) { + struct ip *ip; + int iphlen; + + PULLUP_HDR(mbuf, ehlen + sizeof(*ip)); + ip = mtodo(mbuf, ehlen); + iphlen = ip->ip_hl << 2; + mbuf->m_pkthdr.l3hlen = ehlen + iphlen; + + PULLUP_HDR(mbuf, ehlen + iphlen + sizeof(*th)); + th = mtodo(mbuf, ehlen + iphlen); + + ip->ip_len = 0; + ip->ip_sum = 0; + th->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htons(IPPROTO_TCP)); + } else if (etype == ETHERTYPE_IPV6) { + struct ip6_hdr *ip6; + + PULLUP_HDR(mbuf, ehlen + sizeof(*ip6) + sizeof(*th)); + ip6 = mtodo(mbuf, ehlen); + if (ip6->ip6_nxt != IPPROTO_TCP) { + /* Realy something wrong, just return */ + mana_dbg(NULL, "TSO mbuf not TCP, freed.\n"); + m_freem(mbuf); + return NULL; + } + mbuf->m_pkthdr.l3hlen = ehlen + sizeof(*ip6); + + th = mtodo(mbuf, ehlen + sizeof(*ip6)); + + ip6->ip6_plen = 0; + th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); + } else { + /* CSUM_TSO is set but not IP protocol. */ + mana_warn(NULL, "TSO mbuf not right, freed.\n"); + m_freem(mbuf); + return NULL; + } + + MANA_L3_PROTO(mbuf) = etype; + + return (mbuf); +} + +/* + * If this function failed, the mbuf would be freed. + */ +static inline struct mbuf * +mana_mbuf_csum_check(struct mbuf *mbuf) +{ + struct ether_vlan_header *eh = mtod(mbuf, struct ether_vlan_header *); + struct mbuf *mbuf_next; + uint16_t etype; + int offset; + int ehlen; + + if (eh->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) { + etype = ntohs(eh->evl_proto); + ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; + } else { + etype = ntohs(eh->evl_encap_proto); + ehlen = ETHER_HDR_LEN; + } + + mbuf_next = m_getptr(mbuf, ehlen, &offset); + + MANA_L4_PROTO(mbuf) = 0; + if (etype == ETHERTYPE_IP) { + const struct ip *ip; + int iphlen; + + ip = (struct ip *)(mtodo(mbuf_next, offset)); + iphlen = ip->ip_hl << 2; + mbuf->m_pkthdr.l3hlen = ehlen + iphlen; + + MANA_L4_PROTO(mbuf) = ip->ip_p; + } else if (etype == ETHERTYPE_IPV6) { + const struct ip6_hdr *ip6; + + ip6 = (struct ip6_hdr *)(mtodo(mbuf_next, offset)); + mbuf->m_pkthdr.l3hlen = ehlen + sizeof(*ip6); + + MANA_L4_PROTO(mbuf) = ip6->ip6_nxt; + } else { + MANA_L4_PROTO(mbuf) = 0; + } + + MANA_L3_PROTO(mbuf) = etype; + + return (mbuf); +} + +static int +mana_start_xmit(struct ifnet *ifp, struct mbuf *m) +{ + struct mana_port_context *apc = if_getsoftc(ifp); + struct mana_txq *txq; + int is_drbr_empty; + uint16_t txq_id; + int err; + + if (unlikely((!apc->port_is_up) || + (if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0)) + return ENODEV; + + if (m->m_pkthdr.csum_flags & CSUM_TSO) { + m = mana_tso_fixup(m); + if (unlikely(m == NULL)) { + counter_enter(); + counter_u64_add_protected(apc->port_stats.tx_drops, 1); + counter_exit(); + return EIO; + } + } else { + m = mana_mbuf_csum_check(m); + if (unlikely(m == NULL)) { + counter_enter(); + counter_u64_add_protected(apc->port_stats.tx_drops, 1); + counter_exit(); + return EIO; + } + } + + if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { + uint32_t hash = m->m_pkthdr.flowid; + txq_id = apc->indir_table[(hash) & MANA_INDIRECT_TABLE_MASK] % + apc->num_queues; + } else { + txq_id = m->m_pkthdr.flowid % apc->num_queues; + } + + if (apc->enable_tx_altq) + txq_id = apc->tx_qp[txq_id].txq.alt_txq_idx; + + txq = &apc->tx_qp[txq_id].txq; + + is_drbr_empty = drbr_empty(ifp, txq->txq_br); + err = drbr_enqueue(ifp, txq->txq_br, m); + if (unlikely(err)) { + mana_warn(NULL, "txq %u failed to enqueue: %d\n", + txq_id, err); + taskqueue_enqueue(txq->enqueue_tq, &txq->enqueue_task); + return err; + } + + if (is_drbr_empty && mtx_trylock(&txq->txq_mtx)) { + mana_xmit(txq); + mtx_unlock(&txq->txq_mtx); + } else { + taskqueue_enqueue(txq->enqueue_tq, &txq->enqueue_task); + } + + return 0; +} + +static void +mana_cleanup_port_context(struct mana_port_context *apc) +{ + bus_dma_tag_destroy(apc->tx_buf_tag); + bus_dma_tag_destroy(apc->rx_buf_tag); + apc->rx_buf_tag = NULL; + + free(apc->rxqs, M_DEVBUF); + apc->rxqs = NULL; + + mana_free_counters((counter_u64_t *)&apc->port_stats, + sizeof(struct mana_port_stats)); +} + +static int +mana_init_port_context(struct mana_port_context *apc) +{ + device_t dev = apc->ac->gdma_dev->gdma_context->dev; + uint32_t tso_maxsize; + int err; + + tso_maxsize = MAX_MBUF_FRAGS * MANA_TSO_MAXSEG_SZ - + (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); + + /* Create DMA tag for tx bufs */ + err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */ + 1, 0, /* alignment, boundary */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + tso_maxsize, /* maxsize */ + MAX_MBUF_FRAGS, /* nsegments */ + tso_maxsize, /* maxsegsize */ + 0, /* flags */ + NULL, NULL, /* lockfunc, lockfuncarg*/ + &apc->tx_buf_tag); + if (unlikely(err)) { + device_printf(dev, "Feiled to create TX DMA tag\n"); + return err; + } + + /* Create DMA tag for rx bufs */ + err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */ + 64, 0, /* alignment, boundary */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + MJUMPAGESIZE, /* maxsize */ + 1, /* nsegments */ + MJUMPAGESIZE, /* maxsegsize */ + 0, /* flags */ + NULL, NULL, /* lockfunc, lockfuncarg*/ + &apc->rx_buf_tag); + if (unlikely(err)) { + device_printf(dev, "Feiled to create RX DMA tag\n"); + return err; + } + + apc->rxqs = mallocarray(apc->num_queues, sizeof(struct mana_rxq *), + M_DEVBUF, M_WAITOK | M_ZERO); + + if (!apc->rxqs) { + bus_dma_tag_destroy(apc->tx_buf_tag); + bus_dma_tag_destroy(apc->rx_buf_tag); + apc->rx_buf_tag = NULL; + return ENOMEM; + } + + return 0; +} + +static int +mana_send_request(struct mana_context *ac, void *in_buf, + uint32_t in_len, void *out_buf, uint32_t out_len) +{ + struct gdma_context *gc = ac->gdma_dev->gdma_context; + struct gdma_resp_hdr *resp = out_buf; + struct gdma_req_hdr *req = in_buf; + device_t dev = gc->dev; + static atomic_t activity_id; + int err; + + req->dev_id = gc->mana.dev_id; + req->activity_id = atomic_inc_return(&activity_id); + + mana_dbg(NULL, "activity_id = %u\n", activity_id); + + err = mana_gd_send_request(gc, in_len, in_buf, out_len, + out_buf); + if (err || resp->status) { + device_printf(dev, "Failed to send mana message: %d, 0x%x\n", + err, resp->status); + return err ? err : EPROTO; + } + + if (req->dev_id.as_uint32 != resp->dev_id.as_uint32 || + req->activity_id != resp->activity_id) { + device_printf(dev, + "Unexpected mana message response: %x,%x,%x,%x\n", + req->dev_id.as_uint32, resp->dev_id.as_uint32, + req->activity_id, resp->activity_id); + return EPROTO; + } + + return 0; +} + +static int +mana_verify_resp_hdr(const struct gdma_resp_hdr *resp_hdr, + const enum mana_command_code expected_code, + const uint32_t min_size) +{ + if (resp_hdr->response.msg_type != expected_code) + return EPROTO; + + if (resp_hdr->response.msg_version < GDMA_MESSAGE_V1) + return EPROTO; + + if (resp_hdr->response.msg_size < min_size) + return EPROTO; + + return 0; +} + +static int +mana_query_device_cfg(struct mana_context *ac, uint32_t proto_major_ver, + uint32_t proto_minor_ver, uint32_t proto_micro_ver, + uint16_t *max_num_vports) +{ + struct gdma_context *gc = ac->gdma_dev->gdma_context; + struct mana_query_device_cfg_resp resp = {}; + struct mana_query_device_cfg_req req = {}; + device_t dev = gc->dev; + int err = 0; + + mana_gd_init_req_hdr(&req.hdr, MANA_QUERY_DEV_CONFIG, + sizeof(req), sizeof(resp)); + req.proto_major_ver = proto_major_ver; + req.proto_minor_ver = proto_minor_ver; + req.proto_micro_ver = proto_micro_ver; + + err = mana_send_request(ac, &req, sizeof(req), &resp, sizeof(resp)); + if (err) { + device_printf(dev, "Failed to query config: %d", err); + return err; + } + + err = mana_verify_resp_hdr(&resp.hdr, MANA_QUERY_DEV_CONFIG, + sizeof(resp)); + if (err || resp.hdr.status) { + device_printf(dev, "Invalid query result: %d, 0x%x\n", err, + resp.hdr.status); + if (!err) + err = EPROTO; + return err; + } + + *max_num_vports = resp.max_num_vports; + + mana_dbg(NULL, "mana max_num_vports from device = %d\n", + *max_num_vports); + + return 0; +} + +static int +mana_query_vport_cfg(struct mana_port_context *apc, uint32_t vport_index, + uint32_t *max_sq, uint32_t *max_rq, uint32_t *num_indir_entry) +{ + struct mana_query_vport_cfg_resp resp = {}; + struct mana_query_vport_cfg_req req = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_QUERY_VPORT_CONFIG, + sizeof(req), sizeof(resp)); + + req.vport_index = vport_index; + + err = mana_send_request(apc->ac, &req, sizeof(req), &resp, + sizeof(resp)); + if (err) + return err; + + err = mana_verify_resp_hdr(&resp.hdr, MANA_QUERY_VPORT_CONFIG, + sizeof(resp)); + if (err) + return err; + + if (resp.hdr.status) + return EPROTO; + + *max_sq = resp.max_num_sq; + *max_rq = resp.max_num_rq; + *num_indir_entry = resp.num_indirection_ent; + + apc->port_handle = resp.vport; + memcpy(apc->mac_addr, resp.mac_addr, ETHER_ADDR_LEN); + + return 0; +} + +static int +mana_cfg_vport(struct mana_port_context *apc, uint32_t protection_dom_id, + uint32_t doorbell_pg_id) +{ + struct mana_config_vport_resp resp = {}; + struct mana_config_vport_req req = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_CONFIG_VPORT_TX, + sizeof(req), sizeof(resp)); + req.vport = apc->port_handle; + req.pdid = protection_dom_id; + req.doorbell_pageid = doorbell_pg_id; + + err = mana_send_request(apc->ac, &req, sizeof(req), &resp, + sizeof(resp)); + if (err) { + if_printf(apc->ndev, "Failed to configure vPort: %d\n", err); + goto out; + } + + err = mana_verify_resp_hdr(&resp.hdr, MANA_CONFIG_VPORT_TX, + sizeof(resp)); + if (err || resp.hdr.status) { + if_printf(apc->ndev, "Failed to configure vPort: %d, 0x%x\n", + err, resp.hdr.status); + if (!err) + err = EPROTO; + + goto out; + } + + apc->tx_shortform_allowed = resp.short_form_allowed; + apc->tx_vp_offset = resp.tx_vport_offset; +out: + return err; +} + +static int +mana_cfg_vport_steering(struct mana_port_context *apc, + enum TRI_STATE rx, + bool update_default_rxobj, bool update_key, + bool update_tab) +{ + uint16_t num_entries = MANA_INDIRECT_TABLE_SIZE; + struct mana_cfg_rx_steer_req *req = NULL; + struct mana_cfg_rx_steer_resp resp = {}; + struct ifnet *ndev = apc->ndev; + mana_handle_t *req_indir_tab; + uint32_t req_buf_size; + int err; + + req_buf_size = sizeof(*req) + sizeof(mana_handle_t) * num_entries; + req = malloc(req_buf_size, M_DEVBUF, M_WAITOK | M_ZERO); + if (!req) + return ENOMEM; + + mana_gd_init_req_hdr(&req->hdr, MANA_CONFIG_VPORT_RX, req_buf_size, + sizeof(resp)); + + req->vport = apc->port_handle; + req->num_indir_entries = num_entries; + req->indir_tab_offset = sizeof(*req); + req->rx_enable = rx; + req->rss_enable = apc->rss_state; + req->update_default_rxobj = update_default_rxobj; + req->update_hashkey = update_key; + req->update_indir_tab = update_tab; + req->default_rxobj = apc->default_rxobj; + + if (update_key) + memcpy(&req->hashkey, apc->hashkey, MANA_HASH_KEY_SIZE); + + if (update_tab) { + req_indir_tab = (mana_handle_t *)(req + 1); + memcpy(req_indir_tab, apc->rxobj_table, + req->num_indir_entries * sizeof(mana_handle_t)); + } + + err = mana_send_request(apc->ac, req, req_buf_size, &resp, + sizeof(resp)); + if (err) { + if_printf(ndev, "Failed to configure vPort RX: %d\n", err); + goto out; + } + + err = mana_verify_resp_hdr(&resp.hdr, MANA_CONFIG_VPORT_RX, + sizeof(resp)); + if (err) { + if_printf(ndev, "vPort RX configuration failed: %d\n", err); + goto out; + } + + if (resp.hdr.status) { + if_printf(ndev, "vPort RX configuration failed: 0x%x\n", + resp.hdr.status); + err = EPROTO; + } +out: + free(req, M_DEVBUF); + return err; +} + +static int +mana_create_wq_obj(struct mana_port_context *apc, + mana_handle_t vport, + uint32_t wq_type, struct mana_obj_spec *wq_spec, + struct mana_obj_spec *cq_spec, + mana_handle_t *wq_obj) +{ + struct mana_create_wqobj_resp resp = {}; + struct mana_create_wqobj_req req = {}; + struct ifnet *ndev = apc->ndev; + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_CREATE_WQ_OBJ, + sizeof(req), sizeof(resp)); + req.vport = vport; + req.wq_type = wq_type; + req.wq_gdma_region = wq_spec->gdma_region; + req.cq_gdma_region = cq_spec->gdma_region; + req.wq_size = wq_spec->queue_size; + req.cq_size = cq_spec->queue_size; + req.cq_moderation_ctx_id = cq_spec->modr_ctx_id; + req.cq_parent_qid = cq_spec->attached_eq; + + err = mana_send_request(apc->ac, &req, sizeof(req), &resp, + sizeof(resp)); + if (err) { + if_printf(ndev, "Failed to create WQ object: %d\n", err); + goto out; + } + + err = mana_verify_resp_hdr(&resp.hdr, MANA_CREATE_WQ_OBJ, + sizeof(resp)); + if (err || resp.hdr.status) { + if_printf(ndev, "Failed to create WQ object: %d, 0x%x\n", err, + resp.hdr.status); + if (!err) + err = EPROTO; + goto out; + } + + if (resp.wq_obj == INVALID_MANA_HANDLE) { + if_printf(ndev, "Got an invalid WQ object handle\n"); + err = EPROTO; + goto out; + } + + *wq_obj = resp.wq_obj; + wq_spec->queue_index = resp.wq_id; + cq_spec->queue_index = resp.cq_id; + + return 0; +out: + return err; +} + +static void +mana_destroy_wq_obj(struct mana_port_context *apc, uint32_t wq_type, + mana_handle_t wq_obj) +{ + struct mana_destroy_wqobj_resp resp = {}; + struct mana_destroy_wqobj_req req = {}; + struct ifnet *ndev = apc->ndev; + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_DESTROY_WQ_OBJ, + sizeof(req), sizeof(resp)); + req.wq_type = wq_type; + req.wq_obj_handle = wq_obj; + + err = mana_send_request(apc->ac, &req, sizeof(req), &resp, + sizeof(resp)); + if (err) { + if_printf(ndev, "Failed to destroy WQ object: %d\n", err); + return; + } + + err = mana_verify_resp_hdr(&resp.hdr, MANA_DESTROY_WQ_OBJ, + sizeof(resp)); + if (err || resp.hdr.status) + if_printf(ndev, "Failed to destroy WQ object: %d, 0x%x\n", + err, resp.hdr.status); +} + +static void +mana_init_cqe_poll_buf(struct gdma_comp *cqe_poll_buf) +{ + int i; + + for (i = 0; i < CQE_POLLING_BUFFER; i++) + memset(&cqe_poll_buf[i], 0, sizeof(struct gdma_comp)); +} + +static void +mana_destroy_eq(struct gdma_context *gc, struct mana_port_context *apc) +{ + struct gdma_queue *eq; + int i; + + if (!apc->eqs) + return; + + for (i = 0; i < apc->num_queues; i++) { + eq = apc->eqs[i].eq; + if (!eq) + continue; + + mana_gd_destroy_queue(gc, eq); + } + + free(apc->eqs, M_DEVBUF); + apc->eqs = NULL; +} + +static int +mana_create_eq(struct mana_port_context *apc) +{ + struct gdma_dev *gd = apc->ac->gdma_dev; + struct gdma_queue_spec spec = {}; + int err; + int i; + + apc->eqs = mallocarray(apc->num_queues, sizeof(struct mana_eq), + M_DEVBUF, M_WAITOK | M_ZERO); + if (!apc->eqs) + return ENOMEM; + + spec.type = GDMA_EQ; + spec.monitor_avl_buf = false; + spec.queue_size = EQ_SIZE; + spec.eq.callback = NULL; + spec.eq.context = apc->eqs; + spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE; + spec.eq.ndev = apc->ndev; + + for (i = 0; i < apc->num_queues; i++) { + mana_init_cqe_poll_buf(apc->eqs[i].cqe_poll); + + err = mana_gd_create_mana_eq(gd, &spec, &apc->eqs[i].eq); + if (err) + goto out; + } + + return 0; +out: + mana_destroy_eq(gd->gdma_context, apc); + return err; +} + +static int +mana_move_wq_tail(struct gdma_queue *wq, uint32_t num_units) +{ + uint32_t used_space_old; + uint32_t used_space_new; + + used_space_old = wq->head - wq->tail; + used_space_new = wq->head - (wq->tail + num_units); + + if (used_space_new > used_space_old) { + mana_err(NULL, + "WARNING: new used space %u greater than old one %u\n", + used_space_new, used_space_old); + return ERANGE; + } + + wq->tail += num_units; + return 0; +} + +static void +mana_poll_tx_cq(struct mana_cq *cq) +{ + struct gdma_comp *completions = cq->gdma_comp_buf; + struct gdma_posted_wqe_info *wqe_info; + struct mana_send_buf_info *tx_info; + unsigned int pkt_transmitted = 0; + unsigned int wqe_unit_cnt = 0; + struct mana_txq *txq = cq->txq; + struct mana_port_context *apc; + uint16_t next_to_complete; + struct ifnet *ndev; + int comp_read; + int txq_idx = txq->idx;; + int i; + int sa_drop = 0; + + struct gdma_queue *gdma_wq; + unsigned int avail_space; + bool txq_full = false; + + ndev = txq->ndev; + apc = if_getsoftc(ndev); + + comp_read = mana_gd_poll_cq(cq->gdma_cq, completions, + CQE_POLLING_BUFFER); + + next_to_complete = txq->next_to_complete; + + for (i = 0; i < comp_read; i++) { + struct mana_tx_comp_oob *cqe_oob; + + if (!completions[i].is_sq) { + mana_err(NULL, "WARNING: Not for SQ\n"); + return; + } + + cqe_oob = (struct mana_tx_comp_oob *)completions[i].cqe_data; + if (cqe_oob->cqe_hdr.client_type != + MANA_CQE_COMPLETION) { + mana_err(NULL, + "WARNING: Invalid CQE client type %u\n", + cqe_oob->cqe_hdr.client_type); + return; + } + + switch (cqe_oob->cqe_hdr.cqe_type) { + case CQE_TX_OKAY: + break; + + case CQE_TX_SA_DROP: + case CQE_TX_MTU_DROP: + case CQE_TX_INVALID_OOB: + case CQE_TX_INVALID_ETH_TYPE: + case CQE_TX_HDR_PROCESSING_ERROR: + case CQE_TX_VF_DISABLED: + case CQE_TX_VPORT_IDX_OUT_OF_RANGE: + case CQE_TX_VPORT_DISABLED: + case CQE_TX_VLAN_TAGGING_VIOLATION: + sa_drop ++; + mana_err(NULL, + "TX: txq %d CQE error %d, ntc = %d, " + "pending sends = %d: err ignored.\n", + txq_idx, cqe_oob->cqe_hdr.cqe_type, + next_to_complete, txq->pending_sends); + break; + + default: + /* If the CQE type is unexpected, log an error, + * and go through the error path. + */ + mana_err(NULL, + "ERROR: TX: Unexpected CQE type %d: HW BUG?\n", + cqe_oob->cqe_hdr.cqe_type); + return; + } + if (txq->gdma_txq_id != completions[i].wq_num) { + mana_dbg(NULL, + "txq gdma id not match completion wq num: " + "%d != %d\n", + txq->gdma_txq_id, completions[i].wq_num); + break; + } + + tx_info = &txq->tx_buf_info[next_to_complete]; + if (!tx_info->mbuf) { + mana_err(NULL, + "WARNING: txq %d Empty mbuf on tx_info: %u, " + "ntu = %u, pending_sends = %d, " + "transmitted = %d, sa_drop = %d, i = %d, comp_read = %d\n", + txq_idx, next_to_complete, txq->next_to_use, + txq->pending_sends, pkt_transmitted, sa_drop, + i, comp_read); + continue; + } + + wqe_info = &tx_info->wqe_inf; + wqe_unit_cnt += wqe_info->wqe_size_in_bu; + + mana_tx_unmap_mbuf(apc, tx_info); + mb(); + + next_to_complete = + (next_to_complete + 1) % MAX_SEND_BUFFERS_PER_QUEUE; + + pkt_transmitted++; + } + + txq->next_to_complete = next_to_complete; + + if (wqe_unit_cnt == 0) { + mana_err(NULL, + "WARNING: TX ring not proceeding!\n"); + return; + } + + mana_move_wq_tail(txq->gdma_sq, wqe_unit_cnt); + + /* Ensure tail updated before checking q stop */ + wmb(); + + gdma_wq = txq->gdma_sq; + avail_space = mana_gd_wq_avail_space(gdma_wq); + + + if ((if_getdrvflags(ndev) & MANA_TXQ_FULL) == MANA_TXQ_FULL) { + txq_full = true; + } + + /* Ensure checking txq_full before apc->port_is_up. */ + rmb(); + + if (txq_full && apc->port_is_up && avail_space >= MAX_TX_WQE_SIZE) { + /* Grab the txq lock and re-test */ + mtx_lock(&txq->txq_mtx); + avail_space = mana_gd_wq_avail_space(gdma_wq); + + if ((if_getdrvflags(ndev) & MANA_TXQ_FULL) == MANA_TXQ_FULL && + apc->port_is_up && avail_space >= MAX_TX_WQE_SIZE) { + /* Clear the Q full flag */ + if_setdrvflagbits(apc->ndev, IFF_DRV_RUNNING, + IFF_DRV_OACTIVE); + counter_u64_add(txq->stats.wakeup, 1); + if (txq->alt_txq_idx != txq->idx) { + uint64_t stops = counter_u64_fetch(txq->stats.stop); + uint64_t wakeups = counter_u64_fetch(txq->stats.wakeup); + /* Reset alt_txq_idx back if it is not overloaded */ + if (stops < wakeups) { + txq->alt_txq_idx = txq->idx; + counter_u64_add(txq->stats.alt_reset, 1); + } + } + rmb(); + /* Schedule a tx enqueue task */ + taskqueue_enqueue(txq->enqueue_tq, &txq->enqueue_task); + } + mtx_unlock(&txq->txq_mtx); + } + + if (atomic_sub_return(pkt_transmitted, &txq->pending_sends) < 0) + mana_err(NULL, + "WARNING: TX %d pending_sends error: %d\n", + txq->idx, txq->pending_sends); +} + +static void +mana_post_pkt_rxq(struct mana_rxq *rxq) +{ + struct mana_recv_buf_oob *recv_buf_oob; + uint32_t curr_index; + int err; + + curr_index = rxq->buf_index++; + if (rxq->buf_index == rxq->num_rx_buf) + rxq->buf_index = 0; + + recv_buf_oob = &rxq->rx_oobs[curr_index]; + + err = mana_gd_post_and_ring(rxq->gdma_rq, &recv_buf_oob->wqe_req, + &recv_buf_oob->wqe_inf); + if (err) { + mana_err(NULL, "WARNING: rxq %u post pkt err %d\n", + rxq->rxq_idx, err); + return; + } + + if (recv_buf_oob->wqe_inf.wqe_size_in_bu != 1) { + mana_err(NULL, "WARNING: rxq %u wqe_size_in_bu %u\n", + rxq->rxq_idx, recv_buf_oob->wqe_inf.wqe_size_in_bu); + } +} + +static void +mana_rx_mbuf(struct mbuf *mbuf, struct mana_rxcomp_oob *cqe, + struct mana_rxq *rxq) +{ + struct mana_stats *rx_stats = &rxq->stats; + struct ifnet *ndev = rxq->ndev; + uint32_t pkt_len = cqe->ppi[0].pkt_len; + uint16_t rxq_idx = rxq->rxq_idx; + struct mana_port_context *apc; + struct gdma_queue *eq; + bool do_lro = false; + bool do_if_input; + + apc = if_getsoftc(ndev); + eq = apc->eqs[rxq_idx].eq; + eq->eq.work_done++; + + if (!mbuf) { + return; + } + + mbuf->m_flags |= M_PKTHDR; + mbuf->m_pkthdr.len = pkt_len; + mbuf->m_len = pkt_len; + mbuf->m_pkthdr.rcvif = ndev; + + if ((ndev->if_capenable & IFCAP_RXCSUM || + ndev->if_capenable & IFCAP_RXCSUM_IPV6) && + (cqe->rx_iphdr_csum_succeed)) { + mbuf->m_pkthdr.csum_flags = CSUM_IP_CHECKED; + mbuf->m_pkthdr.csum_flags |= CSUM_IP_VALID; + if (cqe->rx_tcp_csum_succeed || cqe->rx_udp_csum_succeed) { + mbuf->m_pkthdr.csum_flags |= + (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); + mbuf->m_pkthdr.csum_data = 0xffff; + + if (cqe->rx_tcp_csum_succeed) + do_lro = true; + } + } + + if (cqe->rx_hashtype != 0) { + mbuf->m_pkthdr.flowid = cqe->ppi[0].pkt_hash; + + uint16_t hashtype = cqe->rx_hashtype; + if (hashtype & NDIS_HASH_IPV4_MASK) { + hashtype &= NDIS_HASH_IPV4_MASK; + switch (hashtype) { + case NDIS_HASH_TCP_IPV4: + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV4); + break; + case NDIS_HASH_UDP_IPV4: + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV4); + break; + default: + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV4); + } + } else if (hashtype & NDIS_HASH_IPV6_MASK) { + hashtype &= NDIS_HASH_IPV6_MASK; + switch (hashtype) { + case NDIS_HASH_TCP_IPV6: + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV6); + break; + case NDIS_HASH_TCP_IPV6_EX: + M_HASHTYPE_SET(mbuf, + M_HASHTYPE_RSS_TCP_IPV6_EX); + break; + case NDIS_HASH_UDP_IPV6: + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV6); + break; + case NDIS_HASH_UDP_IPV6_EX: + M_HASHTYPE_SET(mbuf, + M_HASHTYPE_RSS_UDP_IPV6_EX); + break; + default: + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV6); + } + } else { + M_HASHTYPE_SET(mbuf, M_HASHTYPE_OPAQUE_HASH); + } + } else { + mbuf->m_pkthdr.flowid = rxq_idx; + M_HASHTYPE_SET(mbuf, M_HASHTYPE_NONE); + } + + do_if_input = true; + if ((ndev->if_capenable & IFCAP_LRO) && do_lro) { + if (rxq->lro.lro_cnt != 0 && + tcp_lro_rx(&rxq->lro, mbuf, 0) == 0) + do_if_input = false; + } + if (do_if_input) { + ndev->if_input(ndev, mbuf); + } + + counter_enter(); + counter_u64_add_protected(rx_stats->packets, 1); + counter_u64_add_protected(apc->port_stats.rx_packets, 1); + counter_u64_add_protected(rx_stats->bytes, pkt_len); + counter_u64_add_protected(apc->port_stats.rx_bytes, pkt_len); + counter_exit(); +} + +static void +mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq, + struct gdma_comp *cqe) +{ + struct mana_rxcomp_oob *oob = (struct mana_rxcomp_oob *)cqe->cqe_data; + struct mana_recv_buf_oob *rxbuf_oob; + struct ifnet *ndev = rxq->ndev; + struct mana_port_context *apc; + struct mbuf *old_mbuf; + uint32_t curr, pktlen; + int err; + + switch (oob->cqe_hdr.cqe_type) { + case CQE_RX_OKAY: + break; + + case CQE_RX_TRUNCATED: + if_printf(ndev, "Dropped a truncated packet\n"); + return; + + case CQE_RX_COALESCED_4: + if_printf(ndev, "RX coalescing is unsupported\n"); + return; + + case CQE_RX_OBJECT_FENCE: + if_printf(ndev, "RX Fencing is unsupported\n"); + return; + + default: + if_printf(ndev, "Unknown RX CQE type = %d\n", + oob->cqe_hdr.cqe_type); + return; + } + + if (oob->cqe_hdr.cqe_type != CQE_RX_OKAY) + return; + + pktlen = oob->ppi[0].pkt_len; + + if (pktlen == 0) { + /* data packets should never have packetlength of zero */ +#if defined(__amd64__) + if_printf(ndev, "RX pkt len=0, rq=%u, cq=%u, rxobj=0x%lx\n", + rxq->gdma_id, cq->gdma_id, rxq->rxobj); +#else + if_printf(ndev, "RX pkt len=0, rq=%u, cq=%u, rxobj=0x%llx\n", + rxq->gdma_id, cq->gdma_id, rxq->rxobj); +#endif + return; + } + + curr = rxq->buf_index; + rxbuf_oob = &rxq->rx_oobs[curr]; + if (rxbuf_oob->wqe_inf.wqe_size_in_bu != 1) { + mana_err(NULL, "WARNING: Rx Incorrect complete " + "WQE size %u\n", + rxbuf_oob->wqe_inf.wqe_size_in_bu); + } + + apc = if_getsoftc(ndev); + + old_mbuf = rxbuf_oob->mbuf; + + /* Unload DMA map for the old mbuf */ + mana_unload_rx_mbuf(apc, rxq, rxbuf_oob, false); + + /* Load a new mbuf to replace the old one */ + err = mana_load_rx_mbuf(apc, rxq, rxbuf_oob, true); + if (err) { + mana_dbg(NULL, + "failed to load rx mbuf, err = %d, packet dropped.\n", + err); + counter_u64_add(rxq->stats.mbuf_alloc_fail, 1); + /* + * Failed to load new mbuf, rxbuf_oob->mbuf is still + * pointing to the old one. Drop the packet. + */ + old_mbuf = NULL; + /* Reload the existing mbuf */ + mana_load_rx_mbuf(apc, rxq, rxbuf_oob, false); + } + + mana_rx_mbuf(old_mbuf, oob, rxq); + + mana_move_wq_tail(rxq->gdma_rq, rxbuf_oob->wqe_inf.wqe_size_in_bu); + + mana_post_pkt_rxq(rxq); +} + +static void +mana_poll_rx_cq(struct mana_cq *cq) +{ + struct gdma_comp *comp = cq->gdma_comp_buf; + int comp_read, i; + + comp_read = mana_gd_poll_cq(cq->gdma_cq, comp, CQE_POLLING_BUFFER); + KASSERT(comp_read <= CQE_POLLING_BUFFER, + ("comp_read %d great than buf size %d", + comp_read, CQE_POLLING_BUFFER)); + + for (i = 0; i < comp_read; i++) { + if (comp[i].is_sq == true) { + mana_err(NULL, + "WARNING: CQE not for receive queue\n"); + return; + } + + /* verify recv cqe references the right rxq */ + if (comp[i].wq_num != cq->rxq->gdma_id) { + mana_err(NULL, + "WARNING: Received CQE %d not for " + "this receive queue %d\n", + comp[i].wq_num, cq->rxq->gdma_id); + return; + } + + mana_process_rx_cqe(cq->rxq, cq, &comp[i]); + } + + tcp_lro_flush_all(&cq->rxq->lro); +} + +static void +mana_cq_handler(void *context, struct gdma_queue *gdma_queue) +{ + struct mana_cq *cq = context; + + KASSERT(cq->gdma_cq == gdma_queue, + ("cq do not match %p, %p", cq->gdma_cq, gdma_queue)); + + if (cq->type == MANA_CQ_TYPE_RX) { + mana_poll_rx_cq(cq); + } else { + mana_poll_tx_cq(cq); + } + + mana_gd_arm_cq(gdma_queue); +} + +static void +mana_deinit_cq(struct mana_port_context *apc, struct mana_cq *cq) +{ + struct gdma_dev *gd = apc->ac->gdma_dev; + + if (!cq->gdma_cq) + return; + + mana_gd_destroy_queue(gd->gdma_context, cq->gdma_cq); +} + +static void +mana_deinit_txq(struct mana_port_context *apc, struct mana_txq *txq) +{ + struct gdma_dev *gd = apc->ac->gdma_dev; + struct mana_send_buf_info *txbuf_info; + uint32_t pending_sends; + int i; + + if (!txq->gdma_sq) + return; + + if ((pending_sends = atomic_read(&txq->pending_sends)) > 0) { + mana_err(NULL, + "WARNING: txq pending sends not zero: %u\n", + pending_sends); + } + + if (txq->next_to_use != txq->next_to_complete) { + mana_err(NULL, + "WARNING: txq buf not completed, " + "next use %u, next complete %u\n", + txq->next_to_use, txq->next_to_complete); + } + + /* Flush buf ring. Grab txq mtx lock */ + if (txq->txq_br) { + mtx_lock(&txq->txq_mtx); + drbr_flush(apc->ndev, txq->txq_br); + mtx_unlock(&txq->txq_mtx); + buf_ring_free(txq->txq_br, M_DEVBUF); + } + + /* Drain taskqueue */ + if (txq->enqueue_tq) { + while (taskqueue_cancel(txq->enqueue_tq, + &txq->enqueue_task, NULL)) { + taskqueue_drain(txq->enqueue_tq, + &txq->enqueue_task); + } + + taskqueue_free(txq->enqueue_tq); + } + + if (txq->tx_buf_info) { + /* Free all mbufs which are still in-flight */ + for (i = 0; i < MAX_SEND_BUFFERS_PER_QUEUE; i++) { + txbuf_info = &txq->tx_buf_info[i]; + if (txbuf_info->mbuf) { + mana_tx_unmap_mbuf(apc, txbuf_info); + } + } + + free(txq->tx_buf_info, M_DEVBUF); + } + + mana_free_counters((counter_u64_t *)&txq->stats, + sizeof(txq->stats)); + + mana_gd_destroy_queue(gd->gdma_context, txq->gdma_sq); + + mtx_destroy(&txq->txq_mtx); +} + +static void +mana_destroy_txq(struct mana_port_context *apc) +{ + int i; + + if (!apc->tx_qp) + return; + + for (i = 0; i < apc->num_queues; i++) { + mana_destroy_wq_obj(apc, GDMA_SQ, apc->tx_qp[i].tx_object); + + mana_deinit_cq(apc, &apc->tx_qp[i].tx_cq); + + mana_deinit_txq(apc, &apc->tx_qp[i].txq); + } + + free(apc->tx_qp, M_DEVBUF); + apc->tx_qp = NULL; +} + +static int +mana_create_txq(struct mana_port_context *apc, struct ifnet *net) +{ + struct gdma_dev *gd = apc->ac->gdma_dev; + struct mana_obj_spec wq_spec; + struct mana_obj_spec cq_spec; + struct gdma_queue_spec spec; + struct gdma_context *gc; + struct mana_txq *txq; + struct mana_cq *cq; + uint32_t txq_size; + uint32_t cq_size; + int err; + int i; + + apc->tx_qp = mallocarray(apc->num_queues, sizeof(struct mana_tx_qp), + M_DEVBUF, M_WAITOK | M_ZERO); + if (!apc->tx_qp) + return ENOMEM; + + /* The minimum size of the WQE is 32 bytes, hence + * MAX_SEND_BUFFERS_PER_QUEUE represents the maximum number of WQEs + * the SQ can store. This value is then used to size other queues + * to prevent overflow. + */ + txq_size = MAX_SEND_BUFFERS_PER_QUEUE * 32; + KASSERT(IS_ALIGNED(txq_size, PAGE_SIZE), + ("txq size not page aligned")); + + cq_size = MAX_SEND_BUFFERS_PER_QUEUE * COMP_ENTRY_SIZE; + cq_size = ALIGN(cq_size, PAGE_SIZE); + + gc = gd->gdma_context; + + for (i = 0; i < apc->num_queues; i++) { + apc->tx_qp[i].tx_object = INVALID_MANA_HANDLE; + + /* Create SQ */ + txq = &apc->tx_qp[i].txq; + + txq->ndev = net; + txq->vp_offset = apc->tx_vp_offset; + txq->idx = i; + txq->alt_txq_idx = i; + + memset(&spec, 0, sizeof(spec)); + spec.type = GDMA_SQ; + spec.monitor_avl_buf = true; + spec.queue_size = txq_size; + err = mana_gd_create_mana_wq_cq(gd, &spec, &txq->gdma_sq); + if (err) + goto out; + + /* Create SQ's CQ */ + cq = &apc->tx_qp[i].tx_cq; + cq->gdma_comp_buf = apc->eqs[i].cqe_poll; + cq->type = MANA_CQ_TYPE_TX; + + cq->txq = txq; + + memset(&spec, 0, sizeof(spec)); + spec.type = GDMA_CQ; + spec.monitor_avl_buf = false; + spec.queue_size = cq_size; + spec.cq.callback = mana_cq_handler; + spec.cq.parent_eq = apc->eqs[i].eq; + spec.cq.context = cq; + err = mana_gd_create_mana_wq_cq(gd, &spec, &cq->gdma_cq); + if (err) + goto out; + + memset(&wq_spec, 0, sizeof(wq_spec)); + memset(&cq_spec, 0, sizeof(cq_spec)); + + wq_spec.gdma_region = txq->gdma_sq->mem_info.gdma_region; + wq_spec.queue_size = txq->gdma_sq->queue_size; + + cq_spec.gdma_region = cq->gdma_cq->mem_info.gdma_region; + cq_spec.queue_size = cq->gdma_cq->queue_size; + cq_spec.modr_ctx_id = 0; + cq_spec.attached_eq = cq->gdma_cq->cq.parent->id; + + err = mana_create_wq_obj(apc, apc->port_handle, GDMA_SQ, + &wq_spec, &cq_spec, &apc->tx_qp[i].tx_object); + + if (err) + goto out; + + txq->gdma_sq->id = wq_spec.queue_index; + cq->gdma_cq->id = cq_spec.queue_index; + + txq->gdma_sq->mem_info.gdma_region = GDMA_INVALID_DMA_REGION; + cq->gdma_cq->mem_info.gdma_region = GDMA_INVALID_DMA_REGION; + + txq->gdma_txq_id = txq->gdma_sq->id; + + cq->gdma_id = cq->gdma_cq->id; + + mana_dbg(NULL, + "txq %d, txq gdma id %d, txq cq gdma id %d\n", + i, txq->gdma_txq_id, cq->gdma_id);; + + if (cq->gdma_id >= gc->max_num_cqs) { + if_printf(net, "CQ id %u too large.\n", cq->gdma_id); + return EINVAL; + } + + gc->cq_table[cq->gdma_id] = cq->gdma_cq; + + /* Initialize tx specific data */ + txq->tx_buf_info = malloc(MAX_SEND_BUFFERS_PER_QUEUE * + sizeof(struct mana_send_buf_info), + M_DEVBUF, M_WAITOK | M_ZERO); + if (unlikely(txq->tx_buf_info == NULL)) { + if_printf(net, + "Failed to allocate tx buf info for SQ %u\n", + txq->gdma_sq->id); + err = ENOMEM; + goto out; + } + + + snprintf(txq->txq_mtx_name, nitems(txq->txq_mtx_name), + "mana:tx(%d)", i); + mtx_init(&txq->txq_mtx, txq->txq_mtx_name, NULL, MTX_DEF); + + txq->txq_br = buf_ring_alloc(4 * MAX_SEND_BUFFERS_PER_QUEUE, + M_DEVBUF, M_WAITOK, &txq->txq_mtx); + if (unlikely(txq->txq_br == NULL)) { + if_printf(net, + "Failed to allocate buf ring for SQ %u\n", + txq->gdma_sq->id); + err = ENOMEM; + goto out; + } + + /* Allocate taskqueue for deferred send */ + TASK_INIT(&txq->enqueue_task, 0, mana_xmit_taskfunc, txq); + txq->enqueue_tq = taskqueue_create_fast("mana_tx_enque", + M_NOWAIT, taskqueue_thread_enqueue, &txq->enqueue_tq); + if (unlikely(txq->enqueue_tq == NULL)) { + if_printf(net, + "Unable to create tx %d enqueue task queue\n", i); + err = ENOMEM; + goto out; + } + taskqueue_start_threads(&txq->enqueue_tq, 1, PI_NET, + "mana txq %d", i); + + mana_alloc_counters((counter_u64_t *)&txq->stats, + sizeof(txq->stats)); + + mana_gd_arm_cq(cq->gdma_cq); + } + + return 0; +out: + mana_destroy_txq(apc); + return err; +} + +static void +mana_destroy_rxq(struct mana_port_context *apc, struct mana_rxq *rxq, + bool validate_state) +{ + struct gdma_context *gc = apc->ac->gdma_dev->gdma_context; + struct mana_recv_buf_oob *rx_oob; + int i; + + if (!rxq) + return; + + if (validate_state) { + /* + * XXX Cancel and drain cleanup task queue here. + */ + ; + } + + mana_destroy_wq_obj(apc, GDMA_RQ, rxq->rxobj); + + mana_deinit_cq(apc, &rxq->rx_cq); + + mana_free_counters((counter_u64_t *)&rxq->stats, + sizeof(rxq->stats)); + + /* Free LRO resources */ + tcp_lro_free(&rxq->lro); + + for (i = 0; i < rxq->num_rx_buf; i++) { + rx_oob = &rxq->rx_oobs[i]; + + if (rx_oob->mbuf) + mana_unload_rx_mbuf(apc, rxq, rx_oob, true); + + bus_dmamap_destroy(apc->rx_buf_tag, rx_oob->dma_map); + } + + if (rxq->gdma_rq) + mana_gd_destroy_queue(gc, rxq->gdma_rq); + + free(rxq, M_DEVBUF); +} + +#define MANA_WQE_HEADER_SIZE 16 +#define MANA_WQE_SGE_SIZE 16 + +static int +mana_alloc_rx_wqe(struct mana_port_context *apc, + struct mana_rxq *rxq, uint32_t *rxq_size, uint32_t *cq_size) +{ + struct mana_recv_buf_oob *rx_oob; + uint32_t buf_idx; + int err; + + if (rxq->datasize == 0 || rxq->datasize > PAGE_SIZE) { + mana_err(NULL, + "WARNING: Invalid rxq datasize %u\n", rxq->datasize); + } + + *rxq_size = 0; + *cq_size = 0; + + for (buf_idx = 0; buf_idx < rxq->num_rx_buf; buf_idx++) { + rx_oob = &rxq->rx_oobs[buf_idx]; + memset(rx_oob, 0, sizeof(*rx_oob)); + + err = bus_dmamap_create(apc->rx_buf_tag, 0, + &rx_oob->dma_map); + if (err) { + mana_err(NULL, + "Failed to create rx DMA map for buf %d\n", + buf_idx); + return err; + } + + err = mana_load_rx_mbuf(apc, rxq, rx_oob, true); + if (err) { + mana_err(NULL, + "Failed to create rx DMA map for buf %d\n", + buf_idx); + bus_dmamap_destroy(apc->rx_buf_tag, rx_oob->dma_map); + return err; + } + + rx_oob->wqe_req.sgl = rx_oob->sgl; + rx_oob->wqe_req.num_sge = rx_oob->num_sge; + rx_oob->wqe_req.inline_oob_size = 0; + rx_oob->wqe_req.inline_oob_data = NULL; + rx_oob->wqe_req.flags = 0; + rx_oob->wqe_req.client_data_unit = 0; + + *rxq_size += ALIGN(MANA_WQE_HEADER_SIZE + + MANA_WQE_SGE_SIZE * rx_oob->num_sge, 32); + *cq_size += COMP_ENTRY_SIZE; + } + + return 0; +} + +static int +mana_push_wqe(struct mana_rxq *rxq) +{ + struct mana_recv_buf_oob *rx_oob; + uint32_t buf_idx; + int err; + + for (buf_idx = 0; buf_idx < rxq->num_rx_buf; buf_idx++) { + rx_oob = &rxq->rx_oobs[buf_idx]; + + err = mana_gd_post_and_ring(rxq->gdma_rq, &rx_oob->wqe_req, + &rx_oob->wqe_inf); + if (err) + return ENOSPC; + } + + return 0; +} + +static struct mana_rxq * +mana_create_rxq(struct mana_port_context *apc, uint32_t rxq_idx, + struct mana_eq *eq, struct ifnet *ndev) +{ + struct gdma_dev *gd = apc->ac->gdma_dev; + struct mana_obj_spec wq_spec; + struct mana_obj_spec cq_spec; + struct gdma_queue_spec spec; + struct mana_cq *cq = NULL; + uint32_t cq_size, rq_size; + struct gdma_context *gc; + struct mana_rxq *rxq; + int err; + + gc = gd->gdma_context; + + rxq = malloc(sizeof(*rxq) + + RX_BUFFERS_PER_QUEUE * sizeof(struct mana_recv_buf_oob), + M_DEVBUF, M_WAITOK | M_ZERO); + if (!rxq) + return NULL; + + rxq->ndev = ndev; + rxq->num_rx_buf = RX_BUFFERS_PER_QUEUE; + rxq->rxq_idx = rxq_idx; + /* + * Minimum size is MCLBYTES(2048) bytes for a mbuf cluster. + * Now we just allow maxium size of 4096. + */ + rxq->datasize = ALIGN(apc->frame_size, MCLBYTES); + if (rxq->datasize > MAX_FRAME_SIZE) + rxq->datasize = MAX_FRAME_SIZE; + + mana_dbg(NULL, "Setting rxq %d datasize %d\n", + rxq_idx, rxq->datasize); + + rxq->rxobj = INVALID_MANA_HANDLE; + + err = mana_alloc_rx_wqe(apc, rxq, &rq_size, &cq_size); + if (err) + goto out; + + /* Create LRO for the RQ */ + if (ndev->if_capenable & IFCAP_LRO) { + err = tcp_lro_init(&rxq->lro); + if (err) { + if_printf(ndev, "Failed to create LRO for rxq %d\n", + rxq_idx); + } else { + rxq->lro.ifp = ndev; + } + } + + mana_alloc_counters((counter_u64_t *)&rxq->stats, + sizeof(rxq->stats)); + + rq_size = ALIGN(rq_size, PAGE_SIZE); + cq_size = ALIGN(cq_size, PAGE_SIZE); + + /* Create RQ */ + memset(&spec, 0, sizeof(spec)); + spec.type = GDMA_RQ; + spec.monitor_avl_buf = true; + spec.queue_size = rq_size; + err = mana_gd_create_mana_wq_cq(gd, &spec, &rxq->gdma_rq); + if (err) + goto out; + + /* Create RQ's CQ */ + cq = &rxq->rx_cq; + cq->gdma_comp_buf = eq->cqe_poll; + cq->type = MANA_CQ_TYPE_RX; + cq->rxq = rxq; + + memset(&spec, 0, sizeof(spec)); + spec.type = GDMA_CQ; + spec.monitor_avl_buf = false; + spec.queue_size = cq_size; + spec.cq.callback = mana_cq_handler; + spec.cq.parent_eq = eq->eq; + spec.cq.context = cq; + err = mana_gd_create_mana_wq_cq(gd, &spec, &cq->gdma_cq); + if (err) + goto out; + + memset(&wq_spec, 0, sizeof(wq_spec)); + memset(&cq_spec, 0, sizeof(cq_spec)); + wq_spec.gdma_region = rxq->gdma_rq->mem_info.gdma_region; + wq_spec.queue_size = rxq->gdma_rq->queue_size; + + cq_spec.gdma_region = cq->gdma_cq->mem_info.gdma_region; + cq_spec.queue_size = cq->gdma_cq->queue_size; + cq_spec.modr_ctx_id = 0; + cq_spec.attached_eq = cq->gdma_cq->cq.parent->id; + + err = mana_create_wq_obj(apc, apc->port_handle, GDMA_RQ, + &wq_spec, &cq_spec, &rxq->rxobj); + if (err) + goto out; + + rxq->gdma_rq->id = wq_spec.queue_index; + cq->gdma_cq->id = cq_spec.queue_index; + + rxq->gdma_rq->mem_info.gdma_region = GDMA_INVALID_DMA_REGION; + cq->gdma_cq->mem_info.gdma_region = GDMA_INVALID_DMA_REGION; + + rxq->gdma_id = rxq->gdma_rq->id; + cq->gdma_id = cq->gdma_cq->id; + + err = mana_push_wqe(rxq); + if (err) + goto out; + + if (cq->gdma_id >= gc->max_num_cqs) + goto out; + + gc->cq_table[cq->gdma_id] = cq->gdma_cq; + + mana_gd_arm_cq(cq->gdma_cq); +out: + if (!err) + return rxq; + + if_printf(ndev, "Failed to create RXQ: err = %d\n", err); + + mana_destroy_rxq(apc, rxq, false); + + if (cq) + mana_deinit_cq(apc, cq); + + return NULL; +} + +static int +mana_add_rx_queues(struct mana_port_context *apc, struct ifnet *ndev) +{ + struct mana_rxq *rxq; + int err = 0; + int i; + + for (i = 0; i < apc->num_queues; i++) { + rxq = mana_create_rxq(apc, i, &apc->eqs[i], ndev); + if (!rxq) { + err = ENOMEM; + goto out; + } + + apc->rxqs[i] = rxq; + } + + apc->default_rxobj = apc->rxqs[0]->rxobj; +out: + return err; +} + +static void +mana_destroy_vport(struct mana_port_context *apc) +{ + struct mana_rxq *rxq; + uint32_t rxq_idx; + struct mana_cq *rx_cq; + struct gdma_queue *cq, *eq; + + for (rxq_idx = 0; rxq_idx < apc->num_queues; rxq_idx++) { + rxq = apc->rxqs[rxq_idx]; + if (!rxq) + continue; + + rx_cq = &rxq->rx_cq; + if ((cq = rx_cq->gdma_cq) != NULL) { + eq = cq->cq.parent; + mana_drain_eq_task(eq); + } + + mana_destroy_rxq(apc, rxq, true); + apc->rxqs[rxq_idx] = NULL; + } + + mana_destroy_txq(apc); +} + +static int +mana_create_vport(struct mana_port_context *apc, struct ifnet *net) +{ + struct gdma_dev *gd = apc->ac->gdma_dev; + int err; + + apc->default_rxobj = INVALID_MANA_HANDLE; + + err = mana_cfg_vport(apc, gd->pdid, gd->doorbell); + if (err) + return err; + + return mana_create_txq(apc, net); +} + + +static void mana_rss_table_init(struct mana_port_context *apc) +{ + int i; + + for (i = 0; i < MANA_INDIRECT_TABLE_SIZE; i++) + apc->indir_table[i] = i % apc->num_queues; +} + +int mana_config_rss(struct mana_port_context *apc, enum TRI_STATE rx, + bool update_hash, bool update_tab) +{ + uint32_t queue_idx; + int i; + + if (update_tab) { + for (i = 0; i < MANA_INDIRECT_TABLE_SIZE; i++) { + queue_idx = apc->indir_table[i]; + apc->rxobj_table[i] = apc->rxqs[queue_idx]->rxobj; + } + } + + return mana_cfg_vport_steering(apc, rx, true, update_hash, update_tab); +} + +static int +mana_init_port(struct ifnet *ndev) +{ + struct mana_port_context *apc = if_getsoftc(ndev); + uint32_t max_txq, max_rxq, max_queues; + int port_idx = apc->port_idx; + uint32_t num_indirect_entries; + int err; + + err = mana_init_port_context(apc); + if (err) + return err; + + err = mana_query_vport_cfg(apc, port_idx, &max_txq, &max_rxq, + &num_indirect_entries); + if (err) { + if_printf(ndev, "Failed to query info for vPort 0\n"); + goto reset_apc; + } + + max_queues = min_t(uint32_t, max_txq, max_rxq); + if (apc->max_queues > max_queues) + apc->max_queues = max_queues; + + if (apc->num_queues > apc->max_queues) + apc->num_queues = apc->max_queues; + + return 0; + +reset_apc: + bus_dma_tag_destroy(apc->rx_buf_tag); + apc->rx_buf_tag = NULL; + free(apc->rxqs, M_DEVBUF); + apc->rxqs = NULL; + return err; +} + +int +mana_alloc_queues(struct ifnet *ndev) +{ + struct mana_port_context *apc = if_getsoftc(ndev); + struct gdma_dev *gd = apc->ac->gdma_dev; + int err; + + err = mana_create_eq(apc); + if (err) + return err; + + err = mana_create_vport(apc, ndev); + if (err) + goto destroy_eq; + + err = mana_add_rx_queues(apc, ndev); + if (err) + goto destroy_vport; + + apc->rss_state = apc->num_queues > 1 ? TRI_STATE_TRUE : TRI_STATE_FALSE; + + mana_rss_table_init(apc); + + err = mana_config_rss(apc, TRI_STATE_TRUE, true, true); + if (err) + goto destroy_vport; + + return 0; + +destroy_vport: + mana_destroy_vport(apc); +destroy_eq: + mana_destroy_eq(gd->gdma_context, apc); + return err; +} + +static int +mana_up(struct mana_port_context *apc) +{ + int err; + + mana_dbg(NULL, "mana_up called\n"); + + err = mana_alloc_queues(apc->ndev); + if (err) { + mana_err(NULL, "Faile alloc mana queues: %d\n", err); + return err; + } + + /* Add queue specific sysctl */ + mana_sysctl_add_queues(apc); + + apc->port_is_up = true; + + /* Ensure port state updated before txq state */ + wmb(); + + if_link_state_change(apc->ndev, LINK_STATE_UP); + if_setdrvflagbits(apc->ndev, IFF_DRV_RUNNING, IFF_DRV_OACTIVE); + + return 0; +} + + +static void +mana_init(void *arg) +{ + struct mana_port_context *apc = (struct mana_port_context *)arg; + + MANA_APC_LOCK_LOCK(apc); + if (!apc->port_is_up) { + mana_up(apc); + } + MANA_APC_LOCK_UNLOCK(apc); +} + +static int +mana_dealloc_queues(struct ifnet *ndev) +{ + struct mana_port_context *apc = if_getsoftc(ndev); + struct mana_txq *txq; + int i, err; + + if (apc->port_is_up) + return EINVAL; + + /* No packet can be transmitted now since apc->port_is_up is false. + * There is still a tiny chance that mana_poll_tx_cq() can re-enable + * a txq because it may not timely see apc->port_is_up being cleared + * to false, but it doesn't matter since mana_start_xmit() drops any + * new packets due to apc->port_is_up being false. + * + * Drain all the in-flight TX packets + */ + for (i = 0; i < apc->num_queues; i++) { + txq = &apc->tx_qp[i].txq; + + struct mana_cq *tx_cq = &apc->tx_qp[i].tx_cq; + struct gdma_queue *eq = NULL; + if (tx_cq->gdma_cq) + eq = tx_cq->gdma_cq->cq.parent; + if (eq) { + /* Stop EQ interrupt */ + eq->eq.do_not_ring_db = true; + /* Schedule a cleanup task */ + taskqueue_enqueue(eq->eq.cleanup_tq, + &eq->eq.cleanup_task); + } + + while (atomic_read(&txq->pending_sends) > 0) + usleep_range(1000, 2000); + } + + /* We're 100% sure the queues can no longer be woken up, because + * we're sure now mana_poll_tx_cq() can't be running. + */ + + apc->rss_state = TRI_STATE_FALSE; + err = mana_config_rss(apc, TRI_STATE_FALSE, false, false); + if (err) { + if_printf(ndev, "Failed to disable vPort: %d\n", err); + return err; + } + + /* TODO: Implement RX fencing */ + gdma_msleep(1000); + + mana_destroy_vport(apc); + + mana_destroy_eq(apc->ac->gdma_dev->gdma_context, apc); + + return 0; +} + +static int +mana_down(struct mana_port_context *apc) +{ + int err = 0; + + apc->port_st_save = apc->port_is_up; + apc->port_is_up = false; + + /* Ensure port state updated before txq state */ + wmb(); + + if (apc->port_st_save) { + if_setdrvflagbits(apc->ndev, IFF_DRV_OACTIVE, + IFF_DRV_RUNNING); + if_link_state_change(apc->ndev, LINK_STATE_DOWN); + + mana_sysctl_free_queues(apc); + + err = mana_dealloc_queues(apc->ndev); + if (err) { + if_printf(apc->ndev, + "Failed to bring down mana interface: %d\n", err); + } + } + + return err; +} + +int +mana_detach(struct ifnet *ndev) +{ + struct mana_port_context *apc = if_getsoftc(ndev); + int err; + + ether_ifdetach(ndev); + + if (!apc) + return 0; + + MANA_APC_LOCK_LOCK(apc); + err = mana_down(apc); + MANA_APC_LOCK_UNLOCK(apc); + + mana_cleanup_port_context(apc); + + MANA_APC_LOCK_DESTROY(apc); + + free(apc, M_DEVBUF); + + return err; +} + +static int +mana_probe_port(struct mana_context *ac, int port_idx, + struct ifnet **ndev_storage) +{ + struct gdma_context *gc = ac->gdma_dev->gdma_context; + struct mana_port_context *apc; + struct ifnet *ndev; + int err; + + ndev = if_alloc_dev(IFT_ETHER, gc->dev); + if (!ndev) { + mana_err(NULL, "Failed to allocate ifnet struct\n"); + return ENOMEM; + } + + *ndev_storage = ndev; + + apc = malloc(sizeof(*apc), M_DEVBUF, M_WAITOK | M_ZERO); + if (!apc) { + mana_err(NULL, "Failed to allocate port context\n"); + err = ENOMEM; + goto free_net; + } + + apc->ac = ac; + apc->ndev = ndev; + apc->max_queues = gc->max_num_queues; + apc->num_queues = min_t(unsigned int, + gc->max_num_queues, MANA_MAX_NUM_QUEUES); + apc->port_handle = INVALID_MANA_HANDLE; + apc->port_idx = port_idx; + apc->frame_size = DEFAULT_FRAME_SIZE; + + MANA_APC_LOCK_INIT(apc); + + if_initname(ndev, device_get_name(gc->dev), port_idx); + if_setdev(ndev,gc->dev); + if_setsoftc(ndev, apc); + + if_setflags(ndev, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); + if_setinitfn(ndev, mana_init); + if_settransmitfn(ndev, mana_start_xmit); + if_setqflushfn(ndev, mana_qflush); + if_setioctlfn(ndev, mana_ioctl); + if_setgetcounterfn(ndev, mana_get_counter); + + if_setmtu(ndev, ETHERMTU); + if_setbaudrate(ndev, IF_Gbps(100)); + + mana_rss_key_fill(apc->hashkey, MANA_HASH_KEY_SIZE); + + err = mana_init_port(ndev); + if (err) + goto reset_apc; + + ndev->if_capabilities |= IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6; + ndev->if_capabilities |= IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6; + ndev->if_capabilities |= IFCAP_TSO4 | IFCAP_TSO6; + + ndev->if_capabilities |= IFCAP_LRO | IFCAP_LINKSTATE; + + /* Enable all available capabilities by default. */ + ndev->if_capenable = ndev->if_capabilities; + + /* TSO parameters */ + ndev->if_hw_tsomax = MAX_MBUF_FRAGS * MANA_TSO_MAXSEG_SZ - + (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); + ndev->if_hw_tsomaxsegcount = MAX_MBUF_FRAGS; + ndev->if_hw_tsomaxsegsize = PAGE_SIZE; + + ifmedia_init(&apc->media, IFM_IMASK, + mana_ifmedia_change, mana_ifmedia_status); + ifmedia_add(&apc->media, IFM_ETHER | IFM_AUTO, 0, NULL); + ifmedia_set(&apc->media, IFM_ETHER | IFM_AUTO); + + ether_ifattach(ndev, apc->mac_addr); + + /* Initialize statistics */ + mana_alloc_counters((counter_u64_t *)&apc->port_stats, + sizeof(struct mana_port_stats)); + mana_sysctl_add_port(apc); + + /* Tell the stack that the interface is not active */ + if_setdrvflagbits(ndev, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); + + return 0; + +reset_apc: + free(apc, M_DEVBUF); +free_net: + *ndev_storage = NULL; + if_printf(ndev, "Failed to probe vPort %d: %d\n", port_idx, err); + if_free(ndev); + return err; +} + +int mana_probe(struct gdma_dev *gd) +{ + struct gdma_context *gc = gd->gdma_context; + device_t dev = gc->dev; + struct mana_context *ac; + int err; + int i; + + device_printf(dev, "%s protocol version: %d.%d.%d\n", DEVICE_NAME, + MANA_MAJOR_VERSION, MANA_MINOR_VERSION, MANA_MICRO_VERSION); + + err = mana_gd_register_device(gd); + if (err) + return err; + + ac = malloc(sizeof(*ac), M_DEVBUF, M_WAITOK | M_ZERO); + if (!ac) + return ENOMEM; + + ac->gdma_dev = gd; + ac->num_ports = 1; + gd->driver_data = ac; + + err = mana_query_device_cfg(ac, MANA_MAJOR_VERSION, MANA_MINOR_VERSION, + MANA_MICRO_VERSION, &ac->num_ports); + if (err) + goto out; + + if (ac->num_ports > MAX_PORTS_IN_MANA_DEV) + ac->num_ports = MAX_PORTS_IN_MANA_DEV; + + for (i = 0; i < ac->num_ports; i++) { + err = mana_probe_port(ac, i, &ac->ports[i]); + if (err) { + device_printf(dev, + "Failed to probe mana port %d\n", i); + break; + } + } + +out: + if (err) + mana_remove(gd); + + return err; +} + +void +mana_remove(struct gdma_dev *gd) +{ + struct gdma_context *gc = gd->gdma_context; + struct mana_context *ac = gd->driver_data; + device_t dev = gc->dev; + struct ifnet *ndev; + int i; + + for (i = 0; i < ac->num_ports; i++) { + ndev = ac->ports[i]; + if (!ndev) { + if (i == 0) + device_printf(dev, "No net device to remove\n"); + goto out; + } + + mana_detach(ndev); + + if_free(ndev); + } +out: + mana_gd_deregister_device(gd); + gd->driver_data = NULL; + gd->gdma_context = NULL; + free(ac, M_DEVBUF); +} diff --git a/sys/dev/mana/mana_sysctl.c b/sys/dev/mana/mana_sysctl.c new file mode 100644 index 000000000000..f0821f05f2d2 --- /dev/null +++ b/sys/dev/mana/mana_sysctl.c @@ -0,0 +1,219 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + + +#include "mana_sysctl.h" + +static int mana_sysctl_cleanup_thread_cpu(SYSCTL_HANDLER_ARGS); + +int mana_log_level = MANA_ALERT | MANA_WARNING | MANA_INFO; + +SYSCTL_NODE(_hw, OID_AUTO, mana, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + "MANA driver parameters"); + +/* + * Logging level for changing verbosity of the output + */ +SYSCTL_INT(_hw_mana, OID_AUTO, log_level, CTLFLAG_RWTUN, + &mana_log_level, 0, "Logging level indicating verbosity of the logs"); + +SYSCTL_CONST_STRING(_hw_mana, OID_AUTO, driver_version, CTLFLAG_RD, + DRV_MODULE_VERSION, "MANA driver version"); + +void +mana_sysctl_add_port(struct mana_port_context *apc) +{ + struct gdma_context *gc = apc->ac->gdma_dev->gdma_context; + device_t dev = gc->dev; + struct sysctl_ctx_list *ctx; + struct sysctl_oid *tree; + struct sysctl_oid_list *child; + struct mana_port_stats *port_stats; + char node_name[32]; + + struct sysctl_oid *port_node, *stats_node; + struct sysctl_oid_list *stats_list; + + ctx = device_get_sysctl_ctx(dev); + tree = device_get_sysctl_tree(dev); + child = SYSCTL_CHILDREN(tree); + + port_stats = &apc->port_stats; + + snprintf(node_name, 32, "port%d", apc->port_idx); + + port_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, + node_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Port Name"); + apc->port_list = SYSCTL_CHILDREN(port_node); + + SYSCTL_ADD_BOOL(ctx, apc->port_list, OID_AUTO, + "enable_altq", CTLFLAG_RW, &apc->enable_tx_altq, 0, + "Choose alternative txq under heavy load"); + + SYSCTL_ADD_PROC(ctx, apc->port_list, OID_AUTO, + "bind_cleanup_thread_cpu", + CTLTYPE_U8 | CTLFLAG_RW | CTLFLAG_MPSAFE, + apc, 0, mana_sysctl_cleanup_thread_cpu, "I", + "Bind cleanup thread to a cpu. 0 disables it."); + + stats_node = SYSCTL_ADD_NODE(ctx, apc->port_list, OID_AUTO, + "port_stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, + "Statistics of port"); + stats_list = SYSCTL_CHILDREN(stats_node); + + SYSCTL_ADD_COUNTER_U64(ctx, stats_list, OID_AUTO, "rx_packets", + CTLFLAG_RD, &port_stats->rx_packets, "Packets received"); + SYSCTL_ADD_COUNTER_U64(ctx, stats_list, OID_AUTO, "tx_packets", + CTLFLAG_RD, &port_stats->tx_packets, "Packets transmitted"); + SYSCTL_ADD_COUNTER_U64(ctx, stats_list, OID_AUTO, "rx_bytes", + CTLFLAG_RD, &port_stats->rx_bytes, "Bytes received"); + SYSCTL_ADD_COUNTER_U64(ctx, stats_list, OID_AUTO, "tx_bytes", + CTLFLAG_RD, &port_stats->tx_bytes, "Bytes transmitted"); + SYSCTL_ADD_COUNTER_U64(ctx, stats_list, OID_AUTO, "rx_drops", + CTLFLAG_RD, &port_stats->rx_drops, "Receive packet drops"); + SYSCTL_ADD_COUNTER_U64(ctx, stats_list, OID_AUTO, "tx_drops", + CTLFLAG_RD, &port_stats->tx_drops, "Transmit packet drops"); +} + +void +mana_sysctl_add_queues(struct mana_port_context *apc) +{ + struct sysctl_ctx_list *ctx = &apc->que_sysctl_ctx; + struct sysctl_oid_list *child = apc->port_list; + + struct sysctl_oid *queue_node, *tx_node, *rx_node; + struct sysctl_oid_list *queue_list, *tx_list, *rx_list; + struct mana_txq *txq; + struct mana_rxq *rxq; + struct mana_stats *tx_stats, *rx_stats; + char que_name[32]; + int i; + + sysctl_ctx_init(ctx); + + for (i = 0; i < apc->num_queues; i++) { + rxq = apc->rxqs[i]; + txq = &apc->tx_qp[i].txq; + + snprintf(que_name, 32, "queue%d", i); + + queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, + que_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name"); + queue_list = SYSCTL_CHILDREN(queue_node); + + /* TX stats */ + tx_node = SYSCTL_ADD_NODE(ctx, queue_list, OID_AUTO, + "txq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TX queue"); + tx_list = SYSCTL_CHILDREN(tx_node); + + tx_stats = &txq->stats; + + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "count", + CTLFLAG_RD, &tx_stats->packets, "Packets sent"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "bytes", + CTLFLAG_RD, &tx_stats->bytes, "Bytes sent"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "queue_wakeups", + CTLFLAG_RD, &tx_stats->wakeup, "Queue wakeups"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "queue_stops", + CTLFLAG_RD, &tx_stats->stop, "Queue stops"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "mbuf_collapse", + CTLFLAG_RD, &tx_stats->collapse, "Mbuf collapse count"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "mbuf_collapse_err", CTLFLAG_RD, + &tx_stats->collapse_err, "Mbuf collapse failures"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "dma_mapping_err", CTLFLAG_RD, + &tx_stats->dma_mapping_err, "DMA mapping failures"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "alt_chg", CTLFLAG_RD, + &tx_stats->alt_chg, "Switch to alternative txq"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "alt_reset", CTLFLAG_RD, + &tx_stats->alt_reset, "Reset to self txq"); + + /* RX stats */ + rx_node = SYSCTL_ADD_NODE(ctx, queue_list, OID_AUTO, + "rxq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "RX queue"); + rx_list = SYSCTL_CHILDREN(rx_node); + + rx_stats = &rxq->stats; + + SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "count", + CTLFLAG_RD, &rx_stats->packets, "Packets received"); + SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "bytes", + CTLFLAG_RD, &rx_stats->bytes, "Bytes received"); + SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, + "mbuf_alloc_fail", CTLFLAG_RD, + &rx_stats->mbuf_alloc_fail, "Failed mbuf allocs"); + SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, + "dma_mapping_err", CTLFLAG_RD, + &rx_stats->dma_mapping_err, "DMA mapping errors"); + } +} + +/* + * Free all queues' sysctl trees attached to the port's tree. + */ +void +mana_sysctl_free_queues(struct mana_port_context *apc) +{ + sysctl_ctx_free(&apc->que_sysctl_ctx); +} + +static int +mana_sysctl_cleanup_thread_cpu(SYSCTL_HANDLER_ARGS) +{ + struct mana_port_context *apc = arg1; + bool bind_cpu = false; + uint8_t val; + int err; + + val = 0; + err = sysctl_wire_old_buffer(req, sizeof(val)); + if (err == 0) { + val = apc->bind_cleanup_thread_cpu; + err = sysctl_handle_8(oidp, &val, 0, req); + } + + if (err != 0 || req->newptr == NULL) + return (err); + + if (val != 0) + bind_cpu = true; + + if (bind_cpu != apc->bind_cleanup_thread_cpu) { + apc->bind_cleanup_thread_cpu = bind_cpu; + err = mana_restart(apc); + } + + return (err); +} diff --git a/sys/dev/mana/mana_sysctl.h b/sys/dev/mana/mana_sysctl.h new file mode 100644 index 000000000000..c47f4cd790a9 --- /dev/null +++ b/sys/dev/mana/mana_sysctl.h @@ -0,0 +1,48 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2015-2020 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#ifndef MANA_SYSCTL_H +#define MANA_SYSCTL_H + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysctl.h> +#include <sys/systm.h> + +#include "mana.h" + +void mana_sysctl_add_port(struct mana_port_context *apc); +void mana_sysctl_add_queues(struct mana_port_context *apc); +void mana_sysctl_free_queues(struct mana_port_context *apc); + +#endif /* !(MANA_SYSCTL_H) */ diff --git a/sys/dev/mana/shm_channel.c b/sys/dev/mana/shm_channel.c new file mode 100644 index 000000000000..17679626d822 --- /dev/null +++ b/sys/dev/mana/shm_channel.c @@ -0,0 +1,337 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/bus.h> + +#include "mana.h" +#include "shm_channel.h" +#include "gdma_util.h" + +#define PAGE_FRAME_L48_WIDTH_BYTES 6 +#define PAGE_FRAME_L48_WIDTH_BITS (PAGE_FRAME_L48_WIDTH_BYTES * 8) +#define PAGE_FRAME_L48_MASK 0x0000FFFFFFFFFFFF +#define PAGE_FRAME_H4_WIDTH_BITS 4 +#define VECTOR_MASK 0xFFFF +#define SHMEM_VF_RESET_STATE ((uint32_t)-1) + +#define SMC_MSG_TYPE_ESTABLISH_HWC 1 +#define SMC_MSG_TYPE_ESTABLISH_HWC_VERSION 0 + +#define SMC_MSG_TYPE_DESTROY_HWC 2 +#define SMC_MSG_TYPE_DESTROY_HWC_VERSION 0 + +#define SMC_MSG_DIRECTION_REQUEST 0 +#define SMC_MSG_DIRECTION_RESPONSE 1 + +/* Structures labeled with "HW DATA" are exchanged with the hardware. All of + * them are naturally aligned and hence don't need __packed. + */ + +/* Shared memory channel protocol header + * + * msg_type: set on request and response; response matches request. + * msg_version: newer PF writes back older response (matching request) + * older PF acts on latest version known and sets that version in result + * (less than request). + * direction: 0 for request, VF->PF; 1 for response, PF->VF. + * status: 0 on request, + * operation result on response (success = 0, failure = 1 or greater). + * reset_vf: If set on either establish or destroy request, indicates perform + * FLR before/after the operation. + * owner_is_pf: 1 indicates PF owned, 0 indicates VF owned. + */ +union smc_proto_hdr { + uint32_t as_uint32; + + struct { + uint8_t msg_type : 3; + uint8_t msg_version : 3; + uint8_t reserved_1 : 1; + uint8_t direction : 1; + + uint8_t status; + + uint8_t reserved_2; + + uint8_t reset_vf : 1; + uint8_t reserved_3 : 6; + uint8_t owner_is_pf : 1; + }; +}; /* HW DATA */ + +#define SMC_APERTURE_BITS 256 +#define SMC_BASIC_UNIT (sizeof(uint32_t)) +#define SMC_APERTURE_DWORDS (SMC_APERTURE_BITS / (SMC_BASIC_UNIT * 8)) +#define SMC_LAST_DWORD (SMC_APERTURE_DWORDS - 1) + +static int +mana_smc_poll_register(void __iomem *base, bool reset) +{ + void __iomem *ptr = (uint8_t *)base + SMC_LAST_DWORD * SMC_BASIC_UNIT; + volatile uint32_t last_dword; + int i; + + /* Poll the hardware for the ownership bit. This should be pretty fast, + * but let's do it in a loop just in case the hardware or the PF + * driver are temporarily busy. + */ + for (i = 0; i < 20 * 1000; i++) { + last_dword = readl(ptr); + + /* shmem reads as 0xFFFFFFFF in the reset case */ + if (reset && last_dword == SHMEM_VF_RESET_STATE) + return 0; + + /* If bit_31 is set, the PF currently owns the SMC. */ + if (!(last_dword & BIT(31))) + return 0; + + DELAY(1000); + } + + return ETIMEDOUT; +} + +static int +mana_smc_read_response(struct shm_channel *sc, uint32_t msg_type, + uint32_t msg_version, bool reset_vf) +{ + void __iomem *base = sc->base; + union smc_proto_hdr hdr; + int err; + + /* Wait for PF to respond. */ + err = mana_smc_poll_register(base, reset_vf); + if (err) + return err; + + hdr.as_uint32 = + readl((uint8_t *)base + SMC_LAST_DWORD * SMC_BASIC_UNIT); + mana_dbg(NULL, "shm response 0x%x\n", hdr.as_uint32); + + if (reset_vf && hdr.as_uint32 == SHMEM_VF_RESET_STATE) + return 0; + + /* Validate protocol fields from the PF driver */ + if (hdr.msg_type != msg_type || hdr.msg_version > msg_version || + hdr.direction != SMC_MSG_DIRECTION_RESPONSE) { + device_printf(sc->dev, + "Wrong SMC response 0x%x, type=%d, ver=%d\n", + hdr.as_uint32, msg_type, msg_version); + return EPROTO; + } + + /* Validate the operation result */ + if (hdr.status != 0) { + device_printf(sc->dev, + "SMC operation failed: 0x%x\n", hdr.status); + return EPROTO; + } + + return 0; +} + +void +mana_smc_init(struct shm_channel *sc, device_t dev, void __iomem *base) +{ + sc->dev = dev; + sc->base = base; +} + +int +mana_smc_setup_hwc(struct shm_channel *sc, bool reset_vf, uint64_t eq_addr, + uint64_t cq_addr, uint64_t rq_addr, uint64_t sq_addr, + uint32_t eq_msix_index) +{ + union smc_proto_hdr *hdr; + uint16_t all_addr_h4bits = 0; + uint16_t frame_addr_seq = 0; + uint64_t frame_addr = 0; + uint8_t shm_buf[32]; + uint64_t *shmem; + uint32_t *dword; + uint8_t *ptr; + int err; + int i; + + /* Ensure VF already has possession of shared memory */ + err = mana_smc_poll_register(sc->base, false); + if (err) { + device_printf(sc->dev, + "Timeout when setting up HWC: %d\n", err); + return err; + } + + if (!IS_ALIGNED(eq_addr, PAGE_SIZE) || + !IS_ALIGNED(cq_addr, PAGE_SIZE) || + !IS_ALIGNED(rq_addr, PAGE_SIZE) || + !IS_ALIGNED(sq_addr, PAGE_SIZE)) + return EINVAL; + + if ((eq_msix_index & VECTOR_MASK) != eq_msix_index) + return EINVAL; + + /* Scheme for packing four addresses and extra info into 256 bits. + * + * Addresses must be page frame aligned, so only frame address bits + * are transferred. + * + * 52-bit frame addresses are split into the lower 48 bits and upper + * 4 bits. Lower 48 bits of 4 address are written sequentially from + * the start of the 256-bit shared memory region followed by 16 bits + * containing the upper 4 bits of the 4 addresses in sequence. + * + * A 16 bit EQ vector number fills out the next-to-last 32-bit dword. + * + * The final 32-bit dword is used for protocol control information as + * defined in smc_proto_hdr. + */ + + memset(shm_buf, 0, sizeof(shm_buf)); + ptr = shm_buf; + + /* EQ addr: low 48 bits of frame address */ + shmem = (uint64_t *)ptr; + frame_addr = PHYS_PFN(eq_addr); + *shmem = frame_addr & PAGE_FRAME_L48_MASK; + all_addr_h4bits |= (frame_addr >> PAGE_FRAME_L48_WIDTH_BITS) << + (frame_addr_seq++ * PAGE_FRAME_H4_WIDTH_BITS); + ptr += PAGE_FRAME_L48_WIDTH_BYTES; + + /* CQ addr: low 48 bits of frame address */ + shmem = (uint64_t *)ptr; + frame_addr = PHYS_PFN(cq_addr); + *shmem = frame_addr & PAGE_FRAME_L48_MASK; + all_addr_h4bits |= (frame_addr >> PAGE_FRAME_L48_WIDTH_BITS) << + (frame_addr_seq++ * PAGE_FRAME_H4_WIDTH_BITS); + ptr += PAGE_FRAME_L48_WIDTH_BYTES; + + /* RQ addr: low 48 bits of frame address */ + shmem = (uint64_t *)ptr; + frame_addr = PHYS_PFN(rq_addr); + *shmem = frame_addr & PAGE_FRAME_L48_MASK; + all_addr_h4bits |= (frame_addr >> PAGE_FRAME_L48_WIDTH_BITS) << + (frame_addr_seq++ * PAGE_FRAME_H4_WIDTH_BITS); + ptr += PAGE_FRAME_L48_WIDTH_BYTES; + + /* SQ addr: low 48 bits of frame address */ + shmem = (uint64_t *)ptr; + frame_addr = PHYS_PFN(sq_addr); + *shmem = frame_addr & PAGE_FRAME_L48_MASK; + all_addr_h4bits |= (frame_addr >> PAGE_FRAME_L48_WIDTH_BITS) << + (frame_addr_seq++ * PAGE_FRAME_H4_WIDTH_BITS); + ptr += PAGE_FRAME_L48_WIDTH_BYTES; + + /* High 4 bits of the four frame addresses */ + *((uint16_t *)ptr) = all_addr_h4bits; + ptr += sizeof(uint16_t); + + /* EQ MSIX vector number */ + *((uint16_t *)ptr) = (uint16_t)eq_msix_index; + ptr += sizeof(uint16_t); + + /* 32-bit protocol header in final dword */ + *((uint32_t *)ptr) = 0; + + hdr = (union smc_proto_hdr *)ptr; + hdr->msg_type = SMC_MSG_TYPE_ESTABLISH_HWC; + hdr->msg_version = SMC_MSG_TYPE_ESTABLISH_HWC_VERSION; + hdr->direction = SMC_MSG_DIRECTION_REQUEST; + hdr->reset_vf = reset_vf; + + /* Write 256-message buffer to shared memory (final 32-bit write + * triggers HW to set possession bit to PF). + */ + dword = (uint32_t *)shm_buf; + for (i = 0; i < SMC_APERTURE_DWORDS; i++) { + mana_dbg(NULL, "write shm_buf %d, val: 0x%x\n", + i, *dword); + writel((char *)sc->base + i * SMC_BASIC_UNIT, *dword++); + } + + /* Read shmem response (polling for VF possession) and validate. + * For setup, waiting for response on shared memory is not strictly + * necessary, since wait occurs later for results to appear in EQE's. + */ + err = mana_smc_read_response(sc, SMC_MSG_TYPE_ESTABLISH_HWC, + SMC_MSG_TYPE_ESTABLISH_HWC_VERSION, reset_vf); + if (err) { + device_printf(sc->dev, + "Error when setting up HWC: %d\n", err); + return err; + } + + return 0; +} + +int +mana_smc_teardown_hwc(struct shm_channel *sc, bool reset_vf) +{ + union smc_proto_hdr hdr = {}; + int err; + + /* Ensure already has possession of shared memory */ + err = mana_smc_poll_register(sc->base, false); + if (err) { + device_printf(sc->dev, "Timeout when tearing down HWC\n"); + return err; + } + + /* Set up protocol header for HWC destroy message */ + hdr.msg_type = SMC_MSG_TYPE_DESTROY_HWC; + hdr.msg_version = SMC_MSG_TYPE_DESTROY_HWC_VERSION; + hdr.direction = SMC_MSG_DIRECTION_REQUEST; + hdr.reset_vf = reset_vf; + + /* Write message in high 32 bits of 256-bit shared memory, causing HW + * to set possession bit to PF. + */ + writel((char *)sc->base + SMC_LAST_DWORD * SMC_BASIC_UNIT, + hdr.as_uint32); + + /* Read shmem response (polling for VF possession) and validate. + * For teardown, waiting for response is required to ensure hardware + * invalidates MST entries before software frees memory. + */ + err = mana_smc_read_response(sc, SMC_MSG_TYPE_DESTROY_HWC, + SMC_MSG_TYPE_DESTROY_HWC_VERSION, reset_vf); + if (err) { + device_printf(sc->dev, + "Error when tearing down HWC: %d\n", err); + return err; + } + + return 0; +} diff --git a/sys/dev/mana/shm_channel.h b/sys/dev/mana/shm_channel.h new file mode 100644 index 000000000000..7836e513a77f --- /dev/null +++ b/sys/dev/mana/shm_channel.h @@ -0,0 +1,52 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#ifndef _SHM_CHANNEL_H +#define _SHM_CHANNEL_H + +#define __iomem + +struct shm_channel { + device_t dev; + void __iomem *base; +}; + +void mana_smc_init(struct shm_channel *sc, device_t dev, void __iomem *base); + +int mana_smc_setup_hwc(struct shm_channel *sc, bool reset_vf, + uint64_t eq_addr, uint64_t cq_addr, uint64_t rq_addr, uint64_t sq_addr, + uint32_t eq_msix_index); + +int mana_smc_teardown_hwc(struct shm_channel *sc, bool reset_vf); + +#endif /* _SHM_CHANNEL_H */ diff --git a/sys/modules/Makefile b/sys/modules/Makefile index fdaea7fa4ec9..7ef3c5a477ce 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -231,6 +231,7 @@ SUBDIR= \ mac_stub \ mac_test \ ${_malo} \ + ${_mana} \ md \ mdio \ mem \ @@ -648,6 +649,7 @@ _ixv= ixv .if ${MK_SOURCELESS_UCODE} != "no" _lio= lio .endif +_mana= mana _nctgpio= nctgpio _ntb= ntb _ocs_fc= ocs_fc diff --git a/sys/modules/mana/Makefile b/sys/modules/mana/Makefile new file mode 100644 index 000000000000..3122c066e948 --- /dev/null +++ b/sys/modules/mana/Makefile @@ -0,0 +1,12 @@ +# $FreeBSD$ + +.PATH: ${SRCTOP}/sys/dev/mana + +KMOD = if_mana +SRCS = gdma_main.c mana_sysctl.c shm_channel.c +SRCS += mana_en.c gdma_util.c hw_channel.c +SRCS += device_if.h bus_if.h pci_if.h + +CFLAGS += -I${SRCTOP}/sys/dev/mana + +.include <bsd.kmod.mk> |