aboutsummaryrefslogtreecommitdiff
path: root/sys/dev/mana
diff options
context:
space:
mode:
Diffstat (limited to 'sys/dev/mana')
-rw-r--r--sys/dev/mana/gdma.h744
-rw-r--r--sys/dev/mana/gdma_main.c1961
-rw-r--r--sys/dev/mana/gdma_util.c96
-rw-r--r--sys/dev/mana/gdma_util.h206
-rw-r--r--sys/dev/mana/hw_channel.c950
-rw-r--r--sys/dev/mana/hw_channel.h222
-rw-r--r--sys/dev/mana/mana.h689
-rw-r--r--sys/dev/mana/mana_en.c2699
-rw-r--r--sys/dev/mana/mana_sysctl.c219
-rw-r--r--sys/dev/mana/mana_sysctl.h48
-rw-r--r--sys/dev/mana/shm_channel.c337
-rw-r--r--sys/dev/mana/shm_channel.h52
12 files changed, 8223 insertions, 0 deletions
diff --git a/sys/dev/mana/gdma.h b/sys/dev/mana/gdma.h
new file mode 100644
index 000000000000..097b2b65e545
--- /dev/null
+++ b/sys/dev/mana/gdma.h
@@ -0,0 +1,744 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2021 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+
+#ifndef _GDMA_H
+#define _GDMA_H
+
+#include <sys/bus.h>
+#include <sys/bus_dma.h>
+#include <sys/types.h>
+#include <sys/limits.h>
+#include <sys/sx.h>
+
+#include "gdma_util.h"
+#include "shm_channel.h"
+
+/* Structures labeled with "HW DATA" are exchanged with the hardware. All of
+ * them are naturally aligned and hence don't need __packed.
+ */
+
+#define GDMA_BAR0 0
+
+#define GDMA_IRQNAME_SZ 40
+
+struct gdma_bus {
+ bus_space_handle_t bar0_h;
+ bus_space_tag_t bar0_t;
+};
+
+struct gdma_msix_entry {
+ int entry;
+ int vector;
+};
+
+enum gdma_request_type {
+ GDMA_VERIFY_VF_DRIVER_VERSION = 1,
+ GDMA_QUERY_MAX_RESOURCES = 2,
+ GDMA_LIST_DEVICES = 3,
+ GDMA_REGISTER_DEVICE = 4,
+ GDMA_DEREGISTER_DEVICE = 5,
+ GDMA_GENERATE_TEST_EQE = 10,
+ GDMA_CREATE_QUEUE = 12,
+ GDMA_DISABLE_QUEUE = 13,
+ GDMA_CREATE_DMA_REGION = 25,
+ GDMA_DMA_REGION_ADD_PAGES = 26,
+ GDMA_DESTROY_DMA_REGION = 27,
+};
+
+enum gdma_queue_type {
+ GDMA_INVALID_QUEUE,
+ GDMA_SQ,
+ GDMA_RQ,
+ GDMA_CQ,
+ GDMA_EQ,
+};
+
+enum gdma_work_request_flags {
+ GDMA_WR_NONE = 0,
+ GDMA_WR_OOB_IN_SGL = BIT(0),
+ GDMA_WR_PAD_BY_SGE0 = BIT(1),
+};
+
+enum gdma_eqe_type {
+ GDMA_EQE_COMPLETION = 3,
+ GDMA_EQE_TEST_EVENT = 64,
+ GDMA_EQE_HWC_INIT_EQ_ID_DB = 129,
+ GDMA_EQE_HWC_INIT_DATA = 130,
+ GDMA_EQE_HWC_INIT_DONE = 131,
+};
+
+enum {
+ GDMA_DEVICE_NONE = 0,
+ GDMA_DEVICE_HWC = 1,
+ GDMA_DEVICE_MANA = 2,
+};
+
+
+struct gdma_resource {
+ /* Protect the bitmap */
+ struct mtx lock_spin;
+
+ /* The bitmap size in bits. */
+ uint32_t size;
+
+ /* The bitmap tracks the resources. */
+ unsigned long *map;
+};
+
+union gdma_doorbell_entry {
+ uint64_t as_uint64;
+
+ struct {
+ uint64_t id : 24;
+ uint64_t reserved : 8;
+ uint64_t tail_ptr : 31;
+ uint64_t arm : 1;
+ } cq;
+
+ struct {
+ uint64_t id : 24;
+ uint64_t wqe_cnt : 8;
+ uint64_t tail_ptr : 32;
+ } rq;
+
+ struct {
+ uint64_t id : 24;
+ uint64_t reserved : 8;
+ uint64_t tail_ptr : 32;
+ } sq;
+
+ struct {
+ uint64_t id : 16;
+ uint64_t reserved : 16;
+ uint64_t tail_ptr : 31;
+ uint64_t arm : 1;
+ } eq;
+}; /* HW DATA */
+
+struct gdma_msg_hdr {
+ uint32_t hdr_type;
+ uint32_t msg_type;
+ uint16_t msg_version;
+ uint16_t hwc_msg_id;
+ uint32_t msg_size;
+}; /* HW DATA */
+
+struct gdma_dev_id {
+ union {
+ struct {
+ uint16_t type;
+ uint16_t instance;
+ };
+
+ uint32_t as_uint32;
+ };
+}; /* HW DATA */
+
+struct gdma_req_hdr {
+ struct gdma_msg_hdr req;
+ struct gdma_msg_hdr resp; /* The expected response */
+ struct gdma_dev_id dev_id;
+ uint32_t activity_id;
+}; /* HW DATA */
+
+struct gdma_resp_hdr {
+ struct gdma_msg_hdr response;
+ struct gdma_dev_id dev_id;
+ uint32_t activity_id;
+ uint32_t status;
+ uint32_t reserved;
+}; /* HW DATA */
+
+struct gdma_general_req {
+ struct gdma_req_hdr hdr;
+}; /* HW DATA */
+
+#define GDMA_MESSAGE_V1 1
+
+struct gdma_general_resp {
+ struct gdma_resp_hdr hdr;
+}; /* HW DATA */
+
+#define GDMA_STANDARD_HEADER_TYPE 0
+
+static inline void
+mana_gd_init_req_hdr(struct gdma_req_hdr *hdr, uint32_t code,
+ uint32_t req_size, uint32_t resp_size)
+{
+ hdr->req.hdr_type = GDMA_STANDARD_HEADER_TYPE;
+ hdr->req.msg_type = code;
+ hdr->req.msg_version = GDMA_MESSAGE_V1;
+ hdr->req.msg_size = req_size;
+
+ hdr->resp.hdr_type = GDMA_STANDARD_HEADER_TYPE;
+ hdr->resp.msg_type = code;
+ hdr->resp.msg_version = GDMA_MESSAGE_V1;
+ hdr->resp.msg_size = resp_size;
+}
+
+/* The 16-byte struct is part of the GDMA work queue entry (WQE). */
+struct gdma_sge {
+ uint64_t address;
+ uint32_t mem_key;
+ uint32_t size;
+}; /* HW DATA */
+
+struct gdma_wqe_request {
+ struct gdma_sge *sgl;
+ uint32_t num_sge;
+
+ uint32_t inline_oob_size;
+ const void *inline_oob_data;
+
+ uint32_t flags;
+ uint32_t client_data_unit;
+};
+
+enum gdma_page_type {
+ GDMA_PAGE_TYPE_4K,
+};
+
+#define GDMA_INVALID_DMA_REGION 0
+
+struct gdma_mem_info {
+ device_t dev;
+
+ bus_dma_tag_t dma_tag;
+ bus_dmamap_t dma_map;
+ bus_addr_t dma_handle; /* Physical address */
+ void *virt_addr; /* Virtual address */
+ uint64_t length;
+
+ /* Allocated by the PF driver */
+ uint64_t gdma_region;
+};
+
+#define REGISTER_ATB_MST_MKEY_LOWER_SIZE 8
+
+struct gdma_dev {
+ struct gdma_context *gdma_context;
+
+ struct gdma_dev_id dev_id;
+
+ uint32_t pdid;
+ uint32_t doorbell;
+ uint32_t gpa_mkey;
+
+ /* GDMA driver specific pointer */
+ void *driver_data;
+};
+
+#define MINIMUM_SUPPORTED_PAGE_SIZE PAGE_SIZE
+
+#define GDMA_CQE_SIZE 64
+#define GDMA_EQE_SIZE 16
+#define GDMA_MAX_SQE_SIZE 512
+#define GDMA_MAX_RQE_SIZE 256
+
+#define GDMA_COMP_DATA_SIZE 0x3C
+
+#define GDMA_EVENT_DATA_SIZE 0xC
+
+/* The WQE size must be a multiple of the Basic Unit, which is 32 bytes. */
+#define GDMA_WQE_BU_SIZE 32
+
+#define INVALID_PDID UINT_MAX
+#define INVALID_DOORBELL UINT_MAX
+#define INVALID_MEM_KEY UINT_MAX
+#define INVALID_QUEUE_ID UINT_MAX
+#define INVALID_PCI_MSIX_INDEX UINT_MAX
+
+struct gdma_comp {
+ uint32_t cqe_data[GDMA_COMP_DATA_SIZE / 4];
+ uint32_t wq_num;
+ bool is_sq;
+};
+
+struct gdma_event {
+ uint32_t details[GDMA_EVENT_DATA_SIZE / 4];
+ uint8_t type;
+};
+
+struct gdma_queue;
+
+#define CQE_POLLING_BUFFER 512
+
+typedef void gdma_eq_callback(void *context, struct gdma_queue *q,
+ struct gdma_event *e);
+
+typedef void gdma_cq_callback(void *context, struct gdma_queue *q);
+
+/* The 'head' is the producer index. For SQ/RQ, when the driver posts a WQE
+ * (Note: the WQE size must be a multiple of the 32-byte Basic Unit), the
+ * driver increases the 'head' in BUs rather than in bytes, and notifies
+ * the HW of the updated head. For EQ/CQ, the driver uses the 'head' to track
+ * the HW head, and increases the 'head' by 1 for every processed EQE/CQE.
+ *
+ * The 'tail' is the consumer index for SQ/RQ. After the CQE of the SQ/RQ is
+ * processed, the driver increases the 'tail' to indicate that WQEs have
+ * been consumed by the HW, so the driver can post new WQEs into the SQ/RQ.
+ *
+ * The driver doesn't use the 'tail' for EQ/CQ, because the driver ensures
+ * that the EQ/CQ is big enough so they can't overflow, and the driver uses
+ * the owner bits mechanism to detect if the queue has become empty.
+ */
+struct gdma_queue {
+ struct gdma_dev *gdma_dev;
+
+ enum gdma_queue_type type;
+ uint32_t id;
+
+ struct gdma_mem_info mem_info;
+
+ void *queue_mem_ptr;
+ uint32_t queue_size;
+
+ bool monitor_avl_buf;
+
+ uint32_t head;
+ uint32_t tail;
+
+ /* Extra fields specific to EQ/CQ. */
+ union {
+ struct {
+ bool disable_needed;
+
+ gdma_eq_callback *callback;
+ void *context;
+
+ unsigned int msix_index;
+
+ uint32_t log2_throttle_limit;
+
+ struct task cleanup_task;
+ struct taskqueue *cleanup_tq;
+ int cpu;
+ bool do_not_ring_db;
+
+ int work_done;
+ int budget;
+ } eq;
+
+ struct {
+ gdma_cq_callback *callback;
+ void *context;
+
+ /* For CQ/EQ relationship */
+ struct gdma_queue *parent;
+ } cq;
+ };
+};
+
+struct gdma_queue_spec {
+ enum gdma_queue_type type;
+ bool monitor_avl_buf;
+ unsigned int queue_size;
+
+ /* Extra fields specific to EQ/CQ. */
+ union {
+ struct {
+ gdma_eq_callback *callback;
+ void *context;
+
+ unsigned long log2_throttle_limit;
+
+ /* Only used by the MANA device. */
+ struct ifnet *ndev;
+ } eq;
+
+ struct {
+ gdma_cq_callback *callback;
+ void *context;
+
+ struct gdma_queue *parent_eq;
+
+ } cq;
+ };
+};
+
+struct mana_eq {
+ struct gdma_queue *eq;
+ struct gdma_comp cqe_poll[CQE_POLLING_BUFFER];
+};
+
+struct gdma_irq_context {
+ struct gdma_msix_entry msix_e;
+ struct resource *res;
+ driver_intr_t *handler;
+ void *arg;
+ void *cookie;
+ bool requested;
+ int cpu;
+ char name[GDMA_IRQNAME_SZ];
+};
+
+struct gdma_context {
+ device_t dev;
+
+ struct gdma_bus gd_bus;
+
+ /* Per-vPort max number of queues */
+ unsigned int max_num_queues;
+ unsigned int max_num_msix;
+ unsigned int num_msix_usable;
+ struct gdma_resource msix_resource;
+ struct gdma_irq_context *irq_contexts;
+
+ /* This maps a CQ index to the queue structure. */
+ unsigned int max_num_cqs;
+ struct gdma_queue **cq_table;
+
+ /* Protect eq_test_event and test_event_eq_id */
+ struct sx eq_test_event_sx;
+ struct completion eq_test_event;
+ uint32_t test_event_eq_id;
+
+ struct resource *bar0;
+ struct resource *msix;
+ int msix_rid;
+ void __iomem *shm_base;
+ void __iomem *db_page_base;
+ uint32_t db_page_size;
+
+ /* Shared memory chanenl (used to bootstrap HWC) */
+ struct shm_channel shm_channel;
+
+ /* Hardware communication channel (HWC) */
+ struct gdma_dev hwc;
+
+ /* Azure network adapter */
+ struct gdma_dev mana;
+};
+
+#define MAX_NUM_GDMA_DEVICES 4
+
+static inline bool mana_gd_is_mana(struct gdma_dev *gd)
+{
+ return gd->dev_id.type == GDMA_DEVICE_MANA;
+}
+
+static inline bool mana_gd_is_hwc(struct gdma_dev *gd)
+{
+ return gd->dev_id.type == GDMA_DEVICE_HWC;
+}
+
+uint8_t *mana_gd_get_wqe_ptr(const struct gdma_queue *wq, uint32_t wqe_offset);
+uint32_t mana_gd_wq_avail_space(struct gdma_queue *wq);
+
+int mana_gd_test_eq(struct gdma_context *gc, struct gdma_queue *eq);
+
+int mana_gd_create_hwc_queue(struct gdma_dev *gd,
+ const struct gdma_queue_spec *spec,
+ struct gdma_queue **queue_ptr);
+
+int mana_gd_create_mana_eq(struct gdma_dev *gd,
+ const struct gdma_queue_spec *spec,
+ struct gdma_queue **queue_ptr);
+
+int mana_gd_create_mana_wq_cq(struct gdma_dev *gd,
+ const struct gdma_queue_spec *spec,
+ struct gdma_queue **queue_ptr);
+
+void mana_gd_destroy_queue(struct gdma_context *gc, struct gdma_queue *queue);
+
+int mana_gd_poll_cq(struct gdma_queue *cq, struct gdma_comp *comp, int num_cqe);
+
+void mana_gd_arm_cq(struct gdma_queue *cq);
+
+struct gdma_wqe {
+ uint32_t reserved :24;
+ uint32_t last_vbytes :8;
+
+ union {
+ uint32_t flags;
+
+ struct {
+ uint32_t num_sge :8;
+ uint32_t inline_oob_size_div4 :3;
+ uint32_t client_oob_in_sgl :1;
+ uint32_t reserved1 :4;
+ uint32_t client_data_unit :14;
+ uint32_t reserved2 :2;
+ };
+ };
+}; /* HW DATA */
+
+#define INLINE_OOB_SMALL_SIZE 8
+#define INLINE_OOB_LARGE_SIZE 24
+
+#define MAX_TX_WQE_SIZE 512
+#define MAX_RX_WQE_SIZE 256
+
+struct gdma_cqe {
+ uint32_t cqe_data[GDMA_COMP_DATA_SIZE / 4];
+
+ union {
+ uint32_t as_uint32;
+
+ struct {
+ uint32_t wq_num :24;
+ uint32_t is_sq :1;
+ uint32_t reserved :4;
+ uint32_t owner_bits :3;
+ };
+ } cqe_info;
+}; /* HW DATA */
+
+#define GDMA_CQE_OWNER_BITS 3
+
+#define GDMA_CQE_OWNER_MASK ((1 << GDMA_CQE_OWNER_BITS) - 1)
+
+#define SET_ARM_BIT 1
+
+#define GDMA_EQE_OWNER_BITS 3
+
+union gdma_eqe_info {
+ uint32_t as_uint32;
+
+ struct {
+ uint32_t type : 8;
+ uint32_t reserved1 : 8;
+ uint32_t client_id : 2;
+ uint32_t reserved2 : 11;
+ uint32_t owner_bits : 3;
+ };
+}; /* HW DATA */
+
+#define GDMA_EQE_OWNER_MASK ((1 << GDMA_EQE_OWNER_BITS) - 1)
+#define INITIALIZED_OWNER_BIT(log2_num_entries) (1UL << (log2_num_entries))
+
+struct gdma_eqe {
+ uint32_t details[GDMA_EVENT_DATA_SIZE / 4];
+ uint32_t eqe_info;
+}; /* HW DATA */
+
+#define GDMA_REG_DB_PAGE_OFFSET 8
+#define GDMA_REG_DB_PAGE_SIZE 0x10
+#define GDMA_REG_SHM_OFFSET 0x18
+
+struct gdma_posted_wqe_info {
+ uint32_t wqe_size_in_bu;
+};
+
+/* GDMA_GENERATE_TEST_EQE */
+struct gdma_generate_test_event_req {
+ struct gdma_req_hdr hdr;
+ uint32_t queue_index;
+}; /* HW DATA */
+
+/* GDMA_VERIFY_VF_DRIVER_VERSION */
+enum {
+ GDMA_PROTOCOL_V1 = 1,
+ GDMA_PROTOCOL_FIRST = GDMA_PROTOCOL_V1,
+ GDMA_PROTOCOL_LAST = GDMA_PROTOCOL_V1,
+};
+
+struct gdma_verify_ver_req {
+ struct gdma_req_hdr hdr;
+
+ /* Mandatory fields required for protocol establishment */
+ uint64_t protocol_ver_min;
+ uint64_t protocol_ver_max;
+ uint64_t drv_cap_flags1;
+ uint64_t drv_cap_flags2;
+ uint64_t drv_cap_flags3;
+ uint64_t drv_cap_flags4;
+
+ /* Advisory fields */
+ uint64_t drv_ver;
+ uint32_t os_type; /* Linux = 0x10; Windows = 0x20; Other = 0x30 */
+ uint32_t reserved;
+ uint32_t os_ver_major;
+ uint32_t os_ver_minor;
+ uint32_t os_ver_build;
+ uint32_t os_ver_platform;
+ uint64_t reserved_2;
+ uint8_t os_ver_str1[128];
+ uint8_t os_ver_str2[128];
+ uint8_t os_ver_str3[128];
+ uint8_t os_ver_str4[128];
+}; /* HW DATA */
+
+struct gdma_verify_ver_resp {
+ struct gdma_resp_hdr hdr;
+ uint64_t gdma_protocol_ver;
+ uint64_t pf_cap_flags1;
+ uint64_t pf_cap_flags2;
+ uint64_t pf_cap_flags3;
+ uint64_t pf_cap_flags4;
+}; /* HW DATA */
+
+/* GDMA_QUERY_MAX_RESOURCES */
+struct gdma_query_max_resources_resp {
+ struct gdma_resp_hdr hdr;
+ uint32_t status;
+ uint32_t max_sq;
+ uint32_t max_rq;
+ uint32_t max_cq;
+ uint32_t max_eq;
+ uint32_t max_db;
+ uint32_t max_mst;
+ uint32_t max_cq_mod_ctx;
+ uint32_t max_mod_cq;
+ uint32_t max_msix;
+}; /* HW DATA */
+
+/* GDMA_LIST_DEVICES */
+struct gdma_list_devices_resp {
+ struct gdma_resp_hdr hdr;
+ uint32_t num_of_devs;
+ uint32_t reserved;
+ struct gdma_dev_id devs[64];
+}; /* HW DATA */
+
+/* GDMA_REGISTER_DEVICE */
+struct gdma_register_device_resp {
+ struct gdma_resp_hdr hdr;
+ uint32_t pdid;
+ uint32_t gpa_mkey;
+ uint32_t db_id;
+}; /* HW DATA */
+
+/* GDMA_CREATE_QUEUE */
+struct gdma_create_queue_req {
+ struct gdma_req_hdr hdr;
+ uint32_t type;
+ uint32_t reserved1;
+ uint32_t pdid;
+ uint32_t doolbell_id;
+ uint64_t gdma_region;
+ uint32_t reserved2;
+ uint32_t queue_size;
+ uint32_t log2_throttle_limit;
+ uint32_t eq_pci_msix_index;
+ uint32_t cq_mod_ctx_id;
+ uint32_t cq_parent_eq_id;
+ uint8_t rq_drop_on_overrun;
+ uint8_t rq_err_on_wqe_overflow;
+ uint8_t rq_chain_rec_wqes;
+ uint8_t sq_hw_db;
+ uint32_t reserved3;
+}; /* HW DATA */
+
+struct gdma_create_queue_resp {
+ struct gdma_resp_hdr hdr;
+ uint32_t queue_index;
+}; /* HW DATA */
+
+/* GDMA_DISABLE_QUEUE */
+struct gdma_disable_queue_req {
+ struct gdma_req_hdr hdr;
+ uint32_t type;
+ uint32_t queue_index;
+ uint32_t alloc_res_id_on_creation;
+}; /* HW DATA */
+
+/* GDMA_CREATE_DMA_REGION */
+struct gdma_create_dma_region_req {
+ struct gdma_req_hdr hdr;
+
+ /* The total size of the DMA region */
+ uint64_t length;
+
+ /* The offset in the first page */
+ uint32_t offset_in_page;
+
+ /* enum gdma_page_type */
+ uint32_t gdma_page_type;
+
+ /* The total number of pages */
+ uint32_t page_count;
+
+ /* If page_addr_list_len is smaller than page_count,
+ * the remaining page addresses will be added via the
+ * message GDMA_DMA_REGION_ADD_PAGES.
+ */
+ uint32_t page_addr_list_len;
+ uint64_t page_addr_list[];
+}; /* HW DATA */
+
+struct gdma_create_dma_region_resp {
+ struct gdma_resp_hdr hdr;
+ uint64_t gdma_region;
+}; /* HW DATA */
+
+/* GDMA_DMA_REGION_ADD_PAGES */
+struct gdma_dma_region_add_pages_req {
+ struct gdma_req_hdr hdr;
+
+ uint64_t gdma_region;
+
+ uint32_t page_addr_list_len;
+ uint32_t reserved3;
+
+ uint64_t page_addr_list[];
+}; /* HW DATA */
+
+/* GDMA_DESTROY_DMA_REGION */
+struct gdma_destroy_dma_region_req {
+ struct gdma_req_hdr hdr;
+
+ uint64_t gdma_region;
+}; /* HW DATA */
+
+int mana_gd_verify_vf_version(device_t dev);
+
+int mana_gd_register_device(struct gdma_dev *gd);
+int mana_gd_deregister_device(struct gdma_dev *gd);
+
+int mana_gd_post_work_request(struct gdma_queue *wq,
+ const struct gdma_wqe_request *wqe_req,
+ struct gdma_posted_wqe_info *wqe_info);
+
+int mana_gd_post_and_ring(struct gdma_queue *queue,
+ const struct gdma_wqe_request *wqe,
+ struct gdma_posted_wqe_info *wqe_info);
+
+int mana_gd_alloc_res_map(uint32_t res_avil, struct gdma_resource *r,
+ const char *lock_name);
+void mana_gd_free_res_map(struct gdma_resource *r);
+
+void mana_gd_wq_ring_doorbell(struct gdma_context *gc,
+ struct gdma_queue *queue);
+
+int mana_gd_alloc_memory(struct gdma_context *gc, unsigned int length,
+ struct gdma_mem_info *gmi);
+
+void mana_gd_free_memory(struct gdma_mem_info *gmi);
+
+void mana_gd_dma_map_paddr(void *arg, bus_dma_segment_t *segs,
+ int nseg, int error);
+
+int mana_gd_send_request(struct gdma_context *gc, uint32_t req_len,
+ const void *req, uint32_t resp_len, void *resp);
+#endif /* _GDMA_H */
diff --git a/sys/dev/mana/gdma_main.c b/sys/dev/mana/gdma_main.c
new file mode 100644
index 000000000000..910992ce17a4
--- /dev/null
+++ b/sys/dev/mana/gdma_main.c
@@ -0,0 +1,1961 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2021 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/rman.h>
+#include <sys/smp.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <sys/time.h>
+#include <sys/eventhandler.h>
+
+#include <machine/bus.h>
+#include <machine/resource.h>
+#include <machine/in_cksum.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include "gdma_util.h"
+#include "mana.h"
+
+
+static mana_vendor_id_t mana_id_table[] = {
+ { PCI_VENDOR_ID_MICROSOFT, PCI_DEV_ID_MANA_VF},
+ /* Last entry */
+ { 0, 0}
+};
+
+static inline uint32_t
+mana_gd_r32(struct gdma_context *g, uint64_t offset)
+{
+ uint32_t v = bus_space_read_4(g->gd_bus.bar0_t,
+ g->gd_bus.bar0_h, offset);
+ rmb();
+ return (v);
+}
+
+#if defined(__amd64__)
+static inline uint64_t
+mana_gd_r64(struct gdma_context *g, uint64_t offset)
+{
+ uint64_t v = bus_space_read_8(g->gd_bus.bar0_t,
+ g->gd_bus.bar0_h, offset);
+ rmb();
+ return (v);
+}
+#else
+static inline uint64_t
+mana_gd_r64(struct gdma_context *g, uint64_t offset)
+{
+ uint64_t v;
+ uint32_t *vp = (uint32_t *)&v;
+
+ *vp = mana_gd_r32(g, offset);
+ *(vp + 1) = mana_gd_r32(g, offset + 4);
+ rmb();
+ return (v);
+}
+#endif
+
+static int
+mana_gd_query_max_resources(device_t dev)
+{
+ struct gdma_context *gc = device_get_softc(dev);
+ struct gdma_query_max_resources_resp resp = {};
+ struct gdma_general_req req = {};
+ int err;
+
+ mana_gd_init_req_hdr(&req.hdr, GDMA_QUERY_MAX_RESOURCES,
+ sizeof(req), sizeof(resp));
+
+ err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
+ if (err || resp.hdr.status) {
+ device_printf(gc->dev,
+ "Failed to query resource info: %d, 0x%x\n",
+ err, resp.hdr.status);
+ return err ? err : EPROTO;
+ }
+
+ mana_dbg(NULL, "max_msix %u, max_eq %u, max_cq %u, "
+ "max_sq %u, max_rq %u\n",
+ resp.max_msix, resp.max_eq, resp.max_cq,
+ resp.max_sq, resp.max_rq);
+
+ if (gc->num_msix_usable > resp.max_msix)
+ gc->num_msix_usable = resp.max_msix;
+
+ if (gc->num_msix_usable <= 1)
+ return ENOSPC;
+
+ gc->max_num_queues = mp_ncpus;
+ if (gc->max_num_queues > MANA_MAX_NUM_QUEUES)
+ gc->max_num_queues = MANA_MAX_NUM_QUEUES;
+
+ if (gc->max_num_queues > resp.max_eq)
+ gc->max_num_queues = resp.max_eq;
+
+ if (gc->max_num_queues > resp.max_cq)
+ gc->max_num_queues = resp.max_cq;
+
+ if (gc->max_num_queues > resp.max_sq)
+ gc->max_num_queues = resp.max_sq;
+
+ if (gc->max_num_queues > resp.max_rq)
+ gc->max_num_queues = resp.max_rq;
+
+ return 0;
+}
+
+static int
+mana_gd_detect_devices(device_t dev)
+{
+ struct gdma_context *gc = device_get_softc(dev);
+ struct gdma_list_devices_resp resp = {};
+ struct gdma_general_req req = {};
+ struct gdma_dev_id gd_dev;
+ uint32_t i, max_num_devs;
+ uint16_t dev_type;
+ int err;
+
+ mana_gd_init_req_hdr(&req.hdr, GDMA_LIST_DEVICES, sizeof(req),
+ sizeof(resp));
+
+ err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
+ if (err || resp.hdr.status) {
+ device_printf(gc->dev,
+ "Failed to detect devices: %d, 0x%x\n", err,
+ resp.hdr.status);
+ return err ? err : EPROTO;
+ }
+
+ max_num_devs = min_t(uint32_t, MAX_NUM_GDMA_DEVICES, resp.num_of_devs);
+
+ for (i = 0; i < max_num_devs; i++) {
+ gd_dev = resp.devs[i];
+ dev_type = gd_dev.type;
+
+ mana_dbg(NULL, "gdma dev %d, type %u\n",
+ i, dev_type);
+
+ /* HWC is already detected in mana_hwc_create_channel(). */
+ if (dev_type == GDMA_DEVICE_HWC)
+ continue;
+
+ if (dev_type == GDMA_DEVICE_MANA) {
+ gc->mana.gdma_context = gc;
+ gc->mana.dev_id = gd_dev;
+ }
+ }
+
+ return gc->mana.dev_id.type == 0 ? ENODEV : 0;
+}
+
+int
+mana_gd_send_request(struct gdma_context *gc, uint32_t req_len,
+ const void *req, uint32_t resp_len, void *resp)
+{
+ struct hw_channel_context *hwc = gc->hwc.driver_data;
+
+ return mana_hwc_send_request(hwc, req_len, req, resp_len, resp);
+}
+
+void
+mana_gd_dma_map_paddr(void *arg, bus_dma_segment_t *segs, int nseg, int error)
+{
+ bus_addr_t *paddr = arg;
+
+ if (error)
+ return;
+
+ KASSERT(nseg == 1, ("too many segments %d!", nseg));
+ *paddr = segs->ds_addr;
+}
+
+int
+mana_gd_alloc_memory(struct gdma_context *gc, unsigned int length,
+ struct gdma_mem_info *gmi)
+{
+ bus_addr_t dma_handle;
+ void *buf;
+ int err;
+
+ if (!gc || !gmi)
+ return EINVAL;
+
+ if (length < PAGE_SIZE || (length != roundup_pow_of_two(length)))
+ return EINVAL;
+
+ err = bus_dma_tag_create(bus_get_dma_tag(gc->dev), /* parent */
+ PAGE_SIZE, 0, /* alignment, boundary */
+ BUS_SPACE_MAXADDR, /* lowaddr */
+ BUS_SPACE_MAXADDR, /* highaddr */
+ NULL, NULL, /* filter, filterarg */
+ length, /* maxsize */
+ 1, /* nsegments */
+ length, /* maxsegsize */
+ 0, /* flags */
+ NULL, NULL, /* lockfunc, lockfuncarg*/
+ &gmi->dma_tag);
+ if (err) {
+ device_printf(gc->dev,
+ "failed to create dma tag, err: %d\n", err);
+ return (err);
+ }
+
+ /*
+ * Must have BUS_DMA_ZERO flag to clear the dma memory.
+ * Otherwise the queue overflow detection mechanism does
+ * not work.
+ */
+ err = bus_dmamem_alloc(gmi->dma_tag, &buf,
+ BUS_DMA_NOWAIT | BUS_DMA_COHERENT | BUS_DMA_ZERO, &gmi->dma_map);
+ if (err) {
+ device_printf(gc->dev,
+ "failed to alloc dma mem, err: %d\n", err);
+ bus_dma_tag_destroy(gmi->dma_tag);
+ return (err);
+ }
+
+ err = bus_dmamap_load(gmi->dma_tag, gmi->dma_map, buf,
+ length, mana_gd_dma_map_paddr, &dma_handle, BUS_DMA_NOWAIT);
+ if (err) {
+ device_printf(gc->dev,
+ "failed to load dma mem, err: %d\n", err);
+ bus_dmamem_free(gmi->dma_tag, buf, gmi->dma_map);
+ bus_dma_tag_destroy(gmi->dma_tag);
+ return (err);
+ }
+
+ gmi->dev = gc->dev;
+ gmi->dma_handle = dma_handle;
+ gmi->virt_addr = buf;
+ gmi->length = length;
+
+ return 0;
+}
+
+void
+mana_gd_free_memory(struct gdma_mem_info *gmi)
+{
+ bus_dmamap_unload(gmi->dma_tag, gmi->dma_map);
+ bus_dmamem_free(gmi->dma_tag, gmi->virt_addr, gmi->dma_map);
+ bus_dma_tag_destroy(gmi->dma_tag);
+}
+
+static int
+mana_gd_create_hw_eq(struct gdma_context *gc,
+ struct gdma_queue *queue)
+{
+ struct gdma_create_queue_resp resp = {};
+ struct gdma_create_queue_req req = {};
+ int err;
+
+ if (queue->type != GDMA_EQ)
+ return EINVAL;
+
+ mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_QUEUE,
+ sizeof(req), sizeof(resp));
+
+ req.hdr.dev_id = queue->gdma_dev->dev_id;
+ req.type = queue->type;
+ req.pdid = queue->gdma_dev->pdid;
+ req.doolbell_id = queue->gdma_dev->doorbell;
+ req.gdma_region = queue->mem_info.gdma_region;
+ req.queue_size = queue->queue_size;
+ req.log2_throttle_limit = queue->eq.log2_throttle_limit;
+ req.eq_pci_msix_index = queue->eq.msix_index;
+
+ err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
+ if (err || resp.hdr.status) {
+ device_printf(gc->dev,
+ "Failed to create queue: %d, 0x%x\n",
+ err, resp.hdr.status);
+ return err ? err : EPROTO;
+ }
+
+ queue->id = resp.queue_index;
+ queue->eq.disable_needed = true;
+ queue->mem_info.gdma_region = GDMA_INVALID_DMA_REGION;
+ return 0;
+}
+
+static
+int mana_gd_disable_queue(struct gdma_queue *queue)
+{
+ struct gdma_context *gc = queue->gdma_dev->gdma_context;
+ struct gdma_disable_queue_req req = {};
+ struct gdma_general_resp resp = {};
+ int err;
+
+ if (queue->type != GDMA_EQ)
+ mana_warn(NULL, "Not event queue type 0x%x\n",
+ queue->type);
+
+ mana_gd_init_req_hdr(&req.hdr, GDMA_DISABLE_QUEUE,
+ sizeof(req), sizeof(resp));
+
+ req.hdr.dev_id = queue->gdma_dev->dev_id;
+ req.type = queue->type;
+ req.queue_index = queue->id;
+ req.alloc_res_id_on_creation = 1;
+
+ err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
+ if (err || resp.hdr.status) {
+ device_printf(gc->dev,
+ "Failed to disable queue: %d, 0x%x\n", err,
+ resp.hdr.status);
+ return err ? err : EPROTO;
+ }
+
+ return 0;
+}
+
+#define DOORBELL_OFFSET_SQ 0x0
+#define DOORBELL_OFFSET_RQ 0x400
+#define DOORBELL_OFFSET_CQ 0x800
+#define DOORBELL_OFFSET_EQ 0xFF8
+
+static void
+mana_gd_ring_doorbell(struct gdma_context *gc, uint32_t db_index,
+ enum gdma_queue_type q_type, uint32_t qid,
+ uint32_t tail_ptr, uint8_t num_req)
+{
+ union gdma_doorbell_entry e = {};
+ void __iomem *addr;
+
+ addr = (char *)gc->db_page_base + gc->db_page_size * db_index;
+ switch (q_type) {
+ case GDMA_EQ:
+ e.eq.id = qid;
+ e.eq.tail_ptr = tail_ptr;
+ e.eq.arm = num_req;
+
+ addr = (char *)addr + DOORBELL_OFFSET_EQ;
+ break;
+
+ case GDMA_CQ:
+ e.cq.id = qid;
+ e.cq.tail_ptr = tail_ptr;
+ e.cq.arm = num_req;
+
+ addr = (char *)addr + DOORBELL_OFFSET_CQ;
+ break;
+
+ case GDMA_RQ:
+ e.rq.id = qid;
+ e.rq.tail_ptr = tail_ptr;
+ e.rq.wqe_cnt = num_req;
+
+ addr = (char *)addr + DOORBELL_OFFSET_RQ;
+ break;
+
+ case GDMA_SQ:
+ e.sq.id = qid;
+ e.sq.tail_ptr = tail_ptr;
+
+ addr = (char *)addr + DOORBELL_OFFSET_SQ;
+ break;
+
+ default:
+ mana_warn(NULL, "Invalid queue type 0x%x\n", q_type);
+ return;
+ }
+
+ /* Ensure all writes are done before ring doorbell */
+ wmb();
+
+#if defined(__amd64__)
+ writeq(addr, e.as_uint64);
+#else
+ uint32_t *p = (uint32_t *)&e.as_uint64;
+ writel(addr, *p);
+ writel((char *)addr + 4, *(p + 1));
+#endif
+}
+
+void
+mana_gd_wq_ring_doorbell(struct gdma_context *gc, struct gdma_queue *queue)
+{
+ mana_gd_ring_doorbell(gc, queue->gdma_dev->doorbell, queue->type,
+ queue->id, queue->head * GDMA_WQE_BU_SIZE, 1);
+}
+
+void
+mana_gd_arm_cq(struct gdma_queue *cq)
+{
+ struct gdma_context *gc = cq->gdma_dev->gdma_context;
+
+ uint32_t num_cqe = cq->queue_size / GDMA_CQE_SIZE;
+
+ uint32_t head = cq->head % (num_cqe << GDMA_CQE_OWNER_BITS);
+
+ mana_gd_ring_doorbell(gc, cq->gdma_dev->doorbell, cq->type, cq->id,
+ head, SET_ARM_BIT);
+}
+
+static void
+mana_gd_process_eqe(struct gdma_queue *eq)
+{
+ uint32_t head = eq->head % (eq->queue_size / GDMA_EQE_SIZE);
+ struct gdma_context *gc = eq->gdma_dev->gdma_context;
+ struct gdma_eqe *eq_eqe_ptr = eq->queue_mem_ptr;
+ union gdma_eqe_info eqe_info;
+ enum gdma_eqe_type type;
+ struct gdma_event event;
+ struct gdma_queue *cq;
+ struct gdma_eqe *eqe;
+ uint32_t cq_id;
+
+ eqe = &eq_eqe_ptr[head];
+ eqe_info.as_uint32 = eqe->eqe_info;
+ type = eqe_info.type;
+
+ switch (type) {
+ case GDMA_EQE_COMPLETION:
+ cq_id = eqe->details[0] & 0xFFFFFF;
+ if (cq_id >= gc->max_num_cqs) {
+ mana_warn(NULL,
+ "failed: cq_id %u > max_num_cqs %u\n",
+ cq_id, gc->max_num_cqs);
+ break;
+ }
+
+ cq = gc->cq_table[cq_id];
+ if (!cq || cq->type != GDMA_CQ || cq->id != cq_id) {
+ mana_warn(NULL,
+ "failed: invalid cq_id %u\n", cq_id);
+ break;
+ }
+
+ if (cq->cq.callback)
+ cq->cq.callback(cq->cq.context, cq);
+
+ break;
+
+ case GDMA_EQE_TEST_EVENT:
+ gc->test_event_eq_id = eq->id;
+
+ mana_dbg(NULL,
+ "EQE TEST EVENT received for EQ %u\n", eq->id);
+
+ complete(&gc->eq_test_event);
+ break;
+
+ case GDMA_EQE_HWC_INIT_EQ_ID_DB:
+ case GDMA_EQE_HWC_INIT_DATA:
+ case GDMA_EQE_HWC_INIT_DONE:
+ if (!eq->eq.callback)
+ break;
+
+ event.type = type;
+ memcpy(&event.details, &eqe->details, GDMA_EVENT_DATA_SIZE);
+ eq->eq.callback(eq->eq.context, eq, &event);
+ break;
+
+ default:
+ break;
+ }
+}
+
+static void
+mana_gd_process_eq_events(void *arg)
+{
+ uint32_t owner_bits, new_bits, old_bits;
+ union gdma_eqe_info eqe_info;
+ struct gdma_eqe *eq_eqe_ptr;
+ struct gdma_queue *eq = arg;
+ struct gdma_context *gc;
+ uint32_t head, num_eqe;
+ struct gdma_eqe *eqe;
+ unsigned int arm_bit;
+ int i, j;
+
+ gc = eq->gdma_dev->gdma_context;
+
+ num_eqe = eq->queue_size / GDMA_EQE_SIZE;
+ eq_eqe_ptr = eq->queue_mem_ptr;
+
+ bus_dmamap_sync(eq->mem_info.dma_tag, eq->mem_info.dma_map,
+ BUS_DMASYNC_POSTREAD);
+
+ /* Process up to 5 EQEs at a time, and update the HW head. */
+ for (i = 0; i < 5; i++) {
+ eqe = &eq_eqe_ptr[eq->head % num_eqe];
+ eqe_info.as_uint32 = eqe->eqe_info;
+ owner_bits = eqe_info.owner_bits;
+
+ old_bits = (eq->head / num_eqe - 1) & GDMA_EQE_OWNER_MASK;
+
+ /* No more entries */
+ if (owner_bits == old_bits)
+ break;
+
+ new_bits = (eq->head / num_eqe) & GDMA_EQE_OWNER_MASK;
+ if (owner_bits != new_bits) {
+ /* Something wrong. Log for debugging purpose */
+ device_printf(gc->dev,
+ "EQ %d: overflow detected, "
+ "i = %d, eq->head = %u "
+ "got owner_bits = %u, new_bits = %u "
+ "eqe addr %p, eqe->eqe_info 0x%x, "
+ "eqe type = %x, reserved1 = %x, client_id = %x, "
+ "reserved2 = %x, owner_bits = %x\n",
+ eq->id, i, eq->head,
+ owner_bits, new_bits,
+ eqe, eqe->eqe_info,
+ eqe_info.type, eqe_info.reserved1,
+ eqe_info.client_id, eqe_info.reserved2,
+ eqe_info.owner_bits);
+
+ uint32_t *eqe_dump = (uint32_t *) eq_eqe_ptr;
+ for (j = 0; j < 20; j++) {
+ device_printf(gc->dev, "%p: %x\t%x\t%x\t%x\n",
+ &eqe_dump[j * 4], eqe_dump[j * 4], eqe_dump[j * 4 + 1],
+ eqe_dump[j * 4 + 2], eqe_dump[j * 4 + 3]);
+ }
+ break;
+ }
+
+ mana_gd_process_eqe(eq);
+
+ eq->head++;
+ }
+
+ bus_dmamap_sync(eq->mem_info.dma_tag, eq->mem_info.dma_map,
+ BUS_DMASYNC_PREREAD);
+
+ /* Always rearm the EQ for HWC. */
+ if (mana_gd_is_hwc(eq->gdma_dev)) {
+ arm_bit = SET_ARM_BIT;
+ } else if (eq->eq.work_done < eq->eq.budget &&
+ eq->eq.do_not_ring_db == false) {
+ arm_bit = SET_ARM_BIT;
+ } else {
+ arm_bit = 0;
+ }
+
+ head = eq->head % (num_eqe << GDMA_EQE_OWNER_BITS);
+
+ mana_gd_ring_doorbell(gc, eq->gdma_dev->doorbell, eq->type, eq->id,
+ head, arm_bit);
+}
+
+#define MANA_POLL_BUDGET 8
+#define MANA_RX_BUDGET 256
+
+static void
+mana_poll(void *arg, int pending)
+{
+ struct gdma_queue *eq = arg;
+ int i;
+
+ eq->eq.work_done = 0;
+ eq->eq.budget = MANA_RX_BUDGET;
+
+ for (i = 0; i < MANA_POLL_BUDGET; i++) {
+ /*
+ * If this is the last loop, set the budget big enough
+ * so it will arm the EQ any way.
+ */
+ if (i == (MANA_POLL_BUDGET - 1))
+ eq->eq.budget = CQE_POLLING_BUFFER + 1;
+
+ mana_gd_process_eq_events(eq);
+
+ if (eq->eq.work_done < eq->eq.budget)
+ break;
+
+ eq->eq.work_done = 0;
+ }
+}
+
+static void
+mana_gd_schedule_task(void *arg)
+{
+ struct gdma_queue *eq = arg;
+
+ taskqueue_enqueue(eq->eq.cleanup_tq, &eq->eq.cleanup_task);
+}
+
+static int
+mana_gd_register_irq(struct gdma_queue *queue,
+ const struct gdma_queue_spec *spec)
+{
+ static int mana_last_bind_cpu = -1;
+ struct gdma_dev *gd = queue->gdma_dev;
+ bool is_mana = mana_gd_is_mana(gd);
+ struct gdma_irq_context *gic;
+ struct gdma_context *gc;
+ struct gdma_resource *r;
+ unsigned int msi_index;
+ int err;
+
+ gc = gd->gdma_context;
+ r = &gc->msix_resource;
+
+ mtx_lock_spin(&r->lock_spin);
+
+ msi_index = find_first_zero_bit(r->map, r->size);
+ if (msi_index >= r->size) {
+ err = ENOSPC;
+ } else {
+ bitmap_set(r->map, msi_index, 1);
+ queue->eq.msix_index = msi_index;
+ err = 0;
+ }
+
+ mtx_unlock_spin(&r->lock_spin);
+
+ if (err)
+ return err;
+
+ if (unlikely(msi_index >= gc->num_msix_usable)) {
+ device_printf(gc->dev,
+ "chose an invalid msix index %d, usable %d\n",
+ msi_index, gc->num_msix_usable);
+ return ENOSPC;
+ }
+
+ gic = &gc->irq_contexts[msi_index];
+
+ if (is_mana) {
+ struct mana_port_context *apc = if_getsoftc(spec->eq.ndev);
+ queue->eq.do_not_ring_db = false;
+
+ NET_TASK_INIT(&queue->eq.cleanup_task, 0, mana_poll, queue);
+ queue->eq.cleanup_tq =
+ taskqueue_create_fast("mana eq cleanup",
+ M_WAITOK, taskqueue_thread_enqueue,
+ &queue->eq.cleanup_tq);
+
+ if (mana_last_bind_cpu < 0)
+ mana_last_bind_cpu = CPU_FIRST();
+ queue->eq.cpu = mana_last_bind_cpu;
+ mana_last_bind_cpu = CPU_NEXT(mana_last_bind_cpu);
+
+ /* XXX Name is not optimal. However we have to start
+ * the task here. Otherwise, test eq will have no
+ * handler.
+ */
+ if (apc->bind_cleanup_thread_cpu) {
+ cpuset_t cpu_mask;
+ CPU_SETOF(queue->eq.cpu, &cpu_mask);
+ taskqueue_start_threads_cpuset(&queue->eq.cleanup_tq,
+ 1, PI_NET, &cpu_mask,
+ "mana eq poll msix %u on cpu %d",
+ msi_index, queue->eq.cpu);
+ } else {
+
+ taskqueue_start_threads(&queue->eq.cleanup_tq, 1,
+ PI_NET, "mana eq poll on msix %u", msi_index);
+ }
+ }
+
+ if (unlikely(gic->handler || gic->arg)) {
+ device_printf(gc->dev,
+ "interrupt handler or arg already assigned, "
+ "msix index: %d\n", msi_index);
+ }
+
+ gic->arg = queue;
+
+ if (is_mana)
+ gic->handler = mana_gd_schedule_task;
+ else
+ gic->handler = mana_gd_process_eq_events;
+
+ mana_dbg(NULL, "registered msix index %d vector %d irq %ju\n",
+ msi_index, gic->msix_e.vector, rman_get_start(gic->res));
+
+ return 0;
+}
+
+static void
+mana_gd_deregiser_irq(struct gdma_queue *queue)
+{
+ struct gdma_dev *gd = queue->gdma_dev;
+ struct gdma_irq_context *gic;
+ struct gdma_context *gc;
+ struct gdma_resource *r;
+ unsigned int msix_index;
+
+ gc = gd->gdma_context;
+ r = &gc->msix_resource;
+
+ /* At most num_online_cpus() + 1 interrupts are used. */
+ msix_index = queue->eq.msix_index;
+ if (unlikely(msix_index >= gc->num_msix_usable))
+ return;
+
+ gic = &gc->irq_contexts[msix_index];
+ gic->handler = NULL;
+ gic->arg = NULL;
+
+ mtx_lock_spin(&r->lock_spin);
+ bitmap_clear(r->map, msix_index, 1);
+ mtx_unlock_spin(&r->lock_spin);
+
+ queue->eq.msix_index = INVALID_PCI_MSIX_INDEX;
+
+ mana_dbg(NULL, "deregistered msix index %d vector %d irq %ju\n",
+ msix_index, gic->msix_e.vector, rman_get_start(gic->res));
+}
+
+int
+mana_gd_test_eq(struct gdma_context *gc, struct gdma_queue *eq)
+{
+ struct gdma_generate_test_event_req req = {};
+ struct gdma_general_resp resp = {};
+ device_t dev = gc->dev;
+ int err;
+
+ sx_xlock(&gc->eq_test_event_sx);
+
+ init_completion(&gc->eq_test_event);
+ gc->test_event_eq_id = INVALID_QUEUE_ID;
+
+ mana_gd_init_req_hdr(&req.hdr, GDMA_GENERATE_TEST_EQE,
+ sizeof(req), sizeof(resp));
+
+ req.hdr.dev_id = eq->gdma_dev->dev_id;
+ req.queue_index = eq->id;
+
+ err = mana_gd_send_request(gc, sizeof(req), &req,
+ sizeof(resp), &resp);
+ if (err) {
+ device_printf(dev, "test_eq failed: %d\n", err);
+ goto out;
+ }
+
+ err = EPROTO;
+
+ if (resp.hdr.status) {
+ device_printf(dev, "test_eq failed: 0x%x\n",
+ resp.hdr.status);
+ goto out;
+ }
+
+ if (wait_for_completion_timeout(&gc->eq_test_event, 30 * hz)) {
+ device_printf(dev, "test_eq timed out on queue %d\n",
+ eq->id);
+ goto out;
+ }
+
+ if (eq->id != gc->test_event_eq_id) {
+ device_printf(dev,
+ "test_eq got an event on wrong queue %d (%d)\n",
+ gc->test_event_eq_id, eq->id);
+ goto out;
+ }
+
+ err = 0;
+out:
+ sx_xunlock(&gc->eq_test_event_sx);
+ return err;
+}
+
+static void
+mana_gd_destroy_eq(struct gdma_context *gc, bool flush_evenets,
+ struct gdma_queue *queue)
+{
+ int err;
+
+ if (flush_evenets) {
+ err = mana_gd_test_eq(gc, queue);
+ if (err)
+ device_printf(gc->dev,
+ "Failed to flush EQ: %d\n", err);
+ }
+
+ mana_gd_deregiser_irq(queue);
+
+ if (mana_gd_is_mana(queue->gdma_dev)) {
+ while (taskqueue_cancel(queue->eq.cleanup_tq,
+ &queue->eq.cleanup_task, NULL))
+ taskqueue_drain(queue->eq.cleanup_tq,
+ &queue->eq.cleanup_task);
+
+ taskqueue_free(queue->eq.cleanup_tq);
+ }
+
+ if (queue->eq.disable_needed)
+ mana_gd_disable_queue(queue);
+}
+
+static int mana_gd_create_eq(struct gdma_dev *gd,
+ const struct gdma_queue_spec *spec,
+ bool create_hwq, struct gdma_queue *queue)
+{
+ struct gdma_context *gc = gd->gdma_context;
+ device_t dev = gc->dev;
+ uint32_t log2_num_entries;
+ int err;
+
+ queue->eq.msix_index = INVALID_PCI_MSIX_INDEX;
+
+ log2_num_entries = ilog2(queue->queue_size / GDMA_EQE_SIZE);
+
+ if (spec->eq.log2_throttle_limit > log2_num_entries) {
+ device_printf(dev,
+ "EQ throttling limit (%lu) > maximum EQE (%u)\n",
+ spec->eq.log2_throttle_limit, log2_num_entries);
+ return EINVAL;
+ }
+
+ err = mana_gd_register_irq(queue, spec);
+ if (err) {
+ device_printf(dev, "Failed to register irq: %d\n", err);
+ return err;
+ }
+
+ queue->eq.callback = spec->eq.callback;
+ queue->eq.context = spec->eq.context;
+ queue->head |= INITIALIZED_OWNER_BIT(log2_num_entries);
+ queue->eq.log2_throttle_limit = spec->eq.log2_throttle_limit ?: 1;
+
+ if (create_hwq) {
+ err = mana_gd_create_hw_eq(gc, queue);
+ if (err)
+ goto out;
+
+ err = mana_gd_test_eq(gc, queue);
+ if (err)
+ goto out;
+ }
+
+ return 0;
+out:
+ device_printf(dev, "Failed to create EQ: %d\n", err);
+ mana_gd_destroy_eq(gc, false, queue);
+ return err;
+}
+
+static void
+mana_gd_create_cq(const struct gdma_queue_spec *spec,
+ struct gdma_queue *queue)
+{
+ uint32_t log2_num_entries = ilog2(spec->queue_size / GDMA_CQE_SIZE);
+
+ queue->head |= INITIALIZED_OWNER_BIT(log2_num_entries);
+ queue->cq.parent = spec->cq.parent_eq;
+ queue->cq.context = spec->cq.context;
+ queue->cq.callback = spec->cq.callback;
+}
+
+static void
+mana_gd_destroy_cq(struct gdma_context *gc,
+ struct gdma_queue *queue)
+{
+ uint32_t id = queue->id;
+
+ if (id >= gc->max_num_cqs)
+ return;
+
+ if (!gc->cq_table[id])
+ return;
+
+ gc->cq_table[id] = NULL;
+}
+
+int mana_gd_create_hwc_queue(struct gdma_dev *gd,
+ const struct gdma_queue_spec *spec,
+ struct gdma_queue **queue_ptr)
+{
+ struct gdma_context *gc = gd->gdma_context;
+ struct gdma_mem_info *gmi;
+ struct gdma_queue *queue;
+ int err;
+
+ queue = malloc(sizeof(*queue), M_DEVBUF, M_WAITOK | M_ZERO);
+ if (!queue)
+ return ENOMEM;
+
+ gmi = &queue->mem_info;
+ err = mana_gd_alloc_memory(gc, spec->queue_size, gmi);
+ if (err)
+ goto free_q;
+
+ queue->head = 0;
+ queue->tail = 0;
+ queue->queue_mem_ptr = gmi->virt_addr;
+ queue->queue_size = spec->queue_size;
+ queue->monitor_avl_buf = spec->monitor_avl_buf;
+ queue->type = spec->type;
+ queue->gdma_dev = gd;
+
+ if (spec->type == GDMA_EQ)
+ err = mana_gd_create_eq(gd, spec, false, queue);
+ else if (spec->type == GDMA_CQ)
+ mana_gd_create_cq(spec, queue);
+
+ if (err)
+ goto out;
+
+ *queue_ptr = queue;
+ return 0;
+out:
+ mana_gd_free_memory(gmi);
+free_q:
+ free(queue, M_DEVBUF);
+ return err;
+}
+
+static void
+mana_gd_destroy_dma_region(struct gdma_context *gc, uint64_t gdma_region)
+{
+ struct gdma_destroy_dma_region_req req = {};
+ struct gdma_general_resp resp = {};
+ int err;
+
+ if (gdma_region == GDMA_INVALID_DMA_REGION)
+ return;
+
+ mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_DMA_REGION, sizeof(req),
+ sizeof(resp));
+ req.gdma_region = gdma_region;
+
+ err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp),
+ &resp);
+ if (err || resp.hdr.status)
+ device_printf(gc->dev,
+ "Failed to destroy DMA region: %d, 0x%x\n",
+ err, resp.hdr.status);
+}
+
+static int
+mana_gd_create_dma_region(struct gdma_dev *gd,
+ struct gdma_mem_info *gmi)
+{
+ unsigned int num_page = gmi->length / PAGE_SIZE;
+ struct gdma_create_dma_region_req *req = NULL;
+ struct gdma_create_dma_region_resp resp = {};
+ struct gdma_context *gc = gd->gdma_context;
+ struct hw_channel_context *hwc;
+ uint32_t length = gmi->length;
+ uint32_t req_msg_size;
+ int err;
+ int i;
+
+ if (length < PAGE_SIZE || !is_power_of_2(length)) {
+ mana_err(NULL, "gmi size incorrect: %u\n", length);
+ return EINVAL;
+ }
+
+ if (offset_in_page((uint64_t)gmi->virt_addr) != 0) {
+ mana_err(NULL, "gmi not page aligned: %p\n",
+ gmi->virt_addr);
+ return EINVAL;
+ }
+
+ hwc = gc->hwc.driver_data;
+ req_msg_size = sizeof(*req) + num_page * sizeof(uint64_t);
+ if (req_msg_size > hwc->max_req_msg_size) {
+ mana_err(NULL, "req msg size too large: %u, %u\n",
+ req_msg_size, hwc->max_req_msg_size);
+ return EINVAL;
+ }
+
+ req = malloc(req_msg_size, M_DEVBUF, M_WAITOK | M_ZERO);
+ if (!req)
+ return ENOMEM;
+
+ mana_gd_init_req_hdr(&req->hdr, GDMA_CREATE_DMA_REGION,
+ req_msg_size, sizeof(resp));
+ req->length = length;
+ req->offset_in_page = 0;
+ req->gdma_page_type = GDMA_PAGE_TYPE_4K;
+ req->page_count = num_page;
+ req->page_addr_list_len = num_page;
+
+ for (i = 0; i < num_page; i++)
+ req->page_addr_list[i] = gmi->dma_handle + i * PAGE_SIZE;
+
+ err = mana_gd_send_request(gc, req_msg_size, req, sizeof(resp), &resp);
+ if (err)
+ goto out;
+
+ if (resp.hdr.status || resp.gdma_region == GDMA_INVALID_DMA_REGION) {
+ device_printf(gc->dev, "Failed to create DMA region: 0x%x\n",
+ resp.hdr.status);
+ err = EPROTO;
+ goto out;
+ }
+
+ gmi->gdma_region = resp.gdma_region;
+out:
+ free(req, M_DEVBUF);
+ return err;
+}
+
+int
+mana_gd_create_mana_eq(struct gdma_dev *gd,
+ const struct gdma_queue_spec *spec,
+ struct gdma_queue **queue_ptr)
+{
+ struct gdma_context *gc = gd->gdma_context;
+ struct gdma_mem_info *gmi;
+ struct gdma_queue *queue;
+ int err;
+
+ if (spec->type != GDMA_EQ)
+ return EINVAL;
+
+ queue = malloc(sizeof(*queue), M_DEVBUF, M_WAITOK | M_ZERO);
+ if (!queue)
+ return ENOMEM;
+
+ gmi = &queue->mem_info;
+ err = mana_gd_alloc_memory(gc, spec->queue_size, gmi);
+ if (err)
+ goto free_q;
+
+ err = mana_gd_create_dma_region(gd, gmi);
+ if (err)
+ goto out;
+
+ queue->head = 0;
+ queue->tail = 0;
+ queue->queue_mem_ptr = gmi->virt_addr;
+ queue->queue_size = spec->queue_size;
+ queue->monitor_avl_buf = spec->monitor_avl_buf;
+ queue->type = spec->type;
+ queue->gdma_dev = gd;
+
+ err = mana_gd_create_eq(gd, spec, true, queue);
+ if (err)
+ goto out;
+
+ *queue_ptr = queue;
+ return 0;
+
+out:
+ mana_gd_free_memory(gmi);
+free_q:
+ free(queue, M_DEVBUF);
+ return err;
+}
+
+int mana_gd_create_mana_wq_cq(struct gdma_dev *gd,
+ const struct gdma_queue_spec *spec,
+ struct gdma_queue **queue_ptr)
+{
+ struct gdma_context *gc = gd->gdma_context;
+ struct gdma_mem_info *gmi;
+ struct gdma_queue *queue;
+ int err;
+
+ if (spec->type != GDMA_CQ && spec->type != GDMA_SQ &&
+ spec->type != GDMA_RQ)
+ return EINVAL;
+
+ queue = malloc(sizeof(*queue), M_DEVBUF, M_WAITOK | M_ZERO);
+ if (!queue)
+ return ENOMEM;
+
+ gmi = &queue->mem_info;
+ err = mana_gd_alloc_memory(gc, spec->queue_size, gmi);
+ if (err)
+ goto free_q;
+
+ err = mana_gd_create_dma_region(gd, gmi);
+ if (err)
+ goto out;
+
+ queue->head = 0;
+ queue->tail = 0;
+ queue->queue_mem_ptr = gmi->virt_addr;
+ queue->queue_size = spec->queue_size;
+ queue->monitor_avl_buf = spec->monitor_avl_buf;
+ queue->type = spec->type;
+ queue->gdma_dev = gd;
+
+ if (spec->type == GDMA_CQ)
+ mana_gd_create_cq(spec, queue);
+
+ *queue_ptr = queue;
+ return 0;
+
+out:
+ mana_gd_free_memory(gmi);
+free_q:
+ free(queue, M_DEVBUF);
+ return err;
+}
+
+void
+mana_gd_destroy_queue(struct gdma_context *gc, struct gdma_queue *queue)
+{
+ struct gdma_mem_info *gmi = &queue->mem_info;
+
+ switch (queue->type) {
+ case GDMA_EQ:
+ mana_gd_destroy_eq(gc, queue->eq.disable_needed, queue);
+ break;
+
+ case GDMA_CQ:
+ mana_gd_destroy_cq(gc, queue);
+ break;
+
+ case GDMA_RQ:
+ break;
+
+ case GDMA_SQ:
+ break;
+
+ default:
+ device_printf(gc->dev,
+ "Can't destroy unknown queue: type = %d\n",
+ queue->type);
+ return;
+ }
+
+ mana_gd_destroy_dma_region(gc, gmi->gdma_region);
+ mana_gd_free_memory(gmi);
+ free(queue, M_DEVBUF);
+}
+
+int
+mana_gd_verify_vf_version(device_t dev)
+{
+ struct gdma_context *gc = device_get_softc(dev);
+ struct gdma_verify_ver_resp resp = {};
+ struct gdma_verify_ver_req req = {};
+ int err;
+
+ mana_gd_init_req_hdr(&req.hdr, GDMA_VERIFY_VF_DRIVER_VERSION,
+ sizeof(req), sizeof(resp));
+
+ req.protocol_ver_min = GDMA_PROTOCOL_FIRST;
+ req.protocol_ver_max = GDMA_PROTOCOL_LAST;
+
+ err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
+ if (err || resp.hdr.status) {
+ device_printf(gc->dev,
+ "VfVerifyVersionOutput: %d, status=0x%x\n",
+ err, resp.hdr.status);
+ return err ? err : EPROTO;
+ }
+
+ return 0;
+}
+
+int
+mana_gd_register_device(struct gdma_dev *gd)
+{
+ struct gdma_context *gc = gd->gdma_context;
+ struct gdma_register_device_resp resp = {};
+ struct gdma_general_req req = {};
+ int err;
+
+ gd->pdid = INVALID_PDID;
+ gd->doorbell = INVALID_DOORBELL;
+ gd->gpa_mkey = INVALID_MEM_KEY;
+
+ mana_gd_init_req_hdr(&req.hdr, GDMA_REGISTER_DEVICE, sizeof(req),
+ sizeof(resp));
+
+ req.hdr.dev_id = gd->dev_id;
+
+ err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
+ if (err || resp.hdr.status) {
+ device_printf(gc->dev,
+ "gdma_register_device_resp failed: %d, 0x%x\n",
+ err, resp.hdr.status);
+ return err ? err : -EPROTO;
+ }
+
+ gd->pdid = resp.pdid;
+ gd->gpa_mkey = resp.gpa_mkey;
+ gd->doorbell = resp.db_id;
+
+ mana_dbg(NULL, "mana device pdid %u, gpa_mkey %u, doorbell %u \n",
+ gd->pdid, gd->gpa_mkey, gd->doorbell);
+
+ return 0;
+}
+
+int
+mana_gd_deregister_device(struct gdma_dev *gd)
+{
+ struct gdma_context *gc = gd->gdma_context;
+ struct gdma_general_resp resp = {};
+ struct gdma_general_req req = {};
+ int err;
+
+ if (gd->pdid == INVALID_PDID)
+ return EINVAL;
+
+ mana_gd_init_req_hdr(&req.hdr, GDMA_DEREGISTER_DEVICE, sizeof(req),
+ sizeof(resp));
+
+ req.hdr.dev_id = gd->dev_id;
+
+ err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
+ if (err || resp.hdr.status) {
+ device_printf(gc->dev,
+ "Failed to deregister device: %d, 0x%x\n",
+ err, resp.hdr.status);
+ if (!err)
+ err = EPROTO;
+ }
+
+ gd->pdid = INVALID_PDID;
+ gd->doorbell = INVALID_DOORBELL;
+ gd->gpa_mkey = INVALID_MEM_KEY;
+
+ return err;
+}
+
+uint32_t
+mana_gd_wq_avail_space(struct gdma_queue *wq)
+{
+ uint32_t used_space = (wq->head - wq->tail) * GDMA_WQE_BU_SIZE;
+ uint32_t wq_size = wq->queue_size;
+
+ if (used_space > wq_size) {
+ mana_warn(NULL, "failed: used space %u > queue size %u\n",
+ used_space, wq_size);
+ }
+
+ return wq_size - used_space;
+}
+
+uint8_t *
+mana_gd_get_wqe_ptr(const struct gdma_queue *wq, uint32_t wqe_offset)
+{
+ uint32_t offset =
+ (wqe_offset * GDMA_WQE_BU_SIZE) & (wq->queue_size - 1);
+
+ if ((offset + GDMA_WQE_BU_SIZE) > wq->queue_size) {
+ mana_warn(NULL, "failed: write end out of queue bound %u, "
+ "queue size %u\n",
+ offset + GDMA_WQE_BU_SIZE, wq->queue_size);
+ }
+
+ return (uint8_t *)wq->queue_mem_ptr + offset;
+}
+
+static uint32_t
+mana_gd_write_client_oob(const struct gdma_wqe_request *wqe_req,
+ enum gdma_queue_type q_type,
+ uint32_t client_oob_size, uint32_t sgl_data_size,
+ uint8_t *wqe_ptr)
+{
+ bool oob_in_sgl = !!(wqe_req->flags & GDMA_WR_OOB_IN_SGL);
+ bool pad_data = !!(wqe_req->flags & GDMA_WR_PAD_BY_SGE0);
+ struct gdma_wqe *header = (struct gdma_wqe *)wqe_ptr;
+ uint8_t *ptr;
+
+ memset(header, 0, sizeof(struct gdma_wqe));
+ header->num_sge = wqe_req->num_sge;
+ header->inline_oob_size_div4 = client_oob_size / sizeof(uint32_t);
+
+ if (oob_in_sgl) {
+ if (!pad_data || wqe_req->num_sge < 2) {
+ mana_warn(NULL, "no pad_data or num_sge < 2\n");
+ }
+
+ header->client_oob_in_sgl = 1;
+
+ if (pad_data)
+ header->last_vbytes = wqe_req->sgl[0].size;
+ }
+
+ if (q_type == GDMA_SQ)
+ header->client_data_unit = wqe_req->client_data_unit;
+
+ /*
+ * The size of gdma_wqe + client_oob_size must be less than or equal
+ * to one Basic Unit (i.e. 32 bytes), so the pointer can't go beyond
+ * the queue memory buffer boundary.
+ */
+ ptr = wqe_ptr + sizeof(header);
+
+ if (wqe_req->inline_oob_data && wqe_req->inline_oob_size > 0) {
+ memcpy(ptr, wqe_req->inline_oob_data, wqe_req->inline_oob_size);
+
+ if (client_oob_size > wqe_req->inline_oob_size)
+ memset(ptr + wqe_req->inline_oob_size, 0,
+ client_oob_size - wqe_req->inline_oob_size);
+ }
+
+ return sizeof(header) + client_oob_size;
+}
+
+static void
+mana_gd_write_sgl(struct gdma_queue *wq, uint8_t *wqe_ptr,
+ const struct gdma_wqe_request *wqe_req)
+{
+ uint32_t sgl_size = sizeof(struct gdma_sge) * wqe_req->num_sge;
+ const uint8_t *address = (uint8_t *)wqe_req->sgl;
+ uint8_t *base_ptr, *end_ptr;
+ uint32_t size_to_end;
+
+ base_ptr = wq->queue_mem_ptr;
+ end_ptr = base_ptr + wq->queue_size;
+ size_to_end = (uint32_t)(end_ptr - wqe_ptr);
+
+ if (size_to_end < sgl_size) {
+ memcpy(wqe_ptr, address, size_to_end);
+
+ wqe_ptr = base_ptr;
+ address += size_to_end;
+ sgl_size -= size_to_end;
+ }
+
+ memcpy(wqe_ptr, address, sgl_size);
+}
+
+int
+mana_gd_post_work_request(struct gdma_queue *wq,
+ const struct gdma_wqe_request *wqe_req,
+ struct gdma_posted_wqe_info *wqe_info)
+{
+ uint32_t client_oob_size = wqe_req->inline_oob_size;
+ struct gdma_context *gc;
+ uint32_t sgl_data_size;
+ uint32_t max_wqe_size;
+ uint32_t wqe_size;
+ uint8_t *wqe_ptr;
+
+ if (wqe_req->num_sge == 0)
+ return EINVAL;
+
+ if (wq->type == GDMA_RQ) {
+ if (client_oob_size != 0)
+ return EINVAL;
+
+ client_oob_size = INLINE_OOB_SMALL_SIZE;
+
+ max_wqe_size = GDMA_MAX_RQE_SIZE;
+ } else {
+ if (client_oob_size != INLINE_OOB_SMALL_SIZE &&
+ client_oob_size != INLINE_OOB_LARGE_SIZE)
+ return EINVAL;
+
+ max_wqe_size = GDMA_MAX_SQE_SIZE;
+ }
+
+ sgl_data_size = sizeof(struct gdma_sge) * wqe_req->num_sge;
+ wqe_size = ALIGN(sizeof(struct gdma_wqe) + client_oob_size +
+ sgl_data_size, GDMA_WQE_BU_SIZE);
+ if (wqe_size > max_wqe_size)
+ return EINVAL;
+
+ if (wq->monitor_avl_buf && wqe_size > mana_gd_wq_avail_space(wq)) {
+ gc = wq->gdma_dev->gdma_context;
+ device_printf(gc->dev, "unsuccessful flow control!\n");
+ return ENOSPC;
+ }
+
+ if (wqe_info)
+ wqe_info->wqe_size_in_bu = wqe_size / GDMA_WQE_BU_SIZE;
+
+ wqe_ptr = mana_gd_get_wqe_ptr(wq, wq->head);
+ wqe_ptr += mana_gd_write_client_oob(wqe_req, wq->type, client_oob_size,
+ sgl_data_size, wqe_ptr);
+ if (wqe_ptr >= (uint8_t *)wq->queue_mem_ptr + wq->queue_size)
+ wqe_ptr -= wq->queue_size;
+
+ mana_gd_write_sgl(wq, wqe_ptr, wqe_req);
+
+ wq->head += wqe_size / GDMA_WQE_BU_SIZE;
+
+ bus_dmamap_sync(wq->mem_info.dma_tag, wq->mem_info.dma_map,
+ BUS_DMASYNC_PREWRITE);
+
+ return 0;
+}
+
+int
+mana_gd_post_and_ring(struct gdma_queue *queue,
+ const struct gdma_wqe_request *wqe_req,
+ struct gdma_posted_wqe_info *wqe_info)
+{
+ struct gdma_context *gc = queue->gdma_dev->gdma_context;
+ int err;
+
+ err = mana_gd_post_work_request(queue, wqe_req, wqe_info);
+ if (err)
+ return err;
+
+ mana_gd_wq_ring_doorbell(gc, queue);
+
+ return 0;
+}
+
+static int
+mana_gd_read_cqe(struct gdma_queue *cq, struct gdma_comp *comp)
+{
+ unsigned int num_cqe = cq->queue_size / sizeof(struct gdma_cqe);
+ struct gdma_cqe *cq_cqe = cq->queue_mem_ptr;
+ uint32_t owner_bits, new_bits, old_bits;
+ struct gdma_cqe *cqe;
+
+ cqe = &cq_cqe[cq->head % num_cqe];
+ owner_bits = cqe->cqe_info.owner_bits;
+
+ old_bits = (cq->head / num_cqe - 1) & GDMA_CQE_OWNER_MASK;
+ /* Return 0 if no more entries. */
+ if (owner_bits == old_bits)
+ return 0;
+
+ new_bits = (cq->head / num_cqe) & GDMA_CQE_OWNER_MASK;
+ /* Return -1 if overflow detected. */
+ if (owner_bits != new_bits)
+ return -1;
+
+ comp->wq_num = cqe->cqe_info.wq_num;
+ comp->is_sq = cqe->cqe_info.is_sq;
+ memcpy(comp->cqe_data, cqe->cqe_data, GDMA_COMP_DATA_SIZE);
+
+ return 1;
+}
+
+int
+mana_gd_poll_cq(struct gdma_queue *cq, struct gdma_comp *comp, int num_cqe)
+{
+ int cqe_idx;
+ int ret;
+
+ bus_dmamap_sync(cq->mem_info.dma_tag, cq->mem_info.dma_map,
+ BUS_DMASYNC_POSTREAD);
+
+ for (cqe_idx = 0; cqe_idx < num_cqe; cqe_idx++) {
+ ret = mana_gd_read_cqe(cq, &comp[cqe_idx]);
+
+ if (ret < 0) {
+ cq->head -= cqe_idx;
+ return ret;
+ }
+
+ if (ret == 0)
+ break;
+
+ cq->head++;
+ }
+
+ return cqe_idx;
+}
+
+static void
+mana_gd_intr(void *arg)
+{
+ struct gdma_irq_context *gic = arg;
+
+ if (gic->handler) {
+ gic->handler(gic->arg);
+ }
+}
+
+int
+mana_gd_alloc_res_map(uint32_t res_avail,
+ struct gdma_resource *r, const char *lock_name)
+{
+ int n = howmany(res_avail, BITS_PER_LONG);
+
+ r->map =
+ malloc(n * sizeof(unsigned long), M_DEVBUF, M_WAITOK | M_ZERO);
+ if (!r->map)
+ return ENOMEM;
+
+ r->size = res_avail;
+ mtx_init(&r->lock_spin, lock_name, NULL, MTX_SPIN);
+
+ mana_dbg(NULL,
+ "total res %u, total number of unsigned longs %u\n",
+ r->size, n);
+ return (0);
+}
+
+void
+mana_gd_free_res_map(struct gdma_resource *r)
+{
+ if (!r || !r->map)
+ return;
+
+ free(r->map, M_DEVBUF);
+ r->map = NULL;
+ r->size = 0;
+}
+
+static void
+mana_gd_init_registers(struct gdma_context *gc)
+{
+ uint64_t bar0_va = rman_get_bushandle(gc->bar0);
+
+ gc->db_page_size = mana_gd_r32(gc, GDMA_REG_DB_PAGE_SIZE) & 0xFFFF;
+
+ gc->db_page_base =
+ (void *) (bar0_va + mana_gd_r64(gc, GDMA_REG_DB_PAGE_OFFSET));
+
+ gc->shm_base =
+ (void *) (bar0_va + mana_gd_r64(gc, GDMA_REG_SHM_OFFSET));
+
+ mana_dbg(NULL, "db_page_size 0x%xx, db_page_base %p,"
+ " shm_base %p\n",
+ gc->db_page_size, gc->db_page_base, gc->shm_base);
+}
+
+static struct resource *
+mana_gd_alloc_bar(device_t dev, int bar)
+{
+ struct resource *res = NULL;
+ struct pci_map *pm;
+ int rid, type;
+
+ if (bar < 0 || bar > PCIR_MAX_BAR_0)
+ goto alloc_bar_out;
+
+ pm = pci_find_bar(dev, PCIR_BAR(bar));
+ if (!pm)
+ goto alloc_bar_out;
+
+ if (PCI_BAR_IO(pm->pm_value))
+ type = SYS_RES_IOPORT;
+ else
+ type = SYS_RES_MEMORY;
+ if (type < 0)
+ goto alloc_bar_out;
+
+ rid = PCIR_BAR(bar);
+ res = bus_alloc_resource_any(dev, type, &rid, RF_ACTIVE);
+#if defined(__amd64__)
+ if (res)
+ mana_dbg(NULL, "bar %d: rid 0x%x, type 0x%jx,"
+ " handle 0x%jx\n",
+ bar, rid, res->r_bustag, res->r_bushandle);
+#endif
+
+alloc_bar_out:
+ return (res);
+}
+
+static void
+mana_gd_free_pci_res(struct gdma_context *gc)
+{
+ if (!gc || gc->dev)
+ return;
+
+ if (gc->bar0 != NULL) {
+ bus_release_resource(gc->dev, SYS_RES_MEMORY,
+ PCIR_BAR(GDMA_BAR0), gc->bar0);
+ }
+
+ if (gc->msix != NULL) {
+ bus_release_resource(gc->dev, SYS_RES_MEMORY,
+ gc->msix_rid, gc->msix);
+ }
+}
+
+static int
+mana_gd_setup_irqs(device_t dev)
+{
+ unsigned int max_queues_per_port = mp_ncpus;
+ struct gdma_context *gc = device_get_softc(dev);
+ struct gdma_irq_context *gic;
+ unsigned int max_irqs;
+ int nvec;
+ int rc, rcc, i;
+
+ if (max_queues_per_port > MANA_MAX_NUM_QUEUES)
+ max_queues_per_port = MANA_MAX_NUM_QUEUES;
+
+ max_irqs = max_queues_per_port * MAX_PORTS_IN_MANA_DEV;
+
+ /* Need 1 interrupt for the Hardware communication Channel (HWC) */
+ max_irqs++;
+
+ nvec = max_irqs;
+ rc = pci_alloc_msix(dev, &nvec);
+ if (unlikely(rc != 0)) {
+ device_printf(dev,
+ "Failed to allocate MSIX, vectors %d, error: %d\n",
+ nvec, rc);
+ rc = ENOSPC;
+ goto err_setup_irq_alloc;
+ }
+
+ if (nvec != max_irqs) {
+ if (nvec == 1) {
+ device_printf(dev,
+ "Not enough number of MSI-x allocated: %d\n",
+ nvec);
+ rc = ENOSPC;
+ goto err_setup_irq_release;
+ }
+ device_printf(dev, "Allocated only %d MSI-x (%d requested)\n",
+ nvec, max_irqs);
+ }
+
+ gc->irq_contexts = malloc(nvec * sizeof(struct gdma_irq_context),
+ M_DEVBUF, M_WAITOK | M_ZERO);
+ if (!gc->irq_contexts) {
+ rc = ENOMEM;
+ goto err_setup_irq_release;
+ }
+
+ for (i = 0; i < nvec; i++) {
+ gic = &gc->irq_contexts[i];
+ gic->msix_e.entry = i;
+ /* Vector starts from 1. */
+ gic->msix_e.vector = i + 1;
+ gic->handler = NULL;
+ gic->arg = NULL;
+
+ gic->res = bus_alloc_resource_any(dev, SYS_RES_IRQ,
+ &gic->msix_e.vector, RF_ACTIVE | RF_SHAREABLE);
+ if (unlikely(gic->res == NULL)) {
+ rc = ENOMEM;
+ device_printf(dev, "could not allocate resource "
+ "for irq vector %d\n", gic->msix_e.vector);
+ goto err_setup_irq;
+ }
+
+ rc = bus_setup_intr(dev, gic->res,
+ INTR_TYPE_NET | INTR_MPSAFE, NULL, mana_gd_intr,
+ gic, &gic->cookie);
+ if (unlikely(rc != 0)) {
+ device_printf(dev, "failed to register interrupt "
+ "handler for irq %ju vector %d: error %d\n",
+ rman_get_start(gic->res), gic->msix_e.vector, rc);
+ goto err_setup_irq;
+ }
+ gic->requested = true;
+
+ mana_dbg(NULL, "added msix vector %d irq %ju\n",
+ gic->msix_e.vector, rman_get_start(gic->res));
+ }
+
+ rc = mana_gd_alloc_res_map(nvec, &gc->msix_resource,
+ "gdma msix res lock");
+ if (rc != 0) {
+ device_printf(dev, "failed to allocate memory "
+ "for msix bitmap\n");
+ goto err_setup_irq;
+ }
+
+ gc->max_num_msix = nvec;
+ gc->num_msix_usable = nvec;
+
+ mana_dbg(NULL, "setup %d msix interrupts\n", nvec);
+
+ return (0);
+
+err_setup_irq:
+ for (; i >= 0; i--) {
+ gic = &gc->irq_contexts[i];
+ rcc = 0;
+
+ /*
+ * If gic->requested is true, we need to free both intr and
+ * resources.
+ */
+ if (gic->requested)
+ rcc = bus_teardown_intr(dev, gic->res, gic->cookie);
+ if (unlikely(rcc != 0))
+ device_printf(dev, "could not release "
+ "irq vector %d, error: %d\n",
+ gic->msix_e.vector, rcc);
+
+ rcc = 0;
+ if (gic->res != NULL) {
+ rcc = bus_release_resource(dev, SYS_RES_IRQ,
+ gic->msix_e.vector, gic->res);
+ }
+ if (unlikely(rcc != 0))
+ device_printf(dev, "dev has no parent while "
+ "releasing resource for irq vector %d\n",
+ gic->msix_e.vector);
+ gic->requested = false;
+ gic->res = NULL;
+ }
+
+ free(gc->irq_contexts, M_DEVBUF);
+ gc->irq_contexts = NULL;
+err_setup_irq_release:
+ pci_release_msi(dev);
+err_setup_irq_alloc:
+ return (rc);
+}
+
+static void
+mana_gd_remove_irqs(device_t dev)
+{
+ struct gdma_context *gc = device_get_softc(dev);
+ struct gdma_irq_context *gic;
+ int rc, i;
+
+ mana_gd_free_res_map(&gc->msix_resource);
+
+ for (i = 0; i < gc->max_num_msix; i++) {
+ gic = &gc->irq_contexts[i];
+ if (gic->requested) {
+ rc = bus_teardown_intr(dev, gic->res, gic->cookie);
+ if (unlikely(rc != 0)) {
+ device_printf(dev, "failed to tear down "
+ "irq vector %d, error: %d\n",
+ gic->msix_e.vector, rc);
+ }
+ gic->requested = false;
+ }
+
+ if (gic->res != NULL) {
+ rc = bus_release_resource(dev, SYS_RES_IRQ,
+ gic->msix_e.vector, gic->res);
+ if (unlikely(rc != 0)) {
+ device_printf(dev, "dev has no parent while "
+ "releasing resource for irq vector %d\n",
+ gic->msix_e.vector);
+ }
+ gic->res = NULL;
+ }
+ }
+
+ gc->max_num_msix = 0;
+ gc->num_msix_usable = 0;
+ free(gc->irq_contexts, M_DEVBUF);
+ gc->irq_contexts = NULL;
+
+ pci_release_msi(dev);
+}
+
+static int
+mana_gd_probe(device_t dev)
+{
+ mana_vendor_id_t *ent;
+ char adapter_name[60];
+ uint16_t pci_vendor_id = 0;
+ uint16_t pci_device_id = 0;
+
+ pci_vendor_id = pci_get_vendor(dev);
+ pci_device_id = pci_get_device(dev);
+
+ ent = mana_id_table;
+ while (ent->vendor_id != 0) {
+ if ((pci_vendor_id == ent->vendor_id) &&
+ (pci_device_id == ent->device_id)) {
+ mana_dbg(NULL, "vendor=%x device=%x\n",
+ pci_vendor_id, pci_device_id);
+
+ sprintf(adapter_name, DEVICE_DESC);
+ device_set_desc_copy(dev, adapter_name);
+ return (BUS_PROBE_DEFAULT);
+ }
+
+ ent++;
+ }
+
+ return (ENXIO);
+}
+
+/**
+ * mana_attach - Device Initialization Routine
+ * @dev: device information struct
+ *
+ * Returns 0 on success, otherwise on failure.
+ *
+ * mana_attach initializes a GDMA adapter identified by a device structure.
+ **/
+static int
+mana_gd_attach(device_t dev)
+{
+ struct gdma_context *gc;
+ int msix_rid;
+ int rc;
+
+ gc = device_get_softc(dev);
+ gc->dev = dev;
+
+ pci_enable_io(dev, SYS_RES_IOPORT);
+ pci_enable_io(dev, SYS_RES_MEMORY);
+
+ pci_enable_busmaster(dev);
+
+ gc->bar0 = mana_gd_alloc_bar(dev, GDMA_BAR0);
+ if (unlikely(gc->bar0 == NULL)) {
+ device_printf(dev,
+ "unable to allocate bus resource for bar0!\n");
+ rc = ENOMEM;
+ goto err_disable_dev;
+ }
+
+ /* Store bar0 tage and handle for quick access */
+ gc->gd_bus.bar0_t = rman_get_bustag(gc->bar0);
+ gc->gd_bus.bar0_h = rman_get_bushandle(gc->bar0);
+
+ /* Map MSI-x vector table */
+ msix_rid = pci_msix_table_bar(dev);
+
+ mana_dbg(NULL, "msix_rid 0x%x\n", msix_rid);
+
+ gc->msix = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
+ &msix_rid, RF_ACTIVE);
+ if (unlikely(gc->msix == NULL)) {
+ device_printf(dev,
+ "unable to allocate bus resource for msix!\n");
+ rc = ENOMEM;
+ goto err_free_pci_res;
+ }
+ gc->msix_rid = msix_rid;
+
+ if (unlikely(gc->gd_bus.bar0_h == 0)) {
+ device_printf(dev, "failed to map bar0!\n");
+ rc = ENXIO;
+ goto err_free_pci_res;
+ }
+
+ mana_gd_init_registers(gc);
+
+ mana_smc_init(&gc->shm_channel, gc->dev, gc->shm_base);
+
+ rc = mana_gd_setup_irqs(dev);
+ if (rc) {
+ goto err_free_pci_res;
+ }
+
+ sx_init(&gc->eq_test_event_sx, "gdma test event sx");
+
+ rc = mana_hwc_create_channel(gc);
+ if (rc) {
+ mana_dbg(NULL, "Failed to create hwc channel\n");
+ if (rc == EIO)
+ goto err_clean_up_gdma;
+ else
+ goto err_remove_irq;
+ }
+
+ rc = mana_gd_verify_vf_version(dev);
+ if (rc) {
+ mana_dbg(NULL, "Failed to verify vf\n");
+ goto err_clean_up_gdma;
+ }
+
+ rc = mana_gd_query_max_resources(dev);
+ if (rc) {
+ mana_dbg(NULL, "Failed to query max resources\n");
+ goto err_clean_up_gdma;
+ }
+
+ rc = mana_gd_detect_devices(dev);
+ if (rc) {
+ mana_dbg(NULL, "Failed to detect mana device\n");
+ goto err_clean_up_gdma;
+ }
+
+ rc = mana_probe(&gc->mana);
+ if (rc) {
+ mana_dbg(NULL, "Failed to probe mana device\n");
+ goto err_clean_up_gdma;
+ }
+
+ return (0);
+
+err_clean_up_gdma:
+ mana_hwc_destroy_channel(gc);
+ if (gc->cq_table)
+ free(gc->cq_table, M_DEVBUF);
+ gc->cq_table = NULL;
+err_remove_irq:
+ mana_gd_remove_irqs(dev);
+err_free_pci_res:
+ mana_gd_free_pci_res(gc);
+err_disable_dev:
+ pci_disable_busmaster(dev);
+
+ return(rc);
+}
+
+/**
+ * mana_detach - Device Removal Routine
+ * @pdev: device information struct
+ *
+ * mana_detach is called by the device subsystem to alert the driver
+ * that it should release a PCI device.
+ **/
+static int
+mana_gd_detach(device_t dev)
+{
+ struct gdma_context *gc = device_get_softc(dev);
+
+ mana_remove(&gc->mana);
+
+ mana_hwc_destroy_channel(gc);
+ free(gc->cq_table, M_DEVBUF);
+ gc->cq_table = NULL;
+
+ mana_gd_remove_irqs(dev);
+
+ mana_gd_free_pci_res(gc);
+
+ pci_disable_busmaster(dev);
+
+ return (bus_generic_detach(dev));
+}
+
+
+/*********************************************************************
+ * FreeBSD Device Interface Entry Points
+ *********************************************************************/
+
+static device_method_t mana_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, mana_gd_probe),
+ DEVMETHOD(device_attach, mana_gd_attach),
+ DEVMETHOD(device_detach, mana_gd_detach),
+ DEVMETHOD_END
+};
+
+static driver_t mana_driver = {
+ "mana", mana_methods, sizeof(struct gdma_context),
+};
+
+devclass_t mana_devclass;
+DRIVER_MODULE(mana, pci, mana_driver, mana_devclass, 0, 0);
+MODULE_PNP_INFO("U16:vendor;U16:device", pci, mana, mana_id_table,
+ nitems(mana_id_table) - 1);
+MODULE_DEPEND(mana, pci, 1, 1, 1);
+MODULE_DEPEND(mana, ether, 1, 1, 1);
+
+/*********************************************************************/
diff --git a/sys/dev/mana/gdma_util.c b/sys/dev/mana/gdma_util.c
new file mode 100644
index 000000000000..304caa28ec7a
--- /dev/null
+++ b/sys/dev/mana/gdma_util.c
@@ -0,0 +1,96 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2021 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+
+#include "gdma_util.h"
+
+
+void
+init_completion(struct completion *c)
+{
+ memset(c, 0, sizeof(*c));
+ mtx_init(&c->lock, "gdma_completion", NULL, MTX_DEF);
+ c->done = 0;
+}
+
+void
+free_completion(struct completion *c)
+{
+ mtx_destroy(&c->lock);
+}
+
+void
+complete(struct completion *c)
+{
+ mtx_lock(&c->lock);
+ c->done++;
+ mtx_unlock(&c->lock);
+ wakeup(c);
+}
+
+void
+wait_for_completion(struct completion *c)
+{
+ mtx_lock(&c->lock);
+ while (c->done == 0)
+ mtx_sleep(c, &c->lock, 0, "gdma_wfc", 0);
+ c->done--;
+ mtx_unlock(&c->lock);
+}
+
+/*
+ * Return: 0 if completed, a non-zero value if timed out.
+ */
+int
+wait_for_completion_timeout(struct completion *c, int timeout)
+{
+ int ret;
+
+ mtx_lock(&c->lock);
+
+ if (c->done == 0)
+ mtx_sleep(c, &c->lock, 0, "gdma_wfc", timeout);
+
+ if (c->done > 0) {
+ c->done--;
+ ret = 0;
+ } else {
+ ret = 1;
+ }
+
+ mtx_unlock(&c->lock);
+
+ return (ret);
+}
diff --git a/sys/dev/mana/gdma_util.h b/sys/dev/mana/gdma_util.h
new file mode 100644
index 000000000000..da2dfe54f1b9
--- /dev/null
+++ b/sys/dev/mana/gdma_util.h
@@ -0,0 +1,206 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2021 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+
+#ifndef _GDMA_UTIL_H_
+#define _GDMA_UTIL_H_
+
+#include <sys/types.h>
+#include <sys/param.h>
+
+/* Log Levels */
+#define MANA_ALERT (1 << 0) /* Alerts are providing more error info. */
+#define MANA_WARNING (1 << 1) /* Driver output is more error sensitive. */
+#define MANA_INFO (1 << 2) /* Provides additional driver info. */
+#define MANA_DBG (1 << 3) /* Driver output for debugging. */
+
+extern int mana_log_level;
+
+#define mana_trace_raw(ctx, level, fmt, args...) \
+ do { \
+ ((void)(ctx)); \
+ if (((level) & mana_log_level) != (level)) \
+ break; \
+ printf(fmt, ##args); \
+ } while (0)
+
+#define mana_trace(ctx, level, fmt, args...) \
+ mana_trace_raw(ctx, level, "%s() [TID:%d]: " \
+ fmt, __func__, curthread->td_tid, ##args)
+
+
+#define mana_dbg(ctx, format, arg...) \
+ mana_trace(ctx, MANA_DBG, format, ##arg)
+#define mana_info(ctx, format, arg...) \
+ mana_trace(ctx, MANA_INFO, format, ##arg)
+#define mana_warn(ctx, format, arg...) \
+ mana_trace(ctx, MANA_WARNING, format, ##arg)
+#define mana_err(ctx, format, arg...) \
+ mana_trace(ctx, MANA_ALERT, format, ##arg)
+
+#define unlikely(x) __predict_false(!!(x))
+#define likely(x) __predict_true(!!(x))
+
+
+#define BITS_PER_LONG (sizeof(long) * NBBY)
+
+#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG))
+#define BITMAP_LAST_WORD_MASK(n) (~0UL >> (BITS_PER_LONG - (n)))
+#define BITS_TO_LONGS(n) howmany((n), BITS_PER_LONG)
+#define BIT_MASK(nr) (1UL << ((nr) & (BITS_PER_LONG - 1)))
+#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
+
+#undef ALIGN
+#define ALIGN(x, y) roundup2((x), (y))
+#define IS_ALIGNED(x, a) (((x) & ((__typeof(x))(a) - 1)) == 0)
+
+#define BIT(n) (1ULL << (n))
+
+#define PHYS_PFN(x) ((unsigned long)((x) >> PAGE_SHIFT))
+#define offset_in_page(x) ((x) & PAGE_MASK)
+
+#define min_t(type, _x, _y) \
+ ((type)(_x) < (type)(_y) ? (type)(_x) : (type)(_y))
+
+#define test_bit(i, a) \
+ ((((volatile const unsigned long *)(a))[BIT_WORD(i)]) & BIT_MASK(i))
+
+typedef volatile uint32_t atomic_t;
+
+#define atomic_add_return(v, p) (atomic_fetchadd_int(p, v) + (v))
+#define atomic_sub_return(v, p) (atomic_fetchadd_int(p, -(v)) - (v))
+#define atomic_inc_return(p) atomic_add_return(1, p)
+#define atomic_dec_return(p) atomic_sub_return(1, p)
+#define atomic_read(p) atomic_add_return(0, p)
+
+#define usleep_range(_1, _2) \
+ pause_sbt("gdma-usleep-range", SBT_1US * _1, SBT_1US * 1, C_ABSOLUTE)
+
+static inline void
+gdma_msleep(unsigned int ms)
+{
+ if (ms == 0)
+ ms = 1;
+ pause_sbt("gdma-msleep", mstosbt(ms), 0, C_HARDCLOCK);
+}
+
+static inline void
+bitmap_set(unsigned long *map, unsigned int start, int nr)
+{
+ const unsigned int size = start + nr;
+ int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
+ unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);
+
+ map += BIT_WORD(start);
+
+ while (nr - bits_to_set >= 0) {
+ *map |= mask_to_set;
+ nr -= bits_to_set;
+ bits_to_set = BITS_PER_LONG;
+ mask_to_set = ~0UL;
+ map++;
+ }
+
+ if (nr) {
+ mask_to_set &= BITMAP_LAST_WORD_MASK(size);
+ *map |= mask_to_set;
+ }
+}
+
+static inline void
+bitmap_clear(unsigned long *map, unsigned int start, int nr)
+{
+ const unsigned int size = start + nr;
+ int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+ unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+
+ map += BIT_WORD(start);
+
+ while (nr - bits_to_clear >= 0) {
+ *map &= ~mask_to_clear;
+ nr -= bits_to_clear;
+ bits_to_clear = BITS_PER_LONG;
+ mask_to_clear = ~0UL;
+ map++;
+ }
+
+ if (nr) {
+ mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+ *map &= ~mask_to_clear;
+ }
+}
+
+static inline unsigned long
+find_first_zero_bit(const unsigned long *p, unsigned long max)
+{
+ unsigned long i, n;
+
+ for (i = 0; i < max / BITS_PER_LONG + 1; i++) {
+ n = ~p[i];
+ if (n != 0)
+ return (i * BITS_PER_LONG + ffsl(n) - 1);
+ }
+ return (max);
+}
+
+static inline unsigned long
+ilog2(unsigned long x)
+{
+ unsigned long log = x;
+ while (x >>= 1)
+ log++;
+ return (log);
+}
+
+static inline unsigned long
+roundup_pow_of_two(unsigned long x)
+{
+ return (1UL << flsl(x - 1));
+}
+
+static inline int
+is_power_of_2(unsigned long n)
+{
+ return (n == roundup_pow_of_two(n));
+}
+
+struct completion {
+ unsigned int done;
+ struct mtx lock;
+};
+
+void init_completion(struct completion *c);
+void free_completion(struct completion *c);
+void complete(struct completion *c);
+void wait_for_completion(struct completion *c);
+int wait_for_completion_timeout(struct completion *c, int timeout);
+#endif /* _GDMA_UTIL_H_ */
diff --git a/sys/dev/mana/hw_channel.c b/sys/dev/mana/hw_channel.c
new file mode 100644
index 000000000000..1949f1d2e049
--- /dev/null
+++ b/sys/dev/mana/hw_channel.c
@@ -0,0 +1,950 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2021 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/bus.h>
+#include <machine/bus.h>
+
+#include "mana.h"
+#include "hw_channel.h"
+
+static int
+mana_hwc_get_msg_index(struct hw_channel_context *hwc, uint16_t *msg_id)
+{
+ struct gdma_resource *r = &hwc->inflight_msg_res;
+ uint32_t index;
+
+ sema_wait(&hwc->sema);
+
+ mtx_lock_spin(&r->lock_spin);
+
+ index = find_first_zero_bit(hwc->inflight_msg_res.map,
+ hwc->inflight_msg_res.size);
+
+ bitmap_set(hwc->inflight_msg_res.map, index, 1);
+
+ mtx_unlock_spin(&r->lock_spin);
+
+ *msg_id = index;
+
+ return 0;
+}
+
+static void
+mana_hwc_put_msg_index(struct hw_channel_context *hwc, uint16_t msg_id)
+{
+ struct gdma_resource *r = &hwc->inflight_msg_res;
+
+ mtx_lock_spin(&r->lock_spin);
+ bitmap_clear(hwc->inflight_msg_res.map, msg_id, 1);
+ mtx_unlock_spin(&r->lock_spin);
+
+ sema_post(&hwc->sema);
+}
+
+static int
+mana_hwc_verify_resp_msg(const struct hwc_caller_ctx *caller_ctx,
+ const struct gdma_resp_hdr *resp_msg,
+ uint32_t resp_len)
+{
+ if (resp_len < sizeof(*resp_msg))
+ return EPROTO;
+
+ if (resp_len > caller_ctx->output_buflen)
+ return EPROTO;
+
+ return 0;
+}
+
+static void
+mana_hwc_handle_resp(struct hw_channel_context *hwc, uint32_t resp_len,
+ const struct gdma_resp_hdr *resp_msg)
+{
+ struct hwc_caller_ctx *ctx;
+ int err;
+
+ if (!test_bit(resp_msg->response.hwc_msg_id,
+ hwc->inflight_msg_res.map)) {
+ device_printf(hwc->dev, "hwc_rx: invalid msg_id = %u\n",
+ resp_msg->response.hwc_msg_id);
+ return;
+ }
+
+ ctx = hwc->caller_ctx + resp_msg->response.hwc_msg_id;
+ err = mana_hwc_verify_resp_msg(ctx, resp_msg, resp_len);
+ if (err)
+ goto out;
+
+ ctx->status_code = resp_msg->status;
+
+ memcpy(ctx->output_buf, resp_msg, resp_len);
+out:
+ ctx->error = err;
+ complete(&ctx->comp_event);
+}
+
+static int
+mana_hwc_post_rx_wqe(const struct hwc_wq *hwc_rxq,
+ struct hwc_work_request *req)
+{
+ device_t dev = hwc_rxq->hwc->dev;
+ struct gdma_sge *sge;
+ int err;
+
+ sge = &req->sge;
+ sge->address = (uint64_t)req->buf_sge_addr;
+ sge->mem_key = hwc_rxq->msg_buf->gpa_mkey;
+ sge->size = req->buf_len;
+
+ memset(&req->wqe_req, 0, sizeof(struct gdma_wqe_request));
+ req->wqe_req.sgl = sge;
+ req->wqe_req.num_sge = 1;
+ req->wqe_req.client_data_unit = 0;
+
+ err = mana_gd_post_and_ring(hwc_rxq->gdma_wq, &req->wqe_req, NULL);
+ if (err)
+ device_printf(dev,
+ "Failed to post WQE on HWC RQ: %d\n", err);
+ return err;
+}
+
+static void
+mana_hwc_init_event_handler(void *ctx, struct gdma_queue *q_self,
+ struct gdma_event *event)
+{
+ struct hw_channel_context *hwc = ctx;
+ struct gdma_dev *gd = hwc->gdma_dev;
+ union hwc_init_type_data type_data;
+ union hwc_init_eq_id_db eq_db;
+ uint32_t type, val;
+
+ switch (event->type) {
+ case GDMA_EQE_HWC_INIT_EQ_ID_DB:
+ eq_db.as_uint32 = event->details[0];
+ hwc->cq->gdma_eq->id = eq_db.eq_id;
+ gd->doorbell = eq_db.doorbell;
+ break;
+
+ case GDMA_EQE_HWC_INIT_DATA:
+ type_data.as_uint32 = event->details[0];
+ type = type_data.type;
+ val = type_data.value;
+
+ switch (type) {
+ case HWC_INIT_DATA_CQID:
+ hwc->cq->gdma_cq->id = val;
+ break;
+
+ case HWC_INIT_DATA_RQID:
+ hwc->rxq->gdma_wq->id = val;
+ break;
+
+ case HWC_INIT_DATA_SQID:
+ hwc->txq->gdma_wq->id = val;
+ break;
+
+ case HWC_INIT_DATA_QUEUE_DEPTH:
+ hwc->hwc_init_q_depth_max = (uint16_t)val;
+ break;
+
+ case HWC_INIT_DATA_MAX_REQUEST:
+ hwc->hwc_init_max_req_msg_size = val;
+ break;
+
+ case HWC_INIT_DATA_MAX_RESPONSE:
+ hwc->hwc_init_max_resp_msg_size = val;
+ break;
+
+ case HWC_INIT_DATA_MAX_NUM_CQS:
+ gd->gdma_context->max_num_cqs = val;
+ break;
+
+ case HWC_INIT_DATA_PDID:
+ hwc->gdma_dev->pdid = val;
+ break;
+
+ case HWC_INIT_DATA_GPA_MKEY:
+ hwc->rxq->msg_buf->gpa_mkey = val;
+ hwc->txq->msg_buf->gpa_mkey = val;
+ break;
+ }
+
+ break;
+
+ case GDMA_EQE_HWC_INIT_DONE:
+ complete(&hwc->hwc_init_eqe_comp);
+ break;
+
+ default:
+ /* Ignore unknown events, which should never happen. */
+ break;
+ }
+}
+
+static void
+mana_hwc_rx_event_handler(void *ctx, uint32_t gdma_rxq_id,
+ const struct hwc_rx_oob *rx_oob)
+{
+ struct hw_channel_context *hwc = ctx;
+ struct hwc_wq *hwc_rxq = hwc->rxq;
+ struct hwc_work_request *rx_req;
+ struct gdma_resp_hdr *resp;
+ struct gdma_wqe *dma_oob;
+ struct gdma_queue *rq;
+ struct gdma_sge *sge;
+ uint64_t rq_base_addr;
+ uint64_t rx_req_idx;
+ uint8_t *wqe;
+
+ if (hwc_rxq->gdma_wq->id != gdma_rxq_id) {
+ mana_warn(NULL, "unmatched rx queue %u != %u\n",
+ hwc_rxq->gdma_wq->id, gdma_rxq_id);
+ return;
+ }
+
+
+ rq = hwc_rxq->gdma_wq;
+ wqe = mana_gd_get_wqe_ptr(rq, rx_oob->wqe_offset / GDMA_WQE_BU_SIZE);
+ dma_oob = (struct gdma_wqe *)wqe;
+
+ bus_dmamap_sync(rq->mem_info.dma_tag, rq->mem_info.dma_map,
+ BUS_DMASYNC_POSTREAD);
+
+ sge = (struct gdma_sge *)(wqe + 8 + dma_oob->inline_oob_size_div4 * 4);
+
+ /* Select the RX work request for virtual address and for reposting. */
+ rq_base_addr = hwc_rxq->msg_buf->mem_info.dma_handle;
+ rx_req_idx = (sge->address - rq_base_addr) / hwc->max_req_msg_size;
+
+ bus_dmamap_sync(hwc_rxq->msg_buf->mem_info.dma_tag,
+ hwc_rxq->msg_buf->mem_info.dma_map,
+ BUS_DMASYNC_POSTREAD);
+
+ rx_req = &hwc_rxq->msg_buf->reqs[rx_req_idx];
+ resp = (struct gdma_resp_hdr *)rx_req->buf_va;
+
+ if (resp->response.hwc_msg_id >= hwc->num_inflight_msg) {
+ device_printf(hwc->dev, "HWC RX: wrong msg_id=%u\n",
+ resp->response.hwc_msg_id);
+ return;
+ }
+
+ mana_hwc_handle_resp(hwc, rx_oob->tx_oob_data_size, resp);
+
+ /* Do no longer use 'resp', because the buffer is posted to the HW
+ * in the below mana_hwc_post_rx_wqe().
+ */
+ resp = NULL;
+
+ bus_dmamap_sync(hwc_rxq->msg_buf->mem_info.dma_tag,
+ hwc_rxq->msg_buf->mem_info.dma_map,
+ BUS_DMASYNC_PREREAD);
+
+ mana_hwc_post_rx_wqe(hwc_rxq, rx_req);
+}
+
+static void
+mana_hwc_tx_event_handler(void *ctx, uint32_t gdma_txq_id,
+ const struct hwc_rx_oob *rx_oob)
+{
+ struct hw_channel_context *hwc = ctx;
+ struct hwc_wq *hwc_txq = hwc->txq;
+
+ if (!hwc_txq || hwc_txq->gdma_wq->id != gdma_txq_id) {
+ mana_warn(NULL, "unmatched tx queue %u != %u\n",
+ hwc_txq->gdma_wq->id, gdma_txq_id);
+ }
+
+ bus_dmamap_sync(hwc_txq->gdma_wq->mem_info.dma_tag,
+ hwc_txq->gdma_wq->mem_info.dma_map,
+ BUS_DMASYNC_POSTWRITE);
+}
+
+static int
+mana_hwc_create_gdma_wq(struct hw_channel_context *hwc,
+ enum gdma_queue_type type, uint64_t queue_size,
+ struct gdma_queue **queue)
+{
+ struct gdma_queue_spec spec = {};
+
+ if (type != GDMA_SQ && type != GDMA_RQ)
+ return EINVAL;
+
+ spec.type = type;
+ spec.monitor_avl_buf = false;
+ spec.queue_size = queue_size;
+
+ return mana_gd_create_hwc_queue(hwc->gdma_dev, &spec, queue);
+}
+
+static int
+mana_hwc_create_gdma_cq(struct hw_channel_context *hwc,
+ uint64_t queue_size,
+ void *ctx, gdma_cq_callback *cb,
+ struct gdma_queue *parent_eq,
+ struct gdma_queue **queue)
+{
+ struct gdma_queue_spec spec = {};
+
+ spec.type = GDMA_CQ;
+ spec.monitor_avl_buf = false;
+ spec.queue_size = queue_size;
+ spec.cq.context = ctx;
+ spec.cq.callback = cb;
+ spec.cq.parent_eq = parent_eq;
+
+ return mana_gd_create_hwc_queue(hwc->gdma_dev, &spec, queue);
+}
+
+static int
+mana_hwc_create_gdma_eq(struct hw_channel_context *hwc,
+ uint64_t queue_size,
+ void *ctx, gdma_eq_callback *cb,
+ struct gdma_queue **queue)
+{
+ struct gdma_queue_spec spec = {};
+
+ spec.type = GDMA_EQ;
+ spec.monitor_avl_buf = false;
+ spec.queue_size = queue_size;
+ spec.eq.context = ctx;
+ spec.eq.callback = cb;
+ spec.eq.log2_throttle_limit = DEFAULT_LOG2_THROTTLING_FOR_ERROR_EQ;
+
+ return mana_gd_create_hwc_queue(hwc->gdma_dev, &spec, queue);
+}
+
+static void
+mana_hwc_comp_event(void *ctx, struct gdma_queue *q_self)
+{
+ struct hwc_rx_oob comp_data = {};
+ struct gdma_comp *completions;
+ struct hwc_cq *hwc_cq = ctx;
+ int comp_read, i;
+
+ completions = hwc_cq->comp_buf;
+ comp_read = mana_gd_poll_cq(q_self, completions, hwc_cq->queue_depth);
+
+ for (i = 0; i < comp_read; ++i) {
+ comp_data = *(struct hwc_rx_oob *)completions[i].cqe_data;
+
+ if (completions[i].is_sq)
+ hwc_cq->tx_event_handler(hwc_cq->tx_event_ctx,
+ completions[i].wq_num,
+ &comp_data);
+ else
+ hwc_cq->rx_event_handler(hwc_cq->rx_event_ctx,
+ completions[i].wq_num,
+ &comp_data);
+ }
+
+ bus_dmamap_sync(q_self->mem_info.dma_tag, q_self->mem_info.dma_map,
+ BUS_DMASYNC_POSTREAD);
+
+ mana_gd_arm_cq(q_self);
+}
+
+static void
+mana_hwc_destroy_cq(struct gdma_context *gc, struct hwc_cq *hwc_cq)
+{
+ if (!hwc_cq)
+ return;
+
+ if (hwc_cq->comp_buf)
+ free(hwc_cq->comp_buf, M_DEVBUF);
+
+ if (hwc_cq->gdma_cq)
+ mana_gd_destroy_queue(gc, hwc_cq->gdma_cq);
+
+ if (hwc_cq->gdma_eq)
+ mana_gd_destroy_queue(gc, hwc_cq->gdma_eq);
+
+ free(hwc_cq, M_DEVBUF);
+}
+
+static int
+mana_hwc_create_cq(struct hw_channel_context *hwc,
+ uint16_t q_depth,
+ gdma_eq_callback *callback, void *ctx,
+ hwc_rx_event_handler_t *rx_ev_hdlr, void *rx_ev_ctx,
+ hwc_tx_event_handler_t *tx_ev_hdlr, void *tx_ev_ctx,
+ struct hwc_cq **hwc_cq_ptr)
+{
+ struct gdma_queue *eq, *cq;
+ struct gdma_comp *comp_buf;
+ struct hwc_cq *hwc_cq;
+ uint32_t eq_size, cq_size;
+ int err;
+
+ eq_size = roundup_pow_of_two(GDMA_EQE_SIZE * q_depth);
+ if (eq_size < MINIMUM_SUPPORTED_PAGE_SIZE)
+ eq_size = MINIMUM_SUPPORTED_PAGE_SIZE;
+
+ cq_size = roundup_pow_of_two(GDMA_CQE_SIZE * q_depth);
+ if (cq_size < MINIMUM_SUPPORTED_PAGE_SIZE)
+ cq_size = MINIMUM_SUPPORTED_PAGE_SIZE;
+
+ hwc_cq = malloc(sizeof(*hwc_cq), M_DEVBUF, M_WAITOK | M_ZERO);
+ if (!hwc_cq)
+ return ENOMEM;
+
+ err = mana_hwc_create_gdma_eq(hwc, eq_size, ctx, callback, &eq);
+ if (err) {
+ device_printf(hwc->dev,
+ "Failed to create HWC EQ for RQ: %d\n", err);
+ goto out;
+ }
+ hwc_cq->gdma_eq = eq;
+
+ err = mana_hwc_create_gdma_cq(hwc, cq_size, hwc_cq,
+ mana_hwc_comp_event, eq, &cq);
+ if (err) {
+ device_printf(hwc->dev,
+ "Failed to create HWC CQ for RQ: %d\n", err);
+ goto out;
+ }
+ hwc_cq->gdma_cq = cq;
+
+ comp_buf = mallocarray(q_depth, sizeof(struct gdma_comp),
+ M_DEVBUF, M_WAITOK | M_ZERO);
+ if (!comp_buf) {
+ err = ENOMEM;
+ goto out;
+ }
+
+ hwc_cq->hwc = hwc;
+ hwc_cq->comp_buf = comp_buf;
+ hwc_cq->queue_depth = q_depth;
+ hwc_cq->rx_event_handler = rx_ev_hdlr;
+ hwc_cq->rx_event_ctx = rx_ev_ctx;
+ hwc_cq->tx_event_handler = tx_ev_hdlr;
+ hwc_cq->tx_event_ctx = tx_ev_ctx;
+
+ *hwc_cq_ptr = hwc_cq;
+ return 0;
+out:
+ mana_hwc_destroy_cq(hwc->gdma_dev->gdma_context, hwc_cq);
+ return err;
+}
+
+static int
+mana_hwc_alloc_dma_buf(struct hw_channel_context *hwc, uint16_t q_depth,
+ uint32_t max_msg_size,
+ struct hwc_dma_buf **dma_buf_ptr)
+{
+ struct gdma_context *gc = hwc->gdma_dev->gdma_context;
+ struct hwc_work_request *hwc_wr;
+ struct hwc_dma_buf *dma_buf;
+ struct gdma_mem_info *gmi;
+ uint32_t buf_size;
+ uint8_t *base_pa;
+ void *virt_addr;
+ uint16_t i;
+ int err;
+
+ dma_buf = malloc(sizeof(*dma_buf) +
+ q_depth * sizeof(struct hwc_work_request),
+ M_DEVBUF, M_WAITOK | M_ZERO);
+ if (!dma_buf)
+ return ENOMEM;
+
+ dma_buf->num_reqs = q_depth;
+
+ buf_size = ALIGN(q_depth * max_msg_size, PAGE_SIZE);
+
+ gmi = &dma_buf->mem_info;
+ err = mana_gd_alloc_memory(gc, buf_size, gmi);
+ if (err) {
+ device_printf(hwc->dev,
+ "Failed to allocate DMA buffer: %d\n", err);
+ goto out;
+ }
+
+ virt_addr = dma_buf->mem_info.virt_addr;
+ base_pa = (uint8_t *)dma_buf->mem_info.dma_handle;
+
+ for (i = 0; i < q_depth; i++) {
+ hwc_wr = &dma_buf->reqs[i];
+
+ hwc_wr->buf_va = (char *)virt_addr + i * max_msg_size;
+ hwc_wr->buf_sge_addr = base_pa + i * max_msg_size;
+
+ hwc_wr->buf_len = max_msg_size;
+ }
+
+ *dma_buf_ptr = dma_buf;
+ return 0;
+out:
+ free(dma_buf, M_DEVBUF);
+ return err;
+}
+
+static void
+mana_hwc_dealloc_dma_buf(struct hw_channel_context *hwc,
+ struct hwc_dma_buf *dma_buf)
+{
+ if (!dma_buf)
+ return;
+
+ mana_gd_free_memory(&dma_buf->mem_info);
+
+ free(dma_buf, M_DEVBUF);
+}
+
+static void
+mana_hwc_destroy_wq(struct hw_channel_context *hwc,
+ struct hwc_wq *hwc_wq)
+{
+ if (!hwc_wq)
+ return;
+
+ mana_hwc_dealloc_dma_buf(hwc, hwc_wq->msg_buf);
+
+ if (hwc_wq->gdma_wq)
+ mana_gd_destroy_queue(hwc->gdma_dev->gdma_context,
+ hwc_wq->gdma_wq);
+
+ free(hwc_wq, M_DEVBUF);
+}
+
+static int
+mana_hwc_create_wq(struct hw_channel_context *hwc,
+ enum gdma_queue_type q_type, uint16_t q_depth,
+ uint32_t max_msg_size, struct hwc_cq *hwc_cq,
+ struct hwc_wq **hwc_wq_ptr)
+{
+ struct gdma_queue *queue;
+ struct hwc_wq *hwc_wq;
+ uint32_t queue_size;
+ int err;
+
+ if (q_type != GDMA_SQ && q_type != GDMA_RQ) {
+ /* XXX should fail and return error? */
+ mana_warn(NULL, "Invalid q_type %u\n", q_type);
+ }
+
+ if (q_type == GDMA_RQ)
+ queue_size = roundup_pow_of_two(GDMA_MAX_RQE_SIZE * q_depth);
+ else
+ queue_size = roundup_pow_of_two(GDMA_MAX_SQE_SIZE * q_depth);
+
+ if (queue_size < MINIMUM_SUPPORTED_PAGE_SIZE)
+ queue_size = MINIMUM_SUPPORTED_PAGE_SIZE;
+
+ hwc_wq = malloc(sizeof(*hwc_wq), M_DEVBUF, M_WAITOK | M_ZERO);
+ if (!hwc_wq)
+ return ENOMEM;
+
+ err = mana_hwc_create_gdma_wq(hwc, q_type, queue_size, &queue);
+ if (err)
+ goto out;
+
+ err = mana_hwc_alloc_dma_buf(hwc, q_depth, max_msg_size,
+ &hwc_wq->msg_buf);
+ if (err)
+ goto out;
+
+ hwc_wq->hwc = hwc;
+ hwc_wq->gdma_wq = queue;
+ hwc_wq->queue_depth = q_depth;
+ hwc_wq->hwc_cq = hwc_cq;
+
+ *hwc_wq_ptr = hwc_wq;
+ return 0;
+out:
+ if (err)
+ mana_hwc_destroy_wq(hwc, hwc_wq);
+ return err;
+}
+
+static int
+mana_hwc_post_tx_wqe(const struct hwc_wq *hwc_txq,
+ struct hwc_work_request *req,
+ uint32_t dest_virt_rq_id, uint32_t dest_virt_rcq_id,
+ bool dest_pf)
+{
+ device_t dev = hwc_txq->hwc->dev;
+ struct hwc_tx_oob *tx_oob;
+ struct gdma_sge *sge;
+ int err;
+
+ if (req->msg_size == 0 || req->msg_size > req->buf_len) {
+ device_printf(dev, "wrong msg_size: %u, buf_len: %u\n",
+ req->msg_size, req->buf_len);
+ return EINVAL;
+ }
+
+ tx_oob = &req->tx_oob;
+
+ tx_oob->vrq_id = dest_virt_rq_id;
+ tx_oob->dest_vfid = 0;
+ tx_oob->vrcq_id = dest_virt_rcq_id;
+ tx_oob->vscq_id = hwc_txq->hwc_cq->gdma_cq->id;
+ tx_oob->loopback = false;
+ tx_oob->lso_override = false;
+ tx_oob->dest_pf = dest_pf;
+ tx_oob->vsq_id = hwc_txq->gdma_wq->id;
+
+ sge = &req->sge;
+ sge->address = (uint64_t)req->buf_sge_addr;
+ sge->mem_key = hwc_txq->msg_buf->gpa_mkey;
+ sge->size = req->msg_size;
+
+ memset(&req->wqe_req, 0, sizeof(struct gdma_wqe_request));
+ req->wqe_req.sgl = sge;
+ req->wqe_req.num_sge = 1;
+ req->wqe_req.inline_oob_size = sizeof(struct hwc_tx_oob);
+ req->wqe_req.inline_oob_data = tx_oob;
+ req->wqe_req.client_data_unit = 0;
+
+ err = mana_gd_post_and_ring(hwc_txq->gdma_wq, &req->wqe_req, NULL);
+ if (err)
+ device_printf(dev,
+ "Failed to post WQE on HWC SQ: %d\n", err);
+ return err;
+}
+
+static int
+mana_hwc_init_inflight_msg(struct hw_channel_context *hwc, uint16_t num_msg)
+{
+ int err;
+
+ sema_init(&hwc->sema, num_msg, "gdma hwc sema");
+
+ err = mana_gd_alloc_res_map(num_msg, &hwc->inflight_msg_res,
+ "gdma hwc res lock");
+ if (err)
+ device_printf(hwc->dev,
+ "Failed to init inflight_msg_res: %d\n", err);
+
+ return (err);
+}
+
+static int
+mana_hwc_test_channel(struct hw_channel_context *hwc, uint16_t q_depth,
+ uint32_t max_req_msg_size, uint32_t max_resp_msg_size)
+{
+ struct gdma_context *gc = hwc->gdma_dev->gdma_context;
+ struct hwc_wq *hwc_rxq = hwc->rxq;
+ struct hwc_work_request *req;
+ struct hwc_caller_ctx *ctx;
+ int err;
+ int i;
+
+ /* Post all WQEs on the RQ */
+ for (i = 0; i < q_depth; i++) {
+ req = &hwc_rxq->msg_buf->reqs[i];
+ err = mana_hwc_post_rx_wqe(hwc_rxq, req);
+ if (err)
+ return err;
+ }
+
+ ctx = malloc(q_depth * sizeof(struct hwc_caller_ctx),
+ M_DEVBUF, M_WAITOK | M_ZERO);
+ if (!ctx)
+ return ENOMEM;
+
+ for (i = 0; i < q_depth; ++i)
+ init_completion(&ctx[i].comp_event);
+
+ hwc->caller_ctx = ctx;
+
+ return mana_gd_test_eq(gc, hwc->cq->gdma_eq);
+}
+
+static int
+mana_hwc_establish_channel(struct gdma_context *gc, uint16_t *q_depth,
+ uint32_t *max_req_msg_size,
+ uint32_t *max_resp_msg_size)
+{
+ struct hw_channel_context *hwc = gc->hwc.driver_data;
+ struct gdma_queue *rq = hwc->rxq->gdma_wq;
+ struct gdma_queue *sq = hwc->txq->gdma_wq;
+ struct gdma_queue *eq = hwc->cq->gdma_eq;
+ struct gdma_queue *cq = hwc->cq->gdma_cq;
+ int err;
+
+ init_completion(&hwc->hwc_init_eqe_comp);
+
+ err = mana_smc_setup_hwc(&gc->shm_channel, false,
+ eq->mem_info.dma_handle,
+ cq->mem_info.dma_handle,
+ rq->mem_info.dma_handle,
+ sq->mem_info.dma_handle,
+ eq->eq.msix_index);
+ if (err)
+ return err;
+
+ if (wait_for_completion_timeout(&hwc->hwc_init_eqe_comp, 60 * hz))
+ return ETIMEDOUT;
+
+ *q_depth = hwc->hwc_init_q_depth_max;
+ *max_req_msg_size = hwc->hwc_init_max_req_msg_size;
+ *max_resp_msg_size = hwc->hwc_init_max_resp_msg_size;
+
+ if (cq->id >= gc->max_num_cqs) {
+ mana_warn(NULL, "invalid cq id %u > %u\n",
+ cq->id, gc->max_num_cqs);
+ return EPROTO;
+ }
+
+ gc->cq_table = malloc(gc->max_num_cqs * sizeof(struct gdma_queue *),
+ M_DEVBUF, M_WAITOK | M_ZERO);
+ if (!gc->cq_table)
+ return ENOMEM;
+
+ gc->cq_table[cq->id] = cq;
+
+ return 0;
+}
+
+static int
+mana_hwc_init_queues(struct hw_channel_context *hwc, uint16_t q_depth,
+ uint32_t max_req_msg_size, uint32_t max_resp_msg_size)
+{
+ struct hwc_wq *hwc_rxq = NULL;
+ struct hwc_wq *hwc_txq = NULL;
+ struct hwc_cq *hwc_cq = NULL;
+ int err;
+
+ err = mana_hwc_init_inflight_msg(hwc, q_depth);
+ if (err)
+ return err;
+
+ /* CQ is shared by SQ and RQ, so CQ's queue depth is the sum of SQ
+ * queue depth and RQ queue depth.
+ */
+ err = mana_hwc_create_cq(hwc, q_depth * 2,
+ mana_hwc_init_event_handler, hwc,
+ mana_hwc_rx_event_handler, hwc,
+ mana_hwc_tx_event_handler, hwc, &hwc_cq);
+ if (err) {
+ device_printf(hwc->dev, "Failed to create HWC CQ: %d\n", err);
+ goto out;
+ }
+ hwc->cq = hwc_cq;
+
+ err = mana_hwc_create_wq(hwc, GDMA_RQ, q_depth, max_req_msg_size,
+ hwc_cq, &hwc_rxq);
+ if (err) {
+ device_printf(hwc->dev, "Failed to create HWC RQ: %d\n", err);
+ goto out;
+ }
+ hwc->rxq = hwc_rxq;
+
+ err = mana_hwc_create_wq(hwc, GDMA_SQ, q_depth, max_resp_msg_size,
+ hwc_cq, &hwc_txq);
+ if (err) {
+ device_printf(hwc->dev, "Failed to create HWC SQ: %d\n", err);
+ goto out;
+ }
+ hwc->txq = hwc_txq;
+
+ hwc->num_inflight_msg = q_depth;
+ hwc->max_req_msg_size = max_req_msg_size;
+
+ return 0;
+out:
+ if (hwc_txq)
+ mana_hwc_destroy_wq(hwc, hwc_txq);
+
+ if (hwc_rxq)
+ mana_hwc_destroy_wq(hwc, hwc_rxq);
+
+ if (hwc_cq)
+ mana_hwc_destroy_cq(hwc->gdma_dev->gdma_context, hwc_cq);
+
+ mana_gd_free_res_map(&hwc->inflight_msg_res);
+ return err;
+}
+
+int
+mana_hwc_create_channel(struct gdma_context *gc)
+{
+ uint32_t max_req_msg_size, max_resp_msg_size;
+ struct gdma_dev *gd = &gc->hwc;
+ struct hw_channel_context *hwc;
+ uint16_t q_depth_max;
+ int err;
+
+ hwc = malloc(sizeof(*hwc), M_DEVBUF, M_WAITOK | M_ZERO);
+ if (!hwc)
+ return ENOMEM;
+
+ gd->gdma_context = gc;
+ gd->driver_data = hwc;
+ hwc->gdma_dev = gd;
+ hwc->dev = gc->dev;
+
+ /* HWC's instance number is always 0. */
+ gd->dev_id.as_uint32 = 0;
+ gd->dev_id.type = GDMA_DEVICE_HWC;
+
+ gd->pdid = INVALID_PDID;
+ gd->doorbell = INVALID_DOORBELL;
+
+ err = mana_hwc_init_queues(hwc, HW_CHANNEL_VF_BOOTSTRAP_QUEUE_DEPTH,
+ HW_CHANNEL_MAX_REQUEST_SIZE,
+ HW_CHANNEL_MAX_RESPONSE_SIZE);
+ if (err) {
+ device_printf(hwc->dev, "Failed to initialize HWC: %d\n",
+ err);
+ goto out;
+ }
+
+ err = mana_hwc_establish_channel(gc, &q_depth_max, &max_req_msg_size,
+ &max_resp_msg_size);
+ if (err) {
+ device_printf(hwc->dev, "Failed to establish HWC: %d\n", err);
+ goto out;
+ }
+
+ err = mana_hwc_test_channel(gc->hwc.driver_data,
+ HW_CHANNEL_VF_BOOTSTRAP_QUEUE_DEPTH,
+ max_req_msg_size, max_resp_msg_size);
+ if (err) {
+ /* Test failed, but the channel has been established */
+ device_printf(hwc->dev, "Failed to test HWC: %d\n", err);
+ return EIO;
+ }
+
+ return 0;
+out:
+ free(hwc, M_DEVBUF);
+ return (err);
+}
+
+void
+mana_hwc_destroy_channel(struct gdma_context *gc)
+{
+ struct hw_channel_context *hwc = gc->hwc.driver_data;
+ struct hwc_caller_ctx *ctx;
+
+ mana_smc_teardown_hwc(&gc->shm_channel, false);
+
+ ctx = hwc->caller_ctx;
+ free(ctx, M_DEVBUF);
+ hwc->caller_ctx = NULL;
+
+ mana_hwc_destroy_wq(hwc, hwc->txq);
+ hwc->txq = NULL;
+
+ mana_hwc_destroy_wq(hwc, hwc->rxq);
+ hwc->rxq = NULL;
+
+ mana_hwc_destroy_cq(hwc->gdma_dev->gdma_context, hwc->cq);
+ hwc->cq = NULL;
+
+ mana_gd_free_res_map(&hwc->inflight_msg_res);
+
+ hwc->num_inflight_msg = 0;
+
+ if (hwc->gdma_dev->pdid != INVALID_PDID) {
+ hwc->gdma_dev->doorbell = INVALID_DOORBELL;
+ hwc->gdma_dev->pdid = INVALID_PDID;
+ }
+
+ free(hwc, M_DEVBUF);
+ gc->hwc.driver_data = NULL;
+ gc->hwc.gdma_context = NULL;
+}
+
+int
+mana_hwc_send_request(struct hw_channel_context *hwc, uint32_t req_len,
+ const void *req, uint32_t resp_len, void *resp)
+{
+ struct hwc_work_request *tx_wr;
+ struct hwc_wq *txq = hwc->txq;
+ struct gdma_req_hdr *req_msg;
+ struct hwc_caller_ctx *ctx;
+ uint16_t msg_id;
+ int err;
+
+ mana_hwc_get_msg_index(hwc, &msg_id);
+
+ tx_wr = &txq->msg_buf->reqs[msg_id];
+
+ if (req_len > tx_wr->buf_len) {
+ device_printf(hwc->dev,
+ "HWC: req msg size: %d > %d\n", req_len,
+ tx_wr->buf_len);
+ err = EINVAL;
+ goto out;
+ }
+
+ ctx = hwc->caller_ctx + msg_id;
+ ctx->output_buf = resp;
+ ctx->output_buflen = resp_len;
+
+ req_msg = (struct gdma_req_hdr *)tx_wr->buf_va;
+ if (req)
+ memcpy(req_msg, req, req_len);
+
+ req_msg->req.hwc_msg_id = msg_id;
+
+ tx_wr->msg_size = req_len;
+
+ err = mana_hwc_post_tx_wqe(txq, tx_wr, 0, 0, false);
+ if (err) {
+ device_printf(hwc->dev,
+ "HWC: Failed to post send WQE: %d\n", err);
+ goto out;
+ }
+
+ if (wait_for_completion_timeout(&ctx->comp_event, 30 * hz)) {
+ device_printf(hwc->dev, "HWC: Request timed out!\n");
+ err = ETIMEDOUT;
+ goto out;
+ }
+
+ if (ctx->error) {
+ err = ctx->error;
+ goto out;
+ }
+
+ if (ctx->status_code) {
+ device_printf(hwc->dev,
+ "HWC: Failed hw_channel req: 0x%x\n", ctx->status_code);
+ err = EPROTO;
+ goto out;
+ }
+out:
+ mana_hwc_put_msg_index(hwc, msg_id);
+ return err;
+}
diff --git a/sys/dev/mana/hw_channel.h b/sys/dev/mana/hw_channel.h
new file mode 100644
index 000000000000..368cc1ecd5f9
--- /dev/null
+++ b/sys/dev/mana/hw_channel.h
@@ -0,0 +1,222 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2021 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+
+#ifndef _HW_CHANNEL_H
+#define _HW_CHANNEL_H
+
+#include <sys/sema.h>
+
+#define DEFAULT_LOG2_THROTTLING_FOR_ERROR_EQ 4
+
+#define HW_CHANNEL_MAX_REQUEST_SIZE 0x1000
+#define HW_CHANNEL_MAX_RESPONSE_SIZE 0x1000
+
+#define HW_CHANNEL_VF_BOOTSTRAP_QUEUE_DEPTH 1
+
+#define HWC_INIT_DATA_CQID 1
+#define HWC_INIT_DATA_RQID 2
+#define HWC_INIT_DATA_SQID 3
+#define HWC_INIT_DATA_QUEUE_DEPTH 4
+#define HWC_INIT_DATA_MAX_REQUEST 5
+#define HWC_INIT_DATA_MAX_RESPONSE 6
+#define HWC_INIT_DATA_MAX_NUM_CQS 7
+#define HWC_INIT_DATA_PDID 8
+#define HWC_INIT_DATA_GPA_MKEY 9
+
+/* Structures labeled with "HW DATA" are exchanged with the hardware. All of
+ * them are naturally aligned and hence don't need __packed.
+ */
+
+union hwc_init_eq_id_db {
+ uint32_t as_uint32;
+
+ struct {
+ uint32_t eq_id : 16;
+ uint32_t doorbell: 16;
+ };
+}; /* HW DATA */
+
+union hwc_init_type_data {
+ uint32_t as_uint32;
+
+ struct {
+ uint32_t value : 24;
+ uint32_t type : 8;
+ };
+}; /* HW DATA */
+
+struct hwc_rx_oob {
+ uint32_t type : 6;
+ uint32_t eom : 1;
+ uint32_t som : 1;
+ uint32_t vendor_err : 8;
+ uint32_t reserved1 : 16;
+
+ uint32_t src_virt_wq : 24;
+ uint32_t src_vfid : 8;
+
+ uint32_t reserved2;
+
+ union {
+ uint32_t wqe_addr_low;
+ uint32_t wqe_offset;
+ };
+
+ uint32_t wqe_addr_high;
+
+ uint32_t client_data_unit : 14;
+ uint32_t reserved3 : 18;
+
+ uint32_t tx_oob_data_size;
+
+ uint32_t chunk_offset : 21;
+ uint32_t reserved4 : 11;
+}; /* HW DATA */
+
+struct hwc_tx_oob {
+ uint32_t reserved1;
+
+ uint32_t reserved2;
+
+ uint32_t vrq_id : 24;
+ uint32_t dest_vfid : 8;
+
+ uint32_t vrcq_id : 24;
+ uint32_t reserved3 : 8;
+
+ uint32_t vscq_id : 24;
+ uint32_t loopback : 1;
+ uint32_t lso_override: 1;
+ uint32_t dest_pf : 1;
+ uint32_t reserved4 : 5;
+
+ uint32_t vsq_id : 24;
+ uint32_t reserved5 : 8;
+}; /* HW DATA */
+
+struct hwc_work_request {
+ void *buf_va;
+ void *buf_sge_addr;
+ uint32_t buf_len;
+ uint32_t msg_size;
+
+ struct gdma_wqe_request wqe_req;
+ struct hwc_tx_oob tx_oob;
+
+ struct gdma_sge sge;
+};
+
+/* hwc_dma_buf represents the array of in-flight WQEs.
+ * mem_info as know as the GDMA mapped memory is partitioned and used by
+ * in-flight WQEs.
+ * The number of WQEs is determined by the number of in-flight messages.
+ */
+struct hwc_dma_buf {
+ struct gdma_mem_info mem_info;
+
+ uint32_t gpa_mkey;
+
+ uint32_t num_reqs;
+ struct hwc_work_request reqs[];
+};
+
+typedef void hwc_rx_event_handler_t(void *ctx, uint32_t gdma_rxq_id,
+ const struct hwc_rx_oob *rx_oob);
+
+typedef void hwc_tx_event_handler_t(void *ctx, uint32_t gdma_txq_id,
+ const struct hwc_rx_oob *rx_oob);
+
+struct hwc_cq {
+ struct hw_channel_context *hwc;
+
+ struct gdma_queue *gdma_cq;
+ struct gdma_queue *gdma_eq;
+ struct gdma_comp *comp_buf;
+ uint16_t queue_depth;
+
+ hwc_rx_event_handler_t *rx_event_handler;
+ void *rx_event_ctx;
+
+ hwc_tx_event_handler_t *tx_event_handler;
+ void *tx_event_ctx;
+};
+
+struct hwc_wq {
+ struct hw_channel_context *hwc;
+
+ struct gdma_queue *gdma_wq;
+ struct hwc_dma_buf *msg_buf;
+ uint16_t queue_depth;
+
+ struct hwc_cq *hwc_cq;
+};
+
+struct hwc_caller_ctx {
+ struct completion comp_event;
+ void *output_buf;
+ uint32_t output_buflen;
+
+ uint32_t error; /* Error code */
+ uint32_t status_code;
+};
+
+struct hw_channel_context {
+ struct gdma_dev *gdma_dev;
+ device_t dev;
+
+ uint16_t num_inflight_msg;
+ uint32_t max_req_msg_size;
+
+ uint16_t hwc_init_q_depth_max;
+ uint32_t hwc_init_max_req_msg_size;
+ uint32_t hwc_init_max_resp_msg_size;
+
+ struct completion hwc_init_eqe_comp;
+
+ struct hwc_wq *rxq;
+ struct hwc_wq *txq;
+ struct hwc_cq *cq;
+
+ struct sema sema;
+ struct gdma_resource inflight_msg_res;
+
+ struct hwc_caller_ctx *caller_ctx;
+};
+
+int mana_hwc_create_channel(struct gdma_context *gc);
+void mana_hwc_destroy_channel(struct gdma_context *gc);
+
+int mana_hwc_send_request(struct hw_channel_context *hwc, uint32_t req_len,
+ const void *req, uint32_t resp_len, void *resp);
+
+#endif /* _HW_CHANNEL_H */
diff --git a/sys/dev/mana/mana.h b/sys/dev/mana/mana.h
new file mode 100644
index 000000000000..683ab67a6abd
--- /dev/null
+++ b/sys/dev/mana/mana.h
@@ -0,0 +1,689 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2021 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+
+#ifndef _MANA_H
+#define _MANA_H
+
+#include <sys/types.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <sys/counter.h>
+
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_media.h>
+#include <netinet/tcp_lro.h>
+
+#include "gdma.h"
+#include "hw_channel.h"
+
+
+/* Microsoft Azure Network Adapter (MANA)'s definitions
+ *
+ * Structures labeled with "HW DATA" are exchanged with the hardware. All of
+ * them are naturally aligned and hence don't need __packed.
+ */
+/* MANA protocol version */
+#define MANA_MAJOR_VERSION 0
+#define MANA_MINOR_VERSION 1
+#define MANA_MICRO_VERSION 1
+
+#define DRV_MODULE_NAME "mana"
+
+#ifndef DRV_MODULE_VERSION
+#define DRV_MODULE_VERSION \
+ __XSTRING(MANA_MAJOR_VERSION) "." \
+ __XSTRING(MANA_MINOR_VERSION) "." \
+ __XSTRING(MANA_MICRO_VERSION)
+#endif
+#define DEVICE_NAME "Microsoft Azure Network Adapter (MANA)"
+#define DEVICE_DESC "MANA adapter"
+
+/*
+ * Supported PCI vendor and devices IDs
+ */
+#ifndef PCI_VENDOR_ID_MICROSOFT
+#define PCI_VENDOR_ID_MICROSOFT 0x1414
+#endif
+
+#define PCI_DEV_ID_MANA_VF 0x00ba
+
+typedef struct _mana_vendor_id_t {
+ uint16_t vendor_id;
+ uint16_t device_id;
+} mana_vendor_id_t;
+
+typedef uint64_t mana_handle_t;
+#define INVALID_MANA_HANDLE ((mana_handle_t)-1)
+
+enum TRI_STATE {
+ TRI_STATE_UNKNOWN = -1,
+ TRI_STATE_FALSE = 0,
+ TRI_STATE_TRUE = 1
+};
+
+/* Number of entries for hardware indirection table must be in power of 2 */
+#define MANA_INDIRECT_TABLE_SIZE 64
+#define MANA_INDIRECT_TABLE_MASK (MANA_INDIRECT_TABLE_SIZE - 1)
+
+/* The Toeplitz hash key's length in bytes: should be multiple of 8 */
+#define MANA_HASH_KEY_SIZE 40
+
+#define COMP_ENTRY_SIZE 64
+
+#define MIN_FRAME_SIZE 146
+#define ADAPTER_MTU_SIZE 1500
+#define DEFAULT_FRAME_SIZE (ADAPTER_MTU_SIZE + 14)
+#define MAX_FRAME_SIZE 4096
+
+#define RX_BUFFERS_PER_QUEUE 512
+
+#define MAX_SEND_BUFFERS_PER_QUEUE 256
+
+#define EQ_SIZE (8 * PAGE_SIZE)
+#define LOG2_EQ_THROTTLE 3
+
+#if 1 /* XXX */
+#define MAX_PORTS_IN_MANA_DEV 1
+#else
+#define MAX_PORTS_IN_MANA_DEV 16
+#endif
+
+struct mana_send_buf_info {
+ struct mbuf *mbuf;
+ bus_dmamap_t dma_map;
+
+ /* Required to store the result of mana_gd_post_work_request.
+ * gdma_posted_wqe_info.wqe_size_in_bu is required for progressing the
+ * work queue when the WQE is consumed.
+ */
+ struct gdma_posted_wqe_info wqe_inf;
+};
+
+struct mana_stats {
+ counter_u64_t packets; /* rx, tx */
+ counter_u64_t bytes; /* rx, tx */
+ counter_u64_t stop; /* tx */
+ counter_u64_t wakeup; /* tx */
+ counter_u64_t collapse; /* tx */
+ counter_u64_t collapse_err; /* tx */
+ counter_u64_t dma_mapping_err; /* rx, tx */
+ counter_u64_t mbuf_alloc_fail; /* rx */
+ counter_u64_t alt_chg; /* tx */
+ counter_u64_t alt_reset; /* tx */
+};
+
+struct mana_txq {
+ struct gdma_queue *gdma_sq;
+
+ union {
+ uint32_t gdma_txq_id;
+ struct {
+ uint32_t reserved1 :10;
+ uint32_t vsq_frame :14;
+ uint32_t reserved2 :8;
+ };
+ };
+
+ uint16_t vp_offset;
+
+ struct ifnet *ndev;
+ /* Store index to the array of tx_qp in port structure */
+ int idx;
+ /* The alternative txq idx when this txq is under heavy load */
+ int alt_txq_idx;
+
+ /* The mbufs are sent to the HW and we are waiting for the CQEs. */
+ struct mana_send_buf_info *tx_buf_info;
+ uint16_t next_to_use;
+ uint16_t next_to_complete;
+
+ atomic_t pending_sends;
+
+ struct buf_ring *txq_br;
+ struct mtx txq_mtx;
+ char txq_mtx_name[16];
+
+ struct task enqueue_task;
+ struct taskqueue *enqueue_tq;
+
+ struct mana_stats stats;
+};
+
+
+/*
+ * Max WQE size is 512B. The first 8B is for GDMA Out of Band (OOB),
+ * next is the Client OOB can be either 8B or 24B. Thus, the max
+ * space for SGL entries in a singel WQE is 512 - 8 - 8 = 496B. Since each
+ * SGL is 16B in size, the max number of SGLs in a WQE is 496/16 = 31.
+ * Save one for emergency use, set the MAX_MBUF_FRAGS allowed to 30.
+ */
+#define MAX_MBUF_FRAGS 30
+#define MANA_TSO_MAXSEG_SZ PAGE_SIZE
+
+/* mbuf data and frags dma mappings */
+struct mana_mbuf_head {
+ bus_addr_t dma_handle[MAX_MBUF_FRAGS + 1];
+
+ uint32_t size[MAX_MBUF_FRAGS + 1];
+};
+
+#define MANA_HEADROOM sizeof(struct mana_mbuf_head)
+
+enum mana_tx_pkt_format {
+ MANA_SHORT_PKT_FMT = 0,
+ MANA_LONG_PKT_FMT = 1,
+};
+
+struct mana_tx_short_oob {
+ uint32_t pkt_fmt :2;
+ uint32_t is_outer_ipv4 :1;
+ uint32_t is_outer_ipv6 :1;
+ uint32_t comp_iphdr_csum :1;
+ uint32_t comp_tcp_csum :1;
+ uint32_t comp_udp_csum :1;
+ uint32_t supress_txcqe_gen :1;
+ uint32_t vcq_num :24;
+
+ uint32_t trans_off :10; /* Transport header offset */
+ uint32_t vsq_frame :14;
+ uint32_t short_vp_offset :8;
+}; /* HW DATA */
+
+struct mana_tx_long_oob {
+ uint32_t is_encap :1;
+ uint32_t inner_is_ipv6 :1;
+ uint32_t inner_tcp_opt :1;
+ uint32_t inject_vlan_pri_tag :1;
+ uint32_t reserved1 :12;
+ uint32_t pcp :3; /* 802.1Q */
+ uint32_t dei :1; /* 802.1Q */
+ uint32_t vlan_id :12; /* 802.1Q */
+
+ uint32_t inner_frame_offset :10;
+ uint32_t inner_ip_rel_offset :6;
+ uint32_t long_vp_offset :12;
+ uint32_t reserved2 :4;
+
+ uint32_t reserved3;
+ uint32_t reserved4;
+}; /* HW DATA */
+
+struct mana_tx_oob {
+ struct mana_tx_short_oob s_oob;
+ struct mana_tx_long_oob l_oob;
+}; /* HW DATA */
+
+enum mana_cq_type {
+ MANA_CQ_TYPE_RX,
+ MANA_CQ_TYPE_TX,
+};
+
+enum mana_cqe_type {
+ CQE_INVALID = 0,
+ CQE_RX_OKAY = 1,
+ CQE_RX_COALESCED_4 = 2,
+ CQE_RX_OBJECT_FENCE = 3,
+ CQE_RX_TRUNCATED = 4,
+
+ CQE_TX_OKAY = 32,
+ CQE_TX_SA_DROP = 33,
+ CQE_TX_MTU_DROP = 34,
+ CQE_TX_INVALID_OOB = 35,
+ CQE_TX_INVALID_ETH_TYPE = 36,
+ CQE_TX_HDR_PROCESSING_ERROR = 37,
+ CQE_TX_VF_DISABLED = 38,
+ CQE_TX_VPORT_IDX_OUT_OF_RANGE = 39,
+ CQE_TX_VPORT_DISABLED = 40,
+ CQE_TX_VLAN_TAGGING_VIOLATION = 41,
+};
+
+#define MANA_CQE_COMPLETION 1
+
+struct mana_cqe_header {
+ uint32_t cqe_type :6;
+ uint32_t client_type :2;
+ uint32_t vendor_err :24;
+}; /* HW DATA */
+
+/* NDIS HASH Types */
+#define NDIS_HASH_IPV4 BIT(0)
+#define NDIS_HASH_TCP_IPV4 BIT(1)
+#define NDIS_HASH_UDP_IPV4 BIT(2)
+#define NDIS_HASH_IPV6 BIT(3)
+#define NDIS_HASH_TCP_IPV6 BIT(4)
+#define NDIS_HASH_UDP_IPV6 BIT(5)
+#define NDIS_HASH_IPV6_EX BIT(6)
+#define NDIS_HASH_TCP_IPV6_EX BIT(7)
+#define NDIS_HASH_UDP_IPV6_EX BIT(8)
+
+#define MANA_HASH_L3 (NDIS_HASH_IPV4 | NDIS_HASH_IPV6 | NDIS_HASH_IPV6_EX)
+#define MANA_HASH_L4 \
+ (NDIS_HASH_TCP_IPV4 | NDIS_HASH_UDP_IPV4 | NDIS_HASH_TCP_IPV6 | \
+ NDIS_HASH_UDP_IPV6 | NDIS_HASH_TCP_IPV6_EX | NDIS_HASH_UDP_IPV6_EX)
+
+#define NDIS_HASH_IPV4_L3_MASK (NDIS_HASH_IPV4)
+#define NDIS_HASH_IPV4_L4_MASK (NDIS_HASH_TCP_IPV4 | NDIS_HASH_UDP_IPV4)
+#define NDIS_HASH_IPV6_L3_MASK (NDIS_HASH_IPV6 | NDIS_HASH_IPV6_EX)
+#define NDIS_HASH_IPV6_L4_MASK \
+ (NDIS_HASH_TCP_IPV6 | NDIS_HASH_UDP_IPV6 | \
+ NDIS_HASH_TCP_IPV6_EX | NDIS_HASH_UDP_IPV6_EX)
+#define NDIS_HASH_IPV4_MASK \
+ (NDIS_HASH_IPV4_L3_MASK | NDIS_HASH_IPV4_L4_MASK)
+#define NDIS_HASH_IPV6_MASK \
+ (NDIS_HASH_IPV6_L3_MASK | NDIS_HASH_IPV6_L4_MASK)
+
+
+struct mana_rxcomp_perpkt_info {
+ uint32_t pkt_len :16;
+ uint32_t reserved1 :16;
+ uint32_t reserved2;
+ uint32_t pkt_hash;
+}; /* HW DATA */
+
+#define MANA_RXCOMP_OOB_NUM_PPI 4
+
+/* Receive completion OOB */
+struct mana_rxcomp_oob {
+ struct mana_cqe_header cqe_hdr;
+
+ uint32_t rx_vlan_id :12;
+ uint32_t rx_vlantag_present :1;
+ uint32_t rx_outer_iphdr_csum_succeed :1;
+ uint32_t rx_outer_iphdr_csum_fail :1;
+ uint32_t reserved1 :1;
+ uint32_t rx_hashtype :9;
+ uint32_t rx_iphdr_csum_succeed :1;
+ uint32_t rx_iphdr_csum_fail :1;
+ uint32_t rx_tcp_csum_succeed :1;
+ uint32_t rx_tcp_csum_fail :1;
+ uint32_t rx_udp_csum_succeed :1;
+ uint32_t rx_udp_csum_fail :1;
+ uint32_t reserved2 :1;
+
+ struct mana_rxcomp_perpkt_info ppi[MANA_RXCOMP_OOB_NUM_PPI];
+
+ uint32_t rx_wqe_offset;
+}; /* HW DATA */
+
+struct mana_tx_comp_oob {
+ struct mana_cqe_header cqe_hdr;
+
+ uint32_t tx_data_offset;
+
+ uint32_t tx_sgl_offset :5;
+ uint32_t tx_wqe_offset :27;
+
+ uint32_t reserved[12];
+}; /* HW DATA */
+
+struct mana_rxq;
+
+struct mana_cq {
+ struct gdma_queue *gdma_cq;
+
+ /* Cache the CQ id (used to verify if each CQE comes to the right CQ. */
+ uint32_t gdma_id;
+
+ /* Type of the CQ: TX or RX */
+ enum mana_cq_type type;
+
+ /* Pointer to the mana_rxq that is pushing RX CQEs to the queue.
+ * Only and must be non-NULL if type is MANA_CQ_TYPE_RX.
+ */
+ struct mana_rxq *rxq;
+
+ /* Pointer to the mana_txq that is pushing TX CQEs to the queue.
+ * Only and must be non-NULL if type is MANA_CQ_TYPE_TX.
+ */
+ struct mana_txq *txq;
+
+ /* Pointer to a buffer which the CQ handler can copy the CQE's into. */
+ struct gdma_comp *gdma_comp_buf;
+};
+
+#define GDMA_MAX_RQE_SGES 15
+
+struct mana_recv_buf_oob {
+ /* A valid GDMA work request representing the data buffer. */
+ struct gdma_wqe_request wqe_req;
+
+ struct mbuf *mbuf;
+ bus_dmamap_t dma_map;
+
+ /* SGL of the buffer going to be sent as part of the work request. */
+ uint32_t num_sge;
+ struct gdma_sge sgl[GDMA_MAX_RQE_SGES];
+
+ /* Required to store the result of mana_gd_post_work_request.
+ * gdma_posted_wqe_info.wqe_size_in_bu is required for progressing the
+ * work queue when the WQE is consumed.
+ */
+ struct gdma_posted_wqe_info wqe_inf;
+};
+
+struct mana_rxq {
+ struct gdma_queue *gdma_rq;
+ /* Cache the gdma receive queue id */
+ uint32_t gdma_id;
+
+ /* Index of RQ in the vPort, not gdma receive queue id */
+ uint32_t rxq_idx;
+
+ uint32_t datasize;
+
+ mana_handle_t rxobj;
+
+ struct mana_cq rx_cq;
+
+ struct ifnet *ndev;
+ struct lro_ctrl lro;
+
+ /* Total number of receive buffers to be allocated */
+ uint32_t num_rx_buf;
+
+ uint32_t buf_index;
+
+ struct mana_stats stats;
+
+ /* MUST BE THE LAST MEMBER:
+ * Each receive buffer has an associated mana_recv_buf_oob.
+ */
+ struct mana_recv_buf_oob rx_oobs[];
+};
+
+struct mana_tx_qp {
+ struct mana_txq txq;
+
+ struct mana_cq tx_cq;
+
+ mana_handle_t tx_object;
+};
+
+struct mana_port_stats {
+ counter_u64_t rx_packets;
+ counter_u64_t tx_packets;
+
+ counter_u64_t rx_bytes;
+ counter_u64_t tx_bytes;
+
+ counter_u64_t rx_drops;
+ counter_u64_t tx_drops;
+
+ counter_u64_t stop_queue;
+ counter_u64_t wake_queue;
+};
+
+struct mana_context {
+ struct gdma_dev *gdma_dev;
+
+ uint16_t num_ports;
+
+ struct ifnet *ports[MAX_PORTS_IN_MANA_DEV];
+};
+
+struct mana_port_context {
+ struct mana_context *ac;
+ struct ifnet *ndev;
+ struct ifmedia media;
+
+ struct sx apc_lock;
+
+ /* DMA tag used for queue bufs of the entire port */
+ bus_dma_tag_t rx_buf_tag;
+ bus_dma_tag_t tx_buf_tag;
+
+ uint8_t mac_addr[ETHER_ADDR_LEN];
+
+ struct mana_eq *eqs;
+
+ enum TRI_STATE rss_state;
+
+ mana_handle_t default_rxobj;
+ bool tx_shortform_allowed;
+ uint16_t tx_vp_offset;
+
+ struct mana_tx_qp *tx_qp;
+
+ /* Indirection Table for RX & TX. The values are queue indexes */
+ uint32_t indir_table[MANA_INDIRECT_TABLE_SIZE];
+
+ /* Indirection table containing RxObject Handles */
+ mana_handle_t rxobj_table[MANA_INDIRECT_TABLE_SIZE];
+
+ /* Hash key used by the NIC */
+ uint8_t hashkey[MANA_HASH_KEY_SIZE];
+
+ /* This points to an array of num_queues of RQ pointers. */
+ struct mana_rxq **rxqs;
+
+ /* Create num_queues EQs, SQs, SQ-CQs, RQs and RQ-CQs, respectively. */
+ unsigned int max_queues;
+ unsigned int num_queues;
+
+ mana_handle_t port_handle;
+
+ uint16_t port_idx;
+
+ uint16_t frame_size;
+
+ bool port_is_up;
+ bool port_st_save; /* Saved port state */
+
+ bool enable_tx_altq;
+ bool bind_cleanup_thread_cpu;
+
+ struct mana_port_stats port_stats;
+
+ struct sysctl_oid_list *port_list;
+ struct sysctl_ctx_list que_sysctl_ctx;
+};
+
+#define MANA_APC_LOCK_INIT(apc) \
+ sx_init(&(apc)->apc_lock, "MANA port lock")
+#define MANA_APC_LOCK_DESTROY(apc) sx_destroy(&(apc)->apc_lock)
+#define MANA_APC_LOCK_LOCK(apc) sx_xlock(&(apc)->apc_lock)
+#define MANA_APC_LOCK_UNLOCK(apc) sx_unlock(&(apc)->apc_lock)
+
+int mana_config_rss(struct mana_port_context *ac, enum TRI_STATE rx,
+ bool update_hash, bool update_tab);
+
+int mana_alloc_queues(struct ifnet *ndev);
+int mana_attach(struct ifnet *ndev);
+int mana_detach(struct ifnet *ndev);
+
+int mana_probe(struct gdma_dev *gd);
+void mana_remove(struct gdma_dev *gd);
+
+struct mana_obj_spec {
+ uint32_t queue_index;
+ uint64_t gdma_region;
+ uint32_t queue_size;
+ uint32_t attached_eq;
+ uint32_t modr_ctx_id;
+};
+
+enum mana_command_code {
+ MANA_QUERY_DEV_CONFIG = 0x20001,
+ MANA_QUERY_GF_STAT = 0x20002,
+ MANA_CONFIG_VPORT_TX = 0x20003,
+ MANA_CREATE_WQ_OBJ = 0x20004,
+ MANA_DESTROY_WQ_OBJ = 0x20005,
+ MANA_FENCE_RQ = 0x20006,
+ MANA_CONFIG_VPORT_RX = 0x20007,
+ MANA_QUERY_VPORT_CONFIG = 0x20008,
+};
+
+/* Query Device Configuration */
+struct mana_query_device_cfg_req {
+ struct gdma_req_hdr hdr;
+
+ /* Driver Capability flags */
+ uint64_t drv_cap_flags1;
+ uint64_t drv_cap_flags2;
+ uint64_t drv_cap_flags3;
+ uint64_t drv_cap_flags4;
+
+ uint32_t proto_major_ver;
+ uint32_t proto_minor_ver;
+ uint32_t proto_micro_ver;
+
+ uint32_t reserved;
+}; /* HW DATA */
+
+struct mana_query_device_cfg_resp {
+ struct gdma_resp_hdr hdr;
+
+ uint64_t pf_cap_flags1;
+ uint64_t pf_cap_flags2;
+ uint64_t pf_cap_flags3;
+ uint64_t pf_cap_flags4;
+
+ uint16_t max_num_vports;
+ uint16_t reserved;
+ uint32_t max_num_eqs;
+}; /* HW DATA */
+
+/* Query vPort Configuration */
+struct mana_query_vport_cfg_req {
+ struct gdma_req_hdr hdr;
+ uint32_t vport_index;
+}; /* HW DATA */
+
+struct mana_query_vport_cfg_resp {
+ struct gdma_resp_hdr hdr;
+ uint32_t max_num_sq;
+ uint32_t max_num_rq;
+ uint32_t num_indirection_ent;
+ uint32_t reserved1;
+ uint8_t mac_addr[6];
+ uint8_t reserved2[2];
+ mana_handle_t vport;
+}; /* HW DATA */
+
+/* Configure vPort */
+struct mana_config_vport_req {
+ struct gdma_req_hdr hdr;
+ mana_handle_t vport;
+ uint32_t pdid;
+ uint32_t doorbell_pageid;
+}; /* HW DATA */
+
+struct mana_config_vport_resp {
+ struct gdma_resp_hdr hdr;
+ uint16_t tx_vport_offset;
+ uint8_t short_form_allowed;
+ uint8_t reserved;
+}; /* HW DATA */
+
+/* Create WQ Object */
+struct mana_create_wqobj_req {
+ struct gdma_req_hdr hdr;
+ mana_handle_t vport;
+ uint32_t wq_type;
+ uint32_t reserved;
+ uint64_t wq_gdma_region;
+ uint64_t cq_gdma_region;
+ uint32_t wq_size;
+ uint32_t cq_size;
+ uint32_t cq_moderation_ctx_id;
+ uint32_t cq_parent_qid;
+}; /* HW DATA */
+
+struct mana_create_wqobj_resp {
+ struct gdma_resp_hdr hdr;
+ uint32_t wq_id;
+ uint32_t cq_id;
+ mana_handle_t wq_obj;
+}; /* HW DATA */
+
+/* Destroy WQ Object */
+struct mana_destroy_wqobj_req {
+ struct gdma_req_hdr hdr;
+ uint32_t wq_type;
+ uint32_t reserved;
+ mana_handle_t wq_obj_handle;
+}; /* HW DATA */
+
+struct mana_destroy_wqobj_resp {
+ struct gdma_resp_hdr hdr;
+}; /* HW DATA */
+
+/* Fence RQ */
+struct mana_fence_rq_req {
+ struct gdma_req_hdr hdr;
+ mana_handle_t wq_obj_handle;
+}; /* HW DATA */
+
+struct mana_fence_rq_resp {
+ struct gdma_resp_hdr hdr;
+}; /* HW DATA */
+
+/* Configure vPort Rx Steering */
+struct mana_cfg_rx_steer_req {
+ struct gdma_req_hdr hdr;
+ mana_handle_t vport;
+ uint16_t num_indir_entries;
+ uint16_t indir_tab_offset;
+ uint32_t rx_enable;
+ uint32_t rss_enable;
+ uint8_t update_default_rxobj;
+ uint8_t update_hashkey;
+ uint8_t update_indir_tab;
+ uint8_t reserved;
+ mana_handle_t default_rxobj;
+ uint8_t hashkey[MANA_HASH_KEY_SIZE];
+}; /* HW DATA */
+
+struct mana_cfg_rx_steer_resp {
+ struct gdma_resp_hdr hdr;
+}; /* HW DATA */
+
+#define MANA_MAX_NUM_QUEUES 16
+
+#define MANA_SHORT_VPORT_OFFSET_MAX ((1U << 8) - 1)
+
+struct mana_tx_package {
+ struct gdma_wqe_request wqe_req;
+ struct gdma_sge sgl_array[MAX_MBUF_FRAGS];
+
+ struct mana_tx_oob tx_oob;
+
+ struct gdma_posted_wqe_info wqe_info;
+};
+
+int mana_restart(struct mana_port_context *apc);
+
+#endif /* _MANA_H */
diff --git a/sys/dev/mana/mana_en.c b/sys/dev/mana/mana_en.c
new file mode 100644
index 000000000000..e6cffb852d70
--- /dev/null
+++ b/sys/dev/mana/mana_en.c
@@ -0,0 +1,2699 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2021 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/smp.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/time.h>
+#include <sys/eventhandler.h>
+
+#include <machine/bus.h>
+#include <machine/resource.h>
+#include <machine/in_cksum.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_types.h>
+#include <net/if_vlan_var.h>
+#ifdef RSS
+#include <net/rss_config.h>
+#endif
+
+#include <netinet/in_systm.h>
+#include <netinet/in.h>
+#include <netinet/if_ether.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+
+#include "mana.h"
+#include "mana_sysctl.h"
+
+static int mana_up(struct mana_port_context *apc);
+static int mana_down(struct mana_port_context *apc);
+
+static void
+mana_rss_key_fill(void *k, size_t size)
+{
+ static bool rss_key_generated = false;
+ static uint8_t rss_key[MANA_HASH_KEY_SIZE];
+
+ KASSERT(size <= MANA_HASH_KEY_SIZE,
+ ("Request more buytes than MANA RSS key can hold"));
+
+ if (!rss_key_generated) {
+ arc4random_buf(rss_key, MANA_HASH_KEY_SIZE);
+ rss_key_generated = true;
+ }
+ memcpy(k, rss_key, size);
+}
+
+static int
+mana_ifmedia_change(struct ifnet *ifp __unused)
+{
+ return EOPNOTSUPP;
+}
+
+static void
+mana_ifmedia_status(struct ifnet *ifp, struct ifmediareq *ifmr)
+{
+ struct mana_port_context *apc = if_getsoftc(ifp);
+
+ if (!apc) {
+ if_printf(ifp, "Port not available\n");
+ return;
+ }
+
+ MANA_APC_LOCK_LOCK(apc);
+
+ ifmr->ifm_status = IFM_AVALID;
+ ifmr->ifm_active = IFM_ETHER;
+
+ if (!apc->port_is_up) {
+ MANA_APC_LOCK_UNLOCK(apc);
+ mana_info(NULL, "Port %u link is down\n", apc->port_idx);
+ return;
+ }
+
+ ifmr->ifm_status |= IFM_ACTIVE;
+ ifmr->ifm_active |= IFM_100G_DR | IFM_FDX;
+
+ MANA_APC_LOCK_UNLOCK(apc);
+}
+
+static uint64_t
+mana_get_counter(struct ifnet *ifp, ift_counter cnt)
+{
+ struct mana_port_context *apc = if_getsoftc(ifp);
+ struct mana_port_stats *stats = &apc->port_stats;
+
+ switch (cnt) {
+ case IFCOUNTER_IPACKETS:
+ return (counter_u64_fetch(stats->rx_packets));
+ case IFCOUNTER_OPACKETS:
+ return (counter_u64_fetch(stats->tx_packets));
+ case IFCOUNTER_IBYTES:
+ return (counter_u64_fetch(stats->rx_bytes));
+ case IFCOUNTER_OBYTES:
+ return (counter_u64_fetch(stats->tx_bytes));
+ case IFCOUNTER_IQDROPS:
+ return (counter_u64_fetch(stats->rx_drops));
+ case IFCOUNTER_OQDROPS:
+ return (counter_u64_fetch(stats->tx_drops));
+ default:
+ return (if_get_counter_default(ifp, cnt));
+ }
+}
+
+static void
+mana_drain_eq_task(struct gdma_queue *queue)
+{
+ if (!queue || !queue->eq.cleanup_tq)
+ return;
+
+ while (taskqueue_cancel(queue->eq.cleanup_tq,
+ &queue->eq.cleanup_task, NULL)) {
+ taskqueue_drain(queue->eq.cleanup_tq,
+ &queue->eq.cleanup_task);
+ }
+}
+
+static void
+mana_qflush(struct ifnet *ifp)
+{
+ if_qflush(ifp);
+}
+
+int
+mana_restart(struct mana_port_context *apc)
+{
+ int rc = 0;
+
+ MANA_APC_LOCK_LOCK(apc);
+ if (apc->port_is_up)
+ mana_down(apc);
+
+ rc = mana_up(apc);
+ MANA_APC_LOCK_UNLOCK(apc);
+
+ return (rc);
+}
+
+static int
+mana_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
+{
+ struct mana_port_context *apc = if_getsoftc(ifp);
+ struct ifrsskey *ifrk;
+ struct ifrsshash *ifrh;
+ struct ifreq *ifr;
+ uint16_t new_mtu;
+ int rc = 0;
+
+ switch (command) {
+ case SIOCSIFMTU:
+ ifr = (struct ifreq *)data;
+ new_mtu = ifr->ifr_mtu;
+ if (ifp->if_mtu == new_mtu)
+ break;
+ if ((new_mtu + 18 > MAX_FRAME_SIZE) ||
+ (new_mtu + 18 < MIN_FRAME_SIZE)) {
+ if_printf(ifp, "Invalid MTU. new_mtu: %d, "
+ "max allowed: %d, min allowed: %d\n",
+ new_mtu, MAX_FRAME_SIZE - 18, MIN_FRAME_SIZE - 18);
+ return EINVAL;
+ }
+ MANA_APC_LOCK_LOCK(apc);
+ if (apc->port_is_up)
+ mana_down(apc);
+
+ apc->frame_size = new_mtu + 18;
+ if_setmtu(ifp, new_mtu);
+ mana_dbg(NULL, "Set MTU to %d\n", new_mtu);
+
+ rc = mana_up(apc);
+ MANA_APC_LOCK_UNLOCK(apc);
+ break;
+
+ case SIOCSIFFLAGS:
+ if (ifp->if_flags & IFF_UP) {
+ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
+ MANA_APC_LOCK_LOCK(apc);
+ if (!apc->port_is_up)
+ rc = mana_up(apc);
+ MANA_APC_LOCK_UNLOCK(apc);
+ }
+ } else {
+ if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
+ MANA_APC_LOCK_LOCK(apc);
+ if (apc->port_is_up)
+ mana_down(apc);
+ MANA_APC_LOCK_UNLOCK(apc);
+ }
+ }
+ break;
+
+ case SIOCSIFMEDIA:
+ case SIOCGIFMEDIA:
+ case SIOCGIFXMEDIA:
+ ifr = (struct ifreq *)data;
+ rc = ifmedia_ioctl(ifp, ifr, &apc->media, command);
+ break;
+
+ case SIOCGIFRSSKEY:
+ ifrk = (struct ifrsskey *)data;
+ ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
+ ifrk->ifrk_keylen = MANA_HASH_KEY_SIZE;
+ memcpy(ifrk->ifrk_key, apc->hashkey, MANA_HASH_KEY_SIZE);
+ break;
+
+ case SIOCGIFRSSHASH:
+ ifrh = (struct ifrsshash *)data;
+ ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
+ ifrh->ifrh_types =
+ RSS_TYPE_TCP_IPV4 |
+ RSS_TYPE_UDP_IPV4 |
+ RSS_TYPE_TCP_IPV6 |
+ RSS_TYPE_UDP_IPV6;
+ break;
+
+ default:
+ rc = ether_ioctl(ifp, command, data);
+ break;
+ }
+
+ return (rc);
+}
+
+static inline void
+mana_alloc_counters(counter_u64_t *begin, int size)
+{
+ counter_u64_t *end = (counter_u64_t *)((char *)begin + size);
+
+ for (; begin < end; ++begin)
+ *begin = counter_u64_alloc(M_WAITOK);
+}
+
+static inline void
+mana_free_counters(counter_u64_t *begin, int size)
+{
+ counter_u64_t *end = (counter_u64_t *)((char *)begin + size);
+
+ for (; begin < end; ++begin)
+ counter_u64_free(*begin);
+}
+
+static inline void
+mana_reset_counters(counter_u64_t *begin, int size)
+{
+ counter_u64_t *end = (counter_u64_t *)((char *)begin + size);
+
+ for (; begin < end; ++begin)
+ counter_u64_zero(*begin);
+}
+
+static bool
+mana_can_tx(struct gdma_queue *wq)
+{
+ return mana_gd_wq_avail_space(wq) >= MAX_TX_WQE_SIZE;
+}
+
+static inline int
+mana_tx_map_mbuf(struct mana_port_context *apc,
+ struct mana_send_buf_info *tx_info,
+ struct mbuf **m_head, struct mana_tx_package *tp,
+ struct mana_stats *tx_stats)
+{
+ struct gdma_dev *gd = apc->ac->gdma_dev;
+ bus_dma_segment_t segs[MAX_MBUF_FRAGS];
+ struct mbuf *m = *m_head;
+ int err, nsegs, i;
+
+ err = bus_dmamap_load_mbuf_sg(apc->tx_buf_tag, tx_info->dma_map,
+ m, segs, &nsegs, BUS_DMA_NOWAIT);
+ if (err == EFBIG) {
+ struct mbuf *m_new;
+
+ counter_u64_add(tx_stats->collapse, 1);
+ m_new = m_collapse(m, M_NOWAIT, MAX_MBUF_FRAGS);
+ if (unlikely(m_new == NULL)) {
+ counter_u64_add(tx_stats->collapse_err, 1);
+ return ENOBUFS;
+ } else {
+ *m_head = m = m_new;
+ }
+
+ mana_warn(NULL,
+ "Too many segs in orig mbuf, m_collapse called\n");
+
+ err = bus_dmamap_load_mbuf_sg(apc->tx_buf_tag,
+ tx_info->dma_map, m, segs, &nsegs, BUS_DMA_NOWAIT);
+ }
+ if (!err) {
+ for (i = 0; i < nsegs; i++) {
+ tp->wqe_req.sgl[i].address = segs[i].ds_addr;
+ tp->wqe_req.sgl[i].mem_key = gd->gpa_mkey;
+ tp->wqe_req.sgl[i].size = segs[i].ds_len;
+ }
+ tp->wqe_req.num_sge = nsegs;
+
+ tx_info->mbuf = *m_head;
+
+ bus_dmamap_sync(apc->tx_buf_tag, tx_info->dma_map,
+ BUS_DMASYNC_PREWRITE);
+ }
+
+ return err;
+}
+
+static inline void
+mana_tx_unmap_mbuf(struct mana_port_context *apc,
+ struct mana_send_buf_info *tx_info)
+{
+ bus_dmamap_sync(apc->tx_buf_tag, tx_info->dma_map,
+ BUS_DMASYNC_POSTWRITE);
+ bus_dmamap_unload(apc->tx_buf_tag, tx_info->dma_map);
+ if (tx_info->mbuf) {
+ m_freem(tx_info->mbuf);
+ tx_info->mbuf = NULL;
+ }
+}
+
+static inline int
+mana_load_rx_mbuf(struct mana_port_context *apc, struct mana_rxq *rxq,
+ struct mana_recv_buf_oob *rx_oob, bool alloc_mbuf)
+{
+ bus_dma_segment_t segs[1];
+ struct mbuf *mbuf;
+ int nsegs, err;
+ uint32_t mlen;
+
+ if (alloc_mbuf) {
+ mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rxq->datasize);
+ if (unlikely(mbuf == NULL)) {
+ mbuf = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
+ if (unlikely(mbuf == NULL)) {
+ return ENOMEM;
+ }
+ mlen = MCLBYTES;
+ } else {
+ mlen = rxq->datasize;
+ }
+
+ mbuf->m_pkthdr.len = mbuf->m_len = mlen;
+ } else {
+ if (rx_oob->mbuf) {
+ mbuf = rx_oob->mbuf;
+ mlen = rx_oob->mbuf->m_pkthdr.len;
+ } else {
+ return ENOMEM;
+ }
+ }
+
+ err = bus_dmamap_load_mbuf_sg(apc->rx_buf_tag, rx_oob->dma_map,
+ mbuf, segs, &nsegs, BUS_DMA_NOWAIT);
+
+ if (unlikely((err != 0) || (nsegs != 1))) {
+ mana_warn(NULL, "Failed to map mbuf, error: %d, "
+ "nsegs: %d\n", err, nsegs);
+ counter_u64_add(rxq->stats.dma_mapping_err, 1);
+ goto error;
+ }
+
+ bus_dmamap_sync(apc->rx_buf_tag, rx_oob->dma_map,
+ BUS_DMASYNC_PREREAD);
+
+ rx_oob->mbuf = mbuf;
+ rx_oob->num_sge = 1;
+ rx_oob->sgl[0].address = segs[0].ds_addr;
+ rx_oob->sgl[0].size = mlen;
+ rx_oob->sgl[0].mem_key = apc->ac->gdma_dev->gpa_mkey;
+
+ return 0;
+
+error:
+ m_freem(mbuf);
+ return EFAULT;
+}
+
+static inline void
+mana_unload_rx_mbuf(struct mana_port_context *apc, struct mana_rxq *rxq,
+ struct mana_recv_buf_oob *rx_oob, bool free_mbuf)
+{
+ bus_dmamap_sync(apc->rx_buf_tag, rx_oob->dma_map,
+ BUS_DMASYNC_POSTREAD);
+ bus_dmamap_unload(apc->rx_buf_tag, rx_oob->dma_map);
+
+ if (free_mbuf && rx_oob->mbuf) {
+ m_freem(rx_oob->mbuf);
+ rx_oob->mbuf = NULL;
+ }
+}
+
+
+/* Use couple mbuf PH_loc spaces for l3 and l4 protocal type */
+#define MANA_L3_PROTO(_mbuf) ((_mbuf)->m_pkthdr.PH_loc.sixteen[0])
+#define MANA_L4_PROTO(_mbuf) ((_mbuf)->m_pkthdr.PH_loc.sixteen[1])
+
+#define MANA_TXQ_FULL (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)
+
+static void
+mana_xmit(struct mana_txq *txq)
+{
+ enum mana_tx_pkt_format pkt_fmt = MANA_SHORT_PKT_FMT;
+ struct mana_send_buf_info *tx_info;
+ struct ifnet *ndev = txq->ndev;
+ struct mbuf *mbuf;
+ struct mana_port_context *apc = if_getsoftc(ndev);
+ struct mana_port_stats *port_stats = &apc->port_stats;
+ struct gdma_dev *gd = apc->ac->gdma_dev;
+ uint64_t packets, bytes;
+ uint16_t next_to_use;
+ struct mana_tx_package pkg = {};
+ struct mana_stats *tx_stats;
+ struct gdma_queue *gdma_sq;
+ struct gdma_queue *gdma_eq;
+ struct mana_cq *cq;
+ int err, len;
+
+ gdma_sq = txq->gdma_sq;
+ cq = &apc->tx_qp[txq->idx].tx_cq;
+ gdma_eq = cq->gdma_cq->cq.parent;
+ tx_stats = &txq->stats;
+
+ packets = 0;
+ bytes = 0;
+ next_to_use = txq->next_to_use;
+
+ while ((mbuf = drbr_peek(ndev, txq->txq_br)) != NULL) {
+ if (!apc->port_is_up ||
+ (if_getdrvflags(ndev) & MANA_TXQ_FULL) != IFF_DRV_RUNNING) {
+ drbr_putback(ndev, txq->txq_br, mbuf);
+ break;
+ }
+
+ if (!mana_can_tx(gdma_sq)) {
+ /* SQ is full. Set the IFF_DRV_OACTIVE flag */
+ if_setdrvflagbits(apc->ndev, IFF_DRV_OACTIVE, 0);
+ counter_u64_add(tx_stats->stop, 1);
+ uint64_t stops = counter_u64_fetch(tx_stats->stop);
+ uint64_t wakeups = counter_u64_fetch(tx_stats->wakeup);
+#define MANA_TXQ_STOP_THRESHOLD 50
+ if (stops > MANA_TXQ_STOP_THRESHOLD && wakeups > 0 &&
+ stops > wakeups && txq->alt_txq_idx == txq->idx) {
+ txq->alt_txq_idx =
+ (txq->idx + (stops / wakeups))
+ % apc->num_queues;
+ counter_u64_add(tx_stats->alt_chg, 1);
+ }
+
+ drbr_putback(ndev, txq->txq_br, mbuf);
+
+ taskqueue_enqueue(gdma_eq->eq.cleanup_tq,
+ &gdma_eq->eq.cleanup_task);
+ break;
+ }
+
+ tx_info = &txq->tx_buf_info[next_to_use];
+
+ memset(&pkg, 0, sizeof(struct mana_tx_package));
+ pkg.wqe_req.sgl = pkg.sgl_array;
+
+ err = mana_tx_map_mbuf(apc, tx_info, &mbuf, &pkg, tx_stats);
+ if (unlikely(err)) {
+ mana_dbg(NULL,
+ "Failed to map tx mbuf, err %d\n", err);
+
+ counter_u64_add(tx_stats->dma_mapping_err, 1);
+
+ /* The mbuf is still there. Free it */
+ m_freem(mbuf);
+ /* Advance the drbr queue */
+ drbr_advance(ndev, txq->txq_br);
+ continue;
+ }
+
+ pkg.tx_oob.s_oob.vcq_num = cq->gdma_id;
+ pkg.tx_oob.s_oob.vsq_frame = txq->vsq_frame;
+
+ if (txq->vp_offset > MANA_SHORT_VPORT_OFFSET_MAX) {
+ pkg.tx_oob.l_oob.long_vp_offset = txq->vp_offset;
+ pkt_fmt = MANA_LONG_PKT_FMT;
+ } else {
+ pkg.tx_oob.s_oob.short_vp_offset = txq->vp_offset;
+ }
+
+ pkg.tx_oob.s_oob.pkt_fmt = pkt_fmt;
+
+ if (pkt_fmt == MANA_SHORT_PKT_FMT)
+ pkg.wqe_req.inline_oob_size = sizeof(struct mana_tx_short_oob);
+ else
+ pkg.wqe_req.inline_oob_size = sizeof(struct mana_tx_oob);
+
+ pkg.wqe_req.inline_oob_data = &pkg.tx_oob;
+ pkg.wqe_req.flags = 0;
+ pkg.wqe_req.client_data_unit = 0;
+
+ if (mbuf->m_pkthdr.csum_flags & CSUM_TSO) {
+ if (MANA_L3_PROTO(mbuf) == ETHERTYPE_IP)
+ pkg.tx_oob.s_oob.is_outer_ipv4 = 1;
+ else
+ pkg.tx_oob.s_oob.is_outer_ipv6 = 1;
+
+ pkg.tx_oob.s_oob.comp_iphdr_csum = 1;
+ pkg.tx_oob.s_oob.comp_tcp_csum = 1;
+ pkg.tx_oob.s_oob.trans_off = mbuf->m_pkthdr.l3hlen;
+
+ pkg.wqe_req.client_data_unit = mbuf->m_pkthdr.tso_segsz;
+ pkg.wqe_req.flags = GDMA_WR_OOB_IN_SGL | GDMA_WR_PAD_BY_SGE0;
+ } else if (mbuf->m_pkthdr.csum_flags &
+ (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
+ if (MANA_L3_PROTO(mbuf) == ETHERTYPE_IP) {
+ pkg.tx_oob.s_oob.is_outer_ipv4 = 1;
+ pkg.tx_oob.s_oob.comp_iphdr_csum = 1;
+ } else {
+ pkg.tx_oob.s_oob.is_outer_ipv6 = 1;
+ }
+
+ if (MANA_L4_PROTO(mbuf) == IPPROTO_TCP) {
+ pkg.tx_oob.s_oob.comp_tcp_csum = 1;
+ pkg.tx_oob.s_oob.trans_off =
+ mbuf->m_pkthdr.l3hlen;
+ } else {
+ pkg.tx_oob.s_oob.comp_udp_csum = 1;
+ }
+ } else if (mbuf->m_pkthdr.csum_flags & CSUM_IP) {
+ pkg.tx_oob.s_oob.is_outer_ipv4 = 1;
+ pkg.tx_oob.s_oob.comp_iphdr_csum = 1;
+ } else {
+ if (MANA_L3_PROTO(mbuf) == ETHERTYPE_IP)
+ pkg.tx_oob.s_oob.is_outer_ipv4 = 1;
+ else if (MANA_L3_PROTO(mbuf) == ETHERTYPE_IPV6)
+ pkg.tx_oob.s_oob.is_outer_ipv6 = 1;
+ }
+
+ len = mbuf->m_pkthdr.len;
+
+ err = mana_gd_post_work_request(gdma_sq, &pkg.wqe_req,
+ (struct gdma_posted_wqe_info *)&tx_info->wqe_inf);
+ if (unlikely(err)) {
+ /* Should not happen */
+ if_printf(ndev, "Failed to post TX OOB: %d\n", err);
+
+ mana_tx_unmap_mbuf(apc, tx_info);
+
+ drbr_advance(ndev, txq->txq_br);
+ continue;
+ }
+
+ next_to_use =
+ (next_to_use + 1) % MAX_SEND_BUFFERS_PER_QUEUE;
+
+ atomic_inc_return(&txq->pending_sends);
+
+ drbr_advance(ndev, txq->txq_br);
+
+ mana_gd_wq_ring_doorbell(gd->gdma_context, gdma_sq);
+
+ packets++;
+ bytes += len;
+ }
+
+ counter_enter();
+ counter_u64_add_protected(tx_stats->packets, packets);
+ counter_u64_add_protected(port_stats->tx_packets, packets);
+ counter_u64_add_protected(tx_stats->bytes, bytes);
+ counter_u64_add_protected(port_stats->tx_bytes, bytes);
+ counter_exit();
+
+ txq->next_to_use = next_to_use;
+}
+
+static void
+mana_xmit_taskfunc(void *arg, int pending)
+{
+ struct mana_txq *txq = (struct mana_txq *)arg;
+ struct ifnet *ndev = txq->ndev;
+ struct mana_port_context *apc = if_getsoftc(ndev);
+
+ while (!drbr_empty(ndev, txq->txq_br) && apc->port_is_up &&
+ (if_getdrvflags(ndev) & MANA_TXQ_FULL) == IFF_DRV_RUNNING) {
+ mtx_lock(&txq->txq_mtx);
+ mana_xmit(txq);
+ mtx_unlock(&txq->txq_mtx);
+ }
+}
+
+#define PULLUP_HDR(m, len) \
+do { \
+ if (unlikely((m)->m_len < (len))) { \
+ (m) = m_pullup((m), (len)); \
+ if ((m) == NULL) \
+ return (NULL); \
+ } \
+} while (0)
+
+/*
+ * If this function failed, the mbuf would be freed.
+ */
+static inline struct mbuf *
+mana_tso_fixup(struct mbuf *mbuf)
+{
+ struct ether_vlan_header *eh = mtod(mbuf, struct ether_vlan_header *);
+ struct tcphdr *th;
+ uint16_t etype;
+ int ehlen;
+
+ if (eh->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) {
+ etype = ntohs(eh->evl_proto);
+ ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
+ } else {
+ etype = ntohs(eh->evl_encap_proto);
+ ehlen = ETHER_HDR_LEN;
+ }
+
+ if (etype == ETHERTYPE_IP) {
+ struct ip *ip;
+ int iphlen;
+
+ PULLUP_HDR(mbuf, ehlen + sizeof(*ip));
+ ip = mtodo(mbuf, ehlen);
+ iphlen = ip->ip_hl << 2;
+ mbuf->m_pkthdr.l3hlen = ehlen + iphlen;
+
+ PULLUP_HDR(mbuf, ehlen + iphlen + sizeof(*th));
+ th = mtodo(mbuf, ehlen + iphlen);
+
+ ip->ip_len = 0;
+ ip->ip_sum = 0;
+ th->th_sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr, htons(IPPROTO_TCP));
+ } else if (etype == ETHERTYPE_IPV6) {
+ struct ip6_hdr *ip6;
+
+ PULLUP_HDR(mbuf, ehlen + sizeof(*ip6) + sizeof(*th));
+ ip6 = mtodo(mbuf, ehlen);
+ if (ip6->ip6_nxt != IPPROTO_TCP) {
+ /* Realy something wrong, just return */
+ mana_dbg(NULL, "TSO mbuf not TCP, freed.\n");
+ m_freem(mbuf);
+ return NULL;
+ }
+ mbuf->m_pkthdr.l3hlen = ehlen + sizeof(*ip6);
+
+ th = mtodo(mbuf, ehlen + sizeof(*ip6));
+
+ ip6->ip6_plen = 0;
+ th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
+ } else {
+ /* CSUM_TSO is set but not IP protocol. */
+ mana_warn(NULL, "TSO mbuf not right, freed.\n");
+ m_freem(mbuf);
+ return NULL;
+ }
+
+ MANA_L3_PROTO(mbuf) = etype;
+
+ return (mbuf);
+}
+
+/*
+ * If this function failed, the mbuf would be freed.
+ */
+static inline struct mbuf *
+mana_mbuf_csum_check(struct mbuf *mbuf)
+{
+ struct ether_vlan_header *eh = mtod(mbuf, struct ether_vlan_header *);
+ struct mbuf *mbuf_next;
+ uint16_t etype;
+ int offset;
+ int ehlen;
+
+ if (eh->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) {
+ etype = ntohs(eh->evl_proto);
+ ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
+ } else {
+ etype = ntohs(eh->evl_encap_proto);
+ ehlen = ETHER_HDR_LEN;
+ }
+
+ mbuf_next = m_getptr(mbuf, ehlen, &offset);
+
+ MANA_L4_PROTO(mbuf) = 0;
+ if (etype == ETHERTYPE_IP) {
+ const struct ip *ip;
+ int iphlen;
+
+ ip = (struct ip *)(mtodo(mbuf_next, offset));
+ iphlen = ip->ip_hl << 2;
+ mbuf->m_pkthdr.l3hlen = ehlen + iphlen;
+
+ MANA_L4_PROTO(mbuf) = ip->ip_p;
+ } else if (etype == ETHERTYPE_IPV6) {
+ const struct ip6_hdr *ip6;
+
+ ip6 = (struct ip6_hdr *)(mtodo(mbuf_next, offset));
+ mbuf->m_pkthdr.l3hlen = ehlen + sizeof(*ip6);
+
+ MANA_L4_PROTO(mbuf) = ip6->ip6_nxt;
+ } else {
+ MANA_L4_PROTO(mbuf) = 0;
+ }
+
+ MANA_L3_PROTO(mbuf) = etype;
+
+ return (mbuf);
+}
+
+static int
+mana_start_xmit(struct ifnet *ifp, struct mbuf *m)
+{
+ struct mana_port_context *apc = if_getsoftc(ifp);
+ struct mana_txq *txq;
+ int is_drbr_empty;
+ uint16_t txq_id;
+ int err;
+
+ if (unlikely((!apc->port_is_up) ||
+ (if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0))
+ return ENODEV;
+
+ if (m->m_pkthdr.csum_flags & CSUM_TSO) {
+ m = mana_tso_fixup(m);
+ if (unlikely(m == NULL)) {
+ counter_enter();
+ counter_u64_add_protected(apc->port_stats.tx_drops, 1);
+ counter_exit();
+ return EIO;
+ }
+ } else {
+ m = mana_mbuf_csum_check(m);
+ if (unlikely(m == NULL)) {
+ counter_enter();
+ counter_u64_add_protected(apc->port_stats.tx_drops, 1);
+ counter_exit();
+ return EIO;
+ }
+ }
+
+ if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
+ uint32_t hash = m->m_pkthdr.flowid;
+ txq_id = apc->indir_table[(hash) & MANA_INDIRECT_TABLE_MASK] %
+ apc->num_queues;
+ } else {
+ txq_id = m->m_pkthdr.flowid % apc->num_queues;
+ }
+
+ if (apc->enable_tx_altq)
+ txq_id = apc->tx_qp[txq_id].txq.alt_txq_idx;
+
+ txq = &apc->tx_qp[txq_id].txq;
+
+ is_drbr_empty = drbr_empty(ifp, txq->txq_br);
+ err = drbr_enqueue(ifp, txq->txq_br, m);
+ if (unlikely(err)) {
+ mana_warn(NULL, "txq %u failed to enqueue: %d\n",
+ txq_id, err);
+ taskqueue_enqueue(txq->enqueue_tq, &txq->enqueue_task);
+ return err;
+ }
+
+ if (is_drbr_empty && mtx_trylock(&txq->txq_mtx)) {
+ mana_xmit(txq);
+ mtx_unlock(&txq->txq_mtx);
+ } else {
+ taskqueue_enqueue(txq->enqueue_tq, &txq->enqueue_task);
+ }
+
+ return 0;
+}
+
+static void
+mana_cleanup_port_context(struct mana_port_context *apc)
+{
+ bus_dma_tag_destroy(apc->tx_buf_tag);
+ bus_dma_tag_destroy(apc->rx_buf_tag);
+ apc->rx_buf_tag = NULL;
+
+ free(apc->rxqs, M_DEVBUF);
+ apc->rxqs = NULL;
+
+ mana_free_counters((counter_u64_t *)&apc->port_stats,
+ sizeof(struct mana_port_stats));
+}
+
+static int
+mana_init_port_context(struct mana_port_context *apc)
+{
+ device_t dev = apc->ac->gdma_dev->gdma_context->dev;
+ uint32_t tso_maxsize;
+ int err;
+
+ tso_maxsize = MAX_MBUF_FRAGS * MANA_TSO_MAXSEG_SZ -
+ (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
+
+ /* Create DMA tag for tx bufs */
+ err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */
+ 1, 0, /* alignment, boundary */
+ BUS_SPACE_MAXADDR, /* lowaddr */
+ BUS_SPACE_MAXADDR, /* highaddr */
+ NULL, NULL, /* filter, filterarg */
+ tso_maxsize, /* maxsize */
+ MAX_MBUF_FRAGS, /* nsegments */
+ tso_maxsize, /* maxsegsize */
+ 0, /* flags */
+ NULL, NULL, /* lockfunc, lockfuncarg*/
+ &apc->tx_buf_tag);
+ if (unlikely(err)) {
+ device_printf(dev, "Feiled to create TX DMA tag\n");
+ return err;
+ }
+
+ /* Create DMA tag for rx bufs */
+ err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */
+ 64, 0, /* alignment, boundary */
+ BUS_SPACE_MAXADDR, /* lowaddr */
+ BUS_SPACE_MAXADDR, /* highaddr */
+ NULL, NULL, /* filter, filterarg */
+ MJUMPAGESIZE, /* maxsize */
+ 1, /* nsegments */
+ MJUMPAGESIZE, /* maxsegsize */
+ 0, /* flags */
+ NULL, NULL, /* lockfunc, lockfuncarg*/
+ &apc->rx_buf_tag);
+ if (unlikely(err)) {
+ device_printf(dev, "Feiled to create RX DMA tag\n");
+ return err;
+ }
+
+ apc->rxqs = mallocarray(apc->num_queues, sizeof(struct mana_rxq *),
+ M_DEVBUF, M_WAITOK | M_ZERO);
+
+ if (!apc->rxqs) {
+ bus_dma_tag_destroy(apc->tx_buf_tag);
+ bus_dma_tag_destroy(apc->rx_buf_tag);
+ apc->rx_buf_tag = NULL;
+ return ENOMEM;
+ }
+
+ return 0;
+}
+
+static int
+mana_send_request(struct mana_context *ac, void *in_buf,
+ uint32_t in_len, void *out_buf, uint32_t out_len)
+{
+ struct gdma_context *gc = ac->gdma_dev->gdma_context;
+ struct gdma_resp_hdr *resp = out_buf;
+ struct gdma_req_hdr *req = in_buf;
+ device_t dev = gc->dev;
+ static atomic_t activity_id;
+ int err;
+
+ req->dev_id = gc->mana.dev_id;
+ req->activity_id = atomic_inc_return(&activity_id);
+
+ mana_dbg(NULL, "activity_id = %u\n", activity_id);
+
+ err = mana_gd_send_request(gc, in_len, in_buf, out_len,
+ out_buf);
+ if (err || resp->status) {
+ device_printf(dev, "Failed to send mana message: %d, 0x%x\n",
+ err, resp->status);
+ return err ? err : EPROTO;
+ }
+
+ if (req->dev_id.as_uint32 != resp->dev_id.as_uint32 ||
+ req->activity_id != resp->activity_id) {
+ device_printf(dev,
+ "Unexpected mana message response: %x,%x,%x,%x\n",
+ req->dev_id.as_uint32, resp->dev_id.as_uint32,
+ req->activity_id, resp->activity_id);
+ return EPROTO;
+ }
+
+ return 0;
+}
+
+static int
+mana_verify_resp_hdr(const struct gdma_resp_hdr *resp_hdr,
+ const enum mana_command_code expected_code,
+ const uint32_t min_size)
+{
+ if (resp_hdr->response.msg_type != expected_code)
+ return EPROTO;
+
+ if (resp_hdr->response.msg_version < GDMA_MESSAGE_V1)
+ return EPROTO;
+
+ if (resp_hdr->response.msg_size < min_size)
+ return EPROTO;
+
+ return 0;
+}
+
+static int
+mana_query_device_cfg(struct mana_context *ac, uint32_t proto_major_ver,
+ uint32_t proto_minor_ver, uint32_t proto_micro_ver,
+ uint16_t *max_num_vports)
+{
+ struct gdma_context *gc = ac->gdma_dev->gdma_context;
+ struct mana_query_device_cfg_resp resp = {};
+ struct mana_query_device_cfg_req req = {};
+ device_t dev = gc->dev;
+ int err = 0;
+
+ mana_gd_init_req_hdr(&req.hdr, MANA_QUERY_DEV_CONFIG,
+ sizeof(req), sizeof(resp));
+ req.proto_major_ver = proto_major_ver;
+ req.proto_minor_ver = proto_minor_ver;
+ req.proto_micro_ver = proto_micro_ver;
+
+ err = mana_send_request(ac, &req, sizeof(req), &resp, sizeof(resp));
+ if (err) {
+ device_printf(dev, "Failed to query config: %d", err);
+ return err;
+ }
+
+ err = mana_verify_resp_hdr(&resp.hdr, MANA_QUERY_DEV_CONFIG,
+ sizeof(resp));
+ if (err || resp.hdr.status) {
+ device_printf(dev, "Invalid query result: %d, 0x%x\n", err,
+ resp.hdr.status);
+ if (!err)
+ err = EPROTO;
+ return err;
+ }
+
+ *max_num_vports = resp.max_num_vports;
+
+ mana_dbg(NULL, "mana max_num_vports from device = %d\n",
+ *max_num_vports);
+
+ return 0;
+}
+
+static int
+mana_query_vport_cfg(struct mana_port_context *apc, uint32_t vport_index,
+ uint32_t *max_sq, uint32_t *max_rq, uint32_t *num_indir_entry)
+{
+ struct mana_query_vport_cfg_resp resp = {};
+ struct mana_query_vport_cfg_req req = {};
+ int err;
+
+ mana_gd_init_req_hdr(&req.hdr, MANA_QUERY_VPORT_CONFIG,
+ sizeof(req), sizeof(resp));
+
+ req.vport_index = vport_index;
+
+ err = mana_send_request(apc->ac, &req, sizeof(req), &resp,
+ sizeof(resp));
+ if (err)
+ return err;
+
+ err = mana_verify_resp_hdr(&resp.hdr, MANA_QUERY_VPORT_CONFIG,
+ sizeof(resp));
+ if (err)
+ return err;
+
+ if (resp.hdr.status)
+ return EPROTO;
+
+ *max_sq = resp.max_num_sq;
+ *max_rq = resp.max_num_rq;
+ *num_indir_entry = resp.num_indirection_ent;
+
+ apc->port_handle = resp.vport;
+ memcpy(apc->mac_addr, resp.mac_addr, ETHER_ADDR_LEN);
+
+ return 0;
+}
+
+static int
+mana_cfg_vport(struct mana_port_context *apc, uint32_t protection_dom_id,
+ uint32_t doorbell_pg_id)
+{
+ struct mana_config_vport_resp resp = {};
+ struct mana_config_vport_req req = {};
+ int err;
+
+ mana_gd_init_req_hdr(&req.hdr, MANA_CONFIG_VPORT_TX,
+ sizeof(req), sizeof(resp));
+ req.vport = apc->port_handle;
+ req.pdid = protection_dom_id;
+ req.doorbell_pageid = doorbell_pg_id;
+
+ err = mana_send_request(apc->ac, &req, sizeof(req), &resp,
+ sizeof(resp));
+ if (err) {
+ if_printf(apc->ndev, "Failed to configure vPort: %d\n", err);
+ goto out;
+ }
+
+ err = mana_verify_resp_hdr(&resp.hdr, MANA_CONFIG_VPORT_TX,
+ sizeof(resp));
+ if (err || resp.hdr.status) {
+ if_printf(apc->ndev, "Failed to configure vPort: %d, 0x%x\n",
+ err, resp.hdr.status);
+ if (!err)
+ err = EPROTO;
+
+ goto out;
+ }
+
+ apc->tx_shortform_allowed = resp.short_form_allowed;
+ apc->tx_vp_offset = resp.tx_vport_offset;
+out:
+ return err;
+}
+
+static int
+mana_cfg_vport_steering(struct mana_port_context *apc,
+ enum TRI_STATE rx,
+ bool update_default_rxobj, bool update_key,
+ bool update_tab)
+{
+ uint16_t num_entries = MANA_INDIRECT_TABLE_SIZE;
+ struct mana_cfg_rx_steer_req *req = NULL;
+ struct mana_cfg_rx_steer_resp resp = {};
+ struct ifnet *ndev = apc->ndev;
+ mana_handle_t *req_indir_tab;
+ uint32_t req_buf_size;
+ int err;
+
+ req_buf_size = sizeof(*req) + sizeof(mana_handle_t) * num_entries;
+ req = malloc(req_buf_size, M_DEVBUF, M_WAITOK | M_ZERO);
+ if (!req)
+ return ENOMEM;
+
+ mana_gd_init_req_hdr(&req->hdr, MANA_CONFIG_VPORT_RX, req_buf_size,
+ sizeof(resp));
+
+ req->vport = apc->port_handle;
+ req->num_indir_entries = num_entries;
+ req->indir_tab_offset = sizeof(*req);
+ req->rx_enable = rx;
+ req->rss_enable = apc->rss_state;
+ req->update_default_rxobj = update_default_rxobj;
+ req->update_hashkey = update_key;
+ req->update_indir_tab = update_tab;
+ req->default_rxobj = apc->default_rxobj;
+
+ if (update_key)
+ memcpy(&req->hashkey, apc->hashkey, MANA_HASH_KEY_SIZE);
+
+ if (update_tab) {
+ req_indir_tab = (mana_handle_t *)(req + 1);
+ memcpy(req_indir_tab, apc->rxobj_table,
+ req->num_indir_entries * sizeof(mana_handle_t));
+ }
+
+ err = mana_send_request(apc->ac, req, req_buf_size, &resp,
+ sizeof(resp));
+ if (err) {
+ if_printf(ndev, "Failed to configure vPort RX: %d\n", err);
+ goto out;
+ }
+
+ err = mana_verify_resp_hdr(&resp.hdr, MANA_CONFIG_VPORT_RX,
+ sizeof(resp));
+ if (err) {
+ if_printf(ndev, "vPort RX configuration failed: %d\n", err);
+ goto out;
+ }
+
+ if (resp.hdr.status) {
+ if_printf(ndev, "vPort RX configuration failed: 0x%x\n",
+ resp.hdr.status);
+ err = EPROTO;
+ }
+out:
+ free(req, M_DEVBUF);
+ return err;
+}
+
+static int
+mana_create_wq_obj(struct mana_port_context *apc,
+ mana_handle_t vport,
+ uint32_t wq_type, struct mana_obj_spec *wq_spec,
+ struct mana_obj_spec *cq_spec,
+ mana_handle_t *wq_obj)
+{
+ struct mana_create_wqobj_resp resp = {};
+ struct mana_create_wqobj_req req = {};
+ struct ifnet *ndev = apc->ndev;
+ int err;
+
+ mana_gd_init_req_hdr(&req.hdr, MANA_CREATE_WQ_OBJ,
+ sizeof(req), sizeof(resp));
+ req.vport = vport;
+ req.wq_type = wq_type;
+ req.wq_gdma_region = wq_spec->gdma_region;
+ req.cq_gdma_region = cq_spec->gdma_region;
+ req.wq_size = wq_spec->queue_size;
+ req.cq_size = cq_spec->queue_size;
+ req.cq_moderation_ctx_id = cq_spec->modr_ctx_id;
+ req.cq_parent_qid = cq_spec->attached_eq;
+
+ err = mana_send_request(apc->ac, &req, sizeof(req), &resp,
+ sizeof(resp));
+ if (err) {
+ if_printf(ndev, "Failed to create WQ object: %d\n", err);
+ goto out;
+ }
+
+ err = mana_verify_resp_hdr(&resp.hdr, MANA_CREATE_WQ_OBJ,
+ sizeof(resp));
+ if (err || resp.hdr.status) {
+ if_printf(ndev, "Failed to create WQ object: %d, 0x%x\n", err,
+ resp.hdr.status);
+ if (!err)
+ err = EPROTO;
+ goto out;
+ }
+
+ if (resp.wq_obj == INVALID_MANA_HANDLE) {
+ if_printf(ndev, "Got an invalid WQ object handle\n");
+ err = EPROTO;
+ goto out;
+ }
+
+ *wq_obj = resp.wq_obj;
+ wq_spec->queue_index = resp.wq_id;
+ cq_spec->queue_index = resp.cq_id;
+
+ return 0;
+out:
+ return err;
+}
+
+static void
+mana_destroy_wq_obj(struct mana_port_context *apc, uint32_t wq_type,
+ mana_handle_t wq_obj)
+{
+ struct mana_destroy_wqobj_resp resp = {};
+ struct mana_destroy_wqobj_req req = {};
+ struct ifnet *ndev = apc->ndev;
+ int err;
+
+ mana_gd_init_req_hdr(&req.hdr, MANA_DESTROY_WQ_OBJ,
+ sizeof(req), sizeof(resp));
+ req.wq_type = wq_type;
+ req.wq_obj_handle = wq_obj;
+
+ err = mana_send_request(apc->ac, &req, sizeof(req), &resp,
+ sizeof(resp));
+ if (err) {
+ if_printf(ndev, "Failed to destroy WQ object: %d\n", err);
+ return;
+ }
+
+ err = mana_verify_resp_hdr(&resp.hdr, MANA_DESTROY_WQ_OBJ,
+ sizeof(resp));
+ if (err || resp.hdr.status)
+ if_printf(ndev, "Failed to destroy WQ object: %d, 0x%x\n",
+ err, resp.hdr.status);
+}
+
+static void
+mana_init_cqe_poll_buf(struct gdma_comp *cqe_poll_buf)
+{
+ int i;
+
+ for (i = 0; i < CQE_POLLING_BUFFER; i++)
+ memset(&cqe_poll_buf[i], 0, sizeof(struct gdma_comp));
+}
+
+static void
+mana_destroy_eq(struct gdma_context *gc, struct mana_port_context *apc)
+{
+ struct gdma_queue *eq;
+ int i;
+
+ if (!apc->eqs)
+ return;
+
+ for (i = 0; i < apc->num_queues; i++) {
+ eq = apc->eqs[i].eq;
+ if (!eq)
+ continue;
+
+ mana_gd_destroy_queue(gc, eq);
+ }
+
+ free(apc->eqs, M_DEVBUF);
+ apc->eqs = NULL;
+}
+
+static int
+mana_create_eq(struct mana_port_context *apc)
+{
+ struct gdma_dev *gd = apc->ac->gdma_dev;
+ struct gdma_queue_spec spec = {};
+ int err;
+ int i;
+
+ apc->eqs = mallocarray(apc->num_queues, sizeof(struct mana_eq),
+ M_DEVBUF, M_WAITOK | M_ZERO);
+ if (!apc->eqs)
+ return ENOMEM;
+
+ spec.type = GDMA_EQ;
+ spec.monitor_avl_buf = false;
+ spec.queue_size = EQ_SIZE;
+ spec.eq.callback = NULL;
+ spec.eq.context = apc->eqs;
+ spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE;
+ spec.eq.ndev = apc->ndev;
+
+ for (i = 0; i < apc->num_queues; i++) {
+ mana_init_cqe_poll_buf(apc->eqs[i].cqe_poll);
+
+ err = mana_gd_create_mana_eq(gd, &spec, &apc->eqs[i].eq);
+ if (err)
+ goto out;
+ }
+
+ return 0;
+out:
+ mana_destroy_eq(gd->gdma_context, apc);
+ return err;
+}
+
+static int
+mana_move_wq_tail(struct gdma_queue *wq, uint32_t num_units)
+{
+ uint32_t used_space_old;
+ uint32_t used_space_new;
+
+ used_space_old = wq->head - wq->tail;
+ used_space_new = wq->head - (wq->tail + num_units);
+
+ if (used_space_new > used_space_old) {
+ mana_err(NULL,
+ "WARNING: new used space %u greater than old one %u\n",
+ used_space_new, used_space_old);
+ return ERANGE;
+ }
+
+ wq->tail += num_units;
+ return 0;
+}
+
+static void
+mana_poll_tx_cq(struct mana_cq *cq)
+{
+ struct gdma_comp *completions = cq->gdma_comp_buf;
+ struct gdma_posted_wqe_info *wqe_info;
+ struct mana_send_buf_info *tx_info;
+ unsigned int pkt_transmitted = 0;
+ unsigned int wqe_unit_cnt = 0;
+ struct mana_txq *txq = cq->txq;
+ struct mana_port_context *apc;
+ uint16_t next_to_complete;
+ struct ifnet *ndev;
+ int comp_read;
+ int txq_idx = txq->idx;;
+ int i;
+ int sa_drop = 0;
+
+ struct gdma_queue *gdma_wq;
+ unsigned int avail_space;
+ bool txq_full = false;
+
+ ndev = txq->ndev;
+ apc = if_getsoftc(ndev);
+
+ comp_read = mana_gd_poll_cq(cq->gdma_cq, completions,
+ CQE_POLLING_BUFFER);
+
+ next_to_complete = txq->next_to_complete;
+
+ for (i = 0; i < comp_read; i++) {
+ struct mana_tx_comp_oob *cqe_oob;
+
+ if (!completions[i].is_sq) {
+ mana_err(NULL, "WARNING: Not for SQ\n");
+ return;
+ }
+
+ cqe_oob = (struct mana_tx_comp_oob *)completions[i].cqe_data;
+ if (cqe_oob->cqe_hdr.client_type !=
+ MANA_CQE_COMPLETION) {
+ mana_err(NULL,
+ "WARNING: Invalid CQE client type %u\n",
+ cqe_oob->cqe_hdr.client_type);
+ return;
+ }
+
+ switch (cqe_oob->cqe_hdr.cqe_type) {
+ case CQE_TX_OKAY:
+ break;
+
+ case CQE_TX_SA_DROP:
+ case CQE_TX_MTU_DROP:
+ case CQE_TX_INVALID_OOB:
+ case CQE_TX_INVALID_ETH_TYPE:
+ case CQE_TX_HDR_PROCESSING_ERROR:
+ case CQE_TX_VF_DISABLED:
+ case CQE_TX_VPORT_IDX_OUT_OF_RANGE:
+ case CQE_TX_VPORT_DISABLED:
+ case CQE_TX_VLAN_TAGGING_VIOLATION:
+ sa_drop ++;
+ mana_err(NULL,
+ "TX: txq %d CQE error %d, ntc = %d, "
+ "pending sends = %d: err ignored.\n",
+ txq_idx, cqe_oob->cqe_hdr.cqe_type,
+ next_to_complete, txq->pending_sends);
+ break;
+
+ default:
+ /* If the CQE type is unexpected, log an error,
+ * and go through the error path.
+ */
+ mana_err(NULL,
+ "ERROR: TX: Unexpected CQE type %d: HW BUG?\n",
+ cqe_oob->cqe_hdr.cqe_type);
+ return;
+ }
+ if (txq->gdma_txq_id != completions[i].wq_num) {
+ mana_dbg(NULL,
+ "txq gdma id not match completion wq num: "
+ "%d != %d\n",
+ txq->gdma_txq_id, completions[i].wq_num);
+ break;
+ }
+
+ tx_info = &txq->tx_buf_info[next_to_complete];
+ if (!tx_info->mbuf) {
+ mana_err(NULL,
+ "WARNING: txq %d Empty mbuf on tx_info: %u, "
+ "ntu = %u, pending_sends = %d, "
+ "transmitted = %d, sa_drop = %d, i = %d, comp_read = %d\n",
+ txq_idx, next_to_complete, txq->next_to_use,
+ txq->pending_sends, pkt_transmitted, sa_drop,
+ i, comp_read);
+ continue;
+ }
+
+ wqe_info = &tx_info->wqe_inf;
+ wqe_unit_cnt += wqe_info->wqe_size_in_bu;
+
+ mana_tx_unmap_mbuf(apc, tx_info);
+ mb();
+
+ next_to_complete =
+ (next_to_complete + 1) % MAX_SEND_BUFFERS_PER_QUEUE;
+
+ pkt_transmitted++;
+ }
+
+ txq->next_to_complete = next_to_complete;
+
+ if (wqe_unit_cnt == 0) {
+ mana_err(NULL,
+ "WARNING: TX ring not proceeding!\n");
+ return;
+ }
+
+ mana_move_wq_tail(txq->gdma_sq, wqe_unit_cnt);
+
+ /* Ensure tail updated before checking q stop */
+ wmb();
+
+ gdma_wq = txq->gdma_sq;
+ avail_space = mana_gd_wq_avail_space(gdma_wq);
+
+
+ if ((if_getdrvflags(ndev) & MANA_TXQ_FULL) == MANA_TXQ_FULL) {
+ txq_full = true;
+ }
+
+ /* Ensure checking txq_full before apc->port_is_up. */
+ rmb();
+
+ if (txq_full && apc->port_is_up && avail_space >= MAX_TX_WQE_SIZE) {
+ /* Grab the txq lock and re-test */
+ mtx_lock(&txq->txq_mtx);
+ avail_space = mana_gd_wq_avail_space(gdma_wq);
+
+ if ((if_getdrvflags(ndev) & MANA_TXQ_FULL) == MANA_TXQ_FULL &&
+ apc->port_is_up && avail_space >= MAX_TX_WQE_SIZE) {
+ /* Clear the Q full flag */
+ if_setdrvflagbits(apc->ndev, IFF_DRV_RUNNING,
+ IFF_DRV_OACTIVE);
+ counter_u64_add(txq->stats.wakeup, 1);
+ if (txq->alt_txq_idx != txq->idx) {
+ uint64_t stops = counter_u64_fetch(txq->stats.stop);
+ uint64_t wakeups = counter_u64_fetch(txq->stats.wakeup);
+ /* Reset alt_txq_idx back if it is not overloaded */
+ if (stops < wakeups) {
+ txq->alt_txq_idx = txq->idx;
+ counter_u64_add(txq->stats.alt_reset, 1);
+ }
+ }
+ rmb();
+ /* Schedule a tx enqueue task */
+ taskqueue_enqueue(txq->enqueue_tq, &txq->enqueue_task);
+ }
+ mtx_unlock(&txq->txq_mtx);
+ }
+
+ if (atomic_sub_return(pkt_transmitted, &txq->pending_sends) < 0)
+ mana_err(NULL,
+ "WARNING: TX %d pending_sends error: %d\n",
+ txq->idx, txq->pending_sends);
+}
+
+static void
+mana_post_pkt_rxq(struct mana_rxq *rxq)
+{
+ struct mana_recv_buf_oob *recv_buf_oob;
+ uint32_t curr_index;
+ int err;
+
+ curr_index = rxq->buf_index++;
+ if (rxq->buf_index == rxq->num_rx_buf)
+ rxq->buf_index = 0;
+
+ recv_buf_oob = &rxq->rx_oobs[curr_index];
+
+ err = mana_gd_post_and_ring(rxq->gdma_rq, &recv_buf_oob->wqe_req,
+ &recv_buf_oob->wqe_inf);
+ if (err) {
+ mana_err(NULL, "WARNING: rxq %u post pkt err %d\n",
+ rxq->rxq_idx, err);
+ return;
+ }
+
+ if (recv_buf_oob->wqe_inf.wqe_size_in_bu != 1) {
+ mana_err(NULL, "WARNING: rxq %u wqe_size_in_bu %u\n",
+ rxq->rxq_idx, recv_buf_oob->wqe_inf.wqe_size_in_bu);
+ }
+}
+
+static void
+mana_rx_mbuf(struct mbuf *mbuf, struct mana_rxcomp_oob *cqe,
+ struct mana_rxq *rxq)
+{
+ struct mana_stats *rx_stats = &rxq->stats;
+ struct ifnet *ndev = rxq->ndev;
+ uint32_t pkt_len = cqe->ppi[0].pkt_len;
+ uint16_t rxq_idx = rxq->rxq_idx;
+ struct mana_port_context *apc;
+ struct gdma_queue *eq;
+ bool do_lro = false;
+ bool do_if_input;
+
+ apc = if_getsoftc(ndev);
+ eq = apc->eqs[rxq_idx].eq;
+ eq->eq.work_done++;
+
+ if (!mbuf) {
+ return;
+ }
+
+ mbuf->m_flags |= M_PKTHDR;
+ mbuf->m_pkthdr.len = pkt_len;
+ mbuf->m_len = pkt_len;
+ mbuf->m_pkthdr.rcvif = ndev;
+
+ if ((ndev->if_capenable & IFCAP_RXCSUM ||
+ ndev->if_capenable & IFCAP_RXCSUM_IPV6) &&
+ (cqe->rx_iphdr_csum_succeed)) {
+ mbuf->m_pkthdr.csum_flags = CSUM_IP_CHECKED;
+ mbuf->m_pkthdr.csum_flags |= CSUM_IP_VALID;
+ if (cqe->rx_tcp_csum_succeed || cqe->rx_udp_csum_succeed) {
+ mbuf->m_pkthdr.csum_flags |=
+ (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
+ mbuf->m_pkthdr.csum_data = 0xffff;
+
+ if (cqe->rx_tcp_csum_succeed)
+ do_lro = true;
+ }
+ }
+
+ if (cqe->rx_hashtype != 0) {
+ mbuf->m_pkthdr.flowid = cqe->ppi[0].pkt_hash;
+
+ uint16_t hashtype = cqe->rx_hashtype;
+ if (hashtype & NDIS_HASH_IPV4_MASK) {
+ hashtype &= NDIS_HASH_IPV4_MASK;
+ switch (hashtype) {
+ case NDIS_HASH_TCP_IPV4:
+ M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV4);
+ break;
+ case NDIS_HASH_UDP_IPV4:
+ M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV4);
+ break;
+ default:
+ M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV4);
+ }
+ } else if (hashtype & NDIS_HASH_IPV6_MASK) {
+ hashtype &= NDIS_HASH_IPV6_MASK;
+ switch (hashtype) {
+ case NDIS_HASH_TCP_IPV6:
+ M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV6);
+ break;
+ case NDIS_HASH_TCP_IPV6_EX:
+ M_HASHTYPE_SET(mbuf,
+ M_HASHTYPE_RSS_TCP_IPV6_EX);
+ break;
+ case NDIS_HASH_UDP_IPV6:
+ M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV6);
+ break;
+ case NDIS_HASH_UDP_IPV6_EX:
+ M_HASHTYPE_SET(mbuf,
+ M_HASHTYPE_RSS_UDP_IPV6_EX);
+ break;
+ default:
+ M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV6);
+ }
+ } else {
+ M_HASHTYPE_SET(mbuf, M_HASHTYPE_OPAQUE_HASH);
+ }
+ } else {
+ mbuf->m_pkthdr.flowid = rxq_idx;
+ M_HASHTYPE_SET(mbuf, M_HASHTYPE_NONE);
+ }
+
+ do_if_input = true;
+ if ((ndev->if_capenable & IFCAP_LRO) && do_lro) {
+ if (rxq->lro.lro_cnt != 0 &&
+ tcp_lro_rx(&rxq->lro, mbuf, 0) == 0)
+ do_if_input = false;
+ }
+ if (do_if_input) {
+ ndev->if_input(ndev, mbuf);
+ }
+
+ counter_enter();
+ counter_u64_add_protected(rx_stats->packets, 1);
+ counter_u64_add_protected(apc->port_stats.rx_packets, 1);
+ counter_u64_add_protected(rx_stats->bytes, pkt_len);
+ counter_u64_add_protected(apc->port_stats.rx_bytes, pkt_len);
+ counter_exit();
+}
+
+static void
+mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
+ struct gdma_comp *cqe)
+{
+ struct mana_rxcomp_oob *oob = (struct mana_rxcomp_oob *)cqe->cqe_data;
+ struct mana_recv_buf_oob *rxbuf_oob;
+ struct ifnet *ndev = rxq->ndev;
+ struct mana_port_context *apc;
+ struct mbuf *old_mbuf;
+ uint32_t curr, pktlen;
+ int err;
+
+ switch (oob->cqe_hdr.cqe_type) {
+ case CQE_RX_OKAY:
+ break;
+
+ case CQE_RX_TRUNCATED:
+ if_printf(ndev, "Dropped a truncated packet\n");
+ return;
+
+ case CQE_RX_COALESCED_4:
+ if_printf(ndev, "RX coalescing is unsupported\n");
+ return;
+
+ case CQE_RX_OBJECT_FENCE:
+ if_printf(ndev, "RX Fencing is unsupported\n");
+ return;
+
+ default:
+ if_printf(ndev, "Unknown RX CQE type = %d\n",
+ oob->cqe_hdr.cqe_type);
+ return;
+ }
+
+ if (oob->cqe_hdr.cqe_type != CQE_RX_OKAY)
+ return;
+
+ pktlen = oob->ppi[0].pkt_len;
+
+ if (pktlen == 0) {
+ /* data packets should never have packetlength of zero */
+#if defined(__amd64__)
+ if_printf(ndev, "RX pkt len=0, rq=%u, cq=%u, rxobj=0x%lx\n",
+ rxq->gdma_id, cq->gdma_id, rxq->rxobj);
+#else
+ if_printf(ndev, "RX pkt len=0, rq=%u, cq=%u, rxobj=0x%llx\n",
+ rxq->gdma_id, cq->gdma_id, rxq->rxobj);
+#endif
+ return;
+ }
+
+ curr = rxq->buf_index;
+ rxbuf_oob = &rxq->rx_oobs[curr];
+ if (rxbuf_oob->wqe_inf.wqe_size_in_bu != 1) {
+ mana_err(NULL, "WARNING: Rx Incorrect complete "
+ "WQE size %u\n",
+ rxbuf_oob->wqe_inf.wqe_size_in_bu);
+ }
+
+ apc = if_getsoftc(ndev);
+
+ old_mbuf = rxbuf_oob->mbuf;
+
+ /* Unload DMA map for the old mbuf */
+ mana_unload_rx_mbuf(apc, rxq, rxbuf_oob, false);
+
+ /* Load a new mbuf to replace the old one */
+ err = mana_load_rx_mbuf(apc, rxq, rxbuf_oob, true);
+ if (err) {
+ mana_dbg(NULL,
+ "failed to load rx mbuf, err = %d, packet dropped.\n",
+ err);
+ counter_u64_add(rxq->stats.mbuf_alloc_fail, 1);
+ /*
+ * Failed to load new mbuf, rxbuf_oob->mbuf is still
+ * pointing to the old one. Drop the packet.
+ */
+ old_mbuf = NULL;
+ /* Reload the existing mbuf */
+ mana_load_rx_mbuf(apc, rxq, rxbuf_oob, false);
+ }
+
+ mana_rx_mbuf(old_mbuf, oob, rxq);
+
+ mana_move_wq_tail(rxq->gdma_rq, rxbuf_oob->wqe_inf.wqe_size_in_bu);
+
+ mana_post_pkt_rxq(rxq);
+}
+
+static void
+mana_poll_rx_cq(struct mana_cq *cq)
+{
+ struct gdma_comp *comp = cq->gdma_comp_buf;
+ int comp_read, i;
+
+ comp_read = mana_gd_poll_cq(cq->gdma_cq, comp, CQE_POLLING_BUFFER);
+ KASSERT(comp_read <= CQE_POLLING_BUFFER,
+ ("comp_read %d great than buf size %d",
+ comp_read, CQE_POLLING_BUFFER));
+
+ for (i = 0; i < comp_read; i++) {
+ if (comp[i].is_sq == true) {
+ mana_err(NULL,
+ "WARNING: CQE not for receive queue\n");
+ return;
+ }
+
+ /* verify recv cqe references the right rxq */
+ if (comp[i].wq_num != cq->rxq->gdma_id) {
+ mana_err(NULL,
+ "WARNING: Received CQE %d not for "
+ "this receive queue %d\n",
+ comp[i].wq_num, cq->rxq->gdma_id);
+ return;
+ }
+
+ mana_process_rx_cqe(cq->rxq, cq, &comp[i]);
+ }
+
+ tcp_lro_flush_all(&cq->rxq->lro);
+}
+
+static void
+mana_cq_handler(void *context, struct gdma_queue *gdma_queue)
+{
+ struct mana_cq *cq = context;
+
+ KASSERT(cq->gdma_cq == gdma_queue,
+ ("cq do not match %p, %p", cq->gdma_cq, gdma_queue));
+
+ if (cq->type == MANA_CQ_TYPE_RX) {
+ mana_poll_rx_cq(cq);
+ } else {
+ mana_poll_tx_cq(cq);
+ }
+
+ mana_gd_arm_cq(gdma_queue);
+}
+
+static void
+mana_deinit_cq(struct mana_port_context *apc, struct mana_cq *cq)
+{
+ struct gdma_dev *gd = apc->ac->gdma_dev;
+
+ if (!cq->gdma_cq)
+ return;
+
+ mana_gd_destroy_queue(gd->gdma_context, cq->gdma_cq);
+}
+
+static void
+mana_deinit_txq(struct mana_port_context *apc, struct mana_txq *txq)
+{
+ struct gdma_dev *gd = apc->ac->gdma_dev;
+ struct mana_send_buf_info *txbuf_info;
+ uint32_t pending_sends;
+ int i;
+
+ if (!txq->gdma_sq)
+ return;
+
+ if ((pending_sends = atomic_read(&txq->pending_sends)) > 0) {
+ mana_err(NULL,
+ "WARNING: txq pending sends not zero: %u\n",
+ pending_sends);
+ }
+
+ if (txq->next_to_use != txq->next_to_complete) {
+ mana_err(NULL,
+ "WARNING: txq buf not completed, "
+ "next use %u, next complete %u\n",
+ txq->next_to_use, txq->next_to_complete);
+ }
+
+ /* Flush buf ring. Grab txq mtx lock */
+ if (txq->txq_br) {
+ mtx_lock(&txq->txq_mtx);
+ drbr_flush(apc->ndev, txq->txq_br);
+ mtx_unlock(&txq->txq_mtx);
+ buf_ring_free(txq->txq_br, M_DEVBUF);
+ }
+
+ /* Drain taskqueue */
+ if (txq->enqueue_tq) {
+ while (taskqueue_cancel(txq->enqueue_tq,
+ &txq->enqueue_task, NULL)) {
+ taskqueue_drain(txq->enqueue_tq,
+ &txq->enqueue_task);
+ }
+
+ taskqueue_free(txq->enqueue_tq);
+ }
+
+ if (txq->tx_buf_info) {
+ /* Free all mbufs which are still in-flight */
+ for (i = 0; i < MAX_SEND_BUFFERS_PER_QUEUE; i++) {
+ txbuf_info = &txq->tx_buf_info[i];
+ if (txbuf_info->mbuf) {
+ mana_tx_unmap_mbuf(apc, txbuf_info);
+ }
+ }
+
+ free(txq->tx_buf_info, M_DEVBUF);
+ }
+
+ mana_free_counters((counter_u64_t *)&txq->stats,
+ sizeof(txq->stats));
+
+ mana_gd_destroy_queue(gd->gdma_context, txq->gdma_sq);
+
+ mtx_destroy(&txq->txq_mtx);
+}
+
+static void
+mana_destroy_txq(struct mana_port_context *apc)
+{
+ int i;
+
+ if (!apc->tx_qp)
+ return;
+
+ for (i = 0; i < apc->num_queues; i++) {
+ mana_destroy_wq_obj(apc, GDMA_SQ, apc->tx_qp[i].tx_object);
+
+ mana_deinit_cq(apc, &apc->tx_qp[i].tx_cq);
+
+ mana_deinit_txq(apc, &apc->tx_qp[i].txq);
+ }
+
+ free(apc->tx_qp, M_DEVBUF);
+ apc->tx_qp = NULL;
+}
+
+static int
+mana_create_txq(struct mana_port_context *apc, struct ifnet *net)
+{
+ struct gdma_dev *gd = apc->ac->gdma_dev;
+ struct mana_obj_spec wq_spec;
+ struct mana_obj_spec cq_spec;
+ struct gdma_queue_spec spec;
+ struct gdma_context *gc;
+ struct mana_txq *txq;
+ struct mana_cq *cq;
+ uint32_t txq_size;
+ uint32_t cq_size;
+ int err;
+ int i;
+
+ apc->tx_qp = mallocarray(apc->num_queues, sizeof(struct mana_tx_qp),
+ M_DEVBUF, M_WAITOK | M_ZERO);
+ if (!apc->tx_qp)
+ return ENOMEM;
+
+ /* The minimum size of the WQE is 32 bytes, hence
+ * MAX_SEND_BUFFERS_PER_QUEUE represents the maximum number of WQEs
+ * the SQ can store. This value is then used to size other queues
+ * to prevent overflow.
+ */
+ txq_size = MAX_SEND_BUFFERS_PER_QUEUE * 32;
+ KASSERT(IS_ALIGNED(txq_size, PAGE_SIZE),
+ ("txq size not page aligned"));
+
+ cq_size = MAX_SEND_BUFFERS_PER_QUEUE * COMP_ENTRY_SIZE;
+ cq_size = ALIGN(cq_size, PAGE_SIZE);
+
+ gc = gd->gdma_context;
+
+ for (i = 0; i < apc->num_queues; i++) {
+ apc->tx_qp[i].tx_object = INVALID_MANA_HANDLE;
+
+ /* Create SQ */
+ txq = &apc->tx_qp[i].txq;
+
+ txq->ndev = net;
+ txq->vp_offset = apc->tx_vp_offset;
+ txq->idx = i;
+ txq->alt_txq_idx = i;
+
+ memset(&spec, 0, sizeof(spec));
+ spec.type = GDMA_SQ;
+ spec.monitor_avl_buf = true;
+ spec.queue_size = txq_size;
+ err = mana_gd_create_mana_wq_cq(gd, &spec, &txq->gdma_sq);
+ if (err)
+ goto out;
+
+ /* Create SQ's CQ */
+ cq = &apc->tx_qp[i].tx_cq;
+ cq->gdma_comp_buf = apc->eqs[i].cqe_poll;
+ cq->type = MANA_CQ_TYPE_TX;
+
+ cq->txq = txq;
+
+ memset(&spec, 0, sizeof(spec));
+ spec.type = GDMA_CQ;
+ spec.monitor_avl_buf = false;
+ spec.queue_size = cq_size;
+ spec.cq.callback = mana_cq_handler;
+ spec.cq.parent_eq = apc->eqs[i].eq;
+ spec.cq.context = cq;
+ err = mana_gd_create_mana_wq_cq(gd, &spec, &cq->gdma_cq);
+ if (err)
+ goto out;
+
+ memset(&wq_spec, 0, sizeof(wq_spec));
+ memset(&cq_spec, 0, sizeof(cq_spec));
+
+ wq_spec.gdma_region = txq->gdma_sq->mem_info.gdma_region;
+ wq_spec.queue_size = txq->gdma_sq->queue_size;
+
+ cq_spec.gdma_region = cq->gdma_cq->mem_info.gdma_region;
+ cq_spec.queue_size = cq->gdma_cq->queue_size;
+ cq_spec.modr_ctx_id = 0;
+ cq_spec.attached_eq = cq->gdma_cq->cq.parent->id;
+
+ err = mana_create_wq_obj(apc, apc->port_handle, GDMA_SQ,
+ &wq_spec, &cq_spec, &apc->tx_qp[i].tx_object);
+
+ if (err)
+ goto out;
+
+ txq->gdma_sq->id = wq_spec.queue_index;
+ cq->gdma_cq->id = cq_spec.queue_index;
+
+ txq->gdma_sq->mem_info.gdma_region = GDMA_INVALID_DMA_REGION;
+ cq->gdma_cq->mem_info.gdma_region = GDMA_INVALID_DMA_REGION;
+
+ txq->gdma_txq_id = txq->gdma_sq->id;
+
+ cq->gdma_id = cq->gdma_cq->id;
+
+ mana_dbg(NULL,
+ "txq %d, txq gdma id %d, txq cq gdma id %d\n",
+ i, txq->gdma_txq_id, cq->gdma_id);;
+
+ if (cq->gdma_id >= gc->max_num_cqs) {
+ if_printf(net, "CQ id %u too large.\n", cq->gdma_id);
+ return EINVAL;
+ }
+
+ gc->cq_table[cq->gdma_id] = cq->gdma_cq;
+
+ /* Initialize tx specific data */
+ txq->tx_buf_info = malloc(MAX_SEND_BUFFERS_PER_QUEUE *
+ sizeof(struct mana_send_buf_info),
+ M_DEVBUF, M_WAITOK | M_ZERO);
+ if (unlikely(txq->tx_buf_info == NULL)) {
+ if_printf(net,
+ "Failed to allocate tx buf info for SQ %u\n",
+ txq->gdma_sq->id);
+ err = ENOMEM;
+ goto out;
+ }
+
+
+ snprintf(txq->txq_mtx_name, nitems(txq->txq_mtx_name),
+ "mana:tx(%d)", i);
+ mtx_init(&txq->txq_mtx, txq->txq_mtx_name, NULL, MTX_DEF);
+
+ txq->txq_br = buf_ring_alloc(4 * MAX_SEND_BUFFERS_PER_QUEUE,
+ M_DEVBUF, M_WAITOK, &txq->txq_mtx);
+ if (unlikely(txq->txq_br == NULL)) {
+ if_printf(net,
+ "Failed to allocate buf ring for SQ %u\n",
+ txq->gdma_sq->id);
+ err = ENOMEM;
+ goto out;
+ }
+
+ /* Allocate taskqueue for deferred send */
+ TASK_INIT(&txq->enqueue_task, 0, mana_xmit_taskfunc, txq);
+ txq->enqueue_tq = taskqueue_create_fast("mana_tx_enque",
+ M_NOWAIT, taskqueue_thread_enqueue, &txq->enqueue_tq);
+ if (unlikely(txq->enqueue_tq == NULL)) {
+ if_printf(net,
+ "Unable to create tx %d enqueue task queue\n", i);
+ err = ENOMEM;
+ goto out;
+ }
+ taskqueue_start_threads(&txq->enqueue_tq, 1, PI_NET,
+ "mana txq %d", i);
+
+ mana_alloc_counters((counter_u64_t *)&txq->stats,
+ sizeof(txq->stats));
+
+ mana_gd_arm_cq(cq->gdma_cq);
+ }
+
+ return 0;
+out:
+ mana_destroy_txq(apc);
+ return err;
+}
+
+static void
+mana_destroy_rxq(struct mana_port_context *apc, struct mana_rxq *rxq,
+ bool validate_state)
+{
+ struct gdma_context *gc = apc->ac->gdma_dev->gdma_context;
+ struct mana_recv_buf_oob *rx_oob;
+ int i;
+
+ if (!rxq)
+ return;
+
+ if (validate_state) {
+ /*
+ * XXX Cancel and drain cleanup task queue here.
+ */
+ ;
+ }
+
+ mana_destroy_wq_obj(apc, GDMA_RQ, rxq->rxobj);
+
+ mana_deinit_cq(apc, &rxq->rx_cq);
+
+ mana_free_counters((counter_u64_t *)&rxq->stats,
+ sizeof(rxq->stats));
+
+ /* Free LRO resources */
+ tcp_lro_free(&rxq->lro);
+
+ for (i = 0; i < rxq->num_rx_buf; i++) {
+ rx_oob = &rxq->rx_oobs[i];
+
+ if (rx_oob->mbuf)
+ mana_unload_rx_mbuf(apc, rxq, rx_oob, true);
+
+ bus_dmamap_destroy(apc->rx_buf_tag, rx_oob->dma_map);
+ }
+
+ if (rxq->gdma_rq)
+ mana_gd_destroy_queue(gc, rxq->gdma_rq);
+
+ free(rxq, M_DEVBUF);
+}
+
+#define MANA_WQE_HEADER_SIZE 16
+#define MANA_WQE_SGE_SIZE 16
+
+static int
+mana_alloc_rx_wqe(struct mana_port_context *apc,
+ struct mana_rxq *rxq, uint32_t *rxq_size, uint32_t *cq_size)
+{
+ struct mana_recv_buf_oob *rx_oob;
+ uint32_t buf_idx;
+ int err;
+
+ if (rxq->datasize == 0 || rxq->datasize > PAGE_SIZE) {
+ mana_err(NULL,
+ "WARNING: Invalid rxq datasize %u\n", rxq->datasize);
+ }
+
+ *rxq_size = 0;
+ *cq_size = 0;
+
+ for (buf_idx = 0; buf_idx < rxq->num_rx_buf; buf_idx++) {
+ rx_oob = &rxq->rx_oobs[buf_idx];
+ memset(rx_oob, 0, sizeof(*rx_oob));
+
+ err = bus_dmamap_create(apc->rx_buf_tag, 0,
+ &rx_oob->dma_map);
+ if (err) {
+ mana_err(NULL,
+ "Failed to create rx DMA map for buf %d\n",
+ buf_idx);
+ return err;
+ }
+
+ err = mana_load_rx_mbuf(apc, rxq, rx_oob, true);
+ if (err) {
+ mana_err(NULL,
+ "Failed to create rx DMA map for buf %d\n",
+ buf_idx);
+ bus_dmamap_destroy(apc->rx_buf_tag, rx_oob->dma_map);
+ return err;
+ }
+
+ rx_oob->wqe_req.sgl = rx_oob->sgl;
+ rx_oob->wqe_req.num_sge = rx_oob->num_sge;
+ rx_oob->wqe_req.inline_oob_size = 0;
+ rx_oob->wqe_req.inline_oob_data = NULL;
+ rx_oob->wqe_req.flags = 0;
+ rx_oob->wqe_req.client_data_unit = 0;
+
+ *rxq_size += ALIGN(MANA_WQE_HEADER_SIZE +
+ MANA_WQE_SGE_SIZE * rx_oob->num_sge, 32);
+ *cq_size += COMP_ENTRY_SIZE;
+ }
+
+ return 0;
+}
+
+static int
+mana_push_wqe(struct mana_rxq *rxq)
+{
+ struct mana_recv_buf_oob *rx_oob;
+ uint32_t buf_idx;
+ int err;
+
+ for (buf_idx = 0; buf_idx < rxq->num_rx_buf; buf_idx++) {
+ rx_oob = &rxq->rx_oobs[buf_idx];
+
+ err = mana_gd_post_and_ring(rxq->gdma_rq, &rx_oob->wqe_req,
+ &rx_oob->wqe_inf);
+ if (err)
+ return ENOSPC;
+ }
+
+ return 0;
+}
+
+static struct mana_rxq *
+mana_create_rxq(struct mana_port_context *apc, uint32_t rxq_idx,
+ struct mana_eq *eq, struct ifnet *ndev)
+{
+ struct gdma_dev *gd = apc->ac->gdma_dev;
+ struct mana_obj_spec wq_spec;
+ struct mana_obj_spec cq_spec;
+ struct gdma_queue_spec spec;
+ struct mana_cq *cq = NULL;
+ uint32_t cq_size, rq_size;
+ struct gdma_context *gc;
+ struct mana_rxq *rxq;
+ int err;
+
+ gc = gd->gdma_context;
+
+ rxq = malloc(sizeof(*rxq) +
+ RX_BUFFERS_PER_QUEUE * sizeof(struct mana_recv_buf_oob),
+ M_DEVBUF, M_WAITOK | M_ZERO);
+ if (!rxq)
+ return NULL;
+
+ rxq->ndev = ndev;
+ rxq->num_rx_buf = RX_BUFFERS_PER_QUEUE;
+ rxq->rxq_idx = rxq_idx;
+ /*
+ * Minimum size is MCLBYTES(2048) bytes for a mbuf cluster.
+ * Now we just allow maxium size of 4096.
+ */
+ rxq->datasize = ALIGN(apc->frame_size, MCLBYTES);
+ if (rxq->datasize > MAX_FRAME_SIZE)
+ rxq->datasize = MAX_FRAME_SIZE;
+
+ mana_dbg(NULL, "Setting rxq %d datasize %d\n",
+ rxq_idx, rxq->datasize);
+
+ rxq->rxobj = INVALID_MANA_HANDLE;
+
+ err = mana_alloc_rx_wqe(apc, rxq, &rq_size, &cq_size);
+ if (err)
+ goto out;
+
+ /* Create LRO for the RQ */
+ if (ndev->if_capenable & IFCAP_LRO) {
+ err = tcp_lro_init(&rxq->lro);
+ if (err) {
+ if_printf(ndev, "Failed to create LRO for rxq %d\n",
+ rxq_idx);
+ } else {
+ rxq->lro.ifp = ndev;
+ }
+ }
+
+ mana_alloc_counters((counter_u64_t *)&rxq->stats,
+ sizeof(rxq->stats));
+
+ rq_size = ALIGN(rq_size, PAGE_SIZE);
+ cq_size = ALIGN(cq_size, PAGE_SIZE);
+
+ /* Create RQ */
+ memset(&spec, 0, sizeof(spec));
+ spec.type = GDMA_RQ;
+ spec.monitor_avl_buf = true;
+ spec.queue_size = rq_size;
+ err = mana_gd_create_mana_wq_cq(gd, &spec, &rxq->gdma_rq);
+ if (err)
+ goto out;
+
+ /* Create RQ's CQ */
+ cq = &rxq->rx_cq;
+ cq->gdma_comp_buf = eq->cqe_poll;
+ cq->type = MANA_CQ_TYPE_RX;
+ cq->rxq = rxq;
+
+ memset(&spec, 0, sizeof(spec));
+ spec.type = GDMA_CQ;
+ spec.monitor_avl_buf = false;
+ spec.queue_size = cq_size;
+ spec.cq.callback = mana_cq_handler;
+ spec.cq.parent_eq = eq->eq;
+ spec.cq.context = cq;
+ err = mana_gd_create_mana_wq_cq(gd, &spec, &cq->gdma_cq);
+ if (err)
+ goto out;
+
+ memset(&wq_spec, 0, sizeof(wq_spec));
+ memset(&cq_spec, 0, sizeof(cq_spec));
+ wq_spec.gdma_region = rxq->gdma_rq->mem_info.gdma_region;
+ wq_spec.queue_size = rxq->gdma_rq->queue_size;
+
+ cq_spec.gdma_region = cq->gdma_cq->mem_info.gdma_region;
+ cq_spec.queue_size = cq->gdma_cq->queue_size;
+ cq_spec.modr_ctx_id = 0;
+ cq_spec.attached_eq = cq->gdma_cq->cq.parent->id;
+
+ err = mana_create_wq_obj(apc, apc->port_handle, GDMA_RQ,
+ &wq_spec, &cq_spec, &rxq->rxobj);
+ if (err)
+ goto out;
+
+ rxq->gdma_rq->id = wq_spec.queue_index;
+ cq->gdma_cq->id = cq_spec.queue_index;
+
+ rxq->gdma_rq->mem_info.gdma_region = GDMA_INVALID_DMA_REGION;
+ cq->gdma_cq->mem_info.gdma_region = GDMA_INVALID_DMA_REGION;
+
+ rxq->gdma_id = rxq->gdma_rq->id;
+ cq->gdma_id = cq->gdma_cq->id;
+
+ err = mana_push_wqe(rxq);
+ if (err)
+ goto out;
+
+ if (cq->gdma_id >= gc->max_num_cqs)
+ goto out;
+
+ gc->cq_table[cq->gdma_id] = cq->gdma_cq;
+
+ mana_gd_arm_cq(cq->gdma_cq);
+out:
+ if (!err)
+ return rxq;
+
+ if_printf(ndev, "Failed to create RXQ: err = %d\n", err);
+
+ mana_destroy_rxq(apc, rxq, false);
+
+ if (cq)
+ mana_deinit_cq(apc, cq);
+
+ return NULL;
+}
+
+static int
+mana_add_rx_queues(struct mana_port_context *apc, struct ifnet *ndev)
+{
+ struct mana_rxq *rxq;
+ int err = 0;
+ int i;
+
+ for (i = 0; i < apc->num_queues; i++) {
+ rxq = mana_create_rxq(apc, i, &apc->eqs[i], ndev);
+ if (!rxq) {
+ err = ENOMEM;
+ goto out;
+ }
+
+ apc->rxqs[i] = rxq;
+ }
+
+ apc->default_rxobj = apc->rxqs[0]->rxobj;
+out:
+ return err;
+}
+
+static void
+mana_destroy_vport(struct mana_port_context *apc)
+{
+ struct mana_rxq *rxq;
+ uint32_t rxq_idx;
+ struct mana_cq *rx_cq;
+ struct gdma_queue *cq, *eq;
+
+ for (rxq_idx = 0; rxq_idx < apc->num_queues; rxq_idx++) {
+ rxq = apc->rxqs[rxq_idx];
+ if (!rxq)
+ continue;
+
+ rx_cq = &rxq->rx_cq;
+ if ((cq = rx_cq->gdma_cq) != NULL) {
+ eq = cq->cq.parent;
+ mana_drain_eq_task(eq);
+ }
+
+ mana_destroy_rxq(apc, rxq, true);
+ apc->rxqs[rxq_idx] = NULL;
+ }
+
+ mana_destroy_txq(apc);
+}
+
+static int
+mana_create_vport(struct mana_port_context *apc, struct ifnet *net)
+{
+ struct gdma_dev *gd = apc->ac->gdma_dev;
+ int err;
+
+ apc->default_rxobj = INVALID_MANA_HANDLE;
+
+ err = mana_cfg_vport(apc, gd->pdid, gd->doorbell);
+ if (err)
+ return err;
+
+ return mana_create_txq(apc, net);
+}
+
+
+static void mana_rss_table_init(struct mana_port_context *apc)
+{
+ int i;
+
+ for (i = 0; i < MANA_INDIRECT_TABLE_SIZE; i++)
+ apc->indir_table[i] = i % apc->num_queues;
+}
+
+int mana_config_rss(struct mana_port_context *apc, enum TRI_STATE rx,
+ bool update_hash, bool update_tab)
+{
+ uint32_t queue_idx;
+ int i;
+
+ if (update_tab) {
+ for (i = 0; i < MANA_INDIRECT_TABLE_SIZE; i++) {
+ queue_idx = apc->indir_table[i];
+ apc->rxobj_table[i] = apc->rxqs[queue_idx]->rxobj;
+ }
+ }
+
+ return mana_cfg_vport_steering(apc, rx, true, update_hash, update_tab);
+}
+
+static int
+mana_init_port(struct ifnet *ndev)
+{
+ struct mana_port_context *apc = if_getsoftc(ndev);
+ uint32_t max_txq, max_rxq, max_queues;
+ int port_idx = apc->port_idx;
+ uint32_t num_indirect_entries;
+ int err;
+
+ err = mana_init_port_context(apc);
+ if (err)
+ return err;
+
+ err = mana_query_vport_cfg(apc, port_idx, &max_txq, &max_rxq,
+ &num_indirect_entries);
+ if (err) {
+ if_printf(ndev, "Failed to query info for vPort 0\n");
+ goto reset_apc;
+ }
+
+ max_queues = min_t(uint32_t, max_txq, max_rxq);
+ if (apc->max_queues > max_queues)
+ apc->max_queues = max_queues;
+
+ if (apc->num_queues > apc->max_queues)
+ apc->num_queues = apc->max_queues;
+
+ return 0;
+
+reset_apc:
+ bus_dma_tag_destroy(apc->rx_buf_tag);
+ apc->rx_buf_tag = NULL;
+ free(apc->rxqs, M_DEVBUF);
+ apc->rxqs = NULL;
+ return err;
+}
+
+int
+mana_alloc_queues(struct ifnet *ndev)
+{
+ struct mana_port_context *apc = if_getsoftc(ndev);
+ struct gdma_dev *gd = apc->ac->gdma_dev;
+ int err;
+
+ err = mana_create_eq(apc);
+ if (err)
+ return err;
+
+ err = mana_create_vport(apc, ndev);
+ if (err)
+ goto destroy_eq;
+
+ err = mana_add_rx_queues(apc, ndev);
+ if (err)
+ goto destroy_vport;
+
+ apc->rss_state = apc->num_queues > 1 ? TRI_STATE_TRUE : TRI_STATE_FALSE;
+
+ mana_rss_table_init(apc);
+
+ err = mana_config_rss(apc, TRI_STATE_TRUE, true, true);
+ if (err)
+ goto destroy_vport;
+
+ return 0;
+
+destroy_vport:
+ mana_destroy_vport(apc);
+destroy_eq:
+ mana_destroy_eq(gd->gdma_context, apc);
+ return err;
+}
+
+static int
+mana_up(struct mana_port_context *apc)
+{
+ int err;
+
+ mana_dbg(NULL, "mana_up called\n");
+
+ err = mana_alloc_queues(apc->ndev);
+ if (err) {
+ mana_err(NULL, "Faile alloc mana queues: %d\n", err);
+ return err;
+ }
+
+ /* Add queue specific sysctl */
+ mana_sysctl_add_queues(apc);
+
+ apc->port_is_up = true;
+
+ /* Ensure port state updated before txq state */
+ wmb();
+
+ if_link_state_change(apc->ndev, LINK_STATE_UP);
+ if_setdrvflagbits(apc->ndev, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
+
+ return 0;
+}
+
+
+static void
+mana_init(void *arg)
+{
+ struct mana_port_context *apc = (struct mana_port_context *)arg;
+
+ MANA_APC_LOCK_LOCK(apc);
+ if (!apc->port_is_up) {
+ mana_up(apc);
+ }
+ MANA_APC_LOCK_UNLOCK(apc);
+}
+
+static int
+mana_dealloc_queues(struct ifnet *ndev)
+{
+ struct mana_port_context *apc = if_getsoftc(ndev);
+ struct mana_txq *txq;
+ int i, err;
+
+ if (apc->port_is_up)
+ return EINVAL;
+
+ /* No packet can be transmitted now since apc->port_is_up is false.
+ * There is still a tiny chance that mana_poll_tx_cq() can re-enable
+ * a txq because it may not timely see apc->port_is_up being cleared
+ * to false, but it doesn't matter since mana_start_xmit() drops any
+ * new packets due to apc->port_is_up being false.
+ *
+ * Drain all the in-flight TX packets
+ */
+ for (i = 0; i < apc->num_queues; i++) {
+ txq = &apc->tx_qp[i].txq;
+
+ struct mana_cq *tx_cq = &apc->tx_qp[i].tx_cq;
+ struct gdma_queue *eq = NULL;
+ if (tx_cq->gdma_cq)
+ eq = tx_cq->gdma_cq->cq.parent;
+ if (eq) {
+ /* Stop EQ interrupt */
+ eq->eq.do_not_ring_db = true;
+ /* Schedule a cleanup task */
+ taskqueue_enqueue(eq->eq.cleanup_tq,
+ &eq->eq.cleanup_task);
+ }
+
+ while (atomic_read(&txq->pending_sends) > 0)
+ usleep_range(1000, 2000);
+ }
+
+ /* We're 100% sure the queues can no longer be woken up, because
+ * we're sure now mana_poll_tx_cq() can't be running.
+ */
+
+ apc->rss_state = TRI_STATE_FALSE;
+ err = mana_config_rss(apc, TRI_STATE_FALSE, false, false);
+ if (err) {
+ if_printf(ndev, "Failed to disable vPort: %d\n", err);
+ return err;
+ }
+
+ /* TODO: Implement RX fencing */
+ gdma_msleep(1000);
+
+ mana_destroy_vport(apc);
+
+ mana_destroy_eq(apc->ac->gdma_dev->gdma_context, apc);
+
+ return 0;
+}
+
+static int
+mana_down(struct mana_port_context *apc)
+{
+ int err = 0;
+
+ apc->port_st_save = apc->port_is_up;
+ apc->port_is_up = false;
+
+ /* Ensure port state updated before txq state */
+ wmb();
+
+ if (apc->port_st_save) {
+ if_setdrvflagbits(apc->ndev, IFF_DRV_OACTIVE,
+ IFF_DRV_RUNNING);
+ if_link_state_change(apc->ndev, LINK_STATE_DOWN);
+
+ mana_sysctl_free_queues(apc);
+
+ err = mana_dealloc_queues(apc->ndev);
+ if (err) {
+ if_printf(apc->ndev,
+ "Failed to bring down mana interface: %d\n", err);
+ }
+ }
+
+ return err;
+}
+
+int
+mana_detach(struct ifnet *ndev)
+{
+ struct mana_port_context *apc = if_getsoftc(ndev);
+ int err;
+
+ ether_ifdetach(ndev);
+
+ if (!apc)
+ return 0;
+
+ MANA_APC_LOCK_LOCK(apc);
+ err = mana_down(apc);
+ MANA_APC_LOCK_UNLOCK(apc);
+
+ mana_cleanup_port_context(apc);
+
+ MANA_APC_LOCK_DESTROY(apc);
+
+ free(apc, M_DEVBUF);
+
+ return err;
+}
+
+static int
+mana_probe_port(struct mana_context *ac, int port_idx,
+ struct ifnet **ndev_storage)
+{
+ struct gdma_context *gc = ac->gdma_dev->gdma_context;
+ struct mana_port_context *apc;
+ struct ifnet *ndev;
+ int err;
+
+ ndev = if_alloc_dev(IFT_ETHER, gc->dev);
+ if (!ndev) {
+ mana_err(NULL, "Failed to allocate ifnet struct\n");
+ return ENOMEM;
+ }
+
+ *ndev_storage = ndev;
+
+ apc = malloc(sizeof(*apc), M_DEVBUF, M_WAITOK | M_ZERO);
+ if (!apc) {
+ mana_err(NULL, "Failed to allocate port context\n");
+ err = ENOMEM;
+ goto free_net;
+ }
+
+ apc->ac = ac;
+ apc->ndev = ndev;
+ apc->max_queues = gc->max_num_queues;
+ apc->num_queues = min_t(unsigned int,
+ gc->max_num_queues, MANA_MAX_NUM_QUEUES);
+ apc->port_handle = INVALID_MANA_HANDLE;
+ apc->port_idx = port_idx;
+ apc->frame_size = DEFAULT_FRAME_SIZE;
+
+ MANA_APC_LOCK_INIT(apc);
+
+ if_initname(ndev, device_get_name(gc->dev), port_idx);
+ if_setdev(ndev,gc->dev);
+ if_setsoftc(ndev, apc);
+
+ if_setflags(ndev, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
+ if_setinitfn(ndev, mana_init);
+ if_settransmitfn(ndev, mana_start_xmit);
+ if_setqflushfn(ndev, mana_qflush);
+ if_setioctlfn(ndev, mana_ioctl);
+ if_setgetcounterfn(ndev, mana_get_counter);
+
+ if_setmtu(ndev, ETHERMTU);
+ if_setbaudrate(ndev, IF_Gbps(100));
+
+ mana_rss_key_fill(apc->hashkey, MANA_HASH_KEY_SIZE);
+
+ err = mana_init_port(ndev);
+ if (err)
+ goto reset_apc;
+
+ ndev->if_capabilities |= IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6;
+ ndev->if_capabilities |= IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6;
+ ndev->if_capabilities |= IFCAP_TSO4 | IFCAP_TSO6;
+
+ ndev->if_capabilities |= IFCAP_LRO | IFCAP_LINKSTATE;
+
+ /* Enable all available capabilities by default. */
+ ndev->if_capenable = ndev->if_capabilities;
+
+ /* TSO parameters */
+ ndev->if_hw_tsomax = MAX_MBUF_FRAGS * MANA_TSO_MAXSEG_SZ -
+ (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
+ ndev->if_hw_tsomaxsegcount = MAX_MBUF_FRAGS;
+ ndev->if_hw_tsomaxsegsize = PAGE_SIZE;
+
+ ifmedia_init(&apc->media, IFM_IMASK,
+ mana_ifmedia_change, mana_ifmedia_status);
+ ifmedia_add(&apc->media, IFM_ETHER | IFM_AUTO, 0, NULL);
+ ifmedia_set(&apc->media, IFM_ETHER | IFM_AUTO);
+
+ ether_ifattach(ndev, apc->mac_addr);
+
+ /* Initialize statistics */
+ mana_alloc_counters((counter_u64_t *)&apc->port_stats,
+ sizeof(struct mana_port_stats));
+ mana_sysctl_add_port(apc);
+
+ /* Tell the stack that the interface is not active */
+ if_setdrvflagbits(ndev, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
+
+ return 0;
+
+reset_apc:
+ free(apc, M_DEVBUF);
+free_net:
+ *ndev_storage = NULL;
+ if_printf(ndev, "Failed to probe vPort %d: %d\n", port_idx, err);
+ if_free(ndev);
+ return err;
+}
+
+int mana_probe(struct gdma_dev *gd)
+{
+ struct gdma_context *gc = gd->gdma_context;
+ device_t dev = gc->dev;
+ struct mana_context *ac;
+ int err;
+ int i;
+
+ device_printf(dev, "%s protocol version: %d.%d.%d\n", DEVICE_NAME,
+ MANA_MAJOR_VERSION, MANA_MINOR_VERSION, MANA_MICRO_VERSION);
+
+ err = mana_gd_register_device(gd);
+ if (err)
+ return err;
+
+ ac = malloc(sizeof(*ac), M_DEVBUF, M_WAITOK | M_ZERO);
+ if (!ac)
+ return ENOMEM;
+
+ ac->gdma_dev = gd;
+ ac->num_ports = 1;
+ gd->driver_data = ac;
+
+ err = mana_query_device_cfg(ac, MANA_MAJOR_VERSION, MANA_MINOR_VERSION,
+ MANA_MICRO_VERSION, &ac->num_ports);
+ if (err)
+ goto out;
+
+ if (ac->num_ports > MAX_PORTS_IN_MANA_DEV)
+ ac->num_ports = MAX_PORTS_IN_MANA_DEV;
+
+ for (i = 0; i < ac->num_ports; i++) {
+ err = mana_probe_port(ac, i, &ac->ports[i]);
+ if (err) {
+ device_printf(dev,
+ "Failed to probe mana port %d\n", i);
+ break;
+ }
+ }
+
+out:
+ if (err)
+ mana_remove(gd);
+
+ return err;
+}
+
+void
+mana_remove(struct gdma_dev *gd)
+{
+ struct gdma_context *gc = gd->gdma_context;
+ struct mana_context *ac = gd->driver_data;
+ device_t dev = gc->dev;
+ struct ifnet *ndev;
+ int i;
+
+ for (i = 0; i < ac->num_ports; i++) {
+ ndev = ac->ports[i];
+ if (!ndev) {
+ if (i == 0)
+ device_printf(dev, "No net device to remove\n");
+ goto out;
+ }
+
+ mana_detach(ndev);
+
+ if_free(ndev);
+ }
+out:
+ mana_gd_deregister_device(gd);
+ gd->driver_data = NULL;
+ gd->gdma_context = NULL;
+ free(ac, M_DEVBUF);
+}
diff --git a/sys/dev/mana/mana_sysctl.c b/sys/dev/mana/mana_sysctl.c
new file mode 100644
index 000000000000..f0821f05f2d2
--- /dev/null
+++ b/sys/dev/mana/mana_sysctl.c
@@ -0,0 +1,219 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2021 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+
+#include "mana_sysctl.h"
+
+static int mana_sysctl_cleanup_thread_cpu(SYSCTL_HANDLER_ARGS);
+
+int mana_log_level = MANA_ALERT | MANA_WARNING | MANA_INFO;
+
+SYSCTL_NODE(_hw, OID_AUTO, mana, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+ "MANA driver parameters");
+
+/*
+ * Logging level for changing verbosity of the output
+ */
+SYSCTL_INT(_hw_mana, OID_AUTO, log_level, CTLFLAG_RWTUN,
+ &mana_log_level, 0, "Logging level indicating verbosity of the logs");
+
+SYSCTL_CONST_STRING(_hw_mana, OID_AUTO, driver_version, CTLFLAG_RD,
+ DRV_MODULE_VERSION, "MANA driver version");
+
+void
+mana_sysctl_add_port(struct mana_port_context *apc)
+{
+ struct gdma_context *gc = apc->ac->gdma_dev->gdma_context;
+ device_t dev = gc->dev;
+ struct sysctl_ctx_list *ctx;
+ struct sysctl_oid *tree;
+ struct sysctl_oid_list *child;
+ struct mana_port_stats *port_stats;
+ char node_name[32];
+
+ struct sysctl_oid *port_node, *stats_node;
+ struct sysctl_oid_list *stats_list;
+
+ ctx = device_get_sysctl_ctx(dev);
+ tree = device_get_sysctl_tree(dev);
+ child = SYSCTL_CHILDREN(tree);
+
+ port_stats = &apc->port_stats;
+
+ snprintf(node_name, 32, "port%d", apc->port_idx);
+
+ port_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
+ node_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Port Name");
+ apc->port_list = SYSCTL_CHILDREN(port_node);
+
+ SYSCTL_ADD_BOOL(ctx, apc->port_list, OID_AUTO,
+ "enable_altq", CTLFLAG_RW, &apc->enable_tx_altq, 0,
+ "Choose alternative txq under heavy load");
+
+ SYSCTL_ADD_PROC(ctx, apc->port_list, OID_AUTO,
+ "bind_cleanup_thread_cpu",
+ CTLTYPE_U8 | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ apc, 0, mana_sysctl_cleanup_thread_cpu, "I",
+ "Bind cleanup thread to a cpu. 0 disables it.");
+
+ stats_node = SYSCTL_ADD_NODE(ctx, apc->port_list, OID_AUTO,
+ "port_stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
+ "Statistics of port");
+ stats_list = SYSCTL_CHILDREN(stats_node);
+
+ SYSCTL_ADD_COUNTER_U64(ctx, stats_list, OID_AUTO, "rx_packets",
+ CTLFLAG_RD, &port_stats->rx_packets, "Packets received");
+ SYSCTL_ADD_COUNTER_U64(ctx, stats_list, OID_AUTO, "tx_packets",
+ CTLFLAG_RD, &port_stats->tx_packets, "Packets transmitted");
+ SYSCTL_ADD_COUNTER_U64(ctx, stats_list, OID_AUTO, "rx_bytes",
+ CTLFLAG_RD, &port_stats->rx_bytes, "Bytes received");
+ SYSCTL_ADD_COUNTER_U64(ctx, stats_list, OID_AUTO, "tx_bytes",
+ CTLFLAG_RD, &port_stats->tx_bytes, "Bytes transmitted");
+ SYSCTL_ADD_COUNTER_U64(ctx, stats_list, OID_AUTO, "rx_drops",
+ CTLFLAG_RD, &port_stats->rx_drops, "Receive packet drops");
+ SYSCTL_ADD_COUNTER_U64(ctx, stats_list, OID_AUTO, "tx_drops",
+ CTLFLAG_RD, &port_stats->tx_drops, "Transmit packet drops");
+}
+
+void
+mana_sysctl_add_queues(struct mana_port_context *apc)
+{
+ struct sysctl_ctx_list *ctx = &apc->que_sysctl_ctx;
+ struct sysctl_oid_list *child = apc->port_list;
+
+ struct sysctl_oid *queue_node, *tx_node, *rx_node;
+ struct sysctl_oid_list *queue_list, *tx_list, *rx_list;
+ struct mana_txq *txq;
+ struct mana_rxq *rxq;
+ struct mana_stats *tx_stats, *rx_stats;
+ char que_name[32];
+ int i;
+
+ sysctl_ctx_init(ctx);
+
+ for (i = 0; i < apc->num_queues; i++) {
+ rxq = apc->rxqs[i];
+ txq = &apc->tx_qp[i].txq;
+
+ snprintf(que_name, 32, "queue%d", i);
+
+ queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
+ que_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name");
+ queue_list = SYSCTL_CHILDREN(queue_node);
+
+ /* TX stats */
+ tx_node = SYSCTL_ADD_NODE(ctx, queue_list, OID_AUTO,
+ "txq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TX queue");
+ tx_list = SYSCTL_CHILDREN(tx_node);
+
+ tx_stats = &txq->stats;
+
+ SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "count",
+ CTLFLAG_RD, &tx_stats->packets, "Packets sent");
+ SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "bytes",
+ CTLFLAG_RD, &tx_stats->bytes, "Bytes sent");
+ SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "queue_wakeups",
+ CTLFLAG_RD, &tx_stats->wakeup, "Queue wakeups");
+ SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "queue_stops",
+ CTLFLAG_RD, &tx_stats->stop, "Queue stops");
+ SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "mbuf_collapse",
+ CTLFLAG_RD, &tx_stats->collapse, "Mbuf collapse count");
+ SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
+ "mbuf_collapse_err", CTLFLAG_RD,
+ &tx_stats->collapse_err, "Mbuf collapse failures");
+ SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
+ "dma_mapping_err", CTLFLAG_RD,
+ &tx_stats->dma_mapping_err, "DMA mapping failures");
+ SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
+ "alt_chg", CTLFLAG_RD,
+ &tx_stats->alt_chg, "Switch to alternative txq");
+ SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
+ "alt_reset", CTLFLAG_RD,
+ &tx_stats->alt_reset, "Reset to self txq");
+
+ /* RX stats */
+ rx_node = SYSCTL_ADD_NODE(ctx, queue_list, OID_AUTO,
+ "rxq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "RX queue");
+ rx_list = SYSCTL_CHILDREN(rx_node);
+
+ rx_stats = &rxq->stats;
+
+ SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "count",
+ CTLFLAG_RD, &rx_stats->packets, "Packets received");
+ SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "bytes",
+ CTLFLAG_RD, &rx_stats->bytes, "Bytes received");
+ SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
+ "mbuf_alloc_fail", CTLFLAG_RD,
+ &rx_stats->mbuf_alloc_fail, "Failed mbuf allocs");
+ SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
+ "dma_mapping_err", CTLFLAG_RD,
+ &rx_stats->dma_mapping_err, "DMA mapping errors");
+ }
+}
+
+/*
+ * Free all queues' sysctl trees attached to the port's tree.
+ */
+void
+mana_sysctl_free_queues(struct mana_port_context *apc)
+{
+ sysctl_ctx_free(&apc->que_sysctl_ctx);
+}
+
+static int
+mana_sysctl_cleanup_thread_cpu(SYSCTL_HANDLER_ARGS)
+{
+ struct mana_port_context *apc = arg1;
+ bool bind_cpu = false;
+ uint8_t val;
+ int err;
+
+ val = 0;
+ err = sysctl_wire_old_buffer(req, sizeof(val));
+ if (err == 0) {
+ val = apc->bind_cleanup_thread_cpu;
+ err = sysctl_handle_8(oidp, &val, 0, req);
+ }
+
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val != 0)
+ bind_cpu = true;
+
+ if (bind_cpu != apc->bind_cleanup_thread_cpu) {
+ apc->bind_cleanup_thread_cpu = bind_cpu;
+ err = mana_restart(apc);
+ }
+
+ return (err);
+}
diff --git a/sys/dev/mana/mana_sysctl.h b/sys/dev/mana/mana_sysctl.h
new file mode 100644
index 000000000000..c47f4cd790a9
--- /dev/null
+++ b/sys/dev/mana/mana_sysctl.h
@@ -0,0 +1,48 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2015-2020 Amazon.com, Inc. or its affiliates.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+
+#ifndef MANA_SYSCTL_H
+#define MANA_SYSCTL_H
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include "mana.h"
+
+void mana_sysctl_add_port(struct mana_port_context *apc);
+void mana_sysctl_add_queues(struct mana_port_context *apc);
+void mana_sysctl_free_queues(struct mana_port_context *apc);
+
+#endif /* !(MANA_SYSCTL_H) */
diff --git a/sys/dev/mana/shm_channel.c b/sys/dev/mana/shm_channel.c
new file mode 100644
index 000000000000..17679626d822
--- /dev/null
+++ b/sys/dev/mana/shm_channel.c
@@ -0,0 +1,337 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2021 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+
+#include "mana.h"
+#include "shm_channel.h"
+#include "gdma_util.h"
+
+#define PAGE_FRAME_L48_WIDTH_BYTES 6
+#define PAGE_FRAME_L48_WIDTH_BITS (PAGE_FRAME_L48_WIDTH_BYTES * 8)
+#define PAGE_FRAME_L48_MASK 0x0000FFFFFFFFFFFF
+#define PAGE_FRAME_H4_WIDTH_BITS 4
+#define VECTOR_MASK 0xFFFF
+#define SHMEM_VF_RESET_STATE ((uint32_t)-1)
+
+#define SMC_MSG_TYPE_ESTABLISH_HWC 1
+#define SMC_MSG_TYPE_ESTABLISH_HWC_VERSION 0
+
+#define SMC_MSG_TYPE_DESTROY_HWC 2
+#define SMC_MSG_TYPE_DESTROY_HWC_VERSION 0
+
+#define SMC_MSG_DIRECTION_REQUEST 0
+#define SMC_MSG_DIRECTION_RESPONSE 1
+
+/* Structures labeled with "HW DATA" are exchanged with the hardware. All of
+ * them are naturally aligned and hence don't need __packed.
+ */
+
+/* Shared memory channel protocol header
+ *
+ * msg_type: set on request and response; response matches request.
+ * msg_version: newer PF writes back older response (matching request)
+ * older PF acts on latest version known and sets that version in result
+ * (less than request).
+ * direction: 0 for request, VF->PF; 1 for response, PF->VF.
+ * status: 0 on request,
+ * operation result on response (success = 0, failure = 1 or greater).
+ * reset_vf: If set on either establish or destroy request, indicates perform
+ * FLR before/after the operation.
+ * owner_is_pf: 1 indicates PF owned, 0 indicates VF owned.
+ */
+union smc_proto_hdr {
+ uint32_t as_uint32;
+
+ struct {
+ uint8_t msg_type : 3;
+ uint8_t msg_version : 3;
+ uint8_t reserved_1 : 1;
+ uint8_t direction : 1;
+
+ uint8_t status;
+
+ uint8_t reserved_2;
+
+ uint8_t reset_vf : 1;
+ uint8_t reserved_3 : 6;
+ uint8_t owner_is_pf : 1;
+ };
+}; /* HW DATA */
+
+#define SMC_APERTURE_BITS 256
+#define SMC_BASIC_UNIT (sizeof(uint32_t))
+#define SMC_APERTURE_DWORDS (SMC_APERTURE_BITS / (SMC_BASIC_UNIT * 8))
+#define SMC_LAST_DWORD (SMC_APERTURE_DWORDS - 1)
+
+static int
+mana_smc_poll_register(void __iomem *base, bool reset)
+{
+ void __iomem *ptr = (uint8_t *)base + SMC_LAST_DWORD * SMC_BASIC_UNIT;
+ volatile uint32_t last_dword;
+ int i;
+
+ /* Poll the hardware for the ownership bit. This should be pretty fast,
+ * but let's do it in a loop just in case the hardware or the PF
+ * driver are temporarily busy.
+ */
+ for (i = 0; i < 20 * 1000; i++) {
+ last_dword = readl(ptr);
+
+ /* shmem reads as 0xFFFFFFFF in the reset case */
+ if (reset && last_dword == SHMEM_VF_RESET_STATE)
+ return 0;
+
+ /* If bit_31 is set, the PF currently owns the SMC. */
+ if (!(last_dword & BIT(31)))
+ return 0;
+
+ DELAY(1000);
+ }
+
+ return ETIMEDOUT;
+}
+
+static int
+mana_smc_read_response(struct shm_channel *sc, uint32_t msg_type,
+ uint32_t msg_version, bool reset_vf)
+{
+ void __iomem *base = sc->base;
+ union smc_proto_hdr hdr;
+ int err;
+
+ /* Wait for PF to respond. */
+ err = mana_smc_poll_register(base, reset_vf);
+ if (err)
+ return err;
+
+ hdr.as_uint32 =
+ readl((uint8_t *)base + SMC_LAST_DWORD * SMC_BASIC_UNIT);
+ mana_dbg(NULL, "shm response 0x%x\n", hdr.as_uint32);
+
+ if (reset_vf && hdr.as_uint32 == SHMEM_VF_RESET_STATE)
+ return 0;
+
+ /* Validate protocol fields from the PF driver */
+ if (hdr.msg_type != msg_type || hdr.msg_version > msg_version ||
+ hdr.direction != SMC_MSG_DIRECTION_RESPONSE) {
+ device_printf(sc->dev,
+ "Wrong SMC response 0x%x, type=%d, ver=%d\n",
+ hdr.as_uint32, msg_type, msg_version);
+ return EPROTO;
+ }
+
+ /* Validate the operation result */
+ if (hdr.status != 0) {
+ device_printf(sc->dev,
+ "SMC operation failed: 0x%x\n", hdr.status);
+ return EPROTO;
+ }
+
+ return 0;
+}
+
+void
+mana_smc_init(struct shm_channel *sc, device_t dev, void __iomem *base)
+{
+ sc->dev = dev;
+ sc->base = base;
+}
+
+int
+mana_smc_setup_hwc(struct shm_channel *sc, bool reset_vf, uint64_t eq_addr,
+ uint64_t cq_addr, uint64_t rq_addr, uint64_t sq_addr,
+ uint32_t eq_msix_index)
+{
+ union smc_proto_hdr *hdr;
+ uint16_t all_addr_h4bits = 0;
+ uint16_t frame_addr_seq = 0;
+ uint64_t frame_addr = 0;
+ uint8_t shm_buf[32];
+ uint64_t *shmem;
+ uint32_t *dword;
+ uint8_t *ptr;
+ int err;
+ int i;
+
+ /* Ensure VF already has possession of shared memory */
+ err = mana_smc_poll_register(sc->base, false);
+ if (err) {
+ device_printf(sc->dev,
+ "Timeout when setting up HWC: %d\n", err);
+ return err;
+ }
+
+ if (!IS_ALIGNED(eq_addr, PAGE_SIZE) ||
+ !IS_ALIGNED(cq_addr, PAGE_SIZE) ||
+ !IS_ALIGNED(rq_addr, PAGE_SIZE) ||
+ !IS_ALIGNED(sq_addr, PAGE_SIZE))
+ return EINVAL;
+
+ if ((eq_msix_index & VECTOR_MASK) != eq_msix_index)
+ return EINVAL;
+
+ /* Scheme for packing four addresses and extra info into 256 bits.
+ *
+ * Addresses must be page frame aligned, so only frame address bits
+ * are transferred.
+ *
+ * 52-bit frame addresses are split into the lower 48 bits and upper
+ * 4 bits. Lower 48 bits of 4 address are written sequentially from
+ * the start of the 256-bit shared memory region followed by 16 bits
+ * containing the upper 4 bits of the 4 addresses in sequence.
+ *
+ * A 16 bit EQ vector number fills out the next-to-last 32-bit dword.
+ *
+ * The final 32-bit dword is used for protocol control information as
+ * defined in smc_proto_hdr.
+ */
+
+ memset(shm_buf, 0, sizeof(shm_buf));
+ ptr = shm_buf;
+
+ /* EQ addr: low 48 bits of frame address */
+ shmem = (uint64_t *)ptr;
+ frame_addr = PHYS_PFN(eq_addr);
+ *shmem = frame_addr & PAGE_FRAME_L48_MASK;
+ all_addr_h4bits |= (frame_addr >> PAGE_FRAME_L48_WIDTH_BITS) <<
+ (frame_addr_seq++ * PAGE_FRAME_H4_WIDTH_BITS);
+ ptr += PAGE_FRAME_L48_WIDTH_BYTES;
+
+ /* CQ addr: low 48 bits of frame address */
+ shmem = (uint64_t *)ptr;
+ frame_addr = PHYS_PFN(cq_addr);
+ *shmem = frame_addr & PAGE_FRAME_L48_MASK;
+ all_addr_h4bits |= (frame_addr >> PAGE_FRAME_L48_WIDTH_BITS) <<
+ (frame_addr_seq++ * PAGE_FRAME_H4_WIDTH_BITS);
+ ptr += PAGE_FRAME_L48_WIDTH_BYTES;
+
+ /* RQ addr: low 48 bits of frame address */
+ shmem = (uint64_t *)ptr;
+ frame_addr = PHYS_PFN(rq_addr);
+ *shmem = frame_addr & PAGE_FRAME_L48_MASK;
+ all_addr_h4bits |= (frame_addr >> PAGE_FRAME_L48_WIDTH_BITS) <<
+ (frame_addr_seq++ * PAGE_FRAME_H4_WIDTH_BITS);
+ ptr += PAGE_FRAME_L48_WIDTH_BYTES;
+
+ /* SQ addr: low 48 bits of frame address */
+ shmem = (uint64_t *)ptr;
+ frame_addr = PHYS_PFN(sq_addr);
+ *shmem = frame_addr & PAGE_FRAME_L48_MASK;
+ all_addr_h4bits |= (frame_addr >> PAGE_FRAME_L48_WIDTH_BITS) <<
+ (frame_addr_seq++ * PAGE_FRAME_H4_WIDTH_BITS);
+ ptr += PAGE_FRAME_L48_WIDTH_BYTES;
+
+ /* High 4 bits of the four frame addresses */
+ *((uint16_t *)ptr) = all_addr_h4bits;
+ ptr += sizeof(uint16_t);
+
+ /* EQ MSIX vector number */
+ *((uint16_t *)ptr) = (uint16_t)eq_msix_index;
+ ptr += sizeof(uint16_t);
+
+ /* 32-bit protocol header in final dword */
+ *((uint32_t *)ptr) = 0;
+
+ hdr = (union smc_proto_hdr *)ptr;
+ hdr->msg_type = SMC_MSG_TYPE_ESTABLISH_HWC;
+ hdr->msg_version = SMC_MSG_TYPE_ESTABLISH_HWC_VERSION;
+ hdr->direction = SMC_MSG_DIRECTION_REQUEST;
+ hdr->reset_vf = reset_vf;
+
+ /* Write 256-message buffer to shared memory (final 32-bit write
+ * triggers HW to set possession bit to PF).
+ */
+ dword = (uint32_t *)shm_buf;
+ for (i = 0; i < SMC_APERTURE_DWORDS; i++) {
+ mana_dbg(NULL, "write shm_buf %d, val: 0x%x\n",
+ i, *dword);
+ writel((char *)sc->base + i * SMC_BASIC_UNIT, *dword++);
+ }
+
+ /* Read shmem response (polling for VF possession) and validate.
+ * For setup, waiting for response on shared memory is not strictly
+ * necessary, since wait occurs later for results to appear in EQE's.
+ */
+ err = mana_smc_read_response(sc, SMC_MSG_TYPE_ESTABLISH_HWC,
+ SMC_MSG_TYPE_ESTABLISH_HWC_VERSION, reset_vf);
+ if (err) {
+ device_printf(sc->dev,
+ "Error when setting up HWC: %d\n", err);
+ return err;
+ }
+
+ return 0;
+}
+
+int
+mana_smc_teardown_hwc(struct shm_channel *sc, bool reset_vf)
+{
+ union smc_proto_hdr hdr = {};
+ int err;
+
+ /* Ensure already has possession of shared memory */
+ err = mana_smc_poll_register(sc->base, false);
+ if (err) {
+ device_printf(sc->dev, "Timeout when tearing down HWC\n");
+ return err;
+ }
+
+ /* Set up protocol header for HWC destroy message */
+ hdr.msg_type = SMC_MSG_TYPE_DESTROY_HWC;
+ hdr.msg_version = SMC_MSG_TYPE_DESTROY_HWC_VERSION;
+ hdr.direction = SMC_MSG_DIRECTION_REQUEST;
+ hdr.reset_vf = reset_vf;
+
+ /* Write message in high 32 bits of 256-bit shared memory, causing HW
+ * to set possession bit to PF.
+ */
+ writel((char *)sc->base + SMC_LAST_DWORD * SMC_BASIC_UNIT,
+ hdr.as_uint32);
+
+ /* Read shmem response (polling for VF possession) and validate.
+ * For teardown, waiting for response is required to ensure hardware
+ * invalidates MST entries before software frees memory.
+ */
+ err = mana_smc_read_response(sc, SMC_MSG_TYPE_DESTROY_HWC,
+ SMC_MSG_TYPE_DESTROY_HWC_VERSION, reset_vf);
+ if (err) {
+ device_printf(sc->dev,
+ "Error when tearing down HWC: %d\n", err);
+ return err;
+ }
+
+ return 0;
+}
diff --git a/sys/dev/mana/shm_channel.h b/sys/dev/mana/shm_channel.h
new file mode 100644
index 000000000000..7836e513a77f
--- /dev/null
+++ b/sys/dev/mana/shm_channel.h
@@ -0,0 +1,52 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2021 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+
+#ifndef _SHM_CHANNEL_H
+#define _SHM_CHANNEL_H
+
+#define __iomem
+
+struct shm_channel {
+ device_t dev;
+ void __iomem *base;
+};
+
+void mana_smc_init(struct shm_channel *sc, device_t dev, void __iomem *base);
+
+int mana_smc_setup_hwc(struct shm_channel *sc, bool reset_vf,
+ uint64_t eq_addr, uint64_t cq_addr, uint64_t rq_addr, uint64_t sq_addr,
+ uint32_t eq_msix_index);
+
+int mana_smc_teardown_hwc(struct shm_channel *sc, bool reset_vf);
+
+#endif /* _SHM_CHANNEL_H */