aboutsummaryrefslogtreecommitdiff
path: root/sys/dev
diff options
context:
space:
mode:
authorHans Petter Selasky <hselasky@FreeBSD.org>2018-05-29 14:04:57 +0000
committerHans Petter Selasky <hselasky@FreeBSD.org>2018-05-29 14:04:57 +0000
commit38535d6cab17b86db2806866ab9b7a2a30c1ab90 (patch)
tree2584a59ce27b6eb2b6fff0e6c4bdd910ba0e8b76 /sys/dev
parent9c7c97c0fff62c3d801f36bd32bb98d5189c862f (diff)
downloadsrc-38535d6cab17b86db2806866ab9b7a2a30c1ab90.tar.gz
src-38535d6cab17b86db2806866ab9b7a2a30c1ab90.zip
Add support for hardware rate limiting to mlx5en(4).
The hardware rate limiting feature is enabled by the RATELIMIT kernel option. Please refer to ifconfig(8) and the txrtlmt option and the SO_MAX_PACING_RATE set socket option for more information. This feature is compatible with hardware transmit send offload, TSO. A set of sysctl(8) knobs under dev.mce.<N>.rate_limit are provided to setup the ratelimit table and also to fine tune various rate limit related parameters. Sponsored by: Mellanox Technologies
Notes
Notes: svn path=/head/; revision=334332
Diffstat (limited to 'sys/dev')
-rw-r--r--sys/dev/mlx5/driver.h38
-rw-r--r--sys/dev/mlx5/mlx5_core/mlx5_main.c18
-rw-r--r--sys/dev/mlx5/mlx5_core/mlx5_rl.c206
-rw-r--r--sys/dev/mlx5/mlx5_en/en.h8
-rw-r--r--sys/dev/mlx5/mlx5_en/en_rl.h174
-rw-r--r--sys/dev/mlx5/mlx5_en/mlx5_en_main.c34
-rw-r--r--sys/dev/mlx5/mlx5_en/mlx5_en_rl.c1539
-rw-r--r--sys/dev/mlx5/mlx5_en/mlx5_en_tx.c37
8 files changed, 2053 insertions, 1 deletions
diff --git a/sys/dev/mlx5/driver.h b/sys/dev/mlx5/driver.h
index 4a82fde934e7..70e1927b7287 100644
--- a/sys/dev/mlx5/driver.h
+++ b/sys/dev/mlx5/driver.h
@@ -28,6 +28,8 @@
#ifndef MLX5_DRIVER_H
#define MLX5_DRIVER_H
+#include "opt_ratelimit.h"
+
#include <linux/kernel.h>
#include <linux/completion.h>
#include <linux/pci.h>
@@ -500,7 +502,11 @@ struct mlx5_core_health {
struct delayed_work recover_work;
};
+#ifdef RATELIMIT
+#define MLX5_CQ_LINEAR_ARRAY_SIZE (128 * 1024)
+#else
#define MLX5_CQ_LINEAR_ARRAY_SIZE 1024
+#endif
struct mlx5_cq_linear_array_entry {
spinlock_t lock;
@@ -540,6 +546,23 @@ struct mlx5_irq_info {
char name[MLX5_MAX_IRQ_NAME];
};
+#ifdef RATELIMIT
+struct mlx5_rl_entry {
+ u32 rate;
+ u16 burst;
+ u16 index;
+ u32 refcount;
+};
+
+struct mlx5_rl_table {
+ struct mutex rl_lock;
+ u16 max_size;
+ u32 max_rate;
+ u32 min_rate;
+ struct mlx5_rl_entry *rl_entry;
+};
+#endif
+
struct mlx5_priv {
char name[MLX5_MAX_NAME_LEN];
struct mlx5_eq_table eq_table;
@@ -592,6 +615,9 @@ struct mlx5_priv {
struct list_head ctx_list;
spinlock_t ctx_lock;
unsigned long pci_dev_data;
+#ifdef RATELIMIT
+ struct mlx5_rl_table rl_table;
+#endif
};
enum mlx5_device_state {
@@ -1084,5 +1110,17 @@ static inline int mlx5_core_is_pf(struct mlx5_core_dev *dev)
{
return !(dev->priv.pci_dev_data & MLX5_PCI_DEV_IS_VF);
}
+#ifdef RATELIMIT
+int mlx5_init_rl_table(struct mlx5_core_dev *dev);
+void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev);
+int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u32 rate, u32 burst, u16 *index);
+void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, u32 rate, u32 burst);
+bool mlx5_rl_is_in_range(const struct mlx5_core_dev *dev, u32 rate, u32 burst);
+
+static inline bool mlx5_rl_is_supported(struct mlx5_core_dev *dev)
+{
+ return !!(dev->priv.rl_table.max_size);
+}
+#endif
#endif /* MLX5_DRIVER_H */
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_main.c b/sys/dev/mlx5/mlx5_core/mlx5_main.c
index c7406d1413a2..25b789dc8aa4 100644
--- a/sys/dev/mlx5/mlx5_core/mlx5_main.c
+++ b/sys/dev/mlx5/mlx5_core/mlx5_main.c
@@ -905,8 +905,23 @@ static int mlx5_init_once(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
mlx5_init_srq_table(dev);
mlx5_init_mr_table(dev);
+#ifdef RATELIMIT
+ err = mlx5_init_rl_table(dev);
+ if (err) {
+ dev_err(&pdev->dev, "Failed to init rate limiting\n");
+ goto err_tables_cleanup;
+ }
+#endif
return 0;
+#ifdef RATELIMIT
+err_tables_cleanup:
+ mlx5_cleanup_mr_table(dev);
+ mlx5_cleanup_srq_table(dev);
+ mlx5_cleanup_qp_table(dev);
+ mlx5_cleanup_cq_table(dev);
+#endif
+
err_eq_cleanup:
mlx5_eq_cleanup(dev);
@@ -916,6 +931,9 @@ out:
static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
{
+#ifdef RATELIMIT
+ mlx5_cleanup_rl_table(dev);
+#endif
mlx5_cleanup_mr_table(dev);
mlx5_cleanup_srq_table(dev);
mlx5_cleanup_qp_table(dev);
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_rl.c b/sys/dev/mlx5/mlx5_core/mlx5_rl.c
new file mode 100644
index 000000000000..f3d4cbecfc20
--- /dev/null
+++ b/sys/dev/mlx5/mlx5_core/mlx5_rl.c
@@ -0,0 +1,206 @@
+/*-
+ * Copyright (c) 2013-2017, Mellanox Technologies, Ltd. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <dev/mlx5/driver.h>
+#include "mlx5_core.h"
+
+#ifdef RATELIMIT
+
+/* Finds an entry where we can register the given rate
+ * If the rate already exists, return the entry where it is registered,
+ * otherwise return the first available entry.
+ * If the table is full, return NULL
+ */
+static struct mlx5_rl_entry *find_rl_entry(struct mlx5_rl_table *table,
+ u32 rate, u16 burst)
+{
+ struct mlx5_rl_entry *ret_entry = NULL;
+ struct mlx5_rl_entry *entry;
+ u16 i;
+
+ for (i = 0; i < table->max_size; i++) {
+ entry = table->rl_entry + i;
+ if (entry->rate == rate && entry->burst == burst)
+ return entry;
+ if (ret_entry == NULL && entry->rate == 0)
+ ret_entry = entry;
+ }
+
+ return ret_entry;
+}
+
+static int mlx5_set_rate_limit_cmd(struct mlx5_core_dev *dev,
+ u32 rate, u32 burst, u16 index)
+{
+ u32 in[MLX5_ST_SZ_DW(set_rate_limit_in)] = {0};
+ u32 out[MLX5_ST_SZ_DW(set_rate_limit_out)] = {0};
+
+ MLX5_SET(set_rate_limit_in, in, opcode,
+ MLX5_CMD_OP_SET_RATE_LIMIT);
+ MLX5_SET(set_rate_limit_in, in, rate_limit_index, index);
+ MLX5_SET(set_rate_limit_in, in, rate_limit, rate);
+
+ if (MLX5_CAP_QOS(dev, packet_pacing_burst_bound))
+ MLX5_SET(set_rate_limit_in, in, burst_upper_bound, burst);
+
+ return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+bool mlx5_rl_is_in_range(const struct mlx5_core_dev *dev, u32 rate, u32 burst)
+{
+ const struct mlx5_rl_table *table = &dev->priv.rl_table;
+
+ return (rate <= table->max_rate && rate >= table->min_rate &&
+ burst <= 65535);
+}
+EXPORT_SYMBOL(mlx5_rl_is_in_range);
+
+int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u32 rate, u32 burst, u16 *index)
+{
+ struct mlx5_rl_table *table = &dev->priv.rl_table;
+ struct mlx5_rl_entry *entry;
+ int err = 0;
+
+ mutex_lock(&table->rl_lock);
+
+ if (!rate || !mlx5_rl_is_in_range(dev, rate, burst)) {
+ mlx5_core_err(dev, "Invalid rate: %u, should be %u to %u\n",
+ rate, table->min_rate, table->max_rate);
+ err = -ERANGE;
+ goto out;
+ }
+
+ entry = find_rl_entry(table, rate, burst);
+ if (!entry) {
+ mlx5_core_err(dev, "Max number of %u rates reached\n",
+ table->max_size);
+ err = -ENOSPC;
+ goto out;
+ }
+ if (entry->refcount == 0xFFFFFFFFU) {
+ /* out of refcounts */
+ err = -ENOMEM;
+ goto out;
+ } else if (entry->refcount != 0) {
+ /* rate already configured */
+ entry->refcount++;
+ } else {
+ /* new rate limit */
+ err = mlx5_set_rate_limit_cmd(dev, rate, burst, entry->index);
+ if (err) {
+ mlx5_core_err(dev, "Failed configuring rate: %u (%d)\n",
+ rate, err);
+ goto out;
+ }
+ entry->rate = rate;
+ entry->burst = burst;
+ entry->refcount = 1;
+ }
+ *index = entry->index;
+
+out:
+ mutex_unlock(&table->rl_lock);
+ return err;
+}
+EXPORT_SYMBOL(mlx5_rl_add_rate);
+
+void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, u32 rate, u32 burst)
+{
+ struct mlx5_rl_table *table = &dev->priv.rl_table;
+ struct mlx5_rl_entry *entry = NULL;
+
+ /* 0 is a reserved value for unlimited rate */
+ if (rate == 0)
+ return;
+
+ mutex_lock(&table->rl_lock);
+ entry = find_rl_entry(table, rate, burst);
+ if (!entry || !entry->refcount) {
+ mlx5_core_warn(dev, "Rate %u is not configured\n", rate);
+ goto out;
+ }
+
+ entry->refcount--;
+ if (!entry->refcount) {
+ /* need to remove rate */
+ mlx5_set_rate_limit_cmd(dev, 0, 0, entry->index);
+ entry->rate = 0;
+ entry->burst = 0;
+ }
+
+out:
+ mutex_unlock(&table->rl_lock);
+}
+EXPORT_SYMBOL(mlx5_rl_remove_rate);
+
+int mlx5_init_rl_table(struct mlx5_core_dev *dev)
+{
+ struct mlx5_rl_table *table = &dev->priv.rl_table;
+ int i;
+
+ mutex_init(&table->rl_lock);
+ if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, packet_pacing)) {
+ table->max_size = 0;
+ return 0;
+ }
+
+ /* First entry is reserved for unlimited rate */
+ table->max_size = MLX5_CAP_QOS(dev, packet_pacing_rate_table_size) - 1;
+ table->max_rate = MLX5_CAP_QOS(dev, packet_pacing_max_rate);
+ table->min_rate = MLX5_CAP_QOS(dev, packet_pacing_min_rate);
+
+ table->rl_entry = kcalloc(table->max_size, sizeof(struct mlx5_rl_entry),
+ GFP_KERNEL);
+ if (!table->rl_entry)
+ return -ENOMEM;
+
+ /* The index represents the index in HW rate limit table
+ * Index 0 is reserved for unlimited rate
+ */
+ for (i = 0; i < table->max_size; i++)
+ table->rl_entry[i].index = i + 1;
+
+ return 0;
+}
+
+void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev)
+{
+ struct mlx5_rl_table *table = &dev->priv.rl_table;
+ int i;
+
+ /* Clear all configured rates */
+ for (i = 0; i < table->max_size; i++)
+ if (table->rl_entry[i].rate)
+ mlx5_set_rate_limit_cmd(dev, 0, 0,
+ table->rl_entry[i].index);
+
+ kfree(dev->priv.rl_table.rl_entry);
+}
+
+#endif
diff --git a/sys/dev/mlx5/mlx5_en/en.h b/sys/dev/mlx5/mlx5_en/en.h
index b5000c32eafd..9afe61389a8d 100644
--- a/sys/dev/mlx5/mlx5_en/en.h
+++ b/sys/dev/mlx5/mlx5_en/en.h
@@ -49,6 +49,7 @@
#include <netinet/udp.h>
#include <net/ethernet.h>
#include <sys/buf_ring.h>
+#include <sys/kthread.h>
#include "opt_rss.h"
@@ -711,6 +712,10 @@ struct mlx5e_flow_tables {
struct mlx5e_flow_table inner_rss;
};
+#ifdef RATELIMIT
+#include "en_rl.h"
+#endif
+
#define MLX5E_TSTMP_PREC 10
struct mlx5e_clbr_point {
@@ -778,6 +783,9 @@ struct mlx5e_priv {
int media_active_last;
struct callout watchdog;
+#ifdef RATELIMIT
+ struct mlx5e_rl_priv_data rl;
+#endif
struct callout tstmp_clbr;
int clbr_done;
diff --git a/sys/dev/mlx5/mlx5_en/en_rl.h b/sys/dev/mlx5/mlx5_en/en_rl.h
new file mode 100644
index 000000000000..4e2c6c539857
--- /dev/null
+++ b/sys/dev/mlx5/mlx5_en/en_rl.h
@@ -0,0 +1,174 @@
+/*-
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef __MLX5_EN_RL_H__
+#define __MLX5_EN_RL_H__
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/proc.h>
+#include <sys/condvar.h>
+#include <sys/interrupt.h>
+#include <sys/unistd.h>
+
+#include <sys/queue.h>
+
+#define MLX5E_RL_MAX_WORKERS 128 /* limited by Toeplitz hash */
+#define MLX5E_RL_MAX_TX_RATES (64 * 1024) /* software limit */
+#define MLX5E_RL_DEF_SQ_PER_WORKER (12 * 1024) /* software limit */
+#define MLX5E_RL_MAX_SQS (120 * 1024) /* software limit */
+
+#define MLX5E_RL_TX_COAL_USEC_DEFAULT 32
+#define MLX5E_RL_TX_COAL_PKTS_DEFAULT 4
+#define MLX5E_RL_TX_COAL_MODE_DEFAULT 0
+#define MLX5E_RL_TX_COMP_FACT_DEFAULT 1
+
+#define MLX5E_RL_WORKER_LOCK(rlw) mtx_lock(&(rlw)->mtx)
+#define MLX5E_RL_WORKER_UNLOCK(rlw) mtx_unlock(&(rlw)->mtx)
+
+#define MLX5E_RL_RLOCK(rl) sx_slock(&(rl)->rl_sxlock)
+#define MLX5E_RL_RUNLOCK(rl) sx_sunlock(&(rl)->rl_sxlock)
+
+#define MLX5E_RL_WLOCK(rl) sx_xlock(&(rl)->rl_sxlock)
+#define MLX5E_RL_WUNLOCK(rl) sx_xunlock(&(rl)->rl_sxlock)
+
+#define MLX5E_RL_PARAMS(m) \
+ m(+1, u64 tx_queue_size, "tx_queue_size", "Default send queue size") \
+ m(+1, u64 tx_coalesce_usecs, "tx_coalesce_usecs", "Limit in usec for joining TX packets") \
+ m(+1, u64 tx_coalesce_pkts, "tx_coalesce_pkts", "Maximum number of TX packets to join") \
+ m(+1, u64 tx_coalesce_mode, "tx_coalesce_mode", "0: EQE mode 1: CQE mode") \
+ m(+1, u64 tx_completion_fact, "tx_completion_fact", "1..MAX: Completion event ratio") \
+ m(+1, u64 tx_completion_fact_max, "tx_completion_fact_max", "Maximum completion event ratio") \
+ m(+1, u64 tx_worker_threads_max, "tx_worker_threads_max", "Max number of TX worker threads") \
+ m(+1, u64 tx_worker_threads_def, "tx_worker_threads_def", "Default number of TX worker threads") \
+ m(+1, u64 tx_channels_per_worker_max, "tx_channels_per_worker_max", "Max number of TX channels per worker") \
+ m(+1, u64 tx_channels_per_worker_def, "tx_channels_per_worker_def", "Default number of TX channels per worker") \
+ m(+1, u64 tx_rates_max, "tx_rates_max", "Max number of TX rates") \
+ m(+1, u64 tx_rates_def, "tx_rates_def", "Default number of TX rates") \
+ m(+1, u64 tx_limit_min, "tx_limit_min", "Minimum TX rate in bits/s") \
+ m(+1, u64 tx_limit_max, "tx_limit_max", "Maximum TX rate in bits/s") \
+ m(+1, u64 tx_burst_size, "tx_burst_size", "Current burst size in number of packets. A value of zero means use firmware default.") \
+ m(+1, u64 tx_burst_size_max, "tx_burst_size_max", "Maximum burst size in number of packets") \
+ m(+1, u64 tx_burst_size_min, "tx_burst_size_min", "Minimum burst size in number of packets")
+
+#define MLX5E_RL_PARAMS_NUM (0 MLX5E_RL_PARAMS(MLX5E_STATS_COUNT))
+
+#define MLX5E_RL_STATS(m) \
+ m(+1, u64 tx_allocate_resource_failure, "tx_allocate_resource_failure", "Number of times firmware resource allocation failed") \
+ m(+1, u64 tx_add_new_rate_failure, "tx_add_new_rate_failure", "Number of times adding a new firmware rate failed") \
+ m(+1, u64 tx_modify_rate_failure, "tx_modify_rate_failure", "Number of times modifying a firmware rate failed") \
+ m(+1, u64 tx_active_connections, "tx_active_connections", "Number of active connections") \
+ m(+1, u64 tx_open_queues, "tx_open_queues", "Number of open TX queues") \
+ m(+1, u64 tx_available_resource_failure, "tx_available_resource_failure", "Number of times TX resources were not available")
+
+#define MLX5E_RL_STATS_NUM (0 MLX5E_RL_STATS(MLX5E_STATS_COUNT))
+
+#define MLX5E_RL_TABLE_PARAMS(m) \
+ m(+1, u64 tx_limit_add, "tx_limit_add", "Add TX rate limit in bits/s to empty slot") \
+ m(+1, u64 tx_limit_clr, "tx_limit_clr", "Clear all TX rates in table") \
+ m(+1, u64 tx_allowed_deviation, "tx_allowed_deviation", "Relative rate deviation allowed in 1/1000") \
+ m(+1, u64 tx_allowed_deviation_min, "tx_allowed_deviation_min", "Minimum allowed rate deviation in 1/1000") \
+ m(+1, u64 tx_allowed_deviation_max, "tx_allowed_deviation_max", "Maximum allowed rate deviation in 1/1000")
+
+#define MLX5E_RL_TABLE_PARAMS_NUM (0 MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_COUNT))
+
+#define MLX5E_RL_PARAMS_INDEX(n) \
+ (__offsetof(struct mlx5e_rl_params, n) / sizeof(uint64_t))
+
+struct mlx5e_priv;
+
+/* Indicates channel's state */
+enum {
+ MLX5E_RL_ST_FREE,
+ MLX5E_RL_ST_USED,
+ MLX5E_RL_ST_MODIFY,
+ MLX5E_RL_ST_DESTROY,
+};
+
+struct mlx5e_rl_stats {
+ u64 arg [0];
+ MLX5E_RL_STATS(MLX5E_STATS_VAR)
+};
+
+struct mlx5e_rl_params {
+ u64 arg [0];
+ MLX5E_RL_PARAMS(MLX5E_STATS_VAR)
+ u64 table_arg [0];
+ MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_VAR)
+};
+
+struct mlx5e_rl_channel_param {
+ struct mlx5e_sq_param sq;
+ struct mlx5e_cq_param cq;
+};
+
+struct mlx5e_rl_channel {
+ struct m_snd_tag m_snd_tag;
+ STAILQ_ENTRY(mlx5e_rl_channel) entry;
+ struct mlx5e_sq * volatile sq;
+ struct mlx5e_rl_worker *worker;
+ uint64_t new_rate;
+ uint64_t init_rate;
+ uint64_t last_rate;
+ uint16_t last_burst;
+ uint16_t state;
+};
+
+struct mlx5e_rl_worker {
+ struct mtx mtx;
+ struct cv cv;
+ STAILQ_HEAD(, mlx5e_rl_channel) index_list_head;
+ STAILQ_HEAD(, mlx5e_rl_channel) process_head;
+ struct mlx5e_priv *priv;
+ struct mlx5e_rl_channel *channels;
+ unsigned worker_done;
+};
+
+struct mlx5e_rl_priv_data {
+ struct sx rl_sxlock;
+ struct sysctl_ctx_list ctx;
+ struct mlx5e_rl_channel_param chan_param;
+ struct mlx5e_rl_params param;
+ struct mlx5e_rl_stats stats;
+ struct mlx5_uar sq_uar;
+ struct mlx5e_rl_worker *workers;
+ struct mlx5e_priv *priv;
+ uint64_t *rate_limit_table;
+ unsigned opened;
+ uint32_t tisn;
+};
+
+int mlx5e_rl_init(struct mlx5e_priv *priv);
+void mlx5e_rl_cleanup(struct mlx5e_priv *priv);
+if_snd_tag_alloc_t mlx5e_rl_snd_tag_alloc;
+if_snd_tag_modify_t mlx5e_rl_snd_tag_modify;
+if_snd_tag_query_t mlx5e_rl_snd_tag_query;
+if_snd_tag_free_t mlx5e_rl_snd_tag_free;
+
+#endif /* __MLX5_EN_RL_H__ */
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
index c2c4b0d77449..ed46451cf9ed 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
@@ -3507,6 +3507,13 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev)
ifp->if_capabilities |= IFCAP_LRO;
ifp->if_capabilities |= IFCAP_TSO | IFCAP_VLAN_HWTSO;
ifp->if_capabilities |= IFCAP_HWSTATS | IFCAP_HWRXTSTMP;
+#ifdef RATELIMIT
+ ifp->if_capabilities |= IFCAP_TXRTLMT;
+ ifp->if_snd_tag_alloc = mlx5e_rl_snd_tag_alloc;
+ ifp->if_snd_tag_free = mlx5e_rl_snd_tag_free;
+ ifp->if_snd_tag_modify = mlx5e_rl_snd_tag_modify;
+ ifp->if_snd_tag_query = mlx5e_rl_snd_tag_query;
+#endif
/* set TSO limits so that we don't have to drop TX packets */
ifp->if_hw_tsomax = MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
@@ -3588,6 +3595,14 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev)
random_ether_addr(dev_addr);
if_printf(ifp, "Assigned random MAC address\n");
}
+#ifdef RATELIMIT
+ err = mlx5e_rl_init(priv);
+ if (err) {
+ if_printf(ifp, "%s: mlx5e_rl_init failed, %d\n",
+ __func__, err);
+ goto err_create_mkey;
+ }
+#endif
/* set default MTU */
mlx5e_set_dev_port_mtu(ifp, ifp->if_mtu);
@@ -3673,6 +3688,10 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev)
return (priv);
+#ifdef RATELIMIT
+err_create_mkey:
+ mlx5_core_destroy_mkey(priv->mdev, &priv->mr);
+#endif
err_dealloc_transport_domain:
mlx5_dealloc_transport_domain(mdev, priv->tdn);
@@ -3715,6 +3734,18 @@ mlx5e_destroy_ifp(struct mlx5_core_dev *mdev, void *vpriv)
/* XXX wait a bit to allow IOCTL handlers to complete */
pause("W", hz);
+#ifdef RATELIMIT
+ /*
+ * The kernel can have reference(s) via the m_snd_tag's into
+ * the ratelimit channels, and these must go away before
+ * detaching:
+ */
+ while (READ_ONCE(priv->rl.stats.tx_active_connections) != 0) {
+ if_printf(priv->ifp, "Waiting for all ratelimit connections "
+ "to terminate\n");
+ pause("W", hz);
+ }
+#endif
/* stop watchdog timer */
callout_drain(&priv->watchdog);
@@ -3735,6 +3766,9 @@ mlx5e_destroy_ifp(struct mlx5_core_dev *mdev, void *vpriv)
ether_ifdetach(ifp);
if_free(ifp);
+#ifdef RATELIMIT
+ mlx5e_rl_cleanup(priv);
+#endif
/* destroy all remaining sysctl nodes */
if (priv->sysctl_debug)
sysctl_ctx_free(&priv->stats.port_stats_debug.ctx);
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c b/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c
new file mode 100644
index 000000000000..051420373ac7
--- /dev/null
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c
@@ -0,0 +1,1539 @@
+/*-
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "en.h"
+
+#ifdef RATELIMIT
+
+static int mlx5e_rl_open_workers(struct mlx5e_priv *);
+static void mlx5e_rl_close_workers(struct mlx5e_priv *);
+static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS);
+static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *, unsigned x,
+ struct sysctl_oid *, const char *name, const char *desc);
+static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
+ struct sysctl_oid *node, const char *name, const char *desc);
+static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *, uint64_t value);
+static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *, uint64_t value);
+
+static void
+mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data *rl,
+ struct mlx5e_sq_param *param)
+{
+ void *sqc = param->sqc;
+ void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
+ uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
+
+ MLX5_SET(wq, wq, log_wq_sz, log_sq_size);
+ MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
+ MLX5_SET(wq, wq, pd, rl->priv->pdn);
+
+ param->wq.buf_numa_node = 0;
+ param->wq.db_numa_node = 0;
+ param->wq.linear = 1;
+}
+
+static void
+mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data *rl,
+ struct mlx5e_cq_param *param)
+{
+ void *cqc = param->cqc;
+ uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
+
+ MLX5_SET(cqc, cqc, log_cq_size, log_sq_size);
+ MLX5_SET(cqc, cqc, cq_period, rl->param.tx_coalesce_usecs);
+ MLX5_SET(cqc, cqc, cq_max_count, rl->param.tx_coalesce_pkts);
+
+ switch (rl->param.tx_coalesce_mode) {
+ case 0:
+ MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
+ break;
+ default:
+ if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_start_from_cqe))
+ MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
+ else
+ MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
+ break;
+ }
+}
+
+static void
+mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data *rl,
+ struct mlx5e_rl_channel_param *cparam)
+{
+ memset(cparam, 0, sizeof(*cparam));
+
+ mlx5e_rl_build_sq_param(rl, &cparam->sq);
+ mlx5e_rl_build_cq_param(rl, &cparam->cq);
+}
+
+static int
+mlx5e_rl_create_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
+ struct mlx5e_sq_param *param, int ix)
+{
+ struct mlx5_core_dev *mdev = priv->mdev;
+ void *sqc = param->sqc;
+ void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq);
+ int err;
+
+ /* Create DMA descriptor TAG */
+ if ((err = -bus_dma_tag_create(
+ bus_get_dma_tag(mdev->pdev->dev.bsddev),
+ 1, /* any alignment */
+ 0, /* no boundary */
+ BUS_SPACE_MAXADDR, /* lowaddr */
+ BUS_SPACE_MAXADDR, /* highaddr */
+ NULL, NULL, /* filter, filterarg */
+ MLX5E_MAX_TX_PAYLOAD_SIZE, /* maxsize */
+ MLX5E_MAX_TX_MBUF_FRAGS, /* nsegments */
+ MLX5E_MAX_TX_MBUF_SIZE, /* maxsegsize */
+ 0, /* flags */
+ NULL, NULL, /* lockfunc, lockfuncarg */
+ &sq->dma_tag)))
+ goto done;
+
+ /* use shared UAR */
+ sq->uar = priv->rl.sq_uar;
+
+ err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq,
+ &sq->wq_ctrl);
+ if (err)
+ goto err_free_dma_tag;
+
+ sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
+ /*
+ * The sq->bf_buf_size variable is intentionally left zero so
+ * that the doorbell writes will occur at the same memory
+ * location.
+ */
+
+ err = mlx5e_alloc_sq_db(sq);
+ if (err)
+ goto err_sq_wq_destroy;
+
+ sq->mkey_be = cpu_to_be32(priv->mr.key);
+ sq->ifp = priv->ifp;
+ sq->priv = priv;
+
+ return (0);
+
+err_sq_wq_destroy:
+ mlx5_wq_destroy(&sq->wq_ctrl);
+err_free_dma_tag:
+ bus_dma_tag_destroy(sq->dma_tag);
+done:
+ return (err);
+}
+
+static void
+mlx5e_rl_destroy_sq(struct mlx5e_sq *sq)
+{
+
+ mlx5e_free_sq_db(sq);
+ mlx5_wq_destroy(&sq->wq_ctrl);
+}
+
+static int
+mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
+ struct mlx5e_sq_param *param, int ix)
+{
+ int err;
+
+ err = mlx5e_rl_create_sq(priv, sq, param, ix);
+ if (err)
+ return (err);
+
+ err = mlx5e_enable_sq(sq, param, priv->rl.tisn);
+ if (err)
+ goto err_destroy_sq;
+
+ err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
+ if (err)
+ goto err_disable_sq;
+
+ return (0);
+
+err_disable_sq:
+ mlx5e_disable_sq(sq);
+err_destroy_sq:
+ mlx5e_rl_destroy_sq(sq);
+
+ return (err);
+}
+
+static void
+mlx5e_rl_chan_mtx_init(struct mlx5e_priv *priv, struct mlx5e_sq *sq)
+{
+ mtx_init(&sq->lock, "mlx5tx-rl", NULL, MTX_DEF);
+ mtx_init(&sq->comp_lock, "mlx5comp-rl", NULL, MTX_DEF);
+
+ callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
+
+ sq->cev_factor = priv->rl.param.tx_completion_fact;
+
+ /* ensure the TX completion event factor is not zero */
+ if (sq->cev_factor == 0)
+ sq->cev_factor = 1;
+}
+
+static int
+mlx5e_rl_open_channel(struct mlx5e_rl_worker *rlw, int eq_ix,
+ struct mlx5e_rl_channel_param *cparam,
+ struct mlx5e_sq *volatile *ppsq)
+{
+ struct mlx5e_priv *priv = rlw->priv;
+ struct mlx5e_sq *sq;
+ int err;
+
+ sq = malloc(sizeof(*sq), M_MLX5EN, M_WAITOK | M_ZERO);
+
+ /* init mutexes */
+ mlx5e_rl_chan_mtx_init(priv, sq);
+
+ /* open TX completion queue */
+ err = mlx5e_open_cq(priv, &cparam->cq, &sq->cq,
+ &mlx5e_tx_cq_comp, eq_ix);
+ if (err)
+ goto err_free;
+
+ err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix);
+ if (err)
+ goto err_close_tx_cq;
+
+ /* store TX channel pointer */
+ *ppsq = sq;
+
+ /* poll TX queue initially */
+ sq->cq.mcq.comp(&sq->cq.mcq);
+
+ return (0);
+
+err_close_tx_cq:
+ mlx5e_close_cq(&sq->cq);
+
+err_free:
+ /* destroy mutexes */
+ mtx_destroy(&sq->lock);
+ mtx_destroy(&sq->comp_lock);
+ free(sq, M_MLX5EN);
+ atomic_add_64(&priv->rl.stats.tx_allocate_resource_failure, 1ULL);
+ return (err);
+}
+
+static void
+mlx5e_rl_close_channel(struct mlx5e_sq *volatile *ppsq)
+{
+ struct mlx5e_sq *sq = *ppsq;
+
+ /* check if channel is already closed */
+ if (sq == NULL)
+ return;
+ /* ensure channel pointer is no longer used */
+ *ppsq = NULL;
+
+ /* teardown and destroy SQ */
+ mlx5e_drain_sq(sq);
+ mlx5e_disable_sq(sq);
+ mlx5e_rl_destroy_sq(sq);
+
+ /* close CQ */
+ mlx5e_close_cq(&sq->cq);
+
+ /* destroy mutexes */
+ mtx_destroy(&sq->lock);
+ mtx_destroy(&sq->comp_lock);
+
+ free(sq, M_MLX5EN);
+}
+
+static void
+mlx5e_rl_sync_tx_completion_fact(struct mlx5e_rl_priv_data *rl)
+{
+ /*
+ * Limit the maximum distance between completion events to
+ * half of the currently set TX queue size.
+ *
+ * The maximum number of queue entries a single IP packet can
+ * consume is given by MLX5_SEND_WQE_MAX_WQEBBS.
+ *
+ * The worst case max value is then given as below:
+ */
+ uint64_t max = rl->param.tx_queue_size /
+ (2 * MLX5_SEND_WQE_MAX_WQEBBS);
+
+ /*
+ * Update the maximum completion factor value in case the
+ * tx_queue_size field changed. Ensure we don't overflow
+ * 16-bits.
+ */
+ if (max < 1)
+ max = 1;
+ else if (max > 65535)
+ max = 65535;
+ rl->param.tx_completion_fact_max = max;
+
+ /*
+ * Verify that the current TX completion factor is within the
+ * given limits:
+ */
+ if (rl->param.tx_completion_fact < 1)
+ rl->param.tx_completion_fact = 1;
+ else if (rl->param.tx_completion_fact > max)
+ rl->param.tx_completion_fact = max;
+}
+
+static int
+mlx5e_rl_modify_sq(struct mlx5e_sq *sq, uint16_t rl_index)
+{
+ struct mlx5e_priv *priv = sq->priv;
+ struct mlx5_core_dev *mdev = priv->mdev;
+
+ void *in;
+ void *sqc;
+ int inlen;
+ int err;
+
+ inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
+ in = mlx5_vzalloc(inlen);
+ if (in == NULL)
+ return (-ENOMEM);
+
+ sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
+
+ MLX5_SET(modify_sq_in, in, sqn, sq->sqn);
+ MLX5_SET(modify_sq_in, in, sq_state, MLX5_SQC_STATE_RDY);
+ MLX5_SET64(modify_sq_in, in, modify_bitmask, 1);
+ MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RDY);
+ MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index);
+
+ err = mlx5_core_modify_sq(mdev, in, inlen);
+
+ kvfree(in);
+
+ return (err);
+}
+
+/*
+ * This function will search the configured rate limit table for the
+ * best match to avoid that a single socket based application can
+ * allocate all the available hardware rates. If the user selected
+ * rate deviates too much from the closes rate available in the rate
+ * limit table, unlimited rate will be selected.
+ */
+static uint64_t
+mlx5e_rl_find_best_rate_locked(struct mlx5e_rl_priv_data *rl, uint64_t user_rate)
+{
+ uint64_t distance = -1ULL;
+ uint64_t diff;
+ uint64_t retval = 0; /* unlimited */
+ uint64_t x;
+
+ /* search for closest rate */
+ for (x = 0; x != rl->param.tx_rates_def; x++) {
+ uint64_t rate = rl->rate_limit_table[x];
+ if (rate == 0)
+ continue;
+
+ if (rate > user_rate)
+ diff = rate - user_rate;
+ else
+ diff = user_rate - rate;
+
+ /* check if distance is smaller than previous rate */
+ if (diff < distance) {
+ distance = diff;
+ retval = rate;
+ }
+ }
+
+ /* range check for multiplication below */
+ if (user_rate > rl->param.tx_limit_max)
+ user_rate = rl->param.tx_limit_max;
+
+ /* fallback to unlimited, if rate deviates too much */
+ if (distance > howmany(user_rate *
+ rl->param.tx_allowed_deviation, 1000ULL))
+ retval = 0;
+
+ return (retval);
+}
+
+/*
+ * This function sets the requested rate for a rate limit channel, in
+ * bits per second. The requested rate will be filtered through the
+ * find best rate function above.
+ */
+static int
+mlx5e_rlw_channel_set_rate_locked(struct mlx5e_rl_worker *rlw,
+ struct mlx5e_rl_channel *channel, uint64_t rate)
+{
+ struct mlx5e_rl_priv_data *rl = &rlw->priv->rl;
+ struct mlx5e_sq *sq;
+ uint64_t temp;
+ uint16_t index;
+ uint16_t burst;
+ int error;
+
+ if (rate != 0) {
+ MLX5E_RL_WORKER_UNLOCK(rlw);
+
+ MLX5E_RL_RLOCK(rl);
+
+ /* get current burst size in bytes */
+ temp = rl->param.tx_burst_size *
+ MLX5E_SW2HW_MTU(rlw->priv->ifp->if_mtu);
+
+ /* limit burst size to 64K currently */
+ if (temp > 65535)
+ temp = 65535;
+ burst = temp;
+
+ /* find best rate */
+ rate = mlx5e_rl_find_best_rate_locked(rl, rate);
+
+ MLX5E_RL_RUNLOCK(rl);
+
+ if (rate == 0) {
+ /* rate doesn't exist, fallback to unlimited */
+ error = EINVAL;
+ index = 0;
+ rate = 0;
+ atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
+ } else {
+ /* get a reference on the new rate */
+ error = -mlx5_rl_add_rate(rlw->priv->mdev,
+ howmany(rate, 1000), burst, &index);
+
+ if (error != 0) {
+ /* adding rate failed, fallback to unlimited */
+ index = 0;
+ rate = 0;
+ atomic_add_64(&rlw->priv->rl.stats.tx_add_new_rate_failure, 1ULL);
+ }
+ }
+ MLX5E_RL_WORKER_LOCK(rlw);
+ } else {
+ index = 0;
+ burst = 0; /* default */
+ }
+
+ /* atomically swap rates */
+ temp = channel->last_rate;
+ channel->last_rate = rate;
+ rate = temp;
+
+ /* atomically swap burst size */
+ temp = channel->last_burst;
+ channel->last_burst = burst;
+ burst = temp;
+
+ MLX5E_RL_WORKER_UNLOCK(rlw);
+ /* put reference on the old rate, if any */
+ if (rate != 0) {
+ mlx5_rl_remove_rate(rlw->priv->mdev,
+ howmany(rate, 1000), burst);
+ }
+
+ /* set new rate */
+ sq = channel->sq;
+ if (sq != NULL) {
+ error = mlx5e_rl_modify_sq(sq, index);
+ if (error != 0)
+ atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
+ } else
+ error = 0;
+ MLX5E_RL_WORKER_LOCK(rlw);
+
+ return (-error);
+}
+
+static void
+mlx5e_rl_worker(void *arg)
+{
+ struct thread *td;
+ struct mlx5e_rl_worker *rlw = arg;
+ struct mlx5e_rl_channel *channel;
+ struct mlx5e_priv *priv;
+ unsigned ix;
+ uint64_t x;
+ int error;
+
+ /* set thread priority */
+ td = curthread;
+
+ thread_lock(td);
+ sched_prio(td, PI_SWI(SWI_NET));
+ thread_unlock(td);
+
+ priv = rlw->priv;
+
+ /* compute completion vector */
+ ix = (rlw - priv->rl.workers) %
+ priv->mdev->priv.eq_table.num_comp_vectors;
+
+ /* TODO bind to CPU */
+
+ /* open all the SQs */
+ MLX5E_RL_WORKER_LOCK(rlw);
+ for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
+ struct mlx5e_rl_channel *channel = rlw->channels + x;
+
+#if !defined(HAVE_RL_PRE_ALLOCATE_CHANNELS)
+ if (channel->state == MLX5E_RL_ST_FREE)
+ continue;
+#endif
+ MLX5E_RL_WORKER_UNLOCK(rlw);
+
+ MLX5E_RL_RLOCK(&priv->rl);
+ error = mlx5e_rl_open_channel(rlw, ix,
+ &priv->rl.chan_param, &channel->sq);
+ MLX5E_RL_RUNLOCK(&priv->rl);
+
+ MLX5E_RL_WORKER_LOCK(rlw);
+ if (error != 0) {
+ if_printf(priv->ifp,
+ "mlx5e_rl_open_channel failed: %d\n", error);
+ break;
+ }
+ mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->init_rate);
+ }
+ while (1) {
+ if (STAILQ_FIRST(&rlw->process_head) == NULL) {
+ /* check if we are tearing down */
+ if (rlw->worker_done != 0)
+ break;
+ cv_wait(&rlw->cv, &rlw->mtx);
+ }
+ /* check if we are tearing down */
+ if (rlw->worker_done != 0)
+ break;
+ channel = STAILQ_FIRST(&rlw->process_head);
+ if (channel != NULL) {
+ STAILQ_REMOVE_HEAD(&rlw->process_head, entry);
+
+ switch (channel->state) {
+ case MLX5E_RL_ST_MODIFY:
+ channel->state = MLX5E_RL_ST_USED;
+ MLX5E_RL_WORKER_UNLOCK(rlw);
+
+ /* create channel by demand */
+ if (channel->sq == NULL) {
+ MLX5E_RL_RLOCK(&priv->rl);
+ error = mlx5e_rl_open_channel(rlw, ix,
+ &priv->rl.chan_param, &channel->sq);
+ MLX5E_RL_RUNLOCK(&priv->rl);
+
+ if (error != 0) {
+ if_printf(priv->ifp,
+ "mlx5e_rl_open_channel failed: %d\n", error);
+ } else {
+ atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, 1ULL);
+ }
+ } else {
+ mlx5e_resume_sq(channel->sq);
+ }
+
+ MLX5E_RL_WORKER_LOCK(rlw);
+ /* convert from bytes/s to bits/s and set new rate */
+ error = mlx5e_rlw_channel_set_rate_locked(rlw, channel,
+ channel->new_rate * 8ULL);
+ if (error != 0) {
+ if_printf(priv->ifp,
+ "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
+ error);
+ }
+ break;
+
+ case MLX5E_RL_ST_DESTROY:
+ error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
+ if (error != 0) {
+ if_printf(priv->ifp,
+ "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
+ error);
+ }
+ if (channel->sq != NULL) {
+ /*
+ * Make sure all packets are
+ * transmitted before SQ is
+ * returned to free list:
+ */
+ MLX5E_RL_WORKER_UNLOCK(rlw);
+ mlx5e_drain_sq(channel->sq);
+ MLX5E_RL_WORKER_LOCK(rlw);
+ }
+ /* put the channel back into the free list */
+ STAILQ_INSERT_HEAD(&rlw->index_list_head, channel, entry);
+ channel->state = MLX5E_RL_ST_FREE;
+ atomic_add_64(&priv->rl.stats.tx_active_connections, -1ULL);
+ break;
+ default:
+ /* NOP */
+ break;
+ }
+ }
+ }
+
+ /* close all the SQs */
+ for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
+ struct mlx5e_rl_channel *channel = rlw->channels + x;
+
+ /* update the initial rate */
+ channel->init_rate = channel->last_rate;
+
+ /* make sure we free up the rate resource */
+ mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
+
+ if (channel->sq != NULL) {
+ MLX5E_RL_WORKER_UNLOCK(rlw);
+ mlx5e_rl_close_channel(&channel->sq);
+ atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, -1ULL);
+ MLX5E_RL_WORKER_LOCK(rlw);
+ }
+ }
+
+ rlw->worker_done = 0;
+ cv_broadcast(&rlw->cv);
+ MLX5E_RL_WORKER_UNLOCK(rlw);
+
+ kthread_exit();
+}
+
+static int
+mlx5e_rl_open_tis(struct mlx5e_priv *priv)
+{
+ struct mlx5_core_dev *mdev = priv->mdev;
+ u32 in[MLX5_ST_SZ_DW(create_tis_in)];
+ void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
+
+ memset(in, 0, sizeof(in));
+
+ MLX5_SET(tisc, tisc, prio, 0);
+ MLX5_SET(tisc, tisc, transport_domain, priv->tdn);
+
+ return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->rl.tisn));
+}
+
+static void
+mlx5e_rl_close_tis(struct mlx5e_priv *priv)
+{
+ mlx5_core_destroy_tis(priv->mdev, priv->rl.tisn);
+}
+
+static void
+mlx5e_rl_set_default_params(struct mlx5e_rl_params *param,
+ struct mlx5_core_dev *mdev)
+{
+ /* ratelimit workers */
+ param->tx_worker_threads_def = mdev->priv.eq_table.num_comp_vectors;
+ param->tx_worker_threads_max = MLX5E_RL_MAX_WORKERS;
+
+ /* range check */
+ if (param->tx_worker_threads_def == 0 ||
+ param->tx_worker_threads_def > param->tx_worker_threads_max)
+ param->tx_worker_threads_def = param->tx_worker_threads_max;
+
+ /* ratelimit channels */
+ param->tx_channels_per_worker_def = MLX5E_RL_MAX_SQS /
+ param->tx_worker_threads_def;
+ param->tx_channels_per_worker_max = MLX5E_RL_MAX_SQS;
+
+ /* range check */
+ if (param->tx_channels_per_worker_def > MLX5E_RL_DEF_SQ_PER_WORKER)
+ param->tx_channels_per_worker_def = MLX5E_RL_DEF_SQ_PER_WORKER;
+
+ /* set default burst size */
+ param->tx_burst_size = 4; /* MTUs */
+
+ /*
+ * Set maximum burst size
+ *
+ * The burst size is multiplied by the MTU and clamped to the
+ * range 0 ... 65535 bytes inclusivly before fed into the
+ * firmware.
+ *
+ * NOTE: If the burst size or MTU is changed only ratelimit
+ * connections made after the change will use the new burst
+ * size.
+ */
+ param->tx_burst_size_max = 255;
+
+ /* get firmware rate limits in 1000bit/s and convert them to bit/s */
+ param->tx_limit_min = mdev->priv.rl_table.min_rate * 1000ULL;
+ param->tx_limit_max = mdev->priv.rl_table.max_rate * 1000ULL;
+
+ /* ratelimit table size */
+ param->tx_rates_max = mdev->priv.rl_table.max_size;
+
+ /* range check */
+ if (param->tx_rates_max > MLX5E_RL_MAX_TX_RATES)
+ param->tx_rates_max = MLX5E_RL_MAX_TX_RATES;
+
+ /* set default number of rates */
+ param->tx_rates_def = param->tx_rates_max;
+
+ /* set maximum allowed rate deviation */
+ if (param->tx_limit_max != 0) {
+ /*
+ * Make sure the deviation multiplication doesn't
+ * overflow unsigned 64-bit:
+ */
+ param->tx_allowed_deviation_max = -1ULL /
+ param->tx_limit_max;
+ }
+ /* set default rate deviation */
+ param->tx_allowed_deviation = 50; /* 5.0% */
+
+ /* channel parameters */
+ param->tx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
+ param->tx_coalesce_usecs = MLX5E_RL_TX_COAL_USEC_DEFAULT;
+ param->tx_coalesce_pkts = MLX5E_RL_TX_COAL_PKTS_DEFAULT;
+ param->tx_coalesce_mode = MLX5E_RL_TX_COAL_MODE_DEFAULT;
+ param->tx_completion_fact = MLX5E_RL_TX_COMP_FACT_DEFAULT;
+}
+
+static const char *mlx5e_rl_params_desc[] = {
+ MLX5E_RL_PARAMS(MLX5E_STATS_DESC)
+};
+
+static const char *mlx5e_rl_table_params_desc[] = {
+ MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_DESC)
+};
+
+static const char *mlx5e_rl_stats_desc[] = {
+ MLX5E_RL_STATS(MLX5E_STATS_DESC)
+};
+
+int
+mlx5e_rl_init(struct mlx5e_priv *priv)
+{
+ struct mlx5e_rl_priv_data *rl = &priv->rl;
+ struct sysctl_oid *node;
+ struct sysctl_oid *stats;
+ char buf[64];
+ uint64_t i;
+ uint64_t j;
+ int error;
+
+ /* check if there is support for packet pacing */
+ if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
+ return (0);
+
+ rl->priv = priv;
+
+ sysctl_ctx_init(&rl->ctx);
+
+ sx_init(&rl->rl_sxlock, "ratelimit-sxlock");
+
+ /* allocate shared UAR for SQs */
+ error = mlx5_alloc_map_uar(priv->mdev, &rl->sq_uar);
+ if (error)
+ goto done;
+
+ /* open own TIS domain for ratelimit SQs */
+ error = mlx5e_rl_open_tis(priv);
+ if (error)
+ goto err_uar;
+
+ /* setup default value for parameters */
+ mlx5e_rl_set_default_params(&rl->param, priv->mdev);
+
+ /* update the completion factor */
+ mlx5e_rl_sync_tx_completion_fact(rl);
+
+ /* create root node */
+ node = SYSCTL_ADD_NODE(&rl->ctx,
+ SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO,
+ "rate_limit", CTLFLAG_RW, NULL, "Rate limiting support");
+
+ if (node != NULL) {
+ /* create SYSCTLs */
+ for (i = 0; i != MLX5E_RL_PARAMS_NUM; i++) {
+ mlx5e_rl_sysctl_add_u64_oid(rl,
+ MLX5E_RL_PARAMS_INDEX(arg[i]),
+ node, mlx5e_rl_params_desc[2 * i],
+ mlx5e_rl_params_desc[2 * i + 1]);
+ }
+
+ stats = SYSCTL_ADD_NODE(&rl->ctx, SYSCTL_CHILDREN(node),
+ OID_AUTO, "stats", CTLFLAG_RD, NULL,
+ "Rate limiting statistics");
+ if (stats != NULL) {
+ /* create SYSCTLs */
+ for (i = 0; i != MLX5E_RL_STATS_NUM; i++) {
+ mlx5e_rl_sysctl_add_stats_u64_oid(rl, i,
+ stats, mlx5e_rl_stats_desc[2 * i],
+ mlx5e_rl_stats_desc[2 * i + 1]);
+ }
+ }
+ }
+
+ /* allocate workers array */
+ rl->workers = malloc(sizeof(rl->workers[0]) *
+ rl->param.tx_worker_threads_def, M_MLX5EN, M_WAITOK | M_ZERO);
+
+ /* allocate rate limit array */
+ rl->rate_limit_table = malloc(sizeof(rl->rate_limit_table[0]) *
+ rl->param.tx_rates_def, M_MLX5EN, M_WAITOK | M_ZERO);
+
+ if (node != NULL) {
+ /* create more SYSCTls */
+ SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
+ "tx_rate_show", CTLTYPE_STRING | CTLFLAG_RD |
+ CTLFLAG_MPSAFE, rl, 0, &mlx5e_rl_sysctl_show_rate_table,
+ "A", "Show table of all configured TX rates");
+
+ /* try to fetch rate table from kernel environment */
+ for (i = 0; i != rl->param.tx_rates_def; i++) {
+ /* compute path for tunable */
+ snprintf(buf, sizeof(buf), "dev.mce.%d.rate_limit.tx_rate_add_%d",
+ device_get_unit(priv->mdev->pdev->dev.bsddev), (int)i);
+ if (TUNABLE_QUAD_FETCH(buf, &j))
+ mlx5e_rl_tx_limit_add(rl, j);
+ }
+
+ /* setup rate table sysctls */
+ for (i = 0; i != MLX5E_RL_TABLE_PARAMS_NUM; i++) {
+ mlx5e_rl_sysctl_add_u64_oid(rl,
+ MLX5E_RL_PARAMS_INDEX(table_arg[i]),
+ node, mlx5e_rl_table_params_desc[2 * i],
+ mlx5e_rl_table_params_desc[2 * i + 1]);
+ }
+ }
+
+ for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
+ struct mlx5e_rl_worker *rlw = rl->workers + j;
+
+ rlw->priv = priv;
+
+ cv_init(&rlw->cv, "mlx5-worker-cv");
+ mtx_init(&rlw->mtx, "mlx5-worker-mtx", NULL, MTX_DEF);
+ STAILQ_INIT(&rlw->index_list_head);
+ STAILQ_INIT(&rlw->process_head);
+
+ rlw->channels = malloc(sizeof(rlw->channels[0]) *
+ rl->param.tx_channels_per_worker_def, M_MLX5EN, M_WAITOK | M_ZERO);
+
+ MLX5E_RL_WORKER_LOCK(rlw);
+ for (i = 0; i < rl->param.tx_channels_per_worker_def; i++) {
+ struct mlx5e_rl_channel *channel = rlw->channels + i;
+ channel->worker = rlw;
+ channel->m_snd_tag.ifp = priv->ifp;
+ STAILQ_INSERT_TAIL(&rlw->index_list_head, channel, entry);
+ }
+ MLX5E_RL_WORKER_UNLOCK(rlw);
+ }
+
+ PRIV_LOCK(priv);
+ error = mlx5e_rl_open_workers(priv);
+ PRIV_UNLOCK(priv);
+
+ if (error != 0) {
+ if_printf(priv->ifp,
+ "mlx5e_rl_open_workers failed: %d\n", error);
+ }
+
+ return (0);
+
+err_uar:
+ mlx5_unmap_free_uar(priv->mdev, &rl->sq_uar);
+done:
+ sysctl_ctx_free(&rl->ctx);
+ sx_destroy(&rl->rl_sxlock);
+ return (error);
+}
+
+static int
+mlx5e_rl_open_workers(struct mlx5e_priv *priv)
+{
+ struct mlx5e_rl_priv_data *rl = &priv->rl;
+ struct thread *rl_thread = NULL;
+ struct proc *rl_proc = NULL;
+ uint64_t j;
+ int error;
+
+ if (priv->gone || rl->opened)
+ return (-EINVAL);
+
+ MLX5E_RL_WLOCK(rl);
+ /* compute channel parameters once */
+ mlx5e_rl_build_channel_param(rl, &rl->chan_param);
+ MLX5E_RL_WUNLOCK(rl);
+
+ for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
+ struct mlx5e_rl_worker *rlw = rl->workers + j;
+
+ /* start worker thread */
+ error = kproc_kthread_add(mlx5e_rl_worker, rlw, &rl_proc, &rl_thread,
+ RFHIGHPID, 0, "mlx5-ratelimit", "mlx5-rl-worker-thread-%d", (int)j);
+ if (error != 0) {
+ if_printf(rl->priv->ifp,
+ "kproc_kthread_add failed: %d\n", error);
+ rlw->worker_done = 1;
+ }
+ }
+
+ rl->opened = 1;
+
+ return (0);
+}
+
+static void
+mlx5e_rl_close_workers(struct mlx5e_priv *priv)
+{
+ struct mlx5e_rl_priv_data *rl = &priv->rl;
+ uint64_t y;
+
+ if (rl->opened == 0)
+ return;
+
+ /* tear down worker threads simultaneously */
+ for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
+ struct mlx5e_rl_worker *rlw = rl->workers + y;
+
+ /* tear down worker before freeing SQs */
+ MLX5E_RL_WORKER_LOCK(rlw);
+ if (rlw->worker_done == 0) {
+ rlw->worker_done = 1;
+ cv_broadcast(&rlw->cv);
+ } else {
+ /* XXX thread not started */
+ rlw->worker_done = 0;
+ }
+ MLX5E_RL_WORKER_UNLOCK(rlw);
+ }
+
+ /* wait for worker threads to exit */
+ for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
+ struct mlx5e_rl_worker *rlw = rl->workers + y;
+
+ /* tear down worker before freeing SQs */
+ MLX5E_RL_WORKER_LOCK(rlw);
+ while (rlw->worker_done != 0)
+ cv_wait(&rlw->cv, &rlw->mtx);
+ MLX5E_RL_WORKER_UNLOCK(rlw);
+ }
+
+ rl->opened = 0;
+}
+
+static void
+mlx5e_rl_reset_rates(struct mlx5e_rl_priv_data *rl)
+{
+ unsigned x;
+
+ MLX5E_RL_WLOCK(rl);
+ for (x = 0; x != rl->param.tx_rates_def; x++)
+ rl->rate_limit_table[x] = 0;
+ MLX5E_RL_WUNLOCK(rl);
+}
+
+void
+mlx5e_rl_cleanup(struct mlx5e_priv *priv)
+{
+ struct mlx5e_rl_priv_data *rl = &priv->rl;
+ uint64_t y;
+
+ /* check if there is support for packet pacing */
+ if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
+ return;
+
+ /* TODO check if there is support for packet pacing */
+
+ sysctl_ctx_free(&rl->ctx);
+
+ PRIV_LOCK(priv);
+ mlx5e_rl_close_workers(priv);
+ PRIV_UNLOCK(priv);
+
+ mlx5e_rl_reset_rates(rl);
+
+ /* free shared UAR for SQs */
+ mlx5_unmap_free_uar(priv->mdev, &rl->sq_uar);
+
+ /* close TIS domain */
+ mlx5e_rl_close_tis(priv);
+
+ for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
+ struct mlx5e_rl_worker *rlw = rl->workers + y;
+
+ cv_destroy(&rlw->cv);
+ mtx_destroy(&rlw->mtx);
+ free(rlw->channels, M_MLX5EN);
+ }
+ free(rl->rate_limit_table, M_MLX5EN);
+ free(rl->workers, M_MLX5EN);
+ sx_destroy(&rl->rl_sxlock);
+}
+
+static void
+mlx5e_rlw_queue_channel_locked(struct mlx5e_rl_worker *rlw,
+ struct mlx5e_rl_channel *channel)
+{
+ STAILQ_INSERT_TAIL(&rlw->process_head, channel, entry);
+ cv_broadcast(&rlw->cv);
+}
+
+static void
+mlx5e_rl_free(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel)
+{
+ if (channel == NULL)
+ return;
+
+ MLX5E_RL_WORKER_LOCK(rlw);
+ switch (channel->state) {
+ case MLX5E_RL_ST_MODIFY:
+ channel->state = MLX5E_RL_ST_DESTROY;
+ break;
+ case MLX5E_RL_ST_USED:
+ channel->state = MLX5E_RL_ST_DESTROY;
+ mlx5e_rlw_queue_channel_locked(rlw, channel);
+ break;
+ default:
+ break;
+ }
+ MLX5E_RL_WORKER_UNLOCK(rlw);
+}
+
+static int
+mlx5e_rl_modify(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate)
+{
+
+ MLX5E_RL_WORKER_LOCK(rlw);
+ channel->new_rate = rate;
+ switch (channel->state) {
+ case MLX5E_RL_ST_USED:
+ channel->state = MLX5E_RL_ST_MODIFY;
+ mlx5e_rlw_queue_channel_locked(rlw, channel);
+ break;
+ default:
+ break;
+ }
+ MLX5E_RL_WORKER_UNLOCK(rlw);
+
+ return (0);
+}
+
+static int
+mlx5e_rl_query(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t *prate)
+{
+ int retval;
+
+ MLX5E_RL_WORKER_LOCK(rlw);
+ switch (channel->state) {
+ case MLX5E_RL_ST_USED:
+ *prate = channel->last_rate;
+ retval = 0;
+ break;
+ case MLX5E_RL_ST_MODIFY:
+ retval = EBUSY;
+ break;
+ default:
+ retval = EINVAL;
+ break;
+ }
+ MLX5E_RL_WORKER_UNLOCK(rlw);
+
+ return (retval);
+}
+
+static int
+mlx5e_find_available_tx_ring_index(struct mlx5e_rl_worker *rlw,
+ struct mlx5e_rl_channel **pchannel)
+{
+ struct mlx5e_rl_channel *channel;
+ int retval = ENOMEM;
+
+ MLX5E_RL_WORKER_LOCK(rlw);
+ /* Check for available channel in free list */
+ if ((channel = STAILQ_FIRST(&rlw->index_list_head)) != NULL) {
+ retval = 0;
+ /* Remove head index from available list */
+ STAILQ_REMOVE_HEAD(&rlw->index_list_head, entry);
+ channel->state = MLX5E_RL_ST_USED;
+ atomic_add_64(&rlw->priv->rl.stats.tx_active_connections, 1ULL);
+ } else {
+ atomic_add_64(&rlw->priv->rl.stats.tx_available_resource_failure, 1ULL);
+ }
+ MLX5E_RL_WORKER_UNLOCK(rlw);
+
+ *pchannel = channel;
+#ifdef RATELIMIT_DEBUG
+ if_printf(rlw->priv->ifp, "Channel pointer for rate limit connection is %p\n", channel);
+#endif
+ return (retval);
+}
+
+int
+mlx5e_rl_snd_tag_alloc(struct ifnet *ifp,
+ union if_snd_tag_alloc_params *params,
+ struct m_snd_tag **ppmt)
+{
+ struct mlx5e_rl_channel *channel;
+ struct mlx5e_rl_worker *rlw;
+ struct mlx5e_priv *priv;
+ int error;
+
+ priv = ifp->if_softc;
+
+ /* check if there is support for packet pacing or if device is going away */
+ if (!MLX5_CAP_GEN(priv->mdev, qos) ||
+ !MLX5_CAP_QOS(priv->mdev, packet_pacing) || priv->gone ||
+ params->rate_limit.hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT)
+ return (EOPNOTSUPP);
+
+ /* compute worker thread this TCP connection belongs to */
+ rlw = priv->rl.workers + ((params->rate_limit.hdr.flowid % 128) %
+ priv->rl.param.tx_worker_threads_def);
+
+ error = mlx5e_find_available_tx_ring_index(rlw, &channel);
+ if (error != 0)
+ goto done;
+
+ error = mlx5e_rl_modify(rlw, channel, params->rate_limit.max_rate);
+ if (error != 0) {
+ mlx5e_rl_free(rlw, channel);
+ goto done;
+ }
+
+ /* store pointer to mbuf tag */
+ *ppmt = &channel->m_snd_tag;
+done:
+ return (error);
+}
+
+
+int
+mlx5e_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
+{
+ struct mlx5e_rl_channel *channel =
+ container_of(pmt, struct mlx5e_rl_channel, m_snd_tag);
+
+ return (mlx5e_rl_modify(channel->worker, channel, params->rate_limit.max_rate));
+}
+
+int
+mlx5e_rl_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
+{
+ struct mlx5e_rl_channel *channel =
+ container_of(pmt, struct mlx5e_rl_channel, m_snd_tag);
+
+ return (mlx5e_rl_query(channel->worker, channel, &params->rate_limit.max_rate));
+}
+
+void
+mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt)
+{
+ struct mlx5e_rl_channel *channel =
+ container_of(pmt, struct mlx5e_rl_channel, m_snd_tag);
+
+ mlx5e_rl_free(channel->worker, channel);
+}
+
+static int
+mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS)
+{
+ struct mlx5e_rl_priv_data *rl = arg1;
+ struct mlx5e_priv *priv = rl->priv;
+ struct sbuf sbuf;
+ unsigned x;
+ int error;
+
+ error = sysctl_wire_old_buffer(req, 0);
+ if (error != 0)
+ return (error);
+
+ PRIV_LOCK(priv);
+
+ sbuf_new_for_sysctl(&sbuf, NULL, 128 * rl->param.tx_rates_def, req);
+
+ sbuf_printf(&sbuf,
+ "\n\n" "\t" "ENTRY" "\t" "BURST" "\t" "RATE [bit/s]\n"
+ "\t" "--------------------------------------------\n");
+
+ MLX5E_RL_RLOCK(rl);
+ for (x = 0; x != rl->param.tx_rates_def; x++) {
+ if (rl->rate_limit_table[x] == 0)
+ continue;
+
+ sbuf_printf(&sbuf, "\t" "%3u" "\t" "%3u" "\t" "%lld\n",
+ x, (unsigned)rl->param.tx_burst_size,
+ (long long)rl->rate_limit_table[x]);
+ }
+ MLX5E_RL_RUNLOCK(rl);
+
+ error = sbuf_finish(&sbuf);
+ sbuf_delete(&sbuf);
+
+ PRIV_UNLOCK(priv);
+
+ return (error);
+}
+
+static int
+mlx5e_rl_refresh_channel_params(struct mlx5e_rl_priv_data *rl)
+{
+ uint64_t x;
+ uint64_t y;
+
+ MLX5E_RL_WLOCK(rl);
+ /* compute channel parameters once */
+ mlx5e_rl_build_channel_param(rl, &rl->chan_param);
+ MLX5E_RL_WUNLOCK(rl);
+
+ for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
+ struct mlx5e_rl_worker *rlw = rl->workers + y;
+
+ for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
+ struct mlx5e_rl_channel *channel;
+ struct mlx5e_sq *sq;
+
+ channel = rlw->channels + x;
+ sq = channel->sq;
+
+ if (sq == NULL)
+ continue;
+
+ if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_mode_modify)) {
+ mlx5_core_modify_cq_moderation_mode(rl->priv->mdev, &sq->cq.mcq,
+ rl->param.tx_coalesce_usecs,
+ rl->param.tx_coalesce_pkts,
+ rl->param.tx_coalesce_mode);
+ } else {
+ mlx5_core_modify_cq_moderation(rl->priv->mdev, &sq->cq.mcq,
+ rl->param.tx_coalesce_usecs,
+ rl->param.tx_coalesce_pkts);
+ }
+ }
+ }
+ return (0);
+}
+
+static int
+mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *rl, uint64_t value)
+{
+ unsigned x;
+ int error;
+
+ if (value < 1000 ||
+ mlx5_rl_is_in_range(rl->priv->mdev, howmany(value, 1000), 0) == 0)
+ return (EINVAL);
+
+ MLX5E_RL_WLOCK(rl);
+ error = ENOMEM;
+
+ /* check if rate already exists */
+ for (x = 0; x != rl->param.tx_rates_def; x++) {
+ if (rl->rate_limit_table[x] != value)
+ continue;
+ error = EEXIST;
+ break;
+ }
+
+ /* check if there is a free rate entry */
+ if (x == rl->param.tx_rates_def) {
+ for (x = 0; x != rl->param.tx_rates_def; x++) {
+ if (rl->rate_limit_table[x] != 0)
+ continue;
+ rl->rate_limit_table[x] = value;
+ error = 0;
+ break;
+ }
+ }
+ MLX5E_RL_WUNLOCK(rl);
+
+ return (error);
+}
+
+static int
+mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *rl, uint64_t value)
+{
+ unsigned x;
+ int error;
+
+ if (value == 0)
+ return (EINVAL);
+
+ MLX5E_RL_WLOCK(rl);
+
+ /* check if rate already exists */
+ for (x = 0; x != rl->param.tx_rates_def; x++) {
+ if (rl->rate_limit_table[x] != value)
+ continue;
+ /* free up rate */
+ rl->rate_limit_table[x] = 0;
+ break;
+ }
+
+ /* check if there is a free rate entry */
+ if (x == rl->param.tx_rates_def)
+ error = ENOENT;
+ else
+ error = 0;
+ MLX5E_RL_WUNLOCK(rl);
+
+ return (error);
+}
+
+static int
+mlx5e_rl_sysctl_handler(SYSCTL_HANDLER_ARGS)
+{
+ struct mlx5e_rl_priv_data *rl = arg1;
+ struct mlx5e_priv *priv = rl->priv;
+ unsigned mode_modify;
+ unsigned was_opened;
+ uint64_t value;
+ uint64_t old;
+ int error;
+
+ PRIV_LOCK(priv);
+
+ MLX5E_RL_RLOCK(rl);
+ value = rl->param.arg[arg2];
+ MLX5E_RL_RUNLOCK(rl);
+
+ if (req != NULL) {
+ old = value;
+ error = sysctl_handle_64(oidp, &value, 0, req);
+ if (error || req->newptr == NULL ||
+ value == rl->param.arg[arg2])
+ goto done;
+ } else {
+ old = 0;
+ error = 0;
+ }
+
+ /* check if device is gone */
+ if (priv->gone) {
+ error = ENXIO;
+ goto done;
+ }
+ was_opened = rl->opened;
+ mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify);
+
+ switch (MLX5E_RL_PARAMS_INDEX(arg[arg2])) {
+ case MLX5E_RL_PARAMS_INDEX(tx_worker_threads_def):
+ if (value > rl->param.tx_worker_threads_max)
+ value = rl->param.tx_worker_threads_max;
+ else if (value < 1)
+ value = 1;
+
+ /* store new value */
+ rl->param.arg[arg2] = value;
+ break;
+
+ case MLX5E_RL_PARAMS_INDEX(tx_channels_per_worker_def):
+ if (value > rl->param.tx_channels_per_worker_max)
+ value = rl->param.tx_channels_per_worker_max;
+ else if (value < 1)
+ value = 1;
+
+ /* store new value */
+ rl->param.arg[arg2] = value;
+ break;
+
+ case MLX5E_RL_PARAMS_INDEX(tx_rates_def):
+ if (value > rl->param.tx_rates_max)
+ value = rl->param.tx_rates_max;
+ else if (value < 1)
+ value = 1;
+
+ /* store new value */
+ rl->param.arg[arg2] = value;
+ break;
+
+ case MLX5E_RL_PARAMS_INDEX(tx_coalesce_usecs):
+ /* range check */
+ if (value < 1)
+ value = 0;
+ else if (value > MLX5E_FLD_MAX(cqc, cq_period))
+ value = MLX5E_FLD_MAX(cqc, cq_period);
+
+ /* store new value */
+ rl->param.arg[arg2] = value;
+
+ /* check to avoid down and up the network interface */
+ if (was_opened)
+ error = mlx5e_rl_refresh_channel_params(rl);
+ break;
+
+ case MLX5E_RL_PARAMS_INDEX(tx_coalesce_pkts):
+ /* import TX coal pkts */
+ if (value < 1)
+ value = 0;
+ else if (value > MLX5E_FLD_MAX(cqc, cq_max_count))
+ value = MLX5E_FLD_MAX(cqc, cq_max_count);
+
+ /* store new value */
+ rl->param.arg[arg2] = value;
+
+ /* check to avoid down and up the network interface */
+ if (was_opened)
+ error = mlx5e_rl_refresh_channel_params(rl);
+ break;
+
+ case MLX5E_RL_PARAMS_INDEX(tx_coalesce_mode):
+ /* network interface must be down */
+ if (was_opened != 0 && mode_modify == 0)
+ mlx5e_rl_close_workers(priv);
+
+ /* import TX coalesce mode */
+ if (value != 0)
+ value = 1;
+
+ /* store new value */
+ rl->param.arg[arg2] = value;
+
+ /* restart network interface, if any */
+ if (was_opened != 0) {
+ if (mode_modify == 0)
+ mlx5e_rl_open_workers(priv);
+ else
+ error = mlx5e_rl_refresh_channel_params(rl);
+ }
+ break;
+
+ case MLX5E_RL_PARAMS_INDEX(tx_queue_size):
+ /* network interface must be down */
+ if (was_opened)
+ mlx5e_rl_close_workers(priv);
+
+ /* import TX queue size */
+ if (value < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE))
+ value = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
+ else if (value > priv->params_ethtool.tx_queue_size_max)
+ value = priv->params_ethtool.tx_queue_size_max;
+
+ /* store actual TX queue size */
+ value = 1ULL << order_base_2(value);
+
+ /* store new value */
+ rl->param.arg[arg2] = value;
+
+ /* verify TX completion factor */
+ mlx5e_rl_sync_tx_completion_fact(rl);
+
+ /* restart network interface, if any */
+ if (was_opened)
+ mlx5e_rl_open_workers(priv);
+ break;
+
+ case MLX5E_RL_PARAMS_INDEX(tx_completion_fact):
+ /* network interface must be down */
+ if (was_opened)
+ mlx5e_rl_close_workers(priv);
+
+ /* store new value */
+ rl->param.arg[arg2] = value;
+
+ /* verify parameter */
+ mlx5e_rl_sync_tx_completion_fact(rl);
+
+ /* restart network interface, if any */
+ if (was_opened)
+ mlx5e_rl_open_workers(priv);
+ break;
+
+ case MLX5E_RL_PARAMS_INDEX(tx_limit_add):
+ error = mlx5e_rl_tx_limit_add(rl, value);
+ break;
+
+ case MLX5E_RL_PARAMS_INDEX(tx_limit_clr):
+ error = mlx5e_rl_tx_limit_clr(rl, value);
+ break;
+
+ case MLX5E_RL_PARAMS_INDEX(tx_allowed_deviation):
+ /* range check */
+ if (value > rl->param.tx_allowed_deviation_max)
+ value = rl->param.tx_allowed_deviation_max;
+ else if (value < rl->param.tx_allowed_deviation_min)
+ value = rl->param.tx_allowed_deviation_min;
+
+ MLX5E_RL_WLOCK(rl);
+ rl->param.arg[arg2] = value;
+ MLX5E_RL_WUNLOCK(rl);
+ break;
+
+ case MLX5E_RL_PARAMS_INDEX(tx_burst_size):
+ /* range check */
+ if (value > rl->param.tx_burst_size_max)
+ value = rl->param.tx_burst_size_max;
+ else if (value < rl->param.tx_burst_size_min)
+ value = rl->param.tx_burst_size_min;
+
+ MLX5E_RL_WLOCK(rl);
+ rl->param.arg[arg2] = value;
+ MLX5E_RL_WUNLOCK(rl);
+ break;
+
+ default:
+ break;
+ }
+done:
+ PRIV_UNLOCK(priv);
+ return (error);
+}
+
+static void
+mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
+ struct sysctl_oid *node, const char *name, const char *desc)
+{
+ /*
+ * NOTE: In FreeBSD-11 and newer the CTLFLAG_RWTUN flag will
+ * take care of loading default sysctl value from the kernel
+ * environment, if any:
+ */
+ if (strstr(name, "_max") != 0 || strstr(name, "_min") != 0) {
+ /* read-only SYSCTLs */
+ SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
+ name, CTLTYPE_U64 | CTLFLAG_RD |
+ CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
+ } else {
+ if (strstr(name, "_def") != 0) {
+#ifdef RATELIMIT_DEBUG
+ /* tunable read-only advanced SYSCTLs */
+ SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
+ name, CTLTYPE_U64 | CTLFLAG_RDTUN |
+ CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
+#endif
+ } else {
+ /* read-write SYSCTLs */
+ SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
+ name, CTLTYPE_U64 | CTLFLAG_RWTUN |
+ CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
+ }
+ }
+}
+
+static void
+mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
+ struct sysctl_oid *node, const char *name, const char *desc)
+{
+ /* read-only SYSCTLs */
+ SYSCTL_ADD_U64(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name,
+ CTLFLAG_RD, &rl->stats.arg[x], 0, desc);
+}
+
+#endif
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c
index b15d2c2128a1..085cdcbb1e0b 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c
@@ -103,6 +103,25 @@ mlx5e_select_queue(struct ifnet *ifp, struct mbuf *mb)
ch = priv->params.num_channels;
+#ifdef RATELIMIT
+ if (mb->m_pkthdr.snd_tag != NULL) {
+ struct mlx5e_sq *sq;
+
+ /* check for route change */
+ if (mb->m_pkthdr.snd_tag->ifp != ifp)
+ return (NULL);
+
+ /* get pointer to sendqueue */
+ sq = container_of(mb->m_pkthdr.snd_tag,
+ struct mlx5e_rl_channel, m_snd_tag)->sq;
+
+ /* check if valid */
+ if (sq != NULL && sq->stopped == 0)
+ return (sq);
+
+ /* FALLTHROUGH */
+ }
+#endif
/* check if flowid is set */
if (M_HASHTYPE_GET(mb) != M_HASHTYPE_NONE) {
#ifdef RSS
@@ -540,8 +559,24 @@ mlx5e_xmit(struct ifnet *ifp, struct mbuf *mb)
sq = mlx5e_select_queue(ifp, mb);
if (unlikely(sq == NULL)) {
- /* Invalid send queue */
+#ifdef RATELIMIT
+ /* Check for route change */
+ if (mb->m_pkthdr.snd_tag != NULL &&
+ mb->m_pkthdr.snd_tag->ifp != ifp) {
+ /* Free mbuf */
+ m_freem(mb);
+
+ /*
+ * Tell upper layers about route change and to
+ * re-transmit this packet:
+ */
+ return (EAGAIN);
+ }
+#endif
+ /* Free mbuf */
m_freem(mb);
+
+ /* Invalid send queue */
return (ENXIO);
}