HardenedBSD/sys/dev/gve/gve_sysctl.c
Shailend Chand 40097cd67c gve: Fix TX livelock
Before this change the transmit taskqueue would enqueue itself when it
cannot find space on the NIC ring with the hope that eventually space
would be made. This results in the following livelock that only occurs
after passing ~200Gbps of TCP traffic for many hours:

                        100% CPU
┌───────────┐wait on  ┌──────────┐         ┌───────────┐
│user thread│  cpu    │gve xmit  │wait on  │gve cleanup│
│with mbuf  ├────────►│taskqueue ├────────►│taskqueue  │
│uma lock   │         │          │ NIC ring│           │
└───────────┘         └──────────┘  space  └─────┬─────┘
     ▲                                           │
     │      wait on mbuf uma lock                │
     └───────────────────────────────────────────┘

Further details about the livelock are available on
https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=281560.

After this change, the transmit taskqueue no longer spins till there is
room on the NIC ring. It instead stops itself and lets the
completion-processing taskqueue wake it up.

Since I'm touching the trasnmit taskqueue I've also corrected the name
of a counter and also fixed a bug where EINVAL mbufs were not being
freed and were instead living forever on the bufring.

Signed-off-by: Shailend Chand <shailend@google.com>
Reviewed-by: markj
MFC-after: 2 weeks
Differential Revision: https://reviews.freebsd.org/D47138
2024-11-06 15:06:41 +00:00

328 lines
13 KiB
C

/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 2023-2024 Google LLC
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "gve.h"
static SYSCTL_NODE(_hw, OID_AUTO, gve, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
"GVE driver parameters");
bool gve_disable_hw_lro = false;
SYSCTL_BOOL(_hw_gve, OID_AUTO, disable_hw_lro, CTLFLAG_RDTUN,
&gve_disable_hw_lro, 0, "Controls if hardware LRO is used");
char gve_queue_format[8];
SYSCTL_STRING(_hw_gve, OID_AUTO, queue_format, CTLFLAG_RD,
&gve_queue_format, 0, "Queue format being used by the iface");
char gve_version[8];
SYSCTL_STRING(_hw_gve, OID_AUTO, driver_version, CTLFLAG_RD,
&gve_version, 0, "Driver version");
static void
gve_setup_rxq_sysctl(struct sysctl_ctx_list *ctx,
struct sysctl_oid_list *child, struct gve_rx_ring *rxq)
{
struct sysctl_oid *node;
struct sysctl_oid_list *list;
struct gve_rxq_stats *stats;
char namebuf[16];
snprintf(namebuf, sizeof(namebuf), "rxq%d", rxq->com.id);
node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf,
CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Receive Queue");
list = SYSCTL_CHILDREN(node);
stats = &rxq->stats;
SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO,
"rx_bytes", CTLFLAG_RD,
&stats->rbytes, "Bytes received");
SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO,
"rx_packets", CTLFLAG_RD,
&stats->rpackets, "Packets received");
SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_copybreak_cnt",
CTLFLAG_RD, &stats->rx_copybreak_cnt,
"Total frags with mbufs allocated for copybreak");
SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_frag_flip_cnt",
CTLFLAG_RD, &stats->rx_frag_flip_cnt,
"Total frags that allocated mbuf with page flip");
SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_frag_copy_cnt",
CTLFLAG_RD, &stats->rx_frag_copy_cnt,
"Total frags with mbuf that copied payload into mbuf");
SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_dropped_pkt",
CTLFLAG_RD, &stats->rx_dropped_pkt,
"Total rx packets dropped");
SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO,
"rx_dropped_pkt_desc_err", CTLFLAG_RD,
&stats->rx_dropped_pkt_desc_err,
"Packets dropped due to descriptor error");
SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO,
"rx_dropped_pkt_buf_post_fail", CTLFLAG_RD,
&stats->rx_dropped_pkt_buf_post_fail,
"Packets dropped due to failure to post enough buffers");
SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO,
"rx_dropped_pkt_mbuf_alloc_fail", CTLFLAG_RD,
&stats->rx_dropped_pkt_mbuf_alloc_fail,
"Packets dropped due to failed mbuf allocation");
SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO,
"rx_mbuf_dmamap_err", CTLFLAG_RD,
&stats->rx_mbuf_dmamap_err,
"Number of rx mbufs which couldnt be dma mapped");
SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO,
"rx_mbuf_mclget_null", CTLFLAG_RD,
&stats->rx_mbuf_mclget_null,
"Number of times when there were no cluster mbufs");
SYSCTL_ADD_U32(ctx, list, OID_AUTO,
"rx_completed_desc", CTLFLAG_RD,
&rxq->cnt, 0, "Number of descriptors completed");
SYSCTL_ADD_U32(ctx, list, OID_AUTO,
"num_desc_posted", CTLFLAG_RD,
&rxq->fill_cnt, rxq->fill_cnt,
"Toal number of descriptors posted");
}
static void
gve_setup_txq_sysctl(struct sysctl_ctx_list *ctx,
struct sysctl_oid_list *child, struct gve_tx_ring *txq)
{
struct sysctl_oid *node;
struct sysctl_oid_list *tx_list;
struct gve_txq_stats *stats;
char namebuf[16];
snprintf(namebuf, sizeof(namebuf), "txq%d", txq->com.id);
node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf,
CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Transmit Queue");
tx_list = SYSCTL_CHILDREN(node);
stats = &txq->stats;
SYSCTL_ADD_U32(ctx, tx_list, OID_AUTO,
"tx_posted_desc", CTLFLAG_RD,
&txq->req, 0, "Number of descriptors posted by NIC");
SYSCTL_ADD_U32(ctx, tx_list, OID_AUTO,
"tx_completed_desc", CTLFLAG_RD,
&txq->done, 0, "Number of descriptors completed");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"tx_packets", CTLFLAG_RD,
&stats->tpackets, "Packets transmitted");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"tx_tso_packets", CTLFLAG_RD,
&stats->tso_packet_cnt, "TSO Packets transmitted");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"tx_bytes", CTLFLAG_RD,
&stats->tbytes, "Bytes transmitted");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"tx_delayed_pkt_nospace_device", CTLFLAG_RD,
&stats->tx_delayed_pkt_nospace_device,
"Packets delayed due to no space in device");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"tx_dropped_pkt_nospace_bufring", CTLFLAG_RD,
&stats->tx_dropped_pkt_nospace_bufring,
"Packets dropped due to no space in br ring");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"tx_dropped_pkt_vlan", CTLFLAG_RD,
&stats->tx_dropped_pkt_vlan,
"Dropped VLAN packets");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"tx_delayed_pkt_nospace_descring", CTLFLAG_RD,
&stats->tx_delayed_pkt_nospace_descring,
"Packets delayed due to no space in desc ring");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"tx_delayed_pkt_nospace_compring", CTLFLAG_RD,
&stats->tx_delayed_pkt_nospace_compring,
"Packets delayed due to no space in comp ring");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"tx_delayed_pkt_nospace_qpl_bufs", CTLFLAG_RD,
&stats->tx_delayed_pkt_nospace_qpl_bufs,
"Packets delayed due to not enough qpl bufs");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"tx_delayed_pkt_tsoerr", CTLFLAG_RD,
&stats->tx_delayed_pkt_tsoerr,
"TSO packets delayed due to err in prep errors");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"tx_mbuf_collpase", CTLFLAG_RD,
&stats->tx_mbuf_collapse,
"tx mbufs that had to be collpased");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"tx_mbuf_defrag", CTLFLAG_RD,
&stats->tx_mbuf_defrag,
"tx mbufs that had to be defragged");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"tx_mbuf_defrag_err", CTLFLAG_RD,
&stats->tx_mbuf_defrag_err,
"tx mbufs that failed defrag");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"tx_mbuf_dmamap_enomem_err", CTLFLAG_RD,
&stats->tx_mbuf_dmamap_enomem_err,
"tx mbufs that could not be dma-mapped due to low mem");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"tx_mbuf_dmamap_err", CTLFLAG_RD,
&stats->tx_mbuf_dmamap_err,
"tx mbufs that could not be dma-mapped");
}
static void
gve_setup_queue_stat_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child,
struct gve_priv *priv)
{
int i;
for (i = 0; i < priv->rx_cfg.num_queues; i++) {
gve_setup_rxq_sysctl(ctx, child, &priv->rx[i]);
}
for (i = 0; i < priv->tx_cfg.num_queues; i++) {
gve_setup_txq_sysctl(ctx, child, &priv->tx[i]);
}
}
static void
gve_setup_adminq_stat_sysctl(struct sysctl_ctx_list *ctx,
struct sysctl_oid_list *child, struct gve_priv *priv)
{
struct sysctl_oid *admin_node;
struct sysctl_oid_list *admin_list;
/* Admin queue stats */
admin_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "adminq_stats",
CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Admin Queue statistics");
admin_list = SYSCTL_CHILDREN(admin_node);
SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_prod_cnt", CTLFLAG_RD,
&priv->adminq_prod_cnt, 0, "Adminq Commands issued");
SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_cmd_fail", CTLFLAG_RD,
&priv->adminq_cmd_fail, 0, "Aqminq Failed commands");
SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_timeouts", CTLFLAG_RD,
&priv->adminq_timeouts, 0, "Adminq Timedout commands");
SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_describe_device_cnt",
CTLFLAG_RD, &priv->adminq_describe_device_cnt, 0,
"adminq_describe_device_cnt");
SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO,
"adminq_cfg_device_resources_cnt", CTLFLAG_RD,
&priv->adminq_cfg_device_resources_cnt, 0,
"adminq_cfg_device_resources_cnt");
SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO,
"adminq_register_page_list_cnt", CTLFLAG_RD,
&priv->adminq_register_page_list_cnt, 0,
"adminq_register_page_list_cnt");
SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO,
"adminq_unregister_page_list_cnt", CTLFLAG_RD,
&priv->adminq_unregister_page_list_cnt, 0,
"adminq_unregister_page_list_cnt");
SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_create_tx_queue_cnt",
CTLFLAG_RD, &priv->adminq_create_tx_queue_cnt, 0,
"adminq_create_tx_queue_cnt");
SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_create_rx_queue_cnt",
CTLFLAG_RD, &priv->adminq_create_rx_queue_cnt, 0,
"adminq_create_rx_queue_cnt");
SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_destroy_tx_queue_cnt",
CTLFLAG_RD, &priv->adminq_destroy_tx_queue_cnt, 0,
"adminq_destroy_tx_queue_cnt");
SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_destroy_rx_queue_cnt",
CTLFLAG_RD, &priv->adminq_destroy_rx_queue_cnt, 0,
"adminq_destroy_rx_queue_cnt");
SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_get_ptype_map_cnt",
CTLFLAG_RD, &priv->adminq_get_ptype_map_cnt, 0,
"adminq_get_ptype_map_cnt");
SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO,
"adminq_dcfg_device_resources_cnt", CTLFLAG_RD,
&priv->adminq_dcfg_device_resources_cnt, 0,
"adminq_dcfg_device_resources_cnt");
SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO,
"adminq_set_driver_parameter_cnt", CTLFLAG_RD,
&priv->adminq_set_driver_parameter_cnt, 0,
"adminq_set_driver_parameter_cnt");
SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO,
"adminq_verify_driver_compatibility_cnt", CTLFLAG_RD,
&priv->adminq_verify_driver_compatibility_cnt, 0,
"adminq_verify_driver_compatibility_cnt");
}
static void
gve_setup_main_stat_sysctl(struct sysctl_ctx_list *ctx,
struct sysctl_oid_list *child, struct gve_priv *priv)
{
struct sysctl_oid *main_node;
struct sysctl_oid_list *main_list;
/* Main stats */
main_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "main_stats",
CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Main statistics");
main_list = SYSCTL_CHILDREN(main_node);
SYSCTL_ADD_U32(ctx, main_list, OID_AUTO, "interface_up_cnt", CTLFLAG_RD,
&priv->interface_up_cnt, 0, "Times interface was set to up");
SYSCTL_ADD_U32(ctx, main_list, OID_AUTO, "interface_down_cnt", CTLFLAG_RD,
&priv->interface_down_cnt, 0, "Times interface was set to down");
SYSCTL_ADD_U32(ctx, main_list, OID_AUTO, "reset_cnt", CTLFLAG_RD,
&priv->reset_cnt, 0, "Times reset");
}
void gve_setup_sysctl(struct gve_priv *priv)
{
device_t dev;
struct sysctl_ctx_list *ctx;
struct sysctl_oid *tree;
struct sysctl_oid_list *child;
dev = priv->dev;
ctx = device_get_sysctl_ctx(dev);
tree = device_get_sysctl_tree(dev);
child = SYSCTL_CHILDREN(tree);
gve_setup_queue_stat_sysctl(ctx, child, priv);
gve_setup_adminq_stat_sysctl(ctx, child, priv);
gve_setup_main_stat_sysctl(ctx, child, priv);
}
void
gve_accum_stats(struct gve_priv *priv, uint64_t *rpackets,
uint64_t *rbytes, uint64_t *rx_dropped_pkt, uint64_t *tpackets,
uint64_t *tbytes, uint64_t *tx_dropped_pkt)
{
struct gve_rxq_stats *rxqstats;
struct gve_txq_stats *txqstats;
int i;
for (i = 0; i < priv->rx_cfg.num_queues; i++) {
rxqstats = &priv->rx[i].stats;
*rpackets += counter_u64_fetch(rxqstats->rpackets);
*rbytes += counter_u64_fetch(rxqstats->rbytes);
*rx_dropped_pkt += counter_u64_fetch(rxqstats->rx_dropped_pkt);
}
for (i = 0; i < priv->tx_cfg.num_queues; i++) {
txqstats = &priv->tx[i].stats;
*tpackets += counter_u64_fetch(txqstats->tpackets);
*tbytes += counter_u64_fetch(txqstats->tbytes);
*tx_dropped_pkt += counter_u64_fetch(txqstats->tx_dropped_pkt);
}
}