HardenedBSD/sys/dev/gve/gve_tx.c
Shailend Chand 40097cd67c gve: Fix TX livelock
Before this change the transmit taskqueue would enqueue itself when it
cannot find space on the NIC ring with the hope that eventually space
would be made. This results in the following livelock that only occurs
after passing ~200Gbps of TCP traffic for many hours:

                        100% CPU
┌───────────┐wait on  ┌──────────┐         ┌───────────┐
│user thread│  cpu    │gve xmit  │wait on  │gve cleanup│
│with mbuf  ├────────►│taskqueue ├────────►│taskqueue  │
│uma lock   │         │          │ NIC ring│           │
└───────────┘         └──────────┘  space  └─────┬─────┘
     ▲                                           │
     │      wait on mbuf uma lock                │
     └───────────────────────────────────────────┘

Further details about the livelock are available on
https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=281560.

After this change, the transmit taskqueue no longer spins till there is
room on the NIC ring. It instead stops itself and lets the
completion-processing taskqueue wake it up.

Since I'm touching the trasnmit taskqueue I've also corrected the name
of a counter and also fixed a bug where EINVAL mbufs were not being
freed and were instead living forever on the bufring.

Signed-off-by: Shailend Chand <shailend@google.com>
Reviewed-by: markj
MFC-after: 2 weeks
Differential Revision: https://reviews.freebsd.org/D47138
2024-11-06 15:06:41 +00:00

928 lines
24 KiB
C

/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 2023-2024 Google LLC
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "gve.h"
#include "gve_adminq.h"
#include "gve_dqo.h"
#define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182
static int
gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_ring *tx)
{
struct gve_queue_page_list *qpl = tx->com.qpl;
struct gve_tx_fifo *fifo = &tx->fifo;
fifo->size = qpl->num_pages * PAGE_SIZE;
fifo->base = qpl->kva;
atomic_store_int(&fifo->available, fifo->size);
fifo->head = 0;
return (0);
}
static void
gve_tx_free_ring_gqi(struct gve_priv *priv, int i)
{
struct gve_tx_ring *tx = &priv->tx[i];
if (tx->desc_ring != NULL) {
gve_dma_free_coherent(&tx->desc_ring_mem);
tx->desc_ring = NULL;
}
if (tx->info != NULL) {
free(tx->info, M_GVE);
tx->info = NULL;
}
}
static void
gve_tx_free_ring(struct gve_priv *priv, int i)
{
struct gve_tx_ring *tx = &priv->tx[i];
struct gve_ring_com *com = &tx->com;
/* Safe to call even if never alloced */
gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS);
if (mtx_initialized(&tx->ring_mtx))
mtx_destroy(&tx->ring_mtx);
if (com->q_resources != NULL) {
gve_dma_free_coherent(&com->q_resources_mem);
com->q_resources = NULL;
}
if (tx->br != NULL) {
buf_ring_free(tx->br, M_DEVBUF);
tx->br = NULL;
}
if (gve_is_gqi(priv))
gve_tx_free_ring_gqi(priv, i);
else
gve_tx_free_ring_dqo(priv, i);
}
static int
gve_tx_alloc_ring_gqi(struct gve_priv *priv, int i)
{
struct gve_tx_ring *tx = &priv->tx[i];
struct gve_ring_com *com = &tx->com;
int err;
err = gve_dma_alloc_coherent(priv,
sizeof(union gve_tx_desc) * priv->tx_desc_cnt,
CACHE_LINE_SIZE, &tx->desc_ring_mem);
if (err != 0) {
device_printf(priv->dev,
"Failed to alloc desc ring for tx ring %d", i);
goto abort;
}
tx->desc_ring = tx->desc_ring_mem.cpu_addr;
com->qpl = &priv->qpls[i];
if (com->qpl == NULL) {
device_printf(priv->dev, "No QPL left for tx ring %d\n", i);
err = ENOMEM;
goto abort;
}
err = gve_tx_fifo_init(priv, tx);
if (err != 0)
goto abort;
tx->info = malloc(
sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt,
M_GVE, M_WAITOK | M_ZERO);
return (0);
abort:
gve_tx_free_ring_gqi(priv, i);
return (err);
}
static int
gve_tx_alloc_ring(struct gve_priv *priv, int i)
{
struct gve_tx_ring *tx = &priv->tx[i];
struct gve_ring_com *com = &tx->com;
char mtx_name[16];
int err;
com->priv = priv;
com->id = i;
if (gve_is_gqi(priv))
err = gve_tx_alloc_ring_gqi(priv, i);
else
err = gve_tx_alloc_ring_dqo(priv, i);
if (err != 0)
goto abort;
sprintf(mtx_name, "gvetx%d", i);
mtx_init(&tx->ring_mtx, mtx_name, NULL, MTX_DEF);
tx->br = buf_ring_alloc(GVE_TX_BUFRING_ENTRIES, M_DEVBUF,
M_WAITOK, &tx->ring_mtx);
gve_alloc_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS);
err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources),
PAGE_SIZE, &com->q_resources_mem);
if (err != 0) {
device_printf(priv->dev,
"Failed to alloc queue resources for tx ring %d", i);
goto abort;
}
com->q_resources = com->q_resources_mem.cpu_addr;
return (0);
abort:
gve_tx_free_ring(priv, i);
return (err);
}
int
gve_alloc_tx_rings(struct gve_priv *priv)
{
int err = 0;
int i;
priv->tx = malloc(sizeof(struct gve_tx_ring) * priv->tx_cfg.num_queues,
M_GVE, M_WAITOK | M_ZERO);
for (i = 0; i < priv->tx_cfg.num_queues; i++) {
err = gve_tx_alloc_ring(priv, i);
if (err != 0)
goto free_rings;
}
return (0);
free_rings:
while (i--)
gve_tx_free_ring(priv, i);
free(priv->tx, M_GVE);
return (err);
}
void
gve_free_tx_rings(struct gve_priv *priv)
{
int i;
for (i = 0; i < priv->tx_cfg.num_queues; i++)
gve_tx_free_ring(priv, i);
free(priv->tx, M_GVE);
}
static void
gve_tx_clear_desc_ring(struct gve_tx_ring *tx)
{
struct gve_ring_com *com = &tx->com;
int i;
for (i = 0; i < com->priv->tx_desc_cnt; i++) {
tx->desc_ring[i] = (union gve_tx_desc){};
tx->info[i] = (struct gve_tx_buffer_state){};
}
bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map,
BUS_DMASYNC_PREWRITE);
}
static void
gve_clear_tx_ring(struct gve_priv *priv, int i)
{
struct gve_tx_ring *tx = &priv->tx[i];
struct gve_tx_fifo *fifo = &tx->fifo;
tx->req = 0;
tx->done = 0;
tx->mask = priv->tx_desc_cnt - 1;
atomic_store_int(&fifo->available, fifo->size);
fifo->head = 0;
gve_tx_clear_desc_ring(tx);
}
static void
gve_start_tx_ring(struct gve_priv *priv, int i,
void (cleanup) (void *arg, int pending))
{
struct gve_tx_ring *tx = &priv->tx[i];
struct gve_ring_com *com = &tx->com;
atomic_store_bool(&tx->stopped, false);
NET_TASK_INIT(&com->cleanup_task, 0, cleanup, tx);
com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK,
taskqueue_thread_enqueue, &com->cleanup_tq);
taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s txq %d",
device_get_nameunit(priv->dev), i);
TASK_INIT(&tx->xmit_task, 0, gve_xmit_tq, tx);
tx->xmit_tq = taskqueue_create_fast("gve tx xmit",
M_WAITOK, taskqueue_thread_enqueue, &tx->xmit_tq);
taskqueue_start_threads(&tx->xmit_tq, 1, PI_NET, "%s txq %d xmit",
device_get_nameunit(priv->dev), i);
}
int
gve_create_tx_rings(struct gve_priv *priv)
{
struct gve_ring_com *com;
struct gve_tx_ring *tx;
int err;
int i;
if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK))
return (0);
for (i = 0; i < priv->tx_cfg.num_queues; i++) {
if (gve_is_gqi(priv))
gve_clear_tx_ring(priv, i);
else
gve_clear_tx_ring_dqo(priv, i);
}
err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues);
if (err != 0)
return (err);
bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map,
BUS_DMASYNC_POSTREAD);
for (i = 0; i < priv->tx_cfg.num_queues; i++) {
tx = &priv->tx[i];
com = &tx->com;
com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index);
bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map,
BUS_DMASYNC_POSTREAD);
com->db_offset = 4 * be32toh(com->q_resources->db_index);
com->counter_idx = be32toh(com->q_resources->counter_index);
if (gve_is_gqi(priv))
gve_start_tx_ring(priv, i, gve_tx_cleanup_tq);
else
gve_start_tx_ring(priv, i, gve_tx_cleanup_tq_dqo);
}
gve_set_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK);
return (0);
}
static void
gve_stop_tx_ring(struct gve_priv *priv, int i)
{
struct gve_tx_ring *tx = &priv->tx[i];
struct gve_ring_com *com = &tx->com;
if (com->cleanup_tq != NULL) {
taskqueue_quiesce(com->cleanup_tq);
taskqueue_free(com->cleanup_tq);
com->cleanup_tq = NULL;
}
if (tx->xmit_tq != NULL) {
taskqueue_quiesce(tx->xmit_tq);
taskqueue_free(tx->xmit_tq);
tx->xmit_tq = NULL;
}
}
int
gve_destroy_tx_rings(struct gve_priv *priv)
{
int err;
int i;
for (i = 0; i < priv->tx_cfg.num_queues; i++)
gve_stop_tx_ring(priv, i);
if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) {
err = gve_adminq_destroy_tx_queues(priv, priv->tx_cfg.num_queues);
if (err != 0)
return (err);
gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK);
}
return (0);
}
int
gve_tx_intr(void *arg)
{
struct gve_tx_ring *tx = arg;
struct gve_priv *priv = tx->com.priv;
struct gve_ring_com *com = &tx->com;
if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
return (FILTER_STRAY);
gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK);
taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task);
return (FILTER_HANDLED);
}
static uint32_t
gve_tx_load_event_counter(struct gve_priv *priv, struct gve_tx_ring *tx)
{
bus_dmamap_sync(priv->counter_array_mem.tag, priv->counter_array_mem.map,
BUS_DMASYNC_POSTREAD);
uint32_t counter = priv->counters[tx->com.counter_idx];
return (be32toh(counter));
}
static void
gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes)
{
atomic_add_int(&fifo->available, bytes);
}
void
gve_tx_cleanup_tq(void *arg, int pending)
{
struct gve_tx_ring *tx = arg;
struct gve_priv *priv = tx->com.priv;
uint32_t nic_done = gve_tx_load_event_counter(priv, tx);
uint32_t todo = nic_done - tx->done;
size_t space_freed = 0;
int i, j;
if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
return;
for (j = 0; j < todo; j++) {
uint32_t idx = tx->done & tx->mask;
struct gve_tx_buffer_state *info = &tx->info[idx];
struct mbuf *mbuf = info->mbuf;
tx->done++;
if (mbuf == NULL)
continue;
info->mbuf = NULL;
counter_enter();
counter_u64_add_protected(tx->stats.tbytes, mbuf->m_pkthdr.len);
counter_u64_add_protected(tx->stats.tpackets, 1);
counter_exit();
m_freem(mbuf);
for (i = 0; i < GVE_TX_MAX_DESCS; i++) {
space_freed += info->iov[i].iov_len + info->iov[i].iov_padding;
info->iov[i].iov_len = 0;
info->iov[i].iov_padding = 0;
}
}
gve_tx_free_fifo(&tx->fifo, space_freed);
gve_db_bar_write_4(priv, tx->com.irq_db_offset,
GVE_IRQ_ACK | GVE_IRQ_EVENT);
/*
* Completions born before this barrier MAY NOT cause the NIC to send an
* interrupt but they will still be handled by the enqueue below.
* Completions born after the barrier WILL trigger an interrupt.
*/
mb();
nic_done = gve_tx_load_event_counter(priv, tx);
todo = nic_done - tx->done;
if (todo != 0) {
gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK);
taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task);
}
if (atomic_load_bool(&tx->stopped) && space_freed) {
atomic_store_bool(&tx->stopped, false);
taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
}
}
static void
gve_dma_sync_for_device(struct gve_queue_page_list *qpl,
uint64_t iov_offset, uint64_t iov_len)
{
uint64_t last_page = (iov_offset + iov_len - 1) / PAGE_SIZE;
uint64_t first_page = iov_offset / PAGE_SIZE;
struct gve_dma_handle *dma;
uint64_t page;
for (page = first_page; page <= last_page; page++) {
dma = &(qpl->dmas[page]);
bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE);
}
}
static void
gve_tx_fill_mtd_desc(struct gve_tx_mtd_desc *mtd_desc, struct mbuf *mbuf)
{
mtd_desc->type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH;
mtd_desc->path_state = GVE_MTD_PATH_STATE_DEFAULT | GVE_MTD_PATH_HASH_L4;
mtd_desc->path_hash = htobe32(mbuf->m_pkthdr.flowid);
mtd_desc->reserved0 = 0;
mtd_desc->reserved1 = 0;
}
static void
gve_tx_fill_pkt_desc(struct gve_tx_pkt_desc *pkt_desc, bool is_tso,
uint16_t l4_hdr_offset, uint32_t desc_cnt,
uint16_t first_seg_len, uint64_t addr, bool has_csum_flag,
int csum_offset, uint16_t pkt_len)
{
if (is_tso) {
pkt_desc->type_flags = GVE_TXD_TSO | GVE_TXF_L4CSUM;
pkt_desc->l4_csum_offset = csum_offset >> 1;
pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1;
} else if (has_csum_flag) {
pkt_desc->type_flags = GVE_TXD_STD | GVE_TXF_L4CSUM;
pkt_desc->l4_csum_offset = csum_offset >> 1;
pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1;
} else {
pkt_desc->type_flags = GVE_TXD_STD;
pkt_desc->l4_csum_offset = 0;
pkt_desc->l4_hdr_offset = 0;
}
pkt_desc->desc_cnt = desc_cnt;
pkt_desc->len = htobe16(pkt_len);
pkt_desc->seg_len = htobe16(first_seg_len);
pkt_desc->seg_addr = htobe64(addr);
}
static void
gve_tx_fill_seg_desc(struct gve_tx_seg_desc *seg_desc,
bool is_tso, uint16_t len, uint64_t addr,
bool is_ipv6, uint8_t l3_off, uint16_t tso_mss)
{
seg_desc->type_flags = GVE_TXD_SEG;
if (is_tso) {
if (is_ipv6)
seg_desc->type_flags |= GVE_TXSF_IPV6;
seg_desc->l3_offset = l3_off >> 1;
seg_desc->mss = htobe16(tso_mss);
}
seg_desc->seg_len = htobe16(len);
seg_desc->seg_addr = htobe64(addr);
}
static inline uint32_t
gve_tx_avail(struct gve_tx_ring *tx)
{
return (tx->mask + 1 - (tx->req - tx->done));
}
static bool
gve_tx_fifo_can_alloc(struct gve_tx_fifo *fifo, size_t bytes)
{
return (atomic_load_int(&fifo->available) >= bytes);
}
static inline bool
gve_can_tx(struct gve_tx_ring *tx, int bytes_required)
{
return (gve_tx_avail(tx) >= (GVE_TX_MAX_DESCS + 1) &&
gve_tx_fifo_can_alloc(&tx->fifo, bytes_required));
}
static int
gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo *fifo, size_t bytes)
{
return (fifo->head + bytes < fifo->size) ? 0 : fifo->size - fifo->head;
}
static inline int
gve_fifo_bytes_required(struct gve_tx_ring *tx, uint16_t first_seg_len,
uint16_t pkt_len)
{
int pad_bytes, align_hdr_pad;
int bytes;
pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len);
/* We need to take into account the header alignment padding. */
align_hdr_pad = roundup2(first_seg_len, CACHE_LINE_SIZE) - first_seg_len;
bytes = align_hdr_pad + pad_bytes + pkt_len;
return (bytes);
}
static int
gve_tx_alloc_fifo(struct gve_tx_fifo *fifo, size_t bytes,
struct gve_tx_iovec iov[2])
{
size_t overflow, padding;
uint32_t aligned_head;
int nfrags = 0;
if (bytes == 0)
return (0);
/*
* This check happens before we know how much padding is needed to
* align to a cacheline boundary for the payload, but that is fine,
* because the FIFO head always start aligned, and the FIFO's boundaries
* are aligned, so if there is space for the data, there is space for
* the padding to the next alignment.
*/
KASSERT(gve_tx_fifo_can_alloc(fifo, bytes),
("Allocating gve tx fifo when there is no room"));
nfrags++;
iov[0].iov_offset = fifo->head;
iov[0].iov_len = bytes;
fifo->head += bytes;
if (fifo->head > fifo->size) {
/*
* If the allocation did not fit in the tail fragment of the
* FIFO, also use the head fragment.
*/
nfrags++;
overflow = fifo->head - fifo->size;
iov[0].iov_len -= overflow;
iov[1].iov_offset = 0; /* Start of fifo*/
iov[1].iov_len = overflow;
fifo->head = overflow;
}
/* Re-align to a cacheline boundary */
aligned_head = roundup2(fifo->head, CACHE_LINE_SIZE);
padding = aligned_head - fifo->head;
iov[nfrags - 1].iov_padding = padding;
atomic_add_int(&fifo->available, -(bytes + padding));
fifo->head = aligned_head;
if (fifo->head == fifo->size)
fifo->head = 0;
return (nfrags);
}
/* Only error this returns is ENOBUFS when the tx fifo is short of space */
static int
gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf)
{
bool is_tso, has_csum_flag, is_ipv6 = false, is_tcp = false, is_udp = false;
int csum_flags, csum_offset, mtd_desc_nr, offset, copy_offset;
uint16_t tso_mss, l4_off, l4_data_off, pkt_len, first_seg_len;
int pad_bytes, hdr_nfrags, payload_nfrags;
struct gve_tx_pkt_desc *pkt_desc;
struct gve_tx_seg_desc *seg_desc;
struct gve_tx_mtd_desc *mtd_desc;
struct gve_tx_buffer_state *info;
uint32_t idx = tx->req & tx->mask;
struct ether_header *eh;
struct mbuf *mbuf_next;
int payload_iov = 2;
int bytes_required;
struct ip6_hdr *ip6;
struct tcphdr *th;
uint32_t next_idx;
uint8_t l3_off;
struct ip *ip;
int i;
info = &tx->info[idx];
csum_flags = mbuf->m_pkthdr.csum_flags;
pkt_len = mbuf->m_pkthdr.len;
is_tso = csum_flags & CSUM_TSO;
has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP |
CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO);
mtd_desc_nr = M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE ? 1 : 0;
tso_mss = is_tso ? mbuf->m_pkthdr.tso_segsz : 0;
eh = mtod(mbuf, struct ether_header *);
KASSERT(eh->ether_type != ETHERTYPE_VLAN,
("VLAN-tagged packets not supported"));
is_ipv6 = ntohs(eh->ether_type) == ETHERTYPE_IPV6;
l3_off = ETHER_HDR_LEN;
mbuf_next = m_getptr(mbuf, l3_off, &offset);
if (is_ipv6) {
ip6 = (struct ip6_hdr *)(mtodo(mbuf_next, offset));
l4_off = l3_off + sizeof(struct ip6_hdr);
is_tcp = (ip6->ip6_nxt == IPPROTO_TCP);
is_udp = (ip6->ip6_nxt == IPPROTO_UDP);
mbuf_next = m_getptr(mbuf, l4_off, &offset);
} else if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
ip = (struct ip *)(mtodo(mbuf_next, offset));
l4_off = l3_off + (ip->ip_hl << 2);
is_tcp = (ip->ip_p == IPPROTO_TCP);
is_udp = (ip->ip_p == IPPROTO_UDP);
mbuf_next = m_getptr(mbuf, l4_off, &offset);
}
l4_data_off = 0;
if (is_tcp) {
th = (struct tcphdr *)(mtodo(mbuf_next, offset));
l4_data_off = l4_off + (th->th_off << 2);
} else if (is_udp)
l4_data_off = l4_off + sizeof(struct udphdr);
if (has_csum_flag) {
if ((csum_flags & (CSUM_TSO | CSUM_TCP | CSUM_IP6_TCP)) != 0)
csum_offset = offsetof(struct tcphdr, th_sum);
else
csum_offset = offsetof(struct udphdr, uh_sum);
}
/*
* If this packet is neither a TCP nor a UDP packet, the first segment,
* the one represented by the packet descriptor, will carry the
* spec-stipulated minimum of 182B.
*/
if (l4_data_off != 0)
first_seg_len = l4_data_off;
else
first_seg_len = MIN(pkt_len, GVE_GQ_TX_MIN_PKT_DESC_BYTES);
bytes_required = gve_fifo_bytes_required(tx, first_seg_len, pkt_len);
if (__predict_false(!gve_can_tx(tx, bytes_required))) {
counter_enter();
counter_u64_add_protected(tx->stats.tx_delayed_pkt_nospace_device, 1);
counter_exit();
return (ENOBUFS);
}
/* So that the cleanup taskqueue can free the mbuf eventually. */
info->mbuf = mbuf;
/*
* We don't want to split the header, so if necessary, pad to the end
* of the fifo and then put the header at the beginning of the fifo.
*/
pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len);
hdr_nfrags = gve_tx_alloc_fifo(&tx->fifo, first_seg_len + pad_bytes,
&info->iov[0]);
KASSERT(hdr_nfrags > 0, ("Number of header fragments for gve tx is 0"));
payload_nfrags = gve_tx_alloc_fifo(&tx->fifo, pkt_len - first_seg_len,
&info->iov[payload_iov]);
pkt_desc = &tx->desc_ring[idx].pkt;
gve_tx_fill_pkt_desc(pkt_desc, is_tso, l4_off,
1 + mtd_desc_nr + payload_nfrags, first_seg_len,
info->iov[hdr_nfrags - 1].iov_offset, has_csum_flag, csum_offset,
pkt_len);
m_copydata(mbuf, 0, first_seg_len,
(char *)tx->fifo.base + info->iov[hdr_nfrags - 1].iov_offset);
gve_dma_sync_for_device(tx->com.qpl,
info->iov[hdr_nfrags - 1].iov_offset,
info->iov[hdr_nfrags - 1].iov_len);
copy_offset = first_seg_len;
if (mtd_desc_nr == 1) {
next_idx = (tx->req + 1) & tx->mask;
mtd_desc = &tx->desc_ring[next_idx].mtd;
gve_tx_fill_mtd_desc(mtd_desc, mbuf);
}
for (i = payload_iov; i < payload_nfrags + payload_iov; i++) {
next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask;
seg_desc = &tx->desc_ring[next_idx].seg;
gve_tx_fill_seg_desc(seg_desc, is_tso, info->iov[i].iov_len,
info->iov[i].iov_offset, is_ipv6, l3_off, tso_mss);
m_copydata(mbuf, copy_offset, info->iov[i].iov_len,
(char *)tx->fifo.base + info->iov[i].iov_offset);
gve_dma_sync_for_device(tx->com.qpl,
info->iov[i].iov_offset, info->iov[i].iov_len);
copy_offset += info->iov[i].iov_len;
}
tx->req += (1 + mtd_desc_nr + payload_nfrags);
if (is_tso) {
counter_enter();
counter_u64_add_protected(tx->stats.tso_packet_cnt, 1);
counter_exit();
}
return (0);
}
static int
gve_xmit_mbuf(struct gve_tx_ring *tx,
struct mbuf **mbuf)
{
if (gve_is_gqi(tx->com.priv))
return (gve_xmit(tx, *mbuf));
if (gve_is_qpl(tx->com.priv))
return (gve_xmit_dqo_qpl(tx, *mbuf));
/*
* gve_xmit_dqo might attempt to defrag the mbuf chain.
* The reference is passed in so that in the case of
* errors, the new mbuf chain is what's put back on the br.
*/
return (gve_xmit_dqo(tx, mbuf));
}
/*
* Has the side-effect of stopping the xmit queue by setting tx->stopped
*/
static int
gve_xmit_retry_enobuf_mbuf(struct gve_tx_ring *tx,
struct mbuf **mbuf)
{
int err;
atomic_store_bool(&tx->stopped, true);
/*
* Room made in the queue BEFORE the barrier will be seen by the
* gve_xmit_mbuf retry below.
*
* If room is made in the queue AFTER the barrier, the cleanup tq
* iteration creating the room will either see a tx->stopped value
* of 0 or the 1 we just wrote:
*
* If it sees a 1, then it would enqueue the xmit tq. Enqueue
* implies a retry on the waiting pkt.
*
* If it sees a 0, then that implies a previous iteration overwrote
* our 1, and that iteration would enqueue the xmit tq. Enqueue
* implies a retry on the waiting pkt.
*/
atomic_thread_fence_seq_cst();
err = gve_xmit_mbuf(tx, mbuf);
if (err == 0)
atomic_store_bool(&tx->stopped, false);
return (err);
}
static void
gve_xmit_br(struct gve_tx_ring *tx)
{
struct gve_priv *priv = tx->com.priv;
struct ifnet *ifp = priv->ifp;
struct mbuf *mbuf;
int err;
while ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0 &&
(mbuf = drbr_peek(ifp, tx->br)) != NULL) {
err = gve_xmit_mbuf(tx, &mbuf);
/*
* We need to stop this taskqueue when we can't xmit the pkt due
* to lack of space in the NIC ring (ENOBUFS). The retry exists
* to guard against a TOCTTOU bug that could end up freezing the
* queue forever.
*/
if (__predict_false(mbuf != NULL && err == ENOBUFS))
err = gve_xmit_retry_enobuf_mbuf(tx, &mbuf);
if (__predict_false(err != 0 && mbuf != NULL)) {
if (err == EINVAL) {
drbr_advance(ifp, tx->br);
m_freem(mbuf);
} else
drbr_putback(ifp, tx->br, mbuf);
break;
}
drbr_advance(ifp, tx->br);
BPF_MTAP(ifp, mbuf);
bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map,
BUS_DMASYNC_PREWRITE);
if (gve_is_gqi(priv))
gve_db_bar_write_4(priv, tx->com.db_offset, tx->req);
else
gve_db_bar_dqo_write_4(priv, tx->com.db_offset,
tx->dqo.desc_tail);
}
}
void
gve_xmit_tq(void *arg, int pending)
{
struct gve_tx_ring *tx = (struct gve_tx_ring *)arg;
GVE_RING_LOCK(tx);
gve_xmit_br(tx);
GVE_RING_UNLOCK(tx);
}
static bool
is_vlan_tagged_pkt(struct mbuf *mbuf)
{
struct ether_header *eh;
eh = mtod(mbuf, struct ether_header *);
return (ntohs(eh->ether_type) == ETHERTYPE_VLAN);
}
int
gve_xmit_ifp(if_t ifp, struct mbuf *mbuf)
{
struct gve_priv *priv = if_getsoftc(ifp);
struct gve_tx_ring *tx;
bool is_br_empty;
int err;
uint32_t i;
if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
return (ENODEV);
if (M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE)
i = mbuf->m_pkthdr.flowid % priv->tx_cfg.num_queues;
else
i = curcpu % priv->tx_cfg.num_queues;
tx = &priv->tx[i];
if (__predict_false(is_vlan_tagged_pkt(mbuf))) {
counter_enter();
counter_u64_add_protected(tx->stats.tx_dropped_pkt_vlan, 1);
counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1);
counter_exit();
m_freem(mbuf);
return (ENODEV);
}
is_br_empty = drbr_empty(ifp, tx->br);
err = drbr_enqueue(ifp, tx->br, mbuf);
if (__predict_false(err != 0)) {
if (!atomic_load_bool(&tx->stopped))
taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
counter_enter();
counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_bufring, 1);
counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1);
counter_exit();
return (err);
}
/*
* If the mbuf we just enqueued is the only one on the ring, then
* transmit it right away in the interests of low latency.
*/
if (is_br_empty && (GVE_RING_TRYLOCK(tx) != 0)) {
gve_xmit_br(tx);
GVE_RING_UNLOCK(tx);
} else if (!atomic_load_bool(&tx->stopped))
taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
return (0);
}
void
gve_qflush(if_t ifp)
{
struct gve_priv *priv = if_getsoftc(ifp);
struct gve_tx_ring *tx;
int i;
for (i = 0; i < priv->tx_cfg.num_queues; ++i) {
tx = &priv->tx[i];
if (drbr_empty(ifp, tx->br) == 0) {
GVE_RING_LOCK(tx);
drbr_flush(ifp, tx->br);
GVE_RING_UNLOCK(tx);
}
}
if_qflush(ifp);
}