HardenedBSD/sys/dev/gve/gve_tx_dqo.c
Shailend Chand 40097cd67c gve: Fix TX livelock
Before this change the transmit taskqueue would enqueue itself when it
cannot find space on the NIC ring with the hope that eventually space
would be made. This results in the following livelock that only occurs
after passing ~200Gbps of TCP traffic for many hours:

                        100% CPU
┌───────────┐wait on  ┌──────────┐         ┌───────────┐
│user thread│  cpu    │gve xmit  │wait on  │gve cleanup│
│with mbuf  ├────────►│taskqueue ├────────►│taskqueue  │
│uma lock   │         │          │ NIC ring│           │
└───────────┘         └──────────┘  space  └─────┬─────┘
     ▲                                           │
     │      wait on mbuf uma lock                │
     └───────────────────────────────────────────┘

Further details about the livelock are available on
https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=281560.

After this change, the transmit taskqueue no longer spins till there is
room on the NIC ring. It instead stops itself and lets the
completion-processing taskqueue wake it up.

Since I'm touching the trasnmit taskqueue I've also corrected the name
of a counter and also fixed a bug where EINVAL mbufs were not being
freed and were instead living forever on the bufring.

Signed-off-by: Shailend Chand <shailend@google.com>
Reviewed-by: markj
MFC-after: 2 weeks
Differential Revision: https://reviews.freebsd.org/D47138
2024-11-06 15:06:41 +00:00

1091 lines
28 KiB
C

/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 2024 Google LLC
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "opt_inet6.h"
#include "gve.h"
#include "gve_dqo.h"
static void
gve_unmap_packet(struct gve_tx_ring *tx,
struct gve_tx_pending_pkt_dqo *pending_pkt)
{
bus_dmamap_sync(tx->dqo.buf_dmatag, pending_pkt->dmamap,
BUS_DMASYNC_POSTWRITE);
bus_dmamap_unload(tx->dqo.buf_dmatag, pending_pkt->dmamap);
}
static void
gve_free_tx_mbufs_dqo(struct gve_tx_ring *tx)
{
struct gve_tx_pending_pkt_dqo *pending_pkt;
int i;
for (i = 0; i < tx->dqo.num_pending_pkts; i++) {
pending_pkt = &tx->dqo.pending_pkts[i];
if (!pending_pkt->mbuf)
continue;
if (gve_is_qpl(tx->com.priv)) {
pending_pkt->qpl_buf_head = -1;
pending_pkt->num_qpl_bufs = 0;
} else
gve_unmap_packet(tx, pending_pkt);
m_freem(pending_pkt->mbuf);
pending_pkt->mbuf = NULL;
}
}
void
gve_tx_free_ring_dqo(struct gve_priv *priv, int i)
{
struct gve_tx_ring *tx = &priv->tx[i];
int j;
if (tx->dqo.desc_ring != NULL) {
gve_dma_free_coherent(&tx->desc_ring_mem);
tx->dqo.desc_ring = NULL;
}
if (tx->dqo.compl_ring != NULL) {
gve_dma_free_coherent(&tx->dqo.compl_ring_mem);
tx->dqo.compl_ring = NULL;
}
if (tx->dqo.pending_pkts != NULL) {
gve_free_tx_mbufs_dqo(tx);
if (!gve_is_qpl(priv) && tx->dqo.buf_dmatag) {
for (j = 0; j < tx->dqo.num_pending_pkts; j++)
if (tx->dqo.pending_pkts[j].state !=
GVE_PACKET_STATE_UNALLOCATED)
bus_dmamap_destroy(tx->dqo.buf_dmatag,
tx->dqo.pending_pkts[j].dmamap);
}
free(tx->dqo.pending_pkts, M_GVE);
tx->dqo.pending_pkts = NULL;
}
if (!gve_is_qpl(priv) && tx->dqo.buf_dmatag)
bus_dma_tag_destroy(tx->dqo.buf_dmatag);
if (gve_is_qpl(priv) && tx->dqo.qpl_bufs != NULL) {
free(tx->dqo.qpl_bufs, M_GVE);
tx->dqo.qpl_bufs = NULL;
}
}
static int
gve_tx_alloc_rda_fields_dqo(struct gve_tx_ring *tx)
{
struct gve_priv *priv = tx->com.priv;
int err;
int j;
/*
* DMA tag for mapping Tx mbufs
* The maxsize, nsegments, and maxsegsize params should match
* the if_sethwtso* arguments in gve_setup_ifnet in gve_main.c.
*/
err = bus_dma_tag_create(
bus_get_dma_tag(priv->dev), /* parent */
1, 0, /* alignment, bounds */
BUS_SPACE_MAXADDR, /* lowaddr */
BUS_SPACE_MAXADDR, /* highaddr */
NULL, NULL, /* filter, filterarg */
GVE_TSO_MAXSIZE_DQO, /* maxsize */
GVE_TX_MAX_DATA_DESCS_DQO, /* nsegments */
GVE_TX_MAX_BUF_SIZE_DQO, /* maxsegsize */
BUS_DMA_ALLOCNOW, /* flags */
NULL, /* lockfunc */
NULL, /* lockarg */
&tx->dqo.buf_dmatag);
if (err != 0) {
device_printf(priv->dev, "%s: bus_dma_tag_create failed: %d\n",
__func__, err);
return (err);
}
for (j = 0; j < tx->dqo.num_pending_pkts; j++) {
err = bus_dmamap_create(tx->dqo.buf_dmatag, 0,
&tx->dqo.pending_pkts[j].dmamap);
if (err != 0) {
device_printf(priv->dev,
"err in creating pending pkt dmamap %d: %d",
j, err);
return (err);
}
tx->dqo.pending_pkts[j].state = GVE_PACKET_STATE_FREE;
}
return (0);
}
int
gve_tx_alloc_ring_dqo(struct gve_priv *priv, int i)
{
struct gve_tx_ring *tx = &priv->tx[i];
uint16_t num_pending_pkts;
int err;
/* Descriptor ring */
err = gve_dma_alloc_coherent(priv,
sizeof(union gve_tx_desc_dqo) * priv->tx_desc_cnt,
CACHE_LINE_SIZE, &tx->desc_ring_mem);
if (err != 0) {
device_printf(priv->dev,
"Failed to alloc desc ring for tx ring %d", i);
goto abort;
}
tx->dqo.desc_ring = tx->desc_ring_mem.cpu_addr;
/* Completion ring */
err = gve_dma_alloc_coherent(priv,
sizeof(struct gve_tx_compl_desc_dqo) * priv->tx_desc_cnt,
CACHE_LINE_SIZE, &tx->dqo.compl_ring_mem);
if (err != 0) {
device_printf(priv->dev,
"Failed to alloc compl ring for tx ring %d", i);
goto abort;
}
tx->dqo.compl_ring = tx->dqo.compl_ring_mem.cpu_addr;
/*
* pending_pkts array
*
* The max number of pending packets determines the maximum number of
* descriptors which maybe written to the completion queue.
*
* We must set the number small enough to make sure we never overrun the
* completion queue.
*/
num_pending_pkts = priv->tx_desc_cnt;
/*
* Reserve space for descriptor completions, which will be reported at
* most every GVE_TX_MIN_RE_INTERVAL packets.
*/
num_pending_pkts -= num_pending_pkts / GVE_TX_MIN_RE_INTERVAL;
tx->dqo.num_pending_pkts = num_pending_pkts;
tx->dqo.pending_pkts = malloc(
sizeof(struct gve_tx_pending_pkt_dqo) * num_pending_pkts,
M_GVE, M_WAITOK | M_ZERO);
if (gve_is_qpl(priv)) {
int qpl_buf_cnt;
tx->com.qpl = &priv->qpls[i];
qpl_buf_cnt = GVE_TX_BUFS_PER_PAGE_DQO *
tx->com.qpl->num_pages;
tx->dqo.qpl_bufs = malloc(
sizeof(*tx->dqo.qpl_bufs) * qpl_buf_cnt,
M_GVE, M_WAITOK | M_ZERO);
} else
gve_tx_alloc_rda_fields_dqo(tx);
return (0);
abort:
gve_tx_free_ring_dqo(priv, i);
return (err);
}
static void
gve_extract_tx_metadata_dqo(const struct mbuf *mbuf,
struct gve_tx_metadata_dqo *metadata)
{
uint32_t hash = mbuf->m_pkthdr.flowid;
uint16_t path_hash;
metadata->version = GVE_TX_METADATA_VERSION_DQO;
if (hash) {
path_hash = hash ^ (hash >> 16);
path_hash &= (1 << 15) - 1;
if (__predict_false(path_hash == 0))
path_hash = ~path_hash;
metadata->path_hash = path_hash;
}
}
static void
gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx,
uint32_t *desc_idx, uint32_t len, uint64_t addr,
int16_t compl_tag, bool eop, bool csum_enabled)
{
while (len > 0) {
struct gve_tx_pkt_desc_dqo *desc =
&tx->dqo.desc_ring[*desc_idx].pkt;
uint32_t cur_len = MIN(len, GVE_TX_MAX_BUF_SIZE_DQO);
bool cur_eop = eop && cur_len == len;
*desc = (struct gve_tx_pkt_desc_dqo){
.buf_addr = htole64(addr),
.dtype = GVE_TX_PKT_DESC_DTYPE_DQO,
.end_of_packet = cur_eop,
.checksum_offload_enable = csum_enabled,
.compl_tag = htole16(compl_tag),
.buf_size = cur_len,
};
addr += cur_len;
len -= cur_len;
*desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask;
}
}
static void
gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc,
const struct mbuf *mbuf, const struct gve_tx_metadata_dqo *metadata,
int header_len)
{
*desc = (struct gve_tx_tso_context_desc_dqo){
.header_len = header_len,
.cmd_dtype = {
.dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO,
.tso = 1,
},
.flex0 = metadata->bytes[0],
.flex5 = metadata->bytes[5],
.flex6 = metadata->bytes[6],
.flex7 = metadata->bytes[7],
.flex8 = metadata->bytes[8],
.flex9 = metadata->bytes[9],
.flex10 = metadata->bytes[10],
.flex11 = metadata->bytes[11],
};
desc->tso_total_len = mbuf->m_pkthdr.len - header_len;
desc->mss = mbuf->m_pkthdr.tso_segsz;
}
static void
gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc,
const struct gve_tx_metadata_dqo *metadata)
{
*desc = (struct gve_tx_general_context_desc_dqo){
.flex0 = metadata->bytes[0],
.flex1 = metadata->bytes[1],
.flex2 = metadata->bytes[2],
.flex3 = metadata->bytes[3],
.flex4 = metadata->bytes[4],
.flex5 = metadata->bytes[5],
.flex6 = metadata->bytes[6],
.flex7 = metadata->bytes[7],
.flex8 = metadata->bytes[8],
.flex9 = metadata->bytes[9],
.flex10 = metadata->bytes[10],
.flex11 = metadata->bytes[11],
.cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO},
};
}
#define PULLUP_HDR(m, len) \
do { \
if (__predict_false((m)->m_len < (len))) { \
(m) = m_pullup((m), (len)); \
if ((m) == NULL) \
return (EINVAL); \
} \
} while (0)
static int
gve_prep_tso(struct mbuf *mbuf, int *header_len)
{
uint8_t l3_off, l4_off = 0;
struct ether_header *eh;
struct tcphdr *th;
u_short csum;
PULLUP_HDR(mbuf, sizeof(*eh));
eh = mtod(mbuf, struct ether_header *);
KASSERT(eh->ether_type != ETHERTYPE_VLAN,
("VLAN-tagged packets not supported"));
l3_off = ETHER_HDR_LEN;
#ifdef INET6
if (ntohs(eh->ether_type) == ETHERTYPE_IPV6) {
struct ip6_hdr *ip6;
PULLUP_HDR(mbuf, l3_off + sizeof(*ip6));
ip6 = (struct ip6_hdr *)(mtodo(mbuf, l3_off));
l4_off = l3_off + sizeof(struct ip6_hdr);
csum = in6_cksum_pseudo(ip6, /*len=*/0, IPPROTO_TCP,
/*csum=*/0);
} else
#endif
if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
struct ip *ip;
PULLUP_HDR(mbuf, l3_off + sizeof(*ip));
ip = (struct ip *)(mtodo(mbuf, l3_off));
l4_off = l3_off + (ip->ip_hl << 2);
csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
htons(IPPROTO_TCP));
}
PULLUP_HDR(mbuf, l4_off + sizeof(struct tcphdr *));
th = (struct tcphdr *)(mtodo(mbuf, l4_off));
*header_len = l4_off + (th->th_off << 2);
/*
* Hardware requires the th->th_sum to not include the TCP payload,
* hence we recompute the csum with it excluded.
*/
th->th_sum = csum;
return (0);
}
static int
gve_tx_fill_ctx_descs(struct gve_tx_ring *tx, struct mbuf *mbuf,
bool is_tso, uint32_t *desc_idx)
{
struct gve_tx_general_context_desc_dqo *gen_desc;
struct gve_tx_tso_context_desc_dqo *tso_desc;
struct gve_tx_metadata_dqo metadata;
int header_len;
int err;
metadata = (struct gve_tx_metadata_dqo){0};
gve_extract_tx_metadata_dqo(mbuf, &metadata);
if (is_tso) {
err = gve_prep_tso(mbuf, &header_len);
if (__predict_false(err)) {
counter_enter();
counter_u64_add_protected(
tx->stats.tx_delayed_pkt_tsoerr, 1);
counter_exit();
return (err);
}
tso_desc = &tx->dqo.desc_ring[*desc_idx].tso_ctx;
gve_tx_fill_tso_ctx_desc(tso_desc, mbuf, &metadata, header_len);
*desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask;
counter_enter();
counter_u64_add_protected(tx->stats.tso_packet_cnt, 1);
counter_exit();
}
gen_desc = &tx->dqo.desc_ring[*desc_idx].general_ctx;
gve_tx_fill_general_ctx_desc(gen_desc, &metadata);
*desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask;
return (0);
}
static int
gve_map_mbuf_dqo(struct gve_tx_ring *tx,
struct mbuf **mbuf, bus_dmamap_t dmamap,
bus_dma_segment_t *segs, int *nsegs, int attempt)
{
struct mbuf *m_new = NULL;
int err;
err = bus_dmamap_load_mbuf_sg(tx->dqo.buf_dmatag, dmamap,
*mbuf, segs, nsegs, BUS_DMA_NOWAIT);
switch (err) {
case __predict_true(0):
break;
case EFBIG:
if (__predict_false(attempt > 0))
goto abort;
counter_enter();
counter_u64_add_protected(
tx->stats.tx_mbuf_collapse, 1);
counter_exit();
/* Try m_collapse before m_defrag */
m_new = m_collapse(*mbuf, M_NOWAIT,
GVE_TX_MAX_DATA_DESCS_DQO);
if (m_new == NULL) {
counter_enter();
counter_u64_add_protected(
tx->stats.tx_mbuf_defrag, 1);
counter_exit();
m_new = m_defrag(*mbuf, M_NOWAIT);
}
if (__predict_false(m_new == NULL)) {
counter_enter();
counter_u64_add_protected(
tx->stats.tx_mbuf_defrag_err, 1);
counter_exit();
m_freem(*mbuf);
*mbuf = NULL;
err = ENOMEM;
goto abort;
} else {
*mbuf = m_new;
return (gve_map_mbuf_dqo(tx, mbuf, dmamap,
segs, nsegs, ++attempt));
}
case ENOMEM:
counter_enter();
counter_u64_add_protected(
tx->stats.tx_mbuf_dmamap_enomem_err, 1);
counter_exit();
goto abort;
default:
goto abort;
}
return (0);
abort:
counter_enter();
counter_u64_add_protected(tx->stats.tx_mbuf_dmamap_err, 1);
counter_exit();
return (err);
}
static uint32_t
num_avail_desc_ring_slots(const struct gve_tx_ring *tx)
{
uint32_t num_used = (tx->dqo.desc_tail - tx->dqo.desc_head) &
tx->dqo.desc_mask;
return (tx->dqo.desc_mask - num_used);
}
static struct gve_tx_pending_pkt_dqo *
gve_alloc_pending_packet(struct gve_tx_ring *tx)
{
int32_t index = tx->dqo.free_pending_pkts_csm;
struct gve_tx_pending_pkt_dqo *pending_pkt;
/*
* No pending packets available in the consumer list,
* try to steal the producer list.
*/
if (__predict_false(index == -1)) {
tx->dqo.free_pending_pkts_csm = atomic_swap_32(
&tx->dqo.free_pending_pkts_prd, -1);
index = tx->dqo.free_pending_pkts_csm;
if (__predict_false(index == -1))
return (NULL);
}
pending_pkt = &tx->dqo.pending_pkts[index];
/* Remove pending_pkt from the consumer list */
tx->dqo.free_pending_pkts_csm = pending_pkt->next;
pending_pkt->state = GVE_PACKET_STATE_PENDING_DATA_COMPL;
return (pending_pkt);
}
static void
gve_free_pending_packet(struct gve_tx_ring *tx,
struct gve_tx_pending_pkt_dqo *pending_pkt)
{
int index = pending_pkt - tx->dqo.pending_pkts;
int32_t old_head;
pending_pkt->state = GVE_PACKET_STATE_FREE;
/* Add pending_pkt to the producer list */
while (true) {
old_head = atomic_load_acq_32(&tx->dqo.free_pending_pkts_prd);
pending_pkt->next = old_head;
if (atomic_cmpset_32(&tx->dqo.free_pending_pkts_prd,
old_head, index))
break;
}
}
/*
* Has the side-effect of retrieving the value of the last desc index
* processed by the NIC. hw_tx_head is written to by the completions-processing
* taskqueue upon receiving descriptor-completions.
*/
static bool
gve_tx_has_desc_room_dqo(struct gve_tx_ring *tx, int needed_descs)
{
if (needed_descs <= num_avail_desc_ring_slots(tx))
return (true);
tx->dqo.desc_head = atomic_load_acq_32(&tx->dqo.hw_tx_head);
if (needed_descs > num_avail_desc_ring_slots(tx)) {
counter_enter();
counter_u64_add_protected(
tx->stats.tx_delayed_pkt_nospace_descring, 1);
counter_exit();
return (false);
}
return (0);
}
static void
gve_tx_request_desc_compl(struct gve_tx_ring *tx, uint32_t desc_idx)
{
uint32_t last_report_event_interval;
uint32_t last_desc_idx;
last_desc_idx = (desc_idx - 1) & tx->dqo.desc_mask;
last_report_event_interval =
(last_desc_idx - tx->dqo.last_re_idx) & tx->dqo.desc_mask;
if (__predict_false(last_report_event_interval >=
GVE_TX_MIN_RE_INTERVAL)) {
tx->dqo.desc_ring[last_desc_idx].pkt.report_event = true;
tx->dqo.last_re_idx = last_desc_idx;
}
}
static bool
gve_tx_have_enough_qpl_bufs(struct gve_tx_ring *tx, int num_bufs)
{
uint32_t available = tx->dqo.qpl_bufs_produced_cached -
tx->dqo.qpl_bufs_consumed;
if (__predict_true(available >= num_bufs))
return (true);
tx->dqo.qpl_bufs_produced_cached = atomic_load_acq_32(
&tx->dqo.qpl_bufs_produced);
available = tx->dqo.qpl_bufs_produced_cached -
tx->dqo.qpl_bufs_consumed;
if (__predict_true(available >= num_bufs))
return (true);
return (false);
}
static int32_t
gve_tx_alloc_qpl_buf(struct gve_tx_ring *tx)
{
int32_t buf = tx->dqo.free_qpl_bufs_csm;
if (__predict_false(buf == -1)) {
tx->dqo.free_qpl_bufs_csm = atomic_swap_32(
&tx->dqo.free_qpl_bufs_prd, -1);
buf = tx->dqo.free_qpl_bufs_csm;
if (__predict_false(buf == -1))
return (-1);
}
tx->dqo.free_qpl_bufs_csm = tx->dqo.qpl_bufs[buf];
tx->dqo.qpl_bufs_consumed++;
return (buf);
}
/*
* Tx buffer i corresponds to
* qpl_page_id = i / GVE_TX_BUFS_PER_PAGE_DQO
* qpl_page_offset = (i % GVE_TX_BUFS_PER_PAGE_DQO) * GVE_TX_BUF_SIZE_DQO
*/
static void
gve_tx_buf_get_addr_dqo(struct gve_tx_ring *tx,
int32_t index, void **va, bus_addr_t *dma_addr)
{
int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO);
int offset = (index & (GVE_TX_BUFS_PER_PAGE_DQO - 1)) <<
GVE_TX_BUF_SHIFT_DQO;
*va = (char *)tx->com.qpl->dmas[page_id].cpu_addr + offset;
*dma_addr = tx->com.qpl->dmas[page_id].bus_addr + offset;
}
static struct gve_dma_handle *
gve_get_page_dma_handle(struct gve_tx_ring *tx, int32_t index)
{
int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO);
return (&tx->com.qpl->dmas[page_id]);
}
static void
gve_tx_copy_mbuf_and_write_pkt_descs(struct gve_tx_ring *tx,
struct mbuf *mbuf, struct gve_tx_pending_pkt_dqo *pkt,
bool csum_enabled, int16_t completion_tag,
uint32_t *desc_idx)
{
int32_t pkt_len = mbuf->m_pkthdr.len;
struct gve_dma_handle *dma;
uint32_t copy_offset = 0;
int32_t prev_buf = -1;
uint32_t copy_len;
bus_addr_t addr;
int32_t buf;
void *va;
MPASS(pkt->num_qpl_bufs == 0);
MPASS(pkt->qpl_buf_head == -1);
while (copy_offset < pkt_len) {
buf = gve_tx_alloc_qpl_buf(tx);
/* We already checked for availability */
MPASS(buf != -1);
gve_tx_buf_get_addr_dqo(tx, buf, &va, &addr);
copy_len = MIN(GVE_TX_BUF_SIZE_DQO, pkt_len - copy_offset);
m_copydata(mbuf, copy_offset, copy_len, va);
copy_offset += copy_len;
dma = gve_get_page_dma_handle(tx, buf);
bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE);
gve_tx_fill_pkt_desc_dqo(tx, desc_idx,
copy_len, addr, completion_tag,
/*eop=*/copy_offset == pkt_len,
csum_enabled);
/* Link all the qpl bufs for a packet */
if (prev_buf == -1)
pkt->qpl_buf_head = buf;
else
tx->dqo.qpl_bufs[prev_buf] = buf;
prev_buf = buf;
pkt->num_qpl_bufs++;
}
tx->dqo.qpl_bufs[buf] = -1;
}
int
gve_xmit_dqo_qpl(struct gve_tx_ring *tx, struct mbuf *mbuf)
{
uint32_t desc_idx = tx->dqo.desc_tail;
struct gve_tx_pending_pkt_dqo *pkt;
int total_descs_needed;
int16_t completion_tag;
bool has_csum_flag;
int csum_flags;
bool is_tso;
int nsegs;
int err;
csum_flags = mbuf->m_pkthdr.csum_flags;
has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP |
CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO);
is_tso = csum_flags & CSUM_TSO;
nsegs = howmany(mbuf->m_pkthdr.len, GVE_TX_BUF_SIZE_DQO);
/* Check if we have enough room in the desc ring */
total_descs_needed = 1 + /* general_ctx_desc */
nsegs + /* pkt_desc */
(is_tso ? 1 : 0); /* tso_ctx_desc */
if (__predict_false(!gve_tx_has_desc_room_dqo(tx, total_descs_needed)))
return (ENOBUFS);
if (!gve_tx_have_enough_qpl_bufs(tx, nsegs)) {
counter_enter();
counter_u64_add_protected(
tx->stats.tx_delayed_pkt_nospace_qpl_bufs, 1);
counter_exit();
return (ENOBUFS);
}
pkt = gve_alloc_pending_packet(tx);
if (pkt == NULL) {
counter_enter();
counter_u64_add_protected(
tx->stats.tx_delayed_pkt_nospace_compring, 1);
counter_exit();
return (ENOBUFS);
}
completion_tag = pkt - tx->dqo.pending_pkts;
pkt->mbuf = mbuf;
err = gve_tx_fill_ctx_descs(tx, mbuf, is_tso, &desc_idx);
if (err)
goto abort;
gve_tx_copy_mbuf_and_write_pkt_descs(tx, mbuf, pkt,
has_csum_flag, completion_tag, &desc_idx);
/* Remember the index of the last desc written */
tx->dqo.desc_tail = desc_idx;
/*
* Request a descriptor completion on the last descriptor of the
* packet if we are allowed to by the HW enforced interval.
*/
gve_tx_request_desc_compl(tx, desc_idx);
tx->req += total_descs_needed; /* tx->req is just a sysctl counter */
return (0);
abort:
pkt->mbuf = NULL;
gve_free_pending_packet(tx, pkt);
return (err);
}
int
gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr)
{
bus_dma_segment_t segs[GVE_TX_MAX_DATA_DESCS_DQO];
uint32_t desc_idx = tx->dqo.desc_tail;
struct gve_tx_pending_pkt_dqo *pkt;
struct mbuf *mbuf = *mbuf_ptr;
int total_descs_needed;
int16_t completion_tag;
bool has_csum_flag;
int csum_flags;
bool is_tso;
int nsegs;
int err;
int i;
csum_flags = mbuf->m_pkthdr.csum_flags;
has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP |
CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO);
is_tso = csum_flags & CSUM_TSO;
/*
* This mbuf might end up needing more than 1 pkt desc.
* The actual number, `nsegs` is known only after the
* expensive gve_map_mbuf_dqo call. This check beneath
* exists to fail early when the desc ring is really full.
*/
total_descs_needed = 1 + /* general_ctx_desc */
1 + /* pkt_desc */
(is_tso ? 1 : 0); /* tso_ctx_desc */
if (__predict_false(!gve_tx_has_desc_room_dqo(tx, total_descs_needed)))
return (ENOBUFS);
pkt = gve_alloc_pending_packet(tx);
if (pkt == NULL) {
counter_enter();
counter_u64_add_protected(
tx->stats.tx_delayed_pkt_nospace_compring, 1);
counter_exit();
return (ENOBUFS);
}
completion_tag = pkt - tx->dqo.pending_pkts;
err = gve_map_mbuf_dqo(tx, mbuf_ptr, pkt->dmamap,
segs, &nsegs, /*attempt=*/0);
if (err)
goto abort;
mbuf = *mbuf_ptr; /* gve_map_mbuf_dqo might replace the mbuf chain */
pkt->mbuf = mbuf;
total_descs_needed = 1 + /* general_ctx_desc */
nsegs + /* pkt_desc */
(is_tso ? 1 : 0); /* tso_ctx_desc */
if (__predict_false(
!gve_tx_has_desc_room_dqo(tx, total_descs_needed))) {
err = ENOBUFS;
goto abort_with_dma;
}
err = gve_tx_fill_ctx_descs(tx, mbuf, is_tso, &desc_idx);
if (err)
goto abort_with_dma;
bus_dmamap_sync(tx->dqo.buf_dmatag, pkt->dmamap, BUS_DMASYNC_PREWRITE);
for (i = 0; i < nsegs; i++) {
gve_tx_fill_pkt_desc_dqo(tx, &desc_idx,
segs[i].ds_len, segs[i].ds_addr,
completion_tag, /*eop=*/i == (nsegs - 1),
has_csum_flag);
}
/* Remember the index of the last desc written */
tx->dqo.desc_tail = desc_idx;
/*
* Request a descriptor completion on the last descriptor of the
* packet if we are allowed to by the HW enforced interval.
*/
gve_tx_request_desc_compl(tx, desc_idx);
tx->req += total_descs_needed; /* tx->req is just a sysctl counter */
return (0);
abort_with_dma:
gve_unmap_packet(tx, pkt);
abort:
pkt->mbuf = NULL;
gve_free_pending_packet(tx, pkt);
return (err);
}
static void
gve_reap_qpl_bufs_dqo(struct gve_tx_ring *tx,
struct gve_tx_pending_pkt_dqo *pkt)
{
int32_t buf = pkt->qpl_buf_head;
struct gve_dma_handle *dma;
int32_t qpl_buf_tail;
int32_t old_head;
int i;
for (i = 0; i < pkt->num_qpl_bufs; i++) {
dma = gve_get_page_dma_handle(tx, buf);
bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_POSTWRITE);
qpl_buf_tail = buf;
buf = tx->dqo.qpl_bufs[buf];
}
MPASS(buf == -1);
buf = qpl_buf_tail;
while (true) {
old_head = atomic_load_32(&tx->dqo.free_qpl_bufs_prd);
tx->dqo.qpl_bufs[buf] = old_head;
/*
* The "rel" ensures that the update to dqo.free_qpl_bufs_prd
* is visible only after the linked list from this pkt is
* attached above to old_head.
*/
if (atomic_cmpset_rel_32(&tx->dqo.free_qpl_bufs_prd,
old_head, pkt->qpl_buf_head))
break;
}
/*
* The "rel" ensures that the update to dqo.qpl_bufs_produced is
* visible only adter the update to dqo.free_qpl_bufs_prd above.
*/
atomic_add_rel_32(&tx->dqo.qpl_bufs_produced, pkt->num_qpl_bufs);
pkt->qpl_buf_head = -1;
pkt->num_qpl_bufs = 0;
}
static uint64_t
gve_handle_packet_completion(struct gve_priv *priv,
struct gve_tx_ring *tx, uint16_t compl_tag)
{
struct gve_tx_pending_pkt_dqo *pending_pkt;
int32_t pkt_len;
if (__predict_false(compl_tag >= tx->dqo.num_pending_pkts)) {
device_printf(priv->dev, "Invalid TX completion tag: %d\n",
compl_tag);
return (0);
}
pending_pkt = &tx->dqo.pending_pkts[compl_tag];
/* Packet is allocated but not pending data completion. */
if (__predict_false(pending_pkt->state !=
GVE_PACKET_STATE_PENDING_DATA_COMPL)) {
device_printf(priv->dev,
"No pending data completion: %d\n", compl_tag);
return (0);
}
pkt_len = pending_pkt->mbuf->m_pkthdr.len;
if (gve_is_qpl(priv))
gve_reap_qpl_bufs_dqo(tx, pending_pkt);
else
gve_unmap_packet(tx, pending_pkt);
m_freem(pending_pkt->mbuf);
pending_pkt->mbuf = NULL;
gve_free_pending_packet(tx, pending_pkt);
return (pkt_len);
}
int
gve_tx_intr_dqo(void *arg)
{
struct gve_tx_ring *tx = arg;
struct gve_priv *priv = tx->com.priv;
struct gve_ring_com *com = &tx->com;
if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
return (FILTER_STRAY);
/* Interrupts are automatically masked */
taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task);
return (FILTER_HANDLED);
}
static void
gve_tx_clear_desc_ring_dqo(struct gve_tx_ring *tx)
{
struct gve_ring_com *com = &tx->com;
int i;
for (i = 0; i < com->priv->tx_desc_cnt; i++)
tx->dqo.desc_ring[i] = (union gve_tx_desc_dqo){};
bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map,
BUS_DMASYNC_PREWRITE);
}
static void
gve_tx_clear_compl_ring_dqo(struct gve_tx_ring *tx)
{
struct gve_ring_com *com = &tx->com;
int entries;
int i;
entries = com->priv->tx_desc_cnt;
for (i = 0; i < entries; i++)
tx->dqo.compl_ring[i] = (struct gve_tx_compl_desc_dqo){};
bus_dmamap_sync(tx->dqo.compl_ring_mem.tag, tx->dqo.compl_ring_mem.map,
BUS_DMASYNC_PREWRITE);
}
void
gve_clear_tx_ring_dqo(struct gve_priv *priv, int i)
{
struct gve_tx_ring *tx = &priv->tx[i];
int j;
tx->dqo.desc_head = 0;
tx->dqo.desc_tail = 0;
tx->dqo.desc_mask = priv->tx_desc_cnt - 1;
tx->dqo.last_re_idx = 0;
tx->dqo.compl_head = 0;
tx->dqo.compl_mask = priv->tx_desc_cnt - 1;
atomic_store_32(&tx->dqo.hw_tx_head, 0);
tx->dqo.cur_gen_bit = 0;
gve_free_tx_mbufs_dqo(tx);
for (j = 0; j < tx->dqo.num_pending_pkts - 1; j++) {
tx->dqo.pending_pkts[j].next = j + 1;
tx->dqo.pending_pkts[j].state = GVE_PACKET_STATE_FREE;
}
tx->dqo.pending_pkts[tx->dqo.num_pending_pkts - 1].next = -1;
tx->dqo.free_pending_pkts_csm = 0;
atomic_store_rel_32(&tx->dqo.free_pending_pkts_prd, -1);
if (gve_is_qpl(priv)) {
int qpl_buf_cnt = GVE_TX_BUFS_PER_PAGE_DQO *
tx->com.qpl->num_pages;
for (j = 0; j < qpl_buf_cnt - 1; j++)
tx->dqo.qpl_bufs[j] = j + 1;
tx->dqo.qpl_bufs[j] = -1;
tx->dqo.free_qpl_bufs_csm = 0;
atomic_store_32(&tx->dqo.free_qpl_bufs_prd, -1);
atomic_store_32(&tx->dqo.qpl_bufs_produced, qpl_buf_cnt);
tx->dqo.qpl_bufs_produced_cached = qpl_buf_cnt;
tx->dqo.qpl_bufs_consumed = 0;
}
gve_tx_clear_desc_ring_dqo(tx);
gve_tx_clear_compl_ring_dqo(tx);
}
static bool
gve_tx_cleanup_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, int budget)
{
struct gve_tx_compl_desc_dqo *compl_desc;
uint64_t bytes_done = 0;
uint64_t pkts_done = 0;
uint16_t compl_tag;
int work_done = 0;
uint16_t tx_head;
uint16_t type;
while (work_done < budget) {
bus_dmamap_sync(tx->dqo.compl_ring_mem.tag, tx->dqo.compl_ring_mem.map,
BUS_DMASYNC_POSTREAD);
compl_desc = &tx->dqo.compl_ring[tx->dqo.compl_head];
if (compl_desc->generation == tx->dqo.cur_gen_bit)
break;
/*
* Prevent generation bit from being read after the rest of the
* descriptor.
*/
rmb();
type = compl_desc->type;
if (type == GVE_COMPL_TYPE_DQO_DESC) {
/* This is the last descriptor fetched by HW plus one */
tx_head = le16toh(compl_desc->tx_head);
atomic_store_rel_32(&tx->dqo.hw_tx_head, tx_head);
} else if (type == GVE_COMPL_TYPE_DQO_PKT) {
compl_tag = le16toh(compl_desc->completion_tag);
bytes_done += gve_handle_packet_completion(priv,
tx, compl_tag);
pkts_done++;
}
tx->dqo.compl_head = (tx->dqo.compl_head + 1) &
tx->dqo.compl_mask;
/* Flip the generation bit when we wrap around */
tx->dqo.cur_gen_bit ^= tx->dqo.compl_head == 0;
work_done++;
}
/*
* Waking the xmit taskqueue has to occur after room has been made in
* the queue.
*/
atomic_thread_fence_seq_cst();
if (atomic_load_bool(&tx->stopped) && work_done) {
atomic_store_bool(&tx->stopped, false);
taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
}
tx->done += work_done; /* tx->done is just a sysctl counter */
counter_enter();
counter_u64_add_protected(tx->stats.tbytes, bytes_done);
counter_u64_add_protected(tx->stats.tpackets, pkts_done);
counter_exit();
return (work_done == budget);
}
void
gve_tx_cleanup_tq_dqo(void *arg, int pending)
{
struct gve_tx_ring *tx = arg;
struct gve_priv *priv = tx->com.priv;
if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
return;
if (gve_tx_cleanup_dqo(priv, tx, /*budget=*/1024)) {
taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task);
return;
}
gve_db_bar_dqo_write_4(priv, tx->com.irq_db_offset,
GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO);
}