diff --git a/UPDATING b/UPDATING index 75e0e1cf2fc1..e66fd7ebd575 100644 --- a/UPDATING +++ b/UPDATING @@ -22,6 +22,14 @@ NOTE TO PEOPLE WHO THINK THAT FreeBSD 8.x IS SLOW: to maximize performance. (To disable malloc debugging, run ln -s aj /etc/malloc.conf.) +20081121: + __FreeBSD_version 800054 adds memory barriers to + , new interfaces to ifnet to facilitate + multiple hardware transmit queues for cards that support + them, and a lock-less ring-buffer implementation to + enable drivers to more efficiently manage queueing of + packets. + 20081117: A new version of ZFS (version 13) has been merged to -HEAD. This version has zpool attribute "listsnapshots" off by diff --git a/sys/amd64/include/atomic.h b/sys/amd64/include/atomic.h index 0edbfff52ff7..d2a3846172d9 100644 --- a/sys/amd64/include/atomic.h +++ b/sys/amd64/include/atomic.h @@ -32,6 +32,10 @@ #error this file needs sys/cdefs.h as a prerequisite #endif +#define mb() __asm__ __volatile__ ("mfence;": : :"memory") +#define wmb() __asm__ __volatile__ ("sfence;": : :"memory") +#define rmb() __asm__ __volatile__ ("lfence;": : :"memory") + /* * Various simple operations on memory, each of which is atomic in the * presence of interrupts and multiple processors. diff --git a/sys/arm/include/atomic.h b/sys/arm/include/atomic.h index 6a33699b9fc8..72e198d3369b 100644 --- a/sys/arm/include/atomic.h +++ b/sys/arm/include/atomic.h @@ -47,6 +47,10 @@ #include #endif +#define mb() +#define wmb() +#define rmb() + #ifndef I32_bit #define I32_bit (1 << 7) /* IRQ disable */ #endif diff --git a/sys/conf/files b/sys/conf/files index 6ab4068f2cfa..02c4a339ed5f 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1807,6 +1807,7 @@ kern/subr_acl_posix1e.c standard kern/subr_autoconf.c standard kern/subr_blist.c standard kern/subr_bus.c standard +kern/subr_bufring.c standard kern/subr_clist.c standard kern/subr_clock.c standard kern/subr_devstat.c standard diff --git a/sys/dev/bce/if_bcereg.h b/sys/dev/bce/if_bcereg.h index b05cbef8279b..001d3411b647 100644 --- a/sys/dev/bce/if_bcereg.h +++ b/sys/dev/bce/if_bcereg.h @@ -557,6 +557,8 @@ #endif /* BCE_DEBUG */ + +#if __FreeBSD_version < 800054 #if defined(__i386__) || defined(__amd64__) #define mb() __asm volatile("mfence" ::: "memory") #define wmb() __asm volatile("sfence" ::: "memory") @@ -566,6 +568,7 @@ #define rmb() #define wmb() #endif +#endif /****************************************************************************/ /* Device identification definitions. */ diff --git a/sys/dev/cxgb/cxgb_adapter.h b/sys/dev/cxgb/cxgb_adapter.h index 2f274ebf69af..1f37e3a7fe93 100644 --- a/sys/dev/cxgb/cxgb_adapter.h +++ b/sys/dev/cxgb/cxgb_adapter.h @@ -41,6 +41,7 @@ $FreeBSD$ #include #include #include +#include #include #include @@ -258,7 +259,7 @@ struct sge_txq { * mbuf touches */ struct mbuf_head cleanq; - struct buf_ring txq_mr; + struct buf_ring *txq_mr; struct mbuf *immpkt; uint32_t txq_drops; uint32_t txq_skipped; diff --git a/sys/dev/cxgb/cxgb_multiq.c b/sys/dev/cxgb/cxgb_multiq.c index 81c9ba2f62c5..bcae1232cb16 100644 --- a/sys/dev/cxgb/cxgb_multiq.c +++ b/sys/dev/cxgb/cxgb_multiq.c @@ -129,7 +129,7 @@ cxgb_pcpu_enqueue_packet_(struct sge_qset *qs, struct mbuf *m) return (ENXIO); } txq = &qs->txq[TXQ_ETH]; - err = buf_ring_enqueue(&txq->txq_mr, m); + err = buf_ring_enqueue(txq->txq_mr, m); if (err) { txq->txq_drops++; m_freem(m); @@ -194,14 +194,11 @@ cxgb_dequeue_packet(struct sge_txq *txq, struct mbuf **m_vec) } sc = qs->port->adapter; - m = buf_ring_dequeue(&txq->txq_mr); + m = buf_ring_dequeue_sc(txq->txq_mr); if (m == NULL) return (0); count = 1; - KASSERT(m->m_type == MT_DATA, - ("m=%p is bad mbuf type %d from ring cons=%d prod=%d", m, - m->m_type, txq->txq_mr.br_cons, txq->txq_mr.br_prod)); m_vec[0] = m; if (m->m_pkthdr.tso_segsz > 0 || m->m_pkthdr.len > TX_WR_SIZE_MAX || m->m_next != NULL || (cxgb_pcpu_tx_coalesce == 0)) { @@ -209,14 +206,14 @@ cxgb_dequeue_packet(struct sge_txq *txq, struct mbuf **m_vec) } size = m->m_pkthdr.len; - for (m = buf_ring_peek(&txq->txq_mr); m != NULL; - m = buf_ring_peek(&txq->txq_mr)) { + for (m = buf_ring_peek(txq->txq_mr); m != NULL; + m = buf_ring_peek(txq->txq_mr)) { if (m->m_pkthdr.tso_segsz > 0 || size + m->m_pkthdr.len > TX_WR_SIZE_MAX || m->m_next != NULL) break; - buf_ring_dequeue(&txq->txq_mr); + buf_ring_dequeue_sc(txq->txq_mr); size += m->m_pkthdr.len; m_vec[count++] = m; @@ -367,7 +364,7 @@ cxgb_pcpu_free(struct sge_qset *qs) mtx_lock(&txq->lock); while ((m = mbufq_dequeue(&txq->sendq)) != NULL) m_freem(m); - while ((m = buf_ring_dequeue(&txq->txq_mr)) != NULL) + while ((m = buf_ring_dequeue_sc(txq->txq_mr)) != NULL) m_freem(m); t3_free_tx_desc_all(txq); @@ -429,7 +426,7 @@ cxgb_pcpu_start_(struct sge_qset *qs, struct mbuf *immpkt, int tx_flush) initerr = ENXIO; else if (immpkt) { - if (!buf_ring_empty(&txq->txq_mr)) + if (!buf_ring_empty(txq->txq_mr)) initerr = cxgb_pcpu_enqueue_packet_(qs, immpkt); else txq->immpkt = immpkt; @@ -460,7 +457,7 @@ cxgb_pcpu_start_(struct sge_qset *qs, struct mbuf *immpkt, int tx_flush) } stopped = isset(&qs->txq_stopped, TXQ_ETH); - flush = (((!buf_ring_empty(&txq->txq_mr) || (!IFQ_DRV_IS_EMPTY(&pi->ifp->if_snd))) && !stopped) || txq->immpkt); + flush = (((!buf_ring_empty(txq->txq_mr) || (!IFQ_DRV_IS_EMPTY(&pi->ifp->if_snd))) && !stopped) || txq->immpkt); max_desc = tx_flush ? TX_ETH_Q_SIZE : TX_START_MAX_DESC; if (cxgb_debug) @@ -471,7 +468,7 @@ cxgb_pcpu_start_(struct sge_qset *qs, struct mbuf *immpkt, int tx_flush) if ((tx_flush && flush && err == 0) && - (!buf_ring_empty(&txq->txq_mr) || + (!buf_ring_empty(txq->txq_mr) || !IFQ_DRV_IS_EMPTY(&pi->ifp->if_snd))) { struct thread *td = curthread; @@ -521,7 +518,7 @@ cxgb_pcpu_start(struct ifnet *ifp, struct mbuf *immpkt) txq = &qs->txq[TXQ_ETH]; if (((sc->tunq_coalesce == 0) || - (buf_ring_count(&txq->txq_mr) >= TX_WR_COUNT_MAX) || + (buf_ring_count(txq->txq_mr) >= TX_WR_COUNT_MAX) || (cxgb_pcpu_tx_coalesce == 0)) && mtx_trylock(&txq->lock)) { if (cxgb_debug) printf("doing immediate transmit\n"); @@ -529,12 +526,12 @@ cxgb_pcpu_start(struct ifnet *ifp, struct mbuf *immpkt) txq->flags |= TXQ_TRANSMITTING; err = cxgb_pcpu_start_(qs, immpkt, FALSE); txq->flags &= ~TXQ_TRANSMITTING; - resid = (buf_ring_count(&txq->txq_mr) > 64) || (desc_reclaimable(txq) > 64); + resid = (buf_ring_count(txq->txq_mr) > 64) || (desc_reclaimable(txq) > 64); mtx_unlock(&txq->lock); } else if (immpkt) { if (cxgb_debug) printf("deferred coalesce=%jx ring_count=%d mtx_owned=%d\n", - sc->tunq_coalesce, buf_ring_count(&txq->txq_mr), mtx_owned(&txq->lock)); + sc->tunq_coalesce, buf_ring_count(txq->txq_mr), mtx_owned(&txq->lock)); err = cxgb_pcpu_enqueue_packet_(qs, immpkt); } @@ -586,7 +583,7 @@ cxgb_pcpu_start_proc(void *arg) if ((qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { idleticks = hz; - if (!buf_ring_empty(&txq->txq_mr) || + if (!buf_ring_empty(txq->txq_mr) || !mbufq_empty(&txq->sendq)) cxgb_pcpu_free(qs); goto done; @@ -611,11 +608,13 @@ cxgb_pcpu_start_proc(void *arg) mtx_unlock(&qs->rspq.lock); } #endif - if ((!buf_ring_empty(&txq->txq_mr)) && err == 0) { + if ((!buf_ring_empty(txq->txq_mr)) && err == 0) { +#if 0 if (cxgb_debug) printf("head=%p cons=%d prod=%d\n", txq->sendq.head, txq->txq_mr.br_cons, txq->txq_mr.br_prod); +#endif continue; } done: diff --git a/sys/dev/cxgb/cxgb_osdep.h b/sys/dev/cxgb/cxgb_osdep.h index ba37cb94d419..d0a72580d060 100644 --- a/sys/dev/cxgb/cxgb_osdep.h +++ b/sys/dev/cxgb/cxgb_osdep.h @@ -156,9 +156,6 @@ struct t3_mbuf_hdr { #if defined(__i386__) || defined(__amd64__) -#define mb() __asm volatile("mfence":::"memory") -#define rmb() __asm volatile("lfence":::"memory") -#define wmb() __asm volatile("sfence" ::: "memory") #define smp_mb() mb() #define L1_CACHE_BYTES 128 @@ -179,163 +176,11 @@ extern void kdb_backtrace(void); #else /* !i386 && !amd64 */ -#define mb() -#define rmb() -#define wmb() #define smp_mb() #define prefetch(x) #define L1_CACHE_BYTES 32 #endif -struct buf_ring { - caddr_t *br_ring; - volatile uint32_t br_cons; - volatile uint32_t br_prod; - int br_size; - struct mtx br_lock; -}; - -struct buf_ring *buf_ring_alloc(int count, int flags); -void buf_ring_free(struct buf_ring *); - -static __inline int -buf_ring_count(struct buf_ring *mr) -{ - int size = mr->br_size; - uint32_t mask = size - 1; - - return ((size + mr->br_prod - mr->br_cons) & mask); -} - -static __inline int -buf_ring_empty(struct buf_ring *mr) -{ - return (mr->br_cons == mr->br_prod); -} - -static __inline int -buf_ring_full(struct buf_ring *mr) -{ - uint32_t mask; - - mask = mr->br_size - 1; - return (mr->br_cons == ((mr->br_prod + 1) & mask)); -} - -/* - * The producer and consumer are independently locked - * this relies on the consumer providing his own serialization - * - */ -static __inline void * -buf_ring_dequeue(struct buf_ring *mr) -{ - uint32_t prod, cons, mask; - caddr_t *ring, m; - - ring = (caddr_t *)mr->br_ring; - mask = mr->br_size - 1; - cons = mr->br_cons; - mb(); - prod = mr->br_prod; - m = NULL; - if (cons != prod) { - m = ring[cons]; - ring[cons] = NULL; - mr->br_cons = (cons + 1) & mask; - mb(); - } - return (m); -} - -#ifdef DEBUG_BUFRING -static __inline void -__buf_ring_scan(struct buf_ring *mr, void *m, char *file, int line) -{ - int i; - - for (i = 0; i < mr->br_size; i++) - if (m == mr->br_ring[i]) - panic("%s:%d m=%p present prod=%d cons=%d idx=%d", file, - line, m, mr->br_prod, mr->br_cons, i); -} - -static __inline void -buf_ring_scan(struct buf_ring *mr, void *m, char *file, int line) -{ - mtx_lock(&mr->br_lock); - __buf_ring_scan(mr, m, file, line); - mtx_unlock(&mr->br_lock); -} - -#else -static __inline void -__buf_ring_scan(struct buf_ring *mr, void *m, char *file, int line) -{ -} - -static __inline void -buf_ring_scan(struct buf_ring *mr, void *m, char *file, int line) -{ -} -#endif - -static __inline int -__buf_ring_enqueue(struct buf_ring *mr, void *m, char *file, int line) -{ - - uint32_t prod, cons, mask; - int err; - - mask = mr->br_size - 1; - prod = mr->br_prod; - mb(); - cons = mr->br_cons; - __buf_ring_scan(mr, m, file, line); - if (((prod + 1) & mask) != cons) { - KASSERT(mr->br_ring[prod] == NULL, ("overwriting entry")); - mr->br_ring[prod] = m; - mb(); - mr->br_prod = (prod + 1) & mask; - err = 0; - } else - err = ENOBUFS; - - return (err); -} - -static __inline int -buf_ring_enqueue_(struct buf_ring *mr, void *m, char *file, int line) -{ - int err; - - mtx_lock(&mr->br_lock); - err = __buf_ring_enqueue(mr, m, file, line); - mtx_unlock(&mr->br_lock); - - return (err); -} - -#define buf_ring_enqueue(mr, m) buf_ring_enqueue_((mr), (m), __FILE__, __LINE__) - - -static __inline void * -buf_ring_peek(struct buf_ring *mr) -{ - int prod, cons, mask; - caddr_t *ring, m; - - ring = (caddr_t *)mr->br_ring; - mask = mr->br_size - 1; - cons = mr->br_cons; - prod = mr->br_prod; - m = NULL; - if (cons != prod) - m = ring[cons]; - - return (m); -} - #define DBG_RX (1 << 0) static const int debug_flags = DBG_RX; diff --git a/sys/dev/cxgb/cxgb_sge.c b/sys/dev/cxgb/cxgb_sge.c index 7781a55ee76e..0c49dd1d1fc7 100644 --- a/sys/dev/cxgb/cxgb_sge.c +++ b/sys/dev/cxgb/cxgb_sge.c @@ -1719,10 +1719,8 @@ t3_free_qset(adapter_t *sc, struct sge_qset *q) t3_free_tx_desc_all(&q->txq[TXQ_ETH]); for (i = 0; i < SGE_TXQ_PER_SET; i++) - if (q->txq[i].txq_mr.br_ring != NULL) { - free(q->txq[i].txq_mr.br_ring, M_DEVBUF); - mtx_destroy(&q->txq[i].txq_mr.br_lock); - } + if (q->txq[i].txq_mr != NULL) + buf_ring_free(q->txq[i].txq_mr, M_DEVBUF); for (i = 0; i < SGE_RXQ_PER_SET; ++i) { if (q->fl[i].desc) { mtx_lock_spin(&sc->sge.reg_lock); @@ -1885,7 +1883,6 @@ t3_free_tx_desc(struct sge_txq *q, int reclaimable) txsd->flags &= ~TX_SW_DESC_MAPPED; } m_freem_iovec(&txsd->mi); - buf_ring_scan(&q->txq_mr, txsd->mi.mi_base, __FILE__, __LINE__); txsd->mi.mi_base = NULL; /* * XXX check for cache hit rate here @@ -2285,14 +2282,12 @@ t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx, int i, header_size, ret = 0; for (i = 0; i < SGE_TXQ_PER_SET; i++) { - if ((q->txq[i].txq_mr.br_ring = malloc(cxgb_txq_buf_ring_size*sizeof(struct mbuf *), - M_DEVBUF, M_WAITOK|M_ZERO)) == NULL) { + + if ((q->txq[i].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size, + M_DEVBUF, M_WAITOK, &q->txq[i].lock)) == NULL) { device_printf(sc->dev, "failed to allocate mbuf ring\n"); goto err; } - q->txq[i].txq_mr.br_prod = q->txq[i].txq_mr.br_cons = 0; - q->txq[i].txq_mr.br_size = cxgb_txq_buf_ring_size; - mtx_init(&q->txq[i].txq_mr.br_lock, "txq mbuf ring", NULL, MTX_DEF); } init_qset_cntxt(q, id); @@ -3509,12 +3504,14 @@ t3_add_configured_sysctls(adapter_t *sc) SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "sendqlen", CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen, 0, "#tunneled packets waiting to be sent"); +#if 0 SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx", CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod, 0, "#tunneled packets queue producer index"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx", CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons, 0, "#tunneled packets queue consumer index"); +#endif SYSCTL_ADD_INT(ctx, txqpoidlist, OID_AUTO, "processed", CTLFLAG_RD, &qs->txq[TXQ_ETH].processed, 0, "#tunneled packets processed by the card"); diff --git a/sys/dev/cxgb/sys/cxgb_support.c b/sys/dev/cxgb/sys/cxgb_support.c index f0c595677be6..55cf0d516687 100644 --- a/sys/dev/cxgb/sys/cxgb_support.c +++ b/sys/dev/cxgb/sys/cxgb_support.c @@ -303,33 +303,3 @@ free: uma_zfree(zone, vec[i]); } -struct buf_ring * -buf_ring_alloc(int count, int flags) -{ - struct buf_ring *br; - - KASSERT(powerof2(count), ("buf ring must be size power of 2")); - - br = malloc(sizeof(struct buf_ring), M_DEVBUF, flags|M_ZERO); - if (br == NULL) - return (NULL); - - br->br_ring = malloc(sizeof(caddr_t)*count, M_DEVBUF, flags|M_ZERO); - if (br->br_ring == NULL) { - free(br, M_DEVBUF); - return (NULL); - } - - mtx_init(&br->br_lock, "buf ring", NULL, MTX_DUPOK|MTX_DEF); - br->br_size = count; - br->br_prod = br->br_cons = 0; - - return (br); -} - -void -buf_ring_free(struct buf_ring *br) -{ - free(br->br_ring, M_DEVBUF); - free(br, M_DEVBUF); -} diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.c index 1882bcede001..62ffef5871bb 100644 --- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.c +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.c @@ -90,7 +90,7 @@ static int __cxio_init_resource_fifo(struct buf_ring **fifo, u32 rarray[16]; mtx_init(fifo_lock, "cxio fifo", NULL, MTX_DEF|MTX_DUPOK); - *fifo = buf_ring_alloc(nr, M_NOWAIT); + *fifo = buf_ring_alloc(nr, M_DEVBUF, M_NOWAIT, fifo_lock); if (*fifo == NULL) return (-ENOMEM); #if 0 @@ -122,7 +122,7 @@ static int __cxio_init_resource_fifo(struct buf_ring **fifo, buf_ring_enqueue(*fifo, (void *) (uintptr_t)i); #if 0 for (i = 0; i < skip_low + skip_high; i++) - buf_ring_dequeue(*fifo); + buf_ring_dequeue_sc(*fifo); #endif return 0; } @@ -149,7 +149,8 @@ static int cxio_init_qpid_fifo(struct cxio_rdev *rdev_p) mtx_init(&rdev_p->rscp->qpid_fifo_lock, "qpid fifo", NULL, MTX_DEF); - rdev_p->rscp->qpid_fifo = buf_ring_alloc(T3_MAX_NUM_QP, M_NOWAIT); + rdev_p->rscp->qpid_fifo = buf_ring_alloc(T3_MAX_NUM_QP, M_DEVBUF, + M_NOWAIT, &rdev_p->rscp->qpid_fifo_lock); if (rdev_p->rscp->qpid_fifo == NULL) return (-ENOMEM); @@ -168,7 +169,7 @@ int cxio_hal_init_rhdl_resource(u32 nr_rhdl) void cxio_hal_destroy_rhdl_resource(void) { - buf_ring_free(rhdl_fifo); + buf_ring_free(rhdl_fifo, M_DEVBUF); } #endif @@ -202,11 +203,11 @@ int cxio_hal_init_resource(struct cxio_rdev *rdev_p, goto pdid_err; return 0; pdid_err: - buf_ring_free(rscp->cqid_fifo); + buf_ring_free(rscp->cqid_fifo, M_DEVBUF); cqid_err: - buf_ring_free(rscp->qpid_fifo); + buf_ring_free(rscp->qpid_fifo, M_DEVBUF); qpid_err: - buf_ring_free(rscp->tpt_fifo); + buf_ring_free(rscp->tpt_fifo, M_DEVBUF); tpt_err: return (-ENOMEM); } @@ -219,7 +220,7 @@ static u32 cxio_hal_get_resource(struct buf_ring *fifo, struct mtx *lock) u32 entry; mtx_lock(lock); - entry = (u32)(uintptr_t)buf_ring_dequeue(fifo); + entry = (u32)(uintptr_t)buf_ring_dequeue_sc(fifo); mtx_unlock(lock); return entry; } @@ -276,10 +277,10 @@ void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid) void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp) { - buf_ring_free(rscp->tpt_fifo); - buf_ring_free(rscp->cqid_fifo); - buf_ring_free(rscp->qpid_fifo); - buf_ring_free(rscp->pdid_fifo); + buf_ring_free(rscp->tpt_fifo, M_DEVBUF); + buf_ring_free(rscp->cqid_fifo, M_DEVBUF); + buf_ring_free(rscp->qpid_fifo, M_DEVBUF); + buf_ring_free(rscp->pdid_fifo, M_DEVBUF); free(rscp, M_DEVBUF); } diff --git a/sys/dev/mxge/if_mxge_var.h b/sys/dev/mxge/if_mxge_var.h index 21127862dc74..e57c637f300b 100644 --- a/sys/dev/mxge/if_mxge_var.h +++ b/sys/dev/mxge/if_mxge_var.h @@ -279,6 +279,8 @@ struct mxge_media_type /* implement our own memory barriers, since bus_space_barrier cannot handle write-combining regions */ +#if __FreeBSD_version < 800053 + #if defined (__GNUC__) #if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__ #define mb() __asm__ __volatile__ ("sfence;": : :"memory") @@ -293,6 +295,8 @@ struct mxge_media_type #error "unknown compiler" #endif +#endif + static inline void mxge_pio_copy(volatile void *to_v, void *from_v, size_t size) { diff --git a/sys/dev/nxge/xge-osdep.h b/sys/dev/nxge/xge-osdep.h index 15adfe7b3847..e8f4aba2d057 100644 --- a/sys/dev/nxge/xge-osdep.h +++ b/sys/dev/nxge/xge-osdep.h @@ -242,8 +242,12 @@ typedef xge_pci_info_t *pci_cfg_h; mtx_unlock_flags(lockp, flags); \ } +#if __FreeBSD_version > 800053 /* Write memory barrier */ +#define xge_os_wmb() wmb() +#else #define xge_os_wmb() +#endif /* Delay (in micro seconds) */ #define xge_os_udelay(us) DELAY(us) diff --git a/sys/i386/include/atomic.h b/sys/i386/include/atomic.h index 3662a0f2f3ce..f6bcf0c1650b 100644 --- a/sys/i386/include/atomic.h +++ b/sys/i386/include/atomic.h @@ -32,6 +32,21 @@ #error this file needs sys/cdefs.h as a prerequisite #endif + +#if defined(I686_CPU) +#define mb() __asm__ __volatile__ ("mfence;": : :"memory") +#define wmb() __asm__ __volatile__ ("sfence;": : :"memory") +#define rmb() __asm__ __volatile__ ("lfence;": : :"memory") +#else +/* + * do we need a serializing instruction? + */ +#define mb() +#define wmb() +#define rmb() +#endif + + /* * Various simple operations on memory, each of which is atomic in the * presence of interrupts and multiple processors. diff --git a/sys/i386/include/xen/xen-os.h b/sys/i386/include/xen/xen-os.h index 98341b65b264..dac071afce7e 100644 --- a/sys/i386/include/xen/xen-os.h +++ b/sys/i386/include/xen/xen-os.h @@ -165,15 +165,6 @@ do { \ #define spin_unlock_irqrestore mtx_unlock_irqrestore -#ifndef mb -#define mb() __asm__ __volatile__("lock; addl $0, 0(%%esp)": : :"memory") -#endif -#ifndef rmb -#define rmb() mb() -#endif -#ifndef wmb -#define wmb() barrier() -#endif #ifdef SMP #define smp_mb() mb() #define smp_rmb() rmb() diff --git a/sys/ia64/include/atomic.h b/sys/ia64/include/atomic.h index 631193f43725..fdfcb9ebaf3f 100644 --- a/sys/ia64/include/atomic.h +++ b/sys/ia64/include/atomic.h @@ -29,6 +29,10 @@ #ifndef _MACHINE_ATOMIC_H_ #define _MACHINE_ATOMIC_H_ +#define mb() +#define wmb() +#define rmb() + /* * Various simple arithmetic on memory which is atomic in the presence * of interrupts and SMP safe. diff --git a/sys/kern/subr_bufring.c b/sys/kern/subr_bufring.c new file mode 100644 index 000000000000..63938eaca444 --- /dev/null +++ b/sys/kern/subr_bufring.c @@ -0,0 +1,68 @@ +/************************************************************************** + * + * Copyright (c) 2007,2008 Kip Macy kmacy@freebsd.org + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. The name of Kip Macy nor the names of other + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * + ***************************************************************************/ +#include +__FBSDID("$FreeBSD$"); + + +#include +#include +#include +#include +#include +#include + + +struct buf_ring * +buf_ring_alloc(int count, struct malloc_type *type, int flags, struct mtx *lock) +{ + struct buf_ring *br; + + KASSERT(powerof2(count), ("buf ring must be size power of 2")); + + br = malloc(sizeof(struct buf_ring) + count*sizeof(caddr_t), + type, flags|M_ZERO); + if (br == NULL) + return (NULL); +#ifdef DEBUG_BUFRING + br->br_lock = lock; +#endif + br->br_prod_size = br->br_cons_size = count; + br->br_prod_mask = br->br_cons_mask = count-1; + br->br_prod_head = br->br_cons_head = 0; + br->br_prod_tail = br->br_cons_tail = 0; + + return (br); +} + +void +buf_ring_free(struct buf_ring *br, struct malloc_type *type) +{ + free(br, type); +} diff --git a/sys/net/if.c b/sys/net/if.c index f44262c77e81..e159a21ebc60 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -113,10 +113,11 @@ static int ifconf(u_long, caddr_t); static void if_freemulti(struct ifmultiaddr *); static void if_grow(void); static void if_init(void *); -static void if_qflush(struct ifaltq *); +static void if_qflush(struct ifnet *); static void if_route(struct ifnet *, int flag, int fam); static int if_setflag(struct ifnet *, int, int, int *, int); static void if_slowtimo(void *); +static int if_transmit(struct ifnet *ifp, struct mbuf *m); static void if_unroute(struct ifnet *, int flag, int fam); static void link_rtrequest(int, struct rtentry *, struct rt_addrinfo *); static int if_rtdel(struct radix_node *, void *); @@ -126,6 +127,7 @@ static void if_start_deferred(void *context, int pending); static void do_link_state_change(void *, int); static int if_getgroup(struct ifgroupreq *, struct ifnet *); static int if_getgroupmembers(struct ifgroupreq *); + #ifdef INET6 /* * XXX: declare here to avoid to include many inet6 related files.. @@ -481,6 +483,28 @@ if_free_type(struct ifnet *ifp, u_char type) free(ifp, M_IFNET); }; +void +ifq_attach(struct ifaltq *ifq, struct ifnet *ifp) +{ + + mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF); + + if (ifq->ifq_maxlen == 0) + ifq->ifq_maxlen = ifqmaxlen; + + ifq->altq_type = 0; + ifq->altq_disc = NULL; + ifq->altq_flags &= ALTQF_CANTCHANGE; + ifq->altq_tbr = NULL; + ifq->altq_ifp = ifp; +} + +void +ifq_detach(struct ifaltq *ifq) +{ + mtx_destroy(&ifq->ifq_mtx); +} + /* * Perform generic interface initalization tasks and attach the interface * to the list of "active" interfaces. @@ -522,7 +546,8 @@ if_attach(struct ifnet *ifp) getmicrotime(&ifp->if_lastchange); ifp->if_data.ifi_epoch = time_uptime; ifp->if_data.ifi_datalen = sizeof(struct if_data); - + ifp->if_transmit = if_transmit; + ifp->if_qflush = if_qflush; #ifdef MAC mac_ifnet_init(ifp); mac_ifnet_create(ifp); @@ -534,7 +559,7 @@ if_attach(struct ifnet *ifp) make_dev_alias(ifdev_byindex(ifp->if_index), "%s%d", net_cdevsw.d_name, ifp->if_index); - mtx_init(&ifp->if_snd.ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF); + ifq_attach(&ifp->if_snd, ifp); /* * create a Link Level name for this device @@ -572,19 +597,6 @@ if_attach(struct ifnet *ifp) TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link); ifp->if_broadcastaddr = NULL; /* reliably crash if used uninitialized */ - /* - * XXX: why do we warn about this? We're correcting it and most - * drivers just set the value the way we do. - */ - if (ifp->if_snd.ifq_maxlen == 0) { - if_printf(ifp, "XXX: driver didn't set ifq_maxlen\n"); - ifp->if_snd.ifq_maxlen = ifqmaxlen; - } - ifp->if_snd.altq_type = 0; - ifp->if_snd.altq_disc = NULL; - ifp->if_snd.altq_flags &= ALTQF_CANTCHANGE; - ifp->if_snd.altq_tbr = NULL; - ifp->if_snd.altq_ifp = ifp; IFNET_WLOCK(); TAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link); @@ -826,7 +838,7 @@ if_detach(struct ifnet *ifp) KNOTE_UNLOCKED(&ifp->if_klist, NOTE_EXIT); knlist_clear(&ifp->if_klist, 0); knlist_destroy(&ifp->if_klist); - mtx_destroy(&ifp->if_snd.ifq_mtx); + ifq_detach(&ifp->if_snd); IF_AFDATA_DESTROY(ifp); splx(s); } @@ -1377,7 +1389,8 @@ if_unroute(struct ifnet *ifp, int flag, int fam) TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFDOWN, ifa->ifa_addr); - if_qflush(&ifp->if_snd); + ifp->if_qflush(ifp); + #ifdef DEV_CARP if (ifp->if_carp) carp_carpdev_state(ifp->if_carp); @@ -1507,10 +1520,12 @@ if_up(struct ifnet *ifp) * Flush an interface queue. */ static void -if_qflush(struct ifaltq *ifq) +if_qflush(struct ifnet *ifp) { struct mbuf *m, *n; - + struct ifaltq *ifq; + + ifq = &ifp->if_snd; IFQ_LOCK(ifq); #ifdef ALTQ if (ALTQ_IS_ENABLED(ifq)) @@ -2801,6 +2816,19 @@ if_start_deferred(void *context, int pending) (ifp->if_start)(ifp); } +/* + * Backwards compatibility interface for drivers + * that have not implemented it + */ +static int +if_transmit(struct ifnet *ifp, struct mbuf *m) +{ + int error; + + IFQ_HANDOFF(ifp, m, error); + return (error); +} + int if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust) { diff --git a/sys/net/if_var.h b/sys/net/if_var.h index 3f7a0089b5b0..c6bcdf7e8ee8 100644 --- a/sys/net/if_var.h +++ b/sys/net/if_var.h @@ -186,7 +186,11 @@ struct ifnet { /* protected by if_addr_mtx */ void *if_pf_kif; void *if_lagg; /* lagg glue */ - void *if_pspare[10]; /* multiq/TOE 3; vimage 3; general use 4 */ + void *if_pspare[8]; /* multiq/TOE 3; vimage 3; general use 4 */ + void (*if_qflush) /* flush any queues */ + (struct ifnet *); + int (*if_transmit) /* initiate output routine */ + (struct ifnet *, struct mbuf *); int if_ispare[2]; /* general use 2 */ }; @@ -686,6 +690,9 @@ int ifioctl(struct socket *, u_long, caddr_t, struct thread *); int ifpromisc(struct ifnet *, int); struct ifnet *ifunit(const char *); +void ifq_attach(struct ifaltq *, struct ifnet *ifp); +void ifq_detach(struct ifaltq *); + struct ifaddr *ifa_ifwithaddr(struct sockaddr *); struct ifaddr *ifa_ifwithbroadaddr(struct sockaddr *); struct ifaddr *ifa_ifwithdstaddr(struct sockaddr *); diff --git a/sys/powerpc/include/atomic.h b/sys/powerpc/include/atomic.h index d515a6a571ce..4ac9f0c3d960 100644 --- a/sys/powerpc/include/atomic.h +++ b/sys/powerpc/include/atomic.h @@ -39,6 +39,10 @@ #define __ATOMIC_BARRIER \ __asm __volatile("sync" : : : "memory") +#define mb() __ATOMIC_BARRIER +#define wmb() mb() +#define rmb() mb() + /* * atomic_add(p, v) * { *p += v; } diff --git a/sys/sparc64/include/atomic.h b/sys/sparc64/include/atomic.h index fe36791e55d6..d663fbc457ee 100644 --- a/sys/sparc64/include/atomic.h +++ b/sys/sparc64/include/atomic.h @@ -40,6 +40,10 @@ #define __ASI_ATOMIC ASI_P #endif +#define mb() __asm__ __volatile__ ("membar #MemIssue": : :"memory") +#define wmb() mb() +#define rmb() mb() + /* * Various simple arithmetic on memory which is atomic in the presence * of interrupts and multiple processors. See atomic(9) for details. diff --git a/sys/sun4v/include/atomic.h b/sys/sun4v/include/atomic.h index fe36791e55d6..c5005fabdf8e 100644 --- a/sys/sun4v/include/atomic.h +++ b/sys/sun4v/include/atomic.h @@ -33,6 +33,10 @@ #include +#define mb() __asm__ __volatile__ ("membar #MemIssue": : :"memory") +#define wmb() mb() +#define rmb() mb() + /* Userland needs different ASI's. */ #ifdef _KERNEL #define __ASI_ATOMIC ASI_N diff --git a/sys/sys/buf_ring.h b/sys/sys/buf_ring.h new file mode 100644 index 000000000000..d77814a51264 --- /dev/null +++ b/sys/sys/buf_ring.h @@ -0,0 +1,250 @@ +/************************************************************************** + * + * Copyright (c) 2007,2008 Kip Macy kmacy@freebsd.org + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. The name of Kip Macy nor the names of other + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + * + ***************************************************************************/ + +#ifndef _SYS_BUF_RING_H_ +#define _SYS_BUF_RING_H_ + +#include + +#if defined(INVARIANTS) && !defined(DEBUG_BUFRING) +#define DEBUG_BUFRING 1 +#endif + +#ifdef DEBUG_BUFRING +#include +#include +#endif + +struct buf_ring { + volatile uint32_t br_prod_head; + volatile uint32_t br_prod_tail; + int br_prod_size; + int br_prod_mask; + /* + * Pad out to next L2 cache line + */ + uint64_t _pad0[14]; + + volatile uint32_t br_cons_head; + volatile uint32_t br_cons_tail; + int br_cons_size; + int br_cons_mask; + + /* + * Pad out to next L2 cache line + */ + uint64_t _pad1[14]; +#ifdef DEBUG_BUFRING + struct mtx *br_lock; +#endif + void *br_ring[0]; +}; + + +static __inline int +buf_ring_enqueue(struct buf_ring *br, void *buf) +{ + uint32_t prod_head, prod_next; + uint32_t cons_tail; + int success; +#ifdef DEBUG_BUFRING + int i; + for (i = br->br_cons_head; i != br->br_prod_head; + i = ((i + 1) & br->br_cons_mask)) + if(br->br_ring[i] == buf) + panic("buf=%p already enqueue at %d prod=%d cons=%d", + buf, i, br->br_prod_tail, br->br_cons_tail); +#endif + critical_enter(); + do { + prod_head = br->br_prod_head; + cons_tail = br->br_cons_tail; + + prod_next = (prod_head + 1) & br->br_prod_mask; + + if (prod_next == cons_tail) { + critical_exit(); + return (ENOSPC); + } + + success = atomic_cmpset_int(&br->br_prod_head, prod_head, + prod_next); + } while (success == 0); +#ifdef DEBUG_BUFRING + if (br->br_ring[prod_head] != NULL) + panic("dangling value in enqueue"); +#endif + br->br_ring[prod_head] = buf; + wmb(); + + /* + * If there are other enqueues in progress + * that preceeded us, we need to wait for them + * to complete + */ + while (br->br_prod_tail != prod_head) + cpu_spinwait(); + br->br_prod_tail = prod_next; + mb(); + critical_exit(); + return (0); +} + +/* + * multi-consumer safe dequeue + * + */ +static __inline void * +buf_ring_dequeue_mc(struct buf_ring *br) +{ + uint32_t cons_head, cons_next; + uint32_t prod_tail; + void *buf; + int success; + + critical_enter(); + do { + cons_head = br->br_cons_head; + prod_tail = br->br_prod_tail; + + cons_next = (cons_head + 1) & br->br_cons_mask; + + if (cons_head == prod_tail) { + critical_exit(); + return (NULL); + } + + success = atomic_cmpset_int(&br->br_cons_head, cons_head, + cons_next); + } while (success == 0); + + buf = br->br_ring[cons_head]; +#ifdef DEBUG_BUFRING + br->br_ring[cons_head] = NULL; +#endif + mb(); + + /* + * If there are other dequeues in progress + * that preceeded us, we need to wait for them + * to complete + */ + while (br->br_cons_tail != cons_head) + cpu_spinwait(); + + br->br_cons_tail = cons_next; + mb(); + critical_exit(); + + return (buf); +} + +/* + * Single-Consumer dequeue for uses where dequeue + * is protected by a lock + */ +static __inline void * +buf_ring_dequeue_sc(struct buf_ring *br) +{ + uint32_t cons_head, cons_next; + uint32_t prod_tail; + void *buf; + + critical_enter(); + cons_head = br->br_cons_head; + prod_tail = br->br_prod_tail; + + cons_next = (cons_head + 1) & br->br_cons_mask; + + if (cons_head == prod_tail) { + critical_exit(); + return (NULL); + } + + br->br_cons_head = cons_next; + buf = br->br_ring[cons_head]; + mb(); + +#ifdef DEBUG_BUFRING + br->br_ring[cons_head] = NULL; + if (!mtx_owned(br->br_lock)) + panic("lock not held on single consumer dequeue"); + if (br->br_cons_tail != cons_head) + panic("inconsistent list cons_tail=%d cons_head=%d", + br->br_cons_tail, cons_head); +#endif + br->br_cons_tail = cons_next; + mb(); + critical_exit(); + return (buf); +} + +static __inline void * +buf_ring_peek(struct buf_ring *br) +{ + +#ifdef DEBUG_BUFRING + if ((br->br_lock != NULL) && !mtx_owned(br->br_lock)) + panic("lock not held on single consumer dequeue"); +#endif + mb(); + return (br->br_ring[br->br_cons_tail]); +} + +static __inline int +buf_ring_full(struct buf_ring *br) +{ + + return (((br->br_prod_head + 1) & br->br_prod_mask) == br->br_cons_tail); +} + +static __inline int +buf_ring_empty(struct buf_ring *br) +{ + + return (br->br_cons_head == br->br_prod_tail); +} + +static __inline int +buf_ring_count(struct buf_ring *br) +{ + + return ((br->br_prod_size + br->br_prod_tail - br->br_cons_tail) + & br->br_prod_mask); +} + +struct buf_ring *buf_ring_alloc(int count, struct malloc_type *type, int flags, + struct mtx *); +void buf_ring_free(struct buf_ring *br, struct malloc_type *type); + + + +#endif diff --git a/sys/sys/param.h b/sys/sys/param.h index 2724293f027d..e5721f10a46b 100644 --- a/sys/sys/param.h +++ b/sys/sys/param.h @@ -57,7 +57,7 @@ * is created, otherwise 1. */ #undef __FreeBSD_version -#define __FreeBSD_version 800053 /* Master, propagated to newvers */ +#define __FreeBSD_version 800054 /* Master, propagated to newvers */ #ifndef LOCORE #include