From 68b8534bdfeb5078e84d668124e7585e43b03502 Mon Sep 17 00:00:00 2001 From: Luigi Rizzo Date: Thu, 17 Nov 2011 12:17:39 +0000 Subject: [PATCH] Bring in support for netmap, a framework for very efficient packet I/O from userspace, capable of line rate at 10G, see http://info.iet.unipi.it/~luigi/netmap/ At this time I am bringing in only the generic code (sys/dev/netmap/ plus two headers under sys/net/), and some sample applications in tools/tools/netmap. There is also a manpage in share/man/man4 [1] In order to make use of the framework you need to build a kernel with "device netmap", and patch individual drivers with the code that you can find in sys/dev/netmap/head.diff The file will go away as the relevant pieces are committed to the various device drivers, which should happen in a few days after talking to the driver maintainers. Netmap support is available at the moment for Intel 10G and 1G cards (ixgbe, em/lem/igb), and for the Realtek 1G card ("re"). I have partial patches for "bge" and am starting to work on "cxgbe". Hopefully changes are trivial enough so interested third parties can submit their patches. Interested people can contact me for advice on how to add netmap support to specific devices. CREDITS: Netmap has been developed by Luigi Rizzo and other collaborators at the Universita` di Pisa, and supported by EU project CHANGE (http://www.change-project.eu/) The code is distributed under a BSD Copyright. [1] In my opinion is a bad idea to have all manpage in one directory. We should place kernel documentation in the same dir that contains the code, which would make it much simpler to keep doc and code in sync, reduce the clutter in share/man/ and incidentally is the policy used for all of userspace code. Makefiles and doc tools can be trivially adjusted to find the manpages in the relevant subdirs. --- share/man/man4/Makefile | 1 + share/man/man4/netmap.4 | 300 +++++ sys/dev/netmap/head.diff | 654 +++++++++++ sys/dev/netmap/if_em_netmap.h | 383 +++++++ sys/dev/netmap/if_igb_netmap.h | 378 +++++++ sys/dev/netmap/if_lem_netmap.h | 344 ++++++ sys/dev/netmap/if_re_netmap.h | 415 +++++++ sys/dev/netmap/ixgbe_netmap.h | 376 ++++++ sys/dev/netmap/netmap.c | 1762 +++++++++++++++++++++++++++++ sys/dev/netmap/netmap_kern.h | 221 ++++ sys/net/netmap.h | 281 +++++ sys/net/netmap_user.h | 98 ++ tools/tools/README | 1 + tools/tools/netmap/Makefile | 25 + tools/tools/netmap/README | 11 + tools/tools/netmap/bridge.c | 456 ++++++++ tools/tools/netmap/click-test.cfg | 19 + tools/tools/netmap/pcap.c | 761 +++++++++++++ tools/tools/netmap/pkt-gen.c | 1021 +++++++++++++++++ 19 files changed, 7507 insertions(+) create mode 100644 share/man/man4/netmap.4 create mode 100644 sys/dev/netmap/head.diff create mode 100644 sys/dev/netmap/if_em_netmap.h create mode 100644 sys/dev/netmap/if_igb_netmap.h create mode 100644 sys/dev/netmap/if_lem_netmap.h create mode 100644 sys/dev/netmap/if_re_netmap.h create mode 100644 sys/dev/netmap/ixgbe_netmap.h create mode 100644 sys/dev/netmap/netmap.c create mode 100644 sys/dev/netmap/netmap_kern.h create mode 100644 sys/net/netmap.h create mode 100644 sys/net/netmap_user.h create mode 100644 tools/tools/netmap/Makefile create mode 100644 tools/tools/netmap/README create mode 100644 tools/tools/netmap/bridge.c create mode 100644 tools/tools/netmap/click-test.cfg create mode 100644 tools/tools/netmap/pcap.c create mode 100644 tools/tools/netmap/pkt-gen.c diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile index e5fa955f6cc9..0d5a780970e4 100644 --- a/share/man/man4/Makefile +++ b/share/man/man4/Makefile @@ -253,6 +253,7 @@ MAN= aac.4 \ net80211.4 \ netgraph.4 \ netintro.4 \ + netmap.4 \ ${_nfe.4} \ ${_nfsmb.4} \ ng_async.4 \ diff --git a/share/man/man4/netmap.4 b/share/man/man4/netmap.4 new file mode 100644 index 000000000000..8b646f9fa070 --- /dev/null +++ b/share/man/man4/netmap.4 @@ -0,0 +1,300 @@ +.\" Copyright (c) 2011 Matteo Landi, Luigi Rizzo, Universita` di Pisa +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" This document is derived in part from the enet man page (enet.4) +.\" distributed with 4.3BSD Unix. +.\" +.\" $FreeBSD$ +.\" $Id: netmap.4 9662 2011-11-16 13:18:06Z luigi $: stable/8/share/man/man4/bpf.4 181694 2008-08-13 17:45:06Z ed $ +.\" +.Dd November 16, 2011 +.Dt NETMAP 4 +.Os +.Sh NAME +.Nm netmap +.Nd a framework for fast packet I/O +.Sh SYNOPSIS +.Cd device netmap +.Sh DESCRIPTION +.Nm +is a framework for fast and safe access to network devices +(reaching 14.88 Mpps at less than 1 GHz). +.Nm +uses memory mapped buffers and metadata +(buffer indexes and lengths) to communicate with the kernel, +which is in charge of validating information through +.Pa ioctl() +and +.Pa select()/poll(). +.Nm +can exploit the parallelism in multiqueue devices and +multicore systems. +.Pp +.Pp +.Nm +requires explicit support in device drivers. +For a list of supported devices, see the end of this manual page. +.Sh OPERATION +.Nm +clients must first open the +.Pa open("/dev/netmap") , +and then issue an +.Pa ioctl(...,NIOCREGIF,...) +to bind the file descriptor to a network device. +.Pp +When a device is put in +.Nm +mode, its data path is disconnected from the host stack. +The processes owning the file descriptor +can exchange packets with the device, or with the host stack, +through an mmapped memory region that contains pre-allocated +buffers and metadata. +.Pp +Non blocking I/O is done with special +.Pa ioctl()'s , +whereas the file descriptor can be passed to +.Pa select()/poll() +to be notified about incoming packet or available transmit buffers. +.Ss Data structures +All data structures for all devices in +.Nm +mode are in a memory +region shared by the kernel and all processes +who open +.Pa /dev/netmap +(NOTE: visibility may be restricted in future implementations). +All references between the shared data structure +are relative (offsets or indexes). Some macros help converting +them into actual pointers. +.Pp +The data structures in shared memory are the following: +.Pp +.Bl -tag -width XXX +.It Dv struct netmap_if (one per interface) +indicates the number of rings supported by an interface, their +sizes, and the offsets of the +.Pa netmap_rings +associated to the interface. +The offset of a +.Pa struct netmap_if +in the shared memory region is indicated by the +.Pa nr_offset +field in the structure returned by the +.Pa NIOCREGIF +(see below). +.Bd -literal +struct netmap_if { + char ni_name[IFNAMSIZ]; /* name of the interface. */ + const u_int ni_num_queues; /* number of hw ring pairs */ + const ssize_t ring_ofs[]; /* offset of tx and rx rings */ +}; +.Ed +.It Dv struct netmap_ring (one per ring) +contains the index of the current read or write slot (cur), +the number of slots available for reception or transmission (avail), +and an array of +.Pa slots +describing the buffers. +There is one ring pair for each of the N hardware ring pairs +supported by the card (numbered 0..N-1), plus +one ring pair (numbered N) for packets from/to the host stack. +.Bd -literal +struct netmap_ring { + const ssize_t buf_ofs; + const uint32_t num_slots; /* number of slots in the ring. */ + uint32_t avail; /* number of usable slots */ + uint32_t cur; /* 'current' index for the user side */ + + const uint16_t nr_buf_size; + uint16_t flags; + struct netmap_slot slot[0]; /* array of slots. */ +} +.Ed +.It Dv struct netmap_slot (one per packet) +contains the metadata for a packet: a buffer index (buf_idx), +a buffer length (len), and some flags. +.Bd -literal +struct netmap_slot { + uint32_t buf_idx; /* buffer index */ + uint16_t len; /* packet length */ + uint16_t flags; /* buf changed, etc. */ +#define NS_BUF_CHANGED 0x0001 /* must resync, buffer changed */ +#define NS_REPORT 0x0002 /* tell hw to report results + * e.g. by generating an interrupt + */ +}; +.Ed +.It Dv packet buffers +are fixed size (approximately 2k) buffers allocated by the kernel +that contain packet data. Buffers addresses are computed through +macros. +.El +.Pp +Some macros support the access to objects in the shared memory +region. In particular: +.Bd -literal +struct netmap_if *nifp; +... +struct netmap_ring *txring = NETMAP_TXRING(nifp, i); +struct netmap_ring *rxring = NETMAP_RXRING(nifp, i); +int i = txring->slot[txring->cur].buf_idx; +char *buf = NETMAP_BUF(txring, i); +.Ed +.Ss IOCTLS +.Pp +.Nm +supports some ioctl() to synchronize the state of the rings +between the kernel and the user processes, plus some +to query and configure the interface. +The former do not require any argument, whereas the latter +use a +.Pa struct netmap_req +defined as follows: +.Bd -literal +struct nmreq { + char nr_name[IFNAMSIZ]; + uint32_t nr_offset; /* nifp offset in the shared region */ + uint32_t nr_memsize; /* size of the shared region */ + uint32_t nr_numdescs; /* descriptors per queue */ + uint16_t nr_numqueues; + uint16_t nr_ringid; /* ring(s) we care about */ +#define NETMAP_HW_RING 0x4000 /* low bits indicate one hw ring */ +#define NETMAP_SW_RING 0x2000 /* we process the sw ring */ +#define NETMAP_NO_TX_POLL 0x1000 /* no gratuitous txsync on poll */ +#define NETMAP_RING_MASK 0xfff /* the actual ring number */ +}; + +.Ed +A device descriptor obtained through +.Pa /dev/netmap +also supports the ioctl supported by network devices. +.Pp +The netmap-specific +.Xr ioctl 2 +command codes below are defined in +.In net/netmap.h +and are: +.Bl -tag -width XXXX +.It Dv NIOCGINFO +returns information about the interface named in nr_name. +On return, nr_memsize indicates the size of the shared netmap +memory region (this is device-independent), +nr_numslots indicates how many buffers are in a ring, +nr_numrings indicates the number of rings supported by the hardware. +.Pp +If the device does not support netmap, the ioctl returns EINVAL. +.It Dv NIOCREGIF +puts the interface named in nr_name into netmap mode, disconnecting +it from the host stack, and/or defines which rings are controlled +through this file descriptor. +On return, it gives the same info as NIOCGINFO, and nr_ringid +indicates the identity of the rings controlled through the file +descriptor. +.Pp +Possible values for nr_ringid are +.Bl -tag -width XXXXX +.It 0 +default, all hardware rings +.It NETMAP_SW_RING +the ``host rings'' connecting to the host stack +.It NETMAP_HW_RING + i +the i-th hardware ring +.El +By default, a +.Nm poll +or +.Nm select +call pushes out any pending packets on the transmit ring, even if +no write events are specified. +The feature can be disabled by or-ing +.Nm NETMAP_NO_TX_SYNC +to nr_ringid. +But normally you should keep this feature unless you are using +separate file descriptors for the send and receive rings, because +otherwise packets are pushed out only if NETMAP_TXSYNC is called, +or the send queue is full. +.Pp +.Pa NIOCREGIF +can be used multiple times to change the association of a +file descriptor to a ring pair, always within the same device. +.It Dv NIOCUNREGIF +brings an interface back to normal mode. +.It Dv NIOCTXSYNC +tells the hardware of new packets to transmit, and updates the +number of slots available for transmission. +.It Dv NIOCRXSYNC +tells the hardware of consumed packets, and asks for newly available +packets. +.El +.Ss SYSTEM CALLS +.Nm +uses +.Nm select +and +.Nm poll +to wake up processes when significant events occur. +.Sh EXAMPLES +The following code implements a traffic generator +.Pp +.Bd -literal -compact +#include +#include +struct netmap_if *nifp; +struct netmap_ring *ring; +struct netmap_request nmr; + +fd = open("/dev/netmap", O_RDWR); +bzero(&nmr, sizeof(nmr)); +strcpy(nmr.nm_name, "ix0"); +ioctl(fd, NIOCREG, &nmr); +p = mmap(0, nmr.memsize, fd); +nifp = NETMAP_IF(p, nmr.offset); +ring = NETMAP_TXRING(nifp, 0); +fds.fd = fd; +fds.events = POLLOUT; +for (;;) { + poll(list, 1, -1); + while (ring->avail-- > 0) { + i = ring->cur; + buf = NETMAP_BUF(ring, ring->slot[i].buf_index); + ... prepare packet in buf ... + ring->slot[i].len = ... packet length ... + ring->cur = NETMAP_RING_NEXT(ring, i); + } +} +.Ed +.Sh SUPPORTED INTERFACES +.Nm +supports the following interfaces: +.Xr em 4 , +.Xr ixgbe 4 , +.Xr re 4 , +.Sh AUTHORS +The +.Nm +framework has been designed and implemented by +.An Luigi Rizzo +and +.An Matteo Landi +in 2011 at the Universita` di Pisa. diff --git a/sys/dev/netmap/head.diff b/sys/dev/netmap/head.diff new file mode 100644 index 000000000000..51a8e34e74d1 --- /dev/null +++ b/sys/dev/netmap/head.diff @@ -0,0 +1,654 @@ +Index: conf/NOTES +=================================================================== +--- conf/NOTES (revision 227552) ++++ conf/NOTES (working copy) +@@ -799,6 +799,12 @@ + # option. DHCP requires bpf. + device bpf + ++# The `netmap' device implements memory-mapped access to network ++# devices from userspace, enabling wire-speed packet capture and ++# generation even at 10Gbit/s. Requires support in the device ++# driver. Supported drivers are ixgbe, e1000, re. ++device netmap ++ + # The `disc' device implements a minimal network interface, + # which throws away all packets sent and never receives any. It is + # included for testing and benchmarking purposes. +Index: conf/files +=================================================================== +--- conf/files (revision 227552) ++++ conf/files (working copy) +@@ -1507,6 +1507,7 @@ + dev/my/if_my.c optional my + dev/ncv/ncr53c500.c optional ncv + dev/ncv/ncr53c500_pccard.c optional ncv pccard ++dev/netmap/netmap.c optional netmap + dev/nge/if_nge.c optional nge + dev/nxge/if_nxge.c optional nxge + dev/nxge/xgehal/xgehal-device.c optional nxge +Index: conf/options +=================================================================== +--- conf/options (revision 227552) ++++ conf/options (working copy) +@@ -689,6 +689,7 @@ + + # various 'device presence' options. + DEV_BPF opt_bpf.h ++DEV_NETMAP opt_global.h + DEV_MCA opt_mca.h + DEV_CARP opt_carp.h + DEV_SPLASH opt_splash.h +Index: dev/e1000/if_igb.c +=================================================================== +--- dev/e1000/if_igb.c (revision 227552) ++++ dev/e1000/if_igb.c (working copy) +@@ -369,6 +369,9 @@ + &igb_rx_process_limit, 0, + "Maximum number of received packets to process at a time, -1 means unlimited"); + ++#ifdef DEV_NETMAP ++#include ++#endif /* DEV_NETMAP */ + /********************************************************************* + * Device identification routine + * +@@ -664,6 +667,9 @@ + adapter->led_dev = led_create(igb_led_func, adapter, + device_get_nameunit(dev)); + ++#ifdef DEV_NETMAP ++ igb_netmap_attach(adapter); ++#endif /* DEV_NETMAP */ + INIT_DEBUGOUT("igb_attach: end"); + + return (0); +@@ -742,6 +748,9 @@ + + callout_drain(&adapter->timer); + ++#ifdef DEV_NETMAP ++ netmap_detach(adapter->ifp); ++#endif /* DEV_NETMAP */ + igb_free_pci_resources(adapter); + bus_generic_detach(dev); + if_free(ifp); +@@ -3212,6 +3221,10 @@ + struct adapter *adapter = txr->adapter; + struct igb_tx_buffer *txbuf; + int i; ++#ifdef DEV_NETMAP ++ struct netmap_slot *slot = netmap_reset(NA(adapter->ifp), ++ NR_TX, txr->me, 0); ++#endif + + /* Clear the old descriptor contents */ + IGB_TX_LOCK(txr); +@@ -3231,6 +3244,13 @@ + m_freem(txbuf->m_head); + txbuf->m_head = NULL; + } ++#ifdef DEV_NETMAP ++ if (slot) { ++ netmap_load_map(txr->txtag, txbuf->map, ++ NMB(slot), adapter->rx_mbuf_sz); ++ slot++; ++ } ++#endif /* DEV_NETMAP */ + /* clear the watch index */ + txbuf->next_eop = -1; + } +@@ -3626,6 +3646,19 @@ + + IGB_TX_LOCK_ASSERT(txr); + ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ struct netmap_adapter *na = NA(ifp); ++ ++ selwakeuppri(&na->tx_rings[txr->me].si, PI_NET); ++ IGB_TX_UNLOCK(txr); ++ IGB_CORE_LOCK(adapter); ++ selwakeuppri(&na->tx_rings[na->num_queues + 1].si, PI_NET); ++ IGB_CORE_UNLOCK(adapter); ++ IGB_TX_LOCK(txr); // the caller is supposed to own the lock ++ return FALSE; ++ } ++#endif /* DEV_NETMAP */ + if (txr->tx_avail == adapter->num_tx_desc) { + txr->queue_status = IGB_QUEUE_IDLE; + return FALSE; +@@ -3949,6 +3982,10 @@ + bus_dma_segment_t pseg[1], hseg[1]; + struct lro_ctrl *lro = &rxr->lro; + int rsize, nsegs, error = 0; ++#ifdef DEV_NETMAP ++ struct netmap_slot *slot = netmap_reset(NA(rxr->adapter->ifp), ++ NR_RX, rxr->me, 0); ++#endif + + adapter = rxr->adapter; + dev = adapter->dev; +@@ -3974,6 +4011,18 @@ + struct mbuf *mh, *mp; + + rxbuf = &rxr->rx_buffers[j]; ++#ifdef DEV_NETMAP ++ if (slot) { ++ netmap_load_map(rxr->ptag, ++ rxbuf->pmap, NMB(slot), ++ adapter->rx_mbuf_sz); ++ /* Update descriptor */ ++ rxr->rx_base[j].read.pkt_addr = ++ htole64(vtophys(NMB(slot))); ++ slot++; ++ continue; ++ } ++#endif /* DEV_NETMAP */ + if (rxr->hdr_split == FALSE) + goto skip_head; + +@@ -4436,6 +4485,19 @@ + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ struct netmap_adapter *na = NA(ifp); ++ ++ selwakeuppri(&na->rx_rings[rxr->me].si, PI_NET); ++ IGB_RX_UNLOCK(rxr); ++ IGB_CORE_LOCK(adapter); ++ selwakeuppri(&na->rx_rings[na->num_queues + 1].si, PI_NET); ++ IGB_CORE_UNLOCK(adapter); ++ return (0); ++ } ++#endif /* DEV_NETMAP */ ++ + /* Main clean loop */ + for (i = rxr->next_to_check; count != 0;) { + struct mbuf *sendmp, *mh, *mp; +Index: dev/e1000/if_lem.c +=================================================================== +--- dev/e1000/if_lem.c (revision 227552) ++++ dev/e1000/if_lem.c (working copy) +@@ -316,6 +316,10 @@ + /* Global used in WOL setup with multiport cards */ + static int global_quad_port_a = 0; + ++#ifdef DEV_NETMAP ++#include ++#endif /* DEV_NETMAP */ ++ + /********************************************************************* + * Device identification routine + * +@@ -646,6 +650,9 @@ + adapter->led_dev = led_create(lem_led_func, adapter, + device_get_nameunit(dev)); + ++#ifdef DEV_NETMAP ++ lem_netmap_attach(adapter); ++#endif /* DEV_NETMAP */ + INIT_DEBUGOUT("lem_attach: end"); + + return (0); +@@ -724,6 +731,9 @@ + callout_drain(&adapter->timer); + callout_drain(&adapter->tx_fifo_timer); + ++#ifdef DEV_NETMAP ++ netmap_detach(ifp); ++#endif /* DEV_NETMAP */ + lem_free_pci_resources(adapter); + bus_generic_detach(dev); + if_free(ifp); +@@ -2637,6 +2647,9 @@ + lem_setup_transmit_structures(struct adapter *adapter) + { + struct em_buffer *tx_buffer; ++#ifdef DEV_NETMAP ++ struct netmap_slot *slot = netmap_reset(NA(adapter->ifp), NR_TX, 0, 0); ++#endif + + /* Clear the old ring contents */ + bzero(adapter->tx_desc_base, +@@ -2650,6 +2663,15 @@ + bus_dmamap_unload(adapter->txtag, tx_buffer->map); + m_freem(tx_buffer->m_head); + tx_buffer->m_head = NULL; ++#ifdef DEV_NETMAP ++ if (slot) { ++ /* reload the map for netmap mode */ ++ netmap_load_map(adapter->txtag, ++ tx_buffer->map, NMB(slot), ++ NA(adapter->ifp)->buff_size); ++ slot++; ++ } ++#endif /* DEV_NETMAP */ + tx_buffer->next_eop = -1; + } + +@@ -2951,6 +2973,12 @@ + + EM_TX_LOCK_ASSERT(adapter); + ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ selwakeuppri(&NA(ifp)->tx_rings[0].si, PI_NET); ++ return; ++ } ++#endif /* DEV_NETMAP */ + if (adapter->num_tx_desc_avail == adapter->num_tx_desc) + return; + +@@ -3181,6 +3209,9 @@ + { + struct em_buffer *rx_buffer; + int i, error; ++#ifdef DEV_NETMAP ++ struct netmap_slot *slot = netmap_reset(NA(adapter->ifp), NR_RX, 0, 0); ++#endif + + /* Reset descriptor ring */ + bzero(adapter->rx_desc_base, +@@ -3200,6 +3231,18 @@ + + /* Allocate new ones. */ + for (i = 0; i < adapter->num_rx_desc; i++) { ++#ifdef DEV_NETMAP ++ if (slot) { ++ netmap_load_map(adapter->rxtag, ++ rx_buffer->map, NMB(slot), ++ NA(adapter->ifp)->buff_size); ++ /* Update descriptor */ ++ adapter->rx_desc_base[i].buffer_addr = ++ htole64(vtophys(NMB(slot))); ++ slot++; ++ continue; ++ } ++#endif /* DEV_NETMAP */ + error = lem_get_buf(adapter, i); + if (error) + return (error); +@@ -3407,6 +3450,14 @@ + bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, + BUS_DMASYNC_POSTREAD); + ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ selwakeuppri(&NA(ifp)->rx_rings[0].si, PI_NET); ++ EM_RX_UNLOCK(adapter); ++ return (0); ++ } ++#endif /* DEV_NETMAP */ ++ + if (!((current_desc->status) & E1000_RXD_STAT_DD)) { + if (done != NULL) + *done = rx_sent; +Index: dev/e1000/if_em.c +=================================================================== +--- dev/e1000/if_em.c (revision 227552) ++++ dev/e1000/if_em.c (working copy) +@@ -399,6 +399,10 @@ + /* Global used in WOL setup with multiport cards */ + static int global_quad_port_a = 0; + ++#ifdef DEV_NETMAP ++#include ++#endif /* DEV_NETMAP */ ++ + /********************************************************************* + * Device identification routine + * +@@ -714,6 +718,9 @@ + + adapter->led_dev = led_create(em_led_func, adapter, + device_get_nameunit(dev)); ++#ifdef DEV_NETMAP ++ em_netmap_attach(adapter); ++#endif /* DEV_NETMAP */ + + INIT_DEBUGOUT("em_attach: end"); + +@@ -785,6 +792,10 @@ + ether_ifdetach(adapter->ifp); + callout_drain(&adapter->timer); + ++#ifdef DEV_NETMAP ++ netmap_detach(ifp); ++#endif /* DEV_NETMAP */ ++ + em_free_pci_resources(adapter); + bus_generic_detach(dev); + if_free(ifp); +@@ -3213,6 +3224,10 @@ + struct adapter *adapter = txr->adapter; + struct em_buffer *txbuf; + int i; ++#ifdef DEV_NETMAP ++ struct netmap_slot *slot = netmap_reset(NA(adapter->ifp), ++ NR_TX, txr->me, 0); ++#endif + + /* Clear the old descriptor contents */ + EM_TX_LOCK(txr); +@@ -3232,6 +3247,16 @@ + m_freem(txbuf->m_head); + txbuf->m_head = NULL; + } ++#ifdef DEV_NETMAP ++ if (slot) { ++ /* reload the map for netmap mode */ ++ netmap_load_map(txr->txtag, ++ txbuf->map, NMB(slot), ++ adapter->rx_mbuf_sz); ++ slot++; ++ } ++#endif /* DEV_NETMAP */ ++ + /* clear the watch index */ + txbuf->next_eop = -1; + } +@@ -3682,6 +3707,12 @@ + struct ifnet *ifp = adapter->ifp; + + EM_TX_LOCK_ASSERT(txr); ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ selwakeuppri(&NA(ifp)->tx_rings[txr->me].si, PI_NET); ++ return (FALSE); ++ } ++#endif /* DEV_NETMAP */ + + /* No work, make sure watchdog is off */ + if (txr->tx_avail == adapter->num_tx_desc) { +@@ -3978,6 +4009,33 @@ + if (++j == adapter->num_rx_desc) + j = 0; + } ++#ifdef DEV_NETMAP ++ { ++ /* slot is NULL if we are not in netmap mode */ ++ struct netmap_slot *slot = netmap_reset(NA(adapter->ifp), ++ NR_RX, rxr->me, rxr->next_to_check); ++ /* ++ * we need to restore all buffer addresses in the ring as they might ++ * be in the wrong state if we are exiting from netmap mode. ++ */ ++ for (j = 0; j != adapter->num_rx_desc; ++j) { ++ void *addr; ++ rxbuf = &rxr->rx_buffers[j]; ++ if (rxbuf->m_head == NULL && !slot) ++ continue; ++ addr = slot ? NMB(slot) : rxbuf->m_head->m_data; ++ // XXX load or reload ? ++ netmap_load_map(rxr->rxtag, rxbuf->map, addr, adapter->rx_mbuf_sz); ++ /* Update descriptor */ ++ rxr->rx_base[j].buffer_addr = htole64(vtophys(addr)); ++ bus_dmamap_sync(rxr->rxtag, rxbuf->map, BUS_DMASYNC_PREREAD); ++ if (slot) ++ slot++; ++ } ++ /* Setup our descriptor indices */ ++ NA(adapter->ifp)->rx_rings[rxr->me].nr_hwcur = rxr->next_to_check; ++ } ++#endif /* DEV_NETMAP */ + + fail: + rxr->next_to_refresh = i; +@@ -4247,6 +4305,14 @@ + + EM_RX_LOCK(rxr); + ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ selwakeuppri(&NA(ifp)->rx_rings[rxr->me].si, PI_NET); ++ EM_RX_UNLOCK(rxr); ++ return (0); ++ } ++#endif /* DEV_NETMAP */ ++ + for (i = rxr->next_to_check, processed = 0; count != 0;) { + + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) +Index: dev/re/if_re.c +=================================================================== +--- dev/re/if_re.c (revision 227552) ++++ dev/re/if_re.c (working copy) +@@ -291,6 +291,10 @@ + static void re_setwol (struct rl_softc *); + static void re_clrwol (struct rl_softc *); + ++#ifdef DEV_NETMAP ++#include ++#endif /* !DEV_NETMAP */ ++ + #ifdef RE_DIAG + static int re_diag (struct rl_softc *); + #endif +@@ -1583,6 +1587,9 @@ + */ + ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header); + ++#ifdef DEV_NETMAP ++ re_netmap_attach(sc); ++#endif /* DEV_NETMAP */ + #ifdef RE_DIAG + /* + * Perform hardware diagnostic on the original RTL8169. +@@ -1778,6 +1785,9 @@ + bus_dma_tag_destroy(sc->rl_ldata.rl_stag); + } + ++#ifdef DEV_NETMAP ++ netmap_detach(ifp); ++#endif /* DEV_NETMAP */ + if (sc->rl_parent_tag) + bus_dma_tag_destroy(sc->rl_parent_tag); + +@@ -1952,6 +1962,9 @@ + sc->rl_ldata.rl_tx_desc_cnt * sizeof(struct rl_desc)); + for (i = 0; i < sc->rl_ldata.rl_tx_desc_cnt; i++) + sc->rl_ldata.rl_tx_desc[i].tx_m = NULL; ++#ifdef DEV_NETMAP ++ re_netmap_tx_init(sc); ++#endif /* DEV_NETMAP */ + /* Set EOR. */ + desc = &sc->rl_ldata.rl_tx_list[sc->rl_ldata.rl_tx_desc_cnt - 1]; + desc->rl_cmdstat |= htole32(RL_TDESC_CMD_EOR); +@@ -1979,6 +1992,9 @@ + if ((error = re_newbuf(sc, i)) != 0) + return (error); + } ++#ifdef DEV_NETMAP ++ re_netmap_rx_init(sc); ++#endif /* DEV_NETMAP */ + + /* Flush the RX descriptors */ + +@@ -2035,6 +2051,12 @@ + RL_LOCK_ASSERT(sc); + + ifp = sc->rl_ifp; ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ selwakeuppri(&NA(ifp)->rx_rings->si, PI_NET); ++ return 0; ++ } ++#endif /* DEV_NETMAP */ + if (ifp->if_mtu > RL_MTU && (sc->rl_flags & RL_FLAG_JUMBOV2) != 0) + jumbo = 1; + else +@@ -2276,6 +2298,12 @@ + return; + + ifp = sc->rl_ifp; ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ selwakeuppri(&NA(ifp)->tx_rings[0].si, PI_NET); ++ return; ++ } ++#endif /* DEV_NETMAP */ + /* Invalidate the TX descriptor list */ + bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, + sc->rl_ldata.rl_tx_list_map, +@@ -2794,6 +2822,20 @@ + + sc = ifp->if_softc; + ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ struct netmap_kring *kring = &NA(ifp)->tx_rings[0]; ++ if (sc->rl_ldata.rl_tx_prodidx != kring->nr_hwcur) { ++ /* kick the tx unit */ ++ CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START); ++#ifdef RE_TX_MODERATION ++ CSR_WRITE_4(sc, RL_TIMERCNT, 1); ++#endif ++ sc->rl_watchdog_timer = 5; ++ } ++ return; ++ } ++#endif /* DEV_NETMAP */ + if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != + IFF_DRV_RUNNING || (sc->rl_flags & RL_FLAG_LINK) == 0) + return; +Index: dev/ixgbe/ixgbe.c +=================================================================== +--- dev/ixgbe/ixgbe.c (revision 227552) ++++ dev/ixgbe/ixgbe.c (working copy) +@@ -313,6 +313,10 @@ + static int fdir_pballoc = 1; + #endif + ++#ifdef DEV_NETMAP ++#include ++#endif /* DEV_NETMAP */ ++ + /********************************************************************* + * Device identification routine + * +@@ -578,6 +582,9 @@ + + ixgbe_add_hw_stats(adapter); + ++#ifdef DEV_NETMAP ++ ixgbe_netmap_attach(adapter); ++#endif /* DEV_NETMAP */ + INIT_DEBUGOUT("ixgbe_attach: end"); + return (0); + err_late: +@@ -652,6 +659,9 @@ + + ether_ifdetach(adapter->ifp); + callout_drain(&adapter->timer); ++#ifdef DEV_NETMAP ++ netmap_detach(adapter->ifp); ++#endif /* DEV_NETMAP */ + ixgbe_free_pci_resources(adapter); + bus_generic_detach(dev); + if_free(adapter->ifp); +@@ -1719,6 +1729,7 @@ + if (++i == adapter->num_tx_desc) + i = 0; + ++ // XXX should we sync each buffer ? + txbuf->m_head = NULL; + txbuf->eop_index = -1; + } +@@ -2813,6 +2824,10 @@ + struct adapter *adapter = txr->adapter; + struct ixgbe_tx_buf *txbuf; + int i; ++#ifdef DEV_NETMAP ++ struct netmap_slot *slot = netmap_reset(NA(adapter->ifp), ++ NR_TX, txr->me, 0); ++#endif + + /* Clear the old ring contents */ + IXGBE_TX_LOCK(txr); +@@ -2832,6 +2847,13 @@ + m_freem(txbuf->m_head); + txbuf->m_head = NULL; + } ++#ifdef DEV_NETMAP ++ if (slot) { ++ netmap_load_map(txr->txtag, txbuf->map, ++ NMB(slot), adapter->rx_mbuf_sz); ++ slot++; ++ } ++#endif /* DEV_NETMAP */ + /* Clear the EOP index */ + txbuf->eop_index = -1; + } +@@ -3310,6 +3332,20 @@ + + mtx_assert(&txr->tx_mtx, MA_OWNED); + ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ struct netmap_adapter *na = NA(ifp); ++ ++ selwakeuppri(&na->tx_rings[txr->me].si, PI_NET); ++ IXGBE_TX_UNLOCK(txr); ++ IXGBE_CORE_LOCK(adapter); ++ selwakeuppri(&na->tx_rings[na->num_queues + 1].si, PI_NET); ++ IXGBE_CORE_UNLOCK(adapter); ++ IXGBE_TX_LOCK(txr); // the caller is supposed to own the lock ++ return (FALSE); ++ } ++#endif /* DEV_NETMAP */ ++ + if (txr->tx_avail == adapter->num_tx_desc) { + txr->queue_status = IXGBE_QUEUE_IDLE; + return FALSE; +@@ -3698,6 +3734,10 @@ + bus_dma_segment_t pseg[1], hseg[1]; + struct lro_ctrl *lro = &rxr->lro; + int rsize, nsegs, error = 0; ++#ifdef DEV_NETMAP ++ struct netmap_slot *slot = netmap_reset(NA(rxr->adapter->ifp), ++ NR_RX, rxr->me, 0); ++#endif /* DEV_NETMAP */ + + adapter = rxr->adapter; + ifp = adapter->ifp; +@@ -3721,6 +3761,18 @@ + struct mbuf *mh, *mp; + + rxbuf = &rxr->rx_buffers[j]; ++#ifdef DEV_NETMAP ++ if (slot) { ++ netmap_load_map(rxr->ptag, ++ rxbuf->pmap, NMB(slot), ++ adapter->rx_mbuf_sz); ++ /* Update descriptor */ ++ rxr->rx_base[j].read.pkt_addr = ++ htole64(vtophys(NMB(slot))); ++ slot++; ++ continue; ++ } ++#endif /* DEV_NETMAP */ + /* + ** Don't allocate mbufs if not + ** doing header split, its wasteful +@@ -4148,6 +4200,18 @@ + + IXGBE_RX_LOCK(rxr); + ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ struct netmap_adapter *na = NA(ifp); ++ ++ selwakeuppri(&na->rx_rings[rxr->me].si, PI_NET); ++ IXGBE_RX_UNLOCK(rxr); ++ IXGBE_CORE_LOCK(adapter); ++ selwakeuppri(&na->rx_rings[na->num_queues + 1].si, PI_NET); ++ IXGBE_CORE_UNLOCK(adapter); ++ return (0); ++ } ++#endif /* DEV_NETMAP */ + for (i = rxr->next_to_check; count != 0;) { + struct mbuf *sendmp, *mh, *mp; + u32 rsc, ptype; diff --git a/sys/dev/netmap/if_em_netmap.h b/sys/dev/netmap/if_em_netmap.h new file mode 100644 index 000000000000..0e220e755d68 --- /dev/null +++ b/sys/dev/netmap/if_em_netmap.h @@ -0,0 +1,383 @@ +/* + * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: if_em_netmap.h 9662 2011-11-16 13:18:06Z luigi $ + * + * netmap changes for if_em. + */ + +#include +#include +#include +#include /* vtophys ? */ +#include + +static void em_netmap_block_tasks(struct adapter *); +static void em_netmap_unblock_tasks(struct adapter *); +static int em_netmap_reg(struct ifnet *, int onoff); +static int em_netmap_txsync(void *, u_int, int); +static int em_netmap_rxsync(void *, u_int, int); +static void em_netmap_lock_wrapper(void *, int, u_int); + +static void +em_netmap_attach(struct adapter *adapter) +{ + struct netmap_adapter na; + + bzero(&na, sizeof(na)); + + na.ifp = adapter->ifp; + na.separate_locks = 1; + na.num_tx_desc = adapter->num_tx_desc; + na.num_rx_desc = adapter->num_rx_desc; + na.nm_txsync = em_netmap_txsync; + na.nm_rxsync = em_netmap_rxsync; + na.nm_lock = em_netmap_lock_wrapper; + na.nm_register = em_netmap_reg; + /* + * adapter->rx_mbuf_sz is set by SIOCSETMTU, but in netmap mode + * we allocate the buffers on the first register. So we must + * disallow a SIOCSETMTU when if_capenable & IFCAP_NETMAP is set. + */ + na.buff_size = MCLBYTES; + netmap_attach(&na, adapter->num_queues); +} + + +/* + * wrapper to export locks to the generic code + */ +static void +em_netmap_lock_wrapper(void *_a, int what, u_int queueid) +{ + struct adapter *adapter = _a; + + ASSERT(queueid < adapter->num_queues); + switch (what) { + case NETMAP_CORE_LOCK: + EM_CORE_LOCK(adapter); + break; + case NETMAP_CORE_UNLOCK: + EM_CORE_UNLOCK(adapter); + break; + case NETMAP_TX_LOCK: + EM_TX_LOCK(&adapter->tx_rings[queueid]); + break; + case NETMAP_TX_UNLOCK: + EM_TX_UNLOCK(&adapter->tx_rings[queueid]); + break; + case NETMAP_RX_LOCK: + EM_RX_LOCK(&adapter->rx_rings[queueid]); + break; + case NETMAP_RX_UNLOCK: + EM_RX_UNLOCK(&adapter->rx_rings[queueid]); + break; + } +} + + +static void +em_netmap_block_tasks(struct adapter *adapter) +{ + if (adapter->msix > 1) { /* MSIX */ + int i; + struct tx_ring *txr = adapter->tx_rings; + struct rx_ring *rxr = adapter->rx_rings; + + for (i = 0; i < adapter->num_queues; i++, txr++, rxr++) { + taskqueue_block(txr->tq); + taskqueue_drain(txr->tq, &txr->tx_task); + taskqueue_block(rxr->tq); + taskqueue_drain(rxr->tq, &rxr->rx_task); + } + } else { /* legacy */ + taskqueue_block(adapter->tq); + taskqueue_drain(adapter->tq, &adapter->link_task); + taskqueue_drain(adapter->tq, &adapter->que_task); + } +} + + +static void +em_netmap_unblock_tasks(struct adapter *adapter) +{ + if (adapter->msix > 1) { + struct tx_ring *txr = adapter->tx_rings; + struct rx_ring *rxr = adapter->rx_rings; + int i; + + for (i = 0; i < adapter->num_queues; i++) { + taskqueue_unblock(txr->tq); + taskqueue_unblock(rxr->tq); + } + } else { /* legacy */ + taskqueue_unblock(adapter->tq); + } +} + +/* + * register-unregister routine + */ +static int +em_netmap_reg(struct ifnet *ifp, int onoff) +{ + struct adapter *adapter = ifp->if_softc; + struct netmap_adapter *na = NA(ifp); + int error = 0; + + if (na == NULL) + return EINVAL; /* no netmap support here */ + + em_disable_intr(adapter); + + /* Tell the stack that the interface is no longer active */ + ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + + em_netmap_block_tasks(adapter); + + if (onoff) { + ifp->if_capenable |= IFCAP_NETMAP; + + /* save if_transmit for later restore. + * XXX also if_start and if_qflush ? + */ + na->if_transmit = ifp->if_transmit; + ifp->if_transmit = netmap_start; + + em_init_locked(adapter); + if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { + error = ENOMEM; + goto fail; + } + } else { +fail: + /* restore if_transmit */ + ifp->if_transmit = na->if_transmit; + ifp->if_capenable &= ~IFCAP_NETMAP; + em_init_locked(adapter); /* also enable intr */ + + } + em_netmap_unblock_tasks(adapter); + return (error); +} + +/* + * Reconcile hardware and user view of the transmit ring, see + * ixgbe.c for details. + */ +static int +em_netmap_txsync(void *a, u_int ring_nr, int do_lock) +{ + struct adapter *adapter = a; + struct tx_ring *txr = &adapter->tx_rings[ring_nr]; + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->tx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + /* generate an interrupt approximately every half ring */ + int report_frequency = kring->nkr_num_slots >> 1; + + k = ring->cur; + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + EM_TX_LOCK(txr); + bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, + BUS_DMASYNC_POSTREAD); + + /* record completed transmissions TODO + * + * instead of using TDH, we could read the transmitted status bit. + */ + j = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); + if (j >= kring->nkr_num_slots) { /* XXX can happen */ + D("TDH wrap %d", j); + j -= kring->nkr_num_slots; + } + int delta = j - txr->next_to_clean; + if (delta) { + /* new transmissions were completed, increment + ring->nr_hwavail. */ + if (delta < 0) + delta += kring->nkr_num_slots; + txr->next_to_clean = j; + kring->nr_hwavail += delta; + } + + /* update avail to what the hardware knows */ + ring->avail = kring->nr_hwavail; + + j = kring->nr_hwcur; + if (j != k) { /* we have packets to send */ + n = 0; + while (j != k) { + struct netmap_slot *slot = &ring->slot[j]; + struct e1000_tx_desc *curr = &txr->tx_base[j]; + struct em_buffer *txbuf = &txr->tx_buffers[j]; + int flags = ((slot->flags & NS_REPORT) || + j == 0 || j == report_frequency) ? + E1000_TXD_CMD_RS : 0; + void *addr = NMB(slot); + int len = slot->len; + if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { + if (do_lock) + EM_TX_UNLOCK(txr); + return netmap_ring_reinit(kring); + } + + slot->flags &= ~NS_REPORT; + curr->upper.data = 0; + curr->lower.data = + htole32( + adapter->txd_cmd | + (E1000_TXD_CMD_EOP | flags) | + slot->len); + if (slot->flags & NS_BUF_CHANGED) { + curr->buffer_addr = htole64(vtophys(addr)); + /* buffer has changed, unload and reload map */ + netmap_reload_map(txr->txtag, txbuf->map, + addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + + bus_dmamap_sync(txr->txtag, txbuf->map, + BUS_DMASYNC_PREWRITE); + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwcur = ring->cur; + + /* decrease avail by number of sent packets */ + ring->avail -= n; + kring->nr_hwavail = ring->avail; + + bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + + E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), + ring->cur); + } + if (do_lock) + EM_TX_UNLOCK(txr); + return 0; +} + +/* + * Reconcile kernel and user view of the receive ring, see ixgbe.c + */ +static int +em_netmap_rxsync(void *a, u_int ring_nr, int do_lock) +{ + struct adapter *adapter = a; + struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + k = ring->cur; + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + EM_RX_LOCK(rxr); + /* XXX check sync modes */ + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + + /* acknowledge all the received packets. */ + j = rxr->next_to_check; + for (n = 0; ; n++) { + struct e1000_rx_desc *curr = &rxr->rx_base[j]; + + if ((curr->status & E1000_RXD_STAT_DD) == 0) + break; + ring->slot[j].len = le16toh(curr->length); + bus_dmamap_sync(rxr->tag, rxr->rx_buffers[j].map, + BUS_DMASYNC_POSTREAD); + j = (j == lim) ? 0 : j + 1; + } + if (n) { + rxr->next_to_check = j; + kring->nr_hwavail += n; + } + + /* skip past packets that userspace has already processed: + * making them available for reception. + * advance nr_hwcur and issue a bus_dmamap_sync on the + * buffers so it is safe to write to them. + * Also increase nr_hwavail + */ + j = kring->nr_hwcur; + if (j != k) { /* userspace has read some packets. */ + n = 0; + while (j != k) { + struct netmap_slot *slot = &ring->slot[j]; + struct e1000_rx_desc *curr = &rxr->rx_base[j]; + struct em_buffer *rxbuf = &rxr->rx_buffers[j]; + void *addr = NMB(slot); + + if (addr == netmap_buffer_base) { /* bad buf */ + if (do_lock) + EM_RX_UNLOCK(rxr); + return netmap_ring_reinit(kring); + } + + curr->status = 0; + if (slot->flags & NS_BUF_CHANGED) { + curr->buffer_addr = htole64(vtophys(addr)); + /* buffer has changed, unload and reload map */ + netmap_reload_map(rxr->rxtag, rxbuf->map, + addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + + bus_dmamap_sync(rxr->rxtag, rxbuf->map, + BUS_DMASYNC_PREREAD); + + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwavail -= n; + kring->nr_hwcur = ring->cur; + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + /* + * IMPORTANT: we must leave one free slot in the ring, + * so move j back by one unit + */ + j = (j == 0) ? lim : j - 1; + E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), j); + } + /* tell userspace that there are new packets */ + ring->avail = kring->nr_hwavail ; + if (do_lock) + EM_RX_UNLOCK(rxr); + return 0; +} diff --git a/sys/dev/netmap/if_igb_netmap.h b/sys/dev/netmap/if_igb_netmap.h new file mode 100644 index 000000000000..0c147063b211 --- /dev/null +++ b/sys/dev/netmap/if_igb_netmap.h @@ -0,0 +1,378 @@ +/* + * Copyright (C) 2011 Universita` di Pisa. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: if_igb_netmap.h 9662 2011-11-16 13:18:06Z luigi $ + * + * netmap modifications for igb + * contribured by Ahmed Kooli + */ + +#include +#include +#include +#include /* vtophys ? */ +#include + +static int igb_netmap_reg(struct ifnet *, int onoff); +static int igb_netmap_txsync(void *, u_int, int); +static int igb_netmap_rxsync(void *, u_int, int); +static void igb_netmap_lock_wrapper(void *, int, u_int); + + +static void +igb_netmap_attach(struct adapter *adapter) +{ + struct netmap_adapter na; + + bzero(&na, sizeof(na)); + + na.ifp = adapter->ifp; + na.separate_locks = 1; + na.num_tx_desc = adapter->num_tx_desc; + na.num_rx_desc = adapter->num_rx_desc; + na.nm_txsync = igb_netmap_txsync; + na.nm_rxsync = igb_netmap_rxsync; + na.nm_lock = igb_netmap_lock_wrapper; + na.nm_register = igb_netmap_reg; + /* + * adapter->rx_mbuf_sz is set by SIOCSETMTU, but in netmap mode + * we allocate the buffers on the first register. So we must + * disallow a SIOCSETMTU when if_capenable & IFCAP_NETMAP is set. + */ + na.buff_size = MCLBYTES; + netmap_attach(&na, adapter->num_queues); +} + + +/* + * wrapper to export locks to the generic code + */ +static void +igb_netmap_lock_wrapper(void *_a, int what, u_int queueid) +{ + struct adapter *adapter = _a; + + ASSERT(queueid < adapter->num_queues); + switch (what) { + case NETMAP_CORE_LOCK: + IGB_CORE_LOCK(adapter); + break; + case NETMAP_CORE_UNLOCK: + IGB_CORE_UNLOCK(adapter); + break; + case NETMAP_TX_LOCK: + IGB_TX_LOCK(&adapter->tx_rings[queueid]); + break; + case NETMAP_TX_UNLOCK: + IGB_TX_UNLOCK(&adapter->tx_rings[queueid]); + break; + case NETMAP_RX_LOCK: + IGB_RX_LOCK(&adapter->rx_rings[queueid]); + break; + case NETMAP_RX_UNLOCK: + IGB_RX_UNLOCK(&adapter->rx_rings[queueid]); + break; + } +} + + +/* + * support for netmap register/unregisted. We are already under core lock. + * only called on the first init or the last unregister. + */ +static int +igb_netmap_reg(struct ifnet *ifp, int onoff) +{ + struct adapter *adapter = ifp->if_softc; + struct netmap_adapter *na = NA(ifp); + int error = 0; + + if (!na) + return EINVAL; + + igb_disable_intr(adapter); + + /* Tell the stack that the interface is no longer active */ + ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + + if (onoff) { + ifp->if_capenable |= IFCAP_NETMAP; + + /* save if_transmit to restore it later */ + na->if_transmit = ifp->if_transmit; + ifp->if_transmit = netmap_start; + + igb_init_locked(adapter); + if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { + error = ENOMEM; + goto fail; + } + } else { +fail: + /* restore if_transmit */ + ifp->if_transmit = na->if_transmit; + ifp->if_capenable &= ~IFCAP_NETMAP; + igb_init_locked(adapter); /* also enables intr */ + } + return (error); +} + + +/* + * Reconcile kernel and user view of the transmit ring. + * + * Userspace has filled tx slots up to cur (excluded). + * The last unused slot previously known to the kernel was nr_hwcur, + * and the last interrupt reported nr_hwavail slots available + * (using the special value -1 to indicate idle transmit ring). + * The function must first update avail to what the kernel + * knows, subtract the newly used slots (cur - nr_hwcur) + * from both avail and nr_hwavail, and set nr_hwcur = cur + * issuing a dmamap_sync on all slots. + * + * Check parameters in the struct netmap_ring. + * We don't use avail, only check for bogus values. + * Make sure cur is valid, and same goes for buffer indexes and lengths. + * To avoid races, read the values once, and never use those from + * the ring afterwards. + */ +static int +igb_netmap_txsync(void *a, u_int ring_nr, int do_lock) +{ + struct adapter *adapter = a; + struct tx_ring *txr = &adapter->tx_rings[ring_nr]; + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->tx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + /* generate an interrupt approximately every half ring */ + int report_frequency = kring->nkr_num_slots >> 1; + + k = ring->cur; /* ring is not protected by any lock */ + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + IGB_TX_LOCK(txr); + bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, + BUS_DMASYNC_POSTREAD); + + /* record completed transmissions. TODO + * + * Instead of reading from the TDH register, we could and try to check + * the status bit of descriptor packets. + */ + j = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); + if (j >= kring->nkr_num_slots) /* XXX can it happen ? */ + j -= kring->nkr_num_slots; + int delta = j - txr->next_to_clean; + if (delta) { + /* new tx were completed */ + if (delta < 0) + delta += kring->nkr_num_slots; + txr->next_to_clean = j; + kring->nr_hwavail += delta; + } + + /* update avail to what the hardware knows */ + ring->avail = kring->nr_hwavail; + + j = kring->nr_hwcur; + if (j != k) { /* we have new packets to send */ + u32 olinfo_status = 0; + n = 0; + + /* 82575 needs the queue index added */ + if (adapter->hw.mac.type == e1000_82575) + olinfo_status |= txr->me << 4; + + while (j != k) { + struct netmap_slot *slot = &ring->slot[j]; + struct igb_tx_buffer *txbuf = &txr->tx_buffers[j]; + union e1000_adv_tx_desc *curr = + (union e1000_adv_tx_desc *)&txr->tx_base[j]; + void *addr = NMB(slot); + int flags = ((slot->flags & NS_REPORT) || + j == 0 || j == report_frequency) ? + E1000_ADVTXD_DCMD_RS : 0; + int len = slot->len; + + if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { + if (do_lock) + IGB_TX_UNLOCK(txr); + return netmap_ring_reinit(kring); + } + + slot->flags &= ~NS_REPORT; + curr->read.buffer_addr = htole64(vtophys(addr)); + curr->read.olinfo_status = + htole32(olinfo_status | + (len<< E1000_ADVTXD_PAYLEN_SHIFT)); + curr->read.cmd_type_len = + htole32(len | E1000_ADVTXD_DTYP_DATA | + E1000_ADVTXD_DCMD_IFCS | + E1000_ADVTXD_DCMD_DEXT | + E1000_ADVTXD_DCMD_EOP | flags); + if (slot->flags & NS_BUF_CHANGED) { + /* buffer has changed, unload and reload map */ + netmap_reload_map(txr->txtag, txbuf->map, + addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + + bus_dmamap_sync(txr->txtag, txbuf->map, + BUS_DMASYNC_PREWRITE); + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwcur = k; + + /* decrease avail by number of sent packets */ + ring->avail -= n; + kring->nr_hwavail = ring->avail; + + /* Set the watchdog */ + txr->queue_status = IGB_QUEUE_WORKING; + txr->watchdog_time = ticks; + + bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + + E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), k); + } + if (do_lock) + IGB_TX_UNLOCK(txr); + return 0; +} + + +/* + * Reconcile kernel and user view of the receive ring. + * + * Userspace has read rx slots up to cur (excluded). + * The last unread slot previously known to the kernel was nr_hwcur, + * and the last interrupt reported nr_hwavail slots available. + * We must subtract the newly consumed slots (cur - nr_hwcur) + * from nr_hwavail, clearing the descriptors for the next + * read, tell the hardware that they are available, + * and set nr_hwcur = cur and avail = nr_hwavail. + * issuing a dmamap_sync on all slots. + */ +static int +igb_netmap_rxsync(void *a, u_int ring_nr, int do_lock) +{ + struct adapter *adapter = a; + struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + k = ring->cur; /* ring is not protected by any lock */ + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + IGB_RX_LOCK(rxr); + + /* Sync the ring. */ + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + + j = rxr->next_to_check; + for (n = 0; ; n++) { + union e1000_adv_rx_desc *curr = &rxr->rx_base[j]; + uint32_t staterr = le32toh(curr->wb.upper.status_error); + + if ((staterr & E1000_RXD_STAT_DD) == 0) + break; + ring->slot[j].len = le16toh(curr->wb.upper.length); + + bus_dmamap_sync(rxr->ptag, + rxr->rx_buffers[j].pmap, BUS_DMASYNC_POSTREAD); + j = (j == lim) ? 0 : j + 1; + } + if (n) { + rxr->next_to_check = j; + kring->nr_hwavail += n; + if (kring->nr_hwavail >= lim - 10) { + ND("rx ring %d almost full %d", ring_nr, kring->nr_hwavail); + } + } + + /* skip past packets that userspace has already processed, + * making them available for reception. + * advance nr_hwcur and issue a bus_dmamap_sync on the + * buffers so it is safe to write to them. + * Also increase nr_hwavail + */ + j = kring->nr_hwcur; + if (j != k) { /* userspace has read some packets. */ + n = 0; + while (j != k) { + struct netmap_slot *slot = ring->slot + j; + union e1000_adv_rx_desc *curr = &rxr->rx_base[j]; + struct igb_rx_buf *rxbuf = rxr->rx_buffers + j; + void *addr = NMB(slot); + + if (addr == netmap_buffer_base) { /* bad buf */ + if (do_lock) + IGB_RX_UNLOCK(rxr); + return netmap_ring_reinit(kring); + } + + curr->wb.upper.status_error = 0; + curr->read.pkt_addr = htole64(vtophys(addr)); + if (slot->flags & NS_BUF_CHANGED) { + netmap_reload_map(rxr->ptag, rxbuf->pmap, + addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + + bus_dmamap_sync(rxr->ptag, rxbuf->pmap, + BUS_DMASYNC_PREREAD); + + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwavail -= n; + kring->nr_hwcur = ring->cur; + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + /* IMPORTANT: we must leave one free slot in the ring, + * so move j back by one unit + */ + j = (j == 0) ? lim : j - 1; + E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), j); + } + /* tell userspace that there are new packets */ + ring->avail = kring->nr_hwavail ; + if (do_lock) + IGB_RX_UNLOCK(rxr); + return 0; +} diff --git a/sys/dev/netmap/if_lem_netmap.h b/sys/dev/netmap/if_lem_netmap.h new file mode 100644 index 000000000000..a8f34989bcc4 --- /dev/null +++ b/sys/dev/netmap/if_lem_netmap.h @@ -0,0 +1,344 @@ +/* + * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: if_lem_netmap.h 9662 2011-11-16 13:18:06Z luigi $ + * + * netmap support for if_lem.c + */ + +#include +#include +#include +#include /* vtophys ? */ +#include + +static int lem_netmap_reg(struct ifnet *, int onoff); +static int lem_netmap_txsync(void *, u_int, int); +static int lem_netmap_rxsync(void *, u_int, int); +static void lem_netmap_lock_wrapper(void *, int, u_int); + + +SYSCTL_NODE(_dev, OID_AUTO, lem, CTLFLAG_RW, 0, "lem card"); + +static void +lem_netmap_attach(struct adapter *adapter) +{ + struct netmap_adapter na; + + bzero(&na, sizeof(na)); + + na.ifp = adapter->ifp; + na.separate_locks = 1; + na.num_tx_desc = adapter->num_tx_desc; + na.num_rx_desc = adapter->num_rx_desc; + na.nm_txsync = lem_netmap_txsync; + na.nm_rxsync = lem_netmap_rxsync; + na.nm_lock = lem_netmap_lock_wrapper; + na.nm_register = lem_netmap_reg; + na.buff_size = MCLBYTES; + netmap_attach(&na, 1); +} + + +static void +lem_netmap_lock_wrapper(void *_a, int what, u_int ringid) +{ + struct adapter *adapter = _a; + + /* only one ring here so ignore the ringid */ + switch (what) { + case NETMAP_CORE_LOCK: + EM_CORE_LOCK(adapter); + break; + case NETMAP_CORE_UNLOCK: + EM_CORE_UNLOCK(adapter); + break; + case NETMAP_TX_LOCK: + EM_TX_LOCK(adapter); + break; + case NETMAP_TX_UNLOCK: + EM_TX_UNLOCK(adapter); + break; + case NETMAP_RX_LOCK: + EM_RX_LOCK(adapter); + break; + case NETMAP_RX_UNLOCK: + EM_RX_UNLOCK(adapter); + break; + } +} + + +/* + * Reconcile kernel and user view of the transmit ring. see ixgbe.c + */ +static int +lem_netmap_txsync(void *a, u_int ring_nr, int do_lock) +{ + struct adapter *adapter = a; + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->tx_rings[0]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + /* generate an interrupt approximately every half ring */ + int report_frequency = kring->nkr_num_slots >> 1; + + k = ring->cur; + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + EM_TX_LOCK(adapter); + bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, + BUS_DMASYNC_POSTREAD); + + /* record completed transmissions TODO + * + * instead of using TDH, we could read the transmitted status bit. + */ + j = E1000_READ_REG(&adapter->hw, E1000_TDH(0)); + if (j >= kring->nkr_num_slots) { /* can it happen ? */ + D("bad TDH %d", j); + j -= kring->nkr_num_slots; + } + int delta = j - adapter->next_tx_to_clean; + if (delta) { + if (delta < 0) + delta += kring->nkr_num_slots; + adapter->next_tx_to_clean = j; + kring->nr_hwavail += delta; + } + + /* update avail to what the hardware knows */ + ring->avail = kring->nr_hwavail; + + j = kring->nr_hwcur; + if (j != k) { /* we have new packets to send */ + n = 0; + while (j != k) { + struct netmap_slot *slot = &ring->slot[j]; + struct e1000_tx_desc *curr = &adapter->tx_desc_base[j]; + struct em_buffer *txbuf = &adapter->tx_buffer_area[j]; + void *addr = NMB(slot); + int flags = ((slot->flags & NS_REPORT) || + j == 0 || j == report_frequency) ? + E1000_TXD_CMD_RS : 0; + int len = slot->len; + + if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { + if (do_lock) + EM_TX_UNLOCK(adapter); + return netmap_ring_reinit(kring); + } + + curr->upper.data = 0; + /* always interrupt. XXX make it conditional */ + curr->lower.data = + htole32( adapter->txd_cmd | len | + (E1000_TXD_CMD_EOP | flags) ); + if (slot->flags & NS_BUF_CHANGED) { + curr->buffer_addr = htole64(vtophys(addr)); + /* buffer has changed, unload and reload map */ + netmap_reload_map(adapter->txtag, txbuf->map, + addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + + bus_dmamap_sync(adapter->txtag, txbuf->map, + BUS_DMASYNC_PREWRITE); + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwcur = ring->cur; + + /* decrease avail by number of sent packets */ + ring->avail -= n; + kring->nr_hwavail = ring->avail; + + bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + + E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), ring->cur); + } + if (do_lock) + EM_TX_UNLOCK(adapter); + return 0; +} + + +/* + * Reconcile kernel and user view of the receive ring. see ixgbe.c + */ +static int +lem_netmap_rxsync(void *a, u_int ring_nr, int do_lock) +{ + struct adapter *adapter = a; + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->rx_rings[0]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + k = ring->cur; + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + EM_RX_LOCK(adapter); + /* XXX check sync modes */ + bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + + /* acknowldge all the received packets. */ + j = adapter->next_rx_desc_to_check; + for (n = 0; ; n++) { + struct e1000_rx_desc *curr = &adapter->rx_desc_base[j]; + int len = le16toh(adapter->rx_desc_base[j].length) - 4; // CRC + + if ((curr->status & E1000_RXD_STAT_DD) == 0) + break; + + if (len < 0) { + D("bogus pkt size at %d", j); + len = 0; + } + ring->slot[j].len = len; + bus_dmamap_sync(adapter->rxtag, adapter->rx_buffer_area[j].map, + BUS_DMASYNC_POSTREAD); + j = (j == lim) ? 0 : j + 1; + } + if (n) { + adapter->next_rx_desc_to_check = j; + kring->nr_hwavail += n; + } + + /* skip past packets that userspace has already processed, + * making them available for reception. We don't need to set + * the length as it is the same for all slots. + */ + j = kring->nr_hwcur; + if (j != k) { /* userspace has read some packets. */ + n = 0; + while (j != k) { + struct netmap_slot *slot = &ring->slot[j]; + struct e1000_rx_desc *curr = &adapter->rx_desc_base[j]; + struct em_buffer *rxbuf = &adapter->rx_buffer_area[j]; + void *addr = NMB(slot); + + if (addr == netmap_buffer_base) { /* bad buf */ + if (do_lock) + EM_RX_UNLOCK(adapter); + return netmap_ring_reinit(kring); + } + curr = &adapter->rx_desc_base[j]; + curr->status = 0; + if (slot->flags & NS_BUF_CHANGED) { + curr->buffer_addr = htole64(vtophys(addr)); + /* buffer has changed, unload and reload map */ + netmap_reload_map(adapter->rxtag, rxbuf->map, + addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + + bus_dmamap_sync(adapter->rxtag, rxbuf->map, + BUS_DMASYNC_PREREAD); + + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwavail -= n; + kring->nr_hwcur = ring->cur; + bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + /* + * IMPORTANT: we must leave one free slot in the ring, + * so move j back by one unit + */ + j = (j == 0) ? lim : j - 1; + E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), j); + } + + /* tell userspace that there are new packets */ + ring->avail = kring->nr_hwavail ; + if (do_lock) + EM_RX_UNLOCK(adapter); + return 0; +} + + +/* + * Register/unregister routine + */ +static int +lem_netmap_reg(struct ifnet *ifp, int onoff) +{ + struct adapter *adapter = ifp->if_softc; + struct netmap_adapter *na = NA(ifp); + int error = 0; + + if (!na) + return EINVAL; + + lem_disable_intr(adapter); + + /* Tell the stack that the interface is no longer active */ + ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + + /* lem_netmap_block_tasks(adapter); */ +#ifndef EM_LEGACY_IRQ + taskqueue_block(adapter->tq); + taskqueue_drain(adapter->tq, &adapter->rxtx_task); + taskqueue_drain(adapter->tq, &adapter->link_task); +#endif /* !EM_LEGCY_IRQ */ + if (onoff) { + ifp->if_capenable |= IFCAP_NETMAP; + + /* save if_transmit to restore it when exiting. + * XXX what about if_start and if_qflush ? + */ + na->if_transmit = ifp->if_transmit; + ifp->if_transmit = netmap_start; + + lem_init_locked(adapter); + if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { + error = ENOMEM; + goto fail; + } + } else { +fail: + /* restore non-netmap mode */ + ifp->if_transmit = na->if_transmit; + ifp->if_capenable &= ~IFCAP_NETMAP; + lem_init_locked(adapter); /* also enables intr */ + } + +#ifndef EM_LEGACY_IRQ + taskqueue_unblock(adapter->tq); +#endif /* !EM_LEGCY_IRQ */ + + return (error); +} diff --git a/sys/dev/netmap/if_re_netmap.h b/sys/dev/netmap/if_re_netmap.h new file mode 100644 index 000000000000..efccf3a795bc --- /dev/null +++ b/sys/dev/netmap/if_re_netmap.h @@ -0,0 +1,415 @@ +/* + * Copyright (C) 2011 Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: if_re_netmap.h 9662 2011-11-16 13:18:06Z luigi $ + * + * netmap support for if_re + */ + +#include +#include +#include +#include /* vtophys ? */ +#include + +static int re_netmap_reg(struct ifnet *, int onoff); +static int re_netmap_txsync(void *, u_int, int); +static int re_netmap_rxsync(void *, u_int, int); +static void re_netmap_lock_wrapper(void *, int, u_int); + +static void +re_netmap_attach(struct rl_softc *sc) +{ + struct netmap_adapter na; + + bzero(&na, sizeof(na)); + + na.ifp = sc->rl_ifp; + na.separate_locks = 0; + na.num_tx_desc = sc->rl_ldata.rl_tx_desc_cnt; + na.num_rx_desc = sc->rl_ldata.rl_rx_desc_cnt; + na.nm_txsync = re_netmap_txsync; + na.nm_rxsync = re_netmap_rxsync; + na.nm_lock = re_netmap_lock_wrapper; + na.nm_register = re_netmap_reg; + na.buff_size = MCLBYTES; + netmap_attach(&na, 1); +} + + +/* + * wrapper to export locks to the generic code + * We should not use the tx/rx locks + */ +static void +re_netmap_lock_wrapper(void *_a, int what, u_int queueid) +{ + struct rl_softc *adapter = _a; + + switch (what) { + case NETMAP_CORE_LOCK: + RL_LOCK(adapter); + break; + case NETMAP_CORE_UNLOCK: + RL_UNLOCK(adapter); + break; + + case NETMAP_TX_LOCK: + case NETMAP_RX_LOCK: + case NETMAP_TX_UNLOCK: + case NETMAP_RX_UNLOCK: + D("invalid lock call %d, no tx/rx locks here", what); + break; + } +} + + +/* + * support for netmap register/unregisted. We are already under core lock. + * only called on the first register or the last unregister. + */ +static int +re_netmap_reg(struct ifnet *ifp, int onoff) +{ + struct rl_softc *adapter = ifp->if_softc; + struct netmap_adapter *na = NA(ifp); + int error = 0; + + if (!na) + return EINVAL; + /* Tell the stack that the interface is no longer active */ + ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + + re_stop(adapter); + + if (onoff) { + ifp->if_capenable |= IFCAP_NETMAP; + + /* save if_transmit and restore it */ + na->if_transmit = ifp->if_transmit; + /* XXX if_start and if_qflush ??? */ + ifp->if_transmit = netmap_start; + + re_init_locked(adapter); + + if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { + error = ENOMEM; + goto fail; + } + } else { +fail: + /* restore if_transmit */ + ifp->if_transmit = na->if_transmit; + ifp->if_capenable &= ~IFCAP_NETMAP; + re_init_locked(adapter); /* also enables intr */ + } + return (error); + +} + + +/* + * Reconcile kernel and user view of the transmit ring. + * + * Userspace has filled tx slots up to cur (excluded). + * The last unused slot previously known to the kernel was nr_hwcur, + * and the last interrupt reported nr_hwavail slots available + * (using the special value -1 to indicate idle transmit ring). + * The function must first update avail to what the kernel + * knows (translating the -1 to nkr_num_slots - 1), + * subtract the newly used slots (cur - nr_hwcur) + * from both avail and nr_hwavail, and set nr_hwcur = cur + * issuing a dmamap_sync on all slots. + */ +static int +re_netmap_txsync(void *a, u_int ring_nr, int do_lock) +{ + struct rl_softc *sc = a; + struct rl_txdesc *txd = sc->rl_ldata.rl_tx_desc; + struct netmap_adapter *na = NA(sc->rl_ifp); + struct netmap_kring *kring = &na->tx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + k = ring->cur; + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + RL_LOCK(sc); + + /* Sync the TX descriptor list */ + bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, + sc->rl_ldata.rl_tx_list_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + + /* record completed transmissions */ + for (n = 0, j = sc->rl_ldata.rl_tx_considx; + j != sc->rl_ldata.rl_tx_prodidx; + n++, j = RL_TX_DESC_NXT(sc, j)) { + uint32_t cmdstat = + le32toh(sc->rl_ldata.rl_tx_list[j].rl_cmdstat); + if (cmdstat & RL_TDESC_STAT_OWN) + break; + } + if (n > 0) { + sc->rl_ldata.rl_tx_considx = j; + sc->rl_ldata.rl_tx_free += n; + kring->nr_hwavail += n; + } + + /* update avail to what the hardware knows */ + ring->avail = kring->nr_hwavail; + + /* we trust prodidx, not hwcur */ + j = kring->nr_hwcur = sc->rl_ldata.rl_tx_prodidx; + if (j != k) { /* we have new packets to send */ + n = 0; + while (j != k) { + struct netmap_slot *slot = &ring->slot[j]; + struct rl_desc *desc = &sc->rl_ldata.rl_tx_list[j]; + int cmd = slot->len | RL_TDESC_CMD_EOF | + RL_TDESC_CMD_OWN | RL_TDESC_CMD_SOF ; + void *addr = NMB(slot); + int len = slot->len; + + if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { + if (do_lock) + RL_UNLOCK(sc); + return netmap_ring_reinit(kring); + } + + if (j == lim) /* mark end of ring */ + cmd |= RL_TDESC_CMD_EOR; + + if (slot->flags & NS_BUF_CHANGED) { + uint64_t paddr = vtophys(addr); + desc->rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); + desc->rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); + /* buffer has changed, unload and reload map */ + netmap_reload_map(sc->rl_ldata.rl_tx_mtag, + txd[j].tx_dmamap, addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + slot->flags &= ~NS_REPORT; + desc->rl_cmdstat = htole32(cmd); + bus_dmamap_sync(sc->rl_ldata.rl_tx_mtag, + txd[j].tx_dmamap, BUS_DMASYNC_PREWRITE); + j = (j == lim) ? 0 : j + 1; + n++; + } + sc->rl_ldata.rl_tx_prodidx = kring->nr_hwcur = ring->cur; + + /* decrease avail by number of sent packets */ + ring->avail -= n; + kring->nr_hwavail = ring->avail; + + bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, + sc->rl_ldata.rl_tx_list_map, + BUS_DMASYNC_PREWRITE|BUS_DMASYNC_PREREAD); + + /* start ? */ + CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START); + } + if (do_lock) + RL_UNLOCK(sc); + return 0; +} + + +/* + * Reconcile kernel and user view of the receive ring. + * + * Userspace has read rx slots up to cur (excluded). + * The last unread slot previously known to the kernel was nr_hwcur, + * and the last interrupt reported nr_hwavail slots available. + * We must subtract the newly consumed slots (cur - nr_hwcur) + * from nr_hwavail, clearing the descriptors for the next + * read, tell the hardware that they are available, + * and set nr_hwcur = cur and avail = nr_hwavail. + * issuing a dmamap_sync on all slots. + */ +static int +re_netmap_rxsync(void *a, u_int ring_nr, int do_lock) +{ + struct rl_softc *sc = a; + struct rl_rxdesc *rxd = sc->rl_ldata.rl_rx_desc; + struct netmap_adapter *na = NA(sc->rl_ifp); + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + k = ring->cur; + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + RL_LOCK(sc); + /* XXX check sync modes */ + bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag, + sc->rl_ldata.rl_rx_list_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + + /* + * The device uses all the buffers in the ring, so we need + * another termination condition in addition to RL_RDESC_STAT_OWN + * cleared (all buffers could have it cleared. The easiest one + * is to limit the amount of data reported up to 'lim' + */ + j = sc->rl_ldata.rl_rx_prodidx; + for (n = kring->nr_hwavail; n < lim ; n++) { + struct rl_desc *cur_rx = &sc->rl_ldata.rl_rx_list[j]; + uint32_t rxstat = le32toh(cur_rx->rl_cmdstat); + uint32_t total_len; + + if ((rxstat & RL_RDESC_STAT_OWN) != 0) + break; + total_len = rxstat & sc->rl_rxlenmask; + /* XXX subtract crc */ + total_len = (total_len < 4) ? 0 : total_len - 4; + kring->ring->slot[j].len = total_len; + /* sync was in re_newbuf() */ + bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, + rxd[j].rx_dmamap, BUS_DMASYNC_POSTREAD); + j = RL_RX_DESC_NXT(sc, j); + } + if (n != kring->nr_hwavail) { + sc->rl_ldata.rl_rx_prodidx = j; + sc->rl_ifp->if_ipackets += n - kring->nr_hwavail; + kring->nr_hwavail = n; + } + + /* skip past packets that userspace has already processed, + * making them available for reception. + * advance nr_hwcur and issue a bus_dmamap_sync on the + * buffers so it is safe to write to them. + * Also increase nr_hwavail + */ + j = kring->nr_hwcur; + if (j != k) { /* userspace has read some packets. */ + n = 0; + while (j != k) { + struct netmap_slot *slot = ring->slot + j; + struct rl_desc *desc = &sc->rl_ldata.rl_rx_list[j]; + int cmd = na->buff_size | RL_RDESC_CMD_OWN; + void *addr = NMB(slot); + + if (addr == netmap_buffer_base) { /* bad buf */ + if (do_lock) + RL_UNLOCK(sc); + return netmap_ring_reinit(kring); + } + + if (j == lim) /* mark end of ring */ + cmd |= RL_RDESC_CMD_EOR; + + desc->rl_cmdstat = htole32(cmd); + slot->flags &= ~NS_REPORT; + if (slot->flags & NS_BUF_CHANGED) { + uint64_t paddr = vtophys(addr); + desc->rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); + desc->rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); + netmap_reload_map(sc->rl_ldata.rl_rx_mtag, + rxd[j].rx_dmamap, addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, + rxd[j].rx_dmamap, BUS_DMASYNC_PREREAD); + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwavail -= n; + kring->nr_hwcur = k; + /* Flush the RX DMA ring */ + + bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag, + sc->rl_ldata.rl_rx_list_map, + BUS_DMASYNC_PREWRITE|BUS_DMASYNC_PREREAD); + } + /* tell userspace that there are new packets */ + ring->avail = kring->nr_hwavail ; + if (do_lock) + RL_UNLOCK(sc); + return 0; +} + +static void +re_netmap_tx_init(struct rl_softc *sc) +{ + struct rl_txdesc *txd; + struct rl_desc *desc; + int i; + struct netmap_adapter *na = NA(sc->rl_ifp); + struct netmap_slot *slot = netmap_reset(na, NR_TX, 0, 0); + + /* slot is NULL if we are not in netmap mode */ + if (!slot) + return; + /* in netmap mode, overwrite addresses and maps */ + txd = sc->rl_ldata.rl_tx_desc; + desc = sc->rl_ldata.rl_tx_list; + + for (i = 0; i < sc->rl_ldata.rl_tx_desc_cnt; i++) { + void *addr = NMB(slot+i); + uint64_t paddr = vtophys(addr); + + desc[i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); + desc[i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); + netmap_load_map(sc->rl_ldata.rl_tx_mtag, + txd[i].tx_dmamap, addr, na->buff_size); + } +} + +static void +re_netmap_rx_init(struct rl_softc *sc) +{ + /* slot is NULL if we are not in netmap mode */ + struct netmap_adapter *na = NA(sc->rl_ifp); + struct netmap_slot *slot = netmap_reset(na, NR_RX, 0, 0); + struct rl_desc *desc = sc->rl_ldata.rl_rx_list; + uint32_t cmdstat; + int i; + + if (!slot) + return; + + for (i = 0; i < sc->rl_ldata.rl_rx_desc_cnt; i++) { + void *addr = NMB(slot+i); + uint64_t paddr = vtophys(addr); + + desc[i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); + desc[i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); + cmdstat = slot[i].len = na->buff_size; // XXX + if (i == sc->rl_ldata.rl_rx_desc_cnt - 1) + cmdstat |= RL_RDESC_CMD_EOR; + desc[i].rl_cmdstat = htole32(cmdstat | RL_RDESC_CMD_OWN); + + netmap_reload_map(sc->rl_ldata.rl_rx_mtag, + sc->rl_ldata.rl_rx_desc[i].rx_dmamap, + addr, na->buff_size); + } +} diff --git a/sys/dev/netmap/ixgbe_netmap.h b/sys/dev/netmap/ixgbe_netmap.h new file mode 100644 index 000000000000..a4d5491d67f1 --- /dev/null +++ b/sys/dev/netmap/ixgbe_netmap.h @@ -0,0 +1,376 @@ +/* + * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: ixgbe_netmap.h 9662 2011-11-16 13:18:06Z luigi $ + * + * netmap modifications for ixgbe + */ + +#include +#include +// #include +// #include /* vtophys ? */ +#include + +static int ixgbe_netmap_reg(struct ifnet *, int onoff); +static int ixgbe_netmap_txsync(void *, u_int, int); +static int ixgbe_netmap_rxsync(void *, u_int, int); +static void ixgbe_netmap_lock_wrapper(void *, int, u_int); + + +SYSCTL_NODE(_dev, OID_AUTO, ixgbe, CTLFLAG_RW, 0, "ixgbe card"); + +static void +ixgbe_netmap_attach(struct adapter *adapter) +{ + struct netmap_adapter na; + + bzero(&na, sizeof(na)); + + na.ifp = adapter->ifp; + na.separate_locks = 1; + na.num_tx_desc = adapter->num_tx_desc; + na.num_rx_desc = adapter->num_rx_desc; + na.nm_txsync = ixgbe_netmap_txsync; + na.nm_rxsync = ixgbe_netmap_rxsync; + na.nm_lock = ixgbe_netmap_lock_wrapper; + na.nm_register = ixgbe_netmap_reg; + /* + * adapter->rx_mbuf_sz is set by SIOCSETMTU, but in netmap mode + * we allocate the buffers on the first register. So we must + * disallow a SIOCSETMTU when if_capenable & IFCAP_NETMAP is set. + */ + na.buff_size = MCLBYTES; + netmap_attach(&na, adapter->num_queues); +} + + +/* + * wrapper to export locks to the generic code + */ +static void +ixgbe_netmap_lock_wrapper(void *_a, int what, u_int queueid) +{ + struct adapter *adapter = _a; + + ASSERT(queueid < adapter->num_queues); + switch (what) { + case NETMAP_CORE_LOCK: + IXGBE_CORE_LOCK(adapter); + break; + case NETMAP_CORE_UNLOCK: + IXGBE_CORE_UNLOCK(adapter); + break; + case NETMAP_TX_LOCK: + IXGBE_TX_LOCK(&adapter->tx_rings[queueid]); + break; + case NETMAP_TX_UNLOCK: + IXGBE_TX_UNLOCK(&adapter->tx_rings[queueid]); + break; + case NETMAP_RX_LOCK: + IXGBE_RX_LOCK(&adapter->rx_rings[queueid]); + break; + case NETMAP_RX_UNLOCK: + IXGBE_RX_UNLOCK(&adapter->rx_rings[queueid]); + break; + } +} + + +/* + * support for netmap register/unregisted. We are already under core lock. + * only called on the first init or the last unregister. + */ +static int +ixgbe_netmap_reg(struct ifnet *ifp, int onoff) +{ + struct adapter *adapter = ifp->if_softc; + struct netmap_adapter *na = NA(ifp); + int error = 0; + + if (!na) + return EINVAL; + + ixgbe_disable_intr(adapter); + + /* Tell the stack that the interface is no longer active */ + ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + + if (onoff) { + ifp->if_capenable |= IFCAP_NETMAP; + + /* save if_transmit to restore it later */ + na->if_transmit = ifp->if_transmit; + ifp->if_transmit = netmap_start; + + ixgbe_init_locked(adapter); + if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { + error = ENOMEM; + goto fail; + } + } else { +fail: + /* restore if_transmit */ + ifp->if_transmit = na->if_transmit; + ifp->if_capenable &= ~IFCAP_NETMAP; + ixgbe_init_locked(adapter); /* also enables intr */ + } + return (error); +} + + +/* + * Reconcile kernel and user view of the transmit ring. + * + * Userspace has filled tx slots up to cur (excluded). + * The last unused slot previously known to the kernel was nr_hwcur, + * and the last interrupt reported nr_hwavail slots available + * (using the special value -1 to indicate idle transmit ring). + * The function must first update avail to what the kernel + * knows, subtract the newly used slots (cur - nr_hwcur) + * from both avail and nr_hwavail, and set nr_hwcur = cur + * issuing a dmamap_sync on all slots. + * + * Check parameters in the struct netmap_ring. + * We don't use avail, only check for bogus values. + * Make sure cur is valid, and same goes for buffer indexes and lengths. + * To avoid races, read the values once, and never use those from + * the ring afterwards. + */ +static int +ixgbe_netmap_txsync(void *a, u_int ring_nr, int do_lock) +{ + struct adapter *adapter = a; + struct tx_ring *txr = &adapter->tx_rings[ring_nr]; + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->tx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, k, n = 0, lim = kring->nkr_num_slots - 1; + + /* generate an interrupt approximately every half ring */ + int report_frequency = kring->nkr_num_slots >> 1; + + k = ring->cur; /* ring is not protected by any lock */ + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + IXGBE_TX_LOCK(txr); + bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, + BUS_DMASYNC_POSTREAD); + + /* update avail to what the hardware knows */ + ring->avail = kring->nr_hwavail; + + j = kring->nr_hwcur; + if (j != k) { /* we have new packets to send */ + while (j != k) { + struct netmap_slot *slot = &ring->slot[j]; + struct ixgbe_tx_buf *txbuf = &txr->tx_buffers[j]; + union ixgbe_adv_tx_desc *curr = &txr->tx_base[j]; + void *addr = NMB(slot); + int flags = ((slot->flags & NS_REPORT) || + j == 0 || j == report_frequency) ? + IXGBE_TXD_CMD_RS : 0; + int len = slot->len; + + if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { + if (do_lock) + IXGBE_TX_UNLOCK(txr); + return netmap_ring_reinit(kring); + } + + slot->flags &= ~NS_REPORT; + curr->read.buffer_addr = htole64(vtophys(addr)); + curr->read.olinfo_status = 0; + curr->read.cmd_type_len = + htole32(txr->txd_cmd | len | + (IXGBE_ADVTXD_DTYP_DATA | + IXGBE_ADVTXD_DCMD_IFCS | + IXGBE_TXD_CMD_EOP | flags) ); + if (slot->flags & NS_BUF_CHANGED) { + /* buffer has changed, unload and reload map */ + netmap_reload_map(txr->txtag, txbuf->map, + addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + + bus_dmamap_sync(txr->txtag, txbuf->map, + BUS_DMASYNC_PREWRITE); + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwcur = k; + + /* decrease avail by number of sent packets */ + ring->avail -= n; + kring->nr_hwavail = ring->avail; + + bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + + IXGBE_WRITE_REG(&adapter->hw, IXGBE_TDT(txr->me), k); + } + + if (n == 0 || kring->nr_hwavail < 1) { + /* record completed transmissions. TODO + * + * The datasheet discourages the use of TDH to find out the + * number of sent packets; the right way to do so, is to check + * the DD bit inside the status of a packet descriptor. On the + * other hand, we avoid to set the `report status' bit for + * *all* outgoing packets (kind of interrupt mitigation), + * consequently the DD bit is not guaranteed to be set for all + * the packets: thats way, for the moment we continue to use + * TDH. + */ + j = IXGBE_READ_REG(&adapter->hw, IXGBE_TDH(ring_nr)); + if (j >= kring->nkr_num_slots) { /* XXX can happen */ + D("TDH wrap %d", j); + j -= kring->nkr_num_slots; + } + int delta = j - txr->next_to_clean; + if (delta) { + /* new transmissions were completed, increment + ring->nr_hwavail. */ + if (delta < 0) + delta += kring->nkr_num_slots; + txr->next_to_clean = j; + kring->nr_hwavail += delta; + ring->avail = kring->nr_hwavail; + } + } + + if (do_lock) + IXGBE_TX_UNLOCK(txr); + return 0; +} + + +/* + * Reconcile kernel and user view of the receive ring. + * + * Userspace has read rx slots up to cur (excluded). + * The last unread slot previously known to the kernel was nr_hwcur, + * and the last interrupt reported nr_hwavail slots available. + * We must subtract the newly consumed slots (cur - nr_hwcur) + * from nr_hwavail, clearing the descriptors for the next + * read, tell the hardware that they are available, + * and set nr_hwcur = cur and avail = nr_hwavail. + * issuing a dmamap_sync on all slots. + */ +static int +ixgbe_netmap_rxsync(void *a, u_int ring_nr, int do_lock) +{ + struct adapter *adapter = a; + struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + k = ring->cur; /* ring is not protected by any lock */ + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + IXGBE_RX_LOCK(rxr); + /* XXX check sync modes */ + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + + j = rxr->next_to_check; + for (n = 0; ; n++) { + union ixgbe_adv_rx_desc *curr = &rxr->rx_base[j]; + uint32_t staterr = le32toh(curr->wb.upper.status_error); + + if ((staterr & IXGBE_RXD_STAT_DD) == 0) + break; + ring->slot[j].len = le16toh(curr->wb.upper.length); + bus_dmamap_sync(rxr->ptag, + rxr->rx_buffers[j].pmap, BUS_DMASYNC_POSTREAD); + j = (j == lim) ? 0 : j + 1; + } + if (n) { + rxr->next_to_check = j; + kring->nr_hwavail += n; + if (kring->nr_hwavail >= lim - 10) { + ND("rx ring %d almost full %d", ring_nr, kring->nr_hwavail); + } + } + + /* skip past packets that userspace has already processed, + * making them available for reception. + * advance nr_hwcur and issue a bus_dmamap_sync on the + * buffers so it is safe to write to them. + * Also increase nr_hwavail + */ + j = kring->nr_hwcur; + if (j != k) { /* userspace has read some packets. */ + n = 0; + while (j != k) { + struct netmap_slot *slot = ring->slot + j; + union ixgbe_adv_rx_desc *curr = &rxr->rx_base[j]; + struct ixgbe_rx_buf *rxbuf = rxr->rx_buffers + j; + void *addr = NMB(slot); + + if (addr == netmap_buffer_base) { /* bad buf */ + if (do_lock) + IXGBE_RX_UNLOCK(rxr); + return netmap_ring_reinit(kring); + } + + curr->wb.upper.status_error = 0; + curr->read.pkt_addr = htole64(vtophys(addr)); + if (slot->flags & NS_BUF_CHANGED) { + netmap_reload_map(rxr->ptag, rxbuf->pmap, + addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + + bus_dmamap_sync(rxr->ptag, rxbuf->pmap, + BUS_DMASYNC_PREREAD); + + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwavail -= n; + kring->nr_hwcur = ring->cur; + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + /* IMPORTANT: we must leave one free slot in the ring, + * so move j back by one unit + */ + j = (j == 0) ? lim : j - 1; + IXGBE_WRITE_REG(&adapter->hw, IXGBE_RDT(rxr->me), j); + } + /* tell userspace that there are new packets */ + ring->avail = kring->nr_hwavail ; + if (do_lock) + IXGBE_RX_UNLOCK(rxr); + return 0; +} diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c new file mode 100644 index 000000000000..7645a4e6e32b --- /dev/null +++ b/sys/dev/netmap/netmap.c @@ -0,0 +1,1762 @@ +/* + * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: netmap.c 9662 2011-11-16 13:18:06Z luigi $ + * + * This module supports memory mapped access to network devices, + * see netmap(4). + * + * The module uses a large, memory pool allocated by the kernel + * and accessible as mmapped memory by multiple userspace threads/processes. + * The memory pool contains packet buffers and "netmap rings", + * i.e. user-accessible copies of the interface's queues. + * + * Access to the network card works like this: + * 1. a process/thread issues one or more open() on /dev/netmap, to create + * select()able file descriptor on which events are reported. + * 2. on each descriptor, the process issues an ioctl() to identify + * the interface that should report events to the file descriptor. + * 3. on each descriptor, the process issues an mmap() request to + * map the shared memory region within the process' address space. + * The list of interesting queues is indicated by a location in + * the shared memory region. + * 4. using the functions in the netmap(4) userspace API, a process + * can look up the occupation state of a queue, access memory buffers, + * and retrieve received packets or enqueue packets to transmit. + * 5. using some ioctl()s the process can synchronize the userspace view + * of the queue with the actual status in the kernel. This includes both + * receiving the notification of new packets, and transmitting new + * packets on the output interface. + * 6. select() or poll() can be used to wait for events on individual + * transmit or receive queues (or all queues for a given interface). + */ + +#include /* prerequisite */ +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include /* defines used in kernel.h */ +#include /* types used in module initialization */ +#include /* cdevsw struct */ +#include /* uio struct */ +#include +#include /* struct socket */ +#include +#include /* PROT_EXEC */ +#include +#include /* vtophys */ +#include /* vtophys */ +#include /* sockaddrs */ +#include +#include +#include +#include +#include /* BIOCIMMEDIATE */ +#include +#include +#include /* bus_dmamap_* */ + +MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); + +/* + * lock and unlock for the netmap memory allocator + */ +#define NMA_LOCK() mtx_lock(&netmap_mem_d->nm_mtx); +#define NMA_UNLOCK() mtx_unlock(&netmap_mem_d->nm_mtx); + +/* + * Default amount of memory pre-allocated by the module. + * We start with a large size and then shrink our demand + * according to what is avalable when the module is loaded. + * At the moment the block is contiguous, but we can easily + * restrict our demand to smaller units (16..64k) + */ +#define NETMAP_MEMORY_SIZE (64 * 1024 * PAGE_SIZE) +static void * netmap_malloc(size_t size, const char *msg); +static void netmap_free(void *addr, const char *msg); + +/* + * Allocator for a pool of packet buffers. For each buffer we have + * one entry in the bitmap to signal the state. Allocation scans + * the bitmap, but since this is done only on attach, we are not + * too worried about performance + * XXX if we need to allocate small blocks, a translation + * table is used both for kernel virtual address and physical + * addresses. + */ +struct netmap_buf_pool { + u_int total_buffers; /* total buffers. */ + u_int free; + u_int bufsize; + char *base; /* buffer base address */ + uint32_t *bitmap; /* one bit per buffer, 1 means free */ +}; +struct netmap_buf_pool nm_buf_pool; +/* XXX move these two vars back into netmap_buf_pool */ +u_int netmap_total_buffers; +char *netmap_buffer_base; + +/* user-controlled variables */ +int netmap_verbose; + +static int no_timestamp; /* don't timestamp on rxsync */ + +SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); +SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, + CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); +SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, + CTLFLAG_RW, &no_timestamp, 0, "no_timestamp"); +SYSCTL_INT(_dev_netmap, OID_AUTO, total_buffers, + CTLFLAG_RD, &nm_buf_pool.total_buffers, 0, "total_buffers"); +SYSCTL_INT(_dev_netmap, OID_AUTO, free_buffers, + CTLFLAG_RD, &nm_buf_pool.free, 0, "free_buffers"); + +/* + * Allocate n buffers from the ring, and fill the slot. + * Buffer 0 is the 'junk' buffer. + */ +static void +netmap_new_bufs(struct netmap_buf_pool *p, struct netmap_slot *slot, u_int n) +{ + uint32_t bi = 0; /* index in the bitmap */ + uint32_t mask, j, i = 0; /* slot counter */ + + if (n > p->free) { + D("only %d out of %d buffers available", i, n); + return; + } + /* termination is guaranteed by p->free */ + while (i < n && p->free > 0) { + uint32_t cur = p->bitmap[bi]; + if (cur == 0) { /* bitmask is fully used */ + bi++; + continue; + } + /* locate a slot */ + for (j = 0, mask = 1; (cur & mask) == 0; j++, mask <<= 1) ; + p->bitmap[bi] &= ~mask; /* slot in use */ + p->free--; + slot[i].buf_idx = bi*32+j; + slot[i].len = p->bufsize; + slot[i].flags = NS_BUF_CHANGED; + i++; + } + ND("allocated %d buffers, %d available", n, p->free); +} + + +static void +netmap_free_buf(struct netmap_buf_pool *p, uint32_t i) +{ + uint32_t pos, mask; + if (i >= p->total_buffers) { + D("invalid free index %d", i); + return; + } + pos = i / 32; + mask = 1 << (i % 32); + if (p->bitmap[pos] & mask) { + D("slot %d already free", i); + return; + } + p->bitmap[pos] |= mask; + p->free++; +} + + +/* Descriptor of the memory objects handled by our memory allocator. */ +struct netmap_mem_obj { + TAILQ_ENTRY(netmap_mem_obj) nmo_next; /* next object in the + chain. */ + int nmo_used; /* flag set on used memory objects. */ + size_t nmo_size; /* size of the memory area reserved for the + object. */ + void *nmo_data; /* pointer to the memory area. */ +}; + +/* Wrap our memory objects to make them ``chainable``. */ +TAILQ_HEAD(netmap_mem_obj_h, netmap_mem_obj); + + +/* Descriptor of our custom memory allocator. */ +struct netmap_mem_d { + struct mtx nm_mtx; /* lock used to handle the chain of memory + objects. */ + struct netmap_mem_obj_h nm_molist; /* list of memory objects */ + size_t nm_size; /* total amount of memory used for rings etc. */ + size_t nm_totalsize; /* total amount of allocated memory + (the difference is used for buffers) */ + size_t nm_buf_start; /* offset of packet buffers. + This is page-aligned. */ + size_t nm_buf_len; /* total memory for buffers */ + void *nm_buffer; /* pointer to the whole pre-allocated memory + area. */ +}; + + +/* Structure associated to each thread which registered an interface. */ +struct netmap_priv_d { + struct netmap_if *np_nifp; /* netmap interface descriptor. */ + + struct ifnet *np_ifp; /* device for which we hold a reference */ + int np_ringid; /* from the ioctl */ + u_int np_qfirst, np_qlast; /* range of rings to scan */ + uint16_t np_txpoll; +}; + + +static struct cdev *netmap_dev; /* /dev/netmap character device. */ +static struct netmap_mem_d *netmap_mem_d; /* Our memory allocator. */ + + +static d_mmap_t netmap_mmap; +static d_ioctl_t netmap_ioctl; +static d_poll_t netmap_poll; + +#ifdef NETMAP_KEVENT +static d_kqfilter_t netmap_kqfilter; +#endif + +static struct cdevsw netmap_cdevsw = { + .d_version = D_VERSION, + .d_name = "netmap", + .d_mmap = netmap_mmap, + .d_ioctl = netmap_ioctl, + .d_poll = netmap_poll, +#ifdef NETMAP_KEVENT + .d_kqfilter = netmap_kqfilter, +#endif +}; + +#ifdef NETMAP_KEVENT +static int netmap_kqread(struct knote *, long); +static int netmap_kqwrite(struct knote *, long); +static void netmap_kqdetach(struct knote *); + +static struct filterops netmap_read_filterops = { + .f_isfd = 1, + .f_attach = NULL, + .f_detach = netmap_kqdetach, + .f_event = netmap_kqread, +}; + +static struct filterops netmap_write_filterops = { + .f_isfd = 1, + .f_attach = NULL, + .f_detach = netmap_kqdetach, + .f_event = netmap_kqwrite, +}; + +/* + * support for the kevent() system call. + * + * This is the kevent filter, and is executed each time a new event + * is triggered on the device. This function execute some operation + * depending on the received filter. + * + * The implementation should test the filters and should implement + * filter operations we are interested on (a full list in /sys/event.h). + * + * On a match we should: + * - set kn->kn_fop + * - set kn->kn_hook + * - call knlist_add() to deliver the event to the application. + * + * Return 0 if the event should be delivered to the application. + */ +static int +netmap_kqfilter(struct cdev *dev, struct knote *kn) +{ + /* declare variables needed to read/write */ + + switch(kn->kn_filter) { + case EVFILT_READ: + if (netmap_verbose) + D("%s kqfilter: EVFILT_READ" ifp->if_xname); + + /* read operations */ + kn->kn_fop = &netmap_read_filterops; + break; + + case EVFILT_WRITE: + if (netmap_verbose) + D("%s kqfilter: EVFILT_WRITE" ifp->if_xname); + + /* write operations */ + kn->kn_fop = &netmap_write_filterops; + break; + + default: + if (netmap_verbose) + D("%s kqfilter: invalid filter" ifp->if_xname); + return(EINVAL); + } + + kn->kn_hook = 0;// + knlist_add(&netmap_sc->tun_rsel.si_note, kn, 0); + + return (0); +} +#endif /* NETMAP_KEVENT */ + +/* + * File descriptor's private data destructor. + * + * Call nm_register(ifp,0) to stop netmap mode on the interface and + * revert to normal operation. We expect that np_ifp has not gone. + */ +static void +netmap_dtor(void *data) +{ + struct netmap_priv_d *priv = data; + struct ifnet *ifp = priv->np_ifp; + struct netmap_adapter *na = NA(ifp); + struct netmap_if *nifp = priv->np_nifp; + + if (0) + printf("%s starting for %p ifp %p\n", __FUNCTION__, priv, + priv ? priv->np_ifp : NULL); + + na->nm_lock(ifp->if_softc, NETMAP_CORE_LOCK, 0); + + na->refcount--; + if (na->refcount <= 0) { /* last instance */ + u_int i; + + D("deleting last netmap instance for %s", ifp->if_xname); + /* + * there is a race here with *_netmap_task() and + * netmap_poll(), which don't run under NETMAP_CORE_LOCK. + * na->refcount == 0 && na->ifp->if_capenable & IFCAP_NETMAP + * (aka NETMAP_DELETING(na)) are a unique marker that the + * device is dying. + * Before destroying stuff we sleep a bit, and then complete + * the job. NIOCREG should realize the condition and + * loop until they can continue; the other routines + * should check the condition at entry and quit if + * they cannot run. + */ + na->nm_lock(ifp->if_softc, NETMAP_CORE_UNLOCK, 0); + tsleep(na, 0, "NIOCUNREG", 4); + na->nm_lock(ifp->if_softc, NETMAP_CORE_LOCK, 0); + na->nm_register(ifp, 0); /* off, clear IFCAP_NETMAP */ + /* Wake up any sleeping threads. netmap_poll will + * then return POLLERR + */ + for (i = 0; i < na->num_queues + 2; i++) { + selwakeuppri(&na->tx_rings[i].si, PI_NET); + selwakeuppri(&na->rx_rings[i].si, PI_NET); + } + /* release all buffers */ + NMA_LOCK(); + for (i = 0; i < na->num_queues + 1; i++) { + int j, lim; + struct netmap_ring *ring; + + ND("tx queue %d", i); + ring = na->tx_rings[i].ring; + lim = na->tx_rings[i].nkr_num_slots; + for (j = 0; j < lim; j++) + netmap_free_buf(&nm_buf_pool, + ring->slot[j].buf_idx); + + ND("rx queue %d", i); + ring = na->rx_rings[i].ring; + lim = na->rx_rings[i].nkr_num_slots; + for (j = 0; j < lim; j++) + netmap_free_buf(&nm_buf_pool, + ring->slot[j].buf_idx); + } + NMA_UNLOCK(); + netmap_free(na->tx_rings[0].ring, "shadow rings"); + wakeup(na); + } + netmap_free(nifp, "nifp"); + + na->nm_lock(ifp->if_softc, NETMAP_CORE_UNLOCK, 0); + + if_rele(ifp); + + bzero(priv, sizeof(*priv)); /* XXX for safety */ + free(priv, M_DEVBUF); +} + + + +/* + * Create and return a new ``netmap_if`` object, and possibly also + * rings and packet buffors. + * + * Return NULL on failure. + */ +static void * +netmap_if_new(const char *ifname, struct netmap_adapter *na) +{ + struct netmap_if *nifp; + struct netmap_ring *ring; + char *buff; + u_int i, len, ofs; + u_int n = na->num_queues + 1; /* shorthand, include stack queue */ + + /* + * the descriptor is followed inline by an array of offsets + * to the tx and rx rings in the shared memory region. + */ + len = sizeof(struct netmap_if) + 2 * n * sizeof(ssize_t); + nifp = netmap_malloc(len, "nifp"); + if (nifp == NULL) + return (NULL); + + /* initialize base fields */ + *(int *)(uintptr_t)&nifp->ni_num_queues = na->num_queues; + strncpy(nifp->ni_name, ifname, IFNAMSIZ); + + (na->refcount)++; /* XXX atomic ? we are under lock */ + if (na->refcount > 1) + goto final; + + /* + * If this is the first instance, allocate the shadow rings and + * buffers for this card (one for each hw queue, one for the host). + * The rings are contiguous, but have variable size. + * The entire block is reachable at + * na->tx_rings[0].ring + */ + + len = n * (2 * sizeof(struct netmap_ring) + + (na->num_tx_desc + na->num_rx_desc) * + sizeof(struct netmap_slot) ); + buff = netmap_malloc(len, "shadow rings"); + if (buff == NULL) { + D("failed to allocate %d bytes for %s shadow ring", + len, ifname); +error: + (na->refcount)--; + netmap_free(nifp, "nifp, rings failed"); + return (NULL); + } + /* do we have the bufers ? we are in need of num_tx_desc buffers for + * each tx ring and num_tx_desc buffers for each rx ring. */ + len = n * (na->num_tx_desc + na->num_rx_desc); + NMA_LOCK(); + if (nm_buf_pool.free < len) { + NMA_UNLOCK(); + netmap_free(buff, "not enough bufs"); + goto error; + } + /* + * in the kring, store the pointers to the shared rings + * and initialize the rings. We are under NMA_LOCK(). + */ + ofs = 0; + for (i = 0; i < n; i++) { + struct netmap_kring *kring; + int numdesc; + + /* Transmit rings */ + kring = &na->tx_rings[i]; + numdesc = na->num_tx_desc; + bzero(kring, sizeof(*kring)); + kring->na = na; + + ring = kring->ring = (struct netmap_ring *)(buff + ofs); + *(ssize_t *)(uintptr_t)&ring->buf_ofs = + nm_buf_pool.base - (char *)ring; + ND("txring[%d] at %p ofs %d", i, ring, ring->buf_ofs); + *(int *)(int *)(uintptr_t)&ring->num_slots = + kring->nkr_num_slots = numdesc; + + /* + * IMPORTANT: + * Always keep one slot empty, so we can detect new + * transmissions comparing cur and nr_hwcur (they are + * the same only if there are no new transmissions). + */ + ring->avail = kring->nr_hwavail = numdesc - 1; + ring->cur = kring->nr_hwcur = 0; + netmap_new_bufs(&nm_buf_pool, ring->slot, numdesc); + + ofs += sizeof(struct netmap_ring) + + numdesc * sizeof(struct netmap_slot); + + /* Receive rings */ + kring = &na->rx_rings[i]; + numdesc = na->num_rx_desc; + bzero(kring, sizeof(*kring)); + kring->na = na; + + ring = kring->ring = (struct netmap_ring *)(buff + ofs); + *(ssize_t *)(uintptr_t)&ring->buf_ofs = + nm_buf_pool.base - (char *)ring; + ND("rxring[%d] at %p offset %d", i, ring, ring->buf_ofs); + *(int *)(int *)(uintptr_t)&ring->num_slots = + kring->nkr_num_slots = numdesc; + ring->cur = kring->nr_hwcur = 0; + ring->avail = kring->nr_hwavail = 0; /* empty */ + netmap_new_bufs(&nm_buf_pool, ring->slot, numdesc); + ofs += sizeof(struct netmap_ring) + + numdesc * sizeof(struct netmap_slot); + } + NMA_UNLOCK(); + for (i = 0; i < n+1; i++) { + // XXX initialize the selrecord structs. + } +final: + /* + * fill the slots for the rx and tx queues. They contain the offset + * between the ring and nifp, so the information is usable in + * userspace to reach the ring from the nifp. + */ + for (i = 0; i < n; i++) { + char *base = (char *)nifp; + *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i] = + (char *)na->tx_rings[i].ring - base; + *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i+n] = + (char *)na->rx_rings[i].ring - base; + } + return (nifp); +} + + +/* + * mmap(2) support for the "netmap" device. + * + * Expose all the memory previously allocated by our custom memory + * allocator: this way the user has only to issue a single mmap(2), and + * can work on all the data structures flawlessly. + * + * Return 0 on success, -1 otherwise. + */ +static int +#if __FreeBSD_version < 900000 +netmap_mmap(__unused struct cdev *dev, vm_offset_t offset, vm_paddr_t *paddr, + int nprot) +#else +netmap_mmap(__unused struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, + int nprot, __unused vm_memattr_t *memattr) +#endif +{ + if (nprot & PROT_EXEC) + return (-1); // XXX -1 or EINVAL ? + ND("request for offset 0x%x", (uint32_t)offset); + *paddr = vtophys(netmap_mem_d->nm_buffer) + offset; + + return (0); +} + + +/* + * handler for synchronization of the queues from/to the host + */ +static void +netmap_sync_to_host(struct netmap_adapter *na) +{ + struct netmap_kring *kring = &na->tx_rings[na->num_queues]; + struct netmap_ring *ring = kring->ring; + struct mbuf *head = NULL, *tail = NULL, *m; + u_int n, lim = kring->nkr_num_slots - 1; + + na->nm_lock(na->ifp->if_softc, NETMAP_CORE_LOCK, 0); + + /* Take packets from hwcur to cur and pass them up. + * In case of no buffers we give up. At the end of the loop, + * the queue is drained in all cases. + */ + for (n = kring->nr_hwcur; n != ring->cur;) { + struct netmap_slot *slot = &ring->slot[n]; + + n = (n == lim) ? 0 : n + 1; + if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE) { + D("bad pkt at %d len %d", n, slot->len); + continue; + } + m = m_devget(NMB(slot), slot->len, 0, na->ifp, NULL); + + if (m == NULL) + break; + if (tail) + tail->m_nextpkt = m; + else + head = m; + tail = m; + m->m_nextpkt = NULL; + } + kring->nr_hwcur = ring->cur; + kring->nr_hwavail = ring->avail = lim; + na->nm_lock(na->ifp->if_softc, NETMAP_CORE_UNLOCK, 0); + + /* send packets up, outside the lock */ + while ((m = head) != NULL) { + head = head->m_nextpkt; + m->m_nextpkt = NULL; + m->m_pkthdr.rcvif = na->ifp; + if (netmap_verbose & NM_VERB_HOST) + D("sending up pkt %p size %d", m, m->m_pkthdr.len); + (na->ifp->if_input)(na->ifp, m); + } +} + +/* + * This routine also does the selrecord if called from the poll handler + * (we know because td != NULL). + */ +static void +netmap_sync_from_host(struct netmap_adapter *na, struct thread *td) +{ + struct netmap_kring *kring = &na->rx_rings[na->num_queues]; + struct netmap_ring *ring = kring->ring; + int delta; + + na->nm_lock(na->ifp->if_softc, NETMAP_CORE_LOCK, 0); + + /* skip past packets processed by userspace, + * and then sync cur/avail with hwcur/hwavail + */ + delta = ring->cur - kring->nr_hwcur; + if (delta < 0) + delta += kring->nkr_num_slots; + kring->nr_hwavail -= delta; + kring->nr_hwcur = ring->cur; + ring->avail = kring->nr_hwavail; + if (ring->avail == 0 && td) + selrecord(td, &kring->si); + if (ring->avail && (netmap_verbose & NM_VERB_HOST)) + D("%d pkts from stack", ring->avail); + na->nm_lock(na->ifp->if_softc, NETMAP_CORE_UNLOCK, 0); +} + + +/* + * get a refcounted reference to an interface. + * Return ENXIO if the interface does not exist, EINVAL if netmap + * is not supported by the interface. + * If successful, hold a reference. + */ +static int +get_ifp(const char *name, struct ifnet **ifp) +{ + *ifp = ifunit_ref(name); + if (*ifp == NULL) + return (ENXIO); + /* can do this if the capability exists and if_pspare[0] + * points to the netmap descriptor. + */ + if ((*ifp)->if_capabilities & IFCAP_NETMAP && NA(*ifp)) + return 0; /* valid pointer, we hold the refcount */ + if_rele(*ifp); + return EINVAL; // not NETMAP capable +} + + +/* + * Error routine called when txsync/rxsync detects an error. + * Can't do much more than resetting cur = hwcur, avail = hwavail. + * Return 1 on reinit. + */ +int +netmap_ring_reinit(struct netmap_kring *kring) +{ + struct netmap_ring *ring = kring->ring; + u_int i, lim = kring->nkr_num_slots - 1; + int errors = 0; + + D("called for %s", kring->na->ifp->if_xname); + if (ring->cur > lim) + errors++; + for (i = 0; i <= lim; i++) { + u_int idx = ring->slot[i].buf_idx; + u_int len = ring->slot[i].len; + if (idx < 2 || idx >= netmap_total_buffers) { + if (!errors++) + D("bad buffer at slot %d idx %d len %d ", i, idx, len); + ring->slot[i].buf_idx = 0; + ring->slot[i].len = 0; + } else if (len > NETMAP_BUF_SIZE) { + ring->slot[i].len = 0; + if (!errors++) + D("bad len %d at slot %d idx %d", + len, i, idx); + } + } + if (errors) { + int pos = kring - kring->na->tx_rings; + int n = kring->na->num_queues + 2; + + D("total %d errors", errors); + errors++; + D("%s %s[%d] reinit, cur %d -> %d avail %d -> %d", + kring->na->ifp->if_xname, + pos < n ? "TX" : "RX", pos < n ? pos : pos - n, + ring->cur, kring->nr_hwcur, + ring->avail, kring->nr_hwavail); + ring->cur = kring->nr_hwcur; + ring->avail = kring->nr_hwavail; + ring->flags |= NR_REINIT; + kring->na->flags |= NR_REINIT; + } + return (errors ? 1 : 0); +} + +/* + * Clean the reinit flag for our rings. + * XXX at the moment, clear for all rings + */ +static void +netmap_clean_reinit(struct netmap_adapter *na) +{ + //struct netmap_kring *kring; + u_int i; + + na->flags &= ~NR_REINIT; + D("--- NR_REINIT reset on %s", na->ifp->if_xname); + for (i = 0; i < na->num_queues + 1; i++) { + na->tx_rings[i].ring->flags &= ~NR_REINIT; + na->rx_rings[i].ring->flags &= ~NR_REINIT; + } +} + +/* + * Set the ring ID. For devices with a single queue, a request + * for all rings is the same as a single ring. + */ +static int +netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) +{ + struct ifnet *ifp = priv->np_ifp; + struct netmap_adapter *na = NA(ifp); + void *adapter = na->ifp->if_softc; /* shorthand */ + u_int i = ringid & NETMAP_RING_MASK; + /* first time we don't lock */ + int need_lock = (priv->np_qfirst != priv->np_qlast); + + if ( (ringid & NETMAP_HW_RING) && i >= na->num_queues) { + D("invalid ring id %d", i); + return (EINVAL); + } + if (need_lock) + na->nm_lock(adapter, NETMAP_CORE_LOCK, 0); + priv->np_ringid = ringid; + if (ringid & NETMAP_SW_RING) { + priv->np_qfirst = na->num_queues; + priv->np_qlast = na->num_queues + 1; + } else if (ringid & NETMAP_HW_RING) { + priv->np_qfirst = i; + priv->np_qlast = i + 1; + } else { + priv->np_qfirst = 0; + priv->np_qlast = na->num_queues; + } + priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; + if (need_lock) + na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0); + if (ringid & NETMAP_SW_RING) + D("ringid %s set to SW RING", ifp->if_xname); + else if (ringid & NETMAP_HW_RING) + D("ringid %s set to HW RING %d", ifp->if_xname, + priv->np_qfirst); + else + D("ringid %s set to all %d HW RINGS", ifp->if_xname, + priv->np_qlast); + return 0; +} + +/* + * ioctl(2) support for the "netmap" device. + * + * Following a list of accepted commands: + * - NIOCGINFO + * - SIOCGIFADDR just for convenience + * - NIOCREGIF + * - NIOCUNREGIF + * - NIOCTXSYNC + * - NIOCRXSYNC + * + * Return 0 on success, errno otherwise. + */ +static int +netmap_ioctl(__unused struct cdev *dev, u_long cmd, caddr_t data, + __unused int fflag, __unused struct thread *td) +{ + struct netmap_priv_d *priv = NULL; + struct ifnet *ifp; + struct nmreq *nmr = (struct nmreq *) data; + struct netmap_adapter *na; + void *adapter; + int error; + u_int i; + struct netmap_if *nifp; + + error = devfs_get_cdevpriv((void **)&priv); + if (error != ENOENT && error != 0) + return (error); + + error = 0; /* Could be ENOENT */ + switch (cmd) { + case NIOCGINFO: /* return capabilities etc */ + /* memsize is always valid */ + nmr->nr_memsize = netmap_mem_d->nm_totalsize; + nmr->nr_offset = 0; + nmr->nr_numrings = 0; + nmr->nr_numslots = 0; + if (nmr->nr_name[0] == '\0') /* just get memory info */ + break; + error = get_ifp(nmr->nr_name, &ifp); /* get a refcount */ + if (error) + break; + na = NA(ifp); /* retrieve netmap_adapter */ + nmr->nr_numrings = na->num_queues; + nmr->nr_numslots = na->num_tx_desc; + if_rele(ifp); /* return the refcount */ + break; + + case NIOCREGIF: + if (priv != NULL) /* thread already registered */ + return netmap_set_ringid(priv, nmr->nr_ringid); + /* find the interface and a reference */ + error = get_ifp(nmr->nr_name, &ifp); /* keep reference */ + if (error) + break; + na = NA(ifp); /* retrieve netmap adapter */ + adapter = na->ifp->if_softc; /* shorthand */ + /* + * Allocate the private per-thread structure. + * XXX perhaps we can use a blocking malloc ? + */ + priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, + M_NOWAIT | M_ZERO); + if (priv == NULL) { + error = ENOMEM; + if_rele(ifp); /* return the refcount */ + break; + } + + + for (i = 10; i > 0; i--) { + na->nm_lock(adapter, NETMAP_CORE_LOCK, 0); + if (!NETMAP_DELETING(na)) + break; + na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0); + tsleep(na, 0, "NIOCREGIF", hz/10); + } + if (i == 0) { + D("too many NIOCREGIF attempts, give up"); + error = EINVAL; + free(priv, M_DEVBUF); + if_rele(ifp); /* return the refcount */ + break; + } + + priv->np_ifp = ifp; /* store the reference */ + error = netmap_set_ringid(priv, nmr->nr_ringid); + if (error) + goto error; + priv->np_nifp = nifp = netmap_if_new(nmr->nr_name, na); + if (nifp == NULL) { /* allocation failed */ + error = ENOMEM; + } else if (ifp->if_capenable & IFCAP_NETMAP) { + /* was already set */ + } else { + /* Otherwise set the card in netmap mode + * and make it use the shared buffers. + */ + error = na->nm_register(ifp, 1); /* mode on */ + if (error) { + /* + * do something similar to netmap_dtor(). + */ + netmap_free(na->tx_rings[0].ring, "rings, reg.failed"); + free(na->tx_rings, M_DEVBUF); + na->tx_rings = na->rx_rings = NULL; + na->refcount--; + netmap_free(nifp, "nifp, rings failed"); + nifp = NULL; + } + } + + if (error) { /* reg. failed, release priv and ref */ +error: + na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0); + free(priv, M_DEVBUF); + if_rele(ifp); /* return the refcount */ + break; + } + + na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0); + error = devfs_set_cdevpriv(priv, netmap_dtor); + + if (error != 0) { + /* could not assign the private storage for the + * thread, call the destructor explicitly. + */ + netmap_dtor(priv); + break; + } + + /* return the offset of the netmap_if object */ + nmr->nr_numrings = na->num_queues; + nmr->nr_numslots = na->num_tx_desc; + nmr->nr_memsize = netmap_mem_d->nm_totalsize; + nmr->nr_offset = + ((char *) nifp - (char *) netmap_mem_d->nm_buffer); + break; + + case NIOCUNREGIF: + if (priv == NULL) + return (ENXIO); + + /* the interface is unregistered inside the + destructor of the private data. */ + devfs_clear_cdevpriv(); + break; + + case NIOCTXSYNC: + case NIOCRXSYNC: + if (priv == NULL) + return (ENXIO); + ifp = priv->np_ifp; /* we have a reference */ + na = NA(ifp); /* retrieve netmap adapter */ + adapter = ifp->if_softc; /* shorthand */ + + if (na->flags & NR_REINIT) + netmap_clean_reinit(na); + + if (priv->np_qfirst == na->num_queues) { + /* queues to/from host */ + if (cmd == NIOCTXSYNC) + netmap_sync_to_host(na); + else + netmap_sync_from_host(na, NULL); + return error; + } + + for (i = priv->np_qfirst; i < priv->np_qlast; i++) { + if (cmd == NIOCTXSYNC) { + struct netmap_kring *kring = &na->tx_rings[i]; + if (netmap_verbose & NM_VERB_TXSYNC) + D("sync tx ring %d cur %d hwcur %d", + i, kring->ring->cur, + kring->nr_hwcur); + na->nm_txsync(adapter, i, 1 /* do lock */); + if (netmap_verbose & NM_VERB_TXSYNC) + D("after sync tx ring %d cur %d hwcur %d", + i, kring->ring->cur, + kring->nr_hwcur); + } else { + na->nm_rxsync(adapter, i, 1 /* do lock */); + microtime(&na->rx_rings[i].ring->ts); + } + } + + break; + + case BIOCIMMEDIATE: + case BIOCGHDRCMPLT: + case BIOCSHDRCMPLT: + case BIOCSSEESENT: + D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT"); + break; + + default: + { + /* + * allow device calls + */ + struct socket so; + bzero(&so, sizeof(so)); + error = get_ifp(nmr->nr_name, &ifp); /* keep reference */ + if (error) + break; + so.so_vnet = ifp->if_vnet; + // so->so_proto not null. + error = ifioctl(&so, cmd, data, td); + if_rele(ifp); + } + } + + return (error); +} + + +/* + * select(2) and poll(2) handlers for the "netmap" device. + * + * Can be called for one or more queues. + * Return true the event mask corresponding to ready events. + * If there are no ready events, do a selrecord on either individual + * selfd or on the global one. + * Device-dependent parts (locking and sync of tx/rx rings) + * are done through callbacks. + */ +static int +netmap_poll(__unused struct cdev *dev, int events, struct thread *td) +{ + struct netmap_priv_d *priv = NULL; + struct netmap_adapter *na; + struct ifnet *ifp; + struct netmap_kring *kring; + u_int i, check_all, want_tx, want_rx, revents = 0; + void *adapter; + + if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL) + return POLLERR; + + ifp = priv->np_ifp; + // XXX check for deleting() ? + if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) + return POLLERR; + + if (netmap_verbose & 0x8000) + D("device %s events 0x%x", ifp->if_xname, events); + want_tx = events & (POLLOUT | POLLWRNORM); + want_rx = events & (POLLIN | POLLRDNORM); + + adapter = ifp->if_softc; + na = NA(ifp); /* retrieve netmap adapter */ + + /* pending reinit, report up as a poll error. Pending + * reads and writes are lost. + */ + if (na->flags & NR_REINIT) { + netmap_clean_reinit(na); + revents |= POLLERR; + } + /* how many queues we are scanning */ + i = priv->np_qfirst; + if (i == na->num_queues) { /* from/to host */ + if (priv->np_txpoll || want_tx) { + /* push any packets up, then we are always ready */ + kring = &na->tx_rings[i]; + netmap_sync_to_host(na); + revents |= want_tx; + } + if (want_rx) { + kring = &na->rx_rings[i]; + if (kring->ring->avail == 0) + netmap_sync_from_host(na, td); + if (kring->ring->avail > 0) { + revents |= want_rx; + } + } + return (revents); + } + + /* + * check_all is set if the card has more than one queue and + * the client is polling all of them. If true, we sleep on + * the "global" selfd, otherwise we sleep on individual selfd + * (we can only sleep on one of them per direction). + * The interrupt routine in the driver should always wake on + * the individual selfd, and also on the global one if the card + * has more than one ring. + * + * If the card has only one lock, we just use that. + * If the card has separate ring locks, we just use those + * unless we are doing check_all, in which case the whole + * loop is wrapped by the global lock. + * We acquire locks only when necessary: if poll is called + * when buffers are available, we can just return without locks. + * + * rxsync() is only called if we run out of buffers on a POLLIN. + * txsync() is called if we run out of buffers on POLLOUT, or + * there are pending packets to send. The latter can be disabled + * passing NETMAP_NO_TX_POLL in the NIOCREG call. + */ + check_all = (i + 1 != priv->np_qlast); + + /* + * core_lock indicates what to do with the core lock. + * The core lock is used when either the card has no individual + * locks, or it has individual locks but we are cheking all + * rings so we need the core lock to avoid missing wakeup events. + * + * It has three possible states: + * NO_CL we don't need to use the core lock, e.g. + * because we are protected by individual locks. + * NEED_CL we need the core lock. In this case, when we + * call the lock routine, move to LOCKED_CL + * to remember to release the lock once done. + * LOCKED_CL core lock is set, so we need to release it. + */ + enum {NO_CL, NEED_CL, LOCKED_CL }; + int core_lock = (check_all || !na->separate_locks) ? + NEED_CL:NO_CL; + /* + * We start with a lock free round which is good if we have + * data available. If this fails, then lock and call the sync + * routines. + */ + for (i = priv->np_qfirst; want_rx && i < priv->np_qlast; i++) { + kring = &na->rx_rings[i]; + if (kring->ring->avail > 0) { + revents |= want_rx; + want_rx = 0; /* also breaks the loop */ + } + } + for (i = priv->np_qfirst; want_tx && i < priv->np_qlast; i++) { + kring = &na->tx_rings[i]; + if (kring->ring->avail > 0) { + revents |= want_tx; + want_tx = 0; /* also breaks the loop */ + } + } + + /* + * If we to push packets out (priv->np_txpoll) or want_tx is + * still set, we do need to run the txsync calls (on all rings, + * to avoid that the tx rings stall). + */ + if (priv->np_txpoll || want_tx) { + for (i = priv->np_qfirst; i < priv->np_qlast; i++) { + kring = &na->tx_rings[i]; + if (!want_tx && kring->ring->cur == kring->nr_hwcur) + continue; + if (core_lock == NEED_CL) { + na->nm_lock(adapter, NETMAP_CORE_LOCK, 0); + core_lock = LOCKED_CL; + } + if (na->separate_locks) + na->nm_lock(adapter, NETMAP_TX_LOCK, i); + if (netmap_verbose & NM_VERB_TXSYNC) + D("send %d on %s %d", + kring->ring->cur, + ifp->if_xname, i); + if (na->nm_txsync(adapter, i, 0 /* no lock */)) + revents |= POLLERR; + + if (want_tx) { + if (kring->ring->avail > 0) { + /* stop at the first ring. We don't risk + * starvation. + */ + revents |= want_tx; + want_tx = 0; + } else if (!check_all) + selrecord(td, &kring->si); + } + if (na->separate_locks) + na->nm_lock(adapter, NETMAP_TX_UNLOCK, i); + } + } + + /* + * now if want_rx is still set we need to lock and rxsync. + * Do it on all rings because otherwise we starve. + */ + if (want_rx) { + for (i = priv->np_qfirst; i < priv->np_qlast; i++) { + kring = &na->rx_rings[i]; + if (core_lock == NEED_CL) { + na->nm_lock(adapter, NETMAP_CORE_LOCK, 0); + core_lock = LOCKED_CL; + } + if (na->separate_locks) + na->nm_lock(adapter, NETMAP_RX_LOCK, i); + + if (na->nm_rxsync(adapter, i, 0 /* no lock */)) + revents |= POLLERR; + if (no_timestamp == 0 || + kring->ring->flags & NR_TIMESTAMP) + microtime(&kring->ring->ts); + + if (kring->ring->avail > 0) + revents |= want_rx; + else if (!check_all) + selrecord(td, &kring->si); + if (na->separate_locks) + na->nm_lock(adapter, NETMAP_RX_UNLOCK, i); + } + } + if (check_all && revents == 0) { + i = na->num_queues + 1; /* the global queue */ + if (want_tx) + selrecord(td, &na->tx_rings[i].si); + if (want_rx) + selrecord(td, &na->rx_rings[i].si); + } + if (core_lock == LOCKED_CL) + na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0); + + return (revents); +} + +/*------- driver support routines ------*/ + +/* + * Initialize a ``netmap_adapter`` object created by driver on attach. + * We allocate a block of memory with room for a struct netmap_adapter + * plus two sets of N+2 struct netmap_kring (where N is the number + * of hardware rings): + * krings 0..N-1 are for the hardware queues. + * kring N is for the host stack queue + * kring N+1 is only used for the selinfo for all queues. + * Return 0 on success, ENOMEM otherwise. + */ +int +netmap_attach(struct netmap_adapter *na, int num_queues) +{ + int n = num_queues + 2; + int size = sizeof(*na) + 2 * n * sizeof(struct netmap_kring); + void *buf; + struct ifnet *ifp = na->ifp; + + if (ifp == NULL) { + D("ifp not set, giving up"); + return EINVAL; + } + na->refcount = 0; + na->num_queues = num_queues; + + buf = malloc(size, M_DEVBUF, M_NOWAIT | M_ZERO); + if (buf) { + ifp->if_pspare[0] = buf; + na->tx_rings = (void *)((char *)buf + sizeof(*na)); + na->rx_rings = na->tx_rings + n; + bcopy(na, buf, sizeof(*na)); + ifp->if_capabilities |= IFCAP_NETMAP; + } + D("%s for %s", buf ? "ok" : "failed", ifp->if_xname); + + return (buf ? 0 : ENOMEM); +} + + +/* + * Free the allocated memory linked to the given ``netmap_adapter`` + * object. + */ +void +netmap_detach(struct ifnet *ifp) +{ + u_int i; + struct netmap_adapter *na = NA(ifp); + + if (!na) + return; + + for (i = 0; i < na->num_queues + 2; i++) { + knlist_destroy(&na->tx_rings[i].si.si_note); + knlist_destroy(&na->rx_rings[i].si.si_note); + } + bzero(na, sizeof(*na)); + ifp->if_pspare[0] = NULL; + free(na, M_DEVBUF); +} + + +/* + * intercept packets coming from the network stack and present + * them to netmap as incoming packets on a separate ring. + * We are not locked when called. + */ +int +netmap_start(struct ifnet *ifp, struct mbuf *m) +{ + struct netmap_adapter *na = NA(ifp); + u_int i, len, n = na->num_queues; + int error = EBUSY; + struct netmap_kring *kring = &na->rx_rings[n]; + struct netmap_slot *slot; + + len = m->m_pkthdr.len; + if (netmap_verbose & NM_VERB_HOST) + D("%s packet %d len %d from the stack", ifp->if_xname, + kring->nr_hwcur + kring->nr_hwavail, len); + na->nm_lock(ifp->if_softc, NETMAP_CORE_LOCK, 0); + if (kring->nr_hwavail >= (int)kring->nkr_num_slots - 1) { + D("stack ring %s full\n", ifp->if_xname); + goto done; /* no space */ + } + if (len > na->buff_size) { + D("drop packet size %d > %d", len, na->buff_size); + goto done; /* too long for us */ + } + + /* compute the insert position */ + i = kring->nr_hwcur + kring->nr_hwavail; + if (i >= kring->nkr_num_slots) + i -= kring->nkr_num_slots; + slot = &kring->ring->slot[i]; + m_copydata(m, 0, len, NMB(slot)); + slot->len = len; + kring->nr_hwavail++; + if (netmap_verbose & NM_VERB_HOST) + D("wake up host ring %s %d", na->ifp->if_xname, na->num_queues); + selwakeuppri(&kring->si, PI_NET); + error = 0; +done: + na->nm_lock(ifp->if_softc, NETMAP_CORE_UNLOCK, 0); + + /* release the mbuf in either cases of success or failure. As an + * alternative, put the mbuf in a free list and free the list + * only when really necessary. + */ + m_freem(m); + + return (error); +} + + +/* + * netmap_reset() is called by the driver routines when reinitializing + * a ring. The driver is in charge of locking to protect the kring. + * If netmap mode is not set just return NULL. + * Otherwise set NR_REINIT (in the ring and in na) to signal + * that a ring has been reinitialized, + * set cur = hwcur = 0 and avail = hwavail = num_slots - 1 . + * IT IS IMPORTANT to leave one slot free even in the tx ring because + * we rely on cur=hwcur only for empty rings. + * These are good defaults but can be overridden later in the device + * specific code if, after a reinit, the ring does not start from 0 + * (e.g. if_em.c does this). + * + * XXX we shouldn't be touching the ring, but there is a + * race anyways and this is our best option. + * + * XXX setting na->flags makes the syscall code faster, as there is + * only one place to check. On the other hand, we will need a better + * way to notify multiple threads that rings have been reset. + * One way is to increment na->rst_count at each ring reset. + * Each thread in its own priv structure will keep a matching counter, + * and on a reset will acknowledge and clean its own rings. + */ +struct netmap_slot * +netmap_reset(struct netmap_adapter *na, enum txrx tx, int n, + u_int new_cur) +{ + struct netmap_kring *kring; + struct netmap_ring *ring; + struct netmap_slot *slot; + u_int i; + + if (na == NULL) + return NULL; /* no netmap support here */ + if (!(na->ifp->if_capenable & IFCAP_NETMAP)) + return NULL; /* nothing to reinitialize */ + kring = tx == NR_TX ? na->tx_rings + n : na->rx_rings + n; + ring = kring->ring; + if (tx == NR_TX) { + /* + * The last argument is the new value of next_to_clean. + * + * In the TX ring, we have P pending transmissions (from + * next_to_clean to nr_hwcur) followed by nr_hwavail free slots. + * Generally we can use all the slots in the ring so + * P = ring_size - nr_hwavail hence (modulo ring_size): + * next_to_clean == nr_hwcur + nr_hwavail + * + * If, upon a reset, nr_hwavail == ring_size and next_to_clean + * does not change we have nothing to report. Otherwise some + * pending packets may be lost, or newly injected packets will. + */ + /* if hwcur does not change, nothing to report. + * otherwise remember the change so perhaps we can + * shift the block at the next reinit + */ + if (new_cur == kring->nr_hwcur && + kring->nr_hwavail == kring->nkr_num_slots - 1) { + /* all ok */ + D("+++ NR_REINIT ok on %s TX[%d]", na->ifp->if_xname, n); + } else { + D("+++ NR_REINIT set on %s TX[%d]", na->ifp->if_xname, n); + } + ring->flags |= NR_REINIT; + na->flags |= NR_REINIT; + ring->avail = kring->nr_hwavail = kring->nkr_num_slots - 1; + ring->cur = kring->nr_hwcur = new_cur; + } else { + /* + * The last argument is the next free slot. + * In the RX ring we have nr_hwavail full buffers starting + * from nr_hwcur. + * If nr_hwavail == 0 and nr_hwcur does not change we are ok + * otherwise we might be in trouble as the buffers are + * changing. + */ + if (new_cur == kring->nr_hwcur && kring->nr_hwavail == 0) { + /* all ok */ + D("+++ NR_REINIT ok on %s RX[%d]", na->ifp->if_xname, n); + } else { + D("+++ NR_REINIT set on %s RX[%d]", na->ifp->if_xname, n); + } + ring->flags |= NR_REINIT; + na->flags |= NR_REINIT; + ring->avail = kring->nr_hwavail = 0; /* no data */ + ring->cur = kring->nr_hwcur = new_cur; + } + + slot = ring->slot; + /* + * Check that buffer indexes are correct. If we find a + * bogus value we are a bit in trouble because we cannot + * recover easily. Best we can do is (probably) persistently + * reset the ring. + */ + for (i = 0; i < kring->nkr_num_slots; i++) { + if (slot[i].buf_idx >= netmap_total_buffers) { + D("invalid buf_idx %d at slot %d", slot[i].buf_idx, i); + slot[i].buf_idx = 0; /* XXX reset */ + } + /* XXX we don't really need to set the length */ + slot[i].len = 0; + } + /* wakeup possible waiters, both on the ring and on the global + * selfd. Perhaps a bit early now but the device specific + * routine is locked so hopefully we won't have a race. + */ + selwakeuppri(&kring->si, PI_NET); + selwakeuppri(&kring[na->num_queues + 1 - n].si, PI_NET); + return kring->ring->slot; +} + +static void +ns_dmamap_cb(__unused void *arg, __unused bus_dma_segment_t * segs, + __unused int nseg, __unused int error) +{ +} + +/* unload a bus_dmamap and create a new one. Used when the + * buffer in the slot is changed. + * XXX buflen is probably not needed, buffers have constant size. + */ +void +netmap_reload_map(bus_dma_tag_t tag, bus_dmamap_t map, + void *buf, bus_size_t buflen) +{ + bus_addr_t paddr; + bus_dmamap_unload(tag, map); + bus_dmamap_load(tag, map, buf, buflen, ns_dmamap_cb, &paddr, + BUS_DMA_NOWAIT); +} + +void +netmap_load_map(bus_dma_tag_t tag, bus_dmamap_t map, + void *buf, bus_size_t buflen) +{ + bus_addr_t paddr; + bus_dmamap_load(tag, map, buf, buflen, ns_dmamap_cb, &paddr, + BUS_DMA_NOWAIT); +} + +/*------ netmap memory allocator -------*/ +/* + * Request for a chunk of memory. + * + * Memory objects are arranged into a list, hence we need to walk this + * list until we find an object with the needed amount of data free. + * This sounds like a completely inefficient implementation, but given + * the fact that data allocation is done once, we can handle it + * flawlessly. + * + * Return NULL on failure. + */ +static void * +netmap_malloc(size_t size, __unused const char *msg) +{ + struct netmap_mem_obj *mem_obj, *new_mem_obj; + void *ret = NULL; + + NMA_LOCK(); + TAILQ_FOREACH(mem_obj, &netmap_mem_d->nm_molist, nmo_next) { + if (mem_obj->nmo_used != 0 || mem_obj->nmo_size < size) + continue; + + new_mem_obj = malloc(sizeof(struct netmap_mem_obj), M_NETMAP, + M_WAITOK | M_ZERO); + TAILQ_INSERT_BEFORE(mem_obj, new_mem_obj, nmo_next); + + new_mem_obj->nmo_used = 1; + new_mem_obj->nmo_size = size; + new_mem_obj->nmo_data = mem_obj->nmo_data; + memset(new_mem_obj->nmo_data, 0, new_mem_obj->nmo_size); + + mem_obj->nmo_size -= size; + mem_obj->nmo_data = (char *) mem_obj->nmo_data + size; + if (mem_obj->nmo_size == 0) { + TAILQ_REMOVE(&netmap_mem_d->nm_molist, mem_obj, + nmo_next); + free(mem_obj, M_NETMAP); + } + + ret = new_mem_obj->nmo_data; + + break; + } + NMA_UNLOCK(); + ND("%s: %d bytes at %p", msg, size, ret); + + return (ret); +} + +/* + * Return the memory to the allocator. + * + * While freeing a memory object, we try to merge adjacent chunks in + * order to reduce memory fragmentation. + */ +static void +netmap_free(void *addr, const char *msg) +{ + size_t size; + struct netmap_mem_obj *cur, *prev, *next; + + if (addr == NULL) { + D("NULL addr for %s", msg); + return; + } + + NMA_LOCK(); + TAILQ_FOREACH(cur, &netmap_mem_d->nm_molist, nmo_next) { + if (cur->nmo_data == addr && cur->nmo_used) + break; + } + if (cur == NULL) { + NMA_UNLOCK(); + D("invalid addr %s %p", msg, addr); + return; + } + + size = cur->nmo_size; + cur->nmo_used = 0; + + /* merge current chunk of memory with the previous one, + if present. */ + prev = TAILQ_PREV(cur, netmap_mem_obj_h, nmo_next); + if (prev && prev->nmo_used == 0) { + TAILQ_REMOVE(&netmap_mem_d->nm_molist, cur, nmo_next); + prev->nmo_size += cur->nmo_size; + free(cur, M_NETMAP); + cur = prev; + } + + /* merge with the next one */ + next = TAILQ_NEXT(cur, nmo_next); + if (next && next->nmo_used == 0) { + TAILQ_REMOVE(&netmap_mem_d->nm_molist, next, nmo_next); + cur->nmo_size += next->nmo_size; + free(next, M_NETMAP); + } + NMA_UNLOCK(); + ND("freed %s %d bytes at %p", msg, size, addr); +} + + +/* + * Initialize the memory allocator. + * + * Create the descriptor for the memory , allocate the pool of memory + * and initialize the list of memory objects with a single chunk + * containing the whole pre-allocated memory marked as free. + * + * Start with a large size, then halve as needed if we fail to + * allocate the block. While halving, always add one extra page + * because buffers 0 and 1 are used for special purposes. + * Return 0 on success, errno otherwise. + */ +static int +netmap_memory_init(void) +{ + struct netmap_mem_obj *mem_obj; + void *buf = NULL; + int i, n, sz = NETMAP_MEMORY_SIZE; + int extra_sz = 0; // space for rings and two spare buffers + + for (; !buf && sz >= 1<<20; sz >>=1) { + extra_sz = sz/200; + extra_sz = (extra_sz + 2*PAGE_SIZE - 1) & ~(PAGE_SIZE-1); + buf = contigmalloc(sz + extra_sz, + M_NETMAP, + M_WAITOK | M_ZERO, + 0, /* low address */ + -1UL, /* high address */ + PAGE_SIZE, /* alignment */ + 0 /* boundary */ + ); + } + if (buf == NULL) + return (ENOMEM); + sz += extra_sz; + netmap_mem_d = malloc(sizeof(struct netmap_mem_d), M_NETMAP, + M_WAITOK | M_ZERO); + mtx_init(&netmap_mem_d->nm_mtx, "netmap memory allocator lock", NULL, + MTX_DEF); + TAILQ_INIT(&netmap_mem_d->nm_molist); + netmap_mem_d->nm_buffer = buf; + netmap_mem_d->nm_totalsize = sz; + + /* + * A buffer takes 2k, a slot takes 8 bytes + ring overhead, + * so the ratio is 200:1. In other words, we can use 1/200 of + * the memory for the rings, and the rest for the buffers, + * and be sure we never run out. + */ + netmap_mem_d->nm_size = sz/200; + netmap_mem_d->nm_buf_start = + (netmap_mem_d->nm_size + PAGE_SIZE - 1) & ~(PAGE_SIZE-1); + netmap_mem_d->nm_buf_len = sz - netmap_mem_d->nm_buf_start; + + nm_buf_pool.base = netmap_mem_d->nm_buffer; + nm_buf_pool.base += netmap_mem_d->nm_buf_start; + netmap_buffer_base = nm_buf_pool.base; + D("netmap_buffer_base %p (offset %d)", + netmap_buffer_base, netmap_mem_d->nm_buf_start); + /* number of buffers, they all start as free */ + + netmap_total_buffers = nm_buf_pool.total_buffers = + netmap_mem_d->nm_buf_len / NETMAP_BUF_SIZE; + nm_buf_pool.bufsize = NETMAP_BUF_SIZE; + + D("Have %d MB, use %dKB for rings, %d buffers at %p", + (sz >> 20), (netmap_mem_d->nm_size >> 10), + nm_buf_pool.total_buffers, nm_buf_pool.base); + + /* allocate and initialize the bitmap. Entry 0 is considered + * always busy (used as default when there are no buffers left). + */ + n = (nm_buf_pool.total_buffers + 31) / 32; + nm_buf_pool.bitmap = malloc(sizeof(uint32_t) * n, M_NETMAP, + M_WAITOK | M_ZERO); + nm_buf_pool.bitmap[0] = ~3; /* slot 0 and 1 always busy */ + for (i = 1; i < n; i++) + nm_buf_pool.bitmap[i] = ~0; + nm_buf_pool.free = nm_buf_pool.total_buffers - 2; + + mem_obj = malloc(sizeof(struct netmap_mem_obj), M_NETMAP, + M_WAITOK | M_ZERO); + TAILQ_INSERT_HEAD(&netmap_mem_d->nm_molist, mem_obj, nmo_next); + mem_obj->nmo_used = 0; + mem_obj->nmo_size = netmap_mem_d->nm_size; + mem_obj->nmo_data = netmap_mem_d->nm_buffer; + + return (0); +} + + +/* + * Finalize the memory allocator. + * + * Free all the memory objects contained inside the list, and deallocate + * the pool of memory; finally free the memory allocator descriptor. + */ +static void +netmap_memory_fini(void) +{ + struct netmap_mem_obj *mem_obj; + + while (!TAILQ_EMPTY(&netmap_mem_d->nm_molist)) { + mem_obj = TAILQ_FIRST(&netmap_mem_d->nm_molist); + TAILQ_REMOVE(&netmap_mem_d->nm_molist, mem_obj, nmo_next); + if (mem_obj->nmo_used == 1) { + printf("netmap: leaked %d bytes at %p\n", + mem_obj->nmo_size, + mem_obj->nmo_data); + } + free(mem_obj, M_NETMAP); + } + contigfree(netmap_mem_d->nm_buffer, netmap_mem_d->nm_totalsize, M_NETMAP); + // XXX mutex_destroy(nm_mtx); + free(netmap_mem_d, M_NETMAP); +} + + +/* + * Module loader. + * + * Create the /dev/netmap device and initialize all global + * variables. + * + * Return 0 on success, errno on failure. + */ +static int +netmap_init(void) +{ + int error; + + + error = netmap_memory_init(); + if (error != 0) { + printf("netmap: unable to initialize the memory allocator."); + return (error); + } + printf("netmap: loaded module with %d Mbytes\n", + netmap_mem_d->nm_totalsize >> 20); + + netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, + "netmap"); + + return (0); +} + + +/* + * Module unloader. + * + * Free all the memory, and destroy the ``/dev/netmap`` device. + */ +static void +netmap_fini(void) +{ + destroy_dev(netmap_dev); + + netmap_memory_fini(); + + printf("netmap: unloaded module.\n"); +} + + +/* + * Kernel entry point. + * + * Initialize/finalize the module and return. + * + * Return 0 on success, errno on failure. + */ +static int +netmap_loader(__unused struct module *module, int event, __unused void *arg) +{ + int error = 0; + + switch (event) { + case MOD_LOAD: + error = netmap_init(); + break; + + case MOD_UNLOAD: + netmap_fini(); + break; + + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + + +DEV_MODULE(netmap, netmap_loader, NULL); diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h new file mode 100644 index 000000000000..5434609c447b --- /dev/null +++ b/sys/dev/netmap/netmap_kern.h @@ -0,0 +1,221 @@ +/* + * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: netmap_kern.h 9662 2011-11-16 13:18:06Z luigi $ + * + * The header contains the definitions of constants and function + * prototypes used only in kernelspace. + */ + +#ifndef _NET_NETMAP_KERN_H_ +#define _NET_NETMAP_KERN_H_ + +#ifdef MALLOC_DECLARE +MALLOC_DECLARE(M_NETMAP); +#endif + +#define ND(format, ...) +#define D(format, ...) \ + do { \ + struct timeval __xxts; \ + microtime(&__xxts); \ + printf("%03d.%06d %s [%d] " format "\n",\ + (int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec, \ + __FUNCTION__, __LINE__, ##__VA_ARGS__); \ + } while (0) + +struct netmap_adapter; + +/* + * private, kernel view of a ring. + * + * XXX 20110627-todo + * The index in the NIC and netmap ring is offset by nkr_hwofs slots. + * This is so that, on a reset, buffers owned by userspace are not + * modified by the kernel. In particular: + * RX rings: the next empty buffer (hwcur + hwavail + hwofs) coincides + * the next empty buffer as known by the hardware (next_to_check or so). + * TX rings: hwcur + hwofs coincides with next_to_send + */ +struct netmap_kring { + struct netmap_ring *ring; + u_int nr_hwcur; + int nr_hwavail; + u_int nr_kflags; + u_int nkr_num_slots; + + u_int nkr_hwofs; /* offset between NIC and netmap ring */ + struct netmap_adapter *na; // debugging + struct selinfo si; /* poll/select wait queue */ +}; + +/* + * This struct is part of and extends the 'struct adapter' (or + * equivalent) device descriptor. It contains all fields needed to + * support netmap operation. + */ +struct netmap_adapter { + int refcount; /* number of user-space descriptors using this + interface, which is equal to the number of + struct netmap_if objs in the mapped region. */ + + int separate_locks; /* set if the interface suports different + locks for rx, tx and core. */ + + u_int num_queues; /* number of tx/rx queue pairs: this is + a duplicate field needed to simplify the + signature of ``netmap_detach``. */ + + u_int num_tx_desc; /* number of descriptor in each queue */ + u_int num_rx_desc; + u_int buff_size; + + u_int flags; /* NR_REINIT */ + /* tx_rings and rx_rings are private but allocated + * as a contiguous chunk of memory. Each array has + * N+1 entries, for the adapter queues and for the host queue. + */ + struct netmap_kring *tx_rings; /* array of TX rings. */ + struct netmap_kring *rx_rings; /* array of RX rings. */ + + /* copy of if_qflush and if_transmit pointers, to intercept + * packets from the network stack when netmap is active. + * XXX probably if_qflush is not necessary. + */ + void (*if_qflush)(struct ifnet *); + int (*if_transmit)(struct ifnet *, struct mbuf *); + + /* references to the ifnet and device routines, used by + * the generic netmap functions. + */ + struct ifnet *ifp; /* adapter is ifp->if_softc */ + + int (*nm_register)(struct ifnet *, int onoff); + void (*nm_lock)(void *, int what, u_int ringid); + int (*nm_txsync)(void *, u_int ring, int lock); + int (*nm_rxsync)(void *, u_int ring, int lock); +}; + +/* + * The combination of "enable" (ifp->if_capabilities &IFCAP_NETMAP) + * and refcount gives the status of the interface, namely: + * + * enable refcount Status + * + * FALSE 0 normal operation + * FALSE != 0 -- (impossible) + * TRUE 1 netmap mode + * TRUE 0 being deleted. + */ + +#define NETMAP_DELETING(_na) ( ((_na)->refcount == 0) && \ + ( (_na)->ifp->if_capenable & IFCAP_NETMAP) ) + +/* + * parameters for (*nm_lock)(adapter, what, index) + */ +enum { + NETMAP_NO_LOCK = 0, + NETMAP_CORE_LOCK, NETMAP_CORE_UNLOCK, + NETMAP_TX_LOCK, NETMAP_TX_UNLOCK, + NETMAP_RX_LOCK, NETMAP_RX_UNLOCK, +}; + +/* + * The following are support routines used by individual drivers to + * support netmap operation. + * + * netmap_attach() initializes a struct netmap_adapter, allocating the + * struct netmap_ring's and the struct selinfo. + * + * netmap_detach() frees the memory allocated by netmap_attach(). + * + * netmap_start() replaces the if_transmit routine of the interface, + * and is used to intercept packets coming from the stack. + * + * netmap_load_map/netmap_reload_map are helper routines to set/reset + * the dmamap for a packet buffer + * + * netmap_reset() is a helper routine to be called in the driver + * when reinitializing a ring. + */ +int netmap_attach(struct netmap_adapter *, int); +void netmap_detach(struct ifnet *); +int netmap_start(struct ifnet *, struct mbuf *); +enum txrx { NR_RX = 0, NR_TX = 1 }; +struct netmap_slot *netmap_reset(struct netmap_adapter *na, + enum txrx tx, int n, u_int new_cur); +void netmap_load_map(bus_dma_tag_t tag, bus_dmamap_t map, + void *buf, bus_size_t buflen); +void netmap_reload_map(bus_dma_tag_t tag, bus_dmamap_t map, + void *buf, bus_size_t buflen); +int netmap_ring_reinit(struct netmap_kring *); + +/* + * XXX eventually, get rid of netmap_total_buffers and netmap_buffer_base + * in favour of the structure + */ +// struct netmap_buf_pool; +// extern struct netmap_buf_pool nm_buf_pool; +extern u_int netmap_total_buffers; +extern char *netmap_buffer_base; +extern int netmap_verbose; // XXX debugging +enum { /* verbose flags */ + NM_VERB_ON = 1, /* generic verbose */ + NM_VERB_HOST = 0x2, /* verbose host stack */ + NM_VERB_RXSYNC = 0x10, /* verbose on rxsync/txsync */ + NM_VERB_TXSYNC = 0x20, + NM_VERB_RXINTR = 0x100, /* verbose on rx/tx intr (driver) */ + NM_VERB_TXINTR = 0x200, + NM_VERB_NIC_RXSYNC = 0x1000, /* verbose on rx/tx intr (driver) */ + NM_VERB_NIC_TXSYNC = 0x2000, +}; + +/* + * return a pointer to the struct netmap adapter from the ifp + */ +#define NA(_ifp) ((struct netmap_adapter *)(_ifp)->if_pspare[0]) + + +/* + * return the address of a buffer. + * XXX this is a special version with hardwired 2k bufs + * On error return netmap_buffer_base which is detected as a bad pointer. + */ +static inline char * +NMB(struct netmap_slot *slot) +{ + uint32_t i = slot->buf_idx; + return (i >= netmap_total_buffers) ? netmap_buffer_base : +#if NETMAP_BUF_SIZE == 2048 + netmap_buffer_base + (i << 11); +#else + netmap_buffer_base + (i *NETMAP_BUF_SIZE); +#endif +} + +#endif /* _NET_NETMAP_KERN_H_ */ diff --git a/sys/net/netmap.h b/sys/net/netmap.h new file mode 100644 index 000000000000..be9c686a49ed --- /dev/null +++ b/sys/net/netmap.h @@ -0,0 +1,281 @@ +/* + * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * 3. Neither the name of the authors nor the names of their contributors + * may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY MATTEO LANDI AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL MATTEO LANDI OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: netmap.h 9662 2011-11-16 13:18:06Z luigi $ + * + * This header contains the definitions of the constants and the + * structures needed by the ``netmap'' module, both kernel and + * userspace. + */ + +#ifndef _NET_NETMAP_H_ +#define _NET_NETMAP_H_ + +/* + * --- Netmap data structures --- + * + * The data structures used by netmap are shown below. Those in + * capital letters are in an mmapp()ed area shared with userspace, + * while others are private to the kernel. + * Shared structures do not contain pointers but only relative + * offsets, so that addressing is portable between kernel and userspace. + * + * The 'softc' of each interface is extended with a struct netmap_adapter + * containing information to support netmap operation. In addition to + * the fixed fields, it has two pointers to reach the arrays of + * 'struct netmap_kring' which in turn reaches the various + * struct netmap_ring, shared with userspace. + + + softc ++----------------+ +| standard fields| +| if_pspare[0] ----------+ ++----------------+ | + | ++----------------+<------+ +|(netmap_adapter)| +| | netmap_kring +| tx_rings *--------------------------------->+-------------+ +| | netmap_kring | ring *---------> ... +| rx_rings *---------->+--------------+ | nr_hwcur | ++----------------+ | ring *-------+ | nr_hwavail | + | nr_hwcur | | | selinfo | + | nr_hwavail | | +-------------+ + | selinfo | | | ... | + +--------------+ | (na_num_rings+1 entries) + | .... | | | | + (na_num_rings+1 entries) +-------------+ + | | | + +--------------+ | + | NETMAP_RING + +---->+-------------+ + / | cur | + NETMAP_IF (nifp, one per file desc.) / | avail | + +---------------+ / | buf_ofs | + | ni_num_queues | / +=============+ + | | / | buf_idx | slot[0] + | | / | len, flags | + | | / +-------------+ + +===============+ / | buf_idx | slot[1] + | txring_ofs[0] | (rel.to nifp)--' | len, flags | + | txring_ofs[1] | +-------------+ + (num_rings+1 entries) (nr_num_slots entries) + | txring_ofs[n] | | buf_idx | slot[n-1] + +---------------+ | len, flags | + | rxring_ofs[0] | +-------------+ + | rxring_ofs[1] | + (num_rings+1 entries) + | txring_ofs[n] | + +---------------+ + + * The NETMAP_RING is the shadow ring that mirrors the NIC rings. + * Each slot has the index of a buffer, its length and some flags. + * In user space, the buffer address is computed as + * (char *)ring + buf_ofs + index*MAX_BUF_SIZE + * In the kernel, buffers do not necessarily need to be contiguous, + * and the virtual and physical addresses are derived through + * a lookup table. When userspace wants to use a different buffer + * in a location, it must set the NS_BUF_CHANGED flag to make + * sure that the kernel recomputes updates the hardware ring and + * other fields (bus_dmamap, etc.) as needed. + * + * Normally the driver is not requested to report the result of + * transmissions (this can dramatically speed up operation). + * However the user may request to report completion by setting + * NS_REPORT. + */ +struct netmap_slot { + uint32_t buf_idx; /* buffer index */ + uint16_t len; /* packet length, to be copied to/from the hw ring */ + uint16_t flags; /* buf changed, etc. */ +#define NS_BUF_CHANGED 0x0001 /* must resync the map, buffer changed */ +#define NS_REPORT 0x0002 /* ask the hardware to report results + * e.g. by generating an interrupt + */ +}; + +/* + * Netmap representation of a TX or RX ring (also known as "queue"). + * This is a queue implemented as a fixed-size circular array. + * At the software level, two fields are important: avail and cur. + * + * In TX rings: + * avail indicates the number of slots available for transmission. + * It is decremented by the application when it appends a + * packet, and set to nr_hwavail (see below) on a + * NIOCTXSYNC to reflect the actual state of the queue + * (keeping track of completed transmissions). + * cur indicates the empty slot to use for the next packet + * to send (i.e. the "tail" of the queue). + * It is incremented by the application. + * + * The kernel side of netmap uses two additional fields in its own + * private ring structure, netmap_kring: + * nr_hwcur is a copy of nr_cur on an NIOCTXSYNC. + * nr_hwavail is the number of slots known as available by the + * hardware. It is updated on an INTR (inc by the + * number of packets sent) and on a NIOCTXSYNC + * (decrease by nr_cur - nr_hwcur) + * A special case, nr_hwavail is -1 if the transmit + * side is idle (no pending transmits). + * + * In RX rings: + * avail is the number of packets available (possibly 0). + * It is decremented by the software when it consumes + * a packet, and set to nr_hwavail on a NIOCRXSYNC + * cur indicates the first slot that contains a packet + * (the "head" of the queue). + * It is incremented by the software when it consumes + * a packet. + * + * The kernel side of netmap uses two additional fields in the kring: + * nr_hwcur is a copy of nr_cur on an NIOCRXSYNC + * nr_hwavail is the number of packets available. It is updated + * on INTR (inc by the number of new packets arrived) + * and on NIOCRXSYNC (decreased by nr_cur - nr_hwcur). + * + * DATA OWNERSHIP/LOCKING: + * The netmap_ring is owned by the user program and it is only + * accessed or modified in the upper half of the kernel during + * a system call. + * + * The netmap_kring is only modified by the upper half of the kernel. + */ +struct netmap_ring { + /* + * nr_buf_base_ofs is meant to be used through macros. + * It contains the offset of the buffer region from this + * descriptor. + */ + const ssize_t buf_ofs; + const uint32_t num_slots; /* number of slots in the ring. */ + uint32_t avail; /* number of usable slots */ + uint32_t cur; /* 'current' r/w position */ + + const uint16_t nr_buf_size; + uint16_t flags; + /* + * When a ring is reinitialized, the kernel sets kflags. + * On exit from a syscall, if the flag is found set, we + * also reinitialize the nr_* variables. The kflag is then + * unconditionally copied to nr_flags and cleared. + */ +#define NR_REINIT 0x0001 /* ring reinitialized! */ +#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */ + + struct timeval ts; /* time of last *sync() */ + + /* the slots follow. This struct has variable size */ + struct netmap_slot slot[0]; /* array of slots. */ +}; + + +/* + * Netmap representation of an interface and its queue(s). + * There is one netmap_if for each file descriptor on which we want + * to select/poll. We assume that on each interface has the same number + * of receive and transmit queues. + * select/poll operates on one or all pairs depending on the value of + * nmr_queueid passed on the ioctl. + */ +struct netmap_if { + char ni_name[IFNAMSIZ]; /* name of the interface. */ + const u_int ni_version; /* API version, currently unused */ + const u_int ni_num_queues; /* number of queue pairs (TX/RX). */ + const u_int ni_rx_queues; /* if zero, use ni_num_queues */ + /* + * the following array contains the offset of the + * each netmap ring from this structure. The first num_queues+1 + * refer to the tx rings, the next n+1 refer to the rx rings. + * The area is filled up by the kernel on NIOCREG, + * and then only read by userspace code. + * entries 0..ni_num_queues-1 indicate the hardware queues, + * entry ni_num_queues is the queue from/to the stack. + */ + const ssize_t ring_ofs[0]; +}; + +#ifndef IFCAP_NETMAP /* this should go in net/if.h */ +#define IFCAP_NETMAP 0x100000 +#endif + +#ifndef NIOCREGIF +/* + * ioctl names and related fields + * + * NIOCGINFO takes a struct ifreq, the interface name is the input, + * the outputs are number of queues and number of descriptor + * for each queue (useful to set number of threads etc.). + * + * NIOCREGIF takes an interface name within a struct ifreq, + * and activates netmap mode on the interface (if possible). + * + * NIOCUNREGIF unregisters the interface associated to the fd. + * + * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues, + * whose identity is set in NIOCREGIF through nr_ringid + */ + +/* + * struct nmreq overlays a struct ifreq + */ +struct nmreq { + char nr_name[IFNAMSIZ]; + uint32_t nr_version; /* API version (unused) */ + uint32_t nr_offset; /* nifp offset in the shared region */ + uint32_t nr_memsize; /* size of the shared region */ + uint32_t nr_numslots; /* descriptors per queue */ + uint16_t nr_numrings; + uint16_t nr_ringid; /* ring(s) we care about */ +#define NETMAP_HW_RING 0x4000 /* low bits indicate one hw ring */ +#define NETMAP_SW_RING 0x2000 /* we process the sw ring */ +#define NETMAP_NO_TX_POLL 0x1000 /* no gratuitous txsync on poll */ +#define NETMAP_RING_MASK 0xfff /* the ring number */ +}; + +/* + * default buf size is 2048, but it may make sense to have + * it shorter for better cache usage. + */ + +#define NETMAP_BUF_SIZE (2048) +#define NIOCGINFO _IOWR('i', 145, struct nmreq) /* return IF info */ +#define NIOCREGIF _IOWR('i', 146, struct nmreq) /* interface register */ +#define NIOCUNREGIF _IO('i', 147) /* interface unregister */ +#define NIOCTXSYNC _IO('i', 148) /* sync tx queues */ +#define NIOCRXSYNC _IO('i', 149) /* sync rx queues */ +#endif /* !NIOCREGIF */ + +#endif /* _NET_NETMAP_H_ */ diff --git a/sys/net/netmap_user.h b/sys/net/netmap_user.h new file mode 100644 index 000000000000..c9443b89e43f --- /dev/null +++ b/sys/net/netmap_user.h @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * 3. Neither the name of the authors nor the names of their contributors + * may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY MATTEO LANDI AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL MATTEO LANDI OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: netmap_user.h 9495 2011-10-18 15:28:23Z luigi $ + * + * This header contains the macros used to manipulate netmap structures + * and packets in userspace. See netmap(4) for more information. + * + * The address of the struct netmap_if, say nifp, is determined + * by the value returned from ioctl(.., NIOCREG, ...) and the mmap + * region: + * ioctl(fd, NIOCREG, &req); + * mem = mmap(0, ... ); + * nifp = NETMAP_IF(mem, req.nr_nifp); + * (so simple, we could just do it manually) + * + * From there: + * struct netmap_ring *NETMAP_TXRING(nifp, index) + * struct netmap_ring *NETMAP_RXRING(nifp, index) + * we can access ring->nr_cur, ring->nr_avail, ring->nr_flags + * + * ring->slot[i] gives us the i-th slot (we can access + * directly plen, flags, bufindex) + * + * char *buf = NETMAP_BUF(ring, index) returns a pointer to + * the i-th buffer + * + * Since rings are circular, we have macros to compute the next index + * i = NETMAP_RING_NEXT(ring, i); + */ + +#ifndef _NET_NETMAP_USER_H_ +#define _NET_NETMAP_USER_H_ + +#define NETMAP_IF(b, o) (struct netmap_if *)((char *)(b) + (o)) + +#define NETMAP_TXRING(nifp, index) \ + ((struct netmap_ring *)((char *)(nifp) + \ + (nifp)->ring_ofs[index] ) ) + +#define NETMAP_RXRING(nifp, index) \ + ((struct netmap_ring *)((char *)(nifp) + \ + (nifp)->ring_ofs[index + (nifp)->ni_num_queues+1] ) ) + +#if NETMAP_BUF_SIZE != 2048 +#error cannot handle odd size +#define NETMAP_BUF(ring, index) \ + ((char *)(ring) + (ring)->buf_ofs + ((index)*NETMAP_BUF_SIZE)) +#else +#define NETMAP_BUF(ring, index) \ + ((char *)(ring) + (ring)->buf_ofs + ((index)<<11)) +#endif + +#define NETMAP_RING_NEXT(r, i) \ + ((i)+1 == (r)->num_slots ? 0 : (i) + 1 ) + +/* + * Return 1 if the given tx ring is empty. + * + * @r netmap_ring descriptor pointer. + * Special case, a negative value in hwavail indicates that the + * transmit queue is idle. + * XXX revise + */ +#define NETMAP_TX_RING_EMPTY(r) ((r)->avail >= (r)->num_slots - 1) + +#endif /* _NET_NETMAP_USER_H_ */ diff --git a/tools/tools/README b/tools/tools/README index 253b2e08f739..9c3db2fc5364 100644 --- a/tools/tools/README +++ b/tools/tools/README @@ -50,6 +50,7 @@ mfc Merge a directory from HEAD to a branch where it does not mid Create a Message-ID database for mailing lists. mwl Tools specific to the Marvell 88W8363 support ncpus Count the number of processors +netmap Test applications for netmap(4) notescheck Check for missing devices and options in NOTES files. npe Tools specific to the Intel IXP4XXX NPE device nxge A diagnostic tool for the nxge(4) driver diff --git a/tools/tools/netmap/Makefile b/tools/tools/netmap/Makefile new file mode 100644 index 000000000000..4b682e52a311 --- /dev/null +++ b/tools/tools/netmap/Makefile @@ -0,0 +1,25 @@ +# +# $FreeBSD$ +# +# For multiple programs using a single source file each, +# we can just define 'progs' and create custom targets. +PROGS = pkt-gen bridge testpcap libnetmap.so + +CLEANFILES = $(PROGS) pcap.o +NO_MAN= +CFLAGS += -Werror -Wall -nostdinc -I/usr/include -I../../../sys +CFLAGS += -Wextra + +LDFLAGS += -lpthread -lpcap + +.include +.include + +all: $(PROGS) + +testpcap: pcap.c libnetmap.so + $(CC) $(CFLAGS) -L. -lnetmap -o ${.TARGET} pcap.c + +libnetmap.so: pcap.c + $(CC) $(CFLAGS) -fpic -c ${.ALLSRC} + $(CC) -shared -o ${.TARGET} ${.ALLSRC:.c=.o} diff --git a/tools/tools/netmap/README b/tools/tools/netmap/README new file mode 100644 index 000000000000..9a1ba6096188 --- /dev/null +++ b/tools/tools/netmap/README @@ -0,0 +1,11 @@ +$FreeBSD$ + +This directory contains examples that use netmap + + pkt-gen a packet sink/source using the netmap API + + bridge a two-port jumper wire, also using the native API + + testpcap a jumper wire using libnetmap (or libpcap) + + click* various click examples diff --git a/tools/tools/netmap/bridge.c b/tools/tools/netmap/bridge.c new file mode 100644 index 000000000000..2385a0811fb5 --- /dev/null +++ b/tools/tools/netmap/bridge.c @@ -0,0 +1,456 @@ +/* + * (C) 2011 Luigi Rizzo, Matteo Landi + * + * BSD license + * + * A netmap client to bridge two network interfaces + * (or one interface and the host stack). + * + * $FreeBSD$ + */ + +#include +#include /* signal */ +#include +#include +#include /* strcmp */ +#include /* open */ +#include /* close */ + +#include /* le64toh */ +#include /* PROT_* */ +#include /* ioctl */ +#include +#include +#include /* sockaddr.. */ +#include /* ntohs */ + +#include /* ifreq */ +#include +#include +#include + +#include /* sockaddr_in */ + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +int verbose = 0; + +/* debug support */ +#define ND(format, ...) {} +#define D(format, ...) do { \ + if (!verbose) break; \ + struct timeval _xxts; \ + gettimeofday(&_xxts, NULL); \ + fprintf(stderr, "%03d.%06d %s [%d] " format "\n", \ + (int)_xxts.tv_sec %1000, (int)_xxts.tv_usec, \ + __FUNCTION__, __LINE__, ##__VA_ARGS__); \ + } while (0) + + +char *version = "$Id: bridge.c 9642 2011-11-07 21:39:47Z luigi $"; + +static int do_abort = 0; + +/* + * info on a ring we handle + */ +struct my_ring { + const char *ifname; + int fd; + char *mem; /* userspace mmap address */ + u_int memsize; + u_int queueid; + u_int begin, end; /* first..last+1 rings to check */ + struct netmap_if *nifp; + struct netmap_ring *tx, *rx; /* shortcuts */ + + uint32_t if_flags; + uint32_t if_reqcap; + uint32_t if_curcap; +}; + +static void +sigint_h(__unused int sig) +{ + do_abort = 1; + signal(SIGINT, SIG_DFL); +} + + +static int +do_ioctl(struct my_ring *me, int what) +{ + struct ifreq ifr; + int error; + + bzero(&ifr, sizeof(ifr)); + strncpy(ifr.ifr_name, me->ifname, sizeof(ifr.ifr_name)); + switch (what) { + case SIOCSIFFLAGS: + ifr.ifr_flagshigh = me->if_flags >> 16; + ifr.ifr_flags = me->if_flags & 0xffff; + break; + case SIOCSIFCAP: + ifr.ifr_reqcap = me->if_reqcap; + ifr.ifr_curcap = me->if_curcap; + break; + } + error = ioctl(me->fd, what, &ifr); + if (error) { + D("ioctl error %d", what); + return error; + } + switch (what) { + case SIOCGIFFLAGS: + me->if_flags = (ifr.ifr_flagshigh << 16) | + (0xffff & ifr.ifr_flags); + if (verbose) + D("flags are 0x%x", me->if_flags); + break; + + case SIOCGIFCAP: + me->if_reqcap = ifr.ifr_reqcap; + me->if_curcap = ifr.ifr_curcap; + if (verbose) + D("curcap are 0x%x", me->if_curcap); + break; + } + return 0; +} + +/* + * open a device. if me->mem is null then do an mmap. + */ +static int +netmap_open(struct my_ring *me, int ringid) +{ + int fd, err, l; + struct nmreq req; + + me->fd = fd = open("/dev/netmap", O_RDWR); + if (fd < 0) { + D("Unable to open /dev/netmap"); + return (-1); + } + bzero(&req, sizeof(req)); + strncpy(req.nr_name, me->ifname, sizeof(req.nr_name)); + req.nr_ringid = ringid; + err = ioctl(fd, NIOCGINFO, &req); + if (err) { + D("cannot get info on %s", me->ifname); + goto error; + } + me->memsize = l = req.nr_memsize; + if (verbose) + D("memsize is %d MB", l>>20); + err = ioctl(fd, NIOCREGIF, &req); + if (err) { + D("Unable to register %s", me->ifname); + goto error; + } + + if (me->mem == NULL) { + me->mem = mmap(0, l, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0); + if (me->mem == MAP_FAILED) { + D("Unable to mmap"); + me->mem = NULL; + goto error; + } + } + + me->nifp = NETMAP_IF(me->mem, req.nr_offset); + me->queueid = ringid; + if (ringid & NETMAP_SW_RING) { + me->begin = req.nr_numrings; + me->end = me->begin + 1; + } else if (ringid & NETMAP_HW_RING) { + me->begin = ringid & NETMAP_RING_MASK; + me->end = me->begin + 1; + } else { + me->begin = 0; + me->end = req.nr_numrings; + } + me->tx = NETMAP_TXRING(me->nifp, me->begin); + me->rx = NETMAP_RXRING(me->nifp, me->begin); + return (0); +error: + close(me->fd); + return -1; +} + + +static int +netmap_close(struct my_ring *me) +{ + D(""); + if (me->mem) + munmap(me->mem, me->memsize); + ioctl(me->fd, NIOCUNREGIF, NULL); + close(me->fd); + return (0); +} + + +/* + * move up to 'limit' pkts from rxring to txring swapping buffers. + */ +static int +process_rings(struct netmap_ring *rxring, struct netmap_ring *txring, + u_int limit, const char *msg) +{ + u_int j, k, m = 0; + + /* print a warning if any of the ring flags is set (e.g. NM_REINIT) */ + if (rxring->flags || txring->flags) + D("%s rxflags %x txflags %x", + msg, rxring->flags, txring->flags); + j = rxring->cur; /* RX */ + k = txring->cur; /* TX */ + if (rxring->avail < limit) + limit = rxring->avail; + if (txring->avail < limit) + limit = txring->avail; + m = limit; + while (limit-- > 0) { + struct netmap_slot *rs = &rxring->slot[j]; + struct netmap_slot *ts = &txring->slot[k]; + uint32_t pkt; + + /* swap packets */ + if (ts->buf_idx < 2 || rs->buf_idx < 2) { + D("wrong index rx[%d] = %d -> tx[%d] = %d", + j, rs->buf_idx, k, ts->buf_idx); + sleep(2); + } + pkt = ts->buf_idx; + ts->buf_idx = rs->buf_idx; + rs->buf_idx = pkt; + + /* copy the packet lenght. */ + if (rs->len < 14 || rs->len > 2048) + D("wrong len %d rx[%d] -> tx[%d]", rs->len, j, k); + else if (verbose > 1) + D("send len %d rx[%d] -> tx[%d]", rs->len, j, k); + ts->len = rs->len; + + /* report the buffer change. */ + ts->flags |= NS_BUF_CHANGED; + rs->flags |= NS_BUF_CHANGED; + j = NETMAP_RING_NEXT(rxring, j); + k = NETMAP_RING_NEXT(txring, k); + } + rxring->avail -= m; + txring->avail -= m; + rxring->cur = j; + txring->cur = k; + if (verbose && m > 0) + D("sent %d packets to %p", m, txring); + + return (m); +} + +/* move packts from src to destination */ +static int +move(struct my_ring *src, struct my_ring *dst, u_int limit) +{ + struct netmap_ring *txring, *rxring; + u_int m = 0, si = src->begin, di = dst->begin; + const char *msg = (src->queueid & NETMAP_SW_RING) ? + "host->net" : "net->host"; + + while (si < src->end && di < dst->end) { + rxring = NETMAP_RXRING(src->nifp, si); + txring = NETMAP_TXRING(dst->nifp, di); + ND("txring %p rxring %p", txring, rxring); + if (rxring->avail == 0) { + si++; + continue; + } + if (txring->avail == 0) { + di++; + continue; + } + m += process_rings(rxring, txring, limit, msg); + } + + return (m); +} + +/* + * how many packets on this set of queues ? + */ +static int +howmany(struct my_ring *me, int tx) +{ + u_int i, tot = 0; + + ND("me %p begin %d end %d", me, me->begin, me->end); + for (i = me->begin; i < me->end; i++) { + struct netmap_ring *ring = tx ? + NETMAP_TXRING(me->nifp, i) : NETMAP_RXRING(me->nifp, i); + tot += ring->avail; + } + if (0 && verbose && tot && !tx) + D("ring %s %s %s has %d avail at %d", + me->ifname, tx ? "tx": "rx", + me->end > me->nifp->ni_num_queues ? + "host":"net", + tot, NETMAP_TXRING(me->nifp, me->begin)->cur); + return tot; +} + +/* + * bridge [-v] if1 [if2] + * + * If only one name, or the two interfaces are the same, + * bridges userland and the adapter. Otherwise bridge + * two intefaces. + */ +int +main(int argc, char **argv) +{ + struct pollfd pollfd[2]; + int i; + u_int burst = 1024; + struct my_ring me[2]; + + fprintf(stderr, "%s %s built %s %s\n", + argv[0], version, __DATE__, __TIME__); + + bzero(me, sizeof(me)); + + while (argc > 1 && !strcmp(argv[1], "-v")) { + verbose++; + argv++; + argc--; + } + + if (argc < 2 || argc > 4) { + D("Usage: %s IFNAME1 [IFNAME2 [BURST]]", argv[0]); + return (1); + } + + /* setup netmap interface #1. */ + me[0].ifname = argv[1]; + if (argc == 2 || !strcmp(argv[1], argv[2])) { + D("same interface, endpoint 0 goes to host"); + i = NETMAP_SW_RING; + me[1].ifname = argv[1]; + } else { + /* two different interfaces. Take all rings on if1 */ + i = 0; // all hw rings + me[1].ifname = argv[2]; + } + if (netmap_open(me, i)) + return (1); + me[1].mem = me[0].mem; /* copy the pointer, so only one mmap */ + if (netmap_open(me+1, 0)) + return (1); + + /* if bridging two interfaces, set promisc mode */ + if (i != NETMAP_SW_RING) { + do_ioctl(me, SIOCGIFFLAGS); + if ((me[0].if_flags & IFF_UP) == 0) { + D("%s is down, bringing up...", me[0].ifname); + me[0].if_flags |= IFF_UP; + } + me[0].if_flags |= IFF_PPROMISC; + do_ioctl(me, SIOCSIFFLAGS); + + do_ioctl(me+1, SIOCGIFFLAGS); + me[1].if_flags |= IFF_PPROMISC; + do_ioctl(me+1, SIOCSIFFLAGS); + + /* also disable checksums etc. */ + do_ioctl(me, SIOCGIFCAP); + me[0].if_reqcap = me[0].if_curcap; + me[0].if_reqcap &= ~(IFCAP_HWCSUM | IFCAP_TSO | IFCAP_TOE); + do_ioctl(me+0, SIOCSIFCAP); + } + do_ioctl(me+1, SIOCGIFFLAGS); + if ((me[1].if_flags & IFF_UP) == 0) { + D("%s is down, bringing up...", me[1].ifname); + me[1].if_flags |= IFF_UP; + } + do_ioctl(me+1, SIOCSIFFLAGS); + + do_ioctl(me+1, SIOCGIFCAP); + me[1].if_reqcap = me[1].if_curcap; + me[1].if_reqcap &= ~(IFCAP_HWCSUM | IFCAP_TSO | IFCAP_TOE); + do_ioctl(me+1, SIOCSIFCAP); + if (argc > 3) + burst = atoi(argv[3]); /* packets burst size. */ + + /* setup poll(2) variables. */ + memset(pollfd, 0, sizeof(pollfd)); + for (i = 0; i < 2; i++) { + pollfd[i].fd = me[i].fd; + pollfd[i].events = (POLLIN); + } + + D("Wait 2 secs for link to come up..."); + sleep(2); + D("Ready to go, %s 0x%x/%d <-> %s 0x%x/%d.", + me[0].ifname, me[0].queueid, me[0].nifp->ni_num_queues, + me[1].ifname, me[1].queueid, me[1].nifp->ni_num_queues); + + /* main loop */ + signal(SIGINT, sigint_h); + while (!do_abort) { + int n0, n1, ret; + pollfd[0].events = pollfd[1].events = 0; + pollfd[0].revents = pollfd[1].revents = 0; + n0 = howmany(me, 0); + n1 = howmany(me + 1, 0); + if (n0) + pollfd[1].events |= POLLOUT; + else + pollfd[0].events |= POLLIN; + if (n1) + pollfd[0].events |= POLLOUT; + else + pollfd[1].events |= POLLIN; + ret = poll(pollfd, 2, 2500); + if (ret <= 0 || verbose) + D("poll %s [0] ev %x %x rx %d@%d tx %d," + " [1] ev %x %x rx %d@%d tx %d", + ret <= 0 ? "timeout" : "ok", + pollfd[0].events, + pollfd[0].revents, + howmany(me, 0), + me[0].rx->cur, + howmany(me, 1), + pollfd[1].events, + pollfd[1].revents, + howmany(me+1, 0), + me[1].rx->cur, + howmany(me+1, 1) + ); + if (ret < 0) + continue; + if (pollfd[0].revents & POLLERR) { + D("error on fd0, rxcur %d@%d", + me[0].rx->avail, me[0].rx->cur); + } + if (pollfd[1].revents & POLLERR) { + D("error on fd1, rxcur %d@%d", + me[1].rx->avail, me[1].rx->cur); + } + if (pollfd[0].revents & POLLOUT) { + move(me + 1, me, burst); + // XXX we don't need the ioctl */ + // ioctl(me[0].fd, NIOCTXSYNC, NULL); + } + if (pollfd[1].revents & POLLOUT) { + move(me, me + 1, burst); + // XXX we don't need the ioctl */ + // ioctl(me[1].fd, NIOCTXSYNC, NULL); + } + } + D("exiting"); + netmap_close(me + 1); + netmap_close(me + 0); + + return (0); +} diff --git a/tools/tools/netmap/click-test.cfg b/tools/tools/netmap/click-test.cfg new file mode 100644 index 000000000000..fc5759f88b1e --- /dev/null +++ b/tools/tools/netmap/click-test.cfg @@ -0,0 +1,19 @@ +// +// $FreeBSD$ +// +// A sample test configuration for click +// +// +// create a switch + +myswitch :: EtherSwitch; + +// two input devices + +c0 :: FromDevice(ix0, PROMISC true); +c1 :: FromDevice(ix1, PROMISC true); + +// and now pass packets around + +c0[0] -> [0]sw[0] -> Queue(10000) -> ToDevice(ix0); +c1[0] -> [1]sw[1] -> Queue(10000) -> ToDevice(ix1); diff --git a/tools/tools/netmap/pcap.c b/tools/tools/netmap/pcap.c new file mode 100644 index 000000000000..f010b839bfb2 --- /dev/null +++ b/tools/tools/netmap/pcap.c @@ -0,0 +1,761 @@ +/* + * (C) 2011 Luigi Rizzo + * + * BSD license + * + * A simple library that maps some pcap functions onto netmap + * This is not 100% complete but enough to let tcpdump, trafshow + * and other apps work. + * + * $FreeBSD$ + */ + +#include +#include /* signal */ +#include +#include +#include /* strcmp */ +#include /* open */ +#include /* close */ + +#include /* le64toh */ +#include /* PROT_* */ +#include /* ioctl */ +#include +#include +#include /* sockaddr.. */ +#include /* ntohs */ + +#include /* ifreq */ +#include +#include +#include + +#include /* sockaddr_in */ + +#include +#include + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +char *version = "$Id$"; +int verbose = 0; + +/* debug support */ +#define ND(format, ...) do {} while (0) +#define D(format, ...) do { \ + if (verbose) \ + fprintf(stderr, "--- %s [%d] " format "\n", \ + __FUNCTION__, __LINE__, ##__VA_ARGS__); \ + } while (0) + + +/* + * We redefine here a number of structures that are in pcap.h + * so we can compile this file without the system header. + */ +#ifndef PCAP_ERRBUF_SIZE +#define PCAP_ERRBUF_SIZE 128 + +/* + * Each packet is accompanied by a header including the timestamp, + * captured size and actual size. + */ +struct pcap_pkthdr { + struct timeval ts; /* time stamp */ + uint32_t caplen; /* length of portion present */ + uint32_t len; /* length this packet (off wire) */ +}; + +typedef struct pcap_if pcap_if_t; + +/* + * Representation of an interface address. + */ +struct pcap_addr { + struct pcap_addr *next; + struct sockaddr *addr; /* address */ + struct sockaddr *netmask; /* netmask for the above */ + struct sockaddr *broadaddr; /* broadcast addr for the above */ + struct sockaddr *dstaddr; /* P2P dest. address for the above */ +}; + +struct pcap_if { + struct pcap_if *next; + char *name; /* name to hand to "pcap_open_live()" */ + char *description; /* textual description of interface, or NULL */ + struct pcap_addr *addresses; + uint32_t flags; /* PCAP_IF_ interface flags */ +}; + +/* + * We do not support stats (yet) + */ +struct pcap_stat { + u_int ps_recv; /* number of packets received */ + u_int ps_drop; /* number of packets dropped */ + u_int ps_ifdrop; /* drops by interface XXX not yet supported */ +#ifdef WIN32 + u_int bs_capt; /* number of packets that reach the app. */ +#endif /* WIN32 */ +}; + +typedef void pcap_t; +typedef enum { + PCAP_D_INOUT = 0, + PCAP_D_IN, + PCAP_D_OUT +} pcap_direction_t; + + + +typedef void (*pcap_handler)(u_char *user, + const struct pcap_pkthdr *h, const u_char *bytes); + +char errbuf[PCAP_ERRBUF_SIZE]; + +pcap_t *pcap_open_live(const char *device, int snaplen, + int promisc, int to_ms, char *errbuf); + +int pcap_findalldevs(pcap_if_t **alldevsp, char *errbuf); +void pcap_close(pcap_t *p); +int pcap_get_selectable_fd(pcap_t *p); +int pcap_dispatch(pcap_t *p, int cnt, pcap_handler callback, u_char *user); +int pcap_setnonblock(pcap_t *p, int nonblock, char *errbuf); +int pcap_setdirection(pcap_t *p, pcap_direction_t d); +char *pcap_lookupdev(char *errbuf); +int pcap_inject(pcap_t *p, const void *buf, size_t size); +int pcap_fileno(pcap_t *p); + +struct eproto { + const char *s; + u_short p; +}; +#endif /* !PCAP_ERRBUF_SIZE */ + +#ifdef __PIC__ +/* + * build as a shared library + */ + +char pcap_version[] = "libnetmap version 0.3"; + +/* + * Our equivalent of pcap_t + */ +struct my_ring { + struct nmreq nmr; + + int fd; + char *mem; /* userspace mmap address */ + u_int memsize; + u_int queueid; + u_int begin, end; /* first..last+1 rings to check */ + struct netmap_if *nifp; + + int snaplen; + char *errbuf; + int promisc; + int to_ms; + + struct pcap_pkthdr hdr; + + uint32_t if_flags; + uint32_t if_reqcap; + uint32_t if_curcap; + + struct pcap_stat st; + + char msg[PCAP_ERRBUF_SIZE]; +}; + + +static int +do_ioctl(struct my_ring *me, int what) +{ + struct ifreq ifr; + int error; + + bzero(&ifr, sizeof(ifr)); + strncpy(ifr.ifr_name, me->nmr.nr_name, sizeof(ifr.ifr_name)); + switch (what) { + case SIOCSIFFLAGS: + D("call SIOCSIFFLAGS 0x%x", me->if_flags); + ifr.ifr_flagshigh = (me->if_flags >> 16) & 0xffff; + ifr.ifr_flags = me->if_flags & 0xffff; + break; + case SIOCSIFCAP: + ifr.ifr_reqcap = me->if_reqcap; + ifr.ifr_curcap = me->if_curcap; + break; + } + error = ioctl(me->fd, what, &ifr); + if (error) { + D("ioctl 0x%x error %d", what, error); + return error; + } + switch (what) { + case SIOCSIFFLAGS: + case SIOCGIFFLAGS: + me->if_flags = (ifr.ifr_flagshigh << 16) | + (0xffff & ifr.ifr_flags); + D("flags are L 0x%x H 0x%x 0x%x", + (uint16_t)ifr.ifr_flags, + (uint16_t)ifr.ifr_flagshigh, me->if_flags); + break; + + case SIOCGIFCAP: + me->if_reqcap = ifr.ifr_reqcap; + me->if_curcap = ifr.ifr_curcap; + D("curcap are 0x%x", me->if_curcap); + break; + } + return 0; +} + + +/* + * open a device. if me->mem is null then do an mmap. + */ +static int +netmap_open(struct my_ring *me, int ringid) +{ + int fd, err, l; + u_int i; + struct nmreq req; + + me->fd = fd = open("/dev/netmap", O_RDWR); + if (fd < 0) { + D("Unable to open /dev/netmap"); + return (-1); + } + bzero(&req, sizeof(req)); + strncpy(req.nr_name, me->nmr.nr_name, sizeof(req.nr_name)); + req.nr_ringid = ringid; + err = ioctl(fd, NIOCGINFO, &req); + if (err) { + D("cannot get info on %s", me->nmr.nr_name); + goto error; + } + me->memsize = l = req.nr_memsize; + ND("memsize is %d MB", l>>20); + err = ioctl(fd, NIOCREGIF, &req); + if (err) { + D("Unable to register %s", me->nmr.nr_name); + goto error; + } + + if (me->mem == NULL) { + me->mem = mmap(0, l, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0); + if (me->mem == MAP_FAILED) { + D("Unable to mmap"); + me->mem = NULL; + goto error; + } + } + + me->nifp = NETMAP_IF(me->mem, req.nr_offset); + me->queueid = ringid; + if (ringid & NETMAP_SW_RING) { + me->begin = req.nr_numrings; + me->end = me->begin + 1; + } else if (ringid & NETMAP_HW_RING) { + me->begin = ringid & NETMAP_RING_MASK; + me->end = me->begin + 1; + } else { + me->begin = 0; + me->end = req.nr_numrings; + } + /* request timestamps for packets */ + for (i = me->begin; i < me->end; i++) { + struct netmap_ring *ring = NETMAP_RXRING(me->nifp, i); + ring->flags = NR_TIMESTAMP; + } + //me->tx = NETMAP_TXRING(me->nifp, 0); + return (0); +error: + close(me->fd); + return -1; +} + +/* + * There is a set of functions that tcpdump expects even if probably + * not used + */ +struct eproto eproto_db[] = { + { "ip", ETHERTYPE_IP }, + { "arp", ETHERTYPE_ARP }, + { (char *)0, 0 } +}; + + +int +pcap_findalldevs(pcap_if_t **alldevsp, __unused char *errbuf) +{ + struct ifaddrs *i_head, *i; + pcap_if_t *top = NULL, *cur; + struct pcap_addr *tail = NULL; + int l; + + D("listing all devs"); + *alldevsp = NULL; + i_head = NULL; + + if (getifaddrs(&i_head)) { + D("cannot get if addresses"); + return -1; + } + for (i = i_head; i; i = i->ifa_next) { + //struct ifaddrs *ifa; + struct pcap_addr *pca; + //struct sockaddr *sa; + + D("got interface %s", i->ifa_name); + if (!top || strcmp(top->name, i->ifa_name)) { + /* new interface */ + l = sizeof(*top) + strlen(i->ifa_name) + 1; + cur = calloc(1, l); + if (cur == NULL) { + D("no space for if descriptor"); + continue; + } + cur->name = (char *)(cur + 1); + //cur->flags = i->ifa_flags; + strcpy(cur->name, i->ifa_name); + cur->description = NULL; + cur->next = top; + top = cur; + tail = NULL; + } + /* now deal with addresses */ + D("%s addr family %d len %d %s %s", + top->name, + i->ifa_addr->sa_family, i->ifa_addr->sa_len, + i->ifa_netmask ? "Netmask" : "", + i->ifa_broadaddr ? "Broadcast" : ""); + l = sizeof(struct pcap_addr) + + (i->ifa_addr ? i->ifa_addr->sa_len:0) + + (i->ifa_netmask ? i->ifa_netmask->sa_len:0) + + (i->ifa_broadaddr? i->ifa_broadaddr->sa_len:0); + pca = calloc(1, l); + if (pca == NULL) { + D("no space for if addr"); + continue; + } +#define SA_NEXT(x) ((struct sockaddr *)((char *)(x) + (x)->sa_len)) + pca->addr = (struct sockaddr *)(pca + 1); + bcopy(i->ifa_addr, pca->addr, i->ifa_addr->sa_len); + if (i->ifa_netmask) { + pca->netmask = SA_NEXT(pca->addr); + bcopy(i->ifa_netmask, pca->netmask, i->ifa_netmask->sa_len); + if (i->ifa_broadaddr) { + pca->broadaddr = SA_NEXT(pca->netmask); + bcopy(i->ifa_broadaddr, pca->broadaddr, i->ifa_broadaddr->sa_len); + } + } + if (tail == NULL) { + top->addresses = pca; + } else { + tail->next = pca; + } + tail = pca; + + } + freeifaddrs(i_head); + *alldevsp = top; + return 0; +} + +void pcap_freealldevs(__unused pcap_if_t *alldevs) +{ + D("unimplemented"); +} + +char * +pcap_lookupdev(char *buf) +{ + D("%s", buf); + strcpy(buf, "/dev/netmap"); + return buf; +} + +pcap_t * +pcap_create(const char *source, char *errbuf) +{ + D("src %s (call open liveted)", source); + return pcap_open_live(source, 0, 1, 100, errbuf); +} + +int +pcap_activate(pcap_t *p) +{ + D("pcap %p running", p); + return 0; +} + +int +pcap_can_set_rfmon(__unused pcap_t *p) +{ + D(""); + return 0; /* no we can't */ +} + +int +pcap_set_snaplen(pcap_t *p, int snaplen) +{ + struct my_ring *me = p; + + D("len %d", snaplen); + me->snaplen = snaplen; + return 0; +} + +int +pcap_snapshot(pcap_t *p) +{ + struct my_ring *me = p; + + D("len %d", me->snaplen); + return me->snaplen; +} + +int +pcap_lookupnet(const char *device, uint32_t *netp, + uint32_t *maskp, __unused char *errbuf) +{ + + D("device %s", device); + inet_aton("10.0.0.255", (struct in_addr *)netp); + inet_aton("255.255.255.0",(struct in_addr *) maskp); + return 0; +} + +int +pcap_set_promisc(pcap_t *p, int promisc) +{ + struct my_ring *me = p; + + D("promisc %d", promisc); + if (do_ioctl(me, SIOCGIFFLAGS)) + D("SIOCGIFFLAGS failed"); + if (promisc) { + me->if_flags |= IFF_PPROMISC; + } else { + me->if_flags &= ~IFF_PPROMISC; + } + if (do_ioctl(me, SIOCSIFFLAGS)) + D("SIOCSIFFLAGS failed"); + return 0; +} + +int +pcap_set_timeout(pcap_t *p, int to_ms) +{ + struct my_ring *me = p; + + D("%d ms", to_ms); + me->to_ms = to_ms; + return 0; +} + +struct bpf_program; + +int +pcap_compile(__unused pcap_t *p, __unused struct bpf_program *fp, + const char *str, __unused int optimize, __unused uint32_t netmask) +{ + D("%s", str); + return 0; +} + +int +pcap_setfilter(__unused pcap_t *p, __unused struct bpf_program *fp) +{ + D(""); + return 0; +} + +int +pcap_datalink(__unused pcap_t *p) +{ + D(""); + return 1; // ethernet +} + +const char * +pcap_datalink_val_to_name(int dlt) +{ + D("%d", dlt); + return "DLT_EN10MB"; +} + +const char * +pcap_datalink_val_to_description(int dlt) +{ + D("%d", dlt); + return "Ethernet link"; +} + +struct pcap_stat; +int +pcap_stats(pcap_t *p, struct pcap_stat *ps) +{ + struct my_ring *me = p; + ND(""); + + me->st.ps_recv += 10; + *ps = me->st; + sprintf(me->msg, "stats not supported"); + return -1; +}; + +char * +pcap_geterr(pcap_t *p) +{ + struct my_ring *me = p; + + D(""); + return me->msg; +} + +pcap_t * +pcap_open_live(const char *device, __unused int snaplen, + int promisc, int to_ms, __unused char *errbuf) +{ + struct my_ring *me; + + D("request to open %s", device); + me = calloc(1, sizeof(*me)); + if (me == NULL) { + D("failed to allocate struct for %s", device); + return NULL; + } + strncpy(me->nmr.nr_name, device, sizeof(me->nmr.nr_name)); + if (netmap_open(me, 0)) { + D("error opening %s", device); + free(me); + return NULL; + } + me->to_ms = to_ms; + if (do_ioctl(me, SIOCGIFFLAGS)) + D("SIOCGIFFLAGS failed"); + if (promisc) { + me->if_flags |= IFF_PPROMISC; + if (do_ioctl(me, SIOCSIFFLAGS)) + D("SIOCSIFFLAGS failed"); + } + if (do_ioctl(me, SIOCGIFCAP)) + D("SIOCGIFCAP failed"); + me->if_reqcap &= ~(IFCAP_HWCSUM | IFCAP_TSO | IFCAP_TOE); + if (do_ioctl(me, SIOCSIFCAP)) + D("SIOCSIFCAP failed"); + + return (pcap_t *)me; +} + +void +pcap_close(pcap_t *p) +{ + struct my_ring *me = p; + + D(""); + if (!me) + return; + if (me->mem) + munmap(me->mem, me->memsize); + /* restore original flags ? */ + ioctl(me->fd, NIOCUNREGIF, NULL); + close(me->fd); + bzero(me, sizeof(*me)); + free(me); +} + +int +pcap_fileno(pcap_t *p) +{ + struct my_ring *me = p; + D("returns %d", me->fd); + return me->fd; +} + +int +pcap_get_selectable_fd(pcap_t *p) +{ + struct my_ring *me = p; + + ND(""); + return me->fd; +} + +int +pcap_setnonblock(__unused pcap_t *p, int nonblock, __unused char *errbuf) +{ + D("mode is %d", nonblock); + return 0; /* ignore */ +} + +int +pcap_setdirection(__unused pcap_t *p, __unused pcap_direction_t d) +{ + D(""); + return 0; /* ignore */ +}; + +int +pcap_dispatch(pcap_t *p, int cnt, pcap_handler callback, u_char *user) +{ + struct my_ring *me = p; + int got = 0; + u_int si; + + ND("cnt %d", cnt); + /* scan all rings */ + for (si = me->begin; si < me->end; si++) { + struct netmap_ring *ring = NETMAP_RXRING(me->nifp, si); + ND("ring has %d pkts", ring->avail); + if (ring->avail == 0) + continue; + me->hdr.ts = ring->ts; + while ((cnt == -1 || cnt != got) && ring->avail > 0) { + u_int i = ring->cur; + u_int idx = ring->slot[i].buf_idx; + if (idx < 2) { + D("%s bogus RX index %d at offset %d", + me->nifp->ni_name, idx, i); + sleep(2); + } + u_char *buf = (u_char *)NETMAP_BUF(ring, idx); + me->hdr.len = me->hdr.caplen = ring->slot[i].len; + // D("call %p len %d", p, me->hdr.len); + callback(user, &me->hdr, buf); + ring->cur = NETMAP_RING_NEXT(ring, i); + ring->avail--; + got++; + } + } + return got; +} + +int +pcap_inject(pcap_t *p, const void *buf, size_t size) +{ + struct my_ring *me = p; + u_int si; + + ND("cnt %d", cnt); + /* scan all rings */ + for (si = me->begin; si < me->end; si++) { + struct netmap_ring *ring = NETMAP_TXRING(me->nifp, si); + + ND("ring has %d pkts", ring->avail); + if (ring->avail == 0) + continue; + u_int i = ring->cur; + u_int idx = ring->slot[i].buf_idx; + if (idx < 2) { + D("%s bogus TX index %d at offset %d", + me->nifp->ni_name, idx, i); + sleep(2); + } + u_char *dst = (u_char *)NETMAP_BUF(ring, idx); + ring->slot[i].len = size; + bcopy(buf, dst, size); + ring->cur = NETMAP_RING_NEXT(ring, i); + ring->avail--; + // if (ring->avail == 0) ioctl(me->fd, NIOCTXSYNC, NULL); + return size; + } + errno = ENOBUFS; + return -1; +} + +int +pcap_loop(pcap_t *p, int cnt, pcap_handler callback, u_char *user) +{ + struct my_ring *me = p; + struct pollfd fds[1]; + int i; + + ND("cnt %d", cnt); + memset(fds, 0, sizeof(fds)); + fds[0].fd = me->fd; + fds[0].events = (POLLIN); + + while (cnt == -1 || cnt > 0) { + if (poll(fds, 1, me->to_ms) <= 0) { + D("poll error/timeout"); + continue; + } + i = pcap_dispatch(p, cnt, callback, user); + if (cnt > 0) + cnt -= i; + } + return 0; +} + +#endif /* __PIC__ */ + +#ifndef __PIC__ +void do_send(u_char *user, const struct pcap_pkthdr *h, const u_char *buf) +{ + pcap_inject((pcap_t *)user, buf, h->caplen); +} + +/* + * a simple pcap test program, bridge between two interfaces. + */ +int +main(int argc, char **argv) +{ + pcap_t *p0, *p1; + int burst = 1024; + struct pollfd pollfd[2]; + + fprintf(stderr, "%s %s built %s %s\n", + argv[0], version, __DATE__, __TIME__); + + while (argc > 1 && !strcmp(argv[1], "-v")) { + verbose++; + argv++; + argc--; + } + + if (argc < 3 || argc > 4 || !strcmp(argv[1], argv[2])) { + D("Usage: %s IFNAME1 IFNAME2 [BURST]", argv[0]); + return (1); + } + if (argc > 3) + burst = atoi(argv[3]); + + p0 = pcap_open_live(argv[1], 0, 1, 100, NULL); + p1 = pcap_open_live(argv[2], 0, 1, 100, NULL); + D("%s", version); + D("open returns %p %p", p0, p1); + if (!p0 || !p1) + return(1); + bzero(pollfd, sizeof(pollfd)); + pollfd[0].fd = pcap_fileno(p0); + pollfd[1].fd = pcap_fileno(p1); + pollfd[0].events = pollfd[1].events = POLLIN; + for (;;) { + /* do i need to reset ? */ + pollfd[0].revents = pollfd[1].revents = 0; + int ret = poll(pollfd, 2, 1000); + if (ret <= 0 || verbose) + D("poll %s [0] ev %x %x [1] ev %x %x", + ret <= 0 ? "timeout" : "ok", + pollfd[0].events, + pollfd[0].revents, + pollfd[1].events, + pollfd[1].revents); + if (ret < 0) + continue; + if (pollfd[0].revents & POLLIN) + pcap_dispatch(p0, burst, do_send, p1); + if (pollfd[1].revents & POLLIN) + pcap_dispatch(p1, burst, do_send, p0); + } + + return (0); +} +#endif /* !__PIC__ */ diff --git a/tools/tools/netmap/pkt-gen.c b/tools/tools/netmap/pkt-gen.c new file mode 100644 index 000000000000..747bd9dde00b --- /dev/null +++ b/tools/tools/netmap/pkt-gen.c @@ -0,0 +1,1021 @@ +/* + * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: pkt-gen.c 9638 2011-11-07 18:07:43Z luigi $ + * + * Example program to show how to build a multithreaded packet + * source/sink using the netmap device. + * + * In this example we create a programmable number of threads + * to take care of all the queues of the interface used to + * send or receive traffic. + * + */ + +const char *default_payload="netmap pkt-gen Luigi Rizzo and Matteo Landi\n" + "http://info.iet.unipi.it/~luigi/netmap/ "; + +#include +#include /* pthread_* */ +#include /* pthread w/ affinity */ +#include /* signal */ +#include +#include +#include /* strcmp */ +#include /* open */ +#include /* close */ +#include /* getifaddrs */ + +#include /* PROT_* */ +#include /* ioctl */ +#include +#include /* sockaddr.. */ +#include /* ntohs */ +#include +#include /* cpu_set */ +#include /* sysctl */ +#include /* timersub */ + +#include +#include /* ifreq */ +#include /* LLADDR */ + +#include +#include +#include + +#include +#include +#include + + +static inline int min(int a, int b) { return a < b ? a : b; } + +/* debug support */ +#define D(format, ...) \ + fprintf(stderr, "%s [%d] " format "\n", \ + __FUNCTION__, __LINE__, ##__VA_ARGS__) + +#ifndef EXPERIMENTAL +#define EXPERIMENTAL 0 +#endif + +int verbose = 0; +#define MAX_QUEUES 64 /* no need to limit */ + +#define SKIP_PAYLOAD 1 /* do not check payload. */ + +#if EXPERIMENTAL +/* Wrapper around `rdtsc' to take reliable timestamps flushing the pipeline */ +#define netmap_rdtsc(t) \ + do { \ + u_int __regs[4]; \ + \ + do_cpuid(0, __regs); \ + (t) = rdtsc(); \ + } while (0) + +static __inline void +do_cpuid(u_int ax, u_int *p) +{ + __asm __volatile("cpuid" + : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3]) + : "0" (ax)); +} + +static __inline uint64_t +rdtsc(void) +{ + uint64_t rv; + + __asm __volatile("rdtsc" : "=A" (rv)); + return (rv); +} +#define MAX_SAMPLES 100000 +#endif /* EXPERIMENTAL */ + + +struct pkt { + struct ether_header eh; + struct ip ip; + struct udphdr udp; + uint8_t body[NETMAP_BUF_SIZE]; +} __attribute__((__packed__)); + +/* + * global arguments for all threads + */ +struct glob_arg { + const char *src_ip; + const char *dst_ip; + const char *src_mac; + const char *dst_mac; + int pkt_size; + int burst; + int npackets; /* total packets to send */ + int nthreads; + int cpus; + int use_pcap; + pcap_t *p; +}; + +struct mystat { + uint64_t containers[8]; +}; + +/* + * Arguments for a new thread. The same structure is used by + * the source and the sink + */ +struct targ { + struct glob_arg *g; + int used; + int completed; + int fd; + struct nmreq nmr; + struct netmap_if *nifp; + uint16_t qfirst, qlast; /* range of queues to scan */ + uint64_t count; + struct timeval tic, toc; + int me; + pthread_t thread; + int affinity; + + uint8_t dst_mac[6]; + uint8_t src_mac[6]; + u_int dst_mac_range; + u_int src_mac_range; + uint32_t dst_ip; + uint32_t src_ip; + u_int dst_ip_range; + u_int src_ip_range; + + struct pkt pkt; +}; + + +static struct targ *targs; +static int global_nthreads; + +/* control-C handler */ +static void +sigint_h(__unused int sig) +{ + for (int i = 0; i < global_nthreads; i++) { + /* cancel active threads. */ + if (targs[i].used == 0) + continue; + + D("Cancelling thread #%d\n", i); + pthread_cancel(targs[i].thread); + targs[i].used = 0; + } + + signal(SIGINT, SIG_DFL); +} + + +/* sysctl wrapper to return the number of active CPUs */ +static int +system_ncpus(void) +{ + int mib[2], ncpus; + size_t len; + + mib[0] = CTL_HW; + mib[1] = HW_NCPU; + len = sizeof(mib); + sysctl(mib, 2, &ncpus, &len, NULL, 0); + + return (ncpus); +} + +/* + * locate the src mac address for our interface, put it + * into the user-supplied buffer. return 0 if ok, -1 on error. + */ +static int +source_hwaddr(const char *ifname, char *buf) +{ + struct ifaddrs *ifaphead, *ifap; + int l = sizeof(ifap->ifa_name); + + if (getifaddrs(&ifaphead) != 0) { + D("getifaddrs %s failed", ifname); + return (-1); + } + + for (ifap = ifaphead; ifap; ifap = ifap->ifa_next) { + struct sockaddr_dl *sdl = + (struct sockaddr_dl *)ifap->ifa_addr; + uint8_t *mac; + + if (!sdl || sdl->sdl_family != AF_LINK) + continue; + if (strncmp(ifap->ifa_name, ifname, l) != 0) + continue; + mac = (uint8_t *)LLADDR(sdl); + sprintf(buf, "%02x:%02x:%02x:%02x:%02x:%02x", + mac[0], mac[1], mac[2], + mac[3], mac[4], mac[5]); + if (verbose) + D("source hwaddr %s", buf); + break; + } + freeifaddrs(ifaphead); + return ifap ? 0 : 1; +} + + +/* set the thread affinity. */ +static int +setaffinity(pthread_t me, int i) +{ + cpuset_t cpumask; + + if (i == -1) + return 0; + + /* Set thread affinity affinity.*/ + CPU_ZERO(&cpumask); + CPU_SET(i, &cpumask); + + if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) { + D("Unable to set affinity"); + return 1; + } + return 0; +} + +/* Compute the checksum of the given ip header. */ +static uint16_t +checksum(const void *data, uint16_t len) +{ + const uint8_t *addr = data; + uint32_t sum = 0; + + while (len > 1) { + sum += addr[0] * 256 + addr[1]; + addr += 2; + len -= 2; + } + + if (len == 1) + sum += *addr * 256; + + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); + + sum = htons(sum); + + return ~sum; +} + +/* + * Fill a packet with some payload. + */ +static void +initialize_packet(struct targ *targ) +{ + struct pkt *pkt = &targ->pkt; + struct ether_header *eh; + struct ip *ip; + struct udphdr *udp; + uint16_t paylen = targ->g->pkt_size - sizeof(*eh) - sizeof(*ip); + int i, l, l0 = strlen(default_payload); + char *p; + + for (i = 0; i < paylen;) { + l = min(l0, paylen - i); + bcopy(default_payload, pkt->body + i, l); + i += l; + } + pkt->body[i-1] = '\0'; + + udp = &pkt->udp; + udp->uh_sport = htons(1234); + udp->uh_dport = htons(4321); + udp->uh_ulen = htons(paylen); + udp->uh_sum = 0; // checksum(udp, sizeof(*udp)); + + ip = &pkt->ip; + ip->ip_v = IPVERSION; + ip->ip_hl = 5; + ip->ip_id = 0; + ip->ip_tos = IPTOS_LOWDELAY; + ip->ip_len = ntohs(targ->g->pkt_size - sizeof(*eh)); + ip->ip_id = 0; + ip->ip_off = htons(IP_DF); /* Don't fragment */ + ip->ip_ttl = IPDEFTTL; + ip->ip_p = IPPROTO_UDP; + inet_aton(targ->g->src_ip, (struct in_addr *)&ip->ip_src); + inet_aton(targ->g->dst_ip, (struct in_addr *)&ip->ip_dst); + targ->dst_ip = ip->ip_dst.s_addr; + targ->src_ip = ip->ip_src.s_addr; + p = index(targ->g->src_ip, '-'); + if (p) { + targ->dst_ip_range = atoi(p+1); + D("dst-ip sweep %d addresses", targ->dst_ip_range); + } + ip->ip_sum = checksum(ip, sizeof(*ip)); + + eh = &pkt->eh; + bcopy(ether_aton(targ->g->src_mac), targ->src_mac, 6); + bcopy(targ->src_mac, eh->ether_shost, 6); + p = index(targ->g->src_mac, '-'); + if (p) + targ->src_mac_range = atoi(p+1); + + bcopy(ether_aton(targ->g->dst_mac), targ->dst_mac, 6); + bcopy(targ->dst_mac, eh->ether_dhost, 6); + p = index(targ->g->dst_mac, '-'); + if (p) + targ->dst_mac_range = atoi(p+1); + eh->ether_type = htons(ETHERTYPE_IP); +} + +/* Check the payload of the packet for errors (use it for debug). + * Look for consecutive ascii representations of the size of the packet. + */ +static void +check_payload(char *p, int psize) +{ + char temp[64]; + int n_read, size, sizelen; + + /* get the length in ASCII of the length of the packet. */ + sizelen = sprintf(temp, "%d", psize) + 1; // include a whitespace + + /* dummy payload. */ + p += 14; /* skip packet header. */ + n_read = 14; + while (psize - n_read >= sizelen) { + sscanf(p, "%d", &size); + if (size != psize) { + D("Read %d instead of %d", size, psize); + break; + } + + p += sizelen; + n_read += sizelen; + } +} + + +/* + * create and enqueue a batch of packets on a ring. + * On the last one set NS_REPORT to tell the driver to generate + * an interrupt when done. + */ +static int +send_packets(struct netmap_ring *ring, struct pkt *pkt, + int size, u_int count, int fill_all) +{ + u_int sent, cur = ring->cur; + + if (ring->avail < count) + count = ring->avail; + + for (sent = 0; sent < count; sent++) { + struct netmap_slot *slot = &ring->slot[cur]; + char *p = NETMAP_BUF(ring, slot->buf_idx); + + if (fill_all) + memcpy(p, pkt, size); + + slot->len = size; + if (sent == count - 1) + slot->flags |= NS_REPORT; + cur = NETMAP_RING_NEXT(ring, cur); + } + ring->avail -= sent; + ring->cur = cur; + + return (sent); +} + +static void * +sender_body(void *data) +{ + struct targ *targ = (struct targ *) data; + + struct pollfd fds[1]; + struct netmap_if *nifp = targ->nifp; + struct netmap_ring *txring; + int i, n = targ->g->npackets / targ->g->nthreads, sent = 0; + int fill_all = 1; + + if (setaffinity(targ->thread, targ->affinity)) + goto quit; + /* setup poll(2) machanism. */ + memset(fds, 0, sizeof(fds)); + fds[0].fd = targ->fd; + fds[0].events = (POLLOUT); + + /* main loop.*/ + gettimeofday(&targ->tic, NULL); + if (targ->g->use_pcap) { + int size = targ->g->pkt_size; + void *pkt = &targ->pkt; + pcap_t *p = targ->g->p; + + for (; sent < n; sent++) { + if (pcap_inject(p, pkt, size) == -1) + break; + } + } else { + while (sent < n) { + + /* + * wait for available room in the send queue(s) + */ + if (poll(fds, 1, 2000) <= 0) { + D("poll error/timeout on queue %d\n", targ->me); + goto quit; + } + /* + * scan our queues and send on those with room + */ + if (sent > 100000) + fill_all = 0; + for (i = targ->qfirst; i < targ->qlast; i++) { + int m, limit = MIN(n - sent, targ->g->burst); + + txring = NETMAP_TXRING(nifp, i); + if (txring->avail == 0) + continue; + m = send_packets(txring, &targ->pkt, targ->g->pkt_size, + limit, fill_all); + sent += m; + targ->count = sent; + } + } + /* Tell the interface that we have new packets. */ + ioctl(fds[0].fd, NIOCTXSYNC, NULL); + + /* final part: wait all the TX queues to be empty. */ + for (i = targ->qfirst; i < targ->qlast; i++) { + txring = NETMAP_TXRING(nifp, i); + while (!NETMAP_TX_RING_EMPTY(txring)) { + ioctl(fds[0].fd, NIOCTXSYNC, NULL); + usleep(1); /* wait 1 tick */ + } + } + } + + gettimeofday(&targ->toc, NULL); + targ->completed = 1; + targ->count = sent; + +quit: + /* reset the ``used`` flag. */ + targ->used = 0; + + return (NULL); +} + + +static void +receive_pcap(u_char *user, __unused const struct pcap_pkthdr * h, + __unused const u_char * bytes) +{ + int *count = (int *)user; + (*count)++; +} + +static int +receive_packets(struct netmap_ring *ring, u_int limit, int skip_payload) +{ + u_int cur, rx; + + cur = ring->cur; + if (ring->avail < limit) + limit = ring->avail; + for (rx = 0; rx < limit; rx++) { + struct netmap_slot *slot = &ring->slot[cur]; + char *p = NETMAP_BUF(ring, slot->buf_idx); + + if (!skip_payload) + check_payload(p, slot->len); + + cur = NETMAP_RING_NEXT(ring, cur); + } + ring->avail -= rx; + ring->cur = cur; + + return (rx); +} + +static void * +receiver_body(void *data) +{ + struct targ *targ = (struct targ *) data; + struct pollfd fds[1]; + struct netmap_if *nifp = targ->nifp; + struct netmap_ring *rxring; + int i, received = 0; + + if (setaffinity(targ->thread, targ->affinity)) + goto quit; + + /* setup poll(2) machanism. */ + memset(fds, 0, sizeof(fds)); + fds[0].fd = targ->fd; + fds[0].events = (POLLIN); + + /* unbounded wait for the first packet. */ + for (;;) { + i = poll(fds, 1, 1000); + if (i > 0 && !(fds[0].revents & POLLERR)) + break; + D("waiting for initial packets, poll returns %d %d", i, fds[0].revents); + } + + /* main loop, exit after 1s silence */ + gettimeofday(&targ->tic, NULL); + if (targ->g->use_pcap) { + for (;;) { + pcap_dispatch(targ->g->p, targ->g->burst, receive_pcap, NULL); + } + } else { + while (1) { + /* Once we started to receive packets, wait at most 1 seconds + before quitting. */ + if (poll(fds, 1, 1 * 1000) <= 0) { + gettimeofday(&targ->toc, NULL); + targ->toc.tv_sec -= 1; /* Substract timeout time. */ + break; + } + + for (i = targ->qfirst; i < targ->qlast; i++) { + int m; + + rxring = NETMAP_RXRING(nifp, i); + if (rxring->avail == 0) + continue; + + m = receive_packets(rxring, targ->g->burst, + SKIP_PAYLOAD); + received += m; + targ->count = received; + } + + // tell the card we have read the data + //ioctl(fds[0].fd, NIOCRXSYNC, NULL); + } + } + + targ->completed = 1; + targ->count = received; + +quit: + /* reset the ``used`` flag. */ + targ->used = 0; + + return (NULL); +} + +static void +tx_output(uint64_t sent, int size, double delta) +{ + double amount = 8.0 * (1.0 * size * sent) / delta; + double pps = sent / delta; + char units[4] = { '\0', 'K', 'M', 'G' }; + int aunit = 0, punit = 0; + + while (amount >= 1000) { + amount /= 1000; + aunit += 1; + } + while (pps >= 1000) { + pps /= 1000; + punit += 1; + } + + printf("Sent %llu packets, %d bytes each, in %.2f seconds.\n", + sent, size, delta); + printf("Speed: %.2f%cpps. Bandwidth: %.2f%cbps.\n", + pps, units[punit], amount, units[aunit]); +} + + +static void +rx_output(uint64_t received, double delta) +{ + + double pps = received / delta; + char units[4] = { '\0', 'K', 'M', 'G' }; + int punit = 0; + + while (pps >= 1000) { + pps /= 1000; + punit += 1; + } + + printf("Received %llu packets, in %.2f seconds.\n", received, delta); + printf("Speed: %.2f%cpps.\n", pps, units[punit]); +} + +static void +usage(void) +{ + const char *cmd = "pkt-gen"; + fprintf(stderr, + "Usage:\n" + "%s arguments\n" + "\t-i interface interface name\n" + "\t-t pkts_to_send also forces send mode\n" + "\t-r pkts_to_receive also forces receive mode\n" + "\t-l pkts_size in bytes excluding CRC\n" + "\t-d dst-ip end with %%n to sweep n addresses\n" + "\t-s src-ip end with %%n to sweep n addresses\n" + "\t-D dst-mac end with %%n to sweep n addresses\n" + "\t-S src-mac end with %%n to sweep n addresses\n" + "\t-b burst size testing, mostly\n" + "\t-c cores cores to use\n" + "\t-p threads processes/threads to use\n" + "\t-T report_ms milliseconds between reports\n" + "\t-w wait_for_link_time in seconds\n" + "", + cmd); + + exit(0); +} + + +int +main(int arc, char **argv) +{ + int i, fd; + + struct glob_arg g; + + struct nmreq nmr; + void *mmap_addr; /* the mmap address */ + void *(*td_body)(void *) = receiver_body; + int ch; + int report_interval = 1000; /* report interval */ + char *ifname = NULL; + int wait_link = 2; + int devqueues = 1; /* how many device queues */ + + bzero(&g, sizeof(g)); + + g.src_ip = "10.0.0.1"; + g.dst_ip = "10.1.0.1"; + g.dst_mac = "ff:ff:ff:ff:ff:ff"; + g.src_mac = NULL; + g.pkt_size = 60; + g.burst = 512; // default + g.nthreads = 1; + g.cpus = 1; + + while ( (ch = getopt(arc, argv, + "i:t:r:l:d:s:D:S:b:c:p:T:w:v")) != -1) { + switch(ch) { + default: + D("bad option %c %s", ch, optarg); + usage(); + break; + case 'i': /* interface */ + ifname = optarg; + break; + case 't': /* send */ + td_body = sender_body; + g.npackets = atoi(optarg); + break; + case 'r': /* receive */ + td_body = receiver_body; + g.npackets = atoi(optarg); + break; + case 'l': /* pkt_size */ + g.pkt_size = atoi(optarg); + break; + case 'd': + g.dst_ip = optarg; + break; + case 's': + g.src_ip = optarg; + break; + case 'T': /* report interval */ + report_interval = atoi(optarg); + break; + case 'w': + wait_link = atoi(optarg); + break; + case 'b': /* burst */ + g.burst = atoi(optarg); + break; + case 'c': + g.cpus = atoi(optarg); + break; + case 'p': + g.nthreads = atoi(optarg); + break; + + case 'P': + g.use_pcap = 1; + break; + + case 'D': /* destination mac */ + g.dst_mac = optarg; + { + struct ether_addr *mac = ether_aton(g.dst_mac); + D("ether_aton(%s) gives %p", g.dst_mac, mac); + } + break; + case 'S': /* source mac */ + g.src_mac = optarg; + break; + case 'v': + verbose++; + } + } + + if (ifname == NULL) { + D("missing ifname"); + usage(); + } + { + int n = system_ncpus(); + if (g.cpus < 0 || g.cpus > n) { + D("%d cpus is too high, have only %d cpus", g.cpus, n); + usage(); + } + if (g.cpus == 0) + g.cpus = n; + } + if (g.pkt_size < 16 || g.pkt_size > 1536) { + D("bad pktsize %d\n", g.pkt_size); + usage(); + } + + bzero(&nmr, sizeof(nmr)); + /* + * Open the netmap device to fetch the number of queues of our + * interface. + * + * The first NIOCREGIF also detaches the card from the + * protocol stack and may cause a reset of the card, + * which in turn may take some time for the PHY to + * reconfigure. + */ + fd = open("/dev/netmap", O_RDWR); + if (fd == -1) { + D("Unable to open /dev/netmap"); + // fail later + } else { + if ((ioctl(fd, NIOCGINFO, &nmr)) == -1) { + D("Unable to get if info without name"); + } else { + D("map size is %d Kb", nmr.nr_memsize >> 10); + } + bzero(&nmr, sizeof(nmr)); + strncpy(nmr.nr_name, ifname, sizeof(nmr.nr_name)); + if ((ioctl(fd, NIOCGINFO, &nmr)) == -1) { + D("Unable to get if info for %s", ifname); + } + devqueues = nmr.nr_numrings; + } + + /* validate provided nthreads. */ + if (g.nthreads < 1 || g.nthreads > devqueues) { + D("bad nthreads %d, have %d queues", g.nthreads, devqueues); + // continue, fail later + } + + if (td_body == sender_body && g.src_mac == NULL) { + static char mybuf[20] = "ff:ff:ff:ff:ff:ff"; + /* retrieve source mac address. */ + if (source_hwaddr(ifname, mybuf) == -1) { + D("Unable to retrieve source mac"); + // continue, fail later + } + g.src_mac = mybuf; + } + + /* + * Map the netmap shared memory: instead of issuing mmap() + * inside the body of the threads, we prefer to keep this + * operation here to simplify the thread logic. + */ + D("mmapping %d Kbytes", nmr.nr_memsize>>10); + mmap_addr = (struct netmap_d *) mmap(0, nmr.nr_memsize, + PROT_WRITE | PROT_READ, + MAP_SHARED, fd, 0); + if (mmap_addr == MAP_FAILED) { + D("Unable to mmap %d KB", nmr.nr_memsize >> 10); + // continue, fail later + } + + /* + * Register the interface on the netmap device: from now on, + * we can operate on the network interface without any + * interference from the legacy network stack. + * + * We decide to put the first interface registration here to + * give time to cards that take a long time to reset the PHY. + */ + if (ioctl(fd, NIOCREGIF, &nmr) == -1) { + D("Unable to register interface %s", ifname); + //continue, fail later + } + + + /* Print some debug information. */ + fprintf(stdout, + "%s %s: %d queues, %d threads and %d cpus.\n", + (td_body == sender_body) ? "Sending on" : "Receiving from", + ifname, + devqueues, + g.nthreads, + g.cpus); + if (td_body == sender_body) { + fprintf(stdout, "%s -> %s (%s -> %s)\n", + g.src_ip, g.dst_ip, + g.src_mac, g.dst_mac); + } + + /* Exit if something went wrong. */ + if (fd < 0) { + D("aborting"); + usage(); + } + + + /* Wait for PHY reset. */ + D("Wait %d secs for phy reset", wait_link); + sleep(wait_link); + D("Ready..."); + + /* Install ^C handler. */ + global_nthreads = g.nthreads; + signal(SIGINT, sigint_h); + + if (g.use_pcap) { + // XXX g.p = pcap_open_live(..); + } + + targs = calloc(g.nthreads, sizeof(*targs)); + /* + * Now create the desired number of threads, each one + * using a single descriptor. + */ + for (i = 0; i < g.nthreads; i++) { + struct netmap_if *tnifp; + struct nmreq tifreq; + int tfd; + + if (g.use_pcap) { + tfd = -1; + tnifp = NULL; + } else { + /* register interface. */ + tfd = open("/dev/netmap", O_RDWR); + if (tfd == -1) { + D("Unable to open /dev/netmap"); + continue; + } + + bzero(&tifreq, sizeof(tifreq)); + strncpy(tifreq.nr_name, ifname, sizeof(tifreq.nr_name)); + tifreq.nr_ringid = (g.nthreads > 1) ? (i | NETMAP_HW_RING) : 0; + + /* + * if we are acting as a receiver only, do not touch the transmit ring. + * This is not the default because many apps may use the interface + * in both directions, but a pure receiver does not. + */ + if (td_body == receiver_body) { + tifreq.nr_ringid |= NETMAP_NO_TX_POLL; + } + + if ((ioctl(tfd, NIOCREGIF, &tifreq)) == -1) { + D("Unable to register %s", ifname); + continue; + } + tnifp = NETMAP_IF(mmap_addr, tifreq.nr_offset); + } + /* start threads. */ + bzero(&targs[i], sizeof(targs[i])); + targs[i].g = &g; + targs[i].used = 1; + targs[i].completed = 0; + targs[i].fd = tfd; + targs[i].nmr = tifreq; + targs[i].nifp = tnifp; + targs[i].qfirst = (g.nthreads > 1) ? i : 0; + targs[i].qlast = (g.nthreads > 1) ? i+1 : tifreq.nr_numrings; + targs[i].me = i; + targs[i].affinity = g.cpus ? i % g.cpus : -1; + if (td_body == sender_body) { + /* initialize the packet to send. */ + initialize_packet(&targs[i]); + } + + if (pthread_create(&targs[i].thread, NULL, td_body, + &targs[i]) == -1) { + D("Unable to create thread %d", i); + targs[i].used = 0; + } + } + + { + uint64_t my_count = 0, prev = 0; + uint64_t count = 0; + double delta_t; + struct timeval tic, toc; + + gettimeofday(&toc, NULL); + for (;;) { + struct timeval now, delta; + uint64_t pps; + int done = 0; + + delta.tv_sec = report_interval/1000; + delta.tv_usec = (report_interval%1000)*1000; + select(0, NULL, NULL, NULL, &delta); + gettimeofday(&now, NULL); + timersub(&now, &toc, &toc); + my_count = 0; + for (i = 0; i < g.nthreads; i++) { + my_count += targs[i].count; + if (targs[i].used == 0) + done++; + } + pps = toc.tv_sec* 1000000 + toc.tv_usec; + if (pps < 10000) + continue; + pps = (my_count - prev)*1000000 / pps; + D("%llu pps", pps); + prev = my_count; + toc = now; + if (done == g.nthreads) + break; + } + + timerclear(&tic); + timerclear(&toc); + for (i = 0; i < g.nthreads; i++) { + /* + * Join active threads, unregister interfaces and close + * file descriptors. + */ + pthread_join(targs[i].thread, NULL); + ioctl(targs[i].fd, NIOCUNREGIF, &targs[i].nmr); + close(targs[i].fd); + + if (targs[i].completed == 0) + continue; + + /* + * Collect threads o1utput and extract information about + * how log it took to send all the packets. + */ + count += targs[i].count; + if (!timerisset(&tic) || timercmp(&targs[i].tic, &tic, <)) + tic = targs[i].tic; + if (!timerisset(&toc) || timercmp(&targs[i].toc, &toc, >)) + toc = targs[i].toc; + } + + /* print output. */ + timersub(&toc, &tic, &toc); + delta_t = toc.tv_sec + 1e-6* toc.tv_usec; + if (td_body == sender_body) + tx_output(count, g.pkt_size, delta_t); + else + rx_output(count, delta_t); + } + + ioctl(fd, NIOCUNREGIF, &nmr); + munmap(mmap_addr, nmr.nr_memsize); + close(fd); + + return (0); +} +/* end of file */